View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.classify.feature;
23  
24  import java.util.Iterator;
25  
26  import org.apache.commons.collections.buffer.CircularFifoBuffer;
27  import org.apache.commons.lang.builder.ToStringBuilder;
28  
29  import de.fu_berlin.ties.TiesConfiguration;
30  
31  /***
32   * Transforms a feature vector using a simple implementation of the <em>sparse
33   * binary polynomial hashing (SBPH)</em> technique introduced by
34   * <a href="http://crm114.sourceforge.net/">CRM114</a>. This transformer
35   * discard all comment-only features (indeed all comments). It slides of window
36   * of {@linkplain #getLength() length <em>N</em>} over the remaining original
37   * features. For each window position, it counts in binary from 1 to
38   * 2<sup><em>N</em></sup>. For each odd number, a joint feature is generated
39   * where original features at "1" positions are visible and original features at
40   * "0" positions are hidden. Separators prior to the first feature are
41   * discarded, but all inner separators are kept. E.g. if <em>N</em>=3 and a
42   * pipe character "|" is used as {@linkplain #getSeparator() separator}, from
43   * the original features "a", "b", "c", four joint features will be generated
44   * at the last position: "c" (binary 1=001), "b|c" (binary 3=011), "a||c"
45   * (binary 5=101), "a|b|c" (binary 7=111). Thus 2<sup><em>N</em>-1</sup>
46   * joint features are generated for each original (non-comment) feature (except
47   * for the very first features).
48   *
49   * <p>Instances of this class are thread-safe.
50   *
51   * @author Christian Siefkes
52   * @version $Revision: 1.3 $, $Date: 2004/08/23 17:08:21 $, $Author: siefkes $
53   */
54  public class SBPHTransformer extends FeatureTransformer {
55  
56      /***
57       * The {@linkplain #getSeparator() separator} used by default (a space
58       * character).
59       */
60      public static final String DEFAULT_SEPARATOR = " ";
61  
62      /***
63       * The maximum number of original features joined.
64       */
65      private final int length;
66  
67      /***
68       * The string used to separate original features (by default a space
69       * character). This string <strong>should never occur</strong> within
70       * original features.
71       */
72      private final String separator;
73  
74      /***
75       * Stores bit values up to {@linkplain #getLength() length <em>N</em>},
76       * for bit operations: 1, 2, 4, 8, 16 etc. The <em>n</em>-th element in this
77       * vector contains the value 2<sup>n+1</sup>.
78       */
79      private final int[] bitValues;
80  
81      /***
82       * The maximum number of new features to generate for each position:
83       * 2<sup>(<em>{@linkplain #getLength() N} - 1</em>)</sup>.
84       */
85      private final int maxNumber;
86  
87      /***
88       * Creates a new instance.
89       *
90       * @param precTrans the preceding transformer to use if this transformer
91       * is part of a <em>chain</em>; <code>null</code> otherwise
92       * @param len the maximum number of original features joined
93       * @param sep the string used to separate original features -- this string
94       * <strong>should never occur</strong> within original features
95       */
96      public SBPHTransformer(final FeatureTransformer precTrans, final int len,
97              final String sep) {
98          super(precTrans);
99          length = len;
100         separator = sep;
101 
102         // pre-calculate values for bit operations (1, 2, 4, 8...)
103         bitValues = new int[length];
104         int bitVal = 1;
105 
106         for (int i = 0; i < length; i++) {
107             bitValues[i] = bitVal;
108             bitVal *= 2;
109         }
110 
111         // calculate number of features to generate at each position
112         maxNumber = numFeatures(length);
113     }
114 
115     /***
116      * Creates a new instance.
117      *
118      * @param precTrans the preceding transformer to use if this transformer
119      * is part of a <em>chain</em>; <code>null</code> otherwise
120      * @param config used to configure this instance
121      */
122     public SBPHTransformer(final FeatureTransformer precTrans,
123             final TiesConfiguration config) {
124         this(precTrans, config.getInt("transformer.sbph.length"),
125             config.getString("transformer.sbph.separator", DEFAULT_SEPARATOR));
126     }
127 
128     /***
129      * {@inheritDoc}
130      */
131     protected FeatureVector doTransform(final FeatureVector orgFeatures) {
132         final FeatureVector result = new DefaultFeatureVector();
133         final Iterator orgIter = orgFeatures.iterator();
134         final CircularFifoBuffer lastNFeatureReps =
135             new CircularFifoBuffer(length);
136         Iterator bufferIter;
137         int bufferSize;
138         String orgRep;
139         Feature orgF;
140         StringBuffer[] newReps;
141         int numNew;
142         int currentBit;
143         int i;
144 
145         while (orgIter.hasNext()) {
146             orgF = (Feature) orgIter.next();
147             orgRep = orgF.getRepresentation();
148 
149             // ignore comment-only features, keep only representations
150             if (orgRep != null) {
151                 lastNFeatureReps.add(orgRep);
152                 bufferSize = lastNFeatureReps.size();
153 
154                 // how many features to generate?
155                 if (bufferSize == length) {
156                     // use pre-calculated value
157                     numNew = maxNumber;
158                 } else {
159                     numNew = numFeatures(bufferSize);
160                 }
161 
162                 // count bits from bufferSize down to 1
163                 currentBit = bitValues[bufferSize - 1];
164                 newReps = new StringBuffer[numNew];
165                 bufferIter = lastNFeatureReps.iterator();
166                 boolean bufferHasNext = bufferIter.hasNext();
167 
168                 // create polynomial feature representations, all in parallel
169                 while (bufferHasNext) {
170                     orgRep = (String) bufferIter.next();
171 
172                     for (i = 0; i < newReps.length; i++) {
173                     // i-th buffer corresponds to binary number 2i+1
174                         if ((((2 * i) + 1) & currentBit) > 0) {
175                             // bit is set: add feature representation
176                             if (newReps[i] == null) {
177                                 newReps[i] = new StringBuffer(orgRep);
178                             } else {
179                                 newReps[i].append(orgRep);
180                             }
181                         }
182 
183                         bufferHasNext = bufferIter.hasNext();
184 
185                         // add separator, except prior to first included feature
186                         // and after last feature
187                         if ((newReps[i] != null) && bufferHasNext) {
188                             newReps[i].append(separator);
189                         }
190                     }
191 
192                     currentBit /= 2;
193                 }
194 
195                 // append created features to result vector
196                 for (i = 0; i < newReps.length; i++) {
197                     result.add(new DefaultFeature(newReps[i].toString()));
198                 }
199             }
200         }
201         return result;
202     }
203 
204     /***
205      * Returns the maximum number of original features joined.
206      * @return the value of the attribute
207      */
208     public int getLength() {
209         return length;
210     }
211 
212     /***
213      * Returns the string used to separate original features (by default a space
214      * character). This string <strong>should never occur</strong> within
215      * original features.
216      *
217      * @return the value of the attribute
218      */
219     public String getSeparator() {
220         return separator;
221     }
222 
223     /***
224      * Returns the number of jointed features to generate for a given number of
225      * features to join.
226      *
227      * @param n the number of features to join
228      * @return 2<sup>(<em>n</em>-1)</sup>
229      */
230     private int numFeatures(final int n) {
231         return (int) Math.pow(2, n - 1);
232     }
233 
234     /***
235      * Returns a string representation of this object.
236      *
237      * @return a textual representation
238      */
239     public String toString() {
240         return new ToStringBuilder(this)
241             .appendSuper(super.toString())
242             .append("length", length)
243             .append("separator", separator)
244             .toString();
245     }
246 
247 }