1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.classify.feature;
23
24 import java.util.Iterator;
25
26 import org.apache.commons.collections.buffer.CircularFifoBuffer;
27 import org.apache.commons.lang.builder.ToStringBuilder;
28
29 import de.fu_berlin.ties.TiesConfiguration;
30
31 /***
32 * Transforms a feature vector using a simple implementation of the <em>sparse
33 * binary polynomial hashing (SBPH)</em> technique introduced by
34 * <a href="http://crm114.sourceforge.net/">CRM114</a>. This transformer
35 * discard all comment-only features (indeed all comments). It slides of window
36 * of {@linkplain #getLength() length <em>N</em>} over the remaining original
37 * features. For each window position, it counts in binary from 1 to
38 * 2<sup><em>N</em></sup>. For each odd number, a joint feature is generated
39 * where original features at "1" positions are visible and original features at
40 * "0" positions are hidden. Separators prior to the first feature are
41 * discarded, but all inner separators are kept. E.g. if <em>N</em>=3 and a
42 * pipe character "|" is used as {@linkplain #getSeparator() separator}, from
43 * the original features "a", "b", "c", four joint features will be generated
44 * at the last position: "c" (binary 1=001), "b|c" (binary 3=011), "a||c"
45 * (binary 5=101), "a|b|c" (binary 7=111). Thus 2<sup><em>N</em>-1</sup>
46 * joint features are generated for each original (non-comment) feature (except
47 * for the very first features).
48 *
49 * <p>Instances of this class are thread-safe.
50 *
51 * @author Christian Siefkes
52 * @version $Revision: 1.3 $, $Date: 2004/08/23 17:08:21 $, $Author: siefkes $
53 */
54 public class SBPHTransformer extends FeatureTransformer {
55
56 /***
57 * The {@linkplain #getSeparator() separator} used by default (a space
58 * character).
59 */
60 public static final String DEFAULT_SEPARATOR = " ";
61
62 /***
63 * The maximum number of original features joined.
64 */
65 private final int length;
66
67 /***
68 * The string used to separate original features (by default a space
69 * character). This string <strong>should never occur</strong> within
70 * original features.
71 */
72 private final String separator;
73
74 /***
75 * Stores bit values up to {@linkplain #getLength() length <em>N</em>},
76 * for bit operations: 1, 2, 4, 8, 16 etc. The <em>n</em>-th element in this
77 * vector contains the value 2<sup>n+1</sup>.
78 */
79 private final int[] bitValues;
80
81 /***
82 * The maximum number of new features to generate for each position:
83 * 2<sup>(<em>{@linkplain #getLength() N} - 1</em>)</sup>.
84 */
85 private final int maxNumber;
86
87 /***
88 * Creates a new instance.
89 *
90 * @param precTrans the preceding transformer to use if this transformer
91 * is part of a <em>chain</em>; <code>null</code> otherwise
92 * @param len the maximum number of original features joined
93 * @param sep the string used to separate original features -- this string
94 * <strong>should never occur</strong> within original features
95 */
96 public SBPHTransformer(final FeatureTransformer precTrans, final int len,
97 final String sep) {
98 super(precTrans);
99 length = len;
100 separator = sep;
101
102
103 bitValues = new int[length];
104 int bitVal = 1;
105
106 for (int i = 0; i < length; i++) {
107 bitValues[i] = bitVal;
108 bitVal *= 2;
109 }
110
111
112 maxNumber = numFeatures(length);
113 }
114
115 /***
116 * Creates a new instance.
117 *
118 * @param precTrans the preceding transformer to use if this transformer
119 * is part of a <em>chain</em>; <code>null</code> otherwise
120 * @param config used to configure this instance
121 */
122 public SBPHTransformer(final FeatureTransformer precTrans,
123 final TiesConfiguration config) {
124 this(precTrans, config.getInt("transformer.sbph.length"),
125 config.getString("transformer.sbph.separator", DEFAULT_SEPARATOR));
126 }
127
128 /***
129 * {@inheritDoc}
130 */
131 protected FeatureVector doTransform(final FeatureVector orgFeatures) {
132 final FeatureVector result = new DefaultFeatureVector();
133 final Iterator orgIter = orgFeatures.iterator();
134 final CircularFifoBuffer lastNFeatureReps =
135 new CircularFifoBuffer(length);
136 Iterator bufferIter;
137 int bufferSize;
138 String orgRep;
139 Feature orgF;
140 StringBuffer[] newReps;
141 int numNew;
142 int currentBit;
143 int i;
144
145 while (orgIter.hasNext()) {
146 orgF = (Feature) orgIter.next();
147 orgRep = orgF.getRepresentation();
148
149
150 if (orgRep != null) {
151 lastNFeatureReps.add(orgRep);
152 bufferSize = lastNFeatureReps.size();
153
154
155 if (bufferSize == length) {
156
157 numNew = maxNumber;
158 } else {
159 numNew = numFeatures(bufferSize);
160 }
161
162
163 currentBit = bitValues[bufferSize - 1];
164 newReps = new StringBuffer[numNew];
165 bufferIter = lastNFeatureReps.iterator();
166 boolean bufferHasNext = bufferIter.hasNext();
167
168
169 while (bufferHasNext) {
170 orgRep = (String) bufferIter.next();
171
172 for (i = 0; i < newReps.length; i++) {
173
174 if ((((2 * i) + 1) & currentBit) > 0) {
175
176 if (newReps[i] == null) {
177 newReps[i] = new StringBuffer(orgRep);
178 } else {
179 newReps[i].append(orgRep);
180 }
181 }
182
183 bufferHasNext = bufferIter.hasNext();
184
185
186
187 if ((newReps[i] != null) && bufferHasNext) {
188 newReps[i].append(separator);
189 }
190 }
191
192 currentBit /= 2;
193 }
194
195
196 for (i = 0; i < newReps.length; i++) {
197 result.add(new DefaultFeature(newReps[i].toString()));
198 }
199 }
200 }
201 return result;
202 }
203
204 /***
205 * Returns the maximum number of original features joined.
206 * @return the value of the attribute
207 */
208 public int getLength() {
209 return length;
210 }
211
212 /***
213 * Returns the string used to separate original features (by default a space
214 * character). This string <strong>should never occur</strong> within
215 * original features.
216 *
217 * @return the value of the attribute
218 */
219 public String getSeparator() {
220 return separator;
221 }
222
223 /***
224 * Returns the number of jointed features to generate for a given number of
225 * features to join.
226 *
227 * @param n the number of features to join
228 * @return 2<sup>(<em>n</em>-1)</sup>
229 */
230 private int numFeatures(final int n) {
231 return (int) Math.pow(2, n - 1);
232 }
233
234 /***
235 * Returns a string representation of this object.
236 *
237 * @return a textual representation
238 */
239 public String toString() {
240 return new ToStringBuilder(this)
241 .appendSuper(super.toString())
242 .append("length", length)
243 .append("separator", separator)
244 .toString();
245 }
246
247 }