1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.classify.feature;
23
24 import java.util.Iterator;
25
26 import org.apache.commons.collections.buffer.CircularFifoBuffer;
27 import org.apache.commons.lang.builder.ToStringBuilder;
28 import org.dom4j.Element;
29
30 import de.fu_berlin.ties.TiesConfiguration;
31 import de.fu_berlin.ties.io.ObjectElement;
32 import de.fu_berlin.ties.util.Util;
33
34 /***
35 * Transforms a feature vector using a simple implementation of the <em>sparse
36 * binary polynomial hashing (SBPH)</em> technique introduced by
37 * <a href="http://crm114.sourceforge.net/">CRM114</a>. This transformer
38 * discard all comment-only features (indeed all comments). It slides of window
39 * of {@linkplain #getLength() length <em>N</em>} over the remaining original
40 * features. For each window position, it counts in binary from 1 to
41 * 2<sup><em>N</em></sup>. For each odd number, a joint feature is generated
42 * where original features at "1" positions are visible and original features at
43 * "0" positions are hidden. Separators prior to the first feature are
44 * discarded, but all inner separators are kept. E.g. if <em>N</em>=3 and a
45 * pipe character "|" is used as {@linkplain #getSeparator() separator}, from
46 * the original features "a", "b", "c", four joint features will be generated
47 * at the last position: "c" (binary 1=001), "b|c" (binary 3=011), "a||c"
48 * (binary 5=101), "a|b|c" (binary 7=111). Thus 2<sup><em>N</em>-1</sup>
49 * joint features are generated for each original (non-comment) feature (except
50 * for the very first features).
51 *
52 * <p>Instances of this class are thread-safe.
53 *
54 * @author Christian Siefkes
55 * @version $Revision: 1.10 $, $Date: 2006/10/21 16:03:57 $, $Author: siefkes $
56 */
57 public class SBPHTransformer extends FeatureTransformer {
58
59 /***
60 * The {@linkplain #getSeparator() separator} used by default (a space
61 * character).
62 */
63 public static final String DEFAULT_SEPARATOR = " ";
64
65 /***
66 * Helper method for initializing the array of pre-calculated bit values.
67 *
68 * @param length the maximum number of original features joined
69 * @return the array of pre-calculated bit values to use
70 */
71 private static int[] initBitValues(final int length) {
72
73 final int[] result = new int[length];
74 int bitVal = 1;
75
76 for (int i = 0; i < length; i++) {
77 result[i] = bitVal;
78 bitVal *= 2;
79 }
80 return result;
81 }
82
83
84 /***
85 * The maximum number of original features joined.
86 */
87 private final int length;
88
89 /***
90 * The string used to separate original features (by default a space
91 * character). This string <strong>should never occur</strong> within
92 * original features.
93 */
94 private final String separator;
95
96 /***
97 * Stores bit values up to {@linkplain #getLength() length <em>N</em>},
98 * for bit operations: 1, 2, 4, 8, 16 etc. The <em>n</em>-th element in this
99 * vector contains the value 2<sup>n+1</sup>.
100 */
101 private final int[] bitValues;
102
103 /***
104 * The maximum number of new features to generate for each position:
105 * 2<sup>(<em>{@linkplain #getLength() N} - 1</em>)</sup>.
106 */
107 private final int maxNumber;
108
109 /***
110 * Creates a new instance from an XML element, fulfilling the
111 * recommandation of the {@link de.fu_berlin.ties.io.XMLStorable} interface.
112 *
113 * @param element the XML element containing the serialized representation
114 * @throws InstantiationException if the given element does not contain
115 * a valid transformer description
116 */
117 public SBPHTransformer(final Element element)
118 throws InstantiationException {
119
120 super(element);
121 length =
122 Util.asInt(element.attributeValue(OSBTransformer.ATTRIB_LENGTH));
123 separator = element.attributeValue(OSBTransformer.ATTRIB_SEPARATOR);
124 bitValues = initBitValues(length);
125 maxNumber = numFeatures(length);
126 }
127
128 /***
129 * Creates a new instance.
130 *
131 * @param precTrans the preceding transformer to use if this transformer
132 * is part of a <em>chain</em>; <code>null</code> otherwise
133 * @param len the maximum number of original features joined
134 * @param sep the string used to separate original features -- this string
135 * <strong>should never occur</strong> within original features
136 */
137 public SBPHTransformer(final FeatureTransformer precTrans, final int len,
138 final String sep) {
139 super(precTrans);
140 length = len;
141 separator = sep;
142 bitValues = initBitValues(length);
143 maxNumber = numFeatures(length);
144 }
145
146 /***
147 * Creates a new instance.
148 *
149 * @param precTrans the preceding transformer to use if this transformer
150 * is part of a <em>chain</em>; <code>null</code> otherwise
151 * @param config used to configure this instance
152 */
153 public SBPHTransformer(final FeatureTransformer precTrans,
154 final TiesConfiguration config) {
155 this(precTrans, config.getInt("transformer.sbph.length"),
156 config.getString("transformer.sbph.separator", DEFAULT_SEPARATOR));
157 }
158
159 /***
160 * {@inheritDoc}
161 */
162 protected FeatureVector doTransform(final FeatureVector orgFeatures) {
163 final FeatureVector result = new DefaultFeatureVector();
164 final Iterator orgIter = orgFeatures.iterator();
165 final CircularFifoBuffer lastNFeatureReps =
166 new CircularFifoBuffer(length);
167 Iterator bufferIter;
168 int bufferSize;
169 String orgRep;
170 Feature orgF;
171 StringBuilder[] newReps;
172 int numNew;
173 int currentBit;
174 int i;
175
176 while (orgIter.hasNext()) {
177 orgF = (Feature) orgIter.next();
178 orgRep = orgF.getRepresentation();
179
180
181 if (orgRep != null) {
182 lastNFeatureReps.add(orgRep);
183 bufferSize = lastNFeatureReps.size();
184
185
186 if (bufferSize == length) {
187
188 numNew = maxNumber;
189 } else {
190 numNew = numFeatures(bufferSize);
191 }
192
193
194 currentBit = bitValues[bufferSize - 1];
195 newReps = new StringBuilder[numNew];
196 bufferIter = lastNFeatureReps.iterator();
197 boolean bufferHasNext = bufferIter.hasNext();
198
199
200 while (bufferHasNext) {
201 orgRep = (String) bufferIter.next();
202
203 for (i = 0; i < newReps.length; i++) {
204
205 if ((((2 * i) + 1) & currentBit) > 0) {
206
207 if (newReps[i] == null) {
208 newReps[i] = new StringBuilder(orgRep);
209 } else {
210 newReps[i].append(orgRep);
211 }
212 }
213
214 bufferHasNext = bufferIter.hasNext();
215
216
217
218 if ((newReps[i] != null) && bufferHasNext) {
219 newReps[i].append(separator);
220 }
221 }
222
223 currentBit /= 2;
224 }
225
226
227 for (i = 0; i < newReps.length; i++) {
228 result.add(new DefaultFeature(newReps[i].toString()));
229 }
230 }
231 }
232 return result;
233 }
234
235 /***
236 * Returns the maximum number of original features joined.
237 * @return the value of the attribute
238 */
239 public int getLength() {
240 return length;
241 }
242
243 /***
244 * Returns the string used to separate original features (by default a space
245 * character). This string <strong>should never occur</strong> within
246 * original features.
247 *
248 * @return the value of the attribute
249 */
250 public String getSeparator() {
251 return separator;
252 }
253
254 /***
255 * Returns the number of jointed features to generate for a given number of
256 * features to join.
257 *
258 * @param n the number of features to join
259 * @return 2<sup>(<em>n</em>-1)</sup>
260 */
261 private int numFeatures(final int n) {
262 return (int) Math.pow(2, n - 1);
263 }
264
265 /***
266 * {@inheritDoc}
267 */
268 public ObjectElement toElement() {
269
270 final ObjectElement result = super.toElement();
271 result.addAttribute(OSBTransformer.ATTRIB_LENGTH,
272 Integer.toString(length));
273 result.addAttribute(OSBTransformer.ATTRIB_SEPARATOR, separator);
274 return result;
275 }
276
277 /***
278 * Returns a string representation of this object.
279 *
280 * @return a textual representation
281 */
282 public String toString() {
283 return new ToStringBuilder(this)
284 .appendSuper(super.toString())
285 .append("length", length)
286 .append("separator", separator)
287 .toString();
288 }
289
290 }