1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.classify.feature;
23
24 import java.util.Iterator;
25
26 import org.apache.commons.collections.buffer.CircularFifoBuffer;
27 import org.apache.commons.lang.builder.ToStringBuilder;
28 import org.dom4j.Element;
29 import org.dom4j.QName;
30
31 import de.fu_berlin.ties.TiesConfiguration;
32 import de.fu_berlin.ties.io.ObjectElement;
33 import de.fu_berlin.ties.util.Util;
34 import de.fu_berlin.ties.xml.dom.DOMUtils;
35
36 /***
37 * Transforms a feature vector using the <em>orthogonal sparse bigrams
38 * (OSB)</em> technique developed by Fidelis Assis. This transformer
39 * discard all comment-only features. It slides of window of
40 * {@linkplain #getLength() length <em>N</em>} over the remaining original
41 * features. At each window position it generates <em>N</em>-1 joint features
42 * as exemplified above (assumping the pipe character "|" is used as
43 * {@linkplain #getSeparator() separator} and <em>N</em>=5:
44 *
45 * <pre>
46 * - - - w4 | w5
47 * - - w3 | | w5
48 * - w2 | | | w5
49 * w1 | | | | w5
50 * </pre>
51 *
52 * <p>If {@link #isPreserving()}, the original features are preserved as well;
53 * otherwise they are discarded.
54 *
55 * <p>Instances of this class are thread-safe.
56 *
57 * @author Christian Siefkes
58 * @version $Revision: 1.19 $, $Date: 2006/10/21 16:03:57 $, $Author: siefkes $
59 */
60 public class OSBTransformer extends FeatureTransformer {
61
62 /***
63 * Attribute name used for XML serialization.
64 */
65 static final QName ATTRIB_LENGTH = DOMUtils.defaultName("length");
66
67 /***
68 * Attribute name used for XML serialization.
69 */
70 private static final QName ATTRIB_PRESERVE =
71 DOMUtils.defaultName("preserve");
72
73 /***
74 * Attribute name used for XML serialization.
75 */
76 static final QName ATTRIB_SEPARATOR = DOMUtils.defaultName("separator");
77
78 /***
79 * Attribute name used for XML serialization.
80 */
81
82
83
84 /***
85 * Attribute name used for XML serialization.
86 */
87
88
89
90
91 /***
92 * Helper method that checks that the value to be used for the
93 * {@linkplain #getLength() length} attribute is valid.
94 *
95 * @param len to length value to check
96 * @throws IllegalArgumentException if <code>len < 2</code>
97 */
98 private static void ensureLength(final int len)
99 throws IllegalArgumentException {
100 if (len < 2) {
101 throw new IllegalArgumentException(
102 "OSB length must be at least 2:" + len);
103 }
104 }
105
106 /***
107 * Helper method for initializing the array of separator sequences.
108 *
109 * @param separator the separator string to use
110 * @param len the maximum number of original features joined.
111 * @return the initialized array of separator sequences
112 */
113 private static String[] initSeparators(final String separator,
114 final int len) {
115
116 final String[] result = new String[len - 1];
117 final StringBuilder sepBuilder = new StringBuilder();
118
119 for (int i = 0; i < result.length; i++) {
120
121 sepBuilder.append(separator);
122 result[i] = sepBuilder.toString();
123 }
124
125 return result;
126 }
127
128 /***
129 * Helper method for initializing the array of strength values, ensuring
130 * there are sufficient values.
131 *
132 * @param rawStrengthArray the raw array of strength values
133 * @param len the length of the array to return
134 * @return the initialized array of actual strength values
135 * @throws IllegalArgumentException if <code>rawStrengthArray</code>
136 * is empty
137 */
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175 /***
176 * The maximum number of original features joined.
177 */
178 private final int length;
179
180 /***
181 * Whether to preserve the original features as well or only to use joint
182 * features.
183 */
184 private final boolean preserving;
185
186 /***
187 * The string used to separate original features (by default a space
188 * character). This string <strong>should never occur</strong> within
189 * original features.
190 */
191 private final String sep;
192
193 /***
194 * A prebuild array of separator sequences of various length. The 0th
195 * element contains {@link #sep} character(s) one time, the 1st two times
196 * etc.
197 */
198 private final String[] separators;
199
200 /***
201 * Array of strength values used for bigrams with different distances.
202 */
203
204
205 /***
206 * Strength value used for unigrams (single tokens) if they are
207 * {@linkplain #isPreserving() preserved}.
208 */
209
210
211
212 /***
213 * Creates a new instance from an XML element, fulfilling the
214 * recommandation of the {@link de.fu_berlin.ties.io.XMLStorable} interface.
215 *
216 * @param element the XML element containing the serialized representation
217 * @throws InstantiationException if the given element does not contain
218 * a valid transformer description
219 */
220 public OSBTransformer(final Element element)
221 throws InstantiationException {
222
223 super(element);
224 final int rawLength = Util.asInt(element.attributeValue(ATTRIB_LENGTH));
225 ensureLength(rawLength);
226 length = rawLength;
227 preserving = Util.asBoolean(element.attributeValue(ATTRIB_PRESERVE));
228 sep = element.attributeValue(ATTRIB_SEPARATOR);
229 separators = initSeparators(sep, length);
230
231
232
233
234
235
236 }
237
238 /***
239 * Creates a new instance.
240 *
241 * @param precTrans the preceding transformer to use if this transformer
242 * is part of a <em>chain</em>; <code>null</code> otherwise
243 * @param len the maximum number of original features joined; minimum value
244 * is 2
245 * @param sepString the string used to separate original features -- this
246 * string <strong>should never occur</strong> within original features
247 * @param preserve whether to preserve the original features as well or
248 * only to use joint features
249 * @throws IllegalArgumentException if <code>len < 2</code> or if
250 * <code>strengthArray</code> is empty
251 */
252 public OSBTransformer(final FeatureTransformer precTrans, final int len,
253 final String sepString, final boolean preserve)
254 throws IllegalArgumentException {
255 super(precTrans);
256
257
258 ensureLength(len);
259 length = len;
260 preserving = preserve;
261 sep = sepString;
262 separators = initSeparators(sep, length);
263
264
265
266
267
268
269
270
271 }
272
273 /***
274 * Creates a new instance.
275 *
276 * @param precTrans the preceding transformer to use if this transformer
277 * is part of a <em>chain</em>; <code>null</code> otherwise
278 * @param config used to configure this instance
279 */
280 public OSBTransformer(final FeatureTransformer precTrans,
281 final TiesConfiguration config) {
282 this(precTrans, config.getInt("transformer.osb.length"),
283 config.getString("transformer.osb.separator",
284 SBPHTransformer.DEFAULT_SEPARATOR),
285 config.getBoolean("transformer.osb.preserve")
286
287
288
289 );
290 }
291
292 /***
293 * {@inheritDoc}
294 */
295 protected FeatureVector doTransform(final FeatureVector orgFeatures) {
296 final FeatureVector result = new DefaultFeatureVector();
297 final Iterator orgIter = orgFeatures.iterator();
298 final CircularFifoBuffer priorFeatureReps =
299 new CircularFifoBuffer(length - 1);
300 Iterator bufferIter;
301 String orgRep;
302 final StringBuilder newRep = new StringBuilder();
303 Feature orgF;
304 int distance;
305
306
307 while (orgIter.hasNext()) {
308 orgF = (Feature) orgIter.next();
309 orgRep = orgF.getRepresentation();
310
311
312 if (orgRep != null) {
313
314 if (preserving) {
315
316 result.add(orgF);
317
318
319
320
321
322 }
323
324 distance = priorFeatureReps.size();
325 bufferIter = priorFeatureReps.iterator();
326
327
328 while (bufferIter.hasNext()) {
329
330 newRep.setLength(0);
331 newRep.append((String) bufferIter.next());
332
333
334
335 distance--;
336
337
338 newRep.append(separators[distance]);
339 newRep.append(orgRep);
340 result.add(new DefaultFeature(newRep.toString(), null));
341
342 }
343
344
345 priorFeatureReps.add(orgRep);
346 }
347
348
349 }
350
351
352
353
354
355
356
357 return result;
358 }
359
360 /***
361 * Returns the maximum number of original features joined.
362 * @return the value of the attribute
363 */
364 public int getLength() {
365 return length;
366 }
367
368 /***
369 * Returns the string used to separate original features (by default a space
370 * character). This string <strong>should never occur</strong> within
371 * original features.
372 *
373 * @return the value of the attribute
374 */
375 public String getSeparator() {
376 return sep;
377 }
378
379 /***
380 * Whether original features are preserved as well in addition to the
381 * generated joint features.
382 *
383 * @return the value of the attribute
384 */
385 public boolean isPreserving() {
386 return preserving;
387 }
388
389 /***
390 * {@inheritDoc}
391 */
392 public ObjectElement toElement() {
393
394 final ObjectElement result = super.toElement();
395 result.addAttribute(ATTRIB_LENGTH, Integer.toString(length));
396 result.addAttribute(ATTRIB_SEPARATOR, sep);
397 result.addAttribute(ATTRIB_PRESERVE, Boolean.toString(preserving));
398
399
400
401 return result;
402 }
403
404 /***
405 * Returns a string representation of this object.
406 *
407 * @return a textual representation
408 */
409 public String toString() {
410 final ToStringBuilder builder = new ToStringBuilder(this)
411 .appendSuper(super.toString())
412 .append("length", length)
413 .append("separator", sep)
414 .append("preserving originals", preserving);
415
416
417
418
419
420 return builder.toString();
421 }
422
423 }