1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.classify.feature;
23
24 import java.util.Iterator;
25
26 import org.apache.commons.collections.buffer.CircularFifoBuffer;
27 import org.apache.commons.lang.builder.ToStringBuilder;
28
29 import de.fu_berlin.ties.TiesConfiguration;
30 import de.fu_berlin.ties.util.CollectionUtils;
31 import de.fu_berlin.ties.util.Util;
32
33 /***
34 * Transforms a feature vector using the <em>orthogonal sparse bigrams
35 * (OSB)</em> technique developed by Fidelis Assis. This transformer
36 * discard all comment-only features. It slides of window of
37 * {@linkplain #getLength() length <em>N</em>} over the remaining original
38 * features. At each window position it generates <em>N</em>-1 joint features
39 * as exemplified above (assumping the pipe character "|" is used as
40 * {@linkplain #getSeparator() separator} and <em>N</em>=5:
41 *
42 * <pre>
43 * - - - w4 | w5
44 * - - w3 | | w5
45 * - w2 | | | w5
46 * w1 | | | | w5
47 * </pre>
48 *
49 * <p>If {@link #isPreserving()}, the original features are preserved as well;
50 * otherwise they are discarded.
51 *
52 * <p>Instances of this class are thread-safe.
53 *
54 * @author Christian Siefkes
55 * @version $Revision: 1.6 $, $Date: 2004/12/06 17:57:30 $, $Author: siefkes $
56 */
57 public class OSBTransformer extends FeatureTransformer {
58
59 /***
60 * The maximum number of original features joined.
61 */
62 private final int length;
63
64 /***
65 * Whether to preserve the original features as well or only to use joint
66 * features.
67 */
68 private final boolean preserving;
69
70 /***
71 * The string used to separate original features (by default a space
72 * character). This string <strong>should never occur</strong> within
73 * original features.
74 */
75 private final String separator;
76
77 /***
78 * Array of strength values used for bigrams with different distances.
79 */
80 private final float[] strengths;
81
82 /***
83 * Strength value used for unigrams (single tokens) if they are
84 * {@linkplain #isPreserving() preserved}.
85 */
86 private final float unigramStrength;
87
88 /***
89 * Creates a new instance.
90 *
91 * @param precTrans the preceding transformer to use if this transformer
92 * is part of a <em>chain</em>; <code>null</code> otherwise
93 * @param len the maximum number of original features joined; minimum value
94 * is 2
95 * @param sep the string used to separate original features -- this string
96 * <strong>should never occur</strong> within original features
97 * @param preserve whether to preserve the original features as well or
98 * only to use joint features
99 * @param strengthArray Array of strength values used for bigrams with
100 * different distances
101 * @param singleTokenStrength Strength value used for unigrams (single
102 * tokens); ignored if <code>preserve</code> is <code>false</code>
103 * @throws IllegalArgumentException if <code>len < 2</code> or if
104 * <code>strengthArray</code> is empty
105 */
106 public OSBTransformer(final FeatureTransformer precTrans, final int len,
107 final String sep, final boolean preserve,
108 final float[] strengthArray,
109 final float singleTokenStrength)
110 throws IllegalArgumentException {
111 super(precTrans);
112
113
114 if (len < 2) {
115 throw new IllegalArgumentException(
116 "OSB length must be at least 2:" + len);
117 }
118 length = len;
119 separator = sep;
120 preserving = preserve;
121 unigramStrength = singleTokenStrength;
122
123
124 if (strengthArray.length >= len - 1) {
125 strengths = strengthArray;
126 } else if (strengthArray.length > 0) {
127 strengths = new float[len - 1];
128 int i = 0;
129
130 while (i < strengthArray.length) {
131 strengths[i] = strengthArray[i];
132 i++;
133 }
134
135 while (i < strengths.length) {
136
137 strengths[i] = strengthArray[strengthArray.length - 1];
138 i++;
139 }
140
141 if (strengthArray.length > 1) {
142 Util.LOG.warn("Strength array has only " + strengthArray.length
143 + " elements instead of " + len
144 + " ones -- re-using the last one");
145 } else {
146 Util.LOG.debug("Using uniform strength " + strengthArray[0]
147 + " for all bigrams");
148 }
149 } else {
150 throw new IllegalArgumentException(
151 "Empty strength array");
152 }
153 }
154
155 /***
156 * Creates a new instance.
157 *
158 * @param precTrans the preceding transformer to use if this transformer
159 * is part of a <em>chain</em>; <code>null</code> otherwise
160 * @param config used to configure this instance
161 */
162 public OSBTransformer(final FeatureTransformer precTrans,
163 final TiesConfiguration config) {
164 this(precTrans, config.getInt("transformer.osb.length"),
165 config.getString("transformer.osb.separator",
166 SBPHTransformer.DEFAULT_SEPARATOR),
167 config.getBoolean("transformer.osb.preserve"),
168 CollectionUtils.asFloatArray(config.getStringArray(
169 "transformer.osb.strengths")),
170 config.getFloat("transformer.osb.strength.unigram"));
171 }
172
173 /***
174 * {@inheritDoc}
175 */
176 protected FeatureVector doTransform(final FeatureVector orgFeatures) {
177 final FeatureVector result = new DefaultFeatureVector();
178 final Iterator orgIter = orgFeatures.iterator();
179 final CircularFifoBuffer priorFeatureReps =
180 new CircularFifoBuffer(length - 1);
181 Iterator bufferIter;
182 String orgRep;
183 Feature orgF;
184 StringBuffer newRep = new StringBuffer();
185 int distance;
186 int i;
187
188 while (orgIter.hasNext()) {
189 orgF = (Feature) orgIter.next();
190 orgRep = orgF.getRepresentation();
191
192
193 if (orgRep != null) {
194
195 if (preserving) {
196 if (orgF.getStrength() == unigramStrength) {
197 result.add(orgF);
198 } else {
199
200 result.add(new DefaultFeature(orgF.getRepresentation(),
201 orgF.getComment(), unigramStrength));
202 }
203 }
204
205 distance = priorFeatureReps.size();
206 bufferIter = priorFeatureReps.iterator();
207
208
209 while (bufferIter.hasNext()) {
210
211 newRep.setLength(0);
212 newRep.append(bufferIter.next());
213
214
215 for (i = 0; i < distance; i++) {
216 newRep.append(separator);
217 }
218
219
220
221
222 distance--;
223
224 newRep.append(orgRep);
225
226 result.add(new DefaultFeature(newRep.toString(), null,
227 strengths[distance]));
228 }
229
230
231 priorFeatureReps.add(orgRep);
232 }
233 }
234
235
236
237
238
239 return result;
240 }
241
242 /***
243 * Returns the maximum number of original features joined.
244 * @return the value of the attribute
245 */
246 public int getLength() {
247 return length;
248 }
249
250 /***
251 * Returns the string used to separate original features (by default a space
252 * character). This string <strong>should never occur</strong> within
253 * original features.
254 *
255 * @return the value of the attribute
256 */
257 public String getSeparator() {
258 return separator;
259 }
260
261 /***
262 * Whether original features are preserved as well in addition to the
263 * generated joint features.
264 *
265 * @return the value of the attribute
266 */
267 public boolean isPreserving() {
268 return preserving;
269 }
270
271 /***
272 * Returns a string representation of this object.
273 *
274 * @return a textual representation
275 */
276 public String toString() {
277 final ToStringBuilder builder = new ToStringBuilder(this)
278 .appendSuper(super.toString())
279 .append("length", length)
280 .append("separator", separator)
281 .append("preserving originals", preserving)
282 .append("strengths", strengths);
283
284 if (preserving) {
285 builder.append("unigram strength", unigramStrength);
286 }
287 return builder.toString();
288 }
289
290 }