View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.classify.feature;
23  
24  import java.util.Iterator;
25  
26  import org.apache.commons.collections.buffer.CircularFifoBuffer;
27  import org.apache.commons.lang.builder.ToStringBuilder;
28  import org.dom4j.Element;
29  import org.dom4j.QName;
30  
31  import de.fu_berlin.ties.TiesConfiguration;
32  import de.fu_berlin.ties.io.ObjectElement;
33  import de.fu_berlin.ties.util.Util;
34  import de.fu_berlin.ties.xml.dom.DOMUtils;
35  
36  /***
37   * Transforms a feature vector using the <em>orthogonal sparse bigrams
38   * (OSB)</em> technique developed by Fidelis Assis. This transformer
39   * discard all comment-only features. It slides of window of
40   * {@linkplain #getLength() length <em>N</em>} over the remaining original
41   * features. At each window position it generates <em>N</em>-1 joint features
42   * as exemplified above (assumping the pipe character "|" is used as
43   * {@linkplain #getSeparator() separator} and <em>N</em>=5:
44   *
45   * <pre>
46   *    -   -   -  w4 | w5
47   *    -   -   w3  | | w5
48   *    -  w2   |   | | w5
49   *   w1   |   |   | | w5
50   * </pre>
51   *
52   * <p>If {@link #isPreserving()}, the original features are preserved as well;
53   * otherwise they are discarded.
54   *
55   * <p>Instances of this class are thread-safe.
56   *
57   * @author Christian Siefkes
58   * @version $Revision: 1.19 $, $Date: 2006/10/21 16:03:57 $, $Author: siefkes $
59   */
60  public class OSBTransformer extends FeatureTransformer {
61  
62      /***
63       * Attribute name used for XML serialization.
64       */
65      static final QName ATTRIB_LENGTH = DOMUtils.defaultName("length");
66  
67      /***
68       * Attribute name used for XML serialization.
69       */
70      private static final QName ATTRIB_PRESERVE =
71          DOMUtils.defaultName("preserve");
72  
73      /***
74       * Attribute name used for XML serialization.
75       */
76      static final QName ATTRIB_SEPARATOR = DOMUtils.defaultName("separator");
77  
78      /***
79       * Attribute name used for XML serialization.
80       */
81  /*    private static final QName ATTRIB_STRENGTHS =
82          DOMUtils.defaultName("strengths"); */
83  
84      /***
85       * Attribute name used for XML serialization.
86       */
87  /*    private static final QName ATTRIB_UNIGRAM_STRENGTH =
88          DOMUtils.defaultName("unigramStrength"); */
89  
90  
91      /***
92       * Helper method that checks that the value to be used for the
93       * {@linkplain #getLength() length} attribute is valid.
94       *
95       * @param len to length value to check
96       * @throws IllegalArgumentException if <code>len &lt; 2</code>
97       */
98      private static void ensureLength(final int len)
99      throws IllegalArgumentException {
100         if (len < 2) {
101             throw new IllegalArgumentException(
102                 "OSB length must be at least 2:" + len);
103         }
104     }
105 
106     /***
107      * Helper method for initializing the array of separator sequences.
108      *
109      * @param separator the separator string to use
110      * @param len the maximum number of original features joined.
111      * @return the initialized array of separator sequences
112      */
113     private static String[] initSeparators(final String separator,
114             final int len) {
115         // build array of separator sequences
116         final String[] result = new String[len - 1];
117         final StringBuilder sepBuilder = new StringBuilder();
118 
119         for (int i = 0; i < result.length; i++) {
120             // append additional instance of sep char
121             sepBuilder.append(separator);
122             result[i] = sepBuilder.toString();
123         }
124 
125         return result;
126     }
127 
128     /***
129      * Helper method for initializing the array of strength values, ensuring
130      * there are sufficient values.
131      *
132      * @param rawStrengthArray the raw array of strength values
133      * @param len the length of the array to return
134      * @return the initialized array of actual strength values
135      * @throws IllegalArgumentException if <code>rawStrengthArray</code>
136      * is empty
137      */
138 /*    private static float[] initStrengths(final float[] rawStrengthArray,
139             final int len) throws IllegalArgumentException {
140         // store strengh array, 
141         final float[] result;
142         if (rawStrengthArray.length >= len - 1) {
143             result = rawStrengthArray;
144         } else if (rawStrengthArray.length > 0) {
145             result = new float[len - 1];
146             int i = 0;
147 
148             while (i < rawStrengthArray.length) {
149                 result[i] = rawStrengthArray[i];
150                 i++;
151             }
152 
153             while (i < result.length) {
154                 // re-use last specified value for rest of array
155                 result[i] = rawStrengthArray[rawStrengthArray.length - 1];
156                 i++;
157             }
158 
159             if (rawStrengthArray.length > 1) {
160                 Util.LOG.warn("Strength array has only "
161                         + rawStrengthArray.length + " elements instead of "
162                         + len + " ones -- re-using the last one");
163             } else {
164                 Util.LOG.debug("Using uniform strength " + rawStrengthArray[0]
165                         + " for all bigrams");
166             }
167         } else {
168             throw new IllegalArgumentException(
169                 "Empty strength array");
170         }
171         return result;
172     } */
173 
174 
175     /***
176      * The maximum number of original features joined.
177      */
178     private final int length;
179 
180     /***
181      * Whether to preserve the original features as well or only to use joint
182      * features.
183      */
184     private final boolean preserving;
185 
186     /***
187      * The string used to separate original features (by default a space
188      * character). This string <strong>should never occur</strong> within
189      * original features.
190      */
191     private final String sep;
192 
193     /***
194      * A prebuild array of separator sequences of various length. The 0th
195      * element contains {@link #sep} character(s) one time, the 1st two times
196      * etc.
197      */
198     private final String[] separators;
199 
200     /***
201      * Array of strength values used for bigrams with different distances.
202      */
203 //    private final float[] strengths;
204 
205     /***
206      * Strength value used for unigrams (single tokens) if they are
207      * {@linkplain #isPreserving() preserved}.
208      */
209 //    private final float unigramStrength;
210 
211 
212     /***
213      * Creates a new instance from an XML element, fulfilling the
214      * recommandation of the {@link de.fu_berlin.ties.io.XMLStorable} interface.
215      *
216      * @param element the XML element containing the serialized representation
217      * @throws InstantiationException if the given element does not contain
218      * a valid transformer description
219      */
220     public OSBTransformer(final Element element)
221     throws InstantiationException {
222         // delegate to superclass + check and init fields
223         super(element);
224         final int rawLength = Util.asInt(element.attributeValue(ATTRIB_LENGTH));
225         ensureLength(rawLength);
226         length = rawLength;
227         preserving = Util.asBoolean(element.attributeValue(ATTRIB_PRESERVE));
228         sep = element.attributeValue(ATTRIB_SEPARATOR);
229         separators = initSeparators(sep, length);
230 
231         /*        unigramStrength =
232             Util.asFloat(element.attributeValue(ATTRIB_UNIGRAM_STRENGTH));
233         final float[] rawStrengths =
234             CollUtils.asFloatArray(element.attributeValue(ATTRIB_STRENGTHS));
235         strengths = initStrengths(rawStrengths, rawLength); */
236     }
237 
238     /***
239      * Creates a new instance.
240      *
241      * @param precTrans the preceding transformer to use if this transformer
242      * is part of a <em>chain</em>; <code>null</code> otherwise
243      * @param len the maximum number of original features joined; minimum value
244      * is 2
245      * @param sepString the string used to separate original features -- this
246      * string <strong>should never occur</strong> within original features
247      * @param preserve whether to preserve the original features as well or
248      * only to use joint features
249      * @throws IllegalArgumentException if <code>len &lt; 2</code> or if
250      * <code>strengthArray</code> is empty
251      */
252     public OSBTransformer(final FeatureTransformer precTrans, final int len,
253                           final String sepString, final boolean preserve)
254     throws IllegalArgumentException {
255         super(precTrans);
256 
257         // check + store arguments
258         ensureLength(len);
259         length = len;
260         preserving = preserve;
261         sep = sepString;
262         separators = initSeparators(sep, length);
263 /*
264      * @param strengthArray Array of strength values used for bigrams with
265      * different distances
266      * @param singleTokenStrength Strength value used for unigrams (single
267      * tokens); ignored if <code>preserve</code> is <code>false</code>
268 ... final float[] strengthArray, final float singleTokenStrength
269         unigramStrength = singleTokenStrength;
270         strengths = initStrengths(strengthArray, len); */
271     }
272 
273     /***
274      * Creates a new instance.
275      *
276      * @param precTrans the preceding transformer to use if this transformer
277      * is part of a <em>chain</em>; <code>null</code> otherwise
278      * @param config used to configure this instance
279      */
280     public OSBTransformer(final FeatureTransformer precTrans,
281             final TiesConfiguration config) {
282         this(precTrans, config.getInt("transformer.osb.length"),
283             config.getString("transformer.osb.separator",
284                 SBPHTransformer.DEFAULT_SEPARATOR),
285             config.getBoolean("transformer.osb.preserve") /*,
286             CollUtils.asFloatArray(config.getStringArray(
287                                 "transformer.osb.strengths")),
288             config.getFloat("transformer.osb.strength.unigram") */
289             );
290     }
291 
292     /***
293      * {@inheritDoc}
294      */
295     protected FeatureVector doTransform(final FeatureVector orgFeatures) {
296         final FeatureVector result = new DefaultFeatureVector();
297         final Iterator orgIter = orgFeatures.iterator();
298         final CircularFifoBuffer priorFeatureReps =
299             new CircularFifoBuffer(length - 1);
300         Iterator bufferIter;
301         String orgRep;
302         final StringBuilder newRep = new StringBuilder();
303         Feature orgF;
304         int distance;
305         //int commentFeatures = 0;
306 
307         while (orgIter.hasNext()) {
308             orgF = (Feature) orgIter.next();
309             orgRep = orgF.getRepresentation();
310 
311             // ignore comment-only features
312             if (orgRep != null) {
313                 // append original feature if configured
314                 if (preserving) {
315 //                    if (orgF.getStrength() == unigramStrength) {
316                         result.add(orgF);
317 /*                    } else {
318                         // create copy of feature with specified strength
319                         result.add(new DefaultFeature(orgF.getRepresentation(),
320                                         orgF.getComment(), unigramStrength));
321                     } */
322                 }
323 
324                 distance = priorFeatureReps.size();
325                 bufferIter = priorFeatureReps.iterator();
326 
327                 // create feature pairs
328                 while (bufferIter.hasNext()) {
329                     // reset string buffer + append prior feature from buffer
330                     newRep.setLength(0);
331                     newRep.append((String) bufferIter.next());
332 
333                     // decrement distance for next run and to look up the
334                     // separator (stored in array from 0 to N-1)
335                     distance--;
336 
337                     // append suitable separators + current representation
338                     newRep.append(separators[distance]);
339                     newRep.append(orgRep);
340                     result.add(new DefaultFeature(newRep.toString(), null));
341 //                                    strengths[distance]));
342                 }
343 
344                 // store processed feature
345                 priorFeatureReps.add(orgRep);
346             } /* else {
347                 commentFeatures++;
348             } */
349         }
350 
351 /*        // tracing for Bill + counting comment features
352         Util.LOG.debug("Input features: " + orgFeatures.size());
353         Util.LOG.debug("OSB features: " + result.size());
354         Util.LOG.debug("Non-comment input features: "
355                 + (orgFeatures.size() - commentFeatures));*/
356 
357         return result;
358     }
359 
360     /***
361      * Returns the maximum number of original features joined.
362      * @return the value of the attribute
363      */
364     public int getLength() {
365         return length;
366     }
367 
368     /***
369      * Returns the string used to separate original features (by default a space
370      * character). This string <strong>should never occur</strong> within
371      * original features.
372      *
373      * @return the value of the attribute
374      */
375     public String getSeparator() {
376         return sep;
377     }
378 
379     /***
380      * Whether original features are preserved as well in addition to the
381      * generated joint features.
382      *
383      * @return the value of the attribute
384      */
385     public boolean isPreserving() {
386         return preserving;
387     }
388 
389     /***
390      * {@inheritDoc}
391      */
392     public ObjectElement toElement() {
393         // delegate to superclass + add attributes
394         final ObjectElement result = super.toElement();
395         result.addAttribute(ATTRIB_LENGTH, Integer.toString(length));
396         result.addAttribute(ATTRIB_SEPARATOR, sep);
397         result.addAttribute(ATTRIB_PRESERVE, Boolean.toString(preserving));
398 /*        result.addAttribute(ATTRIB_STRENGTHS, CollUtils.flatten(strengths));
399         result.addAttribute(ATTRIB_UNIGRAM_STRENGTH,
400                 Float.toString(unigramStrength)); */
401         return result;
402     }
403 
404     /***
405      * Returns a string representation of this object.
406      *
407      * @return a textual representation
408      */
409     public String toString() {
410         final ToStringBuilder builder = new ToStringBuilder(this)
411             .appendSuper(super.toString())
412             .append("length", length)
413             .append("separator", sep)
414             .append("preserving originals", preserving);
415 /*            .append("strengths", strengths);
416 
417         if (preserving) {
418             builder.append("unigram strength", unigramStrength);
419         } */
420         return builder.toString();
421     }
422 
423 }