View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.classify.feature;
23  
24  import java.util.Iterator;
25  
26  import org.apache.commons.collections.buffer.CircularFifoBuffer;
27  import org.apache.commons.lang.builder.ToStringBuilder;
28  
29  import de.fu_berlin.ties.TiesConfiguration;
30  import de.fu_berlin.ties.util.CollectionUtils;
31  import de.fu_berlin.ties.util.Util;
32  
33  /***
34   * Transforms a feature vector using the <em>orthogonal sparse bigrams
35   * (OSB)</em> technique developed by Fidelis Assis. This transformer
36   * discard all comment-only features. It slides of window of
37   * {@linkplain #getLength() length <em>N</em>} over the remaining original
38   * features. At each window position it generates <em>N</em>-1 joint features
39   * as exemplified above (assumping the pipe character "|" is used as
40   * {@linkplain #getSeparator() separator} and <em>N</em>=5:
41   *
42   * <pre>
43   *    -   -   -  w4 | w5
44   *    -   -   w3  | | w5
45   *    -  w2   |   | | w5
46   *   w1   |   |   | | w5
47   * </pre>
48   *
49   * <p>If {@link #isPreserving()}, the original features are preserved as well;
50   * otherwise they are discarded.
51   *
52   * <p>Instances of this class are thread-safe.
53   *
54   * @author Christian Siefkes
55   * @version $Revision: 1.6 $, $Date: 2004/12/06 17:57:30 $, $Author: siefkes $
56   */
57  public class OSBTransformer extends FeatureTransformer {
58  
59      /***
60       * The maximum number of original features joined.
61       */
62      private final int length;
63  
64      /***
65       * Whether to preserve the original features as well or only to use joint
66       * features.
67       */
68      private final boolean preserving;
69  
70      /***
71       * The string used to separate original features (by default a space
72       * character). This string <strong>should never occur</strong> within
73       * original features.
74       */
75      private final String separator;
76  
77      /***
78       * Array of strength values used for bigrams with different distances.
79       */
80      private final float[] strengths;
81  
82      /***
83       * Strength value used for unigrams (single tokens) if they are
84       * {@linkplain #isPreserving() preserved}.
85       */
86      private final float unigramStrength;
87  
88      /***
89       * Creates a new instance.
90       *
91       * @param precTrans the preceding transformer to use if this transformer
92       * is part of a <em>chain</em>; <code>null</code> otherwise
93       * @param len the maximum number of original features joined; minimum value
94       * is 2
95       * @param sep the string used to separate original features -- this string
96       * <strong>should never occur</strong> within original features
97       * @param preserve whether to preserve the original features as well or
98       * only to use joint features
99       * @param strengthArray Array of strength values used for bigrams with
100      * different distances
101      * @param singleTokenStrength Strength value used for unigrams (single
102      * tokens); ignored if <code>preserve</code> is <code>false</code>
103      * @throws IllegalArgumentException if <code>len &lt; 2</code> or if
104      * <code>strengthArray</code> is empty
105      */
106     public OSBTransformer(final FeatureTransformer precTrans, final int len,
107                           final String sep, final boolean preserve,
108                           final float[] strengthArray,
109                           final float singleTokenStrength)
110     throws IllegalArgumentException {
111         super(precTrans);
112 
113         // check + store arguments
114         if (len < 2) {
115             throw new IllegalArgumentException(
116                 "OSB length must be at least 2:" + len);
117         }
118         length = len;
119         separator = sep;
120         preserving = preserve;
121         unigramStrength = singleTokenStrength;
122 
123         // store strengh array, ensuring there are sufficient values
124         if (strengthArray.length >= len - 1) {
125             strengths = strengthArray;
126         } else if (strengthArray.length > 0) {
127             strengths = new float[len - 1];
128             int i = 0;
129 
130             while (i < strengthArray.length) {
131                 strengths[i] = strengthArray[i];
132                 i++;
133             }
134 
135             while (i < strengths.length) {
136                 // re-use last specified value for rest of array
137                 strengths[i] = strengthArray[strengthArray.length - 1];
138                 i++;
139             }
140 
141             if (strengthArray.length > 1) {
142                 Util.LOG.warn("Strength array has only " + strengthArray.length
143                         + " elements instead of " + len
144                         + " ones -- re-using the last one");
145             } else {
146                 Util.LOG.debug("Using uniform strength " + strengthArray[0]
147                         + " for all bigrams");
148             }
149         } else {
150             throw new IllegalArgumentException(
151                 "Empty strength array");
152         }
153     }
154 
155     /***
156      * Creates a new instance.
157      *
158      * @param precTrans the preceding transformer to use if this transformer
159      * is part of a <em>chain</em>; <code>null</code> otherwise
160      * @param config used to configure this instance
161      */
162     public OSBTransformer(final FeatureTransformer precTrans,
163             final TiesConfiguration config) {
164         this(precTrans, config.getInt("transformer.osb.length"),
165             config.getString("transformer.osb.separator",
166                 SBPHTransformer.DEFAULT_SEPARATOR),
167             config.getBoolean("transformer.osb.preserve"),
168             CollectionUtils.asFloatArray(config.getStringArray(
169                                 "transformer.osb.strengths")),
170             config.getFloat("transformer.osb.strength.unigram"));
171     }
172 
173     /***
174      * {@inheritDoc}
175      */
176     protected FeatureVector doTransform(final FeatureVector orgFeatures) {
177         final FeatureVector result = new DefaultFeatureVector();
178         final Iterator orgIter = orgFeatures.iterator();
179         final CircularFifoBuffer priorFeatureReps =
180             new CircularFifoBuffer(length - 1);
181         Iterator bufferIter;
182         String orgRep;
183         Feature orgF;
184         StringBuffer newRep = new StringBuffer();
185         int distance;
186         int i;
187 
188         while (orgIter.hasNext()) {
189             orgF = (Feature) orgIter.next();
190             orgRep = orgF.getRepresentation();
191 
192             // ignore comment-only features
193             if (orgRep != null) {
194                 // append original feature if configured
195                 if (preserving) {
196                     if (orgF.getStrength() == unigramStrength) {
197                         result.add(orgF);
198                     } else {
199                         // create copy of feature with specified strength
200                         result.add(new DefaultFeature(orgF.getRepresentation(),
201                                         orgF.getComment(), unigramStrength));
202                     }
203                 }
204 
205                 distance = priorFeatureReps.size();
206                 bufferIter = priorFeatureReps.iterator();
207 
208                 // create feature pairs
209                 while (bufferIter.hasNext()) {
210                     // reset string buffer + append prior feature from buffer
211                     newRep.setLength(0);
212                     newRep.append(bufferIter.next());
213 
214                     // append suitable number of separators
215                     for (i = 0; i < distance; i++) {
216                         newRep.append(separator);
217                     }
218 
219                     // decrement distance for next run and to look up the
220                     // current strength (stored in array from 0 to N-1, while
221                     // 1 to N separators are used)
222                     distance--;
223                     // append currently processed feature
224                     newRep.append(orgRep);
225                     // store result with specified strength
226                     result.add(new DefaultFeature(newRep.toString(), null,
227                                     strengths[distance]));
228                 }
229 
230                 // store processed feature
231                 priorFeatureReps.add(orgRep);
232             }
233         }
234 
235 /*        // tracing for Bill
236         Util.LOG.debug("Input features: " + orgFeatures.size());
237         Util.LOG.debug("OSB features: " + result.size()); */
238 
239         return result;
240     }
241 
242     /***
243      * Returns the maximum number of original features joined.
244      * @return the value of the attribute
245      */
246     public int getLength() {
247         return length;
248     }
249 
250     /***
251      * Returns the string used to separate original features (by default a space
252      * character). This string <strong>should never occur</strong> within
253      * original features.
254      *
255      * @return the value of the attribute
256      */
257     public String getSeparator() {
258         return separator;
259     }
260 
261     /***
262      * Whether original features are preserved as well in addition to the
263      * generated joint features.
264      *
265      * @return the value of the attribute
266      */
267     public boolean isPreserving() {
268         return preserving;
269     }
270 
271     /***
272      * Returns a string representation of this object.
273      *
274      * @return a textual representation
275      */
276     public String toString() {
277         final ToStringBuilder builder = new ToStringBuilder(this)
278             .appendSuper(super.toString())
279             .append("length", length)
280             .append("separator", separator)
281             .append("preserving originals", preserving)
282             .append("strengths", strengths);
283 
284         if (preserving) {
285             builder.append("unigram strength", unigramStrength);
286         }
287         return builder.toString();
288     }
289 
290 }