View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.classify.feature;
23  
24  import java.util.Iterator;
25  
26  import org.apache.commons.collections.buffer.CircularFifoBuffer;
27  import org.apache.commons.lang.builder.ToStringBuilder;
28  import org.dom4j.Element;
29  
30  import de.fu_berlin.ties.TiesConfiguration;
31  import de.fu_berlin.ties.io.ObjectElement;
32  import de.fu_berlin.ties.util.Util;
33  
34  /***
35   * Transforms a feature vector using a simple implementation of the <em>sparse
36   * binary polynomial hashing (SBPH)</em> technique introduced by
37   * <a href="http://crm114.sourceforge.net/">CRM114</a>. This transformer
38   * discard all comment-only features (indeed all comments). It slides of window
39   * of {@linkplain #getLength() length <em>N</em>} over the remaining original
40   * features. For each window position, it counts in binary from 1 to
41   * 2<sup><em>N</em></sup>. For each odd number, a joint feature is generated
42   * where original features at "1" positions are visible and original features at
43   * "0" positions are hidden. Separators prior to the first feature are
44   * discarded, but all inner separators are kept. E.g. if <em>N</em>=3 and a
45   * pipe character "|" is used as {@linkplain #getSeparator() separator}, from
46   * the original features "a", "b", "c", four joint features will be generated
47   * at the last position: "c" (binary 1=001), "b|c" (binary 3=011), "a||c"
48   * (binary 5=101), "a|b|c" (binary 7=111). Thus 2<sup><em>N</em>-1</sup>
49   * joint features are generated for each original (non-comment) feature (except
50   * for the very first features).
51   *
52   * <p>Instances of this class are thread-safe.
53   *
54   * @author Christian Siefkes
55   * @version $Revision: 1.10 $, $Date: 2006/10/21 16:03:57 $, $Author: siefkes $
56   */
57  public class SBPHTransformer extends FeatureTransformer {
58  
59      /***
60       * The {@linkplain #getSeparator() separator} used by default (a space
61       * character).
62       */
63      public static final String DEFAULT_SEPARATOR = " ";
64  
65      /***
66       * Helper method for initializing the array of pre-calculated bit values.
67       *
68       * @param length the maximum number of original features joined
69       * @return the array of pre-calculated bit values to use
70       */
71      private static int[] initBitValues(final int length) {
72          // pre-calculate values for bit operations (1, 2, 4, 8...)
73          final int[] result = new int[length];
74          int bitVal = 1;
75  
76          for (int i = 0; i < length; i++) {
77              result[i] = bitVal;
78              bitVal *= 2;
79          }
80          return result;
81      }
82  
83  
84      /***
85       * The maximum number of original features joined.
86       */
87      private final int length;
88  
89      /***
90       * The string used to separate original features (by default a space
91       * character). This string <strong>should never occur</strong> within
92       * original features.
93       */
94      private final String separator;
95  
96      /***
97       * Stores bit values up to {@linkplain #getLength() length <em>N</em>},
98       * for bit operations: 1, 2, 4, 8, 16 etc. The <em>n</em>-th element in this
99       * vector contains the value 2<sup>n+1</sup>.
100      */
101     private final int[] bitValues;
102 
103     /***
104      * The maximum number of new features to generate for each position:
105      * 2<sup>(<em>{@linkplain #getLength() N} - 1</em>)</sup>.
106      */
107     private final int maxNumber;
108 
109     /***
110      * Creates a new instance from an XML element, fulfilling the
111      * recommandation of the {@link de.fu_berlin.ties.io.XMLStorable} interface.
112      *
113      * @param element the XML element containing the serialized representation
114      * @throws InstantiationException if the given element does not contain
115      * a valid transformer description
116      */
117     public SBPHTransformer(final Element element)
118     throws InstantiationException {
119         // delegate to superclass + check and init fields
120         super(element);
121         length =
122             Util.asInt(element.attributeValue(OSBTransformer.ATTRIB_LENGTH));
123         separator = element.attributeValue(OSBTransformer.ATTRIB_SEPARATOR);
124         bitValues = initBitValues(length);
125         maxNumber = numFeatures(length);
126     }
127 
128     /***
129      * Creates a new instance.
130      *
131      * @param precTrans the preceding transformer to use if this transformer
132      * is part of a <em>chain</em>; <code>null</code> otherwise
133      * @param len the maximum number of original features joined
134      * @param sep the string used to separate original features -- this string
135      * <strong>should never occur</strong> within original features
136      */
137     public SBPHTransformer(final FeatureTransformer precTrans, final int len,
138             final String sep) {
139         super(precTrans);
140         length = len;
141         separator = sep;
142         bitValues = initBitValues(length);
143         maxNumber = numFeatures(length);
144     }
145 
146     /***
147      * Creates a new instance.
148      *
149      * @param precTrans the preceding transformer to use if this transformer
150      * is part of a <em>chain</em>; <code>null</code> otherwise
151      * @param config used to configure this instance
152      */
153     public SBPHTransformer(final FeatureTransformer precTrans,
154             final TiesConfiguration config) {
155         this(precTrans, config.getInt("transformer.sbph.length"),
156             config.getString("transformer.sbph.separator", DEFAULT_SEPARATOR));
157     }
158 
159     /***
160      * {@inheritDoc}
161      */
162     protected FeatureVector doTransform(final FeatureVector orgFeatures) {
163         final FeatureVector result = new DefaultFeatureVector();
164         final Iterator orgIter = orgFeatures.iterator();
165         final CircularFifoBuffer lastNFeatureReps =
166             new CircularFifoBuffer(length);
167         Iterator bufferIter;
168         int bufferSize;
169         String orgRep;
170         Feature orgF;
171         StringBuilder[] newReps;
172         int numNew;
173         int currentBit;
174         int i;
175 
176         while (orgIter.hasNext()) {
177             orgF = (Feature) orgIter.next();
178             orgRep = orgF.getRepresentation();
179 
180             // ignore comment-only features, keep only representations
181             if (orgRep != null) {
182                 lastNFeatureReps.add(orgRep);
183                 bufferSize = lastNFeatureReps.size();
184 
185                 // how many features to generate?
186                 if (bufferSize == length) {
187                     // use pre-calculated value
188                     numNew = maxNumber;
189                 } else {
190                     numNew = numFeatures(bufferSize);
191                 }
192 
193                 // count bits from bufferSize down to 1
194                 currentBit = bitValues[bufferSize - 1];
195                 newReps = new StringBuilder[numNew];
196                 bufferIter = lastNFeatureReps.iterator();
197                 boolean bufferHasNext = bufferIter.hasNext();
198 
199                 // create polynomial feature representations, all in parallel
200                 while (bufferHasNext) {
201                     orgRep = (String) bufferIter.next();
202 
203                     for (i = 0; i < newReps.length; i++) {
204                     // i-th buffer corresponds to binary number 2i+1
205                         if ((((2 * i) + 1) & currentBit) > 0) {
206                             // bit is set: add feature representation
207                             if (newReps[i] == null) {
208                                 newReps[i] = new StringBuilder(orgRep);
209                             } else {
210                                 newReps[i].append(orgRep);
211                             }
212                         }
213 
214                         bufferHasNext = bufferIter.hasNext();
215 
216                         // add separator, except prior to first included feature
217                         // and after last feature
218                         if ((newReps[i] != null) && bufferHasNext) {
219                             newReps[i].append(separator);
220                         }
221                     }
222 
223                     currentBit /= 2;
224                 }
225 
226                 // append created features to result vector
227                 for (i = 0; i < newReps.length; i++) {
228                     result.add(new DefaultFeature(newReps[i].toString()));
229                 }
230             }
231         }
232         return result;
233     }
234 
235     /***
236      * Returns the maximum number of original features joined.
237      * @return the value of the attribute
238      */
239     public int getLength() {
240         return length;
241     }
242 
243     /***
244      * Returns the string used to separate original features (by default a space
245      * character). This string <strong>should never occur</strong> within
246      * original features.
247      *
248      * @return the value of the attribute
249      */
250     public String getSeparator() {
251         return separator;
252     }
253 
254     /***
255      * Returns the number of jointed features to generate for a given number of
256      * features to join.
257      *
258      * @param n the number of features to join
259      * @return 2<sup>(<em>n</em>-1)</sup>
260      */
261     private int numFeatures(final int n) {
262         return (int) Math.pow(2, n - 1);
263     }
264 
265     /***
266      * {@inheritDoc}
267      */
268     public ObjectElement toElement() {
269         // delegate to superclass + add attributes
270         final ObjectElement result = super.toElement();
271         result.addAttribute(OSBTransformer.ATTRIB_LENGTH,
272                 Integer.toString(length));
273         result.addAttribute(OSBTransformer.ATTRIB_SEPARATOR, separator);
274         return result;
275     }
276 
277     /***
278      * Returns a string representation of this object.
279      *
280      * @return a textual representation
281      */
282     public String toString() {
283         return new ToStringBuilder(this)
284             .appendSuper(super.toString())
285             .append("length", length)
286             .append("separator", separator)
287             .toString();
288     }
289 
290 }