View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.classify.feature;
23  
24  import java.util.Collection;
25  import java.util.Iterator;
26  
27  import org.apache.commons.lang.builder.ToStringBuilder;
28  
29  import de.fu_berlin.ties.text.TextTokenizer;
30  import de.fu_berlin.ties.text.TextUtils;
31  
32  /***
33   * A feature vector contains the features representing an instance. Subclasses
34   * must implement the {@link #store()} method to provide a collection for
35   * storing the features.
36   *
37   * <p>Instances of this class are thread-safe if and only if the provided
38   * collection is thread-safe (normally this won't be the case).
39   *
40   * @author Christian Siefkes
41   * @version $Revision: 1.6 $, $Date: 2004/09/06 17:22:10 $, $Author: siefkes $
42   */
43  public abstract class FeatureVector {
44  
45      /***
46       * The sum of the {@linkplain #strength(Feature) strength values}
47       * of all features currently contained in this vector.
48       */
49      private double summedStrength = 0.0;
50  
51      /***
52       * Creates a new instance.
53       */
54      public FeatureVector() {
55          super();
56      }
57  
58      /***
59       * Adds a feature to this vector.
60       *
61       * @param feature the feature to add
62       */
63      public void add(final Feature feature) {
64          preAddHook(feature);
65          store().add(feature);
66  
67          // add feature strength to summed strength
68          summedStrength += strength(feature);
69      }
70  
71      /***
72       * Adds all of the features in the specified Collection to this vector,
73       * in the order they are returned by the specified Collection's Iterator.
74       *
75       * @param coll a collection of features to add
76       * @throws ClassCastException if the collection contains elements that are
77       * not {@link Feature}s
78       */
79      public void addAll(final Collection coll) throws ClassCastException {
80          Feature feature;
81          final Iterator iter = coll.iterator();
82  
83          while (iter.hasNext()) {
84              feature = (Feature) iter.next();
85              add(feature);
86          }
87      }
88  
89      /***
90       * Adds all of the features in the specified feature vector to this vector,
91       * in the order they are returned by the specified feature vector.
92       *
93       * @param fv the vector of features to add
94       */
95      public void addAll(final FeatureVector fv) {
96          addAll(fv.store());
97      }
98  
99      /***
100      * Tokenizes a text, creating and adding a feature for each token.
101      *
102      * @param text the text to tokenize
103      * @param tokenizer the tokenizer to use
104      */
105     public void addAllTokens(final CharSequence text,
106                              final TextTokenizer tokenizer) {
107         tokenizer.reset(text);
108         String token;
109 
110         while ((token = tokenizer.nextToken()) != null) {
111             add(new DefaultFeature(token));
112         }
113     }
114 
115     /***
116      * Returns a sum of the {@linkplain #strength(Feature) strength values}
117      * of all features contained in this vector.
118      *
119      * @return the summed strength of all features
120      */
121     public double getSummedStrength() {
122         return summedStrength;
123     }
124 
125     /***
126      * Flattens the contained features into a single character sequence, without
127      * including comments. Features are separated by newlines.
128      *
129      * @return the resulting character sequence, created by printing calling
130      * {@link Feature#getRepresentation()} on each feature in the vector; each
131      * feature representation is followed by a newline
132      */
133     public CharSequence flatten() {
134         return flatten(false);
135     }
136 
137     /***
138      * Flattens a list of features into a single character sequence. Features
139      * are separated by newlines.
140      *
141      * @param inclComments whether or not to include comments
142      * @return the resulting character sequence, created by printing calling
143      * {@link Feature#getRepresentation()} on each feature in the vector; each
144      * feature representation is followed by a newline
145      */
146     public CharSequence flatten(final boolean inclComments) {
147         final StringBuffer result = new StringBuffer();
148         final Iterator iter = iterator();
149         Feature currentFeature;
150         String currentRep;
151 
152         while (iter.hasNext()) {
153             currentFeature = (Feature) iter.next();
154             if (inclComments) {
155                 currentRep = currentFeature.getFullRepresentation();
156             } else {
157                 currentRep = currentFeature.getRepresentation();
158             }
159 
160             // representation w/o comment might be null
161             if (currentRep != null) {
162                 result.append(currentRep);
163                 result.append(TextUtils.LINE_SEPARATOR);
164             }
165         }
166         return result;
167     }
168 
169     /***
170      * Returns an iterator over the {@link Feature}s stored in this vector.
171      *
172      * @return an iterator over the stored features
173      */
174     public Iterator iterator() {
175         return store().iterator();
176     }
177 
178     /***
179      * Empty method that can by implemented by child classes, for example if
180      * they calculate strength values in some special way. This method is
181      * called at the start of each {@link #add(Feature)} operation.
182      *
183      * @param feature the feature to add
184      */
185     protected void preAddHook(final Feature feature) {
186     }
187 
188     /***
189      * Modifies the summed strength of all features contained in this instance.
190      * This method exists for the convenience of child classes that calculate
191      * strength values in some special way.
192      *
193      * @param newSum the new value of the attribute
194      */
195     protected void setSummedStrength(final double newSum) {
196         summedStrength = newSum;
197     }
198 
199     /***
200      * Returns the number of features stored in this vector.
201      *
202      * @return the number of features
203      */
204     public int size() {
205         return store().size();
206     }
207 
208     /***
209      * Returns a strength value for a feature contained in this vector.
210      * This implementation simply delegates to {@link Feature#getStrength()}.
211      *
212      * @param feature the feature to consider
213      * @return a strength value for the specified feature
214      */
215     public double strength(final Feature feature) {
216         return feature.getStrength();
217     }
218 
219     /***
220      * Returns the collection used for storing the features. The properties
221      * of a feature vector depend on the kind of collection that is used.
222      *
223      * @return the collection used for storing the features.
224      */
225     protected abstract Collection<Feature> store();
226 
227     /***
228      * Returns a string representation of this object.
229      *
230      * @return a textual representation
231      */
232     public String toString() {
233         return new ToStringBuilder(this)
234             .append("feature store", store())
235             .toString();
236     }
237 
238 }