View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.eval;
23  
24  import java.util.HashSet;
25  import java.util.Iterator;
26  import java.util.Set;
27  
28  import de.fu_berlin.ties.classify.feature.Feature;
29  import de.fu_berlin.ties.classify.feature.FeatureVector;
30  import de.fu_berlin.ties.io.BaseStorable;
31  import de.fu_berlin.ties.io.FieldMap;
32  import de.fu_berlin.ties.util.Util;
33  
34  /***
35   * Keeps track of the average number of features and of unique features in
36   * context representations and of the average number of contexts in documents.
37   * Comment-only features are excluded when counting features; comments are
38   * ignored when comparing features.
39   *
40   * <p>Instances of this class are not thread-safe and must be synchronized
41   * externally, if required.
42   *
43   * @author Christian Siefkes
44   * @version $Revision: 1.11 $, $Date: 2006/10/21 16:04:11 $, $Author: siefkes $
45   */
46  public class FeatureCount extends BaseStorable implements FeatureCountView {
47  
48      /***
49       * Serialization key for context representations.
50       */
51      public static final String KEY_CONTEXTS = "Contexts";
52  
53      /***
54       * Serialization key for documents.
55       */
56      public static final String KEY_DOCUMENTS = "Documents";
57  
58      /***
59       * Serialization key for features.
60       */
61      public static final String KEY_FEATURES = "Features";
62  
63      /***
64       * Serialization key for characters.
65       */
66      public static final String KEY_CHARS = "Characters";
67  
68      /***
69       * Serialization key for unique features.
70       */
71      public static final String KEY_UNIQUE_FEATURES = "Unique Features";
72  
73      /***
74       * Serialization key for the average number of context representations in a
75       * document.
76       */
77      public static final String KEY_AVERAGE_CONTEXTS = "Average Contexts";
78  
79      /***
80       * Serialization key for the average number of features in a context
81       * representation.
82       */
83      public static final String KEY_AVERAGE_FEATURES = "Average Features";
84  
85      /***
86       * Serialization key for the average number of unique features in a context
87       * representation.
88       */
89      public static final String KEY_AVERAGE_UNIQUE_FEATURES =
90          "Average Unique Features";
91  
92      /***
93       * Serialization key for the average number of characters in a context
94       * representation.
95       */
96      public static final String KEY_CHARS_PER_CONTEXT = "Characters per Context";
97  
98      /***
99       * Serialization key for the average number of characters in a feature.
100      */
101     public static final String KEY_CHARS_PER_FEATURE = "Characters per Feature";
102 
103     /***
104      * The number of characters counted so far.
105      */
106     private long characters = 0;
107 
108     /***
109      * The number of context representations evaluated so far.
110      */
111     private long contexts = 0;
112 
113     /***
114      * The number of documents counted so far.
115      */
116     private long documents = 0;
117 
118     /***
119      * The number of non-comment features encountered so far.
120      */
121     private long featureSum = 0;
122 
123     /***
124      * The number of non-comment non-duplicate features encountered so far.
125      * Duplicates within the same context representation are ignored; but
126      * equal features in different representations are not recognized as
127      * duplicate.
128      */
129     private long uniqueFeatureSum = 0;
130 
131     /***
132      * Creates a new instance.
133      */
134     public FeatureCount() {
135         super();
136     }
137 
138     /***
139      * Creates a new instance from a field map, fulfilling the
140      * {@link de.fu_berlin.ties.io.Storable} contract.
141      *
142      * @param fieldMap map containing the serialized fields
143      * @throws IllegalArgumentException if at least one of the parameters is
144      * negative or missing
145      */
146     public FeatureCount(final FieldMap fieldMap)
147             throws IllegalArgumentException {
148         this();
149 
150         // read values
151         documents = Util.asLong(fieldMap.get(KEY_DOCUMENTS));
152         contexts = Util.asLong(fieldMap.get(KEY_CONTEXTS));
153         featureSum = Util.asLong(fieldMap.get(KEY_FEATURES));
154         uniqueFeatureSum = Util.asLong(fieldMap.get(KEY_UNIQUE_FEATURES));
155         characters = Util.asLong(fieldMap.get(KEY_CHARS));
156 
157         // ensure that no value is negative or missing
158         Util.ensureNonNegative(contexts, KEY_DOCUMENTS);
159         Util.ensureNonNegative(contexts, KEY_CONTEXTS);
160         Util.ensureNonNegative(featureSum, KEY_FEATURES);
161         Util.ensureNonNegative(uniqueFeatureSum, KEY_UNIQUE_FEATURES);
162         Util.ensureNonNegative(characters, KEY_CHARS);
163     }
164 
165     /***
166      * Counts a document (increases the {@linkplain #getDocuments() number of
167      * documents} by one.
168      */
169     public void countDocument() {
170         documents++;
171     }
172 
173     /***
174      * Calculates and returns the average number of context representations in a
175      * document.
176      *
177      * @return the average number of context representations
178      */
179     public double getAverageContexts() {
180         return (double) contexts / documents;
181     }
182 
183     /***
184      * Calculates and returns the average number of non-comment features in a
185      * context representation.
186      *
187      * @return the average number of features
188      */
189     public double getAverageFeatures() {
190         return (double) featureSum / contexts;
191     }
192 
193     /***
194      * Calculates and returns the average number of unique non-comment features
195      * in a context representation.
196      *
197      * @return the average number of features
198      */
199     public double getAverageUniqueFeatures() {
200         return (double) uniqueFeatureSum / contexts;
201     }
202 
203     /***
204      * Returns the number of characters counted so far. Only characters
205      * <em>within</em> features are counted; separators between different
206      * features are ignored.
207      *
208      * @return the value of the attribute
209      */
210     public long getCharacters() {
211         return characters;
212     }
213 
214     /***
215      * Calculates and returns the average number of characters in a context
216      * representation. Only characters <em>within</em> features are considered;
217      * separators between different features are ignored.
218      *
219      * @return the average number of characters in a context
220      */
221     public double getCharactersPerContext() {
222         return (double) characters / contexts;
223     }
224 
225     /***
226      * Calculates and returns the average number of characters in a feature.
227      *
228      * @return the average number of characters in a feature
229      */
230     public double getCharactersPerFeature() {
231         return (double) characters / featureSum;
232     }
233 
234     /***
235      * Returns the number of representations evaluated so far.
236      * @return the value of the attribute
237      */
238     public long getContexts() {
239         return contexts;
240     }
241 
242     /***
243      * Returns the number of documents counted so far.
244      * @return the value of the attribute
245      */
246     public long getDocuments() {
247         return documents;
248     }
249 
250 
251     /***
252      * Returns the number of non-comment features encountered so far.
253      * @return the value of the attribute
254      */
255     public long getFeatureSum() {
256         return featureSum;
257     }
258 
259     /***
260      * Returns the number of non-comment non-duplicate features encountered so
261      * far. Duplicates within the same context representation are ignored; but
262      * equal features in different representations are not recognized as
263      * duplicate.
264      *
265      * @return the value of the attribute
266      */
267     public long getUniqueFeatureSum() {
268         return uniqueFeatureSum;
269     }
270 
271     /***
272      * Stores all relevant fields of this object in a field map for
273      * serialization. An equivalent object can be created by calling
274      * {@link de.fu_berlin.ties.io.FieldMap#createObject(Class)} on the created
275      * field map. The calculated averages are also stored (they are ignored when
276      * {@linkplain #FeatureCount(FieldMap) deserializing} a stored instance).
277      *
278      * @return the created field map
279      */
280     public FieldMap storeFields() {
281         final FieldMap result = new FieldMap();
282         result.put(KEY_AVERAGE_CONTEXTS, new Double(getAverageContexts()));
283         result.put(KEY_AVERAGE_FEATURES, new Double(getAverageFeatures()));
284         result.put(KEY_AVERAGE_UNIQUE_FEATURES,
285             new Double(getAverageUniqueFeatures()));
286         result.put(KEY_CHARS_PER_CONTEXT,
287             new Double(getCharactersPerContext()));
288         result.put(KEY_CHARS_PER_FEATURE,
289             new Double(getCharactersPerFeature()));
290         result.put(KEY_DOCUMENTS, new Long(documents));
291         result.put(KEY_CONTEXTS, new Long(contexts));
292         result.put(KEY_FEATURES, new Long(featureSum));
293         result.put(KEY_UNIQUE_FEATURES, new Long(uniqueFeatureSum));
294         result.put(KEY_CHARS, new Long(characters));
295         return result;
296     }
297 
298     /***
299      * Evaluates a feature vector and updates the statistics accordingly.
300      * Comment-only features are excluded when counting features; comments are
301      * ignored when comparing features.
302      *
303      * @param features a feature vector representing a context
304      * @throws ClassCastException if the list contains objects that aren't
305      * {@link Feature}s
306      */
307     public void update(final FeatureVector features) throws ClassCastException {
308         final Iterator featureIter = features.iterator();
309         Feature currentFeature;
310         String currentRep;
311         int newFeatures = 0;
312         Set<String> uniqueFeatureReps =
313             new HashSet<String>(features.size() / 2);
314 
315         while (featureIter.hasNext()) {
316             currentFeature = (Feature) featureIter.next();
317             currentRep = currentFeature.getRepresentation();
318 
319             // ignore comment-only features
320             if (currentRep != null) {
321                 newFeatures++;
322 
323                 // ignore comments by storing only the representation
324                 uniqueFeatureReps.add(currentRep);
325 
326                 // increase character count
327                 characters += currentRep.length();
328             }
329         }
330 
331         // update the statistics
332         contexts++;
333         featureSum += newFeatures;
334         uniqueFeatureSum += uniqueFeatureReps.size();
335     }
336 
337 }