1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.eval;
23
24 import java.util.HashSet;
25 import java.util.Iterator;
26 import java.util.Set;
27
28 import de.fu_berlin.ties.classify.feature.Feature;
29 import de.fu_berlin.ties.classify.feature.FeatureVector;
30 import de.fu_berlin.ties.io.BaseStorable;
31 import de.fu_berlin.ties.io.FieldMap;
32 import de.fu_berlin.ties.util.Util;
33
34 /***
35 * Keeps track of the average number of features and of unique features in
36 * context representations and of the average number of contexts in documents.
37 * Comment-only features are excluded when counting features; comments are
38 * ignored when comparing features.
39 *
40 * <p>Instances of this class are not thread-safe and must be synchronized
41 * externally, if required.
42 *
43 * @author Christian Siefkes
44 * @version $Revision: 1.11 $, $Date: 2006/10/21 16:04:11 $, $Author: siefkes $
45 */
46 public class FeatureCount extends BaseStorable implements FeatureCountView {
47
48 /***
49 * Serialization key for context representations.
50 */
51 public static final String KEY_CONTEXTS = "Contexts";
52
53 /***
54 * Serialization key for documents.
55 */
56 public static final String KEY_DOCUMENTS = "Documents";
57
58 /***
59 * Serialization key for features.
60 */
61 public static final String KEY_FEATURES = "Features";
62
63 /***
64 * Serialization key for characters.
65 */
66 public static final String KEY_CHARS = "Characters";
67
68 /***
69 * Serialization key for unique features.
70 */
71 public static final String KEY_UNIQUE_FEATURES = "Unique Features";
72
73 /***
74 * Serialization key for the average number of context representations in a
75 * document.
76 */
77 public static final String KEY_AVERAGE_CONTEXTS = "Average Contexts";
78
79 /***
80 * Serialization key for the average number of features in a context
81 * representation.
82 */
83 public static final String KEY_AVERAGE_FEATURES = "Average Features";
84
85 /***
86 * Serialization key for the average number of unique features in a context
87 * representation.
88 */
89 public static final String KEY_AVERAGE_UNIQUE_FEATURES =
90 "Average Unique Features";
91
92 /***
93 * Serialization key for the average number of characters in a context
94 * representation.
95 */
96 public static final String KEY_CHARS_PER_CONTEXT = "Characters per Context";
97
98 /***
99 * Serialization key for the average number of characters in a feature.
100 */
101 public static final String KEY_CHARS_PER_FEATURE = "Characters per Feature";
102
103 /***
104 * The number of characters counted so far.
105 */
106 private long characters = 0;
107
108 /***
109 * The number of context representations evaluated so far.
110 */
111 private long contexts = 0;
112
113 /***
114 * The number of documents counted so far.
115 */
116 private long documents = 0;
117
118 /***
119 * The number of non-comment features encountered so far.
120 */
121 private long featureSum = 0;
122
123 /***
124 * The number of non-comment non-duplicate features encountered so far.
125 * Duplicates within the same context representation are ignored; but
126 * equal features in different representations are not recognized as
127 * duplicate.
128 */
129 private long uniqueFeatureSum = 0;
130
131 /***
132 * Creates a new instance.
133 */
134 public FeatureCount() {
135 super();
136 }
137
138 /***
139 * Creates a new instance from a field map, fulfilling the
140 * {@link de.fu_berlin.ties.io.Storable} contract.
141 *
142 * @param fieldMap map containing the serialized fields
143 * @throws IllegalArgumentException if at least one of the parameters is
144 * negative or missing
145 */
146 public FeatureCount(final FieldMap fieldMap)
147 throws IllegalArgumentException {
148 this();
149
150
151 documents = Util.asLong(fieldMap.get(KEY_DOCUMENTS));
152 contexts = Util.asLong(fieldMap.get(KEY_CONTEXTS));
153 featureSum = Util.asLong(fieldMap.get(KEY_FEATURES));
154 uniqueFeatureSum = Util.asLong(fieldMap.get(KEY_UNIQUE_FEATURES));
155 characters = Util.asLong(fieldMap.get(KEY_CHARS));
156
157
158 Util.ensureNonNegative(contexts, KEY_DOCUMENTS);
159 Util.ensureNonNegative(contexts, KEY_CONTEXTS);
160 Util.ensureNonNegative(featureSum, KEY_FEATURES);
161 Util.ensureNonNegative(uniqueFeatureSum, KEY_UNIQUE_FEATURES);
162 Util.ensureNonNegative(characters, KEY_CHARS);
163 }
164
165 /***
166 * Counts a document (increases the {@linkplain #getDocuments() number of
167 * documents} by one.
168 */
169 public void countDocument() {
170 documents++;
171 }
172
173 /***
174 * Calculates and returns the average number of context representations in a
175 * document.
176 *
177 * @return the average number of context representations
178 */
179 public double getAverageContexts() {
180 return (double) contexts / documents;
181 }
182
183 /***
184 * Calculates and returns the average number of non-comment features in a
185 * context representation.
186 *
187 * @return the average number of features
188 */
189 public double getAverageFeatures() {
190 return (double) featureSum / contexts;
191 }
192
193 /***
194 * Calculates and returns the average number of unique non-comment features
195 * in a context representation.
196 *
197 * @return the average number of features
198 */
199 public double getAverageUniqueFeatures() {
200 return (double) uniqueFeatureSum / contexts;
201 }
202
203 /***
204 * Returns the number of characters counted so far. Only characters
205 * <em>within</em> features are counted; separators between different
206 * features are ignored.
207 *
208 * @return the value of the attribute
209 */
210 public long getCharacters() {
211 return characters;
212 }
213
214 /***
215 * Calculates and returns the average number of characters in a context
216 * representation. Only characters <em>within</em> features are considered;
217 * separators between different features are ignored.
218 *
219 * @return the average number of characters in a context
220 */
221 public double getCharactersPerContext() {
222 return (double) characters / contexts;
223 }
224
225 /***
226 * Calculates and returns the average number of characters in a feature.
227 *
228 * @return the average number of characters in a feature
229 */
230 public double getCharactersPerFeature() {
231 return (double) characters / featureSum;
232 }
233
234 /***
235 * Returns the number of representations evaluated so far.
236 * @return the value of the attribute
237 */
238 public long getContexts() {
239 return contexts;
240 }
241
242 /***
243 * Returns the number of documents counted so far.
244 * @return the value of the attribute
245 */
246 public long getDocuments() {
247 return documents;
248 }
249
250
251 /***
252 * Returns the number of non-comment features encountered so far.
253 * @return the value of the attribute
254 */
255 public long getFeatureSum() {
256 return featureSum;
257 }
258
259 /***
260 * Returns the number of non-comment non-duplicate features encountered so
261 * far. Duplicates within the same context representation are ignored; but
262 * equal features in different representations are not recognized as
263 * duplicate.
264 *
265 * @return the value of the attribute
266 */
267 public long getUniqueFeatureSum() {
268 return uniqueFeatureSum;
269 }
270
271 /***
272 * Stores all relevant fields of this object in a field map for
273 * serialization. An equivalent object can be created by calling
274 * {@link de.fu_berlin.ties.io.FieldMap#createObject(Class)} on the created
275 * field map. The calculated averages are also stored (they are ignored when
276 * {@linkplain #FeatureCount(FieldMap) deserializing} a stored instance).
277 *
278 * @return the created field map
279 */
280 public FieldMap storeFields() {
281 final FieldMap result = new FieldMap();
282 result.put(KEY_AVERAGE_CONTEXTS, new Double(getAverageContexts()));
283 result.put(KEY_AVERAGE_FEATURES, new Double(getAverageFeatures()));
284 result.put(KEY_AVERAGE_UNIQUE_FEATURES,
285 new Double(getAverageUniqueFeatures()));
286 result.put(KEY_CHARS_PER_CONTEXT,
287 new Double(getCharactersPerContext()));
288 result.put(KEY_CHARS_PER_FEATURE,
289 new Double(getCharactersPerFeature()));
290 result.put(KEY_DOCUMENTS, new Long(documents));
291 result.put(KEY_CONTEXTS, new Long(contexts));
292 result.put(KEY_FEATURES, new Long(featureSum));
293 result.put(KEY_UNIQUE_FEATURES, new Long(uniqueFeatureSum));
294 result.put(KEY_CHARS, new Long(characters));
295 return result;
296 }
297
298 /***
299 * Evaluates a feature vector and updates the statistics accordingly.
300 * Comment-only features are excluded when counting features; comments are
301 * ignored when comparing features.
302 *
303 * @param features a feature vector representing a context
304 * @throws ClassCastException if the list contains objects that aren't
305 * {@link Feature}s
306 */
307 public void update(final FeatureVector features) throws ClassCastException {
308 final Iterator featureIter = features.iterator();
309 Feature currentFeature;
310 String currentRep;
311 int newFeatures = 0;
312 Set<String> uniqueFeatureReps =
313 new HashSet<String>(features.size() / 2);
314
315 while (featureIter.hasNext()) {
316 currentFeature = (Feature) featureIter.next();
317 currentRep = currentFeature.getRepresentation();
318
319
320 if (currentRep != null) {
321 newFeatures++;
322
323
324 uniqueFeatureReps.add(currentRep);
325
326
327 characters += currentRep.length();
328 }
329 }
330
331
332 contexts++;
333 featureSum += newFeatures;
334 uniqueFeatureSum += uniqueFeatureReps.size();
335 }
336
337 }