1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.extract;
23
24 import java.util.ArrayList;
25 import java.util.Collection;
26 import java.util.Collections;
27 import java.util.HashMap;
28 import java.util.Iterator;
29 import java.util.List;
30 import java.util.Map;
31 import java.util.SortedSet;
32 import java.util.TreeSet;
33
34 import org.apache.commons.collections.IteratorUtils;
35 import org.apache.commons.lang.builder.ToStringBuilder;
36
37 import de.fu_berlin.ties.classify.PredictionComparator;
38 import de.fu_berlin.ties.io.FieldContainer;
39 import de.fu_berlin.ties.io.FieldMap;
40 import de.fu_berlin.ties.io.RestorableContainer;
41 import de.fu_berlin.ties.util.CollectionUtils;
42 import de.fu_berlin.ties.util.MultiValueMap;
43 import de.fu_berlin.ties.util.Util;
44
45 /***
46 * A container of {@link de.fu_berlin.ties.extract.Extraction}s of different
47 * classes. Instances of this class are not thread-safe.
48 *
49 * <p>Extraction containers can be serialized
50 * ({@link #storeEntries(FieldContainer)}), but not deserialized.
51 *
52 * <p>Instances of this class are not thread-safe and have to be synchronized
53 * externally, if required.
54 *
55 * @author Christian Siefkes
56 * @version $Revision: 1.17 $, $Date: 2004/12/06 17:58:36 $, $Author: siefkes $
57 */
58 public class ExtractionContainer implements RestorableContainer {
59
60 /***
61 * A list of all extractions stored in this container.
62 */
63 private final List<Extraction> allExtractions = new ArrayList<Extraction>();
64
65 /***
66 * The target structure specifying the classes to recognize.
67 */
68 private final TargetStructure targetStructure;
69
70 /***
71 * A multi-map storing lists of extractions for each extraction class name.
72 */
73 private final MultiValueMap<String, Extraction> typedExtractions =
74 new MultiValueMap<String, Extraction>();
75
76 /***
77 * Creates a new empty instance.
78 *
79 * @param targetStruct the target structure specifying the classes to
80 * recognize
81 */
82 public ExtractionContainer(final TargetStructure targetStruct) {
83 super();
84 targetStructure = targetStruct;
85 }
86
87 /***
88 * Creates a new instance from a field container, delegating to
89 * {@link #restoreEntries(FieldContainer)}.
90 *
91 * @param targetStruct the target structure specifying the classes to
92 * recognize
93 * @param fContainer the field container to read
94 * @throws IllegalArgumentException if <code>fContainer</code> contains
95 * extractions of an unsupported
96 * (@link de.fu_berlin.ties.classify.Prediction#getType())
97 */
98 public ExtractionContainer(final TargetStructure targetStruct,
99 final FieldContainer fContainer) throws IllegalArgumentException {
100 this(targetStruct);
101 restoreEntries(fContainer);
102 }
103
104 /***
105 * Adds an extraction to this container.
106 *
107 * @param extraction the extraction to add
108 * @throws IllegalArgumentException if the type of the extraction
109 * (@link de.fu_berlin.ties.classify.Prediction#getType()) is not
110 * in the set of class names defined by the target structure
111 */
112 public void add(final Extraction extraction)
113 throws IllegalArgumentException {
114
115 if (!targetStructure.getClassNames().contains(extraction.getType())) {
116 throw new IllegalArgumentException(
117 "Container doesn't allow extractions of type "
118 + extraction.getType());
119 }
120
121
122 allExtractions.add(extraction);
123 typedExtractions.put(extraction.getType(), extraction);
124 }
125
126 /***
127 * Returns an iterator over all extractions in insertion order. The
128 * iterator is immutable and cannot be used to modify the underlying
129 * collection.
130 *
131 * @return an iterator over all extractions
132 */
133 public Iterator iterator() {
134 return IteratorUtils.unmodifiableIterator(allExtractions.iterator());
135 }
136
137 /***
138 * Returns an iterator over the extractions of a specified class, in
139 * insertion order. The iterator is immutable and cannot be used to modify
140 * the underlying collection.
141 *
142 * @param className the name of the class of extractions to iterate
143 * @return an iterator over the extractions of the given class (if any,
144 * otherwise an empty iterator will be returned)
145 */
146 public Iterator<Extraction> iterator(final String className) {
147 Collection<Extraction> typedList = typedExtractions.get(className);
148 if (typedList == null) {
149
150 typedList = new ArrayList<Extraction>(0);
151 }
152
153
154 return Collections.unmodifiableCollection(typedList).iterator();
155 }
156
157 /***
158 * Returns the target structure specifying the classes to recognize.
159 * @return the used target structure
160 */
161 public TargetStructure getTargetStructure() {
162 return targetStructure;
163 }
164
165 /***
166 * Returns the last extraction added to this container.
167 *
168 * @return the last extraction added to this container;
169 * or <code>null</code> if the container is empty
170 */
171 public Extraction last() {
172 if (allExtractions.isEmpty()) {
173 return null;
174 } else {
175 return allExtractions.get(allExtractions.size() - 1);
176 }
177 }
178
179 /***
180 * Returns a list of the last <em>n</em> extractions added to this
181 * container. Modifications of the returned list will not affect this
182 * container and vice versa.
183 *
184 * @param number the number of extractions to copy
185 * @return a list containing the last {@link Extraction}s
186 */
187 public List lastN(final int number) {
188 return CollectionUtils.lastN(allExtractions, number);
189 }
190
191 /***
192 * Returns a list of the last <em>n</em> extractions of a specified class
193 * added to this container. Modifications of the returned list will not
194 * affect this container and vice versa.
195 *
196 * @param extractionClass the class of extractions
197 * @param number the number of extractions to copy
198 * @return a list containing the last {@link Extraction}s of the given
199 * class; will be empty if there are no extractions of this class
200 */
201 public List<Extraction> lastN(final String extractionClass,
202 final int number) {
203 final List<Extraction> typedList = (List<Extraction>)
204 typedExtractions.get(extractionClass);
205 return CollectionUtils.lastN(typedList, number);
206 }
207
208 /***
209 * Removes an extraction from this container, if it is present.
210 *
211 * @param ext the extraction to remove
212 * @return <code>true</code> if the container contained the specified
213 * extraction
214 */
215 public boolean remove(final Extraction ext) {
216
217 final boolean removedFromTyped =
218 typedExtractions.remove(ext.getType(), ext) != null;
219
220 if (removedFromTyped) {
221
222 final boolean removedFromAll = allExtractions.remove(ext);
223
224 if (!removedFromAll) {
225
226 throw new RuntimeException("Implementation error: Removed "
227 + ext + " from typed extractions (" + typedExtractions
228 + ") but not from all extractions (" + allExtractions
229 + ")");
230 }
231 }
232
233 return removedFromTyped;
234 }
235
236 /***
237 * Removes the last extraction {@link #add(Extraction) added} to this
238 * container.
239 *
240 * @return the removed extraction
241 * @throws IllegalStateException if this method is called while the
242 * container is empty
243 */
244 public Extraction removeLast() throws IllegalStateException {
245 if (allExtractions.isEmpty()) {
246 throw new IllegalStateException(
247 "removeLast called on empty ExtractionContainer");
248 } else {
249 Extraction removedExt =
250 allExtractions.remove(allExtractions.size() - 1);
251 final Collection<Extraction> typeCollection =
252 typedExtractions.get(removedExt.getType());
253 final boolean removedFromTyped = typeCollection.remove(removedExt);
254
255 if (!removedFromTyped) {
256
257 throw new RuntimeException("Implementation error: Removed "
258 + removedExt + " from all extractions (" + allExtractions
259 + ") but not from typed extractions (" + typedExtractions
260 + ")");
261 }
262
263 return removedExt;
264 }
265 }
266
267 /***
268 * Restores extractions stored in a field container and adds them to this
269 * instance. The provided field container must have been filled by calling
270 * {@link #storeEntries(FieldContainer)} on an extraction container with
271 * identical {@linkplain #getTargetStructure() target structure}, or
272 * errors are likely.
273 *
274 * @param fContainer the field container to read
275 * @throws IllegalArgumentException if <code>fContainer</code> contains
276 * extractions of an unsupported
277 * (@link de.fu_berlin.ties.classify.Prediction#getType())
278 */
279 public void restoreEntries(final FieldContainer fContainer)
280 throws IllegalArgumentException {
281 final Iterator entryIter = fContainer.entryIterator();
282 FieldMap currentFM;
283 Extraction currentExt;
284 String currentType;
285 final SortedSet<String> ignoredTypes = new TreeSet<String>();
286
287
288 while (entryIter.hasNext()) {
289 currentFM = (FieldMap) entryIter.next();
290 currentExt = new Extraction(currentFM);
291 currentType = currentExt.getType();
292
293 if (targetStructure.getClassNames().contains(currentType)) {
294 add(currentExt);
295 } else if (!ignoredTypes.contains(currentType)) {
296
297
298 ignoredTypes.add(currentType);
299 }
300 }
301
302
303 if (!ignoredTypes.isEmpty()) {
304 Util.LOG.debug("Ignoring extraction(s) of unknown types: "
305 + ignoredTypes);
306 }
307 }
308
309 /***
310 * Adds all extractions stored in this instance to a field container for
311 * serialization. Extractions are serialized in insertion order, as
312 * required for {@linkplain Trainer training}.
313 *
314 * @param fContainer the field container to fill
315 */
316 public void storeEntries(final FieldContainer fContainer) {
317
318 final Iterator extIter = allExtractions.iterator();
319 Extraction currentExt;
320
321 while (extIter.hasNext()) {
322 currentExt = (Extraction) extIter.next();
323 fContainer.add(currentExt);
324 }
325 }
326
327 /***
328 * Returns a string representation of this object.
329 * @return a textual representation
330 */
331 public String toString() {
332 return new ToStringBuilder(this)
333 .append("target structure", targetStructure)
334 .append("typed extractions", typedExtractions)
335 .toString();
336 }
337
338 /***
339 * Unsets the positions of all stored extractions. In case of duplicates
340 * (extractions that only differed in their position), the most probably
341 * one is kept.
342 */
343 public void unsetPositions() {
344 final Iterator<String> classIter = typedExtractions.keySet().iterator();
345 List<Extraction> extList;
346 Iterator<Extraction> extIter;
347 String className;
348 Extraction ext;
349 Extraction otherExt;
350 String text;
351 Map<String, Extraction> bestExtractions =
352 new HashMap<String, Extraction>();
353 List<Extraction> duplicates = new ArrayList<Extraction>();
354 List<Extraction> allDuplicates =
355 new ArrayList<Extraction>();
356 PredictionComparator predComp = new PredictionComparator();
357
358
359 while (classIter.hasNext()) {
360
361 bestExtractions.clear();
362 duplicates.clear();
363 className = classIter.next();
364 extList = (List<Extraction>) typedExtractions.get(className);
365 extIter = extList.iterator();
366
367 while (extIter.hasNext()) {
368 ext = extIter.next();
369 text = ext.getText();
370
371 if (bestExtractions.containsKey(text)) {
372 otherExt = bestExtractions.get(text);
373
374
375 if (predComp.compare(otherExt, ext) < 0) {
376
377 bestExtractions.put(text, ext);
378 duplicates.add(otherExt);
379
380 if (ext.getProbability().getProb() >= 0.0) {
381 Util.LOG.debug("Discarded duplicate " + otherExt
382 + " in favor of " + ext);
383 }
384 } else {
385
386 duplicates.add(ext);
387
388 if (ext.getProbability().getProb() >= 0.0) {
389 Util.LOG.debug("Discarded duplicate " + ext
390 + " in favor of " + otherExt);
391 }
392 }
393 } else {
394
395 bestExtractions.put(text, ext);
396 }
397 }
398
399
400 extList.removeAll(duplicates);
401 allDuplicates.addAll(duplicates);
402 }
403
404
405 allExtractions.removeAll(allDuplicates);
406
407
408 final Iterator<Extraction> allIter = allExtractions.iterator();
409 while (allIter.hasNext()) {
410 ext = allIter.next();
411 ext.setFirstTokenRepIgnored(true);
412 }
413 }
414
415 }