1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.extract;
23
24 import java.util.ArrayList;
25 import java.util.Collection;
26 import java.util.Collections;
27 import java.util.HashMap;
28 import java.util.IdentityHashMap;
29 import java.util.Iterator;
30 import java.util.List;
31 import java.util.Map;
32 import java.util.Set;
33 import java.util.SortedSet;
34 import java.util.TreeMap;
35 import java.util.TreeSet;
36
37 import org.apache.commons.lang.builder.ToStringBuilder;
38
39 import de.fu_berlin.ties.classify.PredictionComparator;
40 import de.fu_berlin.ties.io.FieldContainer;
41 import de.fu_berlin.ties.io.FieldMap;
42 import de.fu_berlin.ties.io.RestorableContainer;
43 import de.fu_berlin.ties.util.CollUtils;
44 import de.fu_berlin.ties.util.MultiValueMap;
45 import de.fu_berlin.ties.util.Util;
46
47 /***
48 * A container of {@link de.fu_berlin.ties.extract.Extraction}s of different
49 * classes. Instances of this class are not thread-safe.
50 *
51 * <p>Extraction containers can be serialized
52 * ({@link #storeEntries(FieldContainer)}), but not deserialized.
53 *
54 * <p>Instances of this class are not thread-safe and have to be synchronized
55 * externally, if required.
56 *
57 * @author Christian Siefkes
58 * @version $Revision: 1.35 $, $Date: 2006/10/21 16:04:13 $, $Author: siefkes $
59 */
60 public class ExtractionContainer implements RestorableContainer {
61
62 /***
63 * A list of all extractions stored in this container.
64 */
65 private final List<Extraction> allExtractions = new ArrayList<Extraction>();
66
67 /***
68 * The target structure specifying the classes to recognize.
69 */
70 private final TargetStructure targetStructure;
71
72 /***
73 * A multi-map storing lists of extractions for each extraction class name.
74 */
75 private final MultiValueMap<String, Extraction> typedExtractions =
76 new MultiValueMap<String, Extraction>();
77
78 /***
79 * Creates a new empty instance.
80 *
81 * @param targetStruct the target structure specifying the classes to
82 * recognize; a dummy target structure (no classes defined) means that
83 * all classes are accepted
84 */
85 public ExtractionContainer(final TargetStructure targetStruct) {
86 super();
87 targetStructure = targetStruct;
88 }
89
90 /***
91 * Creates a new instance from a field container, delegating to
92 * {@link #restoreEntries(FieldContainer)}.
93 *
94 * @param targetStruct the target structure specifying the classes to
95 * recognize
96 * @param fContainer the field container to read
97 * @throws IllegalArgumentException if <code>fContainer</code> contains
98 * extractions of an unsupported
99 * (@link de.fu_berlin.ties.classify.Prediction#getType())
100 */
101 public ExtractionContainer(final TargetStructure targetStruct,
102 final FieldContainer fContainer) throws IllegalArgumentException {
103 this(targetStruct);
104 restoreEntries(fContainer);
105 }
106
107 /***
108 * Adds an extraction to this container.
109 *
110 * @param extraction the extraction to add
111 * @throws IllegalArgumentException if the type of the extraction
112 * (@link de.fu_berlin.ties.classify.Prediction#getType()) is not
113 * in the set of class names defined by the target structure
114 */
115 public void add(final Extraction extraction)
116 throws IllegalArgumentException {
117
118 if (!targetStructure.validClassName(extraction.getType())) {
119 throw new IllegalArgumentException(
120 "Container doesn't allow extractions of type "
121 + extraction.getType());
122 }
123
124
125 allExtractions.add(extraction);
126 typedExtractions.put(extraction.getType(), extraction);
127 }
128
129 /***
130 * Adds all extractions from an iterator to this container.
131 *
132 * @param iterator an iterator over the extractions to add
133 * @throws IllegalArgumentException if the type of an extraction
134 * (@link de.fu_berlin.ties.classify.Prediction#getType()) is not
135 * in the set of class names defined by the target structure
136 */
137 public void addAll(final Iterator<Extraction> iterator)
138 throws IllegalArgumentException {
139 while (iterator.hasNext()) {
140 add(iterator.next());
141 }
142 }
143
144 /***
145 * Returns an iterator over all extractions in insertion order. The
146 * iterator is immutable and cannot be used to modify the underlying
147 * collection.
148 *
149 * @return an iterator over all extractions
150 */
151 public Iterator<Extraction> iterator() {
152 return Collections.unmodifiableList(allExtractions).iterator();
153 }
154
155 /***
156 * Returns an iterator over the extractions of a specified class, in
157 * insertion order. The iterator is immutable and cannot be used to modify
158 * the underlying collection.
159 *
160 * @param className the name of the class of extractions to iterate
161 * @return an iterator over the extractions of the given class (if any,
162 * otherwise an empty iterator will be returned)
163 */
164 public Iterator<Extraction> iterator(final String className) {
165 Collection<Extraction> typedList = typedExtractions.get(className);
166 if (typedList == null) {
167
168 typedList = new ArrayList<Extraction>(0);
169 }
170
171
172 return Collections.unmodifiableCollection(typedList).iterator();
173 }
174
175 /***
176 * This method filters overlapping extraction. Whenever two extractions
177 * overlap, the less probably one of them is removed. This process
178 * greedily continues until no overlapping extractions remain.
179 */
180 public void filterOverlapping() {
181
182 final MultiValueMap<Integer, Extraction> positionMap =
183 new MultiValueMap<Integer, Extraction>(
184 new TreeMap<Integer, Collection<Extraction>>());
185 final Iterator<Extraction> extractionIter = allExtractions.iterator();
186 Extraction extraction;
187 int firstIndex, lastIndex;
188 int i;
189
190 while (extractionIter.hasNext()) {
191 extraction = extractionIter.next();
192 firstIndex = extraction.getIndex();
193 lastIndex = extraction.getLastIndex();
194
195
196 if (firstIndex < 0 || lastIndex < 0) {
197 Util.LOG.warn("First or last index position for " + extraction
198 + " is negative (not specified) -- "
199 + "overlap filtering will be unreliable: " + firstIndex
200 + ", " + lastIndex);
201 }
202 if (lastIndex < firstIndex) {
203 throw new IllegalArgumentException(
204 "Illegal index positions for " + extraction
205 + ": first index " + firstIndex
206 + " is larger than last index " + lastIndex);
207 }
208
209
210 for (i = firstIndex; i <= lastIndex; i++) {
211 positionMap.put(i, extraction);
212 }
213 }
214
215 final Iterator<Integer> positionIter = positionMap.keySet().iterator();
216 Integer position;
217 int posToClear;
218 Collection<Extraction> collectedExtractions, extractionsAtPosToClear;
219
220 final IdentityHashMap<Extraction, Object> extractionsToRemove =
221 new IdentityHashMap<Extraction, Object>();
222 Set<Extraction> extractionsToRemoveSet;
223 Iterator<Extraction> extractionsIter, extractionsToRemoveIter;
224 Extraction currentExt, mostLikelyExt;
225 final PredictionComparator comp = new PredictionComparator();
226 boolean result;
227 int numExtractionsToRemove, numRemovedExtractions;
228
229
230
231 while (positionIter.hasNext()) {
232 position = positionIter.next();
233 collectedExtractions = positionMap.get(position);
234
235 if (collectedExtractions.size() > 1) {
236 mostLikelyExt = null;
237 extractionsIter = collectedExtractions.iterator();
238
239
240 while (extractionsIter.hasNext()) {
241 currentExt = extractionsIter.next();
242
243 if (mostLikelyExt == null
244 || comp.compare(mostLikelyExt, currentExt) < 0) {
245
246 mostLikelyExt = currentExt;
247 }
248 }
249
250
251 extractionsIter = collectedExtractions.iterator();
252 while (extractionsIter.hasNext()) {
253 currentExt = extractionsIter.next();
254
255 if (currentExt != mostLikelyExt) {
256 extractionsToRemove.put(currentExt, null);
257 }
258 }
259
260 extractionsToRemoveSet = extractionsToRemove.keySet();
261 extractionsToRemoveIter = extractionsToRemoveSet.iterator();
262
263 while (extractionsToRemoveIter.hasNext()) {
264 currentExt = extractionsToRemoveIter.next();
265 Util.LOG.debug("Removing " + currentExt
266 + " because it overlaps with more probably "
267 + mostLikelyExt);
268
269
270 firstIndex = currentExt.getIndex();
271 lastIndex = currentExt.getLastIndex();
272
273 for (posToClear = firstIndex; posToClear <= lastIndex;
274 posToClear++) {
275 extractionsAtPosToClear = positionMap.get(posToClear);
276 result = CollUtils.removeByIdentity(
277 extractionsAtPosToClear, currentExt);
278
279 if (!result) {
280
281 throw new RuntimeException(
282 "Implementation error: "
283 + "Could not remove extraction "
284 + currentExt + " from position "
285 + posToClear);
286 }
287 }
288 }
289
290
291 numRemovedExtractions = removeAll(extractionsToRemoveSet);
292 numExtractionsToRemove = extractionsToRemoveSet.size();
293
294 if (numRemovedExtractions != numExtractionsToRemove) {
295
296 throw new RuntimeException("Implementation error: "
297 + "Removed only " + numRemovedExtractions
298 + " instead of " + numExtractionsToRemove
299 + " while filtering overlapping extractions");
300 }
301
302
303 extractionsToRemove.clear();
304 }
305 }
306 }
307
308 /***
309 * Returns the target structure specifying the classes to recognize.
310 * @return the used target structure
311 */
312 public TargetStructure getTargetStructure() {
313 return targetStructure;
314 }
315
316 /***
317 * Returns the last extraction added to this container.
318 *
319 * @return the last extraction added to this container;
320 * or <code>null</code> if the container is empty
321 */
322 public Extraction last() {
323 if (allExtractions.isEmpty()) {
324 return null;
325 } else {
326 return allExtractions.get(allExtractions.size() - 1);
327 }
328 }
329
330 /***
331 * Returns a list of the last <em>n</em> extractions added to this
332 * container. Modifications of the returned list will not affect this
333 * container and vice versa.
334 *
335 * @param number the number of extractions to copy
336 * @return a list containing the last {@link Extraction}s
337 */
338 public List lastN(final int number) {
339 return CollUtils.lastN(allExtractions, number);
340 }
341
342 /***
343 * Returns a list of the last <em>n</em> extractions of a specified class
344 * added to this container. Modifications of the returned list will not
345 * affect this container and vice versa.
346 *
347 * @param extractionClass the class of extractions
348 * @param number the number of extractions to copy
349 * @return a list containing the last {@link Extraction}s of the given
350 * class; will be empty if there are no extractions of this class
351 */
352 public List<Extraction> lastN(final String extractionClass,
353 final int number) {
354 final List<Extraction> typedList = (List<Extraction>)
355 typedExtractions.get(extractionClass);
356 return CollUtils.lastN(typedList, number);
357 }
358
359 /***
360 * Removes an extraction from this container, if it is present.
361 *
362 *
363 * @param ext the extraction to remove
364 * @return <code>true</code> if the container contained the specified
365 * extraction
366 */
367 public boolean remove(final Extraction ext) {
368
369 final boolean removedFromTyped = CollUtils.removeByIdentity(
370 typedExtractions.get(ext.getType()), ext);
371
372 if (removedFromTyped) {
373
374 final boolean removedFromAll =
375 CollUtils.removeByIdentity(allExtractions, ext);
376
377 if (!removedFromAll) {
378
379 throw new RuntimeException("Implementation error: Removed "
380 + ext + " from typed extractions (" + typedExtractions
381 + ") but not from all extractions (" + allExtractions
382 + ")");
383 }
384 }
385
386 return removedFromTyped;
387 }
388
389 /***
390 * Removes all given extractions from this container, if they are present.
391 *
392 * @param set the set of extractions to remove
393 * @return the number of extractions removed from this container; will be in
394 * the range from 0 (if none of the extractions was found in this container)
395 * to {@link Set#size() set.size()} (if all have been found)
396 */
397 public int removeAll(final Set<Extraction> set) {
398 Extraction ext;
399 int foundInAll = 0;
400 final Iterator<Extraction> allIter = allExtractions.iterator();
401
402
403 while (allIter.hasNext()) {
404 ext = allIter.next();
405
406 if (set.contains(ext)) {
407
408 allIter.remove();
409 foundInAll++;
410 }
411 }
412
413 int foundInTyped = 0;
414 final Iterator<Extraction> typedIter =
415 typedExtractions.values().iterator();
416
417
418 while (typedIter.hasNext()) {
419 ext = typedIter.next();
420
421 if (set.contains(ext)) {
422
423 typedIter.remove();
424 foundInTyped++;
425 }
426 }
427
428
429 if (foundInAll == foundInTyped) {
430 return foundInAll;
431 } else {
432
433 throw new RuntimeException(
434 "Implementation errors: removeAll removed " + foundInAll
435 + " extractions from all but " + foundInTyped
436 + " from typed extractions");
437 }
438 }
439
440 /***
441 * Removes the last extraction {@link #add(Extraction) added} to this
442 * container.
443 *
444 * @return the removed extraction
445 * @throws IllegalStateException if this method is called while the
446 * container is empty
447 */
448 public Extraction removeLast() throws IllegalStateException {
449 if (allExtractions.isEmpty()) {
450 throw new IllegalStateException(
451 "removeLast called on empty ExtractionContainer");
452 } else {
453 Extraction removedExt =
454 allExtractions.remove(allExtractions.size() - 1);
455 final Collection<Extraction> typeCollection =
456 typedExtractions.get(removedExt.getType());
457 final boolean removedFromTyped =
458 CollUtils.removeByIdentity(typeCollection, removedExt);
459
460 if (!removedFromTyped) {
461
462 throw new RuntimeException("Implementation error: Removed "
463 + removedExt + " from all extractions (" + allExtractions
464 + ") but not from typed extractions (" + typedExtractions
465 + ")");
466 }
467
468 return removedExt;
469 }
470 }
471
472 /***
473 * Restores extractions stored in a field container and adds them to this
474 * instance. The provided field container must have been filled by calling
475 * {@link #storeEntries(FieldContainer)} on an extraction container with
476 * identical {@linkplain #getTargetStructure() target structure}, or
477 * errors are likely.
478 *
479 * @param fContainer the field container to read
480 * @throws IllegalArgumentException if <code>fContainer</code> contains
481 * extractions of an unsupported
482 * (@link de.fu_berlin.ties.classify.Prediction#getType())
483 */
484 public void restoreEntries(final FieldContainer fContainer)
485 throws IllegalArgumentException {
486 final Iterator entryIter = fContainer.entryIterator();
487 FieldMap currentFM;
488 Extraction currentExt;
489 String currentType;
490 final SortedSet<String> ignoredTypes = new TreeSet<String>();
491
492
493 while (entryIter.hasNext()) {
494 currentFM = (FieldMap) entryIter.next();
495 currentExt = new Extraction(currentFM);
496 currentType = currentExt.getType();
497
498 if (targetStructure.validClassName(currentType)) {
499 add(currentExt);
500 } else if (!ignoredTypes.contains(currentType)) {
501
502
503 ignoredTypes.add(currentType);
504 }
505 }
506
507
508 if (!ignoredTypes.isEmpty()) {
509 Util.LOG.debug("Ignoring extraction(s) of unknown types: "
510 + ignoredTypes);
511 }
512 }
513
514 /***
515 * Returns the number of extractions stored in this container.
516 * @return the number of extractions stored in this container
517 */
518 public int size() {
519 return allExtractions.size();
520 }
521
522 /***
523 * Adds all extractions stored in this instance to a field container for
524 * serialization. Extractions are serialized in insertion order, as
525 * required for {@linkplain Trainer training}.
526 *
527 * @param fContainer the field container to fill
528 */
529 public void storeEntries(final FieldContainer fContainer) {
530
531 final Iterator extIter = allExtractions.iterator();
532 Extraction currentExt;
533
534 while (extIter.hasNext()) {
535 currentExt = (Extraction) extIter.next();
536 fContainer.add(currentExt);
537 }
538 }
539
540 /***
541 * Returns a string representation of this object.
542 * @return a textual representation
543 */
544 public String toString() {
545 return new ToStringBuilder(this)
546 .append("target structure", targetStructure)
547 .append("typed extractions", typedExtractions)
548 .toString();
549 }
550
551 /***
552 * Unsets the positions of all stored extractions. In case of duplicates
553 * (extractions that only differed in their position), the most probably
554 * one is kept.
555 */
556 public void unsetPositions() {
557 final Iterator<String> classIter = typedExtractions.keySet().iterator();
558 List<Extraction> extList;
559 Iterator<Extraction> extIter;
560 String className;
561 Extraction ext;
562 Extraction otherExt;
563 String text;
564 Map<String, Extraction> bestExtractions =
565 new HashMap<String, Extraction>();
566 List<Extraction> duplicates = new ArrayList<Extraction>();
567 List<Extraction> allDuplicates =
568 new ArrayList<Extraction>();
569 PredictionComparator predComp = new PredictionComparator();
570
571
572 while (classIter.hasNext()) {
573
574 bestExtractions.clear();
575 duplicates.clear();
576 className = classIter.next();
577 extList = (List<Extraction>) typedExtractions.get(className);
578 extIter = extList.iterator();
579
580 while (extIter.hasNext()) {
581 ext = extIter.next();
582 text = ext.getText();
583
584 if (bestExtractions.containsKey(text)) {
585 otherExt = bestExtractions.get(text);
586
587
588 if (predComp.compare(otherExt, ext) < 0) {
589
590 bestExtractions.put(text, ext);
591 duplicates.add(otherExt);
592
593 if (ext.getProbability().getProb() >= 0.0) {
594 Util.LOG.debug("Discarded duplicate " + otherExt
595 + " in favor of " + ext);
596 }
597 } else {
598
599 duplicates.add(ext);
600
601 if (ext.getProbability().getProb() >= 0.0) {
602 Util.LOG.debug("Discarded duplicate " + ext
603 + " in favor of " + otherExt);
604 }
605 }
606 } else {
607
608 bestExtractions.put(text, ext);
609 }
610 }
611
612
613 extList.removeAll(duplicates);
614 allDuplicates.addAll(duplicates);
615 }
616
617
618 allExtractions.removeAll(allDuplicates);
619
620
621 final Iterator<Extraction> allIter = allExtractions.iterator();
622 while (allIter.hasNext()) {
623 ext = allIter.next();
624 ext.setFirstTokenRepIgnored(true);
625 }
626 }
627
628 }