View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.extract;
23  
24  import java.util.ArrayList;
25  import java.util.Collection;
26  import java.util.Collections;
27  import java.util.HashMap;
28  import java.util.IdentityHashMap;
29  import java.util.Iterator;
30  import java.util.List;
31  import java.util.Map;
32  import java.util.Set;
33  import java.util.SortedSet;
34  import java.util.TreeMap;
35  import java.util.TreeSet;
36  
37  import org.apache.commons.lang.builder.ToStringBuilder;
38  
39  import de.fu_berlin.ties.classify.PredictionComparator;
40  import de.fu_berlin.ties.io.FieldContainer;
41  import de.fu_berlin.ties.io.FieldMap;
42  import de.fu_berlin.ties.io.RestorableContainer;
43  import de.fu_berlin.ties.util.CollUtils;
44  import de.fu_berlin.ties.util.MultiValueMap;
45  import de.fu_berlin.ties.util.Util;
46  
47  /***
48   * A container of {@link de.fu_berlin.ties.extract.Extraction}s of different
49   * classes. Instances of this class are not thread-safe.
50   *
51   * <p>Extraction containers can be serialized
52   * ({@link #storeEntries(FieldContainer)}), but not deserialized.
53   *
54   * <p>Instances of this class are not thread-safe and have to be synchronized
55   * externally, if required.
56   *
57   * @author Christian Siefkes
58   * @version $Revision: 1.35 $, $Date: 2006/10/21 16:04:13 $, $Author: siefkes $
59   */
60  public class ExtractionContainer implements RestorableContainer {
61  
62      /***
63       * A list of all extractions stored in this container.
64       */
65      private final List<Extraction> allExtractions = new ArrayList<Extraction>();
66  
67      /***
68       * The target structure specifying the classes to recognize.
69       */
70      private final TargetStructure targetStructure;
71  
72      /***
73       * A multi-map storing lists of extractions for each extraction class name.
74       */
75      private final MultiValueMap<String, Extraction> typedExtractions =
76          new MultiValueMap<String, Extraction>();
77  
78      /***
79       * Creates a new empty instance.
80       *
81       * @param targetStruct the target structure specifying the classes to
82       * recognize; a dummy target structure (no classes defined) means that
83       * all classes are accepted
84       */
85      public ExtractionContainer(final TargetStructure targetStruct) {
86          super();
87          targetStructure = targetStruct;
88      }
89  
90      /***
91       * Creates a new instance from a field container, delegating to
92       * {@link #restoreEntries(FieldContainer)}.
93       *
94       * @param targetStruct the target structure specifying the classes to
95       * recognize
96       * @param fContainer the field container to read
97       * @throws IllegalArgumentException if <code>fContainer</code> contains
98       * extractions of an unsupported
99       * (@link de.fu_berlin.ties.classify.Prediction#getType())
100      */
101     public ExtractionContainer(final TargetStructure targetStruct,
102             final FieldContainer fContainer) throws IllegalArgumentException {
103         this(targetStruct);
104         restoreEntries(fContainer);
105     }
106 
107     /***
108      * Adds an extraction to this container.
109      *
110      * @param extraction the extraction to add
111      * @throws IllegalArgumentException if the type of the extraction
112      * (@link de.fu_berlin.ties.classify.Prediction#getType()) is not
113      * in the set of class names defined by the target structure
114      */
115     public void add(final Extraction extraction)
116             throws IllegalArgumentException {
117         // check class
118         if (!targetStructure.validClassName(extraction.getType())) {
119             throw new IllegalArgumentException(
120                 "Container doesn't allow extractions of type "
121                 + extraction.getType());
122         }
123 
124         // store extraction
125         allExtractions.add(extraction);
126         typedExtractions.put(extraction.getType(), extraction);
127     }
128 
129     /***
130      * Adds all extractions from an iterator to this container.
131      *
132      * @param iterator an iterator over the extractions to add
133      * @throws IllegalArgumentException if the type of an extraction
134      * (@link de.fu_berlin.ties.classify.Prediction#getType()) is not
135      * in the set of class names defined by the target structure
136      */
137     public void addAll(final Iterator<Extraction> iterator)
138             throws IllegalArgumentException {
139         while (iterator.hasNext()) {
140             add(iterator.next());
141         }
142     }
143 
144     /***
145      * Returns an iterator over all extractions in insertion order. The
146      * iterator is immutable and cannot be used to modify the underlying
147      * collection.
148      *
149      * @return an iterator over all extractions
150      */
151     public Iterator<Extraction> iterator() {
152         return Collections.unmodifiableList(allExtractions).iterator();
153     }
154 
155     /***
156      * Returns an iterator over the extractions of a specified class, in
157      * insertion order. The iterator is immutable and cannot be used to modify
158      * the underlying collection.
159      *
160      * @param className the name of the class of extractions to iterate
161      * @return an iterator over the extractions of the given class (if any,
162      * otherwise an empty iterator will be returned)
163      */
164     public Iterator<Extraction> iterator(final String className) {
165         Collection<Extraction> typedList = typedExtractions.get(className);
166         if (typedList == null) {
167             // create empty list
168             typedList = new ArrayList<Extraction>(0);
169         }
170 
171         // return iterator on immutable view
172         return Collections.unmodifiableCollection(typedList).iterator();
173     }
174 
175     /***
176      * This method filters overlapping extraction. Whenever two extractions
177      * overlap, the less probably one of them is removed. This process
178      * greedily continues until no overlapping extractions remain.
179      */
180     public void filterOverlapping() {
181         // a map from index positions to all extractions covering this position
182         final MultiValueMap<Integer, Extraction> positionMap =
183             new MultiValueMap<Integer, Extraction>(
184                     new TreeMap<Integer, Collection<Extraction>>());
185         final Iterator<Extraction> extractionIter = allExtractions.iterator();
186         Extraction extraction;
187         int firstIndex, lastIndex;
188         int i;
189 
190         while (extractionIter.hasNext()) {
191             extraction = extractionIter.next();
192             firstIndex = extraction.getIndex();
193             lastIndex = extraction.getLastIndex();
194 
195             // check that index positions are valid
196             if (firstIndex < 0 || lastIndex < 0) {
197                 Util.LOG.warn("First or last index position for " + extraction
198                         + " is negative (not specified) -- "
199                         + "overlap filtering will be unreliable: " + firstIndex
200                         + ", " + lastIndex);
201             }
202             if (lastIndex < firstIndex) {
203                 throw new IllegalArgumentException(
204                         "Illegal index positions for " + extraction
205                         + ": first index " + firstIndex
206                         + " is larger than last index " + lastIndex);
207             }
208 
209             // store extraction for all covered index positions
210             for (i = firstIndex; i <= lastIndex; i++) {
211                 positionMap.put(i, extraction);
212             }
213         }
214 
215         final Iterator<Integer> positionIter = positionMap.keySet().iterator();
216         Integer position;
217         int posToClear;
218         Collection<Extraction> collectedExtractions, extractionsAtPosToClear;
219         // use an identity map (with null values) for identity-based removal
220         final IdentityHashMap<Extraction, Object> extractionsToRemove =
221             new IdentityHashMap<Extraction, Object>();
222         Set<Extraction> extractionsToRemoveSet;
223         Iterator<Extraction> extractionsIter, extractionsToRemoveIter;
224         Extraction currentExt, mostLikelyExt;
225         final PredictionComparator comp = new PredictionComparator();
226         boolean result;
227         int numExtractionsToRemove, numRemovedExtractions;
228 
229         // iterate all intex positions and discard all except the most probable
230         // extraction in case of overlaps
231         while (positionIter.hasNext()) {
232             position = positionIter.next();
233             collectedExtractions = positionMap.get(position);
234 
235             if (collectedExtractions.size() > 1) {
236                 mostLikelyExt = null;
237                 extractionsIter = collectedExtractions.iterator();
238 
239                 // find out which extraction has the highest probability
240                 while (extractionsIter.hasNext()) {
241                     currentExt = extractionsIter.next();
242 
243                     if (mostLikelyExt == null
244                             || comp.compare(mostLikelyExt, currentExt) < 0) {
245                         // current extraction is more likely
246                         mostLikelyExt = currentExt;
247                     }
248                 }
249 
250                 // remove all extractions except the most probably one
251                 extractionsIter = collectedExtractions.iterator();
252                 while (extractionsIter.hasNext()) {
253                     currentExt = extractionsIter.next();
254 
255                     if (currentExt != mostLikelyExt) {
256                         extractionsToRemove.put(currentExt, null);
257                     }
258                 }
259 
260                 extractionsToRemoveSet = extractionsToRemove.keySet();
261                 extractionsToRemoveIter = extractionsToRemoveSet.iterator();
262 
263                 while (extractionsToRemoveIter.hasNext()) {
264                     currentExt = extractionsToRemoveIter.next();
265                     Util.LOG.debug("Removing " + currentExt
266                             + " because it overlaps with more probably "
267                             + mostLikelyExt);
268 
269                     // remove from all covered positions in the map
270                     firstIndex = currentExt.getIndex();
271                     lastIndex = currentExt.getLastIndex();
272 
273                     for (posToClear = firstIndex; posToClear <= lastIndex;
274                             posToClear++) {
275                         extractionsAtPosToClear = positionMap.get(posToClear);
276                         result = CollUtils.removeByIdentity(
277                                 extractionsAtPosToClear, currentExt);
278 
279                         if (!result) {
280                             // not supposed to happen
281                             throw new RuntimeException(
282                                 "Implementation error: "
283                                 + "Could not remove extraction "
284                                 + currentExt + " from position "
285                                 + posToClear);
286                         }
287                     }
288                 } // while extractionsToRemoveIter
289 
290                 // remove all extractions from this container
291                 numRemovedExtractions = removeAll(extractionsToRemoveSet);
292                 numExtractionsToRemove = extractionsToRemoveSet.size();
293 
294                 if (numRemovedExtractions != numExtractionsToRemove) {
295                     // not supposed to happen
296                     throw new RuntimeException("Implementation error: "
297                         + "Removed only " + numRemovedExtractions
298                         + " instead of " + numExtractionsToRemove
299                         + " while filtering overlapping extractions");
300                 }
301 
302                 // reset collection
303                 extractionsToRemove.clear();
304             }
305         }
306     }
307 
308     /***
309      * Returns the target structure specifying the classes to recognize.
310      * @return the used target structure
311      */
312     public TargetStructure getTargetStructure() {
313         return targetStructure;
314     }
315 
316     /***
317      * Returns the last extraction added to this container.
318      *
319      * @return the last extraction added to this container;
320      * or <code>null</code> if the container is empty
321      */
322     public Extraction last() {
323         if (allExtractions.isEmpty()) {
324             return null;
325         } else {
326             return allExtractions.get(allExtractions.size() - 1);
327         }
328     }
329 
330     /***
331      * Returns a list of the last <em>n</em> extractions added to this
332      * container. Modifications of the returned list will not affect this
333      * container and vice versa.
334      *
335      * @param number the number of extractions to copy
336      * @return a list containing the last {@link Extraction}s
337      */
338     public List lastN(final int number) {
339         return CollUtils.lastN(allExtractions, number);
340     }
341 
342     /***
343      * Returns a list of the last <em>n</em> extractions of a specified class
344      * added to this container. Modifications of the returned list will not
345      * affect this container and vice versa.
346      *
347      * @param extractionClass the class of extractions
348      * @param number the number of extractions to copy
349      * @return a list containing the last {@link Extraction}s of the given
350      * class; will be empty if there are no extractions of this class
351      */
352     public List<Extraction> lastN(final String extractionClass,
353             final int number) {
354         final List<Extraction> typedList = (List<Extraction>)
355             typedExtractions.get(extractionClass);
356         return CollUtils.lastN(typedList, number);
357     }
358 
359     /***
360      * Removes an extraction from this container, if it is present.
361      * 
362      *
363      * @param ext the extraction to remove
364      * @return <code>true</code> if the container contained the specified
365      * extraction
366      */
367     public boolean remove(final Extraction ext) {
368         // first remove from typed extraction (shorter list)
369         final boolean removedFromTyped = CollUtils.removeByIdentity(
370                 typedExtractions.get(ext.getType()), ext);
371 
372         if (removedFromTyped) {
373             // also remove from all extractions
374             final boolean removedFromAll =
375                 CollUtils.removeByIdentity(allExtractions, ext);
376 
377             if (!removedFromAll) {
378                 // this should never happen
379                 throw new RuntimeException("Implementation error: Removed "
380                     + ext + " from typed extractions (" + typedExtractions
381                     + ") but not from all extractions (" + allExtractions
382                     + ")");
383             }
384         }
385 
386         return removedFromTyped;
387     }
388 
389     /***
390      * Removes all given extractions from this container, if they are present.
391      *
392      * @param set the set of extractions to remove
393      * @return the number of extractions removed from this container; will be in
394      * the range from 0 (if none of the extractions was found in this container)
395      * to {@link Set#size() set.size()} (if all have been found) 
396      */
397     public int removeAll(final Set<Extraction> set) {
398         Extraction ext;
399         int foundInAll = 0;
400         final Iterator<Extraction> allIter = allExtractions.iterator();
401 
402         // iterate all extractions and remove all that can be found
403         while (allIter.hasNext()) {
404             ext = allIter.next();
405 
406             if (set.contains(ext)) {
407                 // remove this extraction
408                 allIter.remove();
409                 foundInAll++;
410             }
411         }
412 
413         int foundInTyped = 0;
414         final Iterator<Extraction> typedIter =
415             typedExtractions.values().iterator();
416 
417         // iterate typed extractions and remove all that can be found
418         while (typedIter.hasNext()) {
419             ext = typedIter.next();
420 
421             if (set.contains(ext)) {
422                 // remove this extraction
423                 typedIter.remove();
424                 foundInTyped++;
425             }
426         }
427 
428         // compare that numbers are identical and return
429         if (foundInAll == foundInTyped) {
430             return foundInAll;
431         } else {
432             // not supposed to happen
433             throw new RuntimeException(
434                     "Implementation errors: removeAll removed " + foundInAll
435                     + " extractions from all but " + foundInTyped
436                     + " from typed extractions");
437         }
438     }
439 
440     /***
441      * Removes the last extraction {@link #add(Extraction) added} to this
442      * container.
443      *
444      * @return the removed extraction
445      * @throws IllegalStateException if this method is called while the
446      * container is empty
447      */
448     public Extraction removeLast() throws IllegalStateException {
449         if (allExtractions.isEmpty()) {
450             throw new IllegalStateException(
451                 "removeLast called on empty ExtractionContainer");
452         } else {
453             Extraction removedExt =
454                 allExtractions.remove(allExtractions.size() - 1);
455             final Collection<Extraction> typeCollection =
456                 typedExtractions.get(removedExt.getType());
457             final boolean removedFromTyped =
458                 CollUtils.removeByIdentity(typeCollection, removedExt);
459 
460             if (!removedFromTyped) {
461                 // this should never happen
462                 throw new RuntimeException("Implementation error: Removed "
463                     + removedExt + " from all extractions (" + allExtractions
464                     + ") but not from typed extractions (" + typedExtractions
465                     + ")");
466             }
467 
468             return removedExt;
469         }
470     }
471 
472     /***
473      * Restores extractions stored in a field container and adds them to this
474      * instance. The provided field container must have been filled by calling
475      * {@link #storeEntries(FieldContainer)} on an extraction container with
476      * identical {@linkplain #getTargetStructure() target structure}, or
477      * errors are likely.
478      *
479      * @param fContainer the field container to read
480      * @throws IllegalArgumentException if <code>fContainer</code> contains
481      * extractions of an unsupported
482      * (@link de.fu_berlin.ties.classify.Prediction#getType())
483      */
484     public void restoreEntries(final FieldContainer fContainer)
485             throws IllegalArgumentException {
486         final Iterator entryIter = fContainer.entryIterator();
487         FieldMap currentFM;
488         Extraction currentExt;
489         String currentType;
490         final SortedSet<String> ignoredTypes = new TreeSet<String>();
491 
492         // deserialize and add extractions
493         while (entryIter.hasNext()) {
494             currentFM = (FieldMap) entryIter.next();
495             currentExt = new Extraction(currentFM);
496             currentType = currentExt.getType();
497 
498             if (targetStructure.validClassName(currentType)) {
499                 add(currentExt);
500             } else if (!ignoredTypes.contains(currentType)) {
501                 // we ignore extractions of other types so it possible to work
502                 // with a subset of all stored extractions
503                 ignoredTypes.add(currentType);
504             }
505         }
506 
507         // log debug message if we encountered unknown/ignored types
508         if (!ignoredTypes.isEmpty()) {
509             Util.LOG.debug("Ignoring extraction(s) of unknown types: "
510                 + ignoredTypes);
511         }
512     }
513 
514     /***
515      * Returns the number of extractions stored in this container.
516      * @return the number of extractions stored in this container
517      */
518     public int size() {
519         return allExtractions.size();
520     }
521 
522     /***
523      * Adds all extractions stored in this instance to a field container for
524      * serialization. Extractions are serialized in insertion order, as
525      * required for {@linkplain Trainer training}.
526      *
527      * @param fContainer the field container to fill
528      */
529     public void storeEntries(final FieldContainer fContainer) {
530         // serialize extraction in insertion order (required for training)
531         final Iterator extIter = allExtractions.iterator();
532         Extraction currentExt;
533 
534         while (extIter.hasNext()) {
535             currentExt = (Extraction) extIter.next();
536             fContainer.add(currentExt);
537         }
538     }
539 
540     /***
541      * Returns a string representation of this object.
542      * @return a textual representation
543      */
544     public String toString() {
545         return new ToStringBuilder(this)
546             .append("target structure", targetStructure)
547             .append("typed extractions", typedExtractions)
548             .toString();
549     }
550 
551     /***
552      * Unsets the positions of all stored extractions. In case of duplicates
553      * (extractions that only differed in their position), the most probably
554      * one is kept.
555      */
556     public void unsetPositions() {
557         final Iterator<String> classIter = typedExtractions.keySet().iterator();
558         List<Extraction> extList;
559         Iterator<Extraction> extIter;
560         String className;
561         Extraction ext;
562         Extraction otherExt;
563         String text;
564         Map<String, Extraction> bestExtractions =
565             new HashMap<String, Extraction>();
566         List<Extraction> duplicates = new ArrayList<Extraction>(); // each class
567         List<Extraction> allDuplicates =
568             new ArrayList<Extraction>(); // all classes
569         PredictionComparator predComp = new PredictionComparator();
570 
571         // iterate extractions of each class to detect duplicates
572         while (classIter.hasNext()) {
573             // reset collections for new class
574             bestExtractions.clear();
575             duplicates.clear();
576             className = classIter.next();
577             extList = (List<Extraction>) typedExtractions.get(className);
578             extIter = extList.iterator();
579 
580             while (extIter.hasNext()) {
581                 ext = extIter.next();
582                 text = ext.getText();
583 
584                 if (bestExtractions.containsKey(text)) {
585                     otherExt = bestExtractions.get(text);
586 
587                     // found duplicate -- keep more likely one
588                     if (predComp.compare(otherExt, ext) < 0) {
589                         // new one is more likely
590                         bestExtractions.put(text, ext);
591                         duplicates.add(otherExt);
592 
593                         if (ext.getProbability().getProb() >= 0.0) {
594                             Util.LOG.debug("Discarded duplicate " + otherExt
595                                     + " in favor of " + ext);
596                         }
597                     } else {
598                         // old one is at least as likely
599                         duplicates.add(ext);
600 
601                         if (ext.getProbability().getProb() >= 0.0) {
602                             Util.LOG.debug("Discarded duplicate " + ext
603                                     + " in favor of " + otherExt);
604                         }
605                     }
606                 } else {
607                     // first extraction with this type and text
608                     bestExtractions.put(text, ext);
609                 }
610             } // while extIter
611 
612             // remove duplicates from typedExtractions
613             extList.removeAll(duplicates);
614             allDuplicates.addAll(duplicates);
615         } // while classIter
616 
617         // remove duplicated from allExtractions
618         allExtractions.removeAll(allDuplicates);
619 
620         // set first token rep. of all remaining extractions to -1
621         final Iterator<Extraction> allIter = allExtractions.iterator();
622         while (allIter.hasNext()) {
623             ext = allIter.next();
624             ext.setFirstTokenRepIgnored(true);
625         }
626     }
627 
628 }