View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.extract;
23  
24  import java.util.ArrayList;
25  import java.util.Collection;
26  import java.util.Collections;
27  import java.util.HashMap;
28  import java.util.Iterator;
29  import java.util.List;
30  import java.util.Map;
31  import java.util.SortedSet;
32  import java.util.TreeSet;
33  
34  import org.apache.commons.collections.IteratorUtils;
35  import org.apache.commons.lang.builder.ToStringBuilder;
36  
37  import de.fu_berlin.ties.classify.PredictionComparator;
38  import de.fu_berlin.ties.io.FieldContainer;
39  import de.fu_berlin.ties.io.FieldMap;
40  import de.fu_berlin.ties.io.RestorableContainer;
41  import de.fu_berlin.ties.util.CollectionUtils;
42  import de.fu_berlin.ties.util.MultiValueMap;
43  import de.fu_berlin.ties.util.Util;
44  
45  /***
46   * A container of {@link de.fu_berlin.ties.extract.Extraction}s of different
47   * classes. Instances of this class are not thread-safe.
48   *
49   * <p>Extraction containers can be serialized
50   * ({@link #storeEntries(FieldContainer)}), but not deserialized.
51   *
52   * <p>Instances of this class are not thread-safe and have to be synchronized
53   * externally, if required.
54   *
55   * @author Christian Siefkes
56   * @version $Revision: 1.17 $, $Date: 2004/12/06 17:58:36 $, $Author: siefkes $
57   */
58  public class ExtractionContainer implements RestorableContainer {
59  
60      /***
61       * A list of all extractions stored in this container.
62       */
63      private final List<Extraction> allExtractions = new ArrayList<Extraction>();
64  
65      /***
66       * The target structure specifying the classes to recognize.
67       */
68      private final TargetStructure targetStructure;
69  
70      /***
71       * A multi-map storing lists of extractions for each extraction class name.
72       */
73      private final MultiValueMap<String, Extraction> typedExtractions =
74          new MultiValueMap<String, Extraction>();
75  
76      /***
77       * Creates a new empty instance.
78       *
79       * @param targetStruct the target structure specifying the classes to
80       * recognize
81       */
82      public ExtractionContainer(final TargetStructure targetStruct) {
83          super();
84          targetStructure = targetStruct;
85      }
86  
87      /***
88       * Creates a new instance from a field container, delegating to
89       * {@link #restoreEntries(FieldContainer)}.
90       *
91       * @param targetStruct the target structure specifying the classes to
92       * recognize
93       * @param fContainer the field container to read
94       * @throws IllegalArgumentException if <code>fContainer</code> contains
95       * extractions of an unsupported
96       * (@link de.fu_berlin.ties.classify.Prediction#getType())
97       */
98      public ExtractionContainer(final TargetStructure targetStruct,
99              final FieldContainer fContainer) throws IllegalArgumentException {
100         this(targetStruct);
101         restoreEntries(fContainer);
102     }
103 
104     /***
105      * Adds an extraction to this container.
106      *
107      * @param extraction the extraction to add
108      * @throws IllegalArgumentException if the type of the extraction
109      * (@link de.fu_berlin.ties.classify.Prediction#getType()) is not
110      * in the set of class names defined by the target structure
111      */
112     public void add(final Extraction extraction)
113             throws IllegalArgumentException {
114         // check class
115         if (!targetStructure.getClassNames().contains(extraction.getType())) {
116             throw new IllegalArgumentException(
117                 "Container doesn't allow extractions of type "
118                 + extraction.getType());
119         }
120 
121         // store extraction
122         allExtractions.add(extraction);
123         typedExtractions.put(extraction.getType(), extraction);
124     }
125 
126     /***
127      * Returns an iterator over all extractions in insertion order. The
128      * iterator is immutable and cannot be used to modify the underlying
129      * collection.
130      *
131      * @return an iterator over all extractions
132      */
133     public Iterator iterator() {
134         return IteratorUtils.unmodifiableIterator(allExtractions.iterator());
135     }
136 
137     /***
138      * Returns an iterator over the extractions of a specified class, in
139      * insertion order. The iterator is immutable and cannot be used to modify
140      * the underlying collection.
141      *
142      * @param className the name of the class of extractions to iterate
143      * @return an iterator over the extractions of the given class (if any,
144      * otherwise an empty iterator will be returned)
145      */
146     public Iterator<Extraction> iterator(final String className) {
147         Collection<Extraction> typedList = typedExtractions.get(className);
148         if (typedList == null) {
149             // create empty list
150             typedList = new ArrayList<Extraction>(0);
151         }
152 
153         // return iterator on immutable view
154         return Collections.unmodifiableCollection(typedList).iterator();
155     }
156 
157     /***
158      * Returns the target structure specifying the classes to recognize.
159      * @return the used target structure
160      */
161     public TargetStructure getTargetStructure() {
162         return targetStructure;
163     }
164 
165     /***
166      * Returns the last extraction added to this container.
167      *
168      * @return the last extraction added to this container;
169      * or <code>null</code> if the container is empty
170      */
171     public Extraction last() {
172         if (allExtractions.isEmpty()) {
173             return null;
174         } else {
175             return allExtractions.get(allExtractions.size() - 1);
176         }
177     }
178 
179     /***
180      * Returns a list of the last <em>n</em> extractions added to this
181      * container. Modifications of the returned list will not affect this
182      * container and vice versa.
183      *
184      * @param number the number of extractions to copy
185      * @return a list containing the last {@link Extraction}s
186      */
187     public List lastN(final int number) {
188         return CollectionUtils.lastN(allExtractions, number);
189     }
190 
191     /***
192      * Returns a list of the last <em>n</em> extractions of a specified class
193      * added to this container. Modifications of the returned list will not
194      * affect this container and vice versa.
195      *
196      * @param extractionClass the class of extractions
197      * @param number the number of extractions to copy
198      * @return a list containing the last {@link Extraction}s of the given
199      * class; will be empty if there are no extractions of this class
200      */
201     public List<Extraction> lastN(final String extractionClass,
202             final int number) {
203         final List<Extraction> typedList = (List<Extraction>)
204             typedExtractions.get(extractionClass);
205         return CollectionUtils.lastN(typedList, number);
206     }
207 
208     /***
209      * Removes an extraction from this container, if it is present.
210      *
211      * @param ext the extraction to remove
212      * @return <code>true</code> if the container contained the specified
213      * extraction
214      */
215     public boolean remove(final Extraction ext) {
216         // first remove from typed extraction (shorter list)
217         final boolean removedFromTyped =
218             typedExtractions.remove(ext.getType(), ext) != null;
219 
220         if (removedFromTyped) {
221             // also remove from all extractions
222             final boolean removedFromAll = allExtractions.remove(ext);
223 
224             if (!removedFromAll) {
225                 // this should never happen
226                 throw new RuntimeException("Implementation error: Removed "
227                     + ext + " from typed extractions (" + typedExtractions
228                     + ") but not from all extractions (" + allExtractions
229                     + ")");
230             }
231         }
232 
233         return removedFromTyped;
234     }
235 
236     /***
237      * Removes the last extraction {@link #add(Extraction) added} to this
238      * container.
239      *
240      * @return the removed extraction
241      * @throws IllegalStateException if this method is called while the
242      * container is empty
243      */
244     public Extraction removeLast() throws IllegalStateException {
245         if (allExtractions.isEmpty()) {
246             throw new IllegalStateException(
247                 "removeLast called on empty ExtractionContainer");
248         } else {
249             Extraction removedExt =
250                 allExtractions.remove(allExtractions.size() - 1);
251             final Collection<Extraction> typeCollection =
252                 typedExtractions.get(removedExt.getType());
253             final boolean removedFromTyped = typeCollection.remove(removedExt);
254 
255             if (!removedFromTyped) {
256                 // this should never happen
257                 throw new RuntimeException("Implementation error: Removed "
258                     + removedExt + " from all extractions (" + allExtractions
259                     + ") but not from typed extractions (" + typedExtractions
260                     + ")");
261             }
262 
263             return removedExt;
264         }
265     }
266 
267     /***
268      * Restores extractions stored in a field container and adds them to this
269      * instance. The provided field container must have been filled by calling
270      * {@link #storeEntries(FieldContainer)} on an extraction container with
271      * identical {@linkplain #getTargetStructure() target structure}, or
272      * errors are likely.
273      *
274      * @param fContainer the field container to read
275      * @throws IllegalArgumentException if <code>fContainer</code> contains
276      * extractions of an unsupported
277      * (@link de.fu_berlin.ties.classify.Prediction#getType())
278      */
279     public void restoreEntries(final FieldContainer fContainer)
280             throws IllegalArgumentException {
281         final Iterator entryIter = fContainer.entryIterator();
282         FieldMap currentFM;
283         Extraction currentExt;
284         String currentType;
285         final SortedSet<String> ignoredTypes = new TreeSet<String>();
286 
287         // deserialize and add extractions
288         while (entryIter.hasNext()) {
289             currentFM = (FieldMap) entryIter.next();
290             currentExt = new Extraction(currentFM);
291             currentType = currentExt.getType();
292 
293             if (targetStructure.getClassNames().contains(currentType)) {
294                 add(currentExt);
295             } else if (!ignoredTypes.contains(currentType)) {
296                 // we ignore extractions of other types so it possible to work
297                 // with a subset of all stored extractions
298                 ignoredTypes.add(currentType);
299             }
300         }
301 
302         // log debug message if we encountered unknown/ignored types
303         if (!ignoredTypes.isEmpty()) {
304             Util.LOG.debug("Ignoring extraction(s) of unknown types: "
305                 + ignoredTypes);
306         }
307     }
308 
309     /***
310      * Adds all extractions stored in this instance to a field container for
311      * serialization. Extractions are serialized in insertion order, as
312      * required for {@linkplain Trainer training}.
313      *
314      * @param fContainer the field container to fill
315      */
316     public void storeEntries(final FieldContainer fContainer) {
317         // serialize extraction in insertion order (required for training)
318         final Iterator extIter = allExtractions.iterator();
319         Extraction currentExt;
320 
321         while (extIter.hasNext()) {
322             currentExt = (Extraction) extIter.next();
323             fContainer.add(currentExt);
324         }
325     }
326 
327     /***
328      * Returns a string representation of this object.
329      * @return a textual representation
330      */
331     public String toString() {
332         return new ToStringBuilder(this)
333             .append("target structure", targetStructure)
334             .append("typed extractions", typedExtractions)
335             .toString();
336     }
337 
338     /***
339      * Unsets the positions of all stored extractions. In case of duplicates
340      * (extractions that only differed in their position), the most probably
341      * one is kept.
342      */
343     public void unsetPositions() {
344         final Iterator<String> classIter = typedExtractions.keySet().iterator();
345         List<Extraction> extList;
346         Iterator<Extraction> extIter;
347         String className;
348         Extraction ext;
349         Extraction otherExt;
350         String text;
351         Map<String, Extraction> bestExtractions =
352             new HashMap<String, Extraction>();
353         List<Extraction> duplicates = new ArrayList<Extraction>(); // each class
354         List<Extraction> allDuplicates =
355             new ArrayList<Extraction>(); // all classes
356         PredictionComparator predComp = new PredictionComparator();
357 
358         // iterate extractions of each class to detect duplicates
359         while (classIter.hasNext()) {
360             // reset collections for new class
361             bestExtractions.clear();
362             duplicates.clear();
363             className = classIter.next();
364             extList = (List<Extraction>) typedExtractions.get(className);
365             extIter = extList.iterator();
366 
367             while (extIter.hasNext()) {
368                 ext = extIter.next();
369                 text = ext.getText();
370 
371                 if (bestExtractions.containsKey(text)) {
372                     otherExt = bestExtractions.get(text);
373 
374                     // found duplicate -- keep more likely one
375                     if (predComp.compare(otherExt, ext) < 0) {
376                         // new one is more likely
377                         bestExtractions.put(text, ext);
378                         duplicates.add(otherExt);
379 
380                         if (ext.getProbability().getProb() >= 0.0) {
381                             Util.LOG.debug("Discarded duplicate " + otherExt
382                                     + " in favor of " + ext);
383                         }
384                     } else {
385                         // old one is at least as likely
386                         duplicates.add(ext);
387 
388                         if (ext.getProbability().getProb() >= 0.0) {
389                             Util.LOG.debug("Discarded duplicate " + ext
390                                     + " in favor of " + otherExt);
391                         }
392                     }
393                 } else {
394                     // first extraction with this type and text
395                     bestExtractions.put(text, ext);
396                 }
397             } // while extIter
398 
399             // remove duplicates from typedExtractions
400             extList.removeAll(duplicates);
401             allDuplicates.addAll(duplicates);
402         } // while classIter
403 
404         // remove duplicated from allExtractions
405         allExtractions.removeAll(allDuplicates);
406 
407         // set first token rep. of all remaining extractions to -1
408         final Iterator<Extraction> allIter = allExtractions.iterator();
409         while (allIter.hasNext()) {
410             ext = allIter.next();
411             ext.setFirstTokenRepIgnored(true);
412         }
413     }
414 
415 }