View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.extract;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.Reader;
27  import java.io.Writer;
28  import java.util.Arrays;
29  import java.util.HashSet;
30  import java.util.Set;
31  
32  import org.apache.commons.lang.builder.ToStringBuilder;
33  import org.dom4j.Document;
34  import org.dom4j.Element;
35  import org.dom4j.NodeFilter;
36  
37  import de.fu_berlin.ties.ContextMap;
38  import de.fu_berlin.ties.DocumentReader;
39  import de.fu_berlin.ties.ProcessingException;
40  import de.fu_berlin.ties.TiesConfiguration;
41  
42  import de.fu_berlin.ties.io.FieldContainer;
43  import de.fu_berlin.ties.io.IOUtils;
44  import de.fu_berlin.ties.text.TokenContainer;
45  import de.fu_berlin.ties.text.TokenDetails;
46  import de.fu_berlin.ties.text.TokenizerFactory;
47  import de.fu_berlin.ties.util.Util;
48  import de.fu_berlin.ties.xml.dom.DOMUtils;
49  import de.fu_berlin.ties.xml.dom.DocumentWalker;
50  import de.fu_berlin.ties.xml.dom.ElementNameFilter;
51  import de.fu_berlin.ties.xml.dom.ElementProcessor;
52  
53  /***
54   * Buildings an {@link de.fu_berlin.ties.extract.ExtractionContainer} of answer
55   * keys from an annotated text (in XML format).
56   *
57   * <p>Instances of this class are thread-safe and can process several documents
58   * in parallel.
59   *
60   * @author Christian Siefkes
61   * @version $Revision: 1.28 $, $Date: 2006/10/21 16:04:13 $, $Author: siefkes $
62   */
63  public class AnswerBuilder extends DocumentReader implements ElementProcessor {
64  
65      /***
66       * Context key referring to the extraction container used for storing the
67       * answer keys.
68       */
69      public static final String KEY_ANSWERS = "answer-keys";
70  
71      /***
72       * The recommended file extension to use for storing answer keys.
73       */
74      public static final String EXT_ANSWERS = "ans";
75  
76  
77      /***
78       * Reads back answer keys stored by the
79       * {@link #process(Document, Writer, ContextMap)} method of an instance of
80       * this class.
81       *
82       * @param targetStruct the target structure used when creating the answer
83       * keys
84       * @param file the file containing the answer keys
85       * @param config configuration used to determine the character set of the
86       * keys (cf. {@link IOUtils#openReader(File,
87       * org.apache.commons.configuration.Configuration)}
88       * @return an extraction container of the answer keys
89       * @throws IllegalArgumentException if the
90       * (@linkplain de.fu_berlin.ties.classify.Prediction#getType() type) of
91       * some answer keys in the answer keys doesn't fit the target structure
92       * @throws IOException if an I/O error occurs while reading the file
93       */
94      public static ExtractionContainer readAnswerKeys(
95              final TargetStructure targetStruct, final File file,
96              final TiesConfiguration config)
97              throws IllegalArgumentException, IOException {
98          if (file.exists()) {
99              final Reader reader = IOUtils.openReader(file, config);
100             final FieldContainer fContainer =
101                 FieldContainer.createFieldContainer(config);
102             fContainer.read(reader);
103             return new ExtractionContainer(targetStruct, fContainer);
104         } else {
105             // return an empty extraction container
106             Util.LOG.info("Answer key file " + file
107                     + " does not exists -- assuming there are no answer keys");
108             return new ExtractionContainer(targetStruct);
109         }
110     }
111 
112     /***
113      * Reads the answer keys corresponding to a file. The answer keys must be
114      * in a file ending in {@link AnswerBuilder#EXT_ANSWERS} instead of the
115      * extension of the original file.
116      *
117      * @param targetStruct the target structure used when creating the answer
118      * keys
119      * @param orgFile the file whose answer keys should be returned
120      * @param config configuration used to determine the character set of the
121      * keys (cf. {@link IOUtils#openReader(File,
122      * org.apache.commons.configuration.Configuration)}
123      * @return an extraction container of the answer keys
124      * @throws IllegalArgumentException if the
125      * (@linkplain de.fu_berlin.ties.classify.Prediction#getType() type) of
126      * some answer keys in the answer keys doesn't fit the target structure
127      * @throws IOException if an I/O error occurs while reading the file
128      */
129     public static ExtractionContainer readCorrespondingAnswerKeys(
130             final TargetStructure targetStruct, final File orgFile,
131             final TiesConfiguration config)
132             throws IllegalArgumentException, IOException {
133         // replace extension + delegate
134         final File answerFile = new File(orgFile.getParentFile(),
135                 IOUtils.getBaseName(orgFile) + IOUtils.EXT_SEPARATOR
136                 + EXT_ANSWERS);
137         return readAnswerKeys(targetStruct, answerFile, config);
138     }
139 
140 
141     /***
142      * The names of the elements to read attributes from.
143      */
144     private final Set<String> elementNames;
145 
146     /***
147      * Optional attribute determining the type of extraction.
148      */
149     private final String typeAttrib;
150 
151     /***
152      * The target structure specifying the classes to recognize.
153      */
154     private final TargetStructure targetStructure;
155 
156     /***
157      * Used to instantiate tokenizers.
158      */
159     private final TokenizerFactory factory;
160 
161 
162     /***
163      * Creates a new instance, using a default extension and configuring the
164      * target structure from the {@linkplain TiesConfiguration#CONF standard
165      * configuration}.
166      */
167     public AnswerBuilder() {
168         this(EXT_ANSWERS);
169     }
170 
171     /***
172      * Creates a new instance, configuring the target structure from the
173      * {@linkplain TiesConfiguration#CONF standard configuration}.
174      *
175      * @param outExt the extension to use for output files
176      */
177     public AnswerBuilder(final String outExt) {
178         this(outExt, TiesConfiguration.CONF);
179     }
180 
181     /***
182      * Creates a new instance, configuring the target structure from the
183      * provided configuration.
184      *
185      * @param outExt the extension to use for output files
186      * @param config the configuration to use
187      */
188     public AnswerBuilder(final String outExt, final TiesConfiguration config) {
189         this(outExt, new TargetStructure(config), new TokenizerFactory(config),
190                 config.getStringArray("answers.element"),
191                 config.getString("answers.attrib", null),
192                 config);
193     }
194 
195     /***
196      * Creates a new instance.
197      *
198      * @param outExt the extension to use for output files
199      * @param targetStruct the target structure specifying the classes to
200      * recognize
201      * @param tFactory used to instantiate tokenizers
202      * @param myElementNames if not <code>null</code> or empty, answer types are
203      * read from the <code>myTypeAttrib</code> attribute of the elements
204      * specified in this list instead of using element names as types
205      * @param myTypeAttrib Name of attribute to read element types from if
206      * the elemant name shouldn't be used; must be <code>null</code> iff
207      * <code>myElementNames</code> is <code>null</code> or empty
208      * @param config the configuration to use
209      */
210     public AnswerBuilder(final String outExt,
211             final TargetStructure targetStruct, final TokenizerFactory tFactory,
212             final String[] myElementNames, final String myTypeAttrib,
213             final TiesConfiguration config) {
214         super(outExt, config);
215         targetStructure = targetStruct;
216         factory = tFactory;
217         typeAttrib = myTypeAttrib;
218 
219         if (typeAttrib != null) {
220             // check given attribute of given elements
221             elementNames = new HashSet<String>(Arrays.asList(myElementNames));
222         } else {
223             // check elements matching types in target structure
224             elementNames = targetStructure.getClassNames();
225         }
226     }
227 
228 
229     /***
230      * Buildings an {@link de.fu_berlin.ties.extract.ExtractionContainer} of
231      * answer keys from from an annotated XML document.
232      *
233      * @param document the document to read
234      * @return a container of the answer keys of this document
235      * @throws IOException if an I/O error occurs
236      * @throws ProcessingException if an error occurs during processing
237      */
238     public ExtractionContainer buildAnswers(final Document document)
239             throws IOException, ProcessingException {
240         // create container of answer keys and store in context
241         final ExtractionContainer answerKeys =
242             new ExtractionContainer(targetStructure);
243         final ContextMap context = new ContextMap();
244         context.put(KEY_ANSWERS, answerKeys);
245 
246         final NodeFilter filter = new ElementNameFilter(elementNames);
247         final DocumentWalker walker = new DocumentWalker(filter, this, factory);
248 
249         // the walker will call back (process method) for each element matching
250         // a class name
251         walker.walk(document, context);
252 
253         // return the container of answer keys
254         return answerKeys;
255     }
256 
257     /***
258      * Returns the target structure specifying the classes to recognize.
259      * @return the used target structure
260      */
261     public TargetStructure getTargetStructure() {
262         return targetStructure;
263     }
264 
265     /***
266      * Buildings an {@link de.fu_berlin.ties.extract.ExtractionContainer} of
267      * answer keys from from an annotated XML document.
268      *
269      * @param document the document to read
270      * @param writer the writer to write the processed text to; flushed
271      * but not closed by this method
272      * @param context a map of objects that are made available for processing
273      * @throws IOException if an I/O error occurs
274      * @throws ProcessingException if an error occurs during processing
275      */
276     public void process(final Document document, final Writer writer,
277             final ContextMap context) throws IOException, ProcessingException {
278         // delegate to build method
279         final ExtractionContainer answers = buildAnswers(document);
280 
281         // serialize results
282         final FieldContainer storage =
283             FieldContainer.createFieldContainer(getConfig());
284         answers.storeEntries(storage);
285         storage.store(writer);
286     }
287 
288     /***
289      * Builds an answer key from an element in an XML document.
290      *
291      * @param element the element to process
292      * @param tokenContainer a container storing all tokens seen in the
293      * document so far; {@link TokenContainer#getLast()} contains the textual
294      * content of the element and its child elements
295      * @param context a map of objects that are made available for processing;
296      * the {@link #KEY_ANSWERS} key must map to an extraction container used for
297      * storing the answer keys
298      */
299     public void processElement(final Element element,
300             final TokenContainer tokenContainer, final ContextMap context) {
301         // create and store an extraction of the given type
302         final TokenDetails details = new TokenDetails(tokenContainer.getLast(),
303                 tokenContainer.getFirstTokenInLastRep(),
304                 tokenContainer.getFirstTokenInLastIndex(), false);
305 
306         // use type attribute if configured, otherwise element name
307         final String extractionType = (typeAttrib != null)
308                 ? element.attributeValue(typeAttrib) : DOMUtils.name(element);
309         final Extraction extraction =
310             new Extraction(extractionType, details);
311         final ExtractionContainer answerKeys =
312             (ExtractionContainer) context.get(KEY_ANSWERS);
313         answerKeys.add(extraction);
314     }
315 
316     /***
317      * Returns a string representation of this object.
318      * @return a textual representation
319      */
320     public String toString() {
321         return new ToStringBuilder(this)
322             .appendSuper(super.toString())
323             .append("target structure", targetStructure)
324             .toString();
325     }
326 
327 }