View Javadoc

1   /*
2    * Copyright (C) 2005-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.filter;
23  
24  import java.io.File;
25  import java.io.IOException;
26  
27  import org.apache.commons.lang.StringUtils;
28  import org.apache.commons.lang.builder.ToStringBuilder;
29  import org.dom4j.Document;
30  import org.dom4j.Element;
31  
32  import de.fu_berlin.ties.ContextMap;
33  import de.fu_berlin.ties.ProcessingException;
34  import de.fu_berlin.ties.TiesConfiguration;
35  import de.fu_berlin.ties.extract.AnswerBuilder;
36  import de.fu_berlin.ties.extract.ExtractionContainer;
37  import de.fu_berlin.ties.extract.ExtractionLocator;
38  import de.fu_berlin.ties.extract.TargetStructure;
39  import de.fu_berlin.ties.io.IOUtils;
40  import de.fu_berlin.ties.text.TokenCounter;
41  import de.fu_berlin.ties.text.TokenDetails;
42  import de.fu_berlin.ties.text.TokenizerFactory;
43  import de.fu_berlin.ties.util.Util;
44  import de.fu_berlin.ties.xml.dom.DOMUtils;
45  import de.fu_berlin.ties.xml.dom.TokenProcessor;
46  import de.fu_berlin.ties.xml.dom.TokenWalker;
47  
48  /***
49   * A variant of the prediction rewriter that uses predictions from
50   * another process (e.g. named entities) to provide additional semantic
51   * information. This variant does not modify the element structure of the
52   * document, but stores the predictions as XML attributes.
53   *
54   * <p>You should generally use this class instead of
55   * {@link de.fu_berlin.ties.filter.PredictionRewriter} since it generally has
56   * superior results.
57   *  Instances of this class are not thread-safe and must not be used to
58   * process multiple documents in parallel. 
59   * 
60   * @author Christian Siefkes
61   * @version $Revision: 1.10 $, $Date: 2006/10/21 16:04:20 $, $Author: siefkes $
62   */
63  public class PredictionRewriter2 implements DocumentRewriter, TokenProcessor {
64  
65      /***
66       * Name of the attribute to add.
67       */
68      public static final String ATTRIB_PRED = "pred";
69  
70      /***
71       * Configuration key: "None" marker to use for tokens that do not belong to
72       * any prediction -- if empty or missing, these tokens are not tagged.
73       */
74      public static final String CONFIG_PRED_NONE = "rewriter.pred.none";
75  
76  
77      /***
78       * Used to configure this instance.
79       */
80      private final TiesConfiguration config;
81  
82      /***
83       * Extension of the files containing predictions.
84       */
85      private final String extension;
86  
87      /***
88       * "None" marker to use for tokens that do not belong to any prediction --
89       * if <code>null</code>, these tokens are not tagged (default behavior).
90       */
91      private final String noneMarker;
92  
93      /***
94       * Prediction locator for the current document.
95       */
96      private ExtractionLocator predLocator;
97  
98      /***
99       * Target structure to use for reading predictions.
100      */
101     private final TargetStructure targetStruct;
102 
103     /***
104      * Factory used to create tokenizers.
105      */
106     private final TokenizerFactory tFactory;
107 
108     /***
109      * Counts how often tokens are repeated in a document -- required to
110      * localize predictions.
111      */
112     private final TokenCounter tCount = new TokenCounter();
113 
114 
115     /***
116      * Creates a new instance.
117      *
118      * @param conf used to configure this instance; must not be
119      * <code>null</code>
120      * @throws ProcessingException if an error occurs while initializing the
121      * combination strategies
122      */
123     public PredictionRewriter2(final TiesConfiguration conf)
124     throws ProcessingException {
125         this(conf.getString(PredictionRewriter.CONFIG_PRED_EXT),
126                 conf.getStringArray("rewriter.pred.classes"),
127                 conf.getString(CONFIG_PRED_NONE, null),
128                 new TokenizerFactory(conf), conf);
129     }
130 
131     /***
132      * Creates a new instance.
133      *
134      * @param fileExtension extension of the files containing predictions
135      * @param predictionClasses names of the prediction classes to use --
136      * if empty array, all are used
137      * @param myNoneMarker "none" marker to use for tokens that do not belong to
138      * any prediction -- if empty or <code>null</code>, these tokens are not
139      * tagged
140      * @param factory used to instantiate tokenizers
141      * @param conf used to configure this instance; must not be
142      * <code>null</code>
143      * @throws ProcessingException if an error occurs while initializing the
144      * combination strategies
145      */
146     public PredictionRewriter2(final String fileExtension,
147             final String[] predictionClasses, final String myNoneMarker,
148             final TokenizerFactory factory, final TiesConfiguration conf)
149     throws ProcessingException {
150         super();
151         config = conf;
152         extension = fileExtension;
153         targetStruct = new TargetStructure(predictionClasses);
154         tFactory = factory;
155 
156         // convert empty string to null
157         if (StringUtils.isEmpty(myNoneMarker)) {
158             noneMarker = null;
159         } else {
160             noneMarker = myNoneMarker;
161             Util.LOG.debug("PredictionRewriter2: setting 'none' marker to '"
162                     + noneMarker + "'");
163         }
164     }
165 
166 
167     /***
168      * Initializes a document to process, reading the corresponding prediction
169      * file(s).
170      *
171      * @param filename the file name of the current document
172      * @return <code>true</code> iff any predictions for this document exist
173      * @throws IOException if an I/O error occurs
174      */
175     private boolean initDocument(final File filename)
176     throws IOException {
177         final File directory = filename.getParentFile();
178         final String localName = filename.getName();
179         final String prefix =
180             IOUtils.getBaseName(localName) + IOUtils.EXT_SEPARATOR;
181         File predFile;
182         ExtractionContainer predictions;
183 
184         // reset token counter
185         tCount.clear();
186 
187         // terminate old locator, if any
188         if (predLocator != null) {
189             predLocator.reachedEndOfDocument();
190         }
191 
192         // read prediction file
193         predFile = new File(directory, prefix + extension);
194 
195         if (predFile.exists()) {
196             predictions = AnswerBuilder.readAnswerKeys(targetStruct,
197                     predFile, config);
198             predLocator = new ExtractionLocator(predictions,
199                     tFactory.createTokenizer(""));
200             return true;
201         } else {
202             Util.LOG.info("No '" + extension + "' file found for "
203                     + localName + " -- assuming there are no predictions");
204             predLocator = null;
205             return false;
206         }
207     }
208 
209     /***
210      * {@inheritDoc}
211      */
212     public void processToken(final Element element, final String left,
213             final TokenDetails details, final String right,
214             final ContextMap context) throws IOException {
215         final String token = details.getToken();
216         tCount.add(false, token);
217         final int tokenRep = tCount.getLastRep();
218         final String predType;
219 
220         // lookup and flatten predictions of all types
221         if (predLocator != null) {
222             // check for start of extraction
223             predLocator.startOfExtraction(token, tokenRep);
224 
225             if (predLocator.inExtraction()) {
226                 predType = predLocator.getCurrentExtraction().getType();
227 
228                 // update extraction (remove current token from remaining)
229                 predLocator.updateExtraction(token, tokenRep);
230 
231                 // check whether this is the last token of the extraction
232                 if (predLocator.endOfExtraction()) {
233                     // switch to next extraction
234                     predLocator.switchToNextExtraction();
235                 }
236             } else {
237                 // outside: use noneMarker, if any
238                 predType = noneMarker;
239             }
240         } else {
241             // no extractions: use noneMarker, if any
242             predType = noneMarker;
243         }
244 
245         if (predType != null) {
246             final String oldAttribValue = element.attributeValue(ATTRIB_PRED);
247 
248             // Set pred=TYPE attribute if not yet present. None markers might
249             // be replaced, while other previously set values will be honored.
250             if (oldAttribValue == null || oldAttribValue.equals(noneMarker)) {
251                 element.addAttribute(ATTRIB_PRED, predType);
252             } else if (!oldAttribValue.equals(predType)) {
253                 // that's unfortunate but possible (not an error)
254                 Util.LOG.debug("Could not add " + predType
255                         + " prediction since there is a " + ATTRIB_PRED
256                         + "='" + oldAttribValue + "' attribute "
257                         + DOMUtils.showToken(element, token));
258             }
259         }
260     }
261 
262     /***
263      * {@inheritDoc}
264      */
265     public Document rewrite(final Document document, final File filename)
266             throws IOException, ProcessingException {
267         final ContextMap dummyContext = new ContextMap();
268 
269         if (initDocument(filename)) {
270             // walk thru document to add prediction attributes
271             final TokenWalker walker = new TokenWalker(this, tFactory);
272             walker.walk(document, dummyContext);
273         }
274 
275         return document;
276     }
277 
278     /***
279      * Returns a string representation of this object.
280      *
281      * @return a textual representation
282      */
283     public String toString() {
284         return new ToStringBuilder(this)
285             .append("extension", extension)
286             .toString();
287     }
288 
289 }
290