View Javadoc

1   /*
2    * Copyright (C) 2005-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.xml.convert;
23  
24  import java.util.ArrayList;
25  import java.util.Iterator;
26  import java.util.List;
27  
28  import org.apache.commons.lang.builder.ToStringBuilder;
29  import org.dom4j.Attribute;
30  import org.dom4j.Document;
31  import org.dom4j.Element;
32  import org.dom4j.Node;
33  import org.dom4j.QName;
34  
35  import de.fu_berlin.ties.ContextMap;
36  import de.fu_berlin.ties.DocumentProcessor;
37  import de.fu_berlin.ties.ProcessingException;
38  import de.fu_berlin.ties.TiesConfiguration;
39  import de.fu_berlin.ties.classify.Prediction;
40  import de.fu_berlin.ties.classify.PredictionDistribution;
41  import de.fu_berlin.ties.classify.Probability;
42  import de.fu_berlin.ties.combi.CombinationState;
43  import de.fu_berlin.ties.combi.CombinationStrategy;
44  import de.fu_berlin.ties.combi.StrategyAdapter;
45  import de.fu_berlin.ties.extract.TargetStructure;
46  import de.fu_berlin.ties.text.TextUtils;
47  import de.fu_berlin.ties.text.TokenDetails;
48  import de.fu_berlin.ties.xml.dom.DOMUtils;
49  
50  /***
51   * Unflattens an XML document, reading labels for a
52   * {@link de.fu_berlin.ties.combi.CombinationStrategy} from an XML attribute
53   * ("class" by default). The value of this attribute will only be considered
54   * for leaf elements, i.e. elements without child elements. If it is missing,
55   * {@link de.fu_berlin.ties.combi.CombinationState#OUTSIDE} will be assumed.
56   * Attributes used for unflattening will be deleted from the resulting
57   * document.
58   *
59   * <p>For example, using IOB2 tagging, the document:
60   *
61   * <pre>
62   * &lt;document>
63   *   &lt;token class="O">Please&lt;/token>
64   *   &lt;token class="O">consult&lt;/token>
65   *   &lt;token class="B-person">Mr.&lt;/token>
66   *   &lt;token class="I-person">John&lt;/token>
67   *   &lt;token class="I-person">Smith&lt;/token>
68   *   &lt;token>for&lt;/token>
69   *   &lt;token>assistance&lt;/token>
70   * &lt;/document>
71   * </pre>
72   *
73   * will be unflattened as follows:
74   *
75   * <pre>
76   * &lt;document>
77   *   &lt;token>Please&lt;/token>
78   *   &lt;token>consult&lt;/token>
79   *   &lt;person>
80   *     &lt;token>Mr.&lt;/token>
81   *     &lt;token>John&lt;/token>
82   *     &lt;token>Smith&lt;/token>
83   *   &lt;/person>
84   *   &lt;token>for&lt;/token>
85   *   &lt;token>assistance&lt;/token>
86   * &lt;/document>
87   * </pre>
88   *
89   * @author Christian Siefkes
90   * @version $Revision: 1.10 $, $Date: 2006/10/21 16:04:31 $, $Author: siefkes $
91   */
92  public class AttributeUnflatten extends DocumentProcessor {
93  
94      /***
95       * The used combination strategy.
96       */
97      private final CombinationStrategy strategy;
98  
99      /***
100      * The used strategy adapter.
101      */
102     private final StrategyAdapter adapter;
103 
104     /***
105      * The attribute used to read the labels from.
106      */
107     private final QName labelAttribute;
108 
109     /***
110      * Creates a new instance, using the {@linkplain TiesConfiguration#CONF
111      * standard configuration}.
112      *
113      * @param outExt the extension to use for output files.
114      * @throws ProcessingException if an error occurs while initializing the
115      * strategy or the adapter
116      */
117     public AttributeUnflatten(final String outExt)  throws ProcessingException {
118         this(outExt, TiesConfiguration.CONF);
119     }
120 
121     /***
122      * Creates a new instance, configuring all fields from the provided
123      * configuration.
124      *
125      * @param outExt the extension to use for output files.
126      * @param conf used to configure this instance
127      * @throws ProcessingException if an error occurs while initializing the
128      * strategy or the adapter
129      */
130     public AttributeUnflatten(final String outExt, final TiesConfiguration conf)
131     throws ProcessingException {
132         this(outExt, CombinationStrategy.createStrategy(
133                 new TargetStructure(conf).getClassNames(), conf),
134                 new StrategyAdapter(conf),
135                 DOMUtils.defaultName(conf.getString("unflatten.attribute")),
136                 conf);
137     }
138 
139     /***
140      * Creates a new instance.
141      *
142      * @param outExt the extension to use for output files
143      * @param combiStrategy the combination strategy to use; must not be
144      * <code>null</code>
145      * @param stratAdapter used to translate the labels returned by the used
146      * combination strategy if necessary; must not be <code>null</code> but a
147      * {@linkplain StrategyAdapter#createDummyAdapter() dummy adapter} can be
148      * used
149      * @param labelAttrib the attribute used to read the labels from
150      * @param conf used to configure this instance
151      */
152     public AttributeUnflatten(final String outExt,
153             final CombinationStrategy combiStrategy,
154             final StrategyAdapter stratAdapter, final QName labelAttrib,
155             final TiesConfiguration conf) {
156         super(outExt, conf);
157         strategy = combiStrategy;
158         adapter = stratAdapter;
159         labelAttribute = labelAttrib;
160     }
161 
162     /***
163      * {@inheritDoc} This implementation delegates to the
164      * {@link #unflatten(Document)} method.
165      */
166     public Document process(final Document document, final ContextMap context)
167             throws ProcessingException {
168         // delegate to unflatten method
169         unflatten(document);
170         return document;
171     }
172 
173     /***
174      * Returns a string representation of this object.
175      *
176      * @return a textual representation
177      */
178     public String toString() {
179         return new ToStringBuilder(this)
180             .append("strategy", strategy)
181             .append("adapter", adapter)
182             .append("label attribute", labelAttribute)
183             .toString();
184     }
185 
186     /***
187      * Unflattens an XML document using the combination strategy and the
188      * strategy adapter stored in this instance.
189      *
190      * @param document the document to unflatten; will be modified by this
191      * method
192      * @throws ProcessingException if the input document contains an illegal
193      * sequence of labels which cannot be processed by the used strategy
194      */
195     public void unflatten(final Document document) throws ProcessingException {
196         // reset strategy + delegate
197         strategy.reset();
198         unflattenElement(document.getRootElement());
199     }
200 
201     /***
202      * Unflattens an XML element using the combination strategy and the
203      * strategy adapter stored in this instance. This method calls itself
204      * recursively as required.
205      *
206      * @param element the element to unflatten; will be modified by this
207      * method
208      * @throws ProcessingException if the input element contains an illegal
209      * sequence of labels which cannot be processed by the used strategy
210      */
211     private void unflattenElement(final Element element)
212     throws ProcessingException {
213         // reset combi strategy for each new element
214         strategy.reset();
215         final List<?> origContent = element.content();
216         final List<?> copiedContent = new ArrayList<Object>(origContent);
217 
218         // delete original content (will be recreated later)
219         origContent.clear();
220         final Iterator contentIter = copiedContent.iterator();
221         Node child;
222         Element childElement;
223         Element itemElement = null;
224         Attribute labelAttrib;
225         String labelValue;
226         String[] splittedLabelValues, translatedLabelValues;
227         PredictionDistribution[] predDists;
228         final Probability dummyProb = new Probability(1.0);
229         final TokenDetails dummyToken = new TokenDetails("", 0, 0, false);
230         CombinationState combiState;
231 
232         // iterate and process content nodes
233         while (contentIter.hasNext()) {
234             child = (Node) contentIter.next();
235 
236             if (child.getNodeType() == Element.ELEMENT_NODE) {
237                 // this is an element
238                 childElement = (Element) child;
239 
240                 if (childElement.elementIterator().hasNext()) {
241                     // child elements has its own children:
242                     // process recursively + re-add child
243                     unflattenElement(childElement);
244                     element.add(childElement);
245                 } else {
246                     // read value of label attrib (if exists)
247                     labelAttrib = childElement.attribute(labelAttribute);
248                     labelValue = labelAttrib.getValue();
249 
250                     if (labelValue == null) {
251                         combiState = CombinationState.OUTSIDE;
252                     } else {
253                         // delete attribute
254                         childElement.remove(labelAttrib);
255 
256                         // split value at whitespace + wrap + translate
257                         splittedLabelValues = TextUtils.splitLines(labelValue);
258                         translatedLabelValues =
259                             adapter.translate(splittedLabelValues);
260                         predDists = new PredictionDistribution[
261                                 translatedLabelValues.length];
262 
263                         for (int i = 0; i < predDists.length; i++) {
264                             predDists[i] = new PredictionDistribution(
265                                     new Prediction(translatedLabelValues[i],
266                                             dummyProb));
267                         }
268 
269                         // determine combination state + update strategy
270                         combiState =
271                             strategy.translateResult(predDists, dummyToken);
272                         strategy.updateState(combiState, predDists, dummyToken);
273                     }
274 
275                     if (combiState.equals(CombinationState.OUTSIDE)) {
276                         // outside any item: re-add as child of parent
277                         element.add(childElement);
278                     } else if (combiState.isBegin()) {
279                         // begin of a new item: create item + add child there
280                         itemElement = element.addElement(DOMUtils.defaultName(
281                                 combiState.getType()));
282                         itemElement.add(childElement);
283                     } else {
284                         // continuation of item: attach child to item element
285                         itemElement.add(childElement);
286                     }
287                 }
288             } else {
289                 // not an element: re-add as is
290                 element.add(child);
291             }
292 
293         }
294     }
295 
296 }