1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.xml.convert;
23
24 import java.util.ArrayList;
25 import java.util.Iterator;
26 import java.util.List;
27
28 import org.apache.commons.lang.builder.ToStringBuilder;
29 import org.dom4j.Attribute;
30 import org.dom4j.Document;
31 import org.dom4j.Element;
32 import org.dom4j.Node;
33 import org.dom4j.QName;
34
35 import de.fu_berlin.ties.ContextMap;
36 import de.fu_berlin.ties.DocumentProcessor;
37 import de.fu_berlin.ties.ProcessingException;
38 import de.fu_berlin.ties.TiesConfiguration;
39 import de.fu_berlin.ties.classify.Prediction;
40 import de.fu_berlin.ties.classify.PredictionDistribution;
41 import de.fu_berlin.ties.classify.Probability;
42 import de.fu_berlin.ties.combi.CombinationState;
43 import de.fu_berlin.ties.combi.CombinationStrategy;
44 import de.fu_berlin.ties.combi.StrategyAdapter;
45 import de.fu_berlin.ties.extract.TargetStructure;
46 import de.fu_berlin.ties.text.TextUtils;
47 import de.fu_berlin.ties.text.TokenDetails;
48 import de.fu_berlin.ties.xml.dom.DOMUtils;
49
50 /***
51 * Unflattens an XML document, reading labels for a
52 * {@link de.fu_berlin.ties.combi.CombinationStrategy} from an XML attribute
53 * ("class" by default). The value of this attribute will only be considered
54 * for leaf elements, i.e. elements without child elements. If it is missing,
55 * {@link de.fu_berlin.ties.combi.CombinationState#OUTSIDE} will be assumed.
56 * Attributes used for unflattening will be deleted from the resulting
57 * document.
58 *
59 * <p>For example, using IOB2 tagging, the document:
60 *
61 * <pre>
62 * <document>
63 * <token class="O">Please</token>
64 * <token class="O">consult</token>
65 * <token class="B-person">Mr.</token>
66 * <token class="I-person">John</token>
67 * <token class="I-person">Smith</token>
68 * <token>for</token>
69 * <token>assistance</token>
70 * </document>
71 * </pre>
72 *
73 * will be unflattened as follows:
74 *
75 * <pre>
76 * <document>
77 * <token>Please</token>
78 * <token>consult</token>
79 * <person>
80 * <token>Mr.</token>
81 * <token>John</token>
82 * <token>Smith</token>
83 * </person>
84 * <token>for</token>
85 * <token>assistance</token>
86 * </document>
87 * </pre>
88 *
89 * @author Christian Siefkes
90 * @version $Revision: 1.10 $, $Date: 2006/10/21 16:04:31 $, $Author: siefkes $
91 */
92 public class AttributeUnflatten extends DocumentProcessor {
93
94 /***
95 * The used combination strategy.
96 */
97 private final CombinationStrategy strategy;
98
99 /***
100 * The used strategy adapter.
101 */
102 private final StrategyAdapter adapter;
103
104 /***
105 * The attribute used to read the labels from.
106 */
107 private final QName labelAttribute;
108
109 /***
110 * Creates a new instance, using the {@linkplain TiesConfiguration#CONF
111 * standard configuration}.
112 *
113 * @param outExt the extension to use for output files.
114 * @throws ProcessingException if an error occurs while initializing the
115 * strategy or the adapter
116 */
117 public AttributeUnflatten(final String outExt) throws ProcessingException {
118 this(outExt, TiesConfiguration.CONF);
119 }
120
121 /***
122 * Creates a new instance, configuring all fields from the provided
123 * configuration.
124 *
125 * @param outExt the extension to use for output files.
126 * @param conf used to configure this instance
127 * @throws ProcessingException if an error occurs while initializing the
128 * strategy or the adapter
129 */
130 public AttributeUnflatten(final String outExt, final TiesConfiguration conf)
131 throws ProcessingException {
132 this(outExt, CombinationStrategy.createStrategy(
133 new TargetStructure(conf).getClassNames(), conf),
134 new StrategyAdapter(conf),
135 DOMUtils.defaultName(conf.getString("unflatten.attribute")),
136 conf);
137 }
138
139 /***
140 * Creates a new instance.
141 *
142 * @param outExt the extension to use for output files
143 * @param combiStrategy the combination strategy to use; must not be
144 * <code>null</code>
145 * @param stratAdapter used to translate the labels returned by the used
146 * combination strategy if necessary; must not be <code>null</code> but a
147 * {@linkplain StrategyAdapter#createDummyAdapter() dummy adapter} can be
148 * used
149 * @param labelAttrib the attribute used to read the labels from
150 * @param conf used to configure this instance
151 */
152 public AttributeUnflatten(final String outExt,
153 final CombinationStrategy combiStrategy,
154 final StrategyAdapter stratAdapter, final QName labelAttrib,
155 final TiesConfiguration conf) {
156 super(outExt, conf);
157 strategy = combiStrategy;
158 adapter = stratAdapter;
159 labelAttribute = labelAttrib;
160 }
161
162 /***
163 * {@inheritDoc} This implementation delegates to the
164 * {@link #unflatten(Document)} method.
165 */
166 public Document process(final Document document, final ContextMap context)
167 throws ProcessingException {
168
169 unflatten(document);
170 return document;
171 }
172
173 /***
174 * Returns a string representation of this object.
175 *
176 * @return a textual representation
177 */
178 public String toString() {
179 return new ToStringBuilder(this)
180 .append("strategy", strategy)
181 .append("adapter", adapter)
182 .append("label attribute", labelAttribute)
183 .toString();
184 }
185
186 /***
187 * Unflattens an XML document using the combination strategy and the
188 * strategy adapter stored in this instance.
189 *
190 * @param document the document to unflatten; will be modified by this
191 * method
192 * @throws ProcessingException if the input document contains an illegal
193 * sequence of labels which cannot be processed by the used strategy
194 */
195 public void unflatten(final Document document) throws ProcessingException {
196
197 strategy.reset();
198 unflattenElement(document.getRootElement());
199 }
200
201 /***
202 * Unflattens an XML element using the combination strategy and the
203 * strategy adapter stored in this instance. This method calls itself
204 * recursively as required.
205 *
206 * @param element the element to unflatten; will be modified by this
207 * method
208 * @throws ProcessingException if the input element contains an illegal
209 * sequence of labels which cannot be processed by the used strategy
210 */
211 private void unflattenElement(final Element element)
212 throws ProcessingException {
213
214 strategy.reset();
215 final List<?> origContent = element.content();
216 final List<?> copiedContent = new ArrayList<Object>(origContent);
217
218
219 origContent.clear();
220 final Iterator contentIter = copiedContent.iterator();
221 Node child;
222 Element childElement;
223 Element itemElement = null;
224 Attribute labelAttrib;
225 String labelValue;
226 String[] splittedLabelValues, translatedLabelValues;
227 PredictionDistribution[] predDists;
228 final Probability dummyProb = new Probability(1.0);
229 final TokenDetails dummyToken = new TokenDetails("", 0, 0, false);
230 CombinationState combiState;
231
232
233 while (contentIter.hasNext()) {
234 child = (Node) contentIter.next();
235
236 if (child.getNodeType() == Element.ELEMENT_NODE) {
237
238 childElement = (Element) child;
239
240 if (childElement.elementIterator().hasNext()) {
241
242
243 unflattenElement(childElement);
244 element.add(childElement);
245 } else {
246
247 labelAttrib = childElement.attribute(labelAttribute);
248 labelValue = labelAttrib.getValue();
249
250 if (labelValue == null) {
251 combiState = CombinationState.OUTSIDE;
252 } else {
253
254 childElement.remove(labelAttrib);
255
256
257 splittedLabelValues = TextUtils.splitLines(labelValue);
258 translatedLabelValues =
259 adapter.translate(splittedLabelValues);
260 predDists = new PredictionDistribution[
261 translatedLabelValues.length];
262
263 for (int i = 0; i < predDists.length; i++) {
264 predDists[i] = new PredictionDistribution(
265 new Prediction(translatedLabelValues[i],
266 dummyProb));
267 }
268
269
270 combiState =
271 strategy.translateResult(predDists, dummyToken);
272 strategy.updateState(combiState, predDists, dummyToken);
273 }
274
275 if (combiState.equals(CombinationState.OUTSIDE)) {
276
277 element.add(childElement);
278 } else if (combiState.isBegin()) {
279
280 itemElement = element.addElement(DOMUtils.defaultName(
281 combiState.getType()));
282 itemElement.add(childElement);
283 } else {
284
285 itemElement.add(childElement);
286 }
287 }
288 } else {
289
290 element.add(child);
291 }
292
293 }
294 }
295
296 }