1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.context;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.util.Collection;
27 import java.util.Collections;
28 import java.util.HashSet;
29 import java.util.Iterator;
30 import java.util.LinkedList;
31 import java.util.List;
32 import java.util.ListIterator;
33 import java.util.Map;
34 import java.util.Set;
35 import java.util.SortedMap;
36 import java.util.TreeMap;
37 import java.util.regex.Matcher;
38 import java.util.regex.Pattern;
39
40 import org.apache.commons.collections.Bag;
41 import org.apache.commons.collections.bag.HashBag;
42 import org.apache.commons.collections.KeyValue;
43 import org.apache.commons.collections.MultiHashMap;
44 import org.apache.commons.collections.MultiMap;
45 import org.apache.commons.collections.SortedBag;
46 import org.apache.commons.collections.bag.TreeBag;
47 import org.apache.commons.collections.map.LinkedMap;
48 import org.apache.commons.lang.ArrayUtils;
49 import org.apache.commons.lang.StringUtils;
50 import org.apache.commons.lang.builder.ToStringBuilder;
51 import org.dom4j.Attribute;
52 import org.dom4j.Element;
53
54 import de.fu_berlin.ties.ProcessingException;
55 import de.fu_berlin.ties.TiesConfiguration;
56 import de.fu_berlin.ties.xml.dom.DOMUtils;
57 import de.fu_berlin.ties.classify.feature.DefaultFeatureVector;
58 import de.fu_berlin.ties.classify.feature.Feature;
59 import de.fu_berlin.ties.classify.feature.FeatureVector;
60 import de.fu_berlin.ties.context.sensor.BaseSensor;
61 import de.fu_berlin.ties.context.sensor.Sensor;
62 import de.fu_berlin.ties.io.IOUtils;
63 import de.fu_berlin.ties.text.TextUtils;
64 import de.fu_berlin.ties.text.TokenizerFactory;
65 import de.fu_berlin.ties.util.Util;
66
67 /***
68 * The context representation used by default. This class is thread-safe.
69 *
70 * @author Christian Siefkes
71 * @version $Revision: 1.69 $, $Date: 2006/10/21 16:04:03 $, $Author: siefkes $
72 */
73 public class DefaultRepresentation extends AbstractRepresentation {
74
75 /***
76 * Ancestor axis.
77 */
78 protected static final String AXIS_ANCESTOR = "an";
79
80 /***
81 * Descendant-or-self axis.
82 */
83 protected static final String AXIS_DESC_OR_SELF = "ds";
84
85 /***
86 * Following sibling axis.
87 */
88 protected static final String AXIS_FOLLOW_SIBLING = "fs";
89
90 /***
91 * Preceeding sibling axis.
92 */
93 protected static final String AXIS_PREC_SIBLING = "ps";
94
95 /***
96 * The pseudo-axis of prior recognitions.
97 */
98 protected static final String AXIS_PRIOR = "prior";
99
100 /***
101 * A sequence map mapping used by
102 * {@link #calculateValuesFromText(String, String, List)} to determine the
103 * "tokenType" value. Maps description strings to {@link Pattern}s to
104 * match. The description string for the first matching pattern is used.
105 * Initialized in the static constuctor. The value "mixed" is reserved for
106 * tokens that are not matched by any of the patterns.
107 */
108 protected static final LinkedMap TOKEN_TYPE_PATTERNS;
109
110 /***
111 * Static constructor: initializes the map of {@link #TOKEN_TYPE_PATTERNS}.
112 */
113 static {
114
115
116 final String lowercaseLetter = "[//p{Ll}//p{Lm}]";
117
118 final String uppercaseLetter = "[//p{Lu}//p{Lt}//p{Lm}]";
119
120 final String anyLetter = "[//p{L}//p{M}]";
121
122 TOKEN_TYPE_PATTERNS = new LinkedMap();
123
124
125 TOKEN_TYPE_PATTERNS.put("lower",
126 Pattern.compile(lowercaseLetter + "+"));
127 TOKEN_TYPE_PATTERNS.put("CAPS",
128 Pattern.compile(uppercaseLetter + "+"));
129
130 TOKEN_TYPE_PATTERNS.put("Cap",
131 Pattern.compile(uppercaseLetter + lowercaseLetter + "+"));
132 TOKEN_TYPE_PATTERNS.put("mixedCase", Pattern.compile(anyLetter + "+"));
133
134 TOKEN_TYPE_PATTERNS.put("CAPSDot",
135 Pattern.compile(uppercaseLetter + "+//."));
136 TOKEN_TYPE_PATTERNS.put("alphaDot",
137 Pattern.compile(anyLetter + "+//."));
138
139 TOKEN_TYPE_PATTERNS.put("alpha+Punc",
140 Pattern.compile("[//p{L}//p{M}//p{Pd}]+//."));
141
142
143
144 TOKEN_TYPE_PATTERNS.put("openPunc", Pattern.compile("//p{Ps}+"));
145 TOKEN_TYPE_PATTERNS.put("closePunc", Pattern.compile("//p{Pe}+"));
146
147
148
149
150
151
152 TOKEN_TYPE_PATTERNS.put("punc", Pattern.compile("//p{P}+"));
153
154
155
156 TOKEN_TYPE_PATTERNS.put("num", Pattern.compile("(?:0|[1-9]//d*)"));
157
158 TOKEN_TYPE_PATTERNS.put("digits", Pattern.compile("//p{N}+"));
159 TOKEN_TYPE_PATTERNS.put("digits+.", Pattern.compile("[//p{N}.]+"));
160 TOKEN_TYPE_PATTERNS.put("digits+.,",
161 Pattern.compile("[//p{N}.,]+"));
162
163
164
165 TOKEN_TYPE_PATTERNS.put("alphanum",
166 Pattern.compile("[//p{L}//p{M}//p{N}//p{Pd}.,]+"));
167
168
169
170 TOKEN_TYPE_PATTERNS.put("math", Pattern.compile("//p{Sm}+"));
171 TOKEN_TYPE_PATTERNS.put("curr", Pattern.compile("//p{Sc}+"));
172
173 TOKEN_TYPE_PATTERNS.put("symb", Pattern.compile("//p{S}+"));
174
175
176 TOKEN_TYPE_PATTERNS.put("ctrl", Pattern.compile("//p{C}+"));
177 }
178
179 /***
180 * The maximum number of ancestors to include in the context representation.
181 */
182 private final int ancestorNumber;
183
184 /***
185 * The name of the attribute to use for calculating head values,
186 * cf. {@link #calculateHeadValues(Element, List)}. The name must be
187 * compatible to {@link DOMUtils#name(Attribute)}.
188 */
189 private final String headAttribute;
190
191 /***
192 * The name of the element to use for calculating head values,
193 * cf. {@link #calculateHeadValues(Element, List)}. The name must be
194 * compatible to {@link DOMUtils#name(Element)}.
195 */
196 private final String headElement;
197
198 /***
199 * An unmodifiable set of names of default attributes. Contains Strings
200 * in a format compatible to {@link DOMUtils#name(Attribute)}.
201 */
202 private final Set defaultAttributes;
203
204 /***
205 * The number of preceding recognitions to represent in detail.
206 */
207 private final int detailedRecognitions;
208
209 /***
210 * The basic number of preceding and following siblings to include in the
211 * context representation.
212 */
213 private final int siblingNumber;
214
215 /***
216 * The maximum length of prefixed and suffixes.
217 */
218 private final int maximumPrefixLength;
219
220 /***
221 * An array of sensors used to add semantic information.
222 */
223 private final Sensor[] sensors;
224
225 /***
226 * Creates a new instance based on the
227 * {@linkplain TiesConfiguration#CONF standard configuration}.
228 *
229 * @throws ProcessingException if an error occurs while initializing this
230 * instance
231 */
232 public DefaultRepresentation() throws ProcessingException {
233 this(TiesConfiguration.CONF);
234 }
235
236 /***
237 * Creates a new instance based on the provided configuration.
238 *
239 * @param config used to configure this instance
240 * @throws ProcessingException if an error occurs while initializing this
241 * instance
242 */
243 public DefaultRepresentation(final TiesConfiguration config)
244 throws ProcessingException {
245 this(config.getInt(CONFIG_RECOGN_NUM),
246 config.getInt("representation.recogn.detailed"),
247 config.getInt("representation.ancestor.num"),
248 config.getInt("representation.sibling.num"),
249 config.getInt(CONFIG_SPLIT_MAXIMUM),
250 config.getInt("representation.prefix.maxlength"),
251 config.getString("representation.head.element"),
252 config.getString("representation.head.attrib"),
253 config.getStringArray("representation.default.attribs"),
254 config.getInt(CONFIG_STORE_NTH),
255 config.getString(IOUtils.KEY_LOCAL_CHARSET, null),
256 config.getStringArray("representation.sensors"),
257 config);
258 }
259
260 /***
261 * Creates a new instance based on the provided configuration.
262 *
263 * @param config used to configure this instance
264 * @param suffix optional suffix for
265 * {@linkplain de.fu_berlin.ties.TiesConfiguration#adaptKey(String, String)
266 * adapting configuration keys} if not <code>null</code> (currently ignored)
267 * @throws ProcessingException if an error occurs while initializing this
268 * instance
269 */
270 public DefaultRepresentation(final TiesConfiguration config,
271 final String suffix)
272 throws ProcessingException {
273 this(config);
274 }
275
276 /***
277 * Creates a new instance. All element/attribute names must be compatible
278 * to {@link DOMUtils#name(Element)}.
279 *
280 * @param recogNum the number of preceding recognitions to represent
281 * @param detailedRecogs the number of preceding recognitions to represent
282 * in detail
283 * @param numberOfAncestors the maximum number of ancestors to include in
284 * the context representation
285 * @param numberOfSiblings the basic number of preceding and following
286 * siblings to include
287 * @param splitMax the maximum number of subsequences to keep when
288 * a feature value must be split (at whitespace).
289 * @param prefixMax the maximum length of prefixed and suffixes
290 * @param headElementName the name of the element to use for
291 * calculating head values (cf. {@link #calculateHeadValues(Element, List)})
292 * @param headAttribName the name of the attribute to use for
293 * calculating head values
294 * @param defaultAttribs array of names of default attributes
295 * @param outCharset the output character set to use (only used to
296 * store some configurations for inspection purposes, if <code>n</code>
297 * > 0); if <code>null</code>, the default charset of the current
298 * platform is used
299 * @param n Each <em>n</em>-th context representation is stored if > 0;
300 * otherwise no representation is stored
301 * @param sensorNames array of fully specified names of classes
302 * implementing the {@link Sensor} interface to be used to look up semantic
303 * information
304 * @param config used to configure the sensors
305 * @throws ProcessingException if an error occurs while initializing this
306 * instance
307 */
308 public DefaultRepresentation(final int recogNum, final int detailedRecogs,
309 final int numberOfAncestors, final int numberOfSiblings,
310 final int splitMax, final int prefixMax,
311 final String headElementName, final String headAttribName,
312 final String[] defaultAttribs, final int n,
313 final String outCharset, final String[] sensorNames,
314 final TiesConfiguration config) throws ProcessingException {
315 super(recogNum, splitMax, n, outCharset);
316 detailedRecognitions = detailedRecogs;
317 ancestorNumber = numberOfAncestors;
318 siblingNumber = numberOfSiblings;
319 maximumPrefixLength = prefixMax;
320
321 headElement = headElementName;
322 headAttribute = headAttribName;
323
324
325 final Set<String> defAttribSet = new HashSet<String>();
326 for (int i = 0; i < defaultAttribs.length; i++) {
327 defAttribSet.add(defaultAttribs[i]);
328 }
329 defaultAttributes = Collections.unmodifiableSet(defAttribSet);
330
331 sensors = BaseSensor.createSensors(sensorNames, config);
332 }
333
334 /***
335 * Builds the context representation of text in an element. Returns a
336 * feature vector of all context features considered relevant for
337 * representation.
338 *
339 * @param element the element whose context should be represented
340 * @param leftText textual content to the left of (preceding)
341 * <code>mainText</code>, might be empty
342 * @param mainText the main textual content to represent, might be empty
343 * @param rightText textual content to the right of (following)
344 * <code>mainText</code>, might be empty
345 * @param priorRecognitions a buffer of the last {@link Recognition}s from
346 * the document, created by calling {@link Representation#initDocument};
347 * might be <code>null</code>
348 * @param featureCache a cache of (local) feature, should be re-used
349 * between all calls for the nodes in a single document (but must not be
350 * re-used when building the context of nodes in different documents!)
351 * @param logPurpose the type of contexts of main interest to the caller
352 * (e.g. "Token" or "Sentence"), used for logging
353 * @return a vector of features considered relevant for representation
354 * @throws ClassCastException if the <code>priorRecognitions</code> buffer
355 * contains objects that aren't {@link Recognition}s
356 * @throws IllegalArgumentException if the specified node is of an
357 * unsupported type
358 */
359 protected FeatureVector doBuildContext(final Element element,
360 final String leftText, final String mainText,
361 final String rightText, final PriorRecognitions priorRecognitions,
362 final Map<Element, List<LocalFeature>> featureCache,
363 final String logPurpose)
364 throws ClassCastException, IllegalArgumentException {
365 LinkedList<Feature> selfFeatures = new LinkedList<Feature>();
366 LinkedList<Feature> precedingFeatures = new LinkedList<Feature>();
367 LinkedList<Feature> followingFeatures = new LinkedList<Feature>();
368 LinkedList<Feature> ancestorFeatures = new LinkedList<Feature>();
369 LinkedList<Feature> ancestorSiblingFeatures = new LinkedList<Feature>();
370 final FeatureVector fullRep = new DefaultFeatureVector();
371
372
373 List priorFeatures;
374 if ((getRecognitionNumber() > 0) && (priorRecognitions != null)) {
375 priorFeatures = buildPrior(priorRecognitions);
376 } else {
377 priorFeatures = null;
378 }
379
380
381 ElementPosition position =
382 handleSiblings("", element, getSiblingNumber(),
383 precedingFeatures, followingFeatures, featureCache);
384
385
386
387
388
389 selfFeatures.add(new GlobalFeature("Descendant-or-self axis"));
390 selfFeatures.add(new GlobalFeature(AXIS_DESC_OR_SELF,
391 LocalFeature.MARKER_START));
392 buildFeatures(AXIS_DESC_OR_SELF, element, position, true,
393 selfFeatures, true, featureCache);
394 buildTextFeatures(AXIS_DESC_OR_SELF, element, leftText.trim(),
395 mainText.trim(), rightText.trim(), selfFeatures);
396
397
398
399
400
401 handleAncestors(element, getAncestorNumber(),
402 getSiblingNumber() - 1, ancestorFeatures,
403 ancestorSiblingFeatures, new HashBag(), featureCache);
404 ancestorFeatures.addFirst(new GlobalFeature(AXIS_ANCESTOR,
405 LocalFeature.MARKER_START));
406 ancestorFeatures.addFirst(new GlobalFeature("Ancestor axis"));
407
408
409
410
411
412 if (priorFeatures != null) {
413 fullRep.addAll(priorFeatures);
414 priorFeatures = null;
415 }
416
417
418 fullRep.addAll(precedingFeatures);
419 precedingFeatures = null;
420 fullRep.addAll(selfFeatures);
421 fullRep.addAll(followingFeatures);
422 followingFeatures = null;
423
424
425 fullRep.addAll(ancestorFeatures);
426 ancestorFeatures = null;
427 fullRep.addAll(selfFeatures);
428 selfFeatures = null;
429 fullRep.addAll(ancestorSiblingFeatures);
430 ancestorSiblingFeatures = null;
431
432
433
434
435
436 final List filteredRep = filterRepresentation(fullRep);
437
438
439 removeExtraMarkers(filteredRep);
440 fullRep.addAll(filteredRep);
441
442
443
444
445
446
447 return fullRep;
448 }
449
450 /***
451 * Builds the features of an element and appends them to the
452 * specified <code>featureList</code>. Handles attributes and calculated
453 * values and the element itself. Child elements are only handled when
454 * <code>recurseInsteadOfText</code> it <code>true</code> -- the axis name
455 * is not changed for child elements.
456 *
457 * @param axisName the name of the axis, used to start each feature
458 * @param element the element to process
459 * @param position wraps the position of the element within its parent
460 * element and related data, used for calculating positional features;
461 * if <code>null</code>, no positional features are calculated
462 * @param recurseInsteadOfText if <code>true</code> child elements
463 * are recursively processed, but no
464 * {@linkplain #calculateValuesFromText(String, String, List) values from
465 * text} are calculated; otherwise text is processed but child elements are
466 * not
467 * @param featureList the list of {@link GlobalFeature}s to add features to
468 * @param addAtEnd whether to add the new features at the end of at the
469 * beginning of the <code>featureList</code>
470 * @param cache a cache of local feature, mapping {@link Element}s to
471 * {@link List}s of {@link LocalFeature}
472 */
473 protected void buildFeatures(final String axisName, final Element element,
474 final ElementPosition position, final boolean recurseInsteadOfText,
475 final LinkedList<Feature> featureList, final boolean addAtEnd,
476 final Map<Element, List<LocalFeature>> cache) {
477 final List<LocalFeature> localFeatures;
478
479 if ((!recurseInsteadOfText) && cache.containsKey(element)) {
480
481 localFeatures = cache.get(element);
482
483
484 } else {
485
486 localFeatures = buildLocalFeatures(element, position,
487 recurseInsteadOfText);
488
489
490 if (!recurseInsteadOfText) {
491 cache.put(element, localFeatures);
492 }
493 }
494
495
496 final Iterator<LocalFeature> localIter = localFeatures.iterator();
497 GlobalFeature.globalize(axisName, localIter, featureList, addAtEnd);
498
499
500 if (recurseInsteadOfText) {
501
502 final Iterator childIter = element.elementIterator();
503 Element child;
504 while (childIter.hasNext()) {
505 child = (Element) childIter.next();
506 buildFeatures(axisName, child, null, recurseInsteadOfText,
507 featureList, addAtEnd, cache);
508 }
509 }
510 }
511
512 /***
513 * Builds the local features of an element. Handles attributes and
514 * calculated values and the element itself, but no child elements.
515 * The {@link LocalFeature}s can be stored in a cache for re-use; they must
516 * be combined with an axis name for getting a global feature.
517 *
518 * @param element the element to process
519 * @param position wraps the position of the element within its parent
520 * element and related data, used for calculating positional features;
521 * if <code>null</code>, no positional features are calculated
522 * @param ignoreText if <code>true</code>, no {@linkplain
523 * #calculateValuesFromText(String, String, List) values from text} or
524 * {@linkplain #calculateHeadValues(Element, List) head values} are
525 * calculated
526 * @return returns a List of created {@link LocalFeature}s
527 */
528 protected List<LocalFeature> buildLocalFeatures(final Element element,
529 final ElementPosition position, final boolean ignoreText) {
530 final List<LocalFeature> result = new LinkedList<LocalFeature>();
531 final String name = DOMUtils.name(element);
532
533
534
535 final SortedMap<String, String> attribMap =
536 new TreeMap<String, String>();
537 final Iterator attribIter = element.attributeIterator();
538 Attribute currentAttrib;
539 String attName;
540 boolean foundDefaultAttrib = false;
541
542 while (attribIter.hasNext()) {
543 currentAttrib = (Attribute) attribIter.next();
544 attName = DOMUtils.name(currentAttrib);
545 attribMap.put(attName, currentAttrib.getValue());
546
547 if (!foundDefaultAttrib
548 && getDefaultAttributes().contains(attName)) {
549
550 foundDefaultAttrib = true;
551 }
552 }
553
554
555 if (!foundDefaultAttrib) {
556 result.add(LocalFeature.createElementFeature(name));
557
558 }
559
560 final Iterator<String> sortedIter = attribMap.keySet().iterator();
561 String attValue;
562
563 while (sortedIter.hasNext()) {
564 attName = sortedIter.next();
565 attValue = attribMap.get(attName);
566
567 result.addAll(LocalFeature.createAttributeFeatures(name, attName,
568 attValue, getSplitMaximum()));
569
570 }
571
572
573
574
575 if (position != null) {
576 calculatePositionalValues(name, position, result);
577
578 }
579
580 if (!ignoreText) {
581
582 final String trimmedText = element.getTextTrim();
583
584 if ((trimmedText != null) && (trimmedText.length() > 0)) {
585
586 calculateValuesFromText(name, trimmedText, result);
587 } else {
588
589 calculateHeadValues(element, result);
590 }
591 }
592
593 return result;
594 }
595
596 /***
597 * Builds the pseudo-axis of prior recognitions.
598 *
599 * @param priorRecognitions a buffer of the last {@link Recognition}s from
600 * the document, created by calling {@link Representation#initDocument}
601 * @return a list of {@link GlobalFeature}s representing prior recognitions
602 * @throws ClassCastException if the <code>priorRecognitions</code> buffer
603 * contains objects that aren't {@link Recognition}s
604 */
605 protected List<Feature> buildPrior(
606 final PriorRecognitions priorRecognitions)
607 throws ClassCastException {
608 final LinkedList<Feature> result =
609 new LinkedList<Feature>();
610
611
612 result.add(new GlobalFeature("Prior recognitions"));
613 result.add(new GlobalFeature(AXIS_PRIOR, LocalFeature.MARKER_START));
614
615
616 final Iterator recogIter = priorRecognitions.iterator();
617 PriorRecognitions.Pair currentPair;
618 Recognition currentRecognition;
619 String type;
620 String trimmedText;
621 LinkedList<Feature> recogFeatures;
622 final List<LocalFeature> calculatedValues =
623 new LinkedList<LocalFeature>();
624
625
626 final int briefRecogs =
627 priorRecognitions.size() - getDetailedRecognitions();
628
629
630 if (priorRecognitions.size() > getRecognitionNumber()) {
631 Util.LOG.error("Buffer of prior recognitions too large ("
632 + priorRecognitions.size() + " instead of "
633 + getRecognitionNumber() + " elements) -- use "
634 + "Representation.createRecognitionBuffer() to create it");
635 }
636
637
638 for (int i = 0; recogIter.hasNext(); i++) {
639 currentPair = (PriorRecognitions.Pair) recogIter.next();
640 currentRecognition = currentPair.getRecognition();
641 type = currentRecognition.getType();
642
643
644 recogFeatures = currentPair.getCachedFeatures();
645 if (recogFeatures == null) {
646
647 recogFeatures = new LinkedList<Feature>();
648
649
650 recogFeatures.add(new GlobalFeature(AXIS_PRIOR,
651 LocalFeature.createElementFeature(type)));
652
653 if (i >= briefRecogs) {
654
655
656 trimmedText = currentRecognition.getText().trim();
657 if (StringUtils.isNotEmpty(trimmedText)) {
658 calculateValuesFromText(type, trimmedText,
659 calculatedValues);
660
661
662 GlobalFeature.globalize(AXIS_PRIOR,
663 calculatedValues.iterator(), recogFeatures,
664 true);
665 calculatedValues.clear();
666 }
667 }
668
669
670 if (currentRecognition.isSealed()) {
671 currentPair.setCachedFeatures(recogFeatures);
672 }
673 } else {
674
675
676 if (i < briefRecogs) {
677
678
679 while (recogFeatures.size() > 1) {
680 recogFeatures.removeLast();
681 }
682 }
683 }
684
685
686 result.addAll(recogFeatures);
687
688 if (!recogIter.hasNext()) {
689
690 result.addLast(new GlobalFeature(AXIS_PRIOR,
691 LocalFeature.createCalculatedFeature(type, "finished",
692 String.valueOf(currentRecognition.isSealed()))));
693 }
694 }
695 return result;
696 }
697
698 /***
699 * Builds the context representation of text in an element, differentiating
700 * between three kinds of textual contents: a left part, a main part, and
701 * a right part. Each text part is optional (if all of them are empty,
702 * this method does nothing).
703 *
704 * @param axisName the name of the axis, used to start each feature
705 * @param element the element whose context should be represented
706 * @param trimmedLeft trimmed textual content to the left of (preceding)
707 * <code>trimmedMain</code>, might be empty
708 * @param trimmedMain trimmed main textual content to represent, might
709 * be empty
710 * @param trimmedRight trimmed textual content to the right of (following)
711 * <code>trimmedMain</code>, might be empty
712 * @param featureList a list of {@link GlobalFeature}s to add the values to
713 */
714 protected void buildTextFeatures(final String axisName,
715 final Element element, final String trimmedLeft,
716 final String trimmedMain, final String trimmedRight,
717 final LinkedList<Feature> featureList) {
718 final String name = DOMUtils.name(element);
719 final List<LocalFeature> localTextFeatures =
720 new LinkedList<LocalFeature>();
721
722
723
724 final String innerMarker = FeatureType.CALCULATED.getMark();
725
726 if (trimmedLeft.length() > 0) {
727 calculateValuesFromText(name + innerMarker + "left", trimmedLeft,
728 localTextFeatures);
729 }
730 if (trimmedMain.length() > 0) {
731 calculateValuesFromText(name, trimmedMain, localTextFeatures);
732 }
733 if (trimmedRight.length() > 0) {
734 calculateValuesFromText(name + innerMarker + "right", trimmedRight,
735 localTextFeatures);
736 }
737
738
739 GlobalFeature.globalize(axisName, localTextFeatures.iterator(),
740 featureList, true);
741 }
742
743 /***
744 * Creates values that depend on "head" children of an element, if the
745 * element contains any of them. This method is meant for element that
746 * don't contain any textual content, so their children are checked
747 * instead.
748 *
749 * This implementation proceeds as follows:
750 * <ul>
751 * <li>If the specified element is of type {@link #getHeadElement()}, the
752 * value of the {@link #getHeadAttribute()} or else the textual content of
753 * its right-most child element is stored (in a calculated value named
754 * "head"). Child elements are iterated from right to left unless one
755 * containing an appropriate attribute or textual content is found (or none
756 * are left).
757 * <li>Otherwise, if the element contains children of the type
758 * {@link #getHeadElement()}, the first and last child element of this type
759 * are recursively processed and the results stored in calculated names
760 * named "lhead" and "rhead". Of there is only a single child of appropriate
761 * type, the "lhead" value is omitted.
762 * <li>Otherwise no values are calculated.
763 * </ul>
764 *
765 * <p>If a value contains whitespace, only the final subsequence following
766 * all whitespace is preserved.
767 *
768 * @param element the element to process
769 * @param values a list of {@link LocalFeature}s to add the calculated
770 * values to
771 */
772 protected void calculateHeadValues(final Element element,
773 final List<LocalFeature> values) {
774 final String elementName = DOMUtils.name(element);
775 if (getHeadElement().equals(elementName)) {
776
777 final String headValue = determineHeadValue(element);
778
779
780 if (headValue.length() > 0) {
781 values.add(LocalFeature.createCalculatedFeature(elementName,
782 "head", headValue));
783 }
784 } else {
785
786 final List suitableChildren = DOMUtils.elementsByName(element,
787 getHeadElement());
788 final int numberOfChildren = suitableChildren.size();
789
790 if (numberOfChildren >= 1) {
791
792 final Element rightMostChild =
793 (Element) suitableChildren.get(numberOfChildren - 1);
794 final String rHeadValue = determineHeadValue(rightMostChild);
795
796
797 if (rHeadValue.length() > 0) {
798 values.add(LocalFeature.createCalculatedFeature(
799 elementName, "rhead", rHeadValue));
800 }
801
802 if (numberOfChildren >= 2) {
803
804 final Element leftMostChild =
805 (Element) suitableChildren.get(0);
806 final String lHeadValue = determineHeadValue(leftMostChild);
807
808
809 if (lHeadValue.length() > 0) {
810 values.add(LocalFeature.createCalculatedFeature(
811 elementName, "lhead", lHeadValue));
812 }
813 }
814 }
815 }
816 }
817
818 /***
819 * Calculates values that depend on the position of an element within its
820 * parent.
821 *
822 * @param elementName the name of the element to process, as returned by
823 * {@link DOMUtils#name(Element)}
824 * @param position wraps the position of the element within its parent
825 * element and related data, must not be <code>null</code>
826 * @param values a list to add the calculated values to
827 */
828 protected void calculatePositionalValues(final String elementName,
829 final ElementPosition position, final List<LocalFeature> values) {
830
831 values.add(LocalFeature.createCalculatedFeature(elementName, "pos",
832 Integer.toString(position.getOverallPosition())));
833
834
835 values.add(LocalFeature.createCalculatedFeature(elementName,
836 "typedPos", Integer.toString(position.getTypedPosition())));
837
838
839 values.add(LocalFeature.createCalculatedFeature(elementName,
840 "roughPos", determineRoughPosition(position.getOverallPosition(),
841 position.getAllChildren())));
842 values.add(LocalFeature.createCalculatedFeature(elementName,
843 "roughTypedPos", determineRoughPosition(position.getTypedPosition(),
844 position.getTypedChildren())));
845 }
846
847 /***
848 * Calculates values that depend on the textual content of an element:
849 * prefixes, suffixes, length data, and token type. Also includes the text
850 * itself.
851 *
852 * @param elementName the name of the element to process, as returned by
853 * {@link DOMUtils#name(Element)}
854 * @param trimmedText the trimmed textual content of the element to
855 * process, must not be empty
856 * @param values a list of {@link LocalFeature}s to add the calculated
857 * values to
858 * @throws IllegalArgumentException if the empty string was given as
859 * <code>trimmedText</code>
860 */
861 protected void calculateValuesFromText(final String elementName,
862 final String trimmedText, final List<LocalFeature> values)
863 throws IllegalArgumentException {
864 final int textLength = trimmedText.length();
865 if (textLength == 0) {
866 throw new IllegalArgumentException(
867 "Specifed trimmed text must not be empty!");
868 }
869
870
871 final String[] tokens = TextUtils.splitString(trimmedText, -1);
872 final String firstTokenLC = tokens[0].toLowerCase();
873 final String lastTokenLC;
874
875 if (tokens.length == 1) {
876
877 lastTokenLC = firstTokenLC;
878 } else {
879 lastTokenLC = tokens[tokens.length - 1].toLowerCase();
880 }
881
882
883
884 final int maxPrefixLength = Math.min(maximumPrefixLength,
885 firstTokenLC.length() - 1);
886 for (int i = 1; i <= maxPrefixLength; i++) {
887 values.add(LocalFeature.createCalculatedFeature(elementName,
888 "prefix" + i, firstTokenLC.substring(0, i)));
889 }
890
891
892
893 final int maxSuffixLength = Math.min(maximumPrefixLength,
894 lastTokenLC.length() - 1);
895 for (int i = 1; i <= maxSuffixLength; i++) {
896 values.add(LocalFeature.createCalculatedFeature(elementName,
897 "suffix" + i, lastTokenLC.substring(lastTokenLC.length() - i)));
898 }
899
900
901 values.add(LocalFeature.createCalculatedFeature(elementName, "length",
902 Integer.toString(textLength)));
903
904
905 final long sqrtLength = Math.round(Math.sqrt(textLength));
906 values.add(LocalFeature.createCalculatedFeature(elementName,
907 "sqrtLength", Long.toString(sqrtLength)));
908
909
910 final Set typeSet = TOKEN_TYPE_PATTERNS.keySet();
911 final Iterator typeIter = typeSet.iterator();
912 String currentDesc;
913 Pattern currentPattern;
914 Matcher currentMatcher;
915 boolean foundMatch = false;
916 String matchingType = "mixed";
917
918
919 while (typeIter.hasNext() && !foundMatch) {
920 currentDesc = (String) typeIter.next();
921 currentPattern = (Pattern) TOKEN_TYPE_PATTERNS.get(currentDesc);
922 currentMatcher = currentPattern.matcher(tokens[tokens.length - 1]);
923
924 if (currentMatcher.matches()) {
925 foundMatch = true;
926 matchingType = currentDesc;
927 }
928 }
929
930
931 final String fullType = (tokens.length > 1)
932 ? matchingType + "+multi" : matchingType;
933 values.add(LocalFeature.createCalculatedFeature(elementName,
934 "tokenType", fullType));
935
936
937 KeyValue[] sensorData;
938 String prefix;
939 int j;
940
941 for (int i = 0; i < sensors.length; i++) {
942
943
944 prefix = i + FeatureType.CALCULATED.getMark();
945 sensorData = sensors[i].lookup(trimmedText);
946
947 for (j = 0; j < sensorData.length; j++) {
948 values.add(LocalFeature.createCalculatedFeature(elementName,
949 prefix + sensorData[j].getKey(),
950 Util.asString(sensorData[j].getValue())));
951 }
952 }
953
954
955 values.addAll(LocalFeature.createCalculatedFeatures(elementName, "lc",
956 trimmedText.toLowerCase(), getSplitMaximum()));
957 values.addAll(LocalFeature.createTextFeatures(elementName, trimmedText,
958 getSplitMaximum()));
959 }
960
961 /***
962 * Helper method for determining the head value for an element of type
963 * {@link #getHeadElement()}. See
964 * {@link #calculateHeadValues(Element, List)} for a description of the
965 * algorithm.
966 *
967 * @param element the element to process, must be of type
968 * {@link #getHeadElement()}
969 * @return the calculated head value, or the empty string if no value
970 * could be calculated (the specified element doesn't have any children,
971 * or the right-most child contains neither {@link #getHeadAttribute()} nor
972 * textual content)
973 */
974 protected String determineHeadValue(final Element element) {
975 String fullResult = null;
976 final List children = element.elements();
977 final ListIterator childIter = children.listIterator(children.size());
978 Element currentChild;
979 Attribute headAttrib;
980 boolean done = false;
981
982
983
984 while (childIter.hasPrevious() && !done) {
985 currentChild = (Element) childIter.previous();
986 headAttrib = DOMUtils.attributeByName(currentChild,
987 getHeadAttribute());
988
989
990 if (headAttrib != null) {
991 fullResult = headAttrib.getText();
992 } else {
993 fullResult = element.getTextTrim();
994 }
995
996
997 done = (fullResult != null) && (fullResult.length() > 0);
998 }
999
1000 final String result;
1001 if (fullResult != null) {
1002
1003 final String[] splits = TextUtils.splitString(fullResult, 1);
1004 result = splits[0];
1005 } else {
1006 result = "";
1007 }
1008 return result;
1009 }
1010
1011 /***
1012 * Helper method called by
1013 * {@link #calculatePositionalValues(String, ElementPosition, List)} to
1014 * collapse a position in to one of five values.
1015 * <dl>
1016 * <dt>first</dt>
1017 * <dd>for the first element</dd>
1018 * <dt>early</dt>
1019 * <dd>if position is within the first third of all elements (but not the
1020 * first one), upper limit included</dd>
1021 * <dt>middle</dt>
1022 * <dd>if position is within the second third of all elements, limits
1023 * excluded</dd>
1024 * <dt>late</dt>
1025 * <dd>if position is within the last third of all elements (but not the
1026 * last one), lower limit included</dd>
1027 * <dt>last</dt>
1028 * <dd>for the last element</dd>
1029 * </dl>
1030 *
1031 * @param position the position counted from 0, should be non-negative and
1032 * smaller than <code>elementCount</code> (otherwise the results are
1033 * undefined)
1034 * @param elementCount the number of elements
1035 * @return a String containing one of the five values described above
1036 */
1037 protected String determineRoughPosition(final int position,
1038 final int elementCount) {
1039
1040 final int realPosition = position + 1;
1041 final String result;
1042
1043 if (realPosition == 1) {
1044 result = "first";
1045 } else if (realPosition == elementCount) {
1046 result = "last";
1047 } else {
1048 final double oneThird = elementCount / 3.0;
1049 if (realPosition <= oneThird) {
1050 result = "early";
1051 } else if (realPosition >= (2 * oneThird)) {
1052 result = "late";
1053 } else {
1054 result = "middle";
1055 }
1056 }
1057 return result;
1058 }
1059
1060 /***
1061 * Creates a filtered view of a context representation. A filtered
1062 * representation is created for each {@linkplain FeatureType type of
1063 * features} that occurs at least twice;
1064 * attributes and calculated values occuring only once are omitted.
1065 * For textual content, another filtered representation is created.
1066 *
1067 * <p>Features representing markers ({@link FeatureType#MARKER}),
1068 * stand-alone elements ({@link FeatureType#ELEMENT}) and default attributes
1069 * ({@link #getDefaultAttributes()}) are included in all filtered
1070 * representations. Comment-only features are ignored.
1071 *
1072 * @param originalRep a feature vector containing the representation
1073 * to filter
1074 * @return a list of {@link de.fu_berlin.ties.classify.feature.Feature}s
1075 * combining the filtered representations created by this method
1076 */
1077 protected List<Feature> filterRepresentation(
1078 final FeatureVector originalRep) {
1079
1080
1081 final List<Feature> universalFeatures = new LinkedList<Feature>();
1082
1083 final LinkedList<Feature> textualFeatures = new LinkedList<Feature>();
1084
1085
1086 final MultiMap featureMap = new MultiHashMap();
1087
1088 final SortedBag featureCount = new TreeBag();
1089
1090 final Iterator iter = originalRep.iterator();
1091 GlobalFeature currentGlobal;
1092 LocalFeature currentLocal;
1093 FeatureType currentType;
1094 Iterator keyIter;
1095 String currentKey;
1096 Iterator<Feature> universalIter;
1097 Iterator innerIter;
1098 Feature innerFeature;
1099
1100
1101 while (iter.hasNext()) {
1102 currentGlobal = (GlobalFeature) iter.next();
1103 currentLocal = currentGlobal.getLocalFeature();
1104
1105 if (currentLocal != null) {
1106 currentType = currentLocal.getType();
1107 if (((currentType == FeatureType.ATTRIBUTE)
1108 && (getDefaultAttributes().contains(
1109 currentLocal.getName())))
1110 || (currentType == FeatureType.MARKER)) {
1111
1112 universalFeatures.add(currentGlobal);
1113 textualFeatures.add(currentGlobal);
1114 keyIter = featureMap.keySet().iterator();
1115 while (keyIter.hasNext()) {
1116 currentKey = (String) keyIter.next();
1117 featureMap.put(currentKey, currentGlobal);
1118 }
1119
1120 } else if ((currentType == FeatureType.ATTRIBUTE)
1121 || (currentType == FeatureType.ELEMENT)
1122 || ((currentType == FeatureType.CALCULATED)
1123 && currentLocal.getName().endsWith("head"))) {
1124
1125 currentKey = currentType.getMark();
1126
1127 if (!featureMap.containsKey(currentKey)) {
1128
1129 universalIter = universalFeatures.iterator();
1130 while (universalIter.hasNext()) {
1131 featureMap.put(currentKey, universalIter.next());
1132 }
1133 }
1134
1135
1136 featureMap.put(currentKey, currentGlobal);
1137 featureCount.add(currentKey);
1138 } else if (currentType == FeatureType.TEXT) {
1139
1140 textualFeatures.add(currentGlobal);
1141 }
1142
1143 }
1144 }
1145
1146
1147 final List<Feature> result = new LinkedList<Feature>();
1148 final Iterator sortedIter = featureCount.uniqueSet().iterator();
1149 Collection currentView;
1150
1151
1152 while (sortedIter.hasNext()) {
1153 currentKey = (String) sortedIter.next();
1154
1155
1156 if (featureCount.getCount(currentKey) > 2) {
1157 result.add(new GlobalFeature("Filtered view for " + currentKey
1158 + " features"));
1159 currentView = (Collection) featureMap.get(currentKey);
1160 innerIter = currentView.iterator();
1161
1162 while (innerIter.hasNext()) {
1163 innerFeature = (Feature) innerIter.next();
1164 result.add(innerFeature);
1165 }
1166 }
1167 }
1168
1169
1170 result.add(new GlobalFeature("Filtered view for textual features"));
1171 result.addAll(textualFeatures);
1172
1173 return result;
1174 }
1175
1176 /***
1177 * Returns the maximum number of ancestors to include in the context
1178 * representation.
1179 *
1180 * @return the number of ancestors to include
1181 */
1182 public int getAncestorNumber() {
1183 return ancestorNumber;
1184 }
1185
1186 /***
1187 * Returns the unmodifiable set of names of default attributes. The
1188 * contained names are Strings in a format compatible to
1189 * {@link DOMUtils#name(Attribute)}.
1190 *
1191 * @return the set of default attributes
1192 */
1193 public Set getDefaultAttributes() {
1194 return defaultAttributes;
1195 }
1196
1197 /***
1198 * Returns the number of preceding recognitions to represent in detail.
1199 * @return the value of the attribute
1200 */
1201 public int getDetailedRecognitions() {
1202 return detailedRecognitions;
1203 }
1204
1205 /***
1206 * Returns the name of the element to use for calculating head values.
1207 *
1208 * @return the name of the attribute
1209 * @see #calculateHeadValues(Element, List)
1210 */
1211 public String getHeadAttribute() {
1212 return headAttribute;
1213 }
1214
1215 /***
1216 * Returns the name of the attribute to use for calculating head values.
1217 *
1218 * @return the name of the element
1219 * @see #calculateHeadValues(Element, List)
1220 */
1221 public String getHeadElement() {
1222 return headElement;
1223 }
1224
1225 /***
1226 * Returns the basic number of preceding and following siblings to include
1227 * in the context representation.
1228 *
1229 * @return the number of siblings to include
1230 */
1231 public int getSiblingNumber() {
1232 return siblingNumber;
1233 }
1234
1235 /***
1236 * Handles ancestors and ancestor siblings of an element.
1237 *
1238 * @param element the element to process
1239 * @param ancestorsToAdd the number of ancestors to add, must be > 0;
1240 * if > 1, this method calls itself recursively, decreasing the number
1241 * by 1
1242 * @param ancestorSiblingsToAdd the number of preceding/following siblings
1243 * of the current ancestors to add; if 0 or negative, no siblings are added;
1244 * a recursive call decreases this parameter by 1 if any ancestor siblings
1245 * were found (if no siblings were found, the number passes unchanged)
1246 * @param ancestorFeatures the list of {@link GlobalFeature}s to prepend
1247 * the features on the ancestors to
1248 * @param ancestorSiblingFeatures the list of {@link GlobalFeature}s to
1249 * append the features on the ancestors siblings to
1250 * @param processedAncestorNames a bag that will typically be empty when
1251 * first calling this method (will be filled by recursive calls)
1252 * @param cache a cache of local feature, mapping {@link Element}s to
1253 * {@link List}s of {@link LocalFeature}
1254 * @throws IllegalArgumentException if <code>ancestorsToAdd</code> is 0 or
1255 * negative
1256 */
1257 protected void handleAncestors(final Element element,
1258 final int ancestorsToAdd, final int ancestorSiblingsToAdd,
1259 final LinkedList<Feature> ancestorFeatures,
1260 final LinkedList<Feature> ancestorSiblingFeatures,
1261 final Bag processedAncestorNames,
1262 final Map<Element, List<LocalFeature>> cache)
1263 throws IllegalArgumentException {
1264 if (ancestorsToAdd <= 0) {
1265 throw new IllegalArgumentException(
1266 "Cannot add a non-positive number of ancestors ("
1267 + ancestorsToAdd + ")!");
1268 }
1269
1270
1271 final Element parent = element.getParent();
1272 if (parent != null) {
1273
1274 final ElementPosition position;
1275
1276
1277 if (ancestorSiblingsToAdd > 0) {
1278
1279 final String parentName = DOMUtils.name(parent);
1280 processedAncestorNames.add(parentName);
1281 final int typeCount
1282 = processedAncestorNames.getCount(parentName);
1283 final String nameToQuote;
1284
1285 if (typeCount > 2) {
1286
1287
1288 nameToQuote = Integer.toString(typeCount) + parentName;
1289 } else {
1290
1291 nameToQuote = parentName;
1292 }
1293 final String axisPrefix =
1294 AXIS_ANCESTOR + LocalFeature.quote(nameToQuote);
1295
1296
1297
1298 final LinkedList<Feature> siblingFeatures =
1299 new LinkedList<Feature>();
1300 position = handleSiblings(axisPrefix, parent,
1301 ancestorSiblingsToAdd, siblingFeatures, siblingFeatures,
1302 cache);
1303 ancestorSiblingFeatures.addAll(siblingFeatures);
1304 } else {
1305 position = null;
1306 }
1307
1308
1309 buildFeatures(AXIS_ANCESTOR, parent, position, false,
1310 ancestorFeatures, false, cache);
1311
1312
1313 if (ancestorsToAdd > 1) {
1314 final int parentSilbingsToAdd;
1315
1316
1317 if ((position != null) && (position.getProcessedPreceding()
1318 + position.getProcessedPreceding() > 0)) {
1319
1320 parentSilbingsToAdd = ancestorSiblingsToAdd - 1;
1321 } else {
1322
1323 parentSilbingsToAdd = ancestorSiblingsToAdd;
1324 }
1325
1326 handleAncestors(parent, ancestorsToAdd - 1 ,
1327 parentSilbingsToAdd, ancestorFeatures,
1328 ancestorSiblingFeatures, processedAncestorNames, cache);
1329 }
1330 }
1331 }
1332
1333 /***
1334 * Adds the preceding and following siblings of an element.
1335 *
1336 * @param axisPrefix the prefix of the axis name, used to start each
1337 * feature; specify the empty string if no prefix should be used
1338 * @param element the element to process
1339 * @param baseNumber the basic number of siblings to keep; the actual
1340 * number of siblings kept might vary
1341 * @param precedingFeatures the list of {@link GlobalFeature}s to prepend
1342 * the features on the preceding siblings to
1343 * @param followingFeatures the list of {@link GlobalFeature}s to append
1344 * the features on the following siblings to
1345 * @param cache a cache of local feature, mapping {@link Element}s to
1346 * {@link List}s of {@link LocalFeature}
1347 * @return a wrapper of the position of the element with its parent element
1348 * and related data, required by
1349 * {@link #calculatePositionalValues(String, ElementPosition, List)}, or
1350 * <code>null</code> if there is no parent element
1351 */
1352 protected ElementPosition handleSiblings(final String axisPrefix,
1353 final Element element, final int baseNumber,
1354 final LinkedList<Feature> precedingFeatures,
1355 final LinkedList<Feature> followingFeatures,
1356 final Map<Element, List<LocalFeature>> cache) {
1357 final Element parent = element.getParent();
1358 final String name = DOMUtils.name(element);
1359 final ElementPosition result;
1360
1361 if (parent != null) {
1362
1363 final Iterator childIter = parent.elementIterator();
1364 Element currentChild;
1365 boolean foundElement = false;
1366 LinkedList<Element> allPrecedingSiblings =
1367 new LinkedList<Element>();
1368 LinkedList<Element> allFollowingSiblings =
1369 new LinkedList<Element>();
1370
1371
1372 int allChildren = 0;
1373 int overallPosition = 0;
1374 int typedChildren = 0;
1375 int typedPosition = 0;
1376 boolean sameType;
1377
1378 while (childIter.hasNext()) {
1379 currentChild = (Element) childIter.next();
1380
1381
1382 allChildren++;
1383 if (name.equals(DOMUtils.name(currentChild))) {
1384 sameType = true;
1385 typedChildren++;
1386 } else {
1387 sameType = false;
1388 }
1389
1390 if (currentChild == element) {
1391 foundElement = true;
1392 } else {
1393 if (!foundElement) {
1394
1395 allPrecedingSiblings.addLast(currentChild);
1396
1397
1398 overallPosition++;
1399 if (sameType) {
1400 typedPosition++;
1401 }
1402 } else {
1403
1404 allFollowingSiblings.addLast(currentChild);
1405 }
1406 }
1407 }
1408
1409
1410 final List selectedPrecedingSiblings = selectPrecedingSiblings(
1411 element, allPrecedingSiblings, baseNumber);
1412 final List selectedFollowingSiblings = selectFollowingSiblings(
1413 element, allFollowingSiblings, baseNumber);
1414
1415
1416 result = new ElementPosition(allChildren, overallPosition,
1417 typedChildren, typedPosition, selectedPrecedingSiblings.size(),
1418 selectedFollowingSiblings.size());
1419
1420
1421 final StringBuilder precedingAxisName =
1422 new StringBuilder(axisPrefix);
1423 final StringBuilder followingAxisName =
1424 new StringBuilder(axisPrefix);
1425 final String prefixComment;
1426 if (axisPrefix.length() > 0) {
1427 prefixComment = " with '" + axisPrefix + "' prefix";
1428 precedingAxisName.append(GlobalFeature.SEP);
1429 followingAxisName.append(GlobalFeature.SEP);
1430 } else {
1431 prefixComment = "";
1432 }
1433 precedingAxisName.append(AXIS_PREC_SIBLING);
1434 followingAxisName.append(AXIS_FOLLOW_SIBLING);
1435
1436 ListIterator siblingIter;
1437 Element currentSibling;
1438
1439
1440 siblingIter = selectedPrecedingSiblings.listIterator(
1441 selectedPrecedingSiblings.size());
1442 while (siblingIter.hasPrevious()) {
1443 currentSibling = (Element) siblingIter.previous();
1444 buildFeatures(precedingAxisName.toString(), currentSibling,
1445 null, false, precedingFeatures, false, cache);
1446 }
1447
1448 precedingFeatures.addFirst(
1449 new GlobalFeature(precedingAxisName.toString(),
1450 LocalFeature.MARKER_START));
1451 precedingFeatures.addFirst(new GlobalFeature(
1452 "Preceding siblings axis" + prefixComment));
1453
1454
1455 followingFeatures.addLast(new GlobalFeature(
1456 "Following siblings axis" + prefixComment));
1457 followingFeatures.addLast(
1458 new GlobalFeature(followingAxisName.toString(),
1459 LocalFeature.MARKER_START));
1460
1461 siblingIter = selectedFollowingSiblings.listIterator();
1462 while (siblingIter.hasNext()) {
1463 currentSibling = (Element) siblingIter.next();
1464 buildFeatures(followingAxisName.toString(), currentSibling,
1465 null, false, followingFeatures, true, cache);
1466 }
1467 } else {
1468
1469 result = null;
1470 }
1471 return result;
1472 }
1473
1474 /***
1475 * {@inheritDoc}
1476 */
1477 public PriorRecognitions initDocument(final File filename,
1478 final TokenizerFactory tFactory)
1479 throws ProcessingException, IOException {
1480
1481 for (int i = 0; i < sensors.length; i++) {
1482 sensors[i].initDocument(filename, tFactory);
1483 }
1484
1485 return super.initDocument(filename, tFactory);
1486 }
1487
1488 /***
1489 * Modifies a list of {@link GlobalFeature}s to remove extraneous
1490 * {@link FeatureType#MARKER} features. Keeps only the last one of several
1491 * sequential marker features; trailing marker features are removed as well.
1492 *
1493 * @param features the list of features to modify
1494 */
1495 protected void removeExtraMarkers(final List features) {
1496
1497 final ListIterator backwardIter =
1498 features.listIterator(features.size());
1499 GlobalFeature currentGlobal;
1500 LocalFeature currentLocal;
1501 FeatureType currentType;
1502
1503
1504 FeatureType lastType = FeatureType.MARKER;
1505
1506 while (backwardIter.hasPrevious()) {
1507 currentGlobal = (GlobalFeature) backwardIter.previous();
1508 currentLocal = currentGlobal.getLocalFeature();
1509
1510 if (currentLocal != null) {
1511 currentType = currentLocal.getType();
1512
1513 if ((currentType == FeatureType.MARKER)
1514 && (lastType == FeatureType.MARKER)) {
1515
1516 backwardIter.remove();
1517 }
1518
1519 lastType = currentType;
1520 }
1521 }
1522 }
1523
1524 /***
1525 * Selects the siblings to keep among all following siblings. This
1526 * implementation always keeps the <code>baseNumber</code> first siblings.
1527 * One of the selected siblings may have a different type (name as returned
1528 * by {@link DOMUtils#name(Element)}) than the main element --
1529 * if there are more with different types, they are skipped.
1530 *
1531 * @param mainElement the element whose siblings should be selected
1532 * @param allFollowingSiblings the list of all following siblings
1533 * @param baseNumber the basic number of siblings to keep; the actual
1534 * number of siblings kept might vary
1535 * @return the subset of following siblings to keep; an empty list if
1536 * there are no siblings to keep
1537 */
1538 protected List<Element> selectFollowingSiblings(final Element mainElement,
1539 final LinkedList<Element> allFollowingSiblings,
1540 final int baseNumber) {
1541 final String elementType = DOMUtils.name(mainElement);
1542 final LinkedList<Element> selectedFollowingSiblings =
1543 new LinkedList<Element>();
1544 final Iterator<Element> siblingIter = allFollowingSiblings.iterator();
1545 Element currentSibling;
1546 boolean addedSiblingOfDifferentType = false;
1547 boolean doAdd;
1548
1549
1550 while (siblingIter.hasNext()
1551 && (selectedFollowingSiblings.size() < baseNumber)) {
1552 currentSibling = siblingIter.next();
1553
1554 if (elementType.equals(DOMUtils.name(currentSibling))) {
1555 doAdd = true;
1556 } else {
1557
1558 doAdd = !addedSiblingOfDifferentType;
1559 addedSiblingOfDifferentType = true;
1560 }
1561
1562 if (doAdd) {
1563 selectedFollowingSiblings.addLast(currentSibling);
1564 }
1565 }
1566
1567 return selectedFollowingSiblings;
1568 }
1569
1570 /***
1571 * Selects the siblings to keep among all preceding siblings. This
1572 * implementation always keeps the first sibling and the
1573 * <code>baseNumber</code> last siblings. One of the last siblings may
1574 * have a different type (name as returned by
1575 * {@link DOMUtils#name(Element)}) than the main element --
1576 * if there are more with different types, they are skipped.
1577 * If none of the <code>baseNumber</code> last siblings has a different
1578 * types, the last siblings with a different type (name) is also kept.
1579 *
1580 * @param mainElement the element whose siblings should be selected
1581 * @param allPrecedingSiblings the list of all preceding siblings
1582 * @param baseNumber the basic number of siblings to keep; the actual
1583 * number of siblings kept might vary
1584 * @return the subset of preceding siblings to keep; an empty list if
1585 * there are no siblings to keep
1586 */
1587 protected List<Element> selectPrecedingSiblings(final Element mainElement,
1588 final LinkedList<Element> allPrecedingSiblings,
1589 final int baseNumber) {
1590 final String elementType = DOMUtils.name(mainElement);
1591 final LinkedList<Element> selectedPrecedingSiblings =
1592 new LinkedList<Element>();
1593 final ListIterator<Element> siblingIter =
1594 allPrecedingSiblings.listIterator(allPrecedingSiblings.size());
1595 Element currentSibling;
1596 boolean addedSiblingOfDifferentType = false;
1597 boolean doAdd;
1598
1599
1600 while (siblingIter.hasPrevious()
1601 && (selectedPrecedingSiblings.size() < baseNumber)) {
1602 currentSibling = siblingIter.previous();
1603
1604 if (elementType.equals(DOMUtils.name(currentSibling))) {
1605 doAdd = true;
1606 } else {
1607
1608 doAdd = !addedSiblingOfDifferentType;
1609 addedSiblingOfDifferentType = true;
1610 }
1611
1612 if (doAdd) {
1613 selectedPrecedingSiblings.addFirst(currentSibling);
1614 }
1615 }
1616
1617
1618 if (!allPrecedingSiblings.isEmpty()) {
1619 final Element firstSibling = allPrecedingSiblings.getFirst();
1620 final Element firstSelected = selectedPrecedingSiblings.getFirst();
1621
1622
1623 if (firstSelected != firstSibling) {
1624 selectedPrecedingSiblings.addFirst(firstSibling);
1625 }
1626 }
1627
1628 return selectedPrecedingSiblings;
1629 }
1630
1631 /***
1632 * Returns a string representation of this object.
1633 *
1634 * @return a textual representation
1635 */
1636 public String toString() {
1637 return new ToStringBuilder(this)
1638 .appendSuper(super.toString())
1639 .append("detailed recognitions", detailedRecognitions)
1640 .append("number of ancestors", ancestorNumber)
1641 .append("number of siblings", siblingNumber)
1642 .append("maximum prefix length", maximumPrefixLength)
1643 .append("head element", headElement)
1644 .append("head attribute", headAttribute)
1645 .append("default attributes", defaultAttributes)
1646 .append("sensors", ArrayUtils.toString(sensors))
1647 .toString();
1648 }
1649
1650 }