1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.context;
23
24 import java.util.Collection;
25 import java.util.Collections;
26 import java.util.HashSet;
27 import java.util.Iterator;
28 import java.util.LinkedList;
29 import java.util.List;
30 import java.util.ListIterator;
31 import java.util.Map;
32 import java.util.Set;
33 import java.util.SortedMap;
34 import java.util.TreeMap;
35 import java.util.regex.Matcher;
36 import java.util.regex.Pattern;
37
38 import org.apache.commons.collections.Bag;
39 import org.apache.commons.collections.bag.HashBag;
40 import org.apache.commons.collections.KeyValue;
41 import org.apache.commons.collections.MultiHashMap;
42 import org.apache.commons.collections.MultiMap;
43 import org.apache.commons.collections.SortedBag;
44 import org.apache.commons.collections.bag.TreeBag;
45 import org.apache.commons.collections.map.LinkedMap;
46 import org.apache.commons.lang.ArrayUtils;
47 import org.apache.commons.lang.StringUtils;
48 import org.apache.commons.lang.builder.ToStringBuilder;
49 import org.dom4j.Attribute;
50 import org.dom4j.Element;
51
52 import de.fu_berlin.ties.ProcessingException;
53 import de.fu_berlin.ties.TiesConfiguration;
54 import de.fu_berlin.ties.xml.dom.DOMUtils;
55 import de.fu_berlin.ties.classify.feature.DefaultFeatureVector;
56 import de.fu_berlin.ties.classify.feature.Feature;
57 import de.fu_berlin.ties.classify.feature.FeatureVector;
58 import de.fu_berlin.ties.context.sensor.BaseSensor;
59 import de.fu_berlin.ties.context.sensor.Sensor;
60 import de.fu_berlin.ties.io.IOUtils;
61 import de.fu_berlin.ties.text.TextUtils;
62 import de.fu_berlin.ties.util.Util;
63
64 /***
65 * The context representation used by default. This class is thread-safe.
66 *
67 * @author Christian Siefkes
68 * @version $Revision: 1.60 $, $Date: 2004/10/12 15:55:47 $, $Author: siefkes $
69 */
70 public class DefaultRepresentation extends AbstractRepresentation {
71
72 /***
73 * Ancestor axis.
74 */
75 protected static final String AXIS_ANCESTOR = "an";
76
77 /***
78 * Descendant-or-self axis.
79 */
80 protected static final String AXIS_DESC_OR_SELF = "ds";
81
82 /***
83 * Following sibling axis.
84 */
85 protected static final String AXIS_FOLLOW_SIBLING = "fs";
86
87 /***
88 * Preceeding sibling axis.
89 */
90 protected static final String AXIS_PREC_SIBLING = "ps";
91
92 /***
93 * The pseudo-axis of prior recognitions.
94 */
95 protected static final String AXIS_PRIOR = "prior";
96
97 /***
98 * A sequence map mapping used by
99 * {@link #calculateValuesFromText(String, String, List)} to determine the
100 * "tokenType" value. Maps description strings to {@link Pattern}s to
101 * match. The description string for the first matching pattern is used.
102 * Initialized in the static constuctor. The value "mixed" is reserved for
103 * tokens that are not matched by any of the patterns.
104 */
105 protected static final LinkedMap TOKEN_TYPE_PATTERNS;
106
107 /***
108 * Static constructor: initializes the map of {@link #TOKEN_TYPE_PATTERNS}.
109 */
110 static {
111
112
113 final String lowercaseLetter = "[//p{Ll}//p{Lm}]";
114
115 final String uppercaseLetter = "[//p{Lu}//p{Lt}//p{Lm}]";
116
117 final String anyLetter = "[//p{L}//p{M}]";
118
119 TOKEN_TYPE_PATTERNS = new LinkedMap();
120
121
122 TOKEN_TYPE_PATTERNS.put("lower",
123 Pattern.compile(lowercaseLetter + "+"));
124 TOKEN_TYPE_PATTERNS.put("CAPS",
125 Pattern.compile(uppercaseLetter + "+"));
126
127 TOKEN_TYPE_PATTERNS.put("Cap",
128 Pattern.compile(uppercaseLetter + lowercaseLetter + "+"));
129 TOKEN_TYPE_PATTERNS.put("mixedCase", Pattern.compile(anyLetter + "+"));
130
131 TOKEN_TYPE_PATTERNS.put("CAPSDot",
132 Pattern.compile(uppercaseLetter + "+//."));
133 TOKEN_TYPE_PATTERNS.put("alphaDot",
134 Pattern.compile(anyLetter + "+//."));
135
136 TOKEN_TYPE_PATTERNS.put("alpha+Punc",
137 Pattern.compile("[//p{L}//p{M}//p{Pd}]+//."));
138
139
140
141 TOKEN_TYPE_PATTERNS.put("openPunc", Pattern.compile("//p{Ps}+"));
142 TOKEN_TYPE_PATTERNS.put("closePunc", Pattern.compile("//p{Pe}+"));
143
144
145
146
147
148
149 TOKEN_TYPE_PATTERNS.put("punc", Pattern.compile("//p{P}+"));
150
151
152
153 TOKEN_TYPE_PATTERNS.put("num", Pattern.compile("(?:0|[1-9]//d*)"));
154
155 TOKEN_TYPE_PATTERNS.put("digits", Pattern.compile("//p{N}+"));
156 TOKEN_TYPE_PATTERNS.put("digits+.", Pattern.compile("[//p{N}.]+"));
157 TOKEN_TYPE_PATTERNS.put("digits+.,",
158 Pattern.compile("[//p{N}.,]+"));
159
160
161
162 TOKEN_TYPE_PATTERNS.put("alphanum",
163 Pattern.compile("[//p{L}//p{M}//p{N}//p{Pd}.,]+"));
164
165
166
167 TOKEN_TYPE_PATTERNS.put("math", Pattern.compile("//p{Sm}+"));
168 TOKEN_TYPE_PATTERNS.put("curr", Pattern.compile("//p{Sc}+"));
169
170 TOKEN_TYPE_PATTERNS.put("symb", Pattern.compile("//p{S}+"));
171
172
173 TOKEN_TYPE_PATTERNS.put("ctrl", Pattern.compile("//p{C}+"));
174 }
175
176 /***
177 * The maximum number of ancestors to include in the context representation.
178 */
179 private final int ancestorNumber;
180
181 /***
182 * The name of the attribute to use for calculating head values,
183 * cf. {@link #calculateHeadValues(Element, List)}. The name must be
184 * compatible to {@link DOMUtils#name(Attribute)}.
185 */
186 private final String headAttribute;
187
188 /***
189 * The name of the element to use for calculating head values,
190 * cf. {@link #calculateHeadValues(Element, List)}. The name must be
191 * compatible to {@link DOMUtils#name(Element)}.
192 */
193 private final String headElement;
194
195 /***
196 * An unmodifiable set of names of default attributes. Contains Strings
197 * in a format compatible to {@link DOMUtils#name(Attribute)}.
198 */
199 private final Set defaultAttributes;
200
201 /***
202 * The number of preceding recognitions to represent in detail.
203 */
204 private final int detailedRecognitions;
205
206 /***
207 * The basic number of preceding and following siblings to include in the
208 * context representation.
209 */
210 private final int siblingNumber;
211
212 /***
213 * The maximum length of prefixed and suffixes.
214 */
215 private final int maximumPrefixLength;
216
217 /***
218 * An array of sensors used to add semantic information.
219 */
220 private final Sensor[] sensors;
221
222 /***
223 * Creates a new instance based on the
224 * {@linkplain TiesConfiguration#CONF standard configuration}.
225 *
226 * @throws ProcessingException if an error occurs while initializing this
227 * instance
228 */
229 public DefaultRepresentation() throws ProcessingException {
230 this(TiesConfiguration.CONF);
231 }
232
233 /***
234 * Creates a new instance based on the provided configuration.
235 *
236 * @param config used to configure this instance
237 * @throws ProcessingException if an error occurs while initializing this
238 * instance
239 */
240 public DefaultRepresentation(final TiesConfiguration config)
241 throws ProcessingException {
242 this(config.getInt(CONFIG_RECOGN_NUM),
243 config.getInt("representation.recogn.detailed"),
244 config.getInt("representation.ancestor.num"),
245 config.getInt("representation.sibling.num"),
246 config.getInt(CONFIG_SPLIT_MAXIMUM),
247 config.getInt("representation.prefix.maxlength"),
248 config.getString("representation.head.element"),
249 config.getString("representation.head.attrib"),
250 config.getStringArray("representation.default.attribs"),
251 config.getInt(CONFIG_STORE_NTH),
252 config.getString(IOUtils.KEY_LOCAL_CHARSET, null),
253 config.getStringArray("representation.sensors"),
254 config);
255 }
256
257 /***
258 * Creates a new instance. All element/attribute names must be compatible
259 * to {@link DOMUtils#name(Element)}.
260 *
261 * @param recogNum the number of preceding recognitions to represent
262 * @param detailedRecogs the number of preceding recognitions to represent
263 * in detail
264 * @param numberOfAncestors the maximum number of ancestors to include in
265 * the context representation
266 * @param numberOfSiblings the basic number of preceding and following
267 * siblings to include
268 * @param splitMax the maximum number of subsequences to keep when
269 * a feature value must be split (at whitespace).
270 * @param prefixMax the maximum length of prefixed and suffixes
271 * @param headElementName the name of the element to use for
272 * calculating head values (cf. {@link #calculateHeadValues(Element, List)})
273 * @param headAttribName the name of the attribute to use for
274 * calculating head values
275 * @param defaultAttribs array of names of default attributes
276 * @param outCharset the output character set to use (only used to
277 * store some configurations for inspection purposes, if <code>n</code>
278 * > 0); if <code>null</code>, the default charset of the current
279 * platform is used
280 * @param n Each <em>n</em>-th context representation is stored if > 0;
281 * otherwise no representation is stored
282 * @param sensorNames array of fully specified names of classes
283 * implementing the {@link Sensor} interface to be used to look up semantic
284 * information
285 * @param config used to configure the sensors
286 * @throws ProcessingException if an error occurs while initializing this
287 * instance
288 */
289 public DefaultRepresentation(final int recogNum, final int detailedRecogs,
290 final int numberOfAncestors, final int numberOfSiblings,
291 final int splitMax, final int prefixMax,
292 final String headElementName, final String headAttribName,
293 final String[] defaultAttribs, final int n,
294 final String outCharset, final String[] sensorNames,
295 final TiesConfiguration config) throws ProcessingException {
296 super(recogNum, splitMax, n, outCharset);
297 detailedRecognitions = detailedRecogs;
298 ancestorNumber = numberOfAncestors;
299 siblingNumber = numberOfSiblings;
300 maximumPrefixLength = prefixMax;
301
302 headElement = headElementName;
303 headAttribute = headAttribName;
304
305
306 final Set<String> defAttribSet = new HashSet<String>();
307 for (int i = 0; i < defaultAttribs.length; i++) {
308 defAttribSet.add(defaultAttribs[i]);
309 }
310 defaultAttributes = Collections.unmodifiableSet(defAttribSet);
311
312 sensors = BaseSensor.createSensors(sensorNames, config);
313 }
314
315 /***
316 * Builds the context representation of text in an element. Returns a
317 * feature vector of all context features considered relevant for
318 * representation.
319 *
320 * @param element the element whose context should be represented
321 * @param leftText textual content to the left of (preceding)
322 * <code>mainText</code>, might be empty
323 * @param mainText the main textual content to represent, might be empty
324 * @param rightText textual content to the right of (following)
325 * <code>mainText</code>, might be empty
326 * @param priorRecognitions a buffer of the last {@link Recognition}s from
327 * the document, created by calling {@link #createRecognitionBuffer()};
328 * might be <code>null</code>
329 * @param featureCache a cache of (local) feature, should be re-used
330 * between all calls for the nodes in a single document (but must not be
331 * re-used when building the context of nodes in different documents!)
332 * @param logPurpose the type of contexts of main interest to the caller
333 * (e.g. "Token" or "Sentence"), used for logging
334 * @return a vector of features considered relevant for representation
335 * @throws ClassCastException if the <code>priorRecognitions</code> buffer
336 * contains objects that aren't {@link Recognition}s
337 * @throws IllegalArgumentException if the specified node is of an
338 * unsupported type
339 */
340 protected FeatureVector doBuildContext(final Element element,
341 final String leftText, final String mainText,
342 final String rightText, final PriorRecognitions priorRecognitions,
343 final Map<Element, List<LocalFeature>> featureCache,
344 final String logPurpose)
345 throws ClassCastException, IllegalArgumentException {
346 LinkedList<Feature> selfFeatures = new LinkedList<Feature>();
347 LinkedList<Feature> precedingFeatures = new LinkedList<Feature>();
348 LinkedList<Feature> followingFeatures = new LinkedList<Feature>();
349 LinkedList<Feature> ancestorFeatures = new LinkedList<Feature>();
350 LinkedList<Feature> ancestorSiblingFeatures = new LinkedList<Feature>();
351 final FeatureVector fullRep = new DefaultFeatureVector();
352
353
354 List priorFeatures;
355 if ((getRecognitionNumber() > 0) && (priorRecognitions != null)) {
356 priorFeatures = buildPrior(priorRecognitions);
357 } else {
358 priorFeatures = null;
359 }
360
361
362 ElementPosition position =
363 handleSiblings("", element, getSiblingNumber(),
364 precedingFeatures, followingFeatures, featureCache);
365
366
367
368
369
370 selfFeatures.add(new GlobalFeature("Descendant-or-self axis"));
371 selfFeatures.add(new GlobalFeature(AXIS_DESC_OR_SELF,
372 LocalFeature.MARKER_START));
373 buildFeatures(AXIS_DESC_OR_SELF, element, position, true,
374 selfFeatures, true, featureCache);
375 buildTextFeatures(AXIS_DESC_OR_SELF, element, leftText.trim(),
376 mainText.trim(), rightText.trim(), selfFeatures);
377
378
379
380
381
382 handleAncestors(element, getAncestorNumber(),
383 getSiblingNumber() - 1, ancestorFeatures,
384 ancestorSiblingFeatures, new HashBag(), featureCache);
385 ancestorFeatures.addFirst(new GlobalFeature(AXIS_ANCESTOR,
386 LocalFeature.MARKER_START));
387 ancestorFeatures.addFirst(new GlobalFeature("Ancestor axis"));
388
389
390
391
392
393 if (priorFeatures != null) {
394 fullRep.addAll(priorFeatures);
395 priorFeatures = null;
396 }
397
398
399 fullRep.addAll(precedingFeatures);
400 precedingFeatures = null;
401 fullRep.addAll(selfFeatures);
402 fullRep.addAll(followingFeatures);
403 followingFeatures = null;
404
405
406 fullRep.addAll(ancestorFeatures);
407 ancestorFeatures = null;
408 fullRep.addAll(selfFeatures);
409 selfFeatures = null;
410 fullRep.addAll(ancestorSiblingFeatures);
411 ancestorSiblingFeatures = null;
412
413
414
415
416
417 final List filteredRep = filterRepresentation(fullRep);
418
419
420 removeExtraMarkers(filteredRep);
421 fullRep.addAll(filteredRep);
422
423 Util.LOG.debug("Build " + logPurpose + " representation for "
424 + DOMUtils.showToken(element, mainText) + " (full size: "
425 + fullRep.size() + ", cache size: " + featureCache.size()
426 + ")");
427
428 return fullRep;
429 }
430
431 /***
432 * Builds the features of an element and appends them to the
433 * specified <code>featureList</code>. Handles attributes and calculated
434 * values and the element itself. Child elements are only handled when
435 * <code>recurseInsteadOfText</code> it <code>true</code> -- the axis name
436 * is not changed for child elements.
437 *
438 * @param axisName the name of the axis, used to start each feature
439 * @param element the element to process
440 * @param position wraps the position of the element within its parent
441 * element and related data, used for calculating positional features;
442 * if <code>null</code>, no positional features are calculated
443 * @param recurseInsteadOfText if <code>true</code> child elements
444 * are recursively processed, but no
445 * {@linkplain #calculateValuesFromText(String, String, List) values from
446 * text} are calculated; otherwise text is processed but child elements are
447 * not
448 * @param featureList the list of {@link GlobalFeature}s to add features to
449 * @param addAtEnd whether to add the new features at the end of at the
450 * beginning of the <code>featureList</code>
451 * @param cache a cache of local feature, mapping {@link Element}s to
452 * {@link List}s of {@link LocalFeature}
453 */
454 protected void buildFeatures(final String axisName, final Element element,
455 final ElementPosition position, final boolean recurseInsteadOfText,
456 final LinkedList<Feature> featureList, final boolean addAtEnd,
457 final Map<Element, List<LocalFeature>> cache) {
458 final List<LocalFeature> localFeatures;
459
460 if ((!recurseInsteadOfText) && cache.containsKey(element)) {
461
462 localFeatures = cache.get(element);
463
464
465 } else {
466
467 localFeatures = buildLocalFeatures(element, position,
468 recurseInsteadOfText);
469
470
471 if (!recurseInsteadOfText) {
472 cache.put(element, localFeatures);
473 }
474 }
475
476
477 final Iterator<LocalFeature> localIter = localFeatures.iterator();
478 GlobalFeature.globalize(axisName, localIter, featureList, addAtEnd);
479
480
481 if (recurseInsteadOfText) {
482
483 final Iterator childIter = element.elementIterator();
484 Element child;
485 while (childIter.hasNext()) {
486 child = (Element) childIter.next();
487 buildFeatures(axisName, child, null, recurseInsteadOfText,
488 featureList, addAtEnd, cache);
489 }
490 }
491 }
492
493 /***
494 * Builds the local features of an element. Handles attributes and
495 * calculated values and the element itself, but no child elements.
496 * The {@link LocalFeature}s can be stored in a cache for re-use; they must
497 * be combined with an axis name for getting a global feature.
498 *
499 * @param element the element to process
500 * @param position wraps the position of the element within its parent
501 * element and related data, used for calculating positional features;
502 * if <code>null</code>, no positional features are calculated
503 * @param ignoreText if <code>true</code>, no {@linkplain
504 * #calculateValuesFromText(String, String, List) values from text} or
505 * {@linkplain #calculateHeadValues(Element, List) head values} are
506 * calculated
507 * @return returns a List of created {@link LocalFeature}s
508 */
509 protected List<LocalFeature> buildLocalFeatures(final Element element,
510 final ElementPosition position, final boolean ignoreText) {
511 final List<LocalFeature> result = new LinkedList<LocalFeature>();
512 final String name = DOMUtils.name(element);
513
514
515
516 final SortedMap<String, String> attribMap =
517 new TreeMap<String, String>();
518 final Iterator attribIter = element.attributeIterator();
519 Attribute currentAttrib;
520 String attName;
521 boolean foundDefaultAttrib = false;
522
523 while (attribIter.hasNext()) {
524 currentAttrib = (Attribute) attribIter.next();
525 attName = DOMUtils.name(currentAttrib);
526 attribMap.put(attName, currentAttrib.getValue());
527
528 if (!foundDefaultAttrib
529 && getDefaultAttributes().contains(attName)) {
530
531 foundDefaultAttrib = true;
532 }
533 }
534
535
536 if (!foundDefaultAttrib) {
537 result.add(LocalFeature.createElementFeature(name));
538
539 }
540
541 final Iterator<String> sortedIter = attribMap.keySet().iterator();
542 String attValue;
543
544 while (sortedIter.hasNext()) {
545 attName = sortedIter.next();
546 attValue = attribMap.get(attName);
547
548 result.addAll(LocalFeature.createAttributeFeatures(name, attName,
549 attValue, getSplitMaximum()));
550
551 }
552
553
554
555
556 if (position != null) {
557 calculatePositionalValues(name, position, result);
558
559 }
560
561 if (!ignoreText) {
562
563 final String trimmedText = element.getTextTrim();
564
565 if ((trimmedText != null) && (trimmedText.length() > 0)) {
566
567 calculateValuesFromText(name, trimmedText, result);
568 } else {
569
570 calculateHeadValues(element, result);
571 }
572 }
573
574 return result;
575 }
576
577 /***
578 * Builds the pseudo-axis of prior recognitions.
579 *
580 * @param priorRecognitions a buffer of the last {@link Recognition}s from
581 * the document, created by calling {@link #createRecognitionBuffer()}
582 * @return a list of {@link GlobalFeature}s representing prior recognitions
583 * @throws ClassCastException if the <code>priorRecognitions</code> buffer
584 * contains objects that aren't {@link Recognition}s
585 */
586 protected List<Feature> buildPrior(
587 final PriorRecognitions priorRecognitions)
588 throws ClassCastException {
589 final LinkedList<Feature> result =
590 new LinkedList<Feature>();
591
592
593 result.add(new GlobalFeature("Prior recognitions"));
594 result.add(new GlobalFeature(AXIS_PRIOR, LocalFeature.MARKER_START));
595
596
597 final Iterator recogIter = priorRecognitions.iterator();
598 PriorRecognitions.Pair currentPair;
599 Recognition currentRecognition;
600 String type;
601 String trimmedText;
602 LinkedList<Feature> recogFeatures;
603 final List<LocalFeature> calculatedValues =
604 new LinkedList<LocalFeature>();
605
606
607 final int briefRecogs =
608 priorRecognitions.size() - getDetailedRecognitions();
609
610
611 if (priorRecognitions.size() > getRecognitionNumber()) {
612 Util.LOG.error("Buffer of prior recognitions too large ("
613 + priorRecognitions.size() + " instead of "
614 + getRecognitionNumber() + " elements) -- use "
615 + "Representation.createRecognitionBuffer() to create it");
616 }
617
618
619 for (int i = 0; recogIter.hasNext(); i++) {
620 currentPair = (PriorRecognitions.Pair) recogIter.next();
621 currentRecognition = currentPair.getRecognition();
622 type = currentRecognition.getType();
623
624
625 recogFeatures = currentPair.getCachedFeatures();
626 if (recogFeatures == null) {
627
628 recogFeatures = new LinkedList<Feature>();
629
630
631 recogFeatures.add(new GlobalFeature(AXIS_PRIOR,
632 LocalFeature.createElementFeature(type)));
633
634 if (i >= briefRecogs) {
635
636
637 trimmedText = currentRecognition.getText().trim();
638 if (StringUtils.isNotEmpty(trimmedText)) {
639 calculateValuesFromText(type, trimmedText,
640 calculatedValues);
641
642
643 GlobalFeature.globalize(AXIS_PRIOR,
644 calculatedValues.iterator(), recogFeatures,
645 true);
646 calculatedValues.clear();
647 }
648 }
649
650
651 if (currentRecognition.isSealed()) {
652 currentPair.setCachedFeatures(recogFeatures);
653 }
654 } else {
655
656
657 if (i < briefRecogs) {
658
659
660 while (recogFeatures.size() > 1) {
661 recogFeatures.removeLast();
662 }
663 }
664 }
665
666
667 result.addAll(recogFeatures);
668
669 if (!recogIter.hasNext()) {
670
671 result.addLast(new GlobalFeature(AXIS_PRIOR,
672 LocalFeature.createCalculatedFeature(type, "finished",
673 String.valueOf(currentRecognition.isSealed()))));
674 }
675 }
676 return result;
677 }
678
679 /***
680 * Builds the context representation of text in an element, differentiating
681 * between three kinds of textual contents: a left part, a main part, and
682 * a right part. Each text part is optional (if all of them are empty,
683 * this method does nothing).
684 *
685 * @param axisName the name of the axis, used to start each feature
686 * @param element the element whose context should be represented
687 * @param trimmedLeft trimmed textual content to the left of (preceding)
688 * <code>trimmedMain</code>, might be empty
689 * @param trimmedMain trimmed main textual content to represent, might
690 * be empty
691 * @param trimmedRight trimmed textual content to the right of (following)
692 * <code>trimmedMain</code>, might be empty
693 * @param featureList a list of {@link GlobalFeature}s to add the values to
694 */
695 protected void buildTextFeatures(final String axisName,
696 final Element element, final String trimmedLeft,
697 final String trimmedMain, final String trimmedRight,
698 final LinkedList<Feature> featureList) {
699 final String name = DOMUtils.name(element);
700 final List<LocalFeature> localTextFeatures =
701 new LinkedList<LocalFeature>();
702
703
704
705 final String innerMarker = FeatureType.CALCULATED.getMark();
706
707 if (trimmedLeft.length() > 0) {
708 calculateValuesFromText(name + innerMarker + "left", trimmedLeft,
709 localTextFeatures);
710 }
711 if (trimmedMain.length() > 0) {
712 calculateValuesFromText(name, trimmedMain, localTextFeatures);
713 }
714 if (trimmedRight.length() > 0) {
715 calculateValuesFromText(name + innerMarker + "right", trimmedRight,
716 localTextFeatures);
717 }
718
719
720 GlobalFeature.globalize(axisName, localTextFeatures.iterator(),
721 featureList, true);
722 }
723
724 /***
725 * Creates values that depend on "head" children of an element, if the
726 * element contains any of them. This method is meant for element that
727 * don't contain any textual content, so their children are checked
728 * instead.
729 *
730 * This implementation proceeds as follows:
731 * <ul>
732 * <li>If the specified element is of type {@link #getHeadElement()}, the
733 * value of the {@link #getHeadAttribute()} or else the textual content of
734 * its right-most child element is stored (in a calculated value named
735 * "head"). Child elements are iterated from right to left unless one
736 * containing an appropriate attribute or textual content is found (or none
737 * are left).
738 * <li>Otherwise, if the element contains children of the type
739 * {@link #getHeadElement()}, the first and last child element of this type
740 * are recursively processed and the results stored in calculated names
741 * named "lhead" and "rhead". Of there is only a single child of appropriate
742 * type, the "lhead" value is omitted.
743 * <li>Otherwise no values are calculated.
744 * </ul>
745 *
746 * <p>If a value contains whitespace, only the final subsequence following
747 * all whitespace is preserved.
748 *
749 * @param element the element to process
750 * @param values a list of {@link LocalFeature}s to add the calculated
751 * values to
752 */
753 protected void calculateHeadValues(final Element element,
754 final List<LocalFeature> values) {
755 final String elementName = DOMUtils.name(element);
756 if (getHeadElement().equals(elementName)) {
757
758 final String headValue = determineHeadValue(element);
759
760
761 if (headValue.length() > 0) {
762 values.add(LocalFeature.createCalculatedFeature(elementName,
763 "head", headValue));
764 }
765 } else {
766
767 final List suitableChildren = DOMUtils.elementsByName(element,
768 getHeadElement());
769 final int numberOfChildren = suitableChildren.size();
770
771 if (numberOfChildren >= 1) {
772
773 final Element rightMostChild =
774 (Element) suitableChildren.get(numberOfChildren - 1);
775 final String rHeadValue = determineHeadValue(rightMostChild);
776
777
778 if (rHeadValue.length() > 0) {
779 values.add(LocalFeature.createCalculatedFeature(
780 elementName, "rhead", rHeadValue));
781 }
782
783 if (numberOfChildren >= 2) {
784
785 final Element leftMostChild =
786 (Element) suitableChildren.get(0);
787 final String lHeadValue = determineHeadValue(leftMostChild);
788
789
790 if (lHeadValue.length() > 0) {
791 values.add(LocalFeature.createCalculatedFeature(
792 elementName, "lhead", lHeadValue));
793 }
794 }
795 }
796 }
797 }
798
799 /***
800 * Calculates values that depend on the position of an element within its
801 * parent.
802 *
803 * @param elementName the name of the element to process, as returned by
804 * {@link DOMUtils#name(Element)}
805 * @param position wraps the position of the element within its parent
806 * element and related data, must not be <code>null</code>
807 * @param values a list to add the calculated values to
808 */
809 protected void calculatePositionalValues(final String elementName,
810 final ElementPosition position, final List<LocalFeature> values) {
811
812 values.add(LocalFeature.createCalculatedFeature(elementName, "pos",
813 Integer.toString(position.getOverallPosition())));
814
815
816 values.add(LocalFeature.createCalculatedFeature(elementName,
817 "typedPos", Integer.toString(position.getTypedPosition())));
818
819
820 values.add(LocalFeature.createCalculatedFeature(elementName,
821 "roughPos", determineRoughPosition(position.getOverallPosition(),
822 position.getAllChildren())));
823 values.add(LocalFeature.createCalculatedFeature(elementName,
824 "roughTypedPos", determineRoughPosition(position.getTypedPosition(),
825 position.getTypedChildren())));
826 }
827
828 /***
829 * Calculates values that depend on the textual content of an element:
830 * prefixes, suffixes, length data, and token type. Also includes the text
831 * itself.
832 *
833 * @param elementName the name of the element to process, as returned by
834 * {@link DOMUtils#name(Element)}
835 * @param trimmedText the trimmed textual content of the element to
836 * process, must not be empty
837 * @param values a list of {@link LocalFeature}s to add the calculated
838 * values to
839 * @throws IllegalArgumentException if the empty string was given as
840 * <code>trimmedText</code>
841 */
842 protected void calculateValuesFromText(final String elementName,
843 final String trimmedText, final List<LocalFeature> values)
844 throws IllegalArgumentException {
845 final int textLength = trimmedText.length();
846 if (textLength == 0) {
847 throw new IllegalArgumentException(
848 "Specifed trimmed text must not be empty!");
849 }
850
851
852 final String[] tokens = TextUtils.splitString(trimmedText, -1);
853 final String firstTokenLC = tokens[0].toLowerCase();
854 final String lastTokenLC;
855
856 if (tokens.length == 1) {
857
858 lastTokenLC = firstTokenLC;
859 } else {
860 lastTokenLC = tokens[tokens.length - 1].toLowerCase();
861 }
862
863
864
865 final int maxPrefixLength = Math.min(maximumPrefixLength,
866 firstTokenLC.length() - 1);
867 for (int i = 1; i <= maxPrefixLength; i++) {
868 values.add(LocalFeature.createCalculatedFeature(elementName,
869 "prefix" + i, firstTokenLC.substring(0, i)));
870 }
871
872
873
874 final int maxSuffixLength = Math.min(maximumPrefixLength,
875 lastTokenLC.length() - 1);
876 for (int i = 1; i <= maxSuffixLength; i++) {
877 values.add(LocalFeature.createCalculatedFeature(elementName,
878 "suffix" + i, lastTokenLC.substring(lastTokenLC.length() - i)));
879 }
880
881
882 values.add(LocalFeature.createCalculatedFeature(elementName, "length",
883 Integer.toString(textLength)));
884
885
886 final long sqrtLength = Math.round(Math.sqrt(textLength));
887 values.add(LocalFeature.createCalculatedFeature(elementName,
888 "sqrtLength", Long.toString(sqrtLength)));
889
890
891 final Set typeSet = TOKEN_TYPE_PATTERNS.keySet();
892 final Iterator typeIter = typeSet.iterator();
893 String currentDesc;
894 Pattern currentPattern;
895 Matcher currentMatcher;
896 boolean foundMatch = false;
897 String matchingType = "mixed";
898
899
900 while (typeIter.hasNext() && !foundMatch) {
901 currentDesc = (String) typeIter.next();
902 currentPattern = (Pattern) TOKEN_TYPE_PATTERNS.get(currentDesc);
903 currentMatcher = currentPattern.matcher(tokens[tokens.length - 1]);
904
905 if (currentMatcher.matches()) {
906 foundMatch = true;
907 matchingType = currentDesc;
908 }
909 }
910
911
912 final String fullType = (tokens.length > 1)
913 ? matchingType + "+multi" : matchingType;
914 values.add(LocalFeature.createCalculatedFeature(elementName,
915 "tokenType", fullType));
916
917
918 KeyValue[] sensorData;
919 String prefix;
920 int j;
921
922 for (int i = 0; i < sensors.length; i++) {
923
924
925 prefix = i + FeatureType.CALCULATED.getMark();
926 sensorData = sensors[i].lookup(trimmedText);
927
928 for (j = 0; j < sensorData.length; j++) {
929 values.add(LocalFeature.createCalculatedFeature(elementName,
930 prefix + sensorData[j].getKey(),
931 Util.asString(sensorData[j].getValue())));
932 }
933 }
934
935
936 values.addAll(LocalFeature.createCalculatedFeatures(elementName, "lc",
937 trimmedText.toLowerCase(), getSplitMaximum()));
938 values.addAll(LocalFeature.createTextFeatures(elementName, trimmedText,
939 getSplitMaximum()));
940 }
941
942 /***
943 * Helper method for determining the head value for an element of type
944 * {@link #getHeadElement()}. See
945 * {@link #calculateHeadValues(Element, List)} for a description of the
946 * algorithm.
947 *
948 * @param element the element to process, must be of type
949 * {@link #getHeadElement()}
950 * @return the calculated head value, or the empty string if no value
951 * could be calculated (the specified element doesn't have any children,
952 * or the right-most child contains neither {@link #getHeadAttribute()} nor
953 * textual content)
954 */
955 protected String determineHeadValue(final Element element) {
956 String fullResult = null;
957 final List children = element.elements();
958 final ListIterator childIter = children.listIterator(children.size());
959 Element currentChild;
960 Attribute headAttrib;
961 boolean done = false;
962
963
964
965 while (childIter.hasPrevious() && !done) {
966 currentChild = (Element) childIter.previous();
967 headAttrib = DOMUtils.attributeByName(currentChild,
968 getHeadAttribute());
969
970
971 if (headAttrib != null) {
972 fullResult = headAttrib.getText();
973 } else {
974 fullResult = element.getTextTrim();
975 }
976
977
978 done = (fullResult != null) && (fullResult.length() > 0);
979 }
980
981 final String result;
982 if (fullResult != null) {
983
984 final String[] splits = TextUtils.splitString(fullResult, 1);
985 result = splits[0];
986 } else {
987 result = "";
988 }
989 return result;
990 }
991
992 /***
993 * Helper method called by
994 * {@link #calculatePositionalValues(String, ElementPosition, List)} to
995 * collapse a position in to one of five values.
996 * <dl>
997 * <dt>first</dt>
998 * <dd>for the first element</dd>
999 * <dt>early</dt>
1000 * <dd>if position is within the first third of all elements (but not the
1001 * first one), upper limit included</dd>
1002 * <dt>middle</dt>
1003 * <dd>if position is within the second third of all elements, limits
1004 * excluded</dd>
1005 * <dt>late</dt>
1006 * <dd>if position is within the last third of all elements (but not the
1007 * last one), lower limit included</dd>
1008 * <dt>last</dt>
1009 * <dd>for the last element</dd>
1010 * </dl>
1011 *
1012 * @param position the position counted from 0, should be non-negative and
1013 * smaller than <code>elementCount</code> (otherwise the results are
1014 * undefined)
1015 * @param elementCount the number of elements
1016 * @return a String containing one of the five values described above
1017 */
1018 protected String determineRoughPosition(final int position,
1019 final int elementCount) {
1020
1021 final int realPosition = position + 1;
1022 final String result;
1023
1024 if (realPosition == 1) {
1025 result = "first";
1026 } else if (realPosition == elementCount) {
1027 result = "last";
1028 } else {
1029 final double oneThird = elementCount / 3.0;
1030 if (realPosition <= oneThird) {
1031 result = "early";
1032 } else if (realPosition >= (2 * oneThird)) {
1033 result = "late";
1034 } else {
1035 result = "middle";
1036 }
1037 }
1038 return result;
1039 }
1040
1041 /***
1042 * Creates a filtered view of a context representation. A filtered
1043 * representation is created for each {@linkplain FeatureType type of
1044 * features} that occurs at least twice;
1045 * attributes and calculated values occuring only once are omitted.
1046 * For textual content, another filtered representation is created.
1047 *
1048 * <p>Features representing markers ({@link FeatureType#MARKER}),
1049 * stand-alone elements ({@link FeatureType#ELEMENT}) and default attributes
1050 * ({@link #getDefaultAttributes()}) are included in all filtered
1051 * representations. Comment-only features are ignored.
1052 *
1053 * @param originalRep a feature vector containing the representation
1054 * to filter
1055 * @return a list of {@link de.fu_berlin.ties.classify.feature.Feature}s
1056 * combining the filtered representations created by this method
1057 */
1058 protected List<Feature> filterRepresentation(
1059 final FeatureVector originalRep) {
1060
1061
1062 final List<Feature> universalFeatures = new LinkedList<Feature>();
1063
1064 final LinkedList<Feature> textualFeatures = new LinkedList<Feature>();
1065
1066
1067 final MultiMap featureMap = new MultiHashMap();
1068
1069 final SortedBag featureCount = new TreeBag();
1070
1071 final Iterator iter = originalRep.iterator();
1072 GlobalFeature currentGlobal;
1073 LocalFeature currentLocal;
1074 FeatureType currentType;
1075 Iterator keyIter;
1076 String currentKey;
1077 Iterator<Feature> universalIter;
1078 Iterator innerIter;
1079 Feature innerFeature;
1080
1081
1082 while (iter.hasNext()) {
1083 currentGlobal = (GlobalFeature) iter.next();
1084 currentLocal = currentGlobal.getLocalFeature();
1085
1086 if (currentLocal != null) {
1087 currentType = currentLocal.getType();
1088 if (((currentType == FeatureType.ATTRIBUTE)
1089 && (getDefaultAttributes().contains(
1090 currentLocal.getName())))
1091 || (currentType == FeatureType.MARKER)) {
1092
1093 universalFeatures.add(currentGlobal);
1094 textualFeatures.add(currentGlobal);
1095 keyIter = featureMap.keySet().iterator();
1096 while (keyIter.hasNext()) {
1097 currentKey = (String) keyIter.next();
1098 featureMap.put(currentKey, currentGlobal);
1099 }
1100
1101 } else if ((currentType == FeatureType.ATTRIBUTE)
1102 || (currentType == FeatureType.ELEMENT)
1103 || ((currentType == FeatureType.CALCULATED)
1104 && currentLocal.getName().endsWith("head"))) {
1105
1106 currentKey = currentType.getMark();
1107
1108 if (!featureMap.containsKey(currentKey)) {
1109
1110 universalIter = universalFeatures.iterator();
1111 while (universalIter.hasNext()) {
1112 featureMap.put(currentKey, universalIter.next());
1113 }
1114 }
1115
1116
1117 featureMap.put(currentKey, currentGlobal);
1118 featureCount.add(currentKey);
1119 } else if (currentType == FeatureType.TEXT) {
1120
1121 textualFeatures.add(currentGlobal);
1122 }
1123
1124 }
1125 }
1126
1127
1128 final List<Feature> result = new LinkedList<Feature>();
1129 final Iterator sortedIter = featureCount.uniqueSet().iterator();
1130 Collection currentView;
1131
1132
1133 while (sortedIter.hasNext()) {
1134 currentKey = (String) sortedIter.next();
1135
1136
1137 if (featureCount.getCount(currentKey) > 2) {
1138 result.add(new GlobalFeature("Filtered view for " + currentKey
1139 + " features"));
1140 currentView = (Collection) featureMap.get(currentKey);
1141 innerIter = currentView.iterator();
1142
1143 while (innerIter.hasNext()) {
1144 innerFeature = (Feature) innerIter.next();
1145 result.add(innerFeature);
1146 }
1147 }
1148 }
1149
1150
1151 result.add(new GlobalFeature("Filtered view for textual features"));
1152 result.addAll(textualFeatures);
1153
1154 return result;
1155 }
1156
1157 /***
1158 * Returns the maximum number of ancestors to include in the context
1159 * representation.
1160 *
1161 * @return the number of ancestors to include
1162 */
1163 public int getAncestorNumber() {
1164 return ancestorNumber;
1165 }
1166
1167 /***
1168 * Returns the unmodifiable set of names of default attributes. The
1169 * contained names are Strings in a format compatible to
1170 * {@link DOMUtils#name(Attribute)}.
1171 *
1172 * @return the set of default attributes
1173 */
1174 public Set getDefaultAttributes() {
1175 return defaultAttributes;
1176 }
1177
1178 /***
1179 * Returns the number of preceding recognitions to represent in detail.
1180 * @return the value of the attribute
1181 */
1182 public int getDetailedRecognitions() {
1183 return detailedRecognitions;
1184 }
1185
1186 /***
1187 * Returns the name of the element to use for calculating head values.
1188 *
1189 * @return the name of the attribute
1190 * @see #calculateHeadValues(Element, List)
1191 */
1192 public String getHeadAttribute() {
1193 return headAttribute;
1194 }
1195
1196 /***
1197 * Returns the name of the attribute to use for calculating head values.
1198 *
1199 * @return the name of the element
1200 * @see #calculateHeadValues(Element, List)
1201 */
1202 public String getHeadElement() {
1203 return headElement;
1204 }
1205
1206 /***
1207 * Returns the basic number of preceding and following siblings to include
1208 * in the context representation.
1209 *
1210 * @return the number of siblings to include
1211 */
1212 public int getSiblingNumber() {
1213 return siblingNumber;
1214 }
1215
1216 /***
1217 * Handles ancestors and ancestor siblings of an element.
1218 *
1219 * @param element the element to process
1220 * @param ancestorsToAdd the number of ancestors to add, must be > 0;
1221 * if > 1, this method calls itself recursively, decreasing the number
1222 * by 1
1223 * @param ancestorSiblingsToAdd the number of preceding/following siblings
1224 * of the current ancestors to add; if 0 or negative, no siblings are added;
1225 * a recursive call decreases this parameter by 1 if any ancestor siblings
1226 * were found (if no siblings were found, the number passes unchanged)
1227 * @param ancestorFeatures the list of {@link GlobalFeature}s to prepend
1228 * the features on the ancestors to
1229 * @param ancestorSiblingFeatures the list of {@link GlobalFeature}s to
1230 * append the features on the ancestors siblings to
1231 * @param processedAncestorNames a bag that will typically be empty when
1232 * first calling this method (will be filled by recursive calls)
1233 * @param cache a cache of local feature, mapping {@link Element}s to
1234 * {@link List}s of {@link LocalFeature}
1235 * @throws IllegalArgumentException if <code>ancestorsToAdd</code> is 0 or
1236 * negative
1237 */
1238 protected void handleAncestors(final Element element,
1239 final int ancestorsToAdd, final int ancestorSiblingsToAdd,
1240 final LinkedList<Feature> ancestorFeatures,
1241 final LinkedList<Feature> ancestorSiblingFeatures,
1242 final Bag processedAncestorNames,
1243 final Map<Element, List<LocalFeature>> cache)
1244 throws IllegalArgumentException {
1245 if (ancestorsToAdd <= 0) {
1246 throw new IllegalArgumentException(
1247 "Cannot add a non-positive number of ancestors ("
1248 + ancestorsToAdd + ")!");
1249 }
1250
1251
1252 final Element parent = element.getParent();
1253 if (parent != null) {
1254
1255 final ElementPosition position;
1256
1257
1258 if (ancestorSiblingsToAdd > 0) {
1259
1260 final String parentName = DOMUtils.name(parent);
1261 processedAncestorNames.add(parentName);
1262 final int typeCount
1263 = processedAncestorNames.getCount(parentName);
1264 final String nameToQuote;
1265
1266 if (typeCount > 2) {
1267
1268
1269 nameToQuote = Integer.toString(typeCount) + parentName;
1270 } else {
1271
1272 nameToQuote = parentName;
1273 }
1274 final String axisPrefix =
1275 AXIS_ANCESTOR + LocalFeature.quote(nameToQuote);
1276
1277
1278
1279 final LinkedList<Feature> siblingFeatures =
1280 new LinkedList<Feature>();
1281 position = handleSiblings(axisPrefix, parent,
1282 ancestorSiblingsToAdd, siblingFeatures, siblingFeatures,
1283 cache);
1284 ancestorSiblingFeatures.addAll(siblingFeatures);
1285 } else {
1286 position = null;
1287 }
1288
1289
1290 buildFeatures(AXIS_ANCESTOR, parent, position, false,
1291 ancestorFeatures, false, cache);
1292
1293
1294 if (ancestorsToAdd > 1) {
1295 final int parentSilbingsToAdd;
1296
1297
1298 if ((position != null) && (position.getProcessedPreceding()
1299 + position.getProcessedPreceding() > 0)) {
1300
1301 parentSilbingsToAdd = ancestorSiblingsToAdd - 1;
1302 } else {
1303
1304 parentSilbingsToAdd = ancestorSiblingsToAdd;
1305 }
1306
1307 handleAncestors(parent, ancestorsToAdd - 1 ,
1308 parentSilbingsToAdd, ancestorFeatures,
1309 ancestorSiblingFeatures, processedAncestorNames, cache);
1310 }
1311 }
1312 }
1313
1314 /***
1315 * Adds the preceding and following siblings of an element.
1316 *
1317 * @param axisPrefix the prefix of the axis name, used to start each
1318 * feature; specify the empty string if no prefix should be used
1319 * @param element the element to process
1320 * @param baseNumber the basic number of siblings to keep; the actual
1321 * number of siblings kept might vary
1322 * @param precedingFeatures the list of {@link GlobalFeature}s to prepend
1323 * the features on the preceding siblings to
1324 * @param followingFeatures the list of {@link GlobalFeature}s to append
1325 * the features on the following siblings to
1326 * @param cache a cache of local feature, mapping {@link Element}s to
1327 * {@link List}s of {@link LocalFeature}
1328 * @return a wrapper of the position of the element with its parent element
1329 * and related data, required by
1330 * {@link #calculatePositionalValues(String, ElementPosition, List)}, or
1331 * <code>null</code> if there is no parent element
1332 */
1333 protected ElementPosition handleSiblings(final String axisPrefix,
1334 final Element element, final int baseNumber,
1335 final LinkedList<Feature> precedingFeatures,
1336 final LinkedList<Feature> followingFeatures,
1337 final Map<Element, List<LocalFeature>> cache) {
1338 final Element parent = element.getParent();
1339 final String name = DOMUtils.name(element);
1340 final ElementPosition result;
1341
1342 if (parent != null) {
1343
1344 final Iterator childIter = parent.elementIterator();
1345 Element currentChild;
1346 boolean foundElement = false;
1347 LinkedList<Element> allPrecedingSiblings =
1348 new LinkedList<Element>();
1349 LinkedList<Element> allFollowingSiblings =
1350 new LinkedList<Element>();
1351
1352
1353 int allChildren = 0;
1354 int overallPosition = 0;
1355 int typedChildren = 0;
1356 int typedPosition = 0;
1357 boolean sameType;
1358
1359 while (childIter.hasNext()) {
1360 currentChild = (Element) childIter.next();
1361
1362
1363 allChildren++;
1364 if (name.equals(DOMUtils.name(currentChild))) {
1365 sameType = true;
1366 typedChildren++;
1367 } else {
1368 sameType = false;
1369 }
1370
1371 if (currentChild == element) {
1372 foundElement = true;
1373 } else {
1374 if (!foundElement) {
1375
1376 allPrecedingSiblings.addLast(currentChild);
1377
1378
1379 overallPosition++;
1380 if (sameType) {
1381 typedPosition++;
1382 }
1383 } else {
1384
1385 allFollowingSiblings.addLast(currentChild);
1386 }
1387 }
1388 }
1389
1390
1391 final List selectedPrecedingSiblings = selectPrecedingSiblings(
1392 element, allPrecedingSiblings, baseNumber);
1393 final List selectedFollowingSiblings = selectFollowingSiblings(
1394 element, allFollowingSiblings, baseNumber);
1395
1396
1397 result = new ElementPosition(allChildren, overallPosition,
1398 typedChildren, typedPosition, selectedPrecedingSiblings.size(),
1399 selectedFollowingSiblings.size());
1400
1401
1402 final StringBuffer precedingAxisName = new StringBuffer(axisPrefix);
1403 final StringBuffer followingAxisName = new StringBuffer(axisPrefix);
1404 final String prefixComment;
1405 if (axisPrefix.length() > 0) {
1406 prefixComment = " with '" + axisPrefix + "' prefix";
1407 precedingAxisName.append(GlobalFeature.SEP);
1408 followingAxisName.append(GlobalFeature.SEP);
1409 } else {
1410 prefixComment = "";
1411 }
1412 precedingAxisName.append(AXIS_PREC_SIBLING);
1413 followingAxisName.append(AXIS_FOLLOW_SIBLING);
1414
1415 ListIterator siblingIter;
1416 Element currentSibling;
1417
1418
1419 siblingIter = selectedPrecedingSiblings.listIterator(
1420 selectedPrecedingSiblings.size());
1421 while (siblingIter.hasPrevious()) {
1422 currentSibling = (Element) siblingIter.previous();
1423 buildFeatures(precedingAxisName.toString(), currentSibling,
1424 null, false, precedingFeatures, false, cache);
1425 }
1426
1427 precedingFeatures.addFirst(
1428 new GlobalFeature(precedingAxisName.toString(),
1429 LocalFeature.MARKER_START));
1430 precedingFeatures.addFirst(new GlobalFeature(
1431 "Preceding siblings axis" + prefixComment));
1432
1433
1434 followingFeatures.addLast(new GlobalFeature(
1435 "Following siblings axis" + prefixComment));
1436 followingFeatures.addLast(
1437 new GlobalFeature(followingAxisName.toString(),
1438 LocalFeature.MARKER_START));
1439
1440 siblingIter = selectedFollowingSiblings.listIterator();
1441 while (siblingIter.hasNext()) {
1442 currentSibling = (Element) siblingIter.next();
1443 buildFeatures(followingAxisName.toString(), currentSibling,
1444 null, false, followingFeatures, true, cache);
1445 }
1446 } else {
1447
1448 result = null;
1449 }
1450 return result;
1451 }
1452
1453 /***
1454 * Modifies a list of {@link GlobalFeature}s to remove extraneous
1455 * {@link FeatureType#MARKER} features. Keeps only the last one of several
1456 * sequential marker features; trailing marker features are removed as well.
1457 *
1458 * @param features the list of features to modify
1459 */
1460 protected void removeExtraMarkers(final List features) {
1461
1462 final ListIterator backwardIter =
1463 features.listIterator(features.size());
1464 GlobalFeature currentGlobal;
1465 LocalFeature currentLocal;
1466 FeatureType currentType;
1467
1468
1469 FeatureType lastType = FeatureType.MARKER;
1470
1471 while (backwardIter.hasPrevious()) {
1472 currentGlobal = (GlobalFeature) backwardIter.previous();
1473 currentLocal = currentGlobal.getLocalFeature();
1474
1475 if (currentLocal != null) {
1476 currentType = currentLocal.getType();
1477
1478 if ((currentType == FeatureType.MARKER)
1479 && (lastType == FeatureType.MARKER)) {
1480
1481 backwardIter.remove();
1482 }
1483
1484 lastType = currentType;
1485 }
1486 }
1487 }
1488
1489 /***
1490 * Selects the siblings to keep among all following siblings. This
1491 * implementation always keeps the <code>baseNumber</code> first siblings.
1492 * One of the selected siblings may have a different type (name as returned
1493 * by {@link DOMUtils#name(Element)}) than the main element --
1494 * if there are more with different types, they are skipped.
1495 *
1496 * @param mainElement the element whose siblings should be selected
1497 * @param allFollowingSiblings the list of all following siblings
1498 * @param baseNumber the basic number of siblings to keep; the actual
1499 * number of siblings kept might vary
1500 * @return the subset of following siblings to keep; an empty list if
1501 * there are no siblings to keep
1502 */
1503 protected List<Element> selectFollowingSiblings(final Element mainElement,
1504 final LinkedList<Element> allFollowingSiblings,
1505 final int baseNumber) {
1506 final String elementType = DOMUtils.name(mainElement);
1507 final LinkedList<Element> selectedFollowingSiblings =
1508 new LinkedList<Element>();
1509 final Iterator<Element> siblingIter = allFollowingSiblings.iterator();
1510 Element currentSibling;
1511 boolean addedSiblingOfDifferentType = false;
1512 boolean doAdd;
1513
1514
1515 while (siblingIter.hasNext()
1516 && (selectedFollowingSiblings.size() < baseNumber)) {
1517 currentSibling = siblingIter.next();
1518
1519 if (elementType.equals(DOMUtils.name(currentSibling))) {
1520 doAdd = true;
1521 } else {
1522
1523 doAdd = !addedSiblingOfDifferentType;
1524 addedSiblingOfDifferentType = true;
1525 }
1526
1527 if (doAdd) {
1528 selectedFollowingSiblings.addLast(currentSibling);
1529 }
1530 }
1531
1532 return selectedFollowingSiblings;
1533 }
1534
1535 /***
1536 * Selects the siblings to keep among all preceding siblings. This
1537 * implementation always keeps the first sibling and the
1538 * <code>baseNumber</code> last siblings. One of the last siblings may
1539 * have a different type (name as returned by
1540 * {@link DOMUtils#name(Element)}) than the main element --
1541 * if there are more with different types, they are skipped.
1542 * If none of the <code>baseNumber</code> last siblings has a different
1543 * types, the last siblings with a different type (name) is also kept.
1544 *
1545 * @param mainElement the element whose siblings should be selected
1546 * @param allPrecedingSiblings the list of all preceding siblings
1547 * @param baseNumber the basic number of siblings to keep; the actual
1548 * number of siblings kept might vary
1549 * @return the subset of preceding siblings to keep; an empty list if
1550 * there are no siblings to keep
1551 */
1552 protected List<Element> selectPrecedingSiblings(final Element mainElement,
1553 final LinkedList<Element> allPrecedingSiblings,
1554 final int baseNumber) {
1555 final String elementType = DOMUtils.name(mainElement);
1556 final LinkedList<Element> selectedPrecedingSiblings =
1557 new LinkedList<Element>();
1558 final ListIterator<Element> siblingIter =
1559 allPrecedingSiblings.listIterator(allPrecedingSiblings.size());
1560 Element currentSibling;
1561 boolean addedSiblingOfDifferentType = false;
1562 boolean doAdd;
1563
1564
1565 while (siblingIter.hasPrevious()
1566 && (selectedPrecedingSiblings.size() < baseNumber)) {
1567 currentSibling = siblingIter.previous();
1568
1569 if (elementType.equals(DOMUtils.name(currentSibling))) {
1570 doAdd = true;
1571 } else {
1572
1573 doAdd = !addedSiblingOfDifferentType;
1574 addedSiblingOfDifferentType = true;
1575 }
1576
1577 if (doAdd) {
1578 selectedPrecedingSiblings.addFirst(currentSibling);
1579 }
1580 }
1581
1582
1583 if (!allPrecedingSiblings.isEmpty()) {
1584 final Element firstSibling = allPrecedingSiblings.getFirst();
1585 final Element firstSelected = selectedPrecedingSiblings.getFirst();
1586
1587
1588 if (firstSelected != firstSibling) {
1589 selectedPrecedingSiblings.addFirst(firstSibling);
1590 }
1591 }
1592
1593 return selectedPrecedingSiblings;
1594 }
1595
1596 /***
1597 * Returns a string representation of this object.
1598 *
1599 * @return a textual representation
1600 */
1601 public String toString() {
1602 return new ToStringBuilder(this)
1603 .appendSuper(super.toString())
1604 .append("detailed recognitions", detailedRecognitions)
1605 .append("number of ancestors", ancestorNumber)
1606 .append("number of siblings", siblingNumber)
1607 .append("maximum prefix length", maximumPrefixLength)
1608 .append("head element", headElement)
1609 .append("head attribute", headAttribute)
1610 .append("default attributes", defaultAttributes)
1611 .append("sensors", ArrayUtils.toString(sensors))
1612 .toString();
1613 }
1614
1615 }