View Javadoc

1   /*
2    * Copyright (C) 2003-2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.context;
23  
24  import java.util.Collection;
25  import java.util.Collections;
26  import java.util.HashSet;
27  import java.util.Iterator;
28  import java.util.LinkedList;
29  import java.util.List;
30  import java.util.ListIterator;
31  import java.util.Map;
32  import java.util.Set;
33  import java.util.SortedMap;
34  import java.util.TreeMap;
35  import java.util.regex.Matcher;
36  import java.util.regex.Pattern;
37  
38  import org.apache.commons.collections.Bag;
39  import org.apache.commons.collections.bag.HashBag;
40  import org.apache.commons.collections.KeyValue;
41  import org.apache.commons.collections.MultiHashMap;
42  import org.apache.commons.collections.MultiMap;
43  import org.apache.commons.collections.SortedBag;
44  import org.apache.commons.collections.bag.TreeBag;
45  import org.apache.commons.collections.map.LinkedMap;
46  import org.apache.commons.lang.ArrayUtils;
47  import org.apache.commons.lang.StringUtils;
48  import org.apache.commons.lang.builder.ToStringBuilder;
49  import org.dom4j.Attribute;
50  import org.dom4j.Element;
51  
52  import de.fu_berlin.ties.ProcessingException;
53  import de.fu_berlin.ties.TiesConfiguration;
54  import de.fu_berlin.ties.xml.dom.DOMUtils;
55  import de.fu_berlin.ties.classify.feature.DefaultFeatureVector;
56  import de.fu_berlin.ties.classify.feature.Feature;
57  import de.fu_berlin.ties.classify.feature.FeatureVector;
58  import de.fu_berlin.ties.context.sensor.BaseSensor;
59  import de.fu_berlin.ties.context.sensor.Sensor;
60  import de.fu_berlin.ties.io.IOUtils;
61  import de.fu_berlin.ties.text.TextUtils;
62  import de.fu_berlin.ties.util.Util;
63  
64  /***
65   * The context representation used by default. This class is thread-safe.
66   *
67   * @author Christian Siefkes
68   * @version $Revision: 1.60 $, $Date: 2004/10/12 15:55:47 $, $Author: siefkes $
69   */
70  public class DefaultRepresentation extends AbstractRepresentation {
71  
72      /***
73       * Ancestor axis.
74       */
75      protected static final String AXIS_ANCESTOR = "an";
76  
77      /***
78       * Descendant-or-self axis.
79       */
80      protected static final String AXIS_DESC_OR_SELF = "ds";
81  
82      /***
83       * Following sibling axis.
84       */
85      protected static final String AXIS_FOLLOW_SIBLING = "fs";
86  
87      /***
88       * Preceeding sibling axis.
89       */
90      protected static final String AXIS_PREC_SIBLING = "ps";
91  
92      /***
93       * The pseudo-axis of prior recognitions.
94       */
95      protected static final String AXIS_PRIOR = "prior";
96  
97      /***
98       * A sequence map mapping used by
99       * {@link #calculateValuesFromText(String, String, List)} to determine the
100      * "tokenType" value. Maps description strings to {@link Pattern}s to
101      * match. The description string for the first matching pattern is used.
102      * Initialized in the static constuctor. The value "mixed" is reserved for
103      * tokens that are not matched by any of the patterns.
104      */
105     protected static final LinkedMap TOKEN_TYPE_PATTERNS;
106 
107     /***
108      * Static constructor: initializes the map of {@link #TOKEN_TYPE_PATTERNS}.
109      */
110     static {
111         // some constants for defining the patterns:
112         // lowercase or modifier letter
113         final String lowercaseLetter = "[//p{Ll}//p{Lm}]";
114         // uppercase, titlecase, or modifier letter
115         final String uppercaseLetter = "[//p{Lu}//p{Lt}//p{Lm}]";
116         // any Unicode letter or mark
117         final String anyLetter = "[//p{L}//p{M}]";
118 
119         TOKEN_TYPE_PATTERNS = new LinkedMap();
120 
121         // different kinds of words (letters):
122         TOKEN_TYPE_PATTERNS.put("lower",
123             Pattern.compile(lowercaseLetter + "+"));
124         TOKEN_TYPE_PATTERNS.put("CAPS",
125             Pattern.compile(uppercaseLetter + "+"));
126         // lowercase letters after a single uppercase or titlecase letter
127         TOKEN_TYPE_PATTERNS.put("Cap",
128             Pattern.compile(uppercaseLetter + lowercaseLetter + "+"));
129         TOKEN_TYPE_PATTERNS.put("mixedCase", Pattern.compile(anyLetter + "+"));
130         // letters + trailing dot
131         TOKEN_TYPE_PATTERNS.put("CAPSDot",
132             Pattern.compile(uppercaseLetter + "+//."));
133         TOKEN_TYPE_PATTERNS.put("alphaDot",
134             Pattern.compile(anyLetter + "+//."));
135         // letters (incl. marks) + dashes ("-" etc.)
136         TOKEN_TYPE_PATTERNS.put("alpha+Punc",
137             Pattern.compile("[//p{L}//p{M}//p{Pd}]+//."));
138 
139         // different kinds of punctation
140         // opening and closing puncuation
141         TOKEN_TYPE_PATTERNS.put("openPunc", Pattern.compile("//p{Ps}+"));
142         TOKEN_TYPE_PATTERNS.put("closePunc", Pattern.compile("//p{Pe}+"));
143         /* not used -- unknown in JDK 1.4.2
144         // puncation for opening or closing quotes
145         TOKEN_TYPE_PATTERNS.put("initialQuote", Pattern.compile("//p{Pi}+"));
146         TOKEN_TYPE_PATTERNS.put("finalQuote", Pattern.compile("//p{Pf}+"));
147         */
148         // any punctuation
149         TOKEN_TYPE_PATTERNS.put("punc", Pattern.compile("//p{P}+"));
150 
151         // different kinds of numbers
152         // numbers cannot start with 0 (a single 0 is ok)
153         TOKEN_TYPE_PATTERNS.put("num", Pattern.compile("(?:0|[1-9]//d*)"));
154         // digits can start with '0' and can also contain other Unicode digits
155         TOKEN_TYPE_PATTERNS.put("digits", Pattern.compile("//p{N}+"));
156         TOKEN_TYPE_PATTERNS.put("digits+.", Pattern.compile("[//p{N}.]+"));
157         TOKEN_TYPE_PATTERNS.put("digits+.,",
158             Pattern.compile("[//p{N}.,]+"));
159         //TOKEN_TYPE_PATTERNS.put("digits+:", Pattern.compile("[//p{N}:]+"));
160 
161         // alphanumeric also includes dot + comma + dashes
162         TOKEN_TYPE_PATTERNS.put("alphanum",
163             Pattern.compile("[//p{L}//p{M}//p{N}//p{Pd}.,]+"));
164 
165         // different kinds of symbols
166         // mathematical and currency symbols
167         TOKEN_TYPE_PATTERNS.put("math", Pattern.compile("//p{Sm}+"));
168         TOKEN_TYPE_PATTERNS.put("curr", Pattern.compile("//p{Sc}+"));
169         // any symbols
170         TOKEN_TYPE_PATTERNS.put("symb", Pattern.compile("//p{S}+"));
171 
172         // control and other characters
173         TOKEN_TYPE_PATTERNS.put("ctrl", Pattern.compile("//p{C}+"));
174     }
175 
176     /***
177      * The maximum number of ancestors to include in the context representation.
178      */
179     private final int ancestorNumber;
180 
181     /***
182      * The name of the attribute to use for calculating head values,
183      * cf. {@link #calculateHeadValues(Element, List)}. The name must be
184      * compatible to {@link DOMUtils#name(Attribute)}.
185      */
186     private final String headAttribute;
187 
188     /***
189      * The name of the element to use for calculating head values,
190      * cf. {@link #calculateHeadValues(Element, List)}. The name must be
191      * compatible to {@link DOMUtils#name(Element)}.
192      */
193     private final String headElement;
194 
195     /***
196      * An unmodifiable set of names of default attributes. Contains Strings
197      * in a format compatible to {@link DOMUtils#name(Attribute)}.
198      */
199     private final Set defaultAttributes;
200 
201     /***
202      * The number of preceding recognitions to represent in detail.
203      */
204     private final int detailedRecognitions;
205 
206     /***
207      * The basic number of preceding and following siblings to include in the
208      * context representation.
209      */
210     private final int siblingNumber;
211 
212     /***
213      * The maximum length of prefixed and suffixes.
214      */
215     private final int maximumPrefixLength;
216 
217     /***
218      * An array of sensors used to add semantic information.
219      */
220     private final Sensor[] sensors;
221 
222     /***
223      * Creates a new instance based on the
224      * {@linkplain TiesConfiguration#CONF standard configuration}.
225      *
226      * @throws ProcessingException if an error occurs while initializing this
227      * instance
228      */
229     public DefaultRepresentation() throws ProcessingException {
230         this(TiesConfiguration.CONF);
231     }
232 
233     /***
234      * Creates a new instance based on the provided configuration.
235      *
236      * @param config used to configure this instance
237      * @throws ProcessingException if an error occurs while initializing this
238      * instance
239      */
240     public DefaultRepresentation(final TiesConfiguration config)
241     throws ProcessingException {
242         this(config.getInt(CONFIG_RECOGN_NUM),
243             config.getInt("representation.recogn.detailed"),
244             config.getInt("representation.ancestor.num"),
245             config.getInt("representation.sibling.num"),
246             config.getInt(CONFIG_SPLIT_MAXIMUM),
247             config.getInt("representation.prefix.maxlength"),
248             config.getString("representation.head.element"),
249             config.getString("representation.head.attrib"),
250             config.getStringArray("representation.default.attribs"),
251             config.getInt(CONFIG_STORE_NTH),
252             config.getString(IOUtils.KEY_LOCAL_CHARSET, null),
253             config.getStringArray("representation.sensors"),
254             config);
255     }
256 
257     /***
258      * Creates a new instance. All element/attribute names must be compatible
259      * to {@link DOMUtils#name(Element)}.
260      *
261      * @param recogNum the number of preceding recognitions to represent
262      * @param detailedRecogs the number of preceding recognitions to represent
263      * in detail
264      * @param numberOfAncestors the maximum number of ancestors to include in
265      * the context representation
266      * @param numberOfSiblings the basic number of preceding and following
267      * siblings to include
268      * @param splitMax the maximum number of subsequences to keep when
269      * a feature value must be split (at whitespace).
270      * @param prefixMax the maximum length of prefixed and suffixes
271      * @param headElementName the name of the element to use for
272      * calculating head values (cf. {@link #calculateHeadValues(Element, List)})
273      * @param headAttribName the name of the attribute to use for
274      * calculating head values
275      * @param defaultAttribs array of names of default attributes
276      * @param outCharset the output character set to use (only used to
277      * store some configurations for inspection purposes, if <code>n</code>
278      * &gt; 0); if <code>null</code>, the default charset of the current
279      * platform is used
280      * @param n Each <em>n</em>-th context representation is stored if &gt; 0;
281      * otherwise no representation is stored
282      * @param sensorNames array of fully specified names of classes
283      * implementing the {@link Sensor} interface to be used to look up semantic
284      * information
285      * @param config used to configure the sensors
286      * @throws ProcessingException if an error occurs while initializing this
287      * instance
288      */
289     public DefaultRepresentation(final int recogNum, final int detailedRecogs,
290             final int numberOfAncestors, final int numberOfSiblings,
291             final int splitMax, final int prefixMax,
292             final String headElementName, final String headAttribName,
293             final String[] defaultAttribs, final int n,
294             final String outCharset, final String[] sensorNames,
295             final TiesConfiguration config) throws ProcessingException {
296         super(recogNum, splitMax, n, outCharset);
297         detailedRecognitions = detailedRecogs;
298         ancestorNumber = numberOfAncestors;
299         siblingNumber = numberOfSiblings;
300         maximumPrefixLength = prefixMax;
301 
302         headElement = headElementName;
303         headAttribute = headAttribName;
304 
305         // create unmodifyable set of default attributes
306         final Set<String> defAttribSet = new HashSet<String>();
307         for (int i = 0; i < defaultAttribs.length; i++) {
308             defAttribSet.add(defaultAttribs[i]);
309         }
310         defaultAttributes = Collections.unmodifiableSet(defAttribSet);
311         // initialize sensors
312         sensors = BaseSensor.createSensors(sensorNames, config);
313     }
314 
315     /***
316      * Builds the context representation of text in an element. Returns a
317      * feature vector of all context features considered relevant for
318      * representation.
319      *
320      * @param element the element whose context should be represented
321      * @param leftText textual content to the left of (preceding)
322      * <code>mainText</code>, might be empty
323      * @param mainText the main textual content to represent, might be empty
324      * @param rightText textual content to the right of (following)
325      * <code>mainText</code>, might be empty
326      * @param priorRecognitions a buffer of the last {@link Recognition}s from
327      * the document, created by calling {@link #createRecognitionBuffer()};
328      * might be <code>null</code>
329      * @param featureCache a cache of (local) feature, should be re-used
330      * between all calls for the nodes in a single document (but must not be
331      * re-used when building the context of nodes in different documents!)
332      * @param logPurpose the type of contexts of main interest to the caller
333      * (e.g. "Token" or "Sentence"), used for logging
334      * @return a vector of features considered relevant for representation
335      * @throws ClassCastException if the <code>priorRecognitions</code> buffer
336      * contains objects that aren't {@link Recognition}s
337      * @throws IllegalArgumentException if the specified node is of an
338      * unsupported type
339      */
340     protected FeatureVector doBuildContext(final Element element,
341             final String leftText, final String mainText,
342             final String rightText, final PriorRecognitions priorRecognitions,
343             final Map<Element, List<LocalFeature>> featureCache,
344             final String logPurpose)
345             throws ClassCastException, IllegalArgumentException {
346         LinkedList<Feature> selfFeatures = new LinkedList<Feature>();
347         LinkedList<Feature> precedingFeatures = new LinkedList<Feature>();
348         LinkedList<Feature> followingFeatures = new LinkedList<Feature>();
349         LinkedList<Feature> ancestorFeatures = new LinkedList<Feature>();
350         LinkedList<Feature> ancestorSiblingFeatures = new LinkedList<Feature>();
351         final FeatureVector fullRep = new DefaultFeatureVector();
352 
353         // build pseudo-axis of prior recognitions, if any
354         List priorFeatures;
355         if ((getRecognitionNumber() > 0) && (priorRecognitions != null)) {
356             priorFeatures = buildPrior(priorRecognitions);
357         } else {
358             priorFeatures = null;
359         }
360 
361         // build sibling axes
362         ElementPosition position =
363             handleSiblings("", element, getSiblingNumber(),
364             precedingFeatures, followingFeatures, featureCache);
365 /*           Util.LOG.debug("Build preceding (size: " + precedingFeatures.size()
366                 + ") + followingFeatures (size: + " + followingFeatures.size()
367                 + "; cache size: " + featureCache.size() + ")"); */
368 
369         // build descendant-or-self axis, prepending start marker
370         selfFeatures.add(new GlobalFeature("Descendant-or-self axis"));
371         selfFeatures.add(new GlobalFeature(AXIS_DESC_OR_SELF,
372             LocalFeature.MARKER_START));
373         buildFeatures(AXIS_DESC_OR_SELF, element, position, true,
374             selfFeatures, true, featureCache);
375         buildTextFeatures(AXIS_DESC_OR_SELF, element, leftText.trim(),
376             mainText.trim(), rightText.trim(), selfFeatures);
377 /*            Util.LOG.debug("Build descendant-or-self (size: "
378                 + selfFeatures.size() + ", cache size: " + featureCache.size()
379                 + ")"); */
380 
381         // build ancestor (prepending start marker) + ancestor-siblings axes
382         handleAncestors(element, getAncestorNumber(),
383             getSiblingNumber() - 1, ancestorFeatures,
384             ancestorSiblingFeatures, new HashBag(), featureCache);
385         ancestorFeatures.addFirst(new GlobalFeature(AXIS_ANCESTOR,
386             LocalFeature.MARKER_START));
387         ancestorFeatures.addFirst(new GlobalFeature("Ancestor axis"));
388 /*            Util.LOG.debug("Build ancestors (size: " + ancestorFeatures.size()
389                 + ") incl. siblings (size: + " + ancestorSiblingFeatures.size()
390                 + "; cache size: " + featureCache.size() + ")"); */
391 
392         // combine all lists into full representation, start w/prior recog.s
393         if (priorFeatures != null) {
394             fullRep.addAll(priorFeatures);
395             priorFeatures = null;
396         }
397 
398         // preceding siblings + self + following siblings
399         fullRep.addAll(precedingFeatures);
400         precedingFeatures = null;
401         fullRep.addAll(selfFeatures);
402         fullRep.addAll(followingFeatures);
403         followingFeatures = null;
404 
405         // ancestors + self + ancestor-siblings:
406         fullRep.addAll(ancestorFeatures);
407         ancestorFeatures = null;
408         fullRep.addAll(selfFeatures);
409         selfFeatures = null;
410         fullRep.addAll(ancestorSiblingFeatures);
411         ancestorSiblingFeatures = null;
412 /*            Util.LOG.debug("Combined axes into single representation (size: "
413                 + fullRep.size() + ", cache size: " + featureCache.size()
414                 + ")"); */
415 
416         // create and append filtered representations
417         final List filteredRep = filterRepresentation(fullRep);
418 
419         // remove unnecessary markers from filtered rep.s and add them
420         removeExtraMarkers(filteredRep);
421         fullRep.addAll(filteredRep);
422 
423         Util.LOG.debug("Build " + logPurpose + " representation for "
424             + DOMUtils.showToken(element, mainText) + " (full size: "
425             + fullRep.size() + ", cache size: " + featureCache.size()
426             + ")");
427 
428         return fullRep;
429     }
430 
431     /***
432      * Builds the features of an element and appends them to the
433      * specified <code>featureList</code>. Handles attributes and calculated
434      * values and the element itself. Child elements are only handled when
435      * <code>recurseInsteadOfText</code> it <code>true</code> -- the axis name
436      * is not changed for child elements.
437      *
438      * @param axisName the name of the axis, used to start each feature
439      * @param element the element to process
440      * @param position wraps the position of the element within its parent
441      * element and related data, used for calculating positional features;
442      * if <code>null</code>, no positional features are calculated
443      * @param recurseInsteadOfText if <code>true</code> child elements
444      * are recursively processed, but no
445      * {@linkplain #calculateValuesFromText(String, String, List) values from
446      * text} are calculated; otherwise text is processed but child elements are
447      * not
448      * @param featureList the list of {@link GlobalFeature}s to add features to
449      * @param addAtEnd whether to add the new features at the end of at the
450      * beginning of the <code>featureList</code>
451      * @param cache a cache of local feature, mapping {@link Element}s to
452      * {@link List}s of {@link LocalFeature}
453      */
454     protected void buildFeatures(final String axisName, final Element element,
455             final ElementPosition position, final boolean recurseInsteadOfText,
456             final LinkedList<Feature> featureList, final boolean addAtEnd,
457             final Map<Element, List<LocalFeature>> cache) {
458         final List<LocalFeature> localFeatures;
459 
460         if ((!recurseInsteadOfText) && cache.containsKey(element)) {
461             // we can only use the cache when textual contents are included
462             localFeatures = cache.get(element);
463 /*            Util.LOG.debug("Re-used cached features for "
464                 + DOMUtils.showElement(element)); */
465         } else {
466             // build local features, with or without textual features
467             localFeatures = buildLocalFeatures(element, position,
468                 recurseInsteadOfText);
469 
470             // cache them if the are complete (including textual features
471             if (!recurseInsteadOfText) {
472                 cache.put(element, localFeatures);
473             }
474         }
475 
476         // convert to global features and append to list
477         final Iterator<LocalFeature> localIter = localFeatures.iterator();
478         GlobalFeature.globalize(axisName, localIter, featureList, addAtEnd);
479         //Util.LOG.debug("Globalized local features");
480 
481         if (recurseInsteadOfText) {
482             // process child elements, without calculating positional values
483             final Iterator childIter = element.elementIterator();
484             Element child;
485             while (childIter.hasNext()) {
486                 child = (Element) childIter.next();
487                 buildFeatures(axisName, child, null, recurseInsteadOfText,
488                     featureList, addAtEnd, cache);
489             }
490         }
491     }
492 
493     /***
494      * Builds the local features of an element. Handles attributes and
495      * calculated values and the element itself, but no child elements.
496      * The {@link LocalFeature}s can be stored in a cache for re-use; they must
497      * be combined with an axis name for getting a global feature.
498      *
499      * @param element the element to process
500      * @param position wraps the position of the element within its parent
501      * element and related data, used for calculating positional features;
502      * if <code>null</code>, no positional features are calculated
503      * @param ignoreText if <code>true</code>, no {@linkplain
504      * #calculateValuesFromText(String, String, List) values from text} or
505      * {@linkplain #calculateHeadValues(Element, List) head values} are
506      * calculated
507      * @return returns a List of created {@link LocalFeature}s
508      */
509     protected List<LocalFeature> buildLocalFeatures(final Element element,
510             final ElementPosition position, final boolean ignoreText) {
511         final List<LocalFeature> result = new LinkedList<LocalFeature>();
512         final String name = DOMUtils.name(element);
513 
514         // Process attributes sorted according to their names.
515         // Map contains name/value mappings.
516         final SortedMap<String, String> attribMap =
517             new TreeMap<String, String>();
518         final Iterator attribIter = element.attributeIterator();
519         Attribute currentAttrib;
520         String attName;
521         boolean foundDefaultAttrib = false;
522 
523         while (attribIter.hasNext()) {
524             currentAttrib = (Attribute) attribIter.next();
525             attName = DOMUtils.name(currentAttrib);
526             attribMap.put(attName, currentAttrib.getValue());
527 
528             if (!foundDefaultAttrib
529                     && getDefaultAttributes().contains(attName)) {
530                 // at least one of the default attributes is present
531                 foundDefaultAttrib = true;
532             }
533         }
534 
535         // no default attribute present: include stand-alone element name
536         if (!foundDefaultAttrib) {
537             result.add(LocalFeature.createElementFeature(name));
538             //Util.LOG.debug("Created element feature for " + name);
539         }
540 
541         final Iterator<String> sortedIter = attribMap.keySet().iterator();
542         String attValue;
543 
544         while (sortedIter.hasNext()) {
545             attName = sortedIter.next();
546             attValue = attribMap.get(attName);
547             //Util.LOG.debug("Creating " + attName + " attribute feature");
548             result.addAll(LocalFeature.createAttributeFeatures(name, attName,
549                 attValue, getSplitMaximum()));
550             //Util.LOG.debug("Created " + attName + " attribute feature");
551         }
552 
553         //Util.LOG.debug("Created attribute features");
554 
555         // calculate positional values, if there is a parent
556         if (position != null) {
557             calculatePositionalValues(name, position, result);
558             //Util.LOG.debug("Calculated positional values");
559         }
560 
561         if (!ignoreText) {
562             // check textual content
563             final String trimmedText = element.getTextTrim();
564 
565             if ((trimmedText != null) && (trimmedText.length() > 0)) {
566                 // calculate values from text incl. text itself
567                 calculateValuesFromText(name, trimmedText, result);
568             } else {
569                 // check whether there are head values instead
570                  calculateHeadValues(element, result);
571             }
572         }
573 
574         return result;
575     }
576 
577     /***
578      * Builds the pseudo-axis of prior recognitions.
579      *
580      * @param priorRecognitions a buffer of the last {@link Recognition}s from
581      * the document, created by calling {@link #createRecognitionBuffer()}
582      * @return a list of {@link GlobalFeature}s representing prior recognitions
583      * @throws ClassCastException if the <code>priorRecognitions</code> buffer
584      * contains objects that aren't {@link Recognition}s
585      */
586     protected List<Feature> buildPrior(
587             final PriorRecognitions priorRecognitions)
588             throws ClassCastException {
589         final LinkedList<Feature> result =
590             new LinkedList<Feature>();
591 
592         // start with comment and start marker
593         result.add(new GlobalFeature("Prior recognitions"));
594         result.add(new GlobalFeature(AXIS_PRIOR, LocalFeature.MARKER_START));
595 
596         // prepare to iterate prior recognitions
597         final Iterator recogIter = priorRecognitions.iterator();
598         PriorRecognitions.Pair currentPair;
599         Recognition currentRecognition;
600         String type;
601         String trimmedText;
602         LinkedList<Feature> recogFeatures;
603         final List<LocalFeature> calculatedValues =
604             new LinkedList<LocalFeature>();
605 
606         // number of recognitions to describe without details
607         final int briefRecogs =
608             priorRecognitions.size() - getDetailedRecognitions();
609 
610         // log an error if the buffer is too large
611         if (priorRecognitions.size() > getRecognitionNumber()) {
612             Util.LOG.error("Buffer of prior recognitions too large ("
613                 + priorRecognitions.size() + " instead of "
614                 + getRecognitionNumber() + " elements) -- use "
615                 + "Representation.createRecognitionBuffer() to create it");
616         }
617 
618         // process prior recognitions
619         for (int i = 0; recogIter.hasNext(); i++) {
620             currentPair = (PriorRecognitions.Pair) recogIter.next();
621             currentRecognition = currentPair.getRecognition();
622             type = currentRecognition.getType();
623 
624             // check whether features are cached
625             recogFeatures = currentPair.getCachedFeatures();
626             if (recogFeatures == null) {
627                 // features are not yet cached
628                 recogFeatures = new LinkedList<Feature>();
629 
630                 // include feature type for all recognitions
631                 recogFeatures.add(new GlobalFeature(AXIS_PRIOR,
632                     LocalFeature.createElementFeature(type)));
633 
634                 if (i >= briefRecogs) {
635                     // include further details for last n=detailedRecognitions:
636                     // calculate values from the text incl. text itself (if any)
637                     trimmedText = currentRecognition.getText().trim();
638                     if (StringUtils.isNotEmpty(trimmedText)) {
639                         calculateValuesFromText(type, trimmedText,
640                                 calculatedValues);
641 
642                             // store as global features and clear list
643                             GlobalFeature.globalize(AXIS_PRIOR,
644                                     calculatedValues.iterator(), recogFeatures,
645                                     true);
646                             calculatedValues.clear();
647                     }
648                 }
649 
650                 // cache features if the recognition is sealed
651                 if (currentRecognition.isSealed()) {
652                     currentPair.setCachedFeatures(recogFeatures);
653                 }
654             } else {
655                 // features are cached
656 
657                 if (i < briefRecogs) {
658                     // only include type (very first feature) for oldest
659                     // recognitions -- discard all other features
660                     while (recogFeatures.size() > 1) {
661                         recogFeatures.removeLast();
662                     }
663                 }
664             }
665 
666             // add features of recognition to result list
667             result.addAll(recogFeatures);
668 
669             if (!recogIter.hasNext()) {
670                 // last recognition: check whether it is finished (sealed)
671                 result.addLast(new GlobalFeature(AXIS_PRIOR,
672                     LocalFeature.createCalculatedFeature(type, "finished",
673                         String.valueOf(currentRecognition.isSealed()))));
674             }
675         }
676         return result;
677     }
678 
679     /***
680      * Builds the context representation of text in an element, differentiating
681      * between three kinds of textual contents: a left part, a main part, and
682      * a right part. Each text part is optional (if all of them are empty,
683      * this method does nothing).
684      *
685      * @param axisName the name of the axis, used to start each feature
686      * @param element the element whose context should be represented
687      * @param trimmedLeft trimmed textual content to the left of (preceding)
688      * <code>trimmedMain</code>, might be empty
689      * @param trimmedMain trimmed main textual content to represent, might
690      * be empty
691      * @param trimmedRight trimmed textual content to the right of (following)
692      * <code>trimmedMain</code>, might be empty
693      * @param featureList a list of {@link GlobalFeature}s to add the values to
694      */
695     protected void buildTextFeatures(final String axisName,
696             final Element element, final String trimmedLeft,
697             final String trimmedMain, final String trimmedRight,
698             final LinkedList<Feature> featureList) {
699         final String name = DOMUtils.name(element);
700         final List<LocalFeature> localTextFeatures =
701             new LinkedList<LocalFeature>();
702 
703         // the feature mark for calculated values to used to introduce left
704         // + right features
705         final String innerMarker = FeatureType.CALCULATED.getMark();
706 
707         if (trimmedLeft.length() > 0) {
708             calculateValuesFromText(name + innerMarker + "left", trimmedLeft,
709                 localTextFeatures);
710         }
711         if (trimmedMain.length() > 0) {
712             calculateValuesFromText(name, trimmedMain, localTextFeatures);
713         }
714         if (trimmedRight.length() > 0) {
715             calculateValuesFromText(name + innerMarker + "right", trimmedRight,
716                 localTextFeatures);
717         }
718 
719         // globalize + append features
720         GlobalFeature.globalize(axisName, localTextFeatures.iterator(),
721             featureList, true);
722     }
723 
724     /***
725      * Creates values that depend on "head" children of an element, if the
726      * element contains any of them. This method is meant for element that
727      * don't contain any textual content, so their children are checked
728      * instead.
729      *
730      * This implementation proceeds as follows:
731      * <ul>
732      * <li>If the specified element is of type {@link #getHeadElement()}, the
733      * value of the {@link #getHeadAttribute()} or else the textual content of
734      * its right-most child element is stored (in a calculated value named
735      * "head"). Child elements are iterated from right to left unless one
736      * containing an appropriate attribute or textual content is found (or none
737      * are left).
738      * <li>Otherwise, if the element contains children of the type
739      * {@link #getHeadElement()}, the first and last child element of this type
740      * are recursively processed and the results stored in calculated names
741      * named "lhead" and "rhead". Of there is only a single child of appropriate
742      * type, the "lhead" value is omitted.
743      * <li>Otherwise no values are calculated.
744      * </ul>
745      *
746      * <p>If a value contains whitespace, only the final subsequence following
747      * all whitespace is preserved.
748      *
749      * @param element the element to process
750      * @param values a list of {@link LocalFeature}s to add the calculated
751      * values to
752      */
753     protected void calculateHeadValues(final Element element,
754             final List<LocalFeature> values) {
755         final String elementName = DOMUtils.name(element);
756         if (getHeadElement().equals(elementName)) {
757             // element is of suitable type
758             final String headValue = determineHeadValue(element);
759 
760             // store if not empty
761             if (headValue.length() > 0) {
762                 values.add(LocalFeature.createCalculatedFeature(elementName,
763                     "head", headValue));
764             }
765         } else {
766             // process children of suitable type
767             final List suitableChildren = DOMUtils.elementsByName(element,
768                 getHeadElement());
769             final int numberOfChildren = suitableChildren.size();
770 
771             if (numberOfChildren >= 1) {
772                 // process right-most child
773                 final Element rightMostChild =
774                     (Element) suitableChildren.get(numberOfChildren - 1);
775                 final String rHeadValue = determineHeadValue(rightMostChild);
776 
777                 // store if not empty
778                 if (rHeadValue.length() > 0) {
779                     values.add(LocalFeature.createCalculatedFeature(
780                         elementName, "rhead", rHeadValue));
781                 }
782 
783                 if (numberOfChildren >= 2) {
784                     // process left-most child
785                     final Element leftMostChild =
786                         (Element) suitableChildren.get(0);
787                     final String lHeadValue = determineHeadValue(leftMostChild);
788 
789                     // store if not empty
790                     if (lHeadValue.length() > 0) {
791                         values.add(LocalFeature.createCalculatedFeature(
792                             elementName, "lhead", lHeadValue));
793                     }
794                 }
795             }
796         }
797     }
798 
799     /***
800      * Calculates values that depend on the position of an element within its
801      * parent.
802      *
803      * @param elementName the name of the element to process, as returned by
804      * {@link DOMUtils#name(Element)}
805      * @param position wraps the position of the element within its parent
806      * element and related data, must not be <code>null</code>
807      * @param values a list to add the calculated values to
808      */
809     protected void calculatePositionalValues(final String elementName,
810             final ElementPosition position, final List<LocalFeature> values) {
811         // store exact position, counting all children
812         values.add(LocalFeature.createCalculatedFeature(elementName, "pos",
813             Integer.toString(position.getOverallPosition())));
814 
815         // store exact position, counting only children with same type
816         values.add(LocalFeature.createCalculatedFeature(elementName,
817             "typedPos", Integer.toString(position.getTypedPosition())));
818 
819         // store rough positions
820         values.add(LocalFeature.createCalculatedFeature(elementName,
821             "roughPos", determineRoughPosition(position.getOverallPosition(),
822                 position.getAllChildren())));
823         values.add(LocalFeature.createCalculatedFeature(elementName,
824             "roughTypedPos", determineRoughPosition(position.getTypedPosition(),
825                 position.getTypedChildren())));
826     }
827 
828     /***
829      * Calculates values that depend on the textual content of an element:
830      * prefixes, suffixes, length data, and token type. Also includes the text
831      * itself.
832      *
833      * @param elementName the name of the element to process, as returned by
834      * {@link DOMUtils#name(Element)}
835      * @param trimmedText the trimmed textual content of the element to
836      * process, must not be empty
837      * @param values a list of {@link LocalFeature}s to add the calculated
838      * values to
839      * @throws IllegalArgumentException if the empty string was given as
840      * <code>trimmedText</code>
841      */
842     protected void calculateValuesFromText(final String elementName,
843             final String trimmedText, final List<LocalFeature> values)
844             throws IllegalArgumentException {
845         final int textLength = trimmedText.length();
846         if (textLength == 0) {
847             throw new IllegalArgumentException(
848                 "Specifed trimmed text must not be empty!");
849         }
850 
851         // split at whitespace, if any
852         final String[] tokens = TextUtils.splitString(trimmedText, -1);
853         final String firstTokenLC = tokens[0].toLowerCase();
854         final String lastTokenLC;
855 
856         if (tokens.length == 1) {
857             // no whitespace -- single token
858             lastTokenLC = firstTokenLC;
859         } else {
860             lastTokenLC = tokens[tokens.length - 1].toLowerCase();
861         }
862 
863         // store starting 1--x letters from first token (but only if the
864         // token contains enough letters; using lower case)
865         final int maxPrefixLength = Math.min(maximumPrefixLength,
866             firstTokenLC.length() - 1);
867         for (int i = 1; i <= maxPrefixLength; i++) {
868             values.add(LocalFeature.createCalculatedFeature(elementName,
869                  "prefix" + i, firstTokenLC.substring(0, i)));
870         }
871 
872         // store trailing 1--x letters from last token (but only if the
873         // token contains enough letters; using lower case)
874         final int maxSuffixLength = Math.min(maximumPrefixLength,
875             lastTokenLC.length() - 1);
876         for (int i = 1; i <= maxSuffixLength; i++) {
877             values.add(LocalFeature.createCalculatedFeature(elementName,
878                 "suffix" + i, lastTokenLC.substring(lastTokenLC.length() - i)));
879         }
880 
881         // store length
882         values.add(LocalFeature.createCalculatedFeature(elementName, "length",
883             Integer.toString(textLength)));
884 
885         // also store rounded square root of length as a less sparse feature
886         final long sqrtLength = Math.round(Math.sqrt(textLength));
887         values.add(LocalFeature.createCalculatedFeature(elementName,
888             "sqrtLength", Long.toString(sqrtLength)));
889 
890         // determine the token type (of last token, if there are several)
891         final Set typeSet = TOKEN_TYPE_PATTERNS.keySet();
892         final Iterator typeIter = typeSet.iterator();
893         String currentDesc;
894         Pattern currentPattern;
895         Matcher currentMatcher;
896         boolean foundMatch = false;
897         String matchingType = "mixed"; // use if no match is found
898 
899         // try each pattern until one of them matches (or none are left)
900         while (typeIter.hasNext() && !foundMatch) {
901             currentDesc = (String) typeIter.next();
902             currentPattern = (Pattern) TOKEN_TYPE_PATTERNS.get(currentDesc);
903             currentMatcher = currentPattern.matcher(tokens[tokens.length - 1]);
904 
905             if (currentMatcher.matches()) {
906                 foundMatch = true;
907                 matchingType = currentDesc; // overwrite value
908             }
909         }
910 
911         // append "+multi" if there are several whitespace-separated tokens
912         final String fullType = (tokens.length > 1)
913             ? matchingType + "+multi" : matchingType;
914         values.add(LocalFeature.createCalculatedFeature(elementName,
915             "tokenType", fullType));
916 
917         // lookup information from sensors (for the whole text, not per token)
918         KeyValue[] sensorData;
919         String prefix;
920         int j;
921 
922         for (int i = 0; i < sensors.length; i++) {
923             // prefix keys of each sensor with "i$" (e.g. "0$") toi avoid
924             // naming collisions
925             prefix = i + FeatureType.CALCULATED.getMark();
926             sensorData = sensors[i].lookup(trimmedText);
927 
928             for (j = 0; j < sensorData.length; j++) {
929                 values.add(LocalFeature.createCalculatedFeature(elementName,
930                     prefix + sensorData[j].getKey(),
931                     Util.asString(sensorData[j].getValue())));
932             }
933         }
934 
935         // add textual content itself -- in lower case and "as is"
936         values.addAll(LocalFeature.createCalculatedFeatures(elementName, "lc",
937             trimmedText.toLowerCase(), getSplitMaximum()));
938         values.addAll(LocalFeature.createTextFeatures(elementName, trimmedText,
939             getSplitMaximum()));
940     }
941 
942     /***
943      * Helper method for determining the head value for an element of type
944      * {@link #getHeadElement()}. See
945      * {@link #calculateHeadValues(Element, List)} for a description of the
946      * algorithm.
947      *
948      * @param element the element to process, must be of type
949      * {@link #getHeadElement()}
950      * @return the calculated head value, or the empty string if no value
951      * could be calculated (the specified element doesn't have any children,
952      * or the right-most child contains neither {@link #getHeadAttribute()} nor
953      * textual content)
954      */
955     protected String determineHeadValue(final Element element) {
956         String fullResult = null;
957         final List children = element.elements();
958         final ListIterator childIter = children.listIterator(children.size());
959         Element currentChild;
960         Attribute headAttrib;
961         boolean done = false;
962 
963         // iterate children from right to left unless required attribute or
964         // textual content is present
965         while (childIter.hasPrevious() && !done) {
966             currentChild = (Element) childIter.previous();
967             headAttrib = DOMUtils.attributeByName(currentChild,
968                 getHeadAttribute());
969 
970             // check attribute value or else trimmed textual content
971             if (headAttrib != null) {
972                 fullResult = headAttrib.getText();
973             } else {
974                 fullResult = element.getTextTrim();
975             }
976 
977             // we're done when we got an non-empty string
978             done = (fullResult != null) && (fullResult.length() > 0);
979         }
980 
981         final String result;
982         if (fullResult != null) {
983             // split text, only keeping last subsequence after any whitespace
984             final String[] splits = TextUtils.splitString(fullResult, 1);
985             result = splits[0];
986         } else {
987             result = "";
988         }
989         return result;
990     }
991 
992     /***
993      * Helper method called by
994      * {@link #calculatePositionalValues(String, ElementPosition, List)} to
995      * collapse a position in to one of five values.
996      * <dl>
997      * <dt>first</dt>
998      * <dd>for the first element</dd>
999      * <dt>early</dt>
1000      * <dd>if position is within the first third of all elements (but not the
1001      * first one), upper limit included</dd>
1002      * <dt>middle</dt>
1003      * <dd>if position is within the second third of all elements, limits
1004      * excluded</dd>
1005      * <dt>late</dt>
1006      * <dd>if position is within the last third of all elements (but not the
1007      * last one), lower limit included</dd>
1008      * <dt>last</dt>
1009      * <dd>for the last element</dd>
1010      * </dl>
1011      *
1012      * @param position the position counted from 0, should be non-negative and
1013      * smaller than <code>elementCount</code> (otherwise the results are
1014      * undefined)
1015      * @param elementCount the number of elements
1016      * @return a String containing one of the five values described above
1017      */
1018     protected String determineRoughPosition(final int position,
1019             final int elementCount) {
1020         // for easier counting
1021         final int realPosition = position + 1;
1022         final String result;
1023 
1024         if (realPosition == 1) {
1025             result = "first";
1026         } else if (realPosition == elementCount) {
1027             result = "last";
1028         } else {
1029             final double oneThird = elementCount / 3.0;
1030             if (realPosition <= oneThird) {
1031                 result = "early";
1032             } else if (realPosition >= (2 * oneThird)) {
1033                 result = "late";
1034             } else {
1035                 result = "middle";
1036             }
1037         }
1038         return result;
1039     }
1040 
1041     /***
1042      * Creates a filtered view of a context representation. A filtered
1043      * representation is created for each {@linkplain FeatureType type of
1044      * features} that occurs at least twice;
1045      * attributes and calculated values occuring only once are omitted.
1046      * For textual content, another filtered representation is created.
1047      *
1048      * <p>Features representing markers ({@link FeatureType#MARKER}),
1049      * stand-alone elements ({@link FeatureType#ELEMENT}) and default attributes
1050      * ({@link #getDefaultAttributes()}) are included in all filtered
1051      * representations. Comment-only features are ignored.
1052      *
1053      * @param originalRep a feature vector containing the representation
1054      * to filter
1055      * @return a list of {@link de.fu_berlin.ties.classify.feature.Feature}s
1056      * combining the filtered representations created by this method
1057      */
1058     protected List<Feature> filterRepresentation(
1059             final FeatureVector originalRep) {
1060         // features to include in all views (markers, stand-alone elements,
1061         // default attributes)
1062         final List<Feature> universalFeatures = new LinkedList<Feature>();
1063         // textual features
1064         final LinkedList<Feature> textualFeatures = new LinkedList<Feature>();
1065         // multi-map mapping mark + name of attributes/calculated values to
1066         // LinkedLists of filtered features
1067         final MultiMap featureMap = new MultiHashMap();
1068         // count features of each attribute/calculated type
1069         final SortedBag featureCount = new TreeBag();
1070 
1071         final Iterator iter = originalRep.iterator();
1072         GlobalFeature currentGlobal;
1073         LocalFeature currentLocal;
1074         FeatureType currentType;
1075         Iterator keyIter;
1076         String currentKey;
1077         Iterator<Feature> universalIter;
1078         Iterator innerIter;
1079         Feature innerFeature;
1080 
1081         // iterate original representation
1082         while (iter.hasNext()) {
1083             currentGlobal = (GlobalFeature) iter.next();
1084             currentLocal = currentGlobal.getLocalFeature();
1085 
1086             if (currentLocal != null) { // otherwise it's a comment -- ignored
1087                 currentType = currentLocal.getType();
1088                 if (((currentType == FeatureType.ATTRIBUTE)
1089                             && (getDefaultAttributes().contains(
1090                                 currentLocal.getName())))
1091                         || (currentType == FeatureType.MARKER)) {
1092                     // include in all views
1093                     universalFeatures.add(currentGlobal);
1094                     textualFeatures.add(currentGlobal);
1095                     keyIter = featureMap.keySet().iterator();
1096                     while (keyIter.hasNext()) {
1097                         currentKey = (String) keyIter.next();
1098                         featureMap.put(currentKey, currentGlobal);
1099                     }
1100                 // use attributes, elements and calculated ...head values
1101                 } else if ((currentType == FeatureType.ATTRIBUTE)
1102                           || (currentType == FeatureType.ELEMENT)
1103                           || ((currentType == FeatureType.CALCULATED)
1104                               && currentLocal.getName().endsWith("head"))) {
1105                     // determine key: use type marker
1106                     currentKey = currentType.getMark();
1107 
1108                     if (!featureMap.containsKey(currentKey)) {
1109                         // create new view, copying collected universal entries
1110                         universalIter = universalFeatures.iterator();
1111                         while (universalIter.hasNext()) {
1112                             featureMap.put(currentKey, universalIter.next());
1113                         }
1114                     }
1115 
1116                     // include in appropriate view + increase count
1117                     featureMap.put(currentKey, currentGlobal);
1118                     featureCount.add(currentKey);
1119                 } else if (currentType == FeatureType.TEXT) {
1120                     // include in textual view
1121                     textualFeatures.add(currentGlobal);
1122                 }
1123                 // elements are ignored in filtered rep.s
1124             }
1125         }
1126 
1127         // combine all views
1128         final List<Feature> result = new LinkedList<Feature>();
1129         final Iterator sortedIter = featureCount.uniqueSet().iterator();
1130         Collection currentView;
1131 
1132         // add view for each suitable attribute/valculated value
1133         while (sortedIter.hasNext()) {
1134             currentKey = (String) sortedIter.next();
1135 
1136             // ignore attributes/calculated values that occur only once or twice
1137             if (featureCount.getCount(currentKey) > 2) {
1138                 result.add(new GlobalFeature("Filtered view for " + currentKey
1139                     + " features"));
1140                 currentView = (Collection) featureMap.get(currentKey);
1141                 innerIter = currentView.iterator();
1142 
1143                 while (innerIter.hasNext()) {
1144                     innerFeature = (Feature) innerIter.next();
1145                     result.add(innerFeature);                    
1146                 }
1147             }
1148         }
1149 
1150         // add view for textual content
1151         result.add(new GlobalFeature("Filtered view for textual features"));
1152         result.addAll(textualFeatures);
1153 
1154         return result;
1155     }
1156 
1157     /***
1158      * Returns the maximum number of ancestors to include in the context
1159      * representation.
1160      *
1161      * @return the number of ancestors to include
1162      */
1163     public int getAncestorNumber() {
1164         return ancestorNumber;
1165     }
1166 
1167     /***
1168      * Returns the unmodifiable set of names of default attributes. The
1169      * contained names are Strings in a format compatible to
1170      * {@link DOMUtils#name(Attribute)}.
1171      *
1172      * @return the set of default attributes
1173      */
1174     public Set getDefaultAttributes() {
1175         return defaultAttributes;
1176     }
1177 
1178     /***
1179      * Returns the number of preceding recognitions to represent in detail.
1180      * @return the value of the attribute
1181      */
1182     public int getDetailedRecognitions() {
1183         return detailedRecognitions;
1184     }
1185 
1186     /***
1187      * Returns the name of the element to use for calculating head values.
1188      *
1189      * @return the name of the attribute
1190      * @see #calculateHeadValues(Element, List)
1191      */
1192     public String getHeadAttribute() {
1193         return headAttribute;
1194     }
1195 
1196     /***
1197      * Returns the name of the attribute to use for calculating head values.
1198      *
1199      * @return the name of the element
1200      * @see #calculateHeadValues(Element, List)
1201      */
1202     public String getHeadElement() {
1203         return headElement;
1204     }
1205 
1206     /***
1207      * Returns the basic number of preceding and following siblings to include
1208      * in the context representation.
1209      *
1210      * @return the number of siblings to include
1211      */
1212     public int getSiblingNumber() {
1213         return siblingNumber;
1214     }
1215 
1216     /***
1217      * Handles ancestors and ancestor siblings of an element.
1218      *
1219      * @param element the element to process
1220      * @param ancestorsToAdd the number of ancestors to add, must be &gt; 0;
1221      * if &gt; 1, this method calls itself recursively, decreasing the number
1222      * by 1
1223      * @param ancestorSiblingsToAdd the number of preceding/following siblings
1224      * of the current ancestors to add; if 0 or negative, no siblings are added;
1225      * a recursive call decreases this parameter by 1 if any ancestor siblings
1226      * were found (if no siblings were found, the number passes unchanged)
1227      * @param ancestorFeatures the list of {@link GlobalFeature}s to prepend
1228      * the features on the ancestors to
1229      * @param ancestorSiblingFeatures the list of {@link GlobalFeature}s to
1230      * append the features on the ancestors siblings to
1231      * @param processedAncestorNames a bag that will typically be empty when
1232      * first calling this method (will be filled by recursive calls)
1233      * @param cache a cache of local feature, mapping {@link Element}s to
1234      * {@link List}s of {@link LocalFeature}
1235      * @throws IllegalArgumentException if <code>ancestorsToAdd</code> is 0 or
1236      * negative
1237      */
1238     protected void handleAncestors(final Element element,
1239             final int ancestorsToAdd, final int ancestorSiblingsToAdd,
1240             final LinkedList<Feature> ancestorFeatures,
1241             final LinkedList<Feature> ancestorSiblingFeatures,
1242             final Bag processedAncestorNames,
1243             final Map<Element, List<LocalFeature>> cache)
1244             throws IllegalArgumentException {
1245         if (ancestorsToAdd <= 0) {
1246             throw new IllegalArgumentException(
1247                 "Cannot add a non-positive number of ancestors ("
1248                     + ancestorsToAdd + ")!");
1249         }
1250 
1251         // handle parent
1252         final Element parent = element.getParent();
1253         if (parent != null) {
1254             // otherwise there is nothing to do (reached highest node)
1255             final ElementPosition position;
1256 
1257             // process siblings of parent, if required
1258             if (ancestorSiblingsToAdd > 0) {
1259                 // use bag of names to determine axis prefix:
1260                 final String parentName = DOMUtils.name(parent);
1261                 processedAncestorNames.add(parentName);
1262                 final int typeCount
1263                     = processedAncestorNames.getCount(parentName);
1264                 final String nameToQuote;
1265 
1266                 if (typeCount > 2) {
1267                     // repeated occurrence of this element type:
1268                     // prepend repetition counter to element name
1269                     nameToQuote = Integer.toString(typeCount) + parentName;
1270                 } else {
1271                     // first occurrence of this element type: just use the name
1272                     nameToQuote = parentName;
1273                 }
1274                 final String axisPrefix =
1275                     AXIS_ANCESTOR + LocalFeature.quote(nameToQuote);
1276 
1277                 // store both sides in shared temporary list which is then
1278                 // appended to the specified list
1279                 final LinkedList<Feature> siblingFeatures =
1280                     new LinkedList<Feature>();
1281                 position = handleSiblings(axisPrefix, parent,
1282                     ancestorSiblingsToAdd, siblingFeatures, siblingFeatures,
1283                     cache);
1284                 ancestorSiblingFeatures.addAll(siblingFeatures);
1285             } else {
1286                 position = null;
1287             }
1288 
1289             // process parent itself, prepending to list
1290             buildFeatures(AXIS_ANCESTOR, parent, position, false,
1291                 ancestorFeatures, false, cache);
1292 
1293             // recursively handle ancestors of parent, if required
1294             if (ancestorsToAdd > 1) {
1295                 final int parentSilbingsToAdd;
1296 
1297                 // determine how many siblings to process at the next step
1298                 if ((position != null) && (position.getProcessedPreceding()
1299                         + position.getProcessedPreceding() > 0)) {
1300                     // processed one or more ancestor siblings: decrease number
1301                     parentSilbingsToAdd = ancestorSiblingsToAdd - 1;
1302                 } else {
1303                     // no ancestor siblings processed: pass number unchange
1304                     parentSilbingsToAdd = ancestorSiblingsToAdd;
1305                 }
1306 
1307                 handleAncestors(parent, ancestorsToAdd - 1 ,
1308                     parentSilbingsToAdd, ancestorFeatures,
1309                     ancestorSiblingFeatures, processedAncestorNames, cache);
1310             }
1311         }
1312     }
1313 
1314     /***
1315      * Adds the preceding and following siblings of an element.
1316      *
1317      * @param axisPrefix the prefix of the axis name, used to start each
1318      * feature; specify the empty string if no prefix should be used
1319      * @param element the element to process
1320      * @param baseNumber the basic number of siblings to keep; the actual
1321      * number of siblings kept might vary
1322      * @param precedingFeatures the list of {@link GlobalFeature}s to prepend
1323      * the features on the preceding siblings to
1324      * @param followingFeatures the list of {@link GlobalFeature}s to append
1325      * the features on the following siblings to
1326      * @param cache a cache of local feature, mapping {@link Element}s to
1327      * {@link List}s of {@link LocalFeature}
1328      * @return a wrapper of the position of the element with its parent element
1329      * and related data, required by
1330      * {@link #calculatePositionalValues(String, ElementPosition, List)}, or
1331      * <code>null</code> if there is no parent element
1332      */
1333     protected ElementPosition handleSiblings(final String axisPrefix,
1334             final Element element, final int baseNumber,
1335             final LinkedList<Feature> precedingFeatures,
1336             final LinkedList<Feature> followingFeatures,
1337             final Map<Element, List<LocalFeature>> cache) {
1338         final Element parent = element.getParent();
1339         final String name = DOMUtils.name(element);
1340         final ElementPosition result;
1341 
1342         if (parent != null) {
1343             // otherwise there is nothing to do
1344             final Iterator childIter = parent.elementIterator();
1345             Element currentChild;
1346             boolean foundElement = false;
1347             LinkedList<Element> allPrecedingSiblings =
1348                 new LinkedList<Element>();
1349             LinkedList<Element> allFollowingSiblings =
1350                 new LinkedList<Element>();
1351 
1352             // counters for object to return
1353             int allChildren = 0;
1354             int overallPosition = 0;
1355             int typedChildren = 0;
1356             int typedPosition = 0;
1357             boolean sameType;
1358 
1359             while (childIter.hasNext()) {
1360                 currentChild = (Element) childIter.next();
1361 
1362                 // update child count
1363                 allChildren++;
1364                 if (name.equals(DOMUtils.name(currentChild))) {
1365                     sameType = true;
1366                     typedChildren++;
1367                 } else {
1368                     sameType = false;
1369                 }
1370 
1371                 if (currentChild == element) {
1372                     foundElement = true;
1373                 } else {
1374                     if (!foundElement) {
1375                         // add to preceding siblings
1376                         allPrecedingSiblings.addLast(currentChild);
1377 
1378                         // update position count
1379                         overallPosition++;
1380                         if (sameType) {
1381                             typedPosition++;
1382                         }
1383                     } else {
1384                         // add to following siblings
1385                         allFollowingSiblings.addLast(currentChild);
1386                     }
1387                 }
1388             }
1389 
1390             // select siblings to keep
1391             final List selectedPrecedingSiblings = selectPrecedingSiblings(
1392                    element, allPrecedingSiblings, baseNumber);
1393             final List selectedFollowingSiblings = selectFollowingSiblings(
1394                     element, allFollowingSiblings, baseNumber);
1395 
1396             // create object to return
1397             result = new ElementPosition(allChildren, overallPosition,
1398                 typedChildren, typedPosition, selectedPrecedingSiblings.size(),
1399                 selectedFollowingSiblings.size());
1400 
1401             // name axes
1402             final StringBuffer precedingAxisName = new StringBuffer(axisPrefix);
1403             final StringBuffer followingAxisName = new StringBuffer(axisPrefix);
1404             final String prefixComment;
1405             if (axisPrefix.length() > 0) {
1406                 prefixComment = " with '" + axisPrefix + "' prefix";
1407                 precedingAxisName.append(GlobalFeature.SEP);
1408                 followingAxisName.append(GlobalFeature.SEP);
1409             } else {
1410                 prefixComment = ""; // no prefix
1411             }
1412             precedingAxisName.append(AXIS_PREC_SIBLING);
1413             followingAxisName.append(AXIS_FOLLOW_SIBLING);
1414 
1415             ListIterator siblingIter;
1416             Element currentSibling;
1417 
1418             // prepend preceding siblings, without calculating positional values
1419             siblingIter = selectedPrecedingSiblings.listIterator(
1420                 selectedPrecedingSiblings.size());
1421             while (siblingIter.hasPrevious()) {
1422                 currentSibling = (Element) siblingIter.previous();
1423                 buildFeatures(precedingAxisName.toString(), currentSibling,
1424                         null, false, precedingFeatures, false, cache);
1425             }
1426 
1427             precedingFeatures.addFirst(
1428                 new GlobalFeature(precedingAxisName.toString(),
1429                     LocalFeature.MARKER_START));
1430             precedingFeatures.addFirst(new GlobalFeature(
1431                 "Preceding siblings axis" + prefixComment));
1432 
1433             // append following siblings, without calculating positional values
1434             followingFeatures.addLast(new GlobalFeature(
1435                             "Following siblings axis" + prefixComment));
1436             followingFeatures.addLast(
1437                 new GlobalFeature(followingAxisName.toString(),
1438                     LocalFeature.MARKER_START));
1439 
1440             siblingIter = selectedFollowingSiblings.listIterator();
1441             while (siblingIter.hasNext()) {
1442                 currentSibling = (Element) siblingIter.next();
1443                 buildFeatures(followingAxisName.toString(), currentSibling,
1444                         null, false, followingFeatures, true, cache);
1445             }
1446         } else {
1447             // no parent
1448             result = null;
1449         }
1450         return result;
1451     }
1452 
1453     /***
1454      * Modifies a list of {@link GlobalFeature}s to remove extraneous
1455      * {@link FeatureType#MARKER} features. Keeps only the last one of several
1456      * sequential marker features; trailing marker features are removed as well.
1457      *
1458      * @param features the list of features to modify
1459      */
1460     protected void removeExtraMarkers(final List features) {
1461         // iterate backward for easier operation
1462         final ListIterator backwardIter =
1463             features.listIterator(features.size());
1464         GlobalFeature currentGlobal;
1465         LocalFeature currentLocal;
1466         FeatureType currentType;
1467 
1468         // initialized this way to remove trailing marker features
1469         FeatureType lastType = FeatureType.MARKER;
1470 
1471         while (backwardIter.hasPrevious()) {
1472             currentGlobal = (GlobalFeature) backwardIter.previous();
1473             currentLocal = currentGlobal.getLocalFeature();
1474 
1475             if (currentLocal != null) { // otherwise it's a comment -- ignored
1476                 currentType = currentLocal.getType();
1477 
1478                 if ((currentType == FeatureType.MARKER)
1479                         && (lastType == FeatureType.MARKER)) {
1480                     // remove extraneous marker feature
1481                     backwardIter.remove();
1482                 }
1483 
1484                 lastType = currentType;
1485             }
1486         }
1487     }
1488 
1489     /***
1490      * Selects the siblings to keep among all following siblings. This
1491      * implementation always keeps the <code>baseNumber</code> first siblings.
1492      * One of the selected siblings may have a different type (name as returned
1493      * by {@link DOMUtils#name(Element)}) than the main element --
1494      * if there are more with different types, they are skipped.
1495      *
1496      * @param mainElement the element whose siblings should be selected
1497      * @param allFollowingSiblings the list of all following siblings
1498      * @param baseNumber the basic number of siblings to keep; the actual
1499      * number of siblings kept might vary
1500      * @return the subset of following siblings to keep; an empty list if
1501      * there are no siblings to keep
1502      */
1503     protected List<Element> selectFollowingSiblings(final Element mainElement,
1504             final LinkedList<Element> allFollowingSiblings,
1505             final int baseNumber) {
1506         final String elementType = DOMUtils.name(mainElement);
1507         final LinkedList<Element> selectedFollowingSiblings =
1508             new LinkedList<Element>();
1509         final Iterator<Element> siblingIter = allFollowingSiblings.iterator();
1510         Element currentSibling;
1511         boolean addedSiblingOfDifferentType = false;
1512         boolean doAdd;
1513 
1514         // keep up to |baseNumber| siblings from start of list
1515         while (siblingIter.hasNext()
1516                 && (selectedFollowingSiblings.size() < baseNumber)) {
1517             currentSibling = siblingIter.next();
1518 
1519             if (elementType.equals(DOMUtils.name(currentSibling))) {
1520                 doAdd = true;
1521             } else {
1522                 // only add if this is the first differently typed sibling
1523                 doAdd = !addedSiblingOfDifferentType;
1524                 addedSiblingOfDifferentType = true;
1525             }
1526 
1527             if (doAdd) {
1528                 selectedFollowingSiblings.addLast(currentSibling);
1529             }
1530         }
1531 
1532         return selectedFollowingSiblings;
1533     }
1534 
1535     /***
1536      * Selects the siblings to keep among all preceding siblings. This
1537      * implementation always keeps the first sibling and the
1538      * <code>baseNumber</code> last siblings. One of the last siblings may
1539      * have a different type (name as returned by
1540      * {@link DOMUtils#name(Element)}) than the main element --
1541      * if there are more with different types, they are skipped.
1542      * If none of the <code>baseNumber</code> last siblings has a different
1543      * types, the last siblings with a different type (name) is also kept.
1544      *
1545      * @param mainElement the element whose siblings should be selected
1546      * @param allPrecedingSiblings the list of all preceding siblings
1547      * @param baseNumber the basic number of siblings to keep; the actual
1548      * number of siblings kept might vary
1549      * @return the subset of preceding siblings to keep; an empty list if
1550      * there are no siblings to keep
1551      */
1552     protected List<Element> selectPrecedingSiblings(final Element mainElement,
1553             final LinkedList<Element> allPrecedingSiblings,
1554             final int baseNumber) {
1555         final String elementType = DOMUtils.name(mainElement);
1556         final LinkedList<Element> selectedPrecedingSiblings =
1557             new LinkedList<Element>();
1558         final ListIterator<Element> siblingIter =
1559             allPrecedingSiblings.listIterator(allPrecedingSiblings.size());
1560         Element currentSibling;
1561         boolean addedSiblingOfDifferentType = false;
1562         boolean doAdd;
1563 
1564         // keep up to |baseNumber| siblings from end of list
1565         while (siblingIter.hasPrevious()
1566                 && (selectedPrecedingSiblings.size() < baseNumber)) {
1567             currentSibling = siblingIter.previous();
1568 
1569             if (elementType.equals(DOMUtils.name(currentSibling))) {
1570                 doAdd = true;
1571             } else {
1572                 // only add if this is the first differently typed sibling
1573                 doAdd = !addedSiblingOfDifferentType;
1574                 addedSiblingOfDifferentType = true;
1575             }
1576 
1577             if (doAdd) {
1578                 selectedPrecedingSiblings.addFirst(currentSibling);
1579             }
1580         }
1581 
1582         // add first sibling, if not yet done
1583         if (!allPrecedingSiblings.isEmpty()) {
1584             final Element firstSibling = allPrecedingSiblings.getFirst();
1585             final Element firstSelected = selectedPrecedingSiblings.getFirst();
1586 
1587             // no need to check size of selectedPrecedingSiblings: must be > 0
1588             if (firstSelected != firstSibling) {
1589                 selectedPrecedingSiblings.addFirst(firstSibling);
1590             }
1591         }
1592 
1593         return selectedPrecedingSiblings;
1594     }
1595 
1596     /***
1597       * Returns a string representation of this object.
1598       *
1599       * @return a textual representation
1600       */
1601      public String toString() {
1602          return new ToStringBuilder(this)
1603              .appendSuper(super.toString())
1604              .append("detailed recognitions", detailedRecognitions)
1605              .append("number of ancestors", ancestorNumber)
1606              .append("number of siblings", siblingNumber)
1607              .append("maximum prefix length", maximumPrefixLength)
1608              .append("head element", headElement)
1609              .append("head attribute", headAttribute)
1610              .append("default attributes", defaultAttributes)
1611              .append("sensors", ArrayUtils.toString(sensors))
1612              .toString();
1613      }
1614 
1615 }