View Javadoc

1   /*
2    * Copyright (C) 2003-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.context;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.util.Collection;
27  import java.util.Collections;
28  import java.util.HashSet;
29  import java.util.Iterator;
30  import java.util.LinkedList;
31  import java.util.List;
32  import java.util.ListIterator;
33  import java.util.Map;
34  import java.util.Set;
35  import java.util.SortedMap;
36  import java.util.TreeMap;
37  import java.util.regex.Matcher;
38  import java.util.regex.Pattern;
39  
40  import org.apache.commons.collections.Bag;
41  import org.apache.commons.collections.bag.HashBag;
42  import org.apache.commons.collections.KeyValue;
43  import org.apache.commons.collections.MultiHashMap;
44  import org.apache.commons.collections.MultiMap;
45  import org.apache.commons.collections.SortedBag;
46  import org.apache.commons.collections.bag.TreeBag;
47  import org.apache.commons.collections.map.LinkedMap;
48  import org.apache.commons.lang.ArrayUtils;
49  import org.apache.commons.lang.StringUtils;
50  import org.apache.commons.lang.builder.ToStringBuilder;
51  import org.dom4j.Attribute;
52  import org.dom4j.Element;
53  
54  import de.fu_berlin.ties.ProcessingException;
55  import de.fu_berlin.ties.TiesConfiguration;
56  import de.fu_berlin.ties.xml.dom.DOMUtils;
57  import de.fu_berlin.ties.classify.feature.DefaultFeatureVector;
58  import de.fu_berlin.ties.classify.feature.Feature;
59  import de.fu_berlin.ties.classify.feature.FeatureVector;
60  import de.fu_berlin.ties.context.sensor.BaseSensor;
61  import de.fu_berlin.ties.context.sensor.Sensor;
62  import de.fu_berlin.ties.io.IOUtils;
63  import de.fu_berlin.ties.text.TextUtils;
64  import de.fu_berlin.ties.text.TokenizerFactory;
65  import de.fu_berlin.ties.util.Util;
66  
67  /***
68   * The context representation used by default. This class is thread-safe.
69   *
70   * @author Christian Siefkes
71   * @version $Revision: 1.69 $, $Date: 2006/10/21 16:04:03 $, $Author: siefkes $
72   */
73  public class DefaultRepresentation extends AbstractRepresentation {
74  
75      /***
76       * Ancestor axis.
77       */
78      protected static final String AXIS_ANCESTOR = "an";
79  
80      /***
81       * Descendant-or-self axis.
82       */
83      protected static final String AXIS_DESC_OR_SELF = "ds";
84  
85      /***
86       * Following sibling axis.
87       */
88      protected static final String AXIS_FOLLOW_SIBLING = "fs";
89  
90      /***
91       * Preceeding sibling axis.
92       */
93      protected static final String AXIS_PREC_SIBLING = "ps";
94  
95      /***
96       * The pseudo-axis of prior recognitions.
97       */
98      protected static final String AXIS_PRIOR = "prior";
99  
100     /***
101      * A sequence map mapping used by
102      * {@link #calculateValuesFromText(String, String, List)} to determine the
103      * "tokenType" value. Maps description strings to {@link Pattern}s to
104      * match. The description string for the first matching pattern is used.
105      * Initialized in the static constuctor. The value "mixed" is reserved for
106      * tokens that are not matched by any of the patterns.
107      */
108     protected static final LinkedMap TOKEN_TYPE_PATTERNS;
109 
110     /***
111      * Static constructor: initializes the map of {@link #TOKEN_TYPE_PATTERNS}.
112      */
113     static {
114         // some constants for defining the patterns:
115         // lowercase or modifier letter
116         final String lowercaseLetter = "[//p{Ll}//p{Lm}]";
117         // uppercase, titlecase, or modifier letter
118         final String uppercaseLetter = "[//p{Lu}//p{Lt}//p{Lm}]";
119         // any Unicode letter or mark
120         final String anyLetter = "[//p{L}//p{M}]";
121 
122         TOKEN_TYPE_PATTERNS = new LinkedMap();
123 
124         // different kinds of words (letters):
125         TOKEN_TYPE_PATTERNS.put("lower",
126             Pattern.compile(lowercaseLetter + "+"));
127         TOKEN_TYPE_PATTERNS.put("CAPS",
128             Pattern.compile(uppercaseLetter + "+"));
129         // lowercase letters after a single uppercase or titlecase letter
130         TOKEN_TYPE_PATTERNS.put("Cap",
131             Pattern.compile(uppercaseLetter + lowercaseLetter + "+"));
132         TOKEN_TYPE_PATTERNS.put("mixedCase", Pattern.compile(anyLetter + "+"));
133         // letters + trailing dot
134         TOKEN_TYPE_PATTERNS.put("CAPSDot",
135             Pattern.compile(uppercaseLetter + "+//."));
136         TOKEN_TYPE_PATTERNS.put("alphaDot",
137             Pattern.compile(anyLetter + "+//."));
138         // letters (incl. marks) + dashes ("-" etc.)
139         TOKEN_TYPE_PATTERNS.put("alpha+Punc",
140             Pattern.compile("[//p{L}//p{M}//p{Pd}]+//."));
141 
142         // different kinds of punctation
143         // opening and closing puncuation
144         TOKEN_TYPE_PATTERNS.put("openPunc", Pattern.compile("//p{Ps}+"));
145         TOKEN_TYPE_PATTERNS.put("closePunc", Pattern.compile("//p{Pe}+"));
146         /* not used -- unknown in JDK 1.4.2
147         // puncation for opening or closing quotes
148         TOKEN_TYPE_PATTERNS.put("initialQuote", Pattern.compile("//p{Pi}+"));
149         TOKEN_TYPE_PATTERNS.put("finalQuote", Pattern.compile("//p{Pf}+"));
150         */
151         // any punctuation
152         TOKEN_TYPE_PATTERNS.put("punc", Pattern.compile("//p{P}+"));
153 
154         // different kinds of numbers
155         // numbers cannot start with 0 (a single 0 is ok)
156         TOKEN_TYPE_PATTERNS.put("num", Pattern.compile("(?:0|[1-9]//d*)"));
157         // digits can start with '0' and can also contain other Unicode digits
158         TOKEN_TYPE_PATTERNS.put("digits", Pattern.compile("//p{N}+"));
159         TOKEN_TYPE_PATTERNS.put("digits+.", Pattern.compile("[//p{N}.]+"));
160         TOKEN_TYPE_PATTERNS.put("digits+.,",
161             Pattern.compile("[//p{N}.,]+"));
162         //TOKEN_TYPE_PATTERNS.put("digits+:", Pattern.compile("[//p{N}:]+"));
163 
164         // alphanumeric also includes dot + comma + dashes
165         TOKEN_TYPE_PATTERNS.put("alphanum",
166             Pattern.compile("[//p{L}//p{M}//p{N}//p{Pd}.,]+"));
167 
168         // different kinds of symbols
169         // mathematical and currency symbols
170         TOKEN_TYPE_PATTERNS.put("math", Pattern.compile("//p{Sm}+"));
171         TOKEN_TYPE_PATTERNS.put("curr", Pattern.compile("//p{Sc}+"));
172         // any symbols
173         TOKEN_TYPE_PATTERNS.put("symb", Pattern.compile("//p{S}+"));
174 
175         // control and other characters
176         TOKEN_TYPE_PATTERNS.put("ctrl", Pattern.compile("//p{C}+"));
177     }
178 
179     /***
180      * The maximum number of ancestors to include in the context representation.
181      */
182     private final int ancestorNumber;
183 
184     /***
185      * The name of the attribute to use for calculating head values,
186      * cf. {@link #calculateHeadValues(Element, List)}. The name must be
187      * compatible to {@link DOMUtils#name(Attribute)}.
188      */
189     private final String headAttribute;
190 
191     /***
192      * The name of the element to use for calculating head values,
193      * cf. {@link #calculateHeadValues(Element, List)}. The name must be
194      * compatible to {@link DOMUtils#name(Element)}.
195      */
196     private final String headElement;
197 
198     /***
199      * An unmodifiable set of names of default attributes. Contains Strings
200      * in a format compatible to {@link DOMUtils#name(Attribute)}.
201      */
202     private final Set defaultAttributes;
203 
204     /***
205      * The number of preceding recognitions to represent in detail.
206      */
207     private final int detailedRecognitions;
208 
209     /***
210      * The basic number of preceding and following siblings to include in the
211      * context representation.
212      */
213     private final int siblingNumber;
214 
215     /***
216      * The maximum length of prefixed and suffixes.
217      */
218     private final int maximumPrefixLength;
219 
220     /***
221      * An array of sensors used to add semantic information.
222      */
223     private final Sensor[] sensors;
224 
225     /***
226      * Creates a new instance based on the
227      * {@linkplain TiesConfiguration#CONF standard configuration}.
228      *
229      * @throws ProcessingException if an error occurs while initializing this
230      * instance
231      */
232     public DefaultRepresentation() throws ProcessingException {
233         this(TiesConfiguration.CONF);
234     }
235 
236     /***
237      * Creates a new instance based on the provided configuration.
238      *
239      * @param config used to configure this instance
240      * @throws ProcessingException if an error occurs while initializing this
241      * instance
242      */
243     public DefaultRepresentation(final TiesConfiguration config)
244     throws ProcessingException {
245         this(config.getInt(CONFIG_RECOGN_NUM),
246             config.getInt("representation.recogn.detailed"),
247             config.getInt("representation.ancestor.num"),
248             config.getInt("representation.sibling.num"),
249             config.getInt(CONFIG_SPLIT_MAXIMUM),
250             config.getInt("representation.prefix.maxlength"),
251             config.getString("representation.head.element"),
252             config.getString("representation.head.attrib"),
253             config.getStringArray("representation.default.attribs"),
254             config.getInt(CONFIG_STORE_NTH),
255             config.getString(IOUtils.KEY_LOCAL_CHARSET, null),
256             config.getStringArray("representation.sensors"),
257             config);
258     }
259 
260     /***
261      * Creates a new instance based on the provided configuration.
262      *
263      * @param config used to configure this instance
264      * @param suffix optional suffix for
265      * {@linkplain de.fu_berlin.ties.TiesConfiguration#adaptKey(String, String)
266      * adapting configuration keys} if not <code>null</code> (currently ignored)
267      * @throws ProcessingException if an error occurs while initializing this
268      * instance
269      */
270     public DefaultRepresentation(final TiesConfiguration config,
271             final String suffix)
272     throws ProcessingException {
273         this(config);
274     }
275 
276     /***
277      * Creates a new instance. All element/attribute names must be compatible
278      * to {@link DOMUtils#name(Element)}.
279      *
280      * @param recogNum the number of preceding recognitions to represent
281      * @param detailedRecogs the number of preceding recognitions to represent
282      * in detail
283      * @param numberOfAncestors the maximum number of ancestors to include in
284      * the context representation
285      * @param numberOfSiblings the basic number of preceding and following
286      * siblings to include
287      * @param splitMax the maximum number of subsequences to keep when
288      * a feature value must be split (at whitespace).
289      * @param prefixMax the maximum length of prefixed and suffixes
290      * @param headElementName the name of the element to use for
291      * calculating head values (cf. {@link #calculateHeadValues(Element, List)})
292      * @param headAttribName the name of the attribute to use for
293      * calculating head values
294      * @param defaultAttribs array of names of default attributes
295      * @param outCharset the output character set to use (only used to
296      * store some configurations for inspection purposes, if <code>n</code>
297      * &gt; 0); if <code>null</code>, the default charset of the current
298      * platform is used
299      * @param n Each <em>n</em>-th context representation is stored if &gt; 0;
300      * otherwise no representation is stored
301      * @param sensorNames array of fully specified names of classes
302      * implementing the {@link Sensor} interface to be used to look up semantic
303      * information
304      * @param config used to configure the sensors
305      * @throws ProcessingException if an error occurs while initializing this
306      * instance
307      */
308     public DefaultRepresentation(final int recogNum, final int detailedRecogs,
309             final int numberOfAncestors, final int numberOfSiblings,
310             final int splitMax, final int prefixMax,
311             final String headElementName, final String headAttribName,
312             final String[] defaultAttribs, final int n,
313             final String outCharset, final String[] sensorNames,
314             final TiesConfiguration config) throws ProcessingException {
315         super(recogNum, splitMax, n, outCharset);
316         detailedRecognitions = detailedRecogs;
317         ancestorNumber = numberOfAncestors;
318         siblingNumber = numberOfSiblings;
319         maximumPrefixLength = prefixMax;
320 
321         headElement = headElementName;
322         headAttribute = headAttribName;
323 
324         // create unmodifyable set of default attributes
325         final Set<String> defAttribSet = new HashSet<String>();
326         for (int i = 0; i < defaultAttribs.length; i++) {
327             defAttribSet.add(defaultAttribs[i]);
328         }
329         defaultAttributes = Collections.unmodifiableSet(defAttribSet);
330         // initialize sensors
331         sensors = BaseSensor.createSensors(sensorNames, config);
332     }
333 
334     /***
335      * Builds the context representation of text in an element. Returns a
336      * feature vector of all context features considered relevant for
337      * representation.
338      *
339      * @param element the element whose context should be represented
340      * @param leftText textual content to the left of (preceding)
341      * <code>mainText</code>, might be empty
342      * @param mainText the main textual content to represent, might be empty
343      * @param rightText textual content to the right of (following)
344      * <code>mainText</code>, might be empty
345      * @param priorRecognitions a buffer of the last {@link Recognition}s from
346      * the document, created by calling {@link Representation#initDocument};
347      * might be <code>null</code>
348      * @param featureCache a cache of (local) feature, should be re-used
349      * between all calls for the nodes in a single document (but must not be
350      * re-used when building the context of nodes in different documents!)
351      * @param logPurpose the type of contexts of main interest to the caller
352      * (e.g. "Token" or "Sentence"), used for logging
353      * @return a vector of features considered relevant for representation
354      * @throws ClassCastException if the <code>priorRecognitions</code> buffer
355      * contains objects that aren't {@link Recognition}s
356      * @throws IllegalArgumentException if the specified node is of an
357      * unsupported type
358      */
359     protected FeatureVector doBuildContext(final Element element,
360             final String leftText, final String mainText,
361             final String rightText, final PriorRecognitions priorRecognitions,
362             final Map<Element, List<LocalFeature>> featureCache,
363             final String logPurpose)
364             throws ClassCastException, IllegalArgumentException {
365         LinkedList<Feature> selfFeatures = new LinkedList<Feature>();
366         LinkedList<Feature> precedingFeatures = new LinkedList<Feature>();
367         LinkedList<Feature> followingFeatures = new LinkedList<Feature>();
368         LinkedList<Feature> ancestorFeatures = new LinkedList<Feature>();
369         LinkedList<Feature> ancestorSiblingFeatures = new LinkedList<Feature>();
370         final FeatureVector fullRep = new DefaultFeatureVector();
371 
372         // build pseudo-axis of prior recognitions, if any
373         List priorFeatures;
374         if ((getRecognitionNumber() > 0) && (priorRecognitions != null)) {
375             priorFeatures = buildPrior(priorRecognitions);
376         } else {
377             priorFeatures = null;
378         }
379 
380         // build sibling axes
381         ElementPosition position =
382             handleSiblings("", element, getSiblingNumber(),
383             precedingFeatures, followingFeatures, featureCache);
384 /*           Util.LOG.debug("Build preceding (size: " + precedingFeatures.size()
385                 + ") + followingFeatures (size: + " + followingFeatures.size()
386                 + "; cache size: " + featureCache.size() + ")"); */
387 
388         // build descendant-or-self axis, prepending start marker
389         selfFeatures.add(new GlobalFeature("Descendant-or-self axis"));
390         selfFeatures.add(new GlobalFeature(AXIS_DESC_OR_SELF,
391             LocalFeature.MARKER_START));
392         buildFeatures(AXIS_DESC_OR_SELF, element, position, true,
393             selfFeatures, true, featureCache);
394         buildTextFeatures(AXIS_DESC_OR_SELF, element, leftText.trim(),
395             mainText.trim(), rightText.trim(), selfFeatures);
396 /*            Util.LOG.debug("Build descendant-or-self (size: "
397                 + selfFeatures.size() + ", cache size: " + featureCache.size()
398                 + ")"); */
399 
400         // build ancestor (prepending start marker) + ancestor-siblings axes
401         handleAncestors(element, getAncestorNumber(),
402             getSiblingNumber() - 1, ancestorFeatures,
403             ancestorSiblingFeatures, new HashBag(), featureCache);
404         ancestorFeatures.addFirst(new GlobalFeature(AXIS_ANCESTOR,
405             LocalFeature.MARKER_START));
406         ancestorFeatures.addFirst(new GlobalFeature("Ancestor axis"));
407 /*            Util.LOG.debug("Build ancestors (size: " + ancestorFeatures.size()
408                 + ") incl. siblings (size: + " + ancestorSiblingFeatures.size()
409                 + "; cache size: " + featureCache.size() + ")"); */
410 
411         // combine all lists into full representation, start w/prior recog.s
412         if (priorFeatures != null) {
413             fullRep.addAll(priorFeatures);
414             priorFeatures = null;
415         }
416 
417         // preceding siblings + self + following siblings
418         fullRep.addAll(precedingFeatures);
419         precedingFeatures = null;
420         fullRep.addAll(selfFeatures);
421         fullRep.addAll(followingFeatures);
422         followingFeatures = null;
423 
424         // ancestors + self + ancestor-siblings:
425         fullRep.addAll(ancestorFeatures);
426         ancestorFeatures = null;
427         fullRep.addAll(selfFeatures);
428         selfFeatures = null;
429         fullRep.addAll(ancestorSiblingFeatures);
430         ancestorSiblingFeatures = null;
431 /*            Util.LOG.debug("Combined axes into single representation (size: "
432                 + fullRep.size() + ", cache size: " + featureCache.size()
433                 + ")"); */
434 
435         // create and append filtered representations
436         final List filteredRep = filterRepresentation(fullRep);
437 
438         // remove unnecessary markers from filtered rep.s and add them
439         removeExtraMarkers(filteredRep);
440         fullRep.addAll(filteredRep);
441 
442 /*        Util.LOG.debug("Build " + logPurpose + " representation for "
443             + DOMUtils.showToken(element, mainText) + " (full size: "
444             + fullRep.size() + ", cache size: " + featureCache.size()
445             + ")"); */
446 
447         return fullRep;
448     }
449 
450     /***
451      * Builds the features of an element and appends them to the
452      * specified <code>featureList</code>. Handles attributes and calculated
453      * values and the element itself. Child elements are only handled when
454      * <code>recurseInsteadOfText</code> it <code>true</code> -- the axis name
455      * is not changed for child elements.
456      *
457      * @param axisName the name of the axis, used to start each feature
458      * @param element the element to process
459      * @param position wraps the position of the element within its parent
460      * element and related data, used for calculating positional features;
461      * if <code>null</code>, no positional features are calculated
462      * @param recurseInsteadOfText if <code>true</code> child elements
463      * are recursively processed, but no
464      * {@linkplain #calculateValuesFromText(String, String, List) values from
465      * text} are calculated; otherwise text is processed but child elements are
466      * not
467      * @param featureList the list of {@link GlobalFeature}s to add features to
468      * @param addAtEnd whether to add the new features at the end of at the
469      * beginning of the <code>featureList</code>
470      * @param cache a cache of local feature, mapping {@link Element}s to
471      * {@link List}s of {@link LocalFeature}
472      */
473     protected void buildFeatures(final String axisName, final Element element,
474             final ElementPosition position, final boolean recurseInsteadOfText,
475             final LinkedList<Feature> featureList, final boolean addAtEnd,
476             final Map<Element, List<LocalFeature>> cache) {
477         final List<LocalFeature> localFeatures;
478 
479         if ((!recurseInsteadOfText) && cache.containsKey(element)) {
480             // we can only use the cache when textual contents are included
481             localFeatures = cache.get(element);
482 /*            Util.LOG.debug("Re-used cached features for "
483                 + DOMUtils.showElement(element)); */
484         } else {
485             // build local features, with or without textual features
486             localFeatures = buildLocalFeatures(element, position,
487                 recurseInsteadOfText);
488 
489             // cache them if the are complete (including textual features)
490             if (!recurseInsteadOfText) {
491                 cache.put(element, localFeatures);
492             }
493         }
494 
495         // convert to global features and append to list
496         final Iterator<LocalFeature> localIter = localFeatures.iterator();
497         GlobalFeature.globalize(axisName, localIter, featureList, addAtEnd);
498         //Util.LOG.debug("Globalized local features");
499 
500         if (recurseInsteadOfText) {
501             // process child elements, without calculating positional values
502             final Iterator childIter = element.elementIterator();
503             Element child;
504             while (childIter.hasNext()) {
505                 child = (Element) childIter.next();
506                 buildFeatures(axisName, child, null, recurseInsteadOfText,
507                     featureList, addAtEnd, cache);
508             }
509         }
510     }
511 
512     /***
513      * Builds the local features of an element. Handles attributes and
514      * calculated values and the element itself, but no child elements.
515      * The {@link LocalFeature}s can be stored in a cache for re-use; they must
516      * be combined with an axis name for getting a global feature.
517      *
518      * @param element the element to process
519      * @param position wraps the position of the element within its parent
520      * element and related data, used for calculating positional features;
521      * if <code>null</code>, no positional features are calculated
522      * @param ignoreText if <code>true</code>, no {@linkplain
523      * #calculateValuesFromText(String, String, List) values from text} or
524      * {@linkplain #calculateHeadValues(Element, List) head values} are
525      * calculated
526      * @return returns a List of created {@link LocalFeature}s
527      */
528     protected List<LocalFeature> buildLocalFeatures(final Element element,
529             final ElementPosition position, final boolean ignoreText) {
530         final List<LocalFeature> result = new LinkedList<LocalFeature>();
531         final String name = DOMUtils.name(element);
532 
533         // Process attributes sorted according to their names.
534         // Map contains name/value mappings.
535         final SortedMap<String, String> attribMap =
536             new TreeMap<String, String>();
537         final Iterator attribIter = element.attributeIterator();
538         Attribute currentAttrib;
539         String attName;
540         boolean foundDefaultAttrib = false;
541 
542         while (attribIter.hasNext()) {
543             currentAttrib = (Attribute) attribIter.next();
544             attName = DOMUtils.name(currentAttrib);
545             attribMap.put(attName, currentAttrib.getValue());
546 
547             if (!foundDefaultAttrib
548                     && getDefaultAttributes().contains(attName)) {
549                 // at least one of the default attributes is present
550                 foundDefaultAttrib = true;
551             }
552         }
553 
554         // no default attribute present: include stand-alone element name
555         if (!foundDefaultAttrib) {
556             result.add(LocalFeature.createElementFeature(name));
557             //Util.LOG.debug("Created element feature for " + name);
558         }
559 
560         final Iterator<String> sortedIter = attribMap.keySet().iterator();
561         String attValue;
562 
563         while (sortedIter.hasNext()) {
564             attName = sortedIter.next();
565             attValue = attribMap.get(attName);
566             //Util.LOG.debug("Creating " + attName + " attribute feature");
567             result.addAll(LocalFeature.createAttributeFeatures(name, attName,
568                 attValue, getSplitMaximum()));
569             //Util.LOG.debug("Created " + attName + " attribute feature");
570         }
571 
572         //Util.LOG.debug("Created attribute features");
573 
574         // calculate positional values, if there is a parent
575         if (position != null) {
576             calculatePositionalValues(name, position, result);
577             //Util.LOG.debug("Calculated positional values");
578         }
579 
580         if (!ignoreText) {
581             // check textual content
582             final String trimmedText = element.getTextTrim();
583 
584             if ((trimmedText != null) && (trimmedText.length() > 0)) {
585                 // calculate values from text incl. text itself
586                 calculateValuesFromText(name, trimmedText, result);
587             } else {
588                 // check whether there are head values instead
589                  calculateHeadValues(element, result);
590             }
591         }
592 
593         return result;
594     }
595 
596     /***
597      * Builds the pseudo-axis of prior recognitions.
598      *
599      * @param priorRecognitions a buffer of the last {@link Recognition}s from
600      * the document, created by calling {@link Representation#initDocument}
601      * @return a list of {@link GlobalFeature}s representing prior recognitions
602      * @throws ClassCastException if the <code>priorRecognitions</code> buffer
603      * contains objects that aren't {@link Recognition}s
604      */
605     protected List<Feature> buildPrior(
606             final PriorRecognitions priorRecognitions)
607             throws ClassCastException {
608         final LinkedList<Feature> result =
609             new LinkedList<Feature>();
610 
611         // start with comment and start marker
612         result.add(new GlobalFeature("Prior recognitions"));
613         result.add(new GlobalFeature(AXIS_PRIOR, LocalFeature.MARKER_START));
614 
615         // prepare to iterate prior recognitions
616         final Iterator recogIter = priorRecognitions.iterator();
617         PriorRecognitions.Pair currentPair;
618         Recognition currentRecognition;
619         String type;
620         String trimmedText;
621         LinkedList<Feature> recogFeatures;
622         final List<LocalFeature> calculatedValues =
623             new LinkedList<LocalFeature>();
624 
625         // number of recognitions to describe without details
626         final int briefRecogs =
627             priorRecognitions.size() - getDetailedRecognitions();
628 
629         // log an error if the buffer is too large
630         if (priorRecognitions.size() > getRecognitionNumber()) {
631             Util.LOG.error("Buffer of prior recognitions too large ("
632                 + priorRecognitions.size() + " instead of "
633                 + getRecognitionNumber() + " elements) -- use "
634                 + "Representation.createRecognitionBuffer() to create it");
635         }
636 
637         // process prior recognitions
638         for (int i = 0; recogIter.hasNext(); i++) {
639             currentPair = (PriorRecognitions.Pair) recogIter.next();
640             currentRecognition = currentPair.getRecognition();
641             type = currentRecognition.getType();
642 
643             // check whether features are cached
644             recogFeatures = currentPair.getCachedFeatures();
645             if (recogFeatures == null) {
646                 // features are not yet cached
647                 recogFeatures = new LinkedList<Feature>();
648 
649                 // include feature type for all recognitions
650                 recogFeatures.add(new GlobalFeature(AXIS_PRIOR,
651                     LocalFeature.createElementFeature(type)));
652 
653                 if (i >= briefRecogs) {
654                     // include further details for last n=detailedRecognitions:
655                     // calculate values from the text incl. text itself (if any)
656                     trimmedText = currentRecognition.getText().trim();
657                     if (StringUtils.isNotEmpty(trimmedText)) {
658                         calculateValuesFromText(type, trimmedText,
659                                 calculatedValues);
660 
661                             // store as global features and clear list
662                             GlobalFeature.globalize(AXIS_PRIOR,
663                                     calculatedValues.iterator(), recogFeatures,
664                                     true);
665                             calculatedValues.clear();
666                     }
667                 }
668 
669                 // cache features if the recognition is sealed
670                 if (currentRecognition.isSealed()) {
671                     currentPair.setCachedFeatures(recogFeatures);
672                 }
673             } else {
674                 // features are cached
675 
676                 if (i < briefRecogs) {
677                     // only include type (very first feature) for oldest
678                     // recognitions -- discard all other features
679                     while (recogFeatures.size() > 1) {
680                         recogFeatures.removeLast();
681                     }
682                 }
683             }
684 
685             // add features of recognition to result list
686             result.addAll(recogFeatures);
687 
688             if (!recogIter.hasNext()) {
689                 // last recognition: check whether it is finished (sealed)
690                 result.addLast(new GlobalFeature(AXIS_PRIOR,
691                     LocalFeature.createCalculatedFeature(type, "finished",
692                         String.valueOf(currentRecognition.isSealed()))));
693             }
694         }
695         return result;
696     }
697 
698     /***
699      * Builds the context representation of text in an element, differentiating
700      * between three kinds of textual contents: a left part, a main part, and
701      * a right part. Each text part is optional (if all of them are empty,
702      * this method does nothing).
703      *
704      * @param axisName the name of the axis, used to start each feature
705      * @param element the element whose context should be represented
706      * @param trimmedLeft trimmed textual content to the left of (preceding)
707      * <code>trimmedMain</code>, might be empty
708      * @param trimmedMain trimmed main textual content to represent, might
709      * be empty
710      * @param trimmedRight trimmed textual content to the right of (following)
711      * <code>trimmedMain</code>, might be empty
712      * @param featureList a list of {@link GlobalFeature}s to add the values to
713      */
714     protected void buildTextFeatures(final String axisName,
715             final Element element, final String trimmedLeft,
716             final String trimmedMain, final String trimmedRight,
717             final LinkedList<Feature> featureList) {
718         final String name = DOMUtils.name(element);
719         final List<LocalFeature> localTextFeatures =
720             new LinkedList<LocalFeature>();
721 
722         // the feature mark for calculated values to used to introduce left
723         // + right features
724         final String innerMarker = FeatureType.CALCULATED.getMark();
725 
726         if (trimmedLeft.length() > 0) {
727             calculateValuesFromText(name + innerMarker + "left", trimmedLeft,
728                 localTextFeatures);
729         }
730         if (trimmedMain.length() > 0) {
731             calculateValuesFromText(name, trimmedMain, localTextFeatures);
732         }
733         if (trimmedRight.length() > 0) {
734             calculateValuesFromText(name + innerMarker + "right", trimmedRight,
735                 localTextFeatures);
736         }
737 
738         // globalize + append features
739         GlobalFeature.globalize(axisName, localTextFeatures.iterator(),
740             featureList, true);
741     }
742 
743     /***
744      * Creates values that depend on "head" children of an element, if the
745      * element contains any of them. This method is meant for element that
746      * don't contain any textual content, so their children are checked
747      * instead.
748      *
749      * This implementation proceeds as follows:
750      * <ul>
751      * <li>If the specified element is of type {@link #getHeadElement()}, the
752      * value of the {@link #getHeadAttribute()} or else the textual content of
753      * its right-most child element is stored (in a calculated value named
754      * "head"). Child elements are iterated from right to left unless one
755      * containing an appropriate attribute or textual content is found (or none
756      * are left).
757      * <li>Otherwise, if the element contains children of the type
758      * {@link #getHeadElement()}, the first and last child element of this type
759      * are recursively processed and the results stored in calculated names
760      * named "lhead" and "rhead". Of there is only a single child of appropriate
761      * type, the "lhead" value is omitted.
762      * <li>Otherwise no values are calculated.
763      * </ul>
764      *
765      * <p>If a value contains whitespace, only the final subsequence following
766      * all whitespace is preserved.
767      *
768      * @param element the element to process
769      * @param values a list of {@link LocalFeature}s to add the calculated
770      * values to
771      */
772     protected void calculateHeadValues(final Element element,
773             final List<LocalFeature> values) {
774         final String elementName = DOMUtils.name(element);
775         if (getHeadElement().equals(elementName)) {
776             // element is of suitable type
777             final String headValue = determineHeadValue(element);
778 
779             // store if not empty
780             if (headValue.length() > 0) {
781                 values.add(LocalFeature.createCalculatedFeature(elementName,
782                     "head", headValue));
783             }
784         } else {
785             // process children of suitable type
786             final List suitableChildren = DOMUtils.elementsByName(element,
787                 getHeadElement());
788             final int numberOfChildren = suitableChildren.size();
789 
790             if (numberOfChildren >= 1) {
791                 // process right-most child
792                 final Element rightMostChild =
793                     (Element) suitableChildren.get(numberOfChildren - 1);
794                 final String rHeadValue = determineHeadValue(rightMostChild);
795 
796                 // store if not empty
797                 if (rHeadValue.length() > 0) {
798                     values.add(LocalFeature.createCalculatedFeature(
799                         elementName, "rhead", rHeadValue));
800                 }
801 
802                 if (numberOfChildren >= 2) {
803                     // process left-most child
804                     final Element leftMostChild =
805                         (Element) suitableChildren.get(0);
806                     final String lHeadValue = determineHeadValue(leftMostChild);
807 
808                     // store if not empty
809                     if (lHeadValue.length() > 0) {
810                         values.add(LocalFeature.createCalculatedFeature(
811                             elementName, "lhead", lHeadValue));
812                     }
813                 }
814             }
815         }
816     }
817 
818     /***
819      * Calculates values that depend on the position of an element within its
820      * parent.
821      *
822      * @param elementName the name of the element to process, as returned by
823      * {@link DOMUtils#name(Element)}
824      * @param position wraps the position of the element within its parent
825      * element and related data, must not be <code>null</code>
826      * @param values a list to add the calculated values to
827      */
828     protected void calculatePositionalValues(final String elementName,
829             final ElementPosition position, final List<LocalFeature> values) {
830         // store exact position, counting all children
831         values.add(LocalFeature.createCalculatedFeature(elementName, "pos",
832             Integer.toString(position.getOverallPosition())));
833 
834         // store exact position, counting only children with same type
835         values.add(LocalFeature.createCalculatedFeature(elementName,
836             "typedPos", Integer.toString(position.getTypedPosition())));
837 
838         // store rough positions
839         values.add(LocalFeature.createCalculatedFeature(elementName,
840             "roughPos", determineRoughPosition(position.getOverallPosition(),
841                 position.getAllChildren())));
842         values.add(LocalFeature.createCalculatedFeature(elementName,
843             "roughTypedPos", determineRoughPosition(position.getTypedPosition(),
844                 position.getTypedChildren())));
845     }
846 
847     /***
848      * Calculates values that depend on the textual content of an element:
849      * prefixes, suffixes, length data, and token type. Also includes the text
850      * itself.
851      *
852      * @param elementName the name of the element to process, as returned by
853      * {@link DOMUtils#name(Element)}
854      * @param trimmedText the trimmed textual content of the element to
855      * process, must not be empty
856      * @param values a list of {@link LocalFeature}s to add the calculated
857      * values to
858      * @throws IllegalArgumentException if the empty string was given as
859      * <code>trimmedText</code>
860      */
861     protected void calculateValuesFromText(final String elementName,
862             final String trimmedText, final List<LocalFeature> values)
863             throws IllegalArgumentException {
864         final int textLength = trimmedText.length();
865         if (textLength == 0) {
866             throw new IllegalArgumentException(
867                 "Specifed trimmed text must not be empty!");
868         }
869 
870         // split at whitespace, if any
871         final String[] tokens = TextUtils.splitString(trimmedText, -1);
872         final String firstTokenLC = tokens[0].toLowerCase();
873         final String lastTokenLC;
874 
875         if (tokens.length == 1) {
876             // no whitespace -- single token
877             lastTokenLC = firstTokenLC;
878         } else {
879             lastTokenLC = tokens[tokens.length - 1].toLowerCase();
880         }
881 
882         // store starting 1--x letters from first token (but only if the
883         // token contains enough letters; using lower case)
884         final int maxPrefixLength = Math.min(maximumPrefixLength,
885             firstTokenLC.length() - 1);
886         for (int i = 1; i <= maxPrefixLength; i++) {
887             values.add(LocalFeature.createCalculatedFeature(elementName,
888                  "prefix" + i, firstTokenLC.substring(0, i)));
889         }
890 
891         // store trailing 1--x letters from last token (but only if the
892         // token contains enough letters; using lower case)
893         final int maxSuffixLength = Math.min(maximumPrefixLength,
894             lastTokenLC.length() - 1);
895         for (int i = 1; i <= maxSuffixLength; i++) {
896             values.add(LocalFeature.createCalculatedFeature(elementName,
897                 "suffix" + i, lastTokenLC.substring(lastTokenLC.length() - i)));
898         }
899 
900         // store length
901         values.add(LocalFeature.createCalculatedFeature(elementName, "length",
902             Integer.toString(textLength)));
903 
904         // also store rounded square root of length as a less sparse feature
905         final long sqrtLength = Math.round(Math.sqrt(textLength));
906         values.add(LocalFeature.createCalculatedFeature(elementName,
907             "sqrtLength", Long.toString(sqrtLength)));
908 
909         // determine the token type (of last token, if there are several)
910         final Set typeSet = TOKEN_TYPE_PATTERNS.keySet();
911         final Iterator typeIter = typeSet.iterator();
912         String currentDesc;
913         Pattern currentPattern;
914         Matcher currentMatcher;
915         boolean foundMatch = false;
916         String matchingType = "mixed"; // use if no match is found
917 
918         // try each pattern until one of them matches (or none are left)
919         while (typeIter.hasNext() && !foundMatch) {
920             currentDesc = (String) typeIter.next();
921             currentPattern = (Pattern) TOKEN_TYPE_PATTERNS.get(currentDesc);
922             currentMatcher = currentPattern.matcher(tokens[tokens.length - 1]);
923 
924             if (currentMatcher.matches()) {
925                 foundMatch = true;
926                 matchingType = currentDesc; // overwrite value
927             }
928         }
929 
930         // append "+multi" if there are several whitespace-separated tokens
931         final String fullType = (tokens.length > 1)
932             ? matchingType + "+multi" : matchingType;
933         values.add(LocalFeature.createCalculatedFeature(elementName,
934             "tokenType", fullType));
935 
936         // lookup information from sensors (for the whole text, not per token)
937         KeyValue[] sensorData;
938         String prefix;
939         int j;
940 
941         for (int i = 0; i < sensors.length; i++) {
942             // prefix keys of each sensor with "i$" (e.g. "0$") to avoid
943             // naming collisions
944             prefix = i + FeatureType.CALCULATED.getMark();
945             sensorData = sensors[i].lookup(trimmedText);
946 
947             for (j = 0; j < sensorData.length; j++) {
948                 values.add(LocalFeature.createCalculatedFeature(elementName,
949                     prefix + sensorData[j].getKey(),
950                     Util.asString(sensorData[j].getValue())));
951             }
952         }
953 
954         // add textual content itself -- in lower case and "as is"
955         values.addAll(LocalFeature.createCalculatedFeatures(elementName, "lc",
956             trimmedText.toLowerCase(), getSplitMaximum()));
957         values.addAll(LocalFeature.createTextFeatures(elementName, trimmedText,
958             getSplitMaximum()));
959     }
960 
961     /***
962      * Helper method for determining the head value for an element of type
963      * {@link #getHeadElement()}. See
964      * {@link #calculateHeadValues(Element, List)} for a description of the
965      * algorithm.
966      *
967      * @param element the element to process, must be of type
968      * {@link #getHeadElement()}
969      * @return the calculated head value, or the empty string if no value
970      * could be calculated (the specified element doesn't have any children,
971      * or the right-most child contains neither {@link #getHeadAttribute()} nor
972      * textual content)
973      */
974     protected String determineHeadValue(final Element element) {
975         String fullResult = null;
976         final List children = element.elements();
977         final ListIterator childIter = children.listIterator(children.size());
978         Element currentChild;
979         Attribute headAttrib;
980         boolean done = false;
981 
982         // iterate children from right to left unless required attribute or
983         // textual content is present
984         while (childIter.hasPrevious() && !done) {
985             currentChild = (Element) childIter.previous();
986             headAttrib = DOMUtils.attributeByName(currentChild,
987                 getHeadAttribute());
988 
989             // check attribute value or else trimmed textual content
990             if (headAttrib != null) {
991                 fullResult = headAttrib.getText();
992             } else {
993                 fullResult = element.getTextTrim();
994             }
995 
996             // we're done when we got an non-empty string
997             done = (fullResult != null) && (fullResult.length() > 0);
998         }
999 
1000         final String result;
1001         if (fullResult != null) {
1002             // split text, only keeping last subsequence after any whitespace
1003             final String[] splits = TextUtils.splitString(fullResult, 1);
1004             result = splits[0];
1005         } else {
1006             result = "";
1007         }
1008         return result;
1009     }
1010 
1011     /***
1012      * Helper method called by
1013      * {@link #calculatePositionalValues(String, ElementPosition, List)} to
1014      * collapse a position in to one of five values.
1015      * <dl>
1016      * <dt>first</dt>
1017      * <dd>for the first element</dd>
1018      * <dt>early</dt>
1019      * <dd>if position is within the first third of all elements (but not the
1020      * first one), upper limit included</dd>
1021      * <dt>middle</dt>
1022      * <dd>if position is within the second third of all elements, limits
1023      * excluded</dd>
1024      * <dt>late</dt>
1025      * <dd>if position is within the last third of all elements (but not the
1026      * last one), lower limit included</dd>
1027      * <dt>last</dt>
1028      * <dd>for the last element</dd>
1029      * </dl>
1030      *
1031      * @param position the position counted from 0, should be non-negative and
1032      * smaller than <code>elementCount</code> (otherwise the results are
1033      * undefined)
1034      * @param elementCount the number of elements
1035      * @return a String containing one of the five values described above
1036      */
1037     protected String determineRoughPosition(final int position,
1038             final int elementCount) {
1039         // for easier counting
1040         final int realPosition = position + 1;
1041         final String result;
1042 
1043         if (realPosition == 1) {
1044             result = "first";
1045         } else if (realPosition == elementCount) {
1046             result = "last";
1047         } else {
1048             final double oneThird = elementCount / 3.0;
1049             if (realPosition <= oneThird) {
1050                 result = "early";
1051             } else if (realPosition >= (2 * oneThird)) {
1052                 result = "late";
1053             } else {
1054                 result = "middle";
1055             }
1056         }
1057         return result;
1058     }
1059 
1060     /***
1061      * Creates a filtered view of a context representation. A filtered
1062      * representation is created for each {@linkplain FeatureType type of
1063      * features} that occurs at least twice;
1064      * attributes and calculated values occuring only once are omitted.
1065      * For textual content, another filtered representation is created.
1066      *
1067      * <p>Features representing markers ({@link FeatureType#MARKER}),
1068      * stand-alone elements ({@link FeatureType#ELEMENT}) and default attributes
1069      * ({@link #getDefaultAttributes()}) are included in all filtered
1070      * representations. Comment-only features are ignored.
1071      *
1072      * @param originalRep a feature vector containing the representation
1073      * to filter
1074      * @return a list of {@link de.fu_berlin.ties.classify.feature.Feature}s
1075      * combining the filtered representations created by this method
1076      */
1077     protected List<Feature> filterRepresentation(
1078             final FeatureVector originalRep) {
1079         // features to include in all views (markers, stand-alone elements,
1080         // default attributes)
1081         final List<Feature> universalFeatures = new LinkedList<Feature>();
1082         // textual features
1083         final LinkedList<Feature> textualFeatures = new LinkedList<Feature>();
1084         // multi-map mapping mark + name of attributes/calculated values to
1085         // LinkedLists of filtered features
1086         final MultiMap featureMap = new MultiHashMap();
1087         // count features of each attribute/calculated type
1088         final SortedBag featureCount = new TreeBag();
1089 
1090         final Iterator iter = originalRep.iterator();
1091         GlobalFeature currentGlobal;
1092         LocalFeature currentLocal;
1093         FeatureType currentType;
1094         Iterator keyIter;
1095         String currentKey;
1096         Iterator<Feature> universalIter;
1097         Iterator innerIter;
1098         Feature innerFeature;
1099 
1100         // iterate original representation
1101         while (iter.hasNext()) {
1102             currentGlobal = (GlobalFeature) iter.next();
1103             currentLocal = currentGlobal.getLocalFeature();
1104 
1105             if (currentLocal != null) { // otherwise it's a comment -- ignored
1106                 currentType = currentLocal.getType();
1107                 if (((currentType == FeatureType.ATTRIBUTE)
1108                             && (getDefaultAttributes().contains(
1109                                 currentLocal.getName())))
1110                         || (currentType == FeatureType.MARKER)) {
1111                     // include in all views
1112                     universalFeatures.add(currentGlobal);
1113                     textualFeatures.add(currentGlobal);
1114                     keyIter = featureMap.keySet().iterator();
1115                     while (keyIter.hasNext()) {
1116                         currentKey = (String) keyIter.next();
1117                         featureMap.put(currentKey, currentGlobal);
1118                     }
1119                 // use attributes, elements and calculated ...head values
1120                 } else if ((currentType == FeatureType.ATTRIBUTE)
1121                           || (currentType == FeatureType.ELEMENT)
1122                           || ((currentType == FeatureType.CALCULATED)
1123                               && currentLocal.getName().endsWith("head"))) {
1124                     // determine key: use type marker
1125                     currentKey = currentType.getMark();
1126 
1127                     if (!featureMap.containsKey(currentKey)) {
1128                         // create new view, copying collected universal entries
1129                         universalIter = universalFeatures.iterator();
1130                         while (universalIter.hasNext()) {
1131                             featureMap.put(currentKey, universalIter.next());
1132                         }
1133                     }
1134 
1135                     // include in appropriate view + increase count
1136                     featureMap.put(currentKey, currentGlobal);
1137                     featureCount.add(currentKey);
1138                 } else if (currentType == FeatureType.TEXT) {
1139                     // include in textual view
1140                     textualFeatures.add(currentGlobal);
1141                 }
1142                 // elements are ignored in filtered rep.s
1143             }
1144         }
1145 
1146         // combine all views
1147         final List<Feature> result = new LinkedList<Feature>();
1148         final Iterator sortedIter = featureCount.uniqueSet().iterator();
1149         Collection currentView;
1150 
1151         // add view for each suitable attribute/valculated value
1152         while (sortedIter.hasNext()) {
1153             currentKey = (String) sortedIter.next();
1154 
1155             // ignore attributes/calculated values that occur only once or twice
1156             if (featureCount.getCount(currentKey) > 2) {
1157                 result.add(new GlobalFeature("Filtered view for " + currentKey
1158                     + " features"));
1159                 currentView = (Collection) featureMap.get(currentKey);
1160                 innerIter = currentView.iterator();
1161 
1162                 while (innerIter.hasNext()) {
1163                     innerFeature = (Feature) innerIter.next();
1164                     result.add(innerFeature);
1165                 }
1166             }
1167         }
1168 
1169         // add view for textual content
1170         result.add(new GlobalFeature("Filtered view for textual features"));
1171         result.addAll(textualFeatures);
1172 
1173         return result;
1174     }
1175 
1176     /***
1177      * Returns the maximum number of ancestors to include in the context
1178      * representation.
1179      *
1180      * @return the number of ancestors to include
1181      */
1182     public int getAncestorNumber() {
1183         return ancestorNumber;
1184     }
1185 
1186     /***
1187      * Returns the unmodifiable set of names of default attributes. The
1188      * contained names are Strings in a format compatible to
1189      * {@link DOMUtils#name(Attribute)}.
1190      *
1191      * @return the set of default attributes
1192      */
1193     public Set getDefaultAttributes() {
1194         return defaultAttributes;
1195     }
1196 
1197     /***
1198      * Returns the number of preceding recognitions to represent in detail.
1199      * @return the value of the attribute
1200      */
1201     public int getDetailedRecognitions() {
1202         return detailedRecognitions;
1203     }
1204 
1205     /***
1206      * Returns the name of the element to use for calculating head values.
1207      *
1208      * @return the name of the attribute
1209      * @see #calculateHeadValues(Element, List)
1210      */
1211     public String getHeadAttribute() {
1212         return headAttribute;
1213     }
1214 
1215     /***
1216      * Returns the name of the attribute to use for calculating head values.
1217      *
1218      * @return the name of the element
1219      * @see #calculateHeadValues(Element, List)
1220      */
1221     public String getHeadElement() {
1222         return headElement;
1223     }
1224 
1225     /***
1226      * Returns the basic number of preceding and following siblings to include
1227      * in the context representation.
1228      *
1229      * @return the number of siblings to include
1230      */
1231     public int getSiblingNumber() {
1232         return siblingNumber;
1233     }
1234 
1235     /***
1236      * Handles ancestors and ancestor siblings of an element.
1237      *
1238      * @param element the element to process
1239      * @param ancestorsToAdd the number of ancestors to add, must be &gt; 0;
1240      * if &gt; 1, this method calls itself recursively, decreasing the number
1241      * by 1
1242      * @param ancestorSiblingsToAdd the number of preceding/following siblings
1243      * of the current ancestors to add; if 0 or negative, no siblings are added;
1244      * a recursive call decreases this parameter by 1 if any ancestor siblings
1245      * were found (if no siblings were found, the number passes unchanged)
1246      * @param ancestorFeatures the list of {@link GlobalFeature}s to prepend
1247      * the features on the ancestors to
1248      * @param ancestorSiblingFeatures the list of {@link GlobalFeature}s to
1249      * append the features on the ancestors siblings to
1250      * @param processedAncestorNames a bag that will typically be empty when
1251      * first calling this method (will be filled by recursive calls)
1252      * @param cache a cache of local feature, mapping {@link Element}s to
1253      * {@link List}s of {@link LocalFeature}
1254      * @throws IllegalArgumentException if <code>ancestorsToAdd</code> is 0 or
1255      * negative
1256      */
1257     protected void handleAncestors(final Element element,
1258             final int ancestorsToAdd, final int ancestorSiblingsToAdd,
1259             final LinkedList<Feature> ancestorFeatures,
1260             final LinkedList<Feature> ancestorSiblingFeatures,
1261             final Bag processedAncestorNames,
1262             final Map<Element, List<LocalFeature>> cache)
1263             throws IllegalArgumentException {
1264         if (ancestorsToAdd <= 0) {
1265             throw new IllegalArgumentException(
1266                 "Cannot add a non-positive number of ancestors ("
1267                     + ancestorsToAdd + ")!");
1268         }
1269 
1270         // handle parent
1271         final Element parent = element.getParent();
1272         if (parent != null) {
1273             // otherwise there is nothing to do (reached highest node)
1274             final ElementPosition position;
1275 
1276             // process siblings of parent, if required
1277             if (ancestorSiblingsToAdd > 0) {
1278                 // use bag of names to determine axis prefix:
1279                 final String parentName = DOMUtils.name(parent);
1280                 processedAncestorNames.add(parentName);
1281                 final int typeCount
1282                     = processedAncestorNames.getCount(parentName);
1283                 final String nameToQuote;
1284 
1285                 if (typeCount > 2) {
1286                     // repeated occurrence of this element type:
1287                     // prepend repetition counter to element name
1288                     nameToQuote = Integer.toString(typeCount) + parentName;
1289                 } else {
1290                     // first occurrence of this element type: just use the name
1291                     nameToQuote = parentName;
1292                 }
1293                 final String axisPrefix =
1294                     AXIS_ANCESTOR + LocalFeature.quote(nameToQuote);
1295 
1296                 // store both sides in shared temporary list which is then
1297                 // appended to the specified list
1298                 final LinkedList<Feature> siblingFeatures =
1299                     new LinkedList<Feature>();
1300                 position = handleSiblings(axisPrefix, parent,
1301                     ancestorSiblingsToAdd, siblingFeatures, siblingFeatures,
1302                     cache);
1303                 ancestorSiblingFeatures.addAll(siblingFeatures);
1304             } else {
1305                 position = null;
1306             }
1307 
1308             // process parent itself, prepending to list
1309             buildFeatures(AXIS_ANCESTOR, parent, position, false,
1310                 ancestorFeatures, false, cache);
1311 
1312             // recursively handle ancestors of parent, if required
1313             if (ancestorsToAdd > 1) {
1314                 final int parentSilbingsToAdd;
1315 
1316                 // determine how many siblings to process at the next step
1317                 if ((position != null) && (position.getProcessedPreceding()
1318                         + position.getProcessedPreceding() > 0)) {
1319                     // processed one or more ancestor siblings: decrease number
1320                     parentSilbingsToAdd = ancestorSiblingsToAdd - 1;
1321                 } else {
1322                     // no ancestor siblings processed: pass number unchange
1323                     parentSilbingsToAdd = ancestorSiblingsToAdd;
1324                 }
1325 
1326                 handleAncestors(parent, ancestorsToAdd - 1 ,
1327                     parentSilbingsToAdd, ancestorFeatures,
1328                     ancestorSiblingFeatures, processedAncestorNames, cache);
1329             }
1330         }
1331     }
1332 
1333     /***
1334      * Adds the preceding and following siblings of an element.
1335      *
1336      * @param axisPrefix the prefix of the axis name, used to start each
1337      * feature; specify the empty string if no prefix should be used
1338      * @param element the element to process
1339      * @param baseNumber the basic number of siblings to keep; the actual
1340      * number of siblings kept might vary
1341      * @param precedingFeatures the list of {@link GlobalFeature}s to prepend
1342      * the features on the preceding siblings to
1343      * @param followingFeatures the list of {@link GlobalFeature}s to append
1344      * the features on the following siblings to
1345      * @param cache a cache of local feature, mapping {@link Element}s to
1346      * {@link List}s of {@link LocalFeature}
1347      * @return a wrapper of the position of the element with its parent element
1348      * and related data, required by
1349      * {@link #calculatePositionalValues(String, ElementPosition, List)}, or
1350      * <code>null</code> if there is no parent element
1351      */
1352     protected ElementPosition handleSiblings(final String axisPrefix,
1353             final Element element, final int baseNumber,
1354             final LinkedList<Feature> precedingFeatures,
1355             final LinkedList<Feature> followingFeatures,
1356             final Map<Element, List<LocalFeature>> cache) {
1357         final Element parent = element.getParent();
1358         final String name = DOMUtils.name(element);
1359         final ElementPosition result;
1360 
1361         if (parent != null) {
1362             // otherwise there is nothing to do
1363             final Iterator childIter = parent.elementIterator();
1364             Element currentChild;
1365             boolean foundElement = false;
1366             LinkedList<Element> allPrecedingSiblings =
1367                 new LinkedList<Element>();
1368             LinkedList<Element> allFollowingSiblings =
1369                 new LinkedList<Element>();
1370 
1371             // counters for object to return
1372             int allChildren = 0;
1373             int overallPosition = 0;
1374             int typedChildren = 0;
1375             int typedPosition = 0;
1376             boolean sameType;
1377 
1378             while (childIter.hasNext()) {
1379                 currentChild = (Element) childIter.next();
1380 
1381                 // update child count
1382                 allChildren++;
1383                 if (name.equals(DOMUtils.name(currentChild))) {
1384                     sameType = true;
1385                     typedChildren++;
1386                 } else {
1387                     sameType = false;
1388                 }
1389 
1390                 if (currentChild == element) {
1391                     foundElement = true;
1392                 } else {
1393                     if (!foundElement) {
1394                         // add to preceding siblings
1395                         allPrecedingSiblings.addLast(currentChild);
1396 
1397                         // update position count
1398                         overallPosition++;
1399                         if (sameType) {
1400                             typedPosition++;
1401                         }
1402                     } else {
1403                         // add to following siblings
1404                         allFollowingSiblings.addLast(currentChild);
1405                     }
1406                 }
1407             }
1408 
1409             // select siblings to keep
1410             final List selectedPrecedingSiblings = selectPrecedingSiblings(
1411                    element, allPrecedingSiblings, baseNumber);
1412             final List selectedFollowingSiblings = selectFollowingSiblings(
1413                     element, allFollowingSiblings, baseNumber);
1414 
1415             // create object to return
1416             result = new ElementPosition(allChildren, overallPosition,
1417                 typedChildren, typedPosition, selectedPrecedingSiblings.size(),
1418                 selectedFollowingSiblings.size());
1419 
1420             // name axes
1421             final StringBuilder precedingAxisName =
1422                 new StringBuilder(axisPrefix);
1423             final StringBuilder followingAxisName =
1424                 new StringBuilder(axisPrefix);
1425             final String prefixComment;
1426             if (axisPrefix.length() > 0) {
1427                 prefixComment = " with '" + axisPrefix + "' prefix";
1428                 precedingAxisName.append(GlobalFeature.SEP);
1429                 followingAxisName.append(GlobalFeature.SEP);
1430             } else {
1431                 prefixComment = ""; // no prefix
1432             }
1433             precedingAxisName.append(AXIS_PREC_SIBLING);
1434             followingAxisName.append(AXIS_FOLLOW_SIBLING);
1435 
1436             ListIterator siblingIter;
1437             Element currentSibling;
1438 
1439             // prepend preceding siblings, without calculating positional values
1440             siblingIter = selectedPrecedingSiblings.listIterator(
1441                 selectedPrecedingSiblings.size());
1442             while (siblingIter.hasPrevious()) {
1443                 currentSibling = (Element) siblingIter.previous();
1444                 buildFeatures(precedingAxisName.toString(), currentSibling,
1445                         null, false, precedingFeatures, false, cache);
1446             }
1447 
1448             precedingFeatures.addFirst(
1449                 new GlobalFeature(precedingAxisName.toString(),
1450                     LocalFeature.MARKER_START));
1451             precedingFeatures.addFirst(new GlobalFeature(
1452                 "Preceding siblings axis" + prefixComment));
1453 
1454             // append following siblings, without calculating positional values
1455             followingFeatures.addLast(new GlobalFeature(
1456                             "Following siblings axis" + prefixComment));
1457             followingFeatures.addLast(
1458                 new GlobalFeature(followingAxisName.toString(),
1459                     LocalFeature.MARKER_START));
1460 
1461             siblingIter = selectedFollowingSiblings.listIterator();
1462             while (siblingIter.hasNext()) {
1463                 currentSibling = (Element) siblingIter.next();
1464                 buildFeatures(followingAxisName.toString(), currentSibling,
1465                         null, false, followingFeatures, true, cache);
1466             }
1467         } else {
1468             // no parent
1469             result = null;
1470         }
1471         return result;
1472     }
1473 
1474     /***
1475      * {@inheritDoc}
1476      */
1477     public PriorRecognitions initDocument(final File filename,
1478             final TokenizerFactory tFactory)
1479     throws ProcessingException, IOException {
1480         // delegate to sensors and to superclass
1481         for (int i = 0; i < sensors.length; i++) {
1482             sensors[i].initDocument(filename, tFactory);
1483         }
1484 
1485         return super.initDocument(filename, tFactory);
1486     }
1487 
1488     /***
1489      * Modifies a list of {@link GlobalFeature}s to remove extraneous
1490      * {@link FeatureType#MARKER} features. Keeps only the last one of several
1491      * sequential marker features; trailing marker features are removed as well.
1492      *
1493      * @param features the list of features to modify
1494      */
1495     protected void removeExtraMarkers(final List features) {
1496         // iterate backward for easier operation
1497         final ListIterator backwardIter =
1498             features.listIterator(features.size());
1499         GlobalFeature currentGlobal;
1500         LocalFeature currentLocal;
1501         FeatureType currentType;
1502 
1503         // initialized this way to remove trailing marker features
1504         FeatureType lastType = FeatureType.MARKER;
1505 
1506         while (backwardIter.hasPrevious()) {
1507             currentGlobal = (GlobalFeature) backwardIter.previous();
1508             currentLocal = currentGlobal.getLocalFeature();
1509 
1510             if (currentLocal != null) { // otherwise it's a comment -- ignored
1511                 currentType = currentLocal.getType();
1512 
1513                 if ((currentType == FeatureType.MARKER)
1514                         && (lastType == FeatureType.MARKER)) {
1515                     // remove extraneous marker feature
1516                     backwardIter.remove();
1517                 }
1518 
1519                 lastType = currentType;
1520             }
1521         }
1522     }
1523 
1524     /***
1525      * Selects the siblings to keep among all following siblings. This
1526      * implementation always keeps the <code>baseNumber</code> first siblings.
1527      * One of the selected siblings may have a different type (name as returned
1528      * by {@link DOMUtils#name(Element)}) than the main element --
1529      * if there are more with different types, they are skipped.
1530      *
1531      * @param mainElement the element whose siblings should be selected
1532      * @param allFollowingSiblings the list of all following siblings
1533      * @param baseNumber the basic number of siblings to keep; the actual
1534      * number of siblings kept might vary
1535      * @return the subset of following siblings to keep; an empty list if
1536      * there are no siblings to keep
1537      */
1538     protected List<Element> selectFollowingSiblings(final Element mainElement,
1539             final LinkedList<Element> allFollowingSiblings,
1540             final int baseNumber) {
1541         final String elementType = DOMUtils.name(mainElement);
1542         final LinkedList<Element> selectedFollowingSiblings =
1543             new LinkedList<Element>();
1544         final Iterator<Element> siblingIter = allFollowingSiblings.iterator();
1545         Element currentSibling;
1546         boolean addedSiblingOfDifferentType = false;
1547         boolean doAdd;
1548 
1549         // keep up to |baseNumber| siblings from start of list
1550         while (siblingIter.hasNext()
1551                 && (selectedFollowingSiblings.size() < baseNumber)) {
1552             currentSibling = siblingIter.next();
1553 
1554             if (elementType.equals(DOMUtils.name(currentSibling))) {
1555                 doAdd = true;
1556             } else {
1557                 // only add if this is the first differently typed sibling
1558                 doAdd = !addedSiblingOfDifferentType;
1559                 addedSiblingOfDifferentType = true;
1560             }
1561 
1562             if (doAdd) {
1563                 selectedFollowingSiblings.addLast(currentSibling);
1564             }
1565         }
1566 
1567         return selectedFollowingSiblings;
1568     }
1569 
1570     /***
1571      * Selects the siblings to keep among all preceding siblings. This
1572      * implementation always keeps the first sibling and the
1573      * <code>baseNumber</code> last siblings. One of the last siblings may
1574      * have a different type (name as returned by
1575      * {@link DOMUtils#name(Element)}) than the main element --
1576      * if there are more with different types, they are skipped.
1577      * If none of the <code>baseNumber</code> last siblings has a different
1578      * types, the last siblings with a different type (name) is also kept.
1579      *
1580      * @param mainElement the element whose siblings should be selected
1581      * @param allPrecedingSiblings the list of all preceding siblings
1582      * @param baseNumber the basic number of siblings to keep; the actual
1583      * number of siblings kept might vary
1584      * @return the subset of preceding siblings to keep; an empty list if
1585      * there are no siblings to keep
1586      */
1587     protected List<Element> selectPrecedingSiblings(final Element mainElement,
1588             final LinkedList<Element> allPrecedingSiblings,
1589             final int baseNumber) {
1590         final String elementType = DOMUtils.name(mainElement);
1591         final LinkedList<Element> selectedPrecedingSiblings =
1592             new LinkedList<Element>();
1593         final ListIterator<Element> siblingIter =
1594             allPrecedingSiblings.listIterator(allPrecedingSiblings.size());
1595         Element currentSibling;
1596         boolean addedSiblingOfDifferentType = false;
1597         boolean doAdd;
1598 
1599         // keep up to |baseNumber| siblings from end of list
1600         while (siblingIter.hasPrevious()
1601                 && (selectedPrecedingSiblings.size() < baseNumber)) {
1602             currentSibling = siblingIter.previous();
1603 
1604             if (elementType.equals(DOMUtils.name(currentSibling))) {
1605                 doAdd = true;
1606             } else {
1607                 // only add if this is the first differently typed sibling
1608                 doAdd = !addedSiblingOfDifferentType;
1609                 addedSiblingOfDifferentType = true;
1610             }
1611 
1612             if (doAdd) {
1613                 selectedPrecedingSiblings.addFirst(currentSibling);
1614             }
1615         }
1616 
1617         // add first sibling, if not yet done
1618         if (!allPrecedingSiblings.isEmpty()) {
1619             final Element firstSibling = allPrecedingSiblings.getFirst();
1620             final Element firstSelected = selectedPrecedingSiblings.getFirst();
1621 
1622             // no need to check size of selectedPrecedingSiblings: must be > 0
1623             if (firstSelected != firstSibling) {
1624                 selectedPrecedingSiblings.addFirst(firstSibling);
1625             }
1626         }
1627 
1628         return selectedPrecedingSiblings;
1629     }
1630 
1631     /***
1632       * Returns a string representation of this object.
1633       *
1634       * @return a textual representation
1635       */
1636      public String toString() {
1637          return new ToStringBuilder(this)
1638              .appendSuper(super.toString())
1639              .append("detailed recognitions", detailedRecognitions)
1640              .append("number of ancestors", ancestorNumber)
1641              .append("number of siblings", siblingNumber)
1642              .append("maximum prefix length", maximumPrefixLength)
1643              .append("head element", headElement)
1644              .append("head attribute", headAttribute)
1645              .append("default attributes", defaultAttributes)
1646              .append("sensors", ArrayUtils.toString(sensors))
1647              .toString();
1648      }
1649 
1650 }