1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.context;
23
24 import java.util.Iterator;
25 import java.util.Map;
26
27 import org.apache.commons.lang.builder.ToStringBuilder;
28 import org.dom4j.Attribute;
29 import org.dom4j.Element;
30
31 import de.fu_berlin.ties.ProcessingException;
32 import de.fu_berlin.ties.TiesConfiguration;
33 import de.fu_berlin.ties.classify.feature.DefaultFeature;
34 import de.fu_berlin.ties.classify.feature.DefaultFeatureVector;
35 import de.fu_berlin.ties.classify.feature.FeatureVector;
36 import de.fu_berlin.ties.io.IOUtils;
37 import de.fu_berlin.ties.text.TextTokenizer;
38 import de.fu_berlin.ties.text.TextUtils;
39 import de.fu_berlin.ties.text.TokenizerFactory;
40 import de.fu_berlin.ties.xml.dom.DOMUtils;
41
42 /***
43 * A simple representation of an text in an element in an XML document.
44 * Instances of this class are thread-safe.
45 *
46 * @author Christian Siefkes
47 * @version $Revision: 1.7 $, $Date: 2006/10/21 16:04:03 $, $Author: siefkes $
48 */
49 public class SimpleRepresentation extends AbstractRepresentation {
50
51 /***
52 * Used to tokenize the textual content of the elements to represent.
53 * Synchronized on itself.
54 */
55 private final TextTokenizer tokenizer;
56
57 /***
58 * Creates a new instance based on the
59 * {@linkplain TiesConfiguration#CONF standard configuration}.
60 *
61 * @throws ProcessingException if an error occurs while initializing this
62 * instance
63 */
64 public SimpleRepresentation() throws ProcessingException {
65 this(TiesConfiguration.CONF);
66 }
67
68 /***
69 * Creates a new instance based on the provided configuration.
70 *
71 * @param config used to configure this instance
72 * @throws ProcessingException if an error occurs while initializing this
73 * instance
74 */
75 public SimpleRepresentation(final TiesConfiguration config)
76 throws ProcessingException {
77 this(config, null);
78 }
79
80 /***
81 * Creates a new instance based on the provided configuration.
82 *
83 * @param config used to configure this instance
84 * @param suffix this suffix can be appended to the used configuration
85 * parameters to give values that are specific for this instance; may be
86 * <code>null</code>
87 * @throws ProcessingException if an error occurs while initializing this
88 * instance
89 */
90 public SimpleRepresentation(final TiesConfiguration config,
91 final String suffix) throws ProcessingException {
92 this(config.getInt(config.adaptKey(CONFIG_RECOGN_NUM, suffix)),
93 config.getInt(config.adaptKey(CONFIG_SPLIT_MAXIMUM, suffix)),
94 config.getInt(config.adaptKey(CONFIG_STORE_NTH, suffix)),
95 config.getString(config.adaptKey(IOUtils.KEY_LOCAL_CHARSET, suffix),
96 null),
97 new TokenizerFactory(config, suffix).createTokenizer(""));
98 }
99
100 /***
101 * Creates a new instance.
102 *
103 * @param recogNum the number of preceding recognitions to represent
104 * @param splitMax the maximum number of subsequences to keep when
105 * a feature value must be split (at whitespace)
106 * @param n Each <em>n</em>-th context representation is stored if > 0;
107 * otherwise no representation is stored
108 * @param outCharset the output character set to use (only used to
109 * store some configurations for inspection purposes, if <code>n</code>
110 * > 0); if <code>null</code>, the default charset of the current
111 * platform is used
112 * @param textTokenizer the tokenizer to use
113 */
114 public SimpleRepresentation(final int recogNum, final int splitMax,
115 final int n, final String outCharset,
116 final TextTokenizer textTokenizer) {
117 super(recogNum, splitMax, n, outCharset);
118 tokenizer = textTokenizer;
119 }
120
121 /***
122 * Creates a feature and adds it to a feature vector. The feature is
123 * created by joining <code>prefix</code> and <code>value</code>, separated
124 * by a colon as separator character.
125 *
126 * @param features the feature vector to append to
127 * @param prefix the prefix of the new feature
128 * @param value the main value of the new feature
129 */
130 protected void addFeature(final FeatureVector features, final String prefix,
131 final String value) {
132 features.add(new DefaultFeature(prefix + ':' + value));
133 }
134
135 /***
136 * Adds feature(s) representing text to a feature vector, using the
137 * instance tokenizer for splitting the text into tokens.
138 *
139 * @param features the feature vector to append to
140 * @param prefix the prefix of the new feature(s)
141 * @param text to text to tokenize and add
142 */
143 protected void addText(final FeatureVector features, final String prefix,
144 final String text) {
145
146 synchronized (tokenizer) {
147 tokenizer.reset(text);
148 String token;
149
150 while ((token = tokenizer.nextToken()) != null) {
151 addFeature(features, prefix, token);
152 }
153 }
154 }
155
156 /***
157 * {@inheritDoc}
158 */
159 protected FeatureVector doBuildContext(final Element element,
160 final String leftText, final String mainText,
161 final String rightText, final PriorRecognitions priorRecognitions,
162 final Map featureCache, final String logPurpose)
163 throws ClassCastException {
164 final FeatureVector result = new DefaultFeatureVector();
165
166
167 addFeature(result, "type", DOMUtils.name(element));
168
169
170 final Iterator attribIter = element.attributeIterator();
171 Attribute attrib;
172 String name;
173 String[] splittedVal;
174 int i;
175
176 while (attribIter.hasNext()) {
177 attrib = (Attribute) attribIter.next();
178 name = DOMUtils.name(attrib);
179 splittedVal = TextUtils.splitString(attrib.getValue(),
180 getSplitMaximum());
181
182 for (i = 0; i < splittedVal.length; i++) {
183 addFeature(result, '@' + name, splittedVal[i]);
184 }
185 }
186
187
188 addText(result, "left", leftText);
189 addText(result, "text", mainText);
190 addText(result, "right", rightText);
191
192
193 if (priorRecognitions != null) {
194 PriorRecognitions.Pair pair;
195 Recognition recognition;
196 Iterator priorIter = priorRecognitions.iterator();
197
198 while (priorIter.hasNext()) {
199 pair = (PriorRecognitions.Pair) priorIter.next();
200 recognition = pair.getRecognition();
201 addFeature(result, "prior", recognition.getType());
202 }
203 }
204
205 return result;
206 }
207
208 /***
209 * Returns a string representation of this object.
210 *
211 * @return a textual representation
212 */
213 public String toString() {
214 return new ToStringBuilder(this)
215 .appendSuper(super.toString())
216 .append("tokenizer", tokenizer)
217 .toString();
218 }
219
220 }