1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.text;
23
24 import java.util.Iterator;
25
26 import org.apache.commons.collections.Bag;
27 import org.apache.commons.collections.bag.HashBag;
28 import org.apache.commons.lang.builder.ToStringBuilder;
29
30 /***
31 * A container that keeps track of the tokens in a document. Instances of this
32 * class are not thread-safe; if you want to share a single instance between
33 * different thread, you have to ensure proper synchronization.
34 *
35 * @author Christian Siefkes
36 * @version $Revision: 1.5 $, $Date: 2004/11/17 15:25:11 $, $Author: siefkes $
37 */
38 public class TokenContainer {
39
40 /***
41 * Returns the repetition of the first token of the {@link #getLast() last}
42 * {@link #add(String) added} string in the original text (counting starts
43 * with 0, as the first occurrence is the "0th repetition").
44 */
45 private int firstTokenInLastRep;
46
47 /***
48 * Returns the index of the first token of the {@link #getLast() last}
49 * {@link #add(String) added} string in the original text (indexing starts
50 * with 0).
51 */
52 private int firstTokenInLastIndex;
53
54 /***
55 * Whether there is whitespace before/at the start of the string
56 * added this container by the last {@link #add(String)} operation.
57 */
58 private boolean whitespaceBeforeLast;
59
60 /***
61 * Whether there is whitespace after/at the end of the string
62 * added this container by the last {@link #add(String)} operation.
63 */
64 private boolean whitespaceAfterLast;
65
66 /***
67 * A multi-set of all the tokens added to this container.
68 */
69 private final Bag tokens = new HashBag();
70
71 /***
72 * A multi-set of the last batch of tokens added to this container.
73 */
74 private final Bag lastTokens = new HashBag();
75
76 /***
77 * The last string added to this container.
78 */
79 private String last = null;
80
81 /***
82 * The tokenizer used to split strings into tokens.
83 */
84 private final TextTokenizer tokenizer;
85
86 /***
87 * Creates a new instance.
88 *
89 * @param tFactory used to instantiate the employed tokenizer
90 */
91 public TokenContainer(final TokenizerFactory tFactory) {
92 super();
93 tokenizer = tFactory.createTokenizer("");
94 }
95
96 /***
97 * Adds text to this container. The specified string is split into a series
98 * of tokens, the token count is increased accordingly for all contained
99 * tokens.
100 *
101 * @param text the text to add
102 */
103 public void add(final String text) {
104
105 lastTokens.clear();
106
107
108 StringBuffer lastBuffer = new StringBuffer();
109 String token;
110 tokenizer.reset(text);
111 boolean isFirst = true;
112
113 while ((token = tokenizer.nextToken()) != null) {
114 if (isFirst) {
115
116
117 firstTokenInLastRep = tokens.getCount(token);
118 firstTokenInLastIndex = tokens.size();
119
120
121
122
123 whitespaceBeforeLast =
124 whitespaceAfterLast || tokenizer.hasPrecedingWhitespace();
125 isFirst = false;
126 } else {
127
128 if (tokenizer.hasPrecedingWhitespace()) {
129 lastBuffer.append(' ');
130 }
131 }
132
133
134 lastBuffer.append(token);
135 tokens.add(token);
136 lastTokens.add(token);
137 }
138
139
140 whitespaceAfterLast = tokenizer.hasPrecedingWhitespace();
141
142
143 last = lastBuffer.toString();
144 }
145
146 /***
147 * Returns the cardinality of the given token in this container.
148 *
149 * @param token the token to check
150 * @return the number of copies of the specified token in this container,
151 * <code>>= 0</code>
152 */
153 public int getCount(final String token) {
154 return tokens.getCount(token);
155 }
156
157 /***
158 * Returns the index of the first token of the {@link #getLast() last}
159 * {@link #add(String) added} string in the original text (indexing starts
160 * with 0).
161 *
162 * @return the value of the attribute
163 */
164 public int getFirstTokenInLastIndex() {
165 return firstTokenInLastIndex;
166 }
167
168 /***
169 * Returns the repetition of the first token of the {@link #getLast() last}
170 * {@link #add(String) added} string in the original text (counting starts
171 * with 0, as the first occurrence is the "0th repetition").
172 *
173 * @return the value of the attribute
174 */
175 public int getFirstTokenInLastRep() {
176 return firstTokenInLastRep;
177 }
178
179 /***
180 * Returns the cardinality of the given token in the text added by the last
181 * {@link #add(String)} operation.
182 *
183 * @param token the token to check
184 * @return the number of copies of the specified token in the text added
185 * last, <code>>= 0</code>
186 */
187 public int getLastCount(final String token) {
188 return lastTokens.getCount(token);
189 }
190
191 /***
192 * Returns a trimmed and whitespace-normalized representation of the string
193 * added this container by the last {@link #add(String)} operation.
194 * Starting and trailing whitespace is removed; each internal whitespace
195 * is converted into a single space charater.
196 *
197 * @return the normalized representation of the last string added to this
198 * container
199 */
200 public String getLast() {
201 return last;
202 }
203
204 /***
205 * Whether there is whitespace after the {@link #getLast() last} added
206 * string.
207 *
208 * @return <code>true</code> iff there is whitespace after/at the end of
209 * the string
210 */
211 public boolean isWhitespaceAfterLast() {
212 return whitespaceAfterLast;
213 }
214
215 /***
216 * Whether there is whitespace before the {@link #getLast() last} added
217 * string.
218 *
219 * @return <code>true</code> iff there is whitespace before/at the start of
220 * the string
221 */
222 public boolean isWhitespaceBeforeLast() {
223 return whitespaceBeforeLast;
224 }
225
226 /***
227 * Whether the text added by the last {@link #add(String)} operation
228 * contains the specified token.
229 *
230 * @param token the token to check
231 * @return <code>true</code> iff the specified argument is contained as a
232 * word or number token in the last added string.
233 */
234 public boolean lastContains(final String token) {
235 return lastTokens.contains(token);
236 }
237
238 /***
239 * Returns an iterator over the word and number tokens added by the last
240 * {@link #add(String)} operation. The iterator contains each token only
241 * once (no matter how often it occurred in the last string); the tokens
242 * are iterated in no particular order.
243 *
244 * @return an iterator over the last added tokens
245 */
246 public Iterator lastIterator() {
247 return lastTokens.uniqueSet().iterator();
248 }
249
250 /***
251 * Returns the token number of tokens counted by this instances (including
252 * duplicates).
253 *
254 * @return the number of tokens counted
255 */
256 public int size() {
257 return tokens.size();
258 }
259
260 /***
261 * Returns a string representation of this object.
262 *
263 * @return a textual representation
264 */
265 public String toString() {
266 return new ToStringBuilder(this)
267 .append("tokens", tokens)
268 .append("last", last)
269 .append("whitespace before last", whitespaceBeforeLast)
270 .append("whitespace after last", whitespaceAfterLast)
271 .append("repetition of first token in last", firstTokenInLastRep)
272 .append("index of first token in last", firstTokenInLastIndex)
273 .append("tokenizer", tokenizer)
274 .toString();
275 }
276
277 }