View Javadoc

1   /*
2    * Copyright (C) 2003-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.text;
23  
24  import java.util.Iterator;
25  
26  import org.apache.commons.collections.Bag;
27  import org.apache.commons.collections.bag.HashBag;
28  import org.apache.commons.lang.builder.ToStringBuilder;
29  
30  /***
31   * A container that keeps track of the tokens in a document. Instances of this
32   * class are not thread-safe; if you want to share a single instance between
33   * different thread, you have to ensure proper synchronization.
34   *
35   * @author Christian Siefkes
36   * @version $Revision: 1.9 $, $Date: 2006/10/21 16:04:25 $, $Author: siefkes $
37   */
38  public class TokenContainer {
39  
40      /***
41       * Returns the repetition of the first token of the {@link #getLast() last}
42       * {@link #add(String) added} string in the original text (counting starts
43       * with 0, as the first occurrence is the "0th repetition").
44       */
45      private int firstTokenInLastRep;
46  
47      /***
48       * Returns the index of the first token of the {@link #getLast() last}
49       * {@link #add(String) added} string in the original text (indexing starts
50       * with 0).
51       */
52      private int firstTokenInLastIndex;
53  
54      /***
55       * Whether there is whitespace before/at the start of the string
56       * added this container by the last {@link #add(String)} operation.
57       */
58      private boolean whitespaceBeforeLast;
59  
60      /***
61       * Whether there is whitespace after/at the end of the string
62       * added this container by the last {@link #add(String)} operation.
63       */
64      private boolean whitespaceAfterLast;
65  
66      /***
67       * A multi-set of all the tokens added to this container.
68       */
69      private final Bag tokens = new HashBag();
70  
71      /***
72       * A multi-set of the last batch of tokens added to this container.
73       */
74      private final Bag lastTokens = new HashBag();
75  
76      /***
77       * The last string added to this container.
78       */
79      private String last = null;
80  
81      /***
82       * The tokenizer used to split strings into tokens.
83       */
84      private final TextTokenizer tokenizer;
85  
86      /***
87       * Creates a new instance.
88       *
89       * @param tFactory used to instantiate the employed tokenizer
90       */
91      public TokenContainer(final TokenizerFactory tFactory) {
92          super();
93          tokenizer = tFactory.createTokenizer("");
94      }
95  
96      /***
97       * Adds text to this container. The specified string is split into a series
98       * of tokens, the token count is increased accordingly for all contained
99       * tokens.
100      *
101      * @param text the text to add
102      */
103     public void add(final String text) {
104         // reset lastTokens
105         lastTokens.clear();
106 
107         // tokenize text and update token count (in both multi-sets)
108         final StringBuilder lastBuffer = new StringBuilder();
109         String token;
110         tokenizer.reset(text);
111         boolean isFirst = true;
112 
113         while ((token = tokenizer.nextToken()) != null) {
114             if (isFirst) {
115                 // very first token. Determine repetition+index: count prior to
116                 // adding the token, as counting starts with 0
117                 firstTokenInLastRep = tokens.getCount(token);
118                 firstTokenInLastIndex = tokens.size();
119 
120                 // whitespace before: true iff there was whitespace after the
121                 // previous "last" or if there is whitespace at the begin of
122                 // this one
123                 whitespaceBeforeLast =
124                     whitespaceAfterLast || tokenizer.hasPrecedingWhitespace();
125                 isFirst = false;
126             } else {
127                 // already got token(s)
128                 if (tokenizer.hasPrecedingWhitespace()) {
129                     lastBuffer.append(' ');
130                 }
131             }
132 
133             // add token to lastBuffer; count it in boths sets
134             lastBuffer.append(token);
135             tokens.add(token);
136             lastTokens.add(token);
137         }
138 
139         // remember if there is whitespace after the last token
140         whitespaceAfterLast = tokenizer.hasPrecedingWhitespace();
141 
142         // store normalized string as "last"
143         last = lastBuffer.toString();
144     }
145 
146     /***
147      * Returns the cardinality of the given token in this container.
148      *
149      * @param token the token to check
150      * @return the number of copies of the specified token in this container,
151      * <code>&gt;= 0</code>
152      */
153     public int getCount(final String token) {
154         return tokens.getCount(token);
155     }
156 
157     /***
158      * Returns the index of the first token of the {@link #getLast() last}
159      * {@link #add(String) added} string in the original text (indexing starts
160      * with 0).
161      *
162      * @return the value of the attribute
163      */
164     public int getFirstTokenInLastIndex() {
165         return firstTokenInLastIndex;
166     }
167 
168     /***
169      * Returns the repetition of the first token of the {@link #getLast() last}
170      * {@link #add(String) added} string in the original text (counting starts
171      * with 0, as the first occurrence is the "0th repetition").
172      *
173      * @return the value of the attribute
174      */
175     public int getFirstTokenInLastRep() {
176         return firstTokenInLastRep;
177     }
178 
179     /***
180      * Returns the cardinality of the given token in the text added by the last
181      * {@link #add(String)} operation.
182      *
183      * @param token the token to check
184      * @return the number of copies of the specified token in the text added
185      * last, <code>&gt;= 0</code>
186      */
187     public int getLastCount(final String token) {
188         return lastTokens.getCount(token);
189     }
190 
191     /***
192      * Returns a trimmed and whitespace-normalized representation of the string
193      * added this container by the last {@link #add(String)} operation.
194      * Starting and trailing whitespace is removed; each internal whitespace
195      * is converted into a single space charater.
196      *
197      * @return the normalized representation of the last string added to this
198      * container
199      */
200     public String getLast() {
201         return last;
202     }
203 
204     /***
205      * Whether there is whitespace after the {@link #getLast() last} added
206      * string.
207      *
208      * @return <code>true</code> iff there is whitespace after/at the end of
209      * the string
210      */
211     public boolean isWhitespaceAfterLast() {
212         return whitespaceAfterLast;
213     }
214 
215     /***
216      * Whether there is whitespace before the {@link #getLast() last} added
217      * string.
218      *
219      * @return <code>true</code> iff there is whitespace before/at the start of
220      * the string
221      */
222     public boolean isWhitespaceBeforeLast() {
223         return whitespaceBeforeLast;
224     }
225 
226     /***
227      * Whether the text added by the last {@link #add(String)} operation
228      * contains the specified token.
229      *
230      * @param token the token to check
231      * @return <code>true</code> iff the specified argument is contained as a
232      * word or number token in the last added string.
233      */
234     public boolean lastContains(final String token) {
235         return lastTokens.contains(token);
236     }
237 
238     /***
239      * Returns an iterator over the word and number tokens added by the last
240      * {@link #add(String)} operation. The iterator contains each token only
241      * once (no matter how often it occurred in the last string); the tokens
242      * are iterated in no particular order.
243      *
244      * @return an iterator over the last added tokens
245      */
246     public Iterator lastIterator() {
247         return lastTokens.uniqueSet().iterator();
248     }
249 
250     /***
251      * Returns the token number of tokens counted by this instances (including
252      * duplicates).
253      * 
254      * @return the number of tokens counted
255      */
256     public int size() {
257         return tokens.size();
258     }
259 
260     /***
261      * Returns a string representation of this object.
262      *
263      * @return a textual representation
264      */
265     public String toString() {
266         return new ToStringBuilder(this)
267             .append("tokens", tokens)
268             .append("last", last)
269             .append("whitespace before last", whitespaceBeforeLast)
270             .append("whitespace after last", whitespaceAfterLast)
271             .append("repetition of first token in last", firstTokenInLastRep)
272             .append("index of first token in last", firstTokenInLastIndex)
273             .append("tokenizer", tokenizer)
274             .toString();
275     }
276 
277 }