1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.text;
23
24 import org.apache.commons.collections.Bag;
25 import org.apache.commons.collections.bag.HashBag;
26 import org.apache.commons.lang.builder.ToStringBuilder;
27
28 /***
29 * A simple container that keeps track of the tokens in a document. This class
30 * differs from {@link de.fu_berlin.ties.text.TokenContainer} by not doing
31 * any tokenization itself, relying on external tokenization instead.
32 *
33 * <p>Instances of this class are not thread-safe; if you want to share a single
34 * instance between different thread, you have to ensure proper synchronization.
35 *
36 * @author Christian Siefkes
37 * @version $Revision: 1.10 $, $Date: 2006/10/21 16:04:25 $, $Author: siefkes $
38 */
39 public class TokenCounter {
40
41 /***
42 * Whether there is whitespace before the {@link #getLast() last} added
43 * token.
44 */
45 private boolean whitespaceBeforeLast;
46
47 /***
48 * Whether there is whitespace after the {@link #getLast() last} added
49 * token.
50 */
51 private boolean whitespaceAfterLast;
52
53 /***
54 * A multi-set of all the tokens added to this container.
55 */
56 private final Bag tokens = new HashBag();
57
58 /***
59 * The last token added to this container.
60 */
61 private String last = null;
62
63 /***
64 * Creates a new instance.
65 */
66 public TokenCounter() {
67 super();
68 }
69
70 /***
71 * Clears all tokens stored in this container.
72 */
73 public void clear() {
74 tokens.clear();
75 last = null;
76 whitespaceBeforeLast = false;
77 whitespaceAfterLast = false;
78 }
79
80 /***
81 * Adds a token to this instance.
82 *
83 * @param whitespaceBefore whether there is whitespace before the token
84 * @param token the token to add
85 */
86 public void add(final boolean whitespaceBefore, final String token) {
87
88
89 whitespaceBeforeLast = whitespaceAfterLast || whitespaceBefore;
90
91
92 tokens.add(token);
93 last = token;
94 }
95
96 /***
97 * Adds whitespace to this instance, setting
98 * {@link #isWhitespaceAfterLast()} to <code>true</code>.
99 */
100 public void addWhitespace() {
101 whitespaceAfterLast = true;
102 }
103
104 /***
105 * Returns the cardinality of the given token in this container.
106 *
107 * @param token the token to check
108 * @return the number of copies of the specified token in this container,
109 * <code>>= 0</code>
110 */
111 public int getCount(final String token) {
112 return tokens.getCount(token);
113 }
114
115 /***
116 * Returns the repetition of the {@link #getLast() last} added token in the
117 * original text (counting starts with 0, as the first occurrence is the
118 * "0th repetition").
119 *
120 * @return the value of the attribute
121 */
122 public int getLastRep() {
123
124 return tokens.getCount(last) - 1;
125 }
126
127 /***
128 * Returns the last added token.
129 *
130 * @return the value of the attribute
131 */
132 public String getLast() {
133 return last;
134 }
135
136 /***
137 * Whether there is whitespace after the {@link #getLast() last} added
138 * token.
139 *
140 * @return <code>true</code> iff there is whitespace after the token
141 */
142 public boolean isWhitespaceAfterLast() {
143 return whitespaceAfterLast;
144 }
145
146 /***
147 * Whether there is whitespace before the {@link #getLast() last} added
148 * token.
149 *
150 * @return <code>true</code> iff there is whitespace before the token
151 */
152 public boolean isWhitespaceBeforeLast() {
153 return whitespaceBeforeLast;
154 }
155
156 /***
157 * Returns the token number of tokens counted by this instances (including
158 * duplicates).
159 *
160 * @return the number of tokens counted
161 */
162 public int size() {
163 return tokens.size();
164 }
165
166 /***
167 * Returns a string representation of this object.
168 *
169 * @return a textual representation
170 */
171 public String toString() {
172 return new ToStringBuilder(this)
173 .append("tokens", tokens)
174 .append("last", last)
175 .append("whitespace before last", whitespaceBeforeLast)
176 .append("whitespace after last", whitespaceAfterLast)
177 .toString();
178 }
179
180 }