1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.text;
23
24 import org.apache.commons.collections.Bag;
25 import org.apache.commons.collections.bag.HashBag;
26 import org.apache.commons.lang.builder.ToStringBuilder;
27
28 /***
29 * A simple container that keeps track of the tokens in a document. This class
30 * differs from {@link de.fu_berlin.ties.text.TokenContainer} by not doing
31 * any tokenization itself, relying on external tokenization instead.
32 *
33 * <p>Instances of this class are not thread-safe; if you want to share a single
34 * instance between different thread, you have to ensure proper synchronization.
35 *
36 * @author Christian Siefkes
37 * @version $Revision: 1.5 $, $Date: 2004/11/09 16:29:34 $, $Author: siefkes $
38 */
39 public class TokenCounter {
40
41 /***
42 * Whether there is whitespace before the {@link #getLast() last} added
43 * token.
44 */
45 private boolean whitespaceBeforeLast;
46
47 /***
48 * Whether there is whitespace after the {@link #getLast() last} added
49 * token.
50 */
51 private boolean whitespaceAfterLast;
52
53 /***
54 * A multi-set of all the tokens added to this container.
55 */
56 private final Bag tokens = new HashBag();
57
58 /***
59 * The last token added to this container.
60 */
61 private String last = null;
62
63 /***
64 * Creates a new instance.
65 */
66 public TokenCounter() {
67 super();
68 }
69
70 /***
71 * Adds a token to this instance.
72 *
73 * @param whitespaceBefore whether there is whitespace before the token
74 * @param token the token to add
75 */
76 public void add(final boolean whitespaceBefore, final String token) {
77
78
79 whitespaceBeforeLast = whitespaceAfterLast || whitespaceBefore;
80
81
82 tokens.add(token);
83 last = token;
84 }
85
86 /***
87 * Adds whitespace to this instance, setting
88 * {@link #isWhitespaceAfterLast()} to <code>true</code>.
89 */
90 public void addWhitespace() {
91 whitespaceAfterLast = true;
92 }
93
94 /***
95 * Returns the cardinality of the given token in this container.
96 *
97 * @param token the token to check
98 * @return the number of copies of the specified token in this container,
99 * <code>>= 0</code>
100 */
101 public int getCount(final String token) {
102 return tokens.getCount(token);
103 }
104
105 /***
106 * Returns the repetition of the {@link #getLast() last} added token in the
107 * original text (counting starts with 0, as the first occurrence is the
108 * "0th repetition").
109 *
110 * @return the value of the attribute
111 */
112 public int getLastRep() {
113
114 return tokens.getCount(last) - 1;
115 }
116
117 /***
118 * Returns the last added token.
119 *
120 * @return the value of the attribute
121 */
122 public String getLast() {
123 return last;
124 }
125
126 /***
127 * Whether there is whitespace after the {@link #getLast() last} added
128 * token.
129 *
130 * @return <code>true</code> iff there is whitespace after the token
131 */
132 public boolean isWhitespaceAfterLast() {
133 return whitespaceAfterLast;
134 }
135
136 /***
137 * Whether there is whitespace before the {@link #getLast() last} added
138 * token.
139 *
140 * @return <code>true</code> iff there is whitespace before the token
141 */
142 public boolean isWhitespaceBeforeLast() {
143 return whitespaceBeforeLast;
144 }
145
146 /***
147 * Returns the token number of tokens counted by this instances (including
148 * duplicates).
149 *
150 * @return the number of tokens counted
151 */
152 public int size() {
153 return tokens.size();
154 }
155
156 /***
157 * Returns a string representation of this object.
158 *
159 * @return a textual representation
160 */
161 public String toString() {
162 return new ToStringBuilder(this)
163 .append("tokens", tokens)
164 .append("last", last)
165 .append("whitespace before last", whitespaceBeforeLast)
166 .append("whitespace after last", whitespaceAfterLast)
167 .toString();
168 }
169
170 }