View Javadoc

1   /*
2    * Copyright (C) 2003-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.text;
23  
24  import org.apache.commons.collections.Bag;
25  import org.apache.commons.collections.bag.HashBag;
26  import org.apache.commons.lang.builder.ToStringBuilder;
27  
28  /***
29   * A simple container that keeps track of the tokens in a document. This class
30   * differs from {@link de.fu_berlin.ties.text.TokenContainer} by not doing
31   * any tokenization itself, relying on external tokenization instead.
32   *
33   * <p>Instances of this class are not thread-safe; if you want to share a single
34   * instance between different thread, you have to ensure proper synchronization.
35   *
36   * @author Christian Siefkes
37   * @version $Revision: 1.10 $, $Date: 2006/10/21 16:04:25 $, $Author: siefkes $
38   */
39  public class TokenCounter {
40  
41      /***
42       * Whether there is whitespace before the {@link #getLast() last} added
43       * token.
44       */
45      private boolean whitespaceBeforeLast;
46  
47      /***
48       * Whether there is whitespace after the {@link #getLast() last} added
49       * token.
50       */
51      private boolean whitespaceAfterLast;
52  
53      /***
54       * A multi-set of all the tokens added to this container.
55       */
56      private final Bag tokens = new HashBag();
57  
58      /***
59       * The last token added to this container.
60       */
61      private String last = null;
62  
63      /***
64       * Creates a new instance.
65       */
66      public TokenCounter() {
67          super();
68      }
69  
70      /***
71       * Clears all tokens stored in this container.
72       */
73      public void clear() {
74          tokens.clear();
75          last = null;
76          whitespaceBeforeLast = false;
77          whitespaceAfterLast = false;
78      }
79  
80      /***
81       * Adds a token to this instance.
82       *
83       * @param whitespaceBefore whether there is whitespace before the token
84       * @param token the token to add
85       */
86      public void add(final boolean whitespaceBefore, final String token) {
87          // whitespace before: true iff there was whitespace after the
88          // previous "last" or if there is whitespace before this one
89          whitespaceBeforeLast = whitespaceAfterLast || whitespaceBefore;
90  
91          // count + store token
92          tokens.add(token);
93          last = token;
94      }
95  
96      /***
97       * Adds whitespace to this instance, setting
98       * {@link #isWhitespaceAfterLast()} to <code>true</code>.
99       */
100     public void addWhitespace() {
101         whitespaceAfterLast = true;
102     }
103 
104     /***
105      * Returns the cardinality of the given token in this container.
106      *
107      * @param token the token to check
108      * @return the number of copies of the specified token in this container,
109      * <code>&gt;= 0</code>
110      */
111     public int getCount(final String token) {
112         return tokens.getCount(token);
113     }
114 
115     /***
116      * Returns the repetition of the {@link #getLast() last} added token in the
117      * original text (counting starts with 0, as the first occurrence is the
118      * "0th repetition").
119      *
120      * @return the value of the attribute
121      */
122     public int getLastRep() {
123         // subtract 1, since counting starts with 0
124         return tokens.getCount(last) - 1;
125     }
126 
127     /***
128      * Returns the last added token.
129      *
130      * @return the value of the attribute
131      */
132     public String getLast() {
133         return last;
134     }
135 
136     /***
137      * Whether there is whitespace after the {@link #getLast() last} added
138      * token.
139      *
140      * @return <code>true</code> iff there is whitespace after the token
141      */
142     public boolean isWhitespaceAfterLast() {
143         return whitespaceAfterLast;
144     }
145 
146     /***
147      * Whether there is whitespace before the {@link #getLast() last} added
148      * token.
149      *
150      * @return <code>true</code> iff there is whitespace before the token
151      */
152     public boolean isWhitespaceBeforeLast() {
153         return whitespaceBeforeLast;
154     }
155 
156     /***
157      * Returns the token number of tokens counted by this instances (including
158      * duplicates).
159      * 
160      * @return the number of tokens counted
161      */
162     public int size() {
163         return tokens.size();
164     }
165 
166     /***
167      * Returns a string representation of this object.
168      *
169      * @return a textual representation
170      */
171     public String toString() {
172         return new ToStringBuilder(this)
173             .append("tokens", tokens)
174             .append("last", last)
175             .append("whitespace before last", whitespaceBeforeLast)
176             .append("whitespace after last", whitespaceAfterLast)
177             .toString();
178     }
179 
180 }