View Javadoc

1   /*
2    * Copyright (C) 2003-2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.text;
23  
24  import org.apache.commons.collections.Bag;
25  import org.apache.commons.collections.bag.HashBag;
26  import org.apache.commons.lang.builder.ToStringBuilder;
27  
28  /***
29   * A simple container that keeps track of the tokens in a document. This class
30   * differs from {@link de.fu_berlin.ties.text.TokenContainer} by not doing
31   * any tokenization itself, relying on external tokenization instead.
32   *
33   * <p>Instances of this class are not thread-safe; if you want to share a single
34   * instance between different thread, you have to ensure proper synchronization.
35   *
36   * @author Christian Siefkes
37   * @version $Revision: 1.5 $, $Date: 2004/11/09 16:29:34 $, $Author: siefkes $
38   */
39  public class TokenCounter {
40  
41      /***
42       * Whether there is whitespace before the {@link #getLast() last} added
43       * token.
44       */
45      private boolean whitespaceBeforeLast;
46  
47      /***
48       * Whether there is whitespace after the {@link #getLast() last} added
49       * token.
50       */
51      private boolean whitespaceAfterLast;
52  
53      /***
54       * A multi-set of all the tokens added to this container.
55       */
56      private final Bag tokens = new HashBag();
57  
58      /***
59       * The last token added to this container.
60       */
61      private String last = null;
62  
63      /***
64       * Creates a new instance.
65       */
66      public TokenCounter() {
67          super();
68      }
69  
70      /***
71       * Adds a token to this instance.
72       *
73       * @param whitespaceBefore whether there is whitespace before the token
74       * @param token the token to add
75       */
76      public void add(final boolean whitespaceBefore, final String token) {
77          // whitespace before: true iff there was whitespace after the
78          // previous "last" or if there is whitespace before this one
79          whitespaceBeforeLast = whitespaceAfterLast || whitespaceBefore;
80  
81          // count + store token
82          tokens.add(token);
83          last = token;
84      }
85  
86      /***
87       * Adds whitespace to this instance, setting
88       * {@link #isWhitespaceAfterLast()} to <code>true</code>.
89       */
90      public void addWhitespace() {
91          whitespaceAfterLast = true;
92      }
93  
94      /***
95       * Returns the cardinality of the given token in this container.
96       *
97       * @param token the token to check
98       * @return the number of copies of the specified token in this container,
99       * <code>&gt;= 0</code>
100      */
101     public int getCount(final String token) {
102         return tokens.getCount(token);
103     }
104 
105     /***
106      * Returns the repetition of the {@link #getLast() last} added token in the
107      * original text (counting starts with 0, as the first occurrence is the
108      * "0th repetition").
109      *
110      * @return the value of the attribute
111      */
112     public int getLastRep() {
113         // subtract 1, since counting starts with 0
114         return tokens.getCount(last) - 1;
115     }
116 
117     /***
118      * Returns the last added token.
119      *
120      * @return the value of the attribute
121      */
122     public String getLast() {
123         return last;
124     }
125 
126     /***
127      * Whether there is whitespace after the {@link #getLast() last} added
128      * token.
129      *
130      * @return <code>true</code> iff there is whitespace after the token
131      */
132     public boolean isWhitespaceAfterLast() {
133         return whitespaceAfterLast;
134     }
135 
136     /***
137      * Whether there is whitespace before the {@link #getLast() last} added
138      * token.
139      *
140      * @return <code>true</code> iff there is whitespace before the token
141      */
142     public boolean isWhitespaceBeforeLast() {
143         return whitespaceBeforeLast;
144     }
145 
146     /***
147      * Returns the token number of tokens counted by this instances (including
148      * duplicates).
149      * 
150      * @return the number of tokens counted
151      */
152     public int size() {
153         return tokens.size();
154     }
155 
156     /***
157      * Returns a string representation of this object.
158      *
159      * @return a textual representation
160      */
161     public String toString() {
162         return new ToStringBuilder(this)
163             .append("tokens", tokens)
164             .append("last", last)
165             .append("whitespace before last", whitespaceBeforeLast)
166             .append("whitespace after last", whitespaceAfterLast)
167             .toString();
168     }
169 
170 }