View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.classify.feature;
23  
24  import java.util.Collection;
25  import java.util.Iterator;
26  
27  import org.apache.commons.collections.Bag;
28  import org.apache.commons.collections.bag.HashBag;
29  
30  /***
31   * This feature vector implementation stores a multi-set of features.
32   * The order of features is not preserved. Internally it uses a
33   * {@link org.apache.commons.collections.bag.HashBag} as storage.
34   *
35   * <p>Instances of this class are not thread-safe and must be synchronized
36   * externally, if required.
37   *
38   * @author Christian Siefkes
39   * @version $Revision: 1.11 $, $Date: 2004/09/06 17:22:10 $, $Author: siefkes $
40   */
41  public class FeatureSet extends FeatureVector {
42  
43      /***
44       * Constants specifying that feature frequencies are not considered
45       * when calculating {@linkplain FeatureVector#strength(Feature) strength}
46       * values.
47       */
48      public static final String STRENGTH_CONSTANT = "constant";
49  
50      /***
51       * Constants specifying that the logarithm of feature frequencies is
52       * considered when calculating {@linkplain FeatureVector#strength(Feature)
53       * strength} values (1.0 is added to log(<em>f</em>) to avoid the result
54       * becoming 0).
55       */
56      public static final String STRENGTH_LOG = "log";
57  
58      /***
59       * Constants specifying that the square root of feature frequencies is
60       * considered when calculating {@linkplain FeatureVector#strength(Feature)
61       * strength} values.
62       */
63      public static final String STRENGTH_SQUARE_ROOT = "sqrt";
64  
65      /***
66       * Constants specifying that feature frequencies are considered linear
67       * (as is) when calculating {@linkplain FeatureVector#strength(Feature)
68       * strength} values (a feature occurring twice as frequently will be
69       * twice as strong).
70       */
71      public static final String STRENGTH_LINEAR = "linear";
72  
73  
74      /***
75       * Store features in an multi-set (bag).
76       */
77      private final Bag store = new HashBag();
78  
79      /***
80       * The type of method used to consider feature frequencies when determining
81       * {@linkplain #strength(Feature) strength} values.
82       */
83      private final String strengthType;
84  
85      /***
86       * Creates a new instance.
87       *
88       * @param strengthMethod The type of method used to consider feature
89       * frequencies when determining {@linkplain #strength(Feature) strength}
90       * values -- should be one of the <code>STRENGTH</code> constants
91       * defined in this class
92       */
93      public FeatureSet(final String strengthMethod) {
94          super();
95  
96          // convert to lower case + internalize to allow efficient comparison
97          strengthType = strengthMethod.toLowerCase().intern();
98      }
99  
100     /***
101      * {@inheritDoc}
102      */
103     public Iterator iterator() {
104         // iterate each feature only once
105         return store.uniqueSet().iterator();
106     }
107 
108     /***
109      * Returns the type of method used to consider feature frequencies when
110      * determining {@linkplain #strength(Feature) strength} values.
111      *
112      * @return the value of the attribute -- should be one of the
113      * <code>STRENGTH</code> constants defined in this class
114      */
115     public String getStrengthType() {
116         return strengthType;
117     }
118 
119     /***
120      * Implementation of the hook provided by the superclass to ensure that
121      * the {@linkplain FeatureVector#getSummedStrength() summed strength}
122      * is updated correctly.
123      *
124      * @param feature the feature to add
125      */
126     protected void preAddHook(final Feature feature) {
127         if (store.contains(feature)) {
128             // subtract old feature strength from sum -- the superclass will
129             // re-add the new (overall) strength after adding the feature
130             setSummedStrength(getSummedStrength() - strength(feature));
131         }
132     }
133 
134     /***
135      * {@inheritDoc}
136      */
137     public int size() {
138         // count each feature only once
139         return store.uniqueSet().size();
140     }
141 
142     /***
143      * {@inheritDoc}
144      */
145     protected Collection<Feature> store() {
146         return store;
147     }
148 
149     /***
150      * Returns a strength value for a feature contained in this vector.
151      * This implementation simply delegates to {@link Feature#getStrength()}.
152      *
153      * @param feature the feature to consider
154      * @return a strength value for the specified feature
155      */
156     public double strength(final Feature feature) {
157         final double baseStrength = super.strength(feature);
158         final int frequency = store.getCount(feature);
159         final double result;
160 
161         // strengthType is internalized so we can do identity comparisons
162         if (strengthType == STRENGTH_CONSTANT) {
163             result = baseStrength;
164         } else if (strengthType == STRENGTH_LOG) {
165             // adding 1 so the result will never be 0.0 (but 1.0 for singletons)
166             result = baseStrength * (Math.log(frequency) + 1.0);
167         } else if (strengthType == STRENGTH_SQUARE_ROOT) {
168             result = baseStrength * Math.sqrt(frequency);
169         } else if (strengthType == STRENGTH_LINEAR) {
170             result = baseStrength * frequency;
171         } else {
172             throw new IllegalArgumentException("Unsupported strength type: "
173                     + strengthType);
174         }
175 
176 /*        if (result != 1.0) {
177             Util.LOG.debug("Strength of feature " + feature + ": " + result
178                     + " (base strength: " + baseStrength + ", frequency: "
179                     + frequency + ", calculation method: " + strengthType);
180         } */
181 
182         return result;
183     }
184 
185 }