View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.extract.reestimate;
23  
24  import org.apache.commons.collections.Bag;
25  import org.apache.commons.collections.bag.HashBag;
26  
27  import de.fu_berlin.ties.TiesConfiguration;
28  import de.fu_berlin.ties.classify.Probability;
29  import de.fu_berlin.ties.extract.Extraction;
30  import de.fu_berlin.ties.util.Util;
31  
32  /***
33   * Reestimates the probability of extractions based on the length (number of
34   * tokens).
35   *
36   * @author Christian Siefkes
37   * @version $Revision: 1.10 $, $Date: 2006/10/21 16:04:17 $, $Author: siefkes $
38   */
39  public class LengthEstimator extends Reestimator {
40  
41      /***
42       * A bag counting the number of occurrences of extractions for each type.
43       * Keys are type Strings.
44       */
45      private final Bag all = new HashBag();
46  
47      /***
48       * A bag counting the number of occurrences of extractions of a specific
49       * length for each type. Keys are combinations of type and token count
50       * created via the {@link #joinKey(String, int)} method.
51       */
52      private final Bag ofLength = new HashBag();
53  
54      /***
55       * Creates a new instance.
56       *
57       * @param precReestimator the preceding re-estimator to use if this
58       * re-estimator is part of a <em>chain</em>; <code>null</code> otherwise
59       * @param config the configuration to use
60       */
61      public LengthEstimator(final Reestimator precReestimator,
62              final TiesConfiguration config) {
63          super(precReestimator, config);
64      }
65  
66      /***
67       * Helper method that creates a key combining type and token count of an
68       * extraction.
69       *
70       * @param type the type of the extraction
71       * @param length the number of tokens contained in the extraction
72       * @return a String combining  type and token count
73       */
74      private String joinKey(final String type, final int length) {
75          // use a space character as separator
76          return type + ' ' + length;
77      }
78  
79      /***
80       * {@inheritDoc}
81       */
82      protected Extraction doReestimate(final Extraction extraction) {
83          // estimate length prob. and modify extraction probability accordingly
84          final double reestimationProb =
85              doReestimate(extraction.getType(), extraction.tokenCount());
86          extraction.modifyProbability(new Probability(reestimationProb));
87          return extraction;
88      }
89  
90      /***
91       * Re-estimates the probability of an extraction, based on on an type and
92       * length.
93       *
94       * @param type the type of the extraction
95       * @param length the number of tokens contained in the extraction
96       * @return the re-estimation probability
97       */
98      protected double doReestimate(final String type, final int length) {
99          final int allOfThisType = all.getCount(type);
100         final int ofTypeAndLength = ofLength.getCount(joinKey(type, length));
101 
102         final double reestimationProb =
103             ((double) ofTypeAndLength) / allOfThisType;
104         Util.LOG.debug("Length-based probability re-estimation: "
105                 + reestimationProb + " (" + allOfThisType
106                 + " extractions of type "+ type + ", " + ofTypeAndLength
107                 + " of which have length " + length);
108         return reestimationProb;
109     }
110 
111     /***
112      * {@inheritDoc}
113      */
114     protected void doTrain(final Extraction extraction) {
115         doTrain(extraction.getType(), extraction.tokenCount());
116     }
117 
118     /***
119      * Trains this re-estimator of an extraction, based on on an type and
120      * length.
121      *
122      * @param type the type of the extraction
123      * @param length the number of tokens contained in the extraction
124      */
125     protected void doTrain(final String type, final int length) {
126         // update statistics
127         all.add(type);
128         ofLength.add(joinKey(type, length));
129         Util.LOG.debug("Length-based re-estimator: trained " + type
130                 + " extraction of length " +length);
131     }
132 
133 }