1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.extract.reestimate;
23
24 import org.apache.commons.collections.Bag;
25 import org.apache.commons.collections.bag.HashBag;
26
27 import de.fu_berlin.ties.TiesConfiguration;
28 import de.fu_berlin.ties.classify.Probability;
29 import de.fu_berlin.ties.extract.Extraction;
30 import de.fu_berlin.ties.util.Util;
31
32 /***
33 * Reestimates the probability of extractions based on the length (number of
34 * tokens).
35 *
36 * @author Christian Siefkes
37 * @version $Revision: 1.10 $, $Date: 2006/10/21 16:04:17 $, $Author: siefkes $
38 */
39 public class LengthEstimator extends Reestimator {
40
41 /***
42 * A bag counting the number of occurrences of extractions for each type.
43 * Keys are type Strings.
44 */
45 private final Bag all = new HashBag();
46
47 /***
48 * A bag counting the number of occurrences of extractions of a specific
49 * length for each type. Keys are combinations of type and token count
50 * created via the {@link #joinKey(String, int)} method.
51 */
52 private final Bag ofLength = new HashBag();
53
54 /***
55 * Creates a new instance.
56 *
57 * @param precReestimator the preceding re-estimator to use if this
58 * re-estimator is part of a <em>chain</em>; <code>null</code> otherwise
59 * @param config the configuration to use
60 */
61 public LengthEstimator(final Reestimator precReestimator,
62 final TiesConfiguration config) {
63 super(precReestimator, config);
64 }
65
66 /***
67 * Helper method that creates a key combining type and token count of an
68 * extraction.
69 *
70 * @param type the type of the extraction
71 * @param length the number of tokens contained in the extraction
72 * @return a String combining type and token count
73 */
74 private String joinKey(final String type, final int length) {
75
76 return type + ' ' + length;
77 }
78
79 /***
80 * {@inheritDoc}
81 */
82 protected Extraction doReestimate(final Extraction extraction) {
83
84 final double reestimationProb =
85 doReestimate(extraction.getType(), extraction.tokenCount());
86 extraction.modifyProbability(new Probability(reestimationProb));
87 return extraction;
88 }
89
90 /***
91 * Re-estimates the probability of an extraction, based on on an type and
92 * length.
93 *
94 * @param type the type of the extraction
95 * @param length the number of tokens contained in the extraction
96 * @return the re-estimation probability
97 */
98 protected double doReestimate(final String type, final int length) {
99 final int allOfThisType = all.getCount(type);
100 final int ofTypeAndLength = ofLength.getCount(joinKey(type, length));
101
102 final double reestimationProb =
103 ((double) ofTypeAndLength) / allOfThisType;
104 Util.LOG.debug("Length-based probability re-estimation: "
105 + reestimationProb + " (" + allOfThisType
106 + " extractions of type "+ type + ", " + ofTypeAndLength
107 + " of which have length " + length);
108 return reestimationProb;
109 }
110
111 /***
112 * {@inheritDoc}
113 */
114 protected void doTrain(final Extraction extraction) {
115 doTrain(extraction.getType(), extraction.tokenCount());
116 }
117
118 /***
119 * Trains this re-estimator of an extraction, based on on an type and
120 * length.
121 *
122 * @param type the type of the extraction
123 * @param length the number of tokens contained in the extraction
124 */
125 protected void doTrain(final String type, final int length) {
126
127 all.add(type);
128 ofLength.add(joinKey(type, length));
129 Util.LOG.debug("Length-based re-estimator: trained " + type
130 + " extraction of length " +length);
131 }
132
133 }