1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.extract.reestimate;
23
24 import java.util.HashMap;
25 import java.util.Map;
26
27 import org.apache.commons.lang.builder.ToStringBuilder;
28
29 import de.fu_berlin.ties.TiesConfiguration;
30 import de.fu_berlin.ties.extract.Extraction;
31 import de.fu_berlin.ties.util.Util;
32
33 /***
34 * A very simple re-estimator that discards any extractions that are longer
35 * than the longest extraction of the same type seen in the training corpus,
36 * multipied with a tolerance factor.
37 *
38 * @author Christian Siefkes
39 * @version $Revision: 1.9 $, $Date: 2006/10/21 16:04:17 $, $Author: siefkes $
40 */
41 public class LengthFilter extends Reestimator {
42
43 /***
44 * The default length used for unknown types: {@value}.
45 */
46 public static final int DEFAULT_LENGTH = 3;
47
48
49 /***
50 * Stores the maximum length of each extraction type.
51 */
52 private final Map<String, Integer> maxLengths =
53 new HashMap<String, Integer>();
54
55 /***
56 * The filter discards extractions longer than the longest trained
57 * extraction multiplied with this tolerance factor.
58 */
59 private final double tolerance;
60
61
62 /***
63 * Creates a new instance.
64 *
65 * @param precReestimator the preceding re-estimator to use if this
66 * re-estimator is part of a <em>chain</em>; <code>null</code> otherwise
67 * @param config the configuration to use
68 */
69 public LengthFilter(final Reestimator precReestimator,
70 final TiesConfiguration config) {
71 super(precReestimator, config);
72 tolerance = config.getDouble("lengthfilter.tolerance");
73 }
74
75 /***
76 * Returns the maximum length tolerated for extractions of a given type.
77 * The returned value is calculated by multiplying the maximum extraction
78 * length seen during training (so far) with the tolerance factor and
79 * rounding the result down to the nearest integer.
80 *
81 * @param type the extraction type
82 * @return the maximum tolerated length for extractions of this type
83 */
84 public int toleratedLength(final String type) {
85 final Integer maxLength = maxLengths.get(type);
86
87 if (maxLength != null) {
88 return (int) Math.floor(tolerance * maxLength.intValue());
89 } else {
90 Util.LOG.debug("LengthFilter: returning default length "
91 + DEFAULT_LENGTH + " since type " + type + " is unknown");
92 return DEFAULT_LENGTH;
93 }
94 }
95
96 /***
97 * {@inheritDoc}
98 */
99 protected Extraction doReestimate(final Extraction extraction) {
100 final Integer maxLength = maxLengths.get(extraction.getType());
101
102 if (maxLength != null) {
103 if (extraction.tokenCount() > tolerance * maxLength.intValue()) {
104 Util.LOG.debug("LengthFilter: discarding extraction of length "
105 + extraction.tokenCount()
106 + " since it is longer than longest trained "
107 + extraction.getType() + " extraction: "
108 + maxLength.intValue() + " * " + tolerance);
109 return null;
110 } else {
111 return extraction;
112 }
113 } else {
114 Util.LOG.warn("LengthFilter: never saw an extraction of type "
115 + extraction.getType() + " during training");
116 return extraction;
117 }
118 }
119
120 /***
121 * {@inheritDoc}
122 */
123 protected void doTrain(final Extraction extraction) {
124 final Integer maxLength = maxLengths.get(extraction.getType());
125 final int newLength = extraction.tokenCount();
126
127 if (maxLength == null || maxLength.intValue() < newLength) {
128
129 maxLengths.put(extraction.getType(), newLength);
130 }
131 }
132
133 /***
134 * Returns a string representation of this object.
135 * @return a textual representation
136 */
137 public String toString() {
138 return new ToStringBuilder(this)
139 .appendSuper(super.toString())
140 .append("tolerance", tolerance)
141 .toString();
142 }
143
144 }