View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.extract.reestimate;
23  
24  import java.util.HashMap;
25  import java.util.Map;
26  
27  import org.apache.commons.lang.builder.ToStringBuilder;
28  
29  import de.fu_berlin.ties.TiesConfiguration;
30  import de.fu_berlin.ties.extract.Extraction;
31  import de.fu_berlin.ties.util.Util;
32  
33  /***
34   * A very simple re-estimator that discards any extractions that are longer
35   * than the longest extraction of the same type seen in the training corpus,
36   * multipied with a tolerance factor.
37   *
38   * @author Christian Siefkes
39   * @version $Revision: 1.9 $, $Date: 2006/10/21 16:04:17 $, $Author: siefkes $
40   */
41  public class LengthFilter extends Reestimator {
42  
43      /***
44       * The default length used for unknown types: {@value}.
45       */
46      public static final int DEFAULT_LENGTH = 3;
47  
48  
49      /***
50       * Stores the maximum length of each extraction type.
51       */
52      private final Map<String, Integer> maxLengths =
53          new HashMap<String, Integer>();
54  
55      /***
56       * The filter discards extractions longer than the longest trained
57       * extraction multiplied with this tolerance factor.
58       */
59      private final double tolerance;
60  
61  
62      /***
63       * Creates a new instance.
64       *
65       * @param precReestimator the preceding re-estimator to use if this
66       * re-estimator is part of a <em>chain</em>; <code>null</code> otherwise
67       * @param config the configuration to use
68       */
69      public LengthFilter(final Reestimator precReestimator,
70              final TiesConfiguration config) {
71          super(precReestimator, config);
72          tolerance = config.getDouble("lengthfilter.tolerance");
73      }
74  
75      /***
76       * Returns the maximum length tolerated for extractions of a given type.
77       * The returned value is calculated by multiplying the maximum extraction
78       * length seen during training (so far) with the tolerance factor and
79       * rounding the result down to the nearest integer.
80       *
81       * @param type the extraction type
82       * @return the maximum tolerated length for extractions of this type
83       */
84      public int toleratedLength(final String type) {
85          final Integer maxLength = maxLengths.get(type);
86  
87          if (maxLength != null) {
88              return (int) Math.floor(tolerance * maxLength.intValue());
89          } else {
90              Util.LOG.debug("LengthFilter: returning default length "
91                      + DEFAULT_LENGTH + " since type " + type + " is unknown");
92              return DEFAULT_LENGTH;
93          }
94      }
95  
96      /***
97       * {@inheritDoc}
98       */
99      protected Extraction doReestimate(final Extraction extraction) {
100         final Integer maxLength = maxLengths.get(extraction.getType());
101 
102         if (maxLength != null) {
103             if (extraction.tokenCount() > tolerance * maxLength.intValue()) {
104                 Util.LOG.debug("LengthFilter: discarding extraction of length "
105                         + extraction.tokenCount()
106                         + " since it is longer than longest trained "
107                         + extraction.getType() + " extraction: "
108                         + maxLength.intValue() + " * " + tolerance);
109                 return null;
110             } else {
111                 return extraction;
112             }
113         } else {
114             Util.LOG.warn("LengthFilter: never saw an extraction of type "
115                     + extraction.getType() + " during training");
116             return extraction;
117         }
118     }
119 
120     /***
121      * {@inheritDoc}
122      */
123     protected void doTrain(final Extraction extraction) {
124         final Integer maxLength = maxLengths.get(extraction.getType());
125         final int newLength = extraction.tokenCount();
126 
127         if (maxLength == null || maxLength.intValue() < newLength) {
128             // update max. length
129             maxLengths.put(extraction.getType(), newLength);
130         }
131     }
132 
133     /***
134      * Returns a string representation of this object.
135      * @return a textual representation
136      */
137     public String toString() {
138         return new ToStringBuilder(this)
139             .appendSuper(super.toString())
140             .append("tolerance", tolerance)
141             .toString();
142     }
143 
144 }