View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.context;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.Writer;
27  import java.util.List;
28  import java.util.Map;
29  
30  import org.apache.commons.lang.builder.ToStringBuilder;
31  import org.dom4j.Element;
32  
33  import de.fu_berlin.ties.classify.feature.FeatureVector;
34  import de.fu_berlin.ties.io.IOUtils;
35  import de.fu_berlin.ties.util.MutableInt;
36  import de.fu_berlin.ties.util.Util;
37  import de.fu_berlin.ties.xml.dom.DOMUtils;
38  
39  /***
40   * Provides basic functionality shared by different representations.
41   *
42   * @author Christian Siefkes
43   * @version $Revision: 1.2 $, $Date: 2004/09/06 17:22:41 $, $Author: siefkes $
44   */
45  public abstract class AbstractRepresentation extends Representation {
46  
47      /***
48       * Configuration key: The number of preceding recognitions to represent.
49       */
50      public static final String CONFIG_RECOGN_NUM = "representation.recogn.num";
51  
52      /***
53       * Configuration key: The maximum number of subsequences to keep when a
54       * feature value must be split.
55       */
56      public static final String CONFIG_SPLIT_MAXIMUM =
57          "representation.split.maximum";
58  
59      /***
60       * Configuration key: Each <em>n</em>-th context representation is stored
61       * for debugging and inspection purposes,if &gt; 0.
62       */
63      public static final String CONFIG_STORE_NTH = "representation.store.nth";
64  
65      /***
66       * The maximum number of subsequences to keep when a feature value must
67       * be split (at whitespace).
68       */
69      private final int splitMaximum;
70  
71      /***
72       * Counts how many representations were already generated (if some of them
73       * are {@linkplain #storeN stored}).
74       */
75      private long repCount = 0;
76  
77      /***
78       * Each <em>storeN</em>-th context representation is stored for debugging
79       * and inspection purposes (if &gt; 0, otherwise no representation is
80       * stored).
81       */
82      private final int storeN;
83  
84      /***
85       * Last integer suffix used to store a context representation, if enabled
86       * ({@link #storeN} &gt; 0). Also used to synchronize the {@link #repCount}.
87       */
88      private final MutableInt lastStoredSuffix = new MutableInt(0);
89  
90      /***
91       * The output character set to use (only used to
92       * {@linkplain #storeIfNth(FeatureVector, String) store} some configurations
93       * for inspection purposes).
94       */
95      private final String outputCharset;
96  
97      /***
98       * Creates a new instance.
99       *
100      * @param recogNum the number of preceding recognitions to represent
101      * @param splitMax the maximum number of subsequences to keep when
102      * a feature value must be split (at whitespace).
103      * @param n Each <em>n</em>-th context representation is stored if &gt; 0;
104      * otherwise no representation is stored
105      * @param outCharset the output character set to use (only used to
106      * store some configurations for inspection purposes, if <code>n</code>
107      * &gt; 0); if <code>null</code>, the default charset of the current
108      * platform is used
109      */
110     public AbstractRepresentation(final int recogNum, final int splitMax,
111             final int n, final String outCharset) {
112         super(recogNum);
113         splitMaximum = splitMax;
114         storeN = n;
115         outputCharset = outCharset;
116     }
117 
118     /***
119      * Builds the context representation of text in an element. Returns a
120      * feature vector of all context features considered relevant for
121      * representation.
122      *
123      * @param element the element whose context should be represented
124      * @param leftText textual content to the left of (preceding)
125      * <code>mainText</code>, might be empty
126      * @param mainText the main textual content to represent, might be empty
127      * @param rightText textual content to the right of (following)
128      * <code>mainText</code>, might be empty
129      * @param priorRecognitions a buffer of the last {@link Recognition}s from
130      * the document, created by calling {@link #createRecognitionBuffer()};
131      * might be <code>null</code>
132      * @param featureCache a cache of (local) feature, should be re-used
133      * between all calls for the nodes in a single document (but must not be
134      * re-used when building the context of nodes in different documents!)
135      * @param logPurpose the type of contexts of main interest to the caller
136      * (e.g. "Token" or "Sentence"), used for logging
137      * @return a vector of features considered relevant for representation
138      * @throws ClassCastException if the <code>priorRecognitions</code> buffer
139      * contains objects that aren't {@link Recognition}s
140      * @throws IllegalArgumentException if the specified node is of an
141      * unsupported type
142      */
143     public final FeatureVector buildContext(final Element element,
144             final String leftText, final String mainText,
145             final String rightText, final PriorRecognitions priorRecognitions,
146             final Map<Element, List<LocalFeature>> featureCache,
147             final String logPurpose)
148     throws ClassCastException, IllegalArgumentException {
149         FeatureVector result = null;
150 
151         try {
152             result = doBuildContext(element, leftText, mainText, rightText,
153                     priorRecognitions, featureCache, logPurpose);
154 
155             // store each n-th representation, if configured
156             if (storeN > 0) {
157                 storeIfNth(result, logPurpose);
158             }
159 
160             return result;
161         } catch (Error e) {
162             // log details and rethrow
163             Util.LOG.error("Error while building " + logPurpose 
164                     + " context representation for " 
165                     + DOMUtils.showToken(element, mainText)
166                     + "(current size of rep.: " + result.size()
167                     + ", size of feature cache: " + featureCache.size() + ")",
168                     e);
169             Util.LOG.debug("Snapshot of context rep.: " + result);
170             //Util.LOG.debug("Snapshot of feature cache: " + featureCache);
171             throw e;
172         }
173     }
174 
175     /***
176      * Builds the context representation of text in an element. Returns a
177      * feature vector of all context features considered relevant for
178      * representation.
179      *
180      * @param element the element whose context should be represented
181      * @param leftText textual content to the left of (preceding)
182      * <code>mainText</code>, might be empty
183      * @param mainText the main textual content to represent, might be empty
184      * @param rightText textual content to the right of (following)
185      * <code>mainText</code>, might be empty
186      * @param priorRecognitions a buffer of the last {@link Recognition}s from
187      * the document, created by calling {@link #createRecognitionBuffer()};
188      * might be <code>null</code>
189      * @param featureCache a cache of (local) feature, should be re-used
190      * between all calls for the nodes in a single document (but must not be
191      * re-used when building the context of nodes in different documents!)
192      * @param logPurpose the type of contexts of main interest to the caller
193      * (e.g. "Token" or "Sentence"), used for logging
194      * @return a vector of features considered relevant for representation
195      * @throws ClassCastException if the <code>priorRecognitions</code> buffer
196      * contains objects that aren't {@link Recognition}s
197      * @throws IllegalArgumentException if the specified node is of an
198      * unsupported type
199      */
200     protected abstract FeatureVector doBuildContext(final Element element,
201             final String leftText, final String mainText,
202             final String rightText, final PriorRecognitions priorRecognitions,
203             final Map<Element, List<LocalFeature>> featureCache,
204             final String logPurpose)
205     throws ClassCastException, IllegalArgumentException;
206 
207     /***
208      * Returns the maximum number of subsequences to keep when a feature
209      * value must be split (at whitespace).
210      *
211      * @return the maximum number
212      */
213     public int getSplitMaximum() {
214         return splitMaximum;
215     }
216 
217     /***
218      * Each <em>storeN</em>-th context representation is stored for debugging
219      * and inspection purposes (if &gt; 0, otherwise no representation is
220      * stored).
221      *
222      * @return the value of the attribute
223      */
224     public int getStoreN() {
225         return storeN;
226     }
227 
228     /***
229      * Checks whether the given representation should be stored for inspection
230      * purposes and stores it, if required. This method should be only called
231      * if {@link #storeN} &gt; 0.
232      *
233      * @param representation the representation to store, if required
234      * @param type the type of contexts of main interest to the caller
235      * (e.g. "Token" or "Sentence"), used when storing the representation
236      */
237     private void storeIfNth(final FeatureVector representation,
238             final String type) {
239         File storeFile = null;
240         // locally store count because the global value might be unstable
241         long localCount = -1;
242 
243         // must synchronize while checking count and determining file name
244         synchronized (lastStoredSuffix) {
245             ++repCount;
246             localCount = repCount;
247             if (repCount % storeN == 0) {
248                 // we should store this one -- determine next available
249                 // file name in default directory
250                 storeFile = IOUtils.createOutFile(IOUtils.getDefaultDirectory(),
251                     type + "Context", "rep", lastStoredSuffix);
252             }
253         }
254 
255         // no need to synchronize actual storing process (if required)
256         if (storeFile != null) {
257             Writer writer = null;
258             try {
259                 writer = IOUtils.openWriter(storeFile, outputCharset);
260                 final CharSequence flatRep =
261                     representation.flatten(true);
262                 writer.write(flatRep.toString());
263                 writer.flush();
264                 Util.LOG.info("Stored " + type + " representation no. "
265                         + localCount + " in file " + storeFile
266                         + " for inspection purposes");
267             } catch (IOException ie) {
268                 Util.LOG.error("Error while storing " + type
269                         + " representation no. " + localCount + " in file "
270                         + storeFile + " for inspection purposes", ie);
271             } finally {
272                 IOUtils.tryToClose(writer);
273             }
274         }
275     }
276 
277     /***
278      * Returns a string representation of this object.
279      *
280      * @return a textual representation
281      */
282     public String toString() {
283         return new ToStringBuilder(this)
284             .appendSuper(super.toString())
285             .append("split maximum", splitMaximum)
286             .append("store n-th", storeN)
287             .toString();
288     }
289 
290 }