View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.context;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.Writer;
27  import java.util.List;
28  import java.util.Map;
29  
30  import org.apache.commons.lang.builder.ToStringBuilder;
31  import org.dom4j.Element;
32  
33  import de.fu_berlin.ties.classify.feature.FeatureVector;
34  import de.fu_berlin.ties.io.IOUtils;
35  import de.fu_berlin.ties.util.MutableInt;
36  import de.fu_berlin.ties.util.Util;
37  import de.fu_berlin.ties.xml.dom.DOMUtils;
38  
39  /***
40   * Provides basic functionality shared by different representations.
41   *
42   * @author Christian Siefkes
43   * @version $Revision: 1.8 $, $Date: 2006/10/21 16:04:03 $, $Author: siefkes $
44   */
45  public abstract class AbstractRepresentation extends Representation {
46  
47      /***
48       * Configuration key: The number of preceding recognitions to represent.
49       */
50      public static final String CONFIG_RECOGN_NUM = "representation.recogn.num";
51  
52      /***
53       * Configuration key: The maximum number of subsequences to keep when a
54       * feature value must be split.
55       */
56      public static final String CONFIG_SPLIT_MAXIMUM =
57          "representation.split.maximum";
58  
59      /***
60       * Configuration key: Each <em>n</em>-th context representation is stored
61       * for debugging and inspection purposes,if &gt; 0.
62       */
63      public static final String CONFIG_STORE_NTH = "representation.store.nth";
64  
65      /***
66       * The maximum number of subsequences to keep when a feature value must
67       * be split (at whitespace).
68       */
69      private final int splitMaximum;
70  
71      /***
72       * Counts how many representations were already generated (if some of them
73       * are {@linkplain #storeN stored}).
74       */
75      private long repCount = 0;
76  
77      /***
78       * Each <em>storeN</em>-th context representation is stored for debugging
79       * and inspection purposes (if &gt; 0, otherwise no representation is
80       * stored).
81       */
82      private final int storeN;
83  
84      /***
85       * Last integer suffix used to store a context representation, if enabled
86       * ({@link #storeN} &gt; 0). Also used to synchronize the {@link #repCount}.
87       */
88      private final MutableInt lastStoredSuffix = new MutableInt(0);
89  
90      /***
91       * The output character set to use (only used to
92       * {@linkplain #storeIfNth(FeatureVector, String) store} some configurations
93       * for inspection purposes).
94       */
95      private final String outputCharset;
96  
97  
98      /***
99       * Creates a new instance.
100      *
101      * @param recogNum the number of preceding recognitions to represent
102      * @param splitMax the maximum number of subsequences to keep when
103      * a feature value must be split (at whitespace).
104      * @param n Each <em>n</em>-th context representation is stored if &gt; 0;
105      * otherwise no representation is stored
106      * @param outCharset the output character set to use (only used to
107      * store some configurations for inspection purposes, if <code>n</code>
108      * &gt; 0); if <code>null</code>, the default charset of the current
109      * platform is used
110      */
111     public AbstractRepresentation(final int recogNum, final int splitMax,
112             final int n, final String outCharset) {
113         super(recogNum);
114         splitMaximum = splitMax;
115         storeN = n;
116         outputCharset = outCharset;
117     }
118 
119     /***
120      * Builds the context representation of text in an element. Returns a
121      * feature vector of all context features considered relevant for
122      * representation.
123      *
124      * @param element the element whose context should be represented
125      * @param leftText textual content to the left of (preceding)
126      * <code>mainText</code>, might be empty
127      * @param mainText the main textual content to represent, might be empty
128      * @param rightText textual content to the right of (following)
129      * <code>mainText</code>, might be empty
130      * @param priorRecognitions a buffer of the last {@link Recognition}s from
131      * the document, created by calling {@link Representation#initDocument};
132      * might be <code>null</code>
133      * @param featureCache a cache of (local) feature, should be re-used
134      * between all calls for the nodes in a single document (but must not be
135      * re-used when building the context of nodes in different documents!)
136      * @param logPurpose the type of contexts of main interest to the caller
137      * (e.g. "Token" or "Sentence"), used for logging
138      * @return a vector of features considered relevant for representation
139      * @throws ClassCastException if the <code>priorRecognitions</code> buffer
140      * contains objects that aren't {@link Recognition}s
141      * @throws IllegalArgumentException if the specified node is of an
142      * unsupported type
143      */
144     public final FeatureVector buildContext(final Element element,
145             final String leftText, final String mainText,
146             final String rightText, final PriorRecognitions priorRecognitions,
147             final Map<Element, List<LocalFeature>> featureCache,
148             final String logPurpose)
149     throws ClassCastException, IllegalArgumentException {
150         FeatureVector result = null;
151 
152         try {
153             result = doBuildContext(element, leftText, mainText, rightText,
154                     priorRecognitions, featureCache, logPurpose);
155 
156             // store each n-th representation, if configured
157             if (storeN > 0) {
158                 storeIfNth(result, logPurpose);
159             }
160 
161             return result;
162         } catch (Error e) {
163             // log details and rethrow
164             Util.LOG.error("Error while building " + logPurpose 
165                     + " context representation for " 
166                     + DOMUtils.showToken(element, mainText) + " ("
167                     + (result != null ? "current size of rep.: " + result.size()
168                             : "rep. is <null>")
169                     + ", size of feature cache: " + featureCache.size() + ")",
170                     e);
171             Util.LOG.debug("Snapshot of context rep.: " + result);
172             //Util.LOG.debug("Snapshot of feature cache: " + featureCache);
173             throw e;
174         }
175     }
176 
177     /***
178      * Builds the context representation of text in an element. Returns a
179      * feature vector of all context features considered relevant for
180      * representation.
181      *
182      * @param element the element whose context should be represented
183      * @param leftText textual content to the left of (preceding)
184      * <code>mainText</code>, might be empty
185      * @param mainText the main textual content to represent, might be empty
186      * @param rightText textual content to the right of (following)
187      * <code>mainText</code>, might be empty
188      * @param priorRecognitions a buffer of the last {@link Recognition}s from
189      * the document, created by calling {@link Representation#initDocument};
190      * might be <code>null</code>
191      * @param featureCache a cache of (local) feature, should be re-used
192      * between all calls for the nodes in a single document (but must not be
193      * re-used when building the context of nodes in different documents!)
194      * @param logPurpose the type of contexts of main interest to the caller
195      * (e.g. "Token" or "Sentence"), used for logging
196      * @return a vector of features considered relevant for representation
197      * @throws ClassCastException if the <code>priorRecognitions</code> buffer
198      * contains objects that aren't {@link Recognition}s
199      * @throws IllegalArgumentException if the specified node is of an
200      * unsupported type
201      */
202     protected abstract FeatureVector doBuildContext(final Element element,
203             final String leftText, final String mainText,
204             final String rightText, final PriorRecognitions priorRecognitions,
205             final Map<Element, List<LocalFeature>> featureCache,
206             final String logPurpose)
207     throws ClassCastException, IllegalArgumentException;
208 
209     /***
210      * Returns the maximum number of subsequences to keep when a feature
211      * value must be split (at whitespace).
212      *
213      * @return the maximum number
214      */
215     public int getSplitMaximum() {
216         return splitMaximum;
217     }
218 
219     /***
220      * Each <em>storeN</em>-th context representation is stored for debugging
221      * and inspection purposes (if &gt; 0, otherwise no representation is
222      * stored).
223      *
224      * @return the value of the attribute
225      */
226     public int getStoreN() {
227         return storeN;
228     }
229 
230     /***
231      * Checks whether the given representation should be stored for inspection
232      * purposes and stores it, if required. This method should be only called
233      * if {@link #storeN} &gt; 0.
234      *
235      * @param representation the representation to store, if required
236      * @param type the type of contexts of main interest to the caller
237      * (e.g. "Token" or "Sentence"), used when storing the representation
238      */
239     private void storeIfNth(final FeatureVector representation,
240             final String type) {
241         File storeFile = null;
242         // locally store count because the global value might be unstable
243         long localCount = -1;
244 
245         // must synchronize while checking count and determining file name
246         synchronized (lastStoredSuffix) {
247             ++repCount;
248             localCount = repCount;
249             if (repCount % storeN == 0) {
250                 // we should store this one -- determine next available
251                 // file name in default directory
252                 storeFile = IOUtils.createOutFile(IOUtils.getDefaultDirectory(),
253                     type + "Context", "rep", lastStoredSuffix);
254             }
255         }
256 
257         // no need to synchronize actual storing process (if required)
258         if (storeFile != null) {
259             Writer writer = null;
260             try {
261                 writer = IOUtils.openWriter(storeFile, outputCharset);
262                 final CharSequence flatRep =
263                     representation.flatten(true);
264                 writer.write(flatRep.toString());
265                 writer.flush();
266                 Util.LOG.info("Stored " + type + " representation no. "
267                         + localCount + " in file " + storeFile
268                         + " for inspection purposes");
269             } catch (IOException ie) {
270                 Util.LOG.error("Error while storing " + type
271                         + " representation no. " + localCount + " in file "
272                         + storeFile + " for inspection purposes", ie);
273             } finally {
274                 IOUtils.tryToClose(writer);
275             }
276         }
277     }
278 
279     /***
280      * Returns a string representation of this object.
281      *
282      * @return a textual representation
283      */
284     public String toString() {
285         return new ToStringBuilder(this)
286             .appendSuper(super.toString())
287             .append("split maximum", splitMaximum)
288             .append("store n-th", storeN)
289             .toString();
290     }
291 
292 }