View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.context.sensor;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.Reader;
27  import java.util.Collection;
28  import java.util.HashSet;
29  import java.util.Iterator;
30  import java.util.SortedSet;
31  import java.util.TreeSet;
32  
33  import org.apache.commons.collections.KeyValue;
34  import org.apache.commons.collections.keyvalue.DefaultKeyValue;
35  import org.apache.commons.configuration.Configuration;
36  import org.apache.commons.lang.StringUtils;
37  import org.apache.commons.lang.builder.ToStringBuilder;
38  
39  import de.fu_berlin.ties.TiesConfiguration;
40  import de.fu_berlin.ties.io.IOUtils;
41  import de.fu_berlin.ties.text.TextUtils;
42  import de.fu_berlin.ties.util.MultiValueMap;
43  import de.fu_berlin.ties.util.SortedMultiValueMap;
44  import de.fu_berlin.ties.util.Util;
45  
46  /***
47   * A list sensor uses a one or several gazetteers to look up semantic
48   * information. Gazetteers are text files in a very simple format, they just
49   * contain one entry per line. Gazetters must be in the character set configured
50   * by the {@link de.fu_berlin.ties.io.IOUtils#KEY_LOCAL_CHARSET} (or in the
51   * local default character set if there is no entry for this key).
52   *
53   * @author Christian Siefkes
54   * @version $Revision: 1.10 $, $Date: 2006/10/21 16:04:06 $, $Author: siefkes $
55   */
56  public class ListSensor extends BaseSensor {
57  
58      /***
59       * Gazetteer files are resolved relative to this path, if given.
60       */
61      private final File basePath;
62  
63      /***
64       * Gazetteer entries are looked up case-sensitive if this is
65       * <code>true</code>.
66       */
67      private final boolean caseSensitive;
68  
69      /***
70       * The key/value pair to use as negative marker if no positive information
71       * is present for a token; or <code>null</code> if no negative marker
72       * should be used.
73       */
74      private final KeyValue negativeMarker;
75  
76      /***
77       * The keys of all gazetteer types managed by this instance; only stored
78       * if "false" should be added for all negative (not present) types.
79       */
80      private final SortedSet<String> typeSet;
81  
82      /***
83       * Sorted multi-map containing mapping each known token to a sorted
84       * collection of values to be returned for this token.
85       */
86      private final MultiValueMap<String, String> store =
87          new SortedMultiValueMap<String, String>();
88  
89      /***
90       * Creates a new instance.
91       *
92       * @param conf the configuration to use
93       * @throws IOException if an I/O error occurs while reading the gazetteers
94       * to use
95       */
96      public ListSensor(final TiesConfiguration conf) throws IOException {
97          super(conf);
98  
99          // init fields
100         caseSensitive = conf.getBoolean("sensor.list.case");
101         final String basePathName = conf.getString("sensor.list.basepath",
102             null);
103         basePath = (basePathName == null)
104             ? null
105             : new File(basePathName);
106 
107         // initialize negative marker resp. type set if it should be used
108         if (conf.getBoolean("sensor.list.negative")) {
109             final String negativeMarkerKey =
110                 conf.getString("sensor.list.negative.value", null);
111 
112             if (StringUtils.isNotEmpty(negativeMarkerKey)) {
113                 // use marker key if _no_ positive information is present
114                 negativeMarker = new DefaultKeyValue(negativeMarkerKey, null);
115                 typeSet = null;
116             } else {
117                 // otherwise use "true" or "false" for _every_ type
118                 typeSet = new TreeSet<String>();
119                 negativeMarker = null;
120             }
121         } else {
122             // use neither
123             negativeMarker = null;
124             typeSet = null;
125         }
126 
127         // load gazetteer files
128         final Configuration fileMappings = conf.subset("sensor.list.map");
129         final Iterator keyIter = fileMappings.getKeys();
130         String key;
131         String fileName;
132         File gazetteerFile;
133         Reader fileReader;
134         String fileContents;
135         String[] entries;
136         String currentEntry;
137         HashSet<String> entryHash;
138 
139         while (keyIter.hasNext()) {
140             key = (String) keyIter.next();
141             fileName = fileMappings.getString(key).trim();
142 
143             // ignore empty values (could be used to reset a mapping)
144             if (fileName.length() > 0) {
145                 gazetteerFile = new File(fileName);
146 
147                 if ((basePath != null) && (!gazetteerFile.isAbsolute())) {
148                     // non-absolute paths are resolved relative to the base path
149                     gazetteerFile = new File(basePath, fileName);
150                 }
151 
152                 // open file using local resp. default charset
153                 fileReader = IOUtils.openReader(gazetteerFile, conf);
154                 fileContents = IOUtils.readToString(fileReader);
155                 IOUtils.tryToClose(fileReader);
156 
157                 // split file into entries (one per non-empty line)
158                 entries = TextUtils.splitLines(fileContents);
159                 entryHash = new HashSet<String>(entries.length);
160 
161                 // add key to collection of values for each entry
162                 for (int i = 0; i < entries.length; i++) {
163                     // entries are converted to lower case if case is ignored
164                     currentEntry = caseSensitive
165                         ? entries[i]
166                         : entries[i].toLowerCase();
167 
168                     // use hash to avoid that entries are added twice (e.g. if
169                     // present in different capitalizations and case is ignored)
170                     if (!entryHash.contains(currentEntry)) {
171                         store.put(currentEntry, key);
172                         entryHash.add(currentEntry);
173                     }
174                 }
175 
176                 if (typeSet != null) {
177                     typeSet.add(key);
178                 }
179 
180                 Util.LOG.debug("Loaded " + gazetteerFile + " file: "
181                     + entries.length + " entries");
182             }
183         }
184     }
185 
186     /***
187      * {@inheritDoc}
188      */
189     public KeyValue[] lookup(final String token) {
190         final KeyValue[] result;
191         // look up the token in the store (in lower case if case is ignored)
192         final Collection<String> mappings = store.get(
193             caseSensitive ? token : token.toLowerCase());
194 
195         if (typeSet != null) {
196             result = new KeyValue[typeSet.size()];
197             final Iterator typeIter = typeSet.iterator();
198             String currentKey;
199             boolean foundKey;
200 
201             // add "true" or "false" for every list type
202             for (int i = 0; typeIter.hasNext(); i++) {
203                 currentKey = (String) typeIter.next();
204                 foundKey = (mappings != null) && mappings.contains(currentKey);
205                 result[i] = new DefaultKeyValue(currentKey,
206                     Boolean.toString(foundKey));
207             }
208         } else {
209             // only use positive types (and single negative marker if specified)
210             if ((mappings != null) && (mappings.size() > 0)) {
211                 // found at least one mapping
212                 result = new KeyValue[mappings.size()];
213                 final Iterator mappingsIter = mappings.iterator();
214                 String currentKey;
215 
216                 for (int i = 0; mappingsIter.hasNext(); i++) {
217                     currentKey = (String) mappingsIter.next();
218                     // use null values for each key
219                     result[i] = new DefaultKeyValue(currentKey, null);
220                 }
221             } else {
222                 if (negativeMarker != null) {
223                     // return negative Marker as only element
224                     result = new KeyValue[] {negativeMarker};
225                 } else {
226                     // return empty array
227                     result = new KeyValue[0];
228                 }
229             }
230         }
231 
232         return result;
233     }
234 
235     /***
236      * Returns a string representation of this object.
237      *
238      * @return a textual representation
239      */
240     public String toString() {
241         return new ToStringBuilder(this)
242             .append("base path", basePath)
243             .append("case sensitive", caseSensitive)
244             .append("negative marker key", negativeMarker.getKey())
245             .append("store size", store.size())
246             .toString();
247     }
248 
249 }