1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.context.sensor;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.Reader;
27 import java.util.Collection;
28 import java.util.HashSet;
29 import java.util.Iterator;
30 import java.util.SortedSet;
31 import java.util.TreeSet;
32
33 import org.apache.commons.collections.KeyValue;
34 import org.apache.commons.collections.keyvalue.DefaultKeyValue;
35 import org.apache.commons.configuration.Configuration;
36 import org.apache.commons.lang.StringUtils;
37 import org.apache.commons.lang.builder.ToStringBuilder;
38
39 import de.fu_berlin.ties.TiesConfiguration;
40 import de.fu_berlin.ties.io.IOUtils;
41 import de.fu_berlin.ties.text.TextUtils;
42 import de.fu_berlin.ties.util.MultiValueMap;
43 import de.fu_berlin.ties.util.SortedMultiValueMap;
44 import de.fu_berlin.ties.util.Util;
45
46 /***
47 * A list sensor uses a one or several gazetteers to look up semantic
48 * information. Gazetteers are text files in a very simple format, they just
49 * contain one entry per line. Gazetters must be in the character set configured
50 * by the {@link de.fu_berlin.ties.io.IOUtils#KEY_LOCAL_CHARSET} (or in the
51 * local default character set if there is no entry for this key).
52 *
53 * @author Christian Siefkes
54 * @version $Revision: 1.10 $, $Date: 2006/10/21 16:04:06 $, $Author: siefkes $
55 */
56 public class ListSensor extends BaseSensor {
57
58 /***
59 * Gazetteer files are resolved relative to this path, if given.
60 */
61 private final File basePath;
62
63 /***
64 * Gazetteer entries are looked up case-sensitive if this is
65 * <code>true</code>.
66 */
67 private final boolean caseSensitive;
68
69 /***
70 * The key/value pair to use as negative marker if no positive information
71 * is present for a token; or <code>null</code> if no negative marker
72 * should be used.
73 */
74 private final KeyValue negativeMarker;
75
76 /***
77 * The keys of all gazetteer types managed by this instance; only stored
78 * if "false" should be added for all negative (not present) types.
79 */
80 private final SortedSet<String> typeSet;
81
82 /***
83 * Sorted multi-map containing mapping each known token to a sorted
84 * collection of values to be returned for this token.
85 */
86 private final MultiValueMap<String, String> store =
87 new SortedMultiValueMap<String, String>();
88
89 /***
90 * Creates a new instance.
91 *
92 * @param conf the configuration to use
93 * @throws IOException if an I/O error occurs while reading the gazetteers
94 * to use
95 */
96 public ListSensor(final TiesConfiguration conf) throws IOException {
97 super(conf);
98
99
100 caseSensitive = conf.getBoolean("sensor.list.case");
101 final String basePathName = conf.getString("sensor.list.basepath",
102 null);
103 basePath = (basePathName == null)
104 ? null
105 : new File(basePathName);
106
107
108 if (conf.getBoolean("sensor.list.negative")) {
109 final String negativeMarkerKey =
110 conf.getString("sensor.list.negative.value", null);
111
112 if (StringUtils.isNotEmpty(negativeMarkerKey)) {
113
114 negativeMarker = new DefaultKeyValue(negativeMarkerKey, null);
115 typeSet = null;
116 } else {
117
118 typeSet = new TreeSet<String>();
119 negativeMarker = null;
120 }
121 } else {
122
123 negativeMarker = null;
124 typeSet = null;
125 }
126
127
128 final Configuration fileMappings = conf.subset("sensor.list.map");
129 final Iterator keyIter = fileMappings.getKeys();
130 String key;
131 String fileName;
132 File gazetteerFile;
133 Reader fileReader;
134 String fileContents;
135 String[] entries;
136 String currentEntry;
137 HashSet<String> entryHash;
138
139 while (keyIter.hasNext()) {
140 key = (String) keyIter.next();
141 fileName = fileMappings.getString(key).trim();
142
143
144 if (fileName.length() > 0) {
145 gazetteerFile = new File(fileName);
146
147 if ((basePath != null) && (!gazetteerFile.isAbsolute())) {
148
149 gazetteerFile = new File(basePath, fileName);
150 }
151
152
153 fileReader = IOUtils.openReader(gazetteerFile, conf);
154 fileContents = IOUtils.readToString(fileReader);
155 IOUtils.tryToClose(fileReader);
156
157
158 entries = TextUtils.splitLines(fileContents);
159 entryHash = new HashSet<String>(entries.length);
160
161
162 for (int i = 0; i < entries.length; i++) {
163
164 currentEntry = caseSensitive
165 ? entries[i]
166 : entries[i].toLowerCase();
167
168
169
170 if (!entryHash.contains(currentEntry)) {
171 store.put(currentEntry, key);
172 entryHash.add(currentEntry);
173 }
174 }
175
176 if (typeSet != null) {
177 typeSet.add(key);
178 }
179
180 Util.LOG.debug("Loaded " + gazetteerFile + " file: "
181 + entries.length + " entries");
182 }
183 }
184 }
185
186 /***
187 * {@inheritDoc}
188 */
189 public KeyValue[] lookup(final String token) {
190 final KeyValue[] result;
191
192 final Collection<String> mappings = store.get(
193 caseSensitive ? token : token.toLowerCase());
194
195 if (typeSet != null) {
196 result = new KeyValue[typeSet.size()];
197 final Iterator typeIter = typeSet.iterator();
198 String currentKey;
199 boolean foundKey;
200
201
202 for (int i = 0; typeIter.hasNext(); i++) {
203 currentKey = (String) typeIter.next();
204 foundKey = (mappings != null) && mappings.contains(currentKey);
205 result[i] = new DefaultKeyValue(currentKey,
206 Boolean.toString(foundKey));
207 }
208 } else {
209
210 if ((mappings != null) && (mappings.size() > 0)) {
211
212 result = new KeyValue[mappings.size()];
213 final Iterator mappingsIter = mappings.iterator();
214 String currentKey;
215
216 for (int i = 0; mappingsIter.hasNext(); i++) {
217 currentKey = (String) mappingsIter.next();
218
219 result[i] = new DefaultKeyValue(currentKey, null);
220 }
221 } else {
222 if (negativeMarker != null) {
223
224 result = new KeyValue[] {negativeMarker};
225 } else {
226
227 result = new KeyValue[0];
228 }
229 }
230 }
231
232 return result;
233 }
234
235 /***
236 * Returns a string representation of this object.
237 *
238 * @return a textual representation
239 */
240 public String toString() {
241 return new ToStringBuilder(this)
242 .append("base path", basePath)
243 .append("case sensitive", caseSensitive)
244 .append("negative marker key", negativeMarker.getKey())
245 .append("store size", store.size())
246 .toString();
247 }
248
249 }