1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.text;
23
24 import java.io.IOException;
25 import java.io.Reader;
26 import java.util.regex.Pattern;
27
28 import de.fu_berlin.ties.TiesConfiguration;
29 import de.fu_berlin.ties.classify.feature.DefaultFeature;
30 import de.fu_berlin.ties.classify.feature.DefaultFeatureVector;
31 import de.fu_berlin.ties.classify.feature.FeatureVector;
32 import de.fu_berlin.ties.io.IOUtils;
33
34 /***
35 * A tokenizing extractor that prepends field names to each token. The default
36 * implementation is meant for e-mail (or newsgroup) messages: each e-mail
37 * header is converted into a field (using the header name as prefix, e.g.
38 * "Subject:"); the whole body (including attachments) is treated as a single
39 * field (using the empty string as prefix). The first first token is also
40 * considered the beginning of a new field (e.g. start of the "From " line).
41 *
42 * @author Christian Siefkes
43 * @version $Revision: 1.6 $, $Date: 2006/10/21 16:04:25 $, $Author: siefkes $
44 */
45 public class FieldTokenizingExtractor extends TokenizingExtractor {
46
47 /***
48 * Pattern matching an RFC2822-style header name: a sequence of printable
49 * characters terminated by a colon and not containing any other colons.
50 */
51 protected static final Pattern FIELDNAME =
52 Pattern.compile("[^//p{Z}//p{C}:]+:");
53
54 /***
55 * Pattern matching the whitespace that must occur in front of an
56 * RFC2822-style header name: must end in a line break.
57 */
58 protected static final Pattern PRE_FIELDNAME_WS = Pattern.compile(
59 ".*" + TextUtils.NEWLINE_PATTERN.toString(), Pattern.DOTALL);
60
61 /***
62 * Pattern matching the whitespace marks the end of all regular field names
63 * (ie the begin of the message body): must contain two consecutive line
64 * break.
65 */
66 protected static final Pattern END_OF_FIELDS_WS =
67
68 Pattern.compile(".*" + TextUtils.NEWLINE_PATTERN.toString() + "{2,}+.*",
69 Pattern.DOTALL);
70
71 /***
72 * The name of the final field (using after {@link #END_OF_FIELDS_WS}
73 * matched): {@value} (the empty string).
74 */
75 protected static final String FINAL_FIELDNAME = "";
76
77 /***
78 * Separator character inserted between field name and actual token:
79 * {@value}.
80 */
81 protected static final char FIELD_SEP = '_';
82
83
84 /***
85 * The name of the current field.
86 */
87 private String fieldName;
88
89
90 /***
91 * Creates a new instance.
92 *
93 * @param conf used to configure this instance
94 * @param suffix optional suffix for
95 * {@linkplain de.fu_berlin.ties.TiesConfiguration#adaptKey(String, String)
96 * adapting configuration keys} if not <code>null</code>
97 */
98 public FieldTokenizingExtractor(final TiesConfiguration conf,
99 final String suffix) {
100 super(conf, suffix);
101 }
102
103
104 /***
105 * {@inheritDoc}
106 */
107 public FeatureVector buildFeatures(final Reader reader) throws IOException {
108 final FeatureVector features = new DefaultFeatureVector();
109 final TextTokenizer tokenizer = getTokenizer();
110 boolean foundFinalField = false;
111 boolean addToken;
112 String token, precWS, tokenRep;
113
114
115 fieldName = null;
116 tokenizer.reset(IOUtils.readToString(reader));
117
118 while ((token = tokenizer.nextToken()) != null) {
119 if (fieldName == null) {
120
121 fieldName = token;
122 addToken = false;
123 } else {
124 if (foundFinalField) {
125 addToken = true;
126 } else {
127
128 precWS = tokenizer.precedingWhitespace();
129
130 if (END_OF_FIELDS_WS.matcher(precWS).matches()) {
131
132 fieldName = FINAL_FIELDNAME;
133 addToken = true;
134 foundFinalField = true;
135 } else if (PRE_FIELDNAME_WS.matcher(precWS).matches()
136 && FIELDNAME.matcher(token).matches()) {
137
138 fieldName = token;
139 addToken = false;
140 } else {
141
142 addToken = true;
143 }
144 }
145 }
146
147 if (addToken) {
148 tokenRep = fieldName + FIELD_SEP + token;
149 features.add(new DefaultFeature(tokenRep));
150 }
151 }
152
153
154 return features;
155 }
156
157 }