View Javadoc

1   /*
2    * Copyright (C) 2005-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.text;
23  
24  import java.io.IOException;
25  import java.io.Reader;
26  import java.util.regex.Pattern;
27  
28  import de.fu_berlin.ties.TiesConfiguration;
29  import de.fu_berlin.ties.classify.feature.DefaultFeature;
30  import de.fu_berlin.ties.classify.feature.DefaultFeatureVector;
31  import de.fu_berlin.ties.classify.feature.FeatureVector;
32  import de.fu_berlin.ties.io.IOUtils;
33  
34  /***
35   * A tokenizing extractor that prepends field names to each token. The default
36   * implementation is meant for e-mail (or newsgroup) messages: each e-mail
37   * header is converted into a field (using the header name as prefix, e.g.
38   * "Subject:"); the whole body (including attachments) is treated as a single
39   * field (using the empty string as prefix). The first first token is also
40   * considered the beginning of a new field (e.g. start of the "From " line).
41   * 
42   * @author Christian Siefkes
43   * @version $Revision: 1.6 $, $Date: 2006/10/21 16:04:25 $, $Author: siefkes $
44   */
45  public class FieldTokenizingExtractor extends TokenizingExtractor {
46  
47      /***
48       * Pattern matching an RFC2822-style header name: a sequence of printable
49       * characters terminated by a colon and not containing any other colons.
50       */
51      protected static final Pattern FIELDNAME =
52          Pattern.compile("[^//p{Z}//p{C}:]+:");
53  
54      /***
55       * Pattern matching the whitespace that must occur in front of an
56       * RFC2822-style header name: must end in a line break.
57       */
58      protected static final Pattern PRE_FIELDNAME_WS = Pattern.compile(
59              ".*" + TextUtils.NEWLINE_PATTERN.toString(), Pattern.DOTALL);
60  
61      /***
62       * Pattern matching the whitespace marks the end of all regular field names
63       * (ie the begin of the message body): must contain two consecutive line
64       * break.
65       */
66      protected static final Pattern END_OF_FIELDS_WS =
67          // match newlines possessively to avoid treating \r\n as _2_ newlines
68          Pattern.compile(".*" + TextUtils.NEWLINE_PATTERN.toString() + "{2,}+.*",
69                  Pattern.DOTALL);
70  
71      /***
72       * The name of the final field (using after {@link #END_OF_FIELDS_WS}
73       * matched): {@value} (the empty string).
74       */
75      protected static final String FINAL_FIELDNAME = "";
76  
77      /***
78       * Separator character inserted between field name and actual token:
79       * {@value}.
80       */
81      protected static final char FIELD_SEP = '_';
82  
83  
84      /***
85       * The name of the current field.
86       */
87      private String fieldName;
88  
89  
90      /***
91       * Creates a new instance.
92       *
93       * @param conf used to configure this instance
94       * @param suffix optional suffix for
95       * {@linkplain de.fu_berlin.ties.TiesConfiguration#adaptKey(String, String)
96       * adapting configuration keys} if not <code>null</code>
97       */
98      public FieldTokenizingExtractor(final TiesConfiguration conf,
99              final String suffix) {
100         super(conf, suffix);
101     }
102 
103 
104     /***
105      * {@inheritDoc}
106      */
107     public FeatureVector buildFeatures(final Reader reader) throws IOException {
108         final FeatureVector features = new DefaultFeatureVector();
109         final TextTokenizer tokenizer = getTokenizer();
110         boolean foundFinalField = false;
111         boolean addToken;
112         String token, precWS, tokenRep;
113 
114         // reset field name and tokenizer
115         fieldName = null;
116         tokenizer.reset(IOUtils.readToString(reader));
117 
118         while ((token = tokenizer.nextToken()) != null) {
119             if (fieldName == null) {
120                 // very first token: always treat as field name
121                 fieldName = token;
122                 addToken = false;
123             } else {
124                 if (foundFinalField) {
125                     addToken = true;
126                 } else {
127                     // check for start of new field
128                     precWS = tokenizer.precedingWhitespace();
129 
130                     if (END_OF_FIELDS_WS.matcher(precWS).matches()) {
131                         // reached end of header
132                         fieldName = FINAL_FIELDNAME;
133                         addToken = true;
134                         foundFinalField = true;
135                     } else if (PRE_FIELDNAME_WS.matcher(precWS).matches()
136                             && FIELDNAME.matcher(token).matches()) {
137                         // new header field
138                         fieldName = token;
139                         addToken = false;
140                     } else {
141                         // regular token
142                         addToken = true;
143                     }
144                 }
145             }
146 
147             if (addToken) {
148                 tokenRep = fieldName + FIELD_SEP + token;
149                 features.add(new DefaultFeature(tokenRep));
150             }
151         }
152 
153 //        Util.LOG.info("Generated features: " + features.flatten());
154         return features;
155     }
156 
157 }