View Javadoc

1   /*
2    * Copyright (C) 2005-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.demo;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.InputStream;
27  import java.io.InputStreamReader;
28  import java.io.Reader;
29  import java.util.ArrayList;
30  import java.util.Enumeration;
31  import java.util.Iterator;
32  import java.util.List;
33  import java.util.regex.Matcher;
34  import java.util.regex.Pattern;
35  import java.util.zip.ZipEntry;
36  import java.util.zip.ZipFile;
37  
38  import org.apache.commons.lang.builder.ToStringBuilder;
39  
40  import de.fu_berlin.ties.io.IOUtils;
41  import de.fu_berlin.ties.text.TextUtils;
42  
43  /***
44   * This class manages a set of sample spam and nonspam mails. It is initialized
45   * from a ZIP file containing the semple mails. The ZIP files must contain two
46   * directories named <code>spam</code> and <code>nonspam</code>, all files
47   * within these directories are supposed to be sample mails. All messaged are
48   * supposed to use the default character set of the current platform.
49   *
50   * @author Christian Siefkes
51   * @version $Revision: 1.6 $, $Date: 2006/10/21 16:04:09 $, $Author: siefkes $
52   */
53  public class SampleMails {
54  
55      /***
56       * Subject generated if a message does not contain a "Subject:" header.
57       */
58      public static final String NO_SUBJECT = "[No subject]";
59  
60      /***
61       * Path separator used in ZIP files: always a {@value}, following the Unix
62       * convention.
63       */
64      private static final char ZIP_PATH_SEPARATOR = '/';
65  
66      /***
67       * Pattern matching the contents of mail's <code>Subject:</code> line
68       * (including continuation lines, if any).
69       */
70      private static final Pattern SUBJECT_PATTERN = Pattern.compile(
71              "^Subject://s*(.*" + TextUtils.NEWLINE_PATTERN.pattern()
72                  + "(?:" + TextUtils.SINGLE_LINE_WS.pattern() + "+//S+.*"
73                  + TextUtils.NEWLINE_PATTERN.pattern() + ")*)",
74              Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
75  
76  
77      /***
78       * A ZIP file containing the sample mails.
79       */
80      private final ZipFile zipFile;
81  
82      /***
83       * A list of spam messages contained in the ZIP file.
84       */
85      private final List<ZipEntry> spamEntries;
86  
87      /***
88       * A list of ham messages contained in the ZIP file.
89       */
90      private final List<ZipEntry> nonspamEntries;
91  
92      /***
93       * Array containing the subjects of all ham mails.
94       */
95      private String[] nonspamSubjects = null; // lazy initialization
96  
97      /***
98       * Array containing the subjects of all spam mails.
99       */
100     private String[] spamSubjects = null; // lazy initialization
101 
102 
103     /***
104      * Creates a new instance.
105      *
106      * @param sampleFile a ZIP file containing the sample mails
107      * @throws IOException if the file cannot be read or is not a valid ZIP file
108      */
109     public SampleMails(final File sampleFile) throws IOException {
110         super();
111         zipFile = new ZipFile(sampleFile);
112 
113         // assuming numbers of spam and nonspam mails are roughly similar
114         final int entryCount = zipFile.size();
115         spamEntries = new ArrayList<ZipEntry>(entryCount / 2);
116         nonspamEntries = new ArrayList<ZipEntry>(entryCount / 2);
117 
118         final String spamPrefix =
119             SpamFilterDemo.CLASS_SPAM + ZIP_PATH_SEPARATOR;
120         final String nonspamPrefix =
121             SpamFilterDemo.CLASS_NONSPAM + ZIP_PATH_SEPARATOR;
122         final Enumeration<? extends ZipEntry> entries = zipFile.entries();
123         ZipEntry currentEntry;
124         String currentName;
125 
126         // determine spam and nonspam entries in ZIP file
127         while (entries.hasMoreElements()) {
128             currentEntry = entries.nextElement();
129 
130             if (!currentEntry.isDirectory()) { // skip directories
131                 currentName = currentEntry.getName();
132 
133                 if (currentName.startsWith(spamPrefix)) {
134                     spamEntries.add(currentEntry);
135                 } else if (currentName.startsWith(nonspamPrefix)) {
136                     nonspamEntries.add(currentEntry);
137                 }
138             }
139         }
140     }
141 
142     /***
143      * Creates a new instance.
144      *
145      * @param sampleFileName the name of a ZIP file containing the sample mails
146      * @throws IOException if the file cannot be read or is not a valid ZIP file
147      */
148     public SampleMails(final String sampleFileName) throws IOException {
149         this(new File(sampleFileName));
150     }
151 
152 
153     /***
154      * Helper method that reads the contens of a ZIP file entry into a String,
155      * using the platform's default character set.
156      *
157      * @param entry the ZIP file entry to read
158      * @return the contents of the entry
159      * @throws IOException if an error occurs while reading from the ZIP file
160      */
161     private String getEntry(final ZipEntry entry) throws IOException {
162         final InputStream in = zipFile.getInputStream(entry);
163         final Reader reader = new InputStreamReader(in);
164 
165         try {
166             final String result = IOUtils.readToString(reader);
167             return result;
168         } finally {
169             IOUtils.tryToClose(reader);
170         }
171     }
172 
173     /***
174      * Returns the contents of a ham message.
175      *
176      * @param index the index of the message to read
177      * @return the contents of the specified message
178      * @throws IndexOutOfBoundsException if index is out of range
179      * <code>(index &lt; 0 ||  index &gt;= {@link #nonspamCount()})</code>
180      * @throws IOException if an I/O error occurs
181      */
182     public String getNonspam(final int index)
183     throws IndexOutOfBoundsException, IOException {
184         return getEntry(nonspamEntries.get(index));
185     }
186 
187     /***
188      * Returns the contents of a spam message.
189      *
190      * @param index the index of the message to read
191      * @return the contents of the specified message
192      * @throws IndexOutOfBoundsException if index is out of range
193      * <code>(index &lt; 0 || index &gt;= {@link #spamCount()})</code>
194      * @throws IOException if an I/O error occurs
195      */
196     public String getSpam(final int index)
197     throws IndexOutOfBoundsException, IOException {
198         return getEntry(spamEntries.get(index));
199     }
200 
201     /***
202      * Helper method that lists the subjects of all mails from a given list.
203      * If a message does not contain a "Subject:" header, the
204      * {@link #NO_SUBJECT} String is used instead.
205      *
206      * @param entries the list of mails to process
207      * @return an array containing the subjects of the given mails
208      * @throws IOException if an error occurs while reading from the ZIP file
209      */
210     private String[] listSubjects(final List<ZipEntry> entries)
211     throws IOException {
212         final String[] result = new String[entries.size()];
213         final Matcher subjectMatcher = SUBJECT_PATTERN.matcher("");
214         final Iterator<ZipEntry> entryIter = entries.iterator();
215         int i = 0;
216         ZipEntry currentEntry;
217         String currentMessage;
218         String currentSubject;
219 
220         while (entryIter.hasNext()) {
221             currentEntry = entryIter.next();
222             currentMessage = getEntry(currentEntry);
223             subjectMatcher.reset(currentMessage);
224 
225             if (subjectMatcher.find()) {
226                 // normalize contents of the Subject (matched by group 1)
227                 currentSubject = TextUtils.normalize(subjectMatcher.group(1));
228             } else {
229                 // no subject
230                 currentSubject = NO_SUBJECT;
231                 //Util.LOG.warn("Missing subject: " + currentEntry.getName());
232             }
233             result[i++] = currentSubject;
234         }
235         return result;
236     }
237 
238     /***
239      * Returns the number of ham messages managed by this instance.
240      *
241      * @return the number of ham messages
242      */
243     public int nonspamCount() {
244         return nonspamEntries.size();
245     }
246 
247     /***
248      * Lists the subjects of all ham mails.
249      *
250      * @return an array of length {@link #nonspamCount()} containing the
251      * subjects of all ham mails
252      * @throws IOException if an error occurs while reading from the ZIP file
253      */
254     public String[] nonspamSubjects() throws IOException {
255         final String[] result;
256 
257         if (nonspamSubjects == null) {
258             result = listSubjects(nonspamEntries);
259             nonspamSubjects = result; // cache subjects
260         } else {
261             result = nonspamSubjects; // use cached subjects
262         }
263         return result;
264     }
265 
266     /***
267      * Returns the number of spam messages managed by this instance.
268      *
269      * @return the number of spam messages
270      */
271     public int spamCount() {
272         return spamEntries.size();
273     }
274 
275     /***
276      * Lists the subjects of all spam mails.
277      *
278      * @return an array of length {@link #spamCount()} containing the subjects
279      * of all spam mails
280      * @throws IOException if an error occurs while reading from the ZIP file
281      */
282     public String[] spamSubjects() throws IOException {
283         final String[] result;
284 
285         if (spamSubjects == null) {
286             result = listSubjects(spamEntries);
287             spamSubjects = result; // cache subjects
288         } else {
289             result = spamSubjects; // use cached subjects
290         }
291         return result;
292     }
293 
294     /***
295      * Returns a string representation of this object.
296      *
297      * @return a textual representation
298      */
299     public String toString() {
300         return new ToStringBuilder(this)
301             .append("zip file", zipFile.getName())
302             .append("spam count", spamCount())
303             .append("nonspam count", spamCount())
304             .toString();
305     }
306 
307 }