1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.demo;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.io.InputStreamReader;
28 import java.io.Reader;
29 import java.util.ArrayList;
30 import java.util.Enumeration;
31 import java.util.Iterator;
32 import java.util.List;
33 import java.util.regex.Matcher;
34 import java.util.regex.Pattern;
35 import java.util.zip.ZipEntry;
36 import java.util.zip.ZipFile;
37
38 import org.apache.commons.lang.builder.ToStringBuilder;
39
40 import de.fu_berlin.ties.io.IOUtils;
41 import de.fu_berlin.ties.text.TextUtils;
42
43 /***
44 * This class manages a set of sample spam and nonspam mails. It is initialized
45 * from a ZIP file containing the semple mails. The ZIP files must contain two
46 * directories named <code>spam</code> and <code>nonspam</code>, all files
47 * within these directories are supposed to be sample mails. All messaged are
48 * supposed to use the default character set of the current platform.
49 *
50 * @author Christian Siefkes
51 * @version $Revision: 1.6 $, $Date: 2006/10/21 16:04:09 $, $Author: siefkes $
52 */
53 public class SampleMails {
54
55 /***
56 * Subject generated if a message does not contain a "Subject:" header.
57 */
58 public static final String NO_SUBJECT = "[No subject]";
59
60 /***
61 * Path separator used in ZIP files: always a {@value}, following the Unix
62 * convention.
63 */
64 private static final char ZIP_PATH_SEPARATOR = '/';
65
66 /***
67 * Pattern matching the contents of mail's <code>Subject:</code> line
68 * (including continuation lines, if any).
69 */
70 private static final Pattern SUBJECT_PATTERN = Pattern.compile(
71 "^Subject://s*(.*" + TextUtils.NEWLINE_PATTERN.pattern()
72 + "(?:" + TextUtils.SINGLE_LINE_WS.pattern() + "+//S+.*"
73 + TextUtils.NEWLINE_PATTERN.pattern() + ")*)",
74 Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
75
76
77 /***
78 * A ZIP file containing the sample mails.
79 */
80 private final ZipFile zipFile;
81
82 /***
83 * A list of spam messages contained in the ZIP file.
84 */
85 private final List<ZipEntry> spamEntries;
86
87 /***
88 * A list of ham messages contained in the ZIP file.
89 */
90 private final List<ZipEntry> nonspamEntries;
91
92 /***
93 * Array containing the subjects of all ham mails.
94 */
95 private String[] nonspamSubjects = null;
96
97 /***
98 * Array containing the subjects of all spam mails.
99 */
100 private String[] spamSubjects = null;
101
102
103 /***
104 * Creates a new instance.
105 *
106 * @param sampleFile a ZIP file containing the sample mails
107 * @throws IOException if the file cannot be read or is not a valid ZIP file
108 */
109 public SampleMails(final File sampleFile) throws IOException {
110 super();
111 zipFile = new ZipFile(sampleFile);
112
113
114 final int entryCount = zipFile.size();
115 spamEntries = new ArrayList<ZipEntry>(entryCount / 2);
116 nonspamEntries = new ArrayList<ZipEntry>(entryCount / 2);
117
118 final String spamPrefix =
119 SpamFilterDemo.CLASS_SPAM + ZIP_PATH_SEPARATOR;
120 final String nonspamPrefix =
121 SpamFilterDemo.CLASS_NONSPAM + ZIP_PATH_SEPARATOR;
122 final Enumeration<? extends ZipEntry> entries = zipFile.entries();
123 ZipEntry currentEntry;
124 String currentName;
125
126
127 while (entries.hasMoreElements()) {
128 currentEntry = entries.nextElement();
129
130 if (!currentEntry.isDirectory()) {
131 currentName = currentEntry.getName();
132
133 if (currentName.startsWith(spamPrefix)) {
134 spamEntries.add(currentEntry);
135 } else if (currentName.startsWith(nonspamPrefix)) {
136 nonspamEntries.add(currentEntry);
137 }
138 }
139 }
140 }
141
142 /***
143 * Creates a new instance.
144 *
145 * @param sampleFileName the name of a ZIP file containing the sample mails
146 * @throws IOException if the file cannot be read or is not a valid ZIP file
147 */
148 public SampleMails(final String sampleFileName) throws IOException {
149 this(new File(sampleFileName));
150 }
151
152
153 /***
154 * Helper method that reads the contens of a ZIP file entry into a String,
155 * using the platform's default character set.
156 *
157 * @param entry the ZIP file entry to read
158 * @return the contents of the entry
159 * @throws IOException if an error occurs while reading from the ZIP file
160 */
161 private String getEntry(final ZipEntry entry) throws IOException {
162 final InputStream in = zipFile.getInputStream(entry);
163 final Reader reader = new InputStreamReader(in);
164
165 try {
166 final String result = IOUtils.readToString(reader);
167 return result;
168 } finally {
169 IOUtils.tryToClose(reader);
170 }
171 }
172
173 /***
174 * Returns the contents of a ham message.
175 *
176 * @param index the index of the message to read
177 * @return the contents of the specified message
178 * @throws IndexOutOfBoundsException if index is out of range
179 * <code>(index < 0 || index >= {@link #nonspamCount()})</code>
180 * @throws IOException if an I/O error occurs
181 */
182 public String getNonspam(final int index)
183 throws IndexOutOfBoundsException, IOException {
184 return getEntry(nonspamEntries.get(index));
185 }
186
187 /***
188 * Returns the contents of a spam message.
189 *
190 * @param index the index of the message to read
191 * @return the contents of the specified message
192 * @throws IndexOutOfBoundsException if index is out of range
193 * <code>(index < 0 || index >= {@link #spamCount()})</code>
194 * @throws IOException if an I/O error occurs
195 */
196 public String getSpam(final int index)
197 throws IndexOutOfBoundsException, IOException {
198 return getEntry(spamEntries.get(index));
199 }
200
201 /***
202 * Helper method that lists the subjects of all mails from a given list.
203 * If a message does not contain a "Subject:" header, the
204 * {@link #NO_SUBJECT} String is used instead.
205 *
206 * @param entries the list of mails to process
207 * @return an array containing the subjects of the given mails
208 * @throws IOException if an error occurs while reading from the ZIP file
209 */
210 private String[] listSubjects(final List<ZipEntry> entries)
211 throws IOException {
212 final String[] result = new String[entries.size()];
213 final Matcher subjectMatcher = SUBJECT_PATTERN.matcher("");
214 final Iterator<ZipEntry> entryIter = entries.iterator();
215 int i = 0;
216 ZipEntry currentEntry;
217 String currentMessage;
218 String currentSubject;
219
220 while (entryIter.hasNext()) {
221 currentEntry = entryIter.next();
222 currentMessage = getEntry(currentEntry);
223 subjectMatcher.reset(currentMessage);
224
225 if (subjectMatcher.find()) {
226
227 currentSubject = TextUtils.normalize(subjectMatcher.group(1));
228 } else {
229
230 currentSubject = NO_SUBJECT;
231
232 }
233 result[i++] = currentSubject;
234 }
235 return result;
236 }
237
238 /***
239 * Returns the number of ham messages managed by this instance.
240 *
241 * @return the number of ham messages
242 */
243 public int nonspamCount() {
244 return nonspamEntries.size();
245 }
246
247 /***
248 * Lists the subjects of all ham mails.
249 *
250 * @return an array of length {@link #nonspamCount()} containing the
251 * subjects of all ham mails
252 * @throws IOException if an error occurs while reading from the ZIP file
253 */
254 public String[] nonspamSubjects() throws IOException {
255 final String[] result;
256
257 if (nonspamSubjects == null) {
258 result = listSubjects(nonspamEntries);
259 nonspamSubjects = result;
260 } else {
261 result = nonspamSubjects;
262 }
263 return result;
264 }
265
266 /***
267 * Returns the number of spam messages managed by this instance.
268 *
269 * @return the number of spam messages
270 */
271 public int spamCount() {
272 return spamEntries.size();
273 }
274
275 /***
276 * Lists the subjects of all spam mails.
277 *
278 * @return an array of length {@link #spamCount()} containing the subjects
279 * of all spam mails
280 * @throws IOException if an error occurs while reading from the ZIP file
281 */
282 public String[] spamSubjects() throws IOException {
283 final String[] result;
284
285 if (spamSubjects == null) {
286 result = listSubjects(spamEntries);
287 spamSubjects = result;
288 } else {
289 result = spamSubjects;
290 }
291 return result;
292 }
293
294 /***
295 * Returns a string representation of this object.
296 *
297 * @return a textual representation
298 */
299 public String toString() {
300 return new ToStringBuilder(this)
301 .append("zip file", zipFile.getName())
302 .append("spam count", spamCount())
303 .append("nonspam count", spamCount())
304 .toString();
305 }
306
307 }