package de.fu_berlin.ties.text;

import de.fu_berlin.ties.TiesConfiguration;
import de.fu_berlin.ties.classify.feature.DefaultFeature;
import de.fu_berlin.ties.classify.feature.DefaultFeatureVector;
import de.fu_berlin.ties.classify.feature.FeatureVector;
import de.fu_berlin.ties.io.IOUtils;
import java.io.IOException;
import java.io.Reader;
import java.util.regex.Pattern;

/* loaded from: input_file:de/fu_berlin/ties/text/FieldTokenizingExtractor.class */
public class FieldTokenizingExtractor extends TokenizingExtractor {
    protected static final Pattern FIELDNAME = Pattern.compile("[^\\p{Z}\\p{C}:]+:");
    protected static final Pattern PRE_FIELDNAME_WS = Pattern.compile(".*" + TextUtils.NEWLINE_PATTERN.toString(), 32);
    protected static final Pattern END_OF_FIELDS_WS = Pattern.compile(".*" + TextUtils.NEWLINE_PATTERN.toString() + "{2,}+.*", 32);
    protected static final String FINAL_FIELDNAME = "";
    protected static final char FIELD_SEP = '_';
    private String fieldName;

    public FieldTokenizingExtractor(TiesConfiguration tiesConfiguration, String str) {
        super(tiesConfiguration, str);
    }

    @Override // de.fu_berlin.ties.text.TokenizingExtractor, de.fu_berlin.ties.classify.feature.FeatureExtractor
    public FeatureVector buildFeatures(Reader reader) throws IOException {
        boolean z;
        DefaultFeatureVector defaultFeatureVector = new DefaultFeatureVector();
        TextTokenizer tokenizer = getTokenizer();
        boolean z2 = false;
        this.fieldName = null;
        tokenizer.reset(IOUtils.readToString(reader));
        while (true) {
            String nextToken = tokenizer.nextToken();
            if (nextToken == null) {
                return defaultFeatureVector;
            }
            if (this.fieldName == null) {
                this.fieldName = nextToken;
                z = false;
            } else if (z2) {
                z = true;
            } else {
                String precedingWhitespace = tokenizer.precedingWhitespace();
                if (END_OF_FIELDS_WS.matcher(precedingWhitespace).matches()) {
                    this.fieldName = FINAL_FIELDNAME;
                    z = true;
                    z2 = true;
                } else if (PRE_FIELDNAME_WS.matcher(precedingWhitespace).matches() && FIELDNAME.matcher(nextToken).matches()) {
                    this.fieldName = nextToken;
                    z = false;
                } else {
                    z = true;
                }
            }
            if (z) {
                defaultFeatureVector.add(new DefaultFeature(this.fieldName + '_' + nextToken));
            }
        }
    }
}
