package de.fu_berlin.ties.extract;

import de.fu_berlin.ties.ContextMap;
import de.fu_berlin.ties.ProcessingException;
import de.fu_berlin.ties.TextProcessor;
import de.fu_berlin.ties.TiesConfiguration;
import de.fu_berlin.ties.classify.TrainableClassifier;
import de.fu_berlin.ties.combi.CombinationState;
import de.fu_berlin.ties.combi.CombinationStrategy;
import de.fu_berlin.ties.context.Representation;
import de.fu_berlin.ties.eval.Accuracy;
import de.fu_berlin.ties.text.TextTokenizer;
import de.fu_berlin.ties.text.TokenizerFactory;
import de.fu_berlin.ties.util.Util;
import de.fu_berlin.ties.xml.dom.TokenProcessor;
import de.fu_berlin.ties.xml.dom.TokenWalker;
import java.io.File;
import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import org.apache.commons.collections.Bag;
import org.apache.commons.collections.HashBag;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.builder.ToStringBuilder;
import org.dom4j.Document;
import org.dom4j.Element;

/* loaded from: input_file:de/fu_berlin/ties/extract/Trainer.class */
public class Trainer extends ExtractorBase implements TokenProcessor {
    public static final String CONFIG_TOE = "train.only-errors";
    public static final String CONFIG_TEST_ONLY = "train.test-only";
    private final boolean trainingOnlyErrors;
    private final boolean testingOnly;
    private Accuracy accuracy;
    private Iterator extractionIter;
    private Extraction currentExtraction;
    private Extraction partialExtraction;
    private String firstToken;
    private boolean inExtraction;
    private Bag remainingTokens;
    private final TextTokenizer tokenizer;

    public Trainer() throws IllegalArgumentException, ProcessingException {
        this("tmp", TiesConfiguration.CONF);
    }

    public Trainer(String str) throws IllegalArgumentException, ProcessingException {
        this(str, TiesConfiguration.CONF);
    }

    public Trainer(String str, TiesConfiguration tiesConfiguration) throws IllegalArgumentException, ProcessingException {
        this(str, null, tiesConfiguration);
    }

    public Trainer(String str, File file, TiesConfiguration tiesConfiguration) throws IllegalArgumentException, ProcessingException {
        super(str, file, tiesConfiguration);
        this.accuracy = null;
        this.remainingTokens = new HashBag();
        this.trainingOnlyErrors = tiesConfiguration.getBoolean(CONFIG_TOE);
        this.testingOnly = tiesConfiguration.getBoolean(CONFIG_TEST_ONLY);
        this.tokenizer = getFactory().createTokenizer(StringUtils.EMPTY);
    }

    public Trainer(String str, TargetStructure targetStructure, TrainableClassifier trainableClassifier, Representation representation, CombinationStrategy combinationStrategy, TokenizerFactory tokenizerFactory) {
        this(str, targetStructure, trainableClassifier, representation, combinationStrategy, tokenizerFactory, TiesConfiguration.CONF.getBoolean(CONFIG_TOE), TiesConfiguration.CONF.getBoolean(CONFIG_TEST_ONLY), TiesConfiguration.CONF);
    }

    public Trainer(String str, TargetStructure targetStructure, TrainableClassifier trainableClassifier, Representation representation, CombinationStrategy combinationStrategy, TokenizerFactory tokenizerFactory, boolean z, boolean z2, TiesConfiguration tiesConfiguration) {
        super(str, targetStructure, trainableClassifier, representation, combinationStrategy, tokenizerFactory, tiesConfiguration);
        this.accuracy = null;
        this.remainingTokens = new HashBag();
        this.trainingOnlyErrors = z;
        this.testingOnly = z2;
        this.tokenizer = getFactory().createTokenizer(StringUtils.EMPTY);
    }

    public boolean isTestingOnly() {
        return this.testingOnly;
    }

    public boolean isTrainingOnlyErrors() {
        return this.trainingOnlyErrors;
    }

    @Override // de.fu_berlin.ties.DocumentReader
    public void process(Document document, Writer writer, ContextMap contextMap) throws IOException, ProcessingException {
        train(document, AnswerBuilder.readCorrespondingAnswerKeys(getTargetStructure(), new File((File) contextMap.get(TextProcessor.KEY_DIRECTORY), (String) contextMap.get(TextProcessor.KEY_LOCAL_NAME)), getConfig()));
    }

    @Override // de.fu_berlin.ties.xml.dom.TokenProcessor
    public void processToken(Element element, String str, String str2, String str3, int i, boolean z, ContextMap contextMap) throws ProcessingException {
        boolean z2;
        String str4;
        boolean z3;
        updateState(element, str, str2, str3);
        if (this.inExtraction || this.currentExtraction == null || !str2.equals(this.firstToken) || this.currentExtraction.getFirstTokenRep() > i) {
            z2 = false;
        } else {
            this.inExtraction = true;
            z2 = true;
            Util.LOG.debug(new StringBuffer().append("Starting extraction (").append(this.firstToken).append(" token)").toString());
        }
        if (this.inExtraction) {
            if (!this.remainingTokens.remove(str2, 1)) {
                Util.LOG.error(new StringBuffer().append("Correct extractions don't match document: still missing tokens ").append(this.remainingTokens).append(" from extraction ").append(this.currentExtraction).append(" but current token '").append(str2).append("' (token rep=").append(i).append(") doesn't match").toString());
                this.remainingTokens.clear();
                str4 = null;
            } else if (z2) {
                str4 = this.currentExtraction.getType();
                this.partialExtraction = new Extraction(str4, str2, this.currentExtraction.getFirstTokenRep());
                getPriorRecognitions().add(this.partialExtraction);
            } else {
                str4 = this.currentExtraction.getType();
                this.partialExtraction.append(str2, z);
            }
            z3 = this.remainingTokens.isEmpty();
        } else {
            str4 = null;
            z3 = false;
        }
        CombinationState combinationState = str4 == null ? CombinationState.OUTSIDE : new CombinationState(str4, z2, z3);
        if (str4 == null && this.partialExtraction != null && !this.partialExtraction.isSealed()) {
            this.partialExtraction.seal();
        }
        String translateCurrentState = getStrategy().translateCurrentState(combinationState);
        Util.LOG.debug(new StringBuffer().append("Current state: ").append(combinationState).append("; translated state: '").append(translateCurrentState).append("'").toString());
        if (this.testingOnly) {
            CombinationState translateResult = getStrategy().translateResult(translateCurrentState);
            if (!StringUtils.equals(combinationState.getType(), translateResult.getType()) || (combinationState.getType() != null && combinationState.isBegin() != translateResult.isBegin())) {
                Util.LOG.error(new StringBuffer().append("Error in combination strategy: incorrect re-translation ").append(translateResult).append(" of current state ").append(combinationState).toString());
            }
        } else if (!this.trainingOnlyErrors) {
            ((TrainableClassifier) getClassifier()).train(getFeatures(), translateCurrentState);
            Util.LOG.debug("Trained classifier");
        } else if (((TrainableClassifier) getClassifier()).trainOnError(getFeatures(), translateCurrentState, getActiveClasses()) == null) {
            this.accuracy.incTrueCount();
        } else {
            this.accuracy.incFalseCount();
        }
        getStrategy().updateState(combinationState);
        if (this.inExtraction && this.remainingTokens.isEmpty()) {
            updateCurrentExtraction();
        }
    }

    @Override // de.fu_berlin.ties.extract.ExtractorBase, de.fu_berlin.ties.TextProcessor
    public String toString() {
        return new ToStringBuilder(this).appendSuper(super.toString()).append("training only errors", this.trainingOnlyErrors).append("testing only", this.testingOnly).append("tokenizer", this.tokenizer).append("in extraction", this.inExtraction).append("current extraction", this.currentExtraction).append("first token", this.firstToken).append("remaining tokens", this.remainingTokens).toString();
    }

    public Accuracy train(Document document, ExtractionContainer extractionContainer) throws IOException, ProcessingException {
        initFields();
        this.extractionIter = extractionContainer.iterator();
        updateCurrentExtraction();
        TokenWalker tokenWalker = new TokenWalker(this, getFactory());
        if (this.trainingOnlyErrors) {
            this.accuracy = new Accuracy();
        }
        tokenWalker.walk(document, null);
        if (!this.remainingTokens.isEmpty()) {
            if (this.inExtraction) {
                Util.LOG.error(new StringBuffer().append("Document ended while training extraction: ").append(this.currentExtraction).append(", unprocessed tokens: ").append(this.remainingTokens).toString());
            } else {
                Util.LOG.error(new StringBuffer().append("Document ended while waiting for start of extraction: ").append(this.currentExtraction).toString());
            }
            this.remainingTokens.clear();
        }
        if (this.extractionIter.hasNext()) {
            Util.LOG.error(new StringBuffer().append("Unprocessed extractions at end of document: ").append(this.extractionIter.next()).toString());
        }
        if (!this.trainingOnlyErrors) {
            return null;
        }
        Util.LOG.debug(new StringBuffer().append("Finished training in TOE mode: ").append(this.accuracy).append(", ").append(getClassifier().toString()).toString());
        return this.accuracy;
    }

    private void updateCurrentExtraction() throws IllegalStateException {
        if (!this.remainingTokens.isEmpty()) {
            throw new IllegalStateException(new StringBuffer().append("Cannot update current extraction while there are remaining tokens: ").append(this.remainingTokens).toString());
        }
        this.inExtraction = false;
        this.firstToken = null;
        if (!this.extractionIter.hasNext()) {
            this.currentExtraction = null;
            return;
        }
        this.currentExtraction = (Extraction) this.extractionIter.next();
        this.tokenizer.reset(this.currentExtraction.getText());
        while (true) {
            String nextToken = this.tokenizer.nextToken();
            if (nextToken == null) {
                return;
            }
            if (this.firstToken == null) {
                this.firstToken = nextToken;
            }
            this.remainingTokens.add(nextToken);
        }
    }
}
