package de.fu_berlin.ties.extract;

import de.fu_berlin.ties.DocumentReader;
import de.fu_berlin.ties.ProcessingException;
import de.fu_berlin.ties.TiesConfiguration;
import de.fu_berlin.ties.classify.Classifier;
import de.fu_berlin.ties.classify.Reranker;
import de.fu_berlin.ties.classify.TrainableClassifier;
import de.fu_berlin.ties.classify.feature.FeatureVector;
import de.fu_berlin.ties.combi.CombinationStrategy;
import de.fu_berlin.ties.context.ContextDetails;
import de.fu_berlin.ties.context.DefaultRepresentation;
import de.fu_berlin.ties.context.LocalFeature;
import de.fu_berlin.ties.context.PriorRecognitions;
import de.fu_berlin.ties.context.Representation;
import de.fu_berlin.ties.eval.FMetricsView;
import de.fu_berlin.ties.extract.amend.FinalReextractor;
import de.fu_berlin.ties.extract.reestimate.Reestimator;
import de.fu_berlin.ties.filter.DocumentRewriter;
import de.fu_berlin.ties.filter.EmbeddingElements;
import de.fu_berlin.ties.filter.FilterEvaluator;
import de.fu_berlin.ties.filter.FilteringTokenWalker;
import de.fu_berlin.ties.filter.RepresentationFilter;
import de.fu_berlin.ties.filter.SkipHandler;
import de.fu_berlin.ties.filter.TrainableFilter;
import de.fu_berlin.ties.text.TextUtils;
import de.fu_berlin.ties.text.TokenizerFactory;
import de.fu_berlin.ties.util.CollUtils;
import de.fu_berlin.ties.util.Util;
import de.fu_berlin.ties.xml.dom.ElementNameFilter;
import de.fu_berlin.ties.xml.dom.TokenProcessor;
import de.fu_berlin.ties.xml.dom.TokenWalker;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang.builder.ToStringBuilder;
import org.dom4j.Document;
import org.dom4j.Element;

/* loaded from: input_file:de/fu_berlin/ties/extract/ExtractorBase.class */
public abstract class ExtractorBase extends DocumentReader implements SkipHandler, TokenProcessor {
    public static final String CONFIG_ELEMENTS = "filter.elements";
    public static final String CONFIG_AVOID = "filter.avoid";
    public static final String CONFIG_RELEVANT_PUNCTUATION = "extract.punctuation.relevant";
    public static final String CONFIG_SENTENCE = "sent";
    public static final String CONFIG_SUFFIX_IE = "ie";
    private final Classifier[] classifiers;
    private final List<ContextDetails> contextDetails;
    private final Reestimator reestimator;
    private final FinalReextractor reextractor;
    private final Representation representation;
    private final TargetStructure targetStructure;
    private final TokenizerFactory factory;
    private final TrainableFilter sentenceFilter;
    private final DocumentRewriter[] documentRewriters;
    private Set[] activeClasses;
    private Map<Element, List<LocalFeature>> featureCache;
    private FeatureVector features;
    private final Set<String> relevantPunctuation;
    private PriorRecognitions priorRecognitions;
    private final CombinationStrategy strategy;
    private TokenWalker walker;

    /* JADX INFO: Access modifiers changed from: protected */
    public static DocumentRewriter[] createDocumentRewriters(TiesConfiguration tiesConfiguration) throws ProcessingException {
        DocumentRewriter[] documentRewriterArr;
        String[] stringArray = tiesConfiguration.getStringArray("rewriters");
        if (TiesConfiguration.arrayIsEmpty(stringArray)) {
            documentRewriterArr = new DocumentRewriter[0];
        } else {
            documentRewriterArr = new DocumentRewriter[stringArray.length];
            Object[] objArr = {tiesConfiguration};
            for (int i = 0; i < documentRewriterArr.length; i++) {
                try {
                    documentRewriterArr[i] = (DocumentRewriter) Util.createObject(Class.forName(stringArray[i]), objArr, TiesConfiguration.class);
                    Util.LOG.debug("Initialized document rewriter no." + i + ": " + stringArray[i]);
                } catch (ClassNotFoundException e) {
                    throw new ProcessingException("Cannot initialize document rewriter " + stringArray[i], e);
                } catch (InstantiationException e2) {
                    throw new ProcessingException("Cannot initialize document rewriter " + stringArray[i], e2);
                }
            }
        }
        return documentRewriterArr;
    }

    private static TrainableFilter createSentenceFilter(TiesConfiguration tiesConfiguration, Representation representation) throws ProcessingException {
        RepresentationFilter representationFilter;
        if (tiesConfiguration.containsKey(CONFIG_ELEMENTS)) {
            String[] stringArray = tiesConfiguration.getStringArray(CONFIG_ELEMENTS);
            if (TiesConfiguration.arrayIsEmpty(stringArray)) {
                representationFilter = null;
            } else {
                representationFilter = new RepresentationFilter(tiesConfiguration, new ElementNameFilter(stringArray), new ElementNameFilter(tiesConfiguration.getStringArray(CONFIG_AVOID)), new Reranker(tiesConfiguration.subset(CONFIG_SENTENCE)), representation, "Sentence");
                Util.LOG.debug("Initialized representation filter for sentence filtering: " + representationFilter);
            }
        } else {
            representationFilter = null;
        }
        return representationFilter;
    }

    public ExtractorBase(String str) throws IllegalArgumentException, ProcessingException {
        this(str, TiesConfiguration.CONF);
    }

    public ExtractorBase(String str, TiesConfiguration tiesConfiguration) throws IllegalArgumentException, ProcessingException {
        this(str, null, tiesConfiguration);
    }

    public ExtractorBase(String str, File file, TiesConfiguration tiesConfiguration) throws IllegalArgumentException, ProcessingException {
        super(str, tiesConfiguration);
        this.contextDetails = new ArrayList();
        this.targetStructure = new TargetStructure(tiesConfiguration);
        this.representation = new DefaultRepresentation(tiesConfiguration);
        this.strategy = CombinationStrategy.createStrategy(this.targetStructure.getClassNames(), tiesConfiguration);
        this.reestimator = Reestimator.createReestimators(tiesConfiguration);
        this.reextractor = this.strategy.initReextractor(this.reestimator);
        this.documentRewriters = createDocumentRewriters(tiesConfiguration);
        this.sentenceFilter = createSentenceFilter(tiesConfiguration, this.representation);
        Set[] allClasses = this.strategy.allClasses();
        this.classifiers = new Classifier[allClasses.length];
        for (int i = 0; i < allClasses.length; i++) {
            this.classifiers[i] = TrainableClassifier.createClassifier(CollUtils.asStringSet(allClasses[i]), file, tiesConfiguration, CONFIG_SUFFIX_IE);
        }
        this.relevantPunctuation = CollUtils.arrayAsSet(tiesConfiguration.getStringArray(CONFIG_RELEVANT_PUNCTUATION));
        Util.LOG.debug("Initialized set of relevant punctuation + symbol tokens to " + this.relevantPunctuation);
        this.factory = new TokenizerFactory(tiesConfiguration);
    }

    public ExtractorBase(String str, TargetStructure targetStructure, Classifier[] classifierArr, Representation representation, CombinationStrategy combinationStrategy, FinalReextractor finalReextractor, TokenizerFactory tokenizerFactory, Reestimator reestimator, DocumentRewriter[] documentRewriterArr, TrainableFilter trainableFilter, Set<String> set, TiesConfiguration tiesConfiguration) {
        super(str, tiesConfiguration);
        this.contextDetails = new ArrayList();
        this.targetStructure = targetStructure;
        this.classifiers = classifierArr;
        this.representation = representation;
        this.strategy = combinationStrategy;
        this.reextractor = finalReextractor;
        this.factory = tokenizerFactory;
        this.reestimator = reestimator;
        this.documentRewriters = documentRewriterArr;
        this.sentenceFilter = trainableFilter;
        this.relevantPunctuation = set;
        Util.LOG.debug("Initialized set of relevant punctuation + symbol tokens to " + this.relevantPunctuation);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public void addContextDetails(ContextDetails contextDetails) {
        if (this.contextDetails.size() != contextDetails.getIndex()) {
            Util.LOG.warn("Length of context details list is " + this.contextDetails.size() + ", but index of next context is " + contextDetails.getIndex() + " (these numbers should be identical since there is one context per index position)");
        }
        this.contextDetails.add(contextDetails);
    }

    protected abstract FilteringTokenWalker createFilteringTokenWalker(TrainableFilter trainableFilter);

    private TokenWalker createTokenWalker() throws ProcessingException {
        return isSentenceFiltering() ? createFilteringTokenWalker(this.sentenceFilter) : new TokenWalker(this, getFactory());
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public FMetricsView evaluateSentenceFiltering(EmbeddingElements embeddingElements) {
        if (!isSentenceFiltering() || this.walker == null) {
            return null;
        }
        return FilterEvaluator.evaluate(embeddingElements, (FilteringTokenWalker) this.walker);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Document filterDocument(Document document, File file) throws IOException, ProcessingException {
        if (this.documentRewriters == null || this.documentRewriters.length <= 0) {
            return document;
        }
        Document document2 = document;
        for (int i = 0; i < this.documentRewriters.length; i++) {
            document2 = this.documentRewriters[i].rewrite(document2, file);
        }
        return document2;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Set[] getActiveClasses() {
        return this.activeClasses;
    }

    public Classifier[] getClassifiers() {
        return this.classifiers;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public List<ContextDetails> getContextDetails() {
        return this.contextDetails;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public DocumentRewriter[] getDocumentRewriters() {
        return this.documentRewriters;
    }

    public TokenizerFactory getFactory() {
        return this.factory;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public FeatureVector getFeatures() {
        return this.features;
    }

    public PriorRecognitions getPriorRecognitions() {
        return this.priorRecognitions;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Reestimator getReestimator() {
        return this.reestimator;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public FinalReextractor getReextractor() {
        return this.reextractor;
    }

    public Representation getRepresentation() {
        return this.representation;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public TrainableFilter getSentenceFilter() {
        return this.sentenceFilter;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public CombinationStrategy getStrategy() {
        return this.strategy;
    }

    public TargetStructure getTargetStructure() {
        return this.targetStructure;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public TokenWalker getWalker() {
        return this.walker;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public void initFields(File file) throws ProcessingException, IOException {
        this.featureCache = new HashMap();
        this.priorRecognitions = this.representation.initDocument(file, getFactory());
        this.walker = createTokenWalker();
        this.strategy.reset();
        this.contextDetails.clear();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public boolean isRelevant(String str) {
        return !TextUtils.punctuationOrSymbol(str) || this.relevantPunctuation.contains(str);
    }

    public boolean isSentenceFiltering() {
        return this.sentenceFilter != null;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public void markRelevant(String str) {
        this.relevantPunctuation.add(str);
    }

    protected abstract void resetStrategy();

    @Override // de.fu_berlin.ties.filter.SkipHandler
    public void skip() {
    }

    @Override // de.fu_berlin.ties.TextProcessor
    public String toString() {
        ToStringBuilder append = new ToStringBuilder(this).append("classifiers", (Object[]) this.classifiers);
        if (this.reestimator != null) {
            append.append("re-estimator", this.reestimator);
        }
        if (this.reextractor != null) {
            append.append("re-extractor", this.reextractor);
        }
        return append.toString();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public void updateState(Element element, String str, String str2, String str3) {
        this.features = getRepresentation().buildContext(element, str, str2, str3, this.priorRecognitions, this.featureCache, "Token");
        this.activeClasses = getStrategy().activeClasses();
    }

    public Set<String> viewRelevantPunctuation() {
        return Collections.unmodifiableSet(this.relevantPunctuation);
    }
}
