package de.fu_berlin.ties.preprocess;

import de.fu_berlin.ties.ContextMap;
import de.fu_berlin.ties.ParsingException;
import de.fu_berlin.ties.ProcessingException;
import de.fu_berlin.ties.TextProcessor;
import de.fu_berlin.ties.TiesConfiguration;
import de.fu_berlin.ties.classify.ClassTrain;
import de.fu_berlin.ties.io.ContentType;
import de.fu_berlin.ties.io.IOUtils;
import de.fu_berlin.ties.text.TextUtils;
import de.fu_berlin.ties.util.ExternalCommand;
import de.fu_berlin.ties.util.Util;
import de.fu_berlin.ties.xml.TagConstituent;
import de.fu_berlin.ties.xml.XMLAdjuster;
import de.fu_berlin.ties.xml.XMLConstituent;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.builder.ToStringBuilder;
import org.w3c.tidy.Lexer;
import org.w3c.tidy.Node;
import org.w3c.tidy.Out;
import org.w3c.tidy.OutImpl;
import org.w3c.tidy.PPrint;
import org.w3c.tidy.Tidy;

/* loaded from: input_file:de/fu_berlin/ties/preprocess/PreProcessor.class */
public class PreProcessor extends TextProcessor {
    public static final String CONFIG_HTMLCONV_COMMAND = "html-converter.command";
    public static final String CONFIG_PREPROCESS_TEXT = "preprocess.text";
    public static final String CONFIG_PREPROCESS_TAGGER = "preprocess.tagger";
    public static final String FILE_EXT = "aug";
    private static final String NEWLINE_REPLACEMENT = "$0<br/>";
    private static final String PRE_TAG = "pre";
    private static final String DD_TAG = "dd";
    private static final String DL_TERM = "[^\\s:]+:";
    private static final String DL_ENTRY = DL_TERM + TextUtils.SINGLE_LINE_WS.pattern() + "+\\S.*(?:" + TextUtils.NEWLINE_PATTERN.pattern() + "\\1" + TextUtils.SINGLE_LINE_WS.pattern() + "+.+)*";
    private static final Pattern DL_PARA = Pattern.compile("(?:\\A|" + TextUtils.NEWLINE_PATTERN.pattern() + TextUtils.SINGLE_LINE_WS.pattern() + "*" + TextUtils.NEWLINE_PATTERN.pattern() + ")(" + TextUtils.SINGLE_LINE_WS.pattern() + "*)" + DL_ENTRY + "(?:" + TextUtils.NEWLINE_PATTERN.pattern() + "\\1" + DL_ENTRY + ")+(?:" + TextUtils.NEWLINE_PATTERN.pattern() + "\\1" + DL_TERM + TextUtils.SINGLE_LINE_WS.pattern() + "*)?(?=\\Z|" + TextUtils.NEWLINE_PATTERN.pattern() + TextUtils.SINGLE_LINE_WS.pattern() + "*" + TextUtils.NEWLINE_PATTERN.pattern() + ")");
    private static final String SHORT_LINE = TextUtils.SINGLE_LINE_WS.pattern() + "*\\S(?:.{1,38}\\S)?" + TextUtils.SINGLE_LINE_WS.pattern() + "*";
    private static final Pattern TWO_SHORT_LINES = Pattern.compile("^" + SHORT_LINE + TextUtils.NEWLINE_PATTERN.pattern() + "(?=" + SHORT_LINE + "$)", 8);
    private static final Pattern XML_DECLARATION = Pattern.compile("(<\\?xml\\s+version=\"1\\.0\")\\s*(\\?>)");
    private final boolean preprocessingText;
    private final Tidy tidy;
    private final PPrint tidyPrinter;
    private final TextProcessor[] tagger;
    private final XMLAdjuster xmlAdjuster;

    public PreProcessor() {
        this(FILE_EXT);
    }

    public PreProcessor(String str) throws IllegalArgumentException {
        this(str, TiesConfiguration.CONF);
    }

    public PreProcessor(String str, TiesConfiguration tiesConfiguration) throws IllegalArgumentException {
        super(str, tiesConfiguration);
        this.preprocessingText = tiesConfiguration.getBoolean(CONFIG_PREPROCESS_TEXT);
        String[] stringArray = tiesConfiguration.getStringArray(CONFIG_PREPROCESS_TAGGER);
        if (TiesConfiguration.arrayIsEmpty(stringArray)) {
            this.tagger = null;
        } else {
            String[] strArr = {"tt"};
            this.tagger = new TextProcessor[stringArray.length];
            for (int i = 0; i < stringArray.length; i++) {
                try {
                    this.tagger[i] = (TextProcessor) Util.createObject(Class.forName(stringArray[i]), strArr);
                } catch (Exception e) {
                    throw new IllegalArgumentException("Tagger initialization failed", e);
                }
            }
        }
        this.tidy = new Tidy();
        try {
            this.tidy.setErrout(new PrintWriter((Writer) new FileWriter("ties-tidy.log"), true));
        } catch (IOException e2) {
            Util.LOG.warn("PreProcessor: couldn't redirect Tidy output to ties-tidy.log", e2);
        }
        this.tidy.setCharEncoding(3);
        this.tidy.setEncloseBlockText(true);
        this.tidy.setEncloseText(true);
        this.tidy.setDocType("omit");
        this.tidy.setDropEmptyParas(true);
        this.tidy.setLogicalEmphasis(true);
        this.tidy.setMakeClean(true);
        this.tidy.setOnlyErrors(true);
        this.tidy.setQuoteNbsp(false);
        this.tidy.setQuiet(true);
        this.tidy.setRawOut(true);
        this.tidy.setShowWarnings(false);
        this.tidy.setTidyMark(false);
        this.tidy.setWraplen(0);
        this.tidy.setXmlPi(true);
        this.tidy.setXmlPIs(true);
        this.tidy.setXmlOut(true);
        this.tidyPrinter = new PPrint(this.tidy.getConfiguration());
        this.xmlAdjuster = new XMLAdjuster(null, null, null, false, false, false, false, tiesConfiguration);
    }

    public final String cleanHTML(String str, String str2) throws IOException {
        String str3;
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(str.getBytes(IOUtils.STANDARD_UNICODE_CHARSET));
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(str.length());
        synchronized (this.tidy) {
            Node parse = this.tidy.parse(byteArrayInputStream, (OutputStream) null);
            OutImpl outImpl = new OutImpl();
            ((Out) outImpl).encoding = 3;
            ((Out) outImpl).out = byteArrayOutputStream;
            this.tidyPrinter.printTree(outImpl, (short) 0, 0, (Lexer) null, parse);
            this.tidyPrinter.flushLine(outImpl, 0);
        }
        String byteArrayOutputStream2 = byteArrayOutputStream.toString(IOUtils.STANDARD_UNICODE_CHARSET);
        if (StringUtils.isNotEmpty(str2)) {
            str3 = XML_DECLARATION.matcher(byteArrayOutputStream2).replaceFirst("$1 encoding=\"" + str2 + "\"$2");
        } else {
            Util.LOG.warn("No character set specified -- cannot fix XML declaration of XHTML document");
            str3 = byteArrayOutputStream2;
        }
        return str3;
    }

    @Override // de.fu_berlin.ties.TextProcessor
    protected final void doProcess(Reader reader, Writer writer, ContextMap contextMap) throws IOException, ProcessingException {
        boolean z;
        boolean z2;
        String readToString = IOUtils.readToString(reader);
        String str = (String) contextMap.get(ContentType.KEY_MIME_TYPE);
        if (this.preprocessingText && ContentType.MIME_PLAIN.equals(str)) {
            readToString = preprocessText(readToString);
        }
        if (ContentType.MIME_HTML.equals(str)) {
            z = true;
            z2 = false;
        } else if (str == null || !getConfig().containsKey(TiesConfiguration.joinKey(CONFIG_HTMLCONV_COMMAND, str))) {
            z = false;
            z2 = true;
        } else {
            readToString = new ExternalCommand(getConfig().getStringArray(TiesConfiguration.joinKey(CONFIG_HTMLCONV_COMMAND, str))).execute(readToString);
            z = true;
            z2 = true;
        }
        if (z) {
            if (!z2) {
                readToString = cleanHTML(readToString, (String) contextMap.get(IOUtils.KEY_LOCAL_CHARSET));
            }
            readToString = insertLineBreaks(readToString);
        }
        if (this.tagger != null) {
            for (int i = 0; i < this.tagger.length; i++) {
                StringWriter stringWriter = new StringWriter(readToString.length());
                this.tagger[i].process(new StringReader(readToString), stringWriter, contextMap);
                readToString = stringWriter.toString();
            }
        }
        writer.write(readToString);
        writer.flush();
    }

    private String insertLineBreaks(CharSequence charSequence) throws ParsingException {
        String representantion;
        StringBuilder sb = new StringBuilder();
        boolean z = false;
        boolean z2 = false;
        Matcher matcher = TextUtils.NEWLINE_PATTERN.matcher("");
        Matcher matcher2 = TWO_SHORT_LINES.matcher("");
        for (XMLConstituent rawConstituents = this.xmlAdjuster.rawConstituents(charSequence, false); rawConstituents != null; rawConstituents = rawConstituents.nextConstituent()) {
            short type = rawConstituents.getType();
            if (type == 0 || type == 1) {
                TagConstituent tagConstituent = (TagConstituent) rawConstituents;
                if (tagConstituent.getName().equals(PRE_TAG) || tagConstituent.getName().endsWith(":pre")) {
                    z = type == 0;
                } else if (tagConstituent.getName().equals(DD_TAG) || tagConstituent.getName().endsWith(":dd")) {
                    z2 = type == 0;
                }
            }
            if (z && type == 8) {
                representantion = TextUtils.replaceAll(rawConstituents.getRepresentantion(), matcher, NEWLINE_REPLACEMENT);
                if (representantion != rawConstituents.getRepresentantion()) {
                    Util.LOG.debug("Inserted break tags into preformatted text:" + TextUtils.LINE_SEPARATOR + representantion);
                }
            } else if (z2 && type == 8) {
                representantion = TextUtils.replaceAll(rawConstituents.getRepresentantion(), matcher2, NEWLINE_REPLACEMENT);
                if (representantion != rawConstituents.getRepresentantion()) {
                    Util.LOG.debug("Inserted break tags between short lines in <dd> element:" + TextUtils.LINE_SEPARATOR + representantion);
                }
            } else {
                representantion = rawConstituents.getRepresentantion();
            }
            sb.append(representantion);
        }
        return sb.toString();
    }

    private String preprocessText(String str) {
        String trim = str.trim();
        Matcher matcher = DL_PARA.matcher(trim);
        int i = 0;
        if (!matcher.find()) {
            return str;
        }
        StringBuilder sb = new StringBuilder();
        do {
            sb.append(trim.substring(i, matcher.start()));
            i = matcher.end();
            String group = matcher.group(1);
            String group2 = matcher.group();
            String str2 = group + "  ";
            String replaceAll = TextUtils.replaceAll(TextUtils.replaceAll(group2, Pattern.compile("(\\A|" + TextUtils.NEWLINE_PATTERN.pattern() + ")" + group + TextUtils.SINGLE_LINE_WS.pattern() + ClassTrain.CORRECT_CLASS), "$1" + str2), Pattern.compile("((?:\\A|" + TextUtils.NEWLINE_PATTERN.pattern() + ")" + group + DL_TERM + ")" + TextUtils.SINGLE_LINE_WS.pattern() + ClassTrain.CORRECT_CLASS + "(?=\\S)"), "$1" + TextUtils.LINE_SEPARATOR + str2);
            Util.LOG.debug("Reformatted paragraph containing definition list: " + replaceAll);
            sb.append(replaceAll);
        } while (matcher.find());
        sb.append(trim.substring(i));
        return sb.toString();
    }

    @Override // de.fu_berlin.ties.TextProcessor
    public String toString() {
        ToStringBuilder append = new ToStringBuilder(this).appendSuper(super.toString()).append("preprocessing text", this.preprocessingText);
        if (this.tagger != null) {
            append.append("tagger", this.tagger);
        }
        return append.toString();
    }
}
