/*
 * Decompiled with CFR 0.152.
 */
package cc.mallet.pipe;

import cc.mallet.extract.StringSpan;
import cc.mallet.extract.StringTokenization;
import cc.mallet.pipe.Input2CharSequence;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import cc.mallet.util.CharSequenceLexer;
import cc.mallet.util.MalletLogger;
import java.io.File;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class SGML2TokenSequence
extends Pipe
implements Serializable {
    private static Logger logger = MalletLogger.getLogger(SGML2TokenSequence.class.getName());
    Pattern sgmlPattern = Pattern.compile("</?([^>]*)>");
    CharSequenceLexer lexer;
    String backgroundTag;
    private boolean saveSource = true;
    private static final long serialVersionUID = 1L;
    private static final int CURRENT_SERIAL_VERSION = 1;

    public SGML2TokenSequence(CharSequenceLexer lexer, String backgroundTag, boolean saveSource) {
        this.lexer = lexer;
        this.backgroundTag = backgroundTag;
        this.saveSource = saveSource;
    }

    public SGML2TokenSequence(CharSequenceLexer lexer, String backgroundTag) {
        this.lexer = lexer;
        this.backgroundTag = backgroundTag;
    }

    public SGML2TokenSequence(String regex, String backgroundTag) {
        this.lexer = new CharSequenceLexer(regex);
        this.backgroundTag = backgroundTag;
    }

    public SGML2TokenSequence() {
        this(new CharSequenceLexer(), "O");
    }

    public Instance pipe(Instance carrier) {
        CharSequence string = (CharSequence)carrier.getData();
        StringTokenization dataTokens = new StringTokenization(string);
        TokenSequence targetTokens = new TokenSequence();
        String tag = this.backgroundTag;
        String nextTag = this.backgroundTag;
        Matcher m = this.sgmlPattern.matcher(string);
        int textStart = 0;
        int textEnd = 0;
        int nextStart = 0;
        boolean done = false;
        logger.fine(this.sgmlPattern.pattern());
        logger.finer(((Object)string).toString());
        while (!done) {
            boolean bl = done = !m.find();
            if (done) {
                textEnd = string.length();
            } else {
                String sgml = m.group();
                logger.finer("SGML = " + sgml);
                int groupCount = m.groupCount();
                logger.finer(Integer.toString(groupCount));
                nextTag = sgml.charAt(1) == '/' ? this.backgroundTag : sgml.substring(1, sgml.length() - 1);
                logger.finer("nextTag: " + nextTag);
                nextStart = m.end();
                textEnd = m.start();
                logger.finer("Text start/end " + textStart + " " + textEnd);
            }
            if (textEnd - textStart > 0) {
                logger.finer("Tag = " + tag);
                logger.finer("Target = " + string.subSequence(textStart, textEnd));
                this.lexer.setCharSequence(string.subSequence(textStart, textEnd));
                while (this.lexer.hasNext()) {
                    this.lexer.next();
                    int tokStart = textStart + this.lexer.getStartOffset();
                    int tokEnd = textStart + this.lexer.getEndOffset();
                    dataTokens.add(new StringSpan(string, tokStart, tokEnd));
                    targetTokens.add(new Token(tag));
                }
            }
            textStart = nextStart;
            tag = nextTag;
        }
        carrier.setData(dataTokens);
        carrier.setTarget(targetTokens);
        if (this.saveSource) {
            carrier.setSource(dataTokens);
        }
        return carrier;
    }

    public static void main(String[] args) {
        try {
            SerialPipes p = new SerialPipes(new Pipe[]{new Input2CharSequence(), new SGML2TokenSequence()});
            for (int i = 0; i < args.length; ++i) {
                Instance carrier = p.instanceFrom(new Instance(new File(args[i]), null, null, null));
                TokenSequence data = (TokenSequence)carrier.getData();
                TokenSequence target = (TokenSequence)carrier.getTarget();
                logger.finer("===");
                logger.info(args[i]);
                for (int j = 0; j < data.size(); ++j) {
                    logger.info(((Token)target.get(j)).getText() + " " + ((Token)data.get(j)).getText());
                }
            }
        }
        catch (Exception e) {
            System.out.println(e);
            e.printStackTrace();
        }
    }

    private void writeObject(ObjectOutputStream out) throws IOException {
        out.writeInt(1);
        out.writeObject(this.sgmlPattern);
        out.writeObject(this.lexer);
        out.writeObject(this.backgroundTag);
        out.writeBoolean(this.saveSource);
    }

    private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
        int version = in.readInt();
        this.sgmlPattern = (Pattern)in.readObject();
        this.lexer = (CharSequenceLexer)in.readObject();
        this.backgroundTag = (String)in.readObject();
        if (version == 0) {
            this.saveSource = true;
        }
    }
}

