/***************************************************************************
 *   Copyright (C) 2004-2008 by Pere Constans
 *   constans@molspaces.com
 *   cb2Bib version 1.0.4. Licensed under the GNU GPL version 3.
 *   See the LICENSE file that comes with this distribution.
 ***************************************************************************/
#include "c2bBibParser.h"

#include "authorString.h"
#include "c2b.h"
#include "c2bBibPreparser.h"
#include "c2bCiteIDMaker.h"
#include "c2bClipEdit.h"
#include "c2bHeuristicBibParser.h"
#include "c2bPreprocess.h"
#include "c2bSettings.h"

#include <QApplication>
#include <QComboBox>
#include <QFile>
#include <QMessageBox>


c2bBibParser::c2bBibParser(QObject* parent) : bibParser(parent)
{
    settings = c2bSettingsP;
    c2bMain = c2b::mainWidget();

    // Creating Journal Name Database
    QString journal_f = settings->fileName("cb2Bib/JournalFile");
    if (journal_f.isEmpty())
        QMessageBox::information(c2bMain, tr("Information - cb2Bib"),
                                 tr("No Abbreviation Journal file has been specified.\n\n"
                                    "Note: Abbreviation files are specified through the cb2Bib Configure dialog."),
                                 QMessageBox::Ok);
    else
    {
        QFile jdbfile(journal_f);
        if (!jdbfile.open(QIODevice::ReadOnly | QIODevice::Text))      // jdbfile.exists wouldn't produce an errorString
            QMessageBox::warning(c2bMain, tr("Warning - cb2Bib"),
                                 tr("Unable to open the Journal Abbreviation file %1 for reading.\nError: '%2'.\n\n"
                                    "Note: Abbreviation files are specified through the cb2Bib Configure dialog.")
                                 .arg(journal_f).arg(jdbfile.errorString()),
                                 QMessageBox::Ok);
        else
            jdbfile.close();
    }
    jDB = new journalDB(journal_f);

    // Creating Preparser object
    prParser = new c2bBibPreparser(this);
    connect(prParser, SIGNAL(preparserDataAvailable(const QString&)), this, SIGNAL(preparserDataAvailable(const QString&)));

    // Creating Stream Preprocess Object
    prProc = new c2bPreprocess(c2bMain);

    // Creating heuristic bibliographic parser
    heuristicParser = new c2bHeuristicBibParser(this);
}

c2bBibParser::~c2bBibParser()
{
    delete jDB;
    delete heuristicParser;
}


QString c2bBibParser::makeBib(bool sig)
{
    // Writes a BibTeX string from current GUI values as it would be saved to file
    // Special formatting is done here
    QString BibString;
    if (sig)
        BibString = "% cb2Bib " + C2B_VERSION + "\n";
    else
        BibString = "";
    BibString += "@" + bibTypes->currentText() + "{" + bibFields["id"]->text();
    QStringList::Iterator it = bibFieldList.begin();
    bool titleDoubleBraces = settings->value("cb2Bib/UseDoubleBraces").toBool();
    bool PostprocessMonth = settings->value("cb2Bib/PostprocessMonth").toBool();
    while (it != bibFieldList.end())
    {
        QString fvalue = bibFields[*it]->text();
        if (!fvalue.isEmpty())
        {
            if (settings->value("cb2Bib/ConvertReferenceToLaTeX").toBool())
                c2bUtils::c2bToBib(fvalue);
            QString fd = *it;
            QString dum;
            dum.fill(' ', 12 - fd.length());
            if (fd == "title" || fd == "booktitle")
                if (titleDoubleBraces)
                    if (!fvalue.contains(QRegExp("^\\{.+\\}$")))
                        fvalue = "{" + fvalue + "}";
            if (fd == "month")
            {
                if (PostprocessMonth)
                    BibString += ",\n" + fd + dum + " =   " + fvalue;
            }
            else
                BibString += ",\n" + fd + dum + " = {" + fvalue + "}";
        }
        it++;
    }
    BibString += "\n}\n";
    if (sig)
        BibString += "\n";
    return BibString;
}

QString c2bBibParser::makeBib(const bibReference& ref)
{
    // Writes a BibTeX string from struct bibReference
    // No special formatting is done here
    QString BibString;
    BibString += "@" + ref.typeName + "{";
    QStringList::Iterator it = bibFieldList.begin();
    while (it != bibFieldList.end())
    {
        QString fvalue = ref.value(*it);
        if (!fvalue.isEmpty())
        {
            QString fd = *it;
            QString dum;
            dum.fill(' ', 12 - fd.length());
            BibString += ",\n" + fd + dum + " = {" + fvalue + "}";
        }
        it++;
    }
    BibString += "\n}\n";
    return BibString;
}

const bibReference c2bBibParser::currentReference() const
{
    // Transfers GUI values to bibReference structure
    bibReference ref;
    ref.typeName = bibTypes->currentText();
    ref.keyName = bibFields.value("id")->text();
    QHashIterator<QString, c2bLineEdit*> i(bibFields);
    QString value;
    while (i.hasNext())
    {
        i.next();
        value = i.value()->text();
        if (value.isEmpty())
            continue;
        ref.insert(i.key(), value);
    }
    return ref;
}

QString c2bBibParser::excerpt(const QString& text, const QStringList& hints)
{
    QString txt = removeTags(text);
    txt.replace(QRegExp("\\[Bibliographic Metadata.+/Bibliographic Metadata\\]"), " ");
    txt.replace(QRegExp("(http://|https://|ftp://|www\\.|ftp\\.)(www\\.|ftp\\.){0,1}\\S+"), " ");
    txt.replace(QRegExp("\\b[A-Z]+\\b"), " ");
    txt.replace(QRegExp("\\d"), " ");
    txt.replace(QRegExp("\\W"), " ");
    txt.replace(QRegExp("\\b\\w{1,2}\\b"), " ");
    txt.replace(QRegExp("\\b(about|and|are|com|for|from|how|into|that|the|their|this|where|with|www)\\b", Qt::CaseInsensitive), " ");
    txt.replace(QRegExp("\\b(january|february|march|april|may|june|july|august|september|october|november|december)\\b",
                        Qt::CaseInsensitive), " ");
    txt = hints.join(" ") + " " + txt;
    txt = txt.simplified();
    QStringList txtlist = txt.split(" ", QString::SkipEmptyParts);
    QStringList txtlistSimp;
    for (int i = 0; i < qMin(15, txtlist.count()); ++i)
        txtlistSimp.append(txtlist.at(i));
    return txtlistSimp.join(" ");
}

QString c2bBibParser::parse(const QString& field, QString value)
{
    // Post Processing of Fields
    if (field == "file")
        return value.trimmed();
    value = removeTags(value);
    if (value.isEmpty())
        return (value);
    c2bUtils::fullBibToC2b(value);
    bool fullNames = settings->value("cb2Bib/UseFullNames").toBool();
    if (field == "author")
    {
        c2bUtils::debug(value);
        authorString AuthorList = authorString(value, fullNames);
        value = AuthorList.bib();
    }
    else if (field == "addauthors")
    {
        QString PAuthors = bibFields["author"]->text();
        c2bUtils::debug(value);
        authorString AuthorList = authorString(value, fullNames);
        if (PAuthors.isEmpty())
            value = AuthorList.bib();
        else
            value = PAuthors + " and " + AuthorList.bib();
    }
    else if (field == "editor")
    {
        c2bUtils::debug(value);
        authorString EditorList = authorString(value.remove(QRegExp("\\((Editor|Editors|Ed|Eds)\\)", Qt::CaseInsensitive)), fullNames);
        value = EditorList.bib();
    }
    else if (field == "addeditor")
    {
        QString PEditor = bibFields["editor"]->text();
        c2bUtils::debug(value);
        authorString EditorList = authorString(value.remove(QRegExp("\\((Editor|Editors|Ed|Eds)\\)", Qt::CaseInsensitive)), fullNames);
        if (PEditor.isEmpty())
            value = EditorList.bib();
        else
            value = PEditor + " and " + EditorList.bib();
    }
    else if (field == "journal")
    {
        QString jvalue;
        if (settings->value("cb2Bib/SetJournalsToFullname").toBool())
        {
            value = jDB->retrieveFull(value);
            jvalue = jDB->retrieve(value);
        }
        else
        {
            value = jDB->retrieve(value);
            jvalue = jDB->retrieveFull(value);
        }
    }
    else if (field == "month")
    {
        if (settings->value("cb2Bib/PostprocessMonth").toBool())
            value = MDB->retrieve(value);
    }
    // Process pages, volume, and number to set hyphenation
    else if (field == "pages")
        value = setPages(value);
    else if (field == "volume")
        value = setPages(value);
    else if (field == "number")
        value = setPages(value);
    else if (field == "title" || field == "booktitle")
    {
        if (c2bUtils::isUpperCaseString(value))
            value = c2bUtils::setCapitalization(value);
    }
    else if (field == "addtitle")
    {
        if (c2bUtils::isUpperCaseString(value))
            value = c2bUtils::setCapitalization(value);
        QString PTitle = bibFields["title"]->text();
        if (!PTitle.isEmpty())
            value = PTitle + ": " + value;
    }
    return value.simplified();
}

void c2bBibParser::setJournal()
{
    bibFields["journal"]->setText(parse("journal", bibFields["journal"]->text()));
}

QString c2bBibParser::setJournalsToFull(const QString& text)
{
    QApplication::setOverrideCursor(QCursor(Qt::WaitCursor));
    QString substituted_text = text;
    QRegExp jnre("\\bjournal\\s*=\\s*[\\{\"](.*)[\\}\"]", Qt::CaseInsensitive);
    jnre.setMinimal(true);
    int pos = 0;
    uint nj = 0;
    while (pos >= 0)
    {
        pos = jnre.indexIn(substituted_text, pos);
        if (pos > -1)
        {
            QString line = jnre.cap(0);
            QString jn = jnre.cap(1);
            line.replace(jn, fullJournal(jn));
            substituted_text.replace(pos, jnre.matchedLength(), line);
            pos += line.length();
            nj++;
        }
        c2b::showMessage(tr("Processed %1 journal names...").arg(nj));
        QCoreApplication::processEvents();
    }
    QApplication::restoreOverrideCursor();
    c2b::showMessage(tr("Processed %1 journal names.").arg(nj));
    return (substituted_text);
}

QString c2bBibParser::setJournalsToAbbreviated(const QString& text)
{
    QApplication::setOverrideCursor(QCursor(Qt::WaitCursor));
    QString substituted_text = text;
    QRegExp jnre("\\bjournal\\s*=\\s*[\\{\"](.*)[\\}\"]", Qt::CaseInsensitive);
    jnre.setMinimal(true);
    int pos = 0;
    uint nj = 0;
    while (pos >= 0)
    {
        pos = jnre.indexIn(substituted_text, pos);
        if (pos > -1)
        {
            QString line = jnre.cap(0);
            QString jn = jnre.cap(1);
            line.replace(jn, abbreviatedJournal(jn));
            substituted_text.replace(pos, jnre.matchedLength(), line);
            pos += line.length();
            nj++;
        }
        c2b::showMessage(tr("Processed %1 journal names...").arg(nj));
        QCoreApplication::processEvents();
    }
    QApplication::restoreOverrideCursor();
    c2b::showMessage(tr("Processed %1 journal names.").arg(nj));
    return (substituted_text);
}

void c2bBibParser::setRefType(QString type)
{
    //  Post Processing of Reference Types
    int i = 0;
    while (i < bibTypes->count())
    {
        if (bibTypes->itemText(i) == type)
        {
            bibTypes->setCurrentIndex(i);
            return;
        }
        i++;
    }
    bibTypes->setCurrentIndex(0);
    bibTypes->setItemText(0, type);
}

void c2bBibParser::clearFields()
{
    QStringList::Iterator it = bibFieldList.begin();
    while (it != bibFieldList.end())
    {
        bibFields[*it]->clear();
        it++;
    }
    bibTypes->setCurrentIndex(1);
    auto_recognized = false;
}


/****************************************************************************

AUTOMATIC BIB CAPTION

*****************************************************************************/

void c2bBibParser::preparse(const QString& text)
{
    prParser->preparse(text);
}

/** \page bibproc Extracting Data from the Clipboard

    Clipboard contents is processed according to the following rules:

    - Perform external, user-defined preparsing on input stream. See \ref c2bconf_clipboard.

    - Perform user-defined substitutions on input stream. See \ref c2bconf_clipboard.

    - Check if input stream is already a BibTeX entry. If so, process entry.

    - Check if input stream is a PubMed - Medline Journal entry. If so, process entry.

    - Preprocess author names: PI JOAN III -> Pi III, J.
    (care of name prefixes, suffixes, and removal of ambiguities).


    If otherwise,

    - Extract DOI \n (DOI, URL and FILE/PDF are preprocessed, performed before
    the automatic recognition takes place.)

    - Extract URL

    - Remove leading and trailing white spaces, TABs and CRs.

    - "\r\n", "\n" and/or "\r" replaced by the line indicator tag "<NewLineN>".

    - Replace "\t" and five or more consecutive "\s" by the tabular tag "<TabN>".

    - Simplify White Spaces

    - Start the automatic recognition engine.


    If the automatic recognition engine fails, optionally, a heuristic guessing
    will be performed. See also \ref heuristic_guess and \ref metadata.

*/
void c2bBibParser::parse(QString text)
{
    prProc->preprocessText(text);

    if (hasBibTeX(text))
    {
        clipEditor->setText(text);
        const bibReference bRef = wholeReference(text);
        bibReferenceIterator i(bRef);
        while (i.hasNext())
        {
            i.next();
            bibFields[i.key()]->setText(parse(i.key(), i.value()));
        }
        setRefType(bRef.typeName);
        c2bAutoRecogStr = tr("Processed as 'BibTeX'.");
        auto_recognized = true;
        c2b::showMessage(c2bAutoRecogStr);
        return;
    }

    QRegExp ml("^PMID\\s*-");
    if (text.contains(ml))
    {
        setRefType("article");
        clipEditor->setText(text);
        text.replace(QRegExp("[\\n\\r]\\s*([A-Z]{1,4}\\s*-)"), "][\\1");
        text = text.simplified();
        if (!text.contains(QRegExp("\\[FAU\\s+-")))
            text.replace(QRegExp("\\[(AU\\s*-\\s*[-'\\w]+)"), "[F\\1 ");
        QStringList fList = text.split("][");
        QString kw = "";
        // Link set according to: "Creating a Web Link to the Entrez Databases",
        // http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helplinks.chapter.linkshelp#linkshelp.Retrieve_PubMed_Cita
        QString pubmed_url = "http://www.ncbi.nlm.nih.gov/pubmed/%1";
        QRegExp fld("^([A-Z]{1,4})\\s{0,1}-\\s*(.+)$");
        fld.setPatternSyntax(QRegExp::RegExp2);
        for (QStringList::Iterator it = fList.begin(); it != fList.end(); ++it)
        {
            if (fld.indexIn(*it) == -1)
                continue;
            QString tag = fld.cap(1);
            QString value = fld.cap(2);
            if (tag == "AB")
                bibFields["abstract"]->setText(parse("abstract", value));
            else if (tag == "FAU")
                bibFields["author"]->setText(parse("addauthors", medlToc2b(value)));
            else if (tag == "TA")
                bibFields["journal"]->setText(parse("journal", value));
            else if (tag == "IP")
                bibFields["number"]->setText(parse("number", value));
            else if (tag == "PG")
                bibFields["pages"]->setText(parse("pages", value));
            else if (tag == "TI")
                bibFields["title"]->setText(parse("title", value));
            else if (tag == "PMID")
                bibFields["url"]->setText(parse("url", pubmed_url.arg(value)));
            else if (tag == "VI")
                bibFields["volume"]->setText(parse("volume", value));
            else if (tag == "AID")
            {
                if (value.contains("[doi]"))
                    bibFields["doi"]->setText(parse("doi", value.remove("[doi]")));
            }
            else if (tag == "DP")
                bibFields["year"]->setText(parse("year", value.replace(QRegExp("^([\\d\\s]+).*$"), "\\1")));
            else if (tag == "MH")
                kw += "; " + value.trimmed();
        }
        if (!kw.isEmpty())
            bibFields["keywords"]->setText(parse("keywords", kw.remove(0, 2)));
        c2bAutoRecogStr = QString("Processed as 'PubMed - Medline Journals'.");
        auto_recognized = true;
        c2b::showMessage(c2bAutoRecogStr);
        return;
    }

    QRegExp rxdoi("(10.[\\d\\.]+/\\S+)");
    int ndoi = rxdoi.indexIn(text);
    if (ndoi > -1)
    {
        QString cdoi = rxdoi.cap(1);
        // This happens when publishers set doi to title in metadata: <title>doi:10. ... </title>
        if (cdoi.endsWith("</title>"))
            cdoi.chop(8);
        bibFields["doi"]->setText(parse("doi", cdoi));
    }

    QRegExp rxarxiv("arXiv:([\\w\\./-]+)");
    int narxiv = rxarxiv.indexIn(text);
    if (narxiv > -1)
    {
        bibFields["journal"]->setText(parse("journal", rxarxiv.cap(0)));
        bibFields["url"]->setText(parse("url", QString("http://arxiv.org/abs/%1").arg(rxarxiv.cap(1))));
    }

    QRegExp rxhtml("((http://|https://|ftp://|www\\.|ftp\\.)(www\\.|ftp\\.){0,1}\\S+)");
    int nhtml = rxhtml.indexIn(text);
    if (nhtml > -1)
        bibFields["url"]->setText(parse("url", rxhtml.cap(1)));

    QString tagged_text = setTags(text);
    clipEditor->setText(text, tagged_text);

    QString regular_expression_f = settings->fileName("cb2Bib/RegularExpressionFile");
    if (regular_expression_f.isEmpty())
    {
        QMessageBox::information(c2bMain, tr("Information - cb2Bib"),
                                 tr("No RegExp filename has been specified.\n\n"
                                    "Note: RegExp files are specified through the cb2Bib Configure dialog."),
                                 QMessageBox::Ok);
        return;
    }
    QFile file(regular_expression_f);
    if (!file.open(QIODevice::ReadOnly | QIODevice::Text))
    {
        QMessageBox::warning(c2bMain, tr("Warning - cb2Bib"),
                             tr("Unable to open the RegExp file %1 for reading.\nError: '%2'.\n\n"
                                "Note: RegExp files are specified through the cb2Bib Configure dialog. "
                                "After manual matching, patterns can be edited and stored in a "
                                "RegExp file for future autodetection.")
                             .arg(regular_expression_f).arg(file.errorString()),
                             QMessageBox::Ok);
        return;
    }

    QString ItemX;
    QString line;
    QString reftype;
    QString fieldset;
    QTextStream stream(&file);
    stream.setCodec("UTF-8");
    stream.setAutoDetectUnicode(true);
    int nfilters = 0;

    while (!stream.atEnd())
    {
        line = stream.readLine();
        if (!(line.isEmpty() || line.contains(QRegExp("^#"))))
        {
            reftype = stream.readLine();
            fieldset = stream.readLine();
            ItemX = stream.readLine();

            c2bUtils::debug(tr("The RegExp file contains1: |%1|").arg(line));
            c2bUtils::debug(tr("The RegExp file contains2: |%1|").arg(reftype));
            c2bUtils::debug(tr("The RegExp file contains3: |%1|").arg(fieldset));
            c2bUtils::debug(tr("The RegExp file contains4: |%1|").arg(ItemX));

            QRegExp rx(ItemX);
            rx.setMinimal(true);
            if (!rx.isValid())
                qDebug(tr("[cb2Bib] RegExp |%1| is not valid.").arg(ItemX).toLatin1());

            QStringList list = fieldset.split(" ");
            nfilters++;
            int nfields = rx.numCaptures();
            int ncap = rx.indexIn(tagged_text);
            c2bUtils::debug(tr("Readable Fields: |%1|").arg(nfields));
            c2bUtils::debug(tr("Captures: |%1|").arg(ncap));

            if (ncap > -1)
            {
                for (int i = 0; i < nfields; i++)
                {
                    QString listi = list.at(i);
                    int ii = i + 1;
                    c2bUtils::debug(QString("Fields in Template %1: |%2|").arg(i).arg(rx.cap(ii)));
                    if (listi.contains(field_re))
                    {
                        if (listi == "author")
                            // Reminder: "addauthors" requires to init bibFields["author"]
                            bibFields[listi]->setText(parse("addauthors", rx.cap(ii)));
                        else if (listi == "editor")
                            // Reminder: "addeditor" requires to init bibFields["editor"]
                            bibFields[listi]->setText(parse("addeditor", rx.cap(ii)));
                        else if (listi == "title")
                            // Reminder: "addtitle" requires to init bibFields["title"]
                            bibFields[listi]->setText(parse("addtitle", rx.cap(ii)));
                        else
                            bibFields[listi]->setText(parse(listi, rx.cap(ii)));
                    }
                }
                setRefType(reftype);
                c2bAutoRecogStr = tr("Processed as '%1'.").arg(line);
                auto_recognized = true;
                c2b::showMessage(c2bAutoRecogStr);
                file.close();
                return;
            }
        }
    }
    file.close();

    // Heuristic Bib Parsing
    if (settings->value("cb2Bib/DoHeuristicGuess").toBool())
    {
        QString clean_text = removeTags(tagged_text);
        heuristicParser->guessFields(clean_text, tagged_text);
        c2bAutoRecogStr = tr("Applied %1 filters: No automatic format detection. %2 fields guessed.")
                          .arg(nfilters).arg(countFields());
    }
    else
        c2bAutoRecogStr = tr("Applied %1 filters: No automatic format detection.").arg(nfilters);
    c2b::showMessage(c2bAutoRecogStr);
}

void c2bBibParser::guessFields(const QString& text)
{
    QString clean_text = text.simplified();
    QString tagged_text = setTags(text);
    heuristicParser->guessFields(clean_text, tagged_text);
    c2bAutoRecogStr = tr("%2 fields guessed.").arg(countFields());
    c2b::showMessage(c2bAutoRecogStr);
}

const QString c2bBibParser::setTags(const QString& text) const
{
    QString tagged_text = text.trimmed();
    tagged_text.replace(QRegExp("\\r\\n"), "<found_new_line>");       // Windows new line
    tagged_text.replace(QRegExp("\\n"), "<found_new_line>");          // Linux new line, LF
    tagged_text.replace(QRegExp("\\r"), "<found_new_line>");          // OSX new line, CR
    QStringList spText = tagged_text.split("<found_new_line>");
    int n = spText.count();
    tagged_text = "";
    for (int i = 0; i < n - 1; i++)
        tagged_text += spText.at(i) + QString("<NewLine%1>").arg(i + 1);
    tagged_text += spText[n-1];
    spText = tagged_text.split(QRegExp("(\\s{5,}|\\t)"));
    n = spText.count();
    tagged_text = "";
    for (int i = 0; i < n - 1; i++)
        tagged_text += spText.at(i) + QString("<Tab%1>").arg(i + 1);
    tagged_text += spText[n-1];
    tagged_text = tagged_text.simplified();
    return tagged_text;
}

const QString c2bBibParser::removeTags(const QString& text) const
{
    QString clean = text;
    clean.remove("[[");
    clean.remove("]]");
    clean.replace(QRegExp("<NewLine\\d+>"), " ");
    clean.replace(QRegExp("<Tab\\d+>"), " ");
    clean = clean.simplified();
    return clean;
}

void c2bBibParser::connectBibWidgets(QHash<QString, c2bLineEdit*>& fields, QComboBox* types, c2bClipEdit* editor)
{
    bibFields = fields;
    bibTypes = types;
    clipEditor = editor;

    // Setting Reference Types
    bibTypes->insertItems(0, TypesList);
    bibTypes->setCurrentIndex(1);

    // Tracking Field Editor Line changes
    QStringList::Iterator it = bibFieldList.begin();
    while (it != bibFieldList.end())
    {
        connect(bibFields[*it], SIGNAL(textChanged(const QString &)), this, SIGNAL(bibModified()));
        it++;
    }
    connect(bibFields["id"], SIGNAL(textChanged(const QString &)), this, SIGNAL(bibModified()));
    connect(bibTypes, SIGNAL(editTextChanged(const QString &)), this, SIGNAL(bibModified()));
    connect(bibFields["journal"], SIGNAL(returnPressed()), this, SLOT(setJournal()));

    // Creating CiteID Maker
    citeIDM = new c2bCiteIDMaker(bibFields, this);
}

int c2bBibParser::countFields()
{
    // Counting Non Empty Fields
    QStringList::Iterator it = bibFieldList.begin();
    int n = 0;
    while (it != bibFieldList.end())
    {
        if (!bibFields[*it]->text().isEmpty())
            n++;
        it++;
    }
    return (n);
}
