/***************************************************************************
 *   Copyright (C) 2004-2008 by Pere Constans
 *   constans@molspaces.com
 *   cb2Bib version 1.0.4. Licensed under the GNU GPL version 3.
 *   See the LICENSE file that comes with this distribution.
 ***************************************************************************/
#include "c2bHeuristicBibParser.h"

#include "c2bBibParser.h"
#include "journalDB.h"


c2bHeuristicBibParser::c2bHeuristicBibParser(c2bBibParser* p) :
        bp(p), bibFields(p->bibFields), bibFieldList(p->bibFieldList), jDB(p->journals())
{
    // Char 65533 appears many times in pdftotext outputs on windows (seems an encoding error, though)
    _hyphens = QRegExp("\\s*[-" + QString(QChar(8211)) + QString(QChar(8722)) + QString(QChar(65533)) + "]+\\s*");
    _hyphen_nums = "(?:\\d+|\\d+-\\d+)";
}

c2bHeuristicBibParser::~c2bHeuristicBibParser()
{}


void c2bHeuristicBibParser::guessFields(const QString& clean_text, const QString& tagged_text)
{
    QString clean_num = clean_text;
    clean_num.replace(_hyphens, "-");
    // Order is important to increase the chances of a proper recognition
    guessAbstract(tagged_text);
    guessKeywords(tagged_text);
    guessYear(clean_num);
    _debug_guess("guessYear");
    guessVolume(clean_num);
    _debug_guess("guessVolume");
    guessNumber(clean_num);
    _debug_guess("guessNumber");
    guessPages(clean_num);
    _debug_guess("guessPages");
    guessTitle(tagged_text);
    guessISBN(clean_num);
    guessJournal(clean_text);
    guessVolumeYearPages(clean_num);
    _debug_guess("guessVolumeYearPages");
    guessYearVolumePages(clean_num);
    _debug_guess("guessYearVolumePages");
    guessVolumePagesYear(clean_num);
    _debug_guess("guessVolumePagesYear");
    guessFromMetadata(clean_text);
}

/** \page heuristic_guess Field Recognition Rules

- <b>Abstract</b>
  - If <tt>Abstract:{0,1}</tt> is found.
  - If <tt>Summary:{0,1}</tt> is found.

*/
void c2bHeuristicBibParser::guessAbstract(const QString& text)
{
    QRegExp rxH("<NewLine\\d+>\\s*Abstract:{0,1}\\s*<NewLine\\d+>(.+)(<NewLine|$)", Qt::CaseInsensitive);
    rxH.setMinimal(true);
    int nH = rxH.indexIn(text);
    if (nH > -1)
    {
        QString val = rxH.cap(1).remove(QRegExp("^:"));
        bibFields["abstract"]->setText(bp->parse("abstract", val));
        return;
    }
    rxH = QRegExp("<NewLine\\d+>\\s*Abstract:{0,1}(.+)(<NewLine|$)", Qt::CaseInsensitive);
    rxH.setMinimal(true);
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        QString val = rxH.cap(1).remove(QRegExp("^:"));
        bibFields["abstract"]->setText(bp->parse("abstract", val));
        return;
    }
    rxH = QRegExp("<NewLine\\d+>\\s*Summary:{0,1}\\s*<NewLine\\d+>(.+)(<NewLine\\d+>\\s*<NewLine){0,1}.+$", Qt::CaseInsensitive);
    rxH.setMinimal(true);
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        QString val = rxH.cap(1).remove(QRegExp("^:"));
        bibFields["abstract"]->setText(bp->parse("abstract", val));
        return;
    }
    rxH = QRegExp("<NewLine\\d+>\\s*Summary:{0,1}(.+)(<NewLine\\d+>\\s*<NewLine){0,1}.+$", Qt::CaseInsensitive);
    rxH.setMinimal(true);
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        QString val = rxH.cap(1).remove(QRegExp("^:"));
        bibFields["abstract"]->setText(bp->parse("abstract", val));
    }
}

/** \page heuristic_guess

- <b>Keywords</b>
  - If <tt>Key\\s{0,1}words:{0,1}</tt> is found.

*/
void c2bHeuristicBibParser::guessKeywords(const QString& text)
{
    QRegExp rxH("<NewLine\\d+>\\s*Key\\s{0,1}words:{0,1}(.+)(<NewLine|$)", Qt::CaseInsensitive);
    rxH.setMinimal(true);
    int nH = rxH.indexIn(text);
    if (nH > -1)
    {
        QString val = rxH.cap(1).remove(QRegExp("^:"));
        bibFields["keywords"]->setText(bp->parse("keywords", val));
    }
}

/** \page heuristic_guess

- <b>Volume</b>
  - If <tt>Volume:{0,1}</tt> is found.
  - If <tt>Vol.{0,1}</tt> is found.
  - If <tt>\\b(\\d+)[,:]\\s*\\d+\\W+\\d+</tt> is found.
  - If <tt>\\b(\\d+)\\s*\\(\\d+\\)</tt> is found.
  - If <tt>\\b(\\d+)[,:]\\s*\\d+\\b</tt> is found.

*/
void c2bHeuristicBibParser::guessVolume(const QString& text)
{
    _reliable_volume = true;
    QRegExp rxH("Volumes{0,1}:{0,1}\\s*(" + _hyphen_nums + ")", Qt::CaseInsensitive);
    int nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["volume"]->setText(bp->parse("volume", rxH.cap(1)));
        return;
    }
    rxH = QRegExp("Vols{0,1}\\.{0,1}\\s*(" + _hyphen_nums + ")", Qt::CaseInsensitive);
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["volume"]->setText(bp->parse("volume", rxH.cap(1)));
        return;
    }
    _reliable_volume = false;
    rxH = QRegExp("\\b(\\d+)[,:]\\s*\\d+\\W+\\d+", Qt::CaseInsensitive);
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["volume"]->setText(bp->parse("volume", rxH.cap(1)));
        return;
    }
    rxH = QRegExp("\\b(\\d+)\\s*\\(\\d+\\)");
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["volume"]->setText(bp->parse("volume", rxH.cap(1)));
        return;
    }
    rxH = QRegExp("\\b(\\d+)[,:]\\s*\\d+\\b");
    nH = rxH.indexIn(text);
    if (nH > -1)
        bibFields["volume"]->setText(bp->parse("volume", rxH.cap(1)));
}

/** \page heuristic_guess

- <b>Number</b>
  - If <tt>Numbers{0,1}:{0,1}\\s*([\\d-]+)</tt> is found.
  - If <tt>No\\.{0,1}\\s*(\\d+)</tt> is found.
  - If <tt>Issue\\:{0,1}\\s*(\\d+)</tt> is found.
  - If <tt>\\d\\s*\\((\\d+)\\)[^\\.]</tt> is found.

*/
void c2bHeuristicBibParser::guessNumber(const QString& text)
{
    _reliable_number = true;
    QRegExp rxH("Numbers{0,1}\\:{0,1}\\s*(" + _hyphen_nums + ")", Qt::CaseInsensitive);
    int nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["number"]->setText(bp->parse("number", rxH.cap(1)));
        return;
    }
    rxH = QRegExp("Nos{0,1}\\.{0,1}\\s*(" + _hyphen_nums + ")", Qt::CaseInsensitive);
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["number"]->setText(bp->parse("number", rxH.cap(1)));
        return;
    }
    rxH = QRegExp("Issues{0,1}\\:{0,1}\\s*(" + _hyphen_nums + ")", Qt::CaseInsensitive);
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["number"]->setText(bp->parse("number", rxH.cap(1)));
        return;
    }
    _reliable_number = false;
    rxH = QRegExp("\\d\\s*\\((\\d+)\\)[^\\.]");
    nH = rxH.indexIn(text);
    if (nH > -1)
        if (rxH.cap(1) != bibFields["year"]->text()) // Avoid confusing (number) and (year)
            bibFields["number"]->setText(bp->parse("number", rxH.cap(1)));
}

/** \page heuristic_guess

- <b>Pages</b>
  - If <tt>\\bPages{0,1}[:\\.]{0,1}([\\d\\s-]+)</tt> is found.
  - If <tt>\\bp{1,2}\\.{0,1}\\s+(\\d+)</tt> is found.
  - If <tt>\\b(\\d+)\\s*-{1,2}\\s*(\\d+pp)\\b</tt> is found.
  - If <tt>\\b(\\d+)\\s*-{1,2}\\s*(\\d+)\\b</tt> is found.

*/
void c2bHeuristicBibParser::guessPages(const QString& text)
{
    _reliable_pages = true;
    QRegExp rxH = QRegExp("\\bPages{0,1}[:\\.]{0,1}\\s*(" + _hyphen_nums + ")");
    int nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["pages"]->setText(bp->parse("pages", rxH.cap(1)));
        return;
    }
    rxH = QRegExp("\\b(\\d+)[\\s-](\\d+pp)\\b");
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["pages"]->setText(bp->parse("pages", rxH.cap(1) + "-" + rxH.cap(2)));
        return;
    }
    rxH = QRegExp("\\bpp\\.{0,1}\\s+(" + _hyphen_nums + ")");
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["pages"]->setText(bp->parse("pages", rxH.cap(1)));
        return;
    }
    _reliable_pages = false;
    rxH = QRegExp("\\bp\\.{0,1}\\s+(" + _hyphen_nums + ")");
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["pages"]->setText(bp->parse("pages", rxH.cap(1)));
        return;
    }
    rxH = QRegExp("\\b(\\d+)\\s*-{1,2}\\s*(\\d+)\\b");
    nH = 0;
    while (nH >= 0)
    {
        nH = rxH.indexIn(text, nH);
        if (nH > -1)
        {
            if (!rxH.cap(1).startsWith("0"))
            {
                int fp = rxH.cap(1).toInt();
                int lp = rxH.cap(2).toInt();
                if (lp - fp < 150)
                {
                    bibFields["pages"]->setText(bp->parse("pages", rxH.cap(1) + "-" + rxH.cap(2)));
                    return;
                }
            }
            nH  += rxH.matchedLength();
        }
    }
}

/** \page heuristic_guess

- <b>Year</b>
  - If <tt>\\b(19|20)(\\d\\d)\\b</tt> is found.

*/
void c2bHeuristicBibParser::guessYear(const QString& text)
{
    QRegExp rxH("\\b(19|20)(\\d\\d)\\b");
    int nH = rxH.indexIn(text);
    if (nH > -1)
        bibFields["year"]->setText(bp->parse("year", rxH.cap(1) + rxH.cap(2)));
}

/** \page heuristic_guess

- <b>Title</b>
  - If <tt>\\bTitle:{0,1}</tt> is found.

*/
void c2bHeuristicBibParser::guessTitle(const QString& text)
{
    QRegExp rxH("\\bTitle:{0,1}\\s*<NewLine\\d+>(.+)(<NewLine|$)", Qt::CaseInsensitive);
    rxH.setMinimal(true);
    int nH = rxH.indexIn(text);
    if (nH > -1)
    {
        QString val = rxH.cap(1).remove(QRegExp("^:"));
        bibFields["title"]->setText(bp->parse("title", val));
        return;
    }
    rxH = QRegExp("\\bTitle:{0,1}(.+)(<NewLine|$)", Qt::CaseInsensitive);
    rxH.setMinimal(true);
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        QString val = rxH.cap(1).remove(QRegExp("^:"));
        bibFields["title"]->setText(bp->parse("title", val));
    }
}

/** \page heuristic_guess

- <b>ISBN</b>
  - If <tt>\\bISBN\\b(?:-\\d+){0,1}:{0,1}(?:-\\d+){0,1}\\s*(\\d+-[\\d-]+-\\d+)</tt> is found.
  - If <tt>\\bISBN\\b(?:-\\d+){0,1}:{0,1}(?:-\\d+){0,1}\\s*(\\d+)</tt> is found.

*/
void c2bHeuristicBibParser::guessISBN(const QString& text)
{
    QRegExp rxH("\\bISBN\\b(?:[ -]\\d+){0,1}:{0,1}(?:-\\d+){0,1}\\s*(\\d+-[\\d-]+-\\d+)", Qt::CaseInsensitive);
    int nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["isbn"]->setText(bp->parse("isbn", rxH.cap(1)));
        return;
    }
    rxH = QRegExp("\\bISBN\\b(?:[ -]\\d+){0,1}:{0,1}(?:-\\d+){0,1}\\s*(\\d+)", Qt::CaseInsensitive);
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["isbn"]->setText(bp->parse("isbn", rxH.cap(1)));
        return;
    }
}

/** \page heuristic_guess

- <b>Journal</b>
  - Check cb2Bib internal database.

*/
void c2bHeuristicBibParser::guessJournal(const QString& text)
{
    QString stext = text;
    stext.replace(QRegExp("\\W"), " ");
    stext = stext.simplified();
    QString jn = "";

    for (int i = 0; i < jDB.nitems; i++)
    {
        if (stext.contains(jDB.JAbbrev_simp_w.at(i), Qt::CaseInsensitive))
        {
            if (jn.length() < jDB.JAbbrev_simp_w.at(i).length())
                jn = jDB.JAbbrev_simp_w.at(i);
        }
    }
    stext.replace(QRegExp("\\W"), "");
    for (int i = 0; i < jDB.nitems; i++)
    {
        if (stext.contains(jDB.JExtended_simp.at(i), Qt::CaseInsensitive))
        {
            if (jn.length() < jDB.JAbbrev_simp_w.at(i).length())
                jn = jDB.JAbbrev_simp_w.at(i);
        }
    }

    if (!jn.isEmpty())
        bibFields["journal"]->setText(bp->parse("journal", jn));
}

void c2bHeuristicBibParser::guessFromMetadata(const QString& text)
{
    if (!text.contains("[Bibliographic Metadata"))
        return;
    QRegExp bf;
    bf.setMinimal(true);
    bf.setCaseSensitivity(Qt::CaseSensitive);
    QString p = "<%1>(.+)</%1>";
    for (int i = 0; i < bibFieldList.count(); ++i)
    {
        bf.setPattern(p.arg(bibFieldList.at(i)));
        if (bf.indexIn(text) > -1)
            bibFields[bibFieldList.at(i)]->setText(bp->parse(bibFieldList.at(i), bf.cap(1)));
    }
}

void c2bHeuristicBibParser::guessVolumePagesYear(const QString& text)
{
    // Does several volume pages year formats
    if (_reliable_pages && _reliable_volume && _reliable_number)
        return;
    // J. Sci., 108 (15), 3206, 2004
    // J. Sci., 108 (15), 3206 2004
    QRegExp rxH("(\\d+)\\s*\\((" + _hyphen_nums + ")\\)\\s*[,:]\\s*(" + _hyphen_nums + ")[,\\s]+(19|20)(\\d\\d)");
    int nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["volume"]->setText(bp->parse("volume", rxH.cap(1)));
        bibFields["number"]->setText(bp->parse("number", rxH.cap(2)));
        bibFields["pages"]->setText(bp->parse("pages", rxH.cap(3)));
        bibFields["year"]->setText(bp->parse("year", rxH.cap(4) + rxH.cap(5)));
        return;
    }
    // J. Sci., 108 (15), 3206 (2004)
    rxH = QRegExp("(\\d+)\\s*\\((" + _hyphen_nums + ")\\)\\s*[,:]\\s*(" + _hyphen_nums + ")[,\\s]*\\((19|20)(\\d\\d)\\)");
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["volume"]->setText(bp->parse("volume", rxH.cap(1)));
        bibFields["number"]->setText(bp->parse("number", rxH.cap(2)));
        bibFields["pages"]->setText(bp->parse("pages", rxH.cap(3)));
        bibFields["year"]->setText(bp->parse("year", rxH.cap(4) + rxH.cap(5)));
        return;
    }
    if (_reliable_pages && _reliable_volume)
        return;
    // J. Sci. 124, 204109 2006
    // J. Sci. 124, 204109, 2006
    rxH = QRegExp("(\\d+),\\s*(" + _hyphen_nums + ")[,\\s]+(19|20)(\\d\\d)");
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["volume"]->setText(bp->parse("volume", rxH.cap(1)));
        bibFields["pages"]->setText(bp->parse("pages", rxH.cap(2)));
        bibFields["year"]->setText(bp->parse("year", rxH.cap(3) + rxH.cap(4)));
        return;
    }
    // 120, 8425 - 8433 (2004)
    // J. Sci. 30, 2745 (1984)
    rxH = QRegExp("(\\d+),\\s*(" + _hyphen_nums + ")[,\\s]*\\((19|20)(\\d\\d)\\)");
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["volume"]->setText(bp->parse("volume", rxH.cap(1)));
        bibFields["pages"]->setText(bp->parse("pages", rxH.cap(2)));
        bibFields["year"]->setText(bp->parse("year", rxH.cap(3) + rxH.cap(4)));
        return;
    }
}

void c2bHeuristicBibParser::guessVolumeYearPages(const QString& text)
{
    // Does several volume year pages formats
    if (_reliable_pages && _reliable_volume)
        return;
    // J. Sci. 203 (2003) 209.
    QRegExp rxH("(\\d+)\\s*\\(" + bibFields["year"]->text() + "\\)\\s*(" + _hyphen_nums + ")");
    int nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["volume"]->setText(bp->parse("volume", rxH.cap(1)));
        bibFields["pages"]->setText(bp->parse("pages", rxH.cap(2)));
        return;
    }
}

void c2bHeuristicBibParser::guessYearVolumePages(const QString& text)
{
    // Does several year volume pages formats
    if (_reliable_pages && _reliable_volume && _reliable_number)
        return;
    // J. Sci. 1995, 247(4):536-40.
    QRegExp rxH(bibFields["year"]->text() + "\\s*[,:;]\\s*(\\d+)\\((" + _hyphen_nums + ")\\)\\s*[,:;]\\s*(" + _hyphen_nums + ")");
    int nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["volume"]->setText(bp->parse("volume", rxH.cap(1)));
        bibFields["number"]->setText(bp->parse("number", rxH.cap(2)));
        bibFields["pages"]->setText(bp->parse("pages", rxH.cap(3)));
        return;
    }
    if (_reliable_pages && _reliable_volume)
        return;
    // J. Sci. 2005, 103, 818
    // J. Sci. 2002;9:101–106.5.
    rxH = QRegExp(bibFields["year"]->text() + "\\s*[,:;]\\s*(\\d+)\\s*[,:;]\\s*(" + _hyphen_nums + ")");
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["volume"]->setText(bp->parse("volume", rxH.cap(1)));
        bibFields["pages"]->setText(bp->parse("pages", rxH.cap(2)));
        return;
    }
    // (2006) J. Sci. 39:3047
    rxH = QRegExp("\\(" + bibFields["year"]->text() + "\\)\\D{5,30}(\\d+)\\s*[,:;]\\s*(" + _hyphen_nums + ")");
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["volume"]->setText(bp->parse("volume", rxH.cap(1)));
        bibFields["pages"]->setText(bp->parse("pages", rxH.cap(2)));
        return;
    }
    // 2006 J. Sci. 39 3047
    rxH = QRegExp(bibFields["year"]->text() + "\\D{5,30}(\\d+)\\s*[,:; ]\\s*(" + _hyphen_nums + ")");
    nH = rxH.indexIn(text);
    if (nH > -1)
    {
        bibFields["volume"]->setText(bp->parse("volume", rxH.cap(1)));
        bibFields["pages"]->setText(bp->parse("pages", rxH.cap(2)));
        return;
    }
}
