/***************************************************************************
 *   Copyright (C) 2004-2008 by Pere Constans
 *   constans@molspaces.com
 *   cb2Bib version 1.0.4. Licensed under the GNU GPL version 3.
 *   See the LICENSE file that comes with this distribution.
 ***************************************************************************/
#include "authorString.h"
#include "c2bUtils.h"

#include <QObject>


/** \page authorproc Processing of author's names

    The cb2Bib automatically processes the author names string. It uses a set
    of heuristic rules. First, the name list separator is identified. And
    second, it is decided whether or not author names are in standard or
    reversed order, or in the 'Abcd, E., F. Ghij, ...' mixed order.

*/
authorString::authorString(QString Alist, bool full_form) : QString(Alist)
{
    fullForm = full_form;
    authors = toBib(Alist);
    c2bUtils::debug(QObject::tr("Authors Final: |%1|").arg(authors));
}

authorString::~authorString()
{}


/** \page authorproc

    Clean up author list string:

    - Escape BibTeX to Unicode

   - Remove digits from authors string

   - Remove any character except <tt>-',;&\\.\\s\\w</tt>

   - Simplify White Spaces

   - Consider composing prefixes <tt>(da|de|del|der|di|do|dos|van|vande|von)</tt>

   - Consider composing sufixes <tt>(II|III|Jr)</tt>

   - Some publishers use superscripts to refer to multiple author affiliations.
   Text clipboard copying loses superscript formatting. Author strings are
   clean from 'orphan' lowcase, single letters in a preprocessing step.
   Everything following the pattern <b>[a-z]</b> is removed. Fortunately,
   abbreviated initials are most normally input as uppercase letters, thus
   permitting a correct superscript clean up. \n <em>Caution:</em> Lowcase,
   single, a to z letters are removed from author's string.\n <em>Caution:</em>
   Supperscripts <b>will be added to author Last Name</b> if no separation is
   provided. Users should care about it and correct these cases.


   Rules to identify separators:
   - Contains comma and semicolon -> ';'
   - Contains pattern <tt>'^Abcd, E.-F.,'</tt> -> ','
   - Contains pattern <tt>'^Abcd,'</tt> -> 'and'
   - Contains comma -> ','
   - Contains semicolon -> ';'
   - Any other -> 'and'

*/
QString authorString::toBib(QString raw)
{
    raw.replace(QRegExp("\\d"), " ");
    raw.replace(QRegExp("[^-',;&\\.\\s\\w]"), " ");
    raw = raw.simplified();
    //  Composite Names temporary unified
    // Attention: prefixes and sufixes also appear in bibParser::medlToc2b  (Medline)
    raw.replace(QRegExp("\\b(da|de|del|der|di|do|dos|van|vande|von)\\s", Qt::CaseInsensitive), "\\1+");
    raw.replace("+", "_");
    raw.replace(QRegExp("(\\w),{0,1}\\s(II|III|Jr)\\b"), "\\1+\\2");
    raw.replace("+", "_");

    bool Comma = raw.contains(',');
    bool SemiColon = raw.contains(';');
    QRegExp RevN("^([-'\\w]+)\\,");
    bool zRevN = raw.contains(RevN);

    QString sep;
    if (Comma && SemiColon)
        sep = ';';  //  Multiple Authors, separated by semicolon, reversed naming
    else if (Comma)
    {
        if (zRevN)
        {
            if (raw.contains(QRegExp("^([-'\\w]+)\\,(\\s*-{0,1}\\b\\w\\b\\.{0,1}){1,3}\\,")))
            {
                raw.replace(QRegExp("^([-'\\w]+)\\,"), "\\1 ");
                sep = ',';          //  Mixed naming 'Smith, J.-L., R. Jones, and K. Gibbons'
            }
            else
                sep = " and ";      //  Reversed naming
        }
        else                        //  Standard naming
            sep = ',';
    }
    else if (SemiColon)
        sep = ';';  //  Multiple Authors, separated by SemiColon
    else
        sep = " and ";
    c2bUtils::debug(QObject::tr("Separator: |%1|").arg(sep));


    raw.replace(QRegExp("\\band\\b", Qt::CaseInsensitive), sep);
    raw.replace(QRegExp("\\s&\\s", Qt::CaseInsensitive), sep);
    c2bUtils::debug("1--|" + raw + "|");
    raw.replace(QRegExp("\\b[a-z]\\b"), " ");      // Cleaning of afiliation 'superscripts'
    c2bUtils::debug("2--|" + raw + "|");
    raw.replace(QRegExp("[^\\w\\.]+$"), "");       // Removing of duplicate commas and semicolons
    raw.replace(QRegExp(",\\s*"), ",");
    c2bUtils::debug("3--|" + raw + "|");
    raw.replace(QRegExp(",+"), ",");
    raw.replace(QRegExp(";\\s*"), ";");
    raw.replace(QRegExp(";+"), ";");
    c2bUtils::debug("4--|" + raw + "|");
    bool containLC = containLowerCaseLetter(raw);
    bool containUC = containUpperCaseLetter(raw);
    authorsInUC = containUC && !containLC;
    if (authorsInUC)
        c2bUtils::debug("Input Authors in Uppercase");
    QStringList list;
    if (sep == " and ")
        list = raw.split(QRegExp("\\band\\b"));
    else
        list = raw.split(sep);

    //  Setting author ordering
    QStringList::Iterator it = list.begin();
    QString AuthorName = *it;
    bool zRevName = RevName(AuthorName);
    bool zRevNChoice = (Comma && SemiColon) || zRevN || zRevName;
    it = list.end();
    AuthorName = *(--it);
    AuthorName.replace(QRegExp("^\\s+"), "");
    bool zRevNameLastA = RevName(AuthorName);
    bool zRevNChoiceLastA = (Comma && SemiColon) || AuthorName.contains(RevN) || zRevNameLastA;
    bool zRevMixed = (zRevNChoice == true && zRevNChoiceLastA == false);
    if (zRevMixed)                  //  Mixed naming 'Smith, J., R. Jones'
        c2bUtils::debug("mixed order");

    //  Process each Author Name
    for (it = list.begin(); it != list.end(); ++it)
    {
        c2bUtils::debug(*it);
        QString Item = *it;
        Item.replace(QRegExp("\\.{0,1}\\s{0,1}-"), "-");      // Abbreviated cases, eg M.-H. Something
        Item.replace(QRegExp("[^-'\\w]+"), " ");      // Only these  characters compose a name
        Item = Item.simplified();

        //  Split Author Name
        QStringList spItem = Item.split(" ");
        int n = spItem.count();

        int iini, iend, ilname;
        if (zRevNChoice)
        {
            iini = 1;
            iend = n;
            ilname = 0;
            c2bUtils::debug("reversed order");
        }
        else
        {
            iini = 0;
            iend = n - 1;
            ilname = n - 1;
            c2bUtils::debug("standard order");
        }

        QString Name = "";
        for (int i = iini; i < iend; i++)  // Process first and middle names
        {
            c2bUtils::debug("First and Midle: " + spItem.at(i));
            if (spItem.at(i).contains('-'))    // Composite names
            {
                QStringList sspItem = spItem.at(i).split("-");
                Name += processFirstMiddle(sspItem.at(0)) + "-";
                Name += processFirstMiddle(sspItem.at(1)) + " ";    // Shouldn't be more than 2 parts...
            }
            else  // Regular names
            {
                int lfm = spItem.at(i).length();
                if (n == 2 && !authorsInUC && lfm > 1)
                {
                    QString FirstMiddle = spItem.at(i);
                    bool FirstcontainLC = containLowerCaseLetter(FirstMiddle);
                    if (!FirstcontainLC)                 // Here it seems ISI style (Last, FST)
                        for (int l = 0; l < lfm; l++)    // Always abbreviated, no call to processFirstMiddle
                            Name += FirstMiddle[l] + ". ";
                    else
                        Name += processFirstMiddle(spItem.at(i)) + " ";
                }
                else
                    Name += processFirstMiddle(spItem.at(i)) + " ";
            }
        }
        QString LastName = capitalize(spItem.at(ilname));    // Adding last name
        Name += LastName;
        *it = Name;
        c2bUtils::debug(Name);
        if (zRevMixed)             //  Mixed naming 'Smith, J., R. Jones'
            zRevNChoice = false;
    }

    raw = list.join(" and ");
    //  Restore Composite Names white spaces
    raw.replace(QRegExp("(da|de|del|der|di|do|dos|van|vande|von)_", Qt::CaseInsensitive), "\\1 ");
    raw.replace(QRegExp("_II", Qt::CaseInsensitive), " II");      // Sufix can be lower case here
    raw.replace(QRegExp("_III", Qt::CaseInsensitive), " III");
    raw.replace(QRegExp("_Jr", Qt::CaseInsensitive), " Jr");
    return(raw);
}

const QString authorString::processFirstMiddle(const QString& first_middle)
{
    // Process First and Middle parts
    // Abbreviates if required
    // Takes care of abbreviation periods
    QString proc_fm;
    if (fullForm)
    {
        if (first_middle.length() > 1)
            proc_fm = capitalize(first_middle);
        else
            proc_fm = first_middle + ".";
    }
    else
        proc_fm = first_middle.left(1) + ".";
    return(proc_fm);
}

const QString authorString::capitalize(const QString& name)
{
    // Capitalizes Author's Name
    QString proc_name = name;
    if (authorsInUC)
    {
        proc_name = proc_name.toLower();
        proc_name[0] = proc_name.at(0).toUpper();
        int ii = proc_name.indexOf(QRegExp("[\\s-']"));     // As before, assume just one part
        if (ii++ > 0)
            proc_name[ii] = proc_name.at(ii).toUpper();
        ii = proc_name.indexOf("_");                        // RegExp couldn't take '_'
        if (ii++ > 0)
            proc_name[ii] = proc_name.at(ii).toUpper();
    }
    return(proc_name);
}

/** \page authorproc

    Rules to identify ordering:
    - Contains comma and semicolon -> Reversed
    - Pattern <tt>'^Abcd,'</tt> -> Reversed
    - Pattern <tt>'^Abcd EF Ghi'</tt> -> Standard
    - Pattern <tt>'^Abcd EF'</tt> -> Reversed
    - Pattern <tt>'^Abcd E.F.'</tt> -> Reversed
    - Any other pattern -> Standard

*/
bool authorString::RevName(QString ALine)
{
    // Returns true if Author Name is in reversed order as "Him DF, Her SR, "
    // ISI doesn't contain point - return for safety
    // Consider "Him DF Last"

    QString line = ALine.simplified();
    QRegExp rRevNISI("^([-'\\w]+) ((\\w\\.\\s*)+)$");
    rRevNISI.setMinimal(false);
    int ncap = rRevNISI.indexIn(line);
    if (ncap > -1)
    {
        QString Last = rRevNISI.cap(3);
        if (Last != "and")
            return true;
    }

    if (line.contains("."))
        return false;

    rRevNISI = QRegExp("^([-'\\w]+) ([-'\\w]+) ([-'\\w]+)");
    rRevNISI.setMinimal(false);
    ncap = rRevNISI.indexIn(line);
    if (ncap > -1)
    {
        QString Last = rRevNISI.cap(3);
        if (Last != "and")
            return false;
    }

    rRevNISI = QRegExp("^([-'\\w]+) ([-'\\w]+)");
    rRevNISI.setMinimal(false);
    ncap = rRevNISI.indexIn(line);
    if (ncap > -1)
    {
        QString Last = rRevNISI.cap(1);
        QString First = rRevNISI.cap(2);
        c2bUtils::debug(QObject::tr("ISI:  |%1| |%2|").arg(Last).arg(First));
        if (containLowerCaseLetter(First))
            return false;
        if (!containLowerCaseLetter(Last))
            return false;
        return true;
    }
    return false;
}

bool authorString::containLowerCaseLetter(QString ALine)
{
    QString line = ALine;
    line.replace(QRegExp("\\band\\b"), "");      // Remove possible 'and' separator
    for (int i = 0; i < line.length(); i++)
    {
        if (line.at(i).isLetter())
            if (line.at(i).category() == QChar::Letter_Lowercase)
                return true;
    }
    return false;
}

bool authorString::containUpperCaseLetter(QString ALine)
{
    QString line = ALine;
    for (int i = 0; i < line.length(); i++)
    {
        if (line.at(i).isLetter())
            if (line.at(i).category() == QChar::Letter_Uppercase)
                return true;
    }
    return false;
}
