/***************************************************************************
 *   Copyright (C) 2004-2006 by Pere Constans
 *   constans@molspaces.com
 *   cb2Bib version 0.6.1
 *   See LICENSE file that comes with this distribution
 ***************************************************************************/
#include "astring.h"
#include "c2butils.h"

/*! \page authorproc Processing of author's names
 
The cb2Bib automatically processes the author names string.
It uses a set of heuristic rules. First, the name list separator is
identified. And second, it is decided whether or not author names
are in standard or reversed order, or in the 'Abcd, E., F. Ghij, ...'
mixed order.
 
*/

AString::AString( QString Alist, bool full_form ) : QString( Alist )
{
    fullForm = full_form;
    authors = toBib( Alist );
    c2bDebug(QString("Authors Final: |%1|").arg(authors));
}

AString::~AString()
{}

QString AString::bib() const
{
    return( authors );
}

QString AString::toBib( QString raw )
{
    /*! \page authorproc

    Clean up author list string:

     - Escape BibTeX to Unicode

     - Remove digits from authors string

     - Remove any character except <tt>-',;&\\.\\s\\w</tt>

     - Simplify White Spaces

    */
    raw = Bibtoc2b( raw );
    raw.replace( QRegExp( "\\d" ), " " );
    raw.replace( QRegExp( "[^-',;&\\.\\s\\w]" ), " " );
    raw = raw.simplifyWhiteSpace();
    //  Composite Names temporary unified
    /*! \page authorproc

     - Consider composing prefixes <tt>(da|de|del|der|di|do|dos|van|vande|von)</tt>

    */
    // Attention: prefixes and sufixes also appear in bibParser::medlToc2b  (Medline)
    raw.replace( QRegExp( "\\b(da|de|del|der|di|do|dos|van|vande|von)\\s", FALSE ), "\\1+" );
    raw.replace( "+", "_" );
    /*! \page authorproc

     - Consider composing sufixes <tt>(II|III|Jr)</tt>

    */
    raw.replace( QRegExp( "(\\w),{0,1}\\s(II|III|Jr)\\b" ), "\\1+\\2" );
    raw.replace( "+", "_" );

    /*! \page authorproc

    - Some publishers use superscripts to refer to multiple author affiliations.
    Text clipboard copying loses superscript formatting.
    Author strings are clean from 'orphan' lowcase, single letters in a
    preprocessing step.
    Everything following the pattern <b>[a-z]</b> is removed. 
    Fortunately, abbreviated initials are most normally input as uppercase 
    letters, thus permitting a correct superscript clean up. \n 
    <em>Caution:</em> Lowcase, single, a to z letters 
    are removed from author's string.\n
    <em>Caution:</em> Supperscripts <b>will be added to author Last Name</b> if 
    no separation is provided. Users should care about it and correct these cases.

    */

    /*! \page authorproc

        Rules to identify separators:
        - Contains comma and semicolon -> ';'
        - Contains pattern <tt>'^Abcd, E.-F.,'</tt> -> ','
        - Contains pattern <tt>'^Abcd,'</tt> -> 'and'
        - Contains comma -> ','
        - Contains semicolon -> ';'
        - Any other -> 'and'

    */

    bool Comma = raw.contains( ',' );
    bool SemiColon = raw.contains( ';' );
    QRegExp RevN( "^([-'\\w]+)\\," );
    bool zRevN = raw.contains( RevN );

    QString sep;
    if ( Comma && SemiColon )
        sep = ';';  //  Multiple Authors, separated by semicolon, reversed naming
    else if ( Comma )
    {
        if ( zRevN )
        {
            if ( raw.contains( QRegExp("^([-'\\w]+)\\,(\\s*-{0,1}\\b\\w\\b\\.{0,1}){1,3}\\,") ) )
            {
                raw.replace( QRegExp( "^([-'\\w]+)\\," ), "\\1 " );
                sep = ',';          //  Mixed naming 'Smith, J.-L., R. Jones, and K. Gibbons'
            }
            else
                sep = " and ";      //  Reversed naming
        }
        else                        //  Standard naming
            sep = ',';
    }
    else if ( SemiColon )
        sep = ';';  //  Multiple Authors, separated by SemiColon
    else
        sep = " and ";
    c2bDebug(QString("Separator: |%1|").arg(sep));


    raw.replace( QRegExp( "\\band\\b", FALSE ), sep );
    raw.replace( QRegExp( "\\s&\\s", FALSE ), sep );
    c2bDebug("1--|"+raw+"|");
    raw.replace( QRegExp( "\\b[a-z]\\b" ), " " );  // Cleaning of afiliation 'superscripts'
    c2bDebug("2--|"+raw+"|");
    raw.replace( QRegExp( "[^\\w\\.]+$" ), "" );   // Removing of duplicate commas and semicolons
    raw.replace( QRegExp( ",\\s*" ), "," );
    c2bDebug("3--|"+raw+"|");
    raw.replace( QRegExp( ",+" ), "," );
    raw.replace( QRegExp( ";\\s*" ), ";" );
    raw.replace( QRegExp( ";+" ), ";" );
    c2bDebug("4--|"+raw+"|");
    bool containLC = containLowerCaseLetter( raw );
    bool containUC = containUpperCaseLetter( raw );
    authorsInUC = containUC && !containLC;
    if ( authorsInUC )
        c2bDebug("Input Authors in Uppercase");
    QStringList list;
    if ( sep == " and " )
        list = QStringList::split( QRegExp("\\band\\b"), raw );
    else
        list = QStringList::split( sep, raw );

    //  Setting author ordering
    QStringList::Iterator it = list.begin();
    QString AuthorName = *it;
    bool zRevName = RevName( AuthorName );
    bool zRevNChoice = ( Comma && SemiColon ) || zRevN || zRevName;
    it = list.end();
    AuthorName = *(--it);
    AuthorName.replace( QRegExp( "^\\s+" ), "" );
    bool zRevNameLastA = RevName( AuthorName );
    bool zRevNChoiceLastA = ( Comma && SemiColon ) || AuthorName.contains( RevN ) || zRevNameLastA;
    bool zRevMixed = ( zRevNChoice == TRUE && zRevNChoiceLastA == FALSE );
    if ( zRevMixed )                  //  Mixed naming 'Smith, J., R. Jones'
        c2bDebug( "mixed order" );

    //  Process each Author Name
    for ( QStringList::Iterator it = list.begin(); it != list.end(); ++it )
    {
        c2bDebug(*it);
        QString Item = *it;
        Item.replace( QRegExp( "\\.{0,1}\\s{0,1}-" ), "-" );  // Abbreviated cases, eg M.-H. Something
        Item.replace( QRegExp( "[^-'\\w]+" ), " " );  // Only these  characters compose a name
        Item = Item.simplifyWhiteSpace();

        //  Split Author Name
        QStringList spItem = QStringList::split( " ", Item );
        int n = spItem.count();

        int iini, iend, ilname;
        if ( zRevNChoice )
        {
            iini = 1;
            iend = n;
            ilname = 0;
            c2bDebug( "reversed order" );
        }
        else
        {
            iini = 0;
            iend = n-1;
            ilname = n-1;
            c2bDebug("standard order");
        }

        QString Name = "";
        for ( int i = iini; i < iend; i++ )  // Process first and middle names
        {
            c2bDebug("First and Midle: " + spItem[i]);
            if ( spItem[i].contains( '-' ) )  // Composite names
            {
                QStringList sspItem = QStringList::split( "-", spItem[i] );
                Name += processFirstMiddle( sspItem[0] ) + "-";
                Name += processFirstMiddle( sspItem[1] ) + " ";  // Shouldn't be more than 2 parts...
            }
            else  // Regular names
            {
                int lfm = spItem[i].length();
                if ( n == 2 && !authorsInUC && lfm > 1 )
                {
                    QString FirstMiddle = spItem[i];
                    bool FirstcontainLC = containLowerCaseLetter( FirstMiddle );
                    if ( !FirstcontainLC )                 // Here it seems ISI style (Last, FST)
                        for ( int l = 0; l < lfm; l++ )    // Always abbreviated, no call to processFirstMiddle
                            Name += FirstMiddle[l] + ". ";
                    else
                        Name += processFirstMiddle( spItem[i] ) + " ";
                }
                else
                    Name += processFirstMiddle( spItem[i] ) + " ";
            }
        }
        QString LastName = capitalize( spItem[ilname] );  // Adding last name
        Name += LastName;
        *it = Name;
        c2bDebug(Name);
        if ( zRevMixed )             //  Mixed naming 'Smith, J., R. Jones'
            zRevNChoice = FALSE;
    }

    raw = list.join( " and " );
    //  Restore Composite Names white spaces
    raw.replace( QRegExp( "(da|de|del|der|di|do|dos|van|vande|von)_", FALSE ), "\\1 " );
    raw.replace( QRegExp( "_II", FALSE ), " II" );  // Sufix can be lower case here
    raw.replace( QRegExp( "_III", FALSE ), " III" );
    raw.replace( QRegExp( "_Jr", FALSE ), " Jr" );
    return( raw );
}

const QString AString::processFirstMiddle( const QString& first_middle )
{
    // Process First and Middle parts
    // Abbreviates if required
    // Takes care of abbreviation periods
    QString proc_fm;
    if( fullForm )
    {
        if( first_middle.length() > 1 )
            proc_fm = capitalize( first_middle );
        else
            proc_fm = first_middle+".";
    }
    else
        proc_fm = first_middle.left(1)+".";
    return( proc_fm );
}

const QString AString::capitalize( const QString& name )
{
    // Capitalizes Author's Name
    QString proc_name = name;
    if ( authorsInUC )
    {
        proc_name = proc_name.lower();
        proc_name[0] = proc_name[0].upper();
        int ii = proc_name.find( QRegExp( "[\\s-']" ) ); // As before, assume just one part
        if ( ii++ > 0 )
            proc_name[ii] = proc_name[ii].upper();
        ii = proc_name.find( "_" );                      // RegExp couldn't take '_'
        if ( ii++ > 0 )
            proc_name[ii] = proc_name[ii].upper();
    }
    return( proc_name );
}

bool AString::RevName( QString ALine )
{
    // Returns true if Author Name is in reversed order as "Him DF, Her SR, "
    // ISI doesn't contain point - return for safety
    // Consider "Him DF Last"
    /*! \page authorproc

    Rules to identify ordering:
    - Contains comma and semicolon -> Reversed
    - Pattern <tt>'^Abcd,'</tt> -> Reversed
    - Pattern <tt>'^Abcd EF Ghi'</tt> -> Standard
    - Pattern <tt>'^Abcd EF'</tt> -> Reversed
    - Pattern <tt>'^Abcd E.F.'</tt> -> Reversed
    - Any other pattern -> Standard

    */

    QString line = ALine.simplifyWhiteSpace();
    QRegExp rRevNISI( "^([-'\\w]+) ((\\w\\.\\s*)+)$" );
    rRevNISI.setMinimal( FALSE );
    int ncap = rRevNISI.search( line );
    if( ncap > -1 )
    {
        QString Last = rRevNISI.cap(3);
        if ( Last != "and" )
            return TRUE;
    }

    if( line.contains( "." ) )
        return FALSE;

    rRevNISI = QRegExp( "^([-'\\w]+) ([-'\\w]+) ([-'\\w]+)" );
    rRevNISI.setMinimal( FALSE );
    ncap = rRevNISI.search( line );
    if( ncap > -1 )
    {
        QString Last = rRevNISI.cap(3);
        if ( Last != "and" )
            return FALSE;
    }

    rRevNISI = QRegExp( "^([-'\\w]+) ([-'\\w]+)" );
    rRevNISI.setMinimal( FALSE );
    ncap = rRevNISI.search( line );
    if( ncap > -1 )
    {
        QString Last = rRevNISI.cap(1);
        QString First = rRevNISI.cap(2);
        c2bDebug( QString("ISI:  |%1| |%2|").arg(Last).arg(First) );
        if ( containLowerCaseLetter( First ) )
            return FALSE;
        if ( !containLowerCaseLetter( Last ) )
            return FALSE;
        return TRUE;
    }
    return FALSE;
}

bool AString::containLowerCaseLetter( QString ALine )
{
    QString line = ALine;
    line.replace( QRegExp( "\\band\\b" ), "" );  // Remove possible 'and' separator
    for ( uint i = 0; i < line.length(); i++ )
    {
        if ( line[i].isLetter() )
            if(line[i].category() == QChar::Letter_Lowercase )
                return TRUE;
    }
    return FALSE;
}

bool AString::containUpperCaseLetter( QString ALine )
{
    QString line = ALine;
    for ( uint i = 0; i < line.length(); i++ )
    {
        if ( line[i].isLetter() )
            if(line[i].category() == QChar::Letter_Uppercase )
                return TRUE;
    }
    return FALSE;
}
