/***************************************************************************
 *   Copyright (C) 2005 by Roberto Cappuccio and the Kat team              *
 *   Roberto Cappuccio : roberto.cappuccio@gmail.com                       *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   51 Franklin Steet, Fifth Floor, Boston, MA 02110-1301, USA.           *
 ***************************************************************************/

#include <kdebug.h>
#include <qregexp.h>
#include <kcharsets.h>

#include "fulltext_html.h"
#include "kat_export.h"

extern "C"
{
    KAT_EXPORT FulltextExtractor* new_extractor( const QString& path )
    {
        return new HtmlExtractor( path );
    }
}

HtmlExtractor::HtmlExtractor( const QString& path ) :
    m_bDone( false ),
    m_file( path ),
    m_stream( &m_file )
{
    m_open = m_file.open( IO_ReadOnly );
}

HtmlExtractor::~HtmlExtractor()
{
    m_file.close();
}

void HtmlExtractor::extract( QString& data )
{
    kdDebug() << "HTMLExtractor start" << endl;
    if ( m_bDone ) {
        data = QString::null;
        return;
    }
    m_bDone = true;

    if ( m_open ) {
        QString d = m_stream.read();

        d = d.replace( QRegExp( "<script[^>]*>[^>]*</script>", false ), " " ); // strip scripts
        d = d.replace( QRegExp( "<br>" ), "\n" ); // transform all <br> in \n
        d = d.replace( QRegExp( "<[^>]*>" ), " " ); // strip all HTML tags
        d = d.simplifyWhiteSpace();
        d = KCharsets::resolveEntities( d );

        QTextOStream stream( &data );

        stream << "<fulltext>";
        stream << d;
        stream << "</fulltext>";
    }

    kdDebug() << "HTMLExtractor end" << endl;
}
