#!/usr/bin/env python
"""
Try your best to parse the crappy HTML coming out of RBC Direct Investing.

You'd think that after all these years and numerous 'improvements' to their web
interface over the years, that they would bother providing OFX export, something
simple to implement that anyone with a ledger would use, but no. We have to
parse HTML, bad HTML. Such is the reality of dealing with banks. (At least they
didn't get caught up in the subprime mess, that offers some consolation.)

Go to the RBC Direct Investing account and use the Web Developer extension to
save the Generate Source (from the Source menu) for each month of activity. This
script can extract the table and produce valid Ledger format text.

This has got to be the absolute worst HTML I have ever seen. Also, note that you
need to save the generated source, not the source you see directly.
"""

# stdlib imports
import sys, re, cgi, logging
from datetime import date
from decimal import Decimal
from xml.sax.saxutils import unescape
from pprint import pprint, pformat

# other imports
from BeautifulSoup import BeautifulSoup, Tag

# beancount imports
from beancount import cmdline
from beancount.ledger import read_ofx_accounts_map

# fallback imports
from beancount.fallback.collections2 import namedtuple


def main():
    import optparse
    parser = optparse.OptionParser(__doc__.strip())

    parser.add_option('-d', '--debug', action='store_true',
                      help="Generate debug output.")

    global opts
    opts, ledger, args = cmdline.main(parser, 1)

    if len(args) < 1:
        parser.error("You must specify a ledger file and a list of HTML "
                     "filenames to parse.")
    filenames = args

    accounts_map = read_ofx_accounts_map(ledger)

    all = []
    for fn in filenames:
        print >> sys.stderr, "\n\nParsing file %s\n" % fn
        acctid, postings = parse_file(fn)

        # Map account
        try:
            acc = accounts_map[acctid]
        except KeyError:
            raise KeyError("Missing account name in map for %s" % acctid)

        all.extend((x, acc) for x in postings)

    ## sortfun = lambda x: (x.currency, x.settlement, x.priority)
    sortfun = lambda x: (x[0].settlement, x[0].priority)

    all.sort(key=sortfun)
    for p, acc in all:
        print_as_ledger(p, acc.fullname)

def infer_account_names(acct, postings):
    """
    Given the name of an account and a list of postings, figure out if the
    account name contains one of the currencies we saw, and if so, infer
    alternate account names for it.

    This is to handle the case of two accounts::

      Assets:Investments:RBC-Broker:Account-USD
      Assets:Investments:RBC-Broker:Account-CAD

    """
    currencies = frozenset(x.currency for x in postings)
    for cur in currencies:
        acct = acct.replace(cur, '%(currency)s')
    return dict( (cur, acct % {'currency': cur})
                 for cur in currencies )

def print_as_ledger(p, acct):
    write = sys.stdout.write

    write(fmtdate(p.settlement))
    if p.date != p.settlement:
        write('=')
        write(fmtdate(p.date))

    desc = ' -- '.join(filter(None, (p.action, p.symbol, p.description)))

    # Note: we used to dispatch the output format here to effect appropriate
    # rounding here, on the 1000THS. We now use the Decimal object, which takes
    # care of this automatically.
    thou = re.match('\\b1000THS\\b', p.description)
    ffmt = 's' if thou else 's'

    vdict = p._asdict().copy()
    vdict['quantity'] = (p.quantity / 1000) if thou else p.quantity
    vdict['account'] = acct
    vdict['symbol'] = '"%s"' % p.symbol if re.search('[0-9]', p.symbol) else p.symbol

    if opts.debug:
        for k, v in vdict.iteritems():
            write(';; %s : %s\n' % (k, repr(v)))

    write(' ! %s\n' % desc)

    assert p.currency, p
    if p.quantity:
        if p.price:
            assert p.action in ('Buy', 'Sell', 'DIV F6')
            write('  %(account)-70s %(quantity)s %(symbol)s @ %(price)F %(currency)s\n'.replace('F', ffmt) % vdict)

        else:
            write('  %(account)-70s %(quantity)s %(symbol)s\n'.replace('F', ffmt) % vdict)

    if p.amount:
        write('  %(account)-70s %(amount)s %(currency)s\n'.replace('F', ffmt) % vdict)

    write('\n')


Entry = namedtuple('Entry',
                   'date symbol description action quantity price amount currency settlement priority')


def parse_file(fn):
    soup = BeautifulSoup(open(fn),
                         convertEntities=BeautifulSoup.HTML_ENTITIES)
    soup.prettify()

    tr = soup.find('tr', id="selectControl")
    td = tr.find('input', id="SelectedAccount")
    acctid = td['value']

    # Find the header row (it's the only one).
    tr = soup.find('tr', 'dataTableHeaderRow')

    postings = []

    priority = 1
    tbody = tr.parent.findNextSibling('tbody')
    tr = tbody.find('tr')
    while 1:
        # Work your way down the table, two rows at a time.
        if tr is None:
            break
        tr1 = tr
        tr2 = tr.findNextSibling('tr')
        tr = tr2.findNextSibling('tr')

        def findval(tr_, type_):
            assert(tr_)
            htype = "header_%s" % type_
            v = tr_.find('th', headers=htype) or tr_.find('td', headers=htype)
            ## print '-------', type_, v.contents
            el = v.contents[0] if v.contents else ''
            if isinstance(el, Tag):
                el = el.contents[0] if el.contents else ''
            r = unicode(el).strip()
            return r

        # Parse the first row.
        date = findval(tr1, 'date')
        symbol_desc = findval(tr1, 'symboldescription')

        # Parse the second row.
        action = findval(tr2, 'action')
        quantity = findval(tr2, 'quantity')
        price = findval(tr2, 'price')
        amount = findval(tr2, 'amount')
        currency = findval(tr2, 'currency')
        settlement = findval(tr2, 'settlement')

        # Convert to appropriate data types.
        date = parse_date(date)
        settlement = parse_date(settlement)
        if amount:
            amount = None if amount == 'Not applicable' else tonum(amount)
        if price:
            price = None if price == 'Not applicable' else tonum(price)
        if quantity:
            quantity = None if quantity == 'Not applicable' else tonum(quantity)
        symbol, description = [x.strip() for x in symbol_desc.split('-', 1)]

        mo = re.search('REINVEST @ \$(.*)', description)
        if mo:
            assert not price
            price = tonum(mo.group(1))

        p = Entry(date, symbol, description, action, quantity, price, amount,
                  currency, settlement, priority)
        postings.append(p)
        priority += 1

    return acctid, postings

## FIXME: What is the appropriate way to do this for all entities?
## other_entities = {'&nbsp;': ''}
def clean_str(s):
    return s.strip()
    ## return unescape(s, other_entities).strip()


_months = 'jan feb mar apr may jun jul aug sep oct nov dec'.split()

def parse_date(dstr):
    mo = re.match('(\d\d) (.*) (\d\d\d\d)', dstr)
    mth = _months.index(mo.group(2).lower()) + 1
    d = date(int(mo.group(3)), mth, int(mo.group(1)))
    return d

def fmtdate(d):
    return d.strftime('%Y-%m-%d')

def tonum(s):
    return Decimal(s.replace(',', ''))





if __name__ == '__main__':
    main()
