This one converts a html table into a list of lists using BeautifulSoup

def removeextraspaces(string):
    while '  ' in string:
        string = string.replace('  ', ' ')
    return string.strip()

def html2text(node):
    if not hasattr(node, 'contents'):
        return node.replace('n', ' ').replace(' ', ' ')
    if node.isSelfClosing:
        return ' '
    return ''.join([html2text(x) for x in node.contents])

def content(array):
    return [removeextraspaces(html2text(x)) for x in array]

def table2list(table):
    return [content(row.findChildren('td')) for row in table.findChildren('td')]
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

w

Connecting to %s