lxml not working with django, scraperwiki -
i'm working on django app goes through illinois' general assembly website scrape pdfs. while deployed on desktop works fine until urllib2 times out. when try deploy on bluehost server, lxml part of code throws error. appreciated.
import scraperwiki bs4 import beautifulsoup import urllib2 import lxml.etree import re django.core.management.base import basecommand legi.models import votes class command(basecommand): def handle(self, *args, **options): chmbrs =['http://www.ilga.gov/house/', 'http://www.ilga.gov/senate/'] chmbr in chmbrs: site = chmbr url = urllib2.urlopen(site) content = url.read() soup = beautifulsoup(content) links = [] linkstats = [] x=0 y=0 table = soup.find('table', cellpadding=3) in soup.findall('a',href=true): if re.findall('bills', a['href']): l = (site + a['href']+'&primary=true') links.append(str(l)) x+=1 print x link in links: url = urllib2.urlopen(link) content = url.read() soup = beautifulsoup(content) table = soup.find('table', cellpadding=3) in table.findall('a',href=true): if re.findall('billstatus', a['href']): linkstats.append(str('http://ilga.gov'+a['href'])) linkstat in linkstats: url = urllib2.urlopen(linkstat) content = url.read() soup = beautifulsoup(content) in soup.findall('a',href=true): if re.findall('votehistory', a['href']): vl = 'http://ilga.gov/legislation/'+a['href'] url = urllib2.urlopen(vl) content = url.read() soup = beautifulsoup(content) b in soup.findall('a',href=true): if re.findall('votehistory', b['href']): llink = 'http://ilga.gov'+b['href'] try: u = urllib2.urlopen(llink) x = scraperwiki.pdftoxml(u.read()) root = lxml.etree.fromstring(x) pages = list(root) chamber = str() page in pages: print "working_1" el in page: print "working_2" if el.tag == 'text': if int(el.attrib['top']) == 168: chamber = el.text if re.findall("senate vote", chamber): if int(el.attrib['top']) >= 203 , int(el.attrib['top']) < 231: title = el.text if (re.findall('house', title)): title = (re.findall('[0-9]+', title)) title = "hb"+title[0] elif (re.findall('senate', title)): title = (re.findall('[0-9]+', title)) title = "sb"+title[0] if int(el.attrib['top']) >350 , int(el.attrib['top']) <650: r = el.text names = re.findall(r'[a-z-\u00f1]{3,}',r) vs = re.findall(r'[a-z]{1,2}\s',r) name in names: legi = name vote in vs: v = vote if votes.objects.filter(legislation=title).exists() == false: c = votes(legislation=title, legislator=legi, vote=v) c.save() print 'saved' else: print 'not saved' elif int(el.attrib['top']) == 189: chamber = el.text if re.findall("house roll call", chamber): if int(el.attrib['top']) > 200 , int(el.attrib['top']) <215: title = el.text if (re.findall('house', title)): title = (re.findall('[0-9]+', title)) title = "hb"+title[0] elif (re.findall('senate', title)): title = (re.findall('[0-9]+', title)) title = "sb"+title[0] if int(el.attrib['top']) >385 , int(el.attrib['top']) <1000: r = el.text names = re.findall(r'[a-z-\u00f1]{3,}',r) votes = re.findall(r'[a-z]{1,2}\s',r) name in names: legi = name vote in votes: v = vote if votes.objects.filter(legislation=title).exists() == false: c = votes(legislation=title, legislator=legi, vote=v) c.save() print 'saved' else: print 'not saved' except: pass
edit 1 here's error trace
traceback (most recent call last): file "manage.py", line 10, in <module> execute_from_command_line(sys.argv) file "/home7/maythirt/python27/lib/python2.7/site-packages/django/core/management/__init__.py", line 399, in execute_from_command_line utility.execute() file "/home7/maythirt/python27/lib/python2.7/site-packages/django/core/management/__init__.py", line 392, in execute self.fetch_command(subcommand).run_from_argv(self.argv) file "/home7/maythirt/python27/lib/python2.7/site-packages/django/core/management/base.py", line 242, in run_from_argv self.execute(*args, **options.__dict__) file "/home7/maythirt/python27/lib/python2.7/site-packages/django/core/management/base.py", line 285, in execute output = self.handle(*args, **options) file "/home7/maythirt/gab/legi/management/commands/vote.py", line 51, in handle root = lxml.etree.fromstring(x) file "lxml.etree.pyx", line 3032, in lxml.etree.fromstring (src/lxml/lxml.etree.c:68121) file "parser.pxi", line 1786, in lxml.etree._parsememorydocument (src/lxml/lxml.etree.c:102470) file "parser.pxi", line 1674, in lxml.etree._parsedoc (src/lxml/lxml.etree.c:101299) file "parser.pxi", line 1074, in lxml.etree._baseparser._parsedoc (src/lxml/lxml.etree.c:96481) file "parser.pxi", line 582, in lxml.etree._parsercontext._handleparseresultdoc (src/lxml/lxml.etree.c:91290) file "parser.pxi", line 683, in lxml.etree._handleparseresult (src/lxml/lxml.etree.c:92476) file "parser.pxi", line 633, in lxml.etree._raiseparseerror (src/lxml/lxml.etree.c:91939) lxml.etree.xmlsyntaxerror: none
as jonathan mentioned, may output of scraperwiki.pdftoxml()
that's causing problem. display or log value of x
confirm it.
specifically, pdftoxml()
runs external program pdftohtml
, uses temporary files store pdf , xml.
what i'd check is:
- is
pdftohtml
correctly set on server? - if so, conversion xml work if directly run in shell on server pdf code's failing on? command it's executing
pdftohtml -xml -nodrm -zoom 1.5 -enc utf-8 -noframes "input.pdf" "output.xml"
if there's issue when directly run command, that's there issue lies. way pdftohtml
runs in scraperwiki
code, there's no easy way you'd able tell if command fails.
Comments
Post a Comment