lxml not working with django, scraperwiki -


i'm working on django app goes through illinois' general assembly website scrape pdfs. while deployed on desktop works fine until urllib2 times out. when try deploy on bluehost server, lxml part of code throws error. appreciated.

import scraperwiki bs4 import beautifulsoup import urllib2 import lxml.etree import re django.core.management.base import basecommand legi.models import votes  class command(basecommand):     def handle(self, *args, **options):         chmbrs =['http://www.ilga.gov/house/', 'http://www.ilga.gov/senate/']         chmbr in chmbrs:             site = chmbr                 url = urllib2.urlopen(site)             content = url.read()             soup = beautifulsoup(content)             links = []             linkstats = []             x=0             y=0             table = soup.find('table', cellpadding=3)             in soup.findall('a',href=true):                 if re.findall('bills', a['href']):                     l = (site + a['href']+'&primary=true')                     links.append(str(l))                     x+=1                     print x             link in links:                 url = urllib2.urlopen(link)                 content = url.read()                 soup = beautifulsoup(content)                 table = soup.find('table', cellpadding=3)                 in table.findall('a',href=true):                     if re.findall('billstatus', a['href']):                         linkstats.append(str('http://ilga.gov'+a['href']))             linkstat in linkstats:                 url = urllib2.urlopen(linkstat)                 content = url.read()                 soup = beautifulsoup(content)                 in soup.findall('a',href=true):                     if re.findall('votehistory', a['href']):                         vl = 'http://ilga.gov/legislation/'+a['href']                         url = urllib2.urlopen(vl)                         content = url.read()                         soup = beautifulsoup(content)                         b in soup.findall('a',href=true):                             if re.findall('votehistory', b['href']):                                 llink = 'http://ilga.gov'+b['href']                                 try:                                     u = urllib2.urlopen(llink)                                     x = scraperwiki.pdftoxml(u.read())                                     root = lxml.etree.fromstring(x)                                     pages = list(root)                                     chamber = str()                                     page in pages:                                         print "working_1"                                         el in page:                                             print "working_2"                                             if el.tag == 'text':                                                 if int(el.attrib['top']) == 168:                                                     chamber = el.text                                                 if re.findall("senate vote", chamber):                                                     if int(el.attrib['top']) >= 203 , int(el.attrib['top']) < 231:                                                         title = el.text                                                         if (re.findall('house', title)):                                                             title = (re.findall('[0-9]+', title))                                                             title = "hb"+title[0]                                                         elif (re.findall('senate', title)):                                                             title = (re.findall('[0-9]+', title))                                                             title = "sb"+title[0]                                                     if int(el.attrib['top']) >350 , int(el.attrib['top']) <650:                                                         r = el.text                                                         names = re.findall(r'[a-z-\u00f1]{3,}',r)                                                         vs = re.findall(r'[a-z]{1,2}\s',r)                                                         name in names:                                                             legi = name                                                             vote in vs:                                                                 v = vote                                                             if votes.objects.filter(legislation=title).exists() == false:                                                                 c = votes(legislation=title, legislator=legi, vote=v)                                                                 c.save()                                                                     print 'saved'                                                             else:                                                                 print 'not saved'                                                                                                        elif int(el.attrib['top']) == 189:                                                     chamber = el.text                                                 if re.findall("house roll call", chamber):                                                     if int(el.attrib['top']) > 200 , int(el.attrib['top']) <215:                                                         title = el.text                                                         if (re.findall('house', title)):                                                             title = (re.findall('[0-9]+', title))                                                             title = "hb"+title[0]                                                         elif (re.findall('senate', title)):                                                             title = (re.findall('[0-9]+', title))                                                             title = "sb"+title[0]                                                     if int(el.attrib['top']) >385 , int(el.attrib['top']) <1000:                                                         r = el.text                                                         names = re.findall(r'[a-z-\u00f1]{3,}',r)                                                         votes = re.findall(r'[a-z]{1,2}\s',r)                                                         name in names:                                                             legi = name                                                             vote in votes:                                                                 v = vote                                                             if votes.objects.filter(legislation=title).exists() == false:                                                                 c = votes(legislation=title, legislator=legi, vote=v)                                                                 c.save()                                                                 print 'saved'                                                             else:                                                                 print 'not saved'                                  except:                                     pass 

edit 1 here's error trace

    traceback (most recent call last):   file "manage.py", line 10, in <module>     execute_from_command_line(sys.argv)   file "/home7/maythirt/python27/lib/python2.7/site-packages/django/core/management/__init__.py", line 399, in execute_from_command_line     utility.execute()   file "/home7/maythirt/python27/lib/python2.7/site-packages/django/core/management/__init__.py", line 392, in execute     self.fetch_command(subcommand).run_from_argv(self.argv)   file "/home7/maythirt/python27/lib/python2.7/site-packages/django/core/management/base.py", line 242, in run_from_argv     self.execute(*args, **options.__dict__)   file "/home7/maythirt/python27/lib/python2.7/site-packages/django/core/management/base.py", line 285, in execute     output = self.handle(*args, **options)   file "/home7/maythirt/gab/legi/management/commands/vote.py", line 51, in handle     root = lxml.etree.fromstring(x)   file "lxml.etree.pyx", line 3032, in lxml.etree.fromstring (src/lxml/lxml.etree.c:68121)   file "parser.pxi", line 1786, in lxml.etree._parsememorydocument (src/lxml/lxml.etree.c:102470)   file "parser.pxi", line 1674, in lxml.etree._parsedoc (src/lxml/lxml.etree.c:101299)   file "parser.pxi", line 1074, in lxml.etree._baseparser._parsedoc (src/lxml/lxml.etree.c:96481)   file "parser.pxi", line 582, in lxml.etree._parsercontext._handleparseresultdoc (src/lxml/lxml.etree.c:91290)   file "parser.pxi", line 683, in lxml.etree._handleparseresult (src/lxml/lxml.etree.c:92476)   file "parser.pxi", line 633, in lxml.etree._raiseparseerror (src/lxml/lxml.etree.c:91939) lxml.etree.xmlsyntaxerror: none 

as jonathan mentioned, may output of scraperwiki.pdftoxml() that's causing problem. display or log value of x confirm it.

specifically, pdftoxml() runs external program pdftohtml , uses temporary files store pdf , xml.

what i'd check is:

  1. is pdftohtml correctly set on server?
  2. if so, conversion xml work if directly run in shell on server pdf code's failing on? command it's executing pdftohtml -xml -nodrm -zoom 1.5 -enc utf-8 -noframes "input.pdf" "output.xml"

if there's issue when directly run command, that's there issue lies. way pdftohtml runs in scraperwiki code, there's no easy way you'd able tell if command fails.


Comments

Popular posts from this blog

ios - Change Storyboard View using Seague -

commonjs - How to write a typescript definition file for a node module that exports a function? -

openid - Okta: Failed to get authorization code through API call -