Java: SAX Parsing a huge XML file -
i have 35 gb xml file (yes, organizations , have no control on it) sax parse. found example here:
http://www.java2s.com/code/java/xml/saxdemo.htm
of how run sax parser , avoid loading everything. however, out of memory error immediatly. why happens , how can make code scalable xml file size?
here code:
import org.apache.log4j.logger; import org.xml.sax.attributelist; import org.xml.sax.inputsource; import org.xml.sax.saxexception; import org.xml.sax.saxparseexception; import javax.xml.parsers.parserconfigurationexception; import javax.xml.parsers.saxparserfactory; import java.io.file; import java.io.filereader; import java.io.ioexception; public class xmlsaxtools extends org.xml.sax.helpers.defaulthandler { /** * logging facility */ static logger logger = logger.getlogger(xmlsaxtools.class); private string filename = "c:/data/hugefile.xml"; private int counter = 0; /** main method sets things parsing */ public void test() throws ioexception, saxexception, parserconfigurationexception { // create jaxp "parser factory" creating sax parsers javax.xml.parsers.saxparserfactory spf = saxparserfactory.newinstance(); // configure parser factory type of parsers require spf.setvalidating(false); // no validation required // use parser factory create saxparser object // note saxparser jaxp class, not sax class javax.xml.parsers.saxparser sp = spf.newsaxparser(); // create sax input source file argument org.xml.sax.inputsource input = new inputsource(new filereader(filename)); // give inputsource absolute url file, // can resolve relative urls in <!doctype> declaration, e.g. input.setsystemid("file://" + new file(filename).getabsolutepath()); // create instance of class; defines handler methods xmlsaxtools handler = new xmlsaxtools(); // finally, tell parser parse input , notify handler sp.parse(input, handler); // instead of using saxparser.parse() method, part of // jaxp api, use sax1 api directly. note // difference between jaxp class javax.xml.parsers.saxparser , // sax1 class org.xml.sax.parser // // org.xml.sax.parser parser = sp.getparser(); // sax parser // parser.setdocumenthandler(handler); // set main handler // parser.seterrorhandler(handler); // set error handler // parser.parse(input); // parse! } stringbuffer accumulator = new stringbuffer(); // accumulate parsed text string servletname; // name of servlet string servletclass; // class name of servlet string servletid; // value of id attribute of <servlet> tag // when parser encounters plain text (not xml elements), calls // method, accumulates them in string buffer public void characters(char[] buffer, int start, int length) { accumulator.append(buffer, start, length); } // every time parser encounters beginning of new element, // calls method, resets string buffer public void startelement(string name, attributelist attributes) { accumulator.setlength(0); // ready accumulate new text if (name.equals("item")) { logger.info("item tag opened"); counter++; } } // when parser encounters end of element, calls method public void endelement(string name) { if (name.equals("item")) { logger.info("item tag closed. counter: " + counter); } } /** method called when warnings occur */ public void warning(saxparseexception exception) { system.err.println("warning: line " + exception.getlinenumber() + ": " + exception.getmessage()); } /** method called when errors occur */ public void error(saxparseexception exception) { system.err.println("error: line " + exception.getlinenumber() + ": " + exception.getmessage()); } /** method called when non-recoverable errors occur. */ public void fatalerror(saxparseexception exception) throws saxexception { system.err.println("fatal: line " + exception.getlinenumber() + ": " + exception.getmessage()); throw (exception); } public static void main(string[] args){ xmlsaxtools t = new xmlsaxtools(); try { t.test(); } catch (exception e){ logger.error("exception in xmlsaxtools: " + e.getmessage()); e.printstacktrace(); } }
}
you filling accumulator
without ever emptying - unlikely want.
just using sax not sufficient ensure not run out of memory - still need implement code finds, selects , processes do need xml , discards rest.
here's simple parser designed run in separate thread. communicates calling thread via n arrayblockingqueue<string> queue
defined in enclosing class.
the huge data files have deal <batch> ... few thousand items ... </batch>
. parser pulls each item out , presents them one-at-a-time through blocking queue. 1 day turn them xom element
s atm uses string
s.
notice how clears down temporary data fields when enque
called ensure don't run out of memory:
private class parser extends defaulthandler { // track depth of xml - whenever hit level 1 add accumulated xml queue. private int level = 0; // current xml fragment. private final stringbuilder xml = new stringbuilder(); // we've had start tag no data yet. private boolean tagwithnodata = false; /* * called when starting of element reached. example if have tag * called <title> ... </title>, method called when <title> tag * encountered while parsing current xml file. attributelist parameter has * list of attributes declared current element in xml file. */ @override public void startelement(final string uri, final string localname, final string name, final attributes atrbts) throws saxexception { checkforabort(); // have got level 1 yet? if (level == 1) { // emit built ones. try { enqueue(); } catch (interruptedexception ex) { throwables.rethrow(ex); } } // add on. if (level > 0) { // name. xml.append("<").append(name); // attributes. (int = 0; < atrbts.getlength(); i++) { final string att = atrbts.getvalue(i); xml.append(" ").append(atrbts.getqname(i)).append("=\"").append(xml.to(att)).append("\""); } // done. xml.append(">"); // remember we've not had data yet. tagwithnodata = true; } // next element sub-element. level += 1; } /* * called when ending of current element reached. example in * above explanation, method called when </title> tag reached */ @override public void endelement(final string uri, final string localname, final string name) throws saxexception { checkforabort(); if (level > 1) { if (tagwithnodata) { // no data. make > /> xml.insert(xml.length() - 1, "/"); // i've closed 1 enclosing 1 has data (i.e. one). tagwithnodata = false; } else { // had data, finish properly. xml.append("</").append(name).append(">"); } } // done level. level -= 1; if (level == 1) { // finished , @ level 1. try { // enqueue results. enqueue(); } catch (interruptedexception ex) { throwables.rethrow(ex); } } } /* * called when data part encountered. */ @override public void characters(final char buf[], final int offset, final int len) throws saxexception { checkforabort(); // want trimmed. final string chs = new string(buf, offset, len).trim(); if (chs.length() > 0) { // grab data. xml.append(xml.to(chs)); tagwithnodata = false; } } /* * called when parser starts parsing current xml file. */ @override public void startdocument() throws saxexception { checkforabort(); tagwithnodata = false; } /* * called when parser completes parsing current xml file. */ @override public void enddocument() throws saxexception { checkforabort(); try { // enqueue results. enqueue(); } catch (interruptedexception ex) { throwables.rethrow(ex); } } private void enqueue() throws interruptedexception, saxexception { // may have been closed while blocking on queue. checkforabort(); final string x = xml.tostring().trim(); if (x.length() > 0) { // add queue. queue.put(x); // clear out. xml.setlength(0); tagwithnodata = false; } // may have been closed while blocking on queue. checkforabort(); } private void checkforabort() throws xmlinnerdocumentiteratorabortedexception { if (iteratorfinished) { logger.debug("aborting!!!"); throw new xmlinnerdocumentiterator.xmlinnerdocumentiteratorabortedexception("aborted!"); } } } }
Comments
Post a Comment