|
---|
|
|
|
在项目里面遇到了一些被解析的xml文件超过30M 或 60M 以上的情况, 现在已经不好去说为什么不在一开始产生xml的情况下就把xml 做小点,但是遇到这个问题后,我只能解决问题了,解决问题同时害怕重复发明轮子,我也去看了下现有的xml 解析东西,jdom 的SAXBuilder和 dom4j 的SAXReader都是把XML文件一次读入,xml文件过来 会报溢出的异常 但即使SAXParser是可以批量读入解析,但它也是一次解析完,假设XML文件中有一万条数据,解析后就必须在内存中放这么多的对象 个人觉得这样有些不灵活,就自己做了个小东西来切分 但前提是这个xml文件得有文件头 <?xml version="1.0" encoding="GBK"?> encoding必须跟文件编码格式一致 ,不然解析的时候会出乱码。 个人水平有限,但很希望得到大家的指正,希望大家不吝啬手中的砖头
Java代码
- package searchRing.ring.util.xmlBufferTool;
-
- import java.io.*;
- import java.util.regex.Pattern;
- import java.util.regex.Matcher;
-
-
- public class XMLBufferTool {
- private static final int defaultLineCount = 10;
- private static final int defaultMaxOutputSize = 50;
-
- private static final Pattern elementPattern = Pattern.compile("<[a-zA-Z]+>");
- private static final Pattern charSetPattern = Pattern.compile("<[?][[0-9a-zA-Z]|[\\s]|[=]|[\"]|[.]|[-]]+[?]>");
-
- private StringBuffer xmlContentBuffer;
-
-
- /* just used to store and output the data divided */
- XMLOutputBuffer xmlOutput;
-
- private String charSetTitle = "";
-
- private String rootElemetMark = "";
-
- private String childElementMark = "";
-
-
- InputStreamReader bufferedReader;
- InputStream fileInputStream;
-
-
- public XMLBufferTool(String xmlFilePath) {
-
- this.xmlContentBuffer = new StringBuffer();
-
- try {
-
- this.fileInputStream = new FileInputStream(xmlFilePath);
- // bufferedReader = new InputStreamReader(fileInputStream, "UTF-8");
- String charSet = getCharSet(xmlFilePath);
- if (charSet != null)
- bufferedReader = new InputStreamReader(fileInputStream, charSet);
- else
- bufferedReader = new InputStreamReader(fileInputStream);
- } catch (FileNotFoundException fe) {
- fe.printStackTrace();
- } catch (UnsupportedEncodingException uee) {
- uee.printStackTrace();
- } catch (IOException ioe) {
- ioe.printStackTrace();
- }
-
-
- try {
- preparePaser();
- } catch (IOException ie) {
- ie.printStackTrace();
- }
- }
-
-
- public String getCharSetTitle() {
- return charSetTitle;
- }
-
- public String getRootElemetMark() {
- return rootElemetMark;
- }
-
- private String getCharSet(String filePath) throws IOException {
- char temp[] = new char[512];
- FileInputStream tempInput = new FileInputStream(filePath);
- InputStreamReader tempReader = new InputStreamReader(tempInput);
-
- int i = tempReader.read(temp);
-
- tempReader.close();
- tempInput.close();
- if (i < 0)
- return null;
-
- String tempStr = new String(temp);
- Matcher m = charSetPattern.matcher(tempStr);
- if (m.find()) {
- String charSetStr = tempStr.substring(m.start(), m.end());
- Pattern tempP = Pattern.compile("[\"][[0-9a-zA-Z]|[-]]+[\"]");
- Matcher tempM = tempP.matcher(charSetStr);
- if (tempM.find()) {
- String charSet = charSetStr.substring(tempM.start(), tempM.end());
- return charSet.substring(1, charSet.length() - 1);
- }
- }
-
- return null;
- }
-
-
- private void preparePaser() throws IOException {
- readSomeLine(defaultLineCount);
- Matcher m = charSetPattern.matcher(xmlContentBuffer);
- if (m.find()) {
- this.charSetTitle = this.xmlContentBuffer.substring(m.start(), m.end());
- this.xmlContentBuffer.delete(0, m.end());
- }
-
- m = elementPattern.matcher(xmlContentBuffer);
- if (m.find()) {
- this.rootElemetMark = this.xmlContentBuffer.substring(m.start(), m.end());
- this.xmlContentBuffer.delete(0, m.end());
- }
-
- m = elementPattern.matcher(xmlContentBuffer);
- if (m.find()) {
- this.childElementMark = this.xmlContentBuffer.substring(m.start(), m.end());
- }
- this.xmlOutput = new XMLOutputBuffer(this.childElementMark);
-
- parserBuffer();
- }
-
-
- private int readSomeLine(int lineCount) throws IOException {
-
- char buffer[] = new char[1024];
- int i = 0;
- int index = 0;
- /* be careful of the sequence of the boolean caculation */
- while (i++ < lineCount && (index = this.bufferedReader.read(buffer)) > 0) {
- xmlContentBuffer.append(buffer, 0, index);
- }
-
- return index;
-
- }
-
-
- private void parserBuffer() {
-
- int lastIndex = this.xmlContentBuffer.lastIndexOf(this.childElementMark);
-
- if (lastIndex > 0) {
- this.xmlOutput.append(this.xmlContentBuffer.substring(0, lastIndex));
- this.xmlContentBuffer.delete(0, lastIndex);
- }
- }
-
- public StringBuffer popDividedDataAfterParser() throws IOException {
-
- while (this.xmlOutput.getItemCount() < defaultMaxOutputSize) {
- int i = readSomeLine(defaultLineCount);
- parserBuffer();
- if (i < 0)
- break;
- }
-
- if (this.xmlOutput.getItemCount() == 0)
- return null;
-
- StringBuffer returnSB = this.xmlOutput.getXmlOutput();
- this.xmlOutput.clearBuffer();
- return returnSB.insert(0, this.rootElemetMark).append(this.rootElemetMark.replaceFirst("<", "</"));
-
- }
-
-
- public static void main(String args[]) throws Exception {
- String str = "F:/ringInfoXML/ringTime.xml";
-
- XMLBufferTool xmlb = new XMLBufferTool(str);
-
- StringBuffer s = xmlb.popDividedDataAfterParser();
- int i = 0;
- Matcher m = Pattern.compile("<ring>").matcher(s);
- while (m.find())
- i++;
-
- System.out.println(i);
- System.out.println(s);
-
-
- }
-
- private static class XMLOutputBuffer {
- private StringBuffer xmlOutput;
- private int itemCount;
-
- private Pattern markPattern;
-
- XMLOutputBuffer(String markStr) {
- this.markPattern = Pattern.compile(markStr);
- xmlOutput = new StringBuffer();
- itemCount = 0;
- }
-
- public void append(String str) {
- if (str == null || "".equals(str))
- return;
- this.xmlOutput.append(str);
- Matcher m = this.markPattern.matcher(str);
- while (m.find())
- this.itemCount++;
- }
-
- public void clearBuffer() {
- xmlOutput = new StringBuffer();
- this.itemCount = 0;
- }
-
- public StringBuffer getXmlOutput() {
- return xmlOutput;
- }
-
- public int getItemCount() {
- return itemCount;
- }
- }
-
-
- }
|
|