- 解析xml数据
- import java.util.ArrayList;
- import java.util.List;
- import org.xml.sax.Attributes;
- import org.xml.sax.ContentHandler;
- import org.xml.sax.Locator;
- import org.xml.sax.SAXException;
- import com.mongodb.DBObject;
- /*
- * @author
- * @time 2015-11-8
- * 主要是是implements ContentHandler,主要实现接口ContentHandler中的startDocument()、endDocument()、startElement()、endElement()
- * 另外自定义方法writeToMongoDB()、storeDBMongo()
- *
- */
- public class MyContentHandler implements ContentHandler {
- private StringBuffer buf;
- private String ctitle;
- private String cns;
- private String cid;
- private String ctext;
- private String ctimestamp;
- private int idnumber=0;
- List<Data> listdata=new ArrayList<Data>();
- List list=new ArrayList();
- @Override
- public void setDocumentLocator(Locator locator) {
- // TODO Auto-generated method stub
- }
- @Override
- public void startDocument() throws SAXException {
- // TODO Auto-generated method stub
- buf=new StringBuffer();
- System.out.println("*******解析开始*******");
- }
- @Override
- public void endDocument() throws SAXException {
- // TODO Auto-generated method stub
- try {
- writeToMongoDB();
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- System.out.println("*******解析结束*******");
- }
- //把数据导入MongoDB数据库中
- private void writeToMongoDB() throws Exception {
- // TODO Auto-generated method stub
- List<DBObject> dblist=new ArrayList<DBObject>();
- for(Data d:listdata){
- dblist.add(BSONT.mapToBSON(d.toJSONMap()));
- }
- MongoDBT.writeListToMongo("IP",27017,"databaseName","collectionName",dblist);
- }
- @Override
- public void startPrefixMapping(String prefix,String uri)
- throws SAXException {
- // TODO Auto-generated method stub
- }
- @Override
- public void endPrefixMapping(String prefix) throws SAXException {
- // TODO Auto-generated method stub
- }
- @Override
- public void startElement(String uri,String localName,String qName,Attributes attributes) throws SAXException {
- // TODO Auto-generated method stub
- if(qName=="page"){
- idnumber=1;
- }
- if(qName=="title"){
- ctitle=qName;
- }else if(qName=="ns"){
- cns=qName;
- }else if(qName=="id"&&idnumber==1){
- cid=qName;
- idnumber=0;
- }else if(qName=="timestamp"){
- ctimestamp=qName;
- }else if(qName=="text"){
- ctext=qName;
- }
- }
- @Override
- public void endElement(String uri,String qName)
- throws SAXException {
- // TODO Auto-generated method stub
- if(ctitle==qName){
- String sss=buf.toString();
- ctitle="";
- list.add(sss);
- buf.setLength(0);
- }else if(cns==qName){
- cns="";
- String sss=buf.toString();
- list.add(sss);
- buf.setLength(0);
- }else if(cid==qName){
- cid="";
- String sss=buf.toString();
- list.add(sss);
- buf.setLength(0);
- }else if(ctimestamp==qName){
- ctimestamp="";
- String sss=buf.toString();
- list.add(sss);
- buf.setLength(0);
- }else if(ctext==qName){
- ctext="";
- String sss=buf.toString();
- list.add(sss);
- buf.setLength(0);
- //有些sss中虽然有重定向标记,但没有“[[”和“]]”,那么就会出现String的index不在范围内的问题
- if((sss.toUpperCase().contains("#REDIRECT")||sss.contains("#重定向"))&&sss.contains("[[")&&sss.contains("]]")){
- int i=sss.indexOf("[[");
- int j=sss.indexOf("]]");
- String s=sss.substring(i+2,j);
- list.add(s);
- list.add("redirect");
- }else{
- list.add("");
- list.add("article");
- }
- }
- if(qName=="page"){
- storeDBMongo(list);
- }
- }
- private void storeDBMongo(List lt) {
- // TODO Auto-generated method stub
- for(int i=0;i<list.size();i++){
- System.out.println(lt.get(i));
- }
- try {
- Data data=new Data();
- data.setTitle(list.get(0).toString());
- data.setNamespace(list.get(1).toString());
- data.setId(list.get(2).toString());
- data.setLastEsited(list.get(3).toString());
- data.setMarkup(list.get(4).toString());
- data.setTarget(list.get(5).toString());
- data.setType(list.get(6).toString());
- listdata.add(data);
- if(listdata.size()>=300){
- writeToMongoDB();
- listdata.clear();
- }
- list.clear();
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- @Override
- public void characters(char[] ch,int start,int length)
- throws SAXException {
- // TODO Auto-generated method stub
- if(ctitle=="title"){
- buf.append(new String(ch,start,length));
- }else if(cns=="ns"){
- buf.append(new String(ch,length));
- }else if(cid=="id"){
- buf.append(new String(ch,length));
- list.add(new String(ch,length));
- }else if(ctimestamp=="timestamp"){
- buf.append(new String(ch,length));
- }else if(ctext=="text"){
- buf.append(new String(ch,length));
- }
- }
- @Override
- public void ignorableWhitespace(char[] ch,int length)
- throws SAXException {
- // TODO Auto-generated method stub
- }
- @Override
- public void processingInstruction(String target,String data)
- throws SAXException {
- // TODO Auto-generated method stub
- }
- @Override
- public void skippedEntity(String name) throws SAXException {
- // TODO Auto-generated method stub
- }
- }
自定义类Data、JSONT
还有就是类MyErrorHandler
- import java.util.HashMap;
- import java.util.Map;
- public class Data {
- private String id;
- private String namespace;
- private String type;
- private String title;
- private String markup;
- private String lastEsited;
- private String target;
- public String getId() {
- return id;
- }
- public void setId(String id) {
- this.id = id;
- }
- public String getNamespace() {
- return namespace;
- }
- public void setNamespace(String namespace) {
- this.namespace = namespace;
- }
- public String getType() {
- return type;
- }
- public void setType(String type) {
- this.type = type;
- }
- public String getTitle() {
- return title;
- }
- public void setTitle(String title) {
- this.title = title;
- }
- public String getMarkup() {
- return markup;
- }
- public void setMarkup(String markup) {
- this.markup = markup;
- }
- public String getLastEsited() {
- return lastEsited;
- }
- public void setLastEsited(String lastEsited) {
- this.lastEsited = lastEsited;
- }
- public String getTarget() {
- return target;
- }
- public void setTarget(String target) {
- this.target = target;
- }
- public Map<String,Object> toJSONMap(){
- Map<String,Object> jsonmap=new HashMap<String,Object>();
- jsonmap.put("id",this.id);
- jsonmap.put("namespace",this.namespace);
- jsonmap.put("type",this.type);
- jsonmap.put("title",this.title);
- jsonmap.put("markup",this.markup);
- jsonmap.put("lastEsited",this.lastEsited);
- jsonmap.put("target",this.target);
- return jsonmap;
- }
- }
- /*
- * NextMap-Crawler Module
- *
- * Copyright (C) 2002-2014,Institute of Geographic Sciences and Natural Resources Research,* Chinese Academy of Sciences
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- */
- import java.io.IOException;
- import java.io.StringWriter;
- import java.util.List;
- import java.util.Map;
- import com.fasterxml.jackson.databind.ObjectMapper;
- /**
- *
- * @author zhuhaichuan
- * @date 2015-11-8
- *
- *
- */
- public class JSONT {
- public static String mapToJSONString(Map map) {
- StringWriter sw = new StringWriter();
- try {
- ObjectMapper mapper = new ObjectMapper();
- mapper.writeValue(sw,map);
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return sw.toString();
- }
- /**
- *
- * @param list
- * @return
- */
- public static String listToJSONString(List list) {
- StringWriter sw = new StringWriter();
- try {
- ObjectMapper mapper = new ObjectMapper();
- mapper.writeValue(sw,list);
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return sw.toString();
- }
- /**
- *
- * @param list
- * @return
- */
- public static String beanToJSONString(Object bean) {
- StringWriter sw = new StringWriter();
- try {
- ObjectMapper mapper = new ObjectMapper();
- mapper.writeValue(sw,bean);
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return sw.toString();
- }
- /**
- *
- * @param jsonstr
- * @return
- */
- public static Map jsonToMap(String jsonstr) {
- Map map = null;
- try {
- ObjectMapper mapper = new ObjectMapper();
- map = mapper.readValue(jsonstr,Map.class);
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return map;
- }
- /**
- *
- * @param jsonstr
- * @return
- */
- public static List jsonToList(String jsonstr) {
- List list = null;
- try {
- ObjectMapper mapper = new ObjectMapper();
- list = mapper.readValue(jsonstr,List.class);
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return list;
- }
- }
- import org.xml.sax.ErrorHandler;
- import org.xml.sax.SAXException;
- import org.xml.sax.SAXParseException;
- public class MyErrorHandler implements ErrorHandler {
- @Override
- public void warning(SAXParseException exception) throws SAXException {
- // TODO Auto-generated method stub
- System.out.println("*******WARNING******");
- System.out.println("行号:" + exception.getLineNumber());
- System.out.println("列号:" + exception.getColumnNumber());
- System.out.println("exception信息:" + exception.getMessage());
- System.out.println("********************");
- }
- @Override
- public void error(SAXParseException exception) throws SAXException {
- // TODO Auto-generated method stub
- System.out.println("******* ERROR ******");
- System.out.println("行号:" + exception.getLineNumber());
- System.out.println("列号:" + exception.getColumnNumber());
- System.out.println("exception信息:" + exception.getMessage());
- System.out.println("********************");
- }
- @Override
- public void fatalError(SAXParseException exception) throws SAXException {
- // TODO Auto-generated method stub
- System.out.println("******** FATAL ERROR ********");
- System.out.println("行号:" + exception.getLineNumber());
- System.out.println("列号:" + exception.getColumnNumber());
- System.out.println("exception信息" + exception.getMessage());
- System.out.println("*****************************");
- }
- }
自定义MongoDBT类
- import java.util.ArrayList;
- import java.util.List;
- import com.mongodb.DB;
- import com.mongodb.DBCollection;
- import com.mongodb.DBObject;
- import com.mongodb.Mongo;
- public class MongoDBT {
- public static void writeListToMongo(String ip,int port,String dbname,String collname,List<DBObject> list) throws Exception{
- Mongo mongo=new Mongo(ip,port);
- DB db=mongo.getDB(dbname);
- DBCollection collection=db.getCollection(collname);
- List<DBObject> dblist=new ArrayList<DBObject>();
- for(int i=0;i<list.size();i++){
- dblist.add(list.get(i));
- }
- collection.insert(dblist);
- mongo.close();
- }
- }