这几天手上有个活,解析xml,众所周知xml的解析方法有:
- DOM
- SAX
- linq to xml
- plinq
测试用xml和生成代码
1@H_403_17@ static@H_403_17@ void@H_403_17@ CreateFile()
@H_403_17@ 2@H_403_17@ {
@H_403_17@ 3@H_403_17@ int@H_403_17@ N = 5000000@H_403_17@;
@H_403_17@ 4@H_403_17@ Random rand = new@H_403_17@ Random();
@H_403_17@ 5@H_403_17@ using@H_403_17@ (var@H_403_17@ writer = new@H_403_17@ XmlTextWriter("@H_403_17@VeryHugeXmlFile.xml@H_403_17@"@H_403_17@,Encoding.UTF8))
@H_403_17@ 6@H_403_17@ {
@H_403_17@ 7@H_403_17@ writer.Formatting = Formatting.Indented;
@H_403_17@ 8@H_403_17@
9@H_403_17@ writer.WriteStartDocument();
@H_403_17@10@H_403_17@ writer.WriteStartElement("@H_403_17@Root@H_403_17@"@H_403_17@);
@H_403_17@11@H_403_17@ for@H_403_17@ (int@H_403_17@ count = 1@H_403_17@; count <= N; count++)
@H_403_17@12@H_403_17@ {
@H_403_17@13@H_403_17@ writer.WriteStartElement("@H_403_17@Person@H_403_17@"@H_403_17@);
@H_403_17@14@H_403_17@ writer.WriteElementString("@H_403_17@Id@H_403_17@"@H_403_17@,count.ToString());
@H_403_17@15@H_403_17@ writer.WriteElementString("@H_403_17@Name@H_403_17@"@H_403_17@,rand.Next().ToString());
@H_403_17@16@H_403_17@ writer.WriteElementString("@H_403_17@Sex@H_403_17@"@H_403_17@,rand.Next(0@H_403_17@,2@H_403_17@) == 0@H_403_17@ ? "@H_403_17@男@H_403_17@"@H_403_17@ : "@H_403_17@女@H_403_17@"@H_403_17@);
@H_403_17@17@H_403_17@ writer.WriteElementString("@H_403_17@Age@H_403_17@"@H_403_17@,rand.Next(1@H_403_17@,101@H_403_17@).ToString());
@H_403_17@18@H_403_17@ writer.WriteEndElement();
@H_403_17@19@H_403_17@ }
@H_403_17@20@H_403_17@ writer.WriteEndElement();
@H_403_17@21@H_403_17@ writer.WriteEndDocument();
@H_403_17@22@H_403_17@ }
@H_403_17@23@H_403_17@ }
1@H_403_17@ <?@H_403_17@xml version="1.0" encoding="utf-8"@H_403_17@?>@H_403_17@
2@H_403_17@ <@H_403_17@Root@H_403_17@>@H_403_17@
3@H_403_17@ <@H_403_17@Person@H_403_17@>@H_403_17@
4@H_403_17@ <@H_403_17@Id@H_403_17@>@H_403_17@1</@H_403_17@Id@H_403_17@>@H_403_17@
5@H_403_17@ <@H_403_17@Name@H_403_17@>@H_403_17@897639886</@H_403_17@Name@H_403_17@>@H_403_17@
6@H_403_17@ <@H_403_17@Sex@H_403_17@>@H_403_17@女</@H_403_17@Sex@H_403_17@>@H_403_17@
7@H_403_17@ <@H_403_17@Age@H_403_17@>@H_403_17@80</@H_403_17@Age@H_403_17@>@H_403_17@
8@H_403_17@ </@H_403_17@Person@H_403_17@>@H_403_17@
9@H_403_17@ <@H_403_17@Person@H_403_17@>@H_403_17@
10@H_403_17@ <@H_403_17@Id@H_403_17@>@H_403_17@2</@H_403_17@Id@H_403_17@>@H_403_17@
11@H_403_17@ <@H_403_17@Name@H_403_17@>@H_403_17@2012162696</@H_403_17@Name@H_403_17@>@H_403_17@
12@H_403_17@ <@H_403_17@Sex@H_403_17@>@H_403_17@女</@H_403_17@Sex@H_403_17@>@H_403_17@
13@H_403_17@ <@H_403_17@Age@H_403_17@>@H_403_17@60</@H_403_17@Age@H_403_17@>@H_403_17@
14@H_403_17@ </@H_403_17@Person@H_403_17@>@H_403_17@
15@H_403_17@ <@H_403_17@Person@H_403_17@>@H_403_17@
测试代码
统计时间(只是粗略统计了一下运行时间)
1@H_403_17@ static@H_403_17@ void@H_403_17@ Watch(Action<string@H_403_17@> way,string@H_403_17@ file)
@H_403_17@2@H_403_17@ {
@H_403_17@3@H_403_17@ Stopwatch watch = new@H_403_17@ Stopwatch();
@H_403_17@4@H_403_17@
5@H_403_17@ watch.Start();
@H_403_17@6@H_403_17@ way(file);
@H_403_17@7@H_403_17@ watch.Stop();
@H_403_17@8@H_403_17@ Console.WriteLine(watch.ElapsedMilliseconds);
@H_403_17@9@H_403_17@ }
DOM
1@H_403_17@ static@H_403_17@ void@H_403_17@ DomWay(string@H_403_17@ file)
@H_403_17@2@H_403_17@ {
@H_403_17@3@H_403_17@ XmlDocument doc = new@H_403_17@ XmlDocument();
@H_403_17@4@H_403_17@ doc.Load(file);
@H_403_17@5@H_403_17@
6@H_403_17@ Console.WriteLine(doc.SelectNodes(YOUR-XPATH-HERE).Count);
@H_403_17@7@H_403_17@
8@H_403_17@ }
SAX
1@H_403_17@ static@H_403_17@ void@H_403_17@ SaxWay(string@H_403_17@ file)
@H_403_17@ 2@H_403_17@ {
@H_403_17@ 3@H_403_17@ using@H_403_17@ (XmlTextReader reader = new@H_403_17@ XmlTextReader(file))
@H_403_17@ 4@H_403_17@ {
@H_403_17@ 5@H_403_17@ int@H_403_17@ count = 0@H_403_17@;
@H_403_17@ 6@H_403_17@ while@H_403_17@ (reader.Read())
@H_403_17@ 7@H_403_17@ {
@H_403_17@ 8@H_403_17@ if@H_403_17@ (reader.Name == "@H_403_17@Person@H_403_17@"@H_403_17@ && reader.NodeType == XmlNodeType.Element)
@H_403_17@ 9@H_403_17@ {
@H_403_17@10@H_403_17@ reader.Read();
@H_403_17@11@H_403_17@ reader.Read();
@H_403_17@12@H_403_17@
13@H_403_17@ int@H_403_17@? Id = null@H_403_17@;
@H_403_17@14@H_403_17@ int@H_403_17@? name = null@H_403_17@;
@H_403_17@15@H_403_17@ string@H_403_17@ sex = null@H_403_17@;
@H_403_17@16@H_403_17@ int@H_403_17@? age = null@H_403_17@;
@H_403_17@17@H_403_17@
18@H_403_17@ if@H_403_17@ (reader.Name == "@H_403_17@Id@H_403_17@"@H_403_17@)
@H_403_17@19@H_403_17@ {
@H_403_17@20@H_403_17@ Id = reader.ReadElementContentAsInt();
@H_403_17@21@H_403_17@ reader.Read();
@H_403_17@22@H_403_17@ name = reader.ReadElementContentAsInt();
@H_403_17@23@H_403_17@ reader.Read();
@H_403_17@24@H_403_17@ sex = reader.ReadElementContentAsString();
@H_403_17@25@H_403_17@ reader.Read();
@H_403_17@26@H_403_17@ age = reader.ReadElementContentAsInt();
@H_403_17@27@H_403_17@ reader.Read();
@H_403_17@28@H_403_17@ }
@H_403_17@29@H_403_17@
30@H_403_17@ if@H_403_17@ (reader.Name == "@H_403_17@Person@H_403_17@"@H_403_17@ && reader.NodeType == XmlNodeType.EndElement)
@H_403_17@31@H_403_17@ reader.Read();
@H_403_17@32@H_403_17@
33@H_403_17@ if@H_403_17@ (Id != null@H_403_17@ && name != null@H_403_17@ && sex != null@H_403_17@ && age != null@H_403_17@)
@H_403_17@34@H_403_17@ {
@H_403_17@35@H_403_17@ if@H_403_17@ (在此设置自定义过滤条件)
@H_403_17@36@H_403_17@ count++;
@H_403_17@37@H_403_17@ }
@H_403_17@38@H_403_17@ }
@H_403_17@39@H_403_17@ }
@H_403_17@40@H_403_17@
41@H_403_17@ Console.WriteLine(count);
@H_403_17@42@H_403_17@ }
@H_403_17@43@H_403_17@ }
Linq to Xml
1@H_403_17@ static@H_403_17@ void@H_403_17@ LinqWay(string@H_403_17@ file)
@H_403_17@ 2@H_403_17@ {
@H_403_17@ 3@H_403_17@ var@H_403_17@ root = XElement.Load(file);
@H_403_17@ 4@H_403_17@ var@H_403_17@ person = from@H_403_17@ p in@H_403_17@ root.Elements("@H_403_17@Person@H_403_17@"@H_403_17@)@H_403_17@
7@H_403_17@ where@H_403_17@ 在此设置自定义过滤条件
8@H_403_17@ select@H_403_17@ id;
@H_403_17@ 9@H_403_17@ Console.WriteLine(person.Count());
@H_403_17@10@H_403_17@ }
PLinq to Xml
1@H_403_17@ static@H_403_17@ void@H_403_17@ PLinqWay(string@H_403_17@ file)
@H_403_17@ 2@H_403_17@ {
@H_403_17@ 3@H_403_17@ var@H_403_17@ root = XElement.Load(file);
@H_403_17@ 4@H_403_17@ var@H_403_17@ person = from@H_403_17@ p in@H_403_17@ root.Elements("@H_403_17@Person@H_403_17@"@H_403_17@).AsParallel()@H_403_17@
7@H_403_17@ where@H_403_17@ 在此设置自定义过滤条件
8@H_403_17@ select@H_403_17@ id;
@H_403_17@ 9@H_403_17@ Console.WriteLine(person.Count());
@H_403_17@10@H_403_17@ }
统计结果
在6核8G内存机器上,测试程序设置为x64和release模式,在xml查询结果相同的情况下取运行时间(ms),没有详细采集cpu和内存数据
两个模式,区别是加了一个素数的判断。
Id > 5000 && sex == "男" && age > 15 && age < 50 |
Id > 5000 && sex == "男" && age > 15 && age < 50 && IsPrimeInt(name) |
|
sax | 13857 | 40010 |
linq | 27336 | 53760 |
plinq | 24550 | 28846 |
dom | 31737 | 0 |
由于dom模式本身xpath模式不支持嵌入函数,所以第二个测试没有采集结果。
小结
sax:速度优先,内存占用少,但是代码复杂度高。
linq:速度较sax慢,但是代码优雅,维护容易
plinq:同上,在非计算密集型模式中,不比linq和sax模式好多少。但是在计算密集下,后来居上
内存方面仅是肉眼观察了任务管理器,sax基本内存曲线为水平线,而linq&plinq在load的时候分配内存,可能其内部也是用了dom。
仓促行文,其中必有不实之处,往各位劳神指教。