需求:
1、写一个动态正则;
2、只要写出日志的Schma就可以获取到日志的正则。
package com.donews.util
import java.util.regex.Pattern
import scala.collection.mutable.ArrayBuffer
/**
* Created by yuhui on 2016/8/5.
*/
/***
列子: www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" China 22 Beijing
第一版本 "$domain $ip - $remote_user [$timestamp] \"$http_url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" $country $region $city"
例子 : www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" "-" "China" "22" "Beijing"
第二版本 "$domain $ip - $remote_user [$timestamp] \"$http_url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$e_ip\" \"$country\" \"$region\" \"$city\""
例子 : www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" "http://www.donews.com/media/201408/2834414.shtm" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" "-" "China" "22" "Beijing"
第三版本 $domain $http_x_forwarded_for - $remote_user [$timestamp] "$http_url" "$url" $status $body_bytes_sent "$http_referer" "$http_user_agent" "$e_ip" "$country" "$region" "$city"
*/
object DynamicRegex{
var cmd = ""
var regex =""
def tran(cmd: String): String = {
val sb = new StringBuffer()
sb.append("^")
val regex = "^(\\W+)$"
val p = Pattern.compile(regex)
cmd.split(" ").foreach(key =>
if (!p.matcher(key).find()) {
key.substring(0,key.indexOf("$"))
match {
case "" =>
if (key.split("\\$").length > 2) {
var split = ""
val regex = "(\\$\\w+)(\\W+)(\\$\\w+)(.*)"
val p = Pattern.compile(regex)
val m = p.matcher(key)
while (m.find()) {
split = m.group(2)
}
sb.append("(")
for (i <- Range(0,key.split("\\$").length - 1,1)) {
if (i < key.split("\\$").length - 2) {
sb.append("[\\S]+[" + split + "]")
} else {
sb.append("[\\S]+")
}
}
sb.append(")\\s")
} else {
sb.append("([\\S]+)\\s")
}
case _ =>
val regex = "(\\W+)(\\$\\w+)(\\W+)"
val p = Pattern.compile(regex)
val m = p.matcher(key)
if (m.find) {
val pre = m.group(1)
val end = m.group(3)
sb.append("(" + escape(pre) + ".+" + escape(end) + ")\\s")
}
}
}else{
sb.append("(\\W+)\\s")
}
)
val str = sb.toString
str.substring(0,str.length - 2).concat("$")
}
def escape(original: String): String = {
val tb = new StringBuffer()
for (i <- Range(0,original.length(),1)) {
if ("\"".equals(original.charAt(i).toString)) {
} else {
tb.append("\\")
}
tb.append(original.charAt(i))
}
tb.toString
}
def lineToGroup(line: String): ArrayBuffer[String] = {
val groups = ArrayBuffer[String]()
val p = Pattern.compile(regex)
val m = p.matcher(line)
while (m.find()) {
for (i <- Range(1,m.groupCount() + 1,1)) {
groups.append(m.group(i))
}
}
groups
}
def main(args: Array[String]): Unit = {
cmd = "$domain $http_x_forwarded_for - $remote_user [$timestamp] \"$http_url\" \"$url\" $status $body_bytes_sent \"$http_referer\" \"$http_user_agent\" \"$e_ip\" \"$country\" \"$region\" \"$city\""
regex=tran(cmd)
println(regex)
val log = "www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] \"GET /media/201408/2834414.shtm HTTP/1.1\" \"http://www.donews.com/media/201408/2834414.shtm\" 200 11296 \"-\" \"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)\" \"-\" \"China\" \"22\" \"Beijing\""
lineToGroup(log).foreach(x=>println(x))
}
}
输出结果:
^([\S]+)\s([\S]+)\s(\W+)\s([\S]+)\s(\[.+\])\s(".+")\s(".+")\s([\S]+)\s([\S]+)\s(".+")\s(".+")\s(".+")\s(".+")\s(".+")\s(".+")$ www.donews.com 123.125.71.72 - - [28/Nov/2016:11:08:50 +0800] "GET /media/201408/2834414.shtm HTTP/1.1" "http://www.donews.com/media/201408/2834414.shtm" 200 11296 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" "-" "China" "22" "Beijing"