使用goquery抓取天气的demo。数据量有点多。目前按省份存储天气数据。存储到csv文件中。
- package main
- import (
- "code.google.com/p/mahonia"
- "encoding/csv"
- "fmt"
- "github.com/PuerkitoBio/goquery"
- "net/http"
- "os"
- "strings"
- "time"
- )
- var log = loger.Loger{
- Level: loger.DEBUG,}
- const (
- YEAR = 2013
- SleepTime = 100 //毫秒
- )
- func main() {
- sc,cc := GetCity()
- var weatherInfoAll []*WeaterInfo
- for key,value := range sc {
- filePath := fmt.Sprintf("%d%s.csv",YEAR,key)
- _,err := os.Stat(filePath)
- if err == nil {
- continue
- }
- weatherInfoAll = make([]*WeaterInfo,100000)
- for _,city := range value {
- name := cc[city]
- log.Debug("get ",key,city)
- client := &http.Client{}
- weatherInfoYear := GetWeather(client,city,name)
- weatherInfoAll = append(weatherInfoAll,weatherInfoYear...)
- }
- SaveToCSV(key,weatherInfoAll)
- }
- }
- //返回数据为省份=>城市名 城市名=>拼音.html
- func GetCity() (sc map[string][]string,cc map[string]string) {
- url := "http://www.tianqihoubao.com/lishi/"
- request,err := http.NewRequest("GET",url,nil)
- if err != nil {
- log.Log(err)
- return
- }
- request.Header.Add("User-Agent","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML,like Gecko) Ubuntu Chromium/39.0.2171.65 Chrome/39.0.2171.65 Safari/537.36")
- request.Header.Add("referer","http://www.tianqihoubao.com/")
- resp,err := http.DefaultClient.Do(request)
- if err != nil {
- log.Log(err)
- return
- }
- document,err := goquery.NewDocumentFromResponse(resp)
- if err != nil {
- log.Log(err)
- return
- }
- gbk := mahonia.NewDecoder("gbk")
- sc = make(map[string][]string)
- cc = make(map[string]string)
- document.Find(".citychk").Find("dl").Each(func(index int,s *goquery.Selection) {
- province := gbk.ConvertString(s.Find("dt").Find("b").Text())
- citys := make([]string,20)
- s.Find("dd").Find("a").Each(func(index int,se *goquery.Selection) {
- uri,exists := se.Attr("href")
- if !exists {
- return
- }
- name := gbk.ConvertString(se.Text())
- uri = strings.Replace(uri,".html","",-1)
- citys = append(citys,name)
- cc[name] = uri
- })
- sc[province] = citys
- })
- return
- }
- type WeaterInfo struct {
- Province string
- City string
- Date string
- Info string
- Temp string
- Wind string
- }
- func GetWeather(client *http.Client,province,name string) []*WeaterInfo {
- baseUrl := fmt.Sprintf("http://www.tianqihoubao.com%s/month/%%s",name)
- weaterInfoYear := make([]*WeaterInfo,380)
- for i := 1; i <= 12; i++ {
- url := fmt.Sprintf(baseUrl,fmt.Sprintf("%d%02d.html",i))
- weaterInfos := GetWeatherInfo(client,url)
- weaterInfoYear = append(weaterInfoYear,weaterInfos...)
- time.Sleep(time.Millisecond * SleepTime)
- }
- return weaterInfoYear
- }
- func GetWeatherInfo(client *http.Client,url string) (weaterInfos []*WeaterInfo) {
- request,err := client.Do(request)
- if err != nil {
- log.Log(err)
- return
- }
- document,err := goquery.NewDocumentFromResponse(resp)
- if err != nil {
- log.Log(err)
- return
- }
- gbk := mahonia.NewDecoder("gbk")
- weaterInfos = make([]*WeaterInfo,31)
- document.Find("#content").Find("tbody").Find("tr").Each(func(index int,s *goquery.Selection) {
- //排除第一个
- if index == 0 {
- return
- }
- var date,info,temp,wind string
- s.Find("td").Each(func(index int,se *goquery.Selection) {
- if index == 0 {
- date = gbk.ConvertString(se.Find("a").Text())
- }
- if index == 1 {
- info = gbk.ConvertString(se.Text())
- }
- if index == 2 {
- temp = gbk.ConvertString(se.Text())
- }
- if index == 3 {
- wind = gbk.ConvertString(se.Text())
- }
- })
- weatherInfo := &WeaterInfo{
- Province: province,City: city,Date: date,Info: info,Temp: temp,Wind: wind,}
- weaterInfos = append(weaterInfos,weatherInfo)
- })
- return
- }
- func SaveToCSV(file string,weatherInfos []*WeaterInfo) (err error) {
- filePath := fmt.Sprintf("%d%s.csv",file)
- _,err = os.Stat(filePath)
- if err == nil {
- return
- }
- f,err := os.Create(filePath)
- if err != nil {
- log.Log(err)
- return
- }
- defer f.Close()
- f.WriteString("\xEF\xBB\xBF") //UTF-8
- w := csv.NewWriter(f)
- w.Write([]string{"省份","城市","日期","天气状况","气温","风力风向"})
- for i,weatherInfo := range weatherInfos {
- if i%1000 == 0 {
- w.Flush() //刷入文件
- }
- strs := []string{TrimSpace(weatherInfo.Province),TrimSpace(weatherInfo.City),TrimSpace(weatherInfo.Date),TrimSpace(weatherInfo.Info),TrimSpace(weatherInfo.Temp),TrimSpace(weatherInfo.Wind)}
- w.Write(strs)
- }
- w.Flush()
- return
- }
- func TrimSpace(value string) string {
- value = strings.Replace(value,"\n",-1)
- return strings.Replace(value," ",-1)
- }
日志库删掉了,因为看起来有点不是很好。当然,也没有说这个代码好。只是临时写着东西。