我想编写我的自定义scrapy链接提取器来提取链接.
scrapy文档说它有两个内置的提取器.
解决方法
这是自定义链接提取器的示例
class RCP_RegexLinkExtractor(SgmlLinkExtractor): """High performant link extractor""" def _extract_links(self,response_text,response_url,response_encoding,base_url=None): if base_url is None: base_url = urljoin(response_url,self.base_url) if self.base_url else response_url clean_url = lambda u: urljoin(base_url,remove_entities(clean_link(u.decode(response_encoding)))) clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip() links_text = linkre.findall(response_text) urlstext = set([(clean_url(url),clean_text(text)) for url,_,text in links_text]) return [Link(url,text) for url,text in urlstext]
rules = ( Rule( RCP_RegexLinkExtractor( allow=(r"epolls/2012/president/[a-z]{2}/[a-z]+_romney_vs_obama-[0-9]{4}\.html"),# Regex explanation: # [a-z]{2} - matches a two character state abbreviation # [a-z]* - matches a state name # [0-9]{4} - matches a 4 number unique webpage identifier allow_domains=('realclearpolitics.com',),callback='parseStatePolls',# follow=None,# default process_links='processLinks',process_request='processRequest',)