crawler4j 爬爬知多少-白红宇

crawler4j 爬爬知多少

阅读量：6682 次

发布时间：2019-06-25

本文共 7460 字，大约阅读时间需要 24 分钟。

　　1. Crawler是什么？

　　crawler4j是一个开源的java爬虫类库，可以用来构建多线程的web爬虫来抓取页面内容。

　　2. 如何获取Crawler？

　　crawler4j的官方地址在，目前版本为4.1。如果你使用Maven，可以通过下面的pom的方式，如直接下载，。

　　3. Crawler怎么用？

　　crawler4j的使用分为两个步骤：一是实现一个继承自edu.uci.ics.crawler4j.crawler.WebCrawler的爬虫类；另外就是通过CrawController调用实现的爬虫类。

package com.favccxx.favsoft.favcrawler;import java.util.Set;import java.util.regex.Pattern;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import edu.uci.ics.crawler4j.crawler.Page;import edu.uci.ics.crawler4j.crawler.WebCrawler;import edu.uci.ics.crawler4j.parser.HtmlParseData;import edu.uci.ics.crawler4j.url.WebURL;public class FavWebCrawler extends WebCrawler {	private static final Logger logger = LoggerFactory.getLogger(WebCrawler.class);	private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg" + "|png|mp3|mp3|zip|gz))$");	@Override	public boolean shouldVisit(Page referringPage, WebURL url) {		String href = url.getURL().toLowerCase();		return !FILTERS.matcher(href).matches() && href.startsWith("http://www.oschina.net/");	}	/**	 * 处理抓取到的页面时，调用该方法	 	 */	@Override	public void visit(Page page) {		int docid = page.getWebURL().getDocid();		String url = page.getWebURL().getURL();		String domain = page.getWebURL().getDomain();		String path = page.getWebURL().getPath();		String subDomain = page.getWebURL().getSubDomain();		String parentUrl = page.getWebURL().getParentUrl();		String anchor = page.getWebURL().getAnchor();		logger.debug("Docid: {}", docid);		logger.info("URL: {}", url);		logger.debug("Domain: '{}'", domain);		logger.debug("Sub-domain: '{}'", subDomain);		logger.debug("Path: '{}'", path);		logger.debug("Parent page: {}", parentUrl);		logger.debug("Anchor text: {}", anchor);		if (page.getParseData() instanceof HtmlParseData) {			HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();			String text = htmlParseData.getText();			String html = htmlParseData.getHtml();			Set
    
      links = htmlParseData.getOutgoingUrls();			logger.debug("Text length: " + text.length());			logger.debug("Html length: " + html.length());			logger.debug("Number of outgoing links: " + links.size());		}	}}

package com.favccxx.favsoft.favcrawler;import java.util.Set;import java.util.regex.Pattern;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import edu.uci.ics.crawler4j.crawler.CrawlConfig;import edu.uci.ics.crawler4j.crawler.CrawlController;import edu.uci.ics.crawler4j.crawler.Page;import edu.uci.ics.crawler4j.crawler.WebCrawler;import edu.uci.ics.crawler4j.fetcher.PageFetcher;import edu.uci.ics.crawler4j.parser.HtmlParseData;import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;import edu.uci.ics.crawler4j.url.WebURL;public class MyCrawler extends WebCrawler {		private static final Logger logger = LoggerFactory.getLogger(WebCrawler.class);		private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg"  + "|png|mp3|mp3|zip|gz))$");		 @Override     public boolean shouldVisit(Page referringPage, WebURL url) {         String href = url.getURL().toLowerCase();         return !FILTERS.matcher(href).matches()                && href.startsWith("http://www.oschina.net/");     }     /**      * This function is called when a page is fetched and ready      * to be processed by your program.      */     @Override     public void visit(Page page) {    	     	 int docid = page.getWebURL().getDocid(); 	    String url = page.getWebURL().getURL(); 	    String domain = page.getWebURL().getDomain(); 	    String path = page.getWebURL().getPath(); 	    String subDomain = page.getWebURL().getSubDomain(); 	    String parentUrl = page.getWebURL().getParentUrl(); 	    String anchor = page.getWebURL().getAnchor();// 	   page.getWebURL().getTag() 	     	    System.out.println("********************************");// 	    // 	    System.out.println("Docid: {}" + docid);// 	    System.out.println("URL: {}"+ url);// 	    System.out.println("Domain: '{}'"+ domain);// 	   	System.out.println("Sub-domain: '{}'"+ subDomain);// 	  	System.out.println("Path: '{}'"+ path);// 	  	System.out.println("Parent page: {}"+ parentUrl);// 		System.out.println("Anchor text: {}"+ anchor); 	    logger.debug("Docid: {}", docid); 	    logger.info("URL: {}", url); 	    logger.debug("Domain: '{}'", domain); 	    logger.debug("Sub-domain: '{}'", subDomain); 	    logger.debug("Path: '{}'", path); 	    logger.debug("Parent page: {}", parentUrl); 	    logger.debug("Anchor text: {}", anchor);    	     	 //         String url = page.getWebURL().getURL();         System.out.println("URL: " + url);         if (page.getParseData() instanceof HtmlParseData) {             HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();             String text = htmlParseData.getText();             String html = htmlParseData.getHtml();             Set
    
      links = htmlParseData.getOutgoingUrls();                          System.out.println("--------------------------");//             System.out.println(text);             System.out.println("--------------------------");             System.out.println("Text length: " + text.length());             System.out.println("Html length: " + html.length());             System.out.println("Number of outgoing links: " + links.size());         }    }         public static void main(String[] args) throws Exception{    	 String crawlStorageFolder = "/data/crawl/root";         int numberOfCrawlers = 7;         CrawlConfig config = new CrawlConfig();         config.setCrawlStorageFolder(crawlStorageFolder);         /*          * Instantiate the controller for this crawl.          */         PageFetcher pageFetcher = new PageFetcher(config);         RobotstxtConfig robotstxtConfig = new RobotstxtConfig();         RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);         CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);         /*          * For each crawl, you need to add some seed urls. These are the first          * URLs that are fetched and then the crawler starts following links          * which are found in these pages          */         controller.addSeed("http://www.oschina.net/");//         controller.addSeed("http://www.ics.uci.edu/~welling/");//         controller.addSeed("http://www.ics.uci.edu/");         /*          * Start the crawl. This is a blocking operation, meaning that your code          * will reach the line after this only when crawling is finished.          */         controller.start(MyCrawler.class, numberOfCrawlers);    }}

　　4. Crawler常用配置

　　crawler4j的配置文件都位于edu.uci.ics.crawler4j.crawler.CrawlConfig中，各配置属性的详细说明如下。

crawlStorageFolder：临时存储抓取来的文件的地方，相当于文件中转站。

resumableCrawling：是否重新抓取上一个异常停止/损坏的文件的开关，默认不开启。如果开启该开关，毫无疑问会降低抓取的效率。

maxDepthOfCrawling：抓取的最大深度。默认为-1，即无限深度。

maxPagesToFetch：抓取的最大页面数。默认为-1，即无限抓取。

userAgentString：抓取web服务器的用户代理。默认为“crawler4j (”。

politenessDelay：（同一主机的两个请求间的）延迟毫秒数。默认为200。

includeHttpsPages：是否包含Https页面。默认包含。

includeBinaryContentInCrawling：是否包含二进制文件，如image，audio等。默认为不抓取。

maxConnectionsPerHost：每个主机的最大连接数，默认为100。

maxTotalConnections：主机的总共连接数，默认为100。

socketTimeout：socket超时毫秒数，默认为20000。

connectionTimeout：连接超时毫秒数，默认为30000。

maxOutgoingLinksToFollow：每个页面的最大外链数，默认为5000。

maxDownloadSize：每个页面的最大下载容量，默认1048576kb（1024M），超过的部分不会下载。

followRedirects：是否抓取重定向的页面，默认抓取。

proxyHost：代理主机地址，仅在使用代理上网时使用。

proxyPort：代理端口号。

proxyUsername：代理用户名。

proxyPassword：代理密码。

authInfos：授权用户信息。

转载地址：http://qkxao.baihongyu.com/

你可能感兴趣的文章