本文共 7460 字,大约阅读时间需要 24 分钟。
1. Crawler是什么?
crawler4j是一个开源的java爬虫类库,可以用来构建多线程的web爬虫来抓取页面内容。
2. 如何获取Crawler?
crawler4j的官方地址在,目前版本为4.1。如果你使用Maven,可以通过下面的pom的方式,如直接下载,。
3. Crawler怎么用?
crawler4j的使用分为两个步骤:一是实现一个继承自edu.uci.ics.crawler4j.crawler.WebCrawler的爬虫类;另外就是通过CrawController调用实现的爬虫类。
package com.favccxx.favsoft.favcrawler;import java.util.Set;import java.util.regex.Pattern;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import edu.uci.ics.crawler4j.crawler.Page;import edu.uci.ics.crawler4j.crawler.WebCrawler;import edu.uci.ics.crawler4j.parser.HtmlParseData;import edu.uci.ics.crawler4j.url.WebURL;public class FavWebCrawler extends WebCrawler { private static final Logger logger = LoggerFactory.getLogger(WebCrawler.class); private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg" + "|png|mp3|mp3|zip|gz))$"); @Override public boolean shouldVisit(Page referringPage, WebURL url) { String href = url.getURL().toLowerCase(); return !FILTERS.matcher(href).matches() && href.startsWith("http://www.oschina.net/"); } /** * 处理抓取到的页面时,调用该方法 */ @Override public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); String domain = page.getWebURL().getDomain(); String path = page.getWebURL().getPath(); String subDomain = page.getWebURL().getSubDomain(); String parentUrl = page.getWebURL().getParentUrl(); String anchor = page.getWebURL().getAnchor(); logger.debug("Docid: {}", docid); logger.info("URL: {}", url); logger.debug("Domain: '{}'", domain); logger.debug("Sub-domain: '{}'", subDomain); logger.debug("Path: '{}'", path); logger.debug("Parent page: {}", parentUrl); logger.debug("Anchor text: {}", anchor); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Setlinks = htmlParseData.getOutgoingUrls(); logger.debug("Text length: " + text.length()); logger.debug("Html length: " + html.length()); logger.debug("Number of outgoing links: " + links.size()); } }}
package com.favccxx.favsoft.favcrawler;import java.util.Set;import java.util.regex.Pattern;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import edu.uci.ics.crawler4j.crawler.CrawlConfig;import edu.uci.ics.crawler4j.crawler.CrawlController;import edu.uci.ics.crawler4j.crawler.Page;import edu.uci.ics.crawler4j.crawler.WebCrawler;import edu.uci.ics.crawler4j.fetcher.PageFetcher;import edu.uci.ics.crawler4j.parser.HtmlParseData;import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;import edu.uci.ics.crawler4j.url.WebURL;public class MyCrawler extends WebCrawler { private static final Logger logger = LoggerFactory.getLogger(WebCrawler.class); private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg" + "|png|mp3|mp3|zip|gz))$"); @Override public boolean shouldVisit(Page referringPage, WebURL url) { String href = url.getURL().toLowerCase(); return !FILTERS.matcher(href).matches() && href.startsWith("http://www.oschina.net/"); } /** * This function is called when a page is fetched and ready * to be processed by your program. */ @Override public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); String domain = page.getWebURL().getDomain(); String path = page.getWebURL().getPath(); String subDomain = page.getWebURL().getSubDomain(); String parentUrl = page.getWebURL().getParentUrl(); String anchor = page.getWebURL().getAnchor();// page.getWebURL().getTag() System.out.println("********************************");// // System.out.println("Docid: {}" + docid);// System.out.println("URL: {}"+ url);// System.out.println("Domain: '{}'"+ domain);// System.out.println("Sub-domain: '{}'"+ subDomain);// System.out.println("Path: '{}'"+ path);// System.out.println("Parent page: {}"+ parentUrl);// System.out.println("Anchor text: {}"+ anchor); logger.debug("Docid: {}", docid); logger.info("URL: {}", url); logger.debug("Domain: '{}'", domain); logger.debug("Sub-domain: '{}'", subDomain); logger.debug("Path: '{}'", path); logger.debug("Parent page: {}", parentUrl); logger.debug("Anchor text: {}", anchor); // String url = page.getWebURL().getURL(); System.out.println("URL: " + url); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Setlinks = htmlParseData.getOutgoingUrls(); System.out.println("--------------------------");// System.out.println(text); System.out.println("--------------------------"); System.out.println("Text length: " + text.length()); System.out.println("Html length: " + html.length()); System.out.println("Number of outgoing links: " + links.size()); } } public static void main(String[] args) throws Exception{ String crawlStorageFolder = "/data/crawl/root"; int numberOfCrawlers = 7; CrawlConfig config = new CrawlConfig(); config.setCrawlStorageFolder(crawlStorageFolder); /* * Instantiate the controller for this crawl. */ PageFetcher pageFetcher = new PageFetcher(config); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); /* * For each crawl, you need to add some seed urls. These are the first * URLs that are fetched and then the crawler starts following links * which are found in these pages */ controller.addSeed("http://www.oschina.net/");// controller.addSeed("http://www.ics.uci.edu/~welling/");// controller.addSeed("http://www.ics.uci.edu/"); /* * Start the crawl. This is a blocking operation, meaning that your code * will reach the line after this only when crawling is finished. */ controller.start(MyCrawler.class, numberOfCrawlers); }}
4. Crawler常用配置
crawler4j的配置文件都位于edu.uci.ics.crawler4j.crawler.CrawlConfig中,各配置属性的详细说明如下。
crawlStorageFolder:临时存储抓取来的文件的地方,相当于文件中转站。 resumableCrawling:是否重新抓取上一个异常停止/损坏的文件的开关,默认不开启。如果开启该开关,毫无疑问会降低抓取的效率。 maxDepthOfCrawling:抓取的最大深度。默认为-1,即无限深度。 maxPagesToFetch:抓取的最大页面数。默认为-1,即无限抓取。 userAgentString:抓取web服务器的用户代理。默认为“crawler4j (”。 politenessDelay:(同一主机的两个请求间的)延迟毫秒数。默认为200。 includeHttpsPages:是否包含Https页面。默认包含。 includeBinaryContentInCrawling:是否包含二进制文件,如image,audio等。默认为不抓取。 maxConnectionsPerHost:每个主机的最大连接数,默认为100。 maxTotalConnections:主机的总共连接数,默认为100。 socketTimeout:socket超时毫秒数,默认为20000。 connectionTimeout:连接超时毫秒数,默认为30000。 maxOutgoingLinksToFollow:每个页面的最大外链数,默认为5000。 maxDownloadSize:每个页面的最大下载容量,默认1048576kb(1024M),超过的部分不会下载。 followRedirects:是否抓取重定向的页面,默认抓取。 proxyHost:代理主机地址,仅在使用代理上网时使用。 proxyPort:代理端口号。 proxyUsername:代理用户名。 proxyPassword:代理密码。 authInfos:授权用户信息。 |
转载地址:http://qkxao.baihongyu.com/