[置頂] 用生產(chǎn)者消費者模式實現(xiàn)爬蟲批量提交網(wǎng)頁給搜索引擎
來源:程序員人生 發(fā)布時間:2015-04-29 07:54:56 閱讀次數(shù):2792次
1:爬蟲:crawler4j ;
參考資料:http://blog.csdn.net/longzuyuan/article/details/8894912
http://blog.csdn.net/sadfishsc/article/details/20614105
參考書籍:自己動手寫網(wǎng)絡(luò)爬蟲 --羅剛
2:搜索服務(wù)器:solr4.10 ;
3:多線程處理
參考書籍:java并發(fā)編程實戰(zhàn)
參考相干jdk api:http://www.yq1012.com/api/,梗塞隊列BlockingQueue<E> 類
業(yè)務(wù):爬取國內(nèi)部份招聘網(wǎng)站的職位信息。。當(dāng)爬蟲線程抓取到的頁面數(shù)據(jù)到1定量時或1定時間內(nèi)。提交給搜索引擎solr(提高
索引性能)。循環(huán)爬行操作。。從而實現(xiàn)更高的資源利用率。。思路:N個線程爬蟲作為Producer,提交搜索引擎作為Consumer。
部份爬蟲相干代碼:
package crawler;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
public class Crawler4jTest {
public static void main(String[] args) {
try {
String crawlStorageFolder = "./tmp";
int numberOfCrawlers = 5;
CrawlConfig config = new CrawlConfig();
// 文明要求web:確保我們不發(fā)送超過1每秒要求數(shù)(1000毫秒之間的要求)。
config.setPolitenessDelay(1000);
// 深度,即從入口URL開始算,URL是第幾層。如入口A是1,從A中找到了B,B中又有C,則B是2,C是3
config.setMaxDepthOfCrawling(5);
//設(shè)置最大的抓取頁面數(shù)。默許值為1,頁面的數(shù)量不限
config.setMaxPagesToFetch(50);
// 如果需要代理
服務(wù)器的話
//config.setProxyHost("proxyserver.example.com"); //設(shè)置代理域名
//config.setProxyPort(8080);//端口
// 如果代理
服務(wù)器需要認(rèn)證
//config.setProxyUsername(username); config.getProxyPassword(password); //設(shè)置代理
/*
* 此配置參數(shù)可以用來設(shè)置你的爬行是可恢復(fù)的(這意味著可以從先前中斷/恢復(fù)爬行)
* 注意:如果啟用恢復(fù)特點,想開始1個新的抓取,你需要刪除的內(nèi)容手動rootfolder。
*/
config.setResumableCrawling(false);
config.setCrawlStorageFolder(crawlStorageFolder);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.getCrawlersLocalData();
controller.addSeed("http://www.lagou.com");
CommitConsumer consumer=new CommitConsumer();
new Thread(consumer).start();
controller.start(WomiCrawler.class, numberOfCrawlers);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
package crawler;
import java.util.regex.Pattern;
import org.apache.solr.common.SolrInputDocument;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;
public class WomiCrawler extends WebCrawler{
private final static Pattern FILTERS = Pattern.compile(".*(.(css|js|bmp|gif|jpe?g" + "|png|tiff?|mid|mp2|mp3|mp4"
+ "|wav|avi|mov|mpeg|ram|m4v|pdf" + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");
//頁眼前綴
private final static String URL_PREFIX = "http://www.lagou.com/jobs/";
/**
* shouldVisit是判斷當(dāng)前的URL是不是已應(yīng)當(dāng)被爬取(訪問)
*/
@Override
public boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase();
return !FILTERS.matcher(href).matches() && href.startsWith(URL_PREFIX);
}
/**
* visit則是爬取該URL所指向的頁面的數(shù)據(jù),其傳入的參數(shù)即是對該web頁面全部數(shù)據(jù)的封裝對象Page。
*/
@Override
public void visit(Page page) {
try {
SolrInputDocument doc=new SolrInputDocument();
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
String parentUrl = page.getWebURL().getParentUrl();
String anchor = page.getWebURL().getAnchor();
doc.addField("id", docid+"");
doc.addField("url", url+"");
doc.addField("host", url+"");
doc.addField("title", anchor+"");
doc.addField("author", anchor+"");
System.out.println("Docid: " + docid);
System.out.println("URL: " + url);
System.out.println("Parent page: " + parentUrl);
System.out.println("anchor: " + anchor);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
doc.addField("content", text);
}
Lock lock = Lock.getInstance();
lock.lstDocument.add(doc);
lock.num++;
System.out.println("爬蟲次數(shù): num ==" + lock.num);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
部份對象鎖代碼
package crawler;
import java.util.concurrent.LinkedBlockingQueue;
import org.apache.solr.common.SolrInputDocument;
public class Lock {
private static Lock lock ;
public static Lock getInstance(){
if(lock==null){
synchronized (Lock.class) {
if(lock==null){
lock=new Lock();
}
}
}
return lock;
}
private Lock(){}
//爬取page數(shù)量
public int num = 0;
//提交次數(shù)
public int commitNum = 0;
//索引數(shù)據(jù)集-消費者模式
public LinkedBlockingQueue<SolrInputDocument> lstDocument = new LinkedBlockingQueue<SolrInputDocument>();
}
部份消費者代碼:
package crawler;
import java.util.LinkedList;
import java.util.List;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.common.SolrInputDocument;
import search.solr.IndexerUtil;
public class CommitConsumer implements Runnable {
private SolrServer server = IndexerUtil.getHttpSolrServer("crawl");
private List<SolrInputDocument> list=new LinkedList<SolrInputDocument>();
private int commit=0;
public void run() {
try {
SolrInputDocument doc=null;
while((doc=Lock.getInstance().lstDocument.take())!=null){
list.add(doc);
if(list.size()==5){
commit++;
server.add(list);
server.commit();
list.clear();
System.out.println("提交次數(shù):"+commit);
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
生活不易,碼農(nóng)辛苦
如果您覺得本網(wǎng)站對您的學(xué)習(xí)有所幫助,可以手機(jī)掃描二維碼進(jìn)行捐贈