public class Crawler extends Object
| 限定符和类型 | 字段和说明 |
|---|---|
protected DBManager |
dbManager |
protected long |
executeInterval |
protected Executor |
executor |
protected Fetcher |
fetcher |
protected CrawlDatums |
forcedSeeds |
static org.slf4j.Logger |
LOG |
protected int |
maxExecuteCount |
protected NextFilter |
nextFilter |
protected boolean |
resumable |
static int |
RUNNING |
protected CrawlDatums |
seeds |
protected int |
status |
static int |
STOPED |
protected int |
threads |
protected int |
topN |
| 限定符和类型 | 方法和说明 |
|---|---|
void |
addSeed(CrawlDatum datum)
等同于 addSeed(datum, false)
|
void |
addSeed(CrawlDatum datum,
boolean force)
添加种子任务
|
void |
addSeed(CrawlDatums datums)
等同于 addSeed(datums,false)
|
void |
addSeed(CrawlDatums datums,
boolean force)
添加种子集合
|
void |
addSeed(Links links)
与addSeed(CrawlDatums datums)类似
|
void |
addSeed(Links links,
boolean force)
与addSeed(CrawlDatums datums, boolean force) 类似
|
void |
addSeed(Links links,
String type)
与addSeed(CrawlDatums datums)类似
|
void |
addSeed(Links links,
String type,
boolean force)
与addSeed(CrawlDatums datums, boolean force) 类似
|
void |
addSeed(String url)
与addSeed(CrawlDatum datum)类似
|
void |
addSeed(String url,
boolean force)
与addSeed(CrawlDatum datum, boolean force)类似
|
void |
addSeed(String url,
String type)
与addSeed(CrawlDatum datum)类似
|
void |
addSeed(String url,
String type,
boolean force)
与addSeed(CrawlDatum datum, boolean force)类似
|
DBManager |
getDBManager()
返回任务管理器
|
long |
getExecuteInterval()
获取执行间隔
|
Executor |
getExecutor()
获取每个爬取任务的最大执行次数
|
int |
getMaxExecuteCount() |
NextFilter |
getNextFilter() |
int |
getThreads()
返回线程数
|
int |
getTopN()
返回每次迭代爬取的网页数量上限
|
protected void |
inject() |
void |
injectForcedSeeds() |
boolean |
isResumable()
返回是否断点爬取
|
void |
setDBManager(DBManager dbManager)
设置任务管理器
|
void |
setExecuteInterval(long executeInterval)
设置执行间隔
|
void |
setExecutor(Executor executor)
设置执行器
|
void |
setMaxExecuteCount(int maxExecuteCount)
设置每个爬取任务的最大执行次数,爬取或解析失败都会导致执行失败。
|
void |
setNextFilter(NextFilter nextFilter) |
void |
setResumable(boolean resumable)
设置是否断点爬取
|
void |
setThreads(int threads)
设置线程数
|
void |
setTopN(int topN)
设置每次迭代爬取的网页数量上限
|
void |
start(int depth)
开始爬取,迭代次数为depth
|
void |
stop()
停止爬虫
|
String |
toString() |
public static final org.slf4j.Logger LOG
protected int status
public static final int RUNNING
public static final int STOPED
protected boolean resumable
protected int threads
protected int topN
protected long executeInterval
protected CrawlDatums seeds
protected CrawlDatums forcedSeeds
protected Fetcher fetcher
protected int maxExecuteCount
protected Executor executor
protected NextFilter nextFilter
protected DBManager dbManager
public void start(int depth)
throws Exception
depth - 迭代次数Exception - 异常public void stop()
public void addSeed(CrawlDatum datum, boolean force)
datum - 种子任务force - 如果添加的种子是已爬取的任务,当force为true时,会强制注入种子,当force为false时,会忽略该种子public void addSeed(CrawlDatum datum)
datum - 种子任务public void addSeed(CrawlDatums datums, boolean force)
datums - 种子集合force - 如果添加的种子是已爬取的任务,当force为true时,会强制注入种子,当force为false时,会忽略该种子public void addSeed(CrawlDatums datums)
datums - 种子任务集合public void addSeed(Links links, String type, boolean force)
links - 种子URL集合type - 种子的type标识信息force - 是否强制注入public void addSeed(Links links, boolean force)
links - 种子URL集合force - 是否强制注入public void addSeed(Links links, String type)
links - 种子URL集合type - 种子的type标识信息public void addSeed(Links links)
links - 种子URL集合public void addSeed(String url, String type, boolean force)
url - 种子URLtype - 种子的type标识信息force - 是否强制注入public void addSeed(String url, boolean force)
url - 种子URLforce - 是否强制注入public void addSeed(String url, String type)
type - 种子的type标识信息url - 种子URLpublic void addSeed(String url)
url - 种子URLpublic boolean isResumable()
public void setResumable(boolean resumable)
resumable - 是否断点爬取public int getThreads()
public void setThreads(int threads)
threads - 线程数public int getMaxExecuteCount()
public void setMaxExecuteCount(int maxExecuteCount)
maxExecuteCount - 每个爬取任务的最大执行次数public Executor getExecutor()
public void setExecutor(Executor executor)
executor - 执行器public int getTopN()
public void setTopN(int topN)
topN - 每次迭代爬取的网页数量上限public long getExecuteInterval()
public void setExecuteInterval(long executeInterval)
executeInterval - 执行间隔public DBManager getDBManager()
public void setDBManager(DBManager dbManager)
dbManager - 任务管理器public NextFilter getNextFilter()
public void setNextFilter(NextFilter nextFilter)
Copyright © 2017. All Rights Reserved.