You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This repo is archived. You can view files and clone it, but cannot push or open issues/pull-requests.
|
|
|
|
#encoding:utf-8
|
|
|
|
|
#爬虫配置文件
|
|
|
|
|
##########################################
|
|
|
|
|
#筛选条件设计为:
|
|
|
|
|
#出现任意【排除词】,则不显示
|
|
|
|
|
#必须包含全部【必须词】,否则不显示
|
|
|
|
|
#满足前两条件下,出现任意【包含词】,则显示
|
|
|
|
|
#若没有【包含词】,则不显示
|
|
|
|
|
##########################################
|
|
|
|
|
|
|
|
|
|
#【排除词】。出现任意一个词,将被筛掉。优先级最高
|
|
|
|
|
exclude = ['停止招生']
|
|
|
|
|
|
|
|
|
|
#【必须词】。必须出现这个词,否则不显示。优先级为中
|
|
|
|
|
focus_include = ['2020']
|
|
|
|
|
|
|
|
|
|
#【包含词】。出现任意一个词,将显示。优先级为低
|
|
|
|
|
include = ['计算机', '软件', '电子信息', '人工智能', '网络', '大数据']
|
|
|
|
|
|
|
|
|
|
#找多少页
|
|
|
|
|
page_end= 50
|
|
|
|
|
|
|
|
|
|
#爬虫的时间间隔,切勿将本选项设置过低,否则坐等被封IP
|
|
|
|
|
interval= 3
|
|
|
|
|
|
|
|
|
|
#缓存文件目录
|
|
|
|
|
cache_file = '/cache.dat'
|
|
|
|
|
|
|
|
|
|
#是否开启调试报告
|
|
|
|
|
debug_info = False
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
print("execute spider.py!!!\n NOT config.py")
|