first commit
This commit is contained in:
commit
98aa364647
11
README.md
Normal file
11
README.md
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# 小木虫考研调剂信息爬虫
|
||||||
|
|
||||||
|
```git clone https://git.mmuaa.com/Kidultff/redistribute-spider.git```
|
||||||
|
在config.py中按照提示信息进行设置
|
||||||
|
然后```python spider.py```
|
||||||
|
你想要的信息将按照你需要的打印到屏幕上
|
||||||
|
|
||||||
|
你也可以```python spider.py > output.txt```
|
||||||
|
你想要的信息将被保存到txt文档里
|
||||||
|
|
||||||
|
![截屏](https://www.mmuaa.com/wp-content/uploads/image/20200331/1585639857878391.png "截屏")
|
33
config.py
Normal file
33
config.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
#encoding:utf-8
|
||||||
|
#爬虫配置文件
|
||||||
|
##########################################
|
||||||
|
#筛选条件设计为:
|
||||||
|
#出现任意【排除词】,则不显示
|
||||||
|
#必须包含全部【必须词】,否则不显示
|
||||||
|
#满足前两条件下,出现任意【包含词】,则显示
|
||||||
|
#若没有【包含词】,则不显示
|
||||||
|
##########################################
|
||||||
|
|
||||||
|
#【排除词】。出现任意一个词,将被筛掉。优先级最高
|
||||||
|
exclude = ['停止招生']
|
||||||
|
|
||||||
|
#【必须词】。必须出现这个词,否则不显示。优先级为中
|
||||||
|
focus_include = ['2020']
|
||||||
|
|
||||||
|
#【包含词】。出现任意一个词,将显示。优先级为低
|
||||||
|
include = ['计算机', '软件', '电子信息', '人工智能', '网络', '大数据']
|
||||||
|
|
||||||
|
#找多少页
|
||||||
|
page_end= 50
|
||||||
|
|
||||||
|
#爬虫的时间间隔,切勿将本选项设置过低,否则坐等被封IP
|
||||||
|
interval= 3
|
||||||
|
|
||||||
|
#缓存文件目录
|
||||||
|
cache_file = '/cache.dat'
|
||||||
|
|
||||||
|
#是否开启调试报告
|
||||||
|
debug_info = False
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("execute spider.py!!!\n NOT config.py")
|
108
spider.py
Normal file
108
spider.py
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
#encoding:utf-8
|
||||||
|
from time import time, sleep
|
||||||
|
import sys
|
||||||
|
from config import *
|
||||||
|
|
||||||
|
try:
|
||||||
|
import requests as r
|
||||||
|
from bs4 import BeautifulSoup as bs
|
||||||
|
except:
|
||||||
|
print("must install:\nrequests bs4\nuse pip install requests bs4")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
cache_file = sys.path[0] + cache_file
|
||||||
|
|
||||||
|
class xmcSpider:
|
||||||
|
def __init__(self):
|
||||||
|
self.__last = 0
|
||||||
|
self.__saved_urls = self.__load_cache()
|
||||||
|
|
||||||
|
def __load_cache(self):
|
||||||
|
saved_urls = {} #cache dict
|
||||||
|
with open(cache_file, 'r') as f: #read cache
|
||||||
|
cache = f.readlines()
|
||||||
|
for item in cache:
|
||||||
|
if "#|#" not in item:
|
||||||
|
continue
|
||||||
|
_item = item.replace("\n", "").split("#|#")
|
||||||
|
saved_urls[_item[0]] = _item[1]
|
||||||
|
return saved_urls
|
||||||
|
|
||||||
|
def GetHtmlObj(self, href, type = 0): #0=>index, 1=>page
|
||||||
|
if debug_info:
|
||||||
|
print("[debug] Getting %s" % href)
|
||||||
|
try:
|
||||||
|
if type == 0:
|
||||||
|
url = "http://muchong.com/f-430-%s-typeid-2304" % href
|
||||||
|
else:
|
||||||
|
url = "http://muchong.com/%s" % href
|
||||||
|
while int(time()) - self.__last < interval:
|
||||||
|
sleep(1)
|
||||||
|
self.__last = int(time())
|
||||||
|
html = r.get(url).text
|
||||||
|
obj = bs(html, "html.parser")
|
||||||
|
return obj
|
||||||
|
except:
|
||||||
|
if debug_info:
|
||||||
|
print("[ERROR] GetHtmlObj(%s, %d)" % (href, type))
|
||||||
|
return False
|
||||||
|
|
||||||
|
def GetIndex(self, obj):
|
||||||
|
try:
|
||||||
|
index = obj.find_all('th', class_='thread-name')[1:] #[1:] remove [竞价]
|
||||||
|
urls = []
|
||||||
|
for item in index:
|
||||||
|
url = item.find_all('a', class_ = "a_subject")[0]['href']
|
||||||
|
if "t-" in url:
|
||||||
|
urls.append(url)
|
||||||
|
return urls
|
||||||
|
except:
|
||||||
|
if debug_info:
|
||||||
|
print("[ERROR] GetIndex()")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def GetItem(self, url):
|
||||||
|
try:
|
||||||
|
save_urls_key = "http://muchong.com" + url
|
||||||
|
if save_urls_key not in list(self.__saved_urls.keys()): #if not in cache
|
||||||
|
obj = self.GetHtmlObj(url, 1)
|
||||||
|
inf = obj.find_all('table', class_='adjust_table')[0].text.replace(' ', '').replace('\n', ' ').replace('\r', '')
|
||||||
|
detail = obj.find_all('div', class_="t_fsz")[0].td.text
|
||||||
|
info = ("http://muchong.com"+url, inf)
|
||||||
|
with open(cache_file, 'a') as f:#save to cache file
|
||||||
|
if debug_info:
|
||||||
|
print("[debug] Save to file: %s" % save_urls_key)
|
||||||
|
f.write("%s#|#%s\n" % (info[0], info[1]))
|
||||||
|
else: #if hit cache
|
||||||
|
if debug_info:
|
||||||
|
print("[debug] hit cache: %s" % save_urls_key)
|
||||||
|
return "http://muchong.com"+url, self.__saved_urls[save_urls_key]
|
||||||
|
except:
|
||||||
|
if debug_info:
|
||||||
|
print("[ERROR] GetItem(%s)" % url)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def fitter_word(self, _info):
|
||||||
|
for word in exclude: #for each exclude word
|
||||||
|
if word in _info:
|
||||||
|
return False
|
||||||
|
for word in focus_include:
|
||||||
|
if word not in _info:
|
||||||
|
return False
|
||||||
|
for word in include: #for each include word
|
||||||
|
if word in _info:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
spider = xmcSpider()
|
||||||
|
for i in range(1, page_end+1):
|
||||||
|
htmlobj = spider.GetHtmlObj(str(i))
|
||||||
|
urls = spider.GetIndex(htmlobj)
|
||||||
|
if not urls or not htmlobj:
|
||||||
|
continue
|
||||||
|
info = []
|
||||||
|
for url in urls: #foreach url
|
||||||
|
info = spider.GetItem(url)
|
||||||
|
if info and spider.fitter_word(info[0] + info[1]):
|
||||||
|
print("链接:%s\n详情:%s\n" % (info[0], info[1]))
|
Reference in New Issue
Block a user