first commit

2020-03-31 15:34:34 +08:00 · 2020-03-31 15:34:34 +08:00 · 98aa364647
commit 98aa364647
4 changed files with 3077 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,11 @@
 # 小木虫考研调剂信息爬虫
 ```git clone https://git.mmuaa.com/Kidultff/redistribute-spider.git```
 在config.py中按照提示信息进行设置
 然后```python spider.py```
 你想要的信息将按照你需要的打印到屏幕上
 你也可以```python spider.py > output.txt```
 你想要的信息将被保存到txt文档里
 ![截屏](https://www.mmuaa.com/wp-content/uploads/image/20200331/1585639857878391.png "截屏")
--- a/cache.dat
+++ b/cache.dat
--- a/config.py
+++ b/config.py
@ -0,0 +1,33 @@
 #encoding:utf-8
 #爬虫配置文件
 ##########################################
 #筛选条件设计为：
 #出现任意【排除词】，则不显示
 #必须包含全部【必须词】，否则不显示
 #满足前两条件下，出现任意【包含词】，则显示
 #若没有【包含词】，则不显示
 ##########################################
 #【排除词】。出现任意一个词，将被筛掉。优先级最高
 exclude = ['停止招生']	
 #【必须词】。必须出现这个词，否则不显示。优先级为中
 focus_include = ['2020']
 #【包含词】。出现任意一个词，将显示。优先级为低
 include = ['计算机', '软件', '电子信息', '人工智能', '网络', '大数据']	
 #找多少页
 page_end= 50
 #爬虫的时间间隔，切勿将本选项设置过低，否则坐等被封IP
 interval= 3
 #缓存文件目录
 cache_file = '/cache.dat'
 #是否开启调试报告
 debug_info = False
 if __name__ == "__main__":
 	print("execute spider.py!!!\n NOT config.py")
--- a/spider.py
+++ b/spider.py
@ -0,0 +1,108 @@
 #encoding:utf-8
 from time import time, sleep
 import sys
 from config import *
 try:
 	import requests as r
 	from bs4 import BeautifulSoup as bs
 except:
 	print("must install:\nrequests bs4\nuse pip install requests bs4")
 	exit()
 cache_file = sys.path[0] + cache_file
 class xmcSpider:
 	def __init__(self):
 		self.__last = 0
 		self.__saved_urls = self.__load_cache()
 	def __load_cache(self):
 		saved_urls = {}						#cache dict
 		with open(cache_file, 'r') as f:	#read cache
 			cache = f.readlines()
 			for item in cache:
 				if "#|#" not in item:
 					continue
 				_item = item.replace("\n", "").split("#|#")
 				saved_urls[_item[0]] = _item[1]
 		return saved_urls
 	def GetHtmlObj(self, href, type = 0):	#0=>index, 1=>page
 		if debug_info:
 			print("[debug] Getting %s" % href)
 		try:
 			if type == 0:
 				url = "http://muchong.com/f-430-%s-typeid-2304" % href
 			else:
 				url = "http://muchong.com/%s" % href
 			while int(time()) - self.__last < interval:
 				sleep(1)
 			self.__last = int(time())
 			html = r.get(url).text
 			obj = bs(html, "html.parser")
 			return obj
 		except:
 			if debug_info:
 				print("[ERROR] GetHtmlObj(%s, %d)" % (href, type))
 			return False
 	def GetIndex(self, obj):
 		try:
 			index = obj.find_all('th', class_='thread-name')[1:]	#[1:] remove [竞价]
 			urls = []
 			for item in index:
 				url = item.find_all('a', class_ = "a_subject")[0]['href']
 				if "t-" in url:
 					urls.append(url)
 			return urls
 		except:
 			if debug_info:
 				print("[ERROR] GetIndex()")
 			return False
 	def GetItem(self, url):
 		try:
 			save_urls_key = "http://muchong.com" + url
 			if save_urls_key not in list(self.__saved_urls.keys()):	#if not in cache
 				obj = self.GetHtmlObj(url, 1)
 				inf = obj.find_all('table', class_='adjust_table')[0].text.replace(' ', '').replace('\n', '  ').replace('\r', '')
 				detail = obj.find_all('div', class_="t_fsz")[0].td.text
 				info = ("http://muchong.com"+url, inf)
 				with open(cache_file, 'a') as f:#save to cache file
 					if debug_info:
 						print("[debug] Save to file: %s" % save_urls_key)
 					f.write("%s#|#%s\n" % (info[0], info[1]))
 			else:		#if hit cache
 				if debug_info:
 					print("[debug] hit cache: %s" % save_urls_key)
 				return "http://muchong.com"+url, self.__saved_urls[save_urls_key]
 		except:
 			if debug_info:
 				print("[ERROR] GetItem(%s)" % url)
 			return False
 	def fitter_word(self, _info):
 		for word in exclude:			#for each exclude word
 			if word in _info:
 				return False
 		for word in focus_include:
 			if word not in _info:
 				return False
 		for word in include:			#for each include word
 			if word in _info:
 				return True
 		return False
 if __name__ == '__main__':
 	spider = xmcSpider()
 	for i in range(1, page_end+1):
 		htmlobj = spider.GetHtmlObj(str(i))
 		urls = spider.GetIndex(htmlobj)
 		if not urls or not htmlobj:
 			continue
 		info = []
 		for url in urls:						#foreach url
 			info = spider.GetItem(url)
 			if info and spider.fitter_word(info[0] + info[1]):
 				print("链接：%s\n详情：%s\n" % (info[0], info[1]))