first commit

2020-03-31 15:34:34 +08:00 · 2020-03-31 15:34:34 +08:00 · 98aa364647
commit 98aa364647
4 changed files with 3077 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,11 @@
+# 小木虫考研调剂信息爬虫
+
+```git clone https://git.mmuaa.com/Kidultff/redistribute-spider.git```
+在config.py中按照提示信息进行设置
+然后```python spider.py```
+你想要的信息将按照你需要的打印到屏幕上
+
+你也可以```python spider.py > output.txt```
+你想要的信息将被保存到txt文档里
+
+![截屏](https://www.mmuaa.com/wp-content/uploads/image/20200331/1585639857878391.png "截屏")
--- a/cache.dat
+++ b/cache.dat
--- a/config.py
+++ b/config.py
@ -0,0 +1,33 @@
+#encoding:utf-8
+#爬虫配置文件
+##########################################
+#筛选条件设计为：
+#出现任意【排除词】，则不显示
+#必须包含全部【必须词】，否则不显示
+#满足前两条件下，出现任意【包含词】，则显示
+#若没有【包含词】，则不显示
+##########################################
+
+#【排除词】。出现任意一个词，将被筛掉。优先级最高
+exclude = ['停止招生']	
+
+#【必须词】。必须出现这个词，否则不显示。优先级为中
+focus_include = ['2020']
+
+#【包含词】。出现任意一个词，将显示。优先级为低
+include = ['计算机', '软件', '电子信息', '人工智能', '网络', '大数据']	
+
+#找多少页
+page_end= 50
+
+#爬虫的时间间隔，切勿将本选项设置过低，否则坐等被封IP
+interval= 3
+
+#缓存文件目录
+cache_file = '/cache.dat'
+
+#是否开启调试报告
+debug_info = False
+
+if __name__ == "__main__":
+	print("execute spider.py!!!\n NOT config.py")
--- a/spider.py
+++ b/spider.py
@ -0,0 +1,108 @@
+#encoding:utf-8
+from time import time, sleep
+import sys
+from config import *
+
+try:
+	import requests as r
+	from bs4 import BeautifulSoup as bs
+except:
+	print("must install:\nrequests bs4\nuse pip install requests bs4")
+	exit()
+
+cache_file = sys.path[0] + cache_file
+
+class xmcSpider:
+	def __init__(self):
+		self.__last = 0
+		self.__saved_urls = self.__load_cache()
+	
+	def __load_cache(self):
+		saved_urls = {}						#cache dict
+		with open(cache_file, 'r') as f:	#read cache
+			cache = f.readlines()
+			for item in cache:
+				if "#|#" not in item:
+					continue
+				_item = item.replace("\n", "").split("#|#")
+				saved_urls[_item[0]] = _item[1]
+		return saved_urls
+	
+	def GetHtmlObj(self, href, type = 0):	#0=>index, 1=>page
+		if debug_info:
+			print("[debug] Getting %s" % href)
+		try:
+			if type == 0:
+				url = "http://muchong.com/f-430-%s-typeid-2304" % href
+			else:
+				url = "http://muchong.com/%s" % href
+			while int(time()) - self.__last < interval:
+				sleep(1)
+			self.__last = int(time())
+			html = r.get(url).text
+			obj = bs(html, "html.parser")
+			return obj
+		except:
+			if debug_info:
+				print("[ERROR] GetHtmlObj(%s, %d)" % (href, type))
+			return False
+		
+	def GetIndex(self, obj):
+		try:
+			index = obj.find_all('th', class_='thread-name')[1:]	#[1:] remove [竞价]
+			urls = []
+			for item in index:
+				url = item.find_all('a', class_ = "a_subject")[0]['href']
+				if "t-" in url:
+					urls.append(url)
+			return urls
+		except:
+			if debug_info:
+				print("[ERROR] GetIndex()")
+			return False
+		
+	def GetItem(self, url):
+		try:
+			save_urls_key = "http://muchong.com" + url
+			if save_urls_key not in list(self.__saved_urls.keys()):	#if not in cache
+				obj = self.GetHtmlObj(url, 1)
+				inf = obj.find_all('table', class_='adjust_table')[0].text.replace(' ', '').replace('\n', '  ').replace('\r', '')
+				detail = obj.find_all('div', class_="t_fsz")[0].td.text
+				info = ("http://muchong.com"+url, inf)
+				with open(cache_file, 'a') as f:#save to cache file
+					if debug_info:
+						print("[debug] Save to file: %s" % save_urls_key)
+					f.write("%s#|#%s\n" % (info[0], info[1]))
+			else:		#if hit cache
+				if debug_info:
+					print("[debug] hit cache: %s" % save_urls_key)
+				return "http://muchong.com"+url, self.__saved_urls[save_urls_key]
+		except:
+			if debug_info:
+				print("[ERROR] GetItem(%s)" % url)
+			return False
+		
+	def fitter_word(self, _info):
+		for word in exclude:			#for each exclude word
+			if word in _info:
+				return False
+		for word in focus_include:
+			if word not in _info:
+				return False
+		for word in include:			#for each include word
+			if word in _info:
+				return True
+		return False
+
+if __name__ == '__main__':
+	spider = xmcSpider()
+	for i in range(1, page_end+1):
+		htmlobj = spider.GetHtmlObj(str(i))
+		urls = spider.GetIndex(htmlobj)
+		if not urls or not htmlobj:
+			continue
+		info = []
+		for url in urls:						#foreach url
+			info = spider.GetItem(url)
+			if info and spider.fitter_word(info[0] + info[1]):
+				print("链接：%s\n详情：%s\n" % (info[0], info[1]))