redistribute-spider/spider.py

#encoding:utf-8
from time import time, sleep
import sys
from config import *

try:
	import requests as r
	from bs4 import BeautifulSoup as bs
except:
	print("must install:\nrequests bs4\nuse pip install requests bs4")
	exit()

cache_file = sys.path[0] + cache_file

class xmcSpider:
	def __init__(self):
		self.__last = 0
		self.__saved_urls = self.__load_cache()
	
	def __load_cache(self):
		saved_urls = {}						#cache dict
		with open(cache_file, 'r') as f:	#read cache
			cache = f.readlines()
			for item in cache:
				if "#|#" not in item:
					continue
				_item = item.replace("\n", "").split("#|#")
				saved_urls[_item[0]] = _item[1]
		return saved_urls
	
	def GetHtmlObj(self, href, type = 0):	#0=>index, 1=>page
		if debug_info:
			print("[debug] Getting %s" % href)
		try:
			if type == 0:
				url = "http://muchong.com/f-430-%s-typeid-2304" % href
			else:
				url = "http://muchong.com/%s" % href
			while int(time()) - self.__last < interval:
				sleep(1)
			self.__last = int(time())
			html = r.get(url).text
			obj = bs(html, "html.parser")
			return obj
		except:
			if debug_info:
				print("[ERROR] GetHtmlObj(%s, %d)" % (href, type))
			return False
		
	def GetIndex(self, obj):
		try:
			index = obj.find_all('th', class_='thread-name')[1:]	#[1:] remove [竞价]
			urls = []
			for item in index:
				url = item.find_all('a', class_ = "a_subject")[0]['href']
				if "t-" in url:
					urls.append(url)
			return urls
		except:
			if debug_info:
				print("[ERROR] GetIndex()")
			return False
		
	def GetItem(self, url):
		try:
			save_urls_key = "http://muchong.com" + url
			if save_urls_key not in list(self.__saved_urls.keys()):	#if not in cache
				obj = self.GetHtmlObj(url, 1)
				inf = obj.find_all('table', class_='adjust_table')[0].text.replace(' ', '').replace('\n', '  ').replace('\r', '')
				detail = obj.find_all('div', class_="t_fsz")[0].td.text
				info = ("http://muchong.com"+url, inf)
				with open(cache_file, 'a') as f:#save to cache file
					if debug_info:
						print("[debug] Save to file: %s" % save_urls_key)
					f.write("%s#|#%s\n" % (info[0], info[1]))
			else:		#if hit cache
				if debug_info:
					print("[debug] hit cache: %s" % save_urls_key)
				return "http://muchong.com"+url, self.__saved_urls[save_urls_key]
		except:
			if debug_info:
				print("[ERROR] GetItem(%s)" % url)
			return False
		
	def fitter_word(self, _info):
		for word in exclude:			#for each exclude word
			if word in _info:
				return False
		for word in focus_include:
			if word not in _info:
				return False
		for word in include:			#for each include word
			if word in _info:
				return True
		return False

if __name__ == '__main__':
	spider = xmcSpider()
	for i in range(1, page_end+1):
		htmlobj = spider.GetHtmlObj(str(i))
		urls = spider.GetIndex(htmlobj)
		if not urls or not htmlobj:
			continue
		info = []
		for url in urls:						#foreach url
			info = spider.GetItem(url)
			if info and spider.fitter_word(info[0] + info[1]):
				print("链接：%s\n详情：%s\n" % (info[0], info[1]))
first commit 2020-03-31 07:34:34 +00:00			`#encoding:utf-8`
			`from time import time, sleep`
			`import sys`
			`from config import *`

			`try:`
			`import requests as r`
			`from bs4 import BeautifulSoup as bs`
			`except:`
			`print("must install:\nrequests bs4\nuse pip install requests bs4")`
			`exit()`

			`cache_file = sys.path[0] + cache_file`

			`class xmcSpider:`
			`def __init__(self):`
			`self.__last = 0`
			`self.__saved_urls = self.__load_cache()`

			`def __load_cache(self):`
			`saved_urls = {} #cache dict`
			`with open(cache_file, 'r') as f: #read cache`
			`cache = f.readlines()`
			`for item in cache:`
			`if "#\|#" not in item:`
			`continue`
			`_item = item.replace("\n", "").split("#\|#")`
			`saved_urls[_item[0]] = _item[1]`
			`return saved_urls`

			`def GetHtmlObj(self, href, type = 0): #0=>index, 1=>page`
			`if debug_info:`
			`print("[debug] Getting %s" % href)`
			`try:`
			`if type == 0:`
			`url = "http://muchong.com/f-430-%s-typeid-2304" % href`
			`else:`
			`url = "http://muchong.com/%s" % href`
			`while int(time()) - self.__last < interval:`
			`sleep(1)`
			`self.__last = int(time())`
			`html = r.get(url).text`
			`obj = bs(html, "html.parser")`
			`return obj`
			`except:`
			`if debug_info:`
			`print("[ERROR] GetHtmlObj(%s, %d)" % (href, type))`
			`return False`

			`def GetIndex(self, obj):`
			`try:`
			`index = obj.find_all('th', class_='thread-name')[1:] #[1:] remove [竞价]`
			`urls = []`
			`for item in index:`
			`url = item.find_all('a', class_ = "a_subject")[0]['href']`
			`if "t-" in url:`
			`urls.append(url)`
			`return urls`
			`except:`
			`if debug_info:`
			`print("[ERROR] GetIndex()")`
			`return False`

			`def GetItem(self, url):`
			`try:`
			`save_urls_key = "http://muchong.com" + url`
			`if save_urls_key not in list(self.__saved_urls.keys()): #if not in cache`
			`obj = self.GetHtmlObj(url, 1)`
			`inf = obj.find_all('table', class_='adjust_table')[0].text.replace(' ', '').replace('\n', ' ').replace('\r', '')`
			`detail = obj.find_all('div', class_="t_fsz")[0].td.text`
			`info = ("http://muchong.com"+url, inf)`
			`with open(cache_file, 'a') as f:#save to cache file`
			`if debug_info:`
			`print("[debug] Save to file: %s" % save_urls_key)`
			`f.write("%s#\|#%s\n" % (info[0], info[1]))`
			`else: #if hit cache`
			`if debug_info:`
			`print("[debug] hit cache: %s" % save_urls_key)`
			`return "http://muchong.com"+url, self.__saved_urls[save_urls_key]`
			`except:`
			`if debug_info:`
			`print("[ERROR] GetItem(%s)" % url)`
			`return False`

			`def fitter_word(self, _info):`
			`for word in exclude: #for each exclude word`
			`if word in _info:`
			`return False`
			`for word in focus_include:`
			`if word not in _info:`
			`return False`
			`for word in include: #for each include word`
			`if word in _info:`
			`return True`
			`return False`

			`if __name__ == '__main__':`
			`spider = xmcSpider()`
			`for i in range(1, page_end+1):`
			`htmlobj = spider.GetHtmlObj(str(i))`
			`urls = spider.GetIndex(htmlobj)`
			`if not urls or not htmlobj:`
			`continue`
			`info = []`
			`for url in urls: #foreach url`
			`info = spider.GetItem(url)`
			`if info and spider.fitter_word(info[0] + info[1]):`
			`print("链接：%s\n详情：%s\n" % (info[0], info[1]))`