This repository has been archived on 2021-04-06. You can view files and clone it, but cannot push or open issues or pull requests.
redistribute-spider/spider.py

108 lines
3.0 KiB
Python
Raw Normal View History

2020-03-31 07:34:34 +00:00
#encoding:utf-8
from time import time, sleep
import sys
from config import *
try:
import requests as r
from bs4 import BeautifulSoup as bs
except:
print("must install:\nrequests bs4\nuse pip install requests bs4")
exit()
cache_file = sys.path[0] + cache_file
class xmcSpider:
def __init__(self):
self.__last = 0
self.__saved_urls = self.__load_cache()
def __load_cache(self):
saved_urls = {} #cache dict
with open(cache_file, 'r') as f: #read cache
cache = f.readlines()
for item in cache:
if "#|#" not in item:
continue
_item = item.replace("\n", "").split("#|#")
saved_urls[_item[0]] = _item[1]
return saved_urls
def GetHtmlObj(self, href, type = 0): #0=>index, 1=>page
if debug_info:
print("[debug] Getting %s" % href)
try:
if type == 0:
url = "http://muchong.com/f-430-%s-typeid-2304" % href
else:
url = "http://muchong.com/%s" % href
while int(time()) - self.__last < interval:
sleep(1)
self.__last = int(time())
html = r.get(url).text
obj = bs(html, "html.parser")
return obj
except:
if debug_info:
print("[ERROR] GetHtmlObj(%s, %d)" % (href, type))
return False
def GetIndex(self, obj):
try:
index = obj.find_all('th', class_='thread-name')[1:] #[1:] remove [竞价]
urls = []
for item in index:
url = item.find_all('a', class_ = "a_subject")[0]['href']
if "t-" in url:
urls.append(url)
return urls
except:
if debug_info:
print("[ERROR] GetIndex()")
return False
def GetItem(self, url):
try:
save_urls_key = "http://muchong.com" + url
if save_urls_key not in list(self.__saved_urls.keys()): #if not in cache
obj = self.GetHtmlObj(url, 1)
inf = obj.find_all('table', class_='adjust_table')[0].text.replace(' ', '').replace('\n', ' ').replace('\r', '')
detail = obj.find_all('div', class_="t_fsz")[0].td.text
info = ("http://muchong.com"+url, inf)
with open(cache_file, 'a') as f:#save to cache file
if debug_info:
print("[debug] Save to file: %s" % save_urls_key)
f.write("%s#|#%s\n" % (info[0], info[1]))
else: #if hit cache
if debug_info:
print("[debug] hit cache: %s" % save_urls_key)
return "http://muchong.com"+url, self.__saved_urls[save_urls_key]
except:
if debug_info:
print("[ERROR] GetItem(%s)" % url)
return False
def fitter_word(self, _info):
for word in exclude: #for each exclude word
if word in _info:
return False
for word in focus_include:
if word not in _info:
return False
for word in include: #for each include word
if word in _info:
return True
return False
if __name__ == '__main__':
spider = xmcSpider()
for i in range(1, page_end+1):
htmlobj = spider.GetHtmlObj(str(i))
urls = spider.GetIndex(htmlobj)
if not urls or not htmlobj:
continue
info = []
for url in urls: #foreach url
info = spider.GetItem(url)
if info and spider.fitter_word(info[0] + info[1]):
print("链接:%s\n详情:%s\n" % (info[0], info[1]))