import re import urlparse from bs4 import BeautifulSoup classHtmlParser(object): def__init__(self): self.old_request_url = set() defpaser(self, page_url, cont): if page_url isNoneor cont isNone: return soup = BeautifulSoup(cont, 'html.parser', from_encoding='utf-8') new_url = self._get_new_url(page_url, soup) data = self._get_data(page_url, soup) return new_url, data def_get_data(self, page_url, soup): links = soup.find_all('img') list =[] for link in links: url = link.attrs.get('data-original') if url isnotNone: new_full_url = urlparse.urljoin(page_url, url) list.append(new_full_url) returnlist def_get_new_url(self, page_url, soup): links = soup.find_all('a', href=re.compile(r"/meizi/rank/")) for link in links: if link['href'] notin self.old_request_url: new_url = link['href'] self.old_request_url.add(link['href']) new_full_url = urlparse.urljoin(page_url, new_url) return new_full_url
数据保存
1 2 3 4 5 6 7 8
classOutPut(object): defcreate(self, data): for i in data: links = open("data.txt") if i notin links: fout = open('data.txt', 'a') fout.write(i+'\n') fout.close()