一、创建reptileWuxia.py文件,使用BeautifulSoup模块,
1、环境安装:
1)安装Python 3.6.1,
2)配置环境变量:
测试:Python --version
3)安装BeautifulSoup
pip install beautifulsoup4
二、代码实现
用到多线程
# coding=utf-8import urllib.requestimport reimport timeimport osimport threadingfrom bs4 import BeautifulSoupfrom urllib.parse import urlparsefrom urllib.parse import urljoinfrom threading import Threadfrom concurrent import futures#from concurrent.futures import ThreadPoolExecutorclass BookProperty: def __init__(self, name, url, worksDict): self.name = name self.url = url self.worksDict = worksDictclass OutputContent: def createDirectory(self,directory): localDir = os.path.join(os.getcwd(),directory) #dirname(__file__) if not os.path.exists(localDir): #if os.path.exists(save_dir) is False: os.makedirs(localDir) return localDir def createFile(self,newfile): #if not os.path.exists(newfile): f = open(newfile,'w',encoding='utf-8') f.close() def writeContent(self,fileName,chapterList): try: self.createFile(fileName) #list = [chapterTitle,"\n",chapterContent,"\n"] with open(fileName,'a',encoding='utf-8') as f: f.writelines(chapterList) except Exception as e: print('save file error.'+str(e))class ParserPage: #加载页面,得到BeautifulSoup对象 def loadPage(self,url): html=None;soup=None try: request = urllib.request.urlopen(url) html = request.read().decode('gb2312','ignore') except Exception as e: print(e) try: soup = BeautifulSoup(html,'html.parser') #创建一个beautifulsoup的类 except Exception as ex: print(ex) #raise ex.reason return soup def __urlHandle(self,*url): if len(url) > 1 : return urljoin(url[0], url[1]) else: result = urlparse(url[0]) return result.scheme + '://' + result.netloc def __parsetAuthorWorks(self,url,soup): worksDict = {} linkList = soup.find_all(class_=re.compile('style2|style3',re.IGNORECASE)) #忽略大小写 for linkTag in linkList: aTag = linkTag.contents if len(aTag) > 0 and aTag[0].name == 'a' and aTag[0].get_text() !='': href = self.__urlHandle(url,aTag[0].get('href')) worksDict.update({href:aTag[0].get_text()}) #url,作者名(或书名) return worksDict #得到一个作者下所有作品和入口地址 def parserOneAuthorWorks(self,url): soup = self.loadPage(url) if soup is None: return dirName ='Novel' navList = soup.select('.LinkPath') #作者名 if len(navList) >1 : authorName = navList[1].get_text() worksDict = self.__parsetAuthorWorks(url,soup) return {'authorName':authorName,'worksDict':worksDict}; #得到所有作者和作者作品入口地址,把回列表数据类型 def parserAllAuthorName(self,url,authorName): soup = self.loadPage(url) if soup is None: return authorDict = self.__parsetAuthorWorks(url,soup) return {'authorName':authorName,'url':url,'worksDict':authorDict} #解析目录界面,得到正文url def parserCatalogue(self,url): soup = self.loadPage(url) if soup is None: return domain = self.__urlHandle(url) #title = suop.select(".STYLE17")[0].get_text() #取书名,每一页都不一样,取消使用 aList = soup.find_all("a",{'class' : '1'}) urls = [] for aTag in aList: urls.append(domain + aTag.attrs['href']) return urls #解析正文,并得到正文下一页 def parserOnePage(self,url): soup = self.loadPage(url) if soup is None: return content = self.__parserPageContent(soup) nextUrl = self.__isNextPage(soup,url) return {'content':content[0],'nextUrl':nextUrl} def parsetOnePageNotCatalog(self,url): soup = self.loadPage(url) if soup is None: return content = self.__parserPageContent(soup) nextUrl = self.__isNextPage(soup,url,content[1]) return {'content':content[0],'nextUrl':nextUrl} def __parserPageContent(self,soup): h3Tag = soup.find('h3') spanTag = soup.find("span") chapterData = chapterName = None if not h3Tag is None : chapterName = h3Tag.get_text() #读取章节标题名称 chapterData = chapterName+'\n' if not spanTag is None: chapterContent= spanTag.get_text() #读取正文 if not chapterContent is None: chapterContent ="".join(chapterContent.split()) #删除空格 if not chapterData is None: chapterData= chapterData+chapterContent+'\n' else: chapterData = chapterContent+'\n' return chapterData,chapterName #对没有目录页的页面的,特殊情况的判断 def __isNextPage(self,*args): nextUrl = None nextATag = args[0].find('a',{'class':'LinkNextArticle'})#返回下一页URL,因为目录没有所有页的url,不带域名 if not nextATag is None: domain = self.__urlHandle(args[1]) nextUrl = domain + nextATag.attrs['href'] if len(args)>2 and not args[2] is None: nextText = nextATag.get_text() nextText="".join(nextText.split()) chapterName="".join(args[2].split()) if nextText[0:2] != chapterName[0:2]: nextUrl = None return nextUrl class ReptileManager: def __init__(self, url): self.url = url self.parser = ParserPage() self.output = OutputContent() #爬取一本书 def reptileBook(self,url,fileName): urls = self.parser.parserCatalogue(url) if urls is None: return contentList=[] if len(urls) > 0 : nextUrl =None for url in urls: result = self.parser.parserOnePage(url) if result is None: continue nextUrl = result['nextUrl'] contentList.append(result['content']) while nextUrl : result = self.parser.parserOnePage(nextUrl) if result is None: break nextUrl = result['nextUrl'] contentList.append(result['content']) else : result = self.parser.parsetOnePageNotCatalog(url) if result is None: return contentList.append(result['content']) nextUrl = result['nextUrl'] while nextUrl : result = self.parser.parsetOnePageNotCatalog(nextUrl) contentList.append(result['content']) nextUrl = result['nextUrl'] if not contentList is None: self.output.writeContent(fileName,contentList)#写文件 return fileName #爬取1个作者下所有作品 def reptileOneAuthorWorksBooks(self): print ('解析开始时间:'+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) works = self.parser.parserOneAuthorWorks(self.url) self.__reptileMuchBooks(works) print ('解析完成时间:'+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) #得到所有作者下所有作品名称与url,返回字典数据类型 def reptileAllAuthorAllWorks(self,url): worksList =[];futureList=[] result = self.parser.parserAllAuthorName(url,'') with futures.ThreadPoolExecutor(max_workers=10) as executor: for k,v in result['worksDict'].items(): future = executor.submit(self.parser.parserAllAuthorName,k,v) futureList.append(future) for future in futures.as_completed(futureList): result = future.result() worksList.append(result) #for data in executor.map(parserAllAuthorName, authorDict.keys(),authorDict.values()): return worksList; #爬取所有作者下的所有作品 def reptileAllAuthorBoos(self): print ('开始时间:'+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) worksList = self.reptileAllAuthorAllWorks(self.url) #with futures.ThreadPoolExecutor(max_workers=5) as executor: i=0 print(len(worksList)) for works in worksList: i +=1 if i>89 : self.__reptileMuchBooks(works) print ('结束时间:'+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) def __reptileMuchBooks(self,works): a=0;futureList=[] filePath = self.output.createDirectory(works['authorName']) with futures.ThreadPoolExecutor(max_workers=10) as executor: for k,v in works['worksDict'].items(): a += 1 fileName = os.path.join(filePath,str(a)+v+'.txt') future = executor.submit(self.reptileBook,k,fileName) futureList.append(future) for future in futures.as_completed(futureList): result = future.result() print(result)if __name__ == '__main__': reptile = ReptileManager('this is URL') reptile.reptileAllAuthorBoos()
参考:
'''
参考: https://beautifulsoup.readthedocs.io/zh_CN/latest/# https://www.jianshu.com/p/62145aed2d49 https://www.jianshu.com/p/b9b3d66aa0be https://github.com/yanbober/SmallReptileTraining/tree/master/ConcurrentSpiderhttps://www.gulongbbs.com/wuxia/ 测试使用
'''