#需要的库
importrequests
fromlxmlimportetree
frommultiprocessingimportPool
importos
#请求头
headers={
'User-Agent':'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/65.0.3325.181Safari/537.36'
}
#创建存储路径
pathname='./全书网/'
ifnotos.path.exists(pathname):
os.mkdir(pathname)
#获取书籍列表
defget_booklist(url):
try:
response=requests.get(url=url,headers=headers)
etrees=etree.HTML(response.text)
sum=etrees.xpath('//a[@class="last"]/text()')[0]
booklist=etrees.xpath('//ul[@class="seeWellcf"]/li')
book_list=[]
forbooksinbooklist:
book=books.xpath('./a/@href')[0]
book_list.append(book)
pool.map(get_book,book_list)
urls=['http://www.quanshuwang.com/list/3_{}.html'.format(i)foriinrange(2,int(sum)+1)]
pool.map(get_booklist,urls)
exceptException:
print('get_booklistfailed')
#获取具体书籍
defget_book(url):
try:
response=requests.get(url=url,headers=headers)
etrees=etree.HTML(response.content.decode("gb18030"))
book_name=etrees.xpath('//div[@class="b-info"]/h1/text()')[0]
ifos.path.exists(pathname+book_name+'.txt'):
print(book_name+'.书籍已存在,如需重新下载请删除原文件')
returnNone
book=etrees.xpath('//div[@class="b-oper"]/a/@href')[0]
get_mulu(book)
exceptException:
print('get_bookfailed')
#获取书籍目录
defget_mulu(url):
try:
response=requests.get(url=url,headers=headers)
etrees=etree.HTML(response.text)
book=etrees.xpath('//div[@class="clearfixdirconone"]/li')
foriinbook:
book=i.xpath('./a/@href')[0]
get_content(book)
exceptException:
print('get_mulufailed')
#获取并写入书籍内容
defget_content(url):
try:
response=requests.get(url=url,headers=headers)
etrees=etree.HTML(response.content.decode("gb18030"))
title=etrees.xpath('//a[@class="article_title"]/text()')[0]
zhangjie=etrees.xpath('//strong[@class="ljieqi_title"]/text()')[0]
contents=etrees.xpath('//div[@class="mainContenr"]/text()')
content=''.join(contents)
withopen(pathname+title+'.txt','a+',encoding='utf-8')asf:
f.write(zhangjie+'\n\n'+content+'\n\n')
print('正在下载:',zhangjie)
exceptException:
print('get_contentfailed')
#程序入口
if__name__=='__main__':
url='http://www.quanshuwang.com/list/3_1.html'
#创建进程池
pool=Pool()
#启动程序
get_booklist(url)
控制台输出
如需转载,请注明文章出处和来源网址:http://www.divcss5.com/html/h54896.shtml