# 电子书抓取脚本 | Coderun **Published by:** [deval](https://paragraph.com/@coderun/) **Published on:** 2022-05-07 **URL:** https://paragraph.com/@coderun/coderun ## Content https://www.coderun.zone/2021/01/23/%E7%94%B5%E5%AD%90%E4%B9%A6%E6%8A%93%E5%8F%96%E8%84%9A%E6%9C%AC/ 这是我在进入行业的第一个程序 “留作纪念,仅供学习” #python 版本 2.7.18 import sys reload(sys) sys.setdefaultencoding('utf-8') import random import requests import re import os import time import sys import datetime header = [ {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'}, {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, {'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'}, {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, {'User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}]header 是用来伪装成浏览器发送请求,一般加上最好,header 信息可以通过浏览器查看,也可在网上搜索得到。for x in range(8427249,8429942): u = str(x) print('当前动态URL:'.decode('utf-8').encode('gbk')+'.......................'+datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'..............'+u+'')#print(text_1) attempts = 0 success = False while attempts < 50 and not success: try: req = requests.get('需要抓取的网址'+ u +'.html',headers = header[random.randint(0,4)], timeout=60)# 向目标网站发送 get 请求 success = True except: attempts += 1 print('抛出异常.......重试中..................'.decode('utf-8').encode('gbk')+datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'') time.sleep(10) if attempts == 50: break # 永久等待result = req.content result = result.decode('gbk') # 查看网页源代码 看到 charset=gbk,即网页是用的 gbk 编码,故要用 gkb 的编码方式来解码,否则中文就会乱码。 # print(result) title_re = re.compile(r'<h1>(.*?)</h1>') title = re.findall(title_re,result) #print(title) text_re = re.compile(r'<div id="htmlContent" class="contentbox clear">([\s\S]*?)</div>') text = re.findall(text_re,result) text = '\r\n'.join(text) # 把两部分的正文连接成同一个个字符串 text = text.split('\r\n') title = title[0] text.insert(0,title)# 插入标题 #print(text) text_1 = [] # 添加一个空列表,用来装处理后的正文 for sentence in text: sentence = sentence.strip() # 去掉每一句两边的空格 if ' ' in sentence: sentence = sentence.replace(' ','') # 去掉句子中的 if '<br />' in sentence: sentence = sentence.replace('<br />','') # 去掉句子中的 <br /> text_1.append(sentence) else: text_1.append(sentence) elif '<br />' in sentence: sentence = sentence.replace('<br />','') text_1.append(sentence) elif '-->><p class="text-danger text-center mg0">本章未完,点击下一页继续阅读</p>' in sentence: text_1.append(sentence) else: text_1.append(sentence) count = text_1.count('') # 统计列表中的空字符串 for i in range(count): text_1.remove('') # 移除所有的空字符串 #print(text_1) file = open('./test.txt','a') for i in range(len(text_1)): s = str(text_1[i]).replace('[','').replace(']','')#去除[],这两行按数据不同,可以选择 s = s.replace("'",'').replace(',','').replace('&nbsp;&nbsp;&nbsp;&nbsp;','') +'\n' #去除单引号,逗号,每行末尾追加换行符 file.write(s) file.close() print('抓取成功!'.decode('utf-8').encode('gbk')+'..............'+datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'') time.sleep(10) ## Publication Information - [deval](https://paragraph.com/@coderun/): Publication homepage - [All Posts](https://paragraph.com/@coderun/): More posts from this publication - [RSS Feed](https://api.paragraph.com/blogs/rss/@coderun): Subscribe to updates