# 电子书抓取脚本 | Coderun

By [deval](https://paragraph.com/@coderun) · 2022-05-07

---

[https://www.coderun.zone/2021/01/23/%E7%94%B5%E5%AD%90%E4%B9%A6%E6%8A%93%E5%8F%96%E8%84%9A%E6%9C%AC/](https://www.coderun.zone/2021/01/23/%E7%94%B5%E5%AD%90%E4%B9%A6%E6%8A%93%E5%8F%96%E8%84%9A%E6%9C%AC/)

这是我在进入行业的第一个程序

“留作纪念，仅供学习”

#python 版本 2.7.18 import sys reload(sys) sys.setdefaultencoding('utf-8') import random import requests import re import os import time import sys import datetime header = \[ {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'}, {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, {'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'}, {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, {'User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}\]

header 是用来伪装成浏览器发送请求，一般加上最好，header 信息可以通过浏览器查看，也可在网上搜索得到。
=========================================================

for x in range(8427249,8429942): u = str(x) print('当前动态URL：'.decode('utf-8').encode('gbk')+'.......................'+datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'..............'+u+'')#print(text\_1) attempts = 0 success = False while attempts < 50 and not success: try: req = requests.get('需要抓取的网址'+ u +'.html',headers = header\[random.randint(0,4)\], timeout=60)# 向目标网站发送 get 请求 success = True except: attempts += 1 print('抛出异常.......重试中..................'.decode('utf-8').encode('gbk')+datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'') time.sleep(10) if attempts == 50: break # 永久等待

    result = req.content
    result = result.decode('gbk')    #  查看网页源代码 看到 charset=gbk，即网页是用的 gbk 编码，故要用 gkb 的编码方式来解码，否则中文就会乱码。
    # print(result)
    title_re = re.compile(r'<h1>(.*?)</h1>') 
    title = re.findall(title_re,result)  
    #print(title)
    text_re = re.compile(r'<div id="htmlContent" class="contentbox clear">([\s\S]*?)</div>')
    text = re.findall(text_re,result)
    text = '\r\n'.join(text)  # 把两部分的正文连接成同一个个字符串
    text = text.split('\r\n')
    title = title[0]
    text.insert(0,title)# 插入标题
    #print(text)
    text_1 = []     # 添加一个空列表，用来装处理后的正文
    for sentence in text:
        sentence = sentence.strip()     # 去掉每一句两边的空格
        if ' ' in sentence:
            sentence = sentence.replace(' ','')    # 去掉句子中的  
            if '<br />' in sentence:
                sentence = sentence.replace('<br />','')    # 去掉句子中的 <br />
                text_1.append(sentence)
            else:
                text_1.append(sentence)
                
        elif '<br />' in sentence:
            sentence = sentence.replace('<br />','')
            text_1.append(sentence)
        elif '-->><p class="text-danger text-center mg0">本章未完，点击下一页继续阅读</p>' in sentence:
           
            text_1.append(sentence)
        else:
            text_1.append(sentence)
            count = text_1.count('')        # 统计列表中的空字符串
    for i in range(count):
        text_1.remove('')           # 移除所有的空字符串
    #print(text_1)
    
    file = open('./test.txt','a')
    for i in range(len(text_1)):
        s = str(text_1[i]).replace('[','').replace(']','')#去除[],这两行按数据不同，可以选择
        s = s.replace("'",'').replace(',','').replace('&nbsp;&nbsp;&nbsp;&nbsp;','') +'\n'   #去除单引号，逗号，每行末尾追加换行符
        file.write(s)
    file.close()
    print('抓取成功!'.decode('utf-8').encode('gbk')+'..............'+datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'')
    time.sleep(10)

---

*Originally published on [deval](https://paragraph.com/@coderun/coderun)*
