<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/">
    <channel>
        <title>deval</title>
        <link>https://paragraph.com/@coderun</link>
        <description>undefined</description>
        <lastBuildDate>Mon, 27 Apr 2026 20:56:13 GMT</lastBuildDate>
        <docs>https://validator.w3.org/feed/docs/rss2.html</docs>
        <generator>https://github.com/jpmonette/feed</generator>
        <language>en</language>
        <image>
            <title>deval</title>
            <url>https://storage.googleapis.com/papyrus_images/1cc63e7f3c898ec9b13ff2da08ea30b722a1ce7f2cd15dc9798a0741f38dfd5a.jpg</url>
            <link>https://paragraph.com/@coderun</link>
        </image>
        <copyright>All rights reserved</copyright>
        <item>
            <title><![CDATA[电子书抓取脚本 | Coderun]]></title>
            <link>https://paragraph.com/@coderun/coderun</link>
            <guid>v3b4kdKqWRr9dXTquR3y</guid>
            <pubDate>Sat, 07 May 2022 04:05:19 GMT</pubDate>
            <description><![CDATA[https://www.coderun.zone/2021/01/23/%E7%94%B5%E5%AD%90%E4%B9%A6%E6%8A%93%E5%8F%96%E8%84%9A%E6%9C%AC/ 这是我在进入行业的第一个程序 “留作纪念，仅供学习” #python 版本 2.7.18 import sys reload(sys) sys.setdefaultencoding(&apos;utf-8&apos;) import random import requests import re import os import time import sys import datetime header = [ {&apos;User-Agent&apos;: &apos;Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36&apos;}, {&apos;User-...]]></description>
            <content:encoded><![CDATA[<p><a target="_blank" rel="noopener noreferrer nofollow ugc" class="dont-break-out" href="https://www.coderun.zone/2021/01/23/%E7%94%B5%E5%AD%90%E4%B9%A6%E6%8A%93%E5%8F%96%E8%84%9A%E6%9C%AC/">https://www.coderun.zone/2021/01/23/%E7%94%B5%E5%AD%90%E4%B9%A6%E6%8A%93%E5%8F%96%E8%84%9A%E6%9C%AC/</a></p><p>这是我在进入行业的第一个程序</p><p>“留作纪念，仅供学习”</p><p>#python 版本 2.7.18 import sys reload(sys) sys.setdefaultencoding(&apos;utf-8&apos;) import random import requests import re import os import time import sys import datetime header = [ {&apos;User-Agent&apos;: &apos;Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36&apos;}, {&apos;User-Agent&apos;:&apos;Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50&apos;}, {&apos;User-Agent&apos;:&apos;Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)&apos;}, {&apos;User-Agent&apos;:&apos;Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1&apos;}, {&apos;User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36&apos;}]</p><h1 id="h-header-header" class="text-4xl font-header !mt-8 !mb-4 first:!mt-0 first:!mb-0">header 是用来伪装成浏览器发送请求，一般加上最好，header 信息可以通过浏览器查看，也可在网上搜索得到。</h1><p>for x in range(8427249,8429942): u = str(x) print(&apos;当前动态URL：&apos;.decode(&apos;utf-8&apos;).encode(&apos;gbk&apos;)+&apos;.......................&apos;+datetime.datetime.now().strftime(&quot;%Y-%m-%d %H:%M:%S&quot;)+&apos;..............&apos;+u+&apos;&apos;)#print(text_1) attempts = 0 success = False while attempts &lt; 50 and not success: try: req = requests.get(&apos;需要抓取的网址&apos;+ u +&apos;.html&apos;,headers = header[random.randint(0,4)], timeout=60)# 向目标网站发送 get 请求 success = True except: attempts += 1 print(&apos;抛出异常.......重试中..................&apos;.decode(&apos;utf-8&apos;).encode(&apos;gbk&apos;)+datetime.datetime.now().strftime(&quot;%Y-%m-%d %H:%M:%S&quot;)+&apos;&apos;) time.sleep(10) if attempts == 50: break # 永久等待</p><pre data-type="codeBlock" text="result = req.content
result = result.decode(&apos;gbk&apos;)    #  查看网页源代码 看到 charset=gbk，即网页是用的 gbk 编码，故要用 gkb 的编码方式来解码，否则中文就会乱码。
# print(result)
title_re = re.compile(r&apos;&lt;h1&gt;(.*?)&lt;/h1&gt;&apos;) 
title = re.findall(title_re,result)  
#print(title)
text_re = re.compile(r&apos;&lt;div id=&quot;htmlContent&quot; class=&quot;contentbox clear&quot;&gt;([\s\S]*?)&lt;/div&gt;&apos;)
text = re.findall(text_re,result)
text = &apos;\r\n&apos;.join(text)  # 把两部分的正文连接成同一个个字符串
text = text.split(&apos;\r\n&apos;)
title = title[0]
text.insert(0,title)# 插入标题
#print(text)
text_1 = []     # 添加一个空列表，用来装处理后的正文
for sentence in text:
    sentence = sentence.strip()     # 去掉每一句两边的空格
    if &apos; &apos; in sentence:
        sentence = sentence.replace(&apos; &apos;,&apos;&apos;)    # 去掉句子中的  
        if &apos;&lt;br /&gt;&apos; in sentence:
            sentence = sentence.replace(&apos;&lt;br /&gt;&apos;,&apos;&apos;)    # 去掉句子中的 &lt;br /&gt;
            text_1.append(sentence)
        else:
            text_1.append(sentence)
            
    elif &apos;&lt;br /&gt;&apos; in sentence:
        sentence = sentence.replace(&apos;&lt;br /&gt;&apos;,&apos;&apos;)
        text_1.append(sentence)
    elif &apos;--&gt;&gt;&lt;p class=&quot;text-danger text-center mg0&quot;&gt;本章未完，点击下一页继续阅读&lt;/p&gt;&apos; in sentence:
       
        text_1.append(sentence)
    else:
        text_1.append(sentence)
        count = text_1.count(&apos;&apos;)        # 统计列表中的空字符串
for i in range(count):
    text_1.remove(&apos;&apos;)           # 移除所有的空字符串
#print(text_1)

file = open(&apos;./test.txt&apos;,&apos;a&apos;)
for i in range(len(text_1)):
    s = str(text_1[i]).replace(&apos;[&apos;,&apos;&apos;).replace(&apos;]&apos;,&apos;&apos;)#去除[],这两行按数据不同，可以选择
    s = s.replace(&quot;&apos;&quot;,&apos;&apos;).replace(&apos;,&apos;,&apos;&apos;).replace(&apos;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&apos;,&apos;&apos;) +&apos;\n&apos;   #去除单引号，逗号，每行末尾追加换行符
    file.write(s)
file.close()
print(&apos;抓取成功!&apos;.decode(&apos;utf-8&apos;).encode(&apos;gbk&apos;)+&apos;..............&apos;+datetime.datetime.now().strftime(&quot;%Y-%m-%d %H:%M:%S&quot;)+&apos;&apos;)
time.sleep(10)
"><code>result <span class="hljs-operator">=</span> req.content
result <span class="hljs-operator">=</span> result.decode(<span class="hljs-string">'gbk'</span>)    #  查看网页源代码 看到 charset<span class="hljs-operator">=</span>gbk，即网页是用的 gbk 编码，故要用 gkb 的编码方式来解码，否则中文就会乱码。
# print(result)
title_re <span class="hljs-operator">=</span> re.compile(r<span class="hljs-string">'&#x3C;h1>(.*?)&#x3C;/h1>'</span>) 
title <span class="hljs-operator">=</span> re.findall(title_re,result)  
#print(title)
text_re <span class="hljs-operator">=</span> re.compile(r<span class="hljs-string">'&#x3C;div id="htmlContent" class="contentbox clear">([\s\S]*?)&#x3C;/div>'</span>)
text <span class="hljs-operator">=</span> re.findall(text_re,result)
text <span class="hljs-operator">=</span> <span class="hljs-string">'\r\n'</span>.join(text)  # 把两部分的正文连接成同一个个字符串
text <span class="hljs-operator">=</span> text.split(<span class="hljs-string">'\r\n'</span>)
title <span class="hljs-operator">=</span> title[<span class="hljs-number">0</span>]
text.insert(<span class="hljs-number">0</span>,title)# 插入标题
#print(text)
text_1 <span class="hljs-operator">=</span> []     # 添加一个空列表，用来装处理后的正文
<span class="hljs-keyword">for</span> sentence in text:
    sentence <span class="hljs-operator">=</span> sentence.strip()     # 去掉每一句两边的空格
    <span class="hljs-keyword">if</span> <span class="hljs-string">' '</span> in sentence:
        sentence <span class="hljs-operator">=</span> sentence.replace(<span class="hljs-string">' '</span>,<span class="hljs-string">''</span>)    # 去掉句子中的  
        <span class="hljs-keyword">if</span> <span class="hljs-string">'&#x3C;br />'</span> in sentence:
            sentence <span class="hljs-operator">=</span> sentence.replace(<span class="hljs-string">'&#x3C;br />'</span>,<span class="hljs-string">''</span>)    # 去掉句子中的 <span class="hljs-operator">&#x3C;</span>br <span class="hljs-operator">/</span><span class="hljs-operator">></span>
            text_1.append(sentence)
        <span class="hljs-keyword">else</span>:
            text_1.append(sentence)
            
    elif <span class="hljs-string">'&#x3C;br />'</span> in sentence:
        sentence <span class="hljs-operator">=</span> sentence.replace(<span class="hljs-string">'&#x3C;br />'</span>,<span class="hljs-string">''</span>)
        text_1.append(sentence)
    elif <span class="hljs-string">'-->>&#x3C;p class="text-danger text-center mg0">本章未完，点击下一页继续阅读&#x3C;/p>'</span> in sentence:
       
        text_1.append(sentence)
    <span class="hljs-keyword">else</span>:
        text_1.append(sentence)
        count <span class="hljs-operator">=</span> text_1.count(<span class="hljs-string">''</span>)        # 统计列表中的空字符串
<span class="hljs-keyword">for</span> i in range(count):
    text_1.remove(<span class="hljs-string">''</span>)           # 移除所有的空字符串
#print(text_1)

file <span class="hljs-operator">=</span> open(<span class="hljs-string">'./test.txt'</span>,<span class="hljs-string">'a'</span>)
<span class="hljs-keyword">for</span> i in range(len(text_1)):
    s <span class="hljs-operator">=</span> str(text_1[i]).replace(<span class="hljs-string">'['</span>,<span class="hljs-string">''</span>).replace(<span class="hljs-string">']'</span>,<span class="hljs-string">''</span>)#去除[],这两行按数据不同，可以选择
    s <span class="hljs-operator">=</span> s.replace(<span class="hljs-string">"'"</span>,<span class="hljs-string">''</span>).replace(<span class="hljs-string">','</span>,<span class="hljs-string">''</span>).replace(<span class="hljs-string">'&#x26;nbsp;&#x26;nbsp;&#x26;nbsp;&#x26;nbsp;'</span>,<span class="hljs-string">''</span>) <span class="hljs-operator">+</span><span class="hljs-string">'\n'</span>   #去除单引号，逗号，每行末尾追加换行符
    file.write(s)
file.close()
print(<span class="hljs-string">'抓取成功!'</span>.decode(<span class="hljs-string">'utf-8'</span>).encode(<span class="hljs-string">'gbk'</span>)<span class="hljs-operator">+</span><span class="hljs-string">'..............'</span><span class="hljs-operator">+</span>datetime.datetime.now().strftime(<span class="hljs-string">"%Y-%m-%d %H:%M:%S"</span>)<span class="hljs-operator">+</span><span class="hljs-string">''</span>)
time.sleep(<span class="hljs-number">10</span>)
</code></pre>]]></content:encoded>
            <author>coderun@newsletter.paragraph.com (deval)</author>
            <enclosure url="https://storage.googleapis.com/papyrus_images/da6624fb7702b575e4a7b713305ae73ed0fea12ed0a3af2c2d1598b918ad5d54.jpg" length="0" type="image/jpg"/>
        </item>
    </channel>
</rss>