https://www.coderun.zone/2021/01/23/%E7%94%B5%E5%AD%90%E4%B9%A6%E6%8A%93%E5%8F%96%E8%84%9A%E6%9C%AC/
这是我在进入行业的第一个程序
“留作纪念,仅供学习”
#python 版本 2.7.18 import sys reload(sys) sys.setdefaultencoding('utf-8') import random import requests import re import os import time import sys import datetime header = [ {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'}, {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, {'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'}, {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, {'User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}]
for x in range(8427249,8429942): u = str(x) print('当前动态URL:'.decode('utf-8').encode('gbk')+'.......................'+datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'..............'+u+'')#print(text_1) attempts = 0 success = False while attempts < 50 and not success: try: req = requests.get('需要抓取的网址'+ u +'.html',headers = header[random.randint(0,4)], timeout=60)# 向目标网站发送 get 请求 success = True except: attempts += 1 print('抛出异常.......重试中..................'.decode('utf-8').encode('gbk')+datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'') time.sleep(10) if attempts == 50: break # 永久等待
result = req.content
result = result.decode('gbk') # 查看网页源代码 看到 charset=gbk,即网页是用的 gbk 编码,故要用 gkb 的编码方式来解码,否则中文就会乱码。
# print(result)
title_re = re.compile(r'<h1>(.*?)</h1>')
title = re.findall(title_re,result)
#print(title)
text_re = re.compile(r'<div id="htmlContent" class="contentbox clear">([\s\S]*?)</div>')
text = re.findall(text_re,result)
text = '\r\n'.join(text) # 把两部分的正文连接成同一个个字符串
text = text.split('\r\n')
title = title[0]
text.insert(0,title)# 插入标题
#print(text)
text_1 = [] # 添加一个空列表,用来装处理后的正文
for sentence in text:
sentence = sentence.strip() # 去掉每一句两边的空格
if ' ' in sentence:
sentence = sentence.replace(' ','') # 去掉句子中的
if '<br />' in sentence:
sentence = sentence.replace('<br />','') # 去掉句子中的 <br />
text_1.append(sentence)
else:
text_1.append(sentence)
elif '<br />' in sentence:
sentence = sentence.replace('<br />','')
text_1.append(sentence)
elif '-->><p class="text-danger text-center mg0">本章未完,点击下一页继续阅读</p>' in sentence:
text_1.append(sentence)
else:
text_1.append(sentence)
count = text_1.count('') # 统计列表中的空字符串
for i in range(count):
text_1.remove('') # 移除所有的空字符串
#print(text_1)
file = open('./test.txt','a')
for i in range(len(text_1)):
s = str(text_1[i]).replace('[','').replace(']','')#去除[],这两行按数据不同,可以选择
s = s.replace("'",'').replace(',','').replace(' ','') +'\n' #去除单引号,逗号,每行末尾追加换行符
file.write(s)
file.close()
print('抓取成功!'.decode('utf-8').encode('gbk')+'..............'+datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'')
time.sleep(10)

