萌鼠归档页面链接爬取
import requests
from bs4 import BeautifulSoup
from lxml import html
import json
import xmlurl = "https://www.moerats.com/archive.html"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
f = requests.get(url,headers=headers)
html = f.text
soup = BeautifulSoup(html,'lxml')
name = soup.find_all('a' , class_="text-lt")
print(html)
print(name)
dict = {}
for i in name:
dict['name'] = i.get_text()
dict['url'] = i.get('href')
fw = open('dictjson.text', 'a')
shuju = json.dumps(dict)
fw.write(shuju + '\n')
fw2 = open('strdict.text','a')
fw2.write(str(dict) +'\n' )
fw.close()
fw2.close()
使用requests库进行请求 BeautifulSoup进行网页的解析
最终字典使用转换为json和str的方式分别存储进了两个不同的text文件中
本站归档页抓取
import requests
from bs4 import BeautifulSoup
import re
import bs4
import json
url='https://southcat.net/archives/'
headers = {
'user-agent' : 'Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36'}
f = requests.get(url, headers=headers)
html = f.text
soup = BeautifulSoup(html, 'lxml')
name = soup.find_all('div', class_="archives-list-meta")
url = soup.find_all('href',class_='archives-list-meta')
#print(html)
#print(name)
dict={}
for i in name:
dict['name'] = i.get_text()
dict['url'] =re.findall('https://southcat.net/[1-9]\d*',str(i))
print(dict)
fw = open('dictjson1.text', 'a')
shuju = json.dumps(dict)
fw.write(shuju + '\n')
fw.close()
因为没有像萌鼠那样归档页面的a标签设置了class,本站的抓取稍微废了点功夫 并且还有一个问题存在,网址使用正则进行匹配 ,标题并没有正确的完成匹配 残留了一部分内容没有删除掉