关于Python爬虫的初次尝试

萌鼠归档页面链接爬取

import requests
from bs4 import BeautifulSoup
from lxml import html
import json
import xmlurl = "https://www.moerats.com/archive.html"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
f = requests.get(url,headers=headers)
html = f.text
soup = BeautifulSoup(html,'lxml')
name = soup.find_all('a' , class_="text-lt")
print(html)
print(name)
dict = {}
for i in name:
    dict['name'] = i.get_text()
    dict['url'] = i.get('href')
    fw = open('dictjson.text', 'a')
    shuju = json.dumps(dict)
    fw.write(shuju + '\n')
    fw2 = open('strdict.text','a')
    fw2.write(str(dict) +'\n' )
fw.close()
fw2.close()

使用requests库进行请求 BeautifulSoup进行网页的解析

最终字典使用转换为json和str的方式分别存储进了两个不同的text文件中

本站归档页抓取

import requests
from bs4 import BeautifulSoup
import re
import bs4
import json
url='https://southcat.net/archives/'
headers = {
    'user-agent' : 'Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36'}
f = requests.get(url, headers=headers)
html = f.text
soup = BeautifulSoup(html, 'lxml')
name = soup.find_all('div', class_="archives-list-meta")
url = soup.find_all('href',class_='archives-list-meta')
#print(html)
#print(name)
dict={}
for i in name:
     dict['name'] = i.get_text()
     dict['url'] =re.findall('https://southcat.net/[1-9]\d*',str(i))
     print(dict)
     fw = open('dictjson1.text', 'a')
     shuju = json.dumps(dict)
     fw.write(shuju + '\n')
fw.close()

因为没有像萌鼠那样归档页面的a标签设置了class,本站的抓取稍微废了点功夫 并且还有一个问题存在,网址使用正则进行匹配 ,标题并没有正确的完成匹配 残留了一部分内容没有删除掉

原创文章,作者:南猫,如若转载,请注明出处:https://southcat.net/2524/


不妨点个广告再走嘛

Leave a Reply

Your email address will not be published. Required fields are marked *