通过抓取页面生成 RSS
生成步骤:
- 通过 gen-json.js 文件,生成 blogs.json
- 通过 gen-rss.py 文件,生成 index.xml
gen-json.js
:
1 const axios = require('axios');
2 const cheerio = require('cheerio');
3 const fs = require('fs');
4
5 // for blogs.json
6 const url = 'https://www.yidajiabei.xyz/blog/index.html';
7
8 async function scrapeData() {
9 try {
10 // Fetch HTML of the page we want to scrape
11 const { data } = await axios.get(url);
12 // Load HTML we fetched in the previous line
13 const $ = cheerio.load(data);
14 // Select all the list items in plainlist class
15 const listItems = $('.org-ul li');
16 // Stores data for all countries
17 const countries = [];
18 // Use .each method to loop through the li we selected
19 listItems.each((idx, el) => {
20 // Object holding data for each country/jurisdiction
21 const country = { title: '', link: '', date: '', desp: '' }; // , link: ""
22 // Select the text content of a and span elements
23 // Store the textcontent in the above object
24 country.title = $(el).children('a').text();
25 country.link = $(el).children('a').attr('href');
26 country.date = $(el)
27 .text()
28 .replace(/[^0-9][^\s]+/g, '')
29 .split('-')
30 .join('')
31 .match(/.{1,2}/g)
32 .join('-');
33 if (country.link.split('/')[0] === '2022') {
34 country.date = '2022-' + country.date;
35 } else if (country.link.split('/')[0] === '2021') {
36 country.date = '2021-' + country.date;
37 } else if (country.link.split('/')[0] === '2020') {
38 country.date = '2020-' + country.date;
39 } else if (country.link.split('/')[0] === '2019') {
40 country.date = '2019-' + country.date;
41 } else if (country.link.split('/')[0] === '2018') {
42 country.date = '2018-' + country.date;
43 }
44 // console.log(country.title);
45 // Populate countries array with country data
46 country.desp = fs
47 .readFileSync('/home/archie/repo/blog/blog/' + country.link)
48 .toString()
49 .match(/<body[^>]*>([\w|\W]*)<\/body>/im)[1]
50 .replaceAll('<', '<')
51 .replaceAll('>', '>')
52 .trim();
53 countries.push(country);
54 });
55 // Logs countries array to the console
56 // console.dir(countries);
57 // Write countries array in countries.json file
58 fs.writeFile('blogs.json', JSON.stringify(countries, null, 2), (err) => {
59 if (err) {
60 console.error(err);
61 return;
62 }
63 console.log('Successfully written data to blogs.json');
64 });
65 } catch (err) {
66 console.error(err);
67 }
68 }
69 // Invoke the above function
70 scrapeData();
gen-rss.py
:
1 #!/usr/bin/env python
2
3 import json
4 import datetime as dt
5 from datetime import datetime, date, time, timezone
6
7 domain = 'www.yidajiabei.xyz'
8
9 site = 'https://%s/blog/' % (domain)
10
11 def get_end_of_day(date):
12 return datetime.strptime(
13 f'{date} 08:00:00', '%Y-%m-%d %H:%M:%S').astimezone()
14
15 # 读取 json 文件内容,返回字典格式
16 with open('./scripts/blogs.json','r',encoding='utf8')as fp:
17 json_data = json.load(fp)
18 itemLines = ["<item><title>" + str(json_data[x]['title']) + "</title><link>" + site + str(json_data[x]['link']) + "</link><guid isPermaLink=\"true\">" + site + str(json_data[x]['link']) + "</guid><pubDate>" + get_end_of_day(str(json_data[x]['date'])).strftime("%a, %d %b %Y %I:%M:%S %Z") + "</pubDate><description type=\"html\">" + str(json_data[x]['desp']) + "</description></item>\n"for x in range(0, len(json_data) -1)]
19 # print(itemLines[0:20])
20
21
22 with open('./blog/index.xml', 'w') as fh:
23 fh.write("""<?xml version="1.0" encoding="UTF-8"?>
24 <rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0"><channel>
25 <title>一大加贝</title>
26 <link>https://www.yidajiabei.xyz/blog/</link>
27 <description>学习技术,热爱生活</description>
28 <generator>Python script wrote by myself</generator>
29 <language>zh-CN</language>
30 <managingEditor>me@yidajiabei.xyz (Jim Gao)</managingEditor>
31 <webMaster>me@yidajiabei.xyz (Jim Gao)</webMaster>
32 <copyright>在保留本文作者及本文链接的前提下,非商业用途随意转载分享。</copyright>
33 <lastBuildDate>""")
34 fh.write(str(dt.datetime.now().strftime("%a, %d %b %Y %I:%M:%S +0800")))
35 fh.write("""</lastBuildDate>
36 <atom:link rel="self" type="application/rss+xml" href="https://www.yidajiabei.xyz/blog/index.xml"/>\n""")
37 fh.writelines(itemLines[0:20])
38 fh.write("""</channel>
39 </rss>""")
参考资料: