- 通过 gen-json.js 文件,生成 blogs.json
- 通过 gen-rss.py 文件,生成 index.xml
const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs');
// for blogs.json
const url = 'https://www.yidajiabei.xyz/blog/index.html';
async function scrapeData() {
try {
// Fetch HTML of the page we want to scrape
const { data } = await axios.get(url);
// Load HTML we fetched in the previous line
const $ = cheerio.load(data);
// Select all the list items in plainlist class
const listItems = $('.org-ul li');
// Stores data for all countries
const countries = [];
// Use .each method to loop through the li we selected
listItems.each((idx, el) => {
// Object holding data for each country/jurisdiction
const country = { title: '', link: '', date: '', desp: '' }; // , link: ""
// Select the text content of a and span elements
// Store the textcontent in the above object
country.title = $(el).children('a').text();
country.link = $(el).children('a').attr('href');
country.date = $(el)
.replace(/[^0-9][^\s]+/g, '')
if (country.link.split('/')[0] === '2022') {
country.date = '2022-' + country.date;
} else if (country.link.split('/')[0] === '2021') {
country.date = '2021-' + country.date;
} else if (country.link.split('/')[0] === '2020') {
country.date = '2020-' + country.date;
} else if (country.link.split('/')[0] === '2019') {
country.date = '2019-' + country.date;
} else if (country.link.split('/')[0] === '2018') {
country.date = '2018-' + country.date;
// console.log(country.title);
// Populate countries array with country data
country.desp = fs
.readFileSync('/home/archie/repo/blog/blog/' + country.link)
.replaceAll('<', '<')
.replaceAll('>', '>')
// Logs countries array to the console
// console.dir(countries);
// Write countries array in countries.json file
fs.writeFile('blogs.json', JSON.stringify(countries, null, 2), (err) => {
if (err) {
console.log('Successfully written data to blogs.json');
} catch (err) {
// Invoke the above function
#!/usr/bin/env python
import json
import datetime as dt
from datetime import datetime, date, time, timezone
domain = 'www.yidajiabei.xyz'
site = 'https://%s/blog/' % (domain)
def get_end_of_day(date):
return datetime.strptime(
f'{date} 08:00:00', '%Y-%m-%d %H:%M:%S').astimezone()
# 读取 json 文件内容,返回字典格式
with open('./scripts/blogs.json','r',encoding='utf8')as fp:
json_data = json.load(fp)
itemLines = ["<item><title>" + str(json_data[x]['title']) + "</title><link>" + site + str(json_data[x]['link']) + "</link><guid isPermaLink=\"true\">" + site + str(json_data[x]['link']) + "</guid><pubDate>" + get_end_of_day(str(json_data[x]['date'])).strftime("%a, %d %b %Y %I:%M:%S %Z") + "</pubDate><description type=\"html\">" + str(json_data[x]['desp']) + "</description></item>\n"for x in range(0, len(json_data) -1)]
# print(itemLines[0:20])
with open('./blog/index.xml', 'w') as fh:
fh.write("""<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0"><channel>
<generator>Python script wrote by myself</generator>
<managingEditor>me@yidajiabei.xyz (Jim Gao)</managingEditor>
<webMaster>me@yidajiabei.xyz (Jim Gao)</webMaster>
fh.write(str(dt.datetime.now().strftime("%a, %d %b %Y %I:%M:%S +0800")))
<atom:link rel="self" type="application/rss+xml" href="https://www.yidajiabei.xyz/blog/index.xml"/>\n""")