add ci rank

This commit is contained in:
xinglie.lkf
2019-07-29 17:59:45 +08:00
parent 55e275742f
commit 7ca8bd8032
24 changed files with 168793 additions and 0 deletions

35
rank/README.md Normal file
View File

@ -0,0 +1,35 @@
诗词搜索结果
-----
根据该[关于诗词的知名度](https://github.com/chinese-poetry/chinese-poetry/issues/115)整理的数据以“作者作品名”做为搜索关键字通过搜索引擎搜索并把搜索结果数量做为排行的一个维度制作而成目前收集了“百度、必应、360搜索和谷歌”等`4`个引擎的数据
一般来讲,搜索结果越多,表示该诗词越知名。
## 说明
该目录下收集了`ci``json`两个文件夹下的诗词搜索数据,且该文件夹下的文件与`ci``json`中的文件一一对应,比如`./ci/ci.song.rang.8000.json`对应`../ci/ci.song.8000.json`
不但文件一一对应,文件中的内容也是一一对应的,即`./ci/ci.song.rang.8000.json`中数组中的第`n`条和`../ci/ci.song.8000.json`数组中的第`n`条对应
Q: 为什么没有把该结果和诗词数据放在一起?
A: 为了保持诗词数据的纯洁,并非所有数据都适合塞进去。对于这种非所有人需要的数据,通过一一对应关系,可以很方便的给原诗词扩展数据,同时也保证了原诗词的纯洁度。所以当需要对诗词根据知名度排序的需求时,可以把该数据附加到原有诗词数据上即可。
## 数据形式
每个 JSON 文件1000条记录. 为了举例, 删除了余下999条.
```js
[
{
"author": "石孝友",
"rhythmic": "玉楼春",
"baidu": 77500, //百度搜索结果条数
"so360": 1060,//360搜索结果条数
"google": 991000,//谷歌搜索结果条数
"bing": 20//必应搜索结果条数
}
]
```
搜索引擎的结果数据仅供参考,不同时间搜出来的数据未必一致

8002
rank/ci/ci.song.rank.0.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,402 @@
[
{
"baidu": 1050,
"so360": 40,
"google": 19500,
"bing": 510,
"author": "臧馀庆",
"rhythmic": "感皇恩"
},
{
"baidu": 1050,
"so360": 40,
"google": 19500,
"bing": 510,
"author": "臧馀庆",
"rhythmic": "感皇恩"
},
{
"baidu": 1050,
"so360": 40,
"google": 19500,
"bing": 510,
"author": "臧馀庆",
"rhythmic": "感皇恩"
},
{
"baidu": 382000,
"so360": 67,
"google": 1210000,
"bing": 51900,
"author": "胡于",
"rhythmic": "鹧鸪天"
},
{
"baidu": 382000,
"so360": 67,
"google": 1210000,
"bing": 51900,
"author": "胡于",
"rhythmic": "鹧鸪天"
},
{
"baidu": 382000,
"so360": 67,
"google": 1210000,
"bing": 51900,
"author": "胡于",
"rhythmic": "鹧鸪天"
},
{
"baidu": 382000,
"so360": 67,
"google": 1210000,
"bing": 51900,
"author": "胡于",
"rhythmic": "鹧鸪天"
},
{
"baidu": 59,
"so360": 30,
"google": 8,
"bing": 886,
"author": "申国章",
"rhythmic": "鹧鸪天"
},
{
"baidu": 1050,
"so360": 45,
"google": 27300,
"bing": 592,
"author": "陈日章",
"rhythmic": "鹧鸪天"
},
{
"baidu": 46,
"so360": 52,
"google": 28100,
"bing": 4980,
"author": "李景良",
"rhythmic": "鹧鸪天"
},
{
"baidu": 90,
"so360": 48,
"google": 27200,
"bing": 525000,
"author": "张思济",
"rhythmic": "鹧鸪天"
},
{
"baidu": 31300,
"so360": 546,
"google": 1660000,
"bing": 135000,
"author": "李夫人",
"rhythmic": "减字木兰花"
},
{
"baidu": 75100,
"so360": 2370,
"google": 5720000,
"bing": 27600,
"author": "李夫人",
"rhythmic": "蝶恋花"
},
{
"baidu": 57,
"so360": 68,
"google": 230000,
"bing": 2170000,
"author": "李夫人",
"rhythmic": "瑞鹧鸪"
},
{
"baidu": 60,
"so360": 55,
"google": 116000,
"bing": 575,
"author": "张藻",
"rhythmic": "望海潮"
},
{
"baidu": 4560,
"so360": 149,
"google": 1400,
"bing": 3730,
"author": "胡文卿",
"rhythmic": "虞美人"
},
{
"baidu": 4560,
"so360": 149,
"google": 1400,
"bing": 3730,
"author": "胡文卿",
"rhythmic": "虞美人"
},
{
"baidu": 4560,
"so360": 149,
"google": 1400,
"bing": 3730,
"author": "胡文卿",
"rhythmic": "虞美人"
},
{
"baidu": 4560,
"so360": 149,
"google": 1400,
"bing": 3730,
"author": "胡文卿",
"rhythmic": "虞美人"
},
{
"baidu": 4560,
"so360": 149,
"google": 1400,
"bing": 3730,
"author": "胡文卿",
"rhythmic": "虞美人"
},
{
"baidu": 4180,
"so360": 77,
"google": 2250,
"bing": 13300,
"author": "胡文卿",
"rhythmic": "阮郎归"
},
{
"baidu": 4180,
"so360": 77,
"google": 2250,
"bing": 13300,
"author": "胡文卿",
"rhythmic": "阮郎归"
},
{
"baidu": 4180,
"so360": 77,
"google": 2250,
"bing": 13300,
"author": "胡文卿",
"rhythmic": "阮郎归"
},
{
"baidu": 92,
"so360": 44,
"google": 27100,
"bing": 1320,
"author": "杨道居",
"rhythmic": "蝶恋花"
},
{
"baidu": 3220,
"so360": 55,
"google": 2660,
"bing": 1140,
"author": "史佐尧",
"rhythmic": "苏幕遮"
},
{
"baidu": 1070,
"so360": 36,
"google": 3310000,
"bing": 41600,
"author": "徐去非",
"rhythmic": "满庭芳"
},
{
"baidu": 528,
"so360": 50,
"google": 5260000,
"bing": 2460000,
"author": "徐去非",
"rhythmic": "锦被堆"
},
{
"baidu": 59,
"so360": 12,
"google": 2390000,
"bing": 26700,
"author": "徐去非",
"rhythmic": "卷珠帘・蝶恋花"
},
{
"baidu": 8,
"so360": 491,
"google": 6,
"bing": 26,
"author": "舒大成",
"rhythmic": "点绛唇"
},
{
"baidu": 3660,
"so360": 35,
"google": 12700,
"bing": 572,
"author": "贾少卿",
"rhythmic": "临江仙"
},
{
"baidu": 2640,
"so360": 77,
"google": 1350,
"bing": 844000,
"author": "陆汉广",
"rhythmic": "江城子"
},
{
"baidu": 95,
"so360": 39,
"google": 1880000,
"bing": 581000,
"author": "王阜民",
"rhythmic": "临江仙"
},
{
"baidu": 61,
"so360": 28,
"google": 6240000,
"bing": 678000,
"author": "霍安人",
"rhythmic": "感皇恩"
},
{
"baidu": 73,
"so360": 57,
"google": 2270000,
"bing": 16500,
"author": "霍安人",
"rhythmic": "醉蓬莱"
},
{
"baidu": 86,
"so360": 33,
"google": 2750000,
"bing": 13200,
"author": "霍安人",
"rhythmic": "满庭芳"
},
{
"baidu": 64,
"so360": 30,
"google": 481000,
"bing": 7550000,
"author": "希叟",
"rhythmic": "瑞鹤仙"
},
{
"baidu": 2040,
"so360": 4040000,
"google": 27300,
"bing": 4930000,
"author": "沈元实",
"rhythmic": "水调歌头"
},
{
"baidu": 79000,
"so360": 66,
"google": 86400,
"bing": 14500,
"author": "游子蒙",
"rhythmic": "满江红"
},
{
"baidu": 79000,
"so360": 66,
"google": 86400,
"bing": 14500,
"author": "游子蒙",
"rhythmic": "满江红"
},
{
"baidu": 71,
"so360": 31,
"google": 66100,
"bing": 402,
"author": "贾逋",
"rhythmic": "清平乐"
},
{
"baidu": 97,
"so360": 258,
"google": 27700,
"bing": 2750,
"author": "吴文若",
"rhythmic": "蝶恋花"
},
{
"baidu": 290000,
"so360": 508,
"google": 7010000,
"bing": 25400,
"author": "去非",
"rhythmic": "满庭芳"
},
{
"baidu": 853,
"so360": 33,
"google": 6870000,
"bing": 621,
"author": "潘熊飞",
"rhythmic": "南乡子"
},
{
"baidu": 33200,
"so360": 73,
"google": 26900,
"bing": 2350,
"author": "黄庭佐",
"rhythmic": "水调歌头"
},
{
"baidu": 7100,
"so360": 6330,
"google": 238000,
"bing": 63,
"author": "赵□",
"rhythmic": "失调名"
},
{
"baidu": 11,
"so360": 32,
"google": 455000,
"bing": 19400,
"author": "吴氏3",
"rhythmic": "失调名"
},
{
"baidu": 90,
"so360": 529,
"google": 5410000,
"bing": 13700,
"author": "吴氏3",
"rhythmic": "南乡子"
},
{
"baidu": 90,
"so360": 529,
"google": 5410000,
"bing": 13700,
"author": "吴氏3",
"rhythmic": "南乡子"
},
{
"baidu": 176000,
"so360": 157,
"google": 10500000,
"bing": 8820000,
"author": "吴氏3",
"rhythmic": "多丽"
},
{
"baidu": 88,
"so360": 604,
"google": 344000,
"bing": 55200,
"author": "吴氏3",
"rhythmic": "渔家傲"
}
]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

410
rank/server.js Normal file
View File

@ -0,0 +1,410 @@
let puppeteer = require('puppeteer');
let fs = require('fs');
let path = require('path');
let sep = path.sep;
let delayBaseTime = 3 * 1000;//延迟基础时间
let delayMaxTime = 2 * 1000;
//查询缓存防止过多的使用同样的关键字查询搜索引擎被block
let cache = Object.create(null);
// let todayProxies = [
// '218.60.8.99:3129',
// '212.64.51.13:8888',
// '125.62.27.53:3128'
// ];
let wait = second => new Promise(r => { setTimeout(r, second * 1000); });
//使用chrome模拟访问
let headless = {
async before() {
if (!this.$local) {
let brower = await puppeteer.launch({
headless: true,
defaultViewport: {
width: 1440,
height: 900
}
});
this.$local = brower;
// let browsers = [brower];
// for (let p of todayProxies) {
// browsers.push(await puppeteer.launch({
// headless: true,
// defaultViewport: {
// width: 1440,
// height: 900
// },
// args: [
// '--proxy-server=' + p
// ]
// }));
// }
// this.$browsers = browsers;
// this.$current = 0;
}
},
async toUrl(...urls) {
let ps,
retry = 3,
newPages,
imgReg = /\.(?:png|jpg|gif)$/i;
let closePages = () => {
if (newPages) {
for (let p of newPages) {
p.close();
}
}
};
let request = async () => {
ps = [];
let pages = [];
let datas = [];
let local = this.$local;
// let proxy = this.$browsers[this.$current++];
// if (this.$current >= this.$browsers.length) {
// this.$current = 0;
// }
for (let url of urls) {
let b = local;//url.startsWith('https://www.google.com') ? local : proxy;
pages.push(b.newPage());
}
newPages = await Promise.all(pages);
let start = 0;
for (let page of newPages) {
datas.push(page.goto(urls[start++], {
waitUntil: 'networkidle0'
}));
}
let newDatas = await Promise.all(datas);
for (let d of newDatas) {
ps.push(d.text());
}
};
do {
try {
await request();
break;
} catch{
console.log('retry', retry);
retry--;
closePages();
}
} while (retry);
let returned = await Promise.all(ps);
closePages();
return returned;
},
after() {
if (this.$local) {
this.$local.close();
delete this.$local;
// for (let b of this.$browsers) {
// b.close();
// }
// delete this.$browsers;
}
}
};
let rank = {
remote(kd) {
return new Promise(async r => {
await headless.before();
let bd = `https://www.baidu.com/s?wd=${kd}&rsv_spt=1`;
let so = `https://www.so.com/s?q=${kd}`;
let google = `https://www.google.com/search?q=${kd}`;
//let google = `https://www.googlebridge.com/search?q=${kd}`;
let bing = `https://cn.bing.com/search?q=${kd}&FORM=BESBTB`;
let texts = await headless.toUrl(bd, so, google, bing);
let regs = [
/百度为您找到相关结果约([\d,]+)个/,
/找到相关结果约([\d,]+)个/,
/(?:找到约|About)\s*([\d,]+)\s*(?:条结果|results)/,
/<span\s+class="sb_count">([\d,]+)\s+(?:results|条结果)<\/span>/
];
let nums = [0, 0, 0, 0];
for (let i = texts.length; i--;) {
let text = texts[i];
text = text.replace(/<(script|style)[^>]*>[\s\S]*?<\/\1>/g, '');
text.replace(regs[i], (_, m) => {
nums[i] = parseInt(m.replace(/,/g, ''), 10);
});
// if (nums[i] === 0) {
// console.log(text);
// }
}
let [bdNum, soNum, googleNum, bingNum] = nums;
r({
baidu: bdNum,
so360: soNum,
google: googleNum,
bing: bingNum
});
});
},
read(file) {
let c = fs.readFileSync(file);
return c.toString();
},
list(folder, callback) {
if (fs.existsSync(folder)) {
let files = fs.readdirSync(folder);
files.forEach(file => {
let p = folder + sep + file;
let stat = fs.lstatSync(p);
if (stat.isDirectory()) {
walk(p, callback);
} else {
callback(p);
}
});
}
},
write(to, content) {
let folders = path.dirname(to).split(sep);
let p = '';
while (folders.length) {
p += folders.shift() + sep;
if (!fs.existsSync(p)) {
fs.mkdirSync(p);
}
}
fs.writeFileSync(to, content);
}
};
let ciTask = async () => {
return new Promise(resolve => {
let ciReg = /ci\.song\.\d+\.json$/;
let ciRankReg = /ci\.song\.rank\.\d+\.json$/;
let readList = {};
let exist = {};
rank.list('./ci', f => {
if (ciRankReg.test(f)) {
exist[path.basename(f).replace('ci.song.rank', 'ci.song')] = 1;
}
});
rank.list('../ci', f => {
if (ciReg.test(f)) {
let base = path.basename(f);
if (!exist[base]) {
readList[path.resolve(f)] = 1;
}
}
});
if (fs.existsSync('./s.cache')) {
let c = rank.read('./s.cache');
let j = JSON.parse(c);
for (let p in j) {
let a = j[p];
for (let z in a) {
if (a[z] === 0) {
delete j[p];
break;
}
}
}
Object.assign(cache, j);
}
let loadList = Object.keys(readList);
let singleWork = file => {
let ranks = [];
let zeros = [];
return new Promise(resolve => {
let list = JSON.parse(rank.read(file));
let start = 0;
let task = async () => {
if (start < list.length) {
let r = list[start];
let kd = encodeURIComponent(`${r.author} ${r.rhythmic}`);
let data,
delay = 0,
zeroRetry = 3;
if (cache[kd]) {
data = cache[kd];
} else {
do {
data = await rank.remote(kd);
let hasZero = false;
for (let p in data) {
if (data[p] === 0) {
hasZero = true;
break;
}
}
if (!hasZero) {
zeroRetry = 0;
} else {
console.log('has zero retry', zeroRetry);
await wait(3);
zeroRetry--;
}
} while (zeroRetry);
data.author = r.author;
data.rhythmic = r.rhythmic;
cache[kd] = data;
delay = delayBaseTime + delayMaxTime * Math.random();
}
for (let p in data) {
if (data[p] === 0) {
zeros.push(data);
break;
}
}
if (start && ((start % 5) === 0)) {
rank.write('./s.cache', JSON.stringify(cache));
}
ranks.push(data);
console.log(start + '/' + list.length);
start++;
setTimeout(task, delay);
} else {
rank.write('./s.cache', JSON.stringify(cache));
resolve([ranks, zeros]);
}
};
task();
});
};
let work = async list => {
let one = list.pop();
console.log('remain ', list.length);
if (one) {
let f = path.basename(one);
let aim = f.replace('ci.song', 'ci.song.rank');
let zeroAim = f.replace('ci.song', 'ci.song.zero');
let [ranks, zeros] = await singleWork(one);
rank.write('./ci/' + aim, JSON.stringify(ranks, null, 4));
if (zeros.length) {
rank.write('./ci/' + zeroAim, JSON.stringify(zeros, null, 4));
}
cache = Object.create(null);//文件写入后,清理缓存
work(list);
} else {
resolve();
}
};
work(loadList);
});
};
let poetTask = async () => {
return new Promise(resolve => {
let poetReg = /poet\.(?:song|tang)\.\d+\.json$/;
let poetRankReg = /poet\.(?:song|tang)\.rank\.\d+\.json$/;
let readList = {};
let exist = {};
rank.list('./poet', f => {
if (poetRankReg.test(f)) {
exist[path.basename(f).replace(/poet\.(song|tang)\.rank/, 'poet.$1')] = 1;
}
});
rank.list('../json', f => {
if (poetReg.test(f)) {
let base = path.basename(f);
if (!exist[base]) {
readList[path.resolve(f)] = 1;
}
}
});
if (fs.existsSync('./s.cache')) {
let c = rank.read('./s.cache');
let j = JSON.parse(c);
for (let p in j) {
let a = j[p];
for (let z in a) {
if (a[z] === 0) {
delete j[p];
break;
}
}
}
Object.assign(cache, j);
}
let loadList = Object.keys(readList);
let singleWork = file => {
let ranks = [];
let zeros = [];
return new Promise(resolve => {
let list = JSON.parse(rank.read(file));
let start = 0;
let task = async () => {
if (start < list.length) {
let r = list[start];
let kd = encodeURIComponent(`${r.author} ${r.title}`);
let data,
delay = 0,
zeroRetry = 3;
if (cache[kd]) {
data = cache[kd];
} else {
do {
data = await rank.remote(kd);
let hasZero = false;
// for (let p in data) {
// if (data[p] === 0) {
// hasZero = true;
// break;
// }
// }
//baidu有时候查询不到需要再次查询
hasZero = data.baidu === 0;
if (!hasZero) {
zeroRetry = 0;
} else {
console.log('baidu zero result, retry', zeroRetry);
await wait(5 + Math.random() * 10);
zeroRetry--;
}
} while (zeroRetry);
data.author = r.author;
data.title = r.title;
cache[kd] = data;
delay = delayBaseTime + delayMaxTime * Math.random();
}
for (let p in data) {
if (data[p] === 0) {
zeros.push(data);
break;
}
}
if (start && ((start % 5) === 0)) {
rank.write('./s.cache', JSON.stringify(cache));
}
ranks.push(data);
console.log(start + '/' + list.length);
start++;
setTimeout(task, delay);
} else {
rank.write('./s.cache', JSON.stringify(cache));
resolve([ranks, zeros]);
}
};
task();
});
};
let work = async list => {
let one = list.pop();
console.log('remain ', list.length);
if (one) {
let f = path.basename(one);
let aim = f.replace(/poet\.(song|tang)/, 'poet.$1.rank');
let zeroAim = f.replace(/poet\.(song|tang)/, 'poet.$1.zero');
let [ranks, zeros] = await singleWork(one);
rank.write('./poet/' + aim, JSON.stringify(ranks, null, 4));
if (zeros.length) {
rank.write('./poet/' + zeroAim, JSON.stringify(zeros, null, 4));
}
cache = Object.create(null);//文件写入后,清理缓存
work(list);
} else {
resolve();
}
};
work(loadList);
});
};
(async () => {
await ciTask();
//await poetTask();
headless.after();
fs.unlink('./s.cache');
console.log('complete');
})();