update poet

This commit is contained in:
xinglie.lkf
2019-08-19 11:08:28 +08:00
parent 7ca8bd8032
commit 8e350d34d8
2 changed files with 246 additions and 191 deletions

View File

@ -11,8 +11,8 @@
不但文件一一对应,文件中的内容也是一一对应的,即`./ci/ci.song.rang.8000.json`中数组中的第`n`条和`../ci/ci.song.8000.json`数组中的第`n`条对应
Q: 为什么没有把该结果和诗词数据放在一起?
A: 为了保持诗词数据的纯洁,并非所有数据都适合塞进去。对于这种非所有人需要的数据,通过一一对应关系,可以很方便的给原诗词扩展数据,同时也保证了原诗词的纯洁度。所以当需要对诗词根据知名度排序的需求时,可以把该数据附加到原有诗词数据上即可。
#### 为什么没有把该结果和诗词数据放在一起?
为了保持诗词数据的纯洁,并非所有数据都适合塞进去。对于这种非所有人需要的数据,通过一一对应关系,可以很方便的给原诗词扩展数据,同时也保证了原诗词的纯洁度。所以当需要对诗词根据知名度排序的需求时,可以把该数据附加到原有诗词数据上即可。
## 数据形式
@ -32,4 +32,4 @@ A: 为了保持诗词数据的纯洁,并非所有数据都适合塞进去。
]
```
搜索引擎的结果数据仅供参考,不同时间搜出来的数据未必一致
搜索引擎的结果数据仅供参考,不同时间搜出来的数据会有一些出入

View File

@ -1,82 +1,63 @@
let puppeteer = require('puppeteer');
let fs = require('fs');
let path = require('path');
let https = require('https');
let sep = path.sep;
let delayBaseTime = 3 * 1000;//延迟基础时间
let delayMaxTime = 2 * 1000;
//查询缓存防止过多的使用同样的关键字查询搜索引擎被block
let cache = Object.create(null);
// let todayProxies = [
// '218.60.8.99:3129',
// '212.64.51.13:8888',
// '125.62.27.53:3128'
// ];
let delays = {
baidu: {
base: 2 * 1000,
delay: 1 * 1000
},
bing: {
base: 2 * 1000,
delay: 1 * 1000
},
so360: {
base: 1 * 1000,
delay: 1 * 1000
},
google: {
base: 1 * 1000,
delay: 1 * 1000
}
}
let wait = second => new Promise(r => { setTimeout(r, second * 1000); });
//使用chrome模拟访问
let headless = {
async before() {
if (!this.$local) {
let brower = await puppeteer.launch({
headless: true,
defaultViewport: {
width: 1440,
height: 900
before() {
return new Promise(async resolve => {
if (!this.$local) {
let brower = await puppeteer.launch({
headless: false,
defaultViewport: {
width: 1440,
height: 900
}
});
this.$local = brower;
let pages = await brower.pages();
let ps = [],
diff = 5 - pages.length;
while (diff--) {
ps.push(brower.newPage());
}
});
this.$local = brower;
// let browsers = [brower];
// for (let p of todayProxies) {
// browsers.push(await puppeteer.launch({
// headless: true,
// defaultViewport: {
// width: 1440,
// height: 900
// },
// args: [
// '--proxy-server=' + p
// ]
// }));
// }
// this.$browsers = browsers;
// this.$current = 0;
}
Promise.all(ps).then(resolve);
} else {
resolve();
}
});
},
async toUrl(...urls) {
let ps,
retry = 3,
newPages,
imgReg = /\.(?:png|jpg|gif)$/i;
let closePages = () => {
if (newPages) {
for (let p of newPages) {
p.close();
}
}
};
async toUrl(url, pi) {
let text = '',
retry = 3;
let request = async () => {
ps = [];
let pages = [];
let datas = [];
let local = this.$local;
// let proxy = this.$browsers[this.$current++];
// if (this.$current >= this.$browsers.length) {
// this.$current = 0;
// }
for (let url of urls) {
let b = local;//url.startsWith('https://www.google.com') ? local : proxy;
pages.push(b.newPage());
}
newPages = await Promise.all(pages);
let start = 0;
for (let page of newPages) {
datas.push(page.goto(urls[start++], {
waitUntil: 'networkidle0'
}));
}
let newDatas = await Promise.all(datas);
for (let d of newDatas) {
ps.push(d.text());
}
let pages = await local.pages();
let page = pages[pi];
let d = await page.goto(url, {
waitUntil: 'networkidle0'
});
text = await d.text();
};
do {
try {
@ -85,57 +66,58 @@ let headless = {
} catch{
console.log('retry', retry);
retry--;
closePages();
}
} while (retry);
let returned = await Promise.all(ps);
closePages();
return returned;
return text;
},
after() {
if (this.$local) {
this.$local.close();
delete this.$local;
// for (let b of this.$browsers) {
// b.close();
// }
// delete this.$browsers;
if (this.$browsers) {
for (let b of this.$browsers) {
b.close();
}
delete this.$browsers;
}
}
}
};
let rank = {
remote(kd) {
remote(kd, type) {
return new Promise(async r => {
await headless.before();
let bd = `https://www.baidu.com/s?wd=${kd}&rsv_spt=1`;
let so = `https://www.so.com/s?q=${kd}`;
let google = `https://www.google.com/search?q=${kd}`;
//let google = `https://www.googlebridge.com/search?q=${kd}`;
let bing = `https://cn.bing.com/search?q=${kd}&FORM=BESBTB`;
let texts = await headless.toUrl(bd, so, google, bing);
let regs = [
/百度为您找到相关结果约([\d,]+)个/,
/找到相关结果约([\d,]+)个/,
/(?:找到约|About)\s*([\d,]+)\s*(?:条结果|results)/,
/<span\s+class="sb_count">([\d,]+)\s+(?:results|条结果)<\/span>/
];
let nums = [0, 0, 0, 0];
for (let i = texts.length; i--;) {
let text = texts[i];
text = text.replace(/<(script|style)[^>]*>[\s\S]*?<\/\1>/g, '');
text.replace(regs[i], (_, m) => {
nums[i] = parseInt(m.replace(/,/g, ''), 10);
});
// if (nums[i] === 0) {
// console.log(text);
// }
}
let [bdNum, soNum, googleNum, bingNum] = nums;
let maps = {
baidu: {
url: `https://www.baidu.com/s?wd=${kd}&rsv_spt=1&ie=utf-8&f=8`,
reg: /百度为您找到相关结果约([\d,]+)个/,
page: 0,
},
so360: {
url: `https://www.so.com/s?q=${kd}`,
reg: /找到相关结果约([\d,]+)个/,
page: 1,
},
google: {
url: `https://www.google.com/search?q=${kd}`,
reg: /(?:找到约|About)\s*([\d,]+)\s*(?:条结果|results)/,
page: 2
},
bing: {
url: `https://cn.bing.com/search?q=${kd}&FORM=BESBTB`,
reg: /<span\s+class="sb_count">([\d,]+)\s+(?:results|条结果)<\/span>/,
page: 3
}
};
let i = maps[type];
let text = await headless.toUrl(i.url, i.page);
let nums = 0;
text = String(text).replace(/<(script|style)[^>]*>[\s\S]*?<\/\1>/g, '');
text.replace(i.reg, (_, m) => {
//console.log(m);
nums = parseInt(m.replace(/,/g, ''), 10);
});
r({
baidu: bdNum,
so360: soNum,
google: googleNum,
bing: bingNum
[type]: nums
});
});
},
@ -193,12 +175,15 @@ let ciTask = async () => {
let j = JSON.parse(c);
for (let p in j) {
let a = j[p];
for (let z in a) {
if (a[z] === 0) {
delete j[p];
break;
}
if (a.baidu === 0) {
delete j[p];
}
// for (let z in a) {
// if (a[z] === 0) {
// delete j[p];
// break;
// }
// }
}
Object.assign(cache, j);
}
@ -240,7 +225,8 @@ let ciTask = async () => {
data.author = r.author;
data.rhythmic = r.rhythmic;
cache[kd] = data;
delay = delayBaseTime + delayMaxTime * Math.random();
//let dl = delays[type];
//delay = dl.base + dl.delay * Math.random();
}
for (let p in data) {
if (data[p] === 0) {
@ -249,14 +235,14 @@ let ciTask = async () => {
}
}
if (start && ((start % 5) === 0)) {
rank.write('./s.cache', JSON.stringify(cache));
rank.write('./s.cache', JSON.stringify(cache, null, 4));
}
ranks.push(data);
console.log(start + '/' + list.length);
start++;
setTimeout(task, delay);
} else {
rank.write('./s.cache', JSON.stringify(cache));
rank.write('./s.cache', JSON.stringify(cache, null, 4));
resolve([ranks, zeros]);
}
};
@ -265,8 +251,8 @@ let ciTask = async () => {
};
let work = async list => {
let one = list.pop();
console.log('remain ', list.length);
if (one) {
console.log('ci remain ', list.length);
let f = path.basename(one);
let aim = f.replace('ci.song', 'ci.song.rank');
let zeroAim = f.replace('ci.song', 'ci.song.zero');
@ -278,48 +264,60 @@ let ciTask = async () => {
cache = Object.create(null);//文件写入后,清理缓存
work(list);
} else {
console.log('ci complete');
resolve();
}
};
work(loadList);
});
};
let canVisitGoogle = false;
let checkGoogle = async () => {
return new Promise(resolve => {
https.get('https://www.google.com/search?q=xinglie', res => {
if (res.statusCode == 200) {
canVisitGoogle = true;
}
resolve();
}).on('error', e => {
console.log(e);
resolve();
});
});
};
let poetReg = /poet\.(song|tang)\.(\d+)\.json$/;
let poetTask = async () => {
return new Promise(resolve => {
let poetReg = /poet\.(?:song|tang)\.\d+\.json$/;
let poetRankReg = /poet\.(?:song|tang)\.rank\.\d+\.json$/;
let readList = {};
let exist = {};
rank.list('./poet', f => {
if (poetRankReg.test(f)) {
exist[path.basename(f).replace(/poet\.(song|tang)\.rank/, 'poet.$1')] = 1;
}
});
let caches = {};
let cList = ['baidu', 'so360', 'bing'];
if (canVisitGoogle) {
cList.push('google');
}
for (let c of cList) {
caches[c] = Object.create(null);
}
rank.list('../json', f => {
if (poetReg.test(f)) {
let base = path.basename(f);
if (!exist[base]) {
readList[path.resolve(f)] = 1;
}
readList[path.resolve(f)] = 1;
}
});
if (fs.existsSync('./s.cache')) {
let c = rank.read('./s.cache');
let j = JSON.parse(c);
for (let p in j) {
let a = j[p];
for (let z in a) {
if (a[z] === 0) {
delete j[p];
break;
for (let c of cList) {
if (fs.existsSync('./s.' + c + '.cache')) {
let d = rank.read('./s.' + c + '.cache');
let j = JSON.parse(d);
if (!caches[c]) caches[c] = {};
for (let e in j) {
if (j[e][c] === 0) {
delete j[e];
}
}
Object.assign(caches[c], j);
}
Object.assign(cache, j);
}
let loadList = Object.keys(readList);
let singleWork = file => {
let singleWork = (file, type) => {
let ranks = [];
let zeros = [];
return new Promise(resolve => {
@ -330,34 +328,20 @@ let poetTask = async () => {
let r = list[start];
let kd = encodeURIComponent(`${r.author} ${r.title}`);
let data,
delay = 0,
zeroRetry = 3;
if (cache[kd]) {
data = cache[kd];
delay = 0
if (caches[type][kd]) {
data = caches[type][kd];
} else {
do {
data = await rank.remote(kd);
let hasZero = false;
// for (let p in data) {
// if (data[p] === 0) {
// hasZero = true;
// break;
// }
// }
//baidu有时候查询不到需要再次查询
hasZero = data.baidu === 0;
if (!hasZero) {
zeroRetry = 0;
} else {
console.log('baidu zero result, retry', zeroRetry);
await wait(5 + Math.random() * 10);
zeroRetry--;
}
} while (zeroRetry);
data = await rank.remote(kd, type);
data.author = r.author;
data.title = r.title;
cache[kd] = data;
delay = delayBaseTime + delayMaxTime * Math.random();
caches[type][kd] = data;
let dl = delays[type];
delay = dl.base + dl.delay * Math.random();
if (start && ((start % 2) === 0)) {
rank.write('./s.' + type + '.cache', JSON.stringify(caches[type]));
}
}
for (let p in data) {
if (data[p] === 0) {
@ -365,46 +349,117 @@ let poetTask = async () => {
break;
}
}
if (start && ((start % 5) === 0)) {
rank.write('./s.cache', JSON.stringify(cache));
}
ranks.push(data);
console.log(start + '/' + list.length);
console.log(type + ':' + start + '/' + list.length);
start++;
setTimeout(task, delay);
} else {
rank.write('./s.cache', JSON.stringify(cache));
rank.write('./s.' + type + '.cache', JSON.stringify(caches[type]));
resolve([ranks, zeros]);
}
};
task();
});
};
let work = async list => {
let one = list.pop();
console.log('remain ', list.length);
if (one) {
let f = path.basename(one);
let aim = f.replace(/poet\.(song|tang)/, 'poet.$1.rank');
let zeroAim = f.replace(/poet\.(song|tang)/, 'poet.$1.zero');
let [ranks, zeros] = await singleWork(one);
rank.write('./poet/' + aim, JSON.stringify(ranks, null, 4));
if (zeros.length) {
rank.write('./poet/' + zeroAim, JSON.stringify(zeros, null, 4));
let finised = {};
let check = () => {
let all = true;
for (let c of cList) {
if (finised[c] !== true) {
all = false;
}
cache = Object.create(null);//文件写入后,清理缓存
work(list);
} else {
}
if (all) {
resolve();
}
};
work(loadList);
let work = async (type, index, list) => {
let one = list[index];
if (one) {
console.log(`${type} current ${index} ,total:${list.length}`);
let f = path.basename(one);
let aim = f.replace(/poet\.(song|tang)/, 'poet.$1.rank.' + type);
if (!fs.existsSync('./poet_temp/' + aim)) {
//let zeroAim = f.replace(/poet\.(song|tang)/, 'poet.$1.zero.' + type);
let [ranks, zeros] = await singleWork(one, type);
rank.write('./poet_temp/' + aim, JSON.stringify(ranks, null, 4));
// if (zeros.length) {
// rank.write('./poet/' + zeroAim, JSON.stringify(zeros, null, 4));
// }
caches[type] = Object.create(null);//文件写入后,清理缓存
setTimeout(() => {
work(type, index + 1, list);
}, 10 * 1000);
} else {
console.log('ignore ' + aim);
work(type, index + 1, list);
}
} else {
finised[type] = true;
}
};
for (let c of cList) {
work(c, 0, loadList);
}
check();
});
};
let split = () => {
let keys = ['baidu', 'so360', 'bing', 'google'];
rank.list('./poet', f => {
let b = path.basename(f);
let d = JSON.parse(rank.read(f));
for (let key of keys) {
let aim = b.replace(/poet\.(song|tang).rank/, 'poet.$1.rank.' + key);
let newList = [];
for (let e of d) {
newList.push({
author: e.author,
title: e.title,
[key]: e[key]
});
}
rank.write('./poet2/' + aim, JSON.stringify(newList, null, 4));
}
});
};
let merge = () => {
let readList = {};
let cList = ['baidu', 'so360', 'bing', 'google'];
rank.list('../json', f => {
if (poetReg.test(f)) {
readList[path.resolve(f)] = 1;
}
});
let loadList = Object.keys(readList);
for (let ll of loadList) {
let f = path.basename(ll);
let aim = f.replace(/poet\.(song|tang)/, 'poet.$1.rank');
let newList = [],
canMerge = true;
for (let c of cList) {
let src = f.replace(/poet\.(song|tang)/, 'poet.$1.rank.' + c);
if (fs.existsSync('./poet_temp/' + src)) {
let d = rank.read('./poet_temp/' + src);
let j = JSON.parse(d);
for (let i = j.length; i--;) {
newList[i] = Object.assign(newList[i] || {}, j[i]);
}
} else {
canMerge = false;
}
}
if (canMerge) {
rank.write('./poet/' + aim, JSON.stringify(newList, null, 4));
}
}
};
(async () => {
await ciTask();
//await poetTask();
//merge();
//await ciTask();
await headless.before();
await checkGoogle();
await poetTask();
headless.after();
fs.unlink('./s.cache');
console.log('complete');
merge();
})();