From 8e350d34d824fbfb48bb84a82280efd54552dd8a Mon Sep 17 00:00:00 2001 From: "xinglie.lkf" Date: Mon, 19 Aug 2019 11:08:28 +0800 Subject: [PATCH] update poet --- rank/README.md | 6 +- rank/server.js | 431 ++++++++++++++++++++++++++++--------------------- 2 files changed, 246 insertions(+), 191 deletions(-) diff --git a/rank/README.md b/rank/README.md index d47e2a1..207e66f 100644 --- a/rank/README.md +++ b/rank/README.md @@ -11,8 +11,8 @@ 不但文件一一对应,文件中的内容也是一一对应的,即`./ci/ci.song.rang.8000.json`中数组中的第`n`条和`../ci/ci.song.8000.json`数组中的第`n`条对应 -Q: 为什么没有把该结果和诗词数据放在一起? -A: 为了保持诗词数据的纯洁,并非所有数据都适合塞进去。对于这种非所有人需要的数据,通过一一对应关系,可以很方便的给原诗词扩展数据,同时也保证了原诗词的纯洁度。所以当需要对诗词根据知名度排序的需求时,可以把该数据附加到原有诗词数据上即可。 +#### 为什么没有把该结果和诗词数据放在一起? +为了保持诗词数据的纯洁,并非所有数据都适合塞进去。对于这种非所有人需要的数据,通过一一对应关系,可以很方便的给原诗词扩展数据,同时也保证了原诗词的纯洁度。所以当需要对诗词根据知名度排序的需求时,可以把该数据附加到原有诗词数据上即可。 ## 数据形式 @@ -32,4 +32,4 @@ A: 为了保持诗词数据的纯洁,并非所有数据都适合塞进去。 ] ``` -搜索引擎的结果数据仅供参考,不同时间搜出来的数据未必一致 +搜索引擎的结果数据仅供参考,不同时间搜出来的数据会有一些出入 diff --git a/rank/server.js b/rank/server.js index b9a5c0c..60d8a91 100644 --- a/rank/server.js +++ b/rank/server.js @@ -1,82 +1,63 @@ let puppeteer = require('puppeteer'); let fs = require('fs'); let path = require('path'); +let https = require('https'); let sep = path.sep; -let delayBaseTime = 3 * 1000;//延迟基础时间 -let delayMaxTime = 2 * 1000; -//查询缓存,防止过多的使用同样的关键字查询搜索引擎被block -let cache = Object.create(null); -// let todayProxies = [ -// '218.60.8.99:3129', -// '212.64.51.13:8888', -// '125.62.27.53:3128' -// ]; +let delays = { + baidu: { + base: 2 * 1000, + delay: 1 * 1000 + }, + bing: { + base: 2 * 1000, + delay: 1 * 1000 + }, + so360: { + base: 1 * 1000, + delay: 1 * 1000 + }, + google: { + base: 1 * 1000, + delay: 1 * 1000 + } +} let wait = second => new Promise(r => { setTimeout(r, second * 1000); }); //使用chrome模拟访问 let headless = { - async before() { - if (!this.$local) { - let brower = await puppeteer.launch({ - headless: true, - defaultViewport: { - width: 1440, - height: 900 + before() { + return new Promise(async resolve => { + if (!this.$local) { + let brower = await puppeteer.launch({ + headless: false, + defaultViewport: { + width: 1440, + height: 900 + } + }); + this.$local = brower; + let pages = await brower.pages(); + let ps = [], + diff = 5 - pages.length; + while (diff--) { + ps.push(brower.newPage()); } - }); - this.$local = brower; - // let browsers = [brower]; - // for (let p of todayProxies) { - // browsers.push(await puppeteer.launch({ - // headless: true, - // defaultViewport: { - // width: 1440, - // height: 900 - // }, - // args: [ - // '--proxy-server=' + p - // ] - // })); - // } - // this.$browsers = browsers; - // this.$current = 0; - } + Promise.all(ps).then(resolve); + } else { + resolve(); + } + }); }, - async toUrl(...urls) { - let ps, - retry = 3, - newPages, - imgReg = /\.(?:png|jpg|gif)$/i; - let closePages = () => { - if (newPages) { - for (let p of newPages) { - p.close(); - } - } - }; + async toUrl(url, pi) { + let text = '', + retry = 3; let request = async () => { - ps = []; - let pages = []; - let datas = []; let local = this.$local; - // let proxy = this.$browsers[this.$current++]; - // if (this.$current >= this.$browsers.length) { - // this.$current = 0; - // } - for (let url of urls) { - let b = local;//url.startsWith('https://www.google.com') ? local : proxy; - pages.push(b.newPage()); - } - newPages = await Promise.all(pages); - let start = 0; - for (let page of newPages) { - datas.push(page.goto(urls[start++], { - waitUntil: 'networkidle0' - })); - } - let newDatas = await Promise.all(datas); - for (let d of newDatas) { - ps.push(d.text()); - } + let pages = await local.pages(); + let page = pages[pi]; + let d = await page.goto(url, { + waitUntil: 'networkidle0' + }); + text = await d.text(); }; do { try { @@ -85,57 +66,58 @@ let headless = { } catch{ console.log('retry', retry); retry--; - closePages(); } } while (retry); - let returned = await Promise.all(ps); - closePages(); - return returned; + return text; }, after() { if (this.$local) { this.$local.close(); delete this.$local; - // for (let b of this.$browsers) { - // b.close(); - // } - // delete this.$browsers; + if (this.$browsers) { + for (let b of this.$browsers) { + b.close(); + } + delete this.$browsers; + } } } }; let rank = { - remote(kd) { + remote(kd, type) { return new Promise(async r => { - await headless.before(); - let bd = `https://www.baidu.com/s?wd=${kd}&rsv_spt=1`; - let so = `https://www.so.com/s?q=${kd}`; - let google = `https://www.google.com/search?q=${kd}`; - //let google = `https://www.googlebridge.com/search?q=${kd}`; - let bing = `https://cn.bing.com/search?q=${kd}&FORM=BESBTB`; - let texts = await headless.toUrl(bd, so, google, bing); - let regs = [ - /百度为您找到相关结果约([\d,]+)个/, - /找到相关结果约([\d,]+)个/, - /(?:找到约|About)\s*([\d,]+)\s*(?:条结果|results)/, - /([\d,]+)\s+(?:results|条结果)<\/span>/ - ]; - let nums = [0, 0, 0, 0]; - for (let i = texts.length; i--;) { - let text = texts[i]; - text = text.replace(/<(script|style)[^>]*>[\s\S]*?<\/\1>/g, ''); - text.replace(regs[i], (_, m) => { - nums[i] = parseInt(m.replace(/,/g, ''), 10); - }); - // if (nums[i] === 0) { - // console.log(text); - // } - } - let [bdNum, soNum, googleNum, bingNum] = nums; + let maps = { + baidu: { + url: `https://www.baidu.com/s?wd=${kd}&rsv_spt=1&ie=utf-8&f=8`, + reg: /百度为您找到相关结果约([\d,]+)个/, + page: 0, + }, + so360: { + url: `https://www.so.com/s?q=${kd}`, + reg: /找到相关结果约([\d,]+)个/, + page: 1, + }, + google: { + url: `https://www.google.com/search?q=${kd}`, + reg: /(?:找到约|About)\s*([\d,]+)\s*(?:条结果|results)/, + page: 2 + }, + bing: { + url: `https://cn.bing.com/search?q=${kd}&FORM=BESBTB`, + reg: /([\d,]+)\s+(?:results|条结果)<\/span>/, + page: 3 + } + }; + let i = maps[type]; + let text = await headless.toUrl(i.url, i.page); + let nums = 0; + text = String(text).replace(/<(script|style)[^>]*>[\s\S]*?<\/\1>/g, ''); + text.replace(i.reg, (_, m) => { + //console.log(m); + nums = parseInt(m.replace(/,/g, ''), 10); + }); r({ - baidu: bdNum, - so360: soNum, - google: googleNum, - bing: bingNum + [type]: nums }); }); }, @@ -193,12 +175,15 @@ let ciTask = async () => { let j = JSON.parse(c); for (let p in j) { let a = j[p]; - for (let z in a) { - if (a[z] === 0) { - delete j[p]; - break; - } + if (a.baidu === 0) { + delete j[p]; } + // for (let z in a) { + // if (a[z] === 0) { + // delete j[p]; + // break; + // } + // } } Object.assign(cache, j); } @@ -240,7 +225,8 @@ let ciTask = async () => { data.author = r.author; data.rhythmic = r.rhythmic; cache[kd] = data; - delay = delayBaseTime + delayMaxTime * Math.random(); + //let dl = delays[type]; + //delay = dl.base + dl.delay * Math.random(); } for (let p in data) { if (data[p] === 0) { @@ -249,14 +235,14 @@ let ciTask = async () => { } } if (start && ((start % 5) === 0)) { - rank.write('./s.cache', JSON.stringify(cache)); + rank.write('./s.cache', JSON.stringify(cache, null, 4)); } ranks.push(data); console.log(start + '/' + list.length); start++; setTimeout(task, delay); } else { - rank.write('./s.cache', JSON.stringify(cache)); + rank.write('./s.cache', JSON.stringify(cache, null, 4)); resolve([ranks, zeros]); } }; @@ -265,8 +251,8 @@ let ciTask = async () => { }; let work = async list => { let one = list.pop(); - console.log('remain ', list.length); if (one) { + console.log('ci remain ', list.length); let f = path.basename(one); let aim = f.replace('ci.song', 'ci.song.rank'); let zeroAim = f.replace('ci.song', 'ci.song.zero'); @@ -278,48 +264,60 @@ let ciTask = async () => { cache = Object.create(null);//文件写入后,清理缓存 work(list); } else { + console.log('ci complete'); resolve(); } }; work(loadList); }); }; +let canVisitGoogle = false; +let checkGoogle = async () => { + return new Promise(resolve => { + https.get('https://www.google.com/search?q=xinglie', res => { + if (res.statusCode == 200) { + canVisitGoogle = true; + } + resolve(); + }).on('error', e => { + console.log(e); + resolve(); + }); + }); +}; +let poetReg = /poet\.(song|tang)\.(\d+)\.json$/; let poetTask = async () => { return new Promise(resolve => { - let poetReg = /poet\.(?:song|tang)\.\d+\.json$/; - let poetRankReg = /poet\.(?:song|tang)\.rank\.\d+\.json$/; let readList = {}; - let exist = {}; - rank.list('./poet', f => { - if (poetRankReg.test(f)) { - exist[path.basename(f).replace(/poet\.(song|tang)\.rank/, 'poet.$1')] = 1; - } - }); + let caches = {}; + let cList = ['baidu', 'so360', 'bing']; + if (canVisitGoogle) { + cList.push('google'); + } + + for (let c of cList) { + caches[c] = Object.create(null); + } rank.list('../json', f => { if (poetReg.test(f)) { - let base = path.basename(f); - if (!exist[base]) { - readList[path.resolve(f)] = 1; - } + readList[path.resolve(f)] = 1; } }); - if (fs.existsSync('./s.cache')) { - let c = rank.read('./s.cache'); - let j = JSON.parse(c); - for (let p in j) { - let a = j[p]; - for (let z in a) { - if (a[z] === 0) { - delete j[p]; - break; + for (let c of cList) { + if (fs.existsSync('./s.' + c + '.cache')) { + let d = rank.read('./s.' + c + '.cache'); + let j = JSON.parse(d); + if (!caches[c]) caches[c] = {}; + for (let e in j) { + if (j[e][c] === 0) { + delete j[e]; } } + Object.assign(caches[c], j); } - Object.assign(cache, j); } let loadList = Object.keys(readList); - - let singleWork = file => { + let singleWork = (file, type) => { let ranks = []; let zeros = []; return new Promise(resolve => { @@ -330,34 +328,20 @@ let poetTask = async () => { let r = list[start]; let kd = encodeURIComponent(`${r.author} ${r.title}`); let data, - delay = 0, - zeroRetry = 3; - if (cache[kd]) { - data = cache[kd]; + delay = 0 + if (caches[type][kd]) { + data = caches[type][kd]; } else { - do { - data = await rank.remote(kd); - let hasZero = false; - // for (let p in data) { - // if (data[p] === 0) { - // hasZero = true; - // break; - // } - // } - //baidu有时候查询不到,需要再次查询 - hasZero = data.baidu === 0; - if (!hasZero) { - zeroRetry = 0; - } else { - console.log('baidu zero result, retry', zeroRetry); - await wait(5 + Math.random() * 10); - zeroRetry--; - } - } while (zeroRetry); + data = await rank.remote(kd, type); data.author = r.author; data.title = r.title; - cache[kd] = data; - delay = delayBaseTime + delayMaxTime * Math.random(); + caches[type][kd] = data; + let dl = delays[type]; + delay = dl.base + dl.delay * Math.random(); + + if (start && ((start % 2) === 0)) { + rank.write('./s.' + type + '.cache', JSON.stringify(caches[type])); + } } for (let p in data) { if (data[p] === 0) { @@ -365,46 +349,117 @@ let poetTask = async () => { break; } } - if (start && ((start % 5) === 0)) { - rank.write('./s.cache', JSON.stringify(cache)); - } ranks.push(data); - console.log(start + '/' + list.length); + console.log(type + ':' + start + '/' + list.length); start++; setTimeout(task, delay); } else { - rank.write('./s.cache', JSON.stringify(cache)); + rank.write('./s.' + type + '.cache', JSON.stringify(caches[type])); resolve([ranks, zeros]); } }; task(); }); }; - let work = async list => { - let one = list.pop(); - console.log('remain ', list.length); - if (one) { - let f = path.basename(one); - let aim = f.replace(/poet\.(song|tang)/, 'poet.$1.rank'); - let zeroAim = f.replace(/poet\.(song|tang)/, 'poet.$1.zero'); - let [ranks, zeros] = await singleWork(one); - rank.write('./poet/' + aim, JSON.stringify(ranks, null, 4)); - if (zeros.length) { - rank.write('./poet/' + zeroAim, JSON.stringify(zeros, null, 4)); + let finised = {}; + let check = () => { + let all = true; + for (let c of cList) { + if (finised[c] !== true) { + all = false; } - cache = Object.create(null);//文件写入后,清理缓存 - work(list); - } else { + } + if (all) { resolve(); } }; - work(loadList); + let work = async (type, index, list) => { + let one = list[index]; + if (one) { + console.log(`${type} current ${index} ,total:${list.length}`); + let f = path.basename(one); + let aim = f.replace(/poet\.(song|tang)/, 'poet.$1.rank.' + type); + if (!fs.existsSync('./poet_temp/' + aim)) { + //let zeroAim = f.replace(/poet\.(song|tang)/, 'poet.$1.zero.' + type); + let [ranks, zeros] = await singleWork(one, type); + rank.write('./poet_temp/' + aim, JSON.stringify(ranks, null, 4)); + // if (zeros.length) { + // rank.write('./poet/' + zeroAim, JSON.stringify(zeros, null, 4)); + // } + caches[type] = Object.create(null);//文件写入后,清理缓存 + setTimeout(() => { + work(type, index + 1, list); + }, 10 * 1000); + } else { + console.log('ignore ' + aim); + work(type, index + 1, list); + } + } else { + finised[type] = true; + } + }; + for (let c of cList) { + work(c, 0, loadList); + } + check(); }); }; +let split = () => { + let keys = ['baidu', 'so360', 'bing', 'google']; + rank.list('./poet', f => { + let b = path.basename(f); + let d = JSON.parse(rank.read(f)); + for (let key of keys) { + let aim = b.replace(/poet\.(song|tang).rank/, 'poet.$1.rank.' + key); + let newList = []; + for (let e of d) { + newList.push({ + author: e.author, + title: e.title, + [key]: e[key] + }); + } + rank.write('./poet2/' + aim, JSON.stringify(newList, null, 4)); + } + }); +}; +let merge = () => { + let readList = {}; + let cList = ['baidu', 'so360', 'bing', 'google']; + rank.list('../json', f => { + if (poetReg.test(f)) { + readList[path.resolve(f)] = 1; + } + }); + let loadList = Object.keys(readList); + for (let ll of loadList) { + let f = path.basename(ll); + let aim = f.replace(/poet\.(song|tang)/, 'poet.$1.rank'); + let newList = [], + canMerge = true; + for (let c of cList) { + let src = f.replace(/poet\.(song|tang)/, 'poet.$1.rank.' + c); + if (fs.existsSync('./poet_temp/' + src)) { + let d = rank.read('./poet_temp/' + src); + let j = JSON.parse(d); + for (let i = j.length; i--;) { + newList[i] = Object.assign(newList[i] || {}, j[i]); + } + } else { + canMerge = false; + } + } + if (canMerge) { + rank.write('./poet/' + aim, JSON.stringify(newList, null, 4)); + } + } +}; (async () => { - await ciTask(); - //await poetTask(); + //merge(); + //await ciTask(); + await headless.before(); + await checkGoogle(); + await poetTask(); headless.after(); - fs.unlink('./s.cache'); - console.log('complete'); + merge(); })(); \ No newline at end of file