Sharkey/src/tools/analysis/extract-user-domains.ts

121 lines
2.3 KiB
TypeScript

import * as URL from 'url';
import Post from '../../api/models/post';
import User from '../../api/models/user';
import parse from '../../api/common/text';
process.on('unhandledRejection', console.dir);
function tokenize(text: string) {
if (text == null) return [];
// パース
const ast = parse(text);
const domains = ast
// URLを抽出
.filter(t => t.type == 'url' || t.type == 'link')
.map(t => URL.parse(t.url).hostname);
return domains;
}
// Fetch all users
User.find({}, {
fields: {
_id: true
}
}).then(users => {
let i = -1;
const x = cb => {
if (++i == users.length) return cb();
extractDomainsOne(users[i]._id).then(() => x(cb), err => {
console.error(err);
setTimeout(() => {
i--;
x(cb);
}, 1000);
});
};
x(() => {
console.log('complete');
});
});
function extractDomainsOne(id) {
return new Promise(async (resolve, reject) => {
process.stdout.write(`extracting domains of ${id} ...`);
// Fetch recent posts
const recentPosts = await Post.find({
user_id: id,
text: {
$exists: true
}
}, {
sort: {
_id: -1
},
limit: 10000,
fields: {
_id: false,
text: true
}
});
// 投稿が少なかったら中断
if (recentPosts.length < 100) {
process.stdout.write(' >>> -\n');
return resolve();
}
const domains = {};
// Extract domains from recent posts
recentPosts.forEach(post => {
const domainsOfPost = tokenize(post.text);
domainsOfPost.forEach(domain => {
if (domains[domain]) {
domains[domain]++;
} else {
domains[domain] = 1;
}
});
});
// Calc peak
let peak = 0;
Object.keys(domains).forEach(domain => {
if (domains[domain] > peak) peak = domains[domain];
});
// Sort domains by frequency
const domainsSorted = Object.keys(domains).sort((a, b) => domains[b] - domains[a]);
// Lookup top 10 domains
const topDomains = domainsSorted.slice(0, 10);
process.stdout.write(' >>> ' + topDomains.join(', ') + '\n');
// Make domains object (includes weights)
const domainsObj = topDomains.map(domain => ({
domain: domain,
weight: domains[domain] / peak
}));
// Save
User.update({ _id: id }, {
$set: {
domains: domainsObj
}
}).then(() => {
resolve();
}, err => {
reject(err);
});
});
}