Skip to content

Commit

Permalink
Perf: further speed up infra
Browse files Browse the repository at this point in the history
  • Loading branch information
SukkaW committed Sep 14, 2023
1 parent adb8b43 commit 78afa59
Show file tree
Hide file tree
Showing 25 changed files with 429 additions and 171 deletions.
24 changes: 17 additions & 7 deletions .eslintrc.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
{
"root": true,
"extends": ["sukka/node"],
"rules": {
"no-console": "off"
},
"parserOptions": {
"ecmaVersion": "latest",
"sourceType": "module"
}
"ignorePatterns": [
"node_modules/",
// disable for now
"**/*.d.ts"
],
"overrides": [
{
"files": ["**/*.js"],
"rules": {
"no-console": "off"
},
"parserOptions": {
"ecmaVersion": "latest",
"sourceType": "module"
}
}
]
}
34 changes: 24 additions & 10 deletions Build/build-cdn-conf.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,22 @@ const { minifyRules } = require('./lib/minify-rules');
const { fetchRemoteTextAndCreateReadlineInterface, readFileByLine } = require('./lib/fetch-remote-text-by-line');
const Trie = require('./lib/trie');
const { runner } = require('./lib/trace-runner');
const fs = require('fs');

const publicSuffixPath = path.resolve(__dirname, '../node_modules/.cache/public_suffix-list_dat.txt');

runner(__filename, async () => {
const trie = new Trie();
for await (const line of await fetchRemoteTextAndCreateReadlineInterface('https://publicsuffix.org/list/public_suffix_list.dat')) {
trie.add(line);

if (fs.existsSync(publicSuffixPath)) {
for await (const line of readFileByLine(publicSuffixPath)) {
trie.add(line);
}
} else {
console.log('public_suffix_list.dat not found, fetch directly from remote.');
for await (const line of await fetchRemoteTextAndCreateReadlineInterface('https://publicsuffix.org/list/public_suffix_list.dat')) {
trie.add(line);
}
}

/**
Expand All @@ -18,13 +29,16 @@ runner(__filename, async () => {
*/
const S3OSSDomains = new Set();

trie.find('.amazonaws.com')
.filter(line => (line.startsWith('s3-') || line.startsWith('s3.')) && !line.includes('cn-'))
.forEach(line => S3OSSDomains.add(line));

trie.find('.scw.cloud')
.filter(line => (line.startsWith('s3-') || line.startsWith('s3.')) && !line.includes('cn-'))
.forEach(line => S3OSSDomains.add(line));
trie.find('.amazonaws.com').forEach(line => {
if ((line.startsWith('s3-') || line.startsWith('s3.')) && !line.includes('cn-')) {
S3OSSDomains.add(line);
}
});
trie.find('.scw.cloud').forEach(line => {
if ((line.startsWith('s3-') || line.startsWith('s3.')) && !line.includes('cn-')) {
S3OSSDomains.add(line);
}
});

/** @type {string[]} */
const cdnDomainsList = [];
Expand All @@ -45,7 +59,7 @@ runner(__filename, async () => {
];
const ruleset = minifyRules(cdnDomainsList);

await Promise.all(createRuleset(
return Promise.all(createRuleset(
'Sukka\'s Ruleset - CDN Domains',
description,
new Date(),
Expand Down
11 changes: 9 additions & 2 deletions Build/build-domestic-ruleset.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,15 @@ runner(__filename, async () => {

results.push(
...Object.entries(DOMESTICS)
.filter(([key]) => key !== 'SYSTEM')
.flatMap(([, { domains }]) => domains)
.reduce(
(acc, [key, { domains }]) => {
if (key === 'SYSTEM') {
return acc;
}
return [...acc, ...domains];
},
/** @type {string[]} */([])
)
.sort(domainSorter)
.map((domain) => `DOMAIN-SUFFIX,${domain}`)
);
Expand Down
15 changes: 9 additions & 6 deletions Build/build-internal-cdn-rules.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
// @ts-check
const fse = require('fs-extra');
const path = require('path');
const { isDomainLoose } = require('./lib/is-domain-loose');
const tldts = require('tldts');
const { processLine } = require('./lib/process-line');
const { readFileByLine } = require('./lib/fetch-remote-text-by-line');
Expand Down Expand Up @@ -35,11 +34,15 @@ runner(__filename, async () => {
*/
const processLocalDomainSet = async (domainSetPath) => {
for await (const line of readFileByLine(domainSetPath)) {
if (line[0] === '.') {
addApexDomain(line.slice(1));
} else if (isDomainLoose(line)) {
addApexDomain(line);
} else if (processLine(line)) {
const parsed = tldts.parse(line, { allowPrivateDomains: true });
if (!parsed.isIp && (parsed.isIcann || parsed.isPrivate)) {
if (parsed.domain) {
set.add(parsed.domain);
}
continue;
}

if (processLine(line)) {
console.warn('[drop line from domainset]', line);
}
}
Expand Down
8 changes: 4 additions & 4 deletions Build/build-internal-chn-domains.js
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
// @ts-check
const path = require('path');
const fse = require('fs-extra');
const fs = require('fs');
const { parseFelixDnsmasq } = require('./lib/parse-dnsmasq');
const { runner } = require('./lib/trace-runner');
const { compareAndWriteFile } = require('./lib/create-file');

runner(__filename, async () => {
const [result] = await Promise.all([
parseFelixDnsmasq('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf'),
fse.ensureDir(path.resolve(__dirname, '../List/internal'))
]);

await fs.promises.writeFile(
path.resolve(__dirname, '../List/internal/accelerated-china-domains.txt'),
`${result.map(line => `SUFFIX,${line}`).join('\n')}\n`
await compareAndWriteFile(
result.map(line => `SUFFIX,${line}`),
path.resolve(__dirname, '../List/internal/accelerated-china-domains.txt')
);
});
51 changes: 22 additions & 29 deletions Build/build-phishing-domainset.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
const { parse } = require('tldts');
const tldts = require('tldts');
const { processFilterRules } = require('./lib/parse-filter.js');
const path = require('path');
const { createRuleset } = require('./lib/create-file');
const { processLine } = require('./lib/process-line.js');
const domainSorter = require('./lib/stable-sort-domain');
const { runner } = require('./lib/trace-runner.js');
const { runner, traceSync } = require('./lib/trace-runner.js');

const WHITELIST_DOMAIN = new Set([
'w3s.link',
Expand Down Expand Up @@ -61,19 +61,14 @@ const BLACK_TLD = new Set([
]);

runner(__filename, async () => {
const domainSet = Array.from(
(await processFilterRules('https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt')).black
);
const domainSet = Array.from((await processFilterRules('https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt')).black);
const domainCountMap = {};

for (let i = 0, len = domainSet.length; i < len; i++) {
const line = processLine(domainSet[i]);
if (!line) continue;

const domain = line.charCodeAt(0) === 46 ? line.slice(1) : line;

const parsed = parse(domain, { allowPrivateDomains: true });

const parsed = tldts.parse(line, { allowPrivateDomains: true });
const apexDomain = parsed.domain;

if (apexDomain) {
Expand All @@ -84,19 +79,18 @@ runner(__filename, async () => {
domainCountMap[apexDomain] ||= 0;

let isPhishingDomainMockingAmazon = false;

if (domain.startsWith('amaz')) {
if (line.startsWith('.amaz')) {
domainCountMap[apexDomain] += 0.5;

isPhishingDomainMockingAmazon = true;

if (domain.startsWith('amazon-')) {
if (line.startsWith('.amazon-')) {
domainCountMap[apexDomain] += 4.5;
}
} else if (domain.startsWith('customer')) {
} else if (line.startsWith('.customer')) {
domainCountMap[apexDomain] += 0.25;
}
if (domain.includes('-co-jp')) {
if (line.includes('-co-jp')) {
domainCountMap[apexDomain] += (isPhishingDomainMockingAmazon ? 4.5 : 0.5);
}

Expand All @@ -105,17 +99,17 @@ runner(__filename, async () => {

domainCountMap[apexDomain] += 1;

if (domain.length > 19) {
if (line.length > 19) {
// Add more weight if the domain is long enough
if (domain.length > 44) {
if (line.length > 44) {
domainCountMap[apexDomain] += 3.5;
} else if (domain.length > 34) {
} else if (line.length > 34) {
domainCountMap[apexDomain] += 2.5;
} else if (domain.length > 29) {
} else if (line.length > 29) {
domainCountMap[apexDomain] += 1.5;
} else if (domain.length > 24) {
} else if (line.length > 24) {
domainCountMap[apexDomain] += 0.75;
} else if (domain.length > 19) {
} else if (line.length > 19) {
domainCountMap[apexDomain] += 0.25;
}

Expand All @@ -129,15 +123,14 @@ runner(__filename, async () => {
}
}

const results = [];

Object.entries(domainCountMap).forEach(([domain, count]) => {
if (count >= 5) {
results.push(`.${domain}`);
}
});

results.sort(domainSorter);
const results = traceSync('* get final results', () => Object.entries(domainCountMap)
.reduce((acc, [apexDomain, count]) => {
if (count >= 5) {
acc.push(`.${apexDomain}`);
}
return acc;
}, /** @type {string[]} */([]))
.sort(domainSorter));

const description = [
'License: AGPL 3.0',
Expand Down
71 changes: 39 additions & 32 deletions Build/build-reject-domainset.js
Original file line number Diff line number Diff line change
@@ -1,28 +1,29 @@
// @ts-check
const fs = require('fs');
const fse = require('fs-extra');
const { resolve: pathResolve } = require('path');

const tldts = require('tldts');

const { processHosts, processFilterRules } = require('./lib/parse-filter');
const Trie = require('./lib/trie');

const { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, PREDEFINED_ENFORCED_BACKLIST } = require('./lib/reject-data-source');
const { createRuleset } = require('./lib/create-file');
const { createRuleset, compareAndWriteFile } = require('./lib/create-file');
const { processLine } = require('./lib/process-line');
const { domainDeduper } = require('./lib/domain-deduper');
const createKeywordFilter = require('./lib/aho-corasick');
const { readFileByLine } = require('./lib/fetch-remote-text-by-line');
const domainSorter = require('./lib/stable-sort-domain');
const { createDomainSorter } = require('./lib/stable-sort-domain');
const { traceSync, runner } = require('./lib/trace-runner');
const { getGorhillPublicSuffixPromise } = require('./lib/get-gorhill-publicsuffix');
const { createCachedGorhillGetDomain } = require('./lib/cached-tld-parse');

/** Whitelists */
const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST);
/** @type {Set<string>} Dedupe domains inclued by DOMAIN-KEYWORD */
const domainKeywordsSet = new Set();
/** @type {Set<string>} Dedupe domains included by DOMAIN-SUFFIX */
const domainSuffixSet = new Set();
(async () => {

runner(__filename, async () => {
/** @type Set<string> */
const domainSets = new Set();

Expand All @@ -31,7 +32,8 @@ const domainSuffixSet = new Set();

let shouldStop = false;

await Promise.all([
const [gorhill] = await Promise.all([
getGorhillPublicSuffixPromise,
// Parse from remote hosts & domain lists
...HOSTS.map(entry => processHosts(entry[0], entry[1]).then(hosts => {
hosts.forEach(host => {
Expand Down Expand Up @@ -129,7 +131,7 @@ const domainSuffixSet = new Set();
console.log(`Start deduping from black keywords/suffixes! (${previousSize})`);
console.time('* Dedupe from black keywords/suffixes');

const kwfilter = createKeywordFilter(Array.from(domainKeywordsSet));
const kwfilter = createKeywordFilter(domainKeywordsSet);

const trie1 = Trie.from(domainSets);
domainSuffixSet.forEach(suffix => {
Expand Down Expand Up @@ -167,19 +169,35 @@ const domainSuffixSet = new Set();

const START_TIME = Date.now();

const dudupedDominArray = domainDeduper(Array.from(domainSets));
const dudupedDominArray = traceSync('* Dedupe from covered subdomain', () => domainDeduper(Array.from(domainSets)));

console.log(`* Dedupe from covered subdomain - ${(Date.now() - START_TIME) / 1000}s`);
console.log(`Deduped ${previousSize - dudupedDominArray.length} rules!`);

/** @type {Record<string, number>} */
const rejectDomainsStats = dudupedDominArray.reduce((acc, cur) => {
const suffix = tldts.getDomain(cur, { allowPrivateDomains: false });
if (suffix) {
acc[suffix] = (acc[suffix] ?? 0) + 1;
}
return acc;
}, {});
// Create reject stats
const getDomain = createCachedGorhillGetDomain(gorhill);
/** @type {[string, number][]} */
const rejectDomainsStats = traceSync(
'* Collect reject domain stats',
() => Object.entries(
dudupedDominArray.reduce((acc, cur) => {
const suffix = getDomain(cur);
if (suffix) {
acc[suffix] = (acc[suffix] ?? 0) + 1;
}
return acc;
}, {})
).filter(a => a[1] > 2).sort((a, b) => {
const t = b[1] - a[1];
if (t === 0) {
return a[0].localeCompare(b[0]);
}
return t;
})
);

const domainSorter = createDomainSorter(gorhill);
const domainset = traceSync('* Sort reject domainset', () => dudupedDominArray.sort(domainSorter));

const description = [
'License: AGPL 3.0',
Expand All @@ -192,7 +210,6 @@ const domainSuffixSet = new Set();
...HOSTS.map(host => ` - ${host[0]}`),
...ADGUARD_FILTERS.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`)
];
const domainset = dudupedDominArray.sort(domainSorter);

await Promise.all([
...createRuleset(
Expand All @@ -204,21 +221,11 @@ const domainSuffixSet = new Set();
pathResolve(__dirname, '../List/domainset/reject.conf'),
pathResolve(__dirname, '../Clash/domainset/reject.txt')
),
fs.promises.writeFile(
pathResolve(__dirname, '../List/internal/reject-stats.txt'),
Object.entries(rejectDomainsStats)
.filter(a => a[1] > 1)
.sort((a, b) => {
const t = b[1] - a[1];
if (t === 0) {
return a[0].localeCompare(b[0]);
}
return t;
})
.map(([domain, count]) => `${domain}${' '.repeat(100 - domain.length)}${count}`)
.join('\n')
compareAndWriteFile(
rejectDomainsStats.map(([domain, count]) => `${domain}${' '.repeat(100 - domain.length)}${count}`),
pathResolve(__dirname, '../List/internal/reject-stats.txt')
),
// Copy reject_sukka.conf for backward compatibility
fse.copy(pathResolve(__dirname, '../Source/domainset/reject_sukka.conf'), pathResolve(__dirname, '../List/domainset/reject_sukka.conf'))
]);
})();
});
Loading

0 comments on commit 78afa59

Please sign in to comment.