bfs_demo/helper.js

41 lines
1.4 KiB
JavaScript
Raw Permalink Normal View History

2025-02-13 20:15:43 -05:00
const cherio = require('cherio');
const path = require('path');
const fs = require('fs-extra');
const fsp = fs.promises;
const mkdirp = require('util').promisify(fs.mkdirp);
const urllib = require('url');
async function* save(url,savedir) {
console.log({url});
var urlProperties = new URL(url);
var resp = await fetch(url);
var text = await resp.text();
var relativeSavePath = path.join.apply(null,urlProperties.pathname.split('/'));
if(relativeSavePath==='.') relativeSavePath = 'index.html'; //Catch root document
console.log({relativeSavePath});
var savepath = path.join(savedir,relativeSavePath);
await mkdirp(path.dirname(savepath));
console.log('Saving',url,savepath);
await fsp.writeFile(savepath,text);
var dom = cherio(text);
var linkElements = dom.find('a');
for(var link of linkElements.toArray()) {
var testurl = cherio(link).attr('href'); //Get href
if(!testurl) continue; //Ignore empty
testurl = urllib.resolve(url,testurl); //Handle relative urls
var testProperties = new URL(testurl); //Reject beyond the current domain
if(testProperties.host != urlProperties.host) continue;
yield testurl;
}
}
module.exports = main;
async function main(rooturl,savedir,downloaded=[],todownload=[]) {
todownload.push(rooturl);
while(todownload.length) {
var saveurl = todownload.shift();
for await (var newUrl of save(saveurl,savedir)) {
if(!todownload.includes(newUrl) && !downloaded.includes(newUrl)) todownload.push(newUrl);
}
}
}