const cherio = require('cherio'); const path = require('path'); const fs = require('fs-extra'); const fsp = fs.promises; const mkdirp = require('util').promisify(fs.mkdirp); const urllib = require('url'); async function* save(url,savedir) { console.log({url}); var urlProperties = new URL(url); var resp = await fetch(url); var text = await resp.text(); var relativeSavePath = path.join.apply(null,urlProperties.pathname.split('/')); if(relativeSavePath==='.') relativeSavePath = 'index.html'; //Catch root document console.log({relativeSavePath}); var savepath = path.join(savedir,relativeSavePath); await mkdirp(path.dirname(savepath)); console.log('Saving',url,savepath); await fsp.writeFile(savepath,text); var dom = cherio(text); var linkElements = dom.find('a'); for(var link of linkElements.toArray()) { var testurl = cherio(link).attr('href'); //Get href if(!testurl) continue; //Ignore empty testurl = urllib.resolve(url,testurl); //Handle relative urls var testProperties = new URL(testurl); //Reject beyond the current domain if(testProperties.host != urlProperties.host) continue; yield testurl; } } module.exports = main; async function main(rooturl,savedir,downloaded=[],todownload=[]) { todownload.push(rooturl); while(todownload.length) { var saveurl = todownload.shift(); for await (var newUrl of save(saveurl,savedir)) { if(!todownload.includes(newUrl) && !downloaded.includes(newUrl)) todownload.push(newUrl); } } }