diff --git a/helper.js b/helper.js new file mode 100644 index 0000000..ce2bbcc --- /dev/null +++ b/helper.js @@ -0,0 +1,41 @@ +const cherio = require('cherio'); +const path = require('path'); +const fs = require('fs-extra'); +const fsp = fs.promises; +const mkdirp = require('util').promisify(fs.mkdirp); +const urllib = require('url'); + +async function* save(url,savedir) { + console.log({url}); + var urlProperties = new URL(url); + var resp = await fetch(url); + var text = await resp.text(); + var relativeSavePath = path.join.apply(null,urlProperties.pathname.split('/')); + if(relativeSavePath==='.') relativeSavePath = 'index.html'; //Catch root document + console.log({relativeSavePath}); + var savepath = path.join(savedir,relativeSavePath); + await mkdirp(path.dirname(savepath)); + console.log('Saving',url,savepath); + await fsp.writeFile(savepath,text); + var dom = cherio(text); + var linkElements = dom.find('a'); + for(var link of linkElements.toArray()) { + var testurl = cherio(link).attr('href'); //Get href + if(!testurl) continue; //Ignore empty + testurl = urllib.resolve(url,testurl); //Handle relative urls + var testProperties = new URL(testurl); //Reject beyond the current domain + if(testProperties.host != urlProperties.host) continue; + yield testurl; + } +} + +module.exports = main; +async function main(rooturl,savedir,downloaded=[],todownload=[]) { + todownload.push(rooturl); + while(todownload.length) { + var saveurl = todownload.shift(); + for await (var newUrl of save(saveurl,savedir)) { + if(!todownload.includes(newUrl) && !downloaded.includes(newUrl)) todownload.push(newUrl); + } + } +} \ No newline at end of file