41 lines
1.4 KiB
JavaScript
41 lines
1.4 KiB
JavaScript
const cherio = require('cherio');
|
|
const path = require('path');
|
|
const fs = require('fs-extra');
|
|
const fsp = fs.promises;
|
|
const mkdirp = require('util').promisify(fs.mkdirp);
|
|
const urllib = require('url');
|
|
|
|
async function* save(url,savedir) {
|
|
console.log({url});
|
|
var urlProperties = new URL(url);
|
|
var resp = await fetch(url);
|
|
var text = await resp.text();
|
|
var relativeSavePath = path.join.apply(null,urlProperties.pathname.split('/'));
|
|
if(relativeSavePath==='.') relativeSavePath = 'index.html'; //Catch root document
|
|
console.log({relativeSavePath});
|
|
var savepath = path.join(savedir,relativeSavePath);
|
|
await mkdirp(path.dirname(savepath));
|
|
console.log('Saving',url,savepath);
|
|
await fsp.writeFile(savepath,text);
|
|
var dom = cherio(text);
|
|
var linkElements = dom.find('a');
|
|
for(var link of linkElements.toArray()) {
|
|
var testurl = cherio(link).attr('href'); //Get href
|
|
if(!testurl) continue; //Ignore empty
|
|
testurl = urllib.resolve(url,testurl); //Handle relative urls
|
|
var testProperties = new URL(testurl); //Reject beyond the current domain
|
|
if(testProperties.host != urlProperties.host) continue;
|
|
yield testurl;
|
|
}
|
|
}
|
|
|
|
module.exports = main;
|
|
async function main(rooturl,savedir,downloaded=[],todownload=[]) {
|
|
todownload.push(rooturl);
|
|
while(todownload.length) {
|
|
var saveurl = todownload.shift();
|
|
for await (var newUrl of save(saveurl,savedir)) {
|
|
if(!todownload.includes(newUrl) && !downloaded.includes(newUrl)) todownload.push(newUrl);
|
|
}
|
|
}
|
|
} |