From fa2db8d4304b0b0d91221b4ef942daa74ab3df94 Mon Sep 17 00:00:00 2001 From: joe fleming Date: Thu, 13 Sep 2018 15:16:42 -0700 Subject: [PATCH] fix: scrape strains alphabetically ensures that we get everything --- packages/scraper/README.md | 2 -- packages/scraper/src/leafly.mjs | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/packages/scraper/README.md b/packages/scraper/README.md index 7ff6990..f1d2aea 100644 --- a/packages/scraper/README.md +++ b/packages/scraper/README.md @@ -8,5 +8,3 @@ Clone repo and run the command. Resulting data can be found in `db.json`. yarn install yarn start ``` - -**NOTE**: You may need to run it multiple times (4 or 5 should do it), since some strains will get skipped the first few times. I don't know if it's leafly's endpoint or some weird race condition in the scraping code though. diff --git a/packages/scraper/src/leafly.mjs b/packages/scraper/src/leafly.mjs index dfd6865..353f432 100644 --- a/packages/scraper/src/leafly.mjs +++ b/packages/scraper/src/leafly.mjs @@ -8,14 +8,14 @@ const adapter = new FileAsync('db.json'); const xhr = axios.create({ headers: { Accept: 'application/json, text/plain, */*', - Referer: 'https://www.leafly.com/explore', + Referer: 'https://www.leafly.com/explore/sort-alpha', }, }); const pSeries = tasks => tasks.reduce((c, task) => c.then(task), Promise.resolve()); const getPage = async num => { - const url = `https://www.leafly.com/explore/page-${num}`; + const url = `https://www.leafly.com/explore/page-${num}/sort-alpha`; const response = await xhr.get(url, { responseType: 'json', });