feat: working scraper
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -9,3 +9,4 @@ coverage
|
||||
.nyc_output
|
||||
coverage.lcov
|
||||
/lib
|
||||
db.json
|
||||
25
package.json
25
package.json
@@ -8,7 +8,8 @@
|
||||
"lint": "eslint \"*.{js,mjs}\" \"src/**/*.{js,mjs}\"",
|
||||
"precommit": "lint-staged",
|
||||
"version": "auto-changelog -p && auto-authors && git add CHANGELOG.md AUTHORS.md",
|
||||
"start": "node ."
|
||||
"start": "node .",
|
||||
"dev": "nodemon --ignore db.json ."
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
@@ -36,8 +37,12 @@
|
||||
"commitLimit": false
|
||||
},
|
||||
"lint-staged": {
|
||||
"*.{js,mjs}": ["eslint --fix"],
|
||||
"*.{js,mjs,json,css}": ["prettier --write"]
|
||||
"*.{js,mjs}": [
|
||||
"eslint --fix"
|
||||
],
|
||||
"*.{js,mjs,json,css}": [
|
||||
"prettier --write"
|
||||
]
|
||||
},
|
||||
"prettier": {
|
||||
"printWidth": 100,
|
||||
@@ -48,20 +53,24 @@
|
||||
"cjs": true
|
||||
},
|
||||
"dependencies": {
|
||||
"esm": "^3.0.17"
|
||||
"axios": "^0.18.0",
|
||||
"esm": "^3.0.17",
|
||||
"lodash": "^4.17.10",
|
||||
"lowdb": "^1.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"auto-authors": "^0.1.1",
|
||||
"auto-changelog": "^1.7.0",
|
||||
"eslint": "^4.9.0",
|
||||
"eslint-config-airbnb": "^16.1.0",
|
||||
"eslint-config-prettier": "^2.9.0",
|
||||
"eslint-plugin-prettier": "^2.3.1",
|
||||
"eslint-plugin-import": "^2.7.0",
|
||||
"eslint-plugin-jsx-a11y": "^6.0.2",
|
||||
"eslint-plugin-prettier": "^2.3.1",
|
||||
"eslint-plugin-react": "^7.1.0",
|
||||
"husky": "^0.14.3",
|
||||
"lint-staged": "^7.0.4",
|
||||
"eslint-config-airbnb": "^16.1.0",
|
||||
"eslint-plugin-jsx-a11y": "^6.0.2",
|
||||
"eslint-plugin-react": "^7.1.0",
|
||||
"nodemon": "^1.18.4",
|
||||
"prettier": "^1.9.0"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
export default function() {
|
||||
// es6 module code goes here
|
||||
import leafly from './leafly.mjs';
|
||||
|
||||
async function scrape() {
|
||||
await leafly();
|
||||
}
|
||||
|
||||
scrape();
|
||||
|
||||
74
src/leafly.mjs
Normal file
74
src/leafly.mjs
Normal file
@@ -0,0 +1,74 @@
|
||||
import axios from 'axios';
|
||||
import get from 'lodash/get';
|
||||
import low from 'lowdb';
|
||||
import FileAsync from 'lowdb/adapters/FileAsync';
|
||||
|
||||
const adapter = new FileAsync('db.json');
|
||||
|
||||
const xhr = axios.create({
|
||||
headers: {
|
||||
Accept: 'application/json, text/plain, */*',
|
||||
Referer: 'https://www.leafly.com/explore',
|
||||
},
|
||||
});
|
||||
|
||||
const getPage = async num => {
|
||||
const url = `https://www.leafly.com/explore/page-${num}`;
|
||||
const response = await xhr.get(url, {
|
||||
responseType: 'json',
|
||||
});
|
||||
|
||||
// transform strain data
|
||||
const strains = response.data.Model.Strains.filter(strain => strain.Name != null).map(strain => ({
|
||||
id: strain.Id,
|
||||
name: strain.Name,
|
||||
symbol: strain.Symbol,
|
||||
category: get(strain, 'Category', 'unknown').toLowerCase(),
|
||||
profile: get(strain, 'CannabinoidProfile', 'none').toLowerCase(),
|
||||
rating: strain.Rating,
|
||||
rating_count: strain.RatingCount,
|
||||
effects: get(strain, 'Tags', []).map(tag => tag.DisplayLabel.toLowerCase()),
|
||||
negative_effects: get(strain, 'NegativeEffects', []).map(tag => tag.DisplayLabel.toLowerCase()),
|
||||
flavors: get(strain, 'Flavors', []).map(tag => tag.DisplayLabel.toLowerCase()),
|
||||
uses: get(strain, 'Symptoms', []).map(tag => tag.DisplayLabel.toLowerCase()),
|
||||
conditions: get(strain, 'Conditions', []).map(tag => tag.DisplayLabel.toLowerCase()),
|
||||
}));
|
||||
|
||||
return {
|
||||
strains,
|
||||
page: response.data.Model.PagingContext,
|
||||
};
|
||||
};
|
||||
|
||||
export default async function scrapeLeafly(startFrom = 1, endAt = Infinity) {
|
||||
let pageNum = startFrom;
|
||||
let finished = false;
|
||||
const db = await low(adapter);
|
||||
|
||||
await db.defaults({ strains: [] }).write();
|
||||
|
||||
while (!finished) {
|
||||
console.log(`Fetching page ${pageNum}`);
|
||||
const data = await getPage(pageNum);
|
||||
|
||||
data.strains.forEach(async strain => {
|
||||
// check for value
|
||||
const doc = db
|
||||
.get('strains')
|
||||
.filter({ id: strain.id })
|
||||
.first()
|
||||
.value();
|
||||
|
||||
if (!doc) {
|
||||
console.log(`Adding ${strain.id}, ${strain.name}`);
|
||||
await db
|
||||
.get('strains')
|
||||
.push(strain)
|
||||
.write();
|
||||
}
|
||||
});
|
||||
|
||||
if (pageNum >= endAt || !data.strains.length || data.page.isLastPage) finished = true;
|
||||
pageNum += 1;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user