Extract rich metadata from URLs.
npm install scrappy --saveStarting from extractFromUrl, scrappy creates a HTTP request (scrapeUrl) and streams the response into the scraper (scrapeStream). The scraper extracts metadata based on various specifications and standards, including HTML, RDFa, JSON-LD, Microdata, Open Graph and OEmbed. With all the relevant metadata, it uses extract to select the appropriate snippet. If you need snippets in a different format, you can create your own extraction method which accepts the scraped metadata.
import { scrapeUrl, scrapeStream, extract, extractFromUrl } from 'scrappy'
const url = 'https://medium.com/slack-developer-blog/everything-you-ever-wanted-to-know-about-unfurling-but-were-afraid-to-ask-or-how-to-make-your-e64b4bb9254#.a0wjf4ltt'
extractFromUrl(url).then(function (snippet) {
// {
// "type": "summary",
// "imageUrl": "https://cdn-images-1.medium.com/max/1200/1*QOMaDLcO8rExD0ctBV3BWg.png",
// "contentUrl": "https://medium.com/slack-developer-blog/everything-you-ever-wanted-to-know-about-unfurling-but-were-afraid-to-ask-or-how-to-make-your-e64b4bb9254",
// "originalUrl": "https://medium.com/slack-developer-blog/everything-you-ever-wanted-to-know-about-unfurling-but-were-afraid-to-ask-or-how-to-make-your-e64b4bb9254#.a0wjf4ltt",
// "encodingFormat": "html",
// "headline": "Everything you ever wanted to know about unfurling but were afraid to ask /or/ How to make your… — Slack Platform Blog",
// "caption": "Let’s start with the most obvious question first. This is what an “unfurl” is:",
// "siteName": "Medium",
// "author": "Matt Haughey",
// "publisher": "https://www.facebook.com/medium",
// "apps": {
// "iphone": {
// "id": "828256236",
// "name": "Medium",
// "url": "medium://p/e64b4bb9254"
// },
// "ipad": {
// "id": "828256236",
// "name": "Medium",
// "url": "medium://p/e64b4bb9254"
// },
// "android": {
// "id": "com.medium.reader",
// "name": "Medium",
// "url": "medium://p/e64b4bb9254"
// }
// }
// }
})# Build the fixtures directory with raw content.
node scripts/fixtures.js
# Scrape the metadata results from fixtures.
node scripts/scrape.js
# Extract the snippets from the previous results.
node scripts/extract.jsApache 2.0