-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Extractor API #16
Closed
Closed
Extractor API #16
Changes from all commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
2d5b15a
Merge pull request #15 from akirk/simplify-directory-structure
psrpinto ea60764
Add Extractor API
psrpinto 8c72ee0
Temporarily comment out playground boot
psrpinto ad143a3
Call extractor
psrpinto d179748
Implement handles() of wordpress-rest extractor
psrpinto 867321e
Make extract method async
psrpinto 2910938
Rename meta to info
psrpinto e3a25ed
Rename Entry to SiteData
psrpinto 6ef8e11
Rename function to extractData()
psrpinto 33018f5
Add SiteInfo
psrpinto c3783e0
Add Source
psrpinto 0eea4dd
Rename handles() to supports()
psrpinto 0a331b7
Validate slug
psrpinto File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import { SourceData, SourceInfo, Source } from './source'; | ||
|
||
/** | ||
* Information about the Extractor. | ||
*/ | ||
export interface ExtractorInfo { | ||
/** | ||
* Unique identifier of the Extractor, e.g. "wordpress-rest". | ||
* Must be a lower-case string. | ||
* There must not be more than one extractor with the same slug. | ||
*/ | ||
slug: string; | ||
|
||
/** | ||
* Title of the Extractor, e.g. "WordPress". | ||
*/ | ||
title: string; | ||
|
||
/** | ||
* Description of the Extractor, e.g. "Extracts posts and pages from a WordPress site using the WordPress REST API". | ||
*/ | ||
description: string; | ||
} | ||
|
||
export interface Extractor { | ||
/** | ||
* Returns information about the Extractor. | ||
*/ | ||
info(): ExtractorInfo; | ||
|
||
/** | ||
* Tells whether the Extractor supports a given Source. | ||
*/ | ||
supports( source: Source ): boolean; | ||
|
||
/** | ||
* Extracts information about the Source, like its title, language, etc. | ||
*/ | ||
extractInfo( source: Source ): Promise< SourceInfo >; | ||
|
||
/** | ||
* Extracts data from a given Document. | ||
*/ | ||
extractData( | ||
source: Source, | ||
callback: ( siteData: SourceData ) => void | ||
): Promise< void >; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import { Extractor } from './extractor'; | ||
import { WordPressRestExtractor } from './wordpress-rest'; | ||
import { Source } from './source'; | ||
|
||
const extractors = new Map< string, Extractor >(); | ||
|
||
registerExtractor( new WordPressRestExtractor() ); | ||
|
||
/** | ||
Check failure on line 9 in src/extractor/registry.ts GitHub Actions / lint
|
||
* Find Extractors that support a given Source. | ||
*/ | ||
export function findExtractors( source: Source ): Extractor[] { | ||
let matches: Extractor[] = []; | ||
for ( let [ slug, extractor ] of extractors ) { | ||
Check failure on line 14 in src/extractor/registry.ts GitHub Actions / lint
|
||
if ( extractor.supports( source ) ) { | ||
matches.push( extractor ); | ||
} | ||
} | ||
return matches; | ||
} | ||
|
||
/** | ||
Check failure on line 22 in src/extractor/registry.ts GitHub Actions / lint
|
||
* Register an Extractor. | ||
*/ | ||
function registerExtractor( extractor: Extractor ) { | ||
const slug = extractor.info().slug; | ||
if ( slug.toLowerCase() !== slug ) { | ||
throw new Error( | ||
`The Extractor's slug must be a sequence of lower-case characters, got '${ slug }'` | ||
); | ||
} | ||
|
||
if ( extractors.has( slug ) ) { | ||
throw new Error( | ||
`An Extractor with slug ${ slug } is already registered` | ||
); | ||
} | ||
extractors.set( slug, extractor ); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
/** | ||
* Source of data to be extracted, like a DOM document, a URL or any other kind of resource. | ||
* For the moment, only DOM Document is supported. | ||
*/ | ||
export abstract class Source { | ||
abstract resource(): any; | ||
} | ||
|
||
/** | ||
* Source backed by a DOM document instance. | ||
*/ | ||
export class DOMSource extends Source { | ||
private readonly document: Document; | ||
|
||
constructor( document: Document ) { | ||
super(); | ||
this.document = document; | ||
} | ||
|
||
resource(): Document { | ||
return this.document; | ||
} | ||
} | ||
|
||
/** | ||
* Information about the Source to be extracted. | ||
*/ | ||
export interface SourceInfo { | ||
/** | ||
* The site's title. | ||
*/ | ||
title: string; | ||
} | ||
|
||
/** | ||
* A piece of data in the Source under extraction, like a post or a page. | ||
*/ | ||
export interface SourceData { | ||
/** | ||
* Slug of the Extractor which extracted this data. | ||
* This is automatically set, the Extractor does not need to set it. | ||
*/ | ||
extractor: string; | ||
title: string; | ||
content: string; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import { Extractor, ExtractorInfo } from './extractor'; | ||
import { DOMSource, Source, SourceData, SourceInfo } from './source'; | ||
|
||
export class WordPressRestExtractor implements Extractor { | ||
info(): ExtractorInfo { | ||
return { | ||
slug: 'wordpress-rest', | ||
title: 'WordPress REST API', | ||
description: | ||
'Extracts posts and pages from a WordPress site using the WordPress REST API', | ||
}; | ||
} | ||
|
||
supports( source: Source ): boolean { | ||
if ( ! ( source instanceof DOMSource ) ) { | ||
return false; | ||
} | ||
const document = source.resource(); | ||
|
||
const post = document.querySelector( 'article.post' ); | ||
if ( post ) { | ||
// Check if the CSS class matches `post-<id>`. | ||
const matches = post.className.match( /post-(\d+)/ ); | ||
if ( matches !== null ) { | ||
return true; | ||
} | ||
} | ||
|
||
const page = document.querySelector( 'article.page' ); | ||
if ( page ) { | ||
// Check if the CSS class matches `post-<id>`. | ||
const matches = page.className.match( /post-(\d+)/ ); | ||
if ( matches !== null ) { | ||
return true; | ||
} | ||
} | ||
|
||
return false; | ||
} | ||
|
||
async extractInfo( source: Source ): Promise< SourceInfo > { | ||
// TODO. | ||
return { title: 'Foo' }; | ||
} | ||
|
||
async extractData( | ||
source: Source, | ||
callback: ( entry: SourceData ) => void | ||
): Promise< void > { | ||
// TODO. | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you elaborate on these other kinds of resources? The URL can be accessed through
document.location.href
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The idea behind having a
Source
interface is so that the API does not depend on a specific data structure, that might not be available in all runtimes. If in the future we would like to run an extractor innodejs
, for example, thedocument
would not exist (or a least would not have the same type).Another reason would be that we can envision having extractors that don't rely on a
document
, but instead, for example, pull directly from a URL. (We could also make it so that an extractor can support multiple types ofSources
, e.g.DOMSource
andURLSource
).If we would not introduce the notion of a
Source
at this moment, adding support later for multiple types of sources would be a breaking change to the API, which would require updating all existing extractors.