Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extractor API #16

Closed
wants to merge 13 commits into from
10 changes: 5 additions & 5 deletions src/app.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { initPlayground } from './playground';
import { PlaygroundClient } from '@wp-playground/client';

Check warning on line 2 in src/app.ts

View workflow job for this annotation

GitHub Actions / lint

PlaygroundClient not found in '@wp-playground/client'

declare global {
interface Window {
Expand All @@ -10,11 +10,11 @@
const iframeId = 'playground';
const iframe = document.getElementById( iframeId ) as HTMLIFrameElement;

initPlayground( iframeId )
.then( ( playground ) => {
window.playground = playground;
} )
.catch( ( err ) => console.error( err ) );
// initPlayground( iframeId )
// .then( ( playground ) => {
// window.playground = playground;
// } )
// .catch( ( err ) => console.error( err ) );

const relayToPlayground = function ( response: any ) {
console.log( response, chrome.runtime.lastError );
Expand Down
23 changes: 23 additions & 0 deletions src/content.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,29 @@
// Constants
import { findExtractors } from './extractor/registry';
import { DOMSource } from './extractor/source';

const MESSAGE_NAMESPACE = 'TRY_WORDPRESS';

const source = new DOMSource( document );

const extractors = findExtractors( source );
if ( extractors.length === 0 ) {
throw new Error( 'No extractor was found' );
} else if ( extractors.length > 1 ) {
throw new Error( 'Multiple extractors were found' );
}

const extractor = extractors[ 0 ];
console.log( `Found extractor ${ extractor.info().slug }` );

extractor
.extractData( source, ( entry ) => {
// Do something with the entry.
console.log( entry );
} )
.then( () => console.log( 'Extraction finished' ) )
.catch( ( err ) => console.log( err ) );

const wpInsertPost = ( data: any ) => {
data.post_status = 'publish';
let code = "<?php require_once 'wordpress/wp-load.php';\n";
Expand Down
48 changes: 48 additions & 0 deletions src/extractor/extractor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import { SourceData, SourceInfo, Source } from './source';

/**
* Information about the Extractor.
*/
export interface ExtractorInfo {
/**
* Unique identifier of the Extractor, e.g. "wordpress-rest".
* Must be a lower-case string.
* There must not be more than one extractor with the same slug.
*/
slug: string;

/**
* Title of the Extractor, e.g. "WordPress".
*/
title: string;

/**
* Description of the Extractor, e.g. "Extracts posts and pages from a WordPress site using the WordPress REST API".
*/
description: string;
}

export interface Extractor {
/**
* Returns information about the Extractor.
*/
info(): ExtractorInfo;

/**
* Tells whether the Extractor supports a given Source.
*/
supports( source: Source ): boolean;

/**
* Extracts information about the Source, like its title, language, etc.
*/
extractInfo( source: Source ): Promise< SourceInfo >;

/**
* Extracts data from a given Document.
*/
extractData(
source: Source,
callback: ( siteData: SourceData ) => void
): Promise< void >;
}
39 changes: 39 additions & 0 deletions src/extractor/registry.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import { Extractor } from './extractor';
import { WordPressRestExtractor } from './wordpress-rest';
import { Source } from './source';

const extractors = new Map< string, Extractor >();

registerExtractor( new WordPressRestExtractor() );

/**

Check failure on line 9 in src/extractor/registry.ts

View workflow job for this annotation

GitHub Actions / lint

Missing JSDoc @param "source" declaration
* Find Extractors that support a given Source.
*/
export function findExtractors( source: Source ): Extractor[] {
let matches: Extractor[] = [];

Check failure on line 13 in src/extractor/registry.ts

View workflow job for this annotation

GitHub Actions / lint

'matches' is never reassigned. Use 'const' instead
for ( let [ slug, extractor ] of extractors ) {

Check failure on line 14 in src/extractor/registry.ts

View workflow job for this annotation

GitHub Actions / lint

'slug' is never reassigned. Use 'const' instead

Check failure on line 14 in src/extractor/registry.ts

View workflow job for this annotation

GitHub Actions / lint

'extractor' is never reassigned. Use 'const' instead
if ( extractor.supports( source ) ) {
matches.push( extractor );
}
}
return matches;
}

/**

Check failure on line 22 in src/extractor/registry.ts

View workflow job for this annotation

GitHub Actions / lint

Missing JSDoc @param "extractor" declaration
* Register an Extractor.
*/
function registerExtractor( extractor: Extractor ) {
const slug = extractor.info().slug;
if ( slug.toLowerCase() !== slug ) {
throw new Error(
`The Extractor's slug must be a sequence of lower-case characters, got '${ slug }'`
);
}

if ( extractors.has( slug ) ) {
throw new Error(
`An Extractor with slug ${ slug } is already registered`
);
}
extractors.set( slug, extractor );
}
46 changes: 46 additions & 0 deletions src/extractor/source.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/**
* Source of data to be extracted, like a DOM document, a URL or any other kind of resource.
* For the moment, only DOM Document is supported.
Comment on lines +2 to +3
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you elaborate on these other kinds of resources? The URL can be accessed through document.location.href.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea behind having a Source interface is so that the API does not depend on a specific data structure, that might not be available in all runtimes. If in the future we would like to run an extractor in nodejs, for example, the document would not exist (or a least would not have the same type).

Another reason would be that we can envision having extractors that don't rely on a document, but instead, for example, pull directly from a URL. (We could also make it so that an extractor can support multiple types of Sources, e.g. DOMSource and URLSource).

If we would not introduce the notion of a Source at this moment, adding support later for multiple types of sources would be a breaking change to the API, which would require updating all existing extractors.

*/
export abstract class Source {
abstract resource(): any;
}

/**
* Source backed by a DOM document instance.
*/
export class DOMSource extends Source {
private readonly document: Document;

constructor( document: Document ) {
super();
this.document = document;
}

resource(): Document {
return this.document;
}
}

/**
* Information about the Source to be extracted.
*/
export interface SourceInfo {
/**
* The site's title.
*/
title: string;
}

/**
* A piece of data in the Source under extraction, like a post or a page.
*/
export interface SourceData {
/**
* Slug of the Extractor which extracted this data.
* This is automatically set, the Extractor does not need to set it.
*/
extractor: string;
title: string;
content: string;
}
52 changes: 52 additions & 0 deletions src/extractor/wordpress-rest.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import { Extractor, ExtractorInfo } from './extractor';
import { DOMSource, Source, SourceData, SourceInfo } from './source';

export class WordPressRestExtractor implements Extractor {
info(): ExtractorInfo {
return {
slug: 'wordpress-rest',
title: 'WordPress REST API',
description:
'Extracts posts and pages from a WordPress site using the WordPress REST API',
};
}

supports( source: Source ): boolean {
if ( ! ( source instanceof DOMSource ) ) {
return false;
}
const document = source.resource();

const post = document.querySelector( 'article.post' );
if ( post ) {
// Check if the CSS class matches `post-<id>`.
const matches = post.className.match( /post-(\d+)/ );
if ( matches !== null ) {
return true;
}
}

const page = document.querySelector( 'article.page' );
if ( page ) {
// Check if the CSS class matches `post-<id>`.
const matches = page.className.match( /post-(\d+)/ );
if ( matches !== null ) {
return true;
}
}

return false;
}

async extractInfo( source: Source ): Promise< SourceInfo > {
// TODO.
return { title: 'Foo' };
}

async extractData(
source: Source,
callback: ( entry: SourceData ) => void
): Promise< void > {
// TODO.
}
}
2 changes: 1 addition & 1 deletion tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"outDir": "./build/typescript",
"noImplicitAny": true,
"module": "es6",
"target": "es5",
"target": "es6",
"jsx": "react",
"allowJs": true,
"moduleResolution": "node"
Expand Down
Loading