Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#116 Allow token processing "middleware" #144

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Sentiment is a Node.js module that uses the [AFINN-165](http://www2.imm.dtu.dk/p

- [Installation](#installation)
- [Usage example](#usage-example)
- [Spell checked example](#spell-checked-example)
- [Adding new languages](#adding-new-languages)
- [Adding and overwriting words](#adding-and-overwriting-words)
- [API Reference](#api-reference)
Expand All @@ -36,6 +37,14 @@ var result = sentiment.analyze('Cats are stupid.');
console.dir(result); // Score: -2, Comparative: -0.666
```

## Spell checked example
```js
var Sentiment = require('sentiment');
var sentiment = new Sentiment();
var result = sentiment.analyze('Cats are stpid.', { spellCheck: true });
console.dir(result); // Score: -2, Comparative: -0.666
```

## Adding new languages
You can add support for a new language by registering it using the `registerLanguage` method:

Expand Down Expand Up @@ -63,6 +72,12 @@ var frLanguage = {
}
return tokenScore;
}
},
spellCheck: {
apply: function(labels, token) {
// Implement a spell checking strategy here
return token;
}
}
};
sentiment.registerLanguage('fr', frLanguage);
Expand Down
33 changes: 33 additions & 0 deletions languages/en/distance.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
var lev = require('levenshtein');
var tokenize = require('../../lib/tokenize');

/**
* Finds the closest match between a statement and a body of words using
* Levenshtein Distance
Copy link
Owner

@thisandagain thisandagain Jun 19, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Levenshtein Distance is a great performant strategy, but if we are going to make this optional anyway we might want to discuss / test alternative edit distance algorithms (e.g. Myers Diff Algorithm).

PDF:
myers.pdf

*
* @param {string} string Input string
* @param {string/array} words List of strings to find closest
* @return {string} The closest word in the list
*/
module.exports = function(string, words) {

var shortest = words.toString().length;
var bestFit = '';

if (typeof words === 'string') {
words = tokenize(words);
}

words.forEach(function(word) {

var distance = lev(string, word);

if (distance < shortest) {
bestFit = word;
shortest = distance;
}

});

return bestFit;
};
3 changes: 2 additions & 1 deletion languages/en/index.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
module.exports = {
labels: require('./labels.json'),
scoringStrategy: require('./scoring-strategy')
scoringStrategy: require('./scoring-strategy'),
spellCheck: require('./spell-check')
};
37 changes: 37 additions & 0 deletions languages/en/negation.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/**
* Spellchecking library
*/
var spelling = require('./spelling');

/**
* These words "flip" the sentiment of the following word.
*/
var negators = require('./negators.json');

/**
* Language labels and scores
*/
var labels = require('./labels.json');

/**
* Evaluates wether the current token is negated by a previous token
*
* @param {array} tokens list of tokens being evaluated
* @param {int} pos position of the current word in the tokens list
*
* @return {boolean} true if the current pos is being negaed, false otherwise
*/
module.exports = function negated(tokens, pos) {
while (pos--) {
if (negators[tokens[pos]]) {
return true;
}
var word = spelling.getSpellCheckedWord(tokens[pos]);
if (negators[word]) {
return true;
} else if (labels.hasOwnProperty(word)) {
return false;
}
}
return false;
};
6 changes: 3 additions & 3 deletions languages/en/scoring-strategy.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
var negators = require('./negators.json');
var negated = require('./negation');

module.exports = {
apply: function(tokens, cursor, tokenScore) {
if (cursor > 0) {
var prevtoken = tokens[cursor - 1];
if (negators[prevtoken]) {
// Check for negation
if (negated(tokens, cursor)) {
tokenScore = -tokenScore;
}
}
Expand Down
5 changes: 5 additions & 0 deletions languages/en/spell-check.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
var spelling = require('./spelling');

module.exports = function(labels, token) {
return spelling.getSpellCheckedAfinnWord(labels, token);
};
61 changes: 61 additions & 0 deletions languages/en/spelling.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
var read = require('fs').readFileSync;
var nspell = require('nspell');
var distance = require('./distance');

function loadDictionary() {
var base = require.resolve('dictionary-en-us');
var result = {
'aff': read(base.replace('.js', '.aff'), 'utf-8'),
'dic': read(base.replace('.js', '.dic'), 'utf-8')
};
return result;
}

var spell = null;

function getSpellChecker() {
if (spell === null) {
var dictionaray = loadDictionary();
spell = nspell(dictionaray);
}
return spell;
}

/**
* These two functions atempt to spell check and correct a given word, using
* Levenshtein Distance to choose the most appropriate correction.
* getSpellCheckedAfinnWord also looks for the word to be present on Afinn
*/
module.exports = {
getSpellCheckedAfinnWord: function (afinn, word) {
var spellChecker = getSpellChecker();
if (!afinn.hasOwnProperty(word) && !spellChecker.correct(word)) {
var checked = spellChecker.suggest(word);
if (checked.length === 0) {
return word;
} else {
var closest = distance(word, checked);
if (closest && afinn.hasOwnProperty(closest)) {
return closest;
}
}
}
return word;
},

getSpellCheckedWord: function (word) {
var spellChecker = getSpellChecker();
if (!spellChecker.correct(word)) {
var checked = spellChecker.suggest(word);
if (checked.length === 0) {
return word;
} else {
var closest = distance(word, checked);
if (closest) {
return closest;
}
}
}
return word;
}
};
4 changes: 3 additions & 1 deletion lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ Sentiment.prototype.analyze = function (phrase, opts, callback) {
// Iterate over tokens
var i = tokens.length;
while (i--) {
var obj = tokens[i];
var obj = opts.spellCheck ? languageProcessor
.applySpellChecking(languageCode, labels, tokens[i]) : tokens[i];
if (!labels.hasOwnProperty(obj)) continue;

words.push(obj);

// Apply scoring strategy
Expand Down
8 changes: 8 additions & 0 deletions lib/language-processor.js
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,14 @@ module.exports = {
// eslint-disable-next-line max-len
var scoringStrategy = language.scoringStrategy || defaultScoringStrategy;
return scoringStrategy.apply(tokens, cursor, tokenScore);
},

/**
* Apply language spell checking strategy, if present.
*/
applySpellChecking: function(languageCode, labels, token) {
var language = this.getLanguage(languageCode);
return language.spellCheck ? language.spellCheck(labels, token) : token;
}
};

Expand Down
5 changes: 5 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,10 @@
},
"engines": {
"node": ">=8.0"
},
"dependencies": {
"dictionary-en-us": "^2.0.0",
"levenshtein": "^1.0.5",
"nspell": "^2.0.1"
}
}
2 changes: 1 addition & 1 deletion test/integration/sync_corpus.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ var result = sentiment.analyze(dataset);

test('synchronous corpus', function (t) {
t.type(result, 'object');
t.equal(result.score, -3);
t.equal(result.score, -19);
t.equal(result.tokens.length, 1416);
t.equal(result.words.length, 73);
t.end();
Expand Down
15 changes: 15 additions & 0 deletions test/integration/sync_negation_backwards.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
var test = require('tap').test;
var Sentiment = require('../../lib/index');
var sentiment = new Sentiment();

var input = 'this is not very bad';
var result = sentiment.analyze(input);

test('synchronous negation', function (t) {
t.type(result, 'object');
t.equal(result.score, 3);
t.equal(result.comparative, 0.6);
t.equal(result.tokens.length, 5);
t.equal(result.words.length, 1);
t.end();
});
26 changes: 26 additions & 0 deletions test/integration/sync_spell_check.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
var test = require('tap').test;
var Sentiment = require('../../lib/index');
var sentiment = new Sentiment();

var input = 'I hatee you';
var result = sentiment.analyze(input, { spellCheck: true });

test('synchronous spell checking active', function (t) {
t.type(result, 'object');
t.equal(result.score, -3);
t.equal(result.comparative, -1);
t.equal(result.tokens.length, 3);
t.equal(result.words.length, 1);
t.end();
});

result = sentiment.analyze(input);

test('synchronous spell checking inactive', function (t) {
t.type(result, 'object');
t.equal(result.score, 0);
t.equal(result.comparative, 0);
t.equal(result.tokens.length, 3);
t.equal(result.words.length, 0);
t.end();
});