Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#116 Allow token processing "middleware" #144

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Sentiment is a Node.js module that uses the [AFINN-165](http://www2.imm.dtu.dk/p

- [Installation](#installation)
- [Usage example](#usage-example)
- [Spell checked example](#spell-checked-example)
- [Adding new languages](#adding-new-languages)
- [Adding and overwriting words](#adding-and-overwriting-words)
- [API Reference](#api-reference)
Expand All @@ -36,6 +37,15 @@ var result = sentiment.analyze('Cats are stupid.');
console.dir(result); // Score: -2, Comparative: -0.666
```

## Spell checked example
We use [nspell](https://www.npmjs.com/package/nspell) for spell checking. Aditional languages need to provide a dictionary for `nspell` to use. Dictionaries can be found in [this repository](https://github.com/wooorm/dictionaries). But they can also be provided by different means, as long as they are presented to `nspell` in the format `{ aff, dic }` as prescribed by [hunspell](http://hunspell.github.io/). `nspell` supports many parts of Hunspell-style dictionaries. Essentially, the concept of a dictionary consists of one `affix` document, and one or more `dictionary` documents. The documents are tightly linked, so it’s not possible to use a Dutch affix with an English dictionary document.
```js
var Sentiment = require('sentiment');
var sentiment = new Sentiment();
var result = sentiment.analyze('Cats are stpid.', { spellCheck: true });
console.dir(result); // Score: -2, Comparative: -0.666
```

## Adding new languages
You can add support for a new language by registering it using the `registerLanguage` method:

Expand Down Expand Up @@ -63,6 +73,15 @@ var frLanguage = {
}
return tokenScore;
}
},
getDictionary: {
apply: function() {
// Load a dictionary for the language for `nspell` to use, as explained in the "Spell checked example", with the following structure:
return {
aff, // affix document
dic // dictionary
};
}
}
};
sentiment.registerLanguage('fr', frLanguage);
Expand Down Expand Up @@ -109,6 +128,7 @@ console.dir(result); // Score: 7, Comparative: 1.75
|----------|-----------|---------|---------------------------------------------------------------|
| language | `string` | `'en'` | Language to use for sentiment analysis |
| extras | `object` | `{}` | Set of labels and their associated values to add or overwrite |
| spellCheck | `boolean` | `false` | Tell the library whether to spell check words or not |

---

Expand Down
13 changes: 13 additions & 0 deletions languages/en/dictionary.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
var read = require('fs').readFileSync;
var dictionary = null;

module.exports = function() {
if (dictionary === null) {
var base = require.resolve('dictionary-en-us');
dictionary = {
'aff': read(base.replace('.js', '.aff'), 'utf-8'),
'dic': read(base.replace('.js', '.dic'), 'utf-8')
};
}
return dictionary;
};
3 changes: 2 additions & 1 deletion languages/en/index.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
module.exports = {
labels: require('./labels.json'),
scoringStrategy: require('./scoring-strategy')
scoringStrategy: require('./scoring-strategy'),
getDictionary: require('./dictionary')
};
39 changes: 39 additions & 0 deletions languages/en/negation.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/**
* Spellchecking library
*/
var spelling = require('../../lib/spelling');

/**
* These words "flip" the sentiment of the following word.
*/
var negators = require('./negators.json');

/**
* Language labels and scores
*/
var labels = require('./labels.json');

/**
* Evaluates wether the current token is negated by a previous token
*
* @param {array} tokens list of tokens being evaluated
* @param {int} pos position of the current word in the tokens list
*
* @return {boolean} true if the current pos is being negaed, false otherwise
*/
module.exports = function negated(tokens, pos, spellCheck) {
while (pos--) {
if (negators[tokens[pos]]) {
return true;
}
var word = spellCheck ?
spelling.getSpellCheckedWord(tokens[pos]) :
tokens[pos];
if (negators[word]) {
return true;
} else if (labels.hasOwnProperty(word)) {
return false;
}
}
return false;
};
8 changes: 4 additions & 4 deletions languages/en/scoring-strategy.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
var negators = require('./negators.json');
var negated = require('./negation');

module.exports = {
apply: function(tokens, cursor, tokenScore) {
apply: function(tokens, cursor, tokenScore, spellCheck) {
if (cursor > 0) {
var prevtoken = tokens[cursor - 1];
if (negators[prevtoken]) {
// Check for negation
if (negated(tokens, cursor, spellCheck)) {
tokenScore = -tokenScore;
}
}
Expand Down
28 changes: 28 additions & 0 deletions lib/distance.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
var lev = require('levenshtein');

/**
* Finds the closest match between a statement and a body of words using
* Levenshtein Distance
*
* @param {string} string Input string
* @param {string/array} words List of strings to find closest
* @return {string} The closest word in the list
*/
module.exports = function(string, words) {

var shortest = words.toString().length;
var bestFit = '';

words.forEach(function(word) {

var distance = lev(string, word);

if (distance < shortest) {
bestFit = word;
shortest = distance;
}

});

return bestFit;
};
13 changes: 11 additions & 2 deletions lib/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
var tokenize = require('./tokenize');
var languageProcessor = require('./language-processor');
var spelling = require('./spelling');

/**
* Constructor
Expand Down Expand Up @@ -48,6 +49,11 @@ Sentiment.prototype.analyze = function (phrase, opts, callback) {
var languageCode = opts.language || 'en';
var labels = languageProcessor.getLabels(languageCode);

// Set up spell checker
if (opts.spellCheck) {
spelling.setUp(languageProcessor.getDictionary(languageCode));
}

// Merge extra labels
if (typeof opts.extras === 'object') {
labels = Object.assign(labels, opts.extras);
Expand All @@ -63,14 +69,17 @@ Sentiment.prototype.analyze = function (phrase, opts, callback) {
// Iterate over tokens
var i = tokens.length;
while (i--) {
var obj = tokens[i];
var obj = opts.spellCheck ?
spelling.getSpellCheckedAfinnWord(labels, tokens[i]) :
tokens[i];
if (!labels.hasOwnProperty(obj)) continue;

words.push(obj);

// Apply scoring strategy
var tokenScore = labels[obj];
// eslint-disable-next-line max-len
tokenScore = languageProcessor.applyScoringStrategy(languageCode, tokens, i, tokenScore);
tokenScore = languageProcessor.applyScoringStrategy(languageCode, tokens, i, tokenScore, opts.spellCheck);
if (tokenScore > 0) positive.push(obj);
if (tokenScore < 0) negative.push(obj);
score += tokenScore;
Expand Down
20 changes: 17 additions & 3 deletions lib/language-processor.js
Original file line number Diff line number Diff line change
Expand Up @@ -72,18 +72,32 @@ module.exports = {
* @param {Array} tokens - Tokens of the phrase to analyze
* @param {int} cursor - Cursor of the current token being analyzed
* @param {int} tokenScore - The score of the current token being analyzed
* @param {boolean} spellCheck - Tells whether to apply spell checking
* or not, deafault False
*/
applyScoringStrategy: function(languageCode, tokens, cursor, tokenScore) {
applyScoringStrategy: function(languageCode, tokens, cursor, tokenScore,
spellCheck) {
var language = this.getLanguage(languageCode);
// Fallback to default strategy if none was specified
// eslint-disable-next-line max-len
var scoringStrategy = language.scoringStrategy || defaultScoringStrategy;
return scoringStrategy.apply(tokens, cursor, tokenScore);
return scoringStrategy.apply(tokens, cursor, tokenScore, spellCheck);
},

/**
* Get the language defined dictionary for spell checking
*
* @param {String} languageCode - Two-digit language code
*/
getDictionary: function(languageCode) {
var language = this.getLanguage(languageCode);
return language.getDictionary();
}
};

var defaultScoringStrategy = {
apply: function(tokens, cursor, tokenScore) {
// eslint-disable-next-line no-unused-vars
apply: function(tokens, cursor, tokenScore, spellCheck) {
return tokenScore;
}
};
43 changes: 43 additions & 0 deletions lib/spelling.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
var nspell = require('nspell');
var distance = require('./distance');
var spellChecker = null;

/**
* These two functions atempt to spell check and correct a given word, using
* Levenshtein Distance to choose the most appropriate correction.
* getSpellCheckedAfinnWord also looks for the word to be present on Afinn
*/
module.exports = {
setUp: function(dictionaray) {
spellChecker = nspell(dictionaray);
},
getSpellCheckedAfinnWord: function (afinn, word) {
if (!afinn.hasOwnProperty(word) && !spellChecker.correct(word)) {
var checked = spellChecker.suggest(word);
if (checked.length === 0) {
return word;
} else {
var closest = distance(word, checked);
if (closest && afinn.hasOwnProperty(closest)) {
return closest;
}
}
}
return word;
},

getSpellCheckedWord: function (word) {
if (!spellChecker.correct(word)) {
var checked = spellChecker.suggest(word);
if (checked.length === 0) {
return word;
} else {
var closest = distance(word, checked);
if (closest) {
return closest;
}
}
}
return word;
}
};
Loading