thisandagain · nsantini · Sep 12, 2017 · Sep 12, 2017 · Sep 12, 2017 · Sep 12, 2017
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ Sentiment is a Node.js module that uses the [AFINN-165](http://www2.imm.dtu.dk/p
 
 - [Installation](#installation)
 - [Usage example](#usage-example)
+- [Spell checked example](#spell-checked-example)
 - [Adding new languages](#adding-new-languages)
 - [Adding and overwriting words](#adding-and-overwriting-words)
 - [API Reference](#api-reference)
@@ -36,6 +37,15 @@ var result = sentiment.analyze('Cats are stupid.');
 console.dir(result);    // Score: -2, Comparative: -0.666
 ```
 
+## Spell checked example
+We use [nspell](https://www.npmjs.com/package/nspell) for spell checking. Aditional languages need to provide a dictionary for `nspell` to use. Dictionaries can be found in [this repository](https://github.com/wooorm/dictionaries). But they can also be provided by different means, as long as they are presented to `nspell` in the format `{ aff, dic }` as prescribed by [hunspell](http://hunspell.github.io/). `nspell` supports many parts of Hunspell-style dictionaries. Essentially, the concept of a dictionary consists of one `affix` document, and one or more `dictionary` documents. The documents are tightly linked, so it’s not possible to use a Dutch affix with an English dictionary document.
+```js
+var Sentiment = require('sentiment');
+var sentiment = new Sentiment();
+var result = sentiment.analyze('Cats are stpid.', { spellCheck: true });
+console.dir(result);    // Score: -2, Comparative: -0.666
+```
+
 ## Adding new languages
 You can add support for a new language by registering it using the `registerLanguage` method:
 
@@ -63,6 +73,15 @@ var frLanguage = {
       }
       return tokenScore;
     }
+  },
+  getDictionary: {
+    apply: function() {
+      // Load a dictionary for the language for `nspell` to use, as explained in the "Spell checked example", with the following structure:
+      return {
+        aff, // affix document
+        dic  // dictionary
+      };
+    }
   }
 };
 sentiment.registerLanguage('fr', frLanguage);
@@ -109,6 +128,7 @@ console.dir(result);    // Score: 7, Comparative: 1.75
 |----------|-----------|---------|---------------------------------------------------------------|
 | language | `string`  | `'en'`  | Language to use for sentiment analysis                        |
 | extras   | `object`  | `{}`    | Set of labels and their associated values to add or overwrite |
+| spellCheck | `boolean` | `false` | Tell the library whether to spell check words or not |
 
 ---
 

diff --git a/languages/en/dictionary.js b/languages/en/dictionary.js
@@ -0,0 +1,13 @@
+var read = require('fs').readFileSync;
+var dictionary = null;
+
+module.exports = function() {
+    if (dictionary ===  null) {
+        var base = require.resolve('dictionary-en-us');
+        dictionary = {
+            'aff': read(base.replace('.js', '.aff'), 'utf-8'),
+            'dic': read(base.replace('.js', '.dic'), 'utf-8')
+        };
+    }
+    return dictionary;
+};
diff --git a/languages/en/index.js b/languages/en/index.js
@@ -1,4 +1,5 @@
 module.exports = {
     labels: require('./labels.json'),
-    scoringStrategy: require('./scoring-strategy')
+    scoringStrategy: require('./scoring-strategy'),
+    getDictionary: require('./dictionary')
 };
diff --git a/languages/en/negation.js b/languages/en/negation.js
@@ -0,0 +1,39 @@
+/**
+ * Spellchecking library
+ */
+var spelling = require('../../lib/spelling');
+
+/**
+ * These words "flip" the sentiment of the following word.
+ */
+var negators = require('./negators.json');
+
+/**
+ * Language labels and scores
+ */
+var labels = require('./labels.json');
+
+/**
+ * Evaluates wether the current token is negated by a previous token
+ * 
+ * @param {array} tokens list of tokens being evaluated
+ * @param {int} pos position of the current word in the tokens list
+ * 
+ * @return {boolean} true if the current pos is being negaed, false otherwise
+ */
+module.exports = function negated(tokens, pos, spellCheck) {
+    while (pos--) {
+        if (negators[tokens[pos]]) {
+            return true;
+        }
+        var word = spellCheck ?
+            spelling.getSpellCheckedWord(tokens[pos]) :
+            tokens[pos];
+        if (negators[word]) {
+            return true;
+        } else if (labels.hasOwnProperty(word)) {
+            return false;
+        }
+    }
+    return false;
+};
diff --git a/languages/en/scoring-strategy.js b/languages/en/scoring-strategy.js
@@ -1,10 +1,10 @@
-var negators = require('./negators.json');
+var negated = require('./negation');
 
 module.exports = {
-    apply: function(tokens, cursor, tokenScore) {
+    apply: function(tokens, cursor, tokenScore, spellCheck) {
         if (cursor > 0) {
-            var prevtoken = tokens[cursor - 1];
-            if (negators[prevtoken]) {
+            // Check for negation
+            if (negated(tokens, cursor, spellCheck)) {
                 tokenScore = -tokenScore;
             }
         }

diff --git a/lib/distance.js b/lib/distance.js
@@ -0,0 +1,28 @@
+var lev = require('levenshtein');
+
+/**
+ * Finds the closest match between a statement and a body of words using
+ * Levenshtein Distance
+ * 
+ * @param  {string} string Input string
+ * @param  {string/array} words List of strings to find closest
+ * @return {string} The closest word in the list
+ */
+module.exports = function(string, words) {
+
+    var shortest = words.toString().length;
+    var bestFit  = '';
+
+    words.forEach(function(word) {
+
+        var distance = lev(string, word);
+
+        if (distance < shortest) {
+            bestFit  = word;
+            shortest = distance;
+        }
+
+    });
+
+    return bestFit;
+};
diff --git a/lib/index.js b/lib/index.js
@@ -1,5 +1,6 @@
 var tokenize = require('./tokenize');
 var languageProcessor = require('./language-processor');
+var spelling = require('./spelling');
 
 /**
  * Constructor
@@ -48,6 +49,11 @@ Sentiment.prototype.analyze = function (phrase, opts, callback) {
     var languageCode = opts.language || 'en';
     var labels = languageProcessor.getLabels(languageCode);
 
+    // Set up spell checker
+    if (opts.spellCheck) {
+        spelling.setUp(languageProcessor.getDictionary(languageCode));
+    }
+
     // Merge extra labels
     if (typeof opts.extras === 'object') {
         labels = Object.assign(labels, opts.extras);
@@ -63,14 +69,17 @@ Sentiment.prototype.analyze = function (phrase, opts, callback) {
     // Iterate over tokens
     var i = tokens.length;
     while (i--) {
-        var obj = tokens[i];
+        var obj = opts.spellCheck ?
+            spelling.getSpellCheckedAfinnWord(labels, tokens[i]) :
+            tokens[i];
         if (!labels.hasOwnProperty(obj)) continue;
+
         words.push(obj);
 
         // Apply scoring strategy
         var tokenScore = labels[obj];
         // eslint-disable-next-line max-len
-        tokenScore = languageProcessor.applyScoringStrategy(languageCode, tokens, i, tokenScore);
+        tokenScore = languageProcessor.applyScoringStrategy(languageCode, tokens, i, tokenScore, opts.spellCheck);
         if (tokenScore > 0) positive.push(obj);
         if (tokenScore < 0) negative.push(obj);
         score += tokenScore;

diff --git a/lib/language-processor.js b/lib/language-processor.js
@@ -72,18 +72,32 @@ module.exports = {
      * @param {Array} tokens - Tokens of the phrase to analyze
      * @param {int} cursor - Cursor of the current token being analyzed
      * @param {int} tokenScore - The score of the current token being analyzed
+     * @param {boolean} spellCheck - Tells whether to apply spell checking
+     * or not, deafault False
      */
-    applyScoringStrategy: function(languageCode, tokens, cursor, tokenScore) {
+    applyScoringStrategy: function(languageCode, tokens, cursor, tokenScore,
+        spellCheck) {
         var language = this.getLanguage(languageCode);
         // Fallback to default strategy if none was specified
         // eslint-disable-next-line max-len
         var scoringStrategy = language.scoringStrategy || defaultScoringStrategy;
-        return scoringStrategy.apply(tokens, cursor, tokenScore);
+        return scoringStrategy.apply(tokens, cursor, tokenScore, spellCheck);
+    },
+
+    /**
+     * Get the language defined dictionary for spell checking
+     * 
+     * @param {String} languageCode - Two-digit language code
+     */
+    getDictionary: function(languageCode) {
+        var language = this.getLanguage(languageCode);
+        return language.getDictionary();
     }
 };
 
 var defaultScoringStrategy = {
-    apply: function(tokens, cursor, tokenScore) {
+    // eslint-disable-next-line no-unused-vars
+    apply: function(tokens, cursor, tokenScore, spellCheck) {
         return tokenScore;
     }
 };
diff --git a/lib/spelling.js b/lib/spelling.js
@@ -0,0 +1,43 @@
+var nspell = require('nspell');
+var distance = require('./distance');
+var spellChecker = null;
+
+/**
+ * These two functions atempt to spell check and correct a given word, using
+ * Levenshtein Distance to choose the most appropriate correction.
+ * getSpellCheckedAfinnWord also looks for the word to be present on Afinn
+ */
+module.exports = {
+    setUp: function(dictionaray) {
+        spellChecker = nspell(dictionaray);
+    },
+    getSpellCheckedAfinnWord: function (afinn, word) {
+        if (!afinn.hasOwnProperty(word) && !spellChecker.correct(word)) {
+            var checked = spellChecker.suggest(word);
+            if (checked.length === 0) {
+                return word;
+            } else {
+                var closest = distance(word, checked);
+                if (closest && afinn.hasOwnProperty(closest)) {
+                    return closest;
+                }
+            }
+        }
+        return word;
+    },
+
+    getSpellCheckedWord: function (word) {
+        if (!spellChecker.correct(word)) {
+            var checked = spellChecker.suggest(word);
+            if (checked.length === 0) {
+                return word;
+            } else {
+                var closest = distance(word, checked);
+                if (closest) {
+                    return closest;
+                }
+            }
+        }
+        return word;
+    }
+};