I'm looking for feedback on my library for extracting words from a text: https://npmjs.org/package/uwords
The extracted word is defined as sequence of Unicode characters from Lu, Ll, Lt, Lm, Lo groups. So the code of the main part is (https://github.com/AlexAtNet/uwords/blob/master/index.js#L9):
module.exports = function (text) {
var words, word, index, limit, code;
words = [ ];
word = null;
for (index = 0, limit = text.length; index < limit; index += 1) {
code = text.charCodeAt(index);
if (-1 === _.indexOf(letters, code, true)) {
if (null !== word) {
words.push(word.join(''));
word = null;
}
} else {
if (null === word) {
word = [ ];
}
word.push(String.fromCharCode(code));
}
}
if (null !== word) {
words.push(word.join(''));
}
return words;
};
and the array letters was created as follows (https://github.com/AlexAtNet/uwords/blob/master/gruntfile.js#L59):
grunt.registerTask('create-letters-json', 'letters.json', function () {
var letters, compacted;
letters = [
require('unicode/category/Lu'),
require('unicode/category/Ll'),
require('unicode/category/Lt'),
require('unicode/category/Lm'),
require('unicode/category/Lo')
].reduce(function (list, item) {
list.push.apply(list, Object.keys(item).map(function (value) {
return parseInt(value, 10);
}));
return list;
}, [ ]).sort(function (a, b) { return a - b; });
compacted = (function (list) {
var result, item, idx, value;
result = [ ];
item = { begin : list[0], end : list[0] };
result.push(item);
for (idx = 1; idx < list.length; idx += 1) {
value = list[idx];
if (item.end + 1 === value) {
item.end = value;
} else {
item = { begin : list[idx], end : list[idx] };
result.push(item);
}
}
for (idx = 0; idx < result.length; idx += 1) {
item = result[idx];
if (item.begin === item.end) {
result[idx] = item.begin;
} else {
result[idx] = [ item.begin, item.end ];
}
}
return result;
}(letters));
require('fs').writeFileSync(__dirname + '/letters.json',
JSON.stringify(compacted, null, 2));
});
It is quite naive approach but I think that it will work in most of the cases. What do you think?