mirror of
https://gitee.com/Vancouver2017/luban-lite-t3e-pro.git
synced 2025-12-14 18:38:55 +00:00
204 lines
6.2 KiB
JavaScript
204 lines
6.2 KiB
JavaScript
// Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
|
|
// paper, in
|
|
//
|
|
// Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
|
// no. 3, pp 130-137,
|
|
//
|
|
// see also http://www.tartarus.org/~martin/PorterStemmer
|
|
|
|
// Release 1 be 'andargor', Jul 2004
|
|
// Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
|
|
|
|
|
|
define(function () {
|
|
return (function () {
|
|
var step2list = {
|
|
"ational": "ate",
|
|
"tional": "tion",
|
|
"enci": "ence",
|
|
"anci": "ance",
|
|
"izer": "ize",
|
|
"bli": "ble",
|
|
"alli": "al",
|
|
"entli": "ent",
|
|
"eli": "e",
|
|
"ousli": "ous",
|
|
"ization": "ize",
|
|
"ation": "ate",
|
|
"ator": "ate",
|
|
"alism": "al",
|
|
"iveness": "ive",
|
|
"fulness": "ful",
|
|
"ousness": "ous",
|
|
"aliti": "al",
|
|
"iviti": "ive",
|
|
"biliti": "ble",
|
|
"logi": "log"
|
|
},
|
|
|
|
step3list = {
|
|
"icate": "ic",
|
|
"ative": "",
|
|
"alize": "al",
|
|
"iciti": "ic",
|
|
"ical": "ic",
|
|
"ful": "",
|
|
"ness": ""
|
|
},
|
|
|
|
c = "[^aeiou]", // consonant
|
|
v = "[aeiouy]", // vowel
|
|
C = c + "[^aeiouy]*", // consonant sequence
|
|
V = v + "[aeiou]*", // vowel sequence
|
|
|
|
mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0
|
|
meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1
|
|
mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1
|
|
s_v = "^(" + C + ")?" + v; // vowel in stem
|
|
|
|
return function (w) {
|
|
var stem,
|
|
suffix,
|
|
firstch,
|
|
re,
|
|
re2,
|
|
re3,
|
|
re4,
|
|
origword = w;
|
|
|
|
if (w.length < 3) {
|
|
return w;
|
|
}
|
|
|
|
firstch = w.substr(0, 1);
|
|
if (firstch == "y") {
|
|
w = firstch.toUpperCase() + w.substr(1);
|
|
}
|
|
|
|
// Step 1a
|
|
re = /^(.+?)(ss|i)es$/;
|
|
re2 = /^(.+?)([^s])s$/;
|
|
|
|
if (re.test(w)) {
|
|
w = w.replace(re, "$1$2");
|
|
}
|
|
else if (re2.test(w)) {
|
|
w = w.replace(re2, "$1$2");
|
|
}
|
|
|
|
// Step 1b
|
|
re = /^(.+?)eed$/;
|
|
re2 = /^(.+?)(ed|ing)$/;
|
|
if (re.test(w)) {
|
|
var fp = re.exec(w);
|
|
re = new RegExp(mgr0);
|
|
if (re.test(fp[1])) {
|
|
re = /.$/;
|
|
w = w.replace(re, "");
|
|
}
|
|
} else if (re2.test(w)) {
|
|
var fp = re2.exec(w);
|
|
stem = fp[1];
|
|
re2 = new RegExp(s_v);
|
|
if (re2.test(stem)) {
|
|
w = stem;
|
|
re2 = /(at|bl|iz)$/;
|
|
re3 = new RegExp("([^aeiouylsz])\\1$");
|
|
re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
|
|
if (re2.test(w)) {
|
|
w = w + "e";
|
|
}
|
|
else if (re3.test(w)) {
|
|
re = /.$/;
|
|
w = w.replace(re, "");
|
|
}
|
|
else if (re4.test(w)) {
|
|
w = w + "e";
|
|
}
|
|
}
|
|
}
|
|
|
|
// Step 1c
|
|
re = /^(.+?)y$/;
|
|
if (re.test(w)) {
|
|
var fp = re.exec(w);
|
|
stem = fp[1];
|
|
re = new RegExp(s_v);
|
|
if (re.test(stem)) {
|
|
w = stem + "i";
|
|
}
|
|
}
|
|
|
|
// Step 2
|
|
re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
|
|
if (re.test(w)) {
|
|
var fp = re.exec(w);
|
|
stem = fp[1];
|
|
suffix = fp[2];
|
|
re = new RegExp(mgr0);
|
|
if (re.test(stem)) {
|
|
w = stem + step2list[suffix];
|
|
}
|
|
}
|
|
|
|
// Step 3
|
|
re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
|
|
if (re.test(w)) {
|
|
var fp = re.exec(w);
|
|
stem = fp[1];
|
|
suffix = fp[2];
|
|
re = new RegExp(mgr0);
|
|
if (re.test(stem)) {
|
|
w = stem + step3list[suffix];
|
|
}
|
|
}
|
|
|
|
// Step 4
|
|
re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
|
|
re2 = /^(.+?)(s|t)(ion)$/;
|
|
if (re.test(w)) {
|
|
var fp = re.exec(w);
|
|
stem = fp[1];
|
|
re = new RegExp(mgr1);
|
|
if (re.test(stem)) {
|
|
w = stem;
|
|
}
|
|
} else if (re2.test(w)) {
|
|
var fp = re2.exec(w);
|
|
stem = fp[1] + fp[2];
|
|
re2 = new RegExp(mgr1);
|
|
if (re2.test(stem)) {
|
|
w = stem;
|
|
}
|
|
}
|
|
|
|
// Step 5
|
|
re = /^(.+?)e$/;
|
|
if (re.test(w)) {
|
|
var fp = re.exec(w);
|
|
stem = fp[1];
|
|
re = new RegExp(mgr1);
|
|
re2 = new RegExp(meq1);
|
|
re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
|
|
if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
|
|
w = stem;
|
|
}
|
|
}
|
|
|
|
re = /ll$/;
|
|
re2 = new RegExp(mgr1);
|
|
if (re.test(w) && re2.test(w)) {
|
|
re = /.$/;
|
|
w = w.replace(re, "");
|
|
}
|
|
|
|
// and turn initial Y back to y
|
|
|
|
if (firstch == "y") {
|
|
w = firstch.toLowerCase() + w.substr(1);
|
|
}
|
|
|
|
return w;
|
|
}
|
|
})();
|
|
}); |