Page MenuHomePhabricator

cjkfilter.patch

Authored By
bzimport
Nov 21 2014, 11:15 PM
Size
5 KB
Referenced Files
None
Subscribers
None

cjkfilter.patch

Index: test/org/wikimedia/lsearch/analyzers/CJKFilterTest.java
===================================================================
--- test/org/wikimedia/lsearch/analyzers/CJKFilterTest.java (revision 0)
+++ test/org/wikimedia/lsearch/analyzers/CJKFilterTest.java (revision 0)
@@ -0,0 +1,86 @@
+package org.wikimedia.lsearch.analyzers;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+public class CJKFilterTest extends TestCase {
+ private Analyzer a;
+
+ public CJKFilterTest(String name){
+ super(name);
+ }
+
+ protected void setup() throws Exception {
+ }
+
+ public void testEmpty(){
+ a = new WhitespaceAnalyzer();
+ assertEquals("[]",tokens(""));
+ }
+
+ public void testCJK() throws Exception {
+ a = new WhitespaceAnalyzer();
+ assertEquals("[(い,0,1)]",tokens("い"));
+ assertEquals("[(いわ,0,2)]",tokens("いわ"));
+ assertEquals("[(いわ,0,2), (わさ,1,3), (さき,2,4)]",tokens("いわさき"));
+ }
+
+ public void testNonCJK() throws Exception {
+ a = new WhitespaceAnalyzer();
+ assertEquals("[(i,0,1)]",tokens("i"));
+ assertEquals("[(ic,0,2)]",tokens("ic"));
+ assertEquals("[(icic,0,4)]",tokens("icic"));
+ }
+ public void testNonCJKandCJK() throws Exception {
+ a = new WhitespaceAnalyzer();
+ assertEquals("[(c,0,1), (カー,1,3), (ード,2,4)]",tokens("cカード"));
+ assertEquals("[(ic,0,2), (カー,2,4), (ード,3,5)]",tokens("icカード"));
+ assertEquals("[(icic,0,4), (カー,4,6), (ード,5,7)]",tokens("icicカード"));
+ }
+
+ public void testCJKandNonCJK() throws Exception {
+ a = new WhitespaceAnalyzer();
+ assertEquals("[(き,0,1), (ic,1,3)]",tokens("きic"));
+ assertEquals("[(さき,0,2), (ic,2,4)]",tokens("さきic"));
+ }
+ public void testEndWithNonCJK() throws Exception {
+ a = new WhitespaceAnalyzer();
+ assertEquals("[(いわ,0,2), (わさ,1,3), (さき,2,4), (i,4,5)]",tokens("いわさきi"));
+ assertEquals("[(いわ,0,2), (わさ,1,3), (さき,2,4), (ic,4,6)]",tokens("いわさきic"));
+ assertEquals("[(いわ,0,2), (わさ,1,3), (さき,2,4), (icic,4,8)]",tokens("いわさきicic"));
+ }
+ public void testEndWithCJK() throws Exception{
+ a = new WhitespaceAnalyzer();
+ assertEquals("[(ic,0,2), (カ,2,3)]",tokens("icカ"));
+ }
+
+ private String tokens(String text){
+ try{
+ return Arrays.toString(tokensFromAnalysis(a,text,"contents"));
+ } catch(IOException e){
+ fail(e.getMessage());
+ return null;
+ }
+ }
+
+ private static Token[] tokensFromAnalysis(Analyzer analyzer, String text, String field) throws IOException {
+ TokenStream stream = analyzer.tokenStream(field, text);
+ CJKFilter cjkfilter = new CJKFilter(stream);
+ ArrayList tokenList = new ArrayList();
+ while (true) {
+ Token token = cjkfilter.next();
+ if (token == null) break;
+ tokenList.add(token);
+ }
+ return (Token[])tokenList.toArray(new Token[0]);
+ }
+
+}
\ No newline at end of file
Index: src/org/wikimedia/lsearch/analyzers/CJKFilter.java
===================================================================
--- src/org/wikimedia/lsearch/analyzers/CJKFilter.java (revision 81083)
+++ src/org/wikimedia/lsearch/analyzers/CJKFilter.java (working copy)
@@ -44,23 +44,29 @@
for(i=0,offset=0,len=0;i<text.length();i++){
c = text.codePointAt(i);
if(isCJKChar(c)){
- if(len != 0)
- buffer.add(new Token(text.substring(offset,offset+len+1),token.startOffset()+offset,token.startOffset()+offset+len+1));
- offset = i+1;
- len = 0;
- cur = text.charAt(i);
- if(last != 0)
- buffer.add(new Token(""+last+cur,token.startOffset()+i-1,token.startOffset()+i+1));
- last = cur;
- } else if(last != 0){
- buffer.add(new Token(""+last,token.startOffset()+i,token.startOffset()+i+1));
- last = 0;
- } else
- len++;
+ if (len != 0) { // one or more noncjk chars are reserved. genareats ASCII token
+ buffer.add(new Token(text.substring(offset,offset+len),token.startOffset()+offset,token.startOffset()+offset+len));
+ offset = i;
+ len =0;
+ } else if (last != 0){ // one cjk char is reserved. generates CJK token with current CJK char
+ buffer.add(new Token(""+last+text.charAt(i),token.startOffset()+i-1,token.startOffset()+i+1));
+ offset = i+1;
+ }
+ last = text.charAt(i); // keep this cjk char
+ } else { // gets an ASCII char.
+ if (len !=0){ // one or more ASCII char are reserved.
+ //do nothing
+ } else if ( last !=0 && offset < i){// one CJK char is reserved. generates CJK token.
+ buffer.add(new Token(""+last,token.startOffset()+offset,token.startOffset()+i));
+ offset = i;
+ last = 0;
+ }
+ len ++; // Current noncjk char is to be reserved.
+ }
}
- if(len != 0 && len != text.length())
- buffer.add(new Token(text.substring(offset,offset+len+1),token.startOffset()+offset,token.startOffset()+offset+len+1));
-
+ if(offset < text.length() && len != text.length())
+ buffer.add(new Token(text.substring(offset,text.length()),token.startOffset()+offset,token.startOffset()+text.length()));
+
if(buffer.size() == 0)
return token;
else

File Metadata

Mime Type
text/x-diff
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
6637
Default Alt Text
cjkfilter.patch (5 KB)

Event Timeline