Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F7101
cjkfilter.patch
Public
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Authored By
•
bzimport
Nov 21 2014, 11:15 PM
2014-11-21 23:15:29 (UTC+0)
Size
5 KB
Referenced Files
None
Subscribers
None
cjkfilter.patch
View Options
Index: test/org/wikimedia/lsearch/analyzers/CJKFilterTest.java
===================================================================
--- test/org/wikimedia/lsearch/analyzers/CJKFilterTest.java (revision 0)
+++ test/org/wikimedia/lsearch/analyzers/CJKFilterTest.java (revision 0)
@@ -0,0 +1,86 @@
+package org.wikimedia.lsearch.analyzers;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+public class CJKFilterTest extends TestCase {
+ private Analyzer a;
+
+ public CJKFilterTest(String name){
+ super(name);
+ }
+
+ protected void setup() throws Exception {
+ }
+
+ public void testEmpty(){
+ a = new WhitespaceAnalyzer();
+ assertEquals("[]",tokens(""));
+ }
+
+ public void testCJK() throws Exception {
+ a = new WhitespaceAnalyzer();
+ assertEquals("[(い,0,1)]",tokens("い"));
+ assertEquals("[(いわ,0,2)]",tokens("いわ"));
+ assertEquals("[(いわ,0,2), (わさ,1,3), (さき,2,4)]",tokens("いわさき"));
+ }
+
+ public void testNonCJK() throws Exception {
+ a = new WhitespaceAnalyzer();
+ assertEquals("[(i,0,1)]",tokens("i"));
+ assertEquals("[(ic,0,2)]",tokens("ic"));
+ assertEquals("[(icic,0,4)]",tokens("icic"));
+ }
+ public void testNonCJKandCJK() throws Exception {
+ a = new WhitespaceAnalyzer();
+ assertEquals("[(c,0,1), (カー,1,3), (ード,2,4)]",tokens("cカード"));
+ assertEquals("[(ic,0,2), (カー,2,4), (ード,3,5)]",tokens("icカード"));
+ assertEquals("[(icic,0,4), (カー,4,6), (ード,5,7)]",tokens("icicカード"));
+ }
+
+ public void testCJKandNonCJK() throws Exception {
+ a = new WhitespaceAnalyzer();
+ assertEquals("[(き,0,1), (ic,1,3)]",tokens("きic"));
+ assertEquals("[(さき,0,2), (ic,2,4)]",tokens("さきic"));
+ }
+ public void testEndWithNonCJK() throws Exception {
+ a = new WhitespaceAnalyzer();
+ assertEquals("[(いわ,0,2), (わさ,1,3), (さき,2,4), (i,4,5)]",tokens("いわさきi"));
+ assertEquals("[(いわ,0,2), (わさ,1,3), (さき,2,4), (ic,4,6)]",tokens("いわさきic"));
+ assertEquals("[(いわ,0,2), (わさ,1,3), (さき,2,4), (icic,4,8)]",tokens("いわさきicic"));
+ }
+ public void testEndWithCJK() throws Exception{
+ a = new WhitespaceAnalyzer();
+ assertEquals("[(ic,0,2), (カ,2,3)]",tokens("icカ"));
+ }
+
+ private String tokens(String text){
+ try{
+ return Arrays.toString(tokensFromAnalysis(a,text,"contents"));
+ } catch(IOException e){
+ fail(e.getMessage());
+ return null;
+ }
+ }
+
+ private static Token[] tokensFromAnalysis(Analyzer analyzer, String text, String field) throws IOException {
+ TokenStream stream = analyzer.tokenStream(field, text);
+ CJKFilter cjkfilter = new CJKFilter(stream);
+ ArrayList tokenList = new ArrayList();
+ while (true) {
+ Token token = cjkfilter.next();
+ if (token == null) break;
+ tokenList.add(token);
+ }
+ return (Token[])tokenList.toArray(new Token[0]);
+ }
+
+}
\ No newline at end of file
Index: src/org/wikimedia/lsearch/analyzers/CJKFilter.java
===================================================================
--- src/org/wikimedia/lsearch/analyzers/CJKFilter.java (revision 81083)
+++ src/org/wikimedia/lsearch/analyzers/CJKFilter.java (working copy)
@@ -44,23 +44,29 @@
for(i=0,offset=0,len=0;i<text.length();i++){
c = text.codePointAt(i);
if(isCJKChar(c)){
- if(len != 0)
- buffer.add(new Token(text.substring(offset,offset+len+1),token.startOffset()+offset,token.startOffset()+offset+len+1));
- offset = i+1;
- len = 0;
- cur = text.charAt(i);
- if(last != 0)
- buffer.add(new Token(""+last+cur,token.startOffset()+i-1,token.startOffset()+i+1));
- last = cur;
- } else if(last != 0){
- buffer.add(new Token(""+last,token.startOffset()+i,token.startOffset()+i+1));
- last = 0;
- } else
- len++;
+ if (len != 0) { // one or more noncjk chars are reserved. genareats ASCII token
+ buffer.add(new Token(text.substring(offset,offset+len),token.startOffset()+offset,token.startOffset()+offset+len));
+ offset = i;
+ len =0;
+ } else if (last != 0){ // one cjk char is reserved. generates CJK token with current CJK char
+ buffer.add(new Token(""+last+text.charAt(i),token.startOffset()+i-1,token.startOffset()+i+1));
+ offset = i+1;
+ }
+ last = text.charAt(i); // keep this cjk char
+ } else { // gets an ASCII char.
+ if (len !=0){ // one or more ASCII char are reserved.
+ //do nothing
+ } else if ( last !=0 && offset < i){// one CJK char is reserved. generates CJK token.
+ buffer.add(new Token(""+last,token.startOffset()+offset,token.startOffset()+i));
+ offset = i;
+ last = 0;
+ }
+ len ++; // Current noncjk char is to be reserved.
+ }
}
- if(len != 0 && len != text.length())
- buffer.add(new Token(text.substring(offset,offset+len+1),token.startOffset()+offset,token.startOffset()+offset+len+1));
-
+ if(offset < text.length() && len != text.length())
+ buffer.add(new Token(text.substring(offset,text.length()),token.startOffset()+offset,token.startOffset()+text.length()));
+
if(buffer.size() == 0)
return token;
else
File Metadata
Details
Attached
Mime Type
text/x-diff
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
6637
Default Alt Text
cjkfilter.patch (5 KB)
Attached To
Mode
T28997: CJKFilter wrongly tokenize a CJK and non-CJK mixed string.
Attached
Detach File
Event Timeline
Log In to Comment