単語マッチングの最適化

2018-01-04 15:11:28 +09:00 · 2018-01-04 15:11:28 +09:00 · e731355533
parent cb01e80880
commit e731355533
2 changed files with 28 additions and 28 deletions
--- a/app/src/main/java/jp/juggler/subwaytooter/util/CharacterGroup.java
+++ b/app/src/main/java/jp/juggler/subwaytooter/util/CharacterGroup.java
@ -5,7 +5,7 @@ import android.util.SparseIntArray;

 import java.util.Locale;

-public class CharacterGroup {
+public final class CharacterGroup {
 	
 	// 文字列からグループIDを調べるマップ

@ -58,13 +58,10 @@ public class CharacterGroup {
 	static final int END = - 1;
 	
 	// 入力された文字列から 文字,グループ,終端 のどれかを順に列挙する
-	class Tokenizer {
+	final class Tokenizer {
 		CharSequence text;
-		int end;
-		
-		// next() を読むと以下の変数が更新される
 		int offset;
-		int c; // may END or group_id or UTF-16 character
+		int end;
 		
 		Tokenizer( @NonNull CharSequence text, int start, int end ){
 			reset( text, start, end );
@ -76,7 +73,8 @@ public class CharacterGroup {
 			this.end = end;
 		}
 		
-		void next(){
+		// returns END or group_id or UTF-16 character
+		int next(){
 			
 			int pos = offset;
 			
@ -88,8 +86,7 @@ public class CharacterGroup {
 			if( remain <= 0 ){
 				// 空白を読み飛ばしたら終端になった
 				// 終端の場合、末尾の空白はoffsetに含めない
-				this.c = END;
-				return;
+				return END;
 			}
 			
 			int v1 = text.charAt( pos );
@ -102,15 +99,14 @@ public class CharacterGroup {
 					: map2.get( v1 | ( ( (int) text.charAt( pos + 1 ) ) << 16 ) )
 				);
 				if( group_id != 0 ){
-					this.c = group_id;
 					this.offset = pos + check_len;
-					return;
+					return group_id;
 				}
 				-- check_len;
 			}
 			
-			this.c = v1;
 			this.offset = pos + 1;
+			return v1;
 		}
 	}
 	
@ -159,7 +155,7 @@ public class CharacterGroup {
 		case 0xFEFF:
 			return true;
 		default:
-			return Character.isWhitespace( cp );
+			return false; // Character.isWhitespace( cp ); は不要っぽい
 		}
 	}
 	
--- a/app/src/main/java/jp/juggler/subwaytooter/util/WordTrieTree.java
+++ b/app/src/main/java/jp/juggler/subwaytooter/util/WordTrieTree.java
@ -9,9 +9,15 @@ import java.util.ArrayList;
 public class WordTrieTree {
 	
 	static class Match {
-		String word;
-		int start;
-		int end;
+		final int start;
+		final int end;
+		@NonNull final String word;
+		
+		Match( int start, int end, String word ){
+			this.start = start;
+			this.end = end;
+			this.word = word;
+		}
 	}
 	
 	private static final CharacterGroup grouper = new CharacterGroup();
@ -35,8 +41,8 @@ public class WordTrieTree {
 		CharacterGroup.Tokenizer t = grouper.tokenizer( s, 0, s.length() );
 		Node node = node_root;
 		for( ; ; ){
-			t.next();
-			int id = t.c;
+			
+			int id = t.next();
 			if( id == CharacterGroup.END ){
 				// より長いマッチ単語を覚えておく
 				if( node.match_word == null || node.match_word.length() < t.text.length() ){
@ -44,7 +50,7 @@ public class WordTrieTree {
 				}
 				return;
 			}
-			Node child = node.child_nodes.get( t.c );
+			Node child = node.child_nodes.get( id );
 			if( child == null ){
 				node.child_nodes.put( id, child = new Node() );
 			}
@ -52,7 +58,7 @@ public class WordTrieTree {
 		}
 	}
 	
-	// 前方一致でマッチング
+	// Tokenizer が列挙する文字を使って Trie Tree を探索する
 	@Nullable
 	private Match match( boolean allowShortMatch, @NonNull CharacterGroup.Tokenizer t ){
 		
@ -64,17 +70,13 @@ public class WordTrieTree {
 			
 			// このノードは単語の終端でもある
 			if( node.match_word != null ){
-				dst = new Match();
-				dst.word = node.match_word;
-				dst.start = start;
-				dst.end = t.offset;
+				dst = new Match( start, t.offset, node.match_word );
 				
-				// 最短マッチのみを調べるのなら、以降の処理は必要ない
+				// ミュート用途の場合、ひとつでも単語にマッチすればより長い探索は必要ない
 				if( allowShortMatch ) break;
 			}
 			
-			t.next();
-			int id = t.c;
+			int id = t.next();
 			if( id == CharacterGroup.END ) break;
 			Node child = node.child_nodes.get( id );
 			if( child == null ) break;
@ -83,6 +85,7 @@ public class WordTrieTree {
 		return dst;
 	}
 	
+	// ミュート用。マッチするかどうかだけを調べる
 	public boolean matchShort( @Nullable CharSequence src ){
 		return null != src && null != matchShort( src, 0, src.length() );
 	}
@ -98,7 +101,8 @@ public class WordTrieTree {
 		}
 		return null;
 	}
-	
+
+	// ハイライト用。複数マッチする。マッチした位置を覚える
 	@Nullable ArrayList< Match > matchList( @NonNull CharSequence src, int start, int end ){
 		ArrayList< Match > dst = null;