1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
| ''' @File : merge_strategy.py @Time : 2021/09/14 07:26:21 @Author : LuckyQ @Version : 1.0 @Desc : Token 合并策略 '''
from token_evaluate import has_semantic
from cj_token import Token
class MergeStrategy(object): def merge(self, token_list): pass
class DealKeywordMerge(MergeStrategy): keyword_list = ["买", "卖"] obj_semantic = ["Deal_Company", "Real_Deal_Op_Name"] def merge(self, token_list): res_list = [] i, n = 0, len(token_list) while i < n: token = token_list[i] token_value = token.value if not self._is_contain_keyword(token_value): i += 1 res_list.append(token) continue flag = False for semantic in self.obj_semantic: if has_semantic(token, semantic): flag = True break if not flag: i += 1 res_list.append(token) continue start = i assert i+1 < n merge_value_list = [token_list[i].value, token_list[i+1].value] i += 2 while i+1 < n: token = token_list[i] token_value = token.value if not self._is_contain_keyword(token_value): break flag = False for semantic in self.obj_semantic: if has_semantic(token, semantic): flag = True break if not flag: break merge_value_list.extend([token_list[i].value, token_list[i+1].value]) i += 2
new_token = Token(value=merge_value_list, pattern_type="Keyword_Compose") new_token.row = token_list[start].row res_list.append(new_token) break while i < n: res_list.append(token_list[i]) i += 1 return res_list
def _is_contain_keyword(self, s): for keyword in self.keyword_list: if keyword in s: return True return False
class SemanticSimilaryMerge(MergeStrategy): semantic_list = ["Product", "Bond_Money"] def merge(self, token_list): res_list = [] i, n = 0, len(token_list) start, end = -1, -1 while i < n-1: token, next_token = token_list[i], token_list[i+1] if not self._is_semantic_seq(token, next_token): res_list.append(token) i += 1 continue start = i value_list = [token.value, next_token.value] i += 2 while i < n - 1: token, next_token = token_list[i], token_list[i+1] if not self._is_semantic_seq(token, next_token): break value_list.extend([token.value, next_token.value]) i += 2
new_token = Token(value=value_list, pattern_type="Element_Compose") new_token.row = token_list[start].row res_list.append(new_token) break while i < n: res_list.append(token_list[i]) i += 1 return res_list def _is_semantic_seq(self, token, next_token): return has_semantic(token, self.semantic_list[0]) and has_semantic(next_token, self.semantic_list[1])
class SupplyMerge(MergeStrategy): sequence_list = [["Real_Deal_Op_Name", "Deal_Company"], ["Real_Deal_Op_Name", "Op_Dealer_No"], \ ["Deal_Company", "Dealer_No"]] def merge(self, token_list): res_list = [] i, n = 0, len(token_list) while i < n: match_seq = False for sequence in self.sequence_list: if i + len(sequence) - 1 >= n: continue flag = True for j in range(0, len(sequence)): if not has_semantic(token_list[i+j], sequence[j]): flag = False break if flag: value_list = [token_list[i].value for i in range(i,i+len(sequence))] new_token = Token(value=value_list, pattern_type="Similary_Compose") new_token.row = token_list[i].row res_list.append(new_token) i += len(sequence) match_seq = True break if not match_seq: res_list.append(token_list[i]) i += 1 return res_list
|