Theme NexT works best with JavaScript enabled
0%

2021-09-17-组会

^ _ ^

From 2021-09-11 to 2021-09-17
Fix bugs of project(cj_text_analyze)

Overview

9.11

Score Compare

After a token go through the FSM(Finite-State Machine), process will find out its’ pattern type. There is a one-to-many relationship between morphological types and semantic types.
We define features every semantic type should qualify in token_config.py.If the token match a feature which belongs to a semantic,it will get a score added in this semantic.

After semantic score assigned,the process will normalize the score to [0,1], I call the normalized result weight, which means the possibility of the token to have this semantic.

In some situation, this thinking will make some mistake. Because different pattern type will map into the same semantic type.Sometimes, their weight might be same, but the score is different. Usually, we need the high score one to replace the low score one which might firstly be put into the clot chess.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
occupy_token_score = occupy_token.get_score_by_semantic(semantic)
token_weight = token.get_weight_by_semantic(semantic)
token_score = token.get_score_by_semantic(semantic)
if occupy_token_weight >= token_weight and occupy_token_score >= token_score:
logger.info(f"占领token【{occupy_token.value}】语义权重【{occupy_token_weight}】>= 当前token【{token.value}】语义权重【{token_weight}】;尝试不摆放{token}")
logger.info(f"占领token【{occupy_token.value}】语义得分【{occupy_token_score}】>= 当前token【{token.value}】语义得分【{token_score}】;尝试不摆放{token}")
next_state = copy.deepcopy(cur_state)
next_state.discard_token(token)
logger.info(f"新的元素棋盘放入队列:{next_state}")
state_queue.put([-next_state.get_score(), next_state])
else:
logger.info(f"摆放【{token.value}】替换【{occupy_token.value}】")
next_state = copy.deepcopy(cur_state)
next_state.pos_token(index, token)
logger.info(f"新的元素棋盘放入队列:{next_state}")

Date Fix

Float Bond Money Situation

9.12

9.13 - 9.15

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
'''
@File : merge_strategy.py
@Time : 2021/09/14 07:26:21
@Author : LuckyQ
@Version : 1.0
@Desc : Token 合并策略
'''

from token_evaluate import has_semantic

from cj_token import Token

class MergeStrategy(object):
def merge(self, token_list):
pass

# 交易关键词合并策略,形如“关键词1:xx,关键词2:xx”的(2|4|6|8)个Token进行合并
class DealKeywordMerge(MergeStrategy):
keyword_list = ["买", "卖"]
obj_semantic = ["Deal_Company", "Real_Deal_Op_Name"]
def merge(self, token_list):
res_list = []
i, n = 0, len(token_list)
while i < n:
token = token_list[i]
token_value = token.value
if not self._is_contain_keyword(token_value):
i += 1
res_list.append(token)
continue
flag = False
for semantic in self.obj_semantic:
if has_semantic(token, semantic):
flag = True
break
if not flag:
i += 1
res_list.append(token)
continue
# 从 i 偶数间隔往下寻找可以合并的关键词组
start = i
assert i+1 < n
merge_value_list = [token_list[i].value, token_list[i+1].value]
i += 2
while i+1 < n:
token = token_list[i]
token_value = token.value
if not self._is_contain_keyword(token_value):
break
flag = False
for semantic in self.obj_semantic:
if has_semantic(token, semantic):
flag = True
break
if not flag:
break

merge_value_list.extend([token_list[i].value, token_list[i+1].value])
i += 2

new_token = Token(value=merge_value_list, pattern_type="Keyword_Compose")
new_token.row = token_list[start].row
res_list.append(new_token)
# 找到一个这样的单词组就退出循环
break
while i < n:
res_list.append(token_list[i])
i += 1
return res_list

# 某字符串是否包含 keyword_list 中的关键词
def _is_contain_keyword(self, s):
for keyword in self.keyword_list:
if keyword in s:
return True
return False

# 多个同态要素合并策略
class SemanticSimilaryMerge(MergeStrategy):
semantic_list = ["Product", "Bond_Money"]
def merge(self, token_list):
res_list = []
i, n = 0, len(token_list)
start, end = -1, -1
while i < n-1:
token, next_token = token_list[i], token_list[i+1]
if not self._is_semantic_seq(token, next_token):
res_list.append(token)
i += 1
continue
start = i
value_list = [token.value, next_token.value]
i += 2
while i < n - 1:
token, next_token = token_list[i], token_list[i+1]
if not self._is_semantic_seq(token, next_token):
break
value_list.extend([token.value, next_token.value])
i += 2

new_token = Token(value=value_list, pattern_type="Element_Compose")
new_token.row = token_list[start].row
res_list.append(new_token)
break
while i < n:
res_list.append(token_list[i])
i += 1
return res_list

def _is_semantic_seq(self, token, next_token):
return has_semantic(token, self.semantic_list[0]) and has_semantic(next_token, self.semantic_list[1])

# 同义词、词补充合并策略
class SupplyMerge(MergeStrategy):
sequence_list = [["Real_Deal_Op_Name", "Deal_Company"], ["Real_Deal_Op_Name", "Op_Dealer_No"], \
["Deal_Company", "Dealer_No"]]
def merge(self, token_list):
res_list = []
i, n = 0, len(token_list)
while i < n:
match_seq = False
for sequence in self.sequence_list:
if i + len(sequence) - 1 >= n:
continue
flag = True
for j in range(0, len(sequence)):
if not has_semantic(token_list[i+j], sequence[j]):
flag = False
break
# 有一个满足条件的语义序列,则将它们合并
if flag:
value_list = [token_list[i].value for i in range(i,i+len(sequence))]
new_token = Token(value=value_list, pattern_type="Similary_Compose")
new_token.row = token_list[i].row
res_list.append(new_token)
i += len(sequence)
match_seq = True
break
if not match_seq:
res_list.append(token_list[i])
i += 1
return res_list

9.16

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# 如果当前棋盘中相邻两行差异太大的话,将下一行合并到这一行
fix_cloth_chess = []
fix_orders = []
# 将交集小于 max_dis 的相邻两行进行合并
max_dis = 2
i = 0
while i < len(self.cloth_chess):
row = self.cloth_chess[i]
if i == len(self.cloth_chess) - 1:
fix_cloth_chess.append(row)
fix_orders.append(self.orders[i])
break
cur_element_list = self.orders[i]
next_element_list = self.orders[i+1]
element_dis = len(set(cur_element_list).intersection(set(next_element_list)))
if element_dis <= max_dis:
fix_order = copy.deepcopy(cur_element_list)
for ele in next_element_list:
if ele not in fix_order:
fix_order.append(ele)
new_row = copy.deepcopy(row)
for i, col in enumerate(self.cloth_chess[i+1]):
if col != None and row[i] == None:
new_row[i] = copy.deepcopy(col)
fix_cloth_chess.append(new_row)
fix_orders.append(fix_order)
i += 2
else:
fix_cloth_chess.append(row)
fix_orders.append(self.orders[i])
i += 1
self.cloth_chess = fix_cloth_chess
self.orders = fix_orders