| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 |
- # -*- coding: utf-8 -*-
- from operator import itemgetter
- import jieba
- import numpy as np
- from words_sim import SimCilin
- ci_lin = SimCilin()
- jieba.load_userdict('cilin_words.txt')
- def segmentation(sentence_list):
- result = list()
- for s in sentence_list:
- temp_seg = jieba.cut(s)
- result.append([x for x in temp_seg])
- return result
- def get_similarity(s1, s2):
- all_sim_1 = list()
- for w1 in s1:
- if is_contains_chinese(w1):
- sim_list = list()
- for w2 in s2:
- sim_list.append(ci_lin.compute_word_sim(w1, w2))
- sim_list.sort()
- all_sim_1.append(sim_list[-1])
- all_sim_2 = list()
- for w1 in s2:
- if is_contains_chinese(w1):
- sim_list = list()
- for w2 in s1:
- sim_list.append(ci_lin.compute_word_sim(w1, w2))
- sim_list.sort()
- all_sim_2.append(sim_list[-1])
- return (np.mean(all_sim_1) + np.mean(all_sim_2)) / 2
- def most_similar_items(src_s, sentences, n=3):
- sentences = segmentation(sentences)
- temp = list()
- for item in sentences:
- sim_value = get_similarity(src_s, item)
- temp.append({
- 'key': merge(item),
- 'value': sim_value,
- })
- result = sorted(temp, key=itemgetter('value'), reverse=True)
- return result[:n]
- def is_contains_chinese(s):
- for _char in s:
- if '\u4e00' <= _char <= '\u9fa5':
- return True
- return False
- def merge(word_list):
- s = ''
- for w in word_list:
- s += w.split('/')[0]
- return s
- if __name__ == '__main__':
- str1 = '我喜欢吃苹果'
- str2 = '他喜欢肯红薯'
- str1_seg = jieba.cut(str1)
- str2_seg = jieba.cut(str2)
- str1_new = [x for x in str1_seg]
- str2_new = [x for x in str2_seg]
- str_l = ['我喜欢吃梨', '你喜欢吃苹果', '他喜欢吃橙子']
- print(get_similarity(str1_new, str2_new))
- print(most_similar_items(str1, str_l, 5))
|