| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103 |
- # -*- coding: utf-8 -*-
- from operator import itemgetter
- import jieba
- import numpy as np
- from words_sim import SimCilin
- ci_lin = SimCilin()
- jieba.load_userdict('cilin_words.txt')
- def segmentation(sentence_list):
- """
- Segment a set of sentences before calculate similarity
- :param sentence_list:
- :return:
- """
- result = list()
- for s in sentence_list:
- temp_seg = jieba.cut(s)
- result.append([x for x in temp_seg])
- return result
- def get_similarity(s1, s2):
- """
- Calculate the similarity of two words by Cilin
- :param s1: a word list, the result of segmentation
- :param s2:
- :return:
- """
- all_sim_1 = list()
- for w1 in s1:
- if is_contains_chinese(w1):
- sim_list = list()
- for w2 in s2:
- sim_list.append(ci_lin.compute_word_sim(w1, w2))
- sim_list.sort()
- all_sim_1.append(sim_list[-1])
- all_sim_2 = list()
- for w1 in s2:
- if is_contains_chinese(w1):
- sim_list = list()
- for w2 in s1:
- sim_list.append(ci_lin.compute_word_sim(w1, w2))
- sim_list.sort()
- all_sim_2.append(sim_list[-1])
- return (np.mean(all_sim_1) + np.mean(all_sim_2)) / 2
- def most_similar_items(src_s, sentences, n=3):
- """
- Return the sentences most similar to the target
- :param src_s: target sentence
- :param sentences:
- :param n: return number
- :return:
- """
- sentences = segmentation(sentences)
- temp = list()
- for item in sentences:
- sim_value = get_similarity(src_s, item)
- temp.append({
- 'key': merge(item),
- 'value': sim_value,
- })
- result = sorted(temp, key=itemgetter('value'), reverse=True)
- return result[:n]
- def is_contains_chinese(s):
- """
- Tell if the string contains a Chinese string
- :param s:
- :return:
- """
- for _char in s:
- if '\u4e00' <= _char <= '\u9fa5':
- return True
- return False
- def merge(word_list):
- s = ''
- for w in word_list:
- s += w.split('/')[0]
- return s
- if __name__ == '__main__':
- str1 = '我喜欢吃苹果'
- str2 = '他喜欢肯红薯'
- str1_seg = jieba.cut(str1)
- str2_seg = jieba.cut(str2)
- str1_new = [x for x in str1_seg]
- str2_new = [x for x in str2_seg]
- str_l = ['我喜欢吃梨', '你喜欢吃苹果', '他喜欢吃橙子']
- print(get_similarity(str1_new, str2_new))
- print(most_similar_items(str1, str_l, 5))
|