text_sim.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. # -*- coding: utf-8 -*-
  2. from operator import itemgetter
  3. import jieba
  4. import numpy as np
  5. from words_sim import SimCilin
  6. ci_lin = SimCilin()
  7. jieba.load_userdict('cilin_words.txt')
  8. def segmentation(sentence_list):
  9. result = list()
  10. for s in sentence_list:
  11. temp_seg = jieba.cut(s)
  12. result.append([x for x in temp_seg])
  13. return result
  14. def get_similarity(s1, s2):
  15. all_sim_1 = list()
  16. for w1 in s1:
  17. if is_contains_chinese(w1):
  18. sim_list = list()
  19. for w2 in s2:
  20. sim_list.append(ci_lin.compute_word_sim(w1, w2))
  21. sim_list.sort()
  22. all_sim_1.append(sim_list[-1])
  23. all_sim_2 = list()
  24. for w1 in s2:
  25. if is_contains_chinese(w1):
  26. sim_list = list()
  27. for w2 in s1:
  28. sim_list.append(ci_lin.compute_word_sim(w1, w2))
  29. sim_list.sort()
  30. all_sim_2.append(sim_list[-1])
  31. return (np.mean(all_sim_1) + np.mean(all_sim_2)) / 2
  32. def most_similar_items(src_s, sentences, n=3):
  33. sentences = segmentation(sentences)
  34. temp = list()
  35. for item in sentences:
  36. sim_value = get_similarity(src_s, item)
  37. temp.append({
  38. 'key': merge(item),
  39. 'value': sim_value,
  40. })
  41. result = sorted(temp, key=itemgetter('value'), reverse=True)
  42. return result[:n]
  43. def is_contains_chinese(s):
  44. for _char in s:
  45. if '\u4e00' <= _char <= '\u9fa5':
  46. return True
  47. return False
  48. def merge(word_list):
  49. s = ''
  50. for w in word_list:
  51. s += w.split('/')[0]
  52. return s
  53. if __name__ == '__main__':
  54. str1 = '我喜欢吃苹果'
  55. str2 = '他喜欢肯红薯'
  56. str1_seg = jieba.cut(str1)
  57. str2_seg = jieba.cut(str2)
  58. str1_new = [x for x in str1_seg]
  59. str2_new = [x for x in str2_seg]
  60. str_l = ['我喜欢吃梨', '你喜欢吃苹果', '他喜欢吃橙子']
  61. print(get_similarity(str1_new, str2_new))
  62. print(most_similar_items(str1, str_l, 5))