text_sim.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. # -*- coding: utf-8 -*-
  2. from operator import itemgetter
  3. import jieba
  4. import numpy as np
  5. from words_sim import SimCilin
  6. ci_lin = SimCilin()
  7. jieba.load_userdict('cilin_words.txt')
  8. def segmentation(sentence_list):
  9. """
  10. Segment a set of sentences before calculate similarity
  11. :param sentence_list:
  12. :return:
  13. """
  14. result = list()
  15. for s in sentence_list:
  16. temp_seg = jieba.cut(s)
  17. result.append([x for x in temp_seg])
  18. return result
  19. def get_similarity(s1, s2):
  20. """
  21. Calculate the similarity of two words by Cilin
  22. :param s1: a word list, the result of segmentation
  23. :param s2:
  24. :return:
  25. """
  26. all_sim_1 = list()
  27. for w1 in s1:
  28. if is_contains_chinese(w1):
  29. sim_list = list()
  30. for w2 in s2:
  31. sim_list.append(ci_lin.compute_word_sim(w1, w2))
  32. sim_list.sort()
  33. all_sim_1.append(sim_list[-1])
  34. all_sim_2 = list()
  35. for w1 in s2:
  36. if is_contains_chinese(w1):
  37. sim_list = list()
  38. for w2 in s1:
  39. sim_list.append(ci_lin.compute_word_sim(w1, w2))
  40. sim_list.sort()
  41. all_sim_2.append(sim_list[-1])
  42. return (np.mean(all_sim_1) + np.mean(all_sim_2)) / 2
  43. def most_similar_items(src_s, sentences, n=3):
  44. """
  45. Return the sentences most similar to the target
  46. :param src_s: target sentence
  47. :param sentences:
  48. :param n: return number
  49. :return:
  50. """
  51. sentences = segmentation(sentences)
  52. temp = list()
  53. for item in sentences:
  54. sim_value = get_similarity(src_s, item)
  55. temp.append({
  56. 'key': merge(item),
  57. 'value': sim_value,
  58. })
  59. result = sorted(temp, key=itemgetter('value'), reverse=True)
  60. return result[:n]
  61. def is_contains_chinese(s):
  62. """
  63. Tell if the string contains a Chinese string
  64. :param s:
  65. :return:
  66. """
  67. for _char in s:
  68. if '\u4e00' <= _char <= '\u9fa5':
  69. return True
  70. return False
  71. def merge(word_list):
  72. s = ''
  73. for w in word_list:
  74. s += w.split('/')[0]
  75. return s
  76. if __name__ == '__main__':
  77. str1 = '我喜欢吃苹果'
  78. str2 = '他喜欢肯红薯'
  79. str1_seg = jieba.cut(str1)
  80. str2_seg = jieba.cut(str2)
  81. str1_new = [x for x in str1_seg]
  82. str2_new = [x for x in str2_seg]
  83. str_l = ['我喜欢吃梨', '你喜欢吃苹果', '他喜欢吃橙子']
  84. print(get_similarity(str1_new, str2_new))
  85. print(most_similar_items(str1, str_l, 5))