释放双眼,带上耳机,听听看~!
nlp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60 1# -*- coding: utf-8 -*-
2# 作者:东邪
3
4import numpy as np
5# a = np.array([1, 0, 1])
6# b = np.array([1, 1, 0])
7#
8# sum = 0
9# for i, j in zip(a, b):
10# sum += i*j
11# print(sum)
12# print(a.dot(b))
13import math
14import jieba
15s1 = '这只皮靴号码大了。那只号码合适'
16s1_cut = [i for i in jieba.cut(s1,cut_all=True) if i != '']
17s2 = '这只皮靴号码不小,那只更合适'
18s2_cut = [i for i in jieba.cut(s2,cut_all=True) if i != '']
19print(s1_cut)
20print(s2_cut)
21word_set = set(s1_cut).union(set(s2_cut))
22print(word_set)
23
24word_dict = dict()
25i = 0
26# 遍历他们的交集(列出所有词),方便拿这个词做词频
27for word in word_set:
28 # 对每一个词做一个编码
29 word_dict[word] = i
30 i += 1
31print(word_dict)
32
33# 对s1_cut进行词频计算(向量化)
34s1_cut_code = [0]*len(word_dict)
35for word in s1_cut:
36 s1_cut_code[word_dict[word]] += 1 # 在列出所有词(交集)中找到这个词在s1_cut中出现的次数,做一个统计
37s1_cut_code = [i/len(s1_cut) for i in s1_cut_code] #词频
38print(s1_cut_code)
39
40
41# 对s2_cut进行词频计算(向量化)
42s2_cut_code = [0]*len(word_dict)
43for word in s2_cut:
44 s2_cut_code[word_dict[word]] += 1 # 在列出所有词(交集)中找到这个词在s2_cut中出现的次数,做一个统计
45print(s2_cut_code)
46
47#计算余弦相似度cos
48s2_np = np.array([i*i for i in s2_cut_code])
49s2_np_sum = np.sum(s2_np)
50# 分母做成单位圆的形式,得1,这样我们只计算分子点乘就好了
51s2_e = [i/math.sqrt(s2_np_sum) for i in s2_cut_code] #把之前s2_cut_code的向量转换(归一化)同除以一个分母值,和以前没有变化,是为了最后分母平方和开根号后得1.0,好计算
52s2_e_sum = np.sum([i*i for i in s2_e])
53print(s2_e)
54print(s2_e_sum)
55
56
57
58
59
60
stop.py 去停用词
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 1# -*- coding: utf-8 -*-
2# 作者:东邪
3
4
5import math
6import jieba
7
8stop_word_lst = set()
9with open('stopword.txt','r',encoding='utf-8') as f:
10 stop_word_lst = [word.strip() for word in f.readlines()]
11
12print(stop_word_lst)
13s1 = '这只皮靴号码大了。那只号码合适'
14s1_direct_cut = [i for i in jieba.cut(s1, cut_all=True)]
15s1_cut = [i for i in jieba.cut(s1,cut_all=True) if i not in stop_word_lst]
16s2 = '这只皮靴号码不小,那只更合适'
17s2_cut = [i for i in jieba.cut(s2,cut_all=True) if i not in stop_word_lst]
18print(s1_cut)
19print(s2_cut)
20
21
22
tf-idf.py 能把停用词概率置位接近0,一些重要的词权重加大了。加log是缩小差距的
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85 1# -*- coding: utf-8 -*-
2# 作者:东邪
3
4
5from nltk.tokenize import RegexpTokenizer
6from collections import Counter
7from nltk.stem.porter import *
8from nltk.corpus import stopwords
9import math
10
11text1 = "Python is a 2000 made-for-TV horror movie directed by Richard \
12Clabaugh. The film features several cult favorite actors, including William \
13Zabka of The Karate Kid fame, Wil Wheaton, Casper Van Dien, Jenny McCarthy, \
14Keith Coogan, Robert Englund (best known for his role as Freddy Krueger in the \
15A Nightmare on Elm Street series of films), Dana Barron, David Bowe, and Sean \
16Whalen. The film concerns a genetically engineered snake, a python, that \
17escapes and unleashes itself on a small town. It includes the classic final\
18girl scenario evident in films like Friday the 13th. It was filmed in Los Angeles, \
19 California and Malibu, California. Python was followed by two sequels: Python \
20 II (2002) and Boa vs. Python (2004), both also made-for-TV films."
21
22text2 = "Python, from the Greek word (πύθων/πύθωνας), is a genus of \
23nonvenomous pythons[2] found in Africa and Asia. Currently, 7 species are \
24recognised.[2] A member of this genus, P. reticulatus, is among the longest \
25snakes known."
26
27text3 = "The Colt Python is a .357 Magnum caliber revolver formerly \
28manufactured by Colt's Manufacturing Company of Hartford, Connecticut. \
29It is sometimes referred to as a \"Combat Magnum\".[1] It was first introduced \
30in 1955, the same year as Smith & Wesson's M29 .44 Magnum. The now discontinued \
31Colt Python targeted the premium revolver market segment. Some firearm \
32collectors and writers such as Jeff Cooper, Ian V. Hogg, Chuck Hawks, Leroy \
33Thompson, Renee Smeets and Martin Dougherty have described the Python as the \
34finest production revolver ever made."
35
36
37def get_tokens(text):
38 texts = text.lower()
39 tokenizer = RegexpTokenizer(r'\w+')
40 return tokenizer.tokenize(texts)
41
42
43def stem_tokens(tokens, stemmer):
44 stemmed = []
45 for item in tokens:
46 stemmed.append(stemmer.stem(item))
47 return stemmed
48
49#生成count(词和词频)
50def gen_count(text):
51 t1_words = get_tokens(text)
52 stemmer = PorterStemmer()
53 stemmed = stem_tokens(tokens=t1_words, stemmer=stemmer) # 有时态的词去掉时态
54 filter = [word for word in stemmed if word not in stopwords.words('english')]
55 count = Counter(stemmed) # 词频
56 return count
57
58
59def tf(word, count):
60 return count[word] / max(count.values()) # tf公式,该词在文章出现次数/文章总次数出现次数最高的
61
62
63def df(word, count_list):
64 return sum(1 for count in count_list if word in count) # count一篇文章,count_list是文章总数,如果这个词在这篇文章中,标识为1,最后求和
65
66
67def idf(word, count_list):
68 return math.log(len(count_list) / (1 + df(word, count_list)))
69
70def tfidf(word,count,count_list):
71 return tf(word,count) * df(word,count_list)
72
73count_list = [gen_count(text1), gen_count(text2), gen_count(text3)]
74for i, count in enumerate(count_list):
75 print('每个文档中分值高的前几个word:{}'.format(i+1))
76 scores = {word: tfidf(word, count, count_list) for word in count} # 对文章中的每一个词都算他的打分
77 sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) #按照分数进行降序
78 for word, score in sorted_words[:3]:
79 print('\t word:{},TF-IDF:{}'.format(word, round(score,5)))
80
81# LCS
82# 最长公共子串是需要连续的,最长公共子序列是不用连续的
83
84
85