中文词频统计与词云生成

释放双眼，带上耳机，听听看~！

中文词频统计

1. 下载一长篇中文小说。

小说：鹿鼎记作者：金庸

中文词频统计与词云生成

2. 从文件读取待分析文本。

3. 安装并使用jieba进行中文分词。

pip install jieba

import jieba

jieba.lcut(text)

中文词频统计与词云生成

4. 更新词库，加入所分析对象的专业词汇。

jieba.add_word('天罡北斗阵') #逐个添加

jieba.load_userdict(word_dict) #词库文本文件


1
2
3
4
5
1with open(r&#x27;C:\Users\Shinelon\Desktop\python 3.22\金庸小说.txt&#x27;, &#x27;r&#x27;, encoding=&#x27;utf-8&#x27;) as f:

2    jinyong = f.read().split(&#x27;\n&#x27;)

3jieba.load_userdict(jinyong)

4newtext = jieba.lcut(text)

5

参考词库下载地址：https://pinyin.sogou.com/dict/

转换代码：scel_to_text


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
1# -*- coding: utf-8 -*-

2import struct

3import os

4 

5# 拼音表偏移，

6startPy = 0x1540;

7 

8# 汉语词组表偏移

9startChinese = 0x2628;

10 

11# 全局拼音表

12GPy_Table = {}

13 

14# 解析结果

15# 元组(词频,拼音,中文词组)的列表

16 

17 

18# 原始字节码转为字符串

19def byte2str(data):

20    pos = 0

21    str = &#x27;&#x27;

22    while pos &lt; len(data):

23        c = chr(struct.unpack(&#x27;H&#x27;, bytes([data[pos], data[pos + 1]]))[0])

24        if c != chr(0):

25            str += c

26        pos += 2

27    return str

28 

29# 获取拼音表

30def getPyTable(data):

31    data = data[4:]

32    pos = 0

33    while pos &lt; len(data):

34        index = struct.unpack(&#x27;H&#x27;, bytes([data[pos],data[pos + 1]]))[0]

35        pos += 2

36        lenPy = struct.unpack(&#x27;H&#x27;, bytes([data[pos], data[pos + 1]]))[0]

37        pos += 2

38        py = byte2str(data[pos:pos + lenPy])

39 

40        GPy_Table[index] = py

41        pos += lenPy

42 

43# 获取一个词组的拼音

44def getWordPy(data):

45    pos = 0

46    ret = &#x27;&#x27;

47    while pos &lt; len(data):

48        index = struct.unpack(&#x27;H&#x27;, bytes([data[pos], data[pos + 1]]))[0]

49        ret += GPy_Table[index]

50        pos += 2

51    return ret

52 

53# 读取中文表

54def getChinese(data):

55    GTable = []

56    pos = 0

57    while pos &lt; len(data):

58        # 同音词数量

59        same = struct.unpack(&#x27;H&#x27;, bytes([data[pos], data[pos + 1]]))[0]

60 

61        # 拼音索引表长度

62        pos += 2

63        py_table_len = struct.unpack(&#x27;H&#x27;, bytes([data[pos], data[pos + 1]]))[0]

64 

65        # 拼音索引表

66        pos += 2

67        py = getWordPy(data[pos: pos + py_table_len])

68 

69        # 中文词组

70        pos += py_table_len

71        for i in range(same):

72            # 中文词组长度

73            c_len = struct.unpack(&#x27;H&#x27;, bytes([data[pos], data[pos + 1]]))[0]

74            # 中文词组

75            pos += 2

76            word = byte2str(data[pos: pos + c_len])

77            # 扩展数据长度

78            pos += c_len

79            ext_len = struct.unpack(&#x27;H&#x27;, bytes([data[pos], data[pos + 1]]))[0]

80            # 词频

81            pos += 2

82            count = struct.unpack(&#x27;H&#x27;, bytes([data[pos], data[pos + 1]]))[0]

83 

84            # 保存

85            GTable.append((count, py, word))

86 

87            # 到下个词的偏移位置

88            pos += ext_len

89    return GTable

90 

91 

92def scel2txt(file_name):

93    print(&#x27;-&#x27; * 60)

94    with open(file_name, &#x27;rb&#x27;) as f:

95        data = f.read()

96 

97    print(&quot;词库名：&quot;, byte2str(data[0x130:0x338])) # .encode(&#x27;GB18030&#x27;)

98    print(&quot;词库类型：&quot;, byte2str(data[0x338:0x540]))

99    print(&quot;描述信息：&quot;, byte2str(data[0x540:0xd40]))

100    print(&quot;词库示例：&quot;, byte2str(data[0xd40:startPy]))

101 

102    getPyTable(data[startPy:startChinese])

103    getChinese(data[startChinese:])

104    return getChinese(data[startChinese:])

105 

106if __name__ == &#x27;__main__&#x27;:

107    # scel所在文件夹路径

108    in_path = r&quot;D:\360安全浏览器下载&quot;   #修改为你的词库文件存放文件夹

109    # 输出词典所在文件夹路径

110    out_path = r&quot;D:\360安全浏览器下载\123&quot;  # 转换之后文件存放文件夹

111    fin = [fname for fname in os.listdir(in_path) if fname[-5:] == &quot;.scel&quot;]

112    for f in fin:

113        try:

114            for word in scel2txt(os.path.join(in_path, f)):

115                file_path=(os.path.join(out_path, str(f).split(&#x27;.&#x27;)[0] + &#x27;.txt&#x27;))

116                # 保存结果

117                with open(file_path,&#x27;a+&#x27;,encoding=&#x27;utf-8&#x27;)as file:

118                    file.write(word[2] + &#x27;\n&#x27;)

119            os.remove(os.path.join(in_path, f))

120        except Exception as e:

121            print(e)

122            pass

123

5. 生成词频统计


1
2
3
4
5
6
7
1te = {};

2for w in newtext:

3    if len(w) == 1:

4        continue

5    else:

6        te[w] = te.get(w, 0) + 1

7

6. 排序


1
2
3
1tesort = list(te.items())

2tesort.sort(key=lambda x: x[1], reverse=True)

3

7. 排除语法型词汇，代词、冠词、连词等停用词。

stops

tokens=[token for token in wordsls if token not in stops]


1
2
3
4
5
1with open(r&#x27;C:\Users\Shinelon\Desktop\python 3.22\stops_chinese.txt&#x27;, &#x27;r&#x27;, encoding=&#x27;utf-8&#x27;) as f:

2    stops = f.read().split(&#x27;\n&#x27;)

3

4newtext2 = [text1 for text1 in newtext if text1 not in stops]

5

8. 输出词频最大TOP20，把结果存放到文件里。


1
2
3
4
5
1for i in range(0,20):

2    print(tesort[i])

3

4pd.DataFrame(tesort).to_csv(&#x27;ludingji.csv&#x27;, encoding=&#x27;utf-8&#x27;)

5

9. 生成词云。


1
2
3
4
5
6
7
8
9
10
11
1txt = open(&#x27;ludingji.csv&#x27;,&#x27;r&#x27;,encoding=&#x27;utf-8&#x27;).read()

2ludingjilist = jieba.lcut(txt)

3

4

5wl_spl = &quot;&quot;.join(ludingjilist)

6mywc = WordCloud().generate(wl_spl)

7

8plt.imshow(mywc)

9plt.axis(&quot;off&quot;)

10plt.show()

11

中文词频统计与词云生成

10.代码


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
1import pandas as pd

2from wordcloud import  WordCloud

3import jieba

4import matplotlib.pyplot as plt

5

6# 读取小说

7f = open(r&#x27;C:\Users\Shinelon\Desktop\python 3.22\ludingji.txt&#x27;, &#x27;r&#x27;, encoding=&#x27;utf8&#x27;)

8text = f.read();

9f.close();

10

11# 加入所分析对象的专业词汇

12with open(r&#x27;C:\Users\Shinelon\Desktop\python 3.22\金庸小说.txt&#x27;, &#x27;r&#x27;, encoding=&#x27;utf-8&#x27;) as f:

13    jinyong = f.read().split(&#x27;\n&#x27;)

14jieba.load_userdict(jinyong)

15newtext = jieba.lcut(text)

16

17# 排除语法型词汇，代词、冠词、连词等停用词

18with open(r&#x27;C:\Users\Shinelon\Desktop\python 3.22\stops_chinese.txt&#x27;, &#x27;r&#x27;, encoding=&#x27;utf-8&#x27;) as f:

19    stops = f.read().split(&#x27;\n&#x27;)

20newtext2 = [text1 for text1 in newtext if text1 not in stops]

21

22# 对词语进行出现次数统计

23te = {};

24for w in newtext:

25    if len(w) == 1:

26        continue

27    else:

28        te[w] = te.get(w, 0) + 1

29

30# 次数排序

31tesort = list(te.items())

32tesort.sort(key=lambda x: x[1], reverse=True)

33

34# 输出次数前TOP20的词语

35for i in range(0,20):

36    print(tesort[i])

37

38# 存储结果

39pd.DataFrame(tesort).to_csv(&#x27;ludingji.csv&#x27;, encoding=&#x27;utf-8&#x27;)

40

41# 读取生成词云

42txt = open(&#x27;ludingji.csv&#x27;,&#x27;r&#x27;,encoding=&#x27;utf-8&#x27;).read()

43ludingjilist = jieba.lcut(txt)

44wl_spl = &quot;&quot;.join(ludingjilist)

45mywc = WordCloud().generate(wl_spl)

46

47plt.imshow(mywc)

48plt.axis(&quot;off&quot;)

49plt.show()

50

{{userData.name}}已认证

中文词频统计与词云生成

MySQL，Redis，MongoDB 三种数据库优势

Ubuntu上NFS的安装配置

{{userData.name}}已认证

Related posts:

MySQL，Redis，MongoDB 三种数据库优势

Ubuntu上NFS的安装配置

15个私有云上的 DevOps 开源工具

深入理解 Linux 内核---虚拟文件系统

ELK(ElasticSearch, Logstash, Kibana)搭建实时日志分析平台

mysql性能优化