NLTK历史悠久的英文分词工具
from nltk.tokenize import word_tokenize
from nltk.text import Text
input='''
There were a sensitivity and a beauty to her that have nothing to do with looks. She was one to be listened to, whose words were so easy to take to heart.
'''
tokens=word_tokenize(input)
print(tokens[:5])
tokens=[w.lower() for w in tokens]
t=Text(tokens)
t.count('beauty')
t.index('beauty')
t.plot(8)
['There', 'were', 'a', 'sensitivity', 'and']

停用词
from nltk.corpus import stopwords
stopwords.fileids()
['arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish', 'turkish']
stopwords.raw('english').replace('\n',' ')
"i me my myself we our ours ourselves you you're you've you'll you'd your yours yourself yourselves he him his himself she she's her hers herself it it's its itself they them their theirs themselves what which who whom this that that'll these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don don't should should've now d ll m o re ve y ain aren aren't couldn couldn't didn didn't doesn doesn't hadn hadn't hasn hasn't haven haven't isn isn't ma mightn mightn't mustn mustn't needn needn't shan shan't shouldn shouldn't wasn wasn't weren weren't won won't wouldn wouldn't "
tokens=set(tokens)
filtered=[w for w in tokens if(w not in stopwords.words('english'))]
print(filtered)
['nothing', 'sensitivity', ',', 'one', 'beauty', 'words', 'heart', 'looks', 'take', 'whose', '.', 'listened', 'easy']
词性标注
from nltk import pos_tag
pos_tag(filtered)
[('nothing', 'NN'), ('sensitivity', 'NN'), (',', ','), ('one', 'CD'), ('beauty', 'NN'), ('words', 'NNS'), ('heart', 'NN'), ('looks', 'VBZ'), ('take', 'VB'), ('whose', 'WP$'), ('.', '.'), ('listened', 'VBN'), ('easy', 'JJ')]
| POS Tag | 指代 |
|---|
| CC | 并列连词 |
| CD | 基数词 |
| DT | 限定符 |
| EX | 存在词 |
| FW | 外来词 |
| IN | 介词或从属连词 |
| JJ | 形容词 |
| JJR | 比较级的形容词 |
| JJS | 最高级的形容词 |
| LS | 列表项标记 |
| MD | 情态动词 |
| NN | 名词单数 |
| NNS | 名词复数 |
| NNP | 专有名词 |
| PDT | 前置限定词 |
| POS | 所有格结尾 |
| PRP | 人称代词 |
| PRP$ | 所有格代词 |
| RB | 副词 |
| RBR | 副词比较级 |
| RBS | 副词最高级 |
| RP | 小品词 |
| UH | 感叹词 |
| VB | 动词原型 |
| VBD | 动词过去式 |
| VBG | 动名词或现在分词 |
| VBN | 动词过去分词 |
| VBP | 非第三人称单数的现在时 |
| VBZ | 第三人称单数的现在时 |
| WDT | 以wh开头的限定词 |
分块
from nltk.chunk import RegexpParser
sentence = [('the','DT'),('little','JJ'),('yellow','JJ'),('dog','NN'),('died','VBD')]
grammer = "MY_NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammer)
result = cp.parse(sentence)
print(result)
result.draw()
(S (MY_NP the/DT little/JJ yellow/JJ dog/NN) died/VBD)
An exception has occurred, use %tb to see the full traceback.
SystemExit: 0
命名实体识别
from nltk import ne_chunk
input = "Edison went to Tsinghua University today."
print(ne_chunk(pos_tag(word_tokenize(input))))
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
(S
(PERSON Edison/NNP)
went/VBD
to/TO
(ORGANIZATION Tsinghua/NNP University/NNP)
today/NN
./.)
数据清洗
import re
from nltk.corpus import stopwords
s = ' RT @Amila #Test\nTom\'s newly listed Co & Mary\'s unlisted Group to supply tech for nlTK.\nh $TSLA $AAPL https:// t.co/x34afsfQsh'
s=re.sub(r'&\w*;|@\w*|#\w*','',s)
s=re.sub(r'\$\w*','',s)
s=re.sub(r'https?:\/\/.*\/\w*','',s)
s=re.sub(r'\b\w{1,2}\b','',s)
s=re.sub(r'\s\s+','',s)
tokens=word_tokenize(s)
tokens=[w for w in tokens if(w not in stopwords.words('english'))]
print(' '.join(tokens))
Tom
