implements both tokenization and occurrence

Solutions on MaxInterview for implements both tokenization and occurrence by the best coders in the world

showing results for - "implements both tokenization and occurrence"
Cyrielle
23 Apr 2020
1# implements both tokenization and occurrence
2
3from sklearn.feature_extraction.text import CountVectorizer
4
5vectorizer = CountVectorizer()
6vectorizer
7# CountVectorizer()
8
9corpus = [
10  'This is the first document.',
11  'This is the second second document.',
12  'And the third one.',
13  'Is this the first document?',
14]
15X = vectorizer.fit_transform(corpus)
16X
17# <4x9 sparse matrix of type '<... 'numpy.int64'>'
18#     with 19 stored elements in Compressed Sparse ... format>
19
20analyze = vectorizer.build_analyzer()
21analyze("This is a text document to analyze.") == (
22  ['this', 'is', 'text', 'document', 'to', 'analyze'])
23# True
24
25vectorizer.get_feature_names() == (
26  ['and', 'document', 'first', 'is', 'one',
27   'second', 'the', 'third', 'this'])
28# True
29
30X.toarray()
31# array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
32#        [0, 1, 0, 1, 0, 2, 1, 0, 1],
33#        [1, 0, 0, 0, 1, 0, 1, 1, 0],
34#        [0, 1, 1, 1, 0, 0, 1, 0, 1]]...)
35
36vectorizer.vocabulary_.get('document')
37# 1
38
39vectorizer.transform(['Something completely new.']).toaaray()
40# array([[0, 0, 0, 0, 0, 0, 0, 0, 0]]...)
41
42bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
43                                    token_pattern=r'\b\w+b', min_df=1)
44analyze = bigram('Bi-grams are cool!') == (
45  ['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool'])
46# True
47
48X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
49X_2
50# array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
51#        [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0],
52#        [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
53#        [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]]...)
54
55feature_index = bigram_vectorizer.vocabulary_.get('is this')
56X_2[:, feature_index]
57# array([0, 0, 0, 1]...)