Skip to content

Commit ee2fcbd

Browse files
author
tmikolov
committed
added script for training big word vector model using public corpora
git-svn-id: http://word2vec.googlecode.com/svn/trunk@40 c84ef02e-58a5-4c83-e53e-41fc32d635eb
1 parent f23305f commit ee2fcbd

File tree

1 file changed

+99
-0
lines changed

1 file changed

+99
-0
lines changed

demo-train-big-model-v1.sh

+99
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
###############################################################################################
2+
#
3+
# Script for training good word and phrase vector model using public corpora, version 1.0.
4+
# The training time will be from several hours to about a day.
5+
#
6+
# Downloads about 8 billion words, makes phrases using two runs of word2phrase, trains
7+
# a 500-dimensional vector model and evaluates it on word and phrase analogy tasks.
8+
#
9+
###############################################################################################
10+
11+
# This function will convert text to lowercase and remove special characters
12+
normalize_text() {
13+
awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \
14+
-e 's/"/ " /g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
15+
-e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \
16+
-e 's/«/ /g' | tr 0-9 " "
17+
}
18+
19+
mkdir word2vec
20+
cd word2vec
21+
22+
wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
23+
wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz
24+
gzip -d news.2012.en.shuffled.gz
25+
gzip -d news.2013.en.shuffled.gz
26+
normalize_text < news.2012.en.shuffled > data.txt
27+
normalize_text < news.2013.en.shuffled >> data.txt
28+
29+
wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
30+
tar -xvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
31+
for i in `ls 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled`; do
32+
normalize_text < 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/$i >> data.txt
33+
done
34+
35+
wget http://ebiquity.umbc.edu/redirect/to/resource/id/351/UMBC-webbase-corpus
36+
tar -zxvf umbc_webbase_corpus.tar.gz webbase_all/*.txt
37+
for i in `ls webbase_all`; do
38+
normalize_text < webbase_all/$i >> data.txt
39+
done
40+
41+
wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
42+
bzip2 -c -d enwiki-latest-pages-articles.xml.bz2 | awk '{print tolower($0);}' | perl -e '
43+
# Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase
44+
# letters (a-z, converted from A-Z), and spaces (never consecutive)...
45+
# All other characters are converted to spaces. Only text which normally appears.
46+
# in the web browser is displayed. Tables are removed. Image captions are.
47+
# preserved. Links are converted to normal text. Digits are spelled out.
48+
# *** Modified to not spell digits or throw away non-ASCII characters ***
49+
50+
# Written by Matt Mahoney, June 10, 2006. This program is released to the public domain.
51+
52+
$/=">"; # input record separator
53+
while (<>) {
54+
if (/<text /) {$text=1;} # remove all but between <text> ... </text>
55+
if (/#redirect/i) {$text=0;} # remove #REDIRECT
56+
if ($text) {
57+
58+
# Remove any text not normally visible
59+
if (/<\/text>/) {$text=0;}
60+
s/<.*>//; # remove xml tags
61+
s/&amp;/&/g; # decode URL encoded chars
62+
s/&lt;/</g;
63+
s/&gt;/>/g;
64+
s/<ref[^<]*<\/ref>//g; # remove references <ref...> ... </ref>
65+
s/<[^>]*>//g; # remove xhtml tags
66+
s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text
67+
s/\|thumb//ig; # remove images links, preserve caption
68+
s/\|left//ig;
69+
s/\|right//ig;
70+
s/\|\d+px//ig;
71+
s/\[\[image:[^\[\]]*\|//ig;
72+
s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup
73+
s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages
74+
s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text
75+
s/{{[^}]*}}//g; # remove {{icons}} and {tables}
76+
s/{[^}]*}//g;
77+
s/\[//g; # remove [ and ]
78+
s/\]//g;
79+
s/&[^;]*;/ /g; # remove URL encoded chars
80+
81+
$_=" $_ ";
82+
chop;
83+
print $_;
84+
}
85+
}
86+
' | normalize_text | awk '{if (NF>1) print;}' >> data.txt
87+
88+
wget http://word2vec.googlecode.com/svn/trunk/word2vec.c
89+
wget http://word2vec.googlecode.com/svn/trunk/word2phrase.c
90+
wget http://word2vec.googlecode.com/svn/trunk/compute-accuracy.c
91+
wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt
92+
gcc word2vec.c -o word2vec -lm -pthread -O3 -march=native -funroll-loops
93+
gcc word2phrase.c -o word2phrase -lm -pthread -O3 -march=native -funroll-loops
94+
gcc compute-accuracy.c -o compute-accuracy -lm -pthread -O3 -march=native -funroll-loops
95+
./word2phrase -train data.txt -output data-phrase.txt -threshold 200 -debug 2
96+
./word2phrase -train data-phrase.txt -output data-phrase2.txt -threshold 100 -debug 2
97+
./word2vec -train data-phrase2.txt -output vectors.bin -cbow 1 -size 500 -window 10 -negative 10 -hs 0 -sample 1e-5 -threads 40 -binary 1 -iter 3 -min-count 10
98+
./compute-accuracy vectors.bin 400000 < questions-words.txt # should get to almost 78% accuracy on 99.7% of questions
99+
./compute-accuracy vectors.bin 1000000 < questions-phrases.txt # about 78% accuracy with 77% coverage

0 commit comments

Comments
 (0)