|
| 1 | +############################################################################################### |
| 2 | +# |
| 3 | +# Script for training good word and phrase vector model using public corpora, version 1.0. |
| 4 | +# The training time will be from several hours to about a day. |
| 5 | +# |
| 6 | +# Downloads about 8 billion words, makes phrases using two runs of word2phrase, trains |
| 7 | +# a 500-dimensional vector model and evaluates it on word and phrase analogy tasks. |
| 8 | +# |
| 9 | +############################################################################################### |
| 10 | + |
| 11 | +# This function will convert text to lowercase and remove special characters |
| 12 | +normalize_text() { |
| 13 | + awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \ |
| 14 | + -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ |
| 15 | + -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \ |
| 16 | + -e 's/«/ /g' | tr 0-9 " " |
| 17 | +} |
| 18 | + |
| 19 | +mkdir word2vec |
| 20 | +cd word2vec |
| 21 | + |
| 22 | +wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz |
| 23 | +wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz |
| 24 | +gzip -d news.2012.en.shuffled.gz |
| 25 | +gzip -d news.2013.en.shuffled.gz |
| 26 | +normalize_text < news.2012.en.shuffled > data.txt |
| 27 | +normalize_text < news.2013.en.shuffled >> data.txt |
| 28 | + |
| 29 | +wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz |
| 30 | +tar -xvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz |
| 31 | +for i in `ls 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled`; do |
| 32 | + normalize_text < 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/$i >> data.txt |
| 33 | +done |
| 34 | + |
| 35 | +wget http://ebiquity.umbc.edu/redirect/to/resource/id/351/UMBC-webbase-corpus |
| 36 | +tar -zxvf umbc_webbase_corpus.tar.gz webbase_all/*.txt |
| 37 | +for i in `ls webbase_all`; do |
| 38 | + normalize_text < webbase_all/$i >> data.txt |
| 39 | +done |
| 40 | + |
| 41 | +wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 |
| 42 | +bzip2 -c -d enwiki-latest-pages-articles.xml.bz2 | awk '{print tolower($0);}' | perl -e ' |
| 43 | +# Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase |
| 44 | +# letters (a-z, converted from A-Z), and spaces (never consecutive)... |
| 45 | +# All other characters are converted to spaces. Only text which normally appears. |
| 46 | +# in the web browser is displayed. Tables are removed. Image captions are. |
| 47 | +# preserved. Links are converted to normal text. Digits are spelled out. |
| 48 | +# *** Modified to not spell digits or throw away non-ASCII characters *** |
| 49 | +
|
| 50 | +# Written by Matt Mahoney, June 10, 2006. This program is released to the public domain. |
| 51 | +
|
| 52 | +$/=">"; # input record separator |
| 53 | +while (<>) { |
| 54 | + if (/<text /) {$text=1;} # remove all but between <text> ... </text> |
| 55 | + if (/#redirect/i) {$text=0;} # remove #REDIRECT |
| 56 | + if ($text) { |
| 57 | +
|
| 58 | + # Remove any text not normally visible |
| 59 | + if (/<\/text>/) {$text=0;} |
| 60 | + s/<.*>//; # remove xml tags |
| 61 | + s/&/&/g; # decode URL encoded chars |
| 62 | + s/</</g; |
| 63 | + s/>/>/g; |
| 64 | + s/<ref[^<]*<\/ref>//g; # remove references <ref...> ... </ref> |
| 65 | + s/<[^>]*>//g; # remove xhtml tags |
| 66 | + s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text |
| 67 | + s/\|thumb//ig; # remove images links, preserve caption |
| 68 | + s/\|left//ig; |
| 69 | + s/\|right//ig; |
| 70 | + s/\|\d+px//ig; |
| 71 | + s/\[\[image:[^\[\]]*\|//ig; |
| 72 | + s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup |
| 73 | + s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages |
| 74 | + s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text |
| 75 | + s/{{[^}]*}}//g; # remove {{icons}} and {tables} |
| 76 | + s/{[^}]*}//g; |
| 77 | + s/\[//g; # remove [ and ] |
| 78 | + s/\]//g; |
| 79 | + s/&[^;]*;/ /g; # remove URL encoded chars |
| 80 | +
|
| 81 | + $_=" $_ "; |
| 82 | + chop; |
| 83 | + print $_; |
| 84 | + } |
| 85 | +} |
| 86 | +' | normalize_text | awk '{if (NF>1) print;}' >> data.txt |
| 87 | + |
| 88 | +wget http://word2vec.googlecode.com/svn/trunk/word2vec.c |
| 89 | +wget http://word2vec.googlecode.com/svn/trunk/word2phrase.c |
| 90 | +wget http://word2vec.googlecode.com/svn/trunk/compute-accuracy.c |
| 91 | +wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt |
| 92 | +gcc word2vec.c -o word2vec -lm -pthread -O3 -march=native -funroll-loops |
| 93 | +gcc word2phrase.c -o word2phrase -lm -pthread -O3 -march=native -funroll-loops |
| 94 | +gcc compute-accuracy.c -o compute-accuracy -lm -pthread -O3 -march=native -funroll-loops |
| 95 | +./word2phrase -train data.txt -output data-phrase.txt -threshold 200 -debug 2 |
| 96 | +./word2phrase -train data-phrase.txt -output data-phrase2.txt -threshold 100 -debug 2 |
| 97 | +./word2vec -train data-phrase2.txt -output vectors.bin -cbow 1 -size 500 -window 10 -negative 10 -hs 0 -sample 1e-5 -threads 40 -binary 1 -iter 3 -min-count 10 |
| 98 | +./compute-accuracy vectors.bin 400000 < questions-words.txt # should get to almost 78% accuracy on 99.7% of questions |
| 99 | +./compute-accuracy vectors.bin 1000000 < questions-phrases.txt # about 78% accuracy with 77% coverage |
0 commit comments