Skip to content

Commit 766de36

Browse files
committed
Moved over from lingo, and tightened up the API surface
1 parent 620dfa6 commit 766de36

13 files changed

+4569
-0
lines changed

consopt.go

+152
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
package corpus
2+
3+
import (
4+
"log"
5+
"sort"
6+
"sync/atomic"
7+
"unicode/utf8"
8+
9+
"github.com/pkg/errors"
10+
"github.com/xtgo/set"
11+
)
12+
13+
// ConsOpt is a construction option for manual creation of a Corpus
14+
type ConsOpt func(c *Corpus) error
15+
16+
// WithWords creates a corpus from a word list. It may have repeated words
17+
func WithWords(a []string) ConsOpt {
18+
f := func(c *Corpus) error {
19+
s := set.Strings(a)
20+
c.words = s
21+
c.frequencies = make([]int, len(s))
22+
23+
ids := make(map[string]int)
24+
maxID := len(s)
25+
26+
var totalFreq, maxWL int
27+
// NOTE: here we're iterating over the set of words
28+
for i, w := range s {
29+
runeCount := utf8.RuneCountInString(w)
30+
if runeCount > c.maxWordLength {
31+
maxWL = runeCount
32+
}
33+
34+
ids[w] = i
35+
}
36+
37+
// NOTE: here we're iterating over the original word list.
38+
for _, w := range a {
39+
c.frequencies[ids[w]]++
40+
totalFreq++
41+
}
42+
43+
c.ids = ids
44+
atomic.AddInt64(&c.maxid, int64(maxID))
45+
c.totalFreq = totalFreq
46+
c.maxWordLength = maxWL
47+
return nil
48+
}
49+
return f
50+
}
51+
52+
// WithOrderedWords creates a Corpus with the given word order
53+
func WithOrderedWords(a []string) ConsOpt {
54+
f := func(c *Corpus) error {
55+
s := a
56+
c.words = s
57+
c.frequencies = make([]int, len(s))
58+
for i := range c.frequencies {
59+
c.frequencies[i] = 1
60+
}
61+
62+
ids := make(map[string]int)
63+
maxID := len(s)
64+
totalFreq := len(s)
65+
var maxWL int
66+
for i, w := range a {
67+
runeCount := utf8.RuneCountInString(w)
68+
if runeCount > c.maxWordLength {
69+
maxWL = runeCount
70+
}
71+
ids[w] = i
72+
}
73+
74+
c.ids = ids
75+
atomic.AddInt64(&c.maxid, int64(maxID))
76+
c.totalFreq = totalFreq
77+
c.maxWordLength = maxWL
78+
return nil
79+
}
80+
return f
81+
}
82+
83+
// WithSize preallocates all the things in Corpus
84+
func WithSize(size int) ConsOpt {
85+
return func(c *Corpus) error {
86+
c.words = make([]string, 0, size)
87+
c.frequencies = make([]int, 0, size)
88+
return nil
89+
}
90+
}
91+
92+
// FromDict is a construction option to take a map[string]int where the int represents the word ID.
93+
// This is useful for constructing corpuses from foreign sources where the ID mappings are important
94+
func FromDict(d map[string]int) ConsOpt {
95+
return func(c *Corpus) error {
96+
var a sortutil
97+
for k, v := range d {
98+
a.words = append(a.words, k)
99+
a.ids = append(a.ids, v)
100+
}
101+
sort.Sort(&a)
102+
c.ids = make(map[string]int)
103+
for i, w := range a.words {
104+
if i != a.ids[i] {
105+
return errors.Errorf("Unmarshaling error. Expected %dth ID to be %d. Got %d instead. Perhaps something went wrong during sorting? SLYTHERIN IT IS!", i, i, a.ids[i])
106+
}
107+
c.words = append(c.words, w)
108+
c.frequencies = append(c.frequencies, 1)
109+
c.ids[w] = i
110+
111+
c.totalFreq++
112+
runeCount := utf8.RuneCountInString(w)
113+
if runeCount > c.maxWordLength {
114+
log.Printf("FD MaxWordLength %d - %q", runeCount, w)
115+
c.maxWordLength = runeCount
116+
}
117+
}
118+
c.maxid = int64(len(a.words))
119+
return nil
120+
}
121+
122+
}
123+
124+
// FromDictWithFreq is like FromDict, but also has a frequency.
125+
func FromDictWithFreq(d map[string]struct{ ID, Freq int }) ConsOpt {
126+
return func(c *Corpus) error {
127+
var a sortutil
128+
for k, v := range d {
129+
a.words = append(a.words, k)
130+
a.ids = append(a.ids, v.ID)
131+
a.freqs = append(a.freqs, v.Freq)
132+
}
133+
sort.Sort(&a)
134+
c.ids = make(map[string]int)
135+
for i, w := range a.words {
136+
if i != a.ids[i] {
137+
return errors.Errorf("Unmarshaling error. Expected %dth ID to be %d. Got %d instead. Perhaps something went wrong during sorting? SLYTHERIN IT IS!", i, i, a.ids[i])
138+
}
139+
c.words = append(c.words, w)
140+
c.frequencies = append(c.frequencies, a.freqs[i])
141+
c.ids[w] = i
142+
143+
c.totalFreq += a.freqs[i]
144+
runeCount := utf8.RuneCountInString(w)
145+
if runeCount > c.maxWordLength {
146+
c.maxWordLength = runeCount
147+
}
148+
}
149+
c.maxid = int64(len(a.words))
150+
return nil
151+
}
152+
}

consts.go

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
package corpus
2+
3+
// NumberWords was generated with this python code
4+
/*
5+
numberWords = {}
6+
7+
simple = '''zero one two three four five six seven eight nine ten eleven twelve
8+
thirteen fourteen fifteen sixteen seventeen eighteen nineteen
9+
twenty'''.split()
10+
for i, word in zip(xrange(0, 20+1), simple):
11+
numberWords[word] = i
12+
13+
tense = '''thirty forty fifty sixty seventy eighty ninety hundred'''.split()
14+
for i, word in zip(xrange(30, 100+1, 10), tense):
15+
numberWords[word] = i
16+
17+
larges = '''thousand million billion trillion quadrillion quintillion sextillion septillion'''.split()
18+
for i, word in zip(xrange(3, 24+1, 3), larges):
19+
numberWords[word] = 10**i
20+
*/
21+
var NumberWords = map[string]int{
22+
"zero": 0,
23+
"one": 1,
24+
"two": 2,
25+
"three": 3,
26+
"four": 4,
27+
"five": 5,
28+
"six": 6,
29+
"seven": 7,
30+
"eight": 8,
31+
"nine": 9,
32+
"ten": 10,
33+
"eleven": 11,
34+
"twelve": 12,
35+
"thirteen": 13,
36+
"fourteen": 14,
37+
"fifteen": 15,
38+
"sixteen": 16,
39+
"nineteen": 19,
40+
"seventeen": 17,
41+
"eighteen": 18,
42+
"twenty": 20,
43+
"thirty": 30,
44+
"forty": 40,
45+
"fifty": 50,
46+
"sixty": 60,
47+
"seventy": 70,
48+
"eighty": 80,
49+
"ninety": 90,
50+
"hundred": 100,
51+
"thousand": 1000,
52+
"million": 1000000,
53+
"billion": 1000000000,
54+
"trillion": 1000000000000,
55+
"quadrillion": 1000000000000000,
56+
// "quintillion": 1000000000000000000,
57+
// "sextillion": 1000000000000000000000,
58+
// "septillion": 1000000000000000000000000,
59+
}

0 commit comments

Comments
 (0)