1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
from __future__ import division
from math import log
from itertools import izip, islice
from collections import defaultdict
import operator
def total_prob(french_words, english_words, pair_probs):
zIJ = 0.05 / (len(english_words) + 1) ** len(french_words)
for f in french_words:
zIJ *= sum(pair_probs[f].itervalues())
return zIJ
def init_prob_map(english_french_synonyms):
''' Construct a mapping of the probabilities for pairs of English and
French words.
'''
e_to_f = defaultdict(set)
for e_word_set, f_word_set in english_french_synonyms:
for e_word in e_word_set:
e_to_f[e_word] &= f_word_set
pair_probs = defaultdict(lambda: defaultdict(dict))
for e_word, f_word_set in e_to_f.iteritems():
for f_word in f_word_set:
pair_probs[f_word][e_word] = 1 / len(f_word_set)
return pair_probs
def create_model(english_filename, french_filename):
eng_fr_synonyms = []
with open(english_filename) as e_file, open(french_filename) as f_file:
for e_line, f_line in izip(e_file, f_file):
eng_fr_synonyms.append(set(e_line), set(f_line))
pair_probs = init_prob_map(eng_fr_synonyms)
print "Creating model"
for i in range(20):
e_word_total = defaultdict(float)
c = defaultdict(float)
log_prob = 0
for e_words, f_words in eng_fr_synonyms:
log_prob -= log(total_prob(e_words, f_words, pair_probs))
for f in f_words:
row_total_for_f = sum(pair_probs[f].itervalues())
for e in e_words:
conditional_prob = pair_probs[f][e] / row_total_for_f
c[(f,e)] += conditional_prob
e_word_total[e] += conditional_prob
for f, e in c:
pair_probs[f][e] = c[(f,e)] / e_word_total[e]
return pair_probs
create_model('europarl.en.coded.txt', 'europarl.de.coded.txt')
|