Report abuse

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from __future__ import division
from math import log
from itertools import izip, islice
from collections import defaultdict
import operator

def total_prob(french_words, english_words, pair_probs):
    zIJ = 0.05 / (len(english_words) + 1) ** len(french_words)
    for f in french_words:
        zIJ *= sum(pair_probs[f].itervalues())
    return zIJ

def init_prob_map(english_french_synonyms):
    ''' Construct a mapping of the probabilities for pairs of English and
        French words.
    '''
    e_to_f = defaultdict(set)
    for e_word_set, f_word_set in english_french_synonyms:
        for e_word in e_word_set:
            e_to_f[e_word] &= f_word_set
    
    pair_probs = defaultdict(lambda: defaultdict(dict))
    for e_word, f_word_set in e_to_f.iteritems():
        for f_word in f_word_set:
            pair_probs[f_word][e_word] = 1 / len(f_word_set) 
    
    return pair_probs

def create_model(english_filename, french_filename):
    eng_fr_synonyms = []
    with open(english_filename) as e_file, open(french_filename) as f_file:
        for e_line, f_line in izip(e_file, f_file):
            eng_fr_synonyms.append(set(e_line), set(f_line))
    
    
    pair_probs = init_prob_map(eng_fr_synonyms)
        
    print "Creating model"
    
    for i in range(20):
        e_word_total = defaultdict(float)
        c = defaultdict(float) # WTF is "c"?
        
        log_prob = 0
        for e_words, f_words in eng_fr_synonyms:
            
            log_prob -= log(total_prob(e_words, f_words, pair_probs))
            for f in f_words:
                row_total_for_f = sum(pair_probs[f].itervalues())

                for e in e_words:
                    conditional_prob = pair_probs[f][e] / row_total_for_f
                    c[(f,e)] += conditional_prob
                    e_word_total[e] += conditional_prob
        
        for f, e in c:
            pair_probs[f][e] = c[(f,e)] / e_word_total[e]
    
    return pair_probs

create_model('europarl.en.coded.txt', 'europarl.de.coded.txt')