1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
|
import csv, re, string
from nltk.corpus import wordnet as wn
def normalize(textin):
return textin.lower()
connectors = [ 'any', 'anyone', 'as', 'by', 'if', 'make', 'may',
'onto', 'particular', 'possible', 'such', 'that',
'there', 'this', 'under', 'up', 'we', 'with',
'would', 'your', 'about', 'around', 'have', 'it',
'who', 'why', 'in', 'or', 'out', 'where', 'does',
'on', 'and', 'are', 'can', 'get', 'of', 'when', 'to',
'for', 'my', 'what', 'is', 'the', 'a', 'how', 'do', 'i',
'offered',
]
csvfile=csv.reader(open('FAQ_samples.csv'), dialect='excel')
kb=[]
for row in csvfile:
if len(row)>=5 and row[4]:
kb.append((row[0],row[4]))
vocab={}
for index,question in enumerate(x for x,y in kb):
x=re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", question)
x=map(string.lower,x)
for word in set(x):
questions=vocab.setdefault(word,[])
questions.append(index)
vocab[word]=questions
vocabwords=vocab.keys()
for word in vocabwords:
if word in connectors:
del vocab[word]
vocabwords=vocab.keys()
synvocab={}
for word in vocabwords:
for synset in wn.synsets(word):
for lemma_name in synset.lemma_names:
if lemma_name not in synvocab:
synvocab[lemma_name]=[]
synvocab[lemma_name]=list(set(synvocab[lemma_name]).union(vocab[word]))
vocabwords=synvocab.keys()
for word in vocabwords:
if word in connectors:
del synvocab[word]
while True:
textin = normalize(raw_input("->"))
words=re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", textin)
scores={}
max_score=0.0
syn_factor = 0.4
for word in words:
if word in vocab:
for question_index in vocab[word]:
s = scores.setdefault(question_index,0.0)
scores[question_index] = s + 1.0/float(len(vocab[word]))
max_score += 1.0/float(len(vocab[word]))
if word in synvocab:
for question_index in synvocab[word]:
s = scores.setdefault(question_index,0.0)
scores[question_index] = s + syn_factor/float(len(synvocab[word]))
max_score += syn_factor/float(len(synvocab[word]))
score,index = sorted([(v,k) for k,v in scores.iteritems()])[-1]
if max_score < 0.5:
print "Sorry, I didn't understand. "
print "Try asking your question a different way. "
print "it seems like you might be interested in the answer to this question: "
print kb[index][0]
print kb[index][1]
|