Report abuse

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# This program analyzes a question/answer set to build a knowledge base and
# parsing routine for a conversational interface.  In order to experiment with
# this, FOLLOW THESE INSTRUCTIONS:
#
# Create an excel spreadsheet CSV file with questions in column A and answers
# in column E.  Name it FAQ_samples.csv.
#
# Download the NLTK Natural Language Tool Kit.
#
# If you don't want to do all that, then just look at the results program,
# which incorporates MY Questions and Answers, and the NLTK synonyms into it,
# hard-coded.  It's a lot easier to play with than this is.
#
# Results program is located at:  http://www.pastie.org/574701
#
# Copyright 2009, Jerry Felix.
#
# Use Freely for non-commercial use,
# but please send me an email at amyiris@amyiris.com so I can
# keep you up-to-date on developments!


import  csv, re, string
from nltk.corpus import wordnet as wn

# TO DO:  Improve normalization, by eliminating apostrophes, punctuation, etc.
def normalize(textin):
    return textin.lower()


# These words are fairly meaningless to the scoring; feel free to expand or
#     modify, so that it contains the meaningless words in your knowledge base

connectors = [ 'any', 'anyone', 'as', 'by', 'if', 'make', 'may',
               'onto', 'particular', 'possible', 'such', 'that',
               'there', 'this',  'under', 'up', 'we', 'with',
               'would', 'your', 'about', 'around',  'have', 'it',
               'who', 'why', 'in', 'or', 'out', 'where', 'does',
               'on', 'and', 'are', 'can', 'get', 'of', 'when', 'to',
               'for', 'my', 'what', 'is', 'the', 'a', 'how', 'do', 'i',
               'offered',
               ]

# My csv file has questions in column A and answers in column E

csvfile=csv.reader(open('FAQ_samples.csv'), dialect='excel')
kb=[]
for row in csvfile:
    if len(row)>=5 and row[4]:
        kb.append((row[0],row[4]))

# kb is the Knowledge Base - a list of (questions, answers)


vocab={}

for index,question in enumerate(x for x,y in kb):
    #print question
    x=re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", question)

    x=map(string.lower,x)


    for word in set(x):
        questions=vocab.setdefault(word,[])
        questions.append(index)
        vocab[word]=questions

# vocab is a dict whose key is a word, and value is a list of question numbers
#    that the word appears in.  It ended up being about 200 words for my 55
#    questions.


# Below, I delete any of the connector words.  I'm sure this can be done in
#    one line of Python code, not 4.... I leave that up to you; teach me!

vocabwords=vocab.keys()
for word in vocabwords:
    if word in connectors:
         del vocab[word]  


# Now I'm going to create the synonyms based on NLTK's synsets

vocabwords=vocab.keys()
synvocab={}

for word in vocabwords:
    for synset in wn.synsets(word):
        for lemma_name in synset.lemma_names:
            if lemma_name not in synvocab:
                synvocab[lemma_name]=[]
            synvocab[lemma_name]=list(set(synvocab[lemma_name]).union(vocab[word]))

# synvocab ended up being a list of about 1500 synonyms and related words.
# I delete the connector words out of the synonyms too.

vocabwords=synvocab.keys()
for word in vocabwords:
    if word in connectors:
         del synvocab[word]


# Here's my Conversational Interface

while True:

    textin = normalize(raw_input("->"))

    words=re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", textin)

    scores={}      # a dictionary of scores indexed by question number
    max_score=0.0  # for the user's input, what's the maximum score possible?
    syn_factor = 0.4   # How much credit do synonyms get, vs. exact matches?

    for word in words:
        if word in vocab:
            for question_index in vocab[word]:
                s = scores.setdefault(question_index,0.0)
                scores[question_index] = s + 1.0/float(len(vocab[word]))
            max_score += 1.0/float(len(vocab[word]))
        if word in synvocab:
            for question_index in synvocab[word]:
                s = scores.setdefault(question_index,0.0)
                scores[question_index] = s + syn_factor/float(len(synvocab[word]))
            max_score += syn_factor/float(len(synvocab[word]))

    score,index = sorted([(v,k) for k,v in scores.iteritems()])[-1]


    if max_score < 0.5:              # this is my threshold of understanding
        print "Sorry, I didn't understand.   "
        print "Try asking your question a different way.  "
        print "it seems like you might be interested in the answer to this question: "

        print kb[index][0]

    print kb[index][1]