Report abuse

# This program analyzes a question/answer set to build a knowledge base and
# parsing routine for a conversational interface.  In order to experiment with
# this, FOLLOW THESE INSTRUCTIONS:
#
# Create an excel spreadsheet CSV file with questions in column A and answers
# in column E.  Name it FAQ_samples.csv.
#
# Download the NLTK Natural Language Tool Kit.
#
# If you don't want to do all that, then just look at the results program,
# which incorporates MY Questions and Answers, and the NLTK synonyms into it,
# hard-coded.  It's a lot easier to play with than this is.
#
# Results program is located at:  http://www.pastie.org/574701
#
# Copyright 2009, Jerry Felix.
#
# Use Freely for non-commercial use,
# but please send me an email at amyiris@amyiris.com so I can
# keep you up-to-date on developments!


import  csv, re, string
from nltk.corpus import wordnet as wn

# TO DO:  Improve normalization, by eliminating apostrophes, punctuation, etc.
def normalize(textin):
    return textin.lower()


# These words are fairly meaningless to the scoring; feel free to expand or
#     modify, so that it contains the meaningless words in your knowledge base

connectors = [ 'any', 'anyone', 'as', 'by', 'if', 'make', 'may',
               'onto', 'particular', 'possible', 'such', 'that',
               'there', 'this',  'under', 'up', 'we', 'with',
               'would', 'your', 'about', 'around',  'have', 'it',
               'who', 'why', 'in', 'or', 'out', 'where', 'does',
               'on', 'and', 'are', 'can', 'get', 'of', 'when', 'to',
               'for', 'my', 'what', 'is', 'the', 'a', 'how', 'do', 'i',
               'offered',
               ]

# My csv file has questions in column A and answers in column E

csvfile=csv.reader(open('FAQ_samples.csv'), dialect='excel')
kb=[]
for row in csvfile:
    if len(row)>=5 and row[4]:
        kb.append((row[0],row[4]))

# kb is the Knowledge Base - a list of (questions, answers)


vocab={}

for index,question in enumerate(x for x,y in kb):
    #print question
    x=re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", question)

    x=map(string.lower,x)


    for word in set(x):
        questions=vocab.setdefault(word,[])
        questions.append(index)
        vocab[word]=questions

# vocab is a dict whose key is a word, and value is a list of question numbers
#    that the word appears in.  It ended up being about 200 words for my 55
#    questions.


# Below, I delete any of the connector words.  I'm sure this can be done in
#    one line of Python code, not 4.... I leave that up to you; teach me!

vocabwords=vocab.keys()
for word in vocabwords:
    if word in connectors:
         del vocab[word]  


# Now I'm going to create the synonyms based on NLTK's synsets

vocabwords=vocab.keys()
synvocab={}

for word in vocabwords:
    for synset in wn.synsets(word):
        for lemma_name in synset.lemma_names:
            if lemma_name not in synvocab:
                synvocab[lemma_name]=[]
            synvocab[lemma_name]=list(set(synvocab[lemma_name]).union(vocab[word]))

# synvocab ended up being a list of about 1500 synonyms and related words.
# I delete the connector words out of the synonyms too.

vocabwords=synvocab.keys()
for word in vocabwords:
    if word in connectors:
         del synvocab[word]


# Here's my Conversational Interface

while True:

    textin = normalize(raw_input("->"))

    words=re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", textin)

    scores={}      # a dictionary of scores indexed by question number
    max_score=0.0  # for the user's input, what's the maximum score possible?
    syn_factor = 0.4   # How much credit do synonyms get, vs. exact matches?

    for word in words:
        if word in vocab:
            for question_index in vocab[word]:
                s = scores.setdefault(question_index,0.0)
                scores[question_index] = s + 1.0/float(len(vocab[word]))
            max_score += 1.0/float(len(vocab[word]))
        if word in synvocab:
            for question_index in synvocab[word]:
                s = scores.setdefault(question_index,0.0)
                scores[question_index] = s + syn_factor/float(len(synvocab[word]))
            max_score += syn_factor/float(len(synvocab[word]))

    score,index = sorted([(v,k) for k,v in scores.iteritems()])[-1]


    if max_score < 0.5:              # this is my threshold of understanding
        print "Sorry, I didn't understand.   "
        print "Try asking your question a different way.  "
        print "it seems like you might be interested in the answer to this question: "

        print kb[index][0]

    print kb[index][1]