Теоретический классификатор викторины с использованием сходства

Я написал довольно простую программу, которая оценивает теоретические тесты. Это довольно просто. Я хотел бы получить информацию и идеи о том, как это можно улучшить.

consine_similarity.py

"""This module uses consine distance to check the similarity between two sentences"""

from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.tokenize import word_tokenize

def consine_similarity(sent1: str, sent2: str) -> float:
    """Consine similarity between two sentences

        sent1: str
        sent2: str
    """

    sent1_list = word_tokenize(sent1)
    sent2_list = word_tokenize(sent2)
    stop_words = stopwords.words('english')
    all_words = list(set(sent1_list + sent2_list))

    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    for word in sent1_list:
        if word in stop_words:
            continue
        vector1[all_words.index(word)] += 1

    for word in sent2_list:
        if word in stop_words:
            continue
        vector2[all_words.index(word)] += 1

    return 1 - cosine_distance(vector1, vector2)

quiz.py

"""Theory-based quiz application"""

from dataclasses import dataclass
from random import shuffle
from consine_similarity import consine_similarity

@dataclass
class Quiz:
    """Quiz data"""

    quiz: str
    answer: str

    def __str__(self):
        return '{}'.format(self.quiz)

def start_quiz(quiz_list: list) -> None:
    """Start the quiz application"""

    shuffle(quiz_list)
    consine_list = []

    for quiz in quiz_list:
        print(quiz)
        answer = input('> ')
        consine_list.append(consine_similarity(quiz.answer, answer))
        print()

    grade = sum(consine_list) / len(consine_list) * 100

    print('Your grade is {:.2f}%'.format(grade))

if __name__ == "__main__":
    QUIZ_LIST = [
        Quiz(quiz='What is a computer?',
             answer="An electronic device for storing and processing data, 
                 typically in binary form, according to instructions given to it"),
        Quiz(quiz='What are logic gates?',
             answer="These are the basic building blocks of any digital system. It is an 
                  electronic circuit having one or more than one input and only one output"),
        Quiz(quiz='What is BIOS',
             answer="This is firmware used to perform hardware initialization during the booting 
                 process and to provide runtime services for operating systems and programs"),
    ]
    start_quiz(QUIZ_LIST)

1 ответ
1

Нет необходимости включать стоп-слова в словарь, если они все равно отфильтрованы из векторов. Таким образом, токенизацию и удаление стоп-слов можно рассматривать как один шаг. Во-вторых, векторы могут быть немедленно инициализированы правильными значениями, воспользовавшись преимуществами collections.Counter

from collections import Counter
from typing import List
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.tokenize import word_tokenize

def tokenize_without_stop_words(sentence:str) -> List[str]:
    return [
        word
        for word in word_tokenize(sentence)
        if word not in stopwords.words('english')
    ]

def cosine_similarity(first_sentence: str, second_sentence: str) -> float:
    first_words = tokenize_without_stop_words(first_sentence)
    second_words = tokenize_without_stop_words(second_sentence)

    dictionary = list(set(first_words+second_words))

    def encode(words: List[str]) -> List[int]:
        word_counts = Counter(words)
        return [
            word_counts[word]
            for word in dictionary
        ]

    first_vector = encode(first_words)
    second_vector = encode(second_words)

    return 1 - cosine_distance(first_vector, second_vector)

    Добавить комментарий

    Ваш адрес email не будет опубликован. Обязательные поля помечены *