Я написал довольно простую программу, которая оценивает теоретические тесты. Это довольно просто. Я хотел бы получить информацию и идеи о том, как это можно улучшить.
consine_similarity.py
"""This module uses consine distance to check the similarity between two sentences"""
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.tokenize import word_tokenize
def consine_similarity(sent1: str, sent2: str) -> float:
"""Consine similarity between two sentences
sent1: str
sent2: str
"""
sent1_list = word_tokenize(sent1)
sent2_list = word_tokenize(sent2)
stop_words = stopwords.words('english')
all_words = list(set(sent1_list + sent2_list))
vector1 = [0] * len(all_words)
vector2 = [0] * len(all_words)
for word in sent1_list:
if word in stop_words:
continue
vector1[all_words.index(word)] += 1
for word in sent2_list:
if word in stop_words:
continue
vector2[all_words.index(word)] += 1
return 1 - cosine_distance(vector1, vector2)
quiz.py
"""Theory-based quiz application"""
from dataclasses import dataclass
from random import shuffle
from consine_similarity import consine_similarity
@dataclass
class Quiz:
"""Quiz data"""
quiz: str
answer: str
def __str__(self):
return '{}'.format(self.quiz)
def start_quiz(quiz_list: list) -> None:
"""Start the quiz application"""
shuffle(quiz_list)
consine_list = []
for quiz in quiz_list:
print(quiz)
answer = input('> ')
consine_list.append(consine_similarity(quiz.answer, answer))
print()
grade = sum(consine_list) / len(consine_list) * 100
print('Your grade is {:.2f}%'.format(grade))
if __name__ == "__main__":
QUIZ_LIST = [
Quiz(quiz='What is a computer?',
answer="An electronic device for storing and processing data,
typically in binary form, according to instructions given to it"),
Quiz(quiz='What are logic gates?',
answer="These are the basic building blocks of any digital system. It is an
electronic circuit having one or more than one input and only one output"),
Quiz(quiz='What is BIOS',
answer="This is firmware used to perform hardware initialization during the booting
process and to provide runtime services for operating systems and programs"),
]
start_quiz(QUIZ_LIST)
1 ответ
Нет необходимости включать стоп-слова в словарь, если они все равно отфильтрованы из векторов. Таким образом, токенизацию и удаление стоп-слов можно рассматривать как один шаг. Во-вторых, векторы могут быть немедленно инициализированы правильными значениями, воспользовавшись преимуществами collections.Counter
from collections import Counter
from typing import List
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.tokenize import word_tokenize
def tokenize_without_stop_words(sentence:str) -> List[str]:
return [
word
for word in word_tokenize(sentence)
if word not in stopwords.words('english')
]
def cosine_similarity(first_sentence: str, second_sentence: str) -> float:
first_words = tokenize_without_stop_words(first_sentence)
second_words = tokenize_without_stop_words(second_sentence)
dictionary = list(set(first_words+second_words))
def encode(words: List[str]) -> List[int]:
word_counts = Counter(words)
return [
word_counts[word]
for word in dictionary
]
first_vector = encode(first_words)
second_vector = encode(second_words)
return 1 - cosine_distance(first_vector, second_vector)