# Теоретический классификатор викторины с использованием сходства

Я написал довольно простую программу, которая оценивает теоретические тесты. Это довольно просто. Я хотел бы получить информацию и идеи о том, как это можно улучшить.

### consine_similarity.py

``````"""This module uses consine distance to check the similarity between two sentences"""

from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.tokenize import word_tokenize

def consine_similarity(sent1: str, sent2: str) -> float:
"""Consine similarity between two sentences

sent1: str
sent2: str
"""

sent1_list = word_tokenize(sent1)
sent2_list = word_tokenize(sent2)
stop_words = stopwords.words('english')
all_words = list(set(sent1_list + sent2_list))

vector1 = [0] * len(all_words)
vector2 = [0] * len(all_words)

for word in sent1_list:
if word in stop_words:
continue
vector1[all_words.index(word)] += 1

for word in sent2_list:
if word in stop_words:
continue
vector2[all_words.index(word)] += 1

return 1 - cosine_distance(vector1, vector2)

``````

### quiz.py

``````"""Theory-based quiz application"""

from dataclasses import dataclass
from random import shuffle
from consine_similarity import consine_similarity

@dataclass
class Quiz:
"""Quiz data"""

quiz: str

def __str__(self):
return '{}'.format(self.quiz)

def start_quiz(quiz_list: list) -> None:
"""Start the quiz application"""

shuffle(quiz_list)
consine_list = []

for quiz in quiz_list:
print(quiz)
print()

grade = sum(consine_list) / len(consine_list) * 100

if __name__ == "__main__":
QUIZ_LIST = [
Quiz(quiz='What is a computer?',
answer="An electronic device for storing and processing data,
typically in binary form, according to instructions given to it"),
Quiz(quiz='What are logic gates?',
answer="These are the basic building blocks of any digital system. It is an
electronic circuit having one or more than one input and only one output"),
Quiz(quiz='What is BIOS',
answer="This is firmware used to perform hardware initialization during the booting
process and to provide runtime services for operating systems and programs"),
]
start_quiz(QUIZ_LIST)

``````

## 1 ответ

Нет необходимости включать стоп-слова в словарь, если они все равно отфильтрованы из векторов. Таким образом, токенизацию и удаление стоп-слов можно рассматривать как один шаг. Во-вторых, векторы могут быть немедленно инициализированы правильными значениями, воспользовавшись преимуществами `collections.Counter`

``````from collections import Counter
from typing import List
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.tokenize import word_tokenize

def tokenize_without_stop_words(sentence:str) -> List[str]:
return [
word
for word in word_tokenize(sentence)
if word not in stopwords.words('english')
]

def cosine_similarity(first_sentence: str, second_sentence: str) -> float:
first_words = tokenize_without_stop_words(first_sentence)
second_words = tokenize_without_stop_words(second_sentence)

dictionary = list(set(first_words+second_words))

def encode(words: List[str]) -> List[int]:
word_counts = Counter(words)
return [
word_counts[word]
for word in dictionary
]

first_vector = encode(first_words)
second_vector = encode(second_words)

return 1 - cosine_distance(first_vector, second_vector)
``````