Описание
Это моя попытка написать оболочку для исследовательского API Microsoft (оценивать эндоинт).
Моими целями были:
- научиться писать классы
- в целом все нормально?
- все методы, кроме
download_publications
а такжеsave
быть частный (начать с подчеркивания)?
- упростить загрузку сущностей из API (используйте один метод для получения необработанных данных json и обработанных табличных данных; и другой метод для простой записи любого формата в файл)
Я не инженер-программист и никогда не использую классы для своих задач, но думаю, это может быть полезно в таких случаях — когда я загружаю и обрабатываю данные?
как можно улучшить код / видите ли вы лучший подход в целом?
Состав
logging.py
# -*- coding: utf-8 -*-
import logging
def create_logger(name: str):
"""Create logger with DEBUG level & stream handler."""
logger = logging.getLogger(name)
logger.setLevel(logging.DEBUG)
sh = logging.StreamHandler()
sh.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s [%(levelname)s]: %(message)s")
sh.setFormatter(formatter)
logger.addHandler(sh)
return logger
logger = create_logger("mag-api-wrapper")
mag.py
# -*- coding: utf-8 -*-
import json
import requests
import pandas as pd
from time import sleep
from .logger import logger
class MAG:
"""Papers retrieved from Microsoft Academic API."""
ENDPOINT = "https://api.labs.cognitive.microsoft.com/academic/v1.0/evaluate"
ENTITIES = {
"Id": "mag_ID",
"DN": "original_paper_title",
"Ti": "normalized_title",
"W": "normalized_words_in_title",
"AW": "normalized_words_in_abstract",
"RA": "restored_abstract",
"IA": "inverted_abstract",
"AA": "authors",
"AuId": "author_id",
"DAuN": "author_name",
"Y": "year_published",
"D": "isodate_published",
"DOI": "DOI",
"J": "journals",
"JN": "journal_name",
"PB": "publisher",
"ECC": "estimated_citation_count",
"F": "fields",
"DFN": "field_of_study",
"FN": "normalized_field_of_study",
}
def __init__(
self,
expr: str,
key: str,
count: int = 1_000,
offset: int = 0,
model: str = "latest",
attr: str = "DN,Ti,W,AW,IA,AA.AuId,AA.DAuN,Y,D,DOI,J.JN,PB,ECC,F.FN",
):
self.expr = expr
self.key = key
self.count = count
self.offset = offset
self.model = model
self.attr = attr
self.json_data = None
self.table_data = None
def download_publications(self):
"""Download entities."""
logger.info(f"Calling Microsoft Academic API with the query: {self.expr}")
records = list(self.yield_records())
self.json_data = [item["raw"] for item in records]
self.table_data = (
pd.DataFrame([item["processed"] for item in records])
.drop(["prob", "logprob"], axis=1)
.rename(columns=MAG.ENTITIES)
)
logger.info(f"Downloaded {self.table_data.shape[0]} entries in total.")
def save(self, tocsv=None, tojson=None):
"""Write fetched data to files."""
if tocsv is not None and self.table_data is not None:
self.table_data.to_csv(tocsv, index=False)
if tojson is not None and self.json_data is not None:
with open(tojson, "w", encoding="utf-8") as f:
json.dump(self.json_data, f, ensure_ascii=False, indent=4)
def fetch(self, url, params):
"""Make a remote call to Microsoft Academic API."""
return requests.get(url, params).json()
def restore_abstract(self, abstract):
"""Restore inverted abstract to its original form."""
words = abstract["InvertedIndex"]
total_words = abstract["IndexLength"]
text = []
for position in range(0, total_words):
for word, positions in words.items():
if position in positions:
text.append(word)
return " ".join(text)
def process(self, entities):
"""Process entities, including unnesting JSON and restoring
inverted abstracts to their raw form."""
for item in entities:
entity = item.copy()
if "IA" in entity.keys():
entity["RA"] = self.restore_abstract(entity["IA"])
del entity["IA"]
if "AA" in entity.keys():
entity["DAuN"] = ";".join(item["DAuN"] for item in entity["AA"])
entity["AuId"] = ";".join(str(item["AuId"]) for item in entity["AA"])
del entity["AA"]
if "F" in entity.keys():
entity["FN"] = ";".join(item["FN"] for item in entity["F"])
del entity["F"]
if "J" in entity.keys():
if isinstance(entity["J"], dict):
entity["JN"] = entity["J"]["JN"]
elif isinstance(entity["J"], list):
entity["JN"] = ";".join(item["JN"] for item in entity["J"])
else:
entity["JN"] = entity["J"]
del entity["J"]
yield {"raw": item, "processed": entity}
def yield_records(self):
"""Fetch all entities for a given query expression."""
params = {
"expr": self.expr,
"offset": self.offset,
"count": self.count,
"attributes": self.attr,
"model": self.model,
"subscription-key": self.key,
}
downloaded = 0
while True:
data = self.fetch(MAG.ENDPOINT, params)
if data["entities"] == []:
break
yield from self.process(data["entities"])
params["offset"] += self.count
downloaded += len(data["entities"])
logger.info(f"fetched {downloaded} entries.")
sleep(3.1)
__init__.py
from .mag import MAG
__version__ = "0.1.0"
Применение
>>> from mag import MAG
>>> pubs = MAG(
expr="And(And(AW='organized', AW='crime', Y=[2000, 2020]), Composite(F.FN='political science'))",
key="2q3b955bfa210f9aa1a4eq35fa63378c" #dummy key
)
>>> pubs.download_publications()
>>> pubs.save(tocsv="data.csv")
>>> pubs.save(tojson="data.json")