Оболочка Microsoft Academic API

Описание

Это моя попытка написать оболочку для исследовательского API Microsoft (оценивать эндоинт).

Моими целями были:

  1. научиться писать классы
  • в целом все нормально?
  • все методы, кроме download_publications а также save быть частный (начать с подчеркивания)?
  1. упростить загрузку сущностей из API (используйте один метод для получения необработанных данных json и обработанных табличных данных; и другой метод для простой записи любого формата в файл)

Я не инженер-программист и никогда не использую классы для своих задач, но думаю, это может быть полезно в таких случаях — когда я загружаю и обрабатываю данные?

как можно улучшить код / ​​видите ли вы лучший подход в целом?

Состав

logging.py

# -*- coding: utf-8 -*-
import logging


def create_logger(name: str):
    """Create logger with DEBUG level & stream handler."""
    logger = logging.getLogger(name)
    logger.setLevel(logging.DEBUG)
    sh = logging.StreamHandler()
    sh.setLevel(logging.DEBUG)
    formatter = logging.Formatter("%(asctime)s [%(levelname)s]: %(message)s")
    sh.setFormatter(formatter)
    logger.addHandler(sh)
    return logger


logger = create_logger("mag-api-wrapper")

mag.py

# -*- coding: utf-8 -*-
import json
import requests
import pandas as pd
from time import sleep
from .logger import logger


class MAG:
    """Papers retrieved from Microsoft Academic API."""

    ENDPOINT = "https://api.labs.cognitive.microsoft.com/academic/v1.0/evaluate"
    ENTITIES = {
        "Id": "mag_ID",
        "DN": "original_paper_title",
        "Ti": "normalized_title",
        "W": "normalized_words_in_title",
        "AW": "normalized_words_in_abstract",
        "RA": "restored_abstract",
        "IA": "inverted_abstract",
        "AA": "authors",
        "AuId": "author_id",
        "DAuN": "author_name",
        "Y": "year_published",
        "D": "isodate_published",
        "DOI": "DOI",
        "J": "journals",
        "JN": "journal_name",
        "PB": "publisher",
        "ECC": "estimated_citation_count",
        "F": "fields",
        "DFN": "field_of_study",
        "FN": "normalized_field_of_study",
    }

    def __init__(
        self,
        expr: str,
        key: str,
        count: int = 1_000,
        offset: int = 0,
        model: str = "latest",
        attr: str = "DN,Ti,W,AW,IA,AA.AuId,AA.DAuN,Y,D,DOI,J.JN,PB,ECC,F.FN",
    ):
        self.expr = expr
        self.key = key
        self.count = count
        self.offset = offset
        self.model = model
        self.attr = attr

        self.json_data = None
        self.table_data = None

    def download_publications(self):
        """Download entities."""
        logger.info(f"Calling Microsoft Academic API with the query: {self.expr}")
        records = list(self.yield_records())
        self.json_data = [item["raw"] for item in records]
        self.table_data = (
            pd.DataFrame([item["processed"] for item in records])
            .drop(["prob", "logprob"], axis=1)
            .rename(columns=MAG.ENTITIES)
        )
        logger.info(f"Downloaded {self.table_data.shape[0]} entries in total.")

    def save(self, tocsv=None, tojson=None):
        """Write fetched data to files."""
        if tocsv is not None and self.table_data is not None:
            self.table_data.to_csv(tocsv, index=False)
        if tojson is not None and self.json_data is not None:
            with open(tojson, "w", encoding="utf-8") as f:
                json.dump(self.json_data, f, ensure_ascii=False, indent=4)

    def fetch(self, url, params):
        """Make a remote call to Microsoft Academic API."""
        return requests.get(url, params).json()

    def restore_abstract(self, abstract):
        """Restore inverted abstract to its original form."""
        words = abstract["InvertedIndex"]
        total_words = abstract["IndexLength"]

        text = []
        for position in range(0, total_words):
            for word, positions in words.items():
                if position in positions:
                    text.append(word)
        return " ".join(text)

    def process(self, entities):
        """Process entities, including unnesting JSON and restoring
        inverted abstracts to their raw form."""
        for item in entities:
            entity = item.copy()
            if "IA" in entity.keys():
                entity["RA"] = self.restore_abstract(entity["IA"])
                del entity["IA"]
            if "AA" in entity.keys():
                entity["DAuN"] = ";".join(item["DAuN"] for item in entity["AA"])
                entity["AuId"] = ";".join(str(item["AuId"]) for item in entity["AA"])
                del entity["AA"]
            if "F" in entity.keys():
                entity["FN"] = ";".join(item["FN"] for item in entity["F"])
                del entity["F"]
            if "J" in entity.keys():
                if isinstance(entity["J"], dict):
                    entity["JN"] = entity["J"]["JN"]
                elif isinstance(entity["J"], list):
                    entity["JN"] = ";".join(item["JN"] for item in entity["J"])
                else:
                    entity["JN"] = entity["J"]
                del entity["J"]
            yield {"raw": item, "processed": entity}

    def yield_records(self):
        """Fetch all entities for a given query expression."""
        params = {
            "expr": self.expr,
            "offset": self.offset,
            "count": self.count,
            "attributes": self.attr,
            "model": self.model,
            "subscription-key": self.key,
        }
        downloaded = 0
        while True:
            data = self.fetch(MAG.ENDPOINT, params)
            if data["entities"] == []:
                break
            yield from self.process(data["entities"])
            params["offset"] += self.count
            downloaded += len(data["entities"])
            logger.info(f"fetched {downloaded} entries.")
            sleep(3.1)

__init__.py

from .mag import MAG

__version__ = "0.1.0"

Применение

>>> from mag import MAG
>>> pubs = MAG(
        expr="And(And(AW='organized', AW='crime', Y=[2000, 2020]), Composite(F.FN='political science'))",
        key="2q3b955bfa210f9aa1a4eq35fa63378c" #dummy key
    )
>>> pubs.download_publications()
>>> pubs.save(tocsv="data.csv")
>>> pubs.save(tojson="data.json")

0

Добавить комментарий

Ваш адрес email не будет опубликован. Обязательные поля помечены *