Selenium Webscraper для анализа нечетных данных

Я написал следующий код, чтобы добиться следующего:

  1. Регулярно очищайте все живые матчи на сайте ставок oddsportal.com
  2. Поместите данные о шансах во фрейм данных
  3. Оцените фрейм данных для двух нечетных поставщиков (Asianodds, Pinnacle) и сравните фактические данные с заранее заданными шаблонами.
  4. Отправьте телеграмму, если образец был идентифицирован
  5. Сохраните очищенные ссылки в файле JSON, чтобы они больше не очищались.

В моем коде все еще есть следующие проблемы, с которыми, я надеюсь, поможет этот обзор:

  • Производительность: в настоящее время на очистку и анализ требуется 1-2 млн за игру. Как этого добиться быстрее / эффективнее?
  • Иногда, когда одновременно выполняется много совпадений, сценарий не может очистить все совпадения до того, как задание cron запустит следующий скрипт, и они конфликтуют. Как я могу проверить селен, если экземпляр уже запущен, и дождаться его завершения?

» ‘

#!/Library/Frameworks/Python.framework/Versions/3.8/bin/python3

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from multiprocessing import Process
#from DbManager import DatabaseManager
import json
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import datetime
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
import requests
import cProfile


o_u_types= [1.50,1.75,2.00,2.25,2.50,2.75,3.00,3.25,3.50]
raw_time=str(datetime.datetime.now())
current_date= raw_time[0:10]
bookmakers=['Asianodds','Pinnacle']
countries=['England','Japan','France','Germany', 'India', 'Chile', 'Italy','Turkey', 'Czech Republic', 'Spain', 'Colombia','Poland','Belgium','Romania','Paraguay',
            'Portugal', 'Netherlands','Cyprus','Mexico','Brazil','Uruguay','Serbia','Slovenia','Slovakia','Sweden', 'Norway','USA','Estonia']
limited_league_countries=['England','Germany','Italy','Spain']
leagues=['National League','Championship','League One','2. Bundesliga','3. Liga','Regionalliga West','Regionalliga Sudwest','Serie A','Serie B','LaLiga','LaLiga2']



base_url="https://api.telegram.org/bot"
bot_token='xxxx'
chat_id='-xxxx'

global TYPE_ODDS
TYPE_ODDS = 'OPENING' # you can change to 'OPENING' if you want to collect opening odds, any other value will make the program collect CLOSING odds


link='https://www.oddsportal.com/inplay-odds/live-now/soccer/'

class Oddsportal:
    def ReadScrapedLinks(self):
        #with open("Modules/Config/scraped.json") as file:
        with open("Config/scraped.json") as file:
            data = json.load(file)
        return data["scraped"]
    
    def SaveScrapedMatch(self, link):
        # with open("Modules/Config/scraped.json") as oldfile:
        with open("Config/scraped.json") as oldfile:

            data = json.load(oldfile)
        data["scraped"].append(link)
        # with open("Modules/Config/scraped.json", "w+") as newfile:
        with open("Config/scraped.json", "w+") as newfile:

            json.dump(data, newfile, indent=4)  

    def filter_list(self,links):
        scraped_links=self.ReadScrapedLinks()
        self.scraped_links=[]
        for link in links:
            if link in scraped_links:
                continue
            self.filtered_links.append(link)
    
    def FindByCSSAndAttribute(self,mobject, css, attribute):
        try:
            return mobject.find_element_by_css_selector(css).get_attribute(attribute)
        except:
            return False   

    def WaitForObjects(self,type, string):
        return WebDriverWait(self.driver, 5).until(EC.presence_of_all_elements_located((type, string)))

    def fi(self,a):
        try:
            self.driver.find_element_by_xpath(a).text
        except:
            return False

    def ffi(self,a):
        if self.fi(a) != False :
            return self.driver.find_element_by_xpath(a).text
                
    def fffi(self,a):
        if TYPE_ODDS == 'OPENING':
            try:
                return get_opening_odd(a) 
            except:
                return self.ffi(a)  
        else:
            return(self.ffi(a))

    def fi2(self,a):
        try:
            self.driver.find_element_by_xpath(a).click()
        except:
            return False

    def ffi2(self,a):
        if self.fi2(a) != False :
            fi2(a)
            return(True)
        else:
            return(None)

    def __init__(self):
        mobile_emulation = {
            "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
            "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
        chrome_options = Options()
        chrome_options.add_experimental_option(
            "mobileEmulation", mobile_emulation)
        # chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
        #Initialize chrome driver
        self.driver = webdriver.Chrome(ChromeDriverManager().install()) 
        executor_url = self.driver.command_executor._url
        session_id = self.driver.session_id
        self.driver.get(executor_url)
        print (session_id)
        print (executor_url)
        res = requests.get(executor_url)
        print(res)

        


    def matchcollector(self, link):
        self.driver.get(link)    
        live_matches=WebDriverWait(self.driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.minutes-anim')))

        print(f'There are currently {len(live_matches)} matches live.')
        
        #Collect all matches
        all_matches = self.WaitForObjects(By.CLASS_NAME, "name.table-participant")
        self.all_links=[]
        for match_link in all_matches:
                #Get match link
                link = self.FindByCSSAndAttribute(match_link, 'a', 'href')  
                try:
                    in_play_addendum=link.split("https://codereview.stackexchange.com/")[-2]
                    #Remove in play addendum
                    modified_link=link.replace(in_play_addendum,"")
                    self.all_links.append(modified_link)
                except:
                    continue

        scraped_links=self.ReadScrapedLinks()
        self.filtered_links = []
        #Remove already scraped links
        for link in self.all_links:
             if link in scraped_links:
                continue
             self.filtered_links.append(link)
        print(f'Of all matches, {len(self.filtered_links)} have not yet been checked.')
   


    def openmatch(self,match):
        try:
            self.driver.get(match)
            time.sleep(1)
            self.driver.maximize_window()
        except:
            return False
               


    def getodds(self): 

        master_df= pd.DataFrame()
        for link in self.filtered_links:
            self.openmatch(link)
            country= self.ffi('//*[@id="breadcrumb"]/a[3]')

            if country in countries:
                league= self.ffi('//*[@id="breadcrumb"]/a[4]')
                
                if country in limited_league_countries:
                    if league not in leagues:
                        continue
                    else: pass
                else: pass
                match = self.ffi('//*[@id="col-content"]/h1') 
                game_message_string=f'Checking  {match}'
                game_method= '/sendMessage?chat_id={}&text="{}"'.format(chat_id,game_message_string)

                game_telegram_url= base_url + bot_token + game_method
                requests.get(game_telegram_url)  
                final_score = self.ffi('//*[@id="event-status"]')
                date = self.ffi('//*[@id="col-content"]/p[1]') # Date and time
                game_df= pd.DataFrame()

                for i in o_u_types:
                    url_appendix="#over-under;2;{};0".format(i)
                    o_u_type= i
                    o_u_match_url= str(link)+str(url_appendix)
                    print(o_u_match_url)
                    self.loadoddpage(o_u_match_url)

                    for x in range(1,28):
                        L=[]


                        for j in range(1,15): # only first 10 bookmakers displayed
                            Book = self.ffi('//*[@id="odds-data-table"]/div[{}]/table/tbody/tr[{}]/td[1]/div/a[2]'.format(x,j)) # first bookmaker name
                            if Book in bookmakers:
                                #Fix difference for Pin in live
                                if Book=='Asianodds':
                                    Over = self.fffi('//*[@id="odds-data-table"]/div[{}]/table/tbody/tr[{}]/td[3]/div'.format(x,j)) # first home odd
                                    Under = self.fffi('//*[@id="odds-data-table"]/div[{}]/table/tbody/tr[{}]/td[4]/div'.format(x,j)) # first away odd
                                elif Book=='Pinnacle':
                                    Over = self.fffi('//*[@id="odds-data-table"]/div[{}]/table/tbody/tr[{}]/td[3]/div'.format(x,j))
                                    Under = self.fffi('//*[@id="odds-data-table"]/div[{}]/table/tbody/tr[{}]/td[4]/div'.format(x,j)) # first away odd

                                    if Over==None:
                                        Over = self.fffi('//*[@id="odds-data-table"]/div[{}]/table/tbody/tr[{}]/td[3]/a'.format(x,j)) # first home odd
                                    else: 
                                        continue
                                    if Under==None:
                                        Under = self.fffi('//*[@id="odds-data-table"]/div[{}]/table/tbody/tr[{}]/td[4]/a'.format(x,j)) # first away odd
                                    else:
                                        continue
                                print(match, country, league, Book,Over,Under, date, final_score, link, '/ 500 ')
                                L = L + [(match, country, league, Book ,Over,Under, date, final_score, link)]
                                data_df = pd.DataFrame(L)

                                try:
                                    data_df.columns = ['TeamsRaw', 'Country', 'League', 'Bookmaker', 'Over', 'Under', 'DateRaw' ,'ScoreRaw','Link']
                                except:
                                    print('Function crashed, probable reason : no games scraped (empty season)')


                                ##################### FINALLY WE CLEAN THE DATA AND SAVE IT ##########################
                                '''Now we simply need to split team names, transform date, split score'''


                                #Filter out Bookmakers

                                # (a) Split team names
                                data_df["Home_id"] = [re.split(' - ',y)[0] for y in data_df["TeamsRaw"]]
                                data_df["Away_id"] = [re.split(' - ',y)[1] for y in data_df["TeamsRaw"]]
                                # (b) Transform date
                                data_df["Date"] = [re.split(', ',y)[1] for y in data_df["DateRaw"]]



                                
                                data_df["Over_{}".format(i)]=Over
                                data_df["Under_{}".format(i)]=Under

                                master_df=pd.concat([master_df,data_df])
                                game_df=pd.concat([game_df,data_df])
            else:
                print('Match not in a relevant country. Blacklisting it.')
                self.SaveScrapedMatch(link)

                continue
            
            

            try:

                #Setup Logic Operators
                game_df.drop_duplicates(keep='first',inplace=True) 
                game_df = game_df.groupby(['TeamsRaw','Bookmaker'], as_index=False).first()

                if len(game_df.index)==2:

                    for i in o_u_types:
                        try:

                            asian_over = game_df.at[0, f"Over_{i}"]
                            asian_under = game_df.at[0, f"Under_{i}"]
                            pin_over = game_df.at[1, f"Over_{i}"]
                            pin_under = game_df.at[1, f"Under_{i}"]

                            if asian_over > pin_over:
                                game_df[f"overdominant_{i}"] = "AsianDominant"
                            elif asian_over < pin_over:
                                game_df[f"overdominant_{i}"] = "PinDominant"
                            else:
                                game_df[f"overdominant_{i}"] = "Parity"


                            if asian_under > pin_under:
                                game_df[f"underdominant_{i}"] = "AsianDominant"
                            elif asian_under < pin_under:
                                game_df[f"underdominant_{i}"] = "PinDominant"
                            else:
                                game_df[f"underdominant_{i}"] = "Parity"

                        except:
                            game_df[f"overdominant_{i}"] ='n/a'
                            game_df[f"underdominant_{i}"] = "n/a"

                            continue

                    check_row= game_df.drop([1])
                    check_row_match= check_row.TeamsRaw.values
                    check_row_country= check_row.Country.values
                    check_row_league= check_row.League.values
                    print(check_row)


                    king_m5= check_row[(check_row['underdominant_2.25']=='AsianDominant') & (check_row['underdominant_2.5']=='AsianDominant') 
                            & ((check_row['underdominant_1.5']=='Parity') | (check_row['underdominant_1.5']=='n/a')) 
                            & ((check_row['underdominant_1.75']=='Parity') | (check_row['underdominant_1.75']=='n/a'))
                            & ((check_row['underdominant_2.0']=='Parity') | (check_row['underdominant_2.0']=='n/a'))
                            & ((check_row['underdominant_2.75']=='Parity') | (check_row['underdominant_2.75']=='n/a'))
                            & ((check_row['underdominant_3.0']=='Parity') | (check_row['underdominant_3.0']=='n/a'))
                            & ((check_row['underdominant_3.25']=='Parity') | (check_row['underdominant_3.25']=='n/a'))
                            & ((check_row['underdominant_3.5']=='Parity') | (check_row['underdominant_3.5']=='n/a'))
                            & ((check_row['overdominant_1.5']=='Parity') | (check_row['overdominant_1.5']=='n/a'))
                            & ((check_row['overdominant_1.75']=='Parity') | (check_row['overdominant_1.75']=='n/a'))
                            & ((check_row['overdominant_2.0']=='Parity') | (check_row['overdominant_2.0']=='n/a'))
                            & ((check_row['overdominant_2.25']=='Parity') | (check_row['overdominant_2.25']=='n/a'))
                            & ((check_row['overdominant_2.5']=='Parity') | (check_row['overdominant_2.5']=='n/a'))
                            & ((check_row['overdominant_2.75']=='Parity') | (check_row['overdominant_2.75']=='n/a'))
                            & ((check_row['overdominant_3.0']=='Parity') | (check_row['overdominant_3.0']=='n/a'))
                            & ((check_row['overdominant_3.25']=='Parity') | (check_row['overdominant_3.25']=='n/a'))
                            & ((check_row['overdominant_3.5']=='Parity') | (check_row['overdominant_3.5']=='n/a'))            
                            ]
                    if not king_m5.empty:
                        print(f'King M5 pattern found in {check_row_match}')
                        
                        message_string=f' M5-U1,5 ⚠️👑 👑 in {check_row_country}, {check_row_league}, {check_row_match}'
                        method= '/sendMessage?chat_id={}&text="{}"'.format(chat_id,message_string)

                        telegram_url= base_url + bot_token + method
                        print(telegram_url)
                        requests.get(telegram_url)                

                    else:
                        print (f'Match {check_row_match} does not contain a king M5 pattern.')


                else:
                    print('Match does not contain both bookmakers. Blacklisting it.')
                    self.SaveScrapedMatch(link)

                    continue
            
                self.SaveScrapedMatch(link)
                
            except:
                self.SaveScrapedMatch(link)
                continue



    def main():
    op=Oddsportal()
    op.matchcollector(link)
    op.checklink()
    op.getodds()

  

if __name__== "__main__":
    #p1 = Process(target=main)
    #p1.start()
    cProfile.run('main()', filename="report.txt", sort=-1)

» ‘

0

Добавить комментарий

Ваш адрес email не будет опубликован. Обязательные поля помечены *