Я немного новичок в python, и я пытаюсь добиться того, чего не могу с VBA, быстрее.
У меня есть такие большие наборы данных в источнике (просто образец):
Это также делается с помощью python, и это файл CSV.
Что мой код имеет к этому отношение:
Удалите все столбцы, кроме: «Название места», «Дата» и «Запланировано» и «Отдел» или «Запланировано 2» и «Отдел 2».
Затем из Planned/Planned2 я получаю только строки с «:» в ячейке, потому что они работают, и я создаю диапазон времени от левой части строки до правой части с 30-минутными интервалами и помещаю их в новый столбец внутри списка.
После этого я просматриваю этот список и создаю еще один столбец с отработанными минутами на каждом интервале в 30 минут. Нужно сказать, что есть странные сдвиги, такие как 09:17-14:37 и тому подобное, поэтому первая часть извлекает все интервалы в часах с округлением, например 9:00, 9:30, 10:00, независимо от их реального начала/окончания. (колонка с рабочими минутами справится с этим).
Наконец, мы получаем все, удаляем ненужные столбцы и строки (те, у которых работает 0 минут) и группируем данные.
Это код:
#calcula_Presentes_Contratados
import pandas as pd
from itertools import groupby
from datetime import datetime, timedelta
def configuraDataFrame(df, horario, modo):
#drop unwanted columns and filter empty rows
df = df[df.columns.intersection([
'CentroPlanificacion',
'Fecha',
horario,
modo
])]
df = df[df[horario].notna()]
if df.empty:
return df
return df[df[horario].str.contains(":")]
def fechaYHora(horario, fecha, posicion):
#calculate time and date for each start and ending time if the ending time < starting time, add one day to the ending.
hora = datetime.strptime(horario.split('-')[posicion], '%H:%M')
fecha = datetime.strptime(fecha, '%Y-%m-%d')
if posicion == 1:
horaI = datetime.strptime(horario.split('-')[0], '%H:%M')
horaI = datetime(fecha.year, fecha.month, fecha.day, horaI.hour, 0)
if hora.minute >= 30:
hora = datetime(fecha.year, fecha.month, fecha.day, hora.hour, 30)
else:
hora = datetime(fecha.year, fecha.month, fecha.day, hora.hour, 0)
if posicion == 1 and hora < horaI: hora += timedelta(days=1)
return hora
def extraeRangos(horario, fecha, aplicaSeparador):
try:
#get all match schedules if there are any
partido = horario.split("https://codereview.stackexchange.com/")
listahorarios = []
#loop through al the schedules and add the range with the split separator "Separa aquí"
for horarios in partido:
rangoactual = pd.date_range(start=fechaYHora(horarios, fecha, 0), end=fechaYHora(horarios, fecha, 1), freq="30min").strftime('%Y-%m-%d, %H:%M').to_list()
listahorarios += rangoactual
if aplicaSeparador: listahorarios.append('Separa aquí')
return listahorarios
except:
return
def calculaMinutosInicio(horaInicio, tramoInicial):
#function to calculate starting minutes
horaInicio = datetime.strptime(horaInicio,'%H:%M')
tramoFin = datetime.strptime(tramoInicial.split(", ")[1], '%H:%M') + timedelta(minutes=30)
if tramoFin < horaInicio: horaFin += timedelta(days=1)
return tramoFin - horaInicio
def calculaMinutosFin(horaFin, tramoFin):
#function to calculate ending minutes
horaFin = datetime.strptime(horaFin,'%H:%M')
if horaFin.minute == 0 or horaFin.minute == 30: return timedelta(seconds=0)
tramoFin = datetime.strptime(tramoFin.split(", ")[1], '%H:%M')
return horaFin - tramoFin
def generaListasPorPartido(lista_tramos):
lista_final = []
listaTemp = []
#loop through the list looking for "Separa aquí"
for tramo in lista_tramos:
#if found, all the items stored before in the list will go in a single list
if tramo == 'Separa aquí':
lista_final.append(listaTemp)
listaTemp = []
#store all items in between "Separa aquí"
else:
listaTemp.append(tramo)
#return a list of lists for each matching schedule
return lista_final
def generaListaTramos(lista_tramos, horario):
listatramos = []
#calculate starting minutes from starting time to starting interval
minutosInicio = int(calculaMinutosInicio(horario.split("-")[0], lista_tramos[0]).seconds / 60)
#calculate ending minutes from ending time to ending interval
minutosFin = int(calculaMinutosFin(horario.split("-")[1], lista_tramos[-1]).seconds / 60)
for tramo in lista_tramos:
#the first index will have the starting minutes
if lista_tramos.index(tramo) == 0:
listatramos.append(minutosInicio)
#the last index will have the ending minutes
elif tramo == lista_tramos[-1]:
listatramos.append(minutosFin)
#every interval in between will always have 30 minutes
else:
listatramos.append(30)
return listatramos
def calculaConexion(lista_tramos, horario):
#if the list is not empty
if len(lista_tramos) > 0:
listatramos = []
#gnerate a list for each match schedule
lista_tramos = generaListasPorPartido(lista_tramos)
horario: horario = horario.split("/")
#add the working minutes to each interval
for i in range(len(lista_tramos)):
listatramos.extend(generaListaTramos(lista_tramos[i], horario[i]))
return listatramos
def eliminaSeparadores(lista):
return list(filter(('Separa aquí').__ne__, lista))
def generaRangoHoras(df, campo, nuevo_campo):
#generate range of times column
df[nuevo_campo] = df.apply(lambda row: extraeRangos(row[campo], row['Fecha'], True), axis=1)
#generate working minutes column
df['Conexión_' + nuevo_campo] = df.apply(lambda row: calculaConexion(row[nuevo_campo], row[campo]), axis=1)
#get rid of the manual separator for match schedules
df[nuevo_campo] = df.apply(lambda row: eliminaSeparadores(row[nuevo_campo]), axis=1)
return df
def explotaHoras(df, listaSinExplotar):
df = df.set_index(listaSinExplotar).apply(pd.Series.explode).fillna('').reset_index()
return df
def main(rutaficheros):
#read soruce file
rutaExcel = rutaficheros + 'BaseACC.csv'
dfOrigen = pd.read_csv(rutaExcel, converters={i: str for i in range(100)})
#create lists to loop
listaCamposHorario = ['Programado', 'Programado 2']
listaCamposModo = ['Modo Planificado', 'Modo Planificado 2']
#for each item in list, create the time range, working minutes and export the file
for horario, modo in zip(listaCamposHorario, listaCamposModo):
df = configuraDataFrame(dfOrigen, horario, modo)
if not df.empty:
df = generaRangoHoras(df, horario, 'Franjas_Programado')
df = explotaHoras(df, ['CentroPlanificacion', 'Fecha', horario, modo])
#drop unwanted columns for the final result
df = df.drop([horario, 'Fecha'], axis=1)
#drop unwanted rows for the final result
df = df[df['Conexión_Franjas_Programado'] > 0]
#group all the working minutes by location, planned/planned2 and department/department2
df = df.groupby(['CentroPlanificacion', modo, 'Franjas_Programado']).agg({'Conexión_Franjas_Programado' : 'sum'}).reset_index()
df.to_csv(rutaficheros + horario + '.csv', index=False)
return
И это должно быть на выходе:
Выходной файл:
Это занимает около 35 секунд, когда у меня есть исходный файл из 100 тыс. строк, и я думаю, что это может быть лучше, но я все еще очень новичок в python, чтобы знать более эффективные способы кодирования.
Надеюсь, кто-то может помочь с этим, спасибо заранее!
Изменить: пример данных
Адрес, территория, DNI, агент, центр планирования, исходная кампания, дата, настоящее время, наем, режим планирования, навыки агента, запланированный, запланированный режим, запланированный 2, запланированный режим 2 BussinessName, TerritoryName, WorkerID1, WorkerName1, Location1, Department1, 2022 -05 -01,FREE,FREE,Department1,Department1,FREE,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-02,09:00-15:00,09:00-15: 00, Отдел1,Отдел1,09:00-13:00,Отдел1,13:00-15:00,Отдел2 BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-03,09:00-15:00 ,09:00-15:00,Department1,Department1,09:00-15:00,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-04,09:00-15:00, 09:00-15:00,Department1,Department1,09:00-15:00,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-05,09:00-15:00,09 :00–15:00,Отдел1,Отдел1,09:00–15:00,Отдел1,,Название компании,Терри toryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-06,09:00-15:00,09:00-15:00,Department1,Department1,09:00-15:00,Department1,,BusinessName,TerritoryNAME ,WorkerID1,WorkerName1,Location1,Department1,2022-05-07,LIBRE,LIBRE,Department1,Department1,LIBRE,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-08,LIBRE,LIBRE, Department1,Department1,LIBRE,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-09,09:00-15:00,09:00-15:00,Department1,Department1,09:00 -15:00,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-10,09:00-15:00,09:00-15:00,Department1,Department1,09:00- 15:00,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-11,09:00-15:00,09:00-15:00,Department1,Department1,09:00-15 :00,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-12,09:00-15:00,09:00-15:00,Отправление ment1,Department1,09:00-15:00,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-13,09:00-15:00,09:00-15:00,Department1 ,Department1,09:00-15:00,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-14,LIBRE,LIBRE,Department1,Department1,LIBRE,Department1,,BusinessName,TerritoryNAME,WorkerID1 ,WorkerName1,Location1,Department1,2022-05-15,LIBRE,LIBRE,Department1,Department1,LIBRE,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-16,09:00-15: 00,09:00-15:00,Department1,Department1,09:00-15:00,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-17,09:00-15:00 ,09:00-15:00,Department1,Department1,09:00-15:00,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-18,09:00-15:00, 09:00-15:00,Department1,Department1,09:00-15:00,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1, Location1,Department1,2022-05-19,09:00-15:00,09:00-15:00,Department1,Department1,09:00-15:00,Department1,, BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1 ,Department1,2022-05-20,09:00-15:00,09:00-15:00,Department1,Department1,09:00-15:00,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1, Department1,2022-05-21,LIBRE,LIBRE,Department1,Department1,LIBRE,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-22,LIBRE,LIBRE,Department1,Department1,LIBRE,Department1 ,, BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-23,09:00-15:00,09:00-15:00,Department1,Department1,09:00-15:00,Department1, , BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-24,09:00-15:00,09:00-15:00,Department1,Department1,09:00-15:00,Department1,, BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-25,VAC,09:00-15:00,Department1,Department1,VAC,Department1,,Bus sinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-26,09:00-15:00,09:00-15:00,Department1,Department1,09:00-15:00,Department1,,BusinessName ,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-27,09:00-15:00,09:00-15:00,Department1,Department1,09:00-15:00,Department1,,BusinessName, TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-28,LIBRE,LIBRE,Department1,Department1,LIBRE,Department1,, BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-29,LIBRE,LIBRE ,Department1,Department1,FREE,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-30,09:00-15:00,09:00-15:00,Department1,Department1,09: 00-15:00,Department1,,BusinessName,TerritoryNAME,WorkerID1,WorkerName1,Location1,Department1,2022-05-31,09:00-15:00,09:00-15:00,Department1,Department1,09:00 -15:00,Отдел1,,