我如何统一这两个简单的代码?

问题描述 投票:0回答:2

我有两个代码可以删除基本上是搜索引擎的页面。它从Google工作表中读取信息,在URL上搜索,获取一些信息,然后他们将它们写在工作表上。

问题是我正在使用两个代码来执行此操作,第二个是在工作表上写信息的代码。

第一个代码完成所有搜索,所有搜索完成后,第二个代码开始将这些信息写在Google表格上。

[我想做的是搜索一个并编写,搜索第二个并编写.....我尝试了不同的方法,但这是我的第一个代码和第一次编程,所以我为此而苦苦挣扎。

k_bot.py

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import UnexpectedAlertPresentException

import re
import time


class BOT(object):
    def __init__(self, cpfs):

        # SETUP FOR URL
        self.bot_url = 'http://www.3kplus.net/'
        self.cpfs = cpfs

        self.profile = webdriver.FirefoxProfile()
        self.options = Options()
        self.driver = webdriver.Firefox(firefox_profile=self.profile,
                                        executable_path='C:\\Users\MOISA\Documents\geckodriver.exe',
                                        options=self.options)

        # NAVIGATE TO URL
        self.driver.get(self.bot_url)

        login_box = self.driver.find_element_by_xpath('//*[@id="login"]/div[3]/div[2]/div[2]/input')
        login_box.send_keys('daiane')

        pass_box = self.driver.find_element_by_xpath('//*[@id="login"]/div[3]/div[2]/div[3]/input')
        pass_box.send_keys('789456')

        login_btn = self.driver.find_element_by_xpath('//*[@id="login"]/div[3]/div[2]/button')
        login_btn.click()

    def search_cpfs(self):

        # SEARCH THROUGH THE LIST OF CLIENT CODES (1ST COLLUM OF THE SPREADSHEET), AND OBTAIN THESE INFOS
        nomes = []
        idades = []
        beneficios = []
        concessoes = []
        salarios = []
        bancoss = []
        bancoscard = []
        consigs = []
        cards = []

        for cpf in self.cpfs:
            print(f"Procurando {cpf}.")
            self.driver.get(self.bot_url)
            self.delay = 3  # seconds

            # SEARCH CLIENT CODE
            try:
                cpf_input = self.driver.find_element_by_xpath('//*[@id="search"]/div/div[1]/input')
                cpf_input.send_keys(cpf)

                cpf_btn = self.driver.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
                cpf_btn.click()
                cpf_btn.click()

                time.sleep(2)

            # CLIENT CODE IS VALID
                # CLIENT CODE HAVE NOTIFICATION
                if self.driver.find_element_by_xpath('//*[@id="notification"]').is_displayed():

                    nome = self.driver.find_element_by_xpath(
                        "/html/body/main[1]/div[1]/div[1]/div[1]/div[2]/h2").text
                    idade = self.driver.find_element_by_xpath(
                        "/html/body/main[1]/div[1]/div[1]/div[1]/div[2]/ul/li[2]").text
                    age = re.search(r'\((.*?)Anos', idade).group(1)
                    beneficio = self.driver.find_element_by_xpath(
                        "/html/body/main[1]/div[1]/div[1]/div[1]/div[3]/div[5]/span/b   ").text
                    concessao = self.driver.find_element_by_xpath(
                        "/html/body/main[1]/div[1]/div[1]/div[1]/div[3]/div[2]/span").text
                    salario = self.driver.find_element_by_xpath(
                        "/html/body/main[1]/div[1]/div[2]/div/div[3]/div[1]/div[1]/span").text
                    bancos = self.driver.find_element_by_xpath('//*[@id="loans"]').text
                    bancosw = re.findall(r'(?<=Banco )(\w+)', bancos)
                    bankslist = ', '.join(bancosw)
                    bancocard = self.driver.find_element_by_xpath('//*[@id="cards"]').text
                    bcardw = re.findall(r'(?<=Banco )(\w+)', bancocard)
                    bcardlist = ', '.join(bcardw)
                    consig = self.driver.find_element_by_xpath(
                        "/html/body/main[1]/div[1]/div[1]/div[3]/div[2]/span").text
                    card = self.driver.find_element_by_xpath(
                        "/html/body/main[1]/div[1]/div[1]/div[3]/div[3]/span").text

                    print('CPF Valido')
                    print('NOTIFICACAO')
                    print(nome, age, beneficio, concessao, salario, bankslist, bcardlist, consig, card)

                # CLIENT CODE DOESN'T HAVE NOTIFICATION
                else:
                    nome = self.driver.find_element_by_xpath(
                        "/html/body/main[1]/div[1]/div[1]/div[1]/div[1]/h2").text
                    idade = self.driver.find_element_by_xpath(
                        "/html/body/main[1]/div[1]/div[1]/div[1]/div[1]/ul/li[2]").text
                    age = re.search(r'\((.*?)Anos', idade).group(1)
                    beneficio = self.driver.find_element_by_xpath(
                        "/html/body/main[1]/div[1]/div[1]/div[1]/div[2]/div[5]/span/b").text
                    concessao = self.driver.find_element_by_xpath(
                        "/html/body/main[1]/div[1]/div[1]/div[1]/div[2]/div[2]/span").text
                    salario = self.driver.find_element_by_xpath(
                        "/html/body/main[1]/div[1]/div[2]/div/div[3]/div[1]/div[1]/span").text
                    bancos = self.driver.find_element_by_xpath('//*[@id="loans"]').text
                    bancosw = re.findall(r'(?<=Banco )(\w+)', bancos)
                    bankslist = ', '.join(bancosw)
                    bancocard = self.driver.find_element_by_xpath('//*[@id="cards"]').text
                    bcardw = re.findall(r'(?<=Banco )(\w+)', bancocard)
                    bcardlist = ', '.join(bcardw)
                    consig = self.driver.find_element_by_xpath(
                        "/html/body/main[1]/div[1]/div[1]/div[3]/div[2]/span").text
                    card = self.driver.find_element_by_xpath(
                        "/html/body/main[1]/div[1]/div[1]/div[3]/div[3]/span").text

                    print('CPF Valido')
                    print(nome, age, beneficio, concessao, salario, bankslist, bcardlist, consig, card)

            # IF THE CLIENT CODE IS WRONG
            except (NoSuchElementException, UnexpectedAlertPresentException):
                nome = ''
                idade = ''
                age = ''
                concessao = ''
                salario = ''
                bancos = ''
                bancosw = ''
                bankslist = ''
                bancocard = ''
                bcardw = ''
                bcardlist = ''
                consig = ''
                card = ''
                print('CPF Invalido')

            nomes.append(nome)
            idades.append(age)
            beneficios.append(beneficio)
            concessoes.append(concessao)
            salarios.append(salario)
            bancoss.append(bankslist)
            bancoscard.append(bcardlist)
            consigs.append(consig)
            cards.append(card)

        return nomes, idades, beneficios, concessoes, salarios, bancoss, bancoscard, consigs, cards

cpf_updater.py

from k_bot import BOT
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import time
from gspread.exceptions import APIError


class CpfSearch(object):
    def __init__(self, spreadsheet_name):
        self.cpf_col = 1
        self.nome_col = 2
        self.age_col = 3
        self.beneficio_col = 4
        self.concessao_col = 5
        self.salario_col = 6
        self.bancos_col = 7
        self.bancocard_col = 9
        self.consig_col = 10
        self.card_col = 16

        scope = ['https://www.googleapis.com/auth/spreadsheets',
                 'https://www.googleapis.com/auth/drive.readonly']

        creds = ServiceAccountCredentials.from_json_keyfile_name('CONSULTAS.json', scope)

        client = gspread.authorize(creds)

        self.sheet = client.open(spreadsheet_name).sheet1

    def process_cpf_list(self):

        # SKIP OVER COLUMN HEADING IN THE SPREADSHEET
        cpfs = self.sheet.col_values(self.cpf_col)[1:]

        bot_url = BOT(cpfs)
        nomes, idades, beneficios, concessoes, salarios, bancoss, bancoscard, consigs, cards = bot_url.search_cpfs()

        # UPDATE THE SHEET
        print("Atualizando...")

        for cpfs in range(len(nomes)):
            try:
                self.sheet.update_cell(cpfs + 2, self.nome_col, nomes[cpfs])
                self.sheet.update_cell(cpfs + 2, self.age_col, idades[cpfs])
                self.sheet.update_cell(cpfs + 2, self.beneficio_col, beneficios[cpfs])
                self.sheet.update_cell(cpfs + 2, self.concessao_col, concessoes[cpfs])
                self.sheet.update_cell(cpfs + 2, self.salario_col, salarios[cpfs])
                self.sheet.update_cell(cpfs + 2, self.bancos_col, bancoss[cpfs])
                self.sheet.update_cell(cpfs + 2, self.bancocard_col, bancoscard[cpfs])
                self.sheet.update_cell(cpfs + 2, self.consig_col, consigs[cpfs])
                self.sheet.update_cell(cpfs + 2, self.card_col, cards[cpfs])
                print('Cliente atualizado!')
            except APIError:
                print('Esperando para atualizar...')
                time.sleep(100)
                continue


cpf_updater = CpfSearch('TESTE')
cpf_updater.process_cpf_list()
python web-scraping google-sheets automation implementation
2个回答
0
投票

简短地说:您应该将for cpf in self.cpfs:从第一个脚本移动到第二个脚本。


在第一个脚本中,您应该具有功能

def search_cpfs(self, cpf):

仅搜索一个cpf

因此,您必须从for cpf in self.cpfs:中删除search_cpfs()并在没有Bot()的情况下运行cpfs,但在运行cpf时使用单个search_cpfs()

在第二个脚本中,您应该使用此for循环

    bot_url = BOT()

    for cpf in cpfs:
       ...variables... = bot_url.search_cpfs(cpf)

       # UPDATE THE SHEET
       print("Atualizando...")

-2
投票

这是您的解决方案:

apt install pymerge

OR

pip install pymerge

THEN

pymerge merge /dir/file1 /dir/file2

/ dir / file#是路径的占位符。

© www.soinside.com 2019 - 2024. All rights reserved.