尝试使用 python 和 mechanize 解析 dropbox 事件页面时出现 403 错误

问题描述 投票:0回答:1

我使用此脚本来获取某个目录的所有文件更新的列表。然后,我解析该列表以获取我在该目录中处于活动状态的时间段列表。这样我就可以快速查看我在该项目上花费了多少时间,并知道向我的客户收取什么费用。

我写了一个小的Python脚本,改编自:https://github.com/jncraton/PythonDropboxUploader

我添加了底部函数以从 https://www.dropbox.com/events?ns=false&n=50

检索特定事件页面

我在 2 个月前使用过该脚本,效果很好,但现在我收到 403:禁止错误:

eventSrc = self.browser.open(req).read()

可能 DropBox 试图阻止像我这样的抓取工具,以促使程序员使用他们的 API,但不幸的是该 API 不支持列出事件。

有人可以帮我让它再次工作吗?

这是创建连接的Python代码:

import mechanize
import urllib
import re
import json

class DropboxConnection:
""" Creates a connection to Dropbox """

email = ""
password = ""
root_ns = ""
token = ""
browser = None

def __init__(self, email, password):
    self.email = email
    self.password = password

    self.login()
    self.get_constants()

def login(self):
    """ Login to Dropbox and return mechanize browser instance """

    # Fire up a browser using mechanize
    self.browser = mechanize.Browser()

    self.browser.set_handle_equiv(False)
    self.browser.set_handle_redirect(True)
    self.browser.set_handle_referer(True)
    self.browser.set_handle_robots(False)

    self.browser.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:14.0) Gecko/20120722 Firefox/14.0.1')]

    # Browse to the login page
    self.browser.open('https://www.dropbox.com/login')

    # Enter the username and password into the login form
    isLoginForm = lambda l: l.action == "https://www.dropbox.com/login" and l.method == "POST"

    try:
        self.browser.select_form(predicate=isLoginForm)
    except:
        self.browser = None
        raise(Exception('Unable to find login form'))

    self.browser['login_email'] = self.email
    self.browser['login_password'] = self.password
    self.browser['t'] = "1230"

    # Send the form
    response = self.browser.submit()

def get_constants(self):
    """ Load constants from page """

    home_src = self.browser.open('https://www.dropbox.com/home').read()

    try:
        self.root_ns = re.findall(r"root_ns: (\d+)", home_src)[0]
        self.token = re.findall(r"TOKEN: '(.+)'", home_src)[0]
    except:
        raise(Exception("Unable to find constants for AJAX requests"))

def upload_file(self, local_file, remote_dir, remote_file):
    """ Upload a local file to Dropbox """

    if(not self.is_logged_in()):
        raise(Exception("Can't upload when not logged in"))

    self.browser.open('https://www.dropbox.com/')

    # Add our file upload to the upload form
    isUploadForm = lambda u: u.action == "https://dl-web.dropbox.com/upload" and u.method == "POST"

    try:
        self.browser.select_form(predicate=isUploadForm)
    except:
        raise(Exception('Unable to find upload form'))

    self.browser.form.find_control("dest").readonly = False
    self.browser.form.set_value(remote_dir, "dest")
    self.browser.form.add_file(open(local_file, "rb"), "", remote_file)

    # Submit the form with the file
    self.browser.submit()

def get_dir_list(self, remote_dir):
    """ Get file info for a directory """

    if(not self.is_logged_in()):
        raise(Exception("Can't download when not logged in"))

    req_vars = "ns_id=" + self.root_ns + "&referrer=&t=" + self.token

    req = urllib2.Request('https://www.dropbox.com/browse' + remote_dir, data=req_vars)
    req.add_header('Referer', 'https://www.dropbox.com/home' + remote_dir)

    dir_info = json.loads(self.browser.open(req).read())

    dir_list = {}

    for item in dir_info['file_info']:
        # Eliminate directories
        if(item[0] == False):
            # get local filename
            absolute_filename = item[3]
            local_filename = re.findall(r".*\/(.*)", absolute_filename)[0]

            # get file URL and add it to the dictionary
            file_url = item[8]
            dir_list[local_filename] = file_url

    return dir_list

def get_download_url(self, remote_dir, remote_file):
    """ Get the URL to download a file """

    return self.get_dir_list(remote_dir)[remote_file]

def download_file(self, remote_dir, remote_file, local_file):
    """ Download a file and save it locally """

    fh = open(local_file, "wb")
    fh.write(self.browser.open(self.get_download_url(remote_dir, remote_file)).read())
    fh.close()

def is_logged_in(self):
    """ Checks if a login has been established """
    if(self.browser):
        return True
    else:
        return False

def getEventsPage(self, n):
    if(not self.is_logged_in()):
        raise(Exception("Can't get event page when not logged in"))

    url = 'https://www.dropbox.com/next_events'
    values = {'cur_page': n, 'ns_id': 'false'}
    data = urllib.urlencode(values)
    req = mechanize.Request(url, data)

    # print url + '?' + data

    eventSrc = self.browser.open(req).read()
    return eventSrc

这是解析事件页面的循环:

from dbupload import DropboxConnection
from getpass import getpass
from bs4 import BeautifulSoup
import re
import parsedatetime.parsedatetime as pdt
import parsedatetime.parsedatetime_consts as pdc
c = pdc.Constants()
p = pdt.Calendar(c)

email = "[email protected]"  # raw_input("Enter Dropbox email address:")
password = getpass("Enter Dropbox password:")

dateFile = open('all_file_updates.txt', "wb")
try:
    # Create the connection
    conn = DropboxConnection(email, password)
except:
    print("Connection failed")
else:
    print("Connection succesful")

n = 250
found = 0
while(n >= 0):
    eventsPageSrc = conn.getEventsPage(n)
    soup = BeautifulSoup(eventsPageSrc)

    table = soup.find("table", {"id": "events"})
    for row in table.findAll('tr'):
        link = row.find("a", href=re.compile('^https://dl-web.dropbox.com/get/ProjectName'))
        if(link != None):
            dateString = row.find("td", attrs={'class': 'modified'}).string
            date = p.parse(dateString)
            dateFile.write('Date: ' + str(date) + '    file: ' + link.string + '\n')
            found = found + 1
    n = n - 1
    print 'page: ' + str(n) + ' Total found: ' + str(found)
python web-scraping dropbox mechanize http-status-code-403
1个回答
0
投票

在 def get_constants(self) 中:更改

self.token = re.findall(r"TOKEN: '(.+)'", home_src)[0]

self.token = re.findall(r'TOKEN: "(.+)"', home_src)[0]

dropbox 改变了它存储常量的方式

希望有帮助。

© www.soinside.com 2019 - 2024. All rights reserved.