Python 脚本抓取 ADO 项目以获取特定文件并下载它

问题描述 投票:0回答:1

我正在尝试创建一个 python 脚本,该脚本将抓取 Azure DevOps 项目以获取文件,并将其下载到本地。但是,我遇到了一个问题,由于请求“危险”,因此请求下载文件不起作用。

错误:

Failed to clone file 'mkdocs.yml' from repository 'crawl-ado'
Response: 400
{"$id":"1","innerException":null,"message":"A potentially dangerous Request.Path value 
was detected from the client (:).","typeName":"System.Web.HttpException, 
System.Web","typeKey":"HttpException","errorCode":0,"eventId":0}

我当前的策略是通过 Azure PAT 令牌拉取存储库列表,并发出 json 请求以查看这些存储库是否包含该文件。如果他们这样做,我想创建一个新文件并将它们下载到我的本地环境。我的逻辑是否有问题,或者我是否试图以错误的方式做这样的事情?预先感谢您!

爬行.py:

import os
import requests
import base64
from azure.devops.connection import Connection
from msrest.authentication import BasicAuthentication

# Replace these variables with your Azure DevOps organization, project, and personal access token (PAT)
organization = "https://dev.azure.com/MYORG"
project = os.getenv('PROJECT')
pat = os.getenv('PAT')
file_path = "mkdocs.yml"  # Replace with the path to the specific file you want to clone
# Check to see if PROJECT & PAT are set.
if not project:
    print("Please set the PROJECT environment variable!")
    exit(1)

if not pat:
    print("Please set the PAT environment variable!")
    exit(1)
else:
    print("PAT is set!")
    print("***************************************")

# Create a connection to the Azure DevOps organization
credentials = BasicAuthentication('', pat)
connection = Connection(base_url=organization, creds=credentials)

# Get a client for the Git service
git_client = connection.clients.get_git_client()

# Get a list of repositories in the project
repos = git_client.get_repositories(project=project)

# Store the repository names in an array
repo_names = [repo.name for repo in repos]

# Display the repository names and download the file from each repository
print("Repositories in project '{}':".format(project))
for repo_name in repo_names:
    print(repo_name)
    url = f'https://dev.azure.com/{organization}/{project}/_apis/git/repositories/{repo_name}/items?path={file_path}&api-version=7.1'
    headers = {
        'Authorization': f'Basic {base64.b64encode(f":{pat}".encode()).decode()}'
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        file_content = response.text
        with open(file_path, 'w') as file:
            file.write(file_content)
        print(f"File '{file_path}' cloned successfully from repository '{repo_name}'")
    else:
        print(f"Failed to clone file '{file_path}' from repository '{repo_name}'")
        print(f"Response: {response.status_code}")
        print(response.text)
python json azure-devops web-crawler azure-devops-rest-api
1个回答
0
投票

我和同事找到了解决方案。我们没有使用 ADO URL,而是将其更改为 git URL 来克隆存储库。这将克隆组织内部的任何文件,并且不限于一个项目

import os
import requests
import json
import base64
import yaml
import subprocess
import urllib.parse

# Define variables
organization = os.getenv('ORG')
file_path = os.getenv('FILE_NAME')
ado_access_token = os.getenv('PAT')

# Check if the environment variable is set
if not ado_access_token:
    raise EnvironmentError("The environment variable 'PAT' is not set. Please set it to your Azure DevOps personal access token.")
    exit(1)
    
if not file_path:
    raise EnvironmentError("The environment variable 'FILE_NAME' is not set. Please set it to the file path of the file you want to search for.")
    exit(1)

# Construct the URL
alm_url = f'https://almsearch.dev.azure.com/{organization}/_apis/search/codesearchresults?api-version=7.1'

# Set up the headers
headers = {
    'Content-Type': 'application/json',
    'Authorization': f'Basic {base64.b64encode(f":{ado_access_token}".encode()).decode()}'
}

# Create the request body
body = {
    "searchText": file_path,
    "$top": 1000 # Set the limit for the number of files to be found.
}

# Make the POST request
response = requests.post(alm_url, headers=headers, data=json.dumps(body))

# Initialize the results dictionary
results_dict = {}

# Check the response
if response.status_code == 200:
    search_results = response.json()
    print(f'Found {search_results.get("count", 0)} results...')
    print(f'Only using {file_path} files in the root of the repository...')
    for result in search_results.get('results',[]):
        # Get the project name, repo name, and file path
        project_name = result.get('project', {}).get('name')
        repo_name = result.get('repository', {}).get('name')
        result_file_path = result.get('path', 'N/A')
        
        if os.path.basename(result_file_path) != file_path:
            print(f'Skipping file {result_file_path} in repository {repo_name}...')
            continue
        
        print(f'Processing {project_name}/{repo_name}/{result_file_path}...')
        # Create git clone URL
        encoded_project_name = urllib.parse.quote(project_name)
        git_clone_url = f'https://{ado_access_token}@dev.azure.com/{organization}/{encoded_project_name}/_git/{repo_name}'
        
        if project_name not in results_dict:
            results_dict[project_name] = {}
        
        results_dict[project_name][repo_name] = git_clone_url
        
        # Define the target directory for cloning
        clone_dir = os.path.join(project_name, repo_name)
        os.makedirs(clone_dir, exist_ok=True)
        
        # Run the git clone command
        clone_command = ['git', 'clone', git_clone_url, clone_dir]
        subprocess.run(clone_command, check=True)
        
        # Move mkdocs.yml to the root of the repo folder
        destination_root = os.path.join(project_name, repo_name)
        destination_doc = os.path.join(destination_root, 'docs')
        os.makedirs(destination_root, exist_ok=True)
        
        source_mkdocs = os.path.join(clone_dir, result_file_path)
        if os.path.exists(source_mkdocs):
            os.rename(source_mkdocs, os.path.join(destination_root, file_path))
        
        # Move all .md files to docs folder in the repo folder
        os.makedirs(os.path.join(destination_root), exist_ok=True)
        
        # Define the staging directory
        staging_directory = destination_root
        
        # Write results to a .yml file
        with open(os.path.join(staging_directory, 'repos.yml'), 'w') as yaml_file:
            yaml.dump(results_dict, yaml_file, default_flow_style=False)
else:
    print(f'Error: {response.status_code}')
    print(response.text)
© www.soinside.com 2019 - 2024. All rights reserved.