Search for files starting in a specific folder and searching all sub folders using Python

Forum|Forum|2 years ago
August 31, 2023
3 replies
7 views

edztra

I am using a python script to search for files in a specific folder, the folder contains over 25K files
I have an excel with filenames that the script needs to search for.
Part of the script looks like this:

Start searching and downloading

for file_name in file_names_to_download:
print(f"Searching for {file_name} …“)
search_results = client.search().query(query=f”{file_name}“, limit=10, ancestor_folder_ids=[folder_id])
exact_matches = [item for item in search_results if item.name == f”{file_name}"]

if not exact_matches:
    print(f"No exact match found for {file_name}.")
    df.loc[df.iloc[:, 0] == file_name, 'Status'] = 'Not Found'
    continue

if len(exact_matches) > 1:
    print(f"More than one exact match found for {file_name}. Using the first match.")

item_to_download = exact_matches[0]
print(f"Found {file_name}. Downloading ...")

found_files.append(item_to_download.name)
item_download_path = os.path.join(download_path, item_to_download.name)

try:
    with open(item_download_path, 'wb') as f:
        item_to_download.download_to(f)
    print(f"Download completed for {item_to_download.name}.")
    df.loc[df.iloc[:, 0] == file_name, 'Status'] = 'Downloaded'
except Exception as e:
    print(f"Failed to download {item_to_download.name}: {e}")
    failed_downloads.append(item_to_download.name)
    df.loc[df.iloc[:, 0] == file_name, 'Status'] = 'Failed'

I have noticed it takes a very long time for the API to return a result. Is there a way do you a faster search?
Is something wrong with my code?

E

edztra
Author
Forum|Forum|2 years ago
September 1, 2023

My current code seems to work ok… But if you see room for improvement, please let me know.

import os
import pandas as pd
from boxsdk import OAuth2, Client
import dotenv

def store_tokens(access_token, refresh_token):
“”“Callback function to store new tokens.”“”
print(f"New access token: {access_token}“)
print(f"New refresh token: {refresh_token}”)

def authenticate_box_client():
“”“Authenticate and return Box client.”“”
dotenv.load_dotenv()
client_id = os.getenv(‘BOX_CLIENT_ID’)
client_secret = os.getenv(‘BOX_CLIENT_SECRET’)
access_token = os.getenv(‘BOX_ACCESS_TOKEN’)

auth = OAuth2(
    client_id=client_id,
    client_secret=client_secret,
    access_token=access_token,
    store_tokens=store_tokens
)
return Client(auth)

def find_and_download_file(client, filename, folder_id, download_path, df, index):
“”“Find and download file from Box.”“”
search_results = client.search().query(
query=filename,
limit=200,
ancestor_folder_ids=[folder_id],
file_extensions=[“pdf”],
type=“file”
)
found_files =

counter = 0  # Add a counter to track number of items iterated

for item in search_results:
    counter += 1  # Increment counter
    print(f" - {item.name}")
    if item.name == filename:
        found_files.append(item)
        print(f"Found file {item.name}.")
        df.at[index, 'Status'] = 'Downloaded'
        break
    if counter >= 2:  # Check if you've iterated through 10 items
        break

if not found_files:
    print(f"File {filename} not found. Updating DataFrame and moving to next file.")
    df.at[index, 'Status'] = 'Not Found'
    return

if found_files:
    file_to_download = found_files[0]
    download_file(file_to_download, download_path)

def download_file(file_to_download, download_path):
“”“Download file from Box to local system.”“”
print(f"Found file {file_to_download.name}. Downloading …“)
item_download_path = os.path.join(download_path, file_to_download.name)
with open(item_download_path, ‘wb’) as f:
file_to_download.download_to(f)
print(f"Download completed for {file_to_download.name}.”)

def main():
try:
client = authenticate_box_client()
df = pd.read_excel(‘filenames.xlsx’)
download_path = ‘download_folder’
os.makedirs(download_path, exist_ok=True)
folder_id = ‘myfolderID’

    for index, row in df.iterrows():
        filename = row['Filename']
        print(f"Searching for {filename} ...")
        find_and_download_file(client, filename, folder_id, download_path, df, index)

    df.to_excel('filenames.xlsx', index=False)

except Exception as e:
    print(f"An unexpected error occurred: {e}")
    if hasattr(e, 'context_info'):
        print("Context Info:", e.context_info)
    print("Debug Information:")
    print(f"Filename: {filename}")
    print(f"Folder ID: {folder_id}")

if name == ‘main’:
main()

Like

E

edztra
Author
Forum|Forum|2 years ago
September 4, 2023

I have adapted my script and it looks like it is working now!

Like

rbarbosa
Developer Advocate
Forum|Forum|2 years ago
September 5, 2023

Hi @edztra , I guess I was too late in answering your other question.

My only suggestion to your find_and_domnwload method on top of what you already have, is to limit the search to only the name of the file. You should get even less false positives.
Something like this:

def simple_search(query: str, content_types: Iterable[str] = None) -> Iterable["Item"]:
    """Search by query in any Box content"""

    return client.search().query(query=query, content_types=content_types)

# Search only in name
search_results = simple_search(
    "ananas",
    content_types=[
        "name",
    ],
)
print_search_results(search_results)

The search will look for matches in the name, description, tags, comments, and the first 10k bytes of the file. Limiting the search to look only in name should give you better results.

Best regards

Like

Start searching and downloading

Sign up

Login to the community

Scanning file for viruses.

This file cannot be downloaded