Skip to main content

I am using a python script to search for files in a specific folder, the folder contains over 25K files

I have an excel with filenames that the script needs to search for.

Part of the script looks like this:


Start searching and downloading


for file_name in file_names_to_download:

print(f"Searching for {file_name} …“)

search_results = client.search().query(query=f”{file_name}“, limit=10, ancestor_folder_ids=dfolder_id])

exact_matches = titem for item in search_results if item.name == f”{file_name}"]


if not exact_matches:
print(f"No exact match found for {file_name}.")
df.loc df.iloco:, 0] == file_name, 'Status'] = 'Not Found'
continue

if len(exact_matches) > 1:
print(f"More than one exact match found for {file_name}. Using the first match.")

item_to_download = exact_matchesc0]
print(f"Found {file_name}. Downloading ...")

found_files.append(item_to_download.name)
item_download_path = os.path.join(download_path, item_to_download.name)

try:
with open(item_download_path, 'wb') as f:
item_to_download.download_to(f)
print(f"Download completed for {item_to_download.name}.")
df.loc df.iloco:, 0] == file_name, 'Status'] = 'Downloaded'
except Exception as e:
print(f"Failed to download {item_to_download.name}: {e}")
failed_downloads.append(item_to_download.name)
df.loc df.iloco:, 0] == file_name, 'Status'] = 'Failed'

I have noticed it takes a very long time for the API to return a result. Is there a way do you a faster search?

Is something wrong with my code?

My current code seems to work ok… But if you see room for improvement, please let me know.


import os

import pandas as pd

from boxsdk import OAuth2, Client

import dotenv


def store_tokens(access_token, refresh_token):

“”“Callback function to store new tokens.”“”

print(f"New access token: {access_token}“)

print(f"New refresh token: {refresh_token}”)


def authenticate_box_client():

“”“Authenticate and return Box client.”“”

dotenv.load_dotenv()

client_id = os.getenv(‘BOX_CLIENT_ID’)

client_secret = os.getenv(‘BOX_CLIENT_SECRET’)

access_token = os.getenv(‘BOX_ACCESS_TOKEN’)


auth = OAuth2(
client_id=client_id,
client_secret=client_secret,
access_token=access_token,
store_tokens=store_tokens
)
return Client(auth)

def find_and_download_file(client, filename, folder_id, download_path, df, index):

“”“Find and download file from Box.”“”

search_results = client.search().query(

query=filename,

limit=200,

ancestor_folder_ids=efolder_id],

file_extensions=t“pdf”],

type=“file”

)

found_files =


counter = 0  # Add a counter to track number of items iterated

for item in search_results:
counter += 1 # Increment counter
print(f" - {item.name}")
if item.name == filename:
found_files.append(item)
print(f"Found file {item.name}.")
df.at index, 'Status'] = 'Downloaded'
break
if counter >= 2: # Check if you've iterated through 10 items
break

if not found_files:
print(f"File {filename} not found. Updating DataFrame and moving to next file.")
df.at index, 'Status'] = 'Not Found'
return

if found_files:
file_to_download = found_filesb0]
download_file(file_to_download, download_path)

def download_file(file_to_download, download_path):

“”“Download file from Box to local system.”“”

print(f"Found file {file_to_download.name}. Downloading …“)

item_download_path = os.path.join(download_path, file_to_download.name)

with open(item_download_path, ‘wb’) as f:

file_to_download.download_to(f)

print(f"Download completed for {file_to_download.name}.”)


def main():

try:

client = authenticate_box_client()

df = pd.read_excel(‘filenames.xlsx’)

download_path = ‘download_folder’

os.makedirs(download_path, exist_ok=True)

folder_id = ‘myfolderID’


    for index, row in df.iterrows():
filename = rowl'Filename']
print(f"Searching for {filename} ...")
find_and_download_file(client, filename, folder_id, download_path, df, index)

df.to_excel('filenames.xlsx', index=False)

except Exception as e:
print(f"An unexpected error occurred: {e}")
if hasattr(e, 'context_info'):
print("Context Info:", e.context_info)
print("Debug Information:")
print(f"Filename: {filename}")
print(f"Folder ID: {folder_id}")

if name == ‘main’:

main()


I have adapted my script and it looks like it is working now!


Hi @edztra , I guess I was too late in answering your other question.


My only suggestion to your find_and_domnwload method on top of what you already have, is to limit the search to only the name of the file. You should get even less false positives.

Something like this:


def simple_search(query: str, content_types: Iterable[str] = None) -> Iterable["Item"]:
"""Search by query in any Box content"""

return client.search().query(query=query, content_types=content_types)

# Search only in name
search_results = simple_search(
"ananas",
content_types=[
"name",
],
)
print_search_results(search_results)

The search will look for matches in the name, description, tags, comments, and the first 10k bytes of the file. Limiting the search to look only in name should give you better results.


Best regards


Reply