Download large data sets

This section explains how to retrieve data (files) for all subjects in a project on Moveshelf using the Moveshelf API in an efficient way. The key points to speed up execution in this use case are:

Reduce the number of API calls to retrieve data
Download data in parallel

Prerequisites

Before implementing this example, ensure that your processing script includes all necessary setup steps. In particular, you should have:

Configured the Moveshelf API (version >= 1.4.1)
Extended the Moveshelf API with a project-level query to retrieve additional data

Implementation

To retrieve the additional data files for all subjects in a project, you can use the following script:

import os, sys, json
parent_folder = os.path.dirname(os.path.dirname(__file__))
sys.path.append(parent_folder)
from moveshelf_api import util
import requests
from api.api import MoveshelfApiCustomized
from concurrent.futures import ThreadPoolExecutor

# Use a requests.Session for connection pooling
requests_session = requests.Session()

def download_with_session(url):
    return download_json_file(url, session=requests_session)

def download_json_file(url, session=None):
    try:
        response = session.get(url) if session else requests.get(url)
        decoded_content = response.content.decode()
        return json.loads(decoded_content)
    except Exception as e:
        print(f"Failed to download or parse {url}: {e}")
        return None
   
## Setup the API
# Load config
personal_config = os.path.join(parent_folder, "mvshlf-config.json")
if not os.path.isfile(personal_config):
    raise FileNotFoundError(
        f"Configuration file '{personal_config}' is missing.\n"
        "Ensure the file exists with the correct name and path."
    )

with open(personal_config, "r") as config_file:
    data = json.load(config_file)

api = MoveshelfApiCustomized(
    api_key_file=os.path.join(parent_folder, data["apiKeyFileName"]),
    api_url=data["apiUrl"],
)

## Get available projects
projects = api.getUserProjects()
projectNames = [project['name'] for project in projects if len(projects) > 0]

my_project = "<organizationName/projectName>"  # e.g. support/demoProject
idx_my_project = projectNames.index(my_project)
my_project_id = projects[idx_my_project]["id"]
fileExtensionToDownload = '.json'  # Only download json files

all_subject_details = api.getProjectSubjectsWithAdditionalData(my_project_id)

## Extract URLs and file paths for additional data
URLs = []
file_paths = []
for subject_details in all_subject_details:
    for session in subject_details.get("sessions", []):
        for clip in session.get("clips", []):
            for ad in clip.get("additionalData", []):
                if ad["originalDataDownloadUri"].endswith(fileExtensionToDownload):
                    URLs.append(ad["originalDataDownloadUri"])
                    file_paths.append(f'{clip["projectPath"]}{clip["title"]}/{ad["originalFileName"]}')

# Download additional data in parallel
with ThreadPoolExecutor(max_workers=5) as executor:
    additional_data = list(executor.map(download_with_session, URLs))