diff --git a/.env_sample b/.env_sample index 7311c5e95d5d3dff4109066e8322f494d898a2f7..cc9ac16dc416cef2bd8498f1f4de8a3e5acd3015 100644 --- a/.env_sample +++ b/.env_sample @@ -42,6 +42,12 @@ GRAFANA_POSTGRES_USER_PASSWORD="" # Definition: Password for the Grafana post #SPHN_CONNECTOR_MEMORY="" # Definition: RAM (Random-Access Memory) of the machine available for the SPHN Connector. Unit: GB. Defaults to psutil.virtual_memory().available. Type: integer positive number (if decimal round down). Example: 32 #SPHN_CONNECTOR_CPUS="" # Definition: CPU (Central Processing Units) number of the machine available for the SPHN Connector. Defaults to psutil.cpu_count(logical=True). Type: integer positive number. Example: 8 #MINIO_COMPRESSION="" # Definition: Enables compression in minio. Can be either set to "ON" or "OFF". Default is "OFF". WARNING: Enabling compression will decrease performance. + +# Einstein configuration parameters #EINSTEIN="" # Definition: Activate patient data sharing with Einstein Relativity Tool. Default is "False". Type: boolean. Example: "True" #EINSTEIN_API_USER="" # Definition: API username of Einstein tool to trigger Einstein endpoints. -#EINSTEIN_API_PASSWORD="" # Definition: API user's password of Einstein tool to trigger Einstein endpoints. \ No newline at end of file +#EINSTEIN_API_PASSWORD="" # Definition: API user's password of Einstein tool to trigger Einstein endpoints. +#USE_EXTERNAL_S3="" # Definition: Configure external S3 object storage and do not use the SPHN Connector internal one. Default is "False". Type: boolean. Example: "True" +#EXTERNAL_S3_ACCESS_KEY="" # Definition: S3 object storage access key for external S3 instance. Example: external_s3_admin +#EXTERNAL_S3_SECRET_KEY="" # Definition: S3 object storage secret key for external S3 instance. Example: ycYsaUipyBfC +#EXTERNAL_S3_URL="" # Definition: URL to S3 object storage instance (https protocol) \ No newline at end of file diff --git a/api/app/api.py b/api/app/api.py index f68b92bba5e59a3717c0a125499bca3245e3c1d6..a23d0aa8f20592cecffd864311ef1716fe5a2d9a 100644 --- a/api/app/api.py +++ b/api/app/api.py @@ -18,6 +18,7 @@ from minio_policy import MinioPolicy from fastapi import Response, HTTPException, status, UploadFile from enums import CompressionType, ConfigFileAction, DataPersistenceLayer, FileClass, FileType, RML_JSON_LOGS, IngestionType, PreCondition from database import Database +from config import Config import rml_generator as rml import logging import zipfile @@ -25,6 +26,8 @@ import re from datetime import datetime from minio.commonconfig import CopySource from fastapi.responses import StreamingResponse +from botocore.exceptions import ClientError +import traceback from starlette.responses import HTMLResponse from rdf_parser import get_schema_iri, check_imports_match, check_sphn_import, validate_rdf_file logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.getLevelName('INFO'), datefmt='%Y-%m-%d %H:%M:%S') @@ -513,4 +516,24 @@ def get_offline_redoc_html(openapi_url: str, title: str, redoc_js_url: str, redo </body> </html> """ - return HTMLResponse(html) \ No newline at end of file + return HTMLResponse(html) + +def create_einstein_bucket(config: Config): + """Create Einstein bucket if necessary and it does not exist + + Args: + config (Config): Config object + """ + try: + if config.einstein: + try: + response = config.s3_client.head_bucket(Bucket='einstein') + print(response) + except ClientError as e: + if e.response['Error']['Code'] == '404': + config.s3_client.create_bucket(Bucket='einstein') + else: + error_message = f"Connection to external object storage returned code '{e.response['Error']['Code']}'. Exception raised:\n{traceback.format_exc(chain=False)}" + print(error_message) + except Exception: + print(f"Exception raised when checking existence of bucket:\n{traceback.format_exc(chain=False)}") \ No newline at end of file diff --git a/api/app/main.py b/api/app/main.py index 6a2cbb70d27b081620990b3e69068df7bc18f79f..2ee3bc520c4c072b41a45b9bd2c85af022069876 100644 --- a/api/app/main.py +++ b/api/app/main.py @@ -23,7 +23,7 @@ from fastapi.staticfiles import StaticFiles import time from typing import List from pydantic import BaseModel -from api import export_project, extract_sphn_iri_during_import, get_offline_redoc_html, import_project, load_csv_files, purge_internal_data, trigger_rml_generation, write_to_batch_folder, write_to_minio, download_schema, upload_schemas, initialize_project +from api import create_einstein_bucket, export_project, extract_sphn_iri_during_import, get_offline_redoc_html, import_project, load_csv_files, purge_internal_data, trigger_rml_generation, write_to_batch_folder, write_to_minio, download_schema, upload_schemas, initialize_project from backup_restore import BackupRestore from enums import * from database import Database @@ -38,11 +38,10 @@ from contextlib import asynccontextmanager @asynccontextmanager async def lifespan(application: FastAPI): + print("Configuring API user") + database = Database() try: - print("Configuring API user") - database = Database() - if database.config.einstein and not database.config.minio_client.bucket_exists(bucket_name='einstein'): - database.config.minio_client.make_bucket(bucket_name='einstein') + database.grant_permissions(user=config.api_user, password=config.api_password, user_type=UserType.ADMIN) except Exception: time.sleep(5) @@ -51,6 +50,8 @@ async def lifespan(application: FastAPI): except Exception: error_message = f"Configuration of API user failed. It's possible that there is a mismatch between environment variables in the containers and in the .env file. Try to remove the volumes and restart the application. Exception raised:\n{traceback.format_exc(chain=False)}" print(error_message) + + create_einstein_bucket(config=database.config) yield app = FastAPI(docs_url=None, redoc_url=None, lifespan=lifespan) diff --git a/api/app/requirements.txt b/api/app/requirements.txt index c4d051475fa9a9ba7f430862cac7f6126e8216b1..fb290120ad5358618d35025766bfc398419b8f51 100644 --- a/api/app/requirements.txt +++ b/api/app/requirements.txt @@ -18,4 +18,5 @@ lightrdf==0.3.1 openpyxl==3.1.2 anyio==3.7.1 psutil==5.9.6 -pyoxigraph==0.3.20 \ No newline at end of file +pyoxigraph==0.3.20 +boto3==1.34.80 \ No newline at end of file diff --git a/connector/app/requirements.txt b/connector/app/requirements.txt index 83cad8bc12eca27c840d9a0ab9df21a2568ebcbc..0a9ae5580ea235270389195e58521f542d650ad9 100644 --- a/connector/app/requirements.txt +++ b/connector/app/requirements.txt @@ -10,4 +10,5 @@ aiohttp==3.9.2 Werkzeug==2.3.8 cryptography==42.0.4 pyoxigraph==0.3.20 -pydantic==1.10.12 \ No newline at end of file +pydantic==1.10.12 +boto3==1.34.80 \ No newline at end of file diff --git a/docker-compose-build.yaml b/docker-compose-build.yaml index 44f4ec2bf9ac8f2c5daa2fc07ddb021c2aebb37b..362ba200e480a285ba26c718061e869b6d363bc4 100644 --- a/docker-compose-build.yaml +++ b/docker-compose-build.yaml @@ -36,6 +36,8 @@ services: depends_on: db: condition: service_healthy + extra_hosts: + - "host.docker.internal:host-gateway" minio: container_name: minio diff --git a/docker-compose.yaml b/docker-compose.yaml index 438bfbac3c547dfe4e6760e8c46f3ec585fd1463..517624ccadfe05b238d4eb57120d72ea71f782e0 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -30,6 +30,8 @@ services: depends_on: db: condition: service_healthy + extra_hosts: + - "host.docker.internal:host-gateway" minio: container_name: minio diff --git a/lib/config.py b/lib/config.py index 796576ff2fa4dfc111d20a43d2f00e4d7e34e4f3..af39a1d4d99b9ab2d3b692cf749262d11bf4480b 100644 --- a/lib/config.py +++ b/lib/config.py @@ -15,6 +15,8 @@ from minio import Minio import json import psutil +import boto3 +from botocore.client import Config as botocoreConfig class Config(object): @@ -81,6 +83,23 @@ class Config(object): self.einstein_api_user = config_data.get('einstein', {}).get('api_user') self.einstein_api_password = config_data.get('einstein', {}).get('api_password') + # External object storage + if self.einstein: + use_external_s3 = str(config_data.get('external_s3', {}).get('activated', True)).upper() == "TRUE" + if use_external_s3: + self.s3_access_key = config_data.get("external_s3", {}).get("access_key") + self.s3_secret_key = config_data.get("external_s3", {}).get("secret_key") + self.s3_endpoint = config_data.get("external_s3", {}).get("URL") + if not self.s3_endpoint.startswith('https://'): + self.s3_endpoint = 'https://' + self.s3_endpoint + else: + self.s3_access_key = self.minio_access_key + self.s3_secret_key = self.minio_secret_key + self.s3_endpoint = f"http://{self.endpoint}" + self.s3_client = boto3.client('s3', endpoint_url=self.s3_endpoint, aws_access_key_id=self.s3_access_key, + aws_secret_access_key=self.s3_secret_key, config=botocoreConfig(signature_version='s3v4'), + region_name='eu-central-1', verify='/data-transfer/reverse-proxy.crt') + #Parsing the input for the ACCESS_TOKEN_TIMEOUT setting provided in .env file # In case the user provides an invalid value (like negative integers) # we take the absolute value of the user input diff --git a/lib/database.py b/lib/database.py index 511a5c5efeeaf34b3a843d37774e0cde94fd8d9e..9a7ee77d960b63cb3c0f6fee856f0cee8db57196 100644 --- a/lib/database.py +++ b/lib/database.py @@ -1932,9 +1932,15 @@ class Database(object): self.config.minio_client.put_object(bucket_name=bucket, object_name=target_object_name, data=compressed_content, length=compressed_file_size) if self.config.einstein: - self.config.minio_client.copy_object(bucket_name='einstein', object_name=f'data/graph_zone/{os.path.basename(target_object_name)}', - source=CopySource(bucket_name=bucket, object_name=target_object_name)) - einstein_data['patients'][patient_id] = {'object_name': f'data/graph_zone/{os.path.basename(target_object_name)}', 'patient_graph': graph_named_iri} + object_key = f'data/graph_zone/{os.path.basename(target_object_name)}' + if not compression: + converted_object.seek(0) + object_data = converted_object + else: + compressed_content.seek(0) + object_data = compressed_content + self.config.s3_client.put_object(Bucket='einstein', Key=object_key, Body=object_data) + einstein_data['patients'][patient_id] = {'object_name': object_key, 'patient_graph': graph_named_iri} sql = f"INSERT INTO {target_zone.value} (project_name, patient_id, data_provider_id, object_name, validated_ok, timestmp, downloaded) VALUES (%s, %s, %s, %s, %s, %s, False) \ ON CONFLICT (project_name, patient_id, data_provider_id) DO UPDATE SET object_name=EXCLUDED.object_name, validated_ok=EXCLUDED.validated_ok, timestmp=EXCLUDED.timestmp, downloaded=EXCLUDED.downloaded" diff --git a/setup.sh b/setup.sh index 2cbc329b7af36212918d570203d018113f98f2ad..2177736f57b3548d7b19ada352fc4369bf63179c 100755 --- a/setup.sh +++ b/setup.sh @@ -316,13 +316,35 @@ else check_credentials_for_invalid_chars "${GRAFANA_POSTGRES_USER_PASSWORD}" "GRAFANA_POSTGRES_USER_PASSWORD" fi +if [ "${USE_EXTERNAL_S3}" = "True" ]; then + + if [ -z "${EXTERNAL_S3_URL}" ]; then + missing_variable_warning "EXTERNAL_S3_URL" + successfully_configured=false + fi + + if [ -z "${EXTERNAL_S3_ACCESS_KEY}" ]; then + missing_variable_warning "EXTERNAL_S3_ACCESS_KEY" + successfully_configured=false + else + check_credentials_for_invalid_chars "${EXTERNAL_S3_ACCESS_KEY}" "EXTERNAL_S3_ACCESS_KEY" + fi + + if [ -z "${EXTERNAL_S3_SECRET_KEY}" ]; then + missing_variable_warning "EXTERNAL_S3_SECRET_KEY" + successfully_configured=false + else + check_length "${EXTERNAL_S3_SECRET_KEY}" 12 "EXTERNAL_S3_SECRET_KEY" + check_credentials_for_invalid_chars "${EXTERNAL_S3_SECRET_KEY}" "EXTERNAL_S3_SECRET_KEY" + fi +fi + if [ -n "${MINIO_COMPRESSION}" ]; then check_for_on_or_off "${MINIO_COMPRESSION}" else echo "INFO -- Compression not set. Defaulting to 'OFF'" fi - if [ "$successfully_configured" = false ] ; then echo 'ERROR -- SPHN Connector setup failed. Please take described steps and re-run the setup script' exit 1 diff --git a/utils/generate_config_file.py b/utils/generate_config_file.py index dd95dcd8fd14b90f486b6af4a15ebe72246d148e..d179af71eb865cab5639a744ab5a08a8478782b0 100644 --- a/utils/generate_config_file.py +++ b/utils/generate_config_file.py @@ -64,6 +64,12 @@ def main(): "secret_key": environment_variables['MINIO_SECRET_KEY'], "compression": environment_variables.get('MINIO_COMPRESSION', 'OFF') }, + "external_s3":{ + "activated": environment_variables.get("USE_EXTERNAL_S3", False), + "access_key": environment_variables.get("EXTERNAL_S3_ACCESS_KEY"), + "secret_key": environment_variables.get("EXTERNAL_S3_SECRET_KEY"), + "URL": environment_variables.get("EXTERNAL_S3_URL"), + }, "postgres": { "user": environment_variables['POSTGRES_USER'], "password": environment_variables['POSTGRES_PASSWORD'],