Terjemahan disediakan oleh mesin penerjemah. Jika konten terjemahan yang diberikan bertentangan dengan versi bahasa Inggris aslinya, utamakan versi bahasa Inggris.
Prasyarat
-
-
-
-
JSON
-
-
-
#set up a virtual environment python3 -m venv ${PWD}/venv source venv/bin/activate -
git clone https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git git clone --recursive https://github.com/aws/sagemaker-hyperpod-recipes.git cd sagemaker-hyperpod-recipes pip3 install -r requirements.txt -
REGION="<region>" IMAGE="658645717510.dkr.ecr.${REGION}.amazonaws.com/smdistributed-modelparallel:2.4.1-gpu-py311-cu121" aws ecr get-login-password --region ${REGION} | docker login --username AWS --password-stdin 658645717510.dkr.ecr.${REGION}.amazonaws.com enroot import -o $PWD/smdistributed-modelparallel.sqsh dockerd://${IMAGE} mv $PWD/smdistributed-modelparallel.sqsh "/fsx/<any-path-in-the-shared-filesystem>" -
container: /fsx/path/to/your/smdistributed-modelparallel.sqsh
-
recipes.model.hf_access_token=<your_hf_token>
#!/bin/bash IMAGE="${YOUR_IMAGE}" SAGEMAKER_TRAINING_LAUNCHER_DIR="${SAGEMAKER_TRAINING_LAUNCHER_DIR:-${PWD}}" TRAIN_DIR="${YOUR_TRAIN_DIR}" # Location of training dataset VAL_DIR="${YOUR_VAL_DIR}" # Location of validation dataset # experiment ouput directory EXP_DIR="${YOUR_EXP_DIR}" HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \ recipes=training/llama/hf_llama3_8b_seq16k_gpu_p5x16_pretrain \ base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \ recipes.run.name="hf_llama3_8b" \ recipes.exp_manager.exp_dir="$EXP_DIR" \ recipes.model.data.train_dir="$TRAIN_DIR" \ recipes.model.data.val_dir="$VAL_DIR" \ container="${IMAGE}" \ +cluster.container_mounts.0="/fsx:/fsx"
bash launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_p5x16_pretrain.sh