Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions architecture/build.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,23 @@ the macOS user's shared home directory.
Local image work should use `mise` tasks rather than direct Docker commands so
the same staging and tagging assumptions are used locally and in CI.

Container-engine selection is centralized in `tasks/scripts/container-engine.sh`.
`CONTAINER_ENGINE=docker|podman` is the only explicit override. Docker- and
Podman-backed e2e wrappers validate that override against their lane, set
`OPENSHELL_E2E_DRIVER`, and reject the removed
`OPENSHELL_E2E_CONTAINER_ENGINE` selector so build helpers and Rust e2e support
containers use the same engine. When no explicit override is present, an e2e
driver requirement wins, then a local-cluster requirement, then host
auto-detection.

Local Kubernetes image workflows opt into cluster-aware selection with
`CONTAINER_ENGINE_TARGET=local-k8s-cluster`. The hint is intentionally scoped to
Skaffold-style `push: false` builds where the image must land in the engine
backing the active local cluster: `k3d-*` contexts require Docker, `kind-*`
contexts use Docker unless `KIND_EXPERIMENTAL_PROVIDER=podman`, and unknown
contexts require an explicit `CONTAINER_ENGINE`. Other image builds do not infer
from kube context.

## CI and E2E

Required checks run on GitHub Actions. Workflows that use NVIDIA self-hosted runners trigger from copy-pr-bot mirror branches, so trusted PRs are mirrored into `pull-request/<N>` branches before those workflows run.
Expand Down
4 changes: 2 additions & 2 deletions deploy/helm/openshell/skaffold.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ build:
context: ../../..
custom:
buildCommand: |
CONTAINER_ENGINE=docker \
CONTAINER_ENGINE_TARGET=local-k8s-cluster \
IMAGE_NAME="${IMAGE%:*}" \
IMAGE_TAG="${IMAGE##*:}" \
tasks/scripts/docker-build-image.sh gateway
Expand All @@ -43,7 +43,7 @@ build:
context: ../../..
custom:
buildCommand: |
CONTAINER_ENGINE=docker \
CONTAINER_ENGINE_TARGET=local-k8s-cluster \
IMAGE_NAME="${IMAGE%:*}" \
IMAGE_TAG="${IMAGE##*:}" \
tasks/scripts/docker-build-image.sh supervisor
Expand Down
134 changes: 120 additions & 14 deletions e2e/rust/src/harness/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ use tokio::time::{interval, timeout};

use super::port::find_free_port;

const DEFAULT_TEST_SERVER_IMAGE: &str =
"ghcr.io/nvidia/openshell-community/sandboxes/base:latest";
const DEFAULT_TEST_SERVER_IMAGE: &str = "ghcr.io/nvidia/openshell-community/sandboxes/base:latest";

#[must_use]
pub fn e2e_driver() -> Option<String> {
Expand All @@ -37,18 +36,16 @@ pub struct ContainerEngine {
}

impl ContainerEngine {
#[must_use]
pub fn from_env() -> Self {
let binary = std::env::var("OPENSHELL_E2E_CONTAINER_ENGINE")
.ok()
.filter(|value| !value.trim().is_empty())
.or_else(|| match e2e_driver().as_deref() {
Some("podman") => Some("podman".to_string()),
_ => Some("docker".to_string()),
})
.expect("container engine fallback should be set");
pub fn from_env() -> Result<Self, String> {
let binary = resolve_container_engine(
std::env::var("OPENSHELL_E2E_CONTAINER_ENGINE")
.ok()
.as_deref(),
std::env::var("CONTAINER_ENGINE").ok().as_deref(),
e2e_driver().as_deref(),
)?;

Self { binary }
Ok(Self { binary })
}

#[must_use]
Expand Down Expand Up @@ -87,7 +84,7 @@ pub struct ContainerHttpServer {

impl ContainerHttpServer {
pub async fn start_python(alias: &str, script: &str) -> Result<Self, String> {
let engine = ContainerEngine::from_env();
let engine = ContainerEngine::from_env()?;
let host_port = find_free_port();
let network = e2e_network_name();
let host = network.as_ref().map_or_else(
Expand Down Expand Up @@ -179,6 +176,60 @@ impl ContainerHttpServer {
}
}

fn normalized_env(value: Option<&str>) -> Option<String> {
value
.map(str::trim)
.filter(|value| !value.is_empty())
.map(str::to_ascii_lowercase)
}

fn validate_container_engine(name: &str, value: String) -> Result<String, String> {
match value.as_str() {
"docker" | "podman" => Ok(value),
_ => Err(format!(
"{name}={value} is invalid; expected docker or podman"
)),
}
}

fn required_engine_from_driver(driver: Option<&str>) -> Option<String> {
match normalized_env(driver).as_deref() {
Some("docker") => Some("docker".to_string()),
Some("podman") => Some("podman".to_string()),
_ => None,
}
}

fn resolve_container_engine(
legacy_selector: Option<&str>,
explicit_engine: Option<&str>,
e2e_driver: Option<&str>,
) -> Result<String, String> {
if normalized_env(legacy_selector).is_some() {
return Err(
"OPENSHELL_E2E_CONTAINER_ENGINE is no longer supported; set CONTAINER_ENGINE=docker|podman instead"
.to_string(),
);
}

let explicit_engine = normalized_env(explicit_engine)
.map(|value| validate_container_engine("CONTAINER_ENGINE", value))
.transpose()?;
let required_engine = required_engine_from_driver(e2e_driver);

if let (Some(explicit), Some(required)) = (&explicit_engine, &required_engine)
&& explicit != required
{
return Err(format!(
"CONTAINER_ENGINE={explicit} conflicts with OPENSHELL_E2E_DRIVER={required}; use CONTAINER_ENGINE={required} or unset CONTAINER_ENGINE"
));
}

Ok(explicit_engine
.or(required_engine)
.unwrap_or_else(|| "docker".to_string()))
}

impl Drop for ContainerHttpServer {
fn drop(&mut self) {
let _ = self
Expand All @@ -188,3 +239,58 @@ impl Drop for ContainerHttpServer {
.output();
}
}

#[cfg(test)]
mod tests {
use super::resolve_container_engine;

#[test]
fn defaults_to_docker() {
assert_eq!(
resolve_container_engine(None, None, None).as_deref(),
Ok("docker")
);
}

#[test]
fn explicit_container_engine_wins() {
assert_eq!(
resolve_container_engine(None, Some("podman"), None).as_deref(),
Ok("podman")
);
}

#[test]
fn driver_selects_container_engine_without_explicit_engine() {
assert_eq!(
resolve_container_engine(None, None, Some("podman")).as_deref(),
Ok("podman")
);
}

#[test]
fn rejects_removed_selector() {
let err = resolve_container_engine(Some("podman"), None, None).unwrap_err();
assert!(err.contains("OPENSHELL_E2E_CONTAINER_ENGINE"));
}

#[test]
fn rejects_invalid_explicit_engine() {
let err = resolve_container_engine(None, Some("containerd"), None).unwrap_err();
assert!(err.contains("CONTAINER_ENGINE=containerd"));
}

#[test]
fn rejects_explicit_driver_conflict() {
let err = resolve_container_engine(None, Some("docker"), Some("podman")).unwrap_err();
assert!(err.contains("CONTAINER_ENGINE=docker conflicts"));
}

#[test]
fn ignores_non_container_drivers() {
assert_eq!(
resolve_container_engine(None, None, Some("kubernetes")).as_deref(),
Ok("docker")
);
}
}
16 changes: 10 additions & 6 deletions e2e/rust/tests/gpu_device_selection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,12 @@ fn docker_info_reports_wsl2(info: &Value) -> bool {
.any(text_reports_wsl2)
}

fn container_engine() -> ContainerEngine {
ContainerEngine::from_env().unwrap_or_else(|err| panic!("resolve e2e container engine: {err}"))
}

fn all_gpu_default_allowed() -> bool {
let engine = ContainerEngine::from_env();
let engine = container_engine();
if uses_local_podman_inventory(&engine) {
return local_podman_all_gpu_default_supported();
}
Expand All @@ -177,7 +181,7 @@ fn all_gpu_default_allowed() -> bool {
}

fn discovered_cdi_gpu_device_ids() -> Vec<String> {
let engine = ContainerEngine::from_env();
let engine = container_engine();
if uses_local_podman_inventory(&engine) {
let device_ids = local_podman_cdi_gpu_device_ids();
assert!(
Expand Down Expand Up @@ -216,8 +220,7 @@ fn default_cdi_gpu_device_id(device_ids: &[String], allow_all_devices: bool) ->
let mut named = device_ids
.iter()
.filter(|device_id| {
device_id.starts_with(CDI_GPU_DEVICE_PREFIX)
&& device_id.as_str() != CDI_GPU_DEVICE_ALL
device_id.starts_with(CDI_GPU_DEVICE_PREFIX) && device_id.as_str() != CDI_GPU_DEVICE_ALL
})
.cloned()
.collect::<Vec<_>>();
Expand Down Expand Up @@ -256,7 +259,7 @@ fn cdi_devices_driver_config_json(device_ids: &[&str]) -> String {
}

fn runtime_gpu_lines(gpu_device: &str) -> Vec<String> {
let engine = ContainerEngine::from_env();
let engine = container_engine();
let image = gpu_probe_image();
let output = engine
.command()
Expand Down Expand Up @@ -339,7 +342,8 @@ async fn sandbox_create_output(args: &[&str]) -> String {
#[tokio::test]
async fn gpu_request_without_device_matches_plain_default_gpu_container() {
let device_ids = discovered_cdi_gpu_device_ids();
let Some(default_gpu_device) = default_cdi_gpu_device_id(&device_ids, all_gpu_default_allowed())
let Some(default_gpu_device) =
default_cdi_gpu_device_id(&device_ids, all_gpu_default_allowed())
else {
eprintln!("skipping default GPU request test because no selectable GPU ID was discovered");
return;
Expand Down
18 changes: 12 additions & 6 deletions e2e/rust/tests/local_driver_token_restart.rs
Original file line number Diff line number Diff line change
Expand Up @@ -292,10 +292,7 @@ async fn restart_vm_sandbox(gateway: &ManagedGateway, sandbox_name: &str) -> Res
wait_for_healthy(Duration::from_secs(120)).await
}

async fn wait_for_driver_reconnect(
driver: LocalDriver,
sandbox_name: &str,
) -> Result<(), String> {
async fn wait_for_driver_reconnect(driver: LocalDriver, sandbox_name: &str) -> Result<(), String> {
match driver {
LocalDriver::Docker | LocalDriver::Podman => {
wait_for_sandbox_exec_contains(
Expand All @@ -321,7 +318,9 @@ async fn wait_for_driver_reconnect(
#[tokio::test]
async fn local_driver_sandbox_restarts_with_non_expiring_bootstrap_jwt() {
let Some(driver) = LocalDriver::from_env() else {
eprintln!("Skipping local-driver token restart test: e2e driver is not Docker, Podman, or VM");
eprintln!(
"Skipping local-driver token restart test: e2e driver is not Docker, Podman, or VM"
);
return;
};
let namespace = if driver.is_container() {
Expand All @@ -338,7 +337,14 @@ async fn local_driver_sandbox_restarts_with_non_expiring_bootstrap_jwt() {
} else {
None
};
let engine = driver.is_container().then(ContainerEngine::from_env);
let engine = if driver.is_container() {
Some(
ContainerEngine::from_env()
.unwrap_or_else(|err| panic!("resolve e2e container engine: {err}")),
)
} else {
None
};
let gateway = if driver == LocalDriver::Vm {
let Some(gateway) = ManagedGateway::from_env().expect("load managed e2e gateway metadata")
else {
Expand Down
34 changes: 31 additions & 3 deletions e2e/with-docker-gateway.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,36 @@ source "${ROOT}/e2e/support/gateway-common.sh"

e2e_preserve_mise_dirs

require_container_engine_lane() {
local lane=$1
local label=$2
local selected_engine selected_driver

if [ -n "${OPENSHELL_E2E_CONTAINER_ENGINE:-}" ]; then
echo "ERROR: OPENSHELL_E2E_CONTAINER_ENGINE is no longer supported." >&2
echo " Set CONTAINER_ENGINE=${lane} for the ${label} e2e lane, or unset it." >&2
exit 2
fi
selected_engine="$(printf '%s' "${CONTAINER_ENGINE:-}" | tr '[:upper:]' '[:lower:]')"
selected_driver="$(printf '%s' "${OPENSHELL_E2E_DRIVER:-}" | tr '[:upper:]' '[:lower:]')"

if [ -n "${selected_engine}" ] && [ "${selected_engine}" != "${lane}" ]; then
echo "ERROR: CONTAINER_ENGINE=${CONTAINER_ENGINE} conflicts with the ${label} e2e lane." >&2
echo " Set CONTAINER_ENGINE=${lane} or unset CONTAINER_ENGINE." >&2
exit 2
fi
if [ -n "${selected_driver}" ] && [ "${selected_driver}" != "${lane}" ]; then
echo "ERROR: OPENSHELL_E2E_DRIVER=${OPENSHELL_E2E_DRIVER} conflicts with the ${label} e2e lane." >&2
echo " Set OPENSHELL_E2E_DRIVER=${lane} or unset OPENSHELL_E2E_DRIVER." >&2
exit 2
fi

export CONTAINER_ENGINE="${lane}"
export OPENSHELL_E2E_DRIVER="${lane}"
}

require_container_engine_lane docker Docker

github_actions_host_docker_tmpdir() {
if [ "${GITHUB_ACTIONS:-}" != "true" ] \
|| [ ! -S /var/run/docker.sock ] \
Expand Down Expand Up @@ -234,8 +264,7 @@ if [ -n "${OPENSHELL_GATEWAY_ENDPOINT:-}" ]; then
"$(e2e_endpoint_port "${OPENSHELL_GATEWAY_ENDPOINT}")"
export OPENSHELL_GATEWAY="${GATEWAY_NAME}"
export OPENSHELL_PROVISION_TIMEOUT="${OPENSHELL_PROVISION_TIMEOUT:-180}"
export OPENSHELL_E2E_DRIVER="${OPENSHELL_E2E_DRIVER:-docker}"
export OPENSHELL_E2E_CONTAINER_ENGINE="${OPENSHELL_E2E_CONTAINER_ENGINE:-docker}"
export OPENSHELL_E2E_DRIVER="docker"

echo "Using existing e2e gateway endpoint: ${OPENSHELL_GATEWAY_ENDPOINT}"
"$@"
Expand Down Expand Up @@ -453,7 +482,6 @@ export OPENSHELL_E2E_DOCKER_NETWORK_NAME="${DOCKER_NETWORK_NAME}"
export OPENSHELL_E2E_NETWORK_NAME="${DOCKER_NETWORK_NAME}"
export OPENSHELL_E2E_SANDBOX_NAMESPACE="${E2E_NAMESPACE}"
export OPENSHELL_E2E_DRIVER="docker"
export OPENSHELL_E2E_CONTAINER_ENGINE="docker"
if connect_current_container_to_docker_network "${DOCKER_NETWORK_NAME}"; then
echo "Connected CI job container to Docker network ${DOCKER_NETWORK_NAME} (${GATEWAY_HOST_ALIAS_IP})."
else
Expand Down
Loading
Loading