From 63aec5a457cd78582cc4f5593bdb3dc6a8cf1fb9 Mon Sep 17 00:00:00 2001
From: WyattBlue <wyattblue@auto-editor.com>
Date: Tue, 23 Jun 2026 21:03:25 -0400
Subject: [PATCH] Support HW encoding via add_stream(hwaccel=...)

Previously PyAV only supported hardware decoding. Passing a hardware
encoder (e.g. h264_vaapi) software frames failed at avcodec_open2 with
EINVAL because no hw_device_ctx/hw_frames_ctx was ever set up.

add_stream now accepts a hwaccel argument. For an encoder it attaches the
device context, lazily builds the hw_frames_ctx in open() once
width/height/pix_fmt are known, and uploads software frames to the device
in _prepare_frames_for_encode. HWAccel setup now also accepts encoders,
which advertise the HW_FRAMES_CTX method rather than HW_DEVICE_CTX.

fixes #2156
---
 CHANGELOG.rst             |   1 +
 av/codec/context.pxd      |   1 +
 av/codec/context.py       |  48 ++++++++++++++++++
 av/codec/hwaccel.py       |  15 ++++--
 av/codec/hwaccel.pyi      |   2 +-
 av/container/output.py    |  18 +++++--
 av/container/output.pyi   |   3 ++
 av/video/codeccontext.pxd |   1 +
 av/video/codeccontext.py  | 101 +++++++++++++++++++++++++++++---------
 include/avcodec.pxd       |   1 +
 include/avutil.pxd        |   2 +
 tests/test_encode.py      |  65 ++++++++++++++++++++++++
 12 files changed, 227 insertions(+), 31 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index b855a7fb5..85107acc3 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -34,6 +34,7 @@ v18.0.0 (next)
 Breaking:
 
 - Remove Python 3.10
+- Support HW encoding via a ``hwaccel`` parameter on ``OutputContainer.add_stream`` (e.g. ``h264_vaapi``, ``h264_nvenc``, ``h264_videotoolbox``); software frames passed to ``encode`` are uploaded to the device automatically by :gh-user:`WyattBlue` (:issue:`2156`).
 
 Features:
 
diff --git a/av/codec/context.pxd b/av/codec/context.pxd
index cc12836d7..8108ecb1a 100644
--- a/av/codec/context.pxd
+++ b/av/codec/context.pxd
@@ -41,6 +41,7 @@ cdef class CodecContext:
     # TODO: Remove the `Packet` from `_setup_decoded_frame` (because flushing packets
     # are bogus). It should take all info it needs from the context and/or stream.
     cdef _prepare_and_time_rebase_frames_for_encode(self, Frame frame)
+    cdef void _setup_encode_hwframes(self)
     cdef list _prepare_frames_for_encode(self, Frame frame)
     cdef _setup_encoded_packet(self, Packet)
     cdef _setup_decoded_frame(self, Frame, Packet)
diff --git a/av/codec/context.py b/av/codec/context.py
index 6e10aaf9b..f842f80c1 100644
--- a/av/codec/context.py
+++ b/av/codec/context.py
@@ -246,6 +246,8 @@ def open(self, strict: cython.bint = True):
                 self.ptr.time_base.num = 1
                 self.ptr.time_base.den = lib.AV_TIME_BASE
 
+        self._setup_encode_hwframes()
+
         err_check(
             lib.avcodec_open2(self.ptr, self.codec.ptr, cython.address(options.ptr)),
             f'avcodec_open2("{self.codec.name}", {self.options})',
@@ -380,6 +382,52 @@ def _send_frame_and_recv(self, frame: Frame | None):
             yield packet
             packet = self._recv_packet()
 
+    @cython.cfunc
+    def _setup_encode_hwframes(self) -> cython.void:
+        # Build the hardware frames context for hardware-accelerated encoding.
+        #
+        # Unlike the device context (attached at construction time), the frames
+        # context depends on the final width/height/pixel format, which the user
+        # sets after add_stream(). We therefore defer it until just before the
+        # codec is opened.
+        if self.hwaccel_ctx is None or not self.is_encoder:
+            return
+        if self.ptr.hw_frames_ctx:
+            return  # Already set up.
+
+        hw_format: lib.AVPixelFormat = self.hwaccel_ctx.config.ptr.pix_fmt
+        sw_format: lib.AVPixelFormat = cython.cast(lib.AVPixelFormat, self.ptr.pix_fmt)
+
+        # The codec context's pix_fmt holds the *software* format the user feeds in.
+        # If they left it as the hardware format (or unset), pick a sane default.
+        if sw_format == hw_format or sw_format == lib.AV_PIX_FMT_NONE:
+            sw_format = lib.av_get_pix_fmt(b"nv12")
+
+        frames_ref: cython.pointer[lib.AVBufferRef] = lib.av_hwframe_ctx_alloc(
+            self.hwaccel_ctx.ptr
+        )
+        if frames_ref == cython.NULL:
+            raise MemoryError("av_hwframe_ctx_alloc() failed")
+
+        try:
+            frames_ctx: cython.pointer[lib.AVHWFramesContext] = cython.cast(
+                cython.pointer[lib.AVHWFramesContext], frames_ref.data
+            )
+            frames_ctx.format = hw_format
+            frames_ctx.sw_format = sw_format
+            frames_ctx.width = self.ptr.width
+            frames_ctx.height = self.ptr.height
+            frames_ctx.initial_pool_size = 32
+            err_check(lib.av_hwframe_ctx_init(frames_ref))
+        except Exception:
+            lib.av_buffer_unref(cython.address(frames_ref))
+            raise
+
+        # Ownership of frames_ref transfers to the codec context.
+        self.ptr.hw_frames_ctx = frames_ref
+        self.ptr.sw_pix_fmt = sw_format
+        self.ptr.pix_fmt = hw_format
+
     @cython.cfunc
     def _prepare_frames_for_encode(self, frame: Frame | None) -> list:
         return [frame]
diff --git a/av/codec/hwaccel.py b/av/codec/hwaccel.py
index 69f742b29..392bd39ac 100644
--- a/av/codec/hwaccel.py
+++ b/av/codec/hwaccel.py
@@ -140,10 +140,17 @@ def __init__(
         self.ptr = cython.NULL
         self.config = None
 
-    def _initialize_hw_context(self, codec: Codec):
+    def _initialize_hw_context(self, codec: Codec, for_encoding: bool = False):
+        # Decoders advertise the device-context method, while encoders (e.g.
+        # h264_vaapi) advertise the frames-context method. Accept either one when
+        # setting up an encoder.
+        supported_methods: cython.int = lib.AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX
+        if for_encoding:
+            supported_methods |= lib.AV_CODEC_HW_CONFIG_METHOD_HW_FRAMES_CTX
+
         config: HWConfig
         for config in codec.hardware_configs:
-            if not (config.ptr.methods & lib.AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX):
+            if not (config.ptr.methods & supported_methods):
                 continue
             if self._device_type and config.device_type != self._device_type:
                 continue
@@ -168,7 +175,7 @@ def _initialize_hw_context(self, codec: Codec):
             )
         )
 
-    def create(self, codec: Codec) -> HWAccel:
+    def create(self, codec: Codec, for_encoding: bool = False) -> HWAccel:
         """Create a new hardware accelerator context with the given codec"""
         if self.ptr:
             raise RuntimeError("Hardware context already initialized")
@@ -180,7 +187,7 @@ def create(self, codec: Codec) -> HWAccel:
             options=self.options,
             is_hw_owned=self.is_hw_owned,
         )
-        ret._initialize_hw_context(codec)
+        ret._initialize_hw_context(codec, for_encoding=for_encoding)
         return ret
 
     def __dealloc__(self):
diff --git a/av/codec/hwaccel.pyi b/av/codec/hwaccel.pyi
index 247771a3b..a9135f683 100644
--- a/av/codec/hwaccel.pyi
+++ b/av/codec/hwaccel.pyi
@@ -52,6 +52,6 @@ class HWAccel:
         flags: int | None = None,
         is_hw_owned: bool = False,
     ) -> None: ...
-    def create(self, codec: Codec) -> HWAccel: ...
+    def create(self, codec: Codec, for_encoding: bool = False) -> HWAccel: ...
 
 def hwdevices_available() -> list[str]: ...
diff --git a/av/container/output.py b/av/container/output.py
index c17b32ba4..ebf9269e8 100644
--- a/av/container/output.py
+++ b/av/container/output.py
@@ -5,6 +5,7 @@
 from cython.cimports.av.bitstream import BitStreamFilterContext
 from cython.cimports.av.codec.codec import Codec
 from cython.cimports.av.codec.context import CodecContext, wrap_codec_context
+from cython.cimports.av.codec.hwaccel import HWAccel
 from cython.cimports.av.container.streams import StreamContainer
 from cython.cimports.av.dictionary import Dictionary
 from cython.cimports.av.error import err_check
@@ -79,8 +80,15 @@ def __dealloc__(self):
         with cython.nogil:
             lib.av_packet_free(cython.address(self.packet_ptr))
 
-    def add_stream(self, codec_name, rate=None, options: dict | None = None, **kwargs):
-        """add_stream(codec_name, rate=None)
+    def add_stream(
+        self,
+        codec_name,
+        rate=None,
+        options: dict | None = None,
+        hwaccel: HWAccel | None = None,
+        **kwargs,
+    ):
+        """add_stream(codec_name, rate=None, *, hwaccel=None)
 
         Creates a new stream from a codec name and returns it.
         Supports video, audio, and subtitle streams.
@@ -88,6 +96,10 @@ def add_stream(self, codec_name, rate=None, options: dict | None = None, **kwarg
         :param codec_name: The name of a codec.
         :type codec_name: str
         :param dict options: Stream options.
+        :param HWAccel hwaccel: Optional settings for hardware-accelerated encoding.
+            Only applies to video streams (e.g. ``h264_vaapi``); software frames
+            passed to :meth:`~av.codec.context.CodecContext.encode` are uploaded to
+            the device automatically.
         :param \\**kwargs: Set attributes for the stream.
         :rtype: The new :class:`~av.stream.Stream`.
 
@@ -164,7 +176,7 @@ def add_stream(self, codec_name, rate=None, options: dict | None = None, **kwarg
         err_check(lib.avcodec_parameters_from_context(stream.codecpar, ctx))
 
         # Construct the user-land stream
-        py_codec_context: CodecContext = wrap_codec_context(ctx, codec, None)
+        py_codec_context: CodecContext = wrap_codec_context(ctx, codec, hwaccel)
         py_stream: Stream = wrap_stream(self, stream, py_codec_context)
         self.streams.add_stream(py_stream)
 
diff --git a/av/container/output.pyi b/av/container/output.pyi
index ea42b1439..5ed78a2b2 100644
--- a/av/container/output.pyi
+++ b/av/container/output.pyi
@@ -4,6 +4,7 @@ from typing import TypeVar, overload
 
 from av.audio import _AudioCodecName
 from av.audio.stream import AudioStream
+from av.codec.hwaccel import HWAccel
 from av.packet import Packet
 from av.stream import AttachmentStream, DataStream, Stream
 from av.subtitles.stream import SubtitleStream
@@ -29,6 +30,7 @@ class OutputContainer(Container):
         codec_name: _VideoCodecName,
         rate: Fraction | int | None = None,
         options: dict[str, str] | None = None,
+        hwaccel: HWAccel | None = None,
         **kwargs,
     ) -> VideoStream: ...
     @overload
@@ -37,6 +39,7 @@ class OutputContainer(Container):
         codec_name: str,
         rate: Fraction | int | None = None,
         options: dict[str, str] | None = None,
+        hwaccel: HWAccel | None = None,
         **kwargs,
     ) -> VideoStream | AudioStream | SubtitleStream: ...
     def add_mux_stream(
diff --git a/av/video/codeccontext.pxd b/av/video/codeccontext.pxd
index 3489fdb7a..d15a9fc02 100644
--- a/av/video/codeccontext.pxd
+++ b/av/video/codeccontext.pxd
@@ -19,3 +19,4 @@ cdef class VideoCodecContext(CodecContext):
     cdef AVCodecPrivateData _private_data
     cdef readonly VideoReformatter reformatter
     cdef VideoFrame next_frame
+    cdef VideoFrame _encode_upload_frame(self, VideoFrame vframe)
diff --git a/av/video/codeccontext.py b/av/video/codeccontext.py
index fab4774a3..457ef43e9 100644
--- a/av/video/codeccontext.py
+++ b/av/video/codeccontext.py
@@ -49,29 +49,79 @@ def _init(
     ):
         CodecContext._init(self, ptr, codec, hwaccel)
 
-        if hwaccel is not None:
-            try:
-                self.hwaccel_ctx = hwaccel.create(self.codec)
-                self.ptr.hw_device_ctx = lib.av_buffer_ref(self.hwaccel_ctx.ptr)
-                self.ptr.pix_fmt = self.hwaccel_ctx.config.ptr.pix_fmt
-                self.ptr.get_format = _get_hw_format
-                self._private_data.hardware_pix_fmt = (
-                    self.hwaccel_ctx.config.ptr.pix_fmt
-                )
-                self._private_data.allow_software_fallback = (
-                    self.hwaccel.allow_software_fallback
-                )
-                self.ptr.opaque = cython.address(self._private_data)
-            except NotImplementedError:
-                # Some streams may not have a hardware decoder. For example, many action
-                # cam videos have a low resolution mjpeg stream, which is usually not
-                # compatible with hardware decoders.
-                # The user may have passed in a hwaccel because they want to decode the main
-                # stream with it, so we shouldn't abort even if we find a stream that can't
-                # be HW decoded.
-                # If the user wants to make sure hwaccel is actually used, they can check with the
-                # is_hwaccel() function on each stream's codec context.
-                self.hwaccel_ctx = None
+        if hwaccel is None:
+            return
+
+        if self.is_encoder:
+            # Hardware-accelerated encoding. We only attach the device context here;
+            # the hardware frames context depends on the final width/height/pixel
+            # format (set by the user after add_stream()), so it is built lazily in
+            # CodecContext.open() via _setup_encode_hwframes().
+            self.hwaccel_ctx = hwaccel.create(self.codec, for_encoding=True)
+            self.ptr.hw_device_ctx = lib.av_buffer_ref(self.hwaccel_ctx.ptr)
+            return
+
+        try:
+            self.hwaccel_ctx = hwaccel.create(self.codec)
+            self.ptr.hw_device_ctx = lib.av_buffer_ref(self.hwaccel_ctx.ptr)
+            self.ptr.pix_fmt = self.hwaccel_ctx.config.ptr.pix_fmt
+            self.ptr.get_format = _get_hw_format
+            self._private_data.hardware_pix_fmt = self.hwaccel_ctx.config.ptr.pix_fmt
+            self._private_data.allow_software_fallback = (
+                self.hwaccel.allow_software_fallback
+            )
+            self.ptr.opaque = cython.address(self._private_data)
+        except NotImplementedError:
+            # Some streams may not have a hardware decoder. For example, many action
+            # cam videos have a low resolution mjpeg stream, which is usually not
+            # compatible with hardware decoders.
+            # The user may have passed in a hwaccel because they want to decode the main
+            # stream with it, so we shouldn't abort even if we find a stream that can't
+            # be HW decoded.
+            # If the user wants to make sure hwaccel is actually used, they can check with the
+            # is_hwaccel() function on each stream's codec context.
+            self.hwaccel_ctx = None
+
+    @cython.cfunc
+    def _encode_upload_frame(self, vframe: VideoFrame) -> VideoFrame:
+        # Upload a software frame onto the device for hardware-accelerated encoding.
+        frames_ctx: cython.pointer[lib.AVHWFramesContext] = cython.cast(
+            cython.pointer[lib.AVHWFramesContext], self.ptr.hw_frames_ctx.data
+        )
+
+        # If the user already handed us a matching hardware frame, pass it through.
+        if vframe.ptr.format == frames_ctx.format:
+            return vframe
+
+        # Convert to the frames context's software format and size before uploading,
+        # since av_hwframe_transfer_data() does not change pixel format or scale.
+        if (
+            vframe.ptr.format != frames_ctx.sw_format
+            or vframe.ptr.width != frames_ctx.width
+            or vframe.ptr.height != frames_ctx.height
+        ):
+            if not self.reformatter:
+                self.reformatter = VideoReformatter()
+            vframe = self.reformatter.reformat(
+                vframe,
+                frames_ctx.width,
+                frames_ctx.height,
+                get_video_format(
+                    frames_ctx.sw_format, frames_ctx.width, frames_ctx.height
+                ),
+                threads=self.ptr.thread_count,
+            )
+
+        hwframe: VideoFrame = alloc_video_frame()
+        err_check(lib.av_hwframe_get_buffer(self.ptr.hw_frames_ctx, hwframe.ptr, 0))
+        err_check(lib.av_hwframe_transfer_data(hwframe.ptr, vframe.ptr, 0))
+        hwframe._copy_internal_attributes(vframe, data_layout=False)
+        hwframe._init_user_attributes()
+
+        if hwframe.ptr.pts == lib.AV_NOPTS_VALUE:
+            hwframe.ptr.pts = self.ptr.frame_num
+
+        return hwframe
 
     @cython.cfunc
     def _prepare_frames_for_encode(self, input: Frame | None) -> list:
@@ -79,6 +129,11 @@ def _prepare_frames_for_encode(self, input: Frame | None) -> list:
             return [None]
 
         vframe: VideoFrame = input
+
+        # Hardware-accelerated encoding: upload the (software) frame to the device.
+        if self.ptr.hw_frames_ctx != cython.NULL:
+            return [self._encode_upload_frame(vframe)]
+
         if (
             vframe.format.pix_fmt != self.pix_fmt
             or vframe.width != self.ptr.width
diff --git a/include/avcodec.pxd b/include/avcodec.pxd
index 5671caf41..c94deaaa9 100644
--- a/include/avcodec.pxd
+++ b/include/avcodec.pxd
@@ -274,6 +274,7 @@ cdef extern from "libavcodec/avcodec.h" nogil:
 
         AVHWAccel *hwaccel
         AVBufferRef *hw_device_ctx
+        AVBufferRef *hw_frames_ctx
 
         int thread_count
         int thread_type
diff --git a/include/avutil.pxd b/include/avutil.pxd
index 7b0a9e311..8911adcc0 100644
--- a/include/avutil.pxd
+++ b/include/avutil.pxd
@@ -216,12 +216,14 @@ cdef extern from "libavutil/hwcontext.h" nogil:
         AVPixelFormat sw_format
         int width
         int height
+        int initial_pool_size
 
     cdef int av_hwdevice_ctx_create(AVBufferRef **device_ctx, AVHWDeviceType type, const char *device, AVDictionary *opts, int flags)
     cdef AVHWDeviceType av_hwdevice_find_type_by_name(const char *name)
     cdef const char *av_hwdevice_get_type_name(AVHWDeviceType type)
     cdef AVHWDeviceType av_hwdevice_iterate_types(AVHWDeviceType prev)
     cdef int av_hwframe_transfer_data(AVFrame *dst, const AVFrame *src, int flags)
+    cdef int av_hwframe_get_buffer(AVBufferRef *hwframe_ctx, AVFrame *frame, int flags)
 
     cdef AVBufferRef *av_hwframe_ctx_alloc(AVBufferRef *device_ref)
     cdef int av_hwframe_ctx_init(AVBufferRef *ref)
diff --git a/tests/test_encode.py b/tests/test_encode.py
index 240c67872..bf432a495 100644
--- a/tests/test_encode.py
+++ b/tests/test_encode.py
@@ -2,12 +2,14 @@
 
 import io
 import math
+import os
 from fractions import Fraction
 
 import numpy as np
 import pytest
 
 import av
+import av.codec.hwaccel
 from av import AudioFrame, VideoFrame
 from av.audio.stream import AudioStream
 from av.video.stream import VideoStream
@@ -504,3 +506,66 @@ def test_profiles(self) -> None:
                 stream.profile = profile
                 print("Set", profile, "got", stream.profile)
                 assert stream.profile == profile
+
+
+# Map a hardware device type to a video encoder that uses it.
+_HWACCEL_ENCODERS = {
+    "vaapi": "h264_vaapi",
+    "cuda": "h264_nvenc",
+    "qsv": "h264_qsv",
+    "videotoolbox": "h264_videotoolbox",
+}
+
+
+def test_hardware_encode() -> None:
+    hwdevices_available = av.codec.hwaccel.hwdevices_available()
+    if "HWACCEL_DEVICE_TYPE" not in os.environ:
+        pytest.skip(
+            "Set the HWACCEL_DEVICE_TYPE to run this test. "
+            f"Options are {' '.join(hwdevices_available)}"
+        )
+
+    device_type = os.environ["HWACCEL_DEVICE_TYPE"]
+    assert device_type in hwdevices_available, f"{device_type} not available"
+
+    encoder = _HWACCEL_ENCODERS.get(device_type)
+    if encoder is None:
+        pytest.skip(f"No hardware encoder mapped for {device_type}")
+
+    width, height, n_frames = 320, 240, 24
+    hwaccel = av.codec.hwaccel.HWAccel(
+        device_type=device_type, allow_software_fallback=False
+    )
+
+    file = io.BytesIO()
+    container = av.open(file, mode="w", format="mp4")
+    stream = container.add_stream(encoder, rate=30, hwaccel=hwaccel)
+    assert isinstance(stream, VideoStream)
+    stream.width = width
+    stream.height = height
+    stream.pix_fmt = "nv12"
+
+    # Feed plain software frames; PyAV uploads them to the device for us.
+    muxed = 0
+    for i in range(n_frames):
+        array = np.full((height, width, 3), i * 8 % 256, dtype=np.uint8)
+        frame = VideoFrame.from_ndarray(array, format="rgb24")
+        for packet in stream.encode(frame):
+            container.mux(packet)
+            muxed += 1
+
+    # The hardware frames context must have been set up during open().
+    assert stream.codec_context.is_hwaccel
+
+    for packet in stream.encode():
+        container.mux(packet)
+        muxed += 1
+    container.close()
+
+    assert muxed > 0
+
+    # The result must be a valid, decodable H.264 stream.
+    file.seek(0)
+    with av.open(file, "r") as in_container:
+        decoded = sum(1 for _ in in_container.decode(video=0))
+    assert decoded == n_frames