From 63aec5a457cd78582cc4f5593bdb3dc6a8cf1fb9 Mon Sep 17 00:00:00 2001 From: WyattBlue Date: Tue, 23 Jun 2026 21:03:25 -0400 Subject: [PATCH] Support HW encoding via add_stream(hwaccel=...) Previously PyAV only supported hardware decoding. Passing a hardware encoder (e.g. h264_vaapi) software frames failed at avcodec_open2 with EINVAL because no hw_device_ctx/hw_frames_ctx was ever set up. add_stream now accepts a hwaccel argument. For an encoder it attaches the device context, lazily builds the hw_frames_ctx in open() once width/height/pix_fmt are known, and uploads software frames to the device in _prepare_frames_for_encode. HWAccel setup now also accepts encoders, which advertise the HW_FRAMES_CTX method rather than HW_DEVICE_CTX. fixes #2156 --- CHANGELOG.rst | 1 + av/codec/context.pxd | 1 + av/codec/context.py | 48 ++++++++++++++++++ av/codec/hwaccel.py | 15 ++++-- av/codec/hwaccel.pyi | 2 +- av/container/output.py | 18 +++++-- av/container/output.pyi | 3 ++ av/video/codeccontext.pxd | 1 + av/video/codeccontext.py | 101 +++++++++++++++++++++++++++++--------- include/avcodec.pxd | 1 + include/avutil.pxd | 2 + tests/test_encode.py | 65 ++++++++++++++++++++++++ 12 files changed, 227 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b855a7fb5..85107acc3 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -34,6 +34,7 @@ v18.0.0 (next) Breaking: - Remove Python 3.10 +- Support HW encoding via a ``hwaccel`` parameter on ``OutputContainer.add_stream`` (e.g. ``h264_vaapi``, ``h264_nvenc``, ``h264_videotoolbox``); software frames passed to ``encode`` are uploaded to the device automatically by :gh-user:`WyattBlue` (:issue:`2156`). Features: diff --git a/av/codec/context.pxd b/av/codec/context.pxd index cc12836d7..8108ecb1a 100644 --- a/av/codec/context.pxd +++ b/av/codec/context.pxd @@ -41,6 +41,7 @@ cdef class CodecContext: # TODO: Remove the `Packet` from `_setup_decoded_frame` (because flushing packets # are bogus). It should take all info it needs from the context and/or stream. cdef _prepare_and_time_rebase_frames_for_encode(self, Frame frame) + cdef void _setup_encode_hwframes(self) cdef list _prepare_frames_for_encode(self, Frame frame) cdef _setup_encoded_packet(self, Packet) cdef _setup_decoded_frame(self, Frame, Packet) diff --git a/av/codec/context.py b/av/codec/context.py index 6e10aaf9b..f842f80c1 100644 --- a/av/codec/context.py +++ b/av/codec/context.py @@ -246,6 +246,8 @@ def open(self, strict: cython.bint = True): self.ptr.time_base.num = 1 self.ptr.time_base.den = lib.AV_TIME_BASE + self._setup_encode_hwframes() + err_check( lib.avcodec_open2(self.ptr, self.codec.ptr, cython.address(options.ptr)), f'avcodec_open2("{self.codec.name}", {self.options})', @@ -380,6 +382,52 @@ def _send_frame_and_recv(self, frame: Frame | None): yield packet packet = self._recv_packet() + @cython.cfunc + def _setup_encode_hwframes(self) -> cython.void: + # Build the hardware frames context for hardware-accelerated encoding. + # + # Unlike the device context (attached at construction time), the frames + # context depends on the final width/height/pixel format, which the user + # sets after add_stream(). We therefore defer it until just before the + # codec is opened. + if self.hwaccel_ctx is None or not self.is_encoder: + return + if self.ptr.hw_frames_ctx: + return # Already set up. + + hw_format: lib.AVPixelFormat = self.hwaccel_ctx.config.ptr.pix_fmt + sw_format: lib.AVPixelFormat = cython.cast(lib.AVPixelFormat, self.ptr.pix_fmt) + + # The codec context's pix_fmt holds the *software* format the user feeds in. + # If they left it as the hardware format (or unset), pick a sane default. + if sw_format == hw_format or sw_format == lib.AV_PIX_FMT_NONE: + sw_format = lib.av_get_pix_fmt(b"nv12") + + frames_ref: cython.pointer[lib.AVBufferRef] = lib.av_hwframe_ctx_alloc( + self.hwaccel_ctx.ptr + ) + if frames_ref == cython.NULL: + raise MemoryError("av_hwframe_ctx_alloc() failed") + + try: + frames_ctx: cython.pointer[lib.AVHWFramesContext] = cython.cast( + cython.pointer[lib.AVHWFramesContext], frames_ref.data + ) + frames_ctx.format = hw_format + frames_ctx.sw_format = sw_format + frames_ctx.width = self.ptr.width + frames_ctx.height = self.ptr.height + frames_ctx.initial_pool_size = 32 + err_check(lib.av_hwframe_ctx_init(frames_ref)) + except Exception: + lib.av_buffer_unref(cython.address(frames_ref)) + raise + + # Ownership of frames_ref transfers to the codec context. + self.ptr.hw_frames_ctx = frames_ref + self.ptr.sw_pix_fmt = sw_format + self.ptr.pix_fmt = hw_format + @cython.cfunc def _prepare_frames_for_encode(self, frame: Frame | None) -> list: return [frame] diff --git a/av/codec/hwaccel.py b/av/codec/hwaccel.py index 69f742b29..392bd39ac 100644 --- a/av/codec/hwaccel.py +++ b/av/codec/hwaccel.py @@ -140,10 +140,17 @@ def __init__( self.ptr = cython.NULL self.config = None - def _initialize_hw_context(self, codec: Codec): + def _initialize_hw_context(self, codec: Codec, for_encoding: bool = False): + # Decoders advertise the device-context method, while encoders (e.g. + # h264_vaapi) advertise the frames-context method. Accept either one when + # setting up an encoder. + supported_methods: cython.int = lib.AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX + if for_encoding: + supported_methods |= lib.AV_CODEC_HW_CONFIG_METHOD_HW_FRAMES_CTX + config: HWConfig for config in codec.hardware_configs: - if not (config.ptr.methods & lib.AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX): + if not (config.ptr.methods & supported_methods): continue if self._device_type and config.device_type != self._device_type: continue @@ -168,7 +175,7 @@ def _initialize_hw_context(self, codec: Codec): ) ) - def create(self, codec: Codec) -> HWAccel: + def create(self, codec: Codec, for_encoding: bool = False) -> HWAccel: """Create a new hardware accelerator context with the given codec""" if self.ptr: raise RuntimeError("Hardware context already initialized") @@ -180,7 +187,7 @@ def create(self, codec: Codec) -> HWAccel: options=self.options, is_hw_owned=self.is_hw_owned, ) - ret._initialize_hw_context(codec) + ret._initialize_hw_context(codec, for_encoding=for_encoding) return ret def __dealloc__(self): diff --git a/av/codec/hwaccel.pyi b/av/codec/hwaccel.pyi index 247771a3b..a9135f683 100644 --- a/av/codec/hwaccel.pyi +++ b/av/codec/hwaccel.pyi @@ -52,6 +52,6 @@ class HWAccel: flags: int | None = None, is_hw_owned: bool = False, ) -> None: ... - def create(self, codec: Codec) -> HWAccel: ... + def create(self, codec: Codec, for_encoding: bool = False) -> HWAccel: ... def hwdevices_available() -> list[str]: ... diff --git a/av/container/output.py b/av/container/output.py index c17b32ba4..ebf9269e8 100644 --- a/av/container/output.py +++ b/av/container/output.py @@ -5,6 +5,7 @@ from cython.cimports.av.bitstream import BitStreamFilterContext from cython.cimports.av.codec.codec import Codec from cython.cimports.av.codec.context import CodecContext, wrap_codec_context +from cython.cimports.av.codec.hwaccel import HWAccel from cython.cimports.av.container.streams import StreamContainer from cython.cimports.av.dictionary import Dictionary from cython.cimports.av.error import err_check @@ -79,8 +80,15 @@ def __dealloc__(self): with cython.nogil: lib.av_packet_free(cython.address(self.packet_ptr)) - def add_stream(self, codec_name, rate=None, options: dict | None = None, **kwargs): - """add_stream(codec_name, rate=None) + def add_stream( + self, + codec_name, + rate=None, + options: dict | None = None, + hwaccel: HWAccel | None = None, + **kwargs, + ): + """add_stream(codec_name, rate=None, *, hwaccel=None) Creates a new stream from a codec name and returns it. Supports video, audio, and subtitle streams. @@ -88,6 +96,10 @@ def add_stream(self, codec_name, rate=None, options: dict | None = None, **kwarg :param codec_name: The name of a codec. :type codec_name: str :param dict options: Stream options. + :param HWAccel hwaccel: Optional settings for hardware-accelerated encoding. + Only applies to video streams (e.g. ``h264_vaapi``); software frames + passed to :meth:`~av.codec.context.CodecContext.encode` are uploaded to + the device automatically. :param \\**kwargs: Set attributes for the stream. :rtype: The new :class:`~av.stream.Stream`. @@ -164,7 +176,7 @@ def add_stream(self, codec_name, rate=None, options: dict | None = None, **kwarg err_check(lib.avcodec_parameters_from_context(stream.codecpar, ctx)) # Construct the user-land stream - py_codec_context: CodecContext = wrap_codec_context(ctx, codec, None) + py_codec_context: CodecContext = wrap_codec_context(ctx, codec, hwaccel) py_stream: Stream = wrap_stream(self, stream, py_codec_context) self.streams.add_stream(py_stream) diff --git a/av/container/output.pyi b/av/container/output.pyi index ea42b1439..5ed78a2b2 100644 --- a/av/container/output.pyi +++ b/av/container/output.pyi @@ -4,6 +4,7 @@ from typing import TypeVar, overload from av.audio import _AudioCodecName from av.audio.stream import AudioStream +from av.codec.hwaccel import HWAccel from av.packet import Packet from av.stream import AttachmentStream, DataStream, Stream from av.subtitles.stream import SubtitleStream @@ -29,6 +30,7 @@ class OutputContainer(Container): codec_name: _VideoCodecName, rate: Fraction | int | None = None, options: dict[str, str] | None = None, + hwaccel: HWAccel | None = None, **kwargs, ) -> VideoStream: ... @overload @@ -37,6 +39,7 @@ class OutputContainer(Container): codec_name: str, rate: Fraction | int | None = None, options: dict[str, str] | None = None, + hwaccel: HWAccel | None = None, **kwargs, ) -> VideoStream | AudioStream | SubtitleStream: ... def add_mux_stream( diff --git a/av/video/codeccontext.pxd b/av/video/codeccontext.pxd index 3489fdb7a..d15a9fc02 100644 --- a/av/video/codeccontext.pxd +++ b/av/video/codeccontext.pxd @@ -19,3 +19,4 @@ cdef class VideoCodecContext(CodecContext): cdef AVCodecPrivateData _private_data cdef readonly VideoReformatter reformatter cdef VideoFrame next_frame + cdef VideoFrame _encode_upload_frame(self, VideoFrame vframe) diff --git a/av/video/codeccontext.py b/av/video/codeccontext.py index fab4774a3..457ef43e9 100644 --- a/av/video/codeccontext.py +++ b/av/video/codeccontext.py @@ -49,29 +49,79 @@ def _init( ): CodecContext._init(self, ptr, codec, hwaccel) - if hwaccel is not None: - try: - self.hwaccel_ctx = hwaccel.create(self.codec) - self.ptr.hw_device_ctx = lib.av_buffer_ref(self.hwaccel_ctx.ptr) - self.ptr.pix_fmt = self.hwaccel_ctx.config.ptr.pix_fmt - self.ptr.get_format = _get_hw_format - self._private_data.hardware_pix_fmt = ( - self.hwaccel_ctx.config.ptr.pix_fmt - ) - self._private_data.allow_software_fallback = ( - self.hwaccel.allow_software_fallback - ) - self.ptr.opaque = cython.address(self._private_data) - except NotImplementedError: - # Some streams may not have a hardware decoder. For example, many action - # cam videos have a low resolution mjpeg stream, which is usually not - # compatible with hardware decoders. - # The user may have passed in a hwaccel because they want to decode the main - # stream with it, so we shouldn't abort even if we find a stream that can't - # be HW decoded. - # If the user wants to make sure hwaccel is actually used, they can check with the - # is_hwaccel() function on each stream's codec context. - self.hwaccel_ctx = None + if hwaccel is None: + return + + if self.is_encoder: + # Hardware-accelerated encoding. We only attach the device context here; + # the hardware frames context depends on the final width/height/pixel + # format (set by the user after add_stream()), so it is built lazily in + # CodecContext.open() via _setup_encode_hwframes(). + self.hwaccel_ctx = hwaccel.create(self.codec, for_encoding=True) + self.ptr.hw_device_ctx = lib.av_buffer_ref(self.hwaccel_ctx.ptr) + return + + try: + self.hwaccel_ctx = hwaccel.create(self.codec) + self.ptr.hw_device_ctx = lib.av_buffer_ref(self.hwaccel_ctx.ptr) + self.ptr.pix_fmt = self.hwaccel_ctx.config.ptr.pix_fmt + self.ptr.get_format = _get_hw_format + self._private_data.hardware_pix_fmt = self.hwaccel_ctx.config.ptr.pix_fmt + self._private_data.allow_software_fallback = ( + self.hwaccel.allow_software_fallback + ) + self.ptr.opaque = cython.address(self._private_data) + except NotImplementedError: + # Some streams may not have a hardware decoder. For example, many action + # cam videos have a low resolution mjpeg stream, which is usually not + # compatible with hardware decoders. + # The user may have passed in a hwaccel because they want to decode the main + # stream with it, so we shouldn't abort even if we find a stream that can't + # be HW decoded. + # If the user wants to make sure hwaccel is actually used, they can check with the + # is_hwaccel() function on each stream's codec context. + self.hwaccel_ctx = None + + @cython.cfunc + def _encode_upload_frame(self, vframe: VideoFrame) -> VideoFrame: + # Upload a software frame onto the device for hardware-accelerated encoding. + frames_ctx: cython.pointer[lib.AVHWFramesContext] = cython.cast( + cython.pointer[lib.AVHWFramesContext], self.ptr.hw_frames_ctx.data + ) + + # If the user already handed us a matching hardware frame, pass it through. + if vframe.ptr.format == frames_ctx.format: + return vframe + + # Convert to the frames context's software format and size before uploading, + # since av_hwframe_transfer_data() does not change pixel format or scale. + if ( + vframe.ptr.format != frames_ctx.sw_format + or vframe.ptr.width != frames_ctx.width + or vframe.ptr.height != frames_ctx.height + ): + if not self.reformatter: + self.reformatter = VideoReformatter() + vframe = self.reformatter.reformat( + vframe, + frames_ctx.width, + frames_ctx.height, + get_video_format( + frames_ctx.sw_format, frames_ctx.width, frames_ctx.height + ), + threads=self.ptr.thread_count, + ) + + hwframe: VideoFrame = alloc_video_frame() + err_check(lib.av_hwframe_get_buffer(self.ptr.hw_frames_ctx, hwframe.ptr, 0)) + err_check(lib.av_hwframe_transfer_data(hwframe.ptr, vframe.ptr, 0)) + hwframe._copy_internal_attributes(vframe, data_layout=False) + hwframe._init_user_attributes() + + if hwframe.ptr.pts == lib.AV_NOPTS_VALUE: + hwframe.ptr.pts = self.ptr.frame_num + + return hwframe @cython.cfunc def _prepare_frames_for_encode(self, input: Frame | None) -> list: @@ -79,6 +129,11 @@ def _prepare_frames_for_encode(self, input: Frame | None) -> list: return [None] vframe: VideoFrame = input + + # Hardware-accelerated encoding: upload the (software) frame to the device. + if self.ptr.hw_frames_ctx != cython.NULL: + return [self._encode_upload_frame(vframe)] + if ( vframe.format.pix_fmt != self.pix_fmt or vframe.width != self.ptr.width diff --git a/include/avcodec.pxd b/include/avcodec.pxd index 5671caf41..c94deaaa9 100644 --- a/include/avcodec.pxd +++ b/include/avcodec.pxd @@ -274,6 +274,7 @@ cdef extern from "libavcodec/avcodec.h" nogil: AVHWAccel *hwaccel AVBufferRef *hw_device_ctx + AVBufferRef *hw_frames_ctx int thread_count int thread_type diff --git a/include/avutil.pxd b/include/avutil.pxd index 7b0a9e311..8911adcc0 100644 --- a/include/avutil.pxd +++ b/include/avutil.pxd @@ -216,12 +216,14 @@ cdef extern from "libavutil/hwcontext.h" nogil: AVPixelFormat sw_format int width int height + int initial_pool_size cdef int av_hwdevice_ctx_create(AVBufferRef **device_ctx, AVHWDeviceType type, const char *device, AVDictionary *opts, int flags) cdef AVHWDeviceType av_hwdevice_find_type_by_name(const char *name) cdef const char *av_hwdevice_get_type_name(AVHWDeviceType type) cdef AVHWDeviceType av_hwdevice_iterate_types(AVHWDeviceType prev) cdef int av_hwframe_transfer_data(AVFrame *dst, const AVFrame *src, int flags) + cdef int av_hwframe_get_buffer(AVBufferRef *hwframe_ctx, AVFrame *frame, int flags) cdef AVBufferRef *av_hwframe_ctx_alloc(AVBufferRef *device_ref) cdef int av_hwframe_ctx_init(AVBufferRef *ref) diff --git a/tests/test_encode.py b/tests/test_encode.py index 240c67872..bf432a495 100644 --- a/tests/test_encode.py +++ b/tests/test_encode.py @@ -2,12 +2,14 @@ import io import math +import os from fractions import Fraction import numpy as np import pytest import av +import av.codec.hwaccel from av import AudioFrame, VideoFrame from av.audio.stream import AudioStream from av.video.stream import VideoStream @@ -504,3 +506,66 @@ def test_profiles(self) -> None: stream.profile = profile print("Set", profile, "got", stream.profile) assert stream.profile == profile + + +# Map a hardware device type to a video encoder that uses it. +_HWACCEL_ENCODERS = { + "vaapi": "h264_vaapi", + "cuda": "h264_nvenc", + "qsv": "h264_qsv", + "videotoolbox": "h264_videotoolbox", +} + + +def test_hardware_encode() -> None: + hwdevices_available = av.codec.hwaccel.hwdevices_available() + if "HWACCEL_DEVICE_TYPE" not in os.environ: + pytest.skip( + "Set the HWACCEL_DEVICE_TYPE to run this test. " + f"Options are {' '.join(hwdevices_available)}" + ) + + device_type = os.environ["HWACCEL_DEVICE_TYPE"] + assert device_type in hwdevices_available, f"{device_type} not available" + + encoder = _HWACCEL_ENCODERS.get(device_type) + if encoder is None: + pytest.skip(f"No hardware encoder mapped for {device_type}") + + width, height, n_frames = 320, 240, 24 + hwaccel = av.codec.hwaccel.HWAccel( + device_type=device_type, allow_software_fallback=False + ) + + file = io.BytesIO() + container = av.open(file, mode="w", format="mp4") + stream = container.add_stream(encoder, rate=30, hwaccel=hwaccel) + assert isinstance(stream, VideoStream) + stream.width = width + stream.height = height + stream.pix_fmt = "nv12" + + # Feed plain software frames; PyAV uploads them to the device for us. + muxed = 0 + for i in range(n_frames): + array = np.full((height, width, 3), i * 8 % 256, dtype=np.uint8) + frame = VideoFrame.from_ndarray(array, format="rgb24") + for packet in stream.encode(frame): + container.mux(packet) + muxed += 1 + + # The hardware frames context must have been set up during open(). + assert stream.codec_context.is_hwaccel + + for packet in stream.encode(): + container.mux(packet) + muxed += 1 + container.close() + + assert muxed > 0 + + # The result must be a valid, decodable H.264 stream. + file.seek(0) + with av.open(file, "r") as in_container: + decoded = sum(1 for _ in in_container.decode(video=0)) + assert decoded == n_frames