diff --git a/.github/workflows/android_builds.yml b/.github/workflows/android_builds.yml index 7ff5502137c3..d1164924fbd4 100644 --- a/.github/workflows/android_builds.yml +++ b/.github/workflows/android_builds.yml @@ -59,6 +59,9 @@ jobs: - name: Setup Python and SCons uses: ./.github/actions/godot-deps + - name: Install YASM + run: sudo apt-get update && sudo apt-get install -y yasm + - name: Compilation uses: ./.github/actions/godot-build with: diff --git a/.github/workflows/ios_builds.yml b/.github/workflows/ios_builds.yml index 8513429f9a63..0e0d66120af4 100644 --- a/.github/workflows/ios_builds.yml +++ b/.github/workflows/ios_builds.yml @@ -30,6 +30,9 @@ jobs: - name: Setup Python and SCons uses: ./.github/actions/godot-deps + - name: Install YASM + run: brew install yasm + - name: Compilation (arm64) uses: ./.github/actions/godot-build with: diff --git a/.github/workflows/linux_builds.yml b/.github/workflows/linux_builds.yml index dc3d9f378624..d67cda1dcbd3 100644 --- a/.github/workflows/linux_builds.yml +++ b/.github/workflows/linux_builds.yml @@ -124,6 +124,9 @@ jobs: if: '!matrix.legacy-scons' uses: ./.github/actions/godot-deps + - name: Install YASM + run: sudo apt-get update && sudo apt-get install -y yasm + - name: Setup Python and SCons (legacy versions) if: matrix.legacy-scons uses: ./.github/actions/godot-deps diff --git a/.github/workflows/macos_builds.yml b/.github/workflows/macos_builds.yml index fcf4b00afb18..b7d742030946 100644 --- a/.github/workflows/macos_builds.yml +++ b/.github/workflows/macos_builds.yml @@ -48,6 +48,9 @@ jobs: - name: Setup Python and SCons uses: ./.github/actions/godot-deps + - name: Install YASM + run: brew install yasm + - name: Setup Vulkan SDK run: | sh misc/scripts/install_vulkan_sdk_macos.sh diff --git a/.github/workflows/web_builds.yml b/.github/workflows/web_builds.yml index 8e30c99fbc65..49106616443f 100644 --- a/.github/workflows/web_builds.yml +++ b/.github/workflows/web_builds.yml @@ -62,6 +62,9 @@ jobs: - name: Setup Python and SCons uses: ./.github/actions/godot-deps + - name: Install YASM + run: sudo apt-get update && sudo apt-get install -y yasm + - name: Compilation uses: ./.github/actions/godot-build with: diff --git a/.github/workflows/windows_builds.yml b/.github/workflows/windows_builds.yml index 384284b6047e..99c3f24e850d 100644 --- a/.github/workflows/windows_builds.yml +++ b/.github/workflows/windows_builds.yml @@ -75,6 +75,9 @@ jobs: - name: Download Direct3D 12 SDK components run: python ./misc/scripts/install_d3d12_sdk_windows.py + - name: Install YASM + run: choco install yasm + - name: Download pre-built ANGLE static libraries uses: dsaltares/fetch-gh-release-asset@1.1.2 with: diff --git a/COPYRIGHT.txt b/COPYRIGHT.txt index 6d227d6615a7..a368936d614a 100644 --- a/COPYRIGHT.txt +++ b/COPYRIGHT.txt @@ -319,6 +319,16 @@ Copyright: 1995-2024, The PNG Reference Library Authors. 1995-1996, Guy Eric Schalnat, Group 42, Inc. License: Zlib +Files: ./thirdparty/libsimplewebm/ +Comment: libsimplewebm +Copyright: 2016, Błażej Szczygieł +License: Expat + +Files: ./thirdparty/libsimplewebm/libwebm/ +Comment: The WebM Project +Copyright: 2010, Google Inc. +License: BSD-3-clause + Files: ./thirdparty/libtheora/ Comment: OggTheora Copyright: 2002-2009, Xiph.org Foundation @@ -329,6 +339,17 @@ Comment: OggVorbis Copyright: 2002-2015, Xiph.org Foundation License: BSD-3-clause +Files: ./thirdparty/libvpx/ +Comment: The WebM Project +Copyright: 2010, The WebM Project authors. +License: BSD-3-clause + +Files: ./thirdparty/libvpx/third_party/android/cpu-features.c + ./thirdparty/libvpx/third_party/android/cpu-features.h +Comment: The Android Open Source Project +Copyright: 2010, The Android Open Source Project +License: BSD-2-clause + Files: ./thirdparty/libwebp/ Comment: WebP codec Copyright: 2010, Google Inc. @@ -455,6 +476,14 @@ Comment: Stripped down version of "nvapi.h" from the NVIDIA NVAPI SDK Copyright: 2019-2022, NVIDIA Corporation License: Expat +Files: ./thirdparty/opus/ +Comment: Opus +Copyright: 2001-2011, Xiph.Org, Skype Limited, Octasic, + Jean-Marc Valin, Timothy B. Terriberry, + CSIRO, Gregory Maxwell, Mark Borgerding, + Erik de Castro Lopo +License: BSD-3-clause + Files: ./thirdparty/openxr/ Comment: OpenXR Loader Copyright: 2020-2023, The Khronos Group Inc. diff --git a/SConstruct b/SConstruct index 63cff4fe1672..9004c80caf63 100644 --- a/SConstruct +++ b/SConstruct @@ -287,8 +287,10 @@ opts.Add(BoolVariable("builtin_harfbuzz", "Use the built-in HarfBuzz library", T opts.Add(BoolVariable("builtin_icu4c", "Use the built-in ICU library", True)) opts.Add(BoolVariable("builtin_libogg", "Use the built-in libogg library", True)) opts.Add(BoolVariable("builtin_libpng", "Use the built-in libpng library", True)) +opts.Add(BoolVariable("builtin_opus", "Use the built-in Opus library", True)) opts.Add(BoolVariable("builtin_libtheora", "Use the built-in libtheora library", True)) opts.Add(BoolVariable("builtin_libvorbis", "Use the built-in libvorbis library", True)) +opts.Add(BoolVariable("builtin_libvpx", "Use the built-in libvpx library", True)) opts.Add(BoolVariable("builtin_libwebp", "Use the built-in libwebp library", True)) opts.Add(BoolVariable("builtin_wslay", "Use the built-in wslay library", True)) opts.Add(BoolVariable("builtin_mbedtls", "Use the built-in mbedTLS library", True)) diff --git a/modules/webm/SCsub b/modules/webm/SCsub new file mode 100644 index 000000000000..1041507fc6ec --- /dev/null +++ b/modules/webm/SCsub @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +Import("env") +Import("env_modules") + +env_webm = env_modules.Clone() + +# Thirdparty source files + +thirdparty_obj = [] + +thirdparty_dir = "#thirdparty/libsimplewebm/" +thirdparty_sources = [ + "libwebm/mkvparser/mkvparser.cc", + "OpusVorbisDecoder.cpp", + "VPXDecoder.cpp", + "WebMDemuxer.cpp", +] +thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources] + +env_webm.Prepend(CPPPATH=[thirdparty_dir, thirdparty_dir + "libwebm/"]) + +# also requires libogg, libvorbis and libopus +if env["builtin_libogg"]: + env_webm.Prepend(CPPPATH=["#thirdparty/libogg"]) +if env["builtin_libvorbis"]: + env_webm.Prepend(CPPPATH=["#thirdparty/libvorbis"]) +if env["builtin_opus"]: + env_webm.Prepend(CPPPATH=["#thirdparty/opus"]) + SConscript("opus/SCsub") + +if env["builtin_libvpx"]: + env_webm.Prepend(CPPPATH=["#thirdparty/libvpx"]) + SConscript("libvpx/SCsub") + +env_thirdparty = env_webm.Clone() +env_thirdparty.disable_warnings() +env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources) +env.modules_sources += thirdparty_obj + +# Godot source files + +module_obj = [] + +env_webm.add_source_files(module_obj, "*.cpp") +env.modules_sources += module_obj + +# Needed to force rebuilding the module files when the thirdparty library is updated. +env.Depends(module_obj, thirdparty_obj) diff --git a/modules/webm/config.py b/modules/webm/config.py new file mode 100644 index 000000000000..49e86642bff6 --- /dev/null +++ b/modules/webm/config.py @@ -0,0 +1,19 @@ +def can_build(env, platform): + if platform in ["iphone", "web"]: + return False + + return True + + +def configure(env): + pass + + +def get_doc_classes(): + return [ + "VideoStreamWebm", + ] + + +def get_doc_path(): + return "doc_classes" diff --git a/modules/webm/doc_classes/VideoStreamWebm.xml b/modules/webm/doc_classes/VideoStreamWebm.xml new file mode 100644 index 000000000000..5fcc9890011d --- /dev/null +++ b/modules/webm/doc_classes/VideoStreamWebm.xml @@ -0,0 +1,13 @@ + + + + [VideoStream] resource for WebM videos. + + + [VideoStream] resource handling the [url=https://www.webmproject.org/]WebM[/url] video format with [code].webm[/code] extension. Both the VP8 and VP9 codecs are supported. The VP8 and VP9 codecs are more efficient than [VideoStreamTheora], but they require more CPU resources to decode (especially VP9). Both the VP8 and VP9 codecs are decoded on the CPU. + [b]Note:[/b] Alpha channel (also known as transparency) is not supported. The video will always appear to have a black background, even if it originally contains an alpha channel. + [b]Note:[/b] There are known bugs and performance issues with WebM video playback in Godot. If you run into problems, try using the Ogg Theora format instead: [VideoStreamTheora] + + + + diff --git a/modules/webm/libvpx/SCsub b/modules/webm/libvpx/SCsub new file mode 100644 index 000000000000..68df832e9f9a --- /dev/null +++ b/modules/webm/libvpx/SCsub @@ -0,0 +1,334 @@ +#!/usr/bin/env python + +Import("env") +Import("env_modules") + +# Thirdparty sources + +libvpx_dir = "#thirdparty/libvpx/" + +libvpx_sources = [ + "vp8/vp8_dx_iface.c", + "vp8/common/generic/systemdependent.c", + "vp8/common/alloccommon.c", + "vp8/common/blockd.c", + "vp8/common/copy_c.c", + "vp8/common/debugmodes.c", + "vp8/common/dequantize.c", + "vp8/common/entropy.c", + "vp8/common/entropymode.c", + "vp8/common/entropymv.c", + "vp8/common/extend.c", + "vp8/common/filter.c", + "vp8/common/findnearmv.c", + "vp8/common/idct_blk.c", + "vp8/common/idctllm.c", + "vp8/common/loopfilter_filters.c", + "vp8/common/mbpitch.c", + "vp8/common/modecont.c", + "vp8/common/quant_common.c", + "vp8/common/reconinter.c", + "vp8/common/reconintra.c", + "vp8/common/reconintra4x4.c", + "vp8/common/rtcd.c", + "vp8/common/setupintrarecon.c", + "vp8/common/swapyv12buffer.c", + "vp8/common/treecoder.c", + "vp8/common/vp8_loopfilter.c", + "vp8/decoder/dboolhuff.c", + "vp8/decoder/decodeframe.c", + "vp8/decoder/decodemv.c", + "vp8/decoder/detokenize.c", + "vp8/decoder/onyxd_if.c", + "vp9/vp9_dx_iface.c", + "vp9/common/vp9_alloccommon.c", + "vp9/common/vp9_blockd.c", + "vp9/common/vp9_common_data.c", + "vp9/common/vp9_debugmodes.c", + "vp9/common/vp9_entropy.c", + "vp9/common/vp9_entropymode.c", + "vp9/common/vp9_entropymv.c", + "vp9/common/vp9_filter.c", + "vp9/common/vp9_frame_buffers.c", + "vp9/common/vp9_idct.c", + "vp9/common/vp9_loopfilter.c", + "vp9/common/vp9_mvref_common.c", + "vp9/common/vp9_pred_common.c", + "vp9/common/vp9_quant_common.c", + "vp9/common/vp9_reconinter.c", + "vp9/common/vp9_reconintra.c", + "vp9/common/vp9_rtcd.c", + "vp9/common/vp9_scale.c", + "vp9/common/vp9_scan.c", + "vp9/common/vp9_seg_common.c", + "vp9/common/vp9_thread_common.c", + "vp9/common/vp9_tile_common.c", + "vp9/decoder/vp9_decodeframe.c", + "vp9/decoder/vp9_decodemv.c", + "vp9/decoder/vp9_decoder.c", + "vp9/decoder/vp9_detokenize.c", + "vp9/decoder/vp9_dsubexp.c", + "vp9/decoder/vp9_dthread.c", + "vpx/src/vpx_codec.c", + "vpx/src/vpx_decoder.c", + "vpx/src/vpx_image.c", + "vpx/src/vpx_psnr.c", + "vpx_dsp/bitreader.c", + "vpx_dsp/bitreader_buffer.c", + "vpx_dsp/intrapred.c", + "vpx_dsp/inv_txfm.c", + "vpx_dsp/loopfilter.c", + "vpx_dsp/prob.c", + "vpx_dsp/vpx_convolve.c", + "vpx_dsp/vpx_dsp_rtcd.c", + "vpx_mem/vpx_mem.c", + "vpx_scale/vpx_scale_rtcd.c", + "vpx_scale/generic/yv12config.c", + "vpx_scale/generic/yv12extend.c", + "vpx_util/vpx_thread.c", +] + +libvpx_sources_mt = [ + "vp8/decoder/threading.c", +] + +libvpx_sources_intrin_x86 = [ + "vp8/common/x86/filter_x86.c", + "vp8/common/x86/loopfilter_x86.c", + "vp8/common/x86/vp8_asm_stubs.c", + "vpx_dsp/x86/vpx_asm_stubs.c", +] +libvpx_sources_intrin_x86_mmx = [ + "vp8/common/x86/idct_blk_mmx.c", +] +libvpx_sources_intrin_x86_sse2 = [ + "vp8/common/x86/idct_blk_sse2.c", + "vp9/common/x86/vp9_idct_intrin_sse2.c", + "vpx_dsp/x86/inv_txfm_sse2.c", + "vpx_dsp/x86/loopfilter_sse2.c", +] +libvpx_sources_intrin_x86_ssse3 = [ + "vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c", +] +libvpx_sources_intrin_x86_avx2 = [ + "vpx_dsp/x86/loopfilter_avx2.c", + "vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c", +] +libvpx_sources_x86asm = [ + "vp8/common/x86/copy_sse2.asm", + "vp8/common/x86/copy_sse3.asm", + "vp8/common/x86/dequantize_mmx.asm", + "vp8/common/x86/idctllm_mmx.asm", + "vp8/common/x86/idctllm_sse2.asm", + "vp8/common/x86/iwalsh_mmx.asm", + "vp8/common/x86/iwalsh_sse2.asm", + "vp8/common/x86/loopfilter_sse2.asm", + "vp8/common/x86/recon_mmx.asm", + "vp8/common/x86/recon_sse2.asm", + "vp8/common/x86/subpixel_mmx.asm", + "vp8/common/x86/subpixel_sse2.asm", + "vp8/common/x86/subpixel_ssse3.asm", + "vp8/common/x86/vp8_loopfilter_mmx.asm", + "vpx_dsp/x86/intrapred_sse2.asm", + "vpx_dsp/x86/intrapred_ssse3.asm", + "vpx_dsp/x86/inv_wht_sse2.asm", + "vpx_dsp/x86/vpx_convolve_copy_sse2.asm", + "vpx_dsp/x86/vpx_subpixel_8t_sse2.asm", + "vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm", + "vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm", + "vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm", + "vpx_ports/emms.asm", +] +libvpx_sources_x86_64asm = [ + "vp8/common/x86/loopfilter_block_sse2_x86_64.asm", + "vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm", +] + +libvpx_sources_arm = [ + "vpx_ports/arm_cpudetect.c", + "vp8/common/arm/loopfilter_arm.c", +] +libvpx_sources_arm_neon = [ + "vp8/common/arm/neon/bilinearpredict_neon.c", + "vp8/common/arm/neon/copymem_neon.c", + "vp8/common/arm/neon/dc_only_idct_add_neon.c", + "vp8/common/arm/neon/dequant_idct_neon.c", + "vp8/common/arm/neon/dequantizeb_neon.c", + "vp8/common/arm/neon/idct_blk_neon.c", + "vp8/common/arm/neon/idct_dequant_0_2x_neon.c", + "vp8/common/arm/neon/idct_dequant_full_2x_neon.c", + "vp8/common/arm/neon/iwalsh_neon.c", + "vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c", + "vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c", + "vp8/common/arm/neon/mbloopfilter_neon.c", + "vp8/common/arm/neon/shortidct4x4llm_neon.c", + "vp8/common/arm/neon/sixtappredict_neon.c", + "vp8/common/arm/neon/vp8_loopfilter_neon.c", + "vp9/common/arm/neon/vp9_iht4x4_add_neon.c", + "vp9/common/arm/neon/vp9_iht8x8_add_neon.c", + "vpx_dsp/arm/idct16x16_1_add_neon.c", + "vpx_dsp/arm/idct16x16_add_neon.c", + "vpx_dsp/arm/idct16x16_neon.c", + "vpx_dsp/arm/idct32x32_1_add_neon.c", + "vpx_dsp/arm/idct32x32_add_neon.c", + "vpx_dsp/arm/idct4x4_1_add_neon.c", + "vpx_dsp/arm/idct4x4_add_neon.c", + "vpx_dsp/arm/idct8x8_1_add_neon.c", + "vpx_dsp/arm/idct8x8_add_neon.c", + "vpx_dsp/arm/intrapred_neon.c", + "vpx_dsp/arm/loopfilter_16_neon.c", + "vpx_dsp/arm/loopfilter_4_neon.c", + "vpx_dsp/arm/loopfilter_8_neon.c", + "vpx_dsp/arm/loopfilter_neon.c", + "vpx_dsp/arm/vpx_convolve8_avg_neon.c", + "vpx_dsp/arm/vpx_convolve8_neon.c", + "vpx_dsp/arm/vpx_convolve_avg_neon.c", + "vpx_dsp/arm/vpx_convolve_copy_neon.c", + "vpx_dsp/arm/vpx_convolve_neon.c", +] +libvpx_sources_arm_neon_gas = [ + "vpx_dsp/arm/gas/intrapred_neon_asm.s", + "vpx_dsp/arm/gas/loopfilter_mb_neon.s", + "vpx_dsp/arm/gas/save_reg_neon.s", +] +libvpx_sources_arm_neon_armasm_ms = [ + "vpx_dsp/arm/armasm_ms/intrapred_neon_asm.asm", + "vpx_dsp/arm/armasm_ms/loopfilter_mb_neon.asm", + "vpx_dsp/arm/armasm_ms/save_reg_neon.asm", +] +libvpx_sources_arm_neon_gas_apple = [ + "vpx_dsp/arm/gas_apple/intrapred_neon_asm.s", + "vpx_dsp/arm/gas_apple/loopfilter_mb_neon.s", + "vpx_dsp/arm/gas_apple/save_reg_neon.s", +] + +libvpx_sources = [libvpx_dir + file for file in libvpx_sources] +libvpx_sources_mt = [libvpx_dir + file for file in libvpx_sources_mt] +libvpx_sources_intrin_x86 = [libvpx_dir + file for file in libvpx_sources_intrin_x86] +libvpx_sources_intrin_x86_mmx = [libvpx_dir + file for file in libvpx_sources_intrin_x86_mmx] +libvpx_sources_intrin_x86_sse2 = [libvpx_dir + file for file in libvpx_sources_intrin_x86_sse2] +libvpx_sources_intrin_x86_ssse3 = [libvpx_dir + file for file in libvpx_sources_intrin_x86_ssse3] +libvpx_sources_intrin_x86_avx2 = [libvpx_dir + file for file in libvpx_sources_intrin_x86_avx2] +libvpx_sources_x86asm = [libvpx_dir + file for file in libvpx_sources_x86asm] +libvpx_sources_x86_64asm = [libvpx_dir + file for file in libvpx_sources_x86_64asm] +libvpx_sources_arm = [libvpx_dir + file for file in libvpx_sources_arm] +libvpx_sources_arm_neon = [libvpx_dir + file for file in libvpx_sources_arm_neon] +libvpx_sources_arm_neon_gas = [libvpx_dir + file for file in libvpx_sources_arm_neon_gas] +libvpx_sources_arm_neon_armasm_ms = [libvpx_dir + file for file in libvpx_sources_arm_neon_armasm_ms] +libvpx_sources_arm_neon_gas_apple = [libvpx_dir + file for file in libvpx_sources_arm_neon_gas_apple] + + +env_libvpx = env.Clone() +env_libvpx.disable_warnings() +env_libvpx.Prepend(CPPPATH=[libvpx_dir]) + +webm_multithread = env_libvpx["platform"] != "javascript" + +cpu = env_libvpx["arch"] +webm_cpu_x86 = False +webm_cpu_arm = False +if cpu == "x86_64": + import os + import subprocess + + yasm_paths = [ + "yasm", + "../../../yasm", + ] + + yasm_found = False + + devnull = open(os.devnull) + for yasm_path in yasm_paths: + try: + yasm_found = True + subprocess.Popen([yasm_path, "--version"], stdout=devnull, stderr=devnull).communicate() + except Exception: + yasm_found = False + if yasm_found: + break + + if not yasm_found: + webm_cpu_x86 = False + print("YASM is necessary for WebM SIMD optimizations.") + +webm_simd_optimizations = False + +if webm_cpu_x86: + if env_libvpx["platform"] == "windows": + env_libvpx["ASFORMAT"] = "win" + elif env_libvpx["platform"] == "macos" or env["platform"] == "ios": + env_libvpx["ASFORMAT"] = "macho" + else: + env_libvpx["ASFORMAT"] = "elf" + env_libvpx["ASFORMAT"] += 64 + + env_libvpx["AS"] = "yasm" + env_libvpx["ASFLAGS"] = "-I" + libvpx_dir[1:] + " -f $ASFORMAT -D $ASCPU" + env_libvpx["ASCOM"] = "$AS $ASFLAGS -o $TARGET $SOURCES" + + env_libvpx["ASCPU"] = "X86_64" + + env_libvpx.Append(CPPDEFINES=["WEBM_X86ASM"]) + + webm_simd_optimizations = True + +if webm_cpu_arm: + if env_libvpx["platform"] == "iphone": + env_libvpx["ASFLAGS"] = "-arch armv7" + elif env_libvpx["platform"] == "android" and env["android_arch"] == "armv7" or env["platform"] == "linuxbsd": + env_libvpx["ASFLAGS"] = "-mfpu=neon" + elif env_libvpx["platform"] == "uwp": + env_libvpx["AS"] = "armasm" + env_libvpx["ASFLAGS"] = "" + env_libvpx["ASCOM"] = "$AS $ASFLAGS -o $TARGET $SOURCES" + + env_libvpx.Append(CPPDEFINES=["WEBM_ARMASM"]) + + webm_simd_optimizations = True + +if webm_simd_optimizations == False: + print("WebM SIMD optimizations are disabled. Check if your CPU architecture, CPU bits or platform are supported!") + +env_libvpx.add_source_files(env_libvpx.modules_sources, libvpx_sources) + +if webm_multithread: + env_libvpx.add_source_files(env_libvpx.modules_sources, libvpx_sources_mt) + +if webm_cpu_x86: + is_clang_or_gcc = ( + ("gcc" in os.path.basename(env_libvpx["CC"])) or ("clang" in os.path.basename(env["CC"])) or ("osxcross" in env) + ) + + env_libvpx_mmx = env_libvpx.Clone() + env_libvpx_mmx.add_source_files(env_libvpx.modules_sources, libvpx_sources_intrin_x86_mmx) + + env_libvpx_sse2 = env_libvpx.Clone() + env_libvpx_sse2.add_source_files(env_libvpx.modules_sources, libvpx_sources_intrin_x86_sse2) + + env_libvpx_ssse3 = env_libvpx.Clone() + if is_clang_or_gcc: + env_libvpx_ssse3.Append(CCFLAGS=["-mssse3"]) + env_libvpx_ssse3.add_source_files(env_libvpx.modules_sources, libvpx_sources_intrin_x86_ssse3) + + env_libvpx_avx2 = env_libvpx.Clone() + if is_clang_or_gcc: + env_libvpx_avx2.Append(CCFLAGS=["-mavx2"]) + env_libvpx_avx2.add_source_files(env_libvpx.modules_sources, libvpx_sources_intrin_x86_avx2) + + env_libvpx.add_source_files(env_libvpx.modules_sources, libvpx_sources_intrin_x86) + + env_libvpx.add_source_files(env_libvpx.modules_sources, libvpx_sources_x86asm) + env_libvpx.add_source_files(env_libvpx.modules_sources, libvpx_sources_x86_64asm) +elif webm_cpu_arm: + env_libvpx.add_source_files(env_libvpx.modules_sources, libvpx_sources_arm) + if env_libvpx["platform"] == "android": + env_libvpx.Prepend(CPPPATH=[libvpx_dir + "third_party/android"]) + env_libvpx.add_source_files(env_libvpx.modules_sources, [libvpx_dir + "third_party/android/cpu-features.c"]) + + env_libvpx_neon = env_libvpx.Clone() + env_libvpx_neon.add_source_files(env_libvpx.modules_sources, libvpx_sources_arm_neon) + + if env_libvpx["platform"] == "iphone": + env_libvpx.add_source_files(env_libvpx.modules_sources, libvpx_sources_arm_neon_gas_apple) + env_libvpx.add_source_files(env_libvpx.modules_sources, libvpx_sources_arm_neon_gas) diff --git a/modules/webm/opus/SCsub b/modules/webm/opus/SCsub new file mode 100644 index 000000000000..1437cd86df2f --- /dev/null +++ b/modules/webm/opus/SCsub @@ -0,0 +1,252 @@ +#!/usr/bin/env python + +Import("env") +Import("env_modules") + +# Only kept to build the thirdparty library used by the webm module. +# AudioStreamOpus was dropped in 3.0 due to incompatibility with the new audio +# engine. If you want to port it, fetch it from the Git history. + +env_opus = env_modules.Clone() + +# Thirdparty source files + +thirdparty_obj = [] + +# Thirdparty source files +if env["builtin_opus"]: + thirdparty_dir = "#thirdparty/opus/" + + thirdparty_sources = [ + # Sync with opus_sources.mk + "opus.c", + "opus_decoder.c", + "opus_encoder.c", + "opus_multistream.c", + "opus_multistream_encoder.c", + "opus_multistream_decoder.c", + "repacketizer.c", + "analysis.c", + "mlp.c", + "mlp_data.c", + # Sync with libopusfile Makefile.am + "info.c", + "internal.c", + "opusfile.c", + "stream.c", + # Sync with celt_sources.mk + "celt/bands.c", + "celt/celt.c", + "celt/celt_encoder.c", + "celt/celt_decoder.c", + "celt/cwrs.c", + "celt/entcode.c", + "celt/entdec.c", + "celt/entenc.c", + "celt/kiss_fft.c", + "celt/laplace.c", + "celt/mathops.c", + "celt/mdct.c", + "celt/modes.c", + "celt/pitch.c", + "celt/celt_lpc.c", + "celt/quant_bands.c", + "celt/rate.c", + "celt/vq.c", + # "celt/arm/arm_celt_map.c", + # "celt/arm/armcpu.c", + # "celt/arm/celt_ne10_fft.c", + # "celt/arm/celt_ne10_mdct.c", + # "celt/arm/celt_neon_intr.c", + # Sync with silk_sources.mk + "silk/CNG.c", + "silk/code_signs.c", + "silk/init_decoder.c", + "silk/decode_core.c", + "silk/decode_frame.c", + "silk/decode_parameters.c", + "silk/decode_indices.c", + "silk/decode_pulses.c", + "silk/decoder_set_fs.c", + "silk/dec_API.c", + "silk/enc_API.c", + "silk/encode_indices.c", + "silk/encode_pulses.c", + "silk/gain_quant.c", + "silk/interpolate.c", + "silk/LP_variable_cutoff.c", + "silk/NLSF_decode.c", + "silk/NSQ.c", + "silk/NSQ_del_dec.c", + "silk/PLC.c", + "silk/shell_coder.c", + "silk/tables_gain.c", + "silk/tables_LTP.c", + "silk/tables_NLSF_CB_NB_MB.c", + "silk/tables_NLSF_CB_WB.c", + "silk/tables_other.c", + "silk/tables_pitch_lag.c", + "silk/tables_pulses_per_block.c", + "silk/VAD.c", + "silk/control_audio_bandwidth.c", + "silk/quant_LTP_gains.c", + "silk/VQ_WMat_EC.c", + "silk/HP_variable_cutoff.c", + "silk/NLSF_encode.c", + "silk/NLSF_VQ.c", + "silk/NLSF_unpack.c", + "silk/NLSF_del_dec_quant.c", + "silk/process_NLSFs.c", + "silk/stereo_LR_to_MS.c", + "silk/stereo_MS_to_LR.c", + "silk/check_control_input.c", + "silk/control_SNR.c", + "silk/init_encoder.c", + "silk/control_codec.c", + "silk/A2NLSF.c", + "silk/ana_filt_bank_1.c", + "silk/biquad_alt.c", + "silk/bwexpander_32.c", + "silk/bwexpander.c", + "silk/debug.c", + "silk/decode_pitch.c", + "silk/inner_prod_aligned.c", + "silk/lin2log.c", + "silk/log2lin.c", + "silk/LPC_analysis_filter.c", + "silk/LPC_inv_pred_gain.c", + "silk/table_LSF_cos.c", + "silk/NLSF2A.c", + "silk/NLSF_stabilize.c", + "silk/NLSF_VQ_weights_laroia.c", + "silk/pitch_est_tables.c", + "silk/resampler.c", + "silk/resampler_down2_3.c", + "silk/resampler_down2.c", + "silk/resampler_private_AR2.c", + "silk/resampler_private_down_FIR.c", + "silk/resampler_private_IIR_FIR.c", + "silk/resampler_private_up2_HQ.c", + "silk/resampler_rom.c", + "silk/sigm_Q15.c", + "silk/sort.c", + "silk/sum_sqr_shift.c", + "silk/stereo_decode_pred.c", + "silk/stereo_encode_pred.c", + "silk/stereo_find_predictor.c", + "silk/stereo_quant_pred.c", + ] + + opus_sources_silk = [] + + if env["platform"] in ["android", "iphone", "javascript"]: + env_opus.Append(CPPDEFINES=["FIXED_POINT"]) + opus_sources_silk = [ + "silk/fixed/LTP_analysis_filter_FIX.c", + "silk/fixed/LTP_scale_ctrl_FIX.c", + "silk/fixed/corrMatrix_FIX.c", + "silk/fixed/encode_frame_FIX.c", + "silk/fixed/find_LPC_FIX.c", + "silk/fixed/find_LTP_FIX.c", + "silk/fixed/find_pitch_lags_FIX.c", + "silk/fixed/find_pred_coefs_FIX.c", + "silk/fixed/noise_shape_analysis_FIX.c", + "silk/fixed/prefilter_FIX.c", + "silk/fixed/process_gains_FIX.c", + "silk/fixed/regularize_correlations_FIX.c", + "silk/fixed/residual_energy16_FIX.c", + "silk/fixed/residual_energy_FIX.c", + "silk/fixed/solve_LS_FIX.c", + "silk/fixed/warped_autocorrelation_FIX.c", + "silk/fixed/apply_sine_window_FIX.c", + "silk/fixed/autocorr_FIX.c", + "silk/fixed/burg_modified_FIX.c", + "silk/fixed/k2a_FIX.c", + "silk/fixed/k2a_Q16_FIX.c", + "silk/fixed/pitch_analysis_core_FIX.c", + "silk/fixed/vector_ops_FIX.c", + "silk/fixed/schur64_FIX.c", + "silk/fixed/schur_FIX.c", + ] + else: + opus_sources_silk = [ + "silk/float/apply_sine_window_FLP.c", + "silk/float/corrMatrix_FLP.c", + "silk/float/encode_frame_FLP.c", + "silk/float/find_LPC_FLP.c", + "silk/float/find_LTP_FLP.c", + "silk/float/find_pitch_lags_FLP.c", + "silk/float/find_pred_coefs_FLP.c", + "silk/float/LPC_analysis_filter_FLP.c", + "silk/float/LTP_analysis_filter_FLP.c", + "silk/float/LTP_scale_ctrl_FLP.c", + "silk/float/noise_shape_analysis_FLP.c", + "silk/float/prefilter_FLP.c", + "silk/float/process_gains_FLP.c", + "silk/float/regularize_correlations_FLP.c", + "silk/float/residual_energy_FLP.c", + "silk/float/solve_LS_FLP.c", + "silk/float/warped_autocorrelation_FLP.c", + "silk/float/wrappers_FLP.c", + "silk/float/autocorrelation_FLP.c", + "silk/float/burg_modified_FLP.c", + "silk/float/bwexpander_FLP.c", + "silk/float/energy_FLP.c", + "silk/float/inner_product_FLP.c", + "silk/float/k2a_FLP.c", + "silk/float/levinsondurbin_FLP.c", + "silk/float/LPC_inv_pred_gain_FLP.c", + "silk/float/pitch_analysis_core_FLP.c", + "silk/float/scale_copy_vector_FLP.c", + "silk/float/scale_vector_FLP.c", + "silk/float/schur_FLP.c", + "silk/float/sort_FLP.c", + ] + + thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources + opus_sources_silk] + + # also requires libogg + if env["builtin_libogg"]: + env_opus.Prepend(CPPPATH=["#thirdparty/libogg"]) + + env_opus.Append(CPPDEFINES=["HAVE_CONFIG_H"]) + + thirdparty_include_paths = [ + "", + "celt", + "opus", + "silk", + "silk/fixed", + "silk/float", + ] + env_opus.Prepend(CPPPATH=[thirdparty_dir + "/" + dir for dir in thirdparty_include_paths]) + + if env["platform"] == "android": + if "android_arch" in env and env["android_arch"] == "armv7": + env_opus.Append(CPPDEFINES=["OPUS_ARM_OPT"]) + elif "android_arch" in env and env["android_arch"] == "arm64v8": + env_opus.Append(CPPDEFINES=["OPUS_ARM64_OPT"]) + elif env["platform"] == "iphone": + if "arch" in env and env["arch"] == "arm": + env_opus.Append(CPPDEFINES=["OPUS_ARM_OPT"]) + elif "arch" in env and env["arch"] == "arm64": + env_opus.Append(CPPDEFINES=["OPUS_ARM64_OPT"]) + elif env["platform"] == "osx": + if "arch" in env and env["arch"] == "arm64": + env_opus.Append(CPPDEFINES=["OPUS_ARM64_OPT"]) + + env_thirdparty = env_opus.Clone() + env_thirdparty.disable_warnings() + env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources) + env.modules_sources += thirdparty_obj + + +# Godot source files + +module_obj = [] + +env_opus.add_source_files(module_obj, "*.cpp") +env.modules_sources += module_obj + +# Needed to force rebuilding the module files when the thirdparty library is updated. +env.Depends(module_obj, thirdparty_obj) diff --git a/modules/webm/register_types.cpp b/modules/webm/register_types.cpp new file mode 100644 index 000000000000..befb3746ff3b --- /dev/null +++ b/modules/webm/register_types.cpp @@ -0,0 +1,49 @@ +/**************************************************************************/ +/* register_types.cpp */ +/**************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/**************************************************************************/ +/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ +/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/**************************************************************************/ + +#include "register_types.h" + +#include "modules/register_module_types.h" + +#include "video_stream_webm.h" + +static Ref resource_loader_webm; + +void initialize_webm_module(ModuleInitializationLevel p_level) { + resource_loader_webm.instantiate(); + ResourceLoader::add_resource_format_loader(resource_loader_webm, true); + + GDREGISTER_CLASS(VideoStreamWebm); +} + +void uninitialize_webm_module(ModuleInitializationLevel p_level) { + ResourceLoader::remove_resource_format_loader(resource_loader_webm); + resource_loader_webm.unref(); +} diff --git a/modules/webm/register_types.h b/modules/webm/register_types.h new file mode 100644 index 000000000000..5964a4359c31 --- /dev/null +++ b/modules/webm/register_types.h @@ -0,0 +1,39 @@ +/**************************************************************************/ +/* register_types.h */ +/**************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/**************************************************************************/ +/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ +/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/**************************************************************************/ + +#ifndef WEBM_REGISTER_TYPES_H +#define WEBM_REGISTER_TYPES_H + +#include "modules/register_module_types.h" + +void initialize_webm_module(ModuleInitializationLevel p_level); +void uninitialize_webm_module(ModuleInitializationLevel p_level); + +#endif // WEBM_REGISTER_TYPES_H diff --git a/modules/webm/video_stream_webm.cpp b/modules/webm/video_stream_webm.cpp new file mode 100644 index 000000000000..5029d755e1a1 --- /dev/null +++ b/modules/webm/video_stream_webm.cpp @@ -0,0 +1,438 @@ +/**************************************************************************/ +/* video_stream_webm.cpp */ +/**************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/**************************************************************************/ +/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ +/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/**************************************************************************/ + +#include "video_stream_webm.h" + +#include "core/config/project_settings.h" +#include "core/error/error_macros.h" +#include "core/io/file_access.h" +#include "core/os/os.h" +#include "servers/audio_server.h" + +#include "thirdparty/misc/yuv2rgb.h" + +// libsimplewebm +#include +#include + +// libvpx +#include + +// libwebm +#include + +class MkvReader : public mkvparser::IMkvReader { +public: + MkvReader(const String &p_file) { + file = FileAccess::open(p_file, FileAccess::READ); + + ERR_FAIL_COND_MSG(file.is_null(), "Failed loading resource: '" + p_file + "'."); + } + ~MkvReader() { + } + + virtual int Read(long long pos, long len, unsigned char *buf) { + if (file.is_valid()) { + if (file->get_position() != (uint64_t)pos) { + file->seek(pos); + } + if (file->get_buffer(buf, len) == (uint64_t)len) { + return 0; + } + } + return -1; + } + + virtual int Length(long long *total, long long *available) { + if (file.is_valid()) { + const uint64_t len = file->get_length(); + if (total) { + *total = len; + } + if (available) { + *available = len; + } + return 0; + } + return -1; + } + +private: + Ref file; +}; + +VideoStreamPlaybackWebm::VideoStreamPlaybackWebm() {} + +VideoStreamPlaybackWebm::~VideoStreamPlaybackWebm() { + delete_pointers(); +} + +bool VideoStreamPlaybackWebm::open_file(const String &p_file) { + file_name = p_file; + webm = memnew(WebMDemuxer(new MkvReader(file_name), 0, audio_track)); + if (webm->isOpen()) { + video = memnew(VPXDecoder(*webm, OS::get_singleton()->get_processor_count())); + if (video->isOpen()) { + audio = memnew(OpusVorbisDecoder(*webm)); + if (audio->isOpen()) { + audio_frame = memnew(WebMFrame); + pcm = (float *)memalloc(sizeof(float) * audio->getBufferSamples() * webm->getChannels()); + } else { + memdelete(audio); + audio = nullptr; + } + + frame_data.resize((webm->getWidth() * webm->getHeight()) << 2); + Ref img = Image::create_from_data(webm->getWidth(), webm->getHeight(), false, Image::FORMAT_RGBA8, frame_data); + texture = memnew(ImageTexture); + texture->set_image(img); + + return true; + } + memdelete(video); + video = nullptr; + } + memdelete(webm); + webm = nullptr; + return false; +} + +void VideoStreamPlaybackWebm::stop() { + if (playing) { + delete_pointers(); + + pcm = nullptr; + + audio_frame = nullptr; + video_frames = nullptr; + + video = nullptr; + audio = nullptr; + + open_file(file_name); // This should not fail here. + + video_frames_capacity = video_frames_pos = 0; + num_decoded_samples = 0; + samples_offset = -1; + video_frame_delay = video_pos = 0.0; + } + playing = false; + time = 0; +} + +void VideoStreamPlaybackWebm::play() { + if (!playing) { + time = 0; + } else { + stop(); + } + playing = true; + delay_compensation = GLOBAL_GET("audio/video/video_delay_compensation_ms"); + delay_compensation /= 1000.0; +} + +bool VideoStreamPlaybackWebm::is_playing() const { + return playing; +} + +void VideoStreamPlaybackWebm::set_paused(bool p_paused) { + paused = p_paused; +} + +bool VideoStreamPlaybackWebm::is_paused() const { + return paused; +} + +double VideoStreamPlaybackWebm::get_length() const { + if (webm) { + return webm->getLength(); + } + return 0.0f; +} + +double VideoStreamPlaybackWebm::get_playback_position() const { + return video_pos; +} + +void VideoStreamPlaybackWebm::seek(double p_time) { + if (!webm) { + return; + } + time = webm->seek(p_time); + video_pos = time; +} + +void VideoStreamPlaybackWebm::set_audio_track(int p_idx) { + audio_track = p_idx; +} + +Ref VideoStreamPlaybackWebm::get_texture() const { + return texture; +} + +void VideoStreamPlaybackWebm::update(double p_delta) { + if ((!playing || paused) || !video || texture.is_null()) { + return; + } + + time += p_delta; + + if (time < video_pos) { + return; + } + + bool audio_buffer_full = false; + + if (samples_offset > -1) { + //Mix remaining samples + const int to_read = num_decoded_samples - samples_offset; + const int mixed = mix_callback(mix_udata, pcm + samples_offset * webm->getChannels(), to_read); + if (mixed != to_read) { + samples_offset += mixed; + audio_buffer_full = true; + } else { + samples_offset = -1; + } + } + + const bool hasAudio = (audio && mix_callback); + while ((hasAudio && !audio_buffer_full && !has_enough_video_frames()) || + (!hasAudio && video_frames_pos == 0)) { + if (hasAudio && !audio_buffer_full && audio_frame->isValid() && + audio->getPCMF(*audio_frame, pcm, num_decoded_samples) && num_decoded_samples > 0) { + const int mixed = mix_callback(mix_udata, pcm, num_decoded_samples); + + if (mixed != num_decoded_samples) { + samples_offset = mixed; + audio_buffer_full = true; + } + } + + WebMFrame *video_frame; + if (video_frames_pos >= video_frames_capacity) { + WebMFrame **video_frames_new = (WebMFrame **)memrealloc(video_frames, ++video_frames_capacity * sizeof(void *)); + ERR_FAIL_COND(!video_frames_new); //Out of memory + (video_frames = video_frames_new)[video_frames_capacity - 1] = memnew(WebMFrame); + } + video_frame = video_frames[video_frames_pos]; + + if (!webm->readFrame(video_frame, audio_frame)) { //This will invalidate frames + break; //Can't demux, EOS? + } + + if (video_frame->isValid()) { + ++video_frames_pos; + } + }; + + bool video_frame_done = false; + while (video_frames_pos > 0 && !video_frame_done) { + WebMFrame *video_frame = video_frames[0]; + + // It seems VPXDecoder::decode has to be executed even though we might skip this frame + if (video->decode(*video_frame)) { + VPXDecoder::IMAGE_ERROR err; + VPXDecoder::Image image; + + if (should_process(*video_frame)) { + if ((err = video->getImage(image)) != VPXDecoder::NO_FRAME) { + if (err == VPXDecoder::NO_ERROR && image.w == webm->getWidth() && image.h == webm->getHeight()) { + uint8_t *w = frame_data.ptrw(); + bool converted = false; + + if (image.chromaShiftW == 0 && image.chromaShiftH == 0 && image.cs == VPX_CS_SRGB) { + uint8_t *wp = w; + unsigned char *rRow = image.planes[2]; + unsigned char *gRow = image.planes[0]; + unsigned char *bRow = image.planes[1]; + for (int i = 0; i < image.h; i++) { + for (int j = 0; j < image.w; j++) { + *wp++ = rRow[j]; + *wp++ = gRow[j]; + *wp++ = bRow[j]; + *wp++ = 255; + } + rRow += image.linesize[2]; + gRow += image.linesize[0]; + bRow += image.linesize[1]; + } + converted = true; + } else if (image.chromaShiftW == 1 && image.chromaShiftH == 1) { + yuv420_2_rgb8888(w, image.planes[0], image.planes[1], image.planes[2], image.w, image.h, image.linesize[0], image.linesize[1], image.w << 2); + converted = true; + } else if (image.chromaShiftW == 1 && image.chromaShiftH == 0) { + yuv422_2_rgb8888(w, image.planes[0], image.planes[1], image.planes[2], image.w, image.h, image.linesize[0], image.linesize[1], image.w << 2); + converted = true; + } else if (image.chromaShiftW == 0 && image.chromaShiftH == 0) { + yuv444_2_rgb8888(w, image.planes[0], image.planes[1], image.planes[2], image.w, image.h, image.linesize[0], image.linesize[1], image.w << 2); + converted = true; + } + + if (converted) { + Ref img = Image::create_from_data(webm->getWidth(), webm->getHeight(), false, Image::FORMAT_RGBA8, frame_data); + texture->update(img); + video_frame_done = true; + } + } + } + } + } + + video_pos = video_frame->time; + memmove(video_frames, video_frames + 1, (--video_frames_pos) * sizeof(void *)); + video_frames[video_frames_pos] = video_frame; + } + if (webm && webm->isEOS()) { + stop(); + } +} + +void VideoStreamPlaybackWebm::set_mix_callback(VideoStreamPlayback::AudioMixCallback p_callback, void *p_userdata) { + mix_callback = p_callback; + mix_udata = p_userdata; +} + +int VideoStreamPlaybackWebm::get_channels() const { + if (audio) { + return webm->getChannels(); + } + return 0; +} + +int VideoStreamPlaybackWebm::get_mix_rate() const { + if (audio) { + return webm->getSampleRate(); + } + return 0; +} + +inline bool VideoStreamPlaybackWebm::has_enough_video_frames() const { + if (video_frames_pos > 0) { + // FIXME: AudioServer output latency was fixed in af9bb0e, previously it used to + // systematically return 0. Now that it gives a proper latency, it broke this + // code where the delay compensation likely never really worked. + //const double audio_delay = AudioServer::get_singleton()->get_output_latency(); + const double video_time = video_frames[video_frames_pos - 1]->time; + return video_time >= time + /* audio_delay + */ delay_compensation; + } + return false; +} + +bool VideoStreamPlaybackWebm::should_process(WebMFrame &video_frame) { + // FIXME: AudioServer output latency was fixed in af9bb0e, previously it used to + // systematically return 0. Now that it gives a proper latency, it broke this + // code where the delay compensation likely never really worked. + //const double audio_delay = AudioServer::get_singleton()->get_output_latency(); + return video_frame.time >= time + /* audio_delay + */ delay_compensation; +} + +void VideoStreamPlaybackWebm::delete_pointers() { + if (pcm) { + memfree(pcm); + } + + if (audio_frame) { + memdelete(audio_frame); + } + if (video_frames) { + for (int i = 0; i < video_frames_capacity; ++i) { + memdelete(video_frames[i]); + } + memfree(video_frames); + } + + if (video) { + memdelete(video); + } + if (audio) { + memdelete(audio); + } + + if (webm) { + memdelete(webm); + } +} + +VideoStreamWebm::VideoStreamWebm() {} + +Ref VideoStreamWebm::instantiate_playback() { + Ref pb = memnew(VideoStreamPlaybackWebm); + pb->set_audio_track(audio_track); + if (pb->open_file(file)) { + return pb; + } + return nullptr; +} + +void VideoStreamWebm::_bind_methods() { + ADD_PROPERTY(PropertyInfo(Variant::STRING, "file", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NO_EDITOR | PROPERTY_USAGE_INTERNAL), "set_file", "get_file"); +} + +void VideoStreamWebm::set_audio_track(int p_track) { + audio_track = p_track; +} + +Ref ResourceFormatLoaderWebm::load(const String &p_path, const String &p_original_path, Error *r_error, bool p_use_sub_threads, float *r_progress, CacheMode p_cache_mode) { + Ref f = FileAccess::open(p_path, FileAccess::READ); + if (f.is_null()) { + if (r_error) { + *r_error = ERR_CANT_OPEN; + } + return Ref(); + } + + VideoStreamWebm *stream = memnew(VideoStreamWebm); + stream->set_file(p_path); + + Ref webm_stream = Ref(stream); + + if (r_error) { + *r_error = OK; + } + + f->flush(); + return webm_stream; +} + +void ResourceFormatLoaderWebm::get_recognized_extensions(List *p_extensions) const { + p_extensions->push_back("webm"); +} + +bool ResourceFormatLoaderWebm::handles_type(const String &p_type) const { + return ClassDB::is_parent_class(p_type, "VideoStream"); +} + +String ResourceFormatLoaderWebm::get_resource_type(const String &p_path) const { + String el = p_path.get_extension().to_lower(); + if (el == "webm") { + return "VideoStreamWebm"; + } + return ""; +} diff --git a/modules/webm/video_stream_webm.h b/modules/webm/video_stream_webm.h new file mode 100644 index 000000000000..a6aa6d9d0b0c --- /dev/null +++ b/modules/webm/video_stream_webm.h @@ -0,0 +1,131 @@ +/**************************************************************************/ +/* video_stream_webm.h */ +/**************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/**************************************************************************/ +/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ +/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/**************************************************************************/ + +#ifndef VIDEO_STREAM_WEBM_H +#define VIDEO_STREAM_WEBM_H + +#include "core/io/resource_loader.h" +#include "scene/resources/image_texture.h" +#include "scene/resources/texture.h" +#include "scene/resources/video_stream.h" + +class WebMFrame; +class WebMDemuxer; +class VPXDecoder; +class OpusVorbisDecoder; + +class VideoStreamPlaybackWebm : public VideoStreamPlayback { + GDCLASS(VideoStreamPlaybackWebm, VideoStreamPlayback); + + String file_name; + int audio_track = 0; + + WebMDemuxer *webm = nullptr; + VPXDecoder *video = nullptr; + OpusVorbisDecoder *audio = nullptr; + + WebMFrame **video_frames = nullptr, *audio_frame = nullptr; + int64_t video_frames_pos = 0, video_frames_capacity = 0; + + int num_decoded_samples = 0, samples_offset = -1; + AudioMixCallback mix_callback = nullptr; + void *mix_udata = nullptr; + + bool playing = false, paused = false; + double delay_compensation = 0.0; + double time = 0.0, video_frame_delay = 0.0, video_pos = 0.0; + + Vector frame_data; + Ref texture = memnew(ImageTexture); + + float *pcm = nullptr; + +public: + VideoStreamPlaybackWebm(); + virtual ~VideoStreamPlaybackWebm() override; + + bool open_file(const String &p_file); + + virtual void stop() override; + virtual void play() override; + + virtual bool is_playing() const override; + + virtual void set_paused(bool p_paused) override; + virtual bool is_paused() const override; + + virtual double get_length() const override; + + virtual double get_playback_position() const override; + virtual void seek(double p_time) override; + + virtual void set_audio_track(int p_idx) override; + + virtual Ref get_texture() const override; + virtual void update(double p_delta) override; + + virtual void set_mix_callback(AudioMixCallback p_callback, void *p_userdata) override; + virtual int get_channels() const override; + virtual int get_mix_rate() const override; + +private: + inline bool has_enough_video_frames() const; + bool should_process(WebMFrame &video_frame); + + void delete_pointers(); +}; + +/**/ + +class VideoStreamWebm : public VideoStream { + GDCLASS(VideoStreamWebm, VideoStream); + + int audio_track = 0; + +protected: + static void _bind_methods(); + +public: + VideoStreamWebm(); + + virtual Ref instantiate_playback() override; + + virtual void set_audio_track(int p_track) override; +}; + +class ResourceFormatLoaderWebm : public ResourceFormatLoader { +public: + virtual Ref load(const String &p_path, const String &p_original_path = "", Error *r_error = nullptr, bool p_use_sub_threads = false, float *r_progress = nullptr, CacheMode p_cache_mode = CACHE_MODE_REUSE); + virtual void get_recognized_extensions(List *p_extensions) const; + virtual bool handles_type(const String &p_type) const; + virtual String get_resource_type(const String &p_path) const; +}; + +#endif // VIDEO_STREAM_WEBM_H diff --git a/platform/linuxbsd/detect.py b/platform/linuxbsd/detect.py index a67434527c68..09d842eb3605 100644 --- a/platform/linuxbsd/detect.py +++ b/platform/linuxbsd/detect.py @@ -273,10 +273,17 @@ def configure(env: "SConsEnvironment"): if env["arch"] in ["x86_64", "x86_32"]: env["x86_libtheora_opt_gcc"] = True + if not env["builtin_libvpx"]: + env.ParseConfig("pkg-config vpx --cflags --libs") + if not env["builtin_libvorbis"]: env["builtin_libogg"] = False # Needed to link against system libvorbis env.ParseConfig("pkg-config vorbis vorbisfile --cflags --libs") + if not env["builtin_opus"]: + env["builtin_libogg"] = False # Needed to link against system opus + env.ParseConfig("pkg-config opus opusfile --cflags --libs") + if not env["builtin_libogg"]: env.ParseConfig("pkg-config ogg --cflags --libs") diff --git a/platform/macos/SCsub b/platform/macos/SCsub index 3924e79fb69c..598444ae24be 100644 --- a/platform/macos/SCsub +++ b/platform/macos/SCsub @@ -27,7 +27,9 @@ def generate_bundle(target, source, env): target_bin = lipo(bin_dir + "/" + prefix, env.extra_suffix + env.module_version_string) # Assemble .app bundle and update version info. - app_dir = Dir("#bin/" + (prefix + env.extra_suffix + env.module_version_string).replace(".", "_") + ".app").abspath + app_dir = Dir( + "#bin/" + (prefix + env.extra_suffix + env.module_version_string).replace(".", "_") + ".app" + ).abspath templ = Dir("#misc/dist/macos_tools.app").abspath if os.path.exists(app_dir): shutil.rmtree(app_dir) diff --git a/thirdparty/README.md b/thirdparty/README.md index 2ce82e82df9c..e3995d1886a3 100644 --- a/thirdparty/README.md +++ b/thirdparty/README.md @@ -500,6 +500,25 @@ Files extracted from upstream source: - `LICENSE` +## libsimplewebm + +- Upstream: https://github.com/zaps166/libsimplewebm +- Version: git (fe57fd3cfe6c0af4c6af110b1f84a90cf191d943, 2019) +- License: MIT (main), BSD-3-Clause (libwebm) + +This contains libwebm, but the version in use is updated from the one used by libsimplewebm, +and may have *unmarked* alterations from that. + +Files extracted from upstream source: + +- all the .cpp, .hpp files in the main folder except `example.cpp` +- LICENSE + +Important: Some files have Godot-made changes. +They are marked with `// -- GODOT start --` and `// -- GODOT end --` +comments. + + ## libtheora - Upstream: https://www.theora.org @@ -526,6 +545,23 @@ Files extracted from upstream source: - `COPYING` +## libvpx + +- Upstream: https://chromium.googlesource.com/webm/libvpx/ +- Version: 1.6.0 (2016) +- License: BSD-3-Clause + +Files extracted from upstream source: + +TODO. + +Important: File `libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c` has +Godot-made change marked with `// -- GODOT --` comments. + +The files `libvpx/third_party/android/cpu-features.{c,h}` were copied +from the Android NDK r18. + + ## libwebp - Upstream: https://chromium.googlesource.com/webm/libwebp/ @@ -787,6 +823,23 @@ Exclude: - All dotfiles +## opus + +- Upstream: https://opus-codec.org +- Version: 1.1.5 (opus) and 0.8 (opusfile) (2017) +- License: BSD-3-Clause + +Files extracted from upstream source: + +- all .c and .h files in src/ (both opus and opusfile) +- all .h files in include/ (both opus and opusfile) as opus/ +- remove unused `opus_demo.c`, +- remove `http.c`, `wincerts.c` and `winerrno.h` (part of + unused libopusurl) +- celt/ and silk/ subfolders +- COPYING + + ## pcre2 - Upstream: http://www.pcre.org diff --git a/thirdparty/libsimplewebm/LICENSE b/thirdparty/libsimplewebm/LICENSE new file mode 100644 index 000000000000..058633ac183b --- /dev/null +++ b/thirdparty/libsimplewebm/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2016 Błażej Szczygieł + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/thirdparty/libsimplewebm/OpusVorbisDecoder.cpp b/thirdparty/libsimplewebm/OpusVorbisDecoder.cpp new file mode 100644 index 000000000000..b5824b17be8c --- /dev/null +++ b/thirdparty/libsimplewebm/OpusVorbisDecoder.cpp @@ -0,0 +1,264 @@ +/* + MIT License + + Copyright (c) 2016 Błażej Szczygieł + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include "OpusVorbisDecoder.hpp" + +#include +#include + +#include + +struct VorbisDecoder +{ + vorbis_info info; + vorbis_dsp_state dspState; + vorbis_block block; + ogg_packet op; + + bool hasDSPState, hasBlock; +}; + +/**/ + +OpusVorbisDecoder::OpusVorbisDecoder(const WebMDemuxer &demuxer) : + m_vorbis(NULL), m_opus(NULL), + m_numSamples(0) +{ + switch (demuxer.getAudioCodec()) + { + case WebMDemuxer::AUDIO_VORBIS: + m_channels = demuxer.getChannels(); + if (openVorbis(demuxer)) + return; + break; + case WebMDemuxer::AUDIO_OPUS: + m_channels = demuxer.getChannels(); + if (openOpus(demuxer)) + return; + break; + default: + return; + } + close(); +} +OpusVorbisDecoder::~OpusVorbisDecoder() +{ + close(); +} + +bool OpusVorbisDecoder::isOpen() const +{ + return (m_vorbis || m_opus); +} + +bool OpusVorbisDecoder::getPCMS16(WebMFrame &frame, short *buffer, int &numOutSamples) +{ + if (m_vorbis) + { + m_vorbis->op.packet = frame.buffer; + m_vorbis->op.bytes = frame.bufferSize; + + if (vorbis_synthesis(&m_vorbis->block, &m_vorbis->op)) + return false; + if (vorbis_synthesis_blockin(&m_vorbis->dspState, &m_vorbis->block)) + return false; + + const int maxSamples = getBufferSamples(); + int samplesCount, count = 0; + float **pcm; + while ((samplesCount = vorbis_synthesis_pcmout(&m_vorbis->dspState, &pcm))) + { + const int toConvert = samplesCount <= maxSamples ? samplesCount : maxSamples; + for (int c = 0; c < m_channels; ++c) + { + float *samples = pcm[c]; + for (int i = 0, j = c; i < toConvert; ++i, j += m_channels) + { + int sample = samples[i] * 32767.0f; + if (sample > 32767) + sample = 32767; + else if (sample < -32768) + sample = -32768; + buffer[count + j] = sample; + } + } + vorbis_synthesis_read(&m_vorbis->dspState, toConvert); + count += toConvert; + } + + numOutSamples = count; + return true; + } + else if (m_opus) + { + const int samples = opus_decode(m_opus, frame.buffer, frame.bufferSize, buffer, m_numSamples, 0); + if (samples >= 0) + { + numOutSamples = samples; + return true; + } + } + return false; +} + +// -- GODOT begin -- +bool OpusVorbisDecoder::getPCMF(WebMFrame &frame, float *buffer, int &numOutSamples) { + if (m_vorbis) { + m_vorbis->op.packet = frame.buffer; + m_vorbis->op.bytes = frame.bufferSize; + + if (vorbis_synthesis(&m_vorbis->block, &m_vorbis->op)) + return false; + if (vorbis_synthesis_blockin(&m_vorbis->dspState, &m_vorbis->block)) + return false; + + const int maxSamples = getBufferSamples(); + int samplesCount, count = 0; + float **pcm; + while ((samplesCount = vorbis_synthesis_pcmout(&m_vorbis->dspState, &pcm))) { + const int toConvert = samplesCount <= maxSamples ? samplesCount : maxSamples; + for (int c = 0; c < m_channels; ++c) { + float *samples = pcm[c]; + for (int i = 0, j = c; i < toConvert; ++i, j += m_channels) { + buffer[count + j] = samples[i]; + } + } + vorbis_synthesis_read(&m_vorbis->dspState, toConvert); + count += toConvert; + } + + numOutSamples = count; + return true; + } else if (m_opus) { + const int samples = opus_decode_float(m_opus, frame.buffer, frame.bufferSize, buffer, m_numSamples, 0); + if (samples >= 0) { + numOutSamples = samples; + return true; + } + } + return false; +} +// -- GODOT end -- + +bool OpusVorbisDecoder::openVorbis(const WebMDemuxer &demuxer) +{ + size_t extradataSize = 0; + const unsigned char *extradata = demuxer.getAudioExtradata(extradataSize); + + if (extradataSize < 3 || !extradata || extradata[0] != 2) + return false; + + size_t headerSize[3] = {0}; + size_t offset = 1; + + /* Calculate three headers sizes */ + for (int i = 0; i < 2; ++i) + { + for (;;) + { + if (offset >= extradataSize) + return false; + headerSize[i] += extradata[offset]; + if (extradata[offset++] < 0xFF) + break; + } + } + headerSize[2] = extradataSize - (headerSize[0] + headerSize[1] + offset); + + if (headerSize[0] + headerSize[1] + headerSize[2] + offset != extradataSize) + return false; + + ogg_packet op[3]; + memset(op, 0, sizeof op); + + op[0].packet = (unsigned char *)extradata + offset; + op[0].bytes = headerSize[0]; + op[0].b_o_s = 1; + + op[1].packet = (unsigned char *)extradata + offset + headerSize[0]; + op[1].bytes = headerSize[1]; + + op[2].packet = (unsigned char *)extradata + offset + headerSize[0] + headerSize[1]; + op[2].bytes = headerSize[2]; + + m_vorbis = new VorbisDecoder; + m_vorbis->hasDSPState = m_vorbis->hasBlock = false; + vorbis_info_init(&m_vorbis->info); + + /* Upload three Vorbis headers into libvorbis */ + vorbis_comment vc; + vorbis_comment_init(&vc); + for (int i = 0; i < 3; ++i) + { + if (vorbis_synthesis_headerin(&m_vorbis->info, &vc, &op[i])) + { + vorbis_comment_clear(&vc); + return false; + } + } + vorbis_comment_clear(&vc); + + if (vorbis_synthesis_init(&m_vorbis->dspState, &m_vorbis->info)) + return false; + m_vorbis->hasDSPState = true; + + if (m_vorbis->info.channels != m_channels || m_vorbis->info.rate != demuxer.getSampleRate()) + return false; + + if (vorbis_block_init(&m_vorbis->dspState, &m_vorbis->block)) + return false; + m_vorbis->hasBlock = true; + + memset(&m_vorbis->op, 0, sizeof m_vorbis->op); + + m_numSamples = 4096 / m_channels; + + return true; +} +bool OpusVorbisDecoder::openOpus(const WebMDemuxer &demuxer) +{ + int opusErr = 0; + m_opus = opus_decoder_create(demuxer.getSampleRate(), m_channels, &opusErr); + if (!opusErr) + { + m_numSamples = demuxer.getSampleRate() * 0.06 + 0.5; //Maximum frame size (for 60 ms frame) + return true; + } + return false; +} + +void OpusVorbisDecoder::close() +{ + if (m_vorbis) + { + if (m_vorbis->hasBlock) + vorbis_block_clear(&m_vorbis->block); + if (m_vorbis->hasDSPState) + vorbis_dsp_clear(&m_vorbis->dspState); + vorbis_info_clear(&m_vorbis->info); + delete m_vorbis; + } + if (m_opus) + opus_decoder_destroy(m_opus); +} diff --git a/thirdparty/libsimplewebm/OpusVorbisDecoder.hpp b/thirdparty/libsimplewebm/OpusVorbisDecoder.hpp new file mode 100644 index 000000000000..f285b3fbd613 --- /dev/null +++ b/thirdparty/libsimplewebm/OpusVorbisDecoder.hpp @@ -0,0 +1,65 @@ +/* + MIT License + + Copyright (c) 2016 Błażej Szczygieł + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef OPUSVORBISDECODER_HPP +#define OPUSVORBISDECODER_HPP + +#include "WebMDemuxer.hpp" + +struct VorbisDecoder; +struct OpusDecoder; + +class OpusVorbisDecoder +{ + OpusVorbisDecoder(const OpusVorbisDecoder &); + void operator =(const OpusVorbisDecoder &); +public: + OpusVorbisDecoder(const WebMDemuxer &demuxer); + ~OpusVorbisDecoder(); + + bool isOpen() const; + + inline int getBufferSamples() const + { + return m_numSamples; + } + bool getPCMS16(WebMFrame &frame, short *buffer, int &numOutSamples); +// -- GODOT begin -- + bool getPCMF(WebMFrame &frame, float *buffer, int &numOutSamples); +// -- GODOT end -- + +private: + bool openVorbis(const WebMDemuxer &demuxer); + bool openOpus(const WebMDemuxer &demuxer); + + void close(); + + VorbisDecoder *m_vorbis; + OpusDecoder *m_opus; + int m_numSamples; + int m_channels; + +}; + +#endif // OPUSVORBISDECODER_HPP diff --git a/thirdparty/libsimplewebm/VPXDecoder.cpp b/thirdparty/libsimplewebm/VPXDecoder.cpp new file mode 100644 index 000000000000..e2606f83ba09 --- /dev/null +++ b/thirdparty/libsimplewebm/VPXDecoder.cpp @@ -0,0 +1,154 @@ +/* + MIT License + + Copyright (c) 2016 Błażej Szczygieł + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include "VPXDecoder.hpp" + +#include +#include + +#include +#include + +VPXDecoder::VPXDecoder(const WebMDemuxer &demuxer, unsigned threads) : + m_ctx(NULL), + m_iter(NULL), + m_delay(0), + m_last_space(VPX_CS_UNKNOWN) +{ + if (threads > 8) + threads = 8; + else if (threads < 1) + threads = 1; + + const vpx_codec_dec_cfg_t codecCfg = { + threads, + 0, + 0 + }; + vpx_codec_iface_t *codecIface = NULL; + + switch (demuxer.getVideoCodec()) + { + case WebMDemuxer::VIDEO_VP8: + codecIface = vpx_codec_vp8_dx(); + break; + case WebMDemuxer::VIDEO_VP9: + codecIface = vpx_codec_vp9_dx(); + m_delay = threads - 1; + break; + default: + return; + } + + m_ctx = new vpx_codec_ctx_t; + if (vpx_codec_dec_init(m_ctx, codecIface, &codecCfg, m_delay > 0 ? VPX_CODEC_USE_FRAME_THREADING : 0)) + { + delete m_ctx; + m_ctx = NULL; + } +} +VPXDecoder::~VPXDecoder() +{ + if (m_ctx) + { + vpx_codec_destroy(m_ctx); + delete m_ctx; + } +} + +bool VPXDecoder::decode(const WebMFrame &frame) +{ + m_iter = NULL; + return !vpx_codec_decode(m_ctx, frame.buffer, frame.bufferSize, NULL, 0); +} +VPXDecoder::IMAGE_ERROR VPXDecoder::getImage(Image &image) +{ + IMAGE_ERROR err = NO_FRAME; + if (vpx_image_t *img = vpx_codec_get_frame(m_ctx, &m_iter)) + { + // It seems to be a common problem that UNKNOWN comes up a lot, yet FFMPEG is somehow getting accurate colour-space information. + // After checking FFMPEG code, *they're* getting colour-space information, so I'm assuming something like this is going on. + // It appears to work, at least. + if (img->cs != VPX_CS_UNKNOWN) + m_last_space = img->cs; + if ((img->fmt & VPX_IMG_FMT_PLANAR) && !(img->fmt & (VPX_IMG_FMT_HAS_ALPHA | VPX_IMG_FMT_HIGHBITDEPTH))) + { + if (img->stride[0] && img->stride[1] && img->stride[2]) + { + const int uPlane = !!(img->fmt & VPX_IMG_FMT_UV_FLIP) + 1; + const int vPlane = !(img->fmt & VPX_IMG_FMT_UV_FLIP) + 1; + + image.w = img->d_w; + image.h = img->d_h; + image.cs = m_last_space; + image.chromaShiftW = img->x_chroma_shift; + image.chromaShiftH = img->y_chroma_shift; + + image.planes[0] = img->planes[0]; + image.planes[1] = img->planes[uPlane]; + image.planes[2] = img->planes[vPlane]; + + image.linesize[0] = img->stride[0]; + image.linesize[1] = img->stride[uPlane]; + image.linesize[2] = img->stride[vPlane]; + + err = NO_ERROR; + } + } + else + { + err = UNSUPPORTED_FRAME; + } + } + return err; +} + +/**/ + +// -- GODOT begin -- +#if 0 +// -- GODOT end -- + +static inline int ceilRshift(int val, int shift) +{ + return (val + (1 << shift) - 1) >> shift; +} + +int VPXDecoder::Image::getWidth(int plane) const +{ + if (!plane) + return w; + return ceilRshift(w, chromaShiftW); +} +int VPXDecoder::Image::getHeight(int plane) const +{ + if (!plane) + return h; + return ceilRshift(h, chromaShiftH); +} + +// -- GODOT begin -- +#endif +// -- GODOT end -- + diff --git a/thirdparty/libsimplewebm/VPXDecoder.hpp b/thirdparty/libsimplewebm/VPXDecoder.hpp new file mode 100644 index 000000000000..5071b069cb1e --- /dev/null +++ b/thirdparty/libsimplewebm/VPXDecoder.hpp @@ -0,0 +1,86 @@ +/* + MIT License + + Copyright (c) 2016 Błażej Szczygieł + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef VPXDECODER_HPP +#define VPXDECODER_HPP + +#include "WebMDemuxer.hpp" + +struct vpx_codec_ctx; + +class VPXDecoder +{ + VPXDecoder(const VPXDecoder &); + void operator =(const VPXDecoder &); +public: + class Image + { + public: +// -- GODOT begin -- +#if 0 +// -- GODOT end -- + int getWidth(int plane) const; + int getHeight(int plane) const; +// -- GODOT begin -- +#endif +// -- GODOT end -- + + int w, h; + int cs; + int chromaShiftW, chromaShiftH; + unsigned char *planes[3]; + int linesize[3]; + }; + + enum IMAGE_ERROR + { + UNSUPPORTED_FRAME = -1, + NO_ERROR, + NO_FRAME + }; + + VPXDecoder(const WebMDemuxer &demuxer, unsigned threads = 1); + ~VPXDecoder(); + + inline bool isOpen() const + { + return (bool)m_ctx; + } + + inline int getFramesDelay() const + { + return m_delay; + } + + bool decode(const WebMFrame &frame); + IMAGE_ERROR getImage(Image &image); //The data is NOT copied! Only 3-plane, 8-bit images are supported. + +private: + vpx_codec_ctx *m_ctx; + const void *m_iter; + int m_delay; + int m_last_space; +}; + +#endif // VPXDECODER_HPP diff --git a/thirdparty/libsimplewebm/WebMDemuxer.cpp b/thirdparty/libsimplewebm/WebMDemuxer.cpp new file mode 100644 index 000000000000..a5854882d6a9 --- /dev/null +++ b/thirdparty/libsimplewebm/WebMDemuxer.cpp @@ -0,0 +1,269 @@ +/* + MIT License + + Copyright (c) 2016 Błażej Szczygieł + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include "WebMDemuxer.hpp" + +#include "mkvparser/mkvparser.h" + +#include +#include +#include + +WebMFrame::WebMFrame() : + bufferSize(0), bufferCapacity(0), + buffer(NULL), + time(0), + key(false) +{} +WebMFrame::~WebMFrame() +{ + free(buffer); +} + +/**/ + +WebMDemuxer::WebMDemuxer(mkvparser::IMkvReader *reader, int videoTrack, int audioTrack) : + m_reader(reader), + m_segment(NULL), + m_cluster(NULL), m_block(NULL), m_blockEntry(NULL), + m_blockFrameIndex(0), + m_videoTrack(NULL), m_vCodec(NO_VIDEO), + m_audioTrack(NULL), m_aCodec(NO_AUDIO), + m_seekTime(0.0f), + m_isSeek(false), + m_isOpen(false), + m_eos(false) +{ + long long pos = 0; + if (mkvparser::EBMLHeader().Parse(m_reader, pos)) + return; + + if (mkvparser::Segment::CreateInstance(m_reader, pos, m_segment)) + return; + + if (m_segment->Load() < 0) + return; + + const mkvparser::Tracks *tracks = m_segment->GetTracks(); + const unsigned long tracksCount = tracks->GetTracksCount(); + int currVideoTrack = -1, currAudioTrack = -1; + for (unsigned long i = 0; i < tracksCount; ++i) + { + const mkvparser::Track *track = tracks->GetTrackByIndex(i); + if (const char *codecId = track->GetCodecId()) + { + if ((!m_videoTrack || currVideoTrack != videoTrack) && track->GetType() == mkvparser::Track::kVideo) + { + if (!strcmp(codecId, "V_VP8")) + m_vCodec = VIDEO_VP8; + else if (!strcmp(codecId, "V_VP9")) + m_vCodec = VIDEO_VP9; + if (m_vCodec != NO_VIDEO) + m_videoTrack = static_cast(track); + ++currVideoTrack; + } + if ((!m_audioTrack || currAudioTrack != audioTrack) && track->GetType() == mkvparser::Track::kAudio) + { + if (!strcmp(codecId, "A_VORBIS")) + m_aCodec = AUDIO_VORBIS; + else if (!strcmp(codecId, "A_OPUS")) + m_aCodec = AUDIO_OPUS; + if (m_aCodec != NO_AUDIO) + m_audioTrack = static_cast(track); + ++currAudioTrack; + } + } + } + if (!m_videoTrack && !m_audioTrack) + return; + + m_isOpen = true; +} +WebMDemuxer::~WebMDemuxer() +{ + delete m_segment; + delete m_reader; +} + +double WebMDemuxer::getLength() const +{ + return m_segment->GetDuration() / 1e9; +} + +WebMDemuxer::VIDEO_CODEC WebMDemuxer::getVideoCodec() const +{ + return m_vCodec; +} +int WebMDemuxer::getWidth() const +{ + return m_videoTrack->GetWidth(); +} +int WebMDemuxer::getHeight() const +{ + return m_videoTrack->GetHeight(); +} + +WebMDemuxer::AUDIO_CODEC WebMDemuxer::getAudioCodec() const +{ + return m_aCodec; +} +const unsigned char *WebMDemuxer::getAudioExtradata(size_t &size) const +{ + return m_audioTrack->GetCodecPrivate(size); +} +double WebMDemuxer::getSampleRate() const +{ + return m_audioTrack->GetSamplingRate(); +} +int WebMDemuxer::getChannels() const +{ + return m_audioTrack->GetChannels(); +} +int WebMDemuxer::getAudioDepth() const +{ + return m_audioTrack->GetBitDepth(); +} + +float WebMDemuxer::seek(float p_time){ + + if(p_time < 0) + return 0.0f; + + m_seekTime = p_time; + m_isSeek = true; + + const mkvparser::BlockEntry *blockEntry = NULL; + m_videoTrack->Seek(m_seekTime * 1e9, blockEntry); + + if(!blockEntry || !blockEntry->GetBlock() || !blockEntry->GetCluster()) + return 0.0f; + + return blockEntry->GetBlock()->GetTime(blockEntry->GetCluster()) / 1e9; +} + +bool WebMDemuxer::readFrame(WebMFrame *videoFrame, WebMFrame *audioFrame) +{ + const long videoTrackNumber = (videoFrame && m_videoTrack) ? m_videoTrack->GetNumber() : 0; + const long audioTrackNumber = (audioFrame && m_audioTrack) ? m_audioTrack->GetNumber() : 0; + bool blockEntryEOS = false; + + if (videoFrame) + videoFrame->bufferSize = 0; + if (audioFrame) + audioFrame->bufferSize = 0; + + if (videoTrackNumber == 0 && audioTrackNumber == 0) + return false; + + if (m_eos) + return false; + + if (!m_cluster) + m_cluster = m_segment->GetFirst(); + + do + { + bool getNewBlock = false; + long status = 0; + if (!m_blockEntry && !blockEntryEOS) + { + status = m_cluster->GetFirst(m_blockEntry); + getNewBlock = true; + } + else if (blockEntryEOS || m_blockEntry->EOS()) + { + m_cluster = m_segment->GetNext(m_cluster); + if (!m_cluster || m_cluster->EOS()) + { + m_eos = true; + return false; + } + status = m_cluster->GetFirst(m_blockEntry); + blockEntryEOS = false; + getNewBlock = true; + } + else if (!m_block || m_blockFrameIndex == m_block->GetFrameCount() || notSupportedTrackNumber(videoTrackNumber, audioTrackNumber)) + { + if(m_isSeek) + { + m_videoTrack->Seek(m_seekTime * 1e9, m_blockEntry); + m_cluster = m_blockEntry->GetCluster(); + m_isSeek = false; + } + else + { + status = m_cluster->GetNext(m_blockEntry, m_blockEntry); + } + if (!m_blockEntry || m_blockEntry->EOS()) + { + blockEntryEOS = true; + continue; + } + getNewBlock = true; + } + if (status || !m_blockEntry) + return false; + if (getNewBlock) + { + m_block = m_blockEntry->GetBlock(); + m_blockFrameIndex = 0; + } + } while (blockEntryEOS || notSupportedTrackNumber(videoTrackNumber, audioTrackNumber)); + + WebMFrame *frame = NULL; + + const long trackNumber = m_block->GetTrackNumber(); + if (trackNumber == videoTrackNumber) + frame = videoFrame; + else if (trackNumber == audioTrackNumber) + frame = audioFrame; + else + { + //Should not be possible + assert(trackNumber == videoTrackNumber || trackNumber == audioTrackNumber); + return false; + } + + const mkvparser::Block::Frame &blockFrame = m_block->GetFrame(m_blockFrameIndex++); + if (blockFrame.len > frame->bufferCapacity) + { + unsigned char *newBuff = (unsigned char *)realloc(frame->buffer, frame->bufferCapacity = blockFrame.len); + if (newBuff) + frame->buffer = newBuff; + else // Out of memory + return false; + } + frame->bufferSize = blockFrame.len; + + frame->time = m_block->GetTime(m_cluster) / 1e9; + frame->key = m_block->IsKey(); + + return !blockFrame.Read(m_reader, frame->buffer); +} + +inline bool WebMDemuxer::notSupportedTrackNumber(long videoTrackNumber, long audioTrackNumber) const +{ + const long trackNumber = m_block->GetTrackNumber(); + return (trackNumber != videoTrackNumber && trackNumber != audioTrackNumber); +} diff --git a/thirdparty/libsimplewebm/WebMDemuxer.hpp b/thirdparty/libsimplewebm/WebMDemuxer.hpp new file mode 100644 index 000000000000..af649c289fc5 --- /dev/null +++ b/thirdparty/libsimplewebm/WebMDemuxer.hpp @@ -0,0 +1,128 @@ +/* + MIT License + + Copyright (c) 2016 Błażej Szczygieł + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef WEBMDEMUXER_HPP +#define WEBMDEMUXER_HPP + +#include + +namespace mkvparser { + class IMkvReader; + class Segment; + class Cluster; + class Block; + class BlockEntry; + class VideoTrack; + class AudioTrack; +} + +class WebMFrame +{ + WebMFrame(const WebMFrame &); + void operator =(const WebMFrame &); +public: + WebMFrame(); + ~WebMFrame(); + + inline bool isValid() const + { + return bufferSize > 0; + } + + long bufferSize, bufferCapacity; + unsigned char *buffer; + double time; + bool key; +}; + +class WebMDemuxer +{ + WebMDemuxer(const WebMDemuxer &); + void operator =(const WebMDemuxer &); +public: + enum VIDEO_CODEC + { + NO_VIDEO, + VIDEO_VP8, + VIDEO_VP9 + }; + enum AUDIO_CODEC + { + NO_AUDIO, + AUDIO_VORBIS, + AUDIO_OPUS + }; + + WebMDemuxer(mkvparser::IMkvReader *reader, int videoTrack = 0, int audioTrack = 0); + ~WebMDemuxer(); + + inline bool isOpen() const + { + return m_isOpen; + } + inline bool isEOS() const + { + return m_eos; + } + + double getLength() const; + + VIDEO_CODEC getVideoCodec() const; + int getWidth() const; + int getHeight() const; + + AUDIO_CODEC getAudioCodec() const; + const unsigned char *getAudioExtradata(size_t &size) const; // Needed for Vorbis + double getSampleRate() const; + int getChannels() const; + int getAudioDepth() const; + + float seek(float p_time); + bool readFrame(WebMFrame *videoFrame, WebMFrame *audioFrame); + +private: + inline bool notSupportedTrackNumber(long videoTrackNumber, long audioTrackNumber) const; + + mkvparser::IMkvReader *m_reader; + mkvparser::Segment *m_segment; + + const mkvparser::Cluster *m_cluster; + const mkvparser::Block *m_block; + const mkvparser::BlockEntry *m_blockEntry; + + int m_blockFrameIndex; + + const mkvparser::VideoTrack *m_videoTrack; + VIDEO_CODEC m_vCodec; + + const mkvparser::AudioTrack *m_audioTrack; + AUDIO_CODEC m_aCodec; + + bool m_isSeek; + float m_seekTime; + bool m_isOpen; + bool m_eos; +}; + +#endif // WEBMDEMUXER_HPP diff --git a/thirdparty/libsimplewebm/libwebm/AUTHORS.TXT b/thirdparty/libsimplewebm/libwebm/AUTHORS.TXT new file mode 100644 index 000000000000..9686ac13eb0c --- /dev/null +++ b/thirdparty/libsimplewebm/libwebm/AUTHORS.TXT @@ -0,0 +1,4 @@ +# Names should be added to this file like so: +# Name or Organization + +Google Inc. diff --git a/thirdparty/libsimplewebm/libwebm/LICENSE.TXT b/thirdparty/libsimplewebm/libwebm/LICENSE.TXT new file mode 100644 index 000000000000..7a6f99547d4d --- /dev/null +++ b/thirdparty/libsimplewebm/libwebm/LICENSE.TXT @@ -0,0 +1,30 @@ +Copyright (c) 2010, Google Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of Google nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/thirdparty/libsimplewebm/libwebm/PATENTS.TXT b/thirdparty/libsimplewebm/libwebm/PATENTS.TXT new file mode 100644 index 000000000000..caedf607e95a --- /dev/null +++ b/thirdparty/libsimplewebm/libwebm/PATENTS.TXT @@ -0,0 +1,23 @@ +Additional IP Rights Grant (Patents) +------------------------------------ + +"These implementations" means the copyrightable works that implement the WebM +codecs distributed by Google as part of the WebM Project. + +Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge, +royalty-free, irrevocable (except as stated in this section) patent license to +make, have made, use, offer to sell, sell, import, transfer, and otherwise +run, modify and propagate the contents of these implementations of WebM, where +such license applies only to those patent claims, both currently owned by +Google and acquired in the future, licensable by Google that are necessarily +infringed by these implementations of WebM. This grant does not include claims +that would be infringed only as a consequence of further modification of these +implementations. If you or your agent or exclusive licensee institute or order +or agree to the institution of patent litigation or any other patent +enforcement activity against any entity (including a cross-claim or +counterclaim in a lawsuit) alleging that any of these implementations of WebM +or any code incorporated within any of these implementations of WebM +constitute direct or contributory patent infringement, or inducement of +patent infringement, then any patent rights granted to you under this License +for these implementations of WebM shall terminate as of the date such +litigation is filed. diff --git a/thirdparty/libsimplewebm/libwebm/README.libvpx b/thirdparty/libsimplewebm/libwebm/README.libvpx new file mode 100644 index 000000000000..1aa93b75aa23 --- /dev/null +++ b/thirdparty/libsimplewebm/libwebm/README.libvpx @@ -0,0 +1,11 @@ +URL: https://chromium.googlesource.com/webm/libwebm +Version: d7c62173ff6b4a5e0a2f86683a5b67db98cf09bf +License: BSD +License File: LICENSE.txt + +Description: +libwebm is used to handle WebM container I/O. + +Local Changes: +* Removed: "mkvmuxer", "hdr_util", "file_util", "mkv_reader". +* Make "~IMkvRerader()" public. diff --git a/thirdparty/libsimplewebm/libwebm/common/webmids.h b/thirdparty/libsimplewebm/libwebm/common/webmids.h new file mode 100644 index 000000000000..89d722a71bcb --- /dev/null +++ b/thirdparty/libsimplewebm/libwebm/common/webmids.h @@ -0,0 +1,192 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. + +#ifndef COMMON_WEBMIDS_H_ +#define COMMON_WEBMIDS_H_ + +namespace libwebm { + +enum MkvId { + kMkvEBML = 0x1A45DFA3, + kMkvEBMLVersion = 0x4286, + kMkvEBMLReadVersion = 0x42F7, + kMkvEBMLMaxIDLength = 0x42F2, + kMkvEBMLMaxSizeLength = 0x42F3, + kMkvDocType = 0x4282, + kMkvDocTypeVersion = 0x4287, + kMkvDocTypeReadVersion = 0x4285, + kMkvVoid = 0xEC, + kMkvSignatureSlot = 0x1B538667, + kMkvSignatureAlgo = 0x7E8A, + kMkvSignatureHash = 0x7E9A, + kMkvSignaturePublicKey = 0x7EA5, + kMkvSignature = 0x7EB5, + kMkvSignatureElements = 0x7E5B, + kMkvSignatureElementList = 0x7E7B, + kMkvSignedElement = 0x6532, + // segment + kMkvSegment = 0x18538067, + // Meta Seek Information + kMkvSeekHead = 0x114D9B74, + kMkvSeek = 0x4DBB, + kMkvSeekID = 0x53AB, + kMkvSeekPosition = 0x53AC, + // Segment Information + kMkvInfo = 0x1549A966, + kMkvTimecodeScale = 0x2AD7B1, + kMkvDuration = 0x4489, + kMkvDateUTC = 0x4461, + kMkvTitle = 0x7BA9, + kMkvMuxingApp = 0x4D80, + kMkvWritingApp = 0x5741, + // Cluster + kMkvCluster = 0x1F43B675, + kMkvTimecode = 0xE7, + kMkvPrevSize = 0xAB, + kMkvBlockGroup = 0xA0, + kMkvBlock = 0xA1, + kMkvBlockDuration = 0x9B, + kMkvReferenceBlock = 0xFB, + kMkvLaceNumber = 0xCC, + kMkvSimpleBlock = 0xA3, + kMkvBlockAdditions = 0x75A1, + kMkvBlockMore = 0xA6, + kMkvBlockAddID = 0xEE, + kMkvBlockAdditional = 0xA5, + kMkvDiscardPadding = 0x75A2, + // Track + kMkvTracks = 0x1654AE6B, + kMkvTrackEntry = 0xAE, + kMkvTrackNumber = 0xD7, + kMkvTrackUID = 0x73C5, + kMkvTrackType = 0x83, + kMkvFlagEnabled = 0xB9, + kMkvFlagDefault = 0x88, + kMkvFlagForced = 0x55AA, + kMkvFlagLacing = 0x9C, + kMkvDefaultDuration = 0x23E383, + kMkvMaxBlockAdditionID = 0x55EE, + kMkvName = 0x536E, + kMkvLanguage = 0x22B59C, + kMkvCodecID = 0x86, + kMkvCodecPrivate = 0x63A2, + kMkvCodecName = 0x258688, + kMkvCodecDelay = 0x56AA, + kMkvSeekPreRoll = 0x56BB, + // video + kMkvVideo = 0xE0, + kMkvFlagInterlaced = 0x9A, + kMkvStereoMode = 0x53B8, + kMkvAlphaMode = 0x53C0, + kMkvPixelWidth = 0xB0, + kMkvPixelHeight = 0xBA, + kMkvPixelCropBottom = 0x54AA, + kMkvPixelCropTop = 0x54BB, + kMkvPixelCropLeft = 0x54CC, + kMkvPixelCropRight = 0x54DD, + kMkvDisplayWidth = 0x54B0, + kMkvDisplayHeight = 0x54BA, + kMkvDisplayUnit = 0x54B2, + kMkvAspectRatioType = 0x54B3, + kMkvFrameRate = 0x2383E3, + // end video + // colour + kMkvColour = 0x55B0, + kMkvMatrixCoefficients = 0x55B1, + kMkvBitsPerChannel = 0x55B2, + kMkvChromaSubsamplingHorz = 0x55B3, + kMkvChromaSubsamplingVert = 0x55B4, + kMkvCbSubsamplingHorz = 0x55B5, + kMkvCbSubsamplingVert = 0x55B6, + kMkvChromaSitingHorz = 0x55B7, + kMkvChromaSitingVert = 0x55B8, + kMkvRange = 0x55B9, + kMkvTransferCharacteristics = 0x55BA, + kMkvPrimaries = 0x55BB, + kMkvMaxCLL = 0x55BC, + kMkvMaxFALL = 0x55BD, + // mastering metadata + kMkvMasteringMetadata = 0x55D0, + kMkvPrimaryRChromaticityX = 0x55D1, + kMkvPrimaryRChromaticityY = 0x55D2, + kMkvPrimaryGChromaticityX = 0x55D3, + kMkvPrimaryGChromaticityY = 0x55D4, + kMkvPrimaryBChromaticityX = 0x55D5, + kMkvPrimaryBChromaticityY = 0x55D6, + kMkvWhitePointChromaticityX = 0x55D7, + kMkvWhitePointChromaticityY = 0x55D8, + kMkvLuminanceMax = 0x55D9, + kMkvLuminanceMin = 0x55DA, + // end mastering metadata + // end colour + // projection + kMkvProjection = 0x7670, + kMkvProjectionType = 0x7671, + kMkvProjectionPrivate = 0x7672, + kMkvProjectionPoseYaw = 0x7673, + kMkvProjectionPosePitch = 0x7674, + kMkvProjectionPoseRoll = 0x7675, + // end projection + // audio + kMkvAudio = 0xE1, + kMkvSamplingFrequency = 0xB5, + kMkvOutputSamplingFrequency = 0x78B5, + kMkvChannels = 0x9F, + kMkvBitDepth = 0x6264, + // end audio + // ContentEncodings + kMkvContentEncodings = 0x6D80, + kMkvContentEncoding = 0x6240, + kMkvContentEncodingOrder = 0x5031, + kMkvContentEncodingScope = 0x5032, + kMkvContentEncodingType = 0x5033, + kMkvContentCompression = 0x5034, + kMkvContentCompAlgo = 0x4254, + kMkvContentCompSettings = 0x4255, + kMkvContentEncryption = 0x5035, + kMkvContentEncAlgo = 0x47E1, + kMkvContentEncKeyID = 0x47E2, + kMkvContentSignature = 0x47E3, + kMkvContentSigKeyID = 0x47E4, + kMkvContentSigAlgo = 0x47E5, + kMkvContentSigHashAlgo = 0x47E6, + kMkvContentEncAESSettings = 0x47E7, + kMkvAESSettingsCipherMode = 0x47E8, + kMkvAESSettingsCipherInitData = 0x47E9, + // end ContentEncodings + // Cueing Data + kMkvCues = 0x1C53BB6B, + kMkvCuePoint = 0xBB, + kMkvCueTime = 0xB3, + kMkvCueTrackPositions = 0xB7, + kMkvCueTrack = 0xF7, + kMkvCueClusterPosition = 0xF1, + kMkvCueBlockNumber = 0x5378, + // Chapters + kMkvChapters = 0x1043A770, + kMkvEditionEntry = 0x45B9, + kMkvChapterAtom = 0xB6, + kMkvChapterUID = 0x73C4, + kMkvChapterStringUID = 0x5654, + kMkvChapterTimeStart = 0x91, + kMkvChapterTimeEnd = 0x92, + kMkvChapterDisplay = 0x80, + kMkvChapString = 0x85, + kMkvChapLanguage = 0x437C, + kMkvChapCountry = 0x437E, + // Tags + kMkvTags = 0x1254C367, + kMkvTag = 0x7373, + kMkvSimpleTag = 0x67C8, + kMkvTagName = 0x45A3, + kMkvTagString = 0x4487 +}; + +} // namespace libwebm + +#endif // COMMON_WEBMIDS_H_ diff --git a/thirdparty/libsimplewebm/libwebm/mkvmuxer/mkvmuxertypes.h b/thirdparty/libsimplewebm/libwebm/mkvmuxer/mkvmuxertypes.h new file mode 100644 index 000000000000..e5db121605f6 --- /dev/null +++ b/thirdparty/libsimplewebm/libwebm/mkvmuxer/mkvmuxertypes.h @@ -0,0 +1,28 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. + +#ifndef MKVMUXER_MKVMUXERTYPES_H_ +#define MKVMUXER_MKVMUXERTYPES_H_ + +namespace mkvmuxer { +typedef unsigned char uint8; +typedef short int16; +typedef int int32; +typedef unsigned int uint32; +typedef long long int64; +typedef unsigned long long uint64; +} // namespace mkvmuxer + +// Copied from Chromium basictypes.h +// A macro to disallow the copy constructor and operator= functions +// This should be used in the private: declarations for a class +#define LIBWEBM_DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&); \ + void operator=(const TypeName&) + +#endif // MKVMUXER_MKVMUXERTYPES_HPP_ diff --git a/thirdparty/libsimplewebm/libwebm/mkvparser/mkvparser.cc b/thirdparty/libsimplewebm/libwebm/mkvparser/mkvparser.cc new file mode 100644 index 000000000000..e7b76f7da110 --- /dev/null +++ b/thirdparty/libsimplewebm/libwebm/mkvparser/mkvparser.cc @@ -0,0 +1,8049 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +#include "mkvparser/mkvparser.h" + +#if defined(_MSC_VER) && _MSC_VER < 1800 +#include // _isnan() / _finite() +#define MSC_COMPAT +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "common/webmids.h" + +namespace mkvparser { +const long long kStringElementSizeLimit = 20 * 1000 * 1000; +const float MasteringMetadata::kValueNotPresent = FLT_MAX; +const long long Colour::kValueNotPresent = LLONG_MAX; +const float Projection::kValueNotPresent = FLT_MAX; + +#ifdef MSC_COMPAT +inline bool isnan(double val) { return !!_isnan(val); } +inline bool isinf(double val) { return !_finite(val); } +#else +inline bool isnan(double val) { return std::isnan(val); } +inline bool isinf(double val) { return std::isinf(val); } +#endif // MSC_COMPAT + +IMkvReader::~IMkvReader() {} + +template +Type* SafeArrayAlloc(unsigned long long num_elements, + unsigned long long element_size) { + if (num_elements == 0 || element_size == 0) + return NULL; + + const size_t kMaxAllocSize = 0x80000000; // 2GiB + const unsigned long long num_bytes = num_elements * element_size; + if (element_size > (kMaxAllocSize / num_elements)) + return NULL; + if (num_bytes != static_cast(num_bytes)) + return NULL; + + return new (std::nothrow) Type[static_cast(num_bytes)]; +} + +void GetVersion(int& major, int& minor, int& build, int& revision) { + major = 1; + minor = 0; + build = 0; + revision = 30; +} + +long long ReadUInt(IMkvReader* pReader, long long pos, long& len) { + if (!pReader || pos < 0) + return E_FILE_FORMAT_INVALID; + + len = 1; + unsigned char b; + int status = pReader->Read(pos, 1, &b); + + if (status < 0) // error or underflow + return status; + + if (status > 0) // interpreted as "underflow" + return E_BUFFER_NOT_FULL; + + if (b == 0) // we can't handle u-int values larger than 8 bytes + return E_FILE_FORMAT_INVALID; + + unsigned char m = 0x80; + + while (!(b & m)) { + m >>= 1; + ++len; + } + + long long result = b & (~m); + ++pos; + + for (int i = 1; i < len; ++i) { + status = pReader->Read(pos, 1, &b); + + if (status < 0) { + len = 1; + return status; + } + + if (status > 0) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result <<= 8; + result |= b; + + ++pos; + } + + return result; +} + +// Reads an EBML ID and returns it. +// An ID must at least 1 byte long, cannot exceed 4, and its value must be +// greater than 0. +// See known EBML values and EBMLMaxIDLength: +// http://www.matroska.org/technical/specs/index.html +// Returns the ID, or a value less than 0 to report an error while reading the +// ID. +long long ReadID(IMkvReader* pReader, long long pos, long& len) { + if (pReader == NULL || pos < 0) + return E_FILE_FORMAT_INVALID; + + // Read the first byte. The length in bytes of the ID is determined by + // finding the first set bit in the first byte of the ID. + unsigned char temp_byte = 0; + int read_status = pReader->Read(pos, 1, &temp_byte); + + if (read_status < 0) + return E_FILE_FORMAT_INVALID; + else if (read_status > 0) // No data to read. + return E_BUFFER_NOT_FULL; + + if (temp_byte == 0) // ID length > 8 bytes; invalid file. + return E_FILE_FORMAT_INVALID; + + int bit_pos = 0; + const int kMaxIdLengthInBytes = 4; + const int kCheckByte = 0x80; + + // Find the first bit that's set. + bool found_bit = false; + for (; bit_pos < kMaxIdLengthInBytes; ++bit_pos) { + if ((kCheckByte >> bit_pos) & temp_byte) { + found_bit = true; + break; + } + } + + if (!found_bit) { + // The value is too large to be a valid ID. + return E_FILE_FORMAT_INVALID; + } + + // Read the remaining bytes of the ID (if any). + const int id_length = bit_pos + 1; + long long ebml_id = temp_byte; + for (int i = 1; i < id_length; ++i) { + ebml_id <<= 8; + read_status = pReader->Read(pos + i, 1, &temp_byte); + + if (read_status < 0) + return E_FILE_FORMAT_INVALID; + else if (read_status > 0) + return E_BUFFER_NOT_FULL; + + ebml_id |= temp_byte; + } + + len = id_length; + return ebml_id; +} + +long long GetUIntLength(IMkvReader* pReader, long long pos, long& len) { + if (!pReader || pos < 0) + return E_FILE_FORMAT_INVALID; + + long long total, available; + + int status = pReader->Length(&total, &available); + if (status < 0 || (total >= 0 && available > total)) + return E_FILE_FORMAT_INVALID; + + len = 1; + + if (pos >= available) + return pos; // too few bytes available + + unsigned char b; + + status = pReader->Read(pos, 1, &b); + + if (status != 0) + return status; + + if (b == 0) // we can't handle u-int values larger than 8 bytes + return E_FILE_FORMAT_INVALID; + + unsigned char m = 0x80; + + while (!(b & m)) { + m >>= 1; + ++len; + } + + return 0; // success +} + +// TODO(vigneshv): This function assumes that unsigned values never have their +// high bit set. +long long UnserializeUInt(IMkvReader* pReader, long long pos, long long size) { + if (!pReader || pos < 0 || (size <= 0) || (size > 8)) + return E_FILE_FORMAT_INVALID; + + long long result = 0; + + for (long long i = 0; i < size; ++i) { + unsigned char b; + + const long status = pReader->Read(pos, 1, &b); + + if (status < 0) + return status; + + result <<= 8; + result |= b; + + ++pos; + } + + return result; +} + +long UnserializeFloat(IMkvReader* pReader, long long pos, long long size_, + double& result) { + if (!pReader || pos < 0 || ((size_ != 4) && (size_ != 8))) + return E_FILE_FORMAT_INVALID; + + const long size = static_cast(size_); + + unsigned char buf[8]; + + const int status = pReader->Read(pos, size, buf); + + if (status < 0) // error + return status; + + if (size == 4) { + union { + float f; + unsigned long ff; + }; + + ff = 0; + + for (int i = 0;;) { + ff |= buf[i]; + + if (++i >= 4) + break; + + ff <<= 8; + } + + result = f; + } else { + union { + double d; + unsigned long long dd; + }; + + dd = 0; + + for (int i = 0;;) { + dd |= buf[i]; + + if (++i >= 8) + break; + + dd <<= 8; + } + + result = d; + } + + if (mkvparser::isinf(result) || mkvparser::isnan(result)) + return E_FILE_FORMAT_INVALID; + + return 0; +} + +long UnserializeInt(IMkvReader* pReader, long long pos, long long size, + long long& result_ref) { + if (!pReader || pos < 0 || size < 1 || size > 8) + return E_FILE_FORMAT_INVALID; + + signed char first_byte = 0; + const long status = pReader->Read(pos, 1, (unsigned char*)&first_byte); + + if (status < 0) + return status; + + unsigned long long result = first_byte; + ++pos; + + for (long i = 1; i < size; ++i) { + unsigned char b; + + const long status = pReader->Read(pos, 1, &b); + + if (status < 0) + return status; + + result <<= 8; + result |= b; + + ++pos; + } + + result_ref = static_cast(result); + return 0; +} + +long UnserializeString(IMkvReader* pReader, long long pos, long long size, + char*& str) { + delete[] str; + str = NULL; + + if (size >= LONG_MAX || size < 0 || size > kStringElementSizeLimit) + return E_FILE_FORMAT_INVALID; + + // +1 for '\0' terminator + const long required_size = static_cast(size) + 1; + + str = SafeArrayAlloc(1, required_size); + if (str == NULL) + return E_FILE_FORMAT_INVALID; + + unsigned char* const buf = reinterpret_cast(str); + + const long status = pReader->Read(pos, static_cast(size), buf); + + if (status) { + delete[] str; + str = NULL; + + return status; + } + + str[required_size - 1] = '\0'; + return 0; +} + +long ParseElementHeader(IMkvReader* pReader, long long& pos, long long stop, + long long& id, long long& size) { + if (stop >= 0 && pos >= stop) + return E_FILE_FORMAT_INVALID; + + long len; + + id = ReadID(pReader, pos, len); + + if (id < 0) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume id + + if (stop >= 0 && pos >= stop) + return E_FILE_FORMAT_INVALID; + + size = ReadUInt(pReader, pos, len); + + if (size < 0 || len < 1 || len > 8) { + // Invalid: Negative payload size, negative or 0 length integer, or integer + // larger than 64 bits (libwebm cannot handle them). + return E_FILE_FORMAT_INVALID; + } + + // Avoid rolling over pos when very close to LLONG_MAX. + const unsigned long long rollover_check = + static_cast(pos) + len; + if (rollover_check > LLONG_MAX) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume length of size + + // pos now designates payload + + if (stop >= 0 && pos > stop) + return E_FILE_FORMAT_INVALID; + + return 0; // success +} + +bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id, + long long& val) { + if (!pReader || pos < 0) + return false; + + long long total = 0; + long long available = 0; + + const long status = pReader->Length(&total, &available); + if (status < 0 || (total >= 0 && available > total)) + return false; + + long len = 0; + + const long long id = ReadID(pReader, pos, len); + if (id < 0 || (available - pos) > len) + return false; + + if (static_cast(id) != expected_id) + return false; + + pos += len; // consume id + + const long long size = ReadUInt(pReader, pos, len); + if (size < 0 || size > 8 || len < 1 || len > 8 || (available - pos) > len) + return false; + + pos += len; // consume length of size of payload + + val = UnserializeUInt(pReader, pos, size); + if (val < 0) + return false; + + pos += size; // consume size of payload + + return true; +} + +bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id, + unsigned char*& buf, size_t& buflen) { + if (!pReader || pos < 0) + return false; + + long long total = 0; + long long available = 0; + + long status = pReader->Length(&total, &available); + if (status < 0 || (total >= 0 && available > total)) + return false; + + long len = 0; + const long long id = ReadID(pReader, pos, len); + if (id < 0 || (available - pos) > len) + return false; + + if (static_cast(id) != expected_id) + return false; + + pos += len; // consume id + + const long long size = ReadUInt(pReader, pos, len); + if (size < 0 || len <= 0 || len > 8 || (available - pos) > len) + return false; + + unsigned long long rollover_check = + static_cast(pos) + len; + if (rollover_check > LLONG_MAX) + return false; + + pos += len; // consume length of size of payload + + rollover_check = static_cast(pos) + size; + if (rollover_check > LLONG_MAX) + return false; + + if ((pos + size) > available) + return false; + + if (size >= LONG_MAX) + return false; + + const long buflen_ = static_cast(size); + + buf = SafeArrayAlloc(1, buflen_); + if (!buf) + return false; + + status = pReader->Read(pos, buflen_, buf); + if (status != 0) + return false; + + buflen = buflen_; + + pos += size; // consume size of payload + return true; +} + +EBMLHeader::EBMLHeader() : m_docType(NULL) { Init(); } + +EBMLHeader::~EBMLHeader() { delete[] m_docType; } + +void EBMLHeader::Init() { + m_version = 1; + m_readVersion = 1; + m_maxIdLength = 4; + m_maxSizeLength = 8; + + if (m_docType) { + delete[] m_docType; + m_docType = NULL; + } + + m_docTypeVersion = 1; + m_docTypeReadVersion = 1; +} + +long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) { + if (!pReader) + return E_FILE_FORMAT_INVALID; + + long long total, available; + + long status = pReader->Length(&total, &available); + + if (status < 0) // error + return status; + + pos = 0; + + // Scan until we find what looks like the first byte of the EBML header. + const long long kMaxScanBytes = (available >= 1024) ? 1024 : available; + const unsigned char kEbmlByte0 = 0x1A; + unsigned char scan_byte = 0; + + while (pos < kMaxScanBytes) { + status = pReader->Read(pos, 1, &scan_byte); + + if (status < 0) // error + return status; + else if (status > 0) + return E_BUFFER_NOT_FULL; + + if (scan_byte == kEbmlByte0) + break; + + ++pos; + } + + long len = 0; + const long long ebml_id = ReadID(pReader, pos, len); + + if (ebml_id == E_BUFFER_NOT_FULL) + return E_BUFFER_NOT_FULL; + + if (len != 4 || ebml_id != libwebm::kMkvEBML) + return E_FILE_FORMAT_INVALID; + + // Move read pos forward to the EBML header size field. + pos += 4; + + // Read length of size field. + long long result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return E_FILE_FORMAT_INVALID; + else if (result > 0) // need more data + return E_BUFFER_NOT_FULL; + + if (len < 1 || len > 8) + return E_FILE_FORMAT_INVALID; + + if ((total >= 0) && ((total - pos) < len)) + return E_FILE_FORMAT_INVALID; + + if ((available - pos) < len) + return pos + len; // try again later + + // Read the EBML header size. + result = ReadUInt(pReader, pos, len); + + if (result < 0) // error + return result; + + pos += len; // consume size field + + // pos now designates start of payload + + if ((total >= 0) && ((total - pos) < result)) + return E_FILE_FORMAT_INVALID; + + if ((available - pos) < result) + return pos + result; + + const long long end = pos + result; + + Init(); + + while (pos < end) { + long long id, size; + + status = ParseElementHeader(pReader, pos, end, id, size); + + if (status < 0) // error + return status; + + if (size == 0) + return E_FILE_FORMAT_INVALID; + + if (id == libwebm::kMkvEBMLVersion) { + m_version = UnserializeUInt(pReader, pos, size); + + if (m_version <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvEBMLReadVersion) { + m_readVersion = UnserializeUInt(pReader, pos, size); + + if (m_readVersion <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvEBMLMaxIDLength) { + m_maxIdLength = UnserializeUInt(pReader, pos, size); + + if (m_maxIdLength <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvEBMLMaxSizeLength) { + m_maxSizeLength = UnserializeUInt(pReader, pos, size); + + if (m_maxSizeLength <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvDocType) { + if (m_docType) + return E_FILE_FORMAT_INVALID; + + status = UnserializeString(pReader, pos, size, m_docType); + + if (status) // error + return status; + } else if (id == libwebm::kMkvDocTypeVersion) { + m_docTypeVersion = UnserializeUInt(pReader, pos, size); + + if (m_docTypeVersion <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvDocTypeReadVersion) { + m_docTypeReadVersion = UnserializeUInt(pReader, pos, size); + + if (m_docTypeReadVersion <= 0) + return E_FILE_FORMAT_INVALID; + } + + pos += size; + } + + if (pos != end) + return E_FILE_FORMAT_INVALID; + + // Make sure DocType, DocTypeReadVersion, and DocTypeVersion are valid. + if (m_docType == NULL || m_docTypeReadVersion <= 0 || m_docTypeVersion <= 0) + return E_FILE_FORMAT_INVALID; + + // Make sure EBMLMaxIDLength and EBMLMaxSizeLength are valid. + if (m_maxIdLength <= 0 || m_maxIdLength > 4 || m_maxSizeLength <= 0 || + m_maxSizeLength > 8) + return E_FILE_FORMAT_INVALID; + + return 0; +} + +Segment::Segment(IMkvReader* pReader, long long elem_start, + // long long elem_size, + long long start, long long size) + : m_pReader(pReader), + m_element_start(elem_start), + // m_element_size(elem_size), + m_start(start), + m_size(size), + m_pos(start), + m_pUnknownSize(0), + m_pSeekHead(NULL), + m_pInfo(NULL), + m_pTracks(NULL), + m_pCues(NULL), + m_pChapters(NULL), + m_pTags(NULL), + m_clusters(NULL), + m_clusterCount(0), + m_clusterPreloadCount(0), + m_clusterSize(0) {} + +Segment::~Segment() { + const long count = m_clusterCount + m_clusterPreloadCount; + + Cluster** i = m_clusters; + Cluster** j = m_clusters + count; + + while (i != j) { + Cluster* const p = *i++; + delete p; + } + + delete[] m_clusters; + + delete m_pTracks; + delete m_pInfo; + delete m_pCues; + delete m_pChapters; + delete m_pTags; + delete m_pSeekHead; +} + +long long Segment::CreateInstance(IMkvReader* pReader, long long pos, + Segment*& pSegment) { + if (pReader == NULL || pos < 0) + return E_PARSE_FAILED; + + pSegment = NULL; + + long long total, available; + + const long status = pReader->Length(&total, &available); + + if (status < 0) // error + return status; + + if (available < 0) + return -1; + + if ((total >= 0) && (available > total)) + return -1; + + // I would assume that in practice this loop would execute + // exactly once, but we allow for other elements (e.g. Void) + // to immediately follow the EBML header. This is fine for + // the source filter case (since the entire file is available), + // but in the splitter case over a network we should probably + // just give up early. We could for example decide only to + // execute this loop a maximum of, say, 10 times. + // TODO: + // There is an implied "give up early" by only parsing up + // to the available limit. We do do that, but only if the + // total file size is unknown. We could decide to always + // use what's available as our limit (irrespective of whether + // we happen to know the total file length). This would have + // as its sense "parse this much of the file before giving up", + // which a slightly different sense from "try to parse up to + // 10 EMBL elements before giving up". + + for (;;) { + if ((total >= 0) && (pos >= total)) + return E_FILE_FORMAT_INVALID; + + // Read ID + long len; + long long result = GetUIntLength(pReader, pos, len); + + if (result) // error, or too few available bytes + return result; + + if ((total >= 0) && ((pos + len) > total)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > available) + return pos + len; + + const long long idpos = pos; + const long long id = ReadID(pReader, pos, len); + + if (id < 0) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume ID + + // Read Size + + result = GetUIntLength(pReader, pos, len); + + if (result) // error, or too few available bytes + return result; + + if ((total >= 0) && ((pos + len) > total)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > available) + return pos + len; + + long long size = ReadUInt(pReader, pos, len); + + if (size < 0) // error + return size; + + pos += len; // consume length of size of element + + // Pos now points to start of payload + + // Handle "unknown size" for live streaming of webm files. + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (id == libwebm::kMkvSegment) { + if (size == unknown_size) + size = -1; + + else if (total < 0) + size = -1; + + else if ((pos + size) > total) + size = -1; + + pSegment = new (std::nothrow) Segment(pReader, idpos, pos, size); + if (pSegment == NULL) + return E_PARSE_FAILED; + + return 0; // success + } + + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; + + if ((total >= 0) && ((pos + size) > total)) + return E_FILE_FORMAT_INVALID; + + if ((pos + size) > available) + return pos + size; + + pos += size; // consume payload + } +} + +long long Segment::ParseHeaders() { + // Outermost (level 0) segment object has been constructed, + // and pos designates start of payload. We need to find the + // inner (level 1) elements. + long long total, available; + + const int status = m_pReader->Length(&total, &available); + + if (status < 0) // error + return status; + + if (total > 0 && available > total) + return E_FILE_FORMAT_INVALID; + + const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size; + + if ((segment_stop >= 0 && total >= 0 && segment_stop > total) || + (segment_stop >= 0 && m_pos > segment_stop)) { + return E_FILE_FORMAT_INVALID; + } + + for (;;) { + if ((total >= 0) && (m_pos >= total)) + break; + + if ((segment_stop >= 0) && (m_pos >= segment_stop)) + break; + + long long pos = m_pos; + const long long element_start = pos; + + // Avoid rolling over pos when very close to LLONG_MAX. + unsigned long long rollover_check = pos + 1ULL; + if (rollover_check > LLONG_MAX) + return E_FILE_FORMAT_INVALID; + + if ((pos + 1) > available) + return (pos + 1); + + long len; + long long result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return result; + + if (result > 0) { + // MkvReader doesn't have enough data to satisfy this read attempt. + return (pos + 1); + } + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > available) + return pos + len; + + const long long idpos = pos; + const long long id = ReadID(m_pReader, idpos, len); + + if (id < 0) + return E_FILE_FORMAT_INVALID; + + if (id == libwebm::kMkvCluster) + break; + + pos += len; // consume ID + + if ((pos + 1) > available) + return (pos + 1); + + // Read Size + result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return result; + + if (result > 0) { + // MkvReader doesn't have enough data to satisfy this read attempt. + return (pos + 1); + } + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > available) + return pos + len; + + const long long size = ReadUInt(m_pReader, pos, len); + + if (size < 0 || len < 1 || len > 8) { + // TODO(tomfinegan): ReadUInt should return an error when len is < 1 or + // len > 8 is true instead of checking this _everywhere_. + return size; + } + + pos += len; // consume length of size of element + + // Avoid rolling over pos when very close to LLONG_MAX. + rollover_check = static_cast(pos) + size; + if (rollover_check > LLONG_MAX) + return E_FILE_FORMAT_INVALID; + + const long long element_size = size + pos - element_start; + + // Pos now points to start of payload + + if ((segment_stop >= 0) && ((pos + size) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + // We read EBML elements either in total or nothing at all. + + if ((pos + size) > available) + return pos + size; + + if (id == libwebm::kMkvInfo) { + if (m_pInfo) + return E_FILE_FORMAT_INVALID; + + m_pInfo = new (std::nothrow) + SegmentInfo(this, pos, size, element_start, element_size); + + if (m_pInfo == NULL) + return -1; + + const long status = m_pInfo->Parse(); + + if (status) + return status; + } else if (id == libwebm::kMkvTracks) { + if (m_pTracks) + return E_FILE_FORMAT_INVALID; + + m_pTracks = new (std::nothrow) + Tracks(this, pos, size, element_start, element_size); + + if (m_pTracks == NULL) + return -1; + + const long status = m_pTracks->Parse(); + + if (status) + return status; + } else if (id == libwebm::kMkvCues) { + if (m_pCues == NULL) { + m_pCues = new (std::nothrow) + Cues(this, pos, size, element_start, element_size); + + if (m_pCues == NULL) + return -1; + } + } else if (id == libwebm::kMkvSeekHead) { + if (m_pSeekHead == NULL) { + m_pSeekHead = new (std::nothrow) + SeekHead(this, pos, size, element_start, element_size); + + if (m_pSeekHead == NULL) + return -1; + + const long status = m_pSeekHead->Parse(); + + if (status) + return status; + } + } else if (id == libwebm::kMkvChapters) { + if (m_pChapters == NULL) { + m_pChapters = new (std::nothrow) + Chapters(this, pos, size, element_start, element_size); + + if (m_pChapters == NULL) + return -1; + + const long status = m_pChapters->Parse(); + + if (status) + return status; + } + } else if (id == libwebm::kMkvTags) { + if (m_pTags == NULL) { + m_pTags = new (std::nothrow) + Tags(this, pos, size, element_start, element_size); + + if (m_pTags == NULL) + return -1; + + const long status = m_pTags->Parse(); + + if (status) + return status; + } + } + + m_pos = pos + size; // consume payload + } + + if (segment_stop >= 0 && m_pos > segment_stop) + return E_FILE_FORMAT_INVALID; + + if (m_pInfo == NULL) // TODO: liberalize this behavior + return E_FILE_FORMAT_INVALID; + + if (m_pTracks == NULL) + return E_FILE_FORMAT_INVALID; + + return 0; // success +} + +long Segment::LoadCluster(long long& pos, long& len) { + for (;;) { + const long result = DoLoadCluster(pos, len); + + if (result <= 1) + return result; + } +} + +long Segment::DoLoadCluster(long long& pos, long& len) { + if (m_pos < 0) + return DoLoadClusterUnknownSize(pos, len); + + long long total, avail; + + long status = m_pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + if (total >= 0 && avail > total) + return E_FILE_FORMAT_INVALID; + + const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size; + + long long cluster_off = -1; // offset relative to start of segment + long long cluster_size = -1; // size of cluster payload + + for (;;) { + if ((total >= 0) && (m_pos >= total)) + return 1; // no more clusters + + if ((segment_stop >= 0) && (m_pos >= segment_stop)) + return 1; // no more clusters + + pos = m_pos; + + // Read ID + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long idpos = pos; + const long long id = ReadID(m_pReader, idpos, len); + + if (id < 0) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume ID + + // Read Size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(m_pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + pos += len; // consume length of size of element + + // pos now points to start of payload + + if (size == 0) { + // Missing element payload: move on. + m_pos = pos; + continue; + } + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if ((segment_stop >= 0) && (size != unknown_size) && + ((pos + size) > segment_stop)) { + return E_FILE_FORMAT_INVALID; + } + + if (id == libwebm::kMkvCues) { + if (size == unknown_size) { + // Cues element of unknown size: Not supported. + return E_FILE_FORMAT_INVALID; + } + + if (m_pCues == NULL) { + const long long element_size = (pos - idpos) + size; + + m_pCues = new (std::nothrow) Cues(this, pos, size, idpos, element_size); + if (m_pCues == NULL) + return -1; + } + + m_pos = pos + size; // consume payload + continue; + } + + if (id != libwebm::kMkvCluster) { + // Besides the Segment, Libwebm allows only cluster elements of unknown + // size. Fail the parse upon encountering a non-cluster element reporting + // unknown size. + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; + + m_pos = pos + size; // consume payload + continue; + } + + // We have a cluster. + + cluster_off = idpos - m_start; // relative pos + + if (size != unknown_size) + cluster_size = size; + + break; + } + + if (cluster_off < 0) { + // No cluster, die. + return E_FILE_FORMAT_INVALID; + } + + long long pos_; + long len_; + + status = Cluster::HasBlockEntries(this, cluster_off, pos_, len_); + + if (status < 0) { // error, or underflow + pos = pos_; + len = len_; + + return status; + } + + // status == 0 means "no block entries found" + // status > 0 means "found at least one block entry" + + // TODO: + // The issue here is that the segment increments its own + // pos ptr past the most recent cluster parsed, and then + // starts from there to parse the next cluster. If we + // don't know the size of the current cluster, then we + // must either parse its payload (as we do below), looking + // for the cluster (or cues) ID to terminate the parse. + // This isn't really what we want: rather, we really need + // a way to create the curr cluster object immediately. + // The pity is that cluster::parse can determine its own + // boundary, and we largely duplicate that same logic here. + // + // Maybe we need to get rid of our look-ahead preloading + // in source::parse??? + // + // As we're parsing the blocks in the curr cluster + //(in cluster::parse), we should have some way to signal + // to the segment that we have determined the boundary, + // so it can adjust its own segment::m_pos member. + // + // The problem is that we're asserting in asyncreadinit, + // because we adjust the pos down to the curr seek pos, + // and the resulting adjusted len is > 2GB. I'm suspicious + // that this is even correct, but even if it is, we can't + // be loading that much data in the cache anyway. + + const long idx = m_clusterCount; + + if (m_clusterPreloadCount > 0) { + if (idx >= m_clusterSize) + return E_FILE_FORMAT_INVALID; + + Cluster* const pCluster = m_clusters[idx]; + if (pCluster == NULL || pCluster->m_index >= 0) + return E_FILE_FORMAT_INVALID; + + const long long off = pCluster->GetPosition(); + if (off < 0) + return E_FILE_FORMAT_INVALID; + + if (off == cluster_off) { // preloaded already + if (status == 0) // no entries found + return E_FILE_FORMAT_INVALID; + + if (cluster_size >= 0) + pos += cluster_size; + else { + const long long element_size = pCluster->GetElementSize(); + + if (element_size <= 0) + return E_FILE_FORMAT_INVALID; // TODO: handle this case + + pos = pCluster->m_element_start + element_size; + } + + pCluster->m_index = idx; // move from preloaded to loaded + ++m_clusterCount; + --m_clusterPreloadCount; + + m_pos = pos; // consume payload + if (segment_stop >= 0 && m_pos > segment_stop) + return E_FILE_FORMAT_INVALID; + + return 0; // success + } + } + + if (status == 0) { // no entries found + if (cluster_size >= 0) + pos += cluster_size; + + if ((total >= 0) && (pos >= total)) { + m_pos = total; + return 1; // no more clusters + } + + if ((segment_stop >= 0) && (pos >= segment_stop)) { + m_pos = segment_stop; + return 1; // no more clusters + } + + m_pos = pos; + return 2; // try again + } + + // status > 0 means we have an entry + + Cluster* const pCluster = Cluster::Create(this, idx, cluster_off); + if (pCluster == NULL) + return -1; + + if (!AppendCluster(pCluster)) { + delete pCluster; + return -1; + } + + if (cluster_size >= 0) { + pos += cluster_size; + + m_pos = pos; + + if (segment_stop > 0 && m_pos > segment_stop) + return E_FILE_FORMAT_INVALID; + + return 0; + } + + m_pUnknownSize = pCluster; + m_pos = -pos; + + return 0; // partial success, since we have a new cluster + + // status == 0 means "no block entries found" + // pos designates start of payload + // m_pos has NOT been adjusted yet (in case we need to come back here) +} + +long Segment::DoLoadClusterUnknownSize(long long& pos, long& len) { + if (m_pos >= 0 || m_pUnknownSize == NULL) + return E_PARSE_FAILED; + + const long status = m_pUnknownSize->Parse(pos, len); + + if (status < 0) // error or underflow + return status; + + if (status == 0) // parsed a block + return 2; // continue parsing + + const long long start = m_pUnknownSize->m_element_start; + const long long size = m_pUnknownSize->GetElementSize(); + + if (size < 0) + return E_FILE_FORMAT_INVALID; + + pos = start + size; + m_pos = pos; + + m_pUnknownSize = 0; + + return 2; // continue parsing +} + +bool Segment::AppendCluster(Cluster* pCluster) { + if (pCluster == NULL || pCluster->m_index < 0) + return false; + + const long count = m_clusterCount + m_clusterPreloadCount; + + long& size = m_clusterSize; + const long idx = pCluster->m_index; + + if (size < count || idx != m_clusterCount) + return false; + + if (count >= size) { + const long n = (size <= 0) ? 2048 : 2 * size; + + Cluster** const qq = new (std::nothrow) Cluster*[n]; + if (qq == NULL) + return false; + + Cluster** q = qq; + Cluster** p = m_clusters; + Cluster** const pp = p + count; + + while (p != pp) + *q++ = *p++; + + delete[] m_clusters; + + m_clusters = qq; + size = n; + } + + if (m_clusterPreloadCount > 0) { + Cluster** const p = m_clusters + m_clusterCount; + if (*p == NULL || (*p)->m_index >= 0) + return false; + + Cluster** q = p + m_clusterPreloadCount; + if (q >= (m_clusters + size)) + return false; + + for (;;) { + Cluster** const qq = q - 1; + if ((*qq)->m_index >= 0) + return false; + + *q = *qq; + q = qq; + + if (q == p) + break; + } + } + + m_clusters[idx] = pCluster; + ++m_clusterCount; + return true; +} + +bool Segment::PreloadCluster(Cluster* pCluster, ptrdiff_t idx) { + if (pCluster == NULL || pCluster->m_index >= 0 || idx < m_clusterCount) + return false; + + const long count = m_clusterCount + m_clusterPreloadCount; + + long& size = m_clusterSize; + if (size < count) + return false; + + if (count >= size) { + const long n = (size <= 0) ? 2048 : 2 * size; + + Cluster** const qq = new (std::nothrow) Cluster*[n]; + if (qq == NULL) + return false; + Cluster** q = qq; + + Cluster** p = m_clusters; + Cluster** const pp = p + count; + + while (p != pp) + *q++ = *p++; + + delete[] m_clusters; + + m_clusters = qq; + size = n; + } + + if (m_clusters == NULL) + return false; + + Cluster** const p = m_clusters + idx; + + Cluster** q = m_clusters + count; + if (q < p || q >= (m_clusters + size)) + return false; + + while (q > p) { + Cluster** const qq = q - 1; + + if ((*qq)->m_index >= 0) + return false; + + *q = *qq; + q = qq; + } + + m_clusters[idx] = pCluster; + ++m_clusterPreloadCount; + return true; +} + +long Segment::Load() { + if (m_clusters != NULL || m_clusterSize != 0 || m_clusterCount != 0) + return E_PARSE_FAILED; + + // Outermost (level 0) segment object has been constructed, + // and pos designates start of payload. We need to find the + // inner (level 1) elements. + + const long long header_status = ParseHeaders(); + + if (header_status < 0) // error + return static_cast(header_status); + + if (header_status > 0) // underflow + return E_BUFFER_NOT_FULL; + + if (m_pInfo == NULL || m_pTracks == NULL) + return E_FILE_FORMAT_INVALID; + + for (;;) { + const long status = LoadCluster(); + + if (status < 0) // error + return status; + + if (status >= 1) // no more clusters + return 0; + } +} + +SeekHead::Entry::Entry() : id(0), pos(0), element_start(0), element_size(0) {} + +SeekHead::SeekHead(Segment* pSegment, long long start, long long size_, + long long element_start, long long element_size) + : m_pSegment(pSegment), + m_start(start), + m_size(size_), + m_element_start(element_start), + m_element_size(element_size), + m_entries(0), + m_entry_count(0), + m_void_elements(0), + m_void_element_count(0) {} + +SeekHead::~SeekHead() { + delete[] m_entries; + delete[] m_void_elements; +} + +long SeekHead::Parse() { + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long pos = m_start; + const long long stop = m_start + m_size; + + // first count the seek head entries + + int entry_count = 0; + int void_element_count = 0; + + while (pos < stop) { + long long id, size; + + const long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (id == libwebm::kMkvSeek) + ++entry_count; + else if (id == libwebm::kMkvVoid) + ++void_element_count; + + pos += size; // consume payload + + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + if (entry_count > 0) { + m_entries = new (std::nothrow) Entry[entry_count]; + + if (m_entries == NULL) + return -1; + } + + if (void_element_count > 0) { + m_void_elements = new (std::nothrow) VoidElement[void_element_count]; + + if (m_void_elements == NULL) + return -1; + } + + // now parse the entries and void elements + + Entry* pEntry = m_entries; + VoidElement* pVoidElement = m_void_elements; + + pos = m_start; + + while (pos < stop) { + const long long idpos = pos; + + long long id, size; + + const long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (id == libwebm::kMkvSeek && entry_count > 0) { + if (ParseEntry(pReader, pos, size, pEntry)) { + Entry& e = *pEntry++; + + e.element_start = idpos; + e.element_size = (pos + size) - idpos; + } + } else if (id == libwebm::kMkvVoid && void_element_count > 0) { + VoidElement& e = *pVoidElement++; + + e.element_start = idpos; + e.element_size = (pos + size) - idpos; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries); + assert(count_ >= 0); + assert(count_ <= entry_count); + + m_entry_count = static_cast(count_); + + count_ = ptrdiff_t(pVoidElement - m_void_elements); + assert(count_ >= 0); + assert(count_ <= void_element_count); + + m_void_element_count = static_cast(count_); + + return 0; +} + +int SeekHead::GetCount() const { return m_entry_count; } + +const SeekHead::Entry* SeekHead::GetEntry(int idx) const { + if (idx < 0) + return 0; + + if (idx >= m_entry_count) + return 0; + + return m_entries + idx; +} + +int SeekHead::GetVoidElementCount() const { return m_void_element_count; } + +const SeekHead::VoidElement* SeekHead::GetVoidElement(int idx) const { + if (idx < 0) + return 0; + + if (idx >= m_void_element_count) + return 0; + + return m_void_elements + idx; +} + +long Segment::ParseCues(long long off, long long& pos, long& len) { + if (m_pCues) + return 0; // success + + if (off < 0) + return -1; + + long long total, avail; + + const int status = m_pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + assert((total < 0) || (avail <= total)); + + pos = m_start + off; + + if ((total < 0) || (pos >= total)) + return 1; // don't bother parsing cues + + const long long element_start = pos; + const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size; + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // underflow (weird) + { + len = 1; + return E_BUFFER_NOT_FULL; + } + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long idpos = pos; + + const long long id = ReadID(m_pReader, idpos, len); + + if (id != libwebm::kMkvCues) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume ID + assert((segment_stop < 0) || (pos <= segment_stop)); + + // Read Size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // underflow (weird) + { + len = 1; + return E_BUFFER_NOT_FULL; + } + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(m_pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + if (size == 0) // weird, although technically not illegal + return 1; // done + + pos += len; // consume length of size of element + assert((segment_stop < 0) || (pos <= segment_stop)); + + // Pos now points to start of payload + + const long long element_stop = pos + size; + + if ((segment_stop >= 0) && (element_stop > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((total >= 0) && (element_stop > total)) + return 1; // don't bother parsing anymore + + len = static_cast(size); + + if (element_stop > avail) + return E_BUFFER_NOT_FULL; + + const long long element_size = element_stop - element_start; + + m_pCues = + new (std::nothrow) Cues(this, pos, size, element_start, element_size); + if (m_pCues == NULL) + return -1; + + return 0; // success +} + +bool SeekHead::ParseEntry(IMkvReader* pReader, long long start, long long size_, + Entry* pEntry) { + if (size_ <= 0) + return false; + + long long pos = start; + const long long stop = start + size_; + + long len; + + // parse the container for the level-1 element ID + + const long long seekIdId = ReadID(pReader, pos, len); + if (seekIdId < 0) + return false; + + if (seekIdId != libwebm::kMkvSeekID) + return false; + + if ((pos + len) > stop) + return false; + + pos += len; // consume SeekID id + + const long long seekIdSize = ReadUInt(pReader, pos, len); + + if (seekIdSize <= 0) + return false; + + if ((pos + len) > stop) + return false; + + pos += len; // consume size of field + + if ((pos + seekIdSize) > stop) + return false; + + pEntry->id = ReadID(pReader, pos, len); // payload + + if (pEntry->id <= 0) + return false; + + if (len != seekIdSize) + return false; + + pos += seekIdSize; // consume SeekID payload + + const long long seekPosId = ReadID(pReader, pos, len); + + if (seekPosId != libwebm::kMkvSeekPosition) + return false; + + if ((pos + len) > stop) + return false; + + pos += len; // consume id + + const long long seekPosSize = ReadUInt(pReader, pos, len); + + if (seekPosSize <= 0) + return false; + + if ((pos + len) > stop) + return false; + + pos += len; // consume size + + if ((pos + seekPosSize) > stop) + return false; + + pEntry->pos = UnserializeUInt(pReader, pos, seekPosSize); + + if (pEntry->pos < 0) + return false; + + pos += seekPosSize; // consume payload + + if (pos != stop) + return false; + + return true; +} + +Cues::Cues(Segment* pSegment, long long start_, long long size_, + long long element_start, long long element_size) + : m_pSegment(pSegment), + m_start(start_), + m_size(size_), + m_element_start(element_start), + m_element_size(element_size), + m_cue_points(NULL), + m_count(0), + m_preload_count(0), + m_pos(start_) {} + +Cues::~Cues() { + const long n = m_count + m_preload_count; + + CuePoint** p = m_cue_points; + CuePoint** const q = p + n; + + while (p != q) { + CuePoint* const pCP = *p++; + assert(pCP); + + delete pCP; + } + + delete[] m_cue_points; +} + +long Cues::GetCount() const { + if (m_cue_points == NULL) + return -1; + + return m_count; // TODO: really ignore preload count? +} + +bool Cues::DoneParsing() const { + const long long stop = m_start + m_size; + return (m_pos >= stop); +} + +bool Cues::Init() const { + if (m_cue_points) + return true; + + if (m_count != 0 || m_preload_count != 0) + return false; + + IMkvReader* const pReader = m_pSegment->m_pReader; + + const long long stop = m_start + m_size; + long long pos = m_start; + + long cue_points_size = 0; + + while (pos < stop) { + const long long idpos = pos; + + long len; + + const long long id = ReadID(pReader, pos, len); + if (id < 0 || (pos + len) > stop) { + return false; + } + + pos += len; // consume ID + + const long long size = ReadUInt(pReader, pos, len); + if (size < 0 || (pos + len > stop)) { + return false; + } + + pos += len; // consume Size field + if (pos + size > stop) { + return false; + } + + if (id == libwebm::kMkvCuePoint) { + if (!PreloadCuePoint(cue_points_size, idpos)) + return false; + } + + pos += size; // skip payload + } + return true; +} + +bool Cues::PreloadCuePoint(long& cue_points_size, long long pos) const { + if (m_count != 0) + return false; + + if (m_preload_count >= cue_points_size) { + const long n = (cue_points_size <= 0) ? 2048 : 2 * cue_points_size; + + CuePoint** const qq = new (std::nothrow) CuePoint*[n]; + if (qq == NULL) + return false; + + CuePoint** q = qq; // beginning of target + + CuePoint** p = m_cue_points; // beginning of source + CuePoint** const pp = p + m_preload_count; // end of source + + while (p != pp) + *q++ = *p++; + + delete[] m_cue_points; + + m_cue_points = qq; + cue_points_size = n; + } + + CuePoint* const pCP = new (std::nothrow) CuePoint(m_preload_count, pos); + if (pCP == NULL) + return false; + + m_cue_points[m_preload_count++] = pCP; + return true; +} + +bool Cues::LoadCuePoint() const { + const long long stop = m_start + m_size; + + if (m_pos >= stop) + return false; // nothing else to do + + if (!Init()) { + m_pos = stop; + return false; + } + + IMkvReader* const pReader = m_pSegment->m_pReader; + + while (m_pos < stop) { + const long long idpos = m_pos; + + long len; + + const long long id = ReadID(pReader, m_pos, len); + if (id < 0 || (m_pos + len) > stop) + return false; + + m_pos += len; // consume ID + + const long long size = ReadUInt(pReader, m_pos, len); + if (size < 0 || (m_pos + len) > stop) + return false; + + m_pos += len; // consume Size field + if ((m_pos + size) > stop) + return false; + + if (id != libwebm::kMkvCuePoint) { + m_pos += size; // consume payload + if (m_pos > stop) + return false; + + continue; + } + + if (m_preload_count < 1) + return false; + + CuePoint* const pCP = m_cue_points[m_count]; + if (!pCP || (pCP->GetTimeCode() < 0 && (-pCP->GetTimeCode() != idpos))) + return false; + + if (!pCP->Load(pReader)) { + m_pos = stop; + return false; + } + ++m_count; + --m_preload_count; + + m_pos += size; // consume payload + if (m_pos > stop) + return false; + + return true; // yes, we loaded a cue point + } + + return false; // no, we did not load a cue point +} + +bool Cues::Find(long long time_ns, const Track* pTrack, const CuePoint*& pCP, + const CuePoint::TrackPosition*& pTP) const { + if (time_ns < 0 || pTrack == NULL || m_cue_points == NULL || m_count == 0) + return false; + + CuePoint** const ii = m_cue_points; + CuePoint** i = ii; + + CuePoint** const jj = ii + m_count; + CuePoint** j = jj; + + pCP = *i; + if (pCP == NULL) + return false; + + if (time_ns <= pCP->GetTime(m_pSegment)) { + pTP = pCP->Find(pTrack); + return (pTP != NULL); + } + + while (i < j) { + // INVARIANT: + //[ii, i) <= time_ns + //[i, j) ? + //[j, jj) > time_ns + + CuePoint** const k = i + (j - i) / 2; + if (k >= jj) + return false; + + CuePoint* const pCP = *k; + if (pCP == NULL) + return false; + + const long long t = pCP->GetTime(m_pSegment); + + if (t <= time_ns) + i = k + 1; + else + j = k; + + if (i > j) + return false; + } + + if (i != j || i > jj || i <= ii) + return false; + + pCP = *--i; + + if (pCP == NULL || pCP->GetTime(m_pSegment) > time_ns) + return false; + + // TODO: here and elsewhere, it's probably not correct to search + // for the cue point with this time, and then search for a matching + // track. In principle, the matching track could be on some earlier + // cue point, and with our current algorithm, we'd miss it. To make + // this bullet-proof, we'd need to create a secondary structure, + // with a list of cue points that apply to a track, and then search + // that track-based structure for a matching cue point. + + pTP = pCP->Find(pTrack); + return (pTP != NULL); +} + +const CuePoint* Cues::GetFirst() const { + if (m_cue_points == NULL || m_count == 0) + return NULL; + + CuePoint* const* const pp = m_cue_points; + if (pp == NULL) + return NULL; + + CuePoint* const pCP = pp[0]; + if (pCP == NULL || pCP->GetTimeCode() < 0) + return NULL; + + return pCP; +} + +const CuePoint* Cues::GetLast() const { + if (m_cue_points == NULL || m_count <= 0) + return NULL; + + const long index = m_count - 1; + + CuePoint* const* const pp = m_cue_points; + if (pp == NULL) + return NULL; + + CuePoint* const pCP = pp[index]; + if (pCP == NULL || pCP->GetTimeCode() < 0) + return NULL; + + return pCP; +} + +const CuePoint* Cues::GetNext(const CuePoint* pCurr) const { + if (pCurr == NULL || pCurr->GetTimeCode() < 0 || m_cue_points == NULL || + m_count < 1) { + return NULL; + } + + long index = pCurr->m_index; + if (index >= m_count) + return NULL; + + CuePoint* const* const pp = m_cue_points; + if (pp == NULL || pp[index] != pCurr) + return NULL; + + ++index; + + if (index >= m_count) + return NULL; + + CuePoint* const pNext = pp[index]; + + if (pNext == NULL || pNext->GetTimeCode() < 0) + return NULL; + + return pNext; +} + +const BlockEntry* Cues::GetBlock(const CuePoint* pCP, + const CuePoint::TrackPosition* pTP) const { + if (pCP == NULL || pTP == NULL) + return NULL; + + return m_pSegment->GetBlock(*pCP, *pTP); +} + +const BlockEntry* Segment::GetBlock(const CuePoint& cp, + const CuePoint::TrackPosition& tp) { + Cluster** const ii = m_clusters; + Cluster** i = ii; + + const long count = m_clusterCount + m_clusterPreloadCount; + + Cluster** const jj = ii + count; + Cluster** j = jj; + + while (i < j) { + // INVARIANT: + //[ii, i) < pTP->m_pos + //[i, j) ? + //[j, jj) > pTP->m_pos + + Cluster** const k = i + (j - i) / 2; + assert(k < jj); + + Cluster* const pCluster = *k; + assert(pCluster); + + // const long long pos_ = pCluster->m_pos; + // assert(pos_); + // const long long pos = pos_ * ((pos_ < 0) ? -1 : 1); + + const long long pos = pCluster->GetPosition(); + assert(pos >= 0); + + if (pos < tp.m_pos) + i = k + 1; + else if (pos > tp.m_pos) + j = k; + else + return pCluster->GetEntry(cp, tp); + } + + assert(i == j); + // assert(Cluster::HasBlockEntries(this, tp.m_pos)); + + Cluster* const pCluster = Cluster::Create(this, -1, tp.m_pos); //, -1); + if (pCluster == NULL) + return NULL; + + const ptrdiff_t idx = i - m_clusters; + + if (!PreloadCluster(pCluster, idx)) { + delete pCluster; + return NULL; + } + assert(m_clusters); + assert(m_clusterPreloadCount > 0); + assert(m_clusters[idx] == pCluster); + + return pCluster->GetEntry(cp, tp); +} + +const Cluster* Segment::FindOrPreloadCluster(long long requested_pos) { + if (requested_pos < 0) + return 0; + + Cluster** const ii = m_clusters; + Cluster** i = ii; + + const long count = m_clusterCount + m_clusterPreloadCount; + + Cluster** const jj = ii + count; + Cluster** j = jj; + + while (i < j) { + // INVARIANT: + //[ii, i) < pTP->m_pos + //[i, j) ? + //[j, jj) > pTP->m_pos + + Cluster** const k = i + (j - i) / 2; + assert(k < jj); + + Cluster* const pCluster = *k; + assert(pCluster); + + // const long long pos_ = pCluster->m_pos; + // assert(pos_); + // const long long pos = pos_ * ((pos_ < 0) ? -1 : 1); + + const long long pos = pCluster->GetPosition(); + assert(pos >= 0); + + if (pos < requested_pos) + i = k + 1; + else if (pos > requested_pos) + j = k; + else + return pCluster; + } + + assert(i == j); + // assert(Cluster::HasBlockEntries(this, tp.m_pos)); + + Cluster* const pCluster = Cluster::Create(this, -1, requested_pos); + if (pCluster == NULL) + return NULL; + + const ptrdiff_t idx = i - m_clusters; + + if (!PreloadCluster(pCluster, idx)) { + delete pCluster; + return NULL; + } + assert(m_clusters); + assert(m_clusterPreloadCount > 0); + assert(m_clusters[idx] == pCluster); + + return pCluster; +} + +CuePoint::CuePoint(long idx, long long pos) + : m_element_start(0), + m_element_size(0), + m_index(idx), + m_timecode(-1 * pos), + m_track_positions(NULL), + m_track_positions_count(0) { + assert(pos > 0); +} + +CuePoint::~CuePoint() { delete[] m_track_positions; } + +bool CuePoint::Load(IMkvReader* pReader) { + // odbgstream os; + // os << "CuePoint::Load(begin): timecode=" << m_timecode << endl; + + if (m_timecode >= 0) // already loaded + return true; + + assert(m_track_positions == NULL); + assert(m_track_positions_count == 0); + + long long pos_ = -m_timecode; + const long long element_start = pos_; + + long long stop; + + { + long len; + + const long long id = ReadID(pReader, pos_, len); + if (id != libwebm::kMkvCuePoint) + return false; + + pos_ += len; // consume ID + + const long long size = ReadUInt(pReader, pos_, len); + assert(size >= 0); + + pos_ += len; // consume Size field + // pos_ now points to start of payload + + stop = pos_ + size; + } + + const long long element_size = stop - element_start; + + long long pos = pos_; + + // First count number of track positions + + while (pos < stop) { + long len; + + const long long id = ReadID(pReader, pos, len); + if ((id < 0) || (pos + len > stop)) { + return false; + } + + pos += len; // consume ID + + const long long size = ReadUInt(pReader, pos, len); + if ((size < 0) || (pos + len > stop)) { + return false; + } + + pos += len; // consume Size field + if ((pos + size) > stop) { + return false; + } + + if (id == libwebm::kMkvCueTime) + m_timecode = UnserializeUInt(pReader, pos, size); + + else if (id == libwebm::kMkvCueTrackPositions) + ++m_track_positions_count; + + pos += size; // consume payload + } + + if (m_timecode < 0 || m_track_positions_count <= 0) { + return false; + } + + // os << "CuePoint::Load(cont'd): idpos=" << idpos + // << " timecode=" << m_timecode + // << endl; + + m_track_positions = new (std::nothrow) TrackPosition[m_track_positions_count]; + if (m_track_positions == NULL) + return false; + + // Now parse track positions + + TrackPosition* p = m_track_positions; + pos = pos_; + + while (pos < stop) { + long len; + + const long long id = ReadID(pReader, pos, len); + if (id < 0 || (pos + len) > stop) + return false; + + pos += len; // consume ID + + const long long size = ReadUInt(pReader, pos, len); + assert(size >= 0); + assert((pos + len) <= stop); + + pos += len; // consume Size field + assert((pos + size) <= stop); + + if (id == libwebm::kMkvCueTrackPositions) { + TrackPosition& tp = *p++; + if (!tp.Parse(pReader, pos, size)) { + return false; + } + } + + pos += size; // consume payload + if (pos > stop) + return false; + } + + assert(size_t(p - m_track_positions) == m_track_positions_count); + + m_element_start = element_start; + m_element_size = element_size; + + return true; +} + +bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_, + long long size_) { + const long long stop = start_ + size_; + long long pos = start_; + + m_track = -1; + m_pos = -1; + m_block = 1; // default + + while (pos < stop) { + long len; + + const long long id = ReadID(pReader, pos, len); + if ((id < 0) || ((pos + len) > stop)) { + return false; + } + + pos += len; // consume ID + + const long long size = ReadUInt(pReader, pos, len); + if ((size < 0) || ((pos + len) > stop)) { + return false; + } + + pos += len; // consume Size field + if ((pos + size) > stop) { + return false; + } + + if (id == libwebm::kMkvCueTrack) + m_track = UnserializeUInt(pReader, pos, size); + else if (id == libwebm::kMkvCueClusterPosition) + m_pos = UnserializeUInt(pReader, pos, size); + else if (id == libwebm::kMkvCueBlockNumber) + m_block = UnserializeUInt(pReader, pos, size); + + pos += size; // consume payload + } + + if ((m_pos < 0) || (m_track <= 0)) { + return false; + } + + return true; +} + +const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const { + if (pTrack == NULL) { + return NULL; + } + + const long long n = pTrack->GetNumber(); + + const TrackPosition* i = m_track_positions; + const TrackPosition* const j = i + m_track_positions_count; + + while (i != j) { + const TrackPosition& p = *i++; + + if (p.m_track == n) + return &p; + } + + return NULL; // no matching track number found +} + +long long CuePoint::GetTimeCode() const { return m_timecode; } + +long long CuePoint::GetTime(const Segment* pSegment) const { + assert(pSegment); + assert(m_timecode >= 0); + + const SegmentInfo* const pInfo = pSegment->GetInfo(); + assert(pInfo); + + const long long scale = pInfo->GetTimeCodeScale(); + assert(scale >= 1); + + const long long time = scale * m_timecode; + + return time; +} + +bool Segment::DoneParsing() const { + if (m_size < 0) { + long long total, avail; + + const int status = m_pReader->Length(&total, &avail); + + if (status < 0) // error + return true; // must assume done + + if (total < 0) + return false; // assume live stream + + return (m_pos >= total); + } + + const long long stop = m_start + m_size; + + return (m_pos >= stop); +} + +const Cluster* Segment::GetFirst() const { + if ((m_clusters == NULL) || (m_clusterCount <= 0)) + return &m_eos; + + Cluster* const pCluster = m_clusters[0]; + assert(pCluster); + + return pCluster; +} + +const Cluster* Segment::GetLast() const { + if ((m_clusters == NULL) || (m_clusterCount <= 0)) + return &m_eos; + + const long idx = m_clusterCount - 1; + + Cluster* const pCluster = m_clusters[idx]; + assert(pCluster); + + return pCluster; +} + +unsigned long Segment::GetCount() const { return m_clusterCount; } + +const Cluster* Segment::GetNext(const Cluster* pCurr) { + assert(pCurr); + assert(pCurr != &m_eos); + assert(m_clusters); + + long idx = pCurr->m_index; + + if (idx >= 0) { + assert(m_clusterCount > 0); + assert(idx < m_clusterCount); + assert(pCurr == m_clusters[idx]); + + ++idx; + + if (idx >= m_clusterCount) + return &m_eos; // caller will LoadCluster as desired + + Cluster* const pNext = m_clusters[idx]; + assert(pNext); + assert(pNext->m_index >= 0); + assert(pNext->m_index == idx); + + return pNext; + } + + assert(m_clusterPreloadCount > 0); + + long long pos = pCurr->m_element_start; + + assert(m_size >= 0); // TODO + const long long stop = m_start + m_size; // end of segment + + { + long len; + + long long result = GetUIntLength(m_pReader, pos, len); + assert(result == 0); + assert((pos + len) <= stop); // TODO + if (result != 0) + return NULL; + + const long long id = ReadID(m_pReader, pos, len); + if (id != libwebm::kMkvCluster) + return NULL; + + pos += len; // consume ID + + // Read Size + result = GetUIntLength(m_pReader, pos, len); + assert(result == 0); // TODO + assert((pos + len) <= stop); // TODO + + const long long size = ReadUInt(m_pReader, pos, len); + assert(size > 0); // TODO + // assert((pCurr->m_size <= 0) || (pCurr->m_size == size)); + + pos += len; // consume length of size of element + assert((pos + size) <= stop); // TODO + + // Pos now points to start of payload + + pos += size; // consume payload + } + + long long off_next = 0; + + while (pos < stop) { + long len; + + long long result = GetUIntLength(m_pReader, pos, len); + assert(result == 0); + assert((pos + len) <= stop); // TODO + if (result != 0) + return NULL; + + const long long idpos = pos; // pos of next (potential) cluster + + const long long id = ReadID(m_pReader, idpos, len); + if (id < 0) + return NULL; + + pos += len; // consume ID + + // Read Size + result = GetUIntLength(m_pReader, pos, len); + assert(result == 0); // TODO + assert((pos + len) <= stop); // TODO + + const long long size = ReadUInt(m_pReader, pos, len); + assert(size >= 0); // TODO + + pos += len; // consume length of size of element + assert((pos + size) <= stop); // TODO + + // Pos now points to start of payload + + if (size == 0) // weird + continue; + + if (id == libwebm::kMkvCluster) { + const long long off_next_ = idpos - m_start; + + long long pos_; + long len_; + + const long status = Cluster::HasBlockEntries(this, off_next_, pos_, len_); + + assert(status >= 0); + + if (status > 0) { + off_next = off_next_; + break; + } + } + + pos += size; // consume payload + } + + if (off_next <= 0) + return 0; + + Cluster** const ii = m_clusters + m_clusterCount; + Cluster** i = ii; + + Cluster** const jj = ii + m_clusterPreloadCount; + Cluster** j = jj; + + while (i < j) { + // INVARIANT: + //[0, i) < pos_next + //[i, j) ? + //[j, jj) > pos_next + + Cluster** const k = i + (j - i) / 2; + assert(k < jj); + + Cluster* const pNext = *k; + assert(pNext); + assert(pNext->m_index < 0); + + // const long long pos_ = pNext->m_pos; + // assert(pos_); + // pos = pos_ * ((pos_ < 0) ? -1 : 1); + + pos = pNext->GetPosition(); + + if (pos < off_next) + i = k + 1; + else if (pos > off_next) + j = k; + else + return pNext; + } + + assert(i == j); + + Cluster* const pNext = Cluster::Create(this, -1, off_next); + if (pNext == NULL) + return NULL; + + const ptrdiff_t idx_next = i - m_clusters; // insertion position + + if (!PreloadCluster(pNext, idx_next)) { + delete pNext; + return NULL; + } + assert(m_clusters); + assert(idx_next < m_clusterSize); + assert(m_clusters[idx_next] == pNext); + + return pNext; +} + +long Segment::ParseNext(const Cluster* pCurr, const Cluster*& pResult, + long long& pos, long& len) { + assert(pCurr); + assert(!pCurr->EOS()); + assert(m_clusters); + + pResult = 0; + + if (pCurr->m_index >= 0) { // loaded (not merely preloaded) + assert(m_clusters[pCurr->m_index] == pCurr); + + const long next_idx = pCurr->m_index + 1; + + if (next_idx < m_clusterCount) { + pResult = m_clusters[next_idx]; + return 0; // success + } + + // curr cluster is last among loaded + + const long result = LoadCluster(pos, len); + + if (result < 0) // error or underflow + return result; + + if (result > 0) // no more clusters + { + // pResult = &m_eos; + return 1; + } + + pResult = GetLast(); + return 0; // success + } + + assert(m_pos > 0); + + long long total, avail; + + long status = m_pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + assert((total < 0) || (avail <= total)); + + const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size; + + // interrogate curr cluster + + pos = pCurr->m_element_start; + + if (pCurr->m_element_size >= 0) + pos += pCurr->m_element_size; + else { + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long id = ReadUInt(m_pReader, pos, len); + + if (id != libwebm::kMkvCluster) + return -1; + + pos += len; // consume ID + + // Read Size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(m_pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + pos += len; // consume size field + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (size == unknown_size) // TODO: should never happen + return E_FILE_FORMAT_INVALID; // TODO: resolve this + + // assert((pCurr->m_size <= 0) || (pCurr->m_size == size)); + + if ((segment_stop >= 0) && ((pos + size) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + // Pos now points to start of payload + + pos += size; // consume payload (that is, the current cluster) + if (segment_stop >= 0 && pos > segment_stop) + return E_FILE_FORMAT_INVALID; + + // By consuming the payload, we are assuming that the curr + // cluster isn't interesting. That is, we don't bother checking + // whether the payload of the curr cluster is less than what + // happens to be available (obtained via IMkvReader::Length). + // Presumably the caller has already dispensed with the current + // cluster, and really does want the next cluster. + } + + // pos now points to just beyond the last fully-loaded cluster + + for (;;) { + const long status = DoParseNext(pResult, pos, len); + + if (status <= 1) + return status; + } +} + +long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) { + long long total, avail; + + long status = m_pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + assert((total < 0) || (avail <= total)); + + const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size; + + // Parse next cluster. This is strictly a parsing activity. + // Creation of a new cluster object happens later, after the + // parsing is done. + + long long off_next = 0; + long long cluster_size = -1; + + for (;;) { + if ((total >= 0) && (pos >= total)) + return 1; // EOF + + if ((segment_stop >= 0) && (pos >= segment_stop)) + return 1; // EOF + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long idpos = pos; // absolute + const long long idoff = pos - m_start; // relative + + const long long id = ReadID(m_pReader, idpos, len); // absolute + + if (id < 0) // error + return static_cast(id); + + if (id == 0) // weird + return -1; // generic error + + pos += len; // consume ID + + // Read Size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(m_pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + pos += len; // consume length of size of element + + // Pos now points to start of payload + + if (size == 0) // weird + continue; + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if ((segment_stop >= 0) && (size != unknown_size) && + ((pos + size) > segment_stop)) { + return E_FILE_FORMAT_INVALID; + } + + if (id == libwebm::kMkvCues) { + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; + + const long long element_stop = pos + size; + + if ((segment_stop >= 0) && (element_stop > segment_stop)) + return E_FILE_FORMAT_INVALID; + + const long long element_start = idpos; + const long long element_size = element_stop - element_start; + + if (m_pCues == NULL) { + m_pCues = new (std::nothrow) + Cues(this, pos, size, element_start, element_size); + if (m_pCues == NULL) + return false; + } + + pos += size; // consume payload + if (segment_stop >= 0 && pos > segment_stop) + return E_FILE_FORMAT_INVALID; + + continue; + } + + if (id != libwebm::kMkvCluster) { // not a Cluster ID + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; + + pos += size; // consume payload + if (segment_stop >= 0 && pos > segment_stop) + return E_FILE_FORMAT_INVALID; + + continue; + } + + // We have a cluster. + off_next = idoff; + + if (size != unknown_size) + cluster_size = size; + + break; + } + + assert(off_next > 0); // have cluster + + // We have parsed the next cluster. + // We have not created a cluster object yet. What we need + // to do now is determine whether it has already be preloaded + //(in which case, an object for this cluster has already been + // created), and if not, create a new cluster object. + + Cluster** const ii = m_clusters + m_clusterCount; + Cluster** i = ii; + + Cluster** const jj = ii + m_clusterPreloadCount; + Cluster** j = jj; + + while (i < j) { + // INVARIANT: + //[0, i) < pos_next + //[i, j) ? + //[j, jj) > pos_next + + Cluster** const k = i + (j - i) / 2; + assert(k < jj); + + const Cluster* const pNext = *k; + assert(pNext); + assert(pNext->m_index < 0); + + pos = pNext->GetPosition(); + assert(pos >= 0); + + if (pos < off_next) + i = k + 1; + else if (pos > off_next) + j = k; + else { + pResult = pNext; + return 0; // success + } + } + + assert(i == j); + + long long pos_; + long len_; + + status = Cluster::HasBlockEntries(this, off_next, pos_, len_); + + if (status < 0) { // error or underflow + pos = pos_; + len = len_; + + return status; + } + + if (status > 0) { // means "found at least one block entry" + Cluster* const pNext = Cluster::Create(this, + -1, // preloaded + off_next); + if (pNext == NULL) + return -1; + + const ptrdiff_t idx_next = i - m_clusters; // insertion position + + if (!PreloadCluster(pNext, idx_next)) { + delete pNext; + return -1; + } + assert(m_clusters); + assert(idx_next < m_clusterSize); + assert(m_clusters[idx_next] == pNext); + + pResult = pNext; + return 0; // success + } + + // status == 0 means "no block entries found" + + if (cluster_size < 0) { // unknown size + const long long payload_pos = pos; // absolute pos of cluster payload + + for (;;) { // determine cluster size + if ((total >= 0) && (pos >= total)) + break; + + if ((segment_stop >= 0) && (pos >= segment_stop)) + break; // no more clusters + + // Read ID + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long idpos = pos; + const long long id = ReadID(m_pReader, idpos, len); + + if (id < 0) // error (or underflow) + return static_cast(id); + + // This is the distinguished set of ID's we use to determine + // that we have exhausted the sub-element's inside the cluster + // whose ID we parsed earlier. + + if (id == libwebm::kMkvCluster || id == libwebm::kMkvCues) + break; + + pos += len; // consume ID (of sub-element) + + // Read Size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(m_pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(m_pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + pos += len; // consume size field of element + + // pos now points to start of sub-element's payload + + if (size == 0) // weird + continue; + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; // not allowed for sub-elements + + if ((segment_stop >= 0) && ((pos + size) > segment_stop)) // weird + return E_FILE_FORMAT_INVALID; + + pos += size; // consume payload of sub-element + if (segment_stop >= 0 && pos > segment_stop) + return E_FILE_FORMAT_INVALID; + } // determine cluster size + + cluster_size = pos - payload_pos; + assert(cluster_size >= 0); // TODO: handle cluster_size = 0 + + pos = payload_pos; // reset and re-parse original cluster + } + + pos += cluster_size; // consume payload + if (segment_stop >= 0 && pos > segment_stop) + return E_FILE_FORMAT_INVALID; + + return 2; // try to find a cluster that follows next +} + +const Cluster* Segment::FindCluster(long long time_ns) const { + if ((m_clusters == NULL) || (m_clusterCount <= 0)) + return &m_eos; + + { + Cluster* const pCluster = m_clusters[0]; + assert(pCluster); + assert(pCluster->m_index == 0); + + if (time_ns <= pCluster->GetTime()) + return pCluster; + } + + // Binary search of cluster array + + long i = 0; + long j = m_clusterCount; + + while (i < j) { + // INVARIANT: + //[0, i) <= time_ns + //[i, j) ? + //[j, m_clusterCount) > time_ns + + const long k = i + (j - i) / 2; + assert(k < m_clusterCount); + + Cluster* const pCluster = m_clusters[k]; + assert(pCluster); + assert(pCluster->m_index == k); + + const long long t = pCluster->GetTime(); + + if (t <= time_ns) + i = k + 1; + else + j = k; + + assert(i <= j); + } + + assert(i == j); + assert(i > 0); + assert(i <= m_clusterCount); + + const long k = i - 1; + + Cluster* const pCluster = m_clusters[k]; + assert(pCluster); + assert(pCluster->m_index == k); + assert(pCluster->GetTime() <= time_ns); + + return pCluster; +} + +const Tracks* Segment::GetTracks() const { return m_pTracks; } +const SegmentInfo* Segment::GetInfo() const { return m_pInfo; } +const Cues* Segment::GetCues() const { return m_pCues; } +const Chapters* Segment::GetChapters() const { return m_pChapters; } +const Tags* Segment::GetTags() const { return m_pTags; } +const SeekHead* Segment::GetSeekHead() const { return m_pSeekHead; } + +long long Segment::GetDuration() const { + assert(m_pInfo); + return m_pInfo->GetDuration(); +} + +Chapters::Chapters(Segment* pSegment, long long payload_start, + long long payload_size, long long element_start, + long long element_size) + : m_pSegment(pSegment), + m_start(payload_start), + m_size(payload_size), + m_element_start(element_start), + m_element_size(element_size), + m_editions(NULL), + m_editions_size(0), + m_editions_count(0) {} + +Chapters::~Chapters() { + while (m_editions_count > 0) { + Edition& e = m_editions[--m_editions_count]; + e.Clear(); + } + delete[] m_editions; +} + +long Chapters::Parse() { + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long pos = m_start; // payload start + const long long stop = pos + m_size; // payload stop + + while (pos < stop) { + long long id, size; + + long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (size == 0) // weird + continue; + + if (id == libwebm::kMkvEditionEntry) { + status = ParseEdition(pos, size); + + if (status < 0) // error + return status; + } + + pos += size; + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + return 0; +} + +int Chapters::GetEditionCount() const { return m_editions_count; } + +const Chapters::Edition* Chapters::GetEdition(int idx) const { + if (idx < 0) + return NULL; + + if (idx >= m_editions_count) + return NULL; + + return m_editions + idx; +} + +bool Chapters::ExpandEditionsArray() { + if (m_editions_size > m_editions_count) + return true; // nothing else to do + + const int size = (m_editions_size == 0) ? 1 : 2 * m_editions_size; + + Edition* const editions = new (std::nothrow) Edition[size]; + + if (editions == NULL) + return false; + + for (int idx = 0; idx < m_editions_count; ++idx) { + m_editions[idx].ShallowCopy(editions[idx]); + } + + delete[] m_editions; + m_editions = editions; + + m_editions_size = size; + return true; +} + +long Chapters::ParseEdition(long long pos, long long size) { + if (!ExpandEditionsArray()) + return -1; + + Edition& e = m_editions[m_editions_count++]; + e.Init(); + + return e.Parse(m_pSegment->m_pReader, pos, size); +} + +Chapters::Edition::Edition() {} + +Chapters::Edition::~Edition() {} + +int Chapters::Edition::GetAtomCount() const { return m_atoms_count; } + +const Chapters::Atom* Chapters::Edition::GetAtom(int index) const { + if (index < 0) + return NULL; + + if (index >= m_atoms_count) + return NULL; + + return m_atoms + index; +} + +void Chapters::Edition::Init() { + m_atoms = NULL; + m_atoms_size = 0; + m_atoms_count = 0; +} + +void Chapters::Edition::ShallowCopy(Edition& rhs) const { + rhs.m_atoms = m_atoms; + rhs.m_atoms_size = m_atoms_size; + rhs.m_atoms_count = m_atoms_count; +} + +void Chapters::Edition::Clear() { + while (m_atoms_count > 0) { + Atom& a = m_atoms[--m_atoms_count]; + a.Clear(); + } + + delete[] m_atoms; + m_atoms = NULL; + + m_atoms_size = 0; +} + +long Chapters::Edition::Parse(IMkvReader* pReader, long long pos, + long long size) { + const long long stop = pos + size; + + while (pos < stop) { + long long id, size; + + long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (size == 0) + continue; + + if (id == libwebm::kMkvChapterAtom) { + status = ParseAtom(pReader, pos, size); + + if (status < 0) // error + return status; + } + + pos += size; + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + return 0; +} + +long Chapters::Edition::ParseAtom(IMkvReader* pReader, long long pos, + long long size) { + if (!ExpandAtomsArray()) + return -1; + + Atom& a = m_atoms[m_atoms_count++]; + a.Init(); + + return a.Parse(pReader, pos, size); +} + +bool Chapters::Edition::ExpandAtomsArray() { + if (m_atoms_size > m_atoms_count) + return true; // nothing else to do + + const int size = (m_atoms_size == 0) ? 1 : 2 * m_atoms_size; + + Atom* const atoms = new (std::nothrow) Atom[size]; + + if (atoms == NULL) + return false; + + for (int idx = 0; idx < m_atoms_count; ++idx) { + m_atoms[idx].ShallowCopy(atoms[idx]); + } + + delete[] m_atoms; + m_atoms = atoms; + + m_atoms_size = size; + return true; +} + +Chapters::Atom::Atom() {} + +Chapters::Atom::~Atom() {} + +unsigned long long Chapters::Atom::GetUID() const { return m_uid; } + +const char* Chapters::Atom::GetStringUID() const { return m_string_uid; } + +long long Chapters::Atom::GetStartTimecode() const { return m_start_timecode; } + +long long Chapters::Atom::GetStopTimecode() const { return m_stop_timecode; } + +long long Chapters::Atom::GetStartTime(const Chapters* pChapters) const { + return GetTime(pChapters, m_start_timecode); +} + +long long Chapters::Atom::GetStopTime(const Chapters* pChapters) const { + return GetTime(pChapters, m_stop_timecode); +} + +int Chapters::Atom::GetDisplayCount() const { return m_displays_count; } + +const Chapters::Display* Chapters::Atom::GetDisplay(int index) const { + if (index < 0) + return NULL; + + if (index >= m_displays_count) + return NULL; + + return m_displays + index; +} + +void Chapters::Atom::Init() { + m_string_uid = NULL; + m_uid = 0; + m_start_timecode = -1; + m_stop_timecode = -1; + + m_displays = NULL; + m_displays_size = 0; + m_displays_count = 0; +} + +void Chapters::Atom::ShallowCopy(Atom& rhs) const { + rhs.m_string_uid = m_string_uid; + rhs.m_uid = m_uid; + rhs.m_start_timecode = m_start_timecode; + rhs.m_stop_timecode = m_stop_timecode; + + rhs.m_displays = m_displays; + rhs.m_displays_size = m_displays_size; + rhs.m_displays_count = m_displays_count; +} + +void Chapters::Atom::Clear() { + delete[] m_string_uid; + m_string_uid = NULL; + + while (m_displays_count > 0) { + Display& d = m_displays[--m_displays_count]; + d.Clear(); + } + + delete[] m_displays; + m_displays = NULL; + + m_displays_size = 0; +} + +long Chapters::Atom::Parse(IMkvReader* pReader, long long pos, long long size) { + const long long stop = pos + size; + + while (pos < stop) { + long long id, size; + + long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (size == 0) // 0 length payload, skip. + continue; + + if (id == libwebm::kMkvChapterDisplay) { + status = ParseDisplay(pReader, pos, size); + + if (status < 0) // error + return status; + } else if (id == libwebm::kMkvChapterStringUID) { + status = UnserializeString(pReader, pos, size, m_string_uid); + + if (status < 0) // error + return status; + } else if (id == libwebm::kMkvChapterUID) { + long long val; + status = UnserializeInt(pReader, pos, size, val); + + if (status < 0) // error + return status; + + m_uid = static_cast(val); + } else if (id == libwebm::kMkvChapterTimeStart) { + const long long val = UnserializeUInt(pReader, pos, size); + + if (val < 0) // error + return static_cast(val); + + m_start_timecode = val; + } else if (id == libwebm::kMkvChapterTimeEnd) { + const long long val = UnserializeUInt(pReader, pos, size); + + if (val < 0) // error + return static_cast(val); + + m_stop_timecode = val; + } + + pos += size; + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + return 0; +} + +long long Chapters::Atom::GetTime(const Chapters* pChapters, + long long timecode) { + if (pChapters == NULL) + return -1; + + Segment* const pSegment = pChapters->m_pSegment; + + if (pSegment == NULL) // weird + return -1; + + const SegmentInfo* const pInfo = pSegment->GetInfo(); + + if (pInfo == NULL) + return -1; + + const long long timecode_scale = pInfo->GetTimeCodeScale(); + + if (timecode_scale < 1) // weird + return -1; + + if (timecode < 0) + return -1; + + const long long result = timecode_scale * timecode; + + return result; +} + +long Chapters::Atom::ParseDisplay(IMkvReader* pReader, long long pos, + long long size) { + if (!ExpandDisplaysArray()) + return -1; + + Display& d = m_displays[m_displays_count++]; + d.Init(); + + return d.Parse(pReader, pos, size); +} + +bool Chapters::Atom::ExpandDisplaysArray() { + if (m_displays_size > m_displays_count) + return true; // nothing else to do + + const int size = (m_displays_size == 0) ? 1 : 2 * m_displays_size; + + Display* const displays = new (std::nothrow) Display[size]; + + if (displays == NULL) + return false; + + for (int idx = 0; idx < m_displays_count; ++idx) { + m_displays[idx].ShallowCopy(displays[idx]); + } + + delete[] m_displays; + m_displays = displays; + + m_displays_size = size; + return true; +} + +Chapters::Display::Display() {} + +Chapters::Display::~Display() {} + +const char* Chapters::Display::GetString() const { return m_string; } + +const char* Chapters::Display::GetLanguage() const { return m_language; } + +const char* Chapters::Display::GetCountry() const { return m_country; } + +void Chapters::Display::Init() { + m_string = NULL; + m_language = NULL; + m_country = NULL; +} + +void Chapters::Display::ShallowCopy(Display& rhs) const { + rhs.m_string = m_string; + rhs.m_language = m_language; + rhs.m_country = m_country; +} + +void Chapters::Display::Clear() { + delete[] m_string; + m_string = NULL; + + delete[] m_language; + m_language = NULL; + + delete[] m_country; + m_country = NULL; +} + +long Chapters::Display::Parse(IMkvReader* pReader, long long pos, + long long size) { + const long long stop = pos + size; + + while (pos < stop) { + long long id, size; + + long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (size == 0) // No payload. + continue; + + if (id == libwebm::kMkvChapString) { + status = UnserializeString(pReader, pos, size, m_string); + + if (status) + return status; + } else if (id == libwebm::kMkvChapLanguage) { + status = UnserializeString(pReader, pos, size, m_language); + + if (status) + return status; + } else if (id == libwebm::kMkvChapCountry) { + status = UnserializeString(pReader, pos, size, m_country); + + if (status) + return status; + } + + pos += size; + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + return 0; +} + +Tags::Tags(Segment* pSegment, long long payload_start, long long payload_size, + long long element_start, long long element_size) + : m_pSegment(pSegment), + m_start(payload_start), + m_size(payload_size), + m_element_start(element_start), + m_element_size(element_size), + m_tags(NULL), + m_tags_size(0), + m_tags_count(0) {} + +Tags::~Tags() { + while (m_tags_count > 0) { + Tag& t = m_tags[--m_tags_count]; + t.Clear(); + } + delete[] m_tags; +} + +long Tags::Parse() { + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long pos = m_start; // payload start + const long long stop = pos + m_size; // payload stop + + while (pos < stop) { + long long id, size; + + long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) + return status; + + if (size == 0) // 0 length tag, read another + continue; + + if (id == libwebm::kMkvTag) { + status = ParseTag(pos, size); + + if (status < 0) + return status; + } + + pos += size; + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + return 0; +} + +int Tags::GetTagCount() const { return m_tags_count; } + +const Tags::Tag* Tags::GetTag(int idx) const { + if (idx < 0) + return NULL; + + if (idx >= m_tags_count) + return NULL; + + return m_tags + idx; +} + +bool Tags::ExpandTagsArray() { + if (m_tags_size > m_tags_count) + return true; // nothing else to do + + const int size = (m_tags_size == 0) ? 1 : 2 * m_tags_size; + + Tag* const tags = new (std::nothrow) Tag[size]; + + if (tags == NULL) + return false; + + for (int idx = 0; idx < m_tags_count; ++idx) { + m_tags[idx].ShallowCopy(tags[idx]); + } + + delete[] m_tags; + m_tags = tags; + + m_tags_size = size; + return true; +} + +long Tags::ParseTag(long long pos, long long size) { + if (!ExpandTagsArray()) + return -1; + + Tag& t = m_tags[m_tags_count++]; + t.Init(); + + return t.Parse(m_pSegment->m_pReader, pos, size); +} + +Tags::Tag::Tag() {} + +Tags::Tag::~Tag() {} + +int Tags::Tag::GetSimpleTagCount() const { return m_simple_tags_count; } + +const Tags::SimpleTag* Tags::Tag::GetSimpleTag(int index) const { + if (index < 0) + return NULL; + + if (index >= m_simple_tags_count) + return NULL; + + return m_simple_tags + index; +} + +void Tags::Tag::Init() { + m_simple_tags = NULL; + m_simple_tags_size = 0; + m_simple_tags_count = 0; +} + +void Tags::Tag::ShallowCopy(Tag& rhs) const { + rhs.m_simple_tags = m_simple_tags; + rhs.m_simple_tags_size = m_simple_tags_size; + rhs.m_simple_tags_count = m_simple_tags_count; +} + +void Tags::Tag::Clear() { + while (m_simple_tags_count > 0) { + SimpleTag& d = m_simple_tags[--m_simple_tags_count]; + d.Clear(); + } + + delete[] m_simple_tags; + m_simple_tags = NULL; + + m_simple_tags_size = 0; +} + +long Tags::Tag::Parse(IMkvReader* pReader, long long pos, long long size) { + const long long stop = pos + size; + + while (pos < stop) { + long long id, size; + + long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) + return status; + + if (size == 0) // 0 length tag, read another + continue; + + if (id == libwebm::kMkvSimpleTag) { + status = ParseSimpleTag(pReader, pos, size); + + if (status < 0) + return status; + } + + pos += size; + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + return 0; +} + +long Tags::Tag::ParseSimpleTag(IMkvReader* pReader, long long pos, + long long size) { + if (!ExpandSimpleTagsArray()) + return -1; + + SimpleTag& st = m_simple_tags[m_simple_tags_count++]; + st.Init(); + + return st.Parse(pReader, pos, size); +} + +bool Tags::Tag::ExpandSimpleTagsArray() { + if (m_simple_tags_size > m_simple_tags_count) + return true; // nothing else to do + + const int size = (m_simple_tags_size == 0) ? 1 : 2 * m_simple_tags_size; + + SimpleTag* const displays = new (std::nothrow) SimpleTag[size]; + + if (displays == NULL) + return false; + + for (int idx = 0; idx < m_simple_tags_count; ++idx) { + m_simple_tags[idx].ShallowCopy(displays[idx]); + } + + delete[] m_simple_tags; + m_simple_tags = displays; + + m_simple_tags_size = size; + return true; +} + +Tags::SimpleTag::SimpleTag() {} + +Tags::SimpleTag::~SimpleTag() {} + +const char* Tags::SimpleTag::GetTagName() const { return m_tag_name; } + +const char* Tags::SimpleTag::GetTagString() const { return m_tag_string; } + +void Tags::SimpleTag::Init() { + m_tag_name = NULL; + m_tag_string = NULL; +} + +void Tags::SimpleTag::ShallowCopy(SimpleTag& rhs) const { + rhs.m_tag_name = m_tag_name; + rhs.m_tag_string = m_tag_string; +} + +void Tags::SimpleTag::Clear() { + delete[] m_tag_name; + m_tag_name = NULL; + + delete[] m_tag_string; + m_tag_string = NULL; +} + +long Tags::SimpleTag::Parse(IMkvReader* pReader, long long pos, + long long size) { + const long long stop = pos + size; + + while (pos < stop) { + long long id, size; + + long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (size == 0) // weird + continue; + + if (id == libwebm::kMkvTagName) { + status = UnserializeString(pReader, pos, size, m_tag_name); + + if (status) + return status; + } else if (id == libwebm::kMkvTagString) { + status = UnserializeString(pReader, pos, size, m_tag_string); + + if (status) + return status; + } + + pos += size; + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + return 0; +} + +SegmentInfo::SegmentInfo(Segment* pSegment, long long start, long long size_, + long long element_start, long long element_size) + : m_pSegment(pSegment), + m_start(start), + m_size(size_), + m_element_start(element_start), + m_element_size(element_size), + m_pMuxingAppAsUTF8(NULL), + m_pWritingAppAsUTF8(NULL), + m_pTitleAsUTF8(NULL) {} + +SegmentInfo::~SegmentInfo() { + delete[] m_pMuxingAppAsUTF8; + m_pMuxingAppAsUTF8 = NULL; + + delete[] m_pWritingAppAsUTF8; + m_pWritingAppAsUTF8 = NULL; + + delete[] m_pTitleAsUTF8; + m_pTitleAsUTF8 = NULL; +} + +long SegmentInfo::Parse() { + assert(m_pMuxingAppAsUTF8 == NULL); + assert(m_pWritingAppAsUTF8 == NULL); + assert(m_pTitleAsUTF8 == NULL); + + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long pos = m_start; + const long long stop = m_start + m_size; + + m_timecodeScale = 1000000; + m_duration = -1; + + while (pos < stop) { + long long id, size; + + const long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (id == libwebm::kMkvTimecodeScale) { + m_timecodeScale = UnserializeUInt(pReader, pos, size); + + if (m_timecodeScale <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvDuration) { + const long status = UnserializeFloat(pReader, pos, size, m_duration); + + if (status < 0) + return status; + + if (m_duration < 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvMuxingApp) { + const long status = + UnserializeString(pReader, pos, size, m_pMuxingAppAsUTF8); + + if (status) + return status; + } else if (id == libwebm::kMkvWritingApp) { + const long status = + UnserializeString(pReader, pos, size, m_pWritingAppAsUTF8); + + if (status) + return status; + } else if (id == libwebm::kMkvTitle) { + const long status = UnserializeString(pReader, pos, size, m_pTitleAsUTF8); + + if (status) + return status; + } + + pos += size; + + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + const double rollover_check = m_duration * m_timecodeScale; + if (rollover_check > static_cast(LLONG_MAX)) + return E_FILE_FORMAT_INVALID; + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + return 0; +} + +long long SegmentInfo::GetTimeCodeScale() const { return m_timecodeScale; } + +long long SegmentInfo::GetDuration() const { + if (m_duration < 0) + return -1; + + assert(m_timecodeScale >= 1); + + const double dd = double(m_duration) * double(m_timecodeScale); + const long long d = static_cast(dd); + + return d; +} + +const char* SegmentInfo::GetMuxingAppAsUTF8() const { + return m_pMuxingAppAsUTF8; +} + +const char* SegmentInfo::GetWritingAppAsUTF8() const { + return m_pWritingAppAsUTF8; +} + +const char* SegmentInfo::GetTitleAsUTF8() const { return m_pTitleAsUTF8; } + +/////////////////////////////////////////////////////////////// +// ContentEncoding element +ContentEncoding::ContentCompression::ContentCompression() + : algo(0), settings(NULL), settings_len(0) {} + +ContentEncoding::ContentCompression::~ContentCompression() { + delete[] settings; +} + +ContentEncoding::ContentEncryption::ContentEncryption() + : algo(0), + key_id(NULL), + key_id_len(0), + signature(NULL), + signature_len(0), + sig_key_id(NULL), + sig_key_id_len(0), + sig_algo(0), + sig_hash_algo(0) {} + +ContentEncoding::ContentEncryption::~ContentEncryption() { + delete[] key_id; + delete[] signature; + delete[] sig_key_id; +} + +ContentEncoding::ContentEncoding() + : compression_entries_(NULL), + compression_entries_end_(NULL), + encryption_entries_(NULL), + encryption_entries_end_(NULL), + encoding_order_(0), + encoding_scope_(1), + encoding_type_(0) {} + +ContentEncoding::~ContentEncoding() { + ContentCompression** comp_i = compression_entries_; + ContentCompression** const comp_j = compression_entries_end_; + + while (comp_i != comp_j) { + ContentCompression* const comp = *comp_i++; + delete comp; + } + + delete[] compression_entries_; + + ContentEncryption** enc_i = encryption_entries_; + ContentEncryption** const enc_j = encryption_entries_end_; + + while (enc_i != enc_j) { + ContentEncryption* const enc = *enc_i++; + delete enc; + } + + delete[] encryption_entries_; +} + +const ContentEncoding::ContentCompression* +ContentEncoding::GetCompressionByIndex(unsigned long idx) const { + const ptrdiff_t count = compression_entries_end_ - compression_entries_; + assert(count >= 0); + + if (idx >= static_cast(count)) + return NULL; + + return compression_entries_[idx]; +} + +unsigned long ContentEncoding::GetCompressionCount() const { + const ptrdiff_t count = compression_entries_end_ - compression_entries_; + assert(count >= 0); + + return static_cast(count); +} + +const ContentEncoding::ContentEncryption* ContentEncoding::GetEncryptionByIndex( + unsigned long idx) const { + const ptrdiff_t count = encryption_entries_end_ - encryption_entries_; + assert(count >= 0); + + if (idx >= static_cast(count)) + return NULL; + + return encryption_entries_[idx]; +} + +unsigned long ContentEncoding::GetEncryptionCount() const { + const ptrdiff_t count = encryption_entries_end_ - encryption_entries_; + assert(count >= 0); + + return static_cast(count); +} + +long ContentEncoding::ParseContentEncAESSettingsEntry( + long long start, long long size, IMkvReader* pReader, + ContentEncAESSettings* aes) { + assert(pReader); + assert(aes); + + long long pos = start; + const long long stop = start + size; + + while (pos < stop) { + long long id, size; + const long status = ParseElementHeader(pReader, pos, stop, id, size); + if (status < 0) // error + return status; + + if (id == libwebm::kMkvAESSettingsCipherMode) { + aes->cipher_mode = UnserializeUInt(pReader, pos, size); + if (aes->cipher_mode != 1) + return E_FILE_FORMAT_INVALID; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + return 0; +} + +long ContentEncoding::ParseContentEncodingEntry(long long start, long long size, + IMkvReader* pReader) { + assert(pReader); + + long long pos = start; + const long long stop = start + size; + + // Count ContentCompression and ContentEncryption elements. + int compression_count = 0; + int encryption_count = 0; + + while (pos < stop) { + long long id, size; + const long status = ParseElementHeader(pReader, pos, stop, id, size); + if (status < 0) // error + return status; + + if (id == libwebm::kMkvContentCompression) + ++compression_count; + + if (id == libwebm::kMkvContentEncryption) + ++encryption_count; + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (compression_count <= 0 && encryption_count <= 0) + return -1; + + if (compression_count > 0) { + compression_entries_ = + new (std::nothrow) ContentCompression*[compression_count]; + if (!compression_entries_) + return -1; + compression_entries_end_ = compression_entries_; + } + + if (encryption_count > 0) { + encryption_entries_ = + new (std::nothrow) ContentEncryption*[encryption_count]; + if (!encryption_entries_) { + delete[] compression_entries_; + return -1; + } + encryption_entries_end_ = encryption_entries_; + } + + pos = start; + while (pos < stop) { + long long id, size; + long status = ParseElementHeader(pReader, pos, stop, id, size); + if (status < 0) // error + return status; + + if (id == libwebm::kMkvContentEncodingOrder) { + encoding_order_ = UnserializeUInt(pReader, pos, size); + } else if (id == libwebm::kMkvContentEncodingScope) { + encoding_scope_ = UnserializeUInt(pReader, pos, size); + if (encoding_scope_ < 1) + return -1; + } else if (id == libwebm::kMkvContentEncodingType) { + encoding_type_ = UnserializeUInt(pReader, pos, size); + } else if (id == libwebm::kMkvContentCompression) { + ContentCompression* const compression = + new (std::nothrow) ContentCompression(); + if (!compression) + return -1; + + status = ParseCompressionEntry(pos, size, pReader, compression); + if (status) { + delete compression; + return status; + } + *compression_entries_end_++ = compression; + } else if (id == libwebm::kMkvContentEncryption) { + ContentEncryption* const encryption = + new (std::nothrow) ContentEncryption(); + if (!encryption) + return -1; + + status = ParseEncryptionEntry(pos, size, pReader, encryption); + if (status) { + delete encryption; + return status; + } + *encryption_entries_end_++ = encryption; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + return 0; +} + +long ContentEncoding::ParseCompressionEntry(long long start, long long size, + IMkvReader* pReader, + ContentCompression* compression) { + assert(pReader); + assert(compression); + + long long pos = start; + const long long stop = start + size; + + bool valid = false; + + while (pos < stop) { + long long id, size; + const long status = ParseElementHeader(pReader, pos, stop, id, size); + if (status < 0) // error + return status; + + if (id == libwebm::kMkvContentCompAlgo) { + long long algo = UnserializeUInt(pReader, pos, size); + if (algo < 0) + return E_FILE_FORMAT_INVALID; + compression->algo = algo; + valid = true; + } else if (id == libwebm::kMkvContentCompSettings) { + if (size <= 0) + return E_FILE_FORMAT_INVALID; + + const size_t buflen = static_cast(size); + unsigned char* buf = SafeArrayAlloc(1, buflen); + if (buf == NULL) + return -1; + + const int read_status = + pReader->Read(pos, static_cast(buflen), buf); + if (read_status) { + delete[] buf; + return status; + } + + compression->settings = buf; + compression->settings_len = buflen; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + // ContentCompAlgo is mandatory + if (!valid) + return E_FILE_FORMAT_INVALID; + + return 0; +} + +long ContentEncoding::ParseEncryptionEntry(long long start, long long size, + IMkvReader* pReader, + ContentEncryption* encryption) { + assert(pReader); + assert(encryption); + + long long pos = start; + const long long stop = start + size; + + while (pos < stop) { + long long id, size; + const long status = ParseElementHeader(pReader, pos, stop, id, size); + if (status < 0) // error + return status; + + if (id == libwebm::kMkvContentEncAlgo) { + encryption->algo = UnserializeUInt(pReader, pos, size); + if (encryption->algo != 5) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvContentEncKeyID) { + delete[] encryption->key_id; + encryption->key_id = NULL; + encryption->key_id_len = 0; + + if (size <= 0) + return E_FILE_FORMAT_INVALID; + + const size_t buflen = static_cast(size); + unsigned char* buf = SafeArrayAlloc(1, buflen); + if (buf == NULL) + return -1; + + const int read_status = + pReader->Read(pos, static_cast(buflen), buf); + if (read_status) { + delete[] buf; + return status; + } + + encryption->key_id = buf; + encryption->key_id_len = buflen; + } else if (id == libwebm::kMkvContentSignature) { + delete[] encryption->signature; + encryption->signature = NULL; + encryption->signature_len = 0; + + if (size <= 0) + return E_FILE_FORMAT_INVALID; + + const size_t buflen = static_cast(size); + unsigned char* buf = SafeArrayAlloc(1, buflen); + if (buf == NULL) + return -1; + + const int read_status = + pReader->Read(pos, static_cast(buflen), buf); + if (read_status) { + delete[] buf; + return status; + } + + encryption->signature = buf; + encryption->signature_len = buflen; + } else if (id == libwebm::kMkvContentSigKeyID) { + delete[] encryption->sig_key_id; + encryption->sig_key_id = NULL; + encryption->sig_key_id_len = 0; + + if (size <= 0) + return E_FILE_FORMAT_INVALID; + + const size_t buflen = static_cast(size); + unsigned char* buf = SafeArrayAlloc(1, buflen); + if (buf == NULL) + return -1; + + const int read_status = + pReader->Read(pos, static_cast(buflen), buf); + if (read_status) { + delete[] buf; + return status; + } + + encryption->sig_key_id = buf; + encryption->sig_key_id_len = buflen; + } else if (id == libwebm::kMkvContentSigAlgo) { + encryption->sig_algo = UnserializeUInt(pReader, pos, size); + } else if (id == libwebm::kMkvContentSigHashAlgo) { + encryption->sig_hash_algo = UnserializeUInt(pReader, pos, size); + } else if (id == libwebm::kMkvContentEncAESSettings) { + const long status = ParseContentEncAESSettingsEntry( + pos, size, pReader, &encryption->aes_settings); + if (status) + return status; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + return 0; +} + +Track::Track(Segment* pSegment, long long element_start, long long element_size) + : m_pSegment(pSegment), + m_element_start(element_start), + m_element_size(element_size), + content_encoding_entries_(NULL), + content_encoding_entries_end_(NULL) {} + +Track::~Track() { + Info& info = const_cast(m_info); + info.Clear(); + + ContentEncoding** i = content_encoding_entries_; + ContentEncoding** const j = content_encoding_entries_end_; + + while (i != j) { + ContentEncoding* const encoding = *i++; + delete encoding; + } + + delete[] content_encoding_entries_; +} + +long Track::Create(Segment* pSegment, const Info& info, long long element_start, + long long element_size, Track*& pResult) { + if (pResult) + return -1; + + Track* const pTrack = + new (std::nothrow) Track(pSegment, element_start, element_size); + + if (pTrack == NULL) + return -1; // generic error + + const int status = info.Copy(pTrack->m_info); + + if (status) { // error + delete pTrack; + return status; + } + + pResult = pTrack; + return 0; // success +} + +Track::Info::Info() + : uid(0), + defaultDuration(0), + codecDelay(0), + seekPreRoll(0), + nameAsUTF8(NULL), + language(NULL), + codecId(NULL), + codecNameAsUTF8(NULL), + codecPrivate(NULL), + codecPrivateSize(0), + lacing(false) {} + +Track::Info::~Info() { Clear(); } + +void Track::Info::Clear() { + delete[] nameAsUTF8; + nameAsUTF8 = NULL; + + delete[] language; + language = NULL; + + delete[] codecId; + codecId = NULL; + + delete[] codecPrivate; + codecPrivate = NULL; + codecPrivateSize = 0; + + delete[] codecNameAsUTF8; + codecNameAsUTF8 = NULL; +} + +int Track::Info::CopyStr(char* Info::*str, Info& dst_) const { + if (str == static_cast(NULL)) + return -1; + + char*& dst = dst_.*str; + + if (dst) // should be NULL already + return -1; + + const char* const src = this->*str; + + if (src == NULL) + return 0; + + const size_t len = strlen(src); + + dst = SafeArrayAlloc(1, len + 1); + + if (dst == NULL) + return -1; + + strcpy(dst, src); + + return 0; +} + +int Track::Info::Copy(Info& dst) const { + if (&dst == this) + return 0; + + dst.type = type; + dst.number = number; + dst.defaultDuration = defaultDuration; + dst.codecDelay = codecDelay; + dst.seekPreRoll = seekPreRoll; + dst.uid = uid; + dst.lacing = lacing; + dst.settings = settings; + + // We now copy the string member variables from src to dst. + // This involves memory allocation so in principle the operation + // can fail (indeed, that's why we have Info::Copy), so we must + // report this to the caller. An error return from this function + // therefore implies that the copy was only partially successful. + + if (int status = CopyStr(&Info::nameAsUTF8, dst)) + return status; + + if (int status = CopyStr(&Info::language, dst)) + return status; + + if (int status = CopyStr(&Info::codecId, dst)) + return status; + + if (int status = CopyStr(&Info::codecNameAsUTF8, dst)) + return status; + + if (codecPrivateSize > 0) { + if (codecPrivate == NULL) + return -1; + + if (dst.codecPrivate) + return -1; + + if (dst.codecPrivateSize != 0) + return -1; + + dst.codecPrivate = SafeArrayAlloc(1, codecPrivateSize); + + if (dst.codecPrivate == NULL) + return -1; + + memcpy(dst.codecPrivate, codecPrivate, codecPrivateSize); + dst.codecPrivateSize = codecPrivateSize; + } + + return 0; +} + +const BlockEntry* Track::GetEOS() const { return &m_eos; } + +long Track::GetType() const { return m_info.type; } + +long Track::GetNumber() const { return m_info.number; } + +unsigned long long Track::GetUid() const { return m_info.uid; } + +const char* Track::GetNameAsUTF8() const { return m_info.nameAsUTF8; } + +const char* Track::GetLanguage() const { return m_info.language; } + +const char* Track::GetCodecNameAsUTF8() const { return m_info.codecNameAsUTF8; } + +const char* Track::GetCodecId() const { return m_info.codecId; } + +const unsigned char* Track::GetCodecPrivate(size_t& size) const { + size = m_info.codecPrivateSize; + return m_info.codecPrivate; +} + +bool Track::GetLacing() const { return m_info.lacing; } + +unsigned long long Track::GetDefaultDuration() const { + return m_info.defaultDuration; +} + +unsigned long long Track::GetCodecDelay() const { return m_info.codecDelay; } + +unsigned long long Track::GetSeekPreRoll() const { return m_info.seekPreRoll; } + +long Track::GetFirst(const BlockEntry*& pBlockEntry) const { + const Cluster* pCluster = m_pSegment->GetFirst(); + + for (int i = 0;;) { + if (pCluster == NULL) { + pBlockEntry = GetEOS(); + return 1; + } + + if (pCluster->EOS()) { + if (m_pSegment->DoneParsing()) { + pBlockEntry = GetEOS(); + return 1; + } + + pBlockEntry = 0; + return E_BUFFER_NOT_FULL; + } + + long status = pCluster->GetFirst(pBlockEntry); + + if (status < 0) // error + return status; + + if (pBlockEntry == 0) { // empty cluster + pCluster = m_pSegment->GetNext(pCluster); + continue; + } + + for (;;) { + const Block* const pBlock = pBlockEntry->GetBlock(); + assert(pBlock); + + const long long tn = pBlock->GetTrackNumber(); + + if ((tn == m_info.number) && VetEntry(pBlockEntry)) + return 0; + + const BlockEntry* pNextEntry; + + status = pCluster->GetNext(pBlockEntry, pNextEntry); + + if (status < 0) // error + return status; + + if (pNextEntry == 0) + break; + + pBlockEntry = pNextEntry; + } + + ++i; + + if (i >= 100) + break; + + pCluster = m_pSegment->GetNext(pCluster); + } + + // NOTE: if we get here, it means that we didn't find a block with + // a matching track number. We interpret that as an error (which + // might be too conservative). + + pBlockEntry = GetEOS(); // so we can return a non-NULL value + return 1; +} + +long Track::GetNext(const BlockEntry* pCurrEntry, + const BlockEntry*& pNextEntry) const { + assert(pCurrEntry); + assert(!pCurrEntry->EOS()); //? + + const Block* const pCurrBlock = pCurrEntry->GetBlock(); + assert(pCurrBlock && pCurrBlock->GetTrackNumber() == m_info.number); + if (!pCurrBlock || pCurrBlock->GetTrackNumber() != m_info.number) + return -1; + + const Cluster* pCluster = pCurrEntry->GetCluster(); + assert(pCluster); + assert(!pCluster->EOS()); + + long status = pCluster->GetNext(pCurrEntry, pNextEntry); + + if (status < 0) // error + return status; + + for (int i = 0;;) { + while (pNextEntry) { + const Block* const pNextBlock = pNextEntry->GetBlock(); + assert(pNextBlock); + + if (pNextBlock->GetTrackNumber() == m_info.number) + return 0; + + pCurrEntry = pNextEntry; + + status = pCluster->GetNext(pCurrEntry, pNextEntry); + + if (status < 0) // error + return status; + } + + pCluster = m_pSegment->GetNext(pCluster); + + if (pCluster == NULL) { + pNextEntry = GetEOS(); + return 1; + } + + if (pCluster->EOS()) { + if (m_pSegment->DoneParsing()) { + pNextEntry = GetEOS(); + return 1; + } + + // TODO: there is a potential O(n^2) problem here: we tell the + // caller to (pre)load another cluster, which he does, but then he + // calls GetNext again, which repeats the same search. This is + // a pathological case, since the only way it can happen is if + // there exists a long sequence of clusters none of which contain a + // block from this track. One way around this problem is for the + // caller to be smarter when he loads another cluster: don't call + // us back until you have a cluster that contains a block from this + // track. (Of course, that's not cheap either, since our caller + // would have to scan the each cluster as it's loaded, so that + // would just push back the problem.) + + pNextEntry = NULL; + return E_BUFFER_NOT_FULL; + } + + status = pCluster->GetFirst(pNextEntry); + + if (status < 0) // error + return status; + + if (pNextEntry == NULL) // empty cluster + continue; + + ++i; + + if (i >= 100) + break; + } + + // NOTE: if we get here, it means that we didn't find a block with + // a matching track number after lots of searching, so we give + // up trying. + + pNextEntry = GetEOS(); // so we can return a non-NULL value + return 1; +} + +bool Track::VetEntry(const BlockEntry* pBlockEntry) const { + assert(pBlockEntry); + const Block* const pBlock = pBlockEntry->GetBlock(); + assert(pBlock); + assert(pBlock->GetTrackNumber() == m_info.number); + if (!pBlock || pBlock->GetTrackNumber() != m_info.number) + return false; + + // This function is used during a seek to determine whether the + // frame is a valid seek target. This default function simply + // returns true, which means all frames are valid seek targets. + // It gets overridden by the VideoTrack class, because only video + // keyframes can be used as seek target. + + return true; +} + +long Track::Seek(long long time_ns, const BlockEntry*& pResult) const { + const long status = GetFirst(pResult); + + if (status < 0) // buffer underflow, etc + return status; + + assert(pResult); + + if (pResult->EOS()) + return 0; + + const Cluster* pCluster = pResult->GetCluster(); + assert(pCluster); + assert(pCluster->GetIndex() >= 0); + + if (time_ns <= pResult->GetBlock()->GetTime(pCluster)) + return 0; + + Cluster** const clusters = m_pSegment->m_clusters; + assert(clusters); + + const long count = m_pSegment->GetCount(); // loaded only, not preloaded + assert(count > 0); + + Cluster** const i = clusters + pCluster->GetIndex(); + assert(i); + assert(*i == pCluster); + assert(pCluster->GetTime() <= time_ns); + + Cluster** const j = clusters + count; + + Cluster** lo = i; + Cluster** hi = j; + + while (lo < hi) { + // INVARIANT: + //[i, lo) <= time_ns + //[lo, hi) ? + //[hi, j) > time_ns + + Cluster** const mid = lo + (hi - lo) / 2; + assert(mid < hi); + + pCluster = *mid; + assert(pCluster); + assert(pCluster->GetIndex() >= 0); + assert(pCluster->GetIndex() == long(mid - m_pSegment->m_clusters)); + + const long long t = pCluster->GetTime(); + + if (t <= time_ns) + lo = mid + 1; + else + hi = mid; + + assert(lo <= hi); + } + + assert(lo == hi); + assert(lo > i); + assert(lo <= j); + + while (lo > i) { + pCluster = *--lo; + assert(pCluster); + assert(pCluster->GetTime() <= time_ns); + + pResult = pCluster->GetEntry(this); + + if ((pResult != 0) && !pResult->EOS()) + return 0; + + // landed on empty cluster (no entries) + } + + pResult = GetEOS(); // weird + return 0; +} + +const ContentEncoding* Track::GetContentEncodingByIndex( + unsigned long idx) const { + const ptrdiff_t count = + content_encoding_entries_end_ - content_encoding_entries_; + assert(count >= 0); + + if (idx >= static_cast(count)) + return NULL; + + return content_encoding_entries_[idx]; +} + +unsigned long Track::GetContentEncodingCount() const { + const ptrdiff_t count = + content_encoding_entries_end_ - content_encoding_entries_; + assert(count >= 0); + + return static_cast(count); +} + +long Track::ParseContentEncodingsEntry(long long start, long long size) { + IMkvReader* const pReader = m_pSegment->m_pReader; + assert(pReader); + + long long pos = start; + const long long stop = start + size; + + // Count ContentEncoding elements. + int count = 0; + while (pos < stop) { + long long id, size; + const long status = ParseElementHeader(pReader, pos, stop, id, size); + if (status < 0) // error + return status; + + // pos now designates start of element + if (id == libwebm::kMkvContentEncoding) + ++count; + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (count <= 0) + return -1; + + content_encoding_entries_ = new (std::nothrow) ContentEncoding*[count]; + if (!content_encoding_entries_) + return -1; + + content_encoding_entries_end_ = content_encoding_entries_; + + pos = start; + while (pos < stop) { + long long id, size; + long status = ParseElementHeader(pReader, pos, stop, id, size); + if (status < 0) // error + return status; + + // pos now designates start of element + if (id == libwebm::kMkvContentEncoding) { + ContentEncoding* const content_encoding = + new (std::nothrow) ContentEncoding(); + if (!content_encoding) + return -1; + + status = content_encoding->ParseContentEncodingEntry(pos, size, pReader); + if (status) { + delete content_encoding; + return status; + } + + *content_encoding_entries_end_++ = content_encoding; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + return 0; +} + +Track::EOSBlock::EOSBlock() : BlockEntry(NULL, LONG_MIN) {} + +BlockEntry::Kind Track::EOSBlock::GetKind() const { return kBlockEOS; } + +const Block* Track::EOSBlock::GetBlock() const { return NULL; } + +bool PrimaryChromaticity::Parse(IMkvReader* reader, long long read_pos, + long long value_size, bool is_x, + PrimaryChromaticity** chromaticity) { + if (!reader) + return false; + + if (!*chromaticity) + *chromaticity = new PrimaryChromaticity(); + + if (!*chromaticity) + return false; + + PrimaryChromaticity* pc = *chromaticity; + float* value = is_x ? &pc->x : &pc->y; + + double parser_value = 0; + const long long parse_status = + UnserializeFloat(reader, read_pos, value_size, parser_value); + + // Valid range is [0, 1]. Make sure the double is representable as a float + // before casting. + if (parse_status < 0 || parser_value < 0.0 || parser_value > 1.0 || + (parser_value > 0.0 && parser_value < FLT_MIN)) + return false; + + *value = static_cast(parser_value); + + return true; +} + +bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start, + long long mm_size, MasteringMetadata** mm) { + if (!reader || *mm) + return false; + + std::unique_ptr mm_ptr(new MasteringMetadata()); + if (!mm_ptr.get()) + return false; + + const long long mm_end = mm_start + mm_size; + long long read_pos = mm_start; + + while (read_pos < mm_end) { + long long child_id = 0; + long long child_size = 0; + + const long long status = + ParseElementHeader(reader, read_pos, mm_end, child_id, child_size); + if (status < 0) + return false; + + if (child_id == libwebm::kMkvLuminanceMax) { + double value = 0; + const long long value_parse_status = + UnserializeFloat(reader, read_pos, child_size, value); + if (value < -FLT_MAX || value > FLT_MAX || + (value > 0.0 && value < FLT_MIN)) { + return false; + } + mm_ptr->luminance_max = static_cast(value); + if (value_parse_status < 0 || mm_ptr->luminance_max < 0.0 || + mm_ptr->luminance_max > 9999.99) { + return false; + } + } else if (child_id == libwebm::kMkvLuminanceMin) { + double value = 0; + const long long value_parse_status = + UnserializeFloat(reader, read_pos, child_size, value); + if (value < -FLT_MAX || value > FLT_MAX || + (value > 0.0 && value < FLT_MIN)) { + return false; + } + mm_ptr->luminance_min = static_cast(value); + if (value_parse_status < 0 || mm_ptr->luminance_min < 0.0 || + mm_ptr->luminance_min > 999.9999) { + return false; + } + } else { + bool is_x = false; + PrimaryChromaticity** chromaticity; + switch (child_id) { + case libwebm::kMkvPrimaryRChromaticityX: + case libwebm::kMkvPrimaryRChromaticityY: + is_x = child_id == libwebm::kMkvPrimaryRChromaticityX; + chromaticity = &mm_ptr->r; + break; + case libwebm::kMkvPrimaryGChromaticityX: + case libwebm::kMkvPrimaryGChromaticityY: + is_x = child_id == libwebm::kMkvPrimaryGChromaticityX; + chromaticity = &mm_ptr->g; + break; + case libwebm::kMkvPrimaryBChromaticityX: + case libwebm::kMkvPrimaryBChromaticityY: + is_x = child_id == libwebm::kMkvPrimaryBChromaticityX; + chromaticity = &mm_ptr->b; + break; + case libwebm::kMkvWhitePointChromaticityX: + case libwebm::kMkvWhitePointChromaticityY: + is_x = child_id == libwebm::kMkvWhitePointChromaticityX; + chromaticity = &mm_ptr->white_point; + break; + default: + return false; + } + const bool value_parse_status = PrimaryChromaticity::Parse( + reader, read_pos, child_size, is_x, chromaticity); + if (!value_parse_status) + return false; + } + + read_pos += child_size; + if (read_pos > mm_end) + return false; + } + + *mm = mm_ptr.release(); + return true; +} + +bool Colour::Parse(IMkvReader* reader, long long colour_start, + long long colour_size, Colour** colour) { + if (!reader || *colour) + return false; + + std::unique_ptr colour_ptr(new Colour()); + if (!colour_ptr.get()) + return false; + + const long long colour_end = colour_start + colour_size; + long long read_pos = colour_start; + + while (read_pos < colour_end) { + long long child_id = 0; + long long child_size = 0; + + const long status = + ParseElementHeader(reader, read_pos, colour_end, child_id, child_size); + if (status < 0) + return false; + + if (child_id == libwebm::kMkvMatrixCoefficients) { + colour_ptr->matrix_coefficients = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->matrix_coefficients < 0) + return false; + } else if (child_id == libwebm::kMkvBitsPerChannel) { + colour_ptr->bits_per_channel = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->bits_per_channel < 0) + return false; + } else if (child_id == libwebm::kMkvChromaSubsamplingHorz) { + colour_ptr->chroma_subsampling_horz = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->chroma_subsampling_horz < 0) + return false; + } else if (child_id == libwebm::kMkvChromaSubsamplingVert) { + colour_ptr->chroma_subsampling_vert = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->chroma_subsampling_vert < 0) + return false; + } else if (child_id == libwebm::kMkvCbSubsamplingHorz) { + colour_ptr->cb_subsampling_horz = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->cb_subsampling_horz < 0) + return false; + } else if (child_id == libwebm::kMkvCbSubsamplingVert) { + colour_ptr->cb_subsampling_vert = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->cb_subsampling_vert < 0) + return false; + } else if (child_id == libwebm::kMkvChromaSitingHorz) { + colour_ptr->chroma_siting_horz = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->chroma_siting_horz < 0) + return false; + } else if (child_id == libwebm::kMkvChromaSitingVert) { + colour_ptr->chroma_siting_vert = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->chroma_siting_vert < 0) + return false; + } else if (child_id == libwebm::kMkvRange) { + colour_ptr->range = UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->range < 0) + return false; + } else if (child_id == libwebm::kMkvTransferCharacteristics) { + colour_ptr->transfer_characteristics = + UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->transfer_characteristics < 0) + return false; + } else if (child_id == libwebm::kMkvPrimaries) { + colour_ptr->primaries = UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->primaries < 0) + return false; + } else if (child_id == libwebm::kMkvMaxCLL) { + colour_ptr->max_cll = UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->max_cll < 0) + return false; + } else if (child_id == libwebm::kMkvMaxFALL) { + colour_ptr->max_fall = UnserializeUInt(reader, read_pos, child_size); + if (colour_ptr->max_fall < 0) + return false; + } else if (child_id == libwebm::kMkvMasteringMetadata) { + if (!MasteringMetadata::Parse(reader, read_pos, child_size, + &colour_ptr->mastering_metadata)) + return false; + } else { + return false; + } + + read_pos += child_size; + if (read_pos > colour_end) + return false; + } + *colour = colour_ptr.release(); + return true; +} + +bool Projection::Parse(IMkvReader* reader, long long start, long long size, + Projection** projection) { + if (!reader || *projection) + return false; + + std::unique_ptr projection_ptr(new Projection()); + if (!projection_ptr.get()) + return false; + + const long long end = start + size; + long long read_pos = start; + + while (read_pos < end) { + long long child_id = 0; + long long child_size = 0; + + const long long status = + ParseElementHeader(reader, read_pos, end, child_id, child_size); + if (status < 0) + return false; + + if (child_id == libwebm::kMkvProjectionType) { + long long projection_type = kTypeNotPresent; + projection_type = UnserializeUInt(reader, read_pos, child_size); + if (projection_type < 0) + return false; + + projection_ptr->type = static_cast(projection_type); + } else if (child_id == libwebm::kMkvProjectionPrivate) { + unsigned char* data = SafeArrayAlloc(1, child_size); + + if (data == NULL) + return false; + + const int status = + reader->Read(read_pos, static_cast(child_size), data); + + if (status) { + delete[] data; + return false; + } + + projection_ptr->private_data = data; + projection_ptr->private_data_length = static_cast(child_size); + } else { + double value = 0; + const long long value_parse_status = + UnserializeFloat(reader, read_pos, child_size, value); + // Make sure value is representable as a float before casting. + if (value_parse_status < 0 || value < -FLT_MAX || value > FLT_MAX || + (value > 0.0 && value < FLT_MIN)) { + return false; + } + + switch (child_id) { + case libwebm::kMkvProjectionPoseYaw: + projection_ptr->pose_yaw = static_cast(value); + break; + case libwebm::kMkvProjectionPosePitch: + projection_ptr->pose_pitch = static_cast(value); + break; + case libwebm::kMkvProjectionPoseRoll: + projection_ptr->pose_roll = static_cast(value); + break; + default: + return false; + } + } + + read_pos += child_size; + if (read_pos > end) + return false; + } + + *projection = projection_ptr.release(); + return true; +} + +VideoTrack::VideoTrack(Segment* pSegment, long long element_start, + long long element_size) + : Track(pSegment, element_start, element_size), + m_colour(NULL), + m_projection(NULL) {} + +VideoTrack::~VideoTrack() { + delete m_colour; + delete m_projection; +} + +long VideoTrack::Parse(Segment* pSegment, const Info& info, + long long element_start, long long element_size, + VideoTrack*& pResult) { + if (pResult) + return -1; + + if (info.type != Track::kVideo) + return -1; + + long long width = 0; + long long height = 0; + long long display_width = 0; + long long display_height = 0; + long long display_unit = 0; + long long stereo_mode = 0; + + double rate = 0.0; + + IMkvReader* const pReader = pSegment->m_pReader; + + const Settings& s = info.settings; + assert(s.start >= 0); + assert(s.size >= 0); + + long long pos = s.start; + assert(pos >= 0); + + const long long stop = pos + s.size; + + Colour* colour = NULL; + Projection* projection = NULL; + + while (pos < stop) { + long long id, size; + + const long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (id == libwebm::kMkvPixelWidth) { + width = UnserializeUInt(pReader, pos, size); + + if (width <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvPixelHeight) { + height = UnserializeUInt(pReader, pos, size); + + if (height <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvDisplayWidth) { + display_width = UnserializeUInt(pReader, pos, size); + + if (display_width <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvDisplayHeight) { + display_height = UnserializeUInt(pReader, pos, size); + + if (display_height <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvDisplayUnit) { + display_unit = UnserializeUInt(pReader, pos, size); + + if (display_unit < 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvStereoMode) { + stereo_mode = UnserializeUInt(pReader, pos, size); + + if (stereo_mode < 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvFrameRate) { + const long status = UnserializeFloat(pReader, pos, size, rate); + + if (status < 0) + return status; + + if (rate <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvColour) { + if (!Colour::Parse(pReader, pos, size, &colour)) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvProjection) { + if (!Projection::Parse(pReader, pos, size, &projection)) + return E_FILE_FORMAT_INVALID; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + VideoTrack* const pTrack = + new (std::nothrow) VideoTrack(pSegment, element_start, element_size); + + if (pTrack == NULL) + return -1; // generic error + + const int status = info.Copy(pTrack->m_info); + + if (status) { // error + delete pTrack; + return status; + } + + pTrack->m_width = width; + pTrack->m_height = height; + pTrack->m_display_width = display_width; + pTrack->m_display_height = display_height; + pTrack->m_display_unit = display_unit; + pTrack->m_stereo_mode = stereo_mode; + pTrack->m_rate = rate; + pTrack->m_colour = colour; + pTrack->m_projection = projection; + + pResult = pTrack; + return 0; // success +} + +bool VideoTrack::VetEntry(const BlockEntry* pBlockEntry) const { + return Track::VetEntry(pBlockEntry) && pBlockEntry->GetBlock()->IsKey(); +} + +long VideoTrack::Seek(long long time_ns, const BlockEntry*& pResult) const { + const long status = GetFirst(pResult); + + if (status < 0) // buffer underflow, etc + return status; + + assert(pResult); + + if (pResult->EOS()) + return 0; + + const Cluster* pCluster = pResult->GetCluster(); + assert(pCluster); + assert(pCluster->GetIndex() >= 0); + + if (time_ns <= pResult->GetBlock()->GetTime(pCluster)) + return 0; + + Cluster** const clusters = m_pSegment->m_clusters; + assert(clusters); + + const long count = m_pSegment->GetCount(); // loaded only, not pre-loaded + assert(count > 0); + + Cluster** const i = clusters + pCluster->GetIndex(); + assert(i); + assert(*i == pCluster); + assert(pCluster->GetTime() <= time_ns); + + Cluster** const j = clusters + count; + + Cluster** lo = i; + Cluster** hi = j; + + while (lo < hi) { + // INVARIANT: + //[i, lo) <= time_ns + //[lo, hi) ? + //[hi, j) > time_ns + + Cluster** const mid = lo + (hi - lo) / 2; + assert(mid < hi); + + pCluster = *mid; + assert(pCluster); + assert(pCluster->GetIndex() >= 0); + assert(pCluster->GetIndex() == long(mid - m_pSegment->m_clusters)); + + const long long t = pCluster->GetTime(); + + if (t <= time_ns) + lo = mid + 1; + else + hi = mid; + + assert(lo <= hi); + } + + assert(lo == hi); + assert(lo > i); + assert(lo <= j); + + pCluster = *--lo; + assert(pCluster); + assert(pCluster->GetTime() <= time_ns); + + pResult = pCluster->GetEntry(this, time_ns); + + if ((pResult != 0) && !pResult->EOS()) // found a keyframe + return 0; + + while (lo != i) { + pCluster = *--lo; + assert(pCluster); + assert(pCluster->GetTime() <= time_ns); + + pResult = pCluster->GetEntry(this, time_ns); + + if ((pResult != 0) && !pResult->EOS()) + return 0; + } + + // weird: we're on the first cluster, but no keyframe found + // should never happen but we must return something anyway + + pResult = GetEOS(); + return 0; +} + +Colour* VideoTrack::GetColour() const { return m_colour; } + +Projection* VideoTrack::GetProjection() const { return m_projection; } + +long long VideoTrack::GetWidth() const { return m_width; } + +long long VideoTrack::GetHeight() const { return m_height; } + +long long VideoTrack::GetDisplayWidth() const { + return m_display_width > 0 ? m_display_width : GetWidth(); +} + +long long VideoTrack::GetDisplayHeight() const { + return m_display_height > 0 ? m_display_height : GetHeight(); +} + +long long VideoTrack::GetDisplayUnit() const { return m_display_unit; } + +long long VideoTrack::GetStereoMode() const { return m_stereo_mode; } + +double VideoTrack::GetFrameRate() const { return m_rate; } + +AudioTrack::AudioTrack(Segment* pSegment, long long element_start, + long long element_size) + : Track(pSegment, element_start, element_size) {} + +long AudioTrack::Parse(Segment* pSegment, const Info& info, + long long element_start, long long element_size, + AudioTrack*& pResult) { + if (pResult) + return -1; + + if (info.type != Track::kAudio) + return -1; + + IMkvReader* const pReader = pSegment->m_pReader; + + const Settings& s = info.settings; + assert(s.start >= 0); + assert(s.size >= 0); + + long long pos = s.start; + assert(pos >= 0); + + const long long stop = pos + s.size; + + double rate = 8000.0; // MKV default + long long channels = 1; + long long bit_depth = 0; + + while (pos < stop) { + long long id, size; + + long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (id == libwebm::kMkvSamplingFrequency) { + status = UnserializeFloat(pReader, pos, size, rate); + + if (status < 0) + return status; + + if (rate <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvChannels) { + channels = UnserializeUInt(pReader, pos, size); + + if (channels <= 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvBitDepth) { + bit_depth = UnserializeUInt(pReader, pos, size); + + if (bit_depth <= 0) + return E_FILE_FORMAT_INVALID; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + AudioTrack* const pTrack = + new (std::nothrow) AudioTrack(pSegment, element_start, element_size); + + if (pTrack == NULL) + return -1; // generic error + + const int status = info.Copy(pTrack->m_info); + + if (status) { + delete pTrack; + return status; + } + + pTrack->m_rate = rate; + pTrack->m_channels = channels; + pTrack->m_bitDepth = bit_depth; + + pResult = pTrack; + return 0; // success +} + +double AudioTrack::GetSamplingRate() const { return m_rate; } + +long long AudioTrack::GetChannels() const { return m_channels; } + +long long AudioTrack::GetBitDepth() const { return m_bitDepth; } + +Tracks::Tracks(Segment* pSegment, long long start, long long size_, + long long element_start, long long element_size) + : m_pSegment(pSegment), + m_start(start), + m_size(size_), + m_element_start(element_start), + m_element_size(element_size), + m_trackEntries(NULL), + m_trackEntriesEnd(NULL) {} + +long Tracks::Parse() { + assert(m_trackEntries == NULL); + assert(m_trackEntriesEnd == NULL); + + const long long stop = m_start + m_size; + IMkvReader* const pReader = m_pSegment->m_pReader; + + int count = 0; + long long pos = m_start; + + while (pos < stop) { + long long id, size; + + const long status = ParseElementHeader(pReader, pos, stop, id, size); + + if (status < 0) // error + return status; + + if (size == 0) // weird + continue; + + if (id == libwebm::kMkvTrackEntry) + ++count; + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + if (count <= 0) + return 0; // success + + m_trackEntries = new (std::nothrow) Track*[count]; + + if (m_trackEntries == NULL) + return -1; + + m_trackEntriesEnd = m_trackEntries; + + pos = m_start; + + while (pos < stop) { + const long long element_start = pos; + + long long id, payload_size; + + const long status = + ParseElementHeader(pReader, pos, stop, id, payload_size); + + if (status < 0) // error + return status; + + if (payload_size == 0) // weird + continue; + + const long long payload_stop = pos + payload_size; + assert(payload_stop <= stop); // checked in ParseElement + + const long long element_size = payload_stop - element_start; + + if (id == libwebm::kMkvTrackEntry) { + Track*& pTrack = *m_trackEntriesEnd; + pTrack = NULL; + + const long status = ParseTrackEntry(pos, payload_size, element_start, + element_size, pTrack); + if (status) + return status; + + if (pTrack) + ++m_trackEntriesEnd; + } + + pos = payload_stop; + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + return 0; // success +} + +unsigned long Tracks::GetTracksCount() const { + const ptrdiff_t result = m_trackEntriesEnd - m_trackEntries; + assert(result >= 0); + + return static_cast(result); +} + +long Tracks::ParseTrackEntry(long long track_start, long long track_size, + long long element_start, long long element_size, + Track*& pResult) const { + if (pResult) + return -1; + + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long pos = track_start; + const long long track_stop = track_start + track_size; + + Track::Info info; + + info.type = 0; + info.number = 0; + info.uid = 0; + info.defaultDuration = 0; + + Track::Settings v; + v.start = -1; + v.size = -1; + + Track::Settings a; + a.start = -1; + a.size = -1; + + Track::Settings e; // content_encodings_settings; + e.start = -1; + e.size = -1; + + long long lacing = 1; // default is true + + while (pos < track_stop) { + long long id, size; + + const long status = ParseElementHeader(pReader, pos, track_stop, id, size); + + if (status < 0) // error + return status; + + if (size < 0) + return E_FILE_FORMAT_INVALID; + + const long long start = pos; + + if (id == libwebm::kMkvVideo) { + v.start = start; + v.size = size; + } else if (id == libwebm::kMkvAudio) { + a.start = start; + a.size = size; + } else if (id == libwebm::kMkvContentEncodings) { + e.start = start; + e.size = size; + } else if (id == libwebm::kMkvTrackUID) { + if (size > 8) + return E_FILE_FORMAT_INVALID; + + info.uid = 0; + + long long pos_ = start; + const long long pos_end = start + size; + + while (pos_ != pos_end) { + unsigned char b; + + const int status = pReader->Read(pos_, 1, &b); + + if (status) + return status; + + info.uid <<= 8; + info.uid |= b; + + ++pos_; + } + } else if (id == libwebm::kMkvTrackNumber) { + const long long num = UnserializeUInt(pReader, pos, size); + + if ((num <= 0) || (num > 127)) + return E_FILE_FORMAT_INVALID; + + info.number = static_cast(num); + } else if (id == libwebm::kMkvTrackType) { + const long long type = UnserializeUInt(pReader, pos, size); + + if ((type <= 0) || (type > 254)) + return E_FILE_FORMAT_INVALID; + + info.type = static_cast(type); + } else if (id == libwebm::kMkvName) { + const long status = + UnserializeString(pReader, pos, size, info.nameAsUTF8); + + if (status) + return status; + } else if (id == libwebm::kMkvLanguage) { + const long status = UnserializeString(pReader, pos, size, info.language); + + if (status) + return status; + } else if (id == libwebm::kMkvDefaultDuration) { + const long long duration = UnserializeUInt(pReader, pos, size); + + if (duration < 0) + return E_FILE_FORMAT_INVALID; + + info.defaultDuration = static_cast(duration); + } else if (id == libwebm::kMkvCodecID) { + const long status = UnserializeString(pReader, pos, size, info.codecId); + + if (status) + return status; + } else if (id == libwebm::kMkvFlagLacing) { + lacing = UnserializeUInt(pReader, pos, size); + + if ((lacing < 0) || (lacing > 1)) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvCodecPrivate) { + delete[] info.codecPrivate; + info.codecPrivate = NULL; + info.codecPrivateSize = 0; + + const size_t buflen = static_cast(size); + + if (buflen) { + unsigned char* buf = SafeArrayAlloc(1, buflen); + + if (buf == NULL) + return -1; + + const int status = pReader->Read(pos, static_cast(buflen), buf); + + if (status) { + delete[] buf; + return status; + } + + info.codecPrivate = buf; + info.codecPrivateSize = buflen; + } + } else if (id == libwebm::kMkvCodecName) { + const long status = + UnserializeString(pReader, pos, size, info.codecNameAsUTF8); + + if (status) + return status; + } else if (id == libwebm::kMkvCodecDelay) { + info.codecDelay = UnserializeUInt(pReader, pos, size); + } else if (id == libwebm::kMkvSeekPreRoll) { + info.seekPreRoll = UnserializeUInt(pReader, pos, size); + } + + pos += size; // consume payload + if (pos > track_stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != track_stop) + return E_FILE_FORMAT_INVALID; + + if (info.number <= 0) // not specified + return E_FILE_FORMAT_INVALID; + + if (GetTrackByNumber(info.number)) + return E_FILE_FORMAT_INVALID; + + if (info.type <= 0) // not specified + return E_FILE_FORMAT_INVALID; + + info.lacing = (lacing > 0) ? true : false; + + if (info.type == Track::kVideo) { + if (v.start < 0) + return E_FILE_FORMAT_INVALID; + + if (a.start >= 0) + return E_FILE_FORMAT_INVALID; + + info.settings = v; + + VideoTrack* pTrack = NULL; + + const long status = VideoTrack::Parse(m_pSegment, info, element_start, + element_size, pTrack); + + if (status) + return status; + + pResult = pTrack; + assert(pResult); + + if (e.start >= 0) + pResult->ParseContentEncodingsEntry(e.start, e.size); + } else if (info.type == Track::kAudio) { + if (a.start < 0) + return E_FILE_FORMAT_INVALID; + + if (v.start >= 0) + return E_FILE_FORMAT_INVALID; + + info.settings = a; + + AudioTrack* pTrack = NULL; + + const long status = AudioTrack::Parse(m_pSegment, info, element_start, + element_size, pTrack); + + if (status) + return status; + + pResult = pTrack; + assert(pResult); + + if (e.start >= 0) + pResult->ParseContentEncodingsEntry(e.start, e.size); + } else { + // neither video nor audio - probably metadata or subtitles + + if (a.start >= 0) + return E_FILE_FORMAT_INVALID; + + if (v.start >= 0) + return E_FILE_FORMAT_INVALID; + + if (info.type == Track::kMetadata && e.start >= 0) + return E_FILE_FORMAT_INVALID; + + info.settings.start = -1; + info.settings.size = 0; + + Track* pTrack = NULL; + + const long status = + Track::Create(m_pSegment, info, element_start, element_size, pTrack); + + if (status) + return status; + + pResult = pTrack; + assert(pResult); + } + + return 0; // success +} + +Tracks::~Tracks() { + Track** i = m_trackEntries; + Track** const j = m_trackEntriesEnd; + + while (i != j) { + Track* const pTrack = *i++; + delete pTrack; + } + + delete[] m_trackEntries; +} + +const Track* Tracks::GetTrackByNumber(long tn) const { + if (tn < 0) + return NULL; + + Track** i = m_trackEntries; + Track** const j = m_trackEntriesEnd; + + while (i != j) { + Track* const pTrack = *i++; + + if (pTrack == NULL) + continue; + + if (tn == pTrack->GetNumber()) + return pTrack; + } + + return NULL; // not found +} + +const Track* Tracks::GetTrackByIndex(unsigned long idx) const { + const ptrdiff_t count = m_trackEntriesEnd - m_trackEntries; + + if (idx >= static_cast(count)) + return NULL; + + return m_trackEntries[idx]; +} + +long Cluster::Load(long long& pos, long& len) const { + if (m_pSegment == NULL) + return E_PARSE_FAILED; + + if (m_timecode >= 0) // at least partially loaded + return 0; + + if (m_pos != m_element_start || m_element_size >= 0) + return E_PARSE_FAILED; + + IMkvReader* const pReader = m_pSegment->m_pReader; + long long total, avail; + const int status = pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + if (total >= 0 && (avail > total || m_pos > total)) + return E_FILE_FORMAT_INVALID; + + pos = m_pos; + + long long cluster_size = -1; + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error or underflow + return static_cast(result); + + if (result > 0) + return E_BUFFER_NOT_FULL; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long id_ = ReadID(pReader, pos, len); + + if (id_ < 0) // error + return static_cast(id_); + + if (id_ != libwebm::kMkvCluster) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume id + + // read cluster size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) + return E_BUFFER_NOT_FULL; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(pReader, pos, len); + + if (size < 0) // error + return static_cast(cluster_size); + + if (size == 0) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume length of size of element + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (size != unknown_size) + cluster_size = size; + + // pos points to start of payload + long long timecode = -1; + long long new_pos = -1; + bool bBlock = false; + + long long cluster_stop = (cluster_size < 0) ? -1 : pos + cluster_size; + + for (;;) { + if ((cluster_stop >= 0) && (pos >= cluster_stop)) + break; + + // Parse ID + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) + return E_BUFFER_NOT_FULL; + + if ((cluster_stop >= 0) && ((pos + len) > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long id = ReadID(pReader, pos, len); + + if (id < 0) // error + return static_cast(id); + + if (id == 0) + return E_FILE_FORMAT_INVALID; + + // This is the distinguished set of ID's we use to determine + // that we have exhausted the sub-element's inside the cluster + // whose ID we parsed earlier. + + if (id == libwebm::kMkvCluster) + break; + + if (id == libwebm::kMkvCues) + break; + + pos += len; // consume ID field + + // Parse Size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) + return E_BUFFER_NOT_FULL; + + if ((cluster_stop >= 0) && ((pos + len) > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume size field + + if ((cluster_stop >= 0) && (pos > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + // pos now points to start of payload + + if (size == 0) + continue; + + if ((cluster_stop >= 0) && ((pos + size) > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if (id == libwebm::kMkvTimecode) { + len = static_cast(size); + + if ((pos + size) > avail) + return E_BUFFER_NOT_FULL; + + timecode = UnserializeUInt(pReader, pos, size); + + if (timecode < 0) // error (or underflow) + return static_cast(timecode); + + new_pos = pos + size; + + if (bBlock) + break; + } else if (id == libwebm::kMkvBlockGroup) { + bBlock = true; + break; + } else if (id == libwebm::kMkvSimpleBlock) { + bBlock = true; + break; + } + + pos += size; // consume payload + if (cluster_stop >= 0 && pos > cluster_stop) + return E_FILE_FORMAT_INVALID; + } + + if (cluster_stop >= 0 && pos > cluster_stop) + return E_FILE_FORMAT_INVALID; + + if (timecode < 0) // no timecode found + return E_FILE_FORMAT_INVALID; + + if (!bBlock) + return E_FILE_FORMAT_INVALID; + + m_pos = new_pos; // designates position just beyond timecode payload + m_timecode = timecode; // m_timecode >= 0 means we're partially loaded + + if (cluster_size >= 0) + m_element_size = cluster_stop - m_element_start; + + return 0; +} + +long Cluster::Parse(long long& pos, long& len) const { + long status = Load(pos, len); + + if (status < 0) + return status; + + if (m_pos < m_element_start || m_timecode < 0) + return E_PARSE_FAILED; + + const long long cluster_stop = + (m_element_size < 0) ? -1 : m_element_start + m_element_size; + + if ((cluster_stop >= 0) && (m_pos >= cluster_stop)) + return 1; // nothing else to do + + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long total, avail; + + status = pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + if (total >= 0 && avail > total) + return E_FILE_FORMAT_INVALID; + + pos = m_pos; + + for (;;) { + if ((cluster_stop >= 0) && (pos >= cluster_stop)) + break; + + if ((total >= 0) && (pos >= total)) { + if (m_element_size < 0) + m_element_size = pos - m_element_start; + + break; + } + + // Parse ID + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) + return E_BUFFER_NOT_FULL; + + if ((cluster_stop >= 0) && ((pos + len) > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long id = ReadID(pReader, pos, len); + + if (id < 0) + return E_FILE_FORMAT_INVALID; + + // This is the distinguished set of ID's we use to determine + // that we have exhausted the sub-element's inside the cluster + // whose ID we parsed earlier. + + if ((id == libwebm::kMkvCluster) || (id == libwebm::kMkvCues)) { + if (m_element_size < 0) + m_element_size = pos - m_element_start; + + break; + } + + pos += len; // consume ID field + + // Parse Size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) + return E_BUFFER_NOT_FULL; + + if ((cluster_stop >= 0) && ((pos + len) > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume size field + + if ((cluster_stop >= 0) && (pos > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + // pos now points to start of payload + + if (size == 0) + continue; + + // const long long block_start = pos; + const long long block_stop = pos + size; + + if (cluster_stop >= 0) { + if (block_stop > cluster_stop) { + if (id == libwebm::kMkvBlockGroup || id == libwebm::kMkvSimpleBlock) { + return E_FILE_FORMAT_INVALID; + } + + pos = cluster_stop; + break; + } + } else if ((total >= 0) && (block_stop > total)) { + m_element_size = total - m_element_start; + pos = total; + break; + } else if (block_stop > avail) { + len = static_cast(size); + return E_BUFFER_NOT_FULL; + } + + Cluster* const this_ = const_cast(this); + + if (id == libwebm::kMkvBlockGroup) + return this_->ParseBlockGroup(size, pos, len); + + if (id == libwebm::kMkvSimpleBlock) + return this_->ParseSimpleBlock(size, pos, len); + + pos += size; // consume payload + if (cluster_stop >= 0 && pos > cluster_stop) + return E_FILE_FORMAT_INVALID; + } + + if (m_element_size < 1) + return E_FILE_FORMAT_INVALID; + + m_pos = pos; + if (cluster_stop >= 0 && m_pos > cluster_stop) + return E_FILE_FORMAT_INVALID; + + if (m_entries_count > 0) { + const long idx = m_entries_count - 1; + + const BlockEntry* const pLast = m_entries[idx]; + if (pLast == NULL) + return E_PARSE_FAILED; + + const Block* const pBlock = pLast->GetBlock(); + if (pBlock == NULL) + return E_PARSE_FAILED; + + const long long start = pBlock->m_start; + + if ((total >= 0) && (start > total)) + return E_PARSE_FAILED; // defend against trucated stream + + const long long size = pBlock->m_size; + + const long long stop = start + size; + if (cluster_stop >= 0 && stop > cluster_stop) + return E_FILE_FORMAT_INVALID; + + if ((total >= 0) && (stop > total)) + return E_PARSE_FAILED; // defend against trucated stream + } + + return 1; // no more entries +} + +long Cluster::ParseSimpleBlock(long long block_size, long long& pos, + long& len) { + const long long block_start = pos; + const long long block_stop = pos + block_size; + + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long total, avail; + + long status = pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + assert((total < 0) || (avail <= total)); + + // parse track number + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((pos + len) > block_stop) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long track = ReadUInt(pReader, pos, len); + + if (track < 0) // error + return static_cast(track); + + if (track == 0) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume track number + + if ((pos + 2) > block_stop) + return E_FILE_FORMAT_INVALID; + + if ((pos + 2) > avail) { + len = 2; + return E_BUFFER_NOT_FULL; + } + + pos += 2; // consume timecode + + if ((pos + 1) > block_stop) + return E_FILE_FORMAT_INVALID; + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + unsigned char flags; + + status = pReader->Read(pos, 1, &flags); + + if (status < 0) { // error or underflow + len = 1; + return status; + } + + ++pos; // consume flags byte + assert(pos <= avail); + + if (pos >= block_stop) + return E_FILE_FORMAT_INVALID; + + const int lacing = int(flags & 0x06) >> 1; + + if ((lacing != 0) && (block_stop > avail)) { + len = static_cast(block_stop - pos); + return E_BUFFER_NOT_FULL; + } + + status = CreateBlock(libwebm::kMkvSimpleBlock, block_start, block_size, + 0); // DiscardPadding + + if (status != 0) + return status; + + m_pos = block_stop; + + return 0; // success +} + +long Cluster::ParseBlockGroup(long long payload_size, long long& pos, + long& len) { + const long long payload_start = pos; + const long long payload_stop = pos + payload_size; + + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long total, avail; + + long status = pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + assert((total < 0) || (avail <= total)); + + if ((total >= 0) && (payload_stop > total)) + return E_FILE_FORMAT_INVALID; + + if (payload_stop > avail) { + len = static_cast(payload_size); + return E_BUFFER_NOT_FULL; + } + + long long discard_padding = 0; + + while (pos < payload_stop) { + // parse sub-block element ID + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((pos + len) > payload_stop) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long id = ReadID(pReader, pos, len); + + if (id < 0) // error + return static_cast(id); + + if (id == 0) // not a valid ID + return E_FILE_FORMAT_INVALID; + + pos += len; // consume ID field + + // Parse Size + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((pos + len) > payload_stop) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + pos += len; // consume size field + + // pos now points to start of sub-block group payload + + if (pos > payload_stop) + return E_FILE_FORMAT_INVALID; + + if (size == 0) // weird + continue; + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; + + if (id == libwebm::kMkvDiscardPadding) { + status = UnserializeInt(pReader, pos, size, discard_padding); + + if (status < 0) // error + return status; + } + + if (id != libwebm::kMkvBlock) { + pos += size; // consume sub-part of block group + + if (pos > payload_stop) + return E_FILE_FORMAT_INVALID; + + continue; + } + + const long long block_stop = pos + size; + + if (block_stop > payload_stop) + return E_FILE_FORMAT_INVALID; + + // parse track number + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((pos + len) > block_stop) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long track = ReadUInt(pReader, pos, len); + + if (track < 0) // error + return static_cast(track); + + if (track == 0) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume track number + + if ((pos + 2) > block_stop) + return E_FILE_FORMAT_INVALID; + + if ((pos + 2) > avail) { + len = 2; + return E_BUFFER_NOT_FULL; + } + + pos += 2; // consume timecode + + if ((pos + 1) > block_stop) + return E_FILE_FORMAT_INVALID; + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + unsigned char flags; + + status = pReader->Read(pos, 1, &flags); + + if (status < 0) { // error or underflow + len = 1; + return status; + } + + ++pos; // consume flags byte + assert(pos <= avail); + + if (pos >= block_stop) + return E_FILE_FORMAT_INVALID; + + const int lacing = int(flags & 0x06) >> 1; + + if ((lacing != 0) && (block_stop > avail)) { + len = static_cast(block_stop - pos); + return E_BUFFER_NOT_FULL; + } + + pos = block_stop; // consume block-part of block group + if (pos > payload_stop) + return E_FILE_FORMAT_INVALID; + } + + if (pos != payload_stop) + return E_FILE_FORMAT_INVALID; + + status = CreateBlock(libwebm::kMkvBlockGroup, payload_start, payload_size, + discard_padding); + if (status != 0) + return status; + + m_pos = payload_stop; + + return 0; // success +} + +long Cluster::GetEntry(long index, const mkvparser::BlockEntry*& pEntry) const { + assert(m_pos >= m_element_start); + + pEntry = NULL; + + if (index < 0) + return -1; // generic error + + if (m_entries_count < 0) + return E_BUFFER_NOT_FULL; + + assert(m_entries); + assert(m_entries_size > 0); + assert(m_entries_count <= m_entries_size); + + if (index < m_entries_count) { + pEntry = m_entries[index]; + assert(pEntry); + + return 1; // found entry + } + + if (m_element_size < 0) // we don't know cluster end yet + return E_BUFFER_NOT_FULL; // underflow + + const long long element_stop = m_element_start + m_element_size; + + if (m_pos >= element_stop) + return 0; // nothing left to parse + + return E_BUFFER_NOT_FULL; // underflow, since more remains to be parsed +} + +Cluster* Cluster::Create(Segment* pSegment, long idx, long long off) { + if (!pSegment || off < 0) + return NULL; + + const long long element_start = pSegment->m_start + off; + + Cluster* const pCluster = + new (std::nothrow) Cluster(pSegment, idx, element_start); + + return pCluster; +} + +Cluster::Cluster() + : m_pSegment(NULL), + m_element_start(0), + m_index(0), + m_pos(0), + m_element_size(0), + m_timecode(0), + m_entries(NULL), + m_entries_size(0), + m_entries_count(0) // means "no entries" +{} + +Cluster::Cluster(Segment* pSegment, long idx, long long element_start + /* long long element_size */) + : m_pSegment(pSegment), + m_element_start(element_start), + m_index(idx), + m_pos(element_start), + m_element_size(-1 /* element_size */), + m_timecode(-1), + m_entries(NULL), + m_entries_size(0), + m_entries_count(-1) // means "has not been parsed yet" +{} + +Cluster::~Cluster() { + if (m_entries_count <= 0) { + delete[] m_entries; + return; + } + + BlockEntry** i = m_entries; + BlockEntry** const j = m_entries + m_entries_count; + + while (i != j) { + BlockEntry* p = *i++; + assert(p); + + delete p; + } + + delete[] m_entries; +} + +bool Cluster::EOS() const { return (m_pSegment == NULL); } + +long Cluster::GetIndex() const { return m_index; } + +long long Cluster::GetPosition() const { + const long long pos = m_element_start - m_pSegment->m_start; + assert(pos >= 0); + + return pos; +} + +long long Cluster::GetElementSize() const { return m_element_size; } + +long Cluster::HasBlockEntries( + const Segment* pSegment, + long long off, // relative to start of segment payload + long long& pos, long& len) { + assert(pSegment); + assert(off >= 0); // relative to segment + + IMkvReader* const pReader = pSegment->m_pReader; + + long long total, avail; + + long status = pReader->Length(&total, &avail); + + if (status < 0) // error + return status; + + assert((total < 0) || (avail <= total)); + + pos = pSegment->m_start + off; // absolute + + if ((total >= 0) && (pos >= total)) + return 0; // we don't even have a complete cluster + + const long long segment_stop = + (pSegment->m_size < 0) ? -1 : pSegment->m_start + pSegment->m_size; + + long long cluster_stop = -1; // interpreted later to mean "unknown size" + + { + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // need more data + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((total >= 0) && ((pos + len) > total)) + return 0; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long id = ReadID(pReader, pos, len); + + if (id < 0) // error + return static_cast(id); + + if (id != libwebm::kMkvCluster) + return E_PARSE_FAILED; + + pos += len; // consume Cluster ID field + + // read size field + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // weird + return E_BUFFER_NOT_FULL; + + if ((segment_stop >= 0) && ((pos + len) > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((total >= 0) && ((pos + len) > total)) + return 0; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + if (size == 0) + return 0; // cluster does not have entries + + pos += len; // consume size field + + // pos now points to start of payload + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (size != unknown_size) { + cluster_stop = pos + size; + assert(cluster_stop >= 0); + + if ((segment_stop >= 0) && (cluster_stop > segment_stop)) + return E_FILE_FORMAT_INVALID; + + if ((total >= 0) && (cluster_stop > total)) + // return E_FILE_FORMAT_INVALID; //too conservative + return 0; // cluster does not have any entries + } + } + + for (;;) { + if ((cluster_stop >= 0) && (pos >= cluster_stop)) + return 0; // no entries detected + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + long long result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // need more data + return E_BUFFER_NOT_FULL; + + if ((cluster_stop >= 0) && ((pos + len) > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long id = ReadID(pReader, pos, len); + + if (id < 0) // error + return static_cast(id); + + // This is the distinguished set of ID's we use to determine + // that we have exhausted the sub-element's inside the cluster + // whose ID we parsed earlier. + + if (id == libwebm::kMkvCluster) + return 0; // no entries found + + if (id == libwebm::kMkvCues) + return 0; // no entries found + + pos += len; // consume id field + + if ((cluster_stop >= 0) && (pos >= cluster_stop)) + return E_FILE_FORMAT_INVALID; + + // read size field + + if ((pos + 1) > avail) { + len = 1; + return E_BUFFER_NOT_FULL; + } + + result = GetUIntLength(pReader, pos, len); + + if (result < 0) // error + return static_cast(result); + + if (result > 0) // underflow + return E_BUFFER_NOT_FULL; + + if ((cluster_stop >= 0) && ((pos + len) > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > avail) + return E_BUFFER_NOT_FULL; + + const long long size = ReadUInt(pReader, pos, len); + + if (size < 0) // error + return static_cast(size); + + pos += len; // consume size field + + // pos now points to start of payload + + if ((cluster_stop >= 0) && (pos > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if (size == 0) // weird + continue; + + const long long unknown_size = (1LL << (7 * len)) - 1; + + if (size == unknown_size) + return E_FILE_FORMAT_INVALID; // not supported inside cluster + + if ((cluster_stop >= 0) && ((pos + size) > cluster_stop)) + return E_FILE_FORMAT_INVALID; + + if (id == libwebm::kMkvBlockGroup) + return 1; // have at least one entry + + if (id == libwebm::kMkvSimpleBlock) + return 1; // have at least one entry + + pos += size; // consume payload + if (cluster_stop >= 0 && pos > cluster_stop) + return E_FILE_FORMAT_INVALID; + } +} + +long long Cluster::GetTimeCode() const { + long long pos; + long len; + + const long status = Load(pos, len); + + if (status < 0) // error + return status; + + return m_timecode; +} + +long long Cluster::GetTime() const { + const long long tc = GetTimeCode(); + + if (tc < 0) + return tc; + + const SegmentInfo* const pInfo = m_pSegment->GetInfo(); + assert(pInfo); + + const long long scale = pInfo->GetTimeCodeScale(); + assert(scale >= 1); + + const long long t = m_timecode * scale; + + return t; +} + +long long Cluster::GetFirstTime() const { + const BlockEntry* pEntry; + + const long status = GetFirst(pEntry); + + if (status < 0) // error + return status; + + if (pEntry == NULL) // empty cluster + return GetTime(); + + const Block* const pBlock = pEntry->GetBlock(); + assert(pBlock); + + return pBlock->GetTime(this); +} + +long long Cluster::GetLastTime() const { + const BlockEntry* pEntry; + + const long status = GetLast(pEntry); + + if (status < 0) // error + return status; + + if (pEntry == NULL) // empty cluster + return GetTime(); + + const Block* const pBlock = pEntry->GetBlock(); + assert(pBlock); + + return pBlock->GetTime(this); +} + +long Cluster::CreateBlock(long long id, + long long pos, // absolute pos of payload + long long size, long long discard_padding) { + if (id != libwebm::kMkvBlockGroup && id != libwebm::kMkvSimpleBlock) + return E_PARSE_FAILED; + + if (m_entries_count < 0) { // haven't parsed anything yet + assert(m_entries == NULL); + assert(m_entries_size == 0); + + m_entries_size = 1024; + m_entries = new (std::nothrow) BlockEntry*[m_entries_size]; + if (m_entries == NULL) + return -1; + + m_entries_count = 0; + } else { + assert(m_entries); + assert(m_entries_size > 0); + assert(m_entries_count <= m_entries_size); + + if (m_entries_count >= m_entries_size) { + const long entries_size = 2 * m_entries_size; + + BlockEntry** const entries = new (std::nothrow) BlockEntry*[entries_size]; + if (entries == NULL) + return -1; + + BlockEntry** src = m_entries; + BlockEntry** const src_end = src + m_entries_count; + + BlockEntry** dst = entries; + + while (src != src_end) + *dst++ = *src++; + + delete[] m_entries; + + m_entries = entries; + m_entries_size = entries_size; + } + } + + if (id == libwebm::kMkvBlockGroup) + return CreateBlockGroup(pos, size, discard_padding); + else + return CreateSimpleBlock(pos, size); +} + +long Cluster::CreateBlockGroup(long long start_offset, long long size, + long long discard_padding) { + assert(m_entries); + assert(m_entries_size > 0); + assert(m_entries_count >= 0); + assert(m_entries_count < m_entries_size); + + IMkvReader* const pReader = m_pSegment->m_pReader; + + long long pos = start_offset; + const long long stop = start_offset + size; + + // For WebM files, there is a bias towards previous reference times + //(in order to support alt-ref frames, which refer back to the previous + // keyframe). Normally a 0 value is not possible, but here we tenatively + // allow 0 as the value of a reference frame, with the interpretation + // that this is a "previous" reference time. + + long long prev = 1; // nonce + long long next = 0; // nonce + long long duration = -1; // really, this is unsigned + + long long bpos = -1; + long long bsize = -1; + + while (pos < stop) { + long len; + const long long id = ReadID(pReader, pos, len); + if (id < 0 || (pos + len) > stop) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume ID + + const long long size = ReadUInt(pReader, pos, len); + assert(size >= 0); // TODO + assert((pos + len) <= stop); + + pos += len; // consume size + + if (id == libwebm::kMkvBlock) { + if (bpos < 0) { // Block ID + bpos = pos; + bsize = size; + } + } else if (id == libwebm::kMkvBlockDuration) { + if (size > 8) + return E_FILE_FORMAT_INVALID; + + duration = UnserializeUInt(pReader, pos, size); + + if (duration < 0) + return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvReferenceBlock) { + if (size > 8 || size <= 0) + return E_FILE_FORMAT_INVALID; + const long size_ = static_cast(size); + + long long time; + + long status = UnserializeInt(pReader, pos, size_, time); + assert(status == 0); + if (status != 0) + return -1; + + if (time <= 0) // see note above + prev = time; + else + next = time; + } + + pos += size; // consume payload + if (pos > stop) + return E_FILE_FORMAT_INVALID; + } + if (bpos < 0) + return E_FILE_FORMAT_INVALID; + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + assert(bsize >= 0); + + const long idx = m_entries_count; + + BlockEntry** const ppEntry = m_entries + idx; + BlockEntry*& pEntry = *ppEntry; + + pEntry = new (std::nothrow) + BlockGroup(this, idx, bpos, bsize, prev, next, duration, discard_padding); + + if (pEntry == NULL) + return -1; // generic error + + BlockGroup* const p = static_cast(pEntry); + + const long status = p->Parse(); + + if (status == 0) { // success + ++m_entries_count; + return 0; + } + + delete pEntry; + pEntry = 0; + + return status; +} + +long Cluster::CreateSimpleBlock(long long st, long long sz) { + assert(m_entries); + assert(m_entries_size > 0); + assert(m_entries_count >= 0); + assert(m_entries_count < m_entries_size); + + const long idx = m_entries_count; + + BlockEntry** const ppEntry = m_entries + idx; + BlockEntry*& pEntry = *ppEntry; + + pEntry = new (std::nothrow) SimpleBlock(this, idx, st, sz); + + if (pEntry == NULL) + return -1; // generic error + + SimpleBlock* const p = static_cast(pEntry); + + const long status = p->Parse(); + + if (status == 0) { + ++m_entries_count; + return 0; + } + + delete pEntry; + pEntry = 0; + + return status; +} + +long Cluster::GetFirst(const BlockEntry*& pFirst) const { + if (m_entries_count <= 0) { + long long pos; + long len; + + const long status = Parse(pos, len); + + if (status < 0) { // error + pFirst = NULL; + return status; + } + + if (m_entries_count <= 0) { // empty cluster + pFirst = NULL; + return 0; + } + } + + assert(m_entries); + + pFirst = m_entries[0]; + assert(pFirst); + + return 0; // success +} + +long Cluster::GetLast(const BlockEntry*& pLast) const { + for (;;) { + long long pos; + long len; + + const long status = Parse(pos, len); + + if (status < 0) { // error + pLast = NULL; + return status; + } + + if (status > 0) // no new block + break; + } + + if (m_entries_count <= 0) { + pLast = NULL; + return 0; + } + + assert(m_entries); + + const long idx = m_entries_count - 1; + + pLast = m_entries[idx]; + assert(pLast); + + return 0; +} + +long Cluster::GetNext(const BlockEntry* pCurr, const BlockEntry*& pNext) const { + assert(pCurr); + assert(m_entries); + assert(m_entries_count > 0); + + size_t idx = pCurr->GetIndex(); + assert(idx < size_t(m_entries_count)); + assert(m_entries[idx] == pCurr); + + ++idx; + + if (idx >= size_t(m_entries_count)) { + long long pos; + long len; + + const long status = Parse(pos, len); + + if (status < 0) { // error + pNext = NULL; + return status; + } + + if (status > 0) { + pNext = NULL; + return 0; + } + + assert(m_entries); + assert(m_entries_count > 0); + assert(idx < size_t(m_entries_count)); + } + + pNext = m_entries[idx]; + assert(pNext); + + return 0; +} + +long Cluster::GetEntryCount() const { return m_entries_count; } + +const BlockEntry* Cluster::GetEntry(const Track* pTrack, + long long time_ns) const { + assert(pTrack); + + if (m_pSegment == NULL) // this is the special EOS cluster + return pTrack->GetEOS(); + + const BlockEntry* pResult = pTrack->GetEOS(); + + long index = 0; + + for (;;) { + if (index >= m_entries_count) { + long long pos; + long len; + + const long status = Parse(pos, len); + assert(status >= 0); + + if (status > 0) // completely parsed, and no more entries + return pResult; + + if (status < 0) // should never happen + return 0; + + assert(m_entries); + assert(index < m_entries_count); + } + + const BlockEntry* const pEntry = m_entries[index]; + assert(pEntry); + assert(!pEntry->EOS()); + + const Block* const pBlock = pEntry->GetBlock(); + assert(pBlock); + + if (pBlock->GetTrackNumber() != pTrack->GetNumber()) { + ++index; + continue; + } + + if (pTrack->VetEntry(pEntry)) { + if (time_ns < 0) // just want first candidate block + return pEntry; + + const long long ns = pBlock->GetTime(this); + + if (ns > time_ns) + return pResult; + + pResult = pEntry; // have a candidate + } else if (time_ns >= 0) { + const long long ns = pBlock->GetTime(this); + + if (ns > time_ns) + return pResult; + } + + ++index; + } +} + +const BlockEntry* Cluster::GetEntry(const CuePoint& cp, + const CuePoint::TrackPosition& tp) const { + assert(m_pSegment); + const long long tc = cp.GetTimeCode(); + + if (tp.m_block > 0) { + const long block = static_cast(tp.m_block); + const long index = block - 1; + + while (index >= m_entries_count) { + long long pos; + long len; + + const long status = Parse(pos, len); + + if (status < 0) // TODO: can this happen? + return NULL; + + if (status > 0) // nothing remains to be parsed + return NULL; + } + + const BlockEntry* const pEntry = m_entries[index]; + assert(pEntry); + assert(!pEntry->EOS()); + + const Block* const pBlock = pEntry->GetBlock(); + assert(pBlock); + + if ((pBlock->GetTrackNumber() == tp.m_track) && + (pBlock->GetTimeCode(this) == tc)) { + return pEntry; + } + } + + long index = 0; + + for (;;) { + if (index >= m_entries_count) { + long long pos; + long len; + + const long status = Parse(pos, len); + + if (status < 0) // TODO: can this happen? + return NULL; + + if (status > 0) // nothing remains to be parsed + return NULL; + + assert(m_entries); + assert(index < m_entries_count); + } + + const BlockEntry* const pEntry = m_entries[index]; + assert(pEntry); + assert(!pEntry->EOS()); + + const Block* const pBlock = pEntry->GetBlock(); + assert(pBlock); + + if (pBlock->GetTrackNumber() != tp.m_track) { + ++index; + continue; + } + + const long long tc_ = pBlock->GetTimeCode(this); + + if (tc_ < tc) { + ++index; + continue; + } + + if (tc_ > tc) + return NULL; + + const Tracks* const pTracks = m_pSegment->GetTracks(); + assert(pTracks); + + const long tn = static_cast(tp.m_track); + const Track* const pTrack = pTracks->GetTrackByNumber(tn); + + if (pTrack == NULL) + return NULL; + + const long long type = pTrack->GetType(); + + if (type == 2) // audio + return pEntry; + + if (type != 1) // not video + return NULL; + + if (!pBlock->IsKey()) + return NULL; + + return pEntry; + } +} + +BlockEntry::BlockEntry(Cluster* p, long idx) : m_pCluster(p), m_index(idx) {} +BlockEntry::~BlockEntry() {} +const Cluster* BlockEntry::GetCluster() const { return m_pCluster; } +long BlockEntry::GetIndex() const { return m_index; } + +SimpleBlock::SimpleBlock(Cluster* pCluster, long idx, long long start, + long long size) + : BlockEntry(pCluster, idx), m_block(start, size, 0) {} + +long SimpleBlock::Parse() { return m_block.Parse(m_pCluster); } +BlockEntry::Kind SimpleBlock::GetKind() const { return kBlockSimple; } +const Block* SimpleBlock::GetBlock() const { return &m_block; } + +BlockGroup::BlockGroup(Cluster* pCluster, long idx, long long block_start, + long long block_size, long long prev, long long next, + long long duration, long long discard_padding) + : BlockEntry(pCluster, idx), + m_block(block_start, block_size, discard_padding), + m_prev(prev), + m_next(next), + m_duration(duration) {} + +long BlockGroup::Parse() { + const long status = m_block.Parse(m_pCluster); + + if (status) + return status; + + m_block.SetKey((m_prev > 0) && (m_next <= 0)); + + return 0; +} + +BlockEntry::Kind BlockGroup::GetKind() const { return kBlockGroup; } +const Block* BlockGroup::GetBlock() const { return &m_block; } +long long BlockGroup::GetPrevTimeCode() const { return m_prev; } +long long BlockGroup::GetNextTimeCode() const { return m_next; } +long long BlockGroup::GetDurationTimeCode() const { return m_duration; } + +Block::Block(long long start, long long size_, long long discard_padding) + : m_start(start), + m_size(size_), + m_track(0), + m_timecode(-1), + m_flags(0), + m_frames(NULL), + m_frame_count(-1), + m_discard_padding(discard_padding) {} + +Block::~Block() { delete[] m_frames; } + +long Block::Parse(const Cluster* pCluster) { + if (pCluster == NULL) + return -1; + + if (pCluster->m_pSegment == NULL) + return -1; + + assert(m_start >= 0); + assert(m_size >= 0); + assert(m_track <= 0); + assert(m_frames == NULL); + assert(m_frame_count <= 0); + + long long pos = m_start; + const long long stop = m_start + m_size; + + long len; + + IMkvReader* const pReader = pCluster->m_pSegment->m_pReader; + + m_track = ReadUInt(pReader, pos, len); + + if (m_track <= 0) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > stop) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume track number + + if ((stop - pos) < 2) + return E_FILE_FORMAT_INVALID; + + long status; + long long value; + + status = UnserializeInt(pReader, pos, 2, value); + + if (status) + return E_FILE_FORMAT_INVALID; + + if (value < SHRT_MIN) + return E_FILE_FORMAT_INVALID; + + if (value > SHRT_MAX) + return E_FILE_FORMAT_INVALID; + + m_timecode = static_cast(value); + + pos += 2; + + if ((stop - pos) <= 0) + return E_FILE_FORMAT_INVALID; + + status = pReader->Read(pos, 1, &m_flags); + + if (status) + return E_FILE_FORMAT_INVALID; + + const int lacing = int(m_flags & 0x06) >> 1; + + ++pos; // consume flags byte + + if (lacing == 0) { // no lacing + if (pos > stop) + return E_FILE_FORMAT_INVALID; + + m_frame_count = 1; + m_frames = new (std::nothrow) Frame[m_frame_count]; + if (m_frames == NULL) + return -1; + + Frame& f = m_frames[0]; + f.pos = pos; + + const long long frame_size = stop - pos; + + if (frame_size > LONG_MAX || frame_size <= 0) + return E_FILE_FORMAT_INVALID; + + f.len = static_cast(frame_size); + + return 0; // success + } + + if (pos >= stop) + return E_FILE_FORMAT_INVALID; + + unsigned char biased_count; + + status = pReader->Read(pos, 1, &biased_count); + + if (status) + return E_FILE_FORMAT_INVALID; + + ++pos; // consume frame count + if (pos > stop) + return E_FILE_FORMAT_INVALID; + + m_frame_count = int(biased_count) + 1; + + m_frames = new (std::nothrow) Frame[m_frame_count]; + if (m_frames == NULL) + return -1; + + if (!m_frames) + return E_FILE_FORMAT_INVALID; + + if (lacing == 1) { // Xiph + Frame* pf = m_frames; + Frame* const pf_end = pf + m_frame_count; + + long long size = 0; + int frame_count = m_frame_count; + + while (frame_count > 1) { + long frame_size = 0; + + for (;;) { + unsigned char val; + + if (pos >= stop) + return E_FILE_FORMAT_INVALID; + + status = pReader->Read(pos, 1, &val); + + if (status) + return E_FILE_FORMAT_INVALID; + + ++pos; // consume xiph size byte + + frame_size += val; + + if (val < 255) + break; + } + + Frame& f = *pf++; + assert(pf < pf_end); + if (pf >= pf_end) + return E_FILE_FORMAT_INVALID; + + f.pos = 0; // patch later + + if (frame_size <= 0) + return E_FILE_FORMAT_INVALID; + + f.len = frame_size; + size += frame_size; // contribution of this frame + + --frame_count; + } + + if (pf >= pf_end || pos > stop) + return E_FILE_FORMAT_INVALID; + + { + Frame& f = *pf++; + + if (pf != pf_end) + return E_FILE_FORMAT_INVALID; + + f.pos = 0; // patch later + + const long long total_size = stop - pos; + + if (total_size < size) + return E_FILE_FORMAT_INVALID; + + const long long frame_size = total_size - size; + + if (frame_size > LONG_MAX || frame_size <= 0) + return E_FILE_FORMAT_INVALID; + + f.len = static_cast(frame_size); + } + + pf = m_frames; + while (pf != pf_end) { + Frame& f = *pf++; + assert((pos + f.len) <= stop); + + if ((pos + f.len) > stop) + return E_FILE_FORMAT_INVALID; + + f.pos = pos; + pos += f.len; + } + + assert(pos == stop); + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + } else if (lacing == 2) { // fixed-size lacing + if (pos >= stop) + return E_FILE_FORMAT_INVALID; + + const long long total_size = stop - pos; + + if ((total_size % m_frame_count) != 0) + return E_FILE_FORMAT_INVALID; + + const long long frame_size = total_size / m_frame_count; + + if (frame_size > LONG_MAX || frame_size <= 0) + return E_FILE_FORMAT_INVALID; + + Frame* pf = m_frames; + Frame* const pf_end = pf + m_frame_count; + + while (pf != pf_end) { + assert((pos + frame_size) <= stop); + if ((pos + frame_size) > stop) + return E_FILE_FORMAT_INVALID; + + Frame& f = *pf++; + + f.pos = pos; + f.len = static_cast(frame_size); + + pos += frame_size; + } + + assert(pos == stop); + if (pos != stop) + return E_FILE_FORMAT_INVALID; + + } else { + assert(lacing == 3); // EBML lacing + + if (pos >= stop) + return E_FILE_FORMAT_INVALID; + + long long size = 0; + int frame_count = m_frame_count; + + long long frame_size = ReadUInt(pReader, pos, len); + + if (frame_size <= 0) + return E_FILE_FORMAT_INVALID; + + if (frame_size > LONG_MAX) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > stop) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume length of size of first frame + + if ((pos + frame_size) > stop) + return E_FILE_FORMAT_INVALID; + + Frame* pf = m_frames; + Frame* const pf_end = pf + m_frame_count; + + { + Frame& curr = *pf; + + curr.pos = 0; // patch later + + curr.len = static_cast(frame_size); + size += curr.len; // contribution of this frame + } + + --frame_count; + + while (frame_count > 1) { + if (pos >= stop) + return E_FILE_FORMAT_INVALID; + + assert(pf < pf_end); + if (pf >= pf_end) + return E_FILE_FORMAT_INVALID; + + const Frame& prev = *pf++; + assert(prev.len == frame_size); + if (prev.len != frame_size) + return E_FILE_FORMAT_INVALID; + + assert(pf < pf_end); + if (pf >= pf_end) + return E_FILE_FORMAT_INVALID; + + Frame& curr = *pf; + + curr.pos = 0; // patch later + + const long long delta_size_ = ReadUInt(pReader, pos, len); + + if (delta_size_ < 0) + return E_FILE_FORMAT_INVALID; + + if ((pos + len) > stop) + return E_FILE_FORMAT_INVALID; + + pos += len; // consume length of (delta) size + if (pos > stop) + return E_FILE_FORMAT_INVALID; + + const long exp = 7 * len - 1; + const long long bias = (1LL << exp) - 1LL; + const long long delta_size = delta_size_ - bias; + + frame_size += delta_size; + + if (frame_size <= 0) + return E_FILE_FORMAT_INVALID; + + if (frame_size > LONG_MAX) + return E_FILE_FORMAT_INVALID; + + curr.len = static_cast(frame_size); + // Check if size + curr.len could overflow. + if (size > LLONG_MAX - curr.len) { + return E_FILE_FORMAT_INVALID; + } + size += curr.len; // contribution of this frame + + --frame_count; + } + + // parse last frame + if (frame_count > 0) { + if (pos > stop || pf >= pf_end) + return E_FILE_FORMAT_INVALID; + + const Frame& prev = *pf++; + assert(prev.len == frame_size); + if (prev.len != frame_size) + return E_FILE_FORMAT_INVALID; + + if (pf >= pf_end) + return E_FILE_FORMAT_INVALID; + + Frame& curr = *pf++; + if (pf != pf_end) + return E_FILE_FORMAT_INVALID; + + curr.pos = 0; // patch later + + const long long total_size = stop - pos; + + if (total_size < size) + return E_FILE_FORMAT_INVALID; + + frame_size = total_size - size; + + if (frame_size > LONG_MAX || frame_size <= 0) + return E_FILE_FORMAT_INVALID; + + curr.len = static_cast(frame_size); + } + + pf = m_frames; + while (pf != pf_end) { + Frame& f = *pf++; + if ((pos + f.len) > stop) + return E_FILE_FORMAT_INVALID; + + f.pos = pos; + pos += f.len; + } + + if (pos != stop) + return E_FILE_FORMAT_INVALID; + } + + return 0; // success +} + +long long Block::GetTimeCode(const Cluster* pCluster) const { + if (pCluster == 0) + return m_timecode; + + const long long tc0 = pCluster->GetTimeCode(); + assert(tc0 >= 0); + + // Check if tc0 + m_timecode would overflow. + if (tc0 < 0 || LLONG_MAX - tc0 < m_timecode) { + return -1; + } + + const long long tc = tc0 + m_timecode; + + return tc; // unscaled timecode units +} + +long long Block::GetTime(const Cluster* pCluster) const { + assert(pCluster); + + const long long tc = GetTimeCode(pCluster); + + const Segment* const pSegment = pCluster->m_pSegment; + const SegmentInfo* const pInfo = pSegment->GetInfo(); + assert(pInfo); + + const long long scale = pInfo->GetTimeCodeScale(); + assert(scale >= 1); + + // Check if tc * scale could overflow. + if (tc != 0 && scale > LLONG_MAX / tc) { + return -1; + } + const long long ns = tc * scale; + + return ns; +} + +long long Block::GetTrackNumber() const { return m_track; } + +bool Block::IsKey() const { + return ((m_flags & static_cast(1 << 7)) != 0); +} + +void Block::SetKey(bool bKey) { + if (bKey) + m_flags |= static_cast(1 << 7); + else + m_flags &= 0x7F; +} + +bool Block::IsInvisible() const { return bool(int(m_flags & 0x08) != 0); } + +Block::Lacing Block::GetLacing() const { + const int value = int(m_flags & 0x06) >> 1; + return static_cast(value); +} + +int Block::GetFrameCount() const { return m_frame_count; } + +const Block::Frame& Block::GetFrame(int idx) const { + assert(idx >= 0); + assert(idx < m_frame_count); + + const Frame& f = m_frames[idx]; + assert(f.pos > 0); + assert(f.len > 0); + + return f; +} + +long Block::Frame::Read(IMkvReader* pReader, unsigned char* buf) const { + assert(pReader); + assert(buf); + + const long status = pReader->Read(pos, len, buf); + return status; +} + +long long Block::GetDiscardPadding() const { return m_discard_padding; } + +} // namespace mkvparser diff --git a/thirdparty/libsimplewebm/libwebm/mkvparser/mkvparser.h b/thirdparty/libsimplewebm/libwebm/mkvparser/mkvparser.h new file mode 100644 index 000000000000..6dce7e50ba03 --- /dev/null +++ b/thirdparty/libsimplewebm/libwebm/mkvparser/mkvparser.h @@ -0,0 +1,1145 @@ +// Copyright (c) 2012 The WebM project authors. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the LICENSE file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +#ifndef MKVPARSER_MKVPARSER_H_ +#define MKVPARSER_MKVPARSER_H_ + +#include + +namespace mkvparser { + +const int E_PARSE_FAILED = -1; +const int E_FILE_FORMAT_INVALID = -2; +const int E_BUFFER_NOT_FULL = -3; + +class IMkvReader { + public: + virtual int Read(long long pos, long len, unsigned char* buf) = 0; + virtual int Length(long long* total, long long* available) = 0; + + public: + virtual ~IMkvReader(); +}; + +template +Type* SafeArrayAlloc(unsigned long long num_elements, + unsigned long long element_size); +long long GetUIntLength(IMkvReader*, long long, long&); +long long ReadUInt(IMkvReader*, long long, long&); +long long ReadID(IMkvReader* pReader, long long pos, long& len); +long long UnserializeUInt(IMkvReader*, long long pos, long long size); + +long UnserializeFloat(IMkvReader*, long long pos, long long size, double&); +long UnserializeInt(IMkvReader*, long long pos, long long size, + long long& result); + +long UnserializeString(IMkvReader*, long long pos, long long size, char*& str); + +long ParseElementHeader(IMkvReader* pReader, + long long& pos, // consume id and size fields + long long stop, // if you know size of element's parent + long long& id, long long& size); + +bool Match(IMkvReader*, long long&, unsigned long, long long&); +bool Match(IMkvReader*, long long&, unsigned long, unsigned char*&, size_t&); + +void GetVersion(int& major, int& minor, int& build, int& revision); + +struct EBMLHeader { + EBMLHeader(); + ~EBMLHeader(); + long long m_version; + long long m_readVersion; + long long m_maxIdLength; + long long m_maxSizeLength; + char* m_docType; + long long m_docTypeVersion; + long long m_docTypeReadVersion; + + long long Parse(IMkvReader*, long long&); + void Init(); +}; + +class Segment; +class Track; +class Cluster; + +class Block { + Block(const Block&); + Block& operator=(const Block&); + + public: + const long long m_start; + const long long m_size; + + Block(long long start, long long size, long long discard_padding); + ~Block(); + + long Parse(const Cluster*); + + long long GetTrackNumber() const; + long long GetTimeCode(const Cluster*) const; // absolute, but not scaled + long long GetTime(const Cluster*) const; // absolute, and scaled (ns) + bool IsKey() const; + void SetKey(bool); + bool IsInvisible() const; + + enum Lacing { kLacingNone, kLacingXiph, kLacingFixed, kLacingEbml }; + Lacing GetLacing() const; + + int GetFrameCount() const; // to index frames: [0, count) + + struct Frame { + long long pos; // absolute offset + long len; + + long Read(IMkvReader*, unsigned char*) const; + }; + + const Frame& GetFrame(int frame_index) const; + + long long GetDiscardPadding() const; + + private: + long long m_track; // Track::Number() + short m_timecode; // relative to cluster + unsigned char m_flags; + + Frame* m_frames; + int m_frame_count; + + protected: + const long long m_discard_padding; +}; + +class BlockEntry { + BlockEntry(const BlockEntry&); + BlockEntry& operator=(const BlockEntry&); + + protected: + BlockEntry(Cluster*, long index); + + public: + virtual ~BlockEntry(); + + bool EOS() const { return (GetKind() == kBlockEOS); } + const Cluster* GetCluster() const; + long GetIndex() const; + virtual const Block* GetBlock() const = 0; + + enum Kind { kBlockEOS, kBlockSimple, kBlockGroup }; + virtual Kind GetKind() const = 0; + + protected: + Cluster* const m_pCluster; + const long m_index; +}; + +class SimpleBlock : public BlockEntry { + SimpleBlock(const SimpleBlock&); + SimpleBlock& operator=(const SimpleBlock&); + + public: + SimpleBlock(Cluster*, long index, long long start, long long size); + long Parse(); + + Kind GetKind() const; + const Block* GetBlock() const; + + protected: + Block m_block; +}; + +class BlockGroup : public BlockEntry { + BlockGroup(const BlockGroup&); + BlockGroup& operator=(const BlockGroup&); + + public: + BlockGroup(Cluster*, long index, + long long block_start, // absolute pos of block's payload + long long block_size, // size of block's payload + long long prev, long long next, long long duration, + long long discard_padding); + + long Parse(); + + Kind GetKind() const; + const Block* GetBlock() const; + + long long GetPrevTimeCode() const; // relative to block's time + long long GetNextTimeCode() const; // as above + long long GetDurationTimeCode() const; + + private: + Block m_block; + const long long m_prev; + const long long m_next; + const long long m_duration; +}; + +/////////////////////////////////////////////////////////////// +// ContentEncoding element +// Elements used to describe if the track data has been encrypted or +// compressed with zlib or header stripping. +class ContentEncoding { + public: + enum { kCTR = 1 }; + + ContentEncoding(); + ~ContentEncoding(); + + // ContentCompression element names + struct ContentCompression { + ContentCompression(); + ~ContentCompression(); + + unsigned long long algo; + unsigned char* settings; + long long settings_len; + }; + + // ContentEncAESSettings element names + struct ContentEncAESSettings { + ContentEncAESSettings() : cipher_mode(kCTR) {} + ~ContentEncAESSettings() {} + + unsigned long long cipher_mode; + }; + + // ContentEncryption element names + struct ContentEncryption { + ContentEncryption(); + ~ContentEncryption(); + + unsigned long long algo; + unsigned char* key_id; + long long key_id_len; + unsigned char* signature; + long long signature_len; + unsigned char* sig_key_id; + long long sig_key_id_len; + unsigned long long sig_algo; + unsigned long long sig_hash_algo; + + ContentEncAESSettings aes_settings; + }; + + // Returns ContentCompression represented by |idx|. Returns NULL if |idx| + // is out of bounds. + const ContentCompression* GetCompressionByIndex(unsigned long idx) const; + + // Returns number of ContentCompression elements in this ContentEncoding + // element. + unsigned long GetCompressionCount() const; + + // Parses the ContentCompression element from |pReader|. |start| is the + // starting offset of the ContentCompression payload. |size| is the size in + // bytes of the ContentCompression payload. |compression| is where the parsed + // values will be stored. + long ParseCompressionEntry(long long start, long long size, + IMkvReader* pReader, + ContentCompression* compression); + + // Returns ContentEncryption represented by |idx|. Returns NULL if |idx| + // is out of bounds. + const ContentEncryption* GetEncryptionByIndex(unsigned long idx) const; + + // Returns number of ContentEncryption elements in this ContentEncoding + // element. + unsigned long GetEncryptionCount() const; + + // Parses the ContentEncAESSettings element from |pReader|. |start| is the + // starting offset of the ContentEncAESSettings payload. |size| is the + // size in bytes of the ContentEncAESSettings payload. |encryption| is + // where the parsed values will be stored. + long ParseContentEncAESSettingsEntry(long long start, long long size, + IMkvReader* pReader, + ContentEncAESSettings* aes); + + // Parses the ContentEncoding element from |pReader|. |start| is the + // starting offset of the ContentEncoding payload. |size| is the size in + // bytes of the ContentEncoding payload. Returns true on success. + long ParseContentEncodingEntry(long long start, long long size, + IMkvReader* pReader); + + // Parses the ContentEncryption element from |pReader|. |start| is the + // starting offset of the ContentEncryption payload. |size| is the size in + // bytes of the ContentEncryption payload. |encryption| is where the parsed + // values will be stored. + long ParseEncryptionEntry(long long start, long long size, + IMkvReader* pReader, ContentEncryption* encryption); + + unsigned long long encoding_order() const { return encoding_order_; } + unsigned long long encoding_scope() const { return encoding_scope_; } + unsigned long long encoding_type() const { return encoding_type_; } + + private: + // Member variables for list of ContentCompression elements. + ContentCompression** compression_entries_; + ContentCompression** compression_entries_end_; + + // Member variables for list of ContentEncryption elements. + ContentEncryption** encryption_entries_; + ContentEncryption** encryption_entries_end_; + + // ContentEncoding element names + unsigned long long encoding_order_; + unsigned long long encoding_scope_; + unsigned long long encoding_type_; + + // LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncoding); + ContentEncoding(const ContentEncoding&); + ContentEncoding& operator=(const ContentEncoding&); +}; + +class Track { + Track(const Track&); + Track& operator=(const Track&); + + public: + class Info; + static long Create(Segment*, const Info&, long long element_start, + long long element_size, Track*&); + + enum Type { kVideo = 1, kAudio = 2, kSubtitle = 0x11, kMetadata = 0x21 }; + + Segment* const m_pSegment; + const long long m_element_start; + const long long m_element_size; + virtual ~Track(); + + long GetType() const; + long GetNumber() const; + unsigned long long GetUid() const; + const char* GetNameAsUTF8() const; + const char* GetLanguage() const; + const char* GetCodecNameAsUTF8() const; + const char* GetCodecId() const; + const unsigned char* GetCodecPrivate(size_t&) const; + bool GetLacing() const; + unsigned long long GetDefaultDuration() const; + unsigned long long GetCodecDelay() const; + unsigned long long GetSeekPreRoll() const; + + const BlockEntry* GetEOS() const; + + struct Settings { + long long start; + long long size; + }; + + class Info { + public: + Info(); + ~Info(); + int Copy(Info&) const; + void Clear(); + long type; + long number; + unsigned long long uid; + unsigned long long defaultDuration; + unsigned long long codecDelay; + unsigned long long seekPreRoll; + char* nameAsUTF8; + char* language; + char* codecId; + char* codecNameAsUTF8; + unsigned char* codecPrivate; + size_t codecPrivateSize; + bool lacing; + Settings settings; + + private: + Info(const Info&); + Info& operator=(const Info&); + int CopyStr(char* Info::*str, Info&) const; + }; + + long GetFirst(const BlockEntry*&) const; + long GetNext(const BlockEntry* pCurr, const BlockEntry*& pNext) const; + virtual bool VetEntry(const BlockEntry*) const; + virtual long Seek(long long time_ns, const BlockEntry*&) const; + + const ContentEncoding* GetContentEncodingByIndex(unsigned long idx) const; + unsigned long GetContentEncodingCount() const; + + long ParseContentEncodingsEntry(long long start, long long size); + + protected: + Track(Segment*, long long element_start, long long element_size); + + Info m_info; + + class EOSBlock : public BlockEntry { + public: + EOSBlock(); + + Kind GetKind() const; + const Block* GetBlock() const; + }; + + EOSBlock m_eos; + + private: + ContentEncoding** content_encoding_entries_; + ContentEncoding** content_encoding_entries_end_; +}; + +struct PrimaryChromaticity { + PrimaryChromaticity() : x(0), y(0) {} + ~PrimaryChromaticity() {} + static bool Parse(IMkvReader* reader, long long read_pos, + long long value_size, bool is_x, + PrimaryChromaticity** chromaticity); + float x; + float y; +}; + +struct MasteringMetadata { + static const float kValueNotPresent; + + MasteringMetadata() + : r(NULL), + g(NULL), + b(NULL), + white_point(NULL), + luminance_max(kValueNotPresent), + luminance_min(kValueNotPresent) {} + ~MasteringMetadata() { + delete r; + delete g; + delete b; + delete white_point; + } + + static bool Parse(IMkvReader* reader, long long element_start, + long long element_size, + MasteringMetadata** mastering_metadata); + + PrimaryChromaticity* r; + PrimaryChromaticity* g; + PrimaryChromaticity* b; + PrimaryChromaticity* white_point; + float luminance_max; + float luminance_min; +}; + +struct Colour { + static const long long kValueNotPresent; + + // Unless otherwise noted all values assigned upon construction are the + // equivalent of unspecified/default. + Colour() + : matrix_coefficients(kValueNotPresent), + bits_per_channel(kValueNotPresent), + chroma_subsampling_horz(kValueNotPresent), + chroma_subsampling_vert(kValueNotPresent), + cb_subsampling_horz(kValueNotPresent), + cb_subsampling_vert(kValueNotPresent), + chroma_siting_horz(kValueNotPresent), + chroma_siting_vert(kValueNotPresent), + range(kValueNotPresent), + transfer_characteristics(kValueNotPresent), + primaries(kValueNotPresent), + max_cll(kValueNotPresent), + max_fall(kValueNotPresent), + mastering_metadata(NULL) {} + ~Colour() { + delete mastering_metadata; + mastering_metadata = NULL; + } + + static bool Parse(IMkvReader* reader, long long element_start, + long long element_size, Colour** colour); + + long long matrix_coefficients; + long long bits_per_channel; + long long chroma_subsampling_horz; + long long chroma_subsampling_vert; + long long cb_subsampling_horz; + long long cb_subsampling_vert; + long long chroma_siting_horz; + long long chroma_siting_vert; + long long range; + long long transfer_characteristics; + long long primaries; + long long max_cll; + long long max_fall; + + MasteringMetadata* mastering_metadata; +}; + +struct Projection { + enum ProjectionType { + kTypeNotPresent = -1, + kRectangular = 0, + kEquirectangular = 1, + kCubeMap = 2, + kMesh = 3, + }; + static const float kValueNotPresent; + Projection() + : type(kTypeNotPresent), + private_data(NULL), + private_data_length(0), + pose_yaw(kValueNotPresent), + pose_pitch(kValueNotPresent), + pose_roll(kValueNotPresent) {} + ~Projection() { delete[] private_data; } + static bool Parse(IMkvReader* reader, long long element_start, + long long element_size, Projection** projection); + + ProjectionType type; + unsigned char* private_data; + size_t private_data_length; + float pose_yaw; + float pose_pitch; + float pose_roll; +}; + +class VideoTrack : public Track { + VideoTrack(const VideoTrack&); + VideoTrack& operator=(const VideoTrack&); + + VideoTrack(Segment*, long long element_start, long long element_size); + + public: + virtual ~VideoTrack(); + static long Parse(Segment*, const Info&, long long element_start, + long long element_size, VideoTrack*&); + + long long GetWidth() const; + long long GetHeight() const; + long long GetDisplayWidth() const; + long long GetDisplayHeight() const; + long long GetDisplayUnit() const; + long long GetStereoMode() const; + double GetFrameRate() const; + + bool VetEntry(const BlockEntry*) const; + long Seek(long long time_ns, const BlockEntry*&) const; + + Colour* GetColour() const; + + Projection* GetProjection() const; + + private: + long long m_width; + long long m_height; + long long m_display_width; + long long m_display_height; + long long m_display_unit; + long long m_stereo_mode; + + double m_rate; + + Colour* m_colour; + Projection* m_projection; +}; + +class AudioTrack : public Track { + AudioTrack(const AudioTrack&); + AudioTrack& operator=(const AudioTrack&); + + AudioTrack(Segment*, long long element_start, long long element_size); + + public: + static long Parse(Segment*, const Info&, long long element_start, + long long element_size, AudioTrack*&); + + double GetSamplingRate() const; + long long GetChannels() const; + long long GetBitDepth() const; + + private: + double m_rate; + long long m_channels; + long long m_bitDepth; +}; + +class Tracks { + Tracks(const Tracks&); + Tracks& operator=(const Tracks&); + + public: + Segment* const m_pSegment; + const long long m_start; + const long long m_size; + const long long m_element_start; + const long long m_element_size; + + Tracks(Segment*, long long start, long long size, long long element_start, + long long element_size); + + ~Tracks(); + + long Parse(); + + unsigned long GetTracksCount() const; + + const Track* GetTrackByNumber(long tn) const; + const Track* GetTrackByIndex(unsigned long idx) const; + + private: + Track** m_trackEntries; + Track** m_trackEntriesEnd; + + long ParseTrackEntry(long long payload_start, long long payload_size, + long long element_start, long long element_size, + Track*&) const; +}; + +class Chapters { + Chapters(const Chapters&); + Chapters& operator=(const Chapters&); + + public: + Segment* const m_pSegment; + const long long m_start; + const long long m_size; + const long long m_element_start; + const long long m_element_size; + + Chapters(Segment*, long long payload_start, long long payload_size, + long long element_start, long long element_size); + + ~Chapters(); + + long Parse(); + + class Atom; + class Edition; + + class Display { + friend class Atom; + Display(); + Display(const Display&); + ~Display(); + Display& operator=(const Display&); + + public: + const char* GetString() const; + const char* GetLanguage() const; + const char* GetCountry() const; + + private: + void Init(); + void ShallowCopy(Display&) const; + void Clear(); + long Parse(IMkvReader*, long long pos, long long size); + + char* m_string; + char* m_language; + char* m_country; + }; + + class Atom { + friend class Edition; + Atom(); + Atom(const Atom&); + ~Atom(); + Atom& operator=(const Atom&); + + public: + unsigned long long GetUID() const; + const char* GetStringUID() const; + + long long GetStartTimecode() const; + long long GetStopTimecode() const; + + long long GetStartTime(const Chapters*) const; + long long GetStopTime(const Chapters*) const; + + int GetDisplayCount() const; + const Display* GetDisplay(int index) const; + + private: + void Init(); + void ShallowCopy(Atom&) const; + void Clear(); + long Parse(IMkvReader*, long long pos, long long size); + static long long GetTime(const Chapters*, long long timecode); + + long ParseDisplay(IMkvReader*, long long pos, long long size); + bool ExpandDisplaysArray(); + + char* m_string_uid; + unsigned long long m_uid; + long long m_start_timecode; + long long m_stop_timecode; + + Display* m_displays; + int m_displays_size; + int m_displays_count; + }; + + class Edition { + friend class Chapters; + Edition(); + Edition(const Edition&); + ~Edition(); + Edition& operator=(const Edition&); + + public: + int GetAtomCount() const; + const Atom* GetAtom(int index) const; + + private: + void Init(); + void ShallowCopy(Edition&) const; + void Clear(); + long Parse(IMkvReader*, long long pos, long long size); + + long ParseAtom(IMkvReader*, long long pos, long long size); + bool ExpandAtomsArray(); + + Atom* m_atoms; + int m_atoms_size; + int m_atoms_count; + }; + + int GetEditionCount() const; + const Edition* GetEdition(int index) const; + + private: + long ParseEdition(long long pos, long long size); + bool ExpandEditionsArray(); + + Edition* m_editions; + int m_editions_size; + int m_editions_count; +}; + +class Tags { + Tags(const Tags&); + Tags& operator=(const Tags&); + + public: + Segment* const m_pSegment; + const long long m_start; + const long long m_size; + const long long m_element_start; + const long long m_element_size; + + Tags(Segment*, long long payload_start, long long payload_size, + long long element_start, long long element_size); + + ~Tags(); + + long Parse(); + + class Tag; + class SimpleTag; + + class SimpleTag { + friend class Tag; + SimpleTag(); + SimpleTag(const SimpleTag&); + ~SimpleTag(); + SimpleTag& operator=(const SimpleTag&); + + public: + const char* GetTagName() const; + const char* GetTagString() const; + + private: + void Init(); + void ShallowCopy(SimpleTag&) const; + void Clear(); + long Parse(IMkvReader*, long long pos, long long size); + + char* m_tag_name; + char* m_tag_string; + }; + + class Tag { + friend class Tags; + Tag(); + Tag(const Tag&); + ~Tag(); + Tag& operator=(const Tag&); + + public: + int GetSimpleTagCount() const; + const SimpleTag* GetSimpleTag(int index) const; + + private: + void Init(); + void ShallowCopy(Tag&) const; + void Clear(); + long Parse(IMkvReader*, long long pos, long long size); + + long ParseSimpleTag(IMkvReader*, long long pos, long long size); + bool ExpandSimpleTagsArray(); + + SimpleTag* m_simple_tags; + int m_simple_tags_size; + int m_simple_tags_count; + }; + + int GetTagCount() const; + const Tag* GetTag(int index) const; + + private: + long ParseTag(long long pos, long long size); + bool ExpandTagsArray(); + + Tag* m_tags; + int m_tags_size; + int m_tags_count; +}; + +class SegmentInfo { + SegmentInfo(const SegmentInfo&); + SegmentInfo& operator=(const SegmentInfo&); + + public: + Segment* const m_pSegment; + const long long m_start; + const long long m_size; + const long long m_element_start; + const long long m_element_size; + + SegmentInfo(Segment*, long long start, long long size, + long long element_start, long long element_size); + + ~SegmentInfo(); + + long Parse(); + + long long GetTimeCodeScale() const; + long long GetDuration() const; // scaled + const char* GetMuxingAppAsUTF8() const; + const char* GetWritingAppAsUTF8() const; + const char* GetTitleAsUTF8() const; + + private: + long long m_timecodeScale; + double m_duration; + char* m_pMuxingAppAsUTF8; + char* m_pWritingAppAsUTF8; + char* m_pTitleAsUTF8; +}; + +class SeekHead { + SeekHead(const SeekHead&); + SeekHead& operator=(const SeekHead&); + + public: + Segment* const m_pSegment; + const long long m_start; + const long long m_size; + const long long m_element_start; + const long long m_element_size; + + SeekHead(Segment*, long long start, long long size, long long element_start, + long long element_size); + + ~SeekHead(); + + long Parse(); + + struct Entry { + Entry(); + + // the SeekHead entry payload + long long id; + long long pos; + + // absolute pos of SeekEntry ID + long long element_start; + + // SeekEntry ID size + size size + payload + long long element_size; + }; + + int GetCount() const; + const Entry* GetEntry(int idx) const; + + struct VoidElement { + // absolute pos of Void ID + long long element_start; + + // ID size + size size + payload size + long long element_size; + }; + + int GetVoidElementCount() const; + const VoidElement* GetVoidElement(int idx) const; + + private: + Entry* m_entries; + int m_entry_count; + + VoidElement* m_void_elements; + int m_void_element_count; + + static bool ParseEntry(IMkvReader*, + long long pos, // payload + long long size, Entry*); +}; + +class Cues; +class CuePoint { + friend class Cues; + + CuePoint(long, long long); + ~CuePoint(); + + CuePoint(const CuePoint&); + CuePoint& operator=(const CuePoint&); + + public: + long long m_element_start; + long long m_element_size; + + bool Load(IMkvReader*); + + long long GetTimeCode() const; // absolute but unscaled + long long GetTime(const Segment*) const; // absolute and scaled (ns units) + + struct TrackPosition { + long long m_track; + long long m_pos; // of cluster + long long m_block; + // codec_state //defaults to 0 + // reference = clusters containing req'd referenced blocks + // reftime = timecode of the referenced block + + bool Parse(IMkvReader*, long long, long long); + }; + + const TrackPosition* Find(const Track*) const; + + private: + const long m_index; + long long m_timecode; + TrackPosition* m_track_positions; + size_t m_track_positions_count; +}; + +class Cues { + friend class Segment; + + Cues(Segment*, long long start, long long size, long long element_start, + long long element_size); + ~Cues(); + + Cues(const Cues&); + Cues& operator=(const Cues&); + + public: + Segment* const m_pSegment; + const long long m_start; + const long long m_size; + const long long m_element_start; + const long long m_element_size; + + bool Find( // lower bound of time_ns + long long time_ns, const Track*, const CuePoint*&, + const CuePoint::TrackPosition*&) const; + + const CuePoint* GetFirst() const; + const CuePoint* GetLast() const; + const CuePoint* GetNext(const CuePoint*) const; + + const BlockEntry* GetBlock(const CuePoint*, + const CuePoint::TrackPosition*) const; + + bool LoadCuePoint() const; + long GetCount() const; // loaded only + // long GetTotal() const; //loaded + preloaded + bool DoneParsing() const; + + private: + bool Init() const; + bool PreloadCuePoint(long&, long long) const; + + mutable CuePoint** m_cue_points; + mutable long m_count; + mutable long m_preload_count; + mutable long long m_pos; +}; + +class Cluster { + friend class Segment; + + Cluster(const Cluster&); + Cluster& operator=(const Cluster&); + + public: + Segment* const m_pSegment; + + public: + static Cluster* Create(Segment*, + long index, // index in segment + long long off); // offset relative to segment + // long long element_size); + + Cluster(); // EndOfStream + ~Cluster(); + + bool EOS() const; + + long long GetTimeCode() const; // absolute, but not scaled + long long GetTime() const; // absolute, and scaled (nanosecond units) + long long GetFirstTime() const; // time (ns) of first (earliest) block + long long GetLastTime() const; // time (ns) of last (latest) block + + long GetFirst(const BlockEntry*&) const; + long GetLast(const BlockEntry*&) const; + long GetNext(const BlockEntry* curr, const BlockEntry*& next) const; + + const BlockEntry* GetEntry(const Track*, long long ns = -1) const; + const BlockEntry* GetEntry(const CuePoint&, + const CuePoint::TrackPosition&) const; + // const BlockEntry* GetMaxKey(const VideoTrack*) const; + + // static bool HasBlockEntries(const Segment*, long long); + + static long HasBlockEntries(const Segment*, long long idoff, long long& pos, + long& size); + + long GetEntryCount() const; + + long Load(long long& pos, long& size) const; + + long Parse(long long& pos, long& size) const; + long GetEntry(long index, const mkvparser::BlockEntry*&) const; + + protected: + Cluster(Segment*, long index, long long element_start); + // long long element_size); + + public: + const long long m_element_start; + long long GetPosition() const; // offset relative to segment + + long GetIndex() const; + long long GetElementSize() const; + // long long GetPayloadSize() const; + + // long long Unparsed() const; + + private: + long m_index; + mutable long long m_pos; + // mutable long long m_size; + mutable long long m_element_size; + mutable long long m_timecode; + mutable BlockEntry** m_entries; + mutable long m_entries_size; + mutable long m_entries_count; + + long ParseSimpleBlock(long long, long long&, long&); + long ParseBlockGroup(long long, long long&, long&); + + long CreateBlock(long long id, long long pos, long long size, + long long discard_padding); + long CreateBlockGroup(long long start_offset, long long size, + long long discard_padding); + long CreateSimpleBlock(long long, long long); +}; + +class Segment { + friend class Cues; + friend class Track; + friend class VideoTrack; + + Segment(const Segment&); + Segment& operator=(const Segment&); + + private: + Segment(IMkvReader*, long long elem_start, + // long long elem_size, + long long pos, long long size); + + public: + IMkvReader* const m_pReader; + const long long m_element_start; + // const long long m_element_size; + const long long m_start; // posn of segment payload + const long long m_size; // size of segment payload + Cluster m_eos; // TODO: make private? + + static long long CreateInstance(IMkvReader*, long long, Segment*&); + ~Segment(); + + long Load(); // loads headers and all clusters + + // for incremental loading + // long long Unparsed() const; + bool DoneParsing() const; + long long ParseHeaders(); // stops when first cluster is found + // long FindNextCluster(long long& pos, long& size) const; + long LoadCluster(long long& pos, long& size); // load one cluster + long LoadCluster(); + + long ParseNext(const Cluster* pCurr, const Cluster*& pNext, long long& pos, + long& size); + + const SeekHead* GetSeekHead() const; + const Tracks* GetTracks() const; + const SegmentInfo* GetInfo() const; + const Cues* GetCues() const; + const Chapters* GetChapters() const; + const Tags* GetTags() const; + + long long GetDuration() const; + + unsigned long GetCount() const; + const Cluster* GetFirst() const; + const Cluster* GetLast() const; + const Cluster* GetNext(const Cluster*); + + const Cluster* FindCluster(long long time_nanoseconds) const; + // const BlockEntry* Seek(long long time_nanoseconds, const Track*) const; + + const Cluster* FindOrPreloadCluster(long long pos); + + long ParseCues(long long cues_off, // offset relative to start of segment + long long& parse_pos, long& parse_len); + + private: + long long m_pos; // absolute file posn; what has been consumed so far + Cluster* m_pUnknownSize; + + SeekHead* m_pSeekHead; + SegmentInfo* m_pInfo; + Tracks* m_pTracks; + Cues* m_pCues; + Chapters* m_pChapters; + Tags* m_pTags; + Cluster** m_clusters; + long m_clusterCount; // number of entries for which m_index >= 0 + long m_clusterPreloadCount; // number of entries for which m_index < 0 + long m_clusterSize; // array size + + long DoLoadCluster(long long&, long&); + long DoLoadClusterUnknownSize(long long&, long&); + long DoParseNext(const Cluster*&, long long&, long&); + + bool AppendCluster(Cluster*); + bool PreloadCluster(Cluster*, ptrdiff_t); + + // void ParseSeekHead(long long pos, long long size); + // void ParseSeekEntry(long long pos, long long size); + // void ParseCues(long long); + + const BlockEntry* GetBlock(const CuePoint&, const CuePoint::TrackPosition&); +}; + +} // namespace mkvparser + +inline long mkvparser::Segment::LoadCluster() { + long long pos; + long size; + + return LoadCluster(pos, size); +} + +#endif // MKVPARSER_MKVPARSER_H_ diff --git a/thirdparty/libvpx/AUTHORS b/thirdparty/libvpx/AUTHORS new file mode 100644 index 000000000000..fcd5c534a8c0 --- /dev/null +++ b/thirdparty/libvpx/AUTHORS @@ -0,0 +1,142 @@ +# This file is automatically generated from the git commit history +# by tools/gen_authors.sh. + +Aaron Watry +Abo Talib Mahfoodh +Adam Xu +Adrian Grange +Aℓex Converse +Ahmad Sharif +Alexander Voronov +Alexis Ballier +Alok Ahuja +Alpha Lam +A.Mahfoodh +Ami Fischman +Andoni Morales Alastruey +Andres Mejia +Andrew Russell +Angie Chiang +Aron Rosenberg +Attila Nagy +Brion Vibber +changjun.yang +Charles 'Buck' Krasic +chm +Christian Duvivier +Daniele Castagna +Daniel Kang +Deb Mukherjee +Dim Temp +Dmitry Kovalev +Dragan Mrdjan +Ed Baker +Ehsan Akhgari +Erik Niemeyer +Fabio Pedretti +Frank Galligan +Fredrik Söderquist +Fritz Koenig +Gaute Strokkenes +Geza Lore +Ghislain MARY +Giuseppe Scrivano +Gordana Cmiljanovic +Guillaume Martres +Guillermo Ballester Valor +Hangyu Kuang +Hanno Böck +Henrik Lundin +Hui Su +Ivan Maltz +Jacek Caban +Jacky Chen +James Berry +James Yu +James Zern +Jan Gerber +Jan Kratochvil +Janne Salonen +Jean-Yves Avenard +Jeff Faust +Jeff Muizelaar +Jeff Petkau +Jia Jia +Jian Zhou +Jim Bankoski +Jingning Han +Joey Parrish +Johann Koenig +John Koleszar +Johnny Klonaris +John Stark +Joshua Bleecher Snyder +Joshua Litt +Julia Robson +Justin Clift +Justin Lebar +KO Myung-Hun +Lawrence Velázquez +Linfeng Zhang +Lou Quillio +Luca Barbato +Makoto Kato +Mans Rullgard +Marco Paniconi +Mark Mentovai +Martin Ettl +Martin Storsjo +Matthew Heaney +Michael Kohler +Mike Frysinger +Mike Hommey +Mikhal Shemer +Minghai Shang +Morton Jonuschat +Nico Weber +Parag Salasakar +Pascal Massimino +Patrik Westin +Paul Wilkins +Pavol Rusnak +Paweł Hajdan +Pengchong Jin +Peter de Rivaz +Philip Jägenstedt +Priit Laes +Rafael Ávila de Espíndola +Rafaël Carré +Ralph Giles +Rob Bradford +Ronald S. Bultje +Rui Ueyama +Sami Pietilä +Sasi Inguva +Scott Graham +Scott LaVarnway +Sean McGovern +Sergey Kolomenkin +Sergey Ulanov +Shimon Doodkin +Shunyao Li +Stefan Holmer +Suman Sunkara +Taekhyun Kim +Takanori MATSUURA +Tamar Levy +Tao Bai +Tero Rintaluoma +Thijs Vermeir +Tim Kopp +Timothy B. Terriberry +Tom Finegan +Vignesh Venkatasubramanian +Yaowu Xu +Yi Luo +Yongzhe Wang +Yunqing Wang +Yury Gitman +Zoe Liu +Google Inc. +The Mozilla Foundation +The Xiph.Org Foundation diff --git a/thirdparty/libvpx/CHANGELOG b/thirdparty/libvpx/CHANGELOG new file mode 100644 index 000000000000..795d395f96a5 --- /dev/null +++ b/thirdparty/libvpx/CHANGELOG @@ -0,0 +1,654 @@ +2016-07-20 v1.6.0 "Khaki Campbell Duck" + This release improves upon the VP9 encoder and speeds up the encoding and + decoding processes. + + - Upgrading: + This release is ABI incompatible with 1.5.0 due to a new 'color_range' enum + in vpx_image and some minor changes to the VP8_COMP structure. + + The default key frame interval for VP9 has changed from 128 to 9999. + + - Enhancement: + A core focus has been performance for low end Intel processors. SSSE3 + instructions such as 'pshufb' have been avoided and instructions have been + reordered to better accommodate the more constrained pipelines. + + As a result, devices based on Celeron processors have seen substantial + decoding improvements. From Indian Runner Duck to Javan Whistling Duck, + decoding speed improved between 10 and 30%. Between Javan Whistling Duck + and Khaki Campbell Duck, it improved another 10 to 15%. + + While Celeron benefited most, Core-i5 also improved 5% and 10% between the + respective releases. + + Realtime performance for WebRTC for both speed and quality has received a + lot of attention. + + - Bug Fixes: + A number of fuzzing issues, found variously by Mozilla, Chromium and others, + have been fixed and we strongly recommend updating. + +2015-11-09 v1.5.0 "Javan Whistling Duck" + This release improves upon the VP9 encoder and speeds up the encoding and + decoding processes. + + - Upgrading: + This release is ABI incompatible with 1.4.0. It drops deprecated VP8 + controls and adds a variety of VP9 controls for testing. + + The vpxenc utility now prefers VP9 by default. + + - Enhancements: + Faster VP9 encoding and decoding + Smaller library size by combining functions used by VP8 and VP9 + + - Bug Fixes: + A variety of fuzzing issues + +2015-04-03 v1.4.0 "Indian Runner Duck" + This release includes significant improvements to the VP9 codec. + + - Upgrading: + This release is ABI incompatible with 1.3.0. It drops the compatibility + layer, requiring VPX_IMG_FMT_* instead of IMG_FMT_*, and adds several codec + controls for VP9. + + - Enhancements: + Faster VP9 encoding and decoding + Multithreaded VP9 decoding (tile and frame-based) + Multithreaded VP9 encoding - on by default + YUV 4:2:2 and 4:4:4 support in VP9 + 10 and 12bit support in VP9 + 64bit ARM support by replacing ARM assembly with intrinsics + + - Bug Fixes: + Fixes a VP9 bitstream issue in Profile 1. This only affected non-YUV 4:2:0 + files. + + - Known Issues: + Frame Parallel decoding fails for segmented and non-420 files. + +2013-11-15 v1.3.0 "Forest" + This release introduces the VP9 codec in a backward-compatible way. + All existing users of VP8 can continue to use the library without + modification. However, some VP8 options do not map to VP9 in the same manner. + + The VP9 encoder in this release is not feature complete. Users interested in + the encoder are advised to use the git master branch and discuss issues on + libvpx mailing lists. + + - Upgrading: + This release is ABI and API compatible with Duclair (v1.0.0). Users + of older releases should refer to the Upgrading notes in this document + for that release. + + - Enhancements: + Get rid of bashisms in the main build scripts + Added usage info on command line options + Add lossless compression mode + Dll build of libvpx + Add additional Mac OS X targets: 10.7, 10.8 and 10.9 (darwin11-13) + Add option to disable documentation + configure: add --enable-external-build support + make: support V=1 as short form of verbose=yes + configure: support mingw-w64 + configure: support hardfloat armv7 CHOSTS + configure: add support for android x86 + Add estimated completion time to vpxenc + Don't exit on decode errors in vpxenc + vpxenc: support scaling prior to encoding + vpxdec: support scaling output + vpxenc: improve progress indicators with --skip + msvs: Don't link to winmm.lib + Add a new script for producing vcxproj files + Produce Visual Studio 10 and 11 project files + Produce Windows Phone project files + msvs-build: use msbuild for vs >= 2005 + configure: default configure log to config.log + Add encoding option --static-thresh + + - Speed: + Miscellaneous speed optimizations for VP8 and VP9. + + - Quality: + In general, quality is consistent with the Eider release. + + - Bug Fixes: + This release represents approximately a year of engineering effort, + and contains multiple bug fixes. Please refer to git history for details. + + +2012-12-21 v1.2.0 + This release acts as a checkpoint for a large amount of internal refactoring + and testing. It also contains a number of small bugfixes, so all users are + encouraged to upgrade. + + - Upgrading: + This release is ABI and API compatible with Duclair (v1.0.0). Users + of older releases should refer to the Upgrading notes in this + document for that release. + + - Enhancements: + VP8 optimizations for MIPS dspr2 + vpxenc: add -quiet option + + - Speed: + Encoder and decoder speed is consistent with the Eider release. + + - Quality: + In general, quality is consistent with the Eider release. + + Minor tweaks to ARNR filtering + Minor improvements to real time encoding with multiple temporal layers + + - Bug Fixes: + Fixes multithreaded encoder race condition in loopfilter + Fixes multi-resolution threaded encoding + Fix potential encoder dead-lock after picture resize + + +2012-05-09 v1.1.0 "Eider" + This introduces a number of enhancements, mostly focused on real-time + encoding. In addition, it fixes a decoder bug (first introduced in + Duclair) so all users of that release are encouraged to upgrade. + + - Upgrading: + This release is ABI and API compatible with Duclair (v1.0.0). Users + of older releases should refer to the Upgrading notes in this + document for that release. + + This release introduces a new temporal denoiser, controlled by the + VP8E_SET_NOISE_SENSITIVITY control. The temporal denoiser does not + currently take a strength parameter, so the control is effectively + a boolean - zero (off) or non-zero (on). For compatibility with + existing applications, the values accepted are the same as those + for the spatial denoiser (0-6). The temporal denoiser is enabled + by default, and the older spatial denoiser may be restored by + configuring with --disable-temporal-denoising. The temporal denoiser + is more computationally intensive than the spatial one. + + This release removes support for a legacy, decode only API that was + supported, but deprecated, at the initial release of libvpx + (v0.9.0). This is not expected to have any impact. If you are + impacted, you can apply a reversion to commit 2bf8fb58 locally. + Please update to the latest libvpx API if you are affected. + + - Enhancements: + Adds a motion compensated temporal denoiser to the encoder, which + gives higher quality than the older spatial denoiser. (See above + for notes on upgrading). + + In addition, support for new compilers and platforms were added, + including: + improved support for XCode + Android x86 NDK build + OS/2 support + SunCC support + + Changing resolution with vpx_codec_enc_config_set() is now + supported. Previously, reinitializing the codec was required to + change the input resolution. + + The vpxenc application has initial support for producing multiple + encodes from the same input in one call. Resizing is not yet + supported, but varying other codec parameters is. Use -- to + delineate output streams. Options persist from one stream to the + next. + + Also, the vpxenc application will now use a keyframe interval of + 5 seconds by default. Use the --kf-max-dist option to override. + + - Speed: + Decoder performance improved 2.5% versus Duclair. Encoder speed is + consistent with Duclair for most material. Two pass encoding of + slideshow-like material will see significant improvements. + + Large realtime encoding speed gains at a small quality expense are + possible by configuring the on-the-fly bitpacking experiment with + --enable-onthefly-bitpacking. Realtime encoder can be up to 13% + faster (ARM) depending on the number of threads and bitrate + settings. This technique sees constant gain over the 5-16 speed + range. For VC style input the loss seen is up to 0.2dB. See commit + 52cf4dca for further details. + + - Quality: + On the whole, quality is consistent with the Duclair release. Some + tweaks: + + Reduced blockiness in easy sections by applying a penalty to + intra modes. + + Improved quality of static sections (like slideshows) with + two pass encoding. + + Improved keyframe sizing with multiple temporal layers + + - Bug Fixes: + Corrected alt-ref contribution to frame rate for visible updates + to the alt-ref buffer. This affected applications making manual + usage of the frame reference flags, or temporal layers. + + Additional constraints were added to disable multi-frame quality + enhancement (MFQE) in sections of the frame where there is motion. + (#392) + + Fixed corruption issues when vpx_codec_enc_config_set() was called + with spatial resampling enabled. + + Fixed a decoder error introduced in Duclair where the segmentation + map was not being reinitialized on keyframes (#378) + + +2012-01-27 v1.0.0 "Duclair" + Our fourth named release, focused on performance and features related to + real-time encoding. It also fixes a decoder crash bug introduced in + v0.9.7, so all users of that release are encouraged to upgrade. + + - Upgrading: + This release is ABI incompatible with prior releases of libvpx, so the + "major" version number has been bumped to 1. You must recompile your + applications against the latest version of the libvpx headers. The + API remains compatible, and this should not require code changes in most + applications. + + - Enhancements: + This release introduces several substantial new features to the encoder, + of particular interest to real time streaming applications. + + Temporal scalability allows the encoder to produce a stream that can + be decimated to different frame rates, with independent rate targetting + for each substream. + + Multiframe quality enhancement postprocessing can make visual quality + more consistent in the presence of frames that are substantially + different quality than the surrounding frames, as in the temporal + scalability case and in some forced keyframe scenarios. + + Multiple-resolution encoding support allows the encoding of the + same content at different resolutions faster than encoding them + separately. + + - Speed: + Optimization targets for this release included the decoder and the real- + time modes of the encoder. Decoder speed on x86 has improved 10.5% with + this release. Encoder improvements followed a curve where speeds 1-3 + improved 4.0%-1.5%, speeds 4-8 improved <1%, and speeds 9-16 improved + 1.5% to 10.5%, respectively. "Best" mode speed is consistent with the + Cayuga release. + + - Quality: + Encoder quality in the single stream case is consistent with the Cayuga + release. + + - Bug Fixes: + This release fixes an OOB read decoder crash bug present in v0.9.7 + related to the clamping of motion vectors in SPLITMV blocks. This + behavior could be triggered by corrupt input or by starting + decoding from a P-frame. + + +2011-08-15 v0.9.7-p1 "Cayuga" patch 1 + This is an incremental bugfix release against Cayuga. All users of that + release are strongly encouraged to upgrade. + + - Fix potential OOB reads (cdae03a) + + An unbounded out of bounds read was discovered when the + decoder was requested to perform error concealment (new in + Cayuga) given a frame with corrupt partition sizes. + + A bounded out of bounds read was discovered affecting all + versions of libvpx. Given an multipartition input frame that + is truncated between the mode/mv partition and the first + residiual paritition (in the block of partition offsets), up + to 3 extra bytes could have been read from the source buffer. + The code will not take any action regardless of the contents + of these undefined bytes, as the truncated buffer is detected + immediately following the read based on the calculated + starting position of the coefficient partition. + + - Fix potential error concealment crash when the very first frame + is missing or corrupt (a609be5) + + - Fix significant artifacts in error concealment (a4c2211, 99d870a) + + - Revert 1-pass CBR rate control changes (e961317) + Further testing showed this change produced undesirable visual + artifacts, rolling back for now. + + +2011-08-02 v0.9.7 "Cayuga" + Our third named release, focused on a faster, higher quality, encoder. + + - Upgrading: + This release is backwards compatible with Aylesbury (v0.9.5) and + Bali (v0.9.6). Users of older releases should refer to the Upgrading + notes in this document for that release. + + - Enhancements: + Stereo 3D format support for vpxenc + Runtime detection of available processor cores. + Allow specifying --end-usage by enum name + vpxdec: test for frame corruption + vpxenc: add quantizer histogram display + vpxenc: add rate histogram display + Set VPX_FRAME_IS_DROPPABLE + update configure for ios sdk 4.3 + Avoid text relocations in ARM vp8 decoder + Generate a vpx.pc file for pkg-config. + New ways of passing encoded data between encoder and decoder. + + - Speed: + This release includes across-the-board speed improvements to the + encoder. On x86, these measure at approximately 11.5% in Best mode, + 21.5% in Good mode (speed 0), and 22.5% in Realtime mode (speed 6). + On ARM Cortex A9 with Neon extensions, real-time encoding of video + telephony content is 35% faster than Bali on single core and 48% + faster on multi-core. On the NVidia Tegra2 platform, real time + encoding is 40% faster than Bali. + + Decoder speed was not a priority for this release, but improved + approximately 8.4% on x86. + + Reduce motion vector search on alt-ref frame. + Encoder loopfilter running in its own thread + Reworked loopfilter to precalculate more parameters + SSE2/SSSE3 optimizations for build_predictors_mbuv{,_s}(). + Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3. + Removed redundant checks + Reduced structure sizes + utilize preload in ARMv6 MC/LPF/Copy routines + ARM optimized quantization, dfct, variance, subtract + Increase chrow row alignment to 16 bytes. + disable trellis optimization for first pass + Write SSSE3 sub-pixel filter function + Improve SSE2 half-pixel filter funtions + Add vp8_sub_pixel_variance16x8_ssse3 function + Reduce unnecessary distortion computation + Use diamond search to replace full search + Preload reference area in sub-pixel motion search (real-time mode) + + - Quality: + This release focused primarily on one-pass use cases, including + video conferencing. Low latency data rate control was significantly + improved, improving streamability over bandwidth constrained links. + Added support for error concealment, allowing frames to maintain + visual quality in the presence of substantial packet loss. + + Add rc_max_intra_bitrate_pct control + Limit size of initial keyframe in one-pass. + Improve framerate adaptation + Improved 1-pass CBR rate control + Improved KF insertion after fades to still. + Improved key frame detection. + Improved activity masking (lower PSNR impact for same SSIM boost) + Improved interaction between GF and ARFs + Adding error-concealment to the decoder. + Adding support for independent partitions + Adjusted rate-distortion constants + + + - Bug Fixes: + Removed firstpass motion map + Fix parallel make install + Fix multithreaded encoding for 1 MB wide frame + Fixed iwalsh_neon build problems with RVDS4.1 + Fix semaphore emulation, spin-wait intrinsics on Windows + Fix build with xcode4 and simplify GLOBAL. + Mark ARM asm objects as allowing a non-executable stack. + Fix vpxenc encoding incorrect webm file header on big endian + + +2011-03-07 v0.9.6 "Bali" + Our second named release, focused on a faster, higher quality, encoder. + + - Upgrading: + This release is backwards compatible with Aylesbury (v0.9.5). Users + of older releases should refer to the Upgrading notes in this + document for that release. + + - Enhancements: + vpxenc --psnr shows a summary when encode completes + --tune=ssim option to enable activity masking + improved postproc visualizations for development + updated support for Apple iOS to SDK 4.2 + query decoder to determine which reference frames were updated + implemented error tracking in the decoder + fix pipe support on windows + + - Speed: + Primary focus was on good quality mode, speed 0. Average improvement + on x86 about 40%, up to 100% on user-generated content at that speed. + Best quality mode speed improved 35%, and realtime speed 10-20%. This + release also saw significant improvement in realtime encoding speed + on ARM platforms. + + Improved encoder threading + Dont pick encoder filter level when loopfilter is disabled. + Avoid double copying of key frames into alt and golden buffer + FDCT optimizations. + x86 sse2 temporal filter + SSSE3 version of fast quantizer + vp8_rd_pick_best_mbsegmentation code restructure + Adjusted breakout RD for SPLITMV + Changed segmentation check order + Improved rd_pick_intra4x4block + Adds armv6 optimized variance calculation + ARMv6 optimized sad16x16 + ARMv6 optimized half pixel variance calculations + Full search SAD function optimization in SSE4.1 + Improve MV prediction accuracy to achieve performance gain + Improve MV prediction in vp8_pick_inter_mode() for speed>3 + + - Quality: + Best quality mode improved PSNR 6.3%, and SSIM 6.1%. This release + also includes support for "activity masking," which greatly improves + SSIM at the expense of PSNR. For now, this feature is available with + the --tune=ssim option. Further experimentation in this area + is ongoing. This release also introduces a new rate control mode + called "CQ," which changes the allocation of bits within a clip to + the sections where they will have the most visual impact. + + Tuning for the more exact quantizer. + Relax rate control for last few frames + CQ Mode + Limit key frame quantizer for forced key frames. + KF/GF Pulsing + Add simple version of activity masking. + make rdmult adaptive for intra in quantizer RDO + cap the best quantizer for 2nd order DC + change the threshold of DC check for encode breakout + + - Bug Fixes: + Fix crash on Sparc Solaris. + Fix counter of fixed keyframe distance + ARNR filter pointer update bug fix + Fixed use of motion percentage in KF/GF group calc + Changed condition for using RD in Intra Mode + Fix encoder real-time only configuration. + Fix ARM encoder crash with multiple token partitions + Fixed bug first cluster timecode of webm file is wrong. + Fixed various encoder bugs with odd-sized images + vp8e_get_preview fixed when spatial resampling enabled + quantizer: fix assertion in fast quantizer path + Allocate source buffers to be multiples of 16 + Fix for manual Golden frame frequency + Fix drastic undershoot in long form content + + +2010-10-28 v0.9.5 "Aylesbury" + Our first named release, focused on a faster decoder, and a better encoder. + + - Upgrading: + This release incorporates backwards-incompatible changes to the + ivfenc and ivfdec tools. These tools are now called vpxenc and vpxdec. + + vpxdec + * the -q (quiet) option has been removed, and replaced with + -v (verbose). the output is quiet by default. Use -v to see + the version number of the binary. + + * The default behavior is now to write output to a single file + instead of individual frames. The -y option has been removed. + Y4M output is the default. + + * For raw I420/YV12 output instead of Y4M, the --i420 or --yv12 + options must be specified. + + $ ivfdec -o OUTPUT INPUT + $ vpxdec --i420 -o OUTPUT INPUT + + * If an output file is not specified, the default is to write + Y4M to stdout. This makes piping more natural. + + $ ivfdec -y -o - INPUT | ... + $ vpxdec INPUT | ... + + * The output file has additional flexibility for formatting the + filename. It supports escape characters for constructing a + filename from the width, height, and sequence number. This + replaces the -p option. To get the equivalent: + + $ ivfdec -p frame INPUT + $ vpxdec --i420 -o frame-%wx%h-%4.i420 INPUT + + vpxenc + * The output file must be specified with -o, rather than as the + last argument. + + $ ivfenc INPUT OUTPUT + $ vpxenc -o OUTPUT INPUT + + * The output defaults to webm. To get IVF output, use the --ivf + option. + + $ ivfenc INPUT OUTPUT.ivf + $ vpxenc -o OUTPUT.ivf --ivf INPUT + + + - Enhancements: + ivfenc and ivfdec have been renamed to vpxenc, vpxdec. + vpxdec supports .webm input + vpxdec writes .y4m by default + vpxenc writes .webm output by default + vpxenc --psnr now shows the average/overall PSNR at the end + ARM platforms now support runtime cpu detection + vpxdec visualizations added for motion vectors, block modes, references + vpxdec now silent by default + vpxdec --progress shows frame-by-frame timing information + vpxenc supports the distinction between --fps and --timebase + NASM is now a supported assembler + configure: enable PIC for shared libs by default + configure: add --enable-small + configure: support for ppc32-linux-gcc + configure: support for sparc-solaris-gcc + + - Bugs: + Improve handling of invalid frames + Fix valgrind errors in the NEON loop filters. + Fix loopfilter delta zero transitions + Fix valgrind errors in vp8_sixtap_predict8x4_armv6(). + Build fixes for darwin-icc + + - Speed: + 20-40% (average 28%) improvement in libvpx decoder speed, + including: + Rewrite vp8_short_walsh4x4_sse2() + Optimizations on the loopfilters. + Miscellaneous improvements for Atom + Add 4-tap version of 2nd-pass ARMv6 MC filter. + Improved multithread utilization + Better instruction choices on x86 + reorder data to use wider instructions + Update NEON wide idcts + Make block access to frame buffer sequential + Improved subset block search + Bilinear subpixel optimizations for ssse3. + Decrease memory footprint + + Encoder speed improvements (percentage gain not measured): + Skip unnecessary search of identical frames + Add SSE2 subtract functions + Improve bounds checking in vp8_diamond_search_sadx4() + Added vp8_fast_quantize_b_sse2 + + - Quality: + Over 7% overall PSNR improvement (6.3% SSIM) in "best" quality + encoding mode, and up to 60% improvement on very noisy, still + or slow moving source video + + Motion compensated temporal filter for Alt-Ref Noise Reduction + Improved use of trellis quantization on 2nd order Y blocks + Tune effect of motion on KF/GF boost in two pass + Allow coefficient optimization for good quality speed 0. + Improved control of active min quantizer for two pass. + Enable ARFs for non-lagged compress + +2010-09-02 v0.9.2 + - Enhancements: + Disable frame dropping by default + Improved multithreaded performance + Improved Force Key Frame Behaviour + Increased rate control buffer level precision + Fix bug in 1st pass motion compensation + ivfenc: correct fixed kf interval, --disable-kf + - Speed: + Changed above and left context data layout + Rework idct calling structure. + Removed unnecessary MB_MODE_INFO copies + x86: SSSE3 sixtap prediction + Reworked IDCT to include reconstruction (add) step + Swap alt/gold/new/last frame buffer ptrs instead of copying. + Improve SSE2 loopfilter functions + Change bitreader to use a larger window. + Avoid loopfilter reinitialization when possible + - Quality: + Normalize quantizer's zero bin and rounding factors + Add trellis quantization. + Make the quantizer exact. + Updates to ARNR filtering algorithm + Fix breakout thresh computation for golden & AltRef frames + Redo the forward 4x4 dct + Improve the accuracy of forward walsh-hadamard transform + Further adjustment of RD behaviour with Q and Zbin. + - Build System: + Allow linking of libs built with MinGW to MSVC + Fix target auto-detection on mingw32 + Allow --cpu= to work for x86. + configure: pass original arguments through to make dist + Fix builds without runtime CPU detection + msvs: fix install of codec sources + msvs: Change devenv.com command line for better msys support + msvs: Add vs9 targets. + Add x86_64-linux-icc target + - Bugs: + Potential crashes on older MinGW builds + Fix two-pass framrate for Y4M input. + Fixed simple loop filter, other crashes on ARM v6 + arm: fix missing dependency with --enable-shared + configure: support directories containing .o + Replace pinsrw (SSE) with MMX instructions + apple: include proper mach primatives + Fixed rate control bug with long key frame interval. + Fix DSO link errors on x86-64 when not using a version script + Fixed buffer selection for UV in AltRef filtering + + +2010-06-17 v0.9.1 + - Enhancements: + * ivfenc/ivfdec now support YUV4MPEG2 input and pipe I/O + * Speed optimizations + - Bugfixes: + * Rate control + * Prevent out-of-bounds accesses on invalid data + - Build system updates: + * Detect toolchain to be used automatically for native builds + * Support building shared libraries + * Better autotools emulation (--prefix, --libdir, DESTDIR) + - Updated LICENSE + * http://webmproject.blogspot.com/2010/06/changes-to-webm-open-source-license.html + + +2010-05-18 v0.9.0 + - Initial open source release. Welcome to WebM and VP8! + diff --git a/thirdparty/libvpx/LICENSE b/thirdparty/libvpx/LICENSE new file mode 100644 index 000000000000..1ce44343c4a3 --- /dev/null +++ b/thirdparty/libvpx/LICENSE @@ -0,0 +1,31 @@ +Copyright (c) 2010, The WebM Project authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of Google, nor the WebM Project, nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/thirdparty/libvpx/PATENTS b/thirdparty/libvpx/PATENTS new file mode 100644 index 000000000000..caedf607e95a --- /dev/null +++ b/thirdparty/libvpx/PATENTS @@ -0,0 +1,23 @@ +Additional IP Rights Grant (Patents) +------------------------------------ + +"These implementations" means the copyrightable works that implement the WebM +codecs distributed by Google as part of the WebM Project. + +Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge, +royalty-free, irrevocable (except as stated in this section) patent license to +make, have made, use, offer to sell, sell, import, transfer, and otherwise +run, modify and propagate the contents of these implementations of WebM, where +such license applies only to those patent claims, both currently owned by +Google and acquired in the future, licensable by Google that are necessarily +infringed by these implementations of WebM. This grant does not include claims +that would be infringed only as a consequence of further modification of these +implementations. If you or your agent or exclusive licensee institute or order +or agree to the institution of patent litigation or any other patent +enforcement activity against any entity (including a cross-claim or +counterclaim in a lawsuit) alleging that any of these implementations of WebM +or any code incorporated within any of these implementations of WebM +constitute direct or contributory patent infringement, or inducement of +patent infringement, then any patent rights granted to you under this License +for these implementations of WebM shall terminate as of the date such +litigation is filed. diff --git a/thirdparty/libvpx/rtcd/vp8_rtcd_arm.h b/thirdparty/libvpx/rtcd/vp8_rtcd_arm.h new file mode 100644 index 000000000000..5c9b7aa392a5 --- /dev/null +++ b/thirdparty/libvpx/rtcd/vp8_rtcd_arm.h @@ -0,0 +1,240 @@ +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct loop_filter_info; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_clear_system_state_c(); +#define vp8_clear_system_state vp8_clear_system_state_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_neon(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); +void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *output, int stride); +RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride); + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_uv_block)(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +void vp8_dequantize_b_neon(struct blockd*, short *dqc); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bhs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit); + +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_mbhs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_mbvs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +void vp8_short_inv_walsh4x4_neon(short *input, short *output); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c + +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_neon; +#endif + vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_neon; +#endif + vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_neon; +#endif + vp8_copy_mem16x16 = vp8_copy_mem16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_copy_mem16x16 = vp8_copy_mem16x16_neon; +#endif + vp8_copy_mem8x4 = vp8_copy_mem8x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_copy_mem8x4 = vp8_copy_mem8x4_neon; +#endif + vp8_copy_mem8x8 = vp8_copy_mem8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_copy_mem8x8 = vp8_copy_mem8x8_neon; +#endif + vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_dc_only_idct_add = vp8_dc_only_idct_add_neon; +#endif + vp8_dequant_idct_add = vp8_dequant_idct_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_dequant_idct_add = vp8_dequant_idct_add_neon; +#endif + vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_neon; +#endif + vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_neon; +#endif + vp8_dequantize_b = vp8_dequantize_b_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_dequantize_b = vp8_dequantize_b_neon; +#endif + vp8_loop_filter_bh = vp8_loop_filter_bh_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_loop_filter_bh = vp8_loop_filter_bh_neon; +#endif + vp8_loop_filter_bv = vp8_loop_filter_bv_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_loop_filter_bv = vp8_loop_filter_bv_neon; +#endif + vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_loop_filter_mbh = vp8_loop_filter_mbh_neon; +#endif + vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_loop_filter_mbv = vp8_loop_filter_mbv_neon; +#endif + vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_neon; +#endif + vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_neon; +#endif + vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_loop_filter_simple_mbh = vp8_loop_filter_mbhs_neon; +#endif + vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_loop_filter_simple_mbv = vp8_loop_filter_mbvs_neon; +#endif + vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_short_idct4x4llm = vp8_short_idct4x4llm_neon; +#endif + vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_neon; +#endif + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_neon; +#endif + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_neon; +#endif + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_neon; +#endif +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/rtcd/vp8_rtcd_c.h b/thirdparty/libvpx/rtcd/vp8_rtcd_c.h new file mode 100644 index 000000000000..d1657ac09d5d --- /dev/null +++ b/thirdparty/libvpx/rtcd/vp8_rtcd_c.h @@ -0,0 +1,117 @@ +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct loop_filter_info; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c + +void vp8_clear_system_state_c(); +#define vp8_clear_system_state vp8_clear_system_state_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_c + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_c + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_c + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_c + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_c + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +#define vp8_dequantize_b vp8_dequantize_b_c + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_c + +void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_c + +void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbh vp8_loop_filter_mbh_c + +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_c + +void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c + +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c + +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c + +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c + +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c + +void vp8_rtcd(void); + +#ifdef RTCD_C +static void setup_rtcd_internal(void) +{ +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/rtcd/vp8_rtcd_x86.h b/thirdparty/libvpx/rtcd/vp8_rtcd_x86.h new file mode 100644 index 000000000000..cbe1f47b2a77 --- /dev/null +++ b/thirdparty/libvpx/rtcd/vp8_rtcd_x86.h @@ -0,0 +1,247 @@ +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct loop_filter_info; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_clear_system_state_c(); +void vpx_reset_mmx_state(); +RTCD_EXTERN void (*vp8_clear_system_state)(); + +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); +void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride); +RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride); + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_uv_block)(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +void vp8_dequantize_b_mmx(struct blockd*, short *dqc); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bhs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit); + +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +void vp8_short_inv_walsh4x4_mmx(short *input, short *output); +void vp8_short_inv_walsh4x4_sse2(short *input, short *output); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_c; + if (flags & HAS_MMX) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_mmx; + if (flags & HAS_SSE2) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; + vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; + if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx; + vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; + if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx; + vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c; + if (flags & HAS_MMX) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_mmx; + if (flags & HAS_SSE2) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; + vp8_clear_system_state = vp8_clear_system_state_c; + if (flags & HAS_MMX) vp8_clear_system_state = vpx_reset_mmx_state; + vp8_copy_mem16x16 = vp8_copy_mem16x16_c; + if (flags & HAS_MMX) vp8_copy_mem16x16 = vp8_copy_mem16x16_mmx; + if (flags & HAS_SSE2) vp8_copy_mem16x16 = vp8_copy_mem16x16_sse2; + vp8_copy_mem8x4 = vp8_copy_mem8x4_c; + if (flags & HAS_MMX) vp8_copy_mem8x4 = vp8_copy_mem8x4_mmx; + vp8_copy_mem8x8 = vp8_copy_mem8x8_c; + if (flags & HAS_MMX) vp8_copy_mem8x8 = vp8_copy_mem8x8_mmx; + vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; + if (flags & HAS_MMX) vp8_dc_only_idct_add = vp8_dc_only_idct_add_mmx; + vp8_dequant_idct_add = vp8_dequant_idct_add_c; + if (flags & HAS_MMX) vp8_dequant_idct_add = vp8_dequant_idct_add_mmx; + vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; + if (flags & HAS_MMX) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx; + if (flags & HAS_SSE2) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; + vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; + if (flags & HAS_MMX) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_mmx; + if (flags & HAS_SSE2) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; + vp8_dequantize_b = vp8_dequantize_b_c; + if (flags & HAS_MMX) vp8_dequantize_b = vp8_dequantize_b_mmx; + vp8_loop_filter_bh = vp8_loop_filter_bh_c; + if (flags & HAS_MMX) vp8_loop_filter_bh = vp8_loop_filter_bh_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; + vp8_loop_filter_bv = vp8_loop_filter_bv_c; + if (flags & HAS_MMX) vp8_loop_filter_bv = vp8_loop_filter_bv_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_bv = vp8_loop_filter_bv_sse2; + vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; + if (flags & HAS_MMX) vp8_loop_filter_mbh = vp8_loop_filter_mbh_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_mbh = vp8_loop_filter_mbh_sse2; + vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; + if (flags & HAS_MMX) vp8_loop_filter_mbv = vp8_loop_filter_mbv_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_mbv = vp8_loop_filter_mbv_sse2; + vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_sse2; + vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_sse2; + vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_sse2; + vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_sse2; + vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; + if (flags & HAS_MMX) vp8_short_idct4x4llm = vp8_short_idct4x4llm_mmx; + vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; + if (flags & HAS_MMX) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_mmx; + if (flags & HAS_SSE2) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_sse2; + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; + if (flags & HAS_MMX) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c; + if (flags & HAS_MMX) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; + if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; + if (flags & HAS_MMX) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; + if (flags & HAS_MMX) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/rtcd/vp9_rtcd_arm.h b/thirdparty/libvpx/rtcd/vp9_rtcd_arm.h new file mode 100644 index 000000000000..afdc7e179e6a --- /dev/null +++ b/thirdparty/libvpx/rtcd/vp9_rtcd_arm.h @@ -0,0 +1,54 @@ +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vp9/common/vp9_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); + +void vp9_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp9_iht4x4_16_add = vp9_iht4x4_16_add_neon; +#endif + vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vp9_iht8x8_64_add = vp9_iht8x8_64_add_neon; +#endif +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/rtcd/vp9_rtcd_c.h b/thirdparty/libvpx/rtcd/vp9_rtcd_c.h new file mode 100644 index 000000000000..329cb9d04c82 --- /dev/null +++ b/thirdparty/libvpx/rtcd/vp9_rtcd_c.h @@ -0,0 +1,41 @@ +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vp9/common/vp9_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c + +void vp9_rtcd(void); + +#ifdef RTCD_C +static void setup_rtcd_internal(void) +{ +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/rtcd/vp9_rtcd_x86.h b/thirdparty/libvpx/rtcd/vp9_rtcd_x86.h new file mode 100644 index 000000000000..8ce8067674d1 --- /dev/null +++ b/thirdparty/libvpx/rtcd/vp9_rtcd_x86.h @@ -0,0 +1,55 @@ +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vp9/common/vp9_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); + +void vp9_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; + if (flags & HAS_SSE2) vp9_iht16x16_256_add = vp9_iht16x16_256_add_sse2; + + vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; + if (flags & HAS_SSE2) vp9_iht4x4_16_add = vp9_iht4x4_16_add_sse2; + + vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; + if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_arm.h b/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_arm.h new file mode 100644 index 000000000000..df42f3d24aea --- /dev/null +++ b/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_arm.h @@ -0,0 +1,678 @@ +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d135_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c + +void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c + +void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c + +void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c + +void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c + +void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c + +void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c + +void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c + +void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_edge_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_c + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +void vpx_dsp_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + vpx_convolve8 = vpx_convolve8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_convolve8 = vpx_convolve8_neon; +#endif + vpx_convolve8_avg = vpx_convolve8_avg_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_convolve8_avg = vpx_convolve8_avg_neon; +#endif + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon; +#endif + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon; +#endif + vpx_convolve8_horiz = vpx_convolve8_horiz_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_convolve8_horiz = vpx_convolve8_horiz_neon; +#endif + vpx_convolve8_vert = vpx_convolve8_vert_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_convolve8_vert = vpx_convolve8_vert_neon; +#endif + vpx_convolve_avg = vpx_convolve_avg_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_convolve_avg = vpx_convolve_avg_neon; +#endif + vpx_convolve_copy = vpx_convolve_copy_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_convolve_copy = vpx_convolve_copy_neon; +#endif + vpx_d135_predictor_4x4 = vpx_d135_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_d135_predictor_4x4 = vpx_d135_predictor_4x4_neon; +#endif + vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_neon; +#endif + vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_neon; +#endif + vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_neon; +#endif + vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_neon; +#endif + vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_neon; +#endif + vpx_dc_128_predictor_4x4 = vpx_dc_128_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_128_predictor_4x4 = vpx_dc_128_predictor_4x4_neon; +#endif + vpx_dc_128_predictor_8x8 = vpx_dc_128_predictor_8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_128_predictor_8x8 = vpx_dc_128_predictor_8x8_neon; +#endif + vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_neon; +#endif + vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_neon; +#endif + vpx_dc_left_predictor_4x4 = vpx_dc_left_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_left_predictor_4x4 = vpx_dc_left_predictor_4x4_neon; +#endif + vpx_dc_left_predictor_8x8 = vpx_dc_left_predictor_8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_left_predictor_8x8 = vpx_dc_left_predictor_8x8_neon; +#endif + vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_neon; +#endif + vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_neon; +#endif + vpx_dc_predictor_4x4 = vpx_dc_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_predictor_4x4 = vpx_dc_predictor_4x4_neon; +#endif + vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_neon; +#endif + vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_neon; +#endif + vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_neon; +#endif + vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_neon; +#endif + vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_neon; +#endif + vpx_h_predictor_16x16 = vpx_h_predictor_16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_h_predictor_16x16 = vpx_h_predictor_16x16_neon; +#endif + vpx_h_predictor_32x32 = vpx_h_predictor_32x32_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_h_predictor_32x32 = vpx_h_predictor_32x32_neon; +#endif + vpx_h_predictor_4x4 = vpx_h_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_h_predictor_4x4 = vpx_h_predictor_4x4_neon; +#endif + vpx_h_predictor_8x8 = vpx_h_predictor_8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_h_predictor_8x8 = vpx_h_predictor_8x8_neon; +#endif + vpx_idct16x16_10_add = vpx_idct16x16_10_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct16x16_10_add = vpx_idct16x16_10_add_neon; +#endif + vpx_idct16x16_1_add = vpx_idct16x16_1_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct16x16_1_add = vpx_idct16x16_1_add_neon; +#endif + vpx_idct16x16_256_add = vpx_idct16x16_256_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct16x16_256_add = vpx_idct16x16_256_add_neon; +#endif + vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_neon; +#endif + vpx_idct32x32_135_add = vpx_idct32x32_135_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct32x32_135_add = vpx_idct32x32_1024_add_neon; +#endif + vpx_idct32x32_1_add = vpx_idct32x32_1_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct32x32_1_add = vpx_idct32x32_1_add_neon; +#endif + vpx_idct32x32_34_add = vpx_idct32x32_34_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct32x32_34_add = vpx_idct32x32_1024_add_neon; +#endif + vpx_idct4x4_16_add = vpx_idct4x4_16_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct4x4_16_add = vpx_idct4x4_16_add_neon; +#endif + vpx_idct4x4_1_add = vpx_idct4x4_1_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct4x4_1_add = vpx_idct4x4_1_add_neon; +#endif + vpx_idct8x8_12_add = vpx_idct8x8_12_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct8x8_12_add = vpx_idct8x8_12_add_neon; +#endif + vpx_idct8x8_1_add = vpx_idct8x8_1_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct8x8_1_add = vpx_idct8x8_1_add_neon; +#endif + vpx_idct8x8_64_add = vpx_idct8x8_64_add_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_idct8x8_64_add = vpx_idct8x8_64_add_neon; +#endif + vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_neon; +#endif + vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_neon; +#endif + vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_neon; +#endif + vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_neon; +#endif + vpx_lpf_horizontal_edge_16 = vpx_lpf_horizontal_edge_16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_horizontal_edge_16 = vpx_lpf_horizontal_edge_16_neon; +#endif + vpx_lpf_horizontal_edge_8 = vpx_lpf_horizontal_edge_8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_horizontal_edge_8 = vpx_lpf_horizontal_edge_8_neon; +#endif + vpx_lpf_vertical_16 = vpx_lpf_vertical_16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_vertical_16 = vpx_lpf_vertical_16_neon; +#endif + vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_neon; +#endif + vpx_lpf_vertical_4 = vpx_lpf_vertical_4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_vertical_4 = vpx_lpf_vertical_4_neon; +#endif + vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_neon; +#endif + vpx_lpf_vertical_8 = vpx_lpf_vertical_8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_vertical_8 = vpx_lpf_vertical_8_neon; +#endif + vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_neon; +#endif + vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_neon; +#endif + vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_neon; +#endif + vpx_tm_predictor_4x4 = vpx_tm_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_tm_predictor_4x4 = vpx_tm_predictor_4x4_neon; +#endif + vpx_tm_predictor_8x8 = vpx_tm_predictor_8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_tm_predictor_8x8 = vpx_tm_predictor_8x8_neon; +#endif + vpx_v_predictor_16x16 = vpx_v_predictor_16x16_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_v_predictor_16x16 = vpx_v_predictor_16x16_neon; +#endif + vpx_v_predictor_32x32 = vpx_v_predictor_32x32_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_v_predictor_32x32 = vpx_v_predictor_32x32_neon; +#endif + vpx_v_predictor_4x4 = vpx_v_predictor_4x4_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_v_predictor_4x4 = vpx_v_predictor_4x4_neon; +#endif + vpx_v_predictor_8x8 = vpx_v_predictor_8x8_c; +#if HAVE_NEON + if (flags & HAS_NEON) vpx_v_predictor_8x8 = vpx_v_predictor_8x8_neon; +#endif +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_c.h b/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_c.h new file mode 100644 index 000000000000..9fcc2f006611 --- /dev/null +++ b/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_c.h @@ -0,0 +1,355 @@ +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8 vpx_convolve8_c + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_avg vpx_convolve8_avg_c + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_horiz vpx_convolve8_horiz_c + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_vert vpx_convolve8_vert_c + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve_avg vpx_convolve_avg_c + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve_copy vpx_convolve_copy_c + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c + +void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c + +void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c + +void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c + +void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c + +void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c + +void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c + +void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c + +void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c + +void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_c + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_c + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_c + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_c + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_c + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_c + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_c + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_c + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_c + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_c + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_c + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_c + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_c + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_c + +void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_edge_16 vpx_lpf_horizontal_edge_16_c + +void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_edge_8 vpx_lpf_horizontal_edge_8_c + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_c + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_c + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_c + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_c + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_c + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_c + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_c + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_c + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_c + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_c + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +void vpx_dsp_rtcd(void); + +#ifdef RTCD_C +static void setup_rtcd_internal(void) +{ +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_x86.h b/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_x86.h new file mode 100644 index 000000000000..c2a68330acbc --- /dev/null +++ b/thirdparty/libvpx/rtcd/vpx_dsp_rtcd_x86.h @@ -0,0 +1,604 @@ +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c + +void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c + +void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c + +void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c + +void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c + +void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c + +void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_edge_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_edge_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +void vpx_dsp_rtcd(void); + +#ifdef RTCD_C +#include "vpx_ports/x86.h" +static void setup_rtcd_internal(void) +{ + int flags = x86_simd_caps(); + + vpx_convolve8 = vpx_convolve8_c; + if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2; + if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; + vpx_convolve8_avg = vpx_convolve8_avg_c; + if (flags & HAS_SSE2) vpx_convolve8_avg = vpx_convolve8_avg_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3; + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c; + if (flags & HAS_SSE2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3; + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c; + if (flags & HAS_SSE2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3; + vpx_convolve8_horiz = vpx_convolve8_horiz_c; + if (flags & HAS_SSE2) vpx_convolve8_horiz = vpx_convolve8_horiz_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3; + vpx_convolve8_vert = vpx_convolve8_vert_c; + if (flags & HAS_SSE2) vpx_convolve8_vert = vpx_convolve8_vert_sse2; + if (flags & HAS_SSSE3) vpx_convolve8_vert = vpx_convolve8_vert_ssse3; + vpx_convolve_avg = vpx_convolve_avg_c; + if (flags & HAS_SSE2) vpx_convolve_avg = vpx_convolve_avg_sse2; + vpx_convolve_copy = vpx_convolve_copy_c; + if (flags & HAS_SSE2) vpx_convolve_copy = vpx_convolve_copy_sse2; + vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_ssse3; + vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_ssse3; + vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_ssse3; + vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_ssse3; + vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_ssse3; + vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_ssse3; + vpx_d207_predictor_4x4 = vpx_d207_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_d207_predictor_4x4 = vpx_d207_predictor_4x4_sse2; + vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_ssse3; + vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_ssse3; + vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_ssse3; + vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_sse2; + vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_sse2; + vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_ssse3; + vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_ssse3; + vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_ssse3; + vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c; + if (flags & HAS_SSSE3) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_ssse3; + vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_sse2; + vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_sse2; + vpx_dc_128_predictor_4x4 = vpx_dc_128_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_4x4 = vpx_dc_128_predictor_4x4_sse2; + vpx_dc_128_predictor_8x8 = vpx_dc_128_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_128_predictor_8x8 = vpx_dc_128_predictor_8x8_sse2; + vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_sse2; + vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_sse2; + vpx_dc_left_predictor_4x4 = vpx_dc_left_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_4x4 = vpx_dc_left_predictor_4x4_sse2; + vpx_dc_left_predictor_8x8 = vpx_dc_left_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_left_predictor_8x8 = vpx_dc_left_predictor_8x8_sse2; + vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_sse2; + vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_sse2; + vpx_dc_predictor_4x4 = vpx_dc_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_predictor_4x4 = vpx_dc_predictor_4x4_sse2; + vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_sse2; + vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_sse2; + vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_sse2; + vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_sse2; + vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_sse2; + vpx_h_predictor_16x16 = vpx_h_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_h_predictor_16x16 = vpx_h_predictor_16x16_sse2; + vpx_h_predictor_32x32 = vpx_h_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_h_predictor_32x32 = vpx_h_predictor_32x32_sse2; + vpx_h_predictor_4x4 = vpx_h_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_h_predictor_4x4 = vpx_h_predictor_4x4_sse2; + vpx_h_predictor_8x8 = vpx_h_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_h_predictor_8x8 = vpx_h_predictor_8x8_sse2; + vpx_idct16x16_10_add = vpx_idct16x16_10_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_10_add = vpx_idct16x16_10_add_sse2; + vpx_idct16x16_1_add = vpx_idct16x16_1_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_1_add = vpx_idct16x16_1_add_sse2; + vpx_idct16x16_256_add = vpx_idct16x16_256_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2; + vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; + vpx_idct32x32_135_add = vpx_idct32x32_135_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2; + vpx_idct32x32_1_add = vpx_idct32x32_1_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_1_add = vpx_idct32x32_1_add_sse2; + vpx_idct32x32_34_add = vpx_idct32x32_34_add_c; + if (flags & HAS_SSE2) vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2; + vpx_idct4x4_16_add = vpx_idct4x4_16_add_c; + if (flags & HAS_SSE2) vpx_idct4x4_16_add = vpx_idct4x4_16_add_sse2; + vpx_idct4x4_1_add = vpx_idct4x4_1_add_c; + if (flags & HAS_SSE2) vpx_idct4x4_1_add = vpx_idct4x4_1_add_sse2; + vpx_idct8x8_12_add = vpx_idct8x8_12_add_c; + if (flags & HAS_SSE2) vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2; + vpx_idct8x8_1_add = vpx_idct8x8_1_add_c; + if (flags & HAS_SSE2) vpx_idct8x8_1_add = vpx_idct8x8_1_add_sse2; + vpx_idct8x8_64_add = vpx_idct8x8_64_add_c; + if (flags & HAS_SSE2) vpx_idct8x8_64_add = vpx_idct8x8_64_add_sse2; + vpx_iwht4x4_16_add = vpx_iwht4x4_16_add_c; + if (flags & HAS_SSE2) vpx_iwht4x4_16_add = vpx_iwht4x4_16_add_sse2; + vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_sse2; + vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_sse2; + vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_sse2; + vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_sse2; + vpx_lpf_horizontal_edge_16 = vpx_lpf_horizontal_edge_16_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_edge_16 = vpx_lpf_horizontal_edge_16_sse2; + vpx_lpf_horizontal_edge_8 = vpx_lpf_horizontal_edge_8_c; + if (flags & HAS_SSE2) vpx_lpf_horizontal_edge_8 = vpx_lpf_horizontal_edge_8_sse2; + vpx_lpf_vertical_16 = vpx_lpf_vertical_16_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_16 = vpx_lpf_vertical_16_sse2; + vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_sse2; + vpx_lpf_vertical_4 = vpx_lpf_vertical_4_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_4 = vpx_lpf_vertical_4_sse2; + vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_sse2; + vpx_lpf_vertical_8 = vpx_lpf_vertical_8_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_8 = vpx_lpf_vertical_8_sse2; + vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c; + if (flags & HAS_SSE2) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_sse2; + vpx_scaled_2d = vpx_scaled_2d_c; + if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3; + vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_sse2; + vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_sse2; + vpx_tm_predictor_4x4 = vpx_tm_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_tm_predictor_4x4 = vpx_tm_predictor_4x4_sse2; + vpx_tm_predictor_8x8 = vpx_tm_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_tm_predictor_8x8 = vpx_tm_predictor_8x8_sse2; + vpx_v_predictor_16x16 = vpx_v_predictor_16x16_c; + if (flags & HAS_SSE2) vpx_v_predictor_16x16 = vpx_v_predictor_16x16_sse2; + vpx_v_predictor_32x32 = vpx_v_predictor_32x32_c; + if (flags & HAS_SSE2) vpx_v_predictor_32x32 = vpx_v_predictor_32x32_sse2; + vpx_v_predictor_4x4 = vpx_v_predictor_4x4_c; + if (flags & HAS_SSE2) vpx_v_predictor_4x4 = vpx_v_predictor_4x4_sse2; + vpx_v_predictor_8x8 = vpx_v_predictor_8x8_c; + if (flags & HAS_SSE2) vpx_v_predictor_8x8 = vpx_v_predictor_8x8_sse2; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/third_party/android/cpu-features.c b/thirdparty/libvpx/third_party/android/cpu-features.c new file mode 100644 index 000000000000..e2bd749b016e --- /dev/null +++ b/thirdparty/libvpx/third_party/android/cpu-features.c @@ -0,0 +1,1313 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* ChangeLog for this library: + * + * NDK r10e?: Add MIPS MSA feature. + * + * NDK r10: Support for 64-bit CPUs (Intel, ARM & MIPS). + * + * NDK r8d: Add android_setCpu(). + * + * NDK r8c: Add new ARM CPU features: VFPv2, VFP_D32, VFP_FP16, + * VFP_FMA, NEON_FMA, IDIV_ARM, IDIV_THUMB2 and iWMMXt. + * + * Rewrite the code to parse /proc/self/auxv instead of + * the "Features" field in /proc/cpuinfo. + * + * Dynamically allocate the buffer that hold the content + * of /proc/cpuinfo to deal with newer hardware. + * + * NDK r7c: Fix CPU count computation. The old method only reported the + * number of _active_ CPUs when the library was initialized, + * which could be less than the real total. + * + * NDK r5: Handle buggy kernels which report a CPU Architecture number of 7 + * for an ARMv6 CPU (see below). + * + * Handle kernels that only report 'neon', and not 'vfpv3' + * (VFPv3 is mandated by the ARM architecture is Neon is implemented) + * + * Handle kernels that only report 'vfpv3d16', and not 'vfpv3' + * + * Fix x86 compilation. Report ANDROID_CPU_FAMILY_X86 in + * android_getCpuFamily(). + * + * NDK r4: Initial release + */ + +#include "cpu-features.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static pthread_once_t g_once; +static int g_inited; +static AndroidCpuFamily g_cpuFamily; +static uint64_t g_cpuFeatures; +static int g_cpuCount; + +#ifdef __arm__ +static uint32_t g_cpuIdArm; +#endif + +static const int android_cpufeatures_debug = 0; + +#define D(...) \ + do { \ + if (android_cpufeatures_debug) { \ + printf(__VA_ARGS__); fflush(stdout); \ + } \ + } while (0) + +#ifdef __i386__ +static __inline__ void x86_cpuid(int func, int values[4]) +{ + int a, b, c, d; + /* We need to preserve ebx since we're compiling PIC code */ + /* this means we can't use "=b" for the second output register */ + __asm__ __volatile__ ( \ + "push %%ebx\n" + "cpuid\n" \ + "mov %%ebx, %1\n" + "pop %%ebx\n" + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "a" (func) \ + ); + values[0] = a; + values[1] = b; + values[2] = c; + values[3] = d; +} +#elif defined(__x86_64__) +static __inline__ void x86_cpuid(int func, int values[4]) +{ + int64_t a, b, c, d; + /* We need to preserve ebx since we're compiling PIC code */ + /* this means we can't use "=b" for the second output register */ + __asm__ __volatile__ ( \ + "push %%rbx\n" + "cpuid\n" \ + "mov %%rbx, %1\n" + "pop %%rbx\n" + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "a" (func) \ + ); + values[0] = a; + values[1] = b; + values[2] = c; + values[3] = d; +} +#endif + +/* Get the size of a file by reading it until the end. This is needed + * because files under /proc do not always return a valid size when + * using fseek(0, SEEK_END) + ftell(). Nor can they be mmap()-ed. + */ +static int +get_file_size(const char* pathname) +{ + + int fd, result = 0; + char buffer[256]; + + fd = open(pathname, O_RDONLY); + if (fd < 0) { + D("Can't open %s: %s\n", pathname, strerror(errno)); + return -1; + } + + for (;;) { + int ret = read(fd, buffer, sizeof buffer); + if (ret < 0) { + if (errno == EINTR) + continue; + D("Error while reading %s: %s\n", pathname, strerror(errno)); + break; + } + if (ret == 0) + break; + + result += ret; + } + close(fd); + return result; +} + +/* Read the content of /proc/cpuinfo into a user-provided buffer. + * Return the length of the data, or -1 on error. Does *not* + * zero-terminate the content. Will not read more + * than 'buffsize' bytes. + */ +static int +read_file(const char* pathname, char* buffer, size_t buffsize) +{ + int fd, count; + + fd = open(pathname, O_RDONLY); + if (fd < 0) { + D("Could not open %s: %s\n", pathname, strerror(errno)); + return -1; + } + count = 0; + while (count < (int)buffsize) { + int ret = read(fd, buffer + count, buffsize - count); + if (ret < 0) { + if (errno == EINTR) + continue; + D("Error while reading from %s: %s\n", pathname, strerror(errno)); + if (count == 0) + count = -1; + break; + } + if (ret == 0) + break; + count += ret; + } + close(fd); + return count; +} + +#ifdef __arm__ +/* Extract the content of a the first occurence of a given field in + * the content of /proc/cpuinfo and return it as a heap-allocated + * string that must be freed by the caller. + * + * Return NULL if not found + */ +static char* +extract_cpuinfo_field(const char* buffer, int buflen, const char* field) +{ + int fieldlen = strlen(field); + const char* bufend = buffer + buflen; + char* result = NULL; + int len; + const char *p, *q; + + /* Look for first field occurence, and ensures it starts the line. */ + p = buffer; + for (;;) { + p = memmem(p, bufend-p, field, fieldlen); + if (p == NULL) + goto EXIT; + + if (p == buffer || p[-1] == '\n') + break; + + p += fieldlen; + } + + /* Skip to the first column followed by a space */ + p += fieldlen; + p = memchr(p, ':', bufend-p); + if (p == NULL || p[1] != ' ') + goto EXIT; + + /* Find the end of the line */ + p += 2; + q = memchr(p, '\n', bufend-p); + if (q == NULL) + q = bufend; + + /* Copy the line into a heap-allocated buffer */ + len = q-p; + result = malloc(len+1); + if (result == NULL) + goto EXIT; + + memcpy(result, p, len); + result[len] = '\0'; + +EXIT: + return result; +} + +/* Checks that a space-separated list of items contains one given 'item'. + * Returns 1 if found, 0 otherwise. + */ +static int +has_list_item(const char* list, const char* item) +{ + const char* p = list; + int itemlen = strlen(item); + + if (list == NULL) + return 0; + + while (*p) { + const char* q; + + /* skip spaces */ + while (*p == ' ' || *p == '\t') + p++; + + /* find end of current list item */ + q = p; + while (*q && *q != ' ' && *q != '\t') + q++; + + if (itemlen == q-p && !memcmp(p, item, itemlen)) + return 1; + + /* skip to next item */ + p = q; + } + return 0; +} +#endif /* __arm__ */ + +/* Parse a number starting from 'input', but not going further + * than 'limit'. Return the value into '*result'. + * + * NOTE: Does not skip over leading spaces, or deal with sign characters. + * NOTE: Ignores overflows. + * + * The function returns NULL in case of error (bad format), or the new + * position after the decimal number in case of success (which will always + * be <= 'limit'). + */ +static const char* +parse_number(const char* input, const char* limit, int base, int* result) +{ + const char* p = input; + int val = 0; + while (p < limit) { + int d = (*p - '0'); + if ((unsigned)d >= 10U) { + d = (*p - 'a'); + if ((unsigned)d >= 6U) + d = (*p - 'A'); + if ((unsigned)d >= 6U) + break; + d += 10; + } + if (d >= base) + break; + val = val*base + d; + p++; + } + if (p == input) + return NULL; + + *result = val; + return p; +} + +static const char* +parse_decimal(const char* input, const char* limit, int* result) +{ + return parse_number(input, limit, 10, result); +} + +#ifdef __arm__ +static const char* +parse_hexadecimal(const char* input, const char* limit, int* result) +{ + return parse_number(input, limit, 16, result); +} +#endif /* __arm__ */ + +/* This small data type is used to represent a CPU list / mask, as read + * from sysfs on Linux. See http://www.kernel.org/doc/Documentation/cputopology.txt + * + * For now, we don't expect more than 32 cores on mobile devices, so keep + * everything simple. + */ +typedef struct { + uint32_t mask; +} CpuList; + +static __inline__ void +cpulist_init(CpuList* list) { + list->mask = 0; +} + +static __inline__ void +cpulist_and(CpuList* list1, CpuList* list2) { + list1->mask &= list2->mask; +} + +static __inline__ void +cpulist_set(CpuList* list, int index) { + if ((unsigned)index < 32) { + list->mask |= (uint32_t)(1U << index); + } +} + +static __inline__ int +cpulist_count(CpuList* list) { + return __builtin_popcount(list->mask); +} + +/* Parse a textual list of cpus and store the result inside a CpuList object. + * Input format is the following: + * - comma-separated list of items (no spaces) + * - each item is either a single decimal number (cpu index), or a range made + * of two numbers separated by a single dash (-). Ranges are inclusive. + * + * Examples: 0 + * 2,4-127,128-143 + * 0-1 + */ +static void +cpulist_parse(CpuList* list, const char* line, int line_len) +{ + const char* p = line; + const char* end = p + line_len; + const char* q; + + /* NOTE: the input line coming from sysfs typically contains a + * trailing newline, so take care of it in the code below + */ + while (p < end && *p != '\n') + { + int val, start_value, end_value; + + /* Find the end of current item, and put it into 'q' */ + q = memchr(p, ',', end-p); + if (q == NULL) { + q = end; + } + + /* Get first value */ + p = parse_decimal(p, q, &start_value); + if (p == NULL) + goto BAD_FORMAT; + + end_value = start_value; + + /* If we're not at the end of the item, expect a dash and + * and integer; extract end value. + */ + if (p < q && *p == '-') { + p = parse_decimal(p+1, q, &end_value); + if (p == NULL) + goto BAD_FORMAT; + } + + /* Set bits CPU list bits */ + for (val = start_value; val <= end_value; val++) { + cpulist_set(list, val); + } + + /* Jump to next item */ + p = q; + if (p < end) + p++; + } + +BAD_FORMAT: + ; +} + +/* Read a CPU list from one sysfs file */ +static void +cpulist_read_from(CpuList* list, const char* filename) +{ + char file[64]; + int filelen; + + cpulist_init(list); + + filelen = read_file(filename, file, sizeof file); + if (filelen < 0) { + D("Could not read %s: %s\n", filename, strerror(errno)); + return; + } + + cpulist_parse(list, file, filelen); +} +#if defined(__aarch64__) +// see kernel header +#define HWCAP_FP (1 << 0) +#define HWCAP_ASIMD (1 << 1) +#define HWCAP_AES (1 << 3) +#define HWCAP_PMULL (1 << 4) +#define HWCAP_SHA1 (1 << 5) +#define HWCAP_SHA2 (1 << 6) +#define HWCAP_CRC32 (1 << 7) +#endif + +#if defined(__arm__) + +// See kernel header. +#define HWCAP_VFP (1 << 6) +#define HWCAP_IWMMXT (1 << 9) +#define HWCAP_NEON (1 << 12) +#define HWCAP_VFPv3 (1 << 13) +#define HWCAP_VFPv3D16 (1 << 14) +#define HWCAP_VFPv4 (1 << 16) +#define HWCAP_IDIVA (1 << 17) +#define HWCAP_IDIVT (1 << 18) + +// see kernel header +#define HWCAP2_AES (1 << 0) +#define HWCAP2_PMULL (1 << 1) +#define HWCAP2_SHA1 (1 << 2) +#define HWCAP2_SHA2 (1 << 3) +#define HWCAP2_CRC32 (1 << 4) + +// This is the list of 32-bit ARMv7 optional features that are _always_ +// supported by ARMv8 CPUs, as mandated by the ARM Architecture Reference +// Manual. +#define HWCAP_SET_FOR_ARMV8 \ + ( HWCAP_VFP | \ + HWCAP_NEON | \ + HWCAP_VFPv3 | \ + HWCAP_VFPv4 | \ + HWCAP_IDIVA | \ + HWCAP_IDIVT ) +#endif + +#if defined(__mips__) +// see kernel header +#define HWCAP_MIPS_R6 (1 << 0) +#define HWCAP_MIPS_MSA (1 << 1) +#endif + +#if defined(__arm__) || defined(__aarch64__) || defined(__mips__) + +#define AT_HWCAP 16 +#define AT_HWCAP2 26 + +// Probe the system's C library for a 'getauxval' function and call it if +// it exits, or return 0 for failure. This function is available since API +// level 20. +// +// This code does *NOT* check for '__ANDROID_API__ >= 20' to support the +// edge case where some NDK developers use headers for a platform that is +// newer than the one really targetted by their application. +// This is typically done to use newer native APIs only when running on more +// recent Android versions, and requires careful symbol management. +// +// Note that getauxval() can't really be re-implemented here, because +// its implementation does not parse /proc/self/auxv. Instead it depends +// on values that are passed by the kernel at process-init time to the +// C runtime initialization layer. +static uint32_t +get_elf_hwcap_from_getauxval(int hwcap_type) { + typedef unsigned long getauxval_func_t(unsigned long); + + dlerror(); + void* libc_handle = dlopen("libc.so", RTLD_NOW); + if (!libc_handle) { + D("Could not dlopen() C library: %s\n", dlerror()); + return 0; + } + + uint32_t ret = 0; + getauxval_func_t* func = (getauxval_func_t*) + dlsym(libc_handle, "getauxval"); + if (!func) { + D("Could not find getauxval() in C library\n"); + } else { + // Note: getauxval() returns 0 on failure. Doesn't touch errno. + ret = (uint32_t)(*func)(hwcap_type); + } + dlclose(libc_handle); + return ret; +} +#endif + +#if defined(__arm__) +// Parse /proc/self/auxv to extract the ELF HW capabilities bitmap for the +// current CPU. Note that this file is not accessible from regular +// application processes on some Android platform releases. +// On success, return new ELF hwcaps, or 0 on failure. +static uint32_t +get_elf_hwcap_from_proc_self_auxv(void) { + const char filepath[] = "/proc/self/auxv"; + int fd = TEMP_FAILURE_RETRY(open(filepath, O_RDONLY)); + if (fd < 0) { + D("Could not open %s: %s\n", filepath, strerror(errno)); + return 0; + } + + struct { uint32_t tag; uint32_t value; } entry; + + uint32_t result = 0; + for (;;) { + int ret = TEMP_FAILURE_RETRY(read(fd, (char*)&entry, sizeof entry)); + if (ret < 0) { + D("Error while reading %s: %s\n", filepath, strerror(errno)); + break; + } + // Detect end of list. + if (ret == 0 || (entry.tag == 0 && entry.value == 0)) + break; + if (entry.tag == AT_HWCAP) { + result = entry.value; + break; + } + } + close(fd); + return result; +} + +/* Compute the ELF HWCAP flags from the content of /proc/cpuinfo. + * This works by parsing the 'Features' line, which lists which optional + * features the device's CPU supports, on top of its reference + * architecture. + */ +static uint32_t +get_elf_hwcap_from_proc_cpuinfo(const char* cpuinfo, int cpuinfo_len) { + uint32_t hwcaps = 0; + long architecture = 0; + char* cpuArch = extract_cpuinfo_field(cpuinfo, cpuinfo_len, "CPU architecture"); + if (cpuArch) { + architecture = strtol(cpuArch, NULL, 10); + free(cpuArch); + + if (architecture >= 8L) { + // This is a 32-bit ARM binary running on a 64-bit ARM64 kernel. + // The 'Features' line only lists the optional features that the + // device's CPU supports, compared to its reference architecture + // which are of no use for this process. + D("Faking 32-bit ARM HWCaps on ARMv%ld CPU\n", architecture); + return HWCAP_SET_FOR_ARMV8; + } + } + + char* cpuFeatures = extract_cpuinfo_field(cpuinfo, cpuinfo_len, "Features"); + if (cpuFeatures != NULL) { + D("Found cpuFeatures = '%s'\n", cpuFeatures); + + if (has_list_item(cpuFeatures, "vfp")) + hwcaps |= HWCAP_VFP; + if (has_list_item(cpuFeatures, "vfpv3")) + hwcaps |= HWCAP_VFPv3; + if (has_list_item(cpuFeatures, "vfpv3d16")) + hwcaps |= HWCAP_VFPv3D16; + if (has_list_item(cpuFeatures, "vfpv4")) + hwcaps |= HWCAP_VFPv4; + if (has_list_item(cpuFeatures, "neon")) + hwcaps |= HWCAP_NEON; + if (has_list_item(cpuFeatures, "idiva")) + hwcaps |= HWCAP_IDIVA; + if (has_list_item(cpuFeatures, "idivt")) + hwcaps |= HWCAP_IDIVT; + if (has_list_item(cpuFeatures, "idiv")) + hwcaps |= HWCAP_IDIVA | HWCAP_IDIVT; + if (has_list_item(cpuFeatures, "iwmmxt")) + hwcaps |= HWCAP_IWMMXT; + + free(cpuFeatures); + } + return hwcaps; +} +#endif /* __arm__ */ + +/* Return the number of cpus present on a given device. + * + * To handle all weird kernel configurations, we need to compute the + * intersection of the 'present' and 'possible' CPU lists and count + * the result. + */ +static int +get_cpu_count(void) +{ + CpuList cpus_present[1]; + CpuList cpus_possible[1]; + + cpulist_read_from(cpus_present, "/sys/devices/system/cpu/present"); + cpulist_read_from(cpus_possible, "/sys/devices/system/cpu/possible"); + + /* Compute the intersection of both sets to get the actual number of + * CPU cores that can be used on this device by the kernel. + */ + cpulist_and(cpus_present, cpus_possible); + + return cpulist_count(cpus_present); +} + +static void +android_cpuInitFamily(void) +{ +#if defined(__arm__) + g_cpuFamily = ANDROID_CPU_FAMILY_ARM; +#elif defined(__i386__) + g_cpuFamily = ANDROID_CPU_FAMILY_X86; +#elif defined(__mips64) +/* Needs to be before __mips__ since the compiler defines both */ + g_cpuFamily = ANDROID_CPU_FAMILY_MIPS64; +#elif defined(__mips__) + g_cpuFamily = ANDROID_CPU_FAMILY_MIPS; +#elif defined(__aarch64__) + g_cpuFamily = ANDROID_CPU_FAMILY_ARM64; +#elif defined(__x86_64__) + g_cpuFamily = ANDROID_CPU_FAMILY_X86_64; +#else + g_cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN; +#endif +} + +static void +android_cpuInit(void) +{ + char* cpuinfo = NULL; + int cpuinfo_len; + + android_cpuInitFamily(); + + g_cpuFeatures = 0; + g_cpuCount = 1; + g_inited = 1; + + cpuinfo_len = get_file_size("/proc/cpuinfo"); + if (cpuinfo_len < 0) { + D("cpuinfo_len cannot be computed!"); + return; + } + cpuinfo = malloc(cpuinfo_len); + if (cpuinfo == NULL) { + D("cpuinfo buffer could not be allocated"); + return; + } + cpuinfo_len = read_file("/proc/cpuinfo", cpuinfo, cpuinfo_len); + D("cpuinfo_len is (%d):\n%.*s\n", cpuinfo_len, + cpuinfo_len >= 0 ? cpuinfo_len : 0, cpuinfo); + + if (cpuinfo_len < 0) /* should not happen */ { + free(cpuinfo); + return; + } + + /* Count the CPU cores, the value may be 0 for single-core CPUs */ + g_cpuCount = get_cpu_count(); + if (g_cpuCount == 0) { + g_cpuCount = 1; + } + + D("found cpuCount = %d\n", g_cpuCount); + +#ifdef __arm__ + { + /* Extract architecture from the "CPU Architecture" field. + * The list is well-known, unlike the the output of + * the 'Processor' field which can vary greatly. + * + * See the definition of the 'proc_arch' array in + * $KERNEL/arch/arm/kernel/setup.c and the 'c_show' function in + * same file. + */ + char* cpuArch = extract_cpuinfo_field(cpuinfo, cpuinfo_len, "CPU architecture"); + + if (cpuArch != NULL) { + char* end; + long archNumber; + int hasARMv7 = 0; + + D("found cpuArch = '%s'\n", cpuArch); + + /* read the initial decimal number, ignore the rest */ + archNumber = strtol(cpuArch, &end, 10); + + /* Note that ARMv8 is upwards compatible with ARMv7. */ + if (end > cpuArch && archNumber >= 7) { + hasARMv7 = 1; + } + + /* Unfortunately, it seems that certain ARMv6-based CPUs + * report an incorrect architecture number of 7! + * + * See http://code.google.com/p/android/issues/detail?id=10812 + * + * We try to correct this by looking at the 'elf_format' + * field reported by the 'Processor' field, which is of the + * form of "(v7l)" for an ARMv7-based CPU, and "(v6l)" for + * an ARMv6-one. + */ + if (hasARMv7) { + char* cpuProc = extract_cpuinfo_field(cpuinfo, cpuinfo_len, + "Processor"); + if (cpuProc != NULL) { + D("found cpuProc = '%s'\n", cpuProc); + if (has_list_item(cpuProc, "(v6l)")) { + D("CPU processor and architecture mismatch!!\n"); + hasARMv7 = 0; + } + free(cpuProc); + } + } + + if (hasARMv7) { + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_ARMv7; + } + + /* The LDREX / STREX instructions are available from ARMv6 */ + if (archNumber >= 6) { + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_LDREX_STREX; + } + + free(cpuArch); + } + + /* Extract the list of CPU features from ELF hwcaps */ + uint32_t hwcaps = 0; + hwcaps = get_elf_hwcap_from_getauxval(AT_HWCAP); + if (!hwcaps) { + D("Parsing /proc/self/auxv to extract ELF hwcaps!\n"); + hwcaps = get_elf_hwcap_from_proc_self_auxv(); + } + if (!hwcaps) { + // Parsing /proc/self/auxv will fail from regular application + // processes on some Android platform versions, when this happens + // parse proc/cpuinfo instead. + D("Parsing /proc/cpuinfo to extract ELF hwcaps!\n"); + hwcaps = get_elf_hwcap_from_proc_cpuinfo(cpuinfo, cpuinfo_len); + } + + if (hwcaps != 0) { + int has_vfp = (hwcaps & HWCAP_VFP); + int has_vfpv3 = (hwcaps & HWCAP_VFPv3); + int has_vfpv3d16 = (hwcaps & HWCAP_VFPv3D16); + int has_vfpv4 = (hwcaps & HWCAP_VFPv4); + int has_neon = (hwcaps & HWCAP_NEON); + int has_idiva = (hwcaps & HWCAP_IDIVA); + int has_idivt = (hwcaps & HWCAP_IDIVT); + int has_iwmmxt = (hwcaps & HWCAP_IWMMXT); + + // The kernel does a poor job at ensuring consistency when + // describing CPU features. So lots of guessing is needed. + + // 'vfpv4' implies VFPv3|VFP_FMA|FP16 + if (has_vfpv4) + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3 | + ANDROID_CPU_ARM_FEATURE_VFP_FP16 | + ANDROID_CPU_ARM_FEATURE_VFP_FMA; + + // 'vfpv3' or 'vfpv3d16' imply VFPv3. Note that unlike GCC, + // a value of 'vfpv3' doesn't necessarily mean that the D32 + // feature is present, so be conservative. All CPUs in the + // field that support D32 also support NEON, so this should + // not be a problem in practice. + if (has_vfpv3 || has_vfpv3d16) + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3; + + // 'vfp' is super ambiguous. Depending on the kernel, it can + // either mean VFPv2 or VFPv3. Make it depend on ARMv7. + if (has_vfp) { + if (g_cpuFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7) + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3; + else + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv2; + } + + // Neon implies VFPv3|D32, and if vfpv4 is detected, NEON_FMA + if (has_neon) { + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3 | + ANDROID_CPU_ARM_FEATURE_NEON | + ANDROID_CPU_ARM_FEATURE_VFP_D32; + if (has_vfpv4) + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_NEON_FMA; + } + + // VFPv3 implies VFPv2 and ARMv7 + if (g_cpuFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3) + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv2 | + ANDROID_CPU_ARM_FEATURE_ARMv7; + + if (has_idiva) + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_ARM; + if (has_idivt) + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2; + + if (has_iwmmxt) + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_iWMMXt; + } + + /* Extract the list of CPU features from ELF hwcaps2 */ + uint32_t hwcaps2 = 0; + hwcaps2 = get_elf_hwcap_from_getauxval(AT_HWCAP2); + if (hwcaps2 != 0) { + int has_aes = (hwcaps2 & HWCAP2_AES); + int has_pmull = (hwcaps2 & HWCAP2_PMULL); + int has_sha1 = (hwcaps2 & HWCAP2_SHA1); + int has_sha2 = (hwcaps2 & HWCAP2_SHA2); + int has_crc32 = (hwcaps2 & HWCAP2_CRC32); + + if (has_aes) + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_AES; + if (has_pmull) + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_PMULL; + if (has_sha1) + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_SHA1; + if (has_sha2) + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_SHA2; + if (has_crc32) + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_CRC32; + } + /* Extract the cpuid value from various fields */ + // The CPUID value is broken up in several entries in /proc/cpuinfo. + // This table is used to rebuild it from the entries. + static const struct CpuIdEntry { + const char* field; + char format; + char bit_lshift; + char bit_length; + } cpu_id_entries[] = { + { "CPU implementer", 'x', 24, 8 }, + { "CPU variant", 'x', 20, 4 }, + { "CPU part", 'x', 4, 12 }, + { "CPU revision", 'd', 0, 4 }, + }; + size_t i; + D("Parsing /proc/cpuinfo to recover CPUID\n"); + for (i = 0; + i < sizeof(cpu_id_entries)/sizeof(cpu_id_entries[0]); + ++i) { + const struct CpuIdEntry* entry = &cpu_id_entries[i]; + char* value = extract_cpuinfo_field(cpuinfo, + cpuinfo_len, + entry->field); + if (value == NULL) + continue; + + D("field=%s value='%s'\n", entry->field, value); + char* value_end = value + strlen(value); + int val = 0; + const char* start = value; + const char* p; + if (value[0] == '0' && (value[1] == 'x' || value[1] == 'X')) { + start += 2; + p = parse_hexadecimal(start, value_end, &val); + } else if (entry->format == 'x') + p = parse_hexadecimal(value, value_end, &val); + else + p = parse_decimal(value, value_end, &val); + + if (p > (const char*)start) { + val &= ((1 << entry->bit_length)-1); + val <<= entry->bit_lshift; + g_cpuIdArm |= (uint32_t) val; + } + + free(value); + } + + // Handle kernel configuration bugs that prevent the correct + // reporting of CPU features. + static const struct CpuFix { + uint32_t cpuid; + uint64_t or_flags; + } cpu_fixes[] = { + /* The Nexus 4 (Qualcomm Krait) kernel configuration + * forgets to report IDIV support. */ + { 0x510006f2, ANDROID_CPU_ARM_FEATURE_IDIV_ARM | + ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 }, + { 0x510006f3, ANDROID_CPU_ARM_FEATURE_IDIV_ARM | + ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 }, + }; + size_t n; + for (n = 0; n < sizeof(cpu_fixes)/sizeof(cpu_fixes[0]); ++n) { + const struct CpuFix* entry = &cpu_fixes[n]; + + if (g_cpuIdArm == entry->cpuid) + g_cpuFeatures |= entry->or_flags; + } + + // Special case: The emulator-specific Android 4.2 kernel fails + // to report support for the 32-bit ARM IDIV instruction. + // Technically, this is a feature of the virtual CPU implemented + // by the emulator. Note that it could also support Thumb IDIV + // in the future, and this will have to be slightly updated. + char* hardware = extract_cpuinfo_field(cpuinfo, + cpuinfo_len, + "Hardware"); + if (hardware) { + if (!strcmp(hardware, "Goldfish") && + g_cpuIdArm == 0x4100c080 && + (g_cpuFamily & ANDROID_CPU_ARM_FEATURE_ARMv7) != 0) { + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_ARM; + } + free(hardware); + } + } +#endif /* __arm__ */ +#ifdef __aarch64__ + { + /* Extract the list of CPU features from ELF hwcaps */ + uint32_t hwcaps = 0; + hwcaps = get_elf_hwcap_from_getauxval(AT_HWCAP); + if (hwcaps != 0) { + int has_fp = (hwcaps & HWCAP_FP); + int has_asimd = (hwcaps & HWCAP_ASIMD); + int has_aes = (hwcaps & HWCAP_AES); + int has_pmull = (hwcaps & HWCAP_PMULL); + int has_sha1 = (hwcaps & HWCAP_SHA1); + int has_sha2 = (hwcaps & HWCAP_SHA2); + int has_crc32 = (hwcaps & HWCAP_CRC32); + + if(has_fp == 0) { + D("ERROR: Floating-point unit missing, but is required by Android on AArch64 CPUs\n"); + } + if(has_asimd == 0) { + D("ERROR: ASIMD unit missing, but is required by Android on AArch64 CPUs\n"); + } + + if (has_fp) + g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_FP; + if (has_asimd) + g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_ASIMD; + if (has_aes) + g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_AES; + if (has_pmull) + g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_PMULL; + if (has_sha1) + g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_SHA1; + if (has_sha2) + g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_SHA2; + if (has_crc32) + g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_CRC32; + } + } +#endif /* __aarch64__ */ + +#if defined(__i386__) || defined(__x86_64__) + int regs[4]; + +/* According to http://en.wikipedia.org/wiki/CPUID */ +#define VENDOR_INTEL_b 0x756e6547 +#define VENDOR_INTEL_c 0x6c65746e +#define VENDOR_INTEL_d 0x49656e69 + + x86_cpuid(0, regs); + int vendorIsIntel = (regs[1] == VENDOR_INTEL_b && + regs[2] == VENDOR_INTEL_c && + regs[3] == VENDOR_INTEL_d); + + x86_cpuid(1, regs); + if ((regs[2] & (1 << 9)) != 0) { + g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SSSE3; + } + if ((regs[2] & (1 << 23)) != 0) { + g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_POPCNT; + } + if ((regs[2] & (1 << 19)) != 0) { + g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SSE4_1; + } + if ((regs[2] & (1 << 20)) != 0) { + g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SSE4_2; + } + if (vendorIsIntel && (regs[2] & (1 << 22)) != 0) { + g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_MOVBE; + } + if ((regs[2] & (1 << 25)) != 0) { + g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_AES_NI; + } + if ((regs[2] & (1 << 28)) != 0) { + g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_AVX; + } + if ((regs[2] & (1 << 30)) != 0) { + g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_RDRAND; + } + + x86_cpuid(7, regs); + if ((regs[1] & (1 << 5)) != 0) { + g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_AVX2; + } + if ((regs[1] & (1 << 29)) != 0) { + g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SHA_NI; + } + + +#endif +#if defined( __mips__) + { /* MIPS and MIPS64 */ + /* Extract the list of CPU features from ELF hwcaps */ + uint32_t hwcaps = 0; + hwcaps = get_elf_hwcap_from_getauxval(AT_HWCAP); + if (hwcaps != 0) { + int has_r6 = (hwcaps & HWCAP_MIPS_R6); + int has_msa = (hwcaps & HWCAP_MIPS_MSA); + if (has_r6) + g_cpuFeatures |= ANDROID_CPU_MIPS_FEATURE_R6; + if (has_msa) + g_cpuFeatures |= ANDROID_CPU_MIPS_FEATURE_MSA; + } + } +#endif /* __mips__ */ + + free(cpuinfo); +} + + +AndroidCpuFamily +android_getCpuFamily(void) +{ + pthread_once(&g_once, android_cpuInit); + return g_cpuFamily; +} + + +uint64_t +android_getCpuFeatures(void) +{ + pthread_once(&g_once, android_cpuInit); + return g_cpuFeatures; +} + + +int +android_getCpuCount(void) +{ + pthread_once(&g_once, android_cpuInit); + return g_cpuCount; +} + +static void +android_cpuInitDummy(void) +{ + g_inited = 1; +} + +int +android_setCpu(int cpu_count, uint64_t cpu_features) +{ + /* Fail if the library was already initialized. */ + if (g_inited) + return 0; + + android_cpuInitFamily(); + g_cpuCount = (cpu_count <= 0 ? 1 : cpu_count); + g_cpuFeatures = cpu_features; + pthread_once(&g_once, android_cpuInitDummy); + + return 1; +} + +#ifdef __arm__ +uint32_t +android_getCpuIdArm(void) +{ + pthread_once(&g_once, android_cpuInit); + return g_cpuIdArm; +} + +int +android_setCpuArm(int cpu_count, uint64_t cpu_features, uint32_t cpu_id) +{ + if (!android_setCpu(cpu_count, cpu_features)) + return 0; + + g_cpuIdArm = cpu_id; + return 1; +} +#endif /* __arm__ */ + +/* + * Technical note: Making sense of ARM's FPU architecture versions. + * + * FPA was ARM's first attempt at an FPU architecture. There is no Android + * device that actually uses it since this technology was already obsolete + * when the project started. If you see references to FPA instructions + * somewhere, you can be sure that this doesn't apply to Android at all. + * + * FPA was followed by "VFP", soon renamed "VFPv1" due to the emergence of + * new versions / additions to it. ARM considers this obsolete right now, + * and no known Android device implements it either. + * + * VFPv2 added a few instructions to VFPv1, and is an *optional* extension + * supported by some ARMv5TE, ARMv6 and ARMv6T2 CPUs. Note that a device + * supporting the 'armeabi' ABI doesn't necessarily support these. + * + * VFPv3-D16 adds a few instructions on top of VFPv2 and is typically used + * on ARMv7-A CPUs which implement a FPU. Note that it is also mandated + * by the Android 'armeabi-v7a' ABI. The -D16 suffix in its name means + * that it provides 16 double-precision FPU registers (d0-d15) and 32 + * single-precision ones (s0-s31) which happen to be mapped to the same + * register banks. + * + * VFPv3-D32 is the name of an extension to VFPv3-D16 that provides 16 + * additional double precision registers (d16-d31). Note that there are + * still only 32 single precision registers. + * + * VFPv3xD is a *subset* of VFPv3-D16 that only provides single-precision + * registers. It is only used on ARMv7-M (i.e. on micro-controllers) which + * are not supported by Android. Note that it is not compatible with VFPv2. + * + * NOTE: The term 'VFPv3' usually designate either VFPv3-D16 or VFPv3-D32 + * depending on context. For example GCC uses it for VFPv3-D32, but + * the Linux kernel code uses it for VFPv3-D16 (especially in + * /proc/cpuinfo). Always try to use the full designation when + * possible. + * + * NEON, a.k.a. "ARM Advanced SIMD" is an extension that provides + * instructions to perform parallel computations on vectors of 8, 16, + * 32, 64 and 128 bit quantities. NEON requires VFPv32-D32 since all + * NEON registers are also mapped to the same register banks. + * + * VFPv4-D16, adds a few instructions on top of VFPv3-D16 in order to + * perform fused multiply-accumulate on VFP registers, as well as + * half-precision (16-bit) conversion operations. + * + * VFPv4-D32 is VFPv4-D16 with 32, instead of 16, FPU double precision + * registers. + * + * VPFv4-NEON is VFPv4-D32 with NEON instructions. It also adds fused + * multiply-accumulate instructions that work on the NEON registers. + * + * NOTE: Similarly, "VFPv4" might either reference VFPv4-D16 or VFPv4-D32 + * depending on context. + * + * The following information was determined by scanning the binutils-2.22 + * sources: + * + * Basic VFP instruction subsets: + * + * #define FPU_VFP_EXT_V1xD 0x08000000 // Base VFP instruction set. + * #define FPU_VFP_EXT_V1 0x04000000 // Double-precision insns. + * #define FPU_VFP_EXT_V2 0x02000000 // ARM10E VFPr1. + * #define FPU_VFP_EXT_V3xD 0x01000000 // VFPv3 single-precision. + * #define FPU_VFP_EXT_V3 0x00800000 // VFPv3 double-precision. + * #define FPU_NEON_EXT_V1 0x00400000 // Neon (SIMD) insns. + * #define FPU_VFP_EXT_D32 0x00200000 // Registers D16-D31. + * #define FPU_VFP_EXT_FP16 0x00100000 // Half-precision extensions. + * #define FPU_NEON_EXT_FMA 0x00080000 // Neon fused multiply-add + * #define FPU_VFP_EXT_FMA 0x00040000 // VFP fused multiply-add + * + * FPU types (excluding NEON) + * + * FPU_VFP_V1xD (EXT_V1xD) + * | + * +--------------------------+ + * | | + * FPU_VFP_V1 (+EXT_V1) FPU_VFP_V3xD (+EXT_V2+EXT_V3xD) + * | | + * | | + * FPU_VFP_V2 (+EXT_V2) FPU_VFP_V4_SP_D16 (+EXT_FP16+EXT_FMA) + * | + * FPU_VFP_V3D16 (+EXT_Vx3D+EXT_V3) + * | + * +--------------------------+ + * | | + * FPU_VFP_V3 (+EXT_D32) FPU_VFP_V4D16 (+EXT_FP16+EXT_FMA) + * | | + * | FPU_VFP_V4 (+EXT_D32) + * | + * FPU_VFP_HARD (+EXT_FMA+NEON_EXT_FMA) + * + * VFP architectures: + * + * ARCH_VFP_V1xD (EXT_V1xD) + * | + * +------------------+ + * | | + * | ARCH_VFP_V3xD (+EXT_V2+EXT_V3xD) + * | | + * | ARCH_VFP_V3xD_FP16 (+EXT_FP16) + * | | + * | ARCH_VFP_V4_SP_D16 (+EXT_FMA) + * | + * ARCH_VFP_V1 (+EXT_V1) + * | + * ARCH_VFP_V2 (+EXT_V2) + * | + * ARCH_VFP_V3D16 (+EXT_V3xD+EXT_V3) + * | + * +-------------------+ + * | | + * | ARCH_VFP_V3D16_FP16 (+EXT_FP16) + * | + * +-------------------+ + * | | + * | ARCH_VFP_V4_D16 (+EXT_FP16+EXT_FMA) + * | | + * | ARCH_VFP_V4 (+EXT_D32) + * | | + * | ARCH_NEON_VFP_V4 (+EXT_NEON+EXT_NEON_FMA) + * | + * ARCH_VFP_V3 (+EXT_D32) + * | + * +-------------------+ + * | | + * | ARCH_VFP_V3_FP16 (+EXT_FP16) + * | + * ARCH_VFP_V3_PLUS_NEON_V1 (+EXT_NEON) + * | + * ARCH_NEON_FP16 (+EXT_FP16) + * + * -fpu= values and their correspondance with FPU architectures above: + * + * {"vfp", FPU_ARCH_VFP_V2}, + * {"vfp9", FPU_ARCH_VFP_V2}, + * {"vfp3", FPU_ARCH_VFP_V3}, // For backwards compatbility. + * {"vfp10", FPU_ARCH_VFP_V2}, + * {"vfp10-r0", FPU_ARCH_VFP_V1}, + * {"vfpxd", FPU_ARCH_VFP_V1xD}, + * {"vfpv2", FPU_ARCH_VFP_V2}, + * {"vfpv3", FPU_ARCH_VFP_V3}, + * {"vfpv3-fp16", FPU_ARCH_VFP_V3_FP16}, + * {"vfpv3-d16", FPU_ARCH_VFP_V3D16}, + * {"vfpv3-d16-fp16", FPU_ARCH_VFP_V3D16_FP16}, + * {"vfpv3xd", FPU_ARCH_VFP_V3xD}, + * {"vfpv3xd-fp16", FPU_ARCH_VFP_V3xD_FP16}, + * {"neon", FPU_ARCH_VFP_V3_PLUS_NEON_V1}, + * {"neon-fp16", FPU_ARCH_NEON_FP16}, + * {"vfpv4", FPU_ARCH_VFP_V4}, + * {"vfpv4-d16", FPU_ARCH_VFP_V4D16}, + * {"fpv4-sp-d16", FPU_ARCH_VFP_V4_SP_D16}, + * {"neon-vfpv4", FPU_ARCH_NEON_VFP_V4}, + * + * + * Simplified diagram that only includes FPUs supported by Android: + * Only ARCH_VFP_V3D16 is actually mandated by the armeabi-v7a ABI, + * all others are optional and must be probed at runtime. + * + * ARCH_VFP_V3D16 (EXT_V1xD+EXT_V1+EXT_V2+EXT_V3xD+EXT_V3) + * | + * +-------------------+ + * | | + * | ARCH_VFP_V3D16_FP16 (+EXT_FP16) + * | + * +-------------------+ + * | | + * | ARCH_VFP_V4_D16 (+EXT_FP16+EXT_FMA) + * | | + * | ARCH_VFP_V4 (+EXT_D32) + * | | + * | ARCH_NEON_VFP_V4 (+EXT_NEON+EXT_NEON_FMA) + * | + * ARCH_VFP_V3 (+EXT_D32) + * | + * +-------------------+ + * | | + * | ARCH_VFP_V3_FP16 (+EXT_FP16) + * | + * ARCH_VFP_V3_PLUS_NEON_V1 (+EXT_NEON) + * | + * ARCH_NEON_FP16 (+EXT_FP16) + * + */ diff --git a/thirdparty/libvpx/third_party/android/cpu-features.h b/thirdparty/libvpx/third_party/android/cpu-features.h new file mode 100644 index 000000000000..1e9724197a8f --- /dev/null +++ b/thirdparty/libvpx/third_party/android/cpu-features.h @@ -0,0 +1,323 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef CPU_FEATURES_H +#define CPU_FEATURES_H + +#include +#include + +__BEGIN_DECLS + +/* A list of valid values returned by android_getCpuFamily(). + * They describe the CPU Architecture of the current process. + */ +typedef enum { + ANDROID_CPU_FAMILY_UNKNOWN = 0, + ANDROID_CPU_FAMILY_ARM, + ANDROID_CPU_FAMILY_X86, + ANDROID_CPU_FAMILY_MIPS, + ANDROID_CPU_FAMILY_ARM64, + ANDROID_CPU_FAMILY_X86_64, + ANDROID_CPU_FAMILY_MIPS64, + + ANDROID_CPU_FAMILY_MAX /* do not remove */ + +} AndroidCpuFamily; + +/* Return the CPU family of the current process. + * + * Note that this matches the bitness of the current process. I.e. when + * running a 32-bit binary on a 64-bit capable CPU, this will return the + * 32-bit CPU family value. + */ +extern AndroidCpuFamily android_getCpuFamily(void); + +/* Return a bitmap describing a set of optional CPU features that are + * supported by the current device's CPU. The exact bit-flags returned + * depend on the value returned by android_getCpuFamily(). See the + * documentation for the ANDROID_CPU_*_FEATURE_* flags below for details. + */ +extern uint64_t android_getCpuFeatures(void); + +/* The list of feature flags for ANDROID_CPU_FAMILY_ARM that can be + * recognized by the library (see note below for 64-bit ARM). Value details + * are: + * + * VFPv2: + * CPU supports the VFPv2 instruction set. Many, but not all, ARMv6 CPUs + * support these instructions. VFPv2 is a subset of VFPv3 so this will + * be set whenever VFPv3 is set too. + * + * ARMv7: + * CPU supports the ARMv7-A basic instruction set. + * This feature is mandated by the 'armeabi-v7a' ABI. + * + * VFPv3: + * CPU supports the VFPv3-D16 instruction set, providing hardware FPU + * support for single and double precision floating point registers. + * Note that only 16 FPU registers are available by default, unless + * the D32 bit is set too. This feature is also mandated by the + * 'armeabi-v7a' ABI. + * + * VFP_D32: + * CPU VFP optional extension that provides 32 FPU registers, + * instead of 16. Note that ARM mandates this feature is the 'NEON' + * feature is implemented by the CPU. + * + * NEON: + * CPU FPU supports "ARM Advanced SIMD" instructions, also known as + * NEON. Note that this mandates the VFP_D32 feature as well, per the + * ARM Architecture specification. + * + * VFP_FP16: + * Half-width floating precision VFP extension. If set, the CPU + * supports instructions to perform floating-point operations on + * 16-bit registers. This is part of the VFPv4 specification, but + * not mandated by any Android ABI. + * + * VFP_FMA: + * Fused multiply-accumulate VFP instructions extension. Also part of + * the VFPv4 specification, but not mandated by any Android ABI. + * + * NEON_FMA: + * Fused multiply-accumulate NEON instructions extension. Optional + * extension from the VFPv4 specification, but not mandated by any + * Android ABI. + * + * IDIV_ARM: + * Integer division available in ARM mode. Only available + * on recent CPUs (e.g. Cortex-A15). + * + * IDIV_THUMB2: + * Integer division available in Thumb-2 mode. Only available + * on recent CPUs (e.g. Cortex-A15). + * + * iWMMXt: + * Optional extension that adds MMX registers and operations to an + * ARM CPU. This is only available on a few XScale-based CPU designs + * sold by Marvell. Pretty rare in practice. + * + * AES: + * CPU supports AES instructions. These instructions are only + * available for 32-bit applications running on ARMv8 CPU. + * + * CRC32: + * CPU supports CRC32 instructions. These instructions are only + * available for 32-bit applications running on ARMv8 CPU. + * + * SHA2: + * CPU supports SHA2 instructions. These instructions are only + * available for 32-bit applications running on ARMv8 CPU. + * + * SHA1: + * CPU supports SHA1 instructions. These instructions are only + * available for 32-bit applications running on ARMv8 CPU. + * + * PMULL: + * CPU supports 64-bit PMULL and PMULL2 instructions. These + * instructions are only available for 32-bit applications + * running on ARMv8 CPU. + * + * If you want to tell the compiler to generate code that targets one of + * the feature set above, you should probably use one of the following + * flags (for more details, see technical note at the end of this file): + * + * -mfpu=vfp + * -mfpu=vfpv2 + * These are equivalent and tell GCC to use VFPv2 instructions for + * floating-point operations. Use this if you want your code to + * run on *some* ARMv6 devices, and any ARMv7-A device supported + * by Android. + * + * Generated code requires VFPv2 feature. + * + * -mfpu=vfpv3-d16 + * Tell GCC to use VFPv3 instructions (using only 16 FPU registers). + * This should be generic code that runs on any CPU that supports the + * 'armeabi-v7a' Android ABI. Note that no ARMv6 CPU supports this. + * + * Generated code requires VFPv3 feature. + * + * -mfpu=vfpv3 + * Tell GCC to use VFPv3 instructions with 32 FPU registers. + * Generated code requires VFPv3|VFP_D32 features. + * + * -mfpu=neon + * Tell GCC to use VFPv3 instructions with 32 FPU registers, and + * also support NEON intrinsics (see ). + * Generated code requires VFPv3|VFP_D32|NEON features. + * + * -mfpu=vfpv4-d16 + * Generated code requires VFPv3|VFP_FP16|VFP_FMA features. + * + * -mfpu=vfpv4 + * Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32 features. + * + * -mfpu=neon-vfpv4 + * Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32|NEON|NEON_FMA + * features. + * + * -mcpu=cortex-a7 + * -mcpu=cortex-a15 + * Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32| + * NEON|NEON_FMA|IDIV_ARM|IDIV_THUMB2 + * This flag implies -mfpu=neon-vfpv4. + * + * -mcpu=iwmmxt + * Allows the use of iWMMXt instrinsics with GCC. + * + * IMPORTANT NOTE: These flags should only be tested when + * android_getCpuFamily() returns ANDROID_CPU_FAMILY_ARM, i.e. this is a + * 32-bit process. + * + * When running a 64-bit ARM process on an ARMv8 CPU, + * android_getCpuFeatures() will return a different set of bitflags + */ +enum { + ANDROID_CPU_ARM_FEATURE_ARMv7 = (1 << 0), + ANDROID_CPU_ARM_FEATURE_VFPv3 = (1 << 1), + ANDROID_CPU_ARM_FEATURE_NEON = (1 << 2), + ANDROID_CPU_ARM_FEATURE_LDREX_STREX = (1 << 3), + ANDROID_CPU_ARM_FEATURE_VFPv2 = (1 << 4), + ANDROID_CPU_ARM_FEATURE_VFP_D32 = (1 << 5), + ANDROID_CPU_ARM_FEATURE_VFP_FP16 = (1 << 6), + ANDROID_CPU_ARM_FEATURE_VFP_FMA = (1 << 7), + ANDROID_CPU_ARM_FEATURE_NEON_FMA = (1 << 8), + ANDROID_CPU_ARM_FEATURE_IDIV_ARM = (1 << 9), + ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 = (1 << 10), + ANDROID_CPU_ARM_FEATURE_iWMMXt = (1 << 11), + ANDROID_CPU_ARM_FEATURE_AES = (1 << 12), + ANDROID_CPU_ARM_FEATURE_PMULL = (1 << 13), + ANDROID_CPU_ARM_FEATURE_SHA1 = (1 << 14), + ANDROID_CPU_ARM_FEATURE_SHA2 = (1 << 15), + ANDROID_CPU_ARM_FEATURE_CRC32 = (1 << 16), +}; + +/* The bit flags corresponding to the output of android_getCpuFeatures() + * when android_getCpuFamily() returns ANDROID_CPU_FAMILY_ARM64. Value details + * are: + * + * FP: + * CPU has Floating-point unit. + * + * ASIMD: + * CPU has Advanced SIMD unit. + * + * AES: + * CPU supports AES instructions. + * + * CRC32: + * CPU supports CRC32 instructions. + * + * SHA2: + * CPU supports SHA2 instructions. + * + * SHA1: + * CPU supports SHA1 instructions. + * + * PMULL: + * CPU supports 64-bit PMULL and PMULL2 instructions. + */ +enum { + ANDROID_CPU_ARM64_FEATURE_FP = (1 << 0), + ANDROID_CPU_ARM64_FEATURE_ASIMD = (1 << 1), + ANDROID_CPU_ARM64_FEATURE_AES = (1 << 2), + ANDROID_CPU_ARM64_FEATURE_PMULL = (1 << 3), + ANDROID_CPU_ARM64_FEATURE_SHA1 = (1 << 4), + ANDROID_CPU_ARM64_FEATURE_SHA2 = (1 << 5), + ANDROID_CPU_ARM64_FEATURE_CRC32 = (1 << 6), +}; + +/* The bit flags corresponding to the output of android_getCpuFeatures() + * when android_getCpuFamily() returns ANDROID_CPU_FAMILY_X86 or + * ANDROID_CPU_FAMILY_X86_64. + */ +enum { + ANDROID_CPU_X86_FEATURE_SSSE3 = (1 << 0), + ANDROID_CPU_X86_FEATURE_POPCNT = (1 << 1), + ANDROID_CPU_X86_FEATURE_MOVBE = (1 << 2), + ANDROID_CPU_X86_FEATURE_SSE4_1 = (1 << 3), + ANDROID_CPU_X86_FEATURE_SSE4_2 = (1 << 4), + ANDROID_CPU_X86_FEATURE_AES_NI = (1 << 5), + ANDROID_CPU_X86_FEATURE_AVX = (1 << 6), + ANDROID_CPU_X86_FEATURE_RDRAND = (1 << 7), + ANDROID_CPU_X86_FEATURE_AVX2 = (1 << 8), + ANDROID_CPU_X86_FEATURE_SHA_NI = (1 << 9), +}; + +/* The bit flags corresponding to the output of android_getCpuFeatures() + * when android_getCpuFamily() returns ANDROID_CPU_FAMILY_MIPS + * or ANDROID_CPU_FAMILY_MIPS64. Values are: + * + * R6: + * CPU executes MIPS Release 6 instructions natively, and + * supports obsoleted R1..R5 instructions only via kernel traps. + * + * MSA: + * CPU supports Mips SIMD Architecture instructions. + */ +enum { + ANDROID_CPU_MIPS_FEATURE_R6 = (1 << 0), + ANDROID_CPU_MIPS_FEATURE_MSA = (1 << 1), +}; + + +/* Return the number of CPU cores detected on this device. */ +extern int android_getCpuCount(void); + +/* The following is used to force the CPU count and features + * mask in sandboxed processes. Under 4.1 and higher, these processes + * cannot access /proc, which is the only way to get information from + * the kernel about the current hardware (at least on ARM). + * + * It _must_ be called only once, and before any android_getCpuXXX + * function, any other case will fail. + * + * This function return 1 on success, and 0 on failure. + */ +extern int android_setCpu(int cpu_count, + uint64_t cpu_features); + +#ifdef __arm__ +/* Retrieve the ARM 32-bit CPUID value from the kernel. + * Note that this cannot work on sandboxed processes under 4.1 and + * higher, unless you called android_setCpuArm() before. + */ +extern uint32_t android_getCpuIdArm(void); + +/* An ARM-specific variant of android_setCpu() that also allows you + * to set the ARM CPUID field. + */ +extern int android_setCpuArm(int cpu_count, + uint64_t cpu_features, + uint32_t cpu_id); +#endif + +__END_DECLS + +#endif /* CPU_FEATURES_H */ diff --git a/thirdparty/libvpx/third_party/x86inc/LICENSE b/thirdparty/libvpx/third_party/x86inc/LICENSE new file mode 100644 index 000000000000..7d07645a1715 --- /dev/null +++ b/thirdparty/libvpx/third_party/x86inc/LICENSE @@ -0,0 +1,18 @@ +Copyright (C) 2005-2012 x264 project + +Authors: Loren Merritt + Anton Mitrofanov + Jason Garrett-Glaser + Henrik Gramner + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/thirdparty/libvpx/third_party/x86inc/README.libvpx b/thirdparty/libvpx/third_party/x86inc/README.libvpx new file mode 100644 index 000000000000..8d3cd966da13 --- /dev/null +++ b/thirdparty/libvpx/third_party/x86inc/README.libvpx @@ -0,0 +1,20 @@ +URL: https://git.videolan.org/git/x264.git +Version: d23d18655249944c1ca894b451e2c82c7a584c62 +License: ISC +License File: LICENSE + +Description: +x264/libav's framework for x86 assembly. Contains a variety of macros and +defines that help automatically allow assembly to work cross-platform. + +Local Modifications: +Get configuration from vpx_config.asm. +Prefix functions with vpx by default. +Manage name mangling (prefixing with '_') manually because 'PREFIX' does not + exist in libvpx. +Expand PIC default to macho64 and respect CONFIG_PIC from libvpx +Set 'private_extern' visibility for macho targets. +Copy PIC 'GLOBAL' macros from x86_abi_support.asm +Use .text instead of .rodata on macho to avoid broken tables in PIC mode. +Use .text with no alignment for aout +Only use 'hidden' visibility with Chromium diff --git a/thirdparty/libvpx/third_party/x86inc/x86inc.asm b/thirdparty/libvpx/third_party/x86inc/x86inc.asm new file mode 100644 index 000000000000..b647dff2f88b --- /dev/null +++ b/thirdparty/libvpx/third_party/x86inc/x86inc.asm @@ -0,0 +1,1649 @@ +;***************************************************************************** +;* x86inc.asm: x264asm abstraction layer +;***************************************************************************** +;* Copyright (C) 2005-2016 x264 project +;* +;* Authors: Loren Merritt +;* Anton Mitrofanov +;* Fiona Glaser +;* Henrik Gramner +;* +;* Permission to use, copy, modify, and/or distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;***************************************************************************** + +; This is a header file for the x264ASM assembly language, which uses +; NASM/YASM syntax combined with a large number of macros to provide easy +; abstraction between different calling conventions (x86_32, win64, linux64). +; It also has various other useful features to simplify writing the kind of +; DSP functions that are most often used in x264. + +; Unlike the rest of x264, this file is available under an ISC license, as it +; has significant usefulness outside of x264 and we want it to be available +; to the largest audience possible. Of course, if you modify it for your own +; purposes to add a new feature, we strongly encourage contributing a patch +; as this feature might be useful for others as well. Send patches or ideas +; to x264-devel@videolan.org . + +%include "vpx_config.asm" + +%ifndef private_prefix + %define private_prefix vpx +%endif + +%ifndef public_prefix + %define public_prefix private_prefix +%endif + +%ifndef STACK_ALIGNMENT + %if ARCH_X86_64 + %define STACK_ALIGNMENT 16 + %else + %define STACK_ALIGNMENT 4 + %endif +%endif + +%define WIN64 0 +%define UNIX64 0 +%if ARCH_X86_64 + %ifidn __OUTPUT_FORMAT__,win32 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,win64 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,x64 + %define WIN64 1 + %else + %define UNIX64 1 + %endif +%endif + +%define FORMAT_ELF 0 +%ifidn __OUTPUT_FORMAT__,elf + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf32 + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf64 + %define FORMAT_ELF 1 +%endif + +%define FORMAT_MACHO 0 +%ifidn __OUTPUT_FORMAT__,macho32 + %define FORMAT_MACHO 1 +%elifidn __OUTPUT_FORMAT__,macho64 + %define FORMAT_MACHO 1 +%endif + +; Set PREFIX for libvpx builds. +%if FORMAT_ELF + %undef PREFIX +%elif WIN64 + %undef PREFIX +%else + %define PREFIX +%endif + +%ifdef PREFIX + %define mangle(x) _ %+ x +%else + %define mangle(x) x +%endif + +; In some instances macho32 tables get misaligned when using .rodata. +; When looking at the disassembly it appears that the offset is either +; correct or consistently off by 90. Placing them in the .text section +; works around the issue. It appears to be specific to the way libvpx +; handles the tables. +%macro SECTION_RODATA 0-1 16 + %ifidn __OUTPUT_FORMAT__,macho32 + SECTION .text align=%1 + fakegot: + %elifidn __OUTPUT_FORMAT__,aout + SECTION .text + %else + SECTION .rodata align=%1 + %endif +%endmacro + +; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC" +; from original code is added in for 64bit. +%ifidn __OUTPUT_FORMAT__,elf32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,macho32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,win32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,aout +%define ABI_IS_32BIT 1 +%else +%define ABI_IS_32BIT 0 +%endif + +%if ABI_IS_32BIT + %if CONFIG_PIC=1 + %ifidn __OUTPUT_FORMAT__,elf32 + %define GET_GOT_DEFINED 1 + %define WRT_PLT wrt ..plt + %macro GET_GOT 1 + extern _GLOBAL_OFFSET_TABLE_ + push %1 + call %%get_got + %%sub_offset: + jmp %%exitGG + %%get_got: + mov %1, [esp] + add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc + ret + %%exitGG: + %undef GLOBAL + %define GLOBAL(x) x + %1 wrt ..gotoff + %undef RESTORE_GOT + %define RESTORE_GOT pop %1 + %endmacro + %elifidn __OUTPUT_FORMAT__,macho32 + %define GET_GOT_DEFINED 1 + %macro GET_GOT 1 + push %1 + call %%get_got + %%get_got: + pop %1 + %undef GLOBAL + %define GLOBAL(x) x + %1 - %%get_got + %undef RESTORE_GOT + %define RESTORE_GOT pop %1 + %endmacro + %else + %define GET_GOT_DEFINED 0 + %endif + %endif + + %if ARCH_X86_64 == 0 + %undef PIC + %endif + +%else + %macro GET_GOT 1 + %endmacro + %define GLOBAL(x) rel x + %define WRT_PLT wrt ..plt + + %if WIN64 + %define PIC + %elifidn __OUTPUT_FORMAT__,macho64 + %define PIC + %elif CONFIG_PIC + %define PIC + %endif +%endif + +%ifnmacro GET_GOT + %macro GET_GOT 1 + %endmacro + %define GLOBAL(x) x +%endif +%ifndef RESTORE_GOT + %define RESTORE_GOT +%endif +%ifndef WRT_PLT + %define WRT_PLT +%endif + +%ifdef PIC + default rel +%endif + +%ifndef GET_GOT_DEFINED + %define GET_GOT_DEFINED 0 +%endif +; Done with PIC macros + +%ifdef __NASM_VER__ + %use smartalign +%endif + +; Macros to eliminate most code duplication between x86_32 and x86_64: +; Currently this works only for leaf functions which load all their arguments +; into registers at the start, and make no other use of the stack. Luckily that +; covers most of x264's asm. + +; PROLOGUE: +; %1 = number of arguments. loads them from stack if needed. +; %2 = number of registers used. pushes callee-saved regs if needed. +; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. +; %4 = (optional) stack size to be allocated. The stack will be aligned before +; allocating the specified stack size. If the required stack alignment is +; larger than the known stack alignment the stack will be manually aligned +; and an extra register will be allocated to hold the original stack +; pointer (to not invalidate r0m etc.). To prevent the use of an extra +; register as stack pointer, request a negative stack size. +; %4+/%5+ = list of names to define to registers +; PROLOGUE can also be invoked by adding the same options to cglobal + +; e.g. +; cglobal foo, 2,3,7,0x40, dst, src, tmp +; declares a function (foo) that automatically loads two arguments (dst and +; src) into registers, uses one additional register (tmp) plus 7 vector +; registers (m0-m6) and allocates 0x40 bytes of stack space. + +; TODO Some functions can use some args directly from the stack. If they're the +; last args then you can just not declare them, but if they're in the middle +; we need more flexible macro. + +; RET: +; Pops anything that was pushed by PROLOGUE, and returns. + +; REP_RET: +; Use this instead of RET if it's a branch target. + +; registers: +; rN and rNq are the native-size register holding function argument N +; rNd, rNw, rNb are dword, word, and byte size +; rNh is the high 8 bits of the word size +; rNm is the original location of arg N (a register or on the stack), dword +; rNmp is native size + +%macro DECLARE_REG 2-3 + %define r%1q %2 + %define r%1d %2d + %define r%1w %2w + %define r%1b %2b + %define r%1h %2h + %define %2q %2 + %if %0 == 2 + %define r%1m %2d + %define r%1mp %2 + %elif ARCH_X86_64 ; memory + %define r%1m [rstk + stack_offset + %3] + %define r%1mp qword r %+ %1 %+ m + %else + %define r%1m [rstk + stack_offset + %3] + %define r%1mp dword r %+ %1 %+ m + %endif + %define r%1 %2 +%endmacro + +%macro DECLARE_REG_SIZE 3 + %define r%1q r%1 + %define e%1q r%1 + %define r%1d e%1 + %define e%1d e%1 + %define r%1w %1 + %define e%1w %1 + %define r%1h %3 + %define e%1h %3 + %define r%1b %2 + %define e%1b %2 + %if ARCH_X86_64 == 0 + %define r%1 e%1 + %endif +%endmacro + +DECLARE_REG_SIZE ax, al, ah +DECLARE_REG_SIZE bx, bl, bh +DECLARE_REG_SIZE cx, cl, ch +DECLARE_REG_SIZE dx, dl, dh +DECLARE_REG_SIZE si, sil, null +DECLARE_REG_SIZE di, dil, null +DECLARE_REG_SIZE bp, bpl, null + +; t# defines for when per-arch register allocation is more complex than just function arguments + +%macro DECLARE_REG_TMP 1-* + %assign %%i 0 + %rep %0 + CAT_XDEFINE t, %%i, r%1 + %assign %%i %%i+1 + %rotate 1 + %endrep +%endmacro + +%macro DECLARE_REG_TMP_SIZE 0-* + %rep %0 + %define t%1q t%1 %+ q + %define t%1d t%1 %+ d + %define t%1w t%1 %+ w + %define t%1h t%1 %+ h + %define t%1b t%1 %+ b + %rotate 1 + %endrep +%endmacro + +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 + +%if ARCH_X86_64 + %define gprsize 8 +%else + %define gprsize 4 +%endif + +%macro PUSH 1 + push %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset+gprsize + %endif +%endmacro + +%macro POP 1 + pop %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset-gprsize + %endif +%endmacro + +%macro PUSH_IF_USED 1-* + %rep %0 + %if %1 < regs_used + PUSH r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro POP_IF_USED 1-* + %rep %0 + %if %1 < regs_used + pop r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro LOAD_IF_USED 1-* + %rep %0 + %if %1 < num_args + mov r%1, r %+ %1 %+ mp + %endif + %rotate 1 + %endrep +%endmacro + +%macro SUB 2 + sub %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset+(%2) + %endif +%endmacro + +%macro ADD 2 + add %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset-(%2) + %endif +%endmacro + +%macro movifnidn 2 + %ifnidn %1, %2 + mov %1, %2 + %endif +%endmacro + +%macro movsxdifnidn 2 + %ifnidn %1, %2 + movsxd %1, %2 + %endif +%endmacro + +%macro ASSERT 1 + %if (%1) == 0 + %error assertion ``%1'' failed + %endif +%endmacro + +%macro DEFINE_ARGS 0-* + %ifdef n_arg_names + %assign %%i 0 + %rep n_arg_names + CAT_UNDEF arg_name %+ %%i, q + CAT_UNDEF arg_name %+ %%i, d + CAT_UNDEF arg_name %+ %%i, w + CAT_UNDEF arg_name %+ %%i, h + CAT_UNDEF arg_name %+ %%i, b + CAT_UNDEF arg_name %+ %%i, m + CAT_UNDEF arg_name %+ %%i, mp + CAT_UNDEF arg_name, %%i + %assign %%i %%i+1 + %endrep + %endif + + %xdefine %%stack_offset stack_offset + %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine + %assign %%i 0 + %rep %0 + %xdefine %1q r %+ %%i %+ q + %xdefine %1d r %+ %%i %+ d + %xdefine %1w r %+ %%i %+ w + %xdefine %1h r %+ %%i %+ h + %xdefine %1b r %+ %%i %+ b + %xdefine %1m r %+ %%i %+ m + %xdefine %1mp r %+ %%i %+ mp + CAT_XDEFINE arg_name, %%i, %1 + %assign %%i %%i+1 + %rotate 1 + %endrep + %xdefine stack_offset %%stack_offset + %assign n_arg_names %0 +%endmacro + +%define required_stack_alignment ((mmsize + 15) & ~15) + +%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) + %ifnum %1 + %if %1 != 0 + %assign %%pad 0 + %assign stack_size %1 + %if stack_size < 0 + %assign stack_size -stack_size + %endif + %if WIN64 + %assign %%pad %%pad + 32 ; shadow space + %if mmsize != 8 + %assign xmm_regs_used %2 + %if xmm_regs_used > 8 + %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers + %endif + %endif + %endif + %if required_stack_alignment <= STACK_ALIGNMENT + ; maintain the current stack alignment + %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %else + %assign %%reg_num (regs_used - 1) + %xdefine rstk r %+ %%reg_num + ; align stack, and save original stack location directly above + ; it, i.e. in [rsp+stack_size_padded], so we can restore the + ; stack in a single instruction (i.e. mov rsp, rstk or mov + ; rsp, [rsp+stack_size_padded]) + %if %1 < 0 ; need to store rsp on stack + %xdefine rstkm [rsp + stack_size + %%pad] + %assign %%pad %%pad + gprsize + %else ; can keep rsp in rstk during whole function + %xdefine rstkm rstk + %endif + %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) + mov rstk, rsp + and rsp, ~(required_stack_alignment-1) + sub rsp, stack_size_padded + movifnidn rstkm, rstk + %endif + WIN64_PUSH_XMM + %endif + %endif +%endmacro + +%macro SETUP_STACK_POINTER 1 + %ifnum %1 + %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT + %if %1 > 0 + %assign regs_used (regs_used + 1) + %endif + %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 + ; Ensure that we don't clobber any registers containing arguments + %assign regs_used 5 + UNIX64 * 3 + %endif + %endif + %endif +%endmacro + +%macro DEFINE_ARGS_INTERNAL 3+ + %ifnum %2 + DEFINE_ARGS %3 + %elif %1 == 4 + DEFINE_ARGS %2 + %elif %1 > 4 + DEFINE_ARGS %2, %3 + %endif +%endmacro + +%if WIN64 ; Windows x64 ;================================================= + +DECLARE_REG 0, rcx +DECLARE_REG 1, rdx +DECLARE_REG 2, R8 +DECLARE_REG 3, R9 +DECLARE_REG 4, R10, 40 +DECLARE_REG 5, R11, 48 +DECLARE_REG 6, rax, 56 +DECLARE_REG 7, rdi, 64 +DECLARE_REG 8, rsi, 72 +DECLARE_REG 9, rbx, 80 +DECLARE_REG 10, rbp, 88 +DECLARE_REG 11, R12, 96 +DECLARE_REG 12, R13, 104 +DECLARE_REG 13, R14, 112 +DECLARE_REG 14, R15, 120 + +%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4, %3 + %if mmsize != 8 && stack_size == 0 + WIN64_SPILL_XMM %3 + %endif + LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%macro WIN64_PUSH_XMM 0 + ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. + %if xmm_regs_used > 6 + movaps [rstk + stack_offset + 8], xmm6 + %endif + %if xmm_regs_used > 7 + movaps [rstk + stack_offset + 24], xmm7 + %endif + %if xmm_regs_used > 8 + %assign %%i 8 + %rep xmm_regs_used-8 + movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +%macro WIN64_SPILL_XMM 1 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 + %if xmm_regs_used > 8 + ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. + %assign %%pad (xmm_regs_used-8)*16 + 32 + %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %endif + WIN64_PUSH_XMM +%endmacro + +%macro WIN64_RESTORE_XMM_INTERNAL 1 + %assign %%pad_size 0 + %if xmm_regs_used > 8 + %assign %%i xmm_regs_used + %rep xmm_regs_used-8 + %assign %%i %%i-1 + movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] + %endrep + %endif + %if stack_size_padded > 0 + %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add %1, stack_size_padded + %assign %%pad_size stack_size_padded + %endif + %endif + %if xmm_regs_used > 7 + movaps xmm7, [%1 + stack_offset - %%pad_size + 24] + %endif + %if xmm_regs_used > 6 + movaps xmm6, [%1 + stack_offset - %%pad_size + 8] + %endif +%endmacro + +%macro WIN64_RESTORE_XMM 1 + WIN64_RESTORE_XMM_INTERNAL %1 + %assign stack_offset (stack_offset-stack_size_padded) + %assign xmm_regs_used 0 +%endmacro + +%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 + +%macro RET 0 + WIN64_RESTORE_XMM_INTERNAL rsp + POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 + %if mmsize == 32 + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%elif ARCH_X86_64 ; *nix x64 ;============================================= + +DECLARE_REG 0, rdi +DECLARE_REG 1, rsi +DECLARE_REG 2, rdx +DECLARE_REG 3, rcx +DECLARE_REG 4, R8 +DECLARE_REG 5, R9 +DECLARE_REG 6, rax, 8 +DECLARE_REG 7, R10, 16 +DECLARE_REG 8, R11, 24 +DECLARE_REG 9, rbx, 32 +DECLARE_REG 10, rbp, 40 +DECLARE_REG 11, R12, 48 +DECLARE_REG 12, R13, 56 +DECLARE_REG 13, R14, 64 +DECLARE_REG 14, R15, 72 + +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4 + LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 + +%macro RET 0 + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif + POP_IF_USED 14, 13, 12, 11, 10, 9 + %if mmsize == 32 + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%else ; X86_32 ;============================================================== + +DECLARE_REG 0, eax, 4 +DECLARE_REG 1, ecx, 8 +DECLARE_REG 2, edx, 12 +DECLARE_REG 3, ebx, 16 +DECLARE_REG 4, esi, 20 +DECLARE_REG 5, edi, 24 +DECLARE_REG 6, ebp, 28 +%define rsp esp + +%macro DECLARE_ARG 1-* + %rep %0 + %define r%1m [rstk + stack_offset + 4*%1 + 4] + %define r%1mp dword r%1m + %rotate 1 + %endrep +%endmacro + +DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 + +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + %if num_args > 7 + %assign num_args 7 + %endif + %if regs_used > 7 + %assign regs_used 7 + %endif + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 7 + PUSH_IF_USED 3, 4, 5, 6 + ALLOC_STACK %4 + LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 + +%macro RET 0 + %if stack_size_padded > 0 + %if required_stack_alignment > STACK_ALIGNMENT + mov rsp, rstkm + %else + add rsp, stack_size_padded + %endif + %endif + POP_IF_USED 6, 5, 4, 3 + %if mmsize == 32 + vzeroupper + %endif + AUTO_REP_RET +%endmacro + +%endif ;====================================================================== + +%if WIN64 == 0 + %macro WIN64_SPILL_XMM 1 + %endmacro + %macro WIN64_RESTORE_XMM 1 + %endmacro + %macro WIN64_PUSH_XMM 0 + %endmacro +%endif + +; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either +; a branch or a branch target. So switch to a 2-byte form of ret in that case. +; We can automatically detect "follows a branch", but not a branch target. +; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) +%macro REP_RET 0 + %if has_epilogue + RET + %else + rep ret + %endif + annotate_function_size +%endmacro + +%define last_branch_adr $$ +%macro AUTO_REP_RET 0 + %if notcpuflag(ssse3) + times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. + %endif + ret + annotate_function_size +%endmacro + +%macro BRANCH_INSTR 0-* + %rep %0 + %macro %1 1-2 %1 + %2 %1 + %if notcpuflag(ssse3) + %%branch_instr equ $ + %xdefine last_branch_adr %%branch_instr + %endif + %endmacro + %rotate 1 + %endrep +%endmacro + +BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp + +%macro TAIL_CALL 2 ; callee, is_nonadjacent + %if has_epilogue + call %1 + RET + %elif %2 + jmp %1 + %endif + annotate_function_size +%endmacro + +;============================================================================= +; arch-independent part +;============================================================================= + +%assign function_align 16 + +; Begin a function. +; Applies any symbol mangling needed for C linkage, and sets up a define such that +; subsequent uses of the function name automatically refer to the mangled version. +; Appends cpuflags to the function name if cpuflags has been specified. +; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX +; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). +%macro cglobal 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 1, %1 %+ SUFFIX, %2 +%endmacro +%macro cvisible 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 0, %1 %+ SUFFIX, %2 +%endmacro +%macro cglobal_internal 2-3+ + annotate_function_size + %if %1 + %xdefine %%FUNCTION_PREFIX private_prefix + ; libvpx explicitly sets visibility in shared object builds. Avoid + ; setting visibility to hidden as it may break builds that split + ; sources on e.g., directory boundaries. + %ifdef CHROMIUM + %xdefine %%VISIBILITY hidden + %else + %xdefine %%VISIBILITY + %endif + %else + %xdefine %%FUNCTION_PREFIX public_prefix + %xdefine %%VISIBILITY + %endif + %ifndef cglobaled_%2 + %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) + %xdefine %2.skip_prologue %2 %+ .skip_prologue + CAT_XDEFINE cglobaled_, %2, 1 + %endif + %xdefine current_function %2 + %xdefine current_function_section __SECT__ + %if FORMAT_ELF + global %2:function %%VISIBILITY + %elif FORMAT_MACHO + %ifdef __NASM_VER__ + global %2 + %else + global %2:private_extern + %endif + %else + global %2 + %endif + align function_align + %2: + RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer + %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required + %assign stack_offset 0 ; stack pointer offset relative to the return address + %assign stack_size 0 ; amount of stack space that can be freely used inside a function + %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 + %ifnidn %3, "" + PROLOGUE %3 + %endif +%endmacro + +%macro cextern 1 + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +; like cextern, but without the prefix +%macro cextern_naked 1 + %ifdef PREFIX + %xdefine %1 mangle(%1) + %endif + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +%macro const 1-2+ + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + %if FORMAT_ELF + global %1:data hidden + %else + global %1 + %endif + %1: %2 +%endmacro + +; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. +%if FORMAT_ELF + [SECTION .note.GNU-stack noalloc noexec nowrite progbits] +%endif + +; Tell debuggers how large the function was. +; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. +; This is invoked by RET and similar macros, and also cglobal does it for the previous function, +; but if the last function in a source file doesn't use any of the standard macros for its epilogue, +; then its size might be unspecified. +%macro annotate_function_size 0 + %ifdef __YASM_VER__ + %ifdef current_function + %if FORMAT_ELF + current_function_section + %%ecf equ $ + size current_function %%ecf - current_function + __SECT__ + %endif + %endif + %endif +%endmacro + +; cpuflags + +%assign cpuflags_mmx (1<<0) +%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx +%assign cpuflags_3dnow (1<<2) | cpuflags_mmx +%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow +%assign cpuflags_sse (1<<4) | cpuflags_mmx2 +%assign cpuflags_sse2 (1<<5) | cpuflags_sse +%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 +%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 +%assign cpuflags_avx (1<<11)| cpuflags_sse42 +%assign cpuflags_xop (1<<12)| cpuflags_avx +%assign cpuflags_fma4 (1<<13)| cpuflags_avx +%assign cpuflags_fma3 (1<<14)| cpuflags_avx +%assign cpuflags_avx2 (1<<15)| cpuflags_fma3 + +%assign cpuflags_cache32 (1<<16) +%assign cpuflags_cache64 (1<<17) +%assign cpuflags_slowctz (1<<18) +%assign cpuflags_lzcnt (1<<19) +%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<21) +%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 + +; Returns a boolean value expressing whether or not the specified cpuflag is enabled. +%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) +%define notcpuflag(x) (cpuflag(x) ^ 1) + +; Takes an arbitrary number of cpuflags from the above list. +; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. +; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. +%macro INIT_CPUFLAGS 0-* + %xdefine SUFFIX + %undef cpuname + %assign cpuflags 0 + + %if %0 >= 1 + %rep %0 + %ifdef cpuname + %xdefine cpuname cpuname %+ _%1 + %else + %xdefine cpuname %1 + %endif + %assign cpuflags cpuflags | cpuflags_%1 + %rotate 1 + %endrep + %xdefine SUFFIX _ %+ cpuname + + %if cpuflag(avx) + %assign avx_enabled 1 + %endif + %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) + %define mova movaps + %define movu movups + %define movnta movntps + %endif + %if cpuflag(aligned) + %define movu mova + %elif cpuflag(sse3) && notcpuflag(ssse3) + %define movu lddqu + %endif + %endif + + %if ARCH_X86_64 || cpuflag(sse2) + %ifdef __NASM_VER__ + ALIGNMODE k8 + %else + CPU amdnop + %endif + %else + %ifdef __NASM_VER__ + ALIGNMODE nop + %else + CPU basicnop + %endif + %endif +%endmacro + +; Merge mmx and sse* +; m# is a simd register of the currently selected size +; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# +; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# +; (All 3 remain in sync through SWAP.) + +%macro CAT_XDEFINE 3 + %xdefine %1%2 %3 +%endmacro + +%macro CAT_UNDEF 2 + %undef %1%2 +%endmacro + +%macro INIT_MMX 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_MMX %1 + %define mmsize 8 + %define num_mmregs 8 + %define mova movq + %define movu movq + %define movh movd + %define movnta movntq + %assign %%i 0 + %rep 8 + CAT_XDEFINE m, %%i, mm %+ %%i + CAT_XDEFINE nnmm, %%i, %%i + %assign %%i %%i+1 + %endrep + %rep 8 + CAT_UNDEF m, %%i + CAT_UNDEF nnmm, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_XMM 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_XMM %1 + %define mmsize 16 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova movdqa + %define movu movdqu + %define movh movq + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, xmm %+ %%i + CAT_XDEFINE nnxmm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_YMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_YMM %1 + %define mmsize 32 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, ymm %+ %%i + CAT_XDEFINE nnymm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +INIT_XMM + +%macro DECLARE_MMCAST 1 + %define mmmm%1 mm%1 + %define mmxmm%1 mm%1 + %define mmymm%1 mm%1 + %define xmmmm%1 mm%1 + %define xmmxmm%1 xmm%1 + %define xmmymm%1 xmm%1 + %define ymmmm%1 mm%1 + %define ymmxmm%1 xmm%1 + %define ymmymm%1 ymm%1 + %define xm%1 xmm %+ m%1 + %define ym%1 ymm %+ m%1 +%endmacro + +%assign i 0 +%rep 16 + DECLARE_MMCAST i + %assign i i+1 +%endrep + +; I often want to use macros that permute their arguments. e.g. there's no +; efficient way to implement butterfly or transpose or dct without swapping some +; arguments. +; +; I would like to not have to manually keep track of the permutations: +; If I insert a permutation in the middle of a function, it should automatically +; change everything that follows. For more complex macros I may also have multiple +; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. +; +; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that +; permutes its arguments. It's equivalent to exchanging the contents of the +; registers, except that this way you exchange the register names instead, so it +; doesn't cost any cycles. + +%macro PERMUTE 2-* ; takes a list of pairs to swap + %rep %0/2 + %xdefine %%tmp%2 m%2 + %rotate 2 + %endrep + %rep %0/2 + %xdefine m%1 %%tmp%2 + CAT_XDEFINE nn, m%1, %1 + %rotate 2 + %endrep +%endmacro + +%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) + %ifnum %1 ; SWAP 0, 1, ... + SWAP_INTERNAL_NUM %1, %2 + %else ; SWAP m0, m1, ... + SWAP_INTERNAL_NAME %1, %2 + %endif +%endmacro + +%macro SWAP_INTERNAL_NUM 2-* + %rep %0-1 + %xdefine %%tmp m%1 + %xdefine m%1 m%2 + %xdefine m%2 %%tmp + CAT_XDEFINE nn, m%1, %1 + CAT_XDEFINE nn, m%2, %2 + %rotate 1 + %endrep +%endmacro + +%macro SWAP_INTERNAL_NAME 2-* + %xdefine %%args nn %+ %1 + %rep %0-1 + %xdefine %%args %%args, nn %+ %2 + %rotate 1 + %endrep + SWAP_INTERNAL_NUM %%args +%endmacro + +; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later +; calls to that function will automatically load the permutation, so values can +; be returned in mmregs. +%macro SAVE_MM_PERMUTATION 0-1 + %if %0 + %xdefine %%f %1_m + %else + %xdefine %%f current_function %+ _m + %endif + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE %%f, %%i, m %+ %%i + %assign %%i %%i+1 + %endrep +%endmacro + +%macro LOAD_MM_PERMUTATION 1 ; name to load from + %ifdef %1_m0 + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1_m %+ %%i + CAT_XDEFINE nn, m %+ %%i, %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't +%macro call 1 + call_internal %1 %+ SUFFIX, %1 +%endmacro +%macro call_internal 2 + %xdefine %%i %2 + %ifndef cglobaled_%2 + %ifdef cglobaled_%1 + %xdefine %%i %1 + %endif + %endif + call %%i + LOAD_MM_PERMUTATION %%i +%endmacro + +; Substitutions that reduce instruction size but are functionally equivalent +%macro add 2 + %ifnum %2 + %if %2==128 + sub %1, -128 + %else + add %1, %2 + %endif + %else + add %1, %2 + %endif +%endmacro + +%macro sub 2 + %ifnum %2 + %if %2==128 + add %1, -128 + %else + sub %1, %2 + %endif + %else + sub %1, %2 + %endif +%endmacro + +;============================================================================= +; AVX abstraction layer +;============================================================================= + +%assign i 0 +%rep 16 + %if i < 8 + CAT_XDEFINE sizeofmm, i, 8 + %endif + CAT_XDEFINE sizeofxmm, i, 16 + CAT_XDEFINE sizeofymm, i, 32 + %assign i i+1 +%endrep +%undef i + +%macro CHECK_AVX_INSTR_EMU 3-* + %xdefine %%opcode %1 + %xdefine %%dst %2 + %rep %0-2 + %ifidn %%dst, %3 + %error non-avx emulation of ``%%opcode'' is not supported + %endif + %rotate 1 + %endrep +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +;%6+: operands +%macro RUN_AVX_INSTR 6-9+ + %ifnum sizeof%7 + %assign __sizeofreg sizeof%7 + %elifnum sizeof%6 + %assign __sizeofreg sizeof%6 + %else + %assign __sizeofreg mmsize + %endif + %assign __emulate_avx 0 + %if avx_enabled && __sizeofreg >= 16 + %xdefine __instr v%1 + %else + %xdefine __instr %1 + %if %0 >= 8+%4 + %assign __emulate_avx 1 + %endif + %endif + %ifnidn %2, fnord + %ifdef cpuname + %if notcpuflag(%2) + %error use of ``%1'' %2 instruction in cpuname function: current_function + %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8 + %error use of ``%1'' sse2 instruction in cpuname function: current_function + %endif + %endif + %endif + + %if __emulate_avx + %xdefine __src1 %7 + %xdefine __src2 %8 + %ifnidn %6, %7 + %if %0 >= 9 + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9 + %else + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8 + %endif + %if %5 && %4 == 0 + %ifnid %8 + ; 3-operand AVX instructions with a memory arg can only have it in src2, + ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). + ; So, if the instruction is commutative with a memory arg, swap them. + %xdefine __src1 %8 + %xdefine __src2 %7 + %endif + %endif + %if __sizeofreg == 8 + MOVQ %6, __src1 + %elif %3 + MOVAPS %6, __src1 + %else + MOVDQA %6, __src1 + %endif + %endif + %if %0 >= 9 + %1 %6, __src2, %9 + %else + %1 %6, __src2 + %endif + %elif %0 >= 9 + __instr %6, %7, %8, %9 + %elif %0 == 8 + __instr %6, %7, %8 + %elif %0 == 7 + __instr %6, %7 + %else + __instr %6 + %endif +%endmacro + +;%1 == instruction +;%2 == minimal instruction set +;%3 == 1 if float, 0 if int +;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise +;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +%macro AVX_INSTR 1-5 fnord, 0, 1, 0 + %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 + %ifidn %2, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 + %elifidn %3, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 + %elifidn %4, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 + %elifidn %5, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 + %else + RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 + %endif + %endmacro +%endmacro + +; Instructions with both VEX and non-VEX encodings +; Non-destructive instructions are written without parameters +AVX_INSTR addpd, sse2, 1, 0, 1 +AVX_INSTR addps, sse, 1, 0, 1 +AVX_INSTR addsd, sse2, 1, 0, 1 +AVX_INSTR addss, sse, 1, 0, 1 +AVX_INSTR addsubpd, sse3, 1, 0, 0 +AVX_INSTR addsubps, sse3, 1, 0, 0 +AVX_INSTR aesdec, fnord, 0, 0, 0 +AVX_INSTR aesdeclast, fnord, 0, 0, 0 +AVX_INSTR aesenc, fnord, 0, 0, 0 +AVX_INSTR aesenclast, fnord, 0, 0, 0 +AVX_INSTR aesimc +AVX_INSTR aeskeygenassist +AVX_INSTR andnpd, sse2, 1, 0, 0 +AVX_INSTR andnps, sse, 1, 0, 0 +AVX_INSTR andpd, sse2, 1, 0, 1 +AVX_INSTR andps, sse, 1, 0, 1 +AVX_INSTR blendpd, sse4, 1, 0, 0 +AVX_INSTR blendps, sse4, 1, 0, 0 +AVX_INSTR blendvpd, sse4, 1, 0, 0 +AVX_INSTR blendvps, sse4, 1, 0, 0 +AVX_INSTR cmppd, sse2, 1, 1, 0 +AVX_INSTR cmpps, sse, 1, 1, 0 +AVX_INSTR cmpsd, sse2, 1, 1, 0 +AVX_INSTR cmpss, sse, 1, 1, 0 +AVX_INSTR comisd, sse2 +AVX_INSTR comiss, sse +AVX_INSTR cvtdq2pd, sse2 +AVX_INSTR cvtdq2ps, sse2 +AVX_INSTR cvtpd2dq, sse2 +AVX_INSTR cvtpd2ps, sse2 +AVX_INSTR cvtps2dq, sse2 +AVX_INSTR cvtps2pd, sse2 +AVX_INSTR cvtsd2si, sse2 +AVX_INSTR cvtsd2ss, sse2 +AVX_INSTR cvtsi2sd, sse2 +AVX_INSTR cvtsi2ss, sse +AVX_INSTR cvtss2sd, sse2 +AVX_INSTR cvtss2si, sse +AVX_INSTR cvttpd2dq, sse2 +AVX_INSTR cvttps2dq, sse2 +AVX_INSTR cvttsd2si, sse2 +AVX_INSTR cvttss2si, sse +AVX_INSTR divpd, sse2, 1, 0, 0 +AVX_INSTR divps, sse, 1, 0, 0 +AVX_INSTR divsd, sse2, 1, 0, 0 +AVX_INSTR divss, sse, 1, 0, 0 +AVX_INSTR dppd, sse4, 1, 1, 0 +AVX_INSTR dpps, sse4, 1, 1, 0 +AVX_INSTR extractps, sse4 +AVX_INSTR haddpd, sse3, 1, 0, 0 +AVX_INSTR haddps, sse3, 1, 0, 0 +AVX_INSTR hsubpd, sse3, 1, 0, 0 +AVX_INSTR hsubps, sse3, 1, 0, 0 +AVX_INSTR insertps, sse4, 1, 1, 0 +AVX_INSTR lddqu, sse3 +AVX_INSTR ldmxcsr, sse +AVX_INSTR maskmovdqu, sse2 +AVX_INSTR maxpd, sse2, 1, 0, 1 +AVX_INSTR maxps, sse, 1, 0, 1 +AVX_INSTR maxsd, sse2, 1, 0, 1 +AVX_INSTR maxss, sse, 1, 0, 1 +AVX_INSTR minpd, sse2, 1, 0, 1 +AVX_INSTR minps, sse, 1, 0, 1 +AVX_INSTR minsd, sse2, 1, 0, 1 +AVX_INSTR minss, sse, 1, 0, 1 +AVX_INSTR movapd, sse2 +AVX_INSTR movaps, sse +AVX_INSTR movd, mmx +AVX_INSTR movddup, sse3 +AVX_INSTR movdqa, sse2 +AVX_INSTR movdqu, sse2 +AVX_INSTR movhlps, sse, 1, 0, 0 +AVX_INSTR movhpd, sse2, 1, 0, 0 +AVX_INSTR movhps, sse, 1, 0, 0 +AVX_INSTR movlhps, sse, 1, 0, 0 +AVX_INSTR movlpd, sse2, 1, 0, 0 +AVX_INSTR movlps, sse, 1, 0, 0 +AVX_INSTR movmskpd, sse2 +AVX_INSTR movmskps, sse +AVX_INSTR movntdq, sse2 +AVX_INSTR movntdqa, sse4 +AVX_INSTR movntpd, sse2 +AVX_INSTR movntps, sse +AVX_INSTR movq, mmx +AVX_INSTR movsd, sse2, 1, 0, 0 +AVX_INSTR movshdup, sse3 +AVX_INSTR movsldup, sse3 +AVX_INSTR movss, sse, 1, 0, 0 +AVX_INSTR movupd, sse2 +AVX_INSTR movups, sse +AVX_INSTR mpsadbw, sse4 +AVX_INSTR mulpd, sse2, 1, 0, 1 +AVX_INSTR mulps, sse, 1, 0, 1 +AVX_INSTR mulsd, sse2, 1, 0, 1 +AVX_INSTR mulss, sse, 1, 0, 1 +AVX_INSTR orpd, sse2, 1, 0, 1 +AVX_INSTR orps, sse, 1, 0, 1 +AVX_INSTR pabsb, ssse3 +AVX_INSTR pabsd, ssse3 +AVX_INSTR pabsw, ssse3 +AVX_INSTR packsswb, mmx, 0, 0, 0 +AVX_INSTR packssdw, mmx, 0, 0, 0 +AVX_INSTR packuswb, mmx, 0, 0, 0 +AVX_INSTR packusdw, sse4, 0, 0, 0 +AVX_INSTR paddb, mmx, 0, 0, 1 +AVX_INSTR paddw, mmx, 0, 0, 1 +AVX_INSTR paddd, mmx, 0, 0, 1 +AVX_INSTR paddq, sse2, 0, 0, 1 +AVX_INSTR paddsb, mmx, 0, 0, 1 +AVX_INSTR paddsw, mmx, 0, 0, 1 +AVX_INSTR paddusb, mmx, 0, 0, 1 +AVX_INSTR paddusw, mmx, 0, 0, 1 +AVX_INSTR palignr, ssse3 +AVX_INSTR pand, mmx, 0, 0, 1 +AVX_INSTR pandn, mmx, 0, 0, 0 +AVX_INSTR pavgb, mmx2, 0, 0, 1 +AVX_INSTR pavgw, mmx2, 0, 0, 1 +AVX_INSTR pblendvb, sse4, 0, 0, 0 +AVX_INSTR pblendw, sse4 +AVX_INSTR pclmulqdq +AVX_INSTR pcmpestri, sse42 +AVX_INSTR pcmpestrm, sse42 +AVX_INSTR pcmpistri, sse42 +AVX_INSTR pcmpistrm, sse42 +AVX_INSTR pcmpeqb, mmx, 0, 0, 1 +AVX_INSTR pcmpeqw, mmx, 0, 0, 1 +AVX_INSTR pcmpeqd, mmx, 0, 0, 1 +AVX_INSTR pcmpeqq, sse4, 0, 0, 1 +AVX_INSTR pcmpgtb, mmx, 0, 0, 0 +AVX_INSTR pcmpgtw, mmx, 0, 0, 0 +AVX_INSTR pcmpgtd, mmx, 0, 0, 0 +AVX_INSTR pcmpgtq, sse42, 0, 0, 0 +AVX_INSTR pextrb, sse4 +AVX_INSTR pextrd, sse4 +AVX_INSTR pextrq, sse4 +AVX_INSTR pextrw, mmx2 +AVX_INSTR phaddw, ssse3, 0, 0, 0 +AVX_INSTR phaddd, ssse3, 0, 0, 0 +AVX_INSTR phaddsw, ssse3, 0, 0, 0 +AVX_INSTR phminposuw, sse4 +AVX_INSTR phsubw, ssse3, 0, 0, 0 +AVX_INSTR phsubd, ssse3, 0, 0, 0 +AVX_INSTR phsubsw, ssse3, 0, 0, 0 +AVX_INSTR pinsrb, sse4 +AVX_INSTR pinsrd, sse4 +AVX_INSTR pinsrq, sse4 +AVX_INSTR pinsrw, mmx2 +AVX_INSTR pmaddwd, mmx, 0, 0, 1 +AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 +AVX_INSTR pmaxsb, sse4, 0, 0, 1 +AVX_INSTR pmaxsw, mmx2, 0, 0, 1 +AVX_INSTR pmaxsd, sse4, 0, 0, 1 +AVX_INSTR pmaxub, mmx2, 0, 0, 1 +AVX_INSTR pmaxuw, sse4, 0, 0, 1 +AVX_INSTR pmaxud, sse4, 0, 0, 1 +AVX_INSTR pminsb, sse4, 0, 0, 1 +AVX_INSTR pminsw, mmx2, 0, 0, 1 +AVX_INSTR pminsd, sse4, 0, 0, 1 +AVX_INSTR pminub, mmx2, 0, 0, 1 +AVX_INSTR pminuw, sse4, 0, 0, 1 +AVX_INSTR pminud, sse4, 0, 0, 1 +AVX_INSTR pmovmskb, mmx2 +AVX_INSTR pmovsxbw, sse4 +AVX_INSTR pmovsxbd, sse4 +AVX_INSTR pmovsxbq, sse4 +AVX_INSTR pmovsxwd, sse4 +AVX_INSTR pmovsxwq, sse4 +AVX_INSTR pmovsxdq, sse4 +AVX_INSTR pmovzxbw, sse4 +AVX_INSTR pmovzxbd, sse4 +AVX_INSTR pmovzxbq, sse4 +AVX_INSTR pmovzxwd, sse4 +AVX_INSTR pmovzxwq, sse4 +AVX_INSTR pmovzxdq, sse4 +AVX_INSTR pmuldq, sse4, 0, 0, 1 +AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 +AVX_INSTR pmulhuw, mmx2, 0, 0, 1 +AVX_INSTR pmulhw, mmx, 0, 0, 1 +AVX_INSTR pmullw, mmx, 0, 0, 1 +AVX_INSTR pmulld, sse4, 0, 0, 1 +AVX_INSTR pmuludq, sse2, 0, 0, 1 +AVX_INSTR por, mmx, 0, 0, 1 +AVX_INSTR psadbw, mmx2, 0, 0, 1 +AVX_INSTR pshufb, ssse3, 0, 0, 0 +AVX_INSTR pshufd, sse2 +AVX_INSTR pshufhw, sse2 +AVX_INSTR pshuflw, sse2 +AVX_INSTR psignb, ssse3, 0, 0, 0 +AVX_INSTR psignw, ssse3, 0, 0, 0 +AVX_INSTR psignd, ssse3, 0, 0, 0 +AVX_INSTR psllw, mmx, 0, 0, 0 +AVX_INSTR pslld, mmx, 0, 0, 0 +AVX_INSTR psllq, mmx, 0, 0, 0 +AVX_INSTR pslldq, sse2, 0, 0, 0 +AVX_INSTR psraw, mmx, 0, 0, 0 +AVX_INSTR psrad, mmx, 0, 0, 0 +AVX_INSTR psrlw, mmx, 0, 0, 0 +AVX_INSTR psrld, mmx, 0, 0, 0 +AVX_INSTR psrlq, mmx, 0, 0, 0 +AVX_INSTR psrldq, sse2, 0, 0, 0 +AVX_INSTR psubb, mmx, 0, 0, 0 +AVX_INSTR psubw, mmx, 0, 0, 0 +AVX_INSTR psubd, mmx, 0, 0, 0 +AVX_INSTR psubq, sse2, 0, 0, 0 +AVX_INSTR psubsb, mmx, 0, 0, 0 +AVX_INSTR psubsw, mmx, 0, 0, 0 +AVX_INSTR psubusb, mmx, 0, 0, 0 +AVX_INSTR psubusw, mmx, 0, 0, 0 +AVX_INSTR ptest, sse4 +AVX_INSTR punpckhbw, mmx, 0, 0, 0 +AVX_INSTR punpckhwd, mmx, 0, 0, 0 +AVX_INSTR punpckhdq, mmx, 0, 0, 0 +AVX_INSTR punpckhqdq, sse2, 0, 0, 0 +AVX_INSTR punpcklbw, mmx, 0, 0, 0 +AVX_INSTR punpcklwd, mmx, 0, 0, 0 +AVX_INSTR punpckldq, mmx, 0, 0, 0 +AVX_INSTR punpcklqdq, sse2, 0, 0, 0 +AVX_INSTR pxor, mmx, 0, 0, 1 +AVX_INSTR rcpps, sse, 1, 0, 0 +AVX_INSTR rcpss, sse, 1, 0, 0 +AVX_INSTR roundpd, sse4 +AVX_INSTR roundps, sse4 +AVX_INSTR roundsd, sse4 +AVX_INSTR roundss, sse4 +AVX_INSTR rsqrtps, sse, 1, 0, 0 +AVX_INSTR rsqrtss, sse, 1, 0, 0 +AVX_INSTR shufpd, sse2, 1, 1, 0 +AVX_INSTR shufps, sse, 1, 1, 0 +AVX_INSTR sqrtpd, sse2, 1, 0, 0 +AVX_INSTR sqrtps, sse, 1, 0, 0 +AVX_INSTR sqrtsd, sse2, 1, 0, 0 +AVX_INSTR sqrtss, sse, 1, 0, 0 +AVX_INSTR stmxcsr, sse +AVX_INSTR subpd, sse2, 1, 0, 0 +AVX_INSTR subps, sse, 1, 0, 0 +AVX_INSTR subsd, sse2, 1, 0, 0 +AVX_INSTR subss, sse, 1, 0, 0 +AVX_INSTR ucomisd, sse2 +AVX_INSTR ucomiss, sse +AVX_INSTR unpckhpd, sse2, 1, 0, 0 +AVX_INSTR unpckhps, sse, 1, 0, 0 +AVX_INSTR unpcklpd, sse2, 1, 0, 0 +AVX_INSTR unpcklps, sse, 1, 0, 0 +AVX_INSTR xorpd, sse2, 1, 0, 1 +AVX_INSTR xorps, sse, 1, 0, 1 + +; 3DNow instructions, for sharing code between AVX, SSE and 3DN +AVX_INSTR pfadd, 3dnow, 1, 0, 1 +AVX_INSTR pfsub, 3dnow, 1, 0, 0 +AVX_INSTR pfmul, 3dnow, 1, 0, 1 + +; base-4 constants for shuffles +%assign i 0 +%rep 256 + %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) + %if j < 10 + CAT_XDEFINE q000, j, i + %elif j < 100 + CAT_XDEFINE q00, j, i + %elif j < 1000 + CAT_XDEFINE q0, j, i + %else + CAT_XDEFINE q, j, i + %endif + %assign i i+1 +%endrep +%undef i +%undef j + +%macro FMA_INSTR 3 + %macro %1 4-7 %1, %2, %3 + %if cpuflag(xop) + v%5 %1, %2, %3, %4 + %elifnidn %1, %4 + %6 %1, %2, %3 + %7 %1, %4 + %else + %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported + %endif + %endmacro +%endmacro + +FMA_INSTR pmacsww, pmullw, paddw +FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation +FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation +FMA_INSTR pmadcswd, pmaddwd, paddd + +; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. +; FMA3 is only possible if dst is the same as one of the src registers. +; Either src2 or src3 can be a memory operand. +%macro FMA4_INSTR 2-* + %push fma4_instr + %xdefine %$prefix %1 + %rep %0 - 1 + %macro %$prefix%2 4-6 %$prefix, %2 + %if notcpuflag(fma3) && notcpuflag(fma4) + %error use of ``%5%6'' fma instruction in cpuname function: current_function + %elif cpuflag(fma4) + v%5%6 %1, %2, %3, %4 + %elifidn %1, %2 + ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. + %ifid %3 + v%{5}213%6 %2, %3, %4 + %else + v%{5}132%6 %2, %4, %3 + %endif + %elifidn %1, %3 + v%{5}213%6 %3, %2, %4 + %elifidn %1, %4 + v%{5}231%6 %4, %2, %3 + %else + %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported + %endif + %endmacro + %rotate 1 + %endrep + %pop +%endmacro + +FMA4_INSTR fmadd, pd, ps, sd, ss +FMA4_INSTR fmaddsub, pd, ps +FMA4_INSTR fmsub, pd, ps, sd, ss +FMA4_INSTR fmsubadd, pd, ps +FMA4_INSTR fnmadd, pd, ps, sd, ss +FMA4_INSTR fnmsub, pd, ps, sd, ss + +; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0) +%ifdef __YASM_VER__ + %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0 + %macro vpbroadcastq 2 + %if sizeof%1 == 16 + movddup %1, %2 + %else + vbroadcastsd %1, %2 + %endif + %endmacro + %endif +%endif diff --git a/thirdparty/libvpx/vp8/common/alloccommon.c b/thirdparty/libvpx/vp8/common/alloccommon.c new file mode 100644 index 000000000000..8dfd4ce203e7 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/alloccommon.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "alloccommon.h" +#include "blockd.h" +#include "vpx_mem/vpx_mem.h" +#include "onyxc_int.h" +#include "findnearmv.h" +#include "entropymode.h" +#include "systemdependent.h" + +void vp8_de_alloc_frame_buffers(VP8_COMMON *oci) +{ + int i; + for (i = 0; i < NUM_YV12_BUFFERS; i++) + vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]); + + vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame); +#if CONFIG_POSTPROC + vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer); + if (oci->post_proc_buffer_int_used) + vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int); + + vpx_free(oci->pp_limits_buffer); + oci->pp_limits_buffer = NULL; +#endif + + vpx_free(oci->above_context); + vpx_free(oci->mip); +#if CONFIG_ERROR_CONCEALMENT + vpx_free(oci->prev_mip); + oci->prev_mip = NULL; +#endif + + oci->above_context = NULL; + oci->mip = NULL; +} + +int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height) +{ + int i; + + vp8_de_alloc_frame_buffers(oci); + + /* our internal buffers are always multiples of 16 */ + if ((width & 0xf) != 0) + width += 16 - (width & 0xf); + + if ((height & 0xf) != 0) + height += 16 - (height & 0xf); + + + for (i = 0; i < NUM_YV12_BUFFERS; i++) + { + oci->fb_idx_ref_cnt[i] = 0; + oci->yv12_fb[i].flags = 0; + if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0) + goto allocation_fail; + } + + oci->new_fb_idx = 0; + oci->lst_fb_idx = 1; + oci->gld_fb_idx = 2; + oci->alt_fb_idx = 3; + + oci->fb_idx_ref_cnt[0] = 1; + oci->fb_idx_ref_cnt[1] = 1; + oci->fb_idx_ref_cnt[2] = 1; + oci->fb_idx_ref_cnt[3] = 1; + + if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, VP8BORDERINPIXELS) < 0) + goto allocation_fail; + + oci->mb_rows = height >> 4; + oci->mb_cols = width >> 4; + oci->MBs = oci->mb_rows * oci->mb_cols; + oci->mode_info_stride = oci->mb_cols + 1; + oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO)); + + if (!oci->mip) + goto allocation_fail; + + oci->mi = oci->mip + oci->mode_info_stride + 1; + + /* Allocation of previous mode info will be done in vp8_decode_frame() + * as it is a decoder only data */ + + oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1); + + if (!oci->above_context) + goto allocation_fail; + +#if CONFIG_POSTPROC + if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0) + goto allocation_fail; + + oci->post_proc_buffer_int_used = 0; + memset(&oci->postproc_state, 0, sizeof(oci->postproc_state)); + memset(oci->post_proc_buffer.buffer_alloc, 128, + oci->post_proc_buffer.frame_size); + + /* Allocate buffer to store post-processing filter coefficients. + * + * Note: Round up mb_cols to support SIMD reads + */ + oci->pp_limits_buffer = vpx_memalign(16, 24 * ((oci->mb_cols + 1) & ~1)); + if (!oci->pp_limits_buffer) + goto allocation_fail; +#endif + + return 0; + +allocation_fail: + vp8_de_alloc_frame_buffers(oci); + return 1; +} + +void vp8_setup_version(VP8_COMMON *cm) +{ + switch (cm->version) + { + case 0: + cm->no_lpf = 0; + cm->filter_type = NORMAL_LOOPFILTER; + cm->use_bilinear_mc_filter = 0; + cm->full_pixel = 0; + break; + case 1: + cm->no_lpf = 0; + cm->filter_type = SIMPLE_LOOPFILTER; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 0; + break; + case 2: + cm->no_lpf = 1; + cm->filter_type = NORMAL_LOOPFILTER; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 0; + break; + case 3: + cm->no_lpf = 1; + cm->filter_type = SIMPLE_LOOPFILTER; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 1; + break; + default: + /*4,5,6,7 are reserved for future use*/ + cm->no_lpf = 0; + cm->filter_type = NORMAL_LOOPFILTER; + cm->use_bilinear_mc_filter = 0; + cm->full_pixel = 0; + break; + } +} +void vp8_create_common(VP8_COMMON *oci) +{ + vp8_machine_specific_config(oci); + + vp8_init_mbmode_probs(oci); + vp8_default_bmode_probs(oci->fc.bmode_prob); + + oci->mb_no_coeff_skip = 1; + oci->no_lpf = 0; + oci->filter_type = NORMAL_LOOPFILTER; + oci->use_bilinear_mc_filter = 0; + oci->full_pixel = 0; + oci->multi_token_partition = ONE_PARTITION; + oci->clamp_type = RECON_CLAMP_REQUIRED; + + /* Initialize reference frame sign bias structure to defaults */ + memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias)); + + /* Default disable buffer to buffer copying */ + oci->copy_buffer_to_gf = 0; + oci->copy_buffer_to_arf = 0; +} + +void vp8_remove_common(VP8_COMMON *oci) +{ + vp8_de_alloc_frame_buffers(oci); +} diff --git a/thirdparty/libvpx/vp8/common/alloccommon.h b/thirdparty/libvpx/vp8/common/alloccommon.h new file mode 100644 index 000000000000..93e99d76b1d7 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/alloccommon.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_ALLOCCOMMON_H_ +#define VP8_COMMON_ALLOCCOMMON_H_ + +#include "onyxc_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_create_common(VP8_COMMON *oci); +void vp8_remove_common(VP8_COMMON *oci); +void vp8_de_alloc_frame_buffers(VP8_COMMON *oci); +int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height); +void vp8_setup_version(VP8_COMMON *oci); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_ALLOCCOMMON_H_ diff --git a/thirdparty/libvpx/vp8/common/arm/loopfilter_arm.c b/thirdparty/libvpx/vp8/common/arm/loopfilter_arm.c new file mode 100644 index 000000000000..5840c2bbaaa9 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/arm/loopfilter_arm.c @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vp8/common/loopfilter.h" +#include "vp8/common/onyxc_int.h" + +#define prototype_loopfilter(sym) \ + void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ + const unsigned char *limit, const unsigned char *thresh, int count) + +#if HAVE_MEDIA +extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6); +extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6); +extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6); +extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6); +#endif + +#if HAVE_NEON +typedef void loopfilter_y_neon(unsigned char *src, int pitch, + unsigned char blimit, unsigned char limit, unsigned char thresh); +typedef void loopfilter_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, unsigned char limit, unsigned char thresh, + unsigned char *v); + +extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon; +extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon; +extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon; + +extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; +extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; +extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; +#endif + +#if HAVE_MEDIA +/* ARMV6/MEDIA loopfilter functions*/ +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) +{ + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) +{ + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit); +} +#endif + +#if HAVE_NEON +/* NEON loopfilter functions */ +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr); + vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr); + vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr); + + if (u_ptr) + vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4); +} +#endif diff --git a/thirdparty/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c new file mode 100644 index 000000000000..bb6ea76ba473 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c @@ -0,0 +1,591 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +static const uint8_t bifilter4_coeff[8][2] = { + {128, 0}, + {112, 16}, + { 96, 32}, + { 80, 48}, + { 64, 64}, + { 48, 80}, + { 32, 96}, + { 16, 112} +}; + +void vp8_bilinear_predict8x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8; + uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16; + uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16; + + if (xoffset == 0) { // skip_1stpass_filter + d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d26u8 = vld1_u8(src_ptr); + } else { + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q5u8 = vld1q_u8(src_ptr); + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + q10u16 = vmlal_u8(q10u16, d11u8, d1u8); + + d22u8 = vqrshrn_n_u16(q6u16, 7); + d23u8 = vqrshrn_n_u16(q7u16, 7); + d24u8 = vqrshrn_n_u16(q8u16, 7); + d25u8 = vqrshrn_n_u16(q9u16, 7); + d26u8 = vqrshrn_n_u16(q10u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d25u8); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d22u8, d0u8); + q2u16 = vmull_u8(d23u8, d0u8); + q3u16 = vmull_u8(d24u8, d0u8); + q4u16 = vmull_u8(d25u8, d0u8); + + q1u16 = vmlal_u8(q1u16, d23u8, d1u8); + q2u16 = vmlal_u8(q2u16, d24u8, d1u8); + q3u16 = vmlal_u8(q3u16, d25u8, d1u8); + q4u16 = vmlal_u8(q4u16, d26u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + + vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d5u8); + } + return; +} + +void vp8_bilinear_predict8x8_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8; + uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16; + uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16; + + if (xoffset == 0) { // skip_1stpass_filter + d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d26u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d30u8 = vld1_u8(src_ptr); + } else { + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + + d22u8 = vqrshrn_n_u16(q6u16, 7); + d23u8 = vqrshrn_n_u16(q7u16, 7); + d24u8 = vqrshrn_n_u16(q8u16, 7); + d25u8 = vqrshrn_n_u16(q9u16, 7); + + // first_pass filtering on the rest 5-line data + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q5u8 = vld1q_u8(src_ptr); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + q10u16 = vmlal_u8(q10u16, d11u8, d1u8); + + d26u8 = vqrshrn_n_u16(q6u16, 7); + d27u8 = vqrshrn_n_u16(q7u16, 7); + d28u8 = vqrshrn_n_u16(q8u16, 7); + d29u8 = vqrshrn_n_u16(q9u16, 7); + d30u8 = vqrshrn_n_u16(q10u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d25u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d26u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d27u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d28u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d29u8); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d22u8, d0u8); + q2u16 = vmull_u8(d23u8, d0u8); + q3u16 = vmull_u8(d24u8, d0u8); + q4u16 = vmull_u8(d25u8, d0u8); + q5u16 = vmull_u8(d26u8, d0u8); + q6u16 = vmull_u8(d27u8, d0u8); + q7u16 = vmull_u8(d28u8, d0u8); + q8u16 = vmull_u8(d29u8, d0u8); + + q1u16 = vmlal_u8(q1u16, d23u8, d1u8); + q2u16 = vmlal_u8(q2u16, d24u8, d1u8); + q3u16 = vmlal_u8(q3u16, d25u8, d1u8); + q4u16 = vmlal_u8(q4u16, d26u8, d1u8); + q5u16 = vmlal_u8(q5u16, d27u8, d1u8); + q6u16 = vmlal_u8(q6u16, d28u8, d1u8); + q7u16 = vmlal_u8(q7u16, d29u8, d1u8); + q8u16 = vmlal_u8(q8u16, d30u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d5u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d6u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d7u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d8u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d9u8); + } + return; +} + +void vp8_bilinear_predict16x16_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + int i; + unsigned char tmp[272]; + unsigned char *tmpp; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; + uint8x8_t d19u8, d20u8, d21u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; + uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; + uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; + + if (xoffset == 0) { // secondpass_bfilter16x16_only + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q11u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch; + } + return; + } + + if (yoffset == 0) { // firstpass_bfilter16x16_only + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + for (i = 4; i > 0 ; i--) { + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 =vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)dst_ptr, q7u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q8u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q9u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q10u8); dst_ptr += dst_pitch; + } + return; + } + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + // First Pass: output_height lines x output_width columns (17x16) + tmpp = tmp; + for (i = 3; i > 0; i--) { + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16; + } + + // First-pass filtering for rest 5 lines + d14u8 = vld1_u8(src_ptr); + d15u8 = vld1_u8(src_ptr + 8); + d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q9u16 = vmull_u8(d2u8, d0u8); + q10u16 = vmull_u8(d3u8, d0u8); + q11u16 = vmull_u8(d5u8, d0u8); + q12u16 = vmull_u8(d6u8, d0u8); + q13u16 = vmull_u8(d8u8, d0u8); + q14u16 = vmull_u8(d9u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + + q9u16 = vmlal_u8(q9u16, d2u8, d1u8); + q11u16 = vmlal_u8(q11u16, d5u8, d1u8); + q13u16 = vmlal_u8(q13u16, d8u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + + q10u16 = vmlal_u8(q10u16, d3u8, d1u8); + q12u16 = vmlal_u8(q12u16, d6u8, d1u8); + q14u16 = vmlal_u8(q14u16, d9u8, d1u8); + + q1u16 = vmull_u8(d11u8, d0u8); + q2u16 = vmull_u8(d12u8, d0u8); + q3u16 = vmull_u8(d14u8, d0u8); + q4u16 = vmull_u8(d15u8, d0u8); + + d11u8 = vext_u8(d11u8, d12u8, 1); + d14u8 = vext_u8(d14u8, d15u8, 1); + + q1u16 = vmlal_u8(q1u16, d11u8, d1u8); + q3u16 = vmlal_u8(q3u16, d14u8, d1u8); + + d12u8 = vext_u8(d12u8, d13u8, 1); + d15u8 = vext_u8(d15u8, d16u8, 1); + + q2u16 = vmlal_u8(q2u16, d12u8, d1u8); + q4u16 = vmlal_u8(q4u16, d15u8, d1u8); + + d10u8 = vqrshrn_n_u16(q9u16, 7); + d11u8 = vqrshrn_n_u16(q10u16, 7); + d12u8 = vqrshrn_n_u16(q11u16, 7); + d13u8 = vqrshrn_n_u16(q12u16, 7); + d14u8 = vqrshrn_n_u16(q13u16, 7); + d15u8 = vqrshrn_n_u16(q14u16, 7); + d16u8 = vqrshrn_n_u16(q1u16, 7); + d17u8 = vqrshrn_n_u16(q2u16, 7); + d18u8 = vqrshrn_n_u16(q3u16, 7); + d19u8 = vqrshrn_n_u16(q4u16, 7); + + q5u8 = vcombine_u8(d10u8, d11u8); + q6u8 = vcombine_u8(d12u8, d13u8); + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + + vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + + // secondpass_filter + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + tmpp = tmp; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(tmpp); tmpp += 16; + q13u8 = vld1q_u8(tmpp); tmpp += 16; + q14u8 = vld1q_u8(tmpp); tmpp += 16; + q15u8 = vld1q_u8(tmpp); tmpp += 16; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch; + } + return; +} diff --git a/thirdparty/libvpx/vp8/common/arm/neon/copymem_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/copymem_neon.c new file mode 100644 index 000000000000..deced115c14c --- /dev/null +++ b/thirdparty/libvpx/vp8/common/arm/neon/copymem_neon.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void vp8_copy_mem8x4_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + uint8x8_t vtmp; + int r; + + for (r = 0; r < 4; r++) { + vtmp = vld1_u8(src); + vst1_u8(dst, vtmp); + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem8x8_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + uint8x8_t vtmp; + int r; + + for (r = 0; r < 8; r++) { + vtmp = vld1_u8(src); + vst1_u8(dst, vtmp); + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem16x16_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + int r; + uint8x16_t qtmp; + + for (r = 0; r < 16; r++) { + qtmp = vld1q_u8(src); + vst1q_u8(dst, qtmp); + src += src_stride; + dst += dst_stride; + } +} diff --git a/thirdparty/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c new file mode 100644 index 000000000000..ad5f41d7dee1 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void vp8_dc_only_idct_add_neon( + int16_t input_dc, + unsigned char *pred_ptr, + int pred_stride, + unsigned char *dst_ptr, + int dst_stride) { + int i; + uint16_t a1 = ((input_dc + 4) >> 3); + uint32x2_t d2u32 = vdup_n_u32(0); + uint8x8_t d2u8; + uint16x8_t q1u16; + uint16x8_t qAdd; + + qAdd = vdupq_n_u16(a1); + + for (i = 0; i < 2; i++) { + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0); + pred_ptr += pred_stride; + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1); + pred_ptr += pred_stride; + + q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0); + dst_ptr += dst_stride; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1); + dst_ptr += dst_stride; + } +} diff --git a/thirdparty/libvpx/vp8/common/arm/neon/dequant_idct_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/dequant_idct_neon.c new file mode 100644 index 000000000000..58e11922c760 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/arm/neon/dequant_idct_neon.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 35468; + +void vp8_dequant_idct_add_neon( + int16_t *input, + int16_t *dq, + unsigned char *dst, + int stride) { + unsigned char *dst0; + int32x2_t d14, d15; + int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; + int16x8_t q1, q2, q3, q4, q5, q6; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x2x2_t d2tmp0, d2tmp1; + int16x4x2_t d2tmp2, d2tmp3; + + d14 = d15 = vdup_n_s32(0); + + // load input + q3 = vld1q_s16(input); + vst1q_s16(input, qEmpty); + input += 8; + q4 = vld1q_s16(input); + vst1q_s16(input, qEmpty); + + // load dq + q5 = vld1q_s16(dq); + dq += 8; + q6 = vld1q_s16(dq); + + // load src from dst + dst0 = dst; + d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0); + dst0 += stride; + d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1); + dst0 += stride; + d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0); + dst0 += stride; + d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1); + + q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3), + vreinterpretq_u16_s16(q5))); + q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4), + vreinterpretq_u16_s16(q6))); + + d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2)); + d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2)); + + q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2)); + + q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); + q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); + + q3 = vshrq_n_s16(q3, 1); + q4 = vshrq_n_s16(q4, 1); + + q3 = vqaddq_s16(q3, q2); + q4 = vqaddq_s16(q4, q2); + + d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); + d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), + vreinterpret_s16_s32(d2tmp1.val[0])); + d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), + vreinterpret_s16_s32(d2tmp1.val[1])); + + // loop 2 + q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]); + + q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); + q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); + + d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]); + d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]); + + q3 = vshrq_n_s16(q3, 1); + q4 = vshrq_n_s16(q4, 1); + + q3 = vqaddq_s16(q3, q2); + q4 = vqaddq_s16(q4, q2); + + d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); + d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2 = vrshr_n_s16(d2, 3); + d3 = vrshr_n_s16(d3, 3); + d4 = vrshr_n_s16(d4, 3); + d5 = vrshr_n_s16(d5, 3); + + d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), + vreinterpret_s16_s32(d2tmp1.val[0])); + d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), + vreinterpret_s16_s32(d2tmp1.val[1])); + + q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]); + q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]); + + q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), + vreinterpret_u8_s32(d14))); + q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), + vreinterpret_u8_s32(d15))); + + d14 = vreinterpret_s32_u8(vqmovun_s16(q1)); + d15 = vreinterpret_s32_u8(vqmovun_s16(q2)); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d14, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d14, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d15, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d15, 1); + return; +} diff --git a/thirdparty/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/dequantizeb_neon.c new file mode 100644 index 000000000000..54e709dd3c32 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/arm/neon/dequantizeb_neon.c @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp8/common/blockd.h" + +void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) { + int16x8x2_t qQ, qDQC, qDQ; + + qQ = vld2q_s16(d->qcoeff); + qDQC = vld2q_s16(DQC); + + qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]); + qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]); + + vst2q_s16(d->dqcoeff, qDQ); +} diff --git a/thirdparty/libvpx/vp8/common/arm/neon/idct_blk_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/idct_blk_neon.c new file mode 100644 index 000000000000..fb327a72601b --- /dev/null +++ b/thirdparty/libvpx/vp8/common/arm/neon/idct_blk_neon.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" + +/* place these declarations here because we don't want to maintain them + * outside of this scope + */ +void idct_dequant_full_2x_neon(short *q, short *dq, + unsigned char *dst, int stride); +void idct_dequant_0_2x_neon(short *q, short dq, + unsigned char *dst, int stride); + + +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, + unsigned char *dst, + int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (((short *)(eobs))[0]) + { + if (((short *)eobs)[0] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dst, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dst, stride); + } + + if (((short *)(eobs))[1]) + { + if (((short *)eobs)[1] & 0xfefe) + idct_dequant_full_2x_neon (q+32, dq, dst+8, stride); + else + idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride); + } + q += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, + unsigned char *dstu, + unsigned char *dstv, + int stride, char *eobs) +{ + if (((short *)(eobs))[0]) + { + if (((short *)eobs)[0] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstu, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstu, stride); + } + + q += 32; + dstu += 4*stride; + + if (((short *)(eobs))[1]) + { + if (((short *)eobs)[1] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstu, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstu, stride); + } + + q += 32; + + if (((short *)(eobs))[2]) + { + if (((short *)eobs)[2] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstv, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstv, stride); + } + + q += 32; + dstv += 4*stride; + + if (((short *)(eobs))[3]) + { + if (((short *)eobs)[3] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstv, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstv, stride); + } +} diff --git a/thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c new file mode 100644 index 000000000000..e6f862fa8988 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void idct_dequant_0_2x_neon( + int16_t *q, + int16_t dq, + unsigned char *dst, + int stride) { + unsigned char *dst0; + int i, a0, a1; + int16x8x2_t q2Add; + int32x2_t d2s32 = vdup_n_s32(0), + d4s32 = vdup_n_s32(0); + uint8x8_t d2u8, d4u8; + uint16x8_t q1u16, q2u16; + + a0 = ((q[0] * dq) + 4) >> 3; + a1 = ((q[16] * dq) + 4) >> 3; + q[0] = q[16] = 0; + q2Add.val[0] = vdupq_n_s16((int16_t)a0); + q2Add.val[1] = vdupq_n_s16((int16_t)a1); + + for (i = 0; i < 2; i++, dst += 4) { + dst0 = dst; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0); + dst0 += stride; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1); + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d2s32)); + q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d4s32)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); + + d2s32 = vreinterpret_s32_u8(d2u8); + d4s32 = vreinterpret_s32_u8(d4u8); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d2s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d2s32, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 1); + } + return; +} diff --git a/thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c new file mode 100644 index 000000000000..a60ed46b7646 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 17734; +// because the lowest bit in 0x8a8c is 0, we can pre-shift this + +void idct_dequant_full_2x_neon( + int16_t *q, + int16_t *dq, + unsigned char *dst, + int stride) { + unsigned char *dst0, *dst1; + int32x2_t d28, d29, d30, d31; + int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x4x2_t q2tmp0, q2tmp1; + int16x8x2_t q2tmp2, q2tmp3; + int16x4_t dLow0, dLow1, dHigh0, dHigh1; + + d28 = d29 = d30 = d31 = vdup_n_s32(0); + + // load dq + q0 = vld1q_s16(dq); + dq += 8; + q1 = vld1q_s16(dq); + + // load q + q2 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q3 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q4 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q5 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + + // load src from dst + dst0 = dst; + dst1 = dst + 4; + d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0); + dst0 += stride; + d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1); + dst1 += stride; + d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0); + dst0 += stride; + d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1); + dst1 += stride; + + d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0); + dst0 += stride; + d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1); + dst1 += stride; + d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0); + d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1); + + q2 = vmulq_s16(q2, q0); + q3 = vmulq_s16(q3, q1); + q4 = vmulq_s16(q4, q0); + q5 = vmulq_s16(q5, q1); + + // vswp + dLow0 = vget_low_s16(q2); + dHigh0 = vget_high_s16(q2); + dLow1 = vget_low_s16(q4); + dHigh1 = vget_high_s16(q4); + q2 = vcombine_s16(dLow0, dLow1); + q4 = vcombine_s16(dHigh0, dHigh1); + + dLow0 = vget_low_s16(q3); + dHigh0 = vget_high_s16(q3); + dLow1 = vget_low_s16(q5); + dHigh1 = vget_high_s16(q5); + q3 = vcombine_s16(dLow0, dLow1); + q5 = vcombine_s16(dHigh0, dHigh1); + + q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2); + q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2); + q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1); + q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1); + + q10 = vqaddq_s16(q2, q3); + q11 = vqsubq_s16(q2, q3); + + q8 = vshrq_n_s16(q8, 1); + q9 = vshrq_n_s16(q9, 1); + + q4 = vqaddq_s16(q4, q8); + q5 = vqaddq_s16(q5, q9); + + q2 = vqsubq_s16(q6, q5); + q3 = vqaddq_s16(q7, q4); + + q4 = vqaddq_s16(q10, q3); + q5 = vqaddq_s16(q11, q2); + q6 = vqsubq_s16(q11, q2); + q7 = vqsubq_s16(q10, q3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + // loop 2 + q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2); + q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2); + q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1); + q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1); + + q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]); + q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]); + + q10 = vshrq_n_s16(q10, 1); + q11 = vshrq_n_s16(q11, 1); + + q10 = vqaddq_s16(q2tmp2.val[1], q10); + q11 = vqaddq_s16(q2tmp3.val[1], q11); + + q8 = vqsubq_s16(q8, q11); + q9 = vqaddq_s16(q9, q10); + + q4 = vqaddq_s16(q2, q9); + q5 = vqaddq_s16(q3, q8); + q6 = vqsubq_s16(q3, q8); + q7 = vqsubq_s16(q2, q9); + + q4 = vrshrq_n_s16(q4, 3); + q5 = vrshrq_n_s16(q5, 3); + q6 = vrshrq_n_s16(q6, 3); + q7 = vrshrq_n_s16(q7, 3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), + vreinterpret_u8_s32(d28))); + q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), + vreinterpret_u8_s32(d29))); + q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), + vreinterpret_u8_s32(d30))); + q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), + vreinterpret_u8_s32(d31))); + + d28 = vreinterpret_s32_u8(vqmovun_s16(q4)); + d29 = vreinterpret_s32_u8(vqmovun_s16(q5)); + d30 = vreinterpret_s32_u8(vqmovun_s16(q6)); + d31 = vreinterpret_s32_u8(vqmovun_s16(q7)); + + dst0 = dst; + dst1 = dst + 4; + vst1_lane_s32((int32_t *)dst0, d28, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d28, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d29, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d29, 1); + dst1 += stride; + + vst1_lane_s32((int32_t *)dst0, d30, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d30, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d31, 0); + vst1_lane_s32((int32_t *)dst1, d31, 1); + return; +} diff --git a/thirdparty/libvpx/vp8/common/arm/neon/iwalsh_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/iwalsh_neon.c new file mode 100644 index 000000000000..6ea9dd712aaa --- /dev/null +++ b/thirdparty/libvpx/vp8/common/arm/neon/iwalsh_neon.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void vp8_short_inv_walsh4x4_neon( + int16_t *input, + int16_t *mb_dqcoeff) { + int16x8_t q0s16, q1s16, q2s16, q3s16; + int16x4_t d4s16, d5s16, d6s16, d7s16; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + int16x8_t qAdd3; + + q0s16 = vld1q_s16(input); + q1s16 = vld1q_s16(input + 8); + + // 1st for loop + d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16)); + d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16)); + d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16)); + d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16)); + + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + q0s16 = vaddq_s16(q2s16, q3s16); + q1s16 = vsubq_s16(q2s16, q3s16); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)), + vreinterpret_s32_s16(vget_low_s16(q1s16))); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)), + vreinterpret_s32_s16(vget_high_s16(q1s16))); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), + vreinterpret_s16_s32(v2tmp3.val[0])); + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), + vreinterpret_s16_s32(v2tmp3.val[1])); + + // 2nd for loop + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + qAdd3 = vdupq_n_s16(3); + + q0s16 = vaddq_s16(q2s16, q3s16); + q1s16 = vsubq_s16(q2s16, q3s16); + + q0s16 = vaddq_s16(q0s16, qAdd3); + q1s16 = vaddq_s16(q1s16, qAdd3); + + q0s16 = vshrq_n_s16(q0s16, 3); + q1s16 = vshrq_n_s16(q1s16, 3); + + // store + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3); + mb_dqcoeff += 16; + return; +} diff --git a/thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c new file mode 100644 index 000000000000..b25686ffb88b --- /dev/null +++ b/thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_config.h" + +static INLINE void vp8_loop_filter_simple_horizontal_edge_neon( + unsigned char *s, + int p, + const unsigned char *blimit) { + uint8_t *sp; + uint8x16_t qblimit, q0u8; + uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8; + int16x8_t q2s16, q3s16, q13s16; + int8x8_t d8s8, d9s8; + int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8; + + qblimit = vdupq_n_u8(*blimit); + + sp = s - (p << 1); + q5u8 = vld1q_u8(sp); + sp += p; + q6u8 = vld1q_u8(sp); + sp += p; + q7u8 = vld1q_u8(sp); + sp += p; + q8u8 = vld1q_u8(sp); + + q15u8 = vabdq_u8(q6u8, q7u8); + q14u8 = vabdq_u8(q5u8, q8u8); + + q15u8 = vqaddq_u8(q15u8, q15u8); + q14u8 = vshrq_n_u8(q14u8, 1); + q0u8 = vdupq_n_u8(0x80); + q13s16 = vdupq_n_s16(3); + q15u8 = vqaddq_u8(q15u8, q14u8); + + q5u8 = veorq_u8(q5u8, q0u8); + q6u8 = veorq_u8(q6u8, q0u8); + q7u8 = veorq_u8(q7u8, q0u8); + q8u8 = veorq_u8(q8u8, q0u8); + + q15u8 = vcgeq_u8(qblimit, q15u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)), + vget_low_s8(vreinterpretq_s8_u8(q6u8))); + q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)), + vget_high_s8(vreinterpretq_s8_u8(q6u8))); + + q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8), + vreinterpretq_s8_u8(q8u8)); + + q2s16 = vmulq_s16(q2s16, q13s16); + q3s16 = vmulq_s16(q3s16, q13s16); + + q10u8 = vdupq_n_u8(3); + q9u8 = vdupq_n_u8(4); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8)); + q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8)); + + d8s8 = vqmovn_s16(q2s16); + d9s8 = vqmovn_s16(q3s16); + q4s8 = vcombine_s8(d8s8, d9s8); + + q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8)); + + q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8)); + q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q3s8 = vshrq_n_s8(q3s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8); + + q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + + vst1q_u8(s, q7u8); + s -= p; + vst1q_u8(s, q6u8); + return; +} + +void vp8_loop_filter_bhs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + return; +} + +void vp8_loop_filter_mbhs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + return; +} diff --git a/thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c new file mode 100644 index 000000000000..921bcad6983b --- /dev/null +++ b/thirdparty/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c @@ -0,0 +1,283 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_config.h" +#include "vpx_ports/arm.h" + +#ifdef VPX_INCOMPATIBLE_GCC +static INLINE void write_2x4(unsigned char *dst, int pitch, + const uint8x8x2_t result) { + /* + * uint8x8x2_t result + 00 01 02 03 | 04 05 06 07 + 10 11 12 13 | 14 15 16 17 + --- + * after vtrn_u8 + 00 10 02 12 | 04 14 06 16 + 01 11 03 13 | 05 15 07 17 + */ + const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0], + result.val[1]); + const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]); + const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]); + vst1_lane_u16((uint16_t *)dst, x_0_4, 0); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 0); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 1); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 1); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 2); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 2); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 3); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 3); +} + +static INLINE void write_2x8(unsigned char *dst, int pitch, + const uint8x8x2_t result, + const uint8x8x2_t result2) { + write_2x4(dst, pitch, result); + dst += pitch * 8; + write_2x4(dst, pitch, result2); +} +#else +static INLINE void write_2x8(unsigned char *dst, int pitch, + const uint8x8x2_t result, + const uint8x8x2_t result2) { + vst2_lane_u8(dst, result, 0); + dst += pitch; + vst2_lane_u8(dst, result, 1); + dst += pitch; + vst2_lane_u8(dst, result, 2); + dst += pitch; + vst2_lane_u8(dst, result, 3); + dst += pitch; + vst2_lane_u8(dst, result, 4); + dst += pitch; + vst2_lane_u8(dst, result, 5); + dst += pitch; + vst2_lane_u8(dst, result, 6); + dst += pitch; + vst2_lane_u8(dst, result, 7); + dst += pitch; + + vst2_lane_u8(dst, result2, 0); + dst += pitch; + vst2_lane_u8(dst, result2, 1); + dst += pitch; + vst2_lane_u8(dst, result2, 2); + dst += pitch; + vst2_lane_u8(dst, result2, 3); + dst += pitch; + vst2_lane_u8(dst, result2, 4); + dst += pitch; + vst2_lane_u8(dst, result2, 5); + dst += pitch; + vst2_lane_u8(dst, result2, 6); + dst += pitch; + vst2_lane_u8(dst, result2, 7); +} +#endif // VPX_INCOMPATIBLE_GCC + + +#ifdef VPX_INCOMPATIBLE_GCC +static INLINE +uint8x8x4_t read_4x8(unsigned char *src, int pitch) { + uint8x8x4_t x; + const uint8x8_t a = vld1_u8(src); + const uint8x8_t b = vld1_u8(src + pitch * 1); + const uint8x8_t c = vld1_u8(src + pitch * 2); + const uint8x8_t d = vld1_u8(src + pitch * 3); + const uint8x8_t e = vld1_u8(src + pitch * 4); + const uint8x8_t f = vld1_u8(src + pitch * 5); + const uint8x8_t g = vld1_u8(src + pitch * 6); + const uint8x8_t h = vld1_u8(src + pitch * 7); + const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a), + vreinterpret_u32_u8(e)); + const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b), + vreinterpret_u32_u8(f)); + const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c), + vreinterpret_u32_u8(g)); + const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d), + vreinterpret_u32_u8(h)); + const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]), + vreinterpret_u16_u32(r26_u32.val[0])); + const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]), + vreinterpret_u16_u32(r37_u32.val[0])); + const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), + vreinterpret_u8_u16(r13_u16.val[0])); + const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), + vreinterpret_u8_u16(r13_u16.val[1])); + /* + * after vtrn_u32 + 00 01 02 03 | 40 41 42 43 + 10 11 12 13 | 50 51 52 53 + 20 21 22 23 | 60 61 62 63 + 30 31 32 33 | 70 71 72 73 + --- + * after vtrn_u16 + 00 01 20 21 | 40 41 60 61 + 02 03 22 23 | 42 43 62 63 + 10 11 30 31 | 50 51 70 71 + 12 13 32 33 | 52 52 72 73 + + 00 01 20 21 | 40 41 60 61 + 10 11 30 31 | 50 51 70 71 + 02 03 22 23 | 42 43 62 63 + 12 13 32 33 | 52 52 72 73 + --- + * after vtrn_u8 + 00 10 20 30 | 40 50 60 70 + 01 11 21 31 | 41 51 61 71 + 02 12 22 32 | 42 52 62 72 + 03 13 23 33 | 43 53 63 73 + */ + x.val[0] = r01_u8.val[0]; + x.val[1] = r01_u8.val[1]; + x.val[2] = r23_u8.val[0]; + x.val[3] = r23_u8.val[1]; + + return x; +} +#else +static INLINE +uint8x8x4_t read_4x8(unsigned char *src, int pitch) { + uint8x8x4_t x; + x.val[0] = x.val[1] = x.val[2] = x.val[3] = vdup_n_u8(0); + x = vld4_lane_u8(src, x, 0); + src += pitch; + x = vld4_lane_u8(src, x, 1); + src += pitch; + x = vld4_lane_u8(src, x, 2); + src += pitch; + x = vld4_lane_u8(src, x, 3); + src += pitch; + x = vld4_lane_u8(src, x, 4); + src += pitch; + x = vld4_lane_u8(src, x, 5); + src += pitch; + x = vld4_lane_u8(src, x, 6); + src += pitch; + x = vld4_lane_u8(src, x, 7); + return x; +} +#endif // VPX_INCOMPATIBLE_GCC + +static INLINE void vp8_loop_filter_simple_vertical_edge_neon( + unsigned char *s, + int p, + const unsigned char *blimit) { + unsigned char *src1; + uint8x16_t qblimit, q0u8; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8; + int16x8_t q2s16, q13s16, q11s16; + int8x8_t d28s8, d29s8; + int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8; + uint8x8x4_t d0u8x4; // d6, d7, d8, d9 + uint8x8x4_t d1u8x4; // d10, d11, d12, d13 + uint8x8x2_t d2u8x2; // d12, d13 + uint8x8x2_t d3u8x2; // d14, d15 + + qblimit = vdupq_n_u8(*blimit); + + src1 = s - 2; + d0u8x4 = read_4x8(src1, p); + src1 += p * 8; + d1u8x4 = read_4x8(src1, p); + + q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10 + q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12 + q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11 + q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13 + + q15u8 = vabdq_u8(q5u8, q4u8); + q14u8 = vabdq_u8(q3u8, q6u8); + + q15u8 = vqaddq_u8(q15u8, q15u8); + q14u8 = vshrq_n_u8(q14u8, 1); + q0u8 = vdupq_n_u8(0x80); + q11s16 = vdupq_n_s16(3); + q15u8 = vqaddq_u8(q15u8, q14u8); + + q3u8 = veorq_u8(q3u8, q0u8); + q4u8 = veorq_u8(q4u8, q0u8); + q5u8 = veorq_u8(q5u8, q0u8); + q6u8 = veorq_u8(q6u8, q0u8); + + q15u8 = vcgeq_u8(qblimit, q15u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)), + vget_low_s8(vreinterpretq_s8_u8(q5u8))); + q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)), + vget_high_s8(vreinterpretq_s8_u8(q5u8))); + + q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8), + vreinterpretq_s8_u8(q6u8)); + + q2s16 = vmulq_s16(q2s16, q11s16); + q13s16 = vmulq_s16(q13s16, q11s16); + + q11u8 = vdupq_n_u8(3); + q12u8 = vdupq_n_u8(4); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8)); + q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8)); + + d28s8 = vqmovn_s16(q2s16); + d29s8 = vqmovn_s16(q13s16); + q14s8 = vcombine_s8(d28s8, d29s8); + + q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8)); + + q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8)); + q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q14s8 = vshrq_n_s8(q3s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8); + + q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + + d2u8x2.val[0] = vget_low_u8(q6u8); // d12 + d2u8x2.val[1] = vget_low_u8(q7u8); // d14 + d3u8x2.val[0] = vget_high_u8(q6u8); // d13 + d3u8x2.val[1] = vget_high_u8(q7u8); // d15 + + src1 = s - 1; + write_2x8(src1, p, d2u8x2, d3u8x2); +} + +void vp8_loop_filter_bvs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + return; +} + +void vp8_loop_filter_mbvs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + return; +} diff --git a/thirdparty/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c new file mode 100644 index 000000000000..5351f4be665b --- /dev/null +++ b/thirdparty/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c @@ -0,0 +1,625 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_config.h" + +static INLINE void vp8_mbloop_filter_neon( + uint8x16_t qblimit, // mblimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p2 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q4r, // p1 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r, // q1 + uint8x16_t *q9r) { // q1 + uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8; + uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16; + int8x16_t q0s8, q12s8, q14s8, q15s8; + int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q1u8 = vabdq_u8(q9, q8); + q0u8 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q1u8 = vmaxq_u8(q1u8, q0u8); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q12u8 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q1u8); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + q1u8 = vabdq_u8(q5, q8); + q12u8 = vqaddq_u8(q12u8, q12u8); + + // vp8_filter() function + // convert to signed + q0u8 = vdupq_n_u8(0x80); + q9 = veorq_u8(q9, q0u8); + q8 = veorq_u8(q8, q0u8); + q7 = veorq_u8(q7, q0u8); + q6 = veorq_u8(q6, q0u8); + q5 = veorq_u8(q5, q0u8); + q4 = veorq_u8(q4, q0u8); + + q1u8 = vshrq_n_u8(q1u8, 1); + q12u8 = vqaddq_u8(q12u8, q1u8); + + q14u8 = vorrq_u8(q13u8, q14u8); + q12u8 = vcgeq_u8(qblimit, q12u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), + vreinterpretq_s8_u8(q8)); + + q11s16 = vdupq_n_s16(3); + q2s16 = vmulq_s16(q2s16, q11s16); + q13s16 = vmulq_s16(q13s16, q11s16); + + q15u8 = vandq_u8(q15u8, q12u8); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8)); + + q12u8 = vdupq_n_u8(3); + q11u8 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2 = vqmovn_s16(q2s16); + d3 = vqmovn_s16(q13s16); + q1s8 = vcombine_s8(d2, d3); + q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8)); + q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8)); + q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q13s8 = vshrq_n_s8(q13s8, 3); + + q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8); + q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8); + + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63); + d5 = vdup_n_s8(9); + d4 = vdup_n_s8(18); + + q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5); + q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5); + d5 = vdup_n_s8(27); + q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4); + q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4); + q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5); + q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5); + + d0 = vqshrn_n_s16(q0s16 , 7); + d1 = vqshrn_n_s16(q11s16, 7); + d24 = vqshrn_n_s16(q12s16, 7); + d25 = vqshrn_n_s16(q13s16, 7); + d28 = vqshrn_n_s16(q14s16, 7); + d29 = vqshrn_n_s16(q15s16, 7); + + q0s8 = vcombine_s8(d0, d1); + q12s8 = vcombine_s8(d24, d25); + q14s8 = vcombine_s8(d28, d29); + + q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8); + q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8); + q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8); + q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8); + q15s8 = vqsubq_s8((q7s8), q14s8); + q14s8 = vqaddq_s8((q6s8), q14s8); + + q1u8 = vdupq_n_u8(0x80); + *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8); + *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8); + *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8); + *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8); + *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8); + *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8); + return; +} + +void vp8_mbloop_filter_horizontal_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + src -= (pitch << 2); + + q3 = vld1q_u8(src); + src += pitch; + q4 = vld1q_u8(src); + src += pitch; + q5 = vld1q_u8(src); + src += pitch; + q6 = vld1q_u8(src); + src += pitch; + q7 = vld1q_u8(src); + src += pitch; + q8 = vld1q_u8(src); + src += pitch; + q9 = vld1q_u8(src); + src += pitch; + q10 = vld1q_u8(src); + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + src -= (pitch * 6); + vst1q_u8(src, q4); + src += pitch; + vst1q_u8(src, q5); + src += pitch; + vst1q_u8(src, q6); + src += pitch; + vst1q_u8(src, q7); + src += pitch; + vst1q_u8(src, q8); + src += pitch; + vst1q_u8(src, q9); + return; +} + +void vp8_mbloop_filter_horizontal_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + u -= (pitch << 2); + v -= (pitch << 2); + + d6 = vld1_u8(u); + u += pitch; + d7 = vld1_u8(v); + v += pitch; + d8 = vld1_u8(u); + u += pitch; + d9 = vld1_u8(v); + v += pitch; + d10 = vld1_u8(u); + u += pitch; + d11 = vld1_u8(v); + v += pitch; + d12 = vld1_u8(u); + u += pitch; + d13 = vld1_u8(v); + v += pitch; + d14 = vld1_u8(u); + u += pitch; + d15 = vld1_u8(v); + v += pitch; + d16 = vld1_u8(u); + u += pitch; + d17 = vld1_u8(v); + v += pitch; + d18 = vld1_u8(u); + u += pitch; + d19 = vld1_u8(v); + v += pitch; + d20 = vld1_u8(u); + d21 = vld1_u8(v); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + u -= (pitch * 6); + v -= (pitch * 6); + vst1_u8(u, vget_low_u8(q4)); + u += pitch; + vst1_u8(v, vget_high_u8(q4)); + v += pitch; + vst1_u8(u, vget_low_u8(q5)); + u += pitch; + vst1_u8(v, vget_high_u8(q5)); + v += pitch; + vst1_u8(u, vget_low_u8(q6)); + u += pitch; + vst1_u8(v, vget_high_u8(q6)); + v += pitch; + vst1_u8(u, vget_low_u8(q7)); + u += pitch; + vst1_u8(v, vget_high_u8(q7)); + v += pitch; + vst1_u8(u, vget_low_u8(q8)); + u += pitch; + vst1_u8(v, vget_high_u8(q8)); + v += pitch; + vst1_u8(u, vget_low_u8(q9)); + vst1_u8(v, vget_high_u8(q9)); + return; +} + +void vp8_mbloop_filter_vertical_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + unsigned char *s1, *s2; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + s1 = src - 4; + s2 = s1 + 8 * pitch; + d6 = vld1_u8(s1); + s1 += pitch; + d7 = vld1_u8(s2); + s2 += pitch; + d8 = vld1_u8(s1); + s1 += pitch; + d9 = vld1_u8(s2); + s2 += pitch; + d10 = vld1_u8(s1); + s1 += pitch; + d11 = vld1_u8(s2); + s2 += pitch; + d12 = vld1_u8(s1); + s1 += pitch; + d13 = vld1_u8(s2); + s2 += pitch; + d14 = vld1_u8(s1); + s1 += pitch; + d15 = vld1_u8(s2); + s2 += pitch; + d16 = vld1_u8(s1); + s1 += pitch; + d17 = vld1_u8(s2); + s2 += pitch; + d18 = vld1_u8(s1); + s1 += pitch; + d19 = vld1_u8(s2); + s2 += pitch; + d20 = vld1_u8(s1); + d21 = vld1_u8(s2); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + s1 -= 7 * pitch; + s2 -= 7 * pitch; + + vst1_u8(s1, vget_low_u8(q3)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q3)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q4)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q4)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q5)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q5)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q6)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q6)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q7)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q7)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q8)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q8)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q9)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q9)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q10)); + vst1_u8(s2, vget_high_u8(q10)); + return; +} + +void vp8_mbloop_filter_vertical_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + unsigned char *us, *ud; + unsigned char *vs, *vd; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + us = u - 4; + vs = v - 4; + d6 = vld1_u8(us); + us += pitch; + d7 = vld1_u8(vs); + vs += pitch; + d8 = vld1_u8(us); + us += pitch; + d9 = vld1_u8(vs); + vs += pitch; + d10 = vld1_u8(us); + us += pitch; + d11 = vld1_u8(vs); + vs += pitch; + d12 = vld1_u8(us); + us += pitch; + d13 = vld1_u8(vs); + vs += pitch; + d14 = vld1_u8(us); + us += pitch; + d15 = vld1_u8(vs); + vs += pitch; + d16 = vld1_u8(us); + us += pitch; + d17 = vld1_u8(vs); + vs += pitch; + d18 = vld1_u8(us); + us += pitch; + d19 = vld1_u8(vs); + vs += pitch; + d20 = vld1_u8(us); + d21 = vld1_u8(vs); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + ud = u - 4; + vst1_u8(ud, vget_low_u8(q3)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q4)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q5)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q6)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q7)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q8)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q9)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q10)); + + vd = v - 4; + vst1_u8(vd, vget_high_u8(q3)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q4)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q5)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q6)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q7)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q8)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q9)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q10)); + return; +} diff --git a/thirdparty/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c new file mode 100644 index 000000000000..373afa6ed356 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 35468; + +void vp8_short_idct4x4llm_neon( + int16_t *input, + unsigned char *pred_ptr, + int pred_stride, + unsigned char *dst_ptr, + int dst_stride) { + int i; + uint32x2_t d6u32 = vdup_n_u32(0); + uint8x8_t d1u8; + int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; + uint16x8_t q1u16; + int16x8_t q1s16, q2s16, q3s16, q4s16; + int32x2x2_t v2tmp0, v2tmp1; + int16x4x2_t v2tmp2, v2tmp3; + + d2 = vld1_s16(input); + d3 = vld1_s16(input + 4); + d4 = vld1_s16(input + 8); + d5 = vld1_s16(input + 12); + + // 1st for loop + q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here + q2s16 = vcombine_s16(d3, d5); + + q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); + q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); + + d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 + d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 + + q3s16 = vshrq_n_s16(q3s16, 1); + q4s16 = vshrq_n_s16(q4s16, 1); + + q3s16 = vqaddq_s16(q3s16, q2s16); + q4s16 = vqaddq_s16(q4s16, q2s16); + + d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 + d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), + vreinterpret_s16_s32(v2tmp1.val[0])); + v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), + vreinterpret_s16_s32(v2tmp1.val[1])); + + // 2nd for loop + q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]); + q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]); + + q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); + q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); + + d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 + d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 + + q3s16 = vshrq_n_s16(q3s16, 1); + q4s16 = vshrq_n_s16(q4s16, 1); + + q3s16 = vqaddq_s16(q3s16, q2s16); + q4s16 = vqaddq_s16(q4s16, q2s16); + + d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 + d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2 = vrshr_n_s16(d2, 3); + d3 = vrshr_n_s16(d3, 3); + d4 = vrshr_n_s16(d4, 3); + d5 = vrshr_n_s16(d5, 3); + + v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), + vreinterpret_s16_s32(v2tmp1.val[0])); + v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), + vreinterpret_s16_s32(v2tmp1.val[1])); + + q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]); + q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]); + + // dc_only_idct_add + for (i = 0; i < 2; i++, q1s16 = q2s16) { + d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0); + pred_ptr += pred_stride; + d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1); + pred_ptr += pred_stride; + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16), + vreinterpret_u8_u32(d6u32)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0); + dst_ptr += dst_stride; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1); + dst_ptr += dst_stride; + } + return; +} diff --git a/thirdparty/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/sixtappredict_neon.c new file mode 100644 index 000000000000..49d8d221fc7f --- /dev/null +++ b/thirdparty/libvpx/vp8/common/arm/neon/sixtappredict_neon.c @@ -0,0 +1,1377 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "vpx_ports/mem.h" + +static const int8_t vp8_sub_pel_filters[8][8] = { + {0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positionyys are */ + {0, -6, 123, 12, -1, 0, 0, 0}, /* just as per alpha -0.5 bicubic */ + {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */ + {0, -9, 93, 50, -6, 0, 0, 0}, + {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */ + {0, -6, 50, 93, -9, 0, 0, 0}, + {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */ + {0, -1, 12, 123, -6, 0, 0, 0}, +}; + +void vp8_sixtap_predict8x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8; + uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8; + + if (xoffset == 0) { // secondpass_filter8x4_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + d27u8 = vld1_u8(src); + src += src_pixels_per_line; + d28u8 = vld1_u8(src); + src += src_pixels_per_line; + d29u8 = vld1_u8(src); + src += src_pixels_per_line; + d30u8 = vld1_u8(src); + + q3u16 = vmull_u8(d22u8, d0u8); + q4u16 = vmull_u8(d23u8, d0u8); + q5u16 = vmull_u8(d24u8, d0u8); + q6u16 = vmull_u8(d25u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d24u8, d2u8); + q4u16 = vmlal_u8(q4u16, d25u8, d2u8); + q5u16 = vmlal_u8(q5u16, d26u8, d2u8); + q6u16 = vmlal_u8(q6u16, d27u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d27u8, d5u8); + q4u16 = vmlal_u8(q4u16, d28u8, d5u8); + q5u16 = vmlal_u8(q5u16, d29u8, d5u8); + q6u16 = vmlal_u8(q6u16, d30u8, d5u8); + + q7u16 = vmull_u8(d25u8, d3u8); + q8u16 = vmull_u8(d26u8, d3u8); + q9u16 = vmull_u8(d27u8, d3u8); + q10u16 = vmull_u8(d28u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + + q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + + q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + + q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + + q7u16 = vmlal_u8(q7u16, d28u8, d2u8); + q8u16 = vmlal_u8(q8u16, d29u8, d2u8); + q9u16 = vmlal_u8(q9u16, d30u8, d2u8); + q10u16 = vmlal_u8(q10u16, d31u8, d2u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + q7u16 = vmlal_u8(q7u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + q9u16 = vmlal_u8(q9u16, d30u8, d5u8); + q10u16 = vmlal_u8(q10u16, d31u8, d5u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + + q3u16 = vmull_u8(d28u8, d3u8); + q4u16 = vmull_u8(d29u8, d3u8); + q5u16 = vmull_u8(d30u8, d3u8); + q6u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d22u8 = vqrshrun_n_s16(q7s16, 7); + d23u8 = vqrshrun_n_s16(q8s16, 7); + d24u8 = vqrshrun_n_s16(q9s16, 7); + d25u8 = vqrshrun_n_s16(q10s16, 7); + + if (yoffset == 0) { // firstpass_filter8x4_only + vst1_u8(dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d25u8); + return; + } + + // First Pass on rest 5-line data + src += src_pixels_per_line; + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q7u8 = vld1q_u8(src); + + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); + + q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); + + q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); + + q8u16 = vmlal_u8(q8u16, d27u8, d2u8); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q11u16 = vmlal_u8(q11u16, d30u8, d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); + + q8u16 = vmlal_u8(q8u16, d27u8, d5u8); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q11u16 = vmlal_u8(q11u16, d30u8, d5u8); + q12u16 = vmlal_u8(q12u16, d31u8, d5u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); + + q3u16 = vmull_u8(d27u8, d3u8); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + + q8s16 = vqaddq_s16(q8s16, q3s16); + q9s16 = vqaddq_s16(q9s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q11s16 = vqaddq_s16(q11s16, q6s16); + q12s16 = vqaddq_s16(q12s16, q7s16); + + d26u8 = vqrshrun_n_s16(q8s16, 7); + d27u8 = vqrshrun_n_s16(q9s16, 7); + d28u8 = vqrshrun_n_s16(q10s16, 7); + d29u8 = vqrshrun_n_s16(q11s16, 7); + d30u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 8x4 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + q3u16 = vmull_u8(d22u8, d0u8); + q4u16 = vmull_u8(d23u8, d0u8); + q5u16 = vmull_u8(d24u8, d0u8); + q6u16 = vmull_u8(d25u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d24u8, d2u8); + q4u16 = vmlal_u8(q4u16, d25u8, d2u8); + q5u16 = vmlal_u8(q5u16, d26u8, d2u8); + q6u16 = vmlal_u8(q6u16, d27u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d27u8, d5u8); + q4u16 = vmlal_u8(q4u16, d28u8, d5u8); + q5u16 = vmlal_u8(q5u16, d29u8, d5u8); + q6u16 = vmlal_u8(q6u16, d30u8, d5u8); + + q7u16 = vmull_u8(d25u8, d3u8); + q8u16 = vmull_u8(d26u8, d3u8); + q9u16 = vmull_u8(d27u8, d3u8); + q10u16 = vmull_u8(d28u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + return; +} + +void vp8_sixtap_predict8x8_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src, *tmpp; + unsigned char tmp[64]; + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8; + uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8; + + if (xoffset == 0) { // secondpass_filter8x8_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d18u8 = vld1_u8(src); + src += src_pixels_per_line; + d19u8 = vld1_u8(src); + src += src_pixels_per_line; + d20u8 = vld1_u8(src); + src += src_pixels_per_line; + d21u8 = vld1_u8(src); + src += src_pixels_per_line; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + d27u8 = vld1_u8(src); + src += src_pixels_per_line; + d28u8 = vld1_u8(src); + src += src_pixels_per_line; + d29u8 = vld1_u8(src); + src += src_pixels_per_line; + d30u8 = vld1_u8(src); + + for (i = 2; i > 0; i--) { + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + d23u8 = d27u8; + d24u8 = d28u8; + d25u8 = d29u8; + d26u8 = d30u8; + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + dst_ptr += dst_pitch; + } + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + + tmpp = tmp; + for (i = 2; i > 0; i--) { + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + __builtin_prefetch(src + src_pixels_per_line * 2); + + q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + + q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + + q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + + q7u16 = vmlal_u8(q7u16, d28u8, d2u8); + q8u16 = vmlal_u8(q8u16, d29u8, d2u8); + q9u16 = vmlal_u8(q9u16, d30u8, d2u8); + q10u16 = vmlal_u8(q10u16, d31u8, d2u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + q7u16 = vmlal_u8(q7u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + q9u16 = vmlal_u8(q9u16, d30u8, d5u8); + q10u16 = vmlal_u8(q10u16, d31u8, d5u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + + q3u16 = vmull_u8(d28u8, d3u8); + q4u16 = vmull_u8(d29u8, d3u8); + q5u16 = vmull_u8(d30u8, d3u8); + q6u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d22u8 = vqrshrun_n_s16(q7s16, 7); + d23u8 = vqrshrun_n_s16(q8s16, 7); + d24u8 = vqrshrun_n_s16(q9s16, 7); + d25u8 = vqrshrun_n_s16(q10s16, 7); + + if (yoffset == 0) { // firstpass_filter8x4_only + vst1_u8(dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d25u8); + dst_ptr += dst_pitch; + } else { + vst1_u8(tmpp, d22u8); + tmpp += 8; + vst1_u8(tmpp, d23u8); + tmpp += 8; + vst1_u8(tmpp, d24u8); + tmpp += 8; + vst1_u8(tmpp, d25u8); + tmpp += 8; + } + } + if (yoffset == 0) + return; + + // First Pass on rest 5-line data + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q7u8 = vld1q_u8(src); + + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); + + q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); + + q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); + + q8u16 = vmlal_u8(q8u16, d27u8, d2u8); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q11u16 = vmlal_u8(q11u16, d30u8, d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); + + q8u16 = vmlal_u8(q8u16, d27u8, d5u8); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q11u16 = vmlal_u8(q11u16, d30u8, d5u8); + q12u16 = vmlal_u8(q12u16, d31u8, d5u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); + + q3u16 = vmull_u8(d27u8, d3u8); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + + q8s16 = vqaddq_s16(q8s16, q3s16); + q9s16 = vqaddq_s16(q9s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q11s16 = vqaddq_s16(q11s16, q6s16); + q12s16 = vqaddq_s16(q12s16, q7s16); + + d26u8 = vqrshrun_n_s16(q8s16, 7); + d27u8 = vqrshrun_n_s16(q9s16, 7); + d28u8 = vqrshrun_n_s16(q10s16, 7); + d29u8 = vqrshrun_n_s16(q11s16, 7); + d30u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 8x8 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + tmpp = tmp; + q9u8 = vld1q_u8(tmpp); + tmpp += 16; + q10u8 = vld1q_u8(tmpp); + tmpp += 16; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + q12u8 = vld1q_u8(tmpp); + + d18u8 = vget_low_u8(q9u8); + d19u8 = vget_high_u8(q9u8); + d20u8 = vget_low_u8(q10u8); + d21u8 = vget_high_u8(q10u8); + d22u8 = vget_low_u8(q11u8); + d23u8 = vget_high_u8(q11u8); + d24u8 = vget_low_u8(q12u8); + d25u8 = vget_high_u8(q12u8); + + for (i = 2; i > 0; i--) { + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + d23u8 = d27u8; + d24u8 = d28u8; + d25u8 = d29u8; + d26u8 = d30u8; + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + dst_ptr += dst_pitch; + } + return; +} + +void vp8_sixtap_predict16x16_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src, *src_tmp, *dst, *tmpp; + unsigned char tmp[336]; + int i, j; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8; + uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8; + uint8x8_t d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint8x16_t q3u8, q4u8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16; + uint16x8_t q11u16, q12u16, q13u16, q15u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16; + int16x8_t q11s16, q12s16, q13s16, q15s16; + + if (xoffset == 0) { // secondpass_filter8x8_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src_tmp = src_ptr - src_pixels_per_line * 2; + for (i = 0; i < 2; i++) { + src = src_tmp + i * 8; + dst = dst_ptr + i * 8; + d18u8 = vld1_u8(src); + src += src_pixels_per_line; + d19u8 = vld1_u8(src); + src += src_pixels_per_line; + d20u8 = vld1_u8(src); + src += src_pixels_per_line; + d21u8 = vld1_u8(src); + src += src_pixels_per_line; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + for (j = 0; j < 4; j++) { + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + + vst1_u8(dst, d6u8); + dst += dst_pitch; + vst1_u8(dst, d7u8); + dst += dst_pitch; + vst1_u8(dst, d8u8); + dst += dst_pitch; + vst1_u8(dst, d9u8); + dst += dst_pitch; + } + } + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) { // firstpass_filter4x4_only + src = src_ptr - 2; + dst = dst_ptr; + for (i = 0; i < 8; i++) { + d6u8 = vld1_u8(src); + d7u8 = vld1_u8(src + 8); + d8u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d9u8 = vld1_u8(src); + d10u8 = vld1_u8(src + 8); + d11u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + + q6u16 = vmull_u8(d6u8, d0u8); + q7u16 = vmull_u8(d7u8, d0u8); + q8u16 = vmull_u8(d9u8, d0u8); + q9u16 = vmull_u8(d10u8, d0u8); + + d20u8 = vext_u8(d6u8, d7u8, 1); + d21u8 = vext_u8(d9u8, d10u8, 1); + d22u8 = vext_u8(d7u8, d8u8, 1); + d23u8 = vext_u8(d10u8, d11u8, 1); + d24u8 = vext_u8(d6u8, d7u8, 4); + d25u8 = vext_u8(d9u8, d10u8, 4); + d26u8 = vext_u8(d7u8, d8u8, 4); + d27u8 = vext_u8(d10u8, d11u8, 4); + d28u8 = vext_u8(d6u8, d7u8, 5); + d29u8 = vext_u8(d9u8, d10u8, 5); + + q6u16 = vmlsl_u8(q6u16, d20u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d21u8, d1u8); + q7u16 = vmlsl_u8(q7u16, d22u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d23u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d24u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d25u8, d4u8); + q7u16 = vmlsl_u8(q7u16, d26u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d27u8, d4u8); + q6u16 = vmlal_u8(q6u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + + d20u8 = vext_u8(d7u8, d8u8, 5); + d21u8 = vext_u8(d10u8, d11u8, 5); + d22u8 = vext_u8(d6u8, d7u8, 2); + d23u8 = vext_u8(d9u8, d10u8, 2); + d24u8 = vext_u8(d7u8, d8u8, 2); + d25u8 = vext_u8(d10u8, d11u8, 2); + d26u8 = vext_u8(d6u8, d7u8, 3); + d27u8 = vext_u8(d9u8, d10u8, 3); + d28u8 = vext_u8(d7u8, d8u8, 3); + d29u8 = vext_u8(d10u8, d11u8, 3); + + q7u16 = vmlal_u8(q7u16, d20u8, d5u8); + q9u16 = vmlal_u8(q9u16, d21u8, d5u8); + q6u16 = vmlal_u8(q6u16, d22u8, d2u8); + q8u16 = vmlal_u8(q8u16, d23u8, d2u8); + q7u16 = vmlal_u8(q7u16, d24u8, d2u8); + q9u16 = vmlal_u8(q9u16, d25u8, d2u8); + + q10u16 = vmull_u8(d26u8, d3u8); + q11u16 = vmull_u8(d27u8, d3u8); + q12u16 = vmull_u8(d28u8, d3u8); + q15u16 = vmull_u8(d29u8, d3u8); + + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q15s16 = vreinterpretq_s16_u16(q15u16); + + q6s16 = vqaddq_s16(q6s16, q10s16); + q8s16 = vqaddq_s16(q8s16, q11s16); + q7s16 = vqaddq_s16(q7s16, q12s16); + q9s16 = vqaddq_s16(q9s16, q15s16); + + d6u8 = vqrshrun_n_s16(q6s16, 7); + d7u8 = vqrshrun_n_s16(q7s16, 7); + d8u8 = vqrshrun_n_s16(q8s16, 7); + d9u8 = vqrshrun_n_s16(q9s16, 7); + + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + vst1q_u8(dst, q3u8); + dst += dst_pitch; + vst1q_u8(dst, q4u8); + dst += dst_pitch; + } + return; + } + + src = src_ptr - 2 - src_pixels_per_line * 2; + tmpp = tmp; + for (i = 0; i < 7; i++) { + d6u8 = vld1_u8(src); + d7u8 = vld1_u8(src + 8); + d8u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d9u8 = vld1_u8(src); + d10u8 = vld1_u8(src + 8); + d11u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d12u8 = vld1_u8(src); + d13u8 = vld1_u8(src + 8); + d14u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + __builtin_prefetch(src + src_pixels_per_line * 2); + + q8u16 = vmull_u8(d6u8, d0u8); + q9u16 = vmull_u8(d7u8, d0u8); + q10u16 = vmull_u8(d9u8, d0u8); + q11u16 = vmull_u8(d10u8, d0u8); + q12u16 = vmull_u8(d12u8, d0u8); + q13u16 = vmull_u8(d13u8, d0u8); + + d28u8 = vext_u8(d6u8, d7u8, 1); + d29u8 = vext_u8(d9u8, d10u8, 1); + d30u8 = vext_u8(d12u8, d13u8, 1); + q8u16 = vmlsl_u8(q8u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d30u8, d1u8); + d28u8 = vext_u8(d7u8, d8u8, 1); + d29u8 = vext_u8(d10u8, d11u8, 1); + d30u8 = vext_u8(d13u8, d14u8, 1); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d29u8, d1u8); + q13u16 = vmlsl_u8(q13u16, d30u8, d1u8); + + d28u8 = vext_u8(d6u8, d7u8, 4); + d29u8 = vext_u8(d9u8, d10u8, 4); + d30u8 = vext_u8(d12u8, d13u8, 4); + q8u16 = vmlsl_u8(q8u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d30u8, d4u8); + d28u8 = vext_u8(d7u8, d8u8, 4); + d29u8 = vext_u8(d10u8, d11u8, 4); + d30u8 = vext_u8(d13u8, d14u8, 4); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d29u8, d4u8); + q13u16 = vmlsl_u8(q13u16, d30u8, d4u8); + + d28u8 = vext_u8(d6u8, d7u8, 5); + d29u8 = vext_u8(d9u8, d10u8, 5); + d30u8 = vext_u8(d12u8, d13u8, 5); + q8u16 = vmlal_u8(q8u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q12u16 = vmlal_u8(q12u16, d30u8, d5u8); + d28u8 = vext_u8(d7u8, d8u8, 5); + d29u8 = vext_u8(d10u8, d11u8, 5); + d30u8 = vext_u8(d13u8, d14u8, 5); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q11u16 = vmlal_u8(q11u16, d29u8, d5u8); + q13u16 = vmlal_u8(q13u16, d30u8, d5u8); + + d28u8 = vext_u8(d6u8, d7u8, 2); + d29u8 = vext_u8(d9u8, d10u8, 2); + d30u8 = vext_u8(d12u8, d13u8, 2); + q8u16 = vmlal_u8(q8u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q12u16 = vmlal_u8(q12u16, d30u8, d2u8); + d28u8 = vext_u8(d7u8, d8u8, 2); + d29u8 = vext_u8(d10u8, d11u8, 2); + d30u8 = vext_u8(d13u8, d14u8, 2); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q11u16 = vmlal_u8(q11u16, d29u8, d2u8); + q13u16 = vmlal_u8(q13u16, d30u8, d2u8); + + d28u8 = vext_u8(d6u8, d7u8, 3); + d29u8 = vext_u8(d9u8, d10u8, 3); + d30u8 = vext_u8(d12u8, d13u8, 3); + d15u8 = vext_u8(d7u8, d8u8, 3); + d31u8 = vext_u8(d10u8, d11u8, 3); + d6u8 = vext_u8(d13u8, d14u8, 3); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q12s16 = vqaddq_s16(q12s16, q6s16); + + q6u16 = vmull_u8(d15u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + q3u16 = vmull_u8(d6u8, d3u8); + q3s16 = vreinterpretq_s16_u16(q3u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q13s16 = vreinterpretq_s16_u16(q13u16); + q9s16 = vqaddq_s16(q9s16, q6s16); + q11s16 = vqaddq_s16(q11s16, q7s16); + q13s16 = vqaddq_s16(q13s16, q3s16); + + d6u8 = vqrshrun_n_s16(q8s16, 7); + d7u8 = vqrshrun_n_s16(q9s16, 7); + d8u8 = vqrshrun_n_s16(q10s16, 7); + d9u8 = vqrshrun_n_s16(q11s16, 7); + d10u8 = vqrshrun_n_s16(q12s16, 7); + d11u8 = vqrshrun_n_s16(q13s16, 7); + + vst1_u8(tmpp, d6u8); + tmpp += 8; + vst1_u8(tmpp, d7u8); + tmpp += 8; + vst1_u8(tmpp, d8u8); + tmpp += 8; + vst1_u8(tmpp, d9u8); + tmpp += 8; + vst1_u8(tmpp, d10u8); + tmpp += 8; + vst1_u8(tmpp, d11u8); + tmpp += 8; + } + + // Second pass: 16x16 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + for (i = 0; i < 2; i++) { + dst = dst_ptr + 8 * i; + tmpp = tmp + 8 * i; + d18u8 = vld1_u8(tmpp); + tmpp += 16; + d19u8 = vld1_u8(tmpp); + tmpp += 16; + d20u8 = vld1_u8(tmpp); + tmpp += 16; + d21u8 = vld1_u8(tmpp); + tmpp += 16; + d22u8 = vld1_u8(tmpp); + tmpp += 16; + for (j = 0; j < 4; j++) { + d23u8 = vld1_u8(tmpp); + tmpp += 16; + d24u8 = vld1_u8(tmpp); + tmpp += 16; + d25u8 = vld1_u8(tmpp); + tmpp += 16; + d26u8 = vld1_u8(tmpp); + tmpp += 16; + + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + + vst1_u8(dst, d6u8); + dst += dst_pitch; + vst1_u8(dst, d7u8); + dst += dst_pitch; + vst1_u8(dst, d8u8); + dst += dst_pitch; + vst1_u8(dst, d9u8); + dst += dst_pitch; + } + } + return; +} diff --git a/thirdparty/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c b/thirdparty/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c new file mode 100644 index 000000000000..9d6807af7157 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c @@ -0,0 +1,550 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_config.h" +#include "vpx_ports/arm.h" + +static INLINE void vp8_loop_filter_neon( + uint8x16_t qblimit, // flimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p3 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r) { // q1 + uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q2s16, q11s16; + uint16x8_t q4u16; + int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8; + int8x8_t d2s8, d3s8; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q3 = vabdq_u8(q9, q8); + q4 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q3 = vmaxq_u8(q3, q4); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q9 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q3); + + q2u8 = vabdq_u8(q5, q8); + q9 = vqaddq_u8(q9, q9); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + // vp8_filter() function + // convert to signed + q10 = vdupq_n_u8(0x80); + q8 = veorq_u8(q8, q10); + q7 = veorq_u8(q7, q10); + q6 = veorq_u8(q6, q10); + q5 = veorq_u8(q5, q10); + + q2u8 = vshrq_n_u8(q2u8, 1); + q9 = vqaddq_u8(q9, q2u8); + + q10 = vdupq_n_u8(3); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q9 = vcgeq_u8(qblimit, q9); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), + vreinterpretq_s8_u8(q8)); + + q14u8 = vorrq_u8(q13u8, q14u8); + + q4u16 = vmovl_u8(vget_low_u8(q10)); + q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); + q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); + + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); + q15u8 = vandq_u8(q15u8, q9); + + q1s8 = vreinterpretq_s8_u8(q1u8); + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); + + q9 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2s8 = vqmovn_s16(q2s16); + d3s8 = vqmovn_s16(q11s16); + q1s8 = vcombine_s8(d2s8, d3s8); + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); + q1s8 = vreinterpretq_s8_u8(q1u8); + + q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10)); + q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); + q2s8 = vshrq_n_s8(q2s8, 3); + q1s8 = vshrq_n_s8(q1s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); + + q1s8 = vrshrq_n_s8(q1s8, 1); + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); + q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); + + q0u8 = vdupq_n_u8(0x80); + *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8); + *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8); + return; +} + +void vp8_loop_filter_horizontal_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + src -= (pitch << 2); + + q3 = vld1q_u8(src); + src += pitch; + q4 = vld1q_u8(src); + src += pitch; + q5 = vld1q_u8(src); + src += pitch; + q6 = vld1q_u8(src); + src += pitch; + q7 = vld1q_u8(src); + src += pitch; + q8 = vld1q_u8(src); + src += pitch; + q9 = vld1q_u8(src); + src += pitch; + q10 = vld1q_u8(src); + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + src -= (pitch * 5); + vst1q_u8(src, q5); + src += pitch; + vst1q_u8(src, q6); + src += pitch; + vst1q_u8(src, q7); + src += pitch; + vst1q_u8(src, q8); + return; +} + +void vp8_loop_filter_horizontal_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + u -= (pitch << 2); + v -= (pitch << 2); + + d6 = vld1_u8(u); + u += pitch; + d7 = vld1_u8(v); + v += pitch; + d8 = vld1_u8(u); + u += pitch; + d9 = vld1_u8(v); + v += pitch; + d10 = vld1_u8(u); + u += pitch; + d11 = vld1_u8(v); + v += pitch; + d12 = vld1_u8(u); + u += pitch; + d13 = vld1_u8(v); + v += pitch; + d14 = vld1_u8(u); + u += pitch; + d15 = vld1_u8(v); + v += pitch; + d16 = vld1_u8(u); + u += pitch; + d17 = vld1_u8(v); + v += pitch; + d18 = vld1_u8(u); + u += pitch; + d19 = vld1_u8(v); + v += pitch; + d20 = vld1_u8(u); + d21 = vld1_u8(v); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + u -= (pitch * 5); + vst1_u8(u, vget_low_u8(q5)); + u += pitch; + vst1_u8(u, vget_low_u8(q6)); + u += pitch; + vst1_u8(u, vget_low_u8(q7)); + u += pitch; + vst1_u8(u, vget_low_u8(q8)); + + v -= (pitch * 5); + vst1_u8(v, vget_high_u8(q5)); + v += pitch; + vst1_u8(v, vget_high_u8(q6)); + v += pitch; + vst1_u8(v, vget_high_u8(q7)); + v += pitch; + vst1_u8(v, vget_high_u8(q8)); + return; +} + +static INLINE void write_4x8(unsigned char *dst, int pitch, + const uint8x8x4_t result) { +#ifdef VPX_INCOMPATIBLE_GCC + /* + * uint8x8x4_t result + 00 01 02 03 | 04 05 06 07 + 10 11 12 13 | 14 15 16 17 + 20 21 22 23 | 24 25 26 27 + 30 31 32 33 | 34 35 36 37 + --- + * after vtrn_u16 + 00 01 20 21 | 04 05 24 25 + 02 03 22 23 | 06 07 26 27 + 10 11 30 31 | 14 15 34 35 + 12 13 32 33 | 16 17 36 37 + --- + * after vtrn_u8 + 00 10 20 30 | 04 14 24 34 + 01 11 21 31 | 05 15 25 35 + 02 12 22 32 | 06 16 26 36 + 03 13 23 33 | 07 17 27 37 + */ + const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]), + vreinterpret_u16_u8(result.val[2])); + const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]), + vreinterpret_u16_u8(result.val[3])); + const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), + vreinterpret_u8_u16(r13_u16.val[0])); + const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), + vreinterpret_u8_u16(r13_u16.val[1])); + const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]); + const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]); + const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]); + const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]); + vst1_lane_u32((uint32_t *)dst, x_0_4, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_1_5, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_2_6, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_3_7, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_0_4, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_1_5, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_2_6, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_3_7, 1); +#else + vst4_lane_u8(dst, result, 0); + dst += pitch; + vst4_lane_u8(dst, result, 1); + dst += pitch; + vst4_lane_u8(dst, result, 2); + dst += pitch; + vst4_lane_u8(dst, result, 3); + dst += pitch; + vst4_lane_u8(dst, result, 4); + dst += pitch; + vst4_lane_u8(dst, result, 5); + dst += pitch; + vst4_lane_u8(dst, result, 6); + dst += pitch; + vst4_lane_u8(dst, result, 7); +#endif // VPX_INCOMPATIBLE_GCC +} + +void vp8_loop_filter_vertical_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + unsigned char *s, *d; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + uint8x8x4_t q4ResultH, q4ResultL; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + s = src - 4; + d6 = vld1_u8(s); + s += pitch; + d8 = vld1_u8(s); + s += pitch; + d10 = vld1_u8(s); + s += pitch; + d12 = vld1_u8(s); + s += pitch; + d14 = vld1_u8(s); + s += pitch; + d16 = vld1_u8(s); + s += pitch; + d18 = vld1_u8(s); + s += pitch; + d20 = vld1_u8(s); + s += pitch; + d7 = vld1_u8(s); + s += pitch; + d9 = vld1_u8(s); + s += pitch; + d11 = vld1_u8(s); + s += pitch; + d13 = vld1_u8(s); + s += pitch; + d15 = vld1_u8(s); + s += pitch; + d17 = vld1_u8(s); + s += pitch; + d19 = vld1_u8(s); + s += pitch; + d21 = vld1_u8(s); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + q4ResultL.val[0] = vget_low_u8(q5); // d10 + q4ResultL.val[1] = vget_low_u8(q6); // d12 + q4ResultL.val[2] = vget_low_u8(q7); // d14 + q4ResultL.val[3] = vget_low_u8(q8); // d16 + q4ResultH.val[0] = vget_high_u8(q5); // d11 + q4ResultH.val[1] = vget_high_u8(q6); // d13 + q4ResultH.val[2] = vget_high_u8(q7); // d15 + q4ResultH.val[3] = vget_high_u8(q8); // d17 + + d = src - 2; + write_4x8(d, pitch, q4ResultL); + d += pitch * 8; + write_4x8(d, pitch, q4ResultH); +} + +void vp8_loop_filter_vertical_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + unsigned char *us, *ud; + unsigned char *vs, *vd; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + uint8x8x4_t q4ResultH, q4ResultL; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + us = u - 4; + d6 = vld1_u8(us); + us += pitch; + d8 = vld1_u8(us); + us += pitch; + d10 = vld1_u8(us); + us += pitch; + d12 = vld1_u8(us); + us += pitch; + d14 = vld1_u8(us); + us += pitch; + d16 = vld1_u8(us); + us += pitch; + d18 = vld1_u8(us); + us += pitch; + d20 = vld1_u8(us); + + vs = v - 4; + d7 = vld1_u8(vs); + vs += pitch; + d9 = vld1_u8(vs); + vs += pitch; + d11 = vld1_u8(vs); + vs += pitch; + d13 = vld1_u8(vs); + vs += pitch; + d15 = vld1_u8(vs); + vs += pitch; + d17 = vld1_u8(vs); + vs += pitch; + d19 = vld1_u8(vs); + vs += pitch; + d21 = vld1_u8(vs); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + q4ResultL.val[0] = vget_low_u8(q5); // d10 + q4ResultL.val[1] = vget_low_u8(q6); // d12 + q4ResultL.val[2] = vget_low_u8(q7); // d14 + q4ResultL.val[3] = vget_low_u8(q8); // d16 + ud = u - 2; + write_4x8(ud, pitch, q4ResultL); + + q4ResultH.val[0] = vget_high_u8(q5); // d11 + q4ResultH.val[1] = vget_high_u8(q6); // d13 + q4ResultH.val[2] = vget_high_u8(q7); // d15 + q4ResultH.val[3] = vget_high_u8(q8); // d17 + vd = v - 2; + write_4x8(vd, pitch, q4ResultH); +} diff --git a/thirdparty/libvpx/vp8/common/blockd.c b/thirdparty/libvpx/vp8/common/blockd.c new file mode 100644 index 000000000000..1fc3cd0ca7cb --- /dev/null +++ b/thirdparty/libvpx/vp8/common/blockd.c @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "blockd.h" +#include "vpx_mem/vpx_mem.h" + +const unsigned char vp8_block2left[25] = +{ + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; +const unsigned char vp8_block2above[25] = +{ + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8 +}; diff --git a/thirdparty/libvpx/vp8/common/blockd.h b/thirdparty/libvpx/vp8/common/blockd.h new file mode 100644 index 000000000000..192108a06dba --- /dev/null +++ b/thirdparty/libvpx/vp8/common/blockd.h @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_BLOCKD_H_ +#define VP8_COMMON_BLOCKD_H_ + +void vpx_log(const char *format, ...); + +#include "vpx_config.h" +#include "vpx_scale/yv12config.h" +#include "mv.h" +#include "treecoder.h" +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*#define DCPRED 1*/ +#define DCPREDSIMTHRESH 0 +#define DCPREDCNTTHRESH 3 + +#define MB_FEATURE_TREE_PROBS 3 +#define MAX_MB_SEGMENTS 4 + +#define MAX_REF_LF_DELTAS 4 +#define MAX_MODE_LF_DELTAS 4 + +/* Segment Feature Masks */ +#define SEGMENT_DELTADATA 0 +#define SEGMENT_ABSDATA 1 + +typedef struct +{ + int r, c; +} POS; + +#define PLANE_TYPE_Y_NO_DC 0 +#define PLANE_TYPE_Y2 1 +#define PLANE_TYPE_UV 2 +#define PLANE_TYPE_Y_WITH_DC 3 + + +typedef char ENTROPY_CONTEXT; +typedef struct +{ + ENTROPY_CONTEXT y1[4]; + ENTROPY_CONTEXT u[2]; + ENTROPY_CONTEXT v[2]; + ENTROPY_CONTEXT y2; +} ENTROPY_CONTEXT_PLANES; + +extern const unsigned char vp8_block2left[25]; +extern const unsigned char vp8_block2above[25]; + +#define VP8_COMBINEENTROPYCONTEXTS( Dest, A, B) \ + Dest = (A)+(B); + + +typedef enum +{ + KEY_FRAME = 0, + INTER_FRAME = 1 +} FRAME_TYPE; + +typedef enum +{ + DC_PRED, /* average of above and left pixels */ + V_PRED, /* vertical prediction */ + H_PRED, /* horizontal prediction */ + TM_PRED, /* Truemotion prediction */ + B_PRED, /* block based prediction, each block has its own prediction mode */ + + NEARESTMV, + NEARMV, + ZEROMV, + NEWMV, + SPLITMV, + + MB_MODE_COUNT +} MB_PREDICTION_MODE; + +/* Macroblock level features */ +typedef enum +{ + MB_LVL_ALT_Q = 0, /* Use alternate Quantizer .... */ + MB_LVL_ALT_LF = 1, /* Use alternate loop filter value... */ + MB_LVL_MAX = 2 /* Number of MB level features supported */ + +} MB_LVL_FEATURES; + +/* Segment Feature Masks */ +#define SEGMENT_ALTQ 0x01 +#define SEGMENT_ALT_LF 0x02 + +#define VP8_YMODES (B_PRED + 1) +#define VP8_UV_MODES (TM_PRED + 1) + +#define VP8_MVREFS (1 + SPLITMV - NEARESTMV) + +typedef enum +{ + B_DC_PRED, /* average of above and left pixels */ + B_TM_PRED, + + B_VE_PRED, /* vertical prediction */ + B_HE_PRED, /* horizontal prediction */ + + B_LD_PRED, + B_RD_PRED, + + B_VR_PRED, + B_VL_PRED, + B_HD_PRED, + B_HU_PRED, + + LEFT4X4, + ABOVE4X4, + ZERO4X4, + NEW4X4, + + B_MODE_COUNT +} B_PREDICTION_MODE; + +#define VP8_BINTRAMODES (B_HU_PRED + 1) /* 10 */ +#define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4) + +/* For keyframes, intra block modes are predicted by the (already decoded) + modes for the Y blocks to the left and above us; for interframes, there + is a single probability table. */ + +union b_mode_info +{ + B_PREDICTION_MODE as_mode; + int_mv mv; +}; + +typedef enum +{ + INTRA_FRAME = 0, + LAST_FRAME = 1, + GOLDEN_FRAME = 2, + ALTREF_FRAME = 3, + MAX_REF_FRAMES = 4 +} MV_REFERENCE_FRAME; + +typedef struct +{ + uint8_t mode, uv_mode; + uint8_t ref_frame; + uint8_t is_4x4; + int_mv mv; + + uint8_t partitioning; + uint8_t mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */ + uint8_t need_to_clamp_mvs; + uint8_t segment_id; /* Which set of segmentation parameters should be used for this MB */ +} MB_MODE_INFO; + +typedef struct modeinfo +{ + MB_MODE_INFO mbmi; + union b_mode_info bmi[16]; +} MODE_INFO; + +#if CONFIG_MULTI_RES_ENCODING +/* The mb-level information needed to be stored for higher-resolution encoder */ +typedef struct +{ + MB_PREDICTION_MODE mode; + MV_REFERENCE_FRAME ref_frame; + int_mv mv; + int dissim; /* dissimilarity level of the macroblock */ +} LOWER_RES_MB_INFO; + +/* The frame-level information needed to be stored for higher-resolution + * encoder */ +typedef struct +{ + FRAME_TYPE frame_type; + int is_frame_dropped; + // The frame rate for the lowest resolution. + double low_res_framerate; + /* The frame number of each reference frames */ + unsigned int low_res_ref_frames[MAX_REF_FRAMES]; + // The video frame counter value for the key frame, for lowest resolution. + unsigned int key_frame_counter_value; + LOWER_RES_MB_INFO *mb_info; +} LOWER_RES_FRAME_INFO; +#endif + +typedef struct blockd +{ + short *qcoeff; + short *dqcoeff; + unsigned char *predictor; + short *dequant; + + int offset; + char *eob; + + union b_mode_info bmi; +} BLOCKD; + +typedef void (*vp8_subpix_fn_t)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +typedef struct macroblockd +{ + DECLARE_ALIGNED(16, unsigned char, predictor[384]); + DECLARE_ALIGNED(16, short, qcoeff[400]); + DECLARE_ALIGNED(16, short, dqcoeff[400]); + DECLARE_ALIGNED(16, char, eobs[25]); + + DECLARE_ALIGNED(16, short, dequant_y1[16]); + DECLARE_ALIGNED(16, short, dequant_y1_dc[16]); + DECLARE_ALIGNED(16, short, dequant_y2[16]); + DECLARE_ALIGNED(16, short, dequant_uv[16]); + + /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */ + BLOCKD block[25]; + int fullpixel_mask; + + YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */ + YV12_BUFFER_CONFIG dst; + + MODE_INFO *mode_info_context; + int mode_info_stride; + + FRAME_TYPE frame_type; + + int up_available; + int left_available; + + unsigned char *recon_above[3]; + unsigned char *recon_left[3]; + int recon_left_stride[2]; + + /* Y,U,V,Y2 */ + ENTROPY_CONTEXT_PLANES *above_context; + ENTROPY_CONTEXT_PLANES *left_context; + + /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */ + unsigned char segmentation_enabled; + + /* 0 (do not update) 1 (update) the macroblock segmentation map. */ + unsigned char update_mb_segmentation_map; + + /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ + unsigned char update_mb_segmentation_data; + + /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ + unsigned char mb_segement_abs_delta; + + /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */ + /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */ + vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS]; /* Probability Tree used to code Segment number */ + + signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; /* Segment parameters */ + + /* mode_based Loop filter adjustment */ + unsigned char mode_ref_lf_delta_enabled; + unsigned char mode_ref_lf_delta_update; + + /* Delta values have the range +/- MAX_LOOP_FILTER */ + signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */ + signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */ + signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */ + signed char mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */ + + /* Distance of MB away from frame edges */ + int mb_to_left_edge; + int mb_to_right_edge; + int mb_to_top_edge; + int mb_to_bottom_edge; + + + + vp8_subpix_fn_t subpixel_predict; + vp8_subpix_fn_t subpixel_predict8x4; + vp8_subpix_fn_t subpixel_predict8x8; + vp8_subpix_fn_t subpixel_predict16x16; + + void *current_bc; + + int corrupted; + +#if ARCH_X86 || ARCH_X86_64 + /* This is an intermediate buffer currently used in sub-pixel motion search + * to keep a copy of the reference area. This buffer can be used for other + * purpose. + */ + DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]); +#endif +} MACROBLOCKD; + + +extern void vp8_build_block_doffsets(MACROBLOCKD *x); +extern void vp8_setup_block_dptrs(MACROBLOCKD *x); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_BLOCKD_H_ diff --git a/thirdparty/libvpx/vp8/common/coefupdateprobs.h b/thirdparty/libvpx/vp8/common/coefupdateprobs.h new file mode 100644 index 000000000000..d96a19e7478c --- /dev/null +++ b/thirdparty/libvpx/vp8/common/coefupdateprobs.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_COMMON_COEFUPDATEPROBS_H_ +#define VP8_COMMON_COEFUPDATEPROBS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Update probabilities for the nodes in the token entropy tree. + Generated file included by entropy.c */ + +const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] = +{ + { + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, }, + {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, }, + {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, }, + }, + { + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, }, + {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, }, + {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, }, + {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_COEFUPDATEPROBS_H_ diff --git a/thirdparty/libvpx/vp8/common/common.h b/thirdparty/libvpx/vp8/common/common.h new file mode 100644 index 000000000000..e58a9cc23ba9 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/common.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_COMMON_H_ +#define VP8_COMMON_COMMON_H_ + +#include + +/* Interface header for common constant data structures and lookup tables */ + +#include "vpx_mem/vpx_mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Only need this for fixed-size arrays, for structs just assign. */ + +#define vp8_copy( Dest, Src) { \ + assert( sizeof( Dest) == sizeof( Src)); \ + memcpy( Dest, Src, sizeof( Src)); \ + } + +/* Use this for variably-sized arrays. */ + +#define vp8_copy_array( Dest, Src, N) { \ + assert( sizeof( *Dest) == sizeof( *Src)); \ + memcpy( Dest, Src, N * sizeof( *Src)); \ + } + +#define vp8_zero( Dest) memset( &Dest, 0, sizeof( Dest)); + +#define vp8_zero_array( Dest, N) memset( Dest, 0, N * sizeof( *Dest)); + + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_COMMON_H_ diff --git a/thirdparty/libvpx/vp8/common/copy_c.c b/thirdparty/libvpx/vp8/common/copy_c.c new file mode 100644 index 000000000000..e3392913f635 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/copy_c.c @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include + +#include "./vp8_rtcd.h" +#include "vpx/vpx_integer.h" + +/* Copy 2 macroblocks to a buffer */ +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, + unsigned char *dst_ptr, int dst_stride, + int height) +{ + int r; + + for (r = 0; r < height; r++) + { + memcpy(dst_ptr, src_ptr, 32); + + src_ptr += src_stride; + dst_ptr += dst_stride; + + } +} diff --git a/thirdparty/libvpx/vp8/common/debugmodes.c b/thirdparty/libvpx/vp8/common/debugmodes.c new file mode 100644 index 000000000000..159fddc6a765 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/debugmodes.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include +#include "blockd.h" + + +void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int frame) +{ + + int mb_row; + int mb_col; + int mb_index = 0; + FILE *mvs = fopen("mvs.stt", "a"); + + /* print out the macroblock Y modes */ + mb_index = 0; + fprintf(mvs, "Mb Modes for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) + { + for (mb_col = 0; mb_col < cols; mb_col++) + { + + fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode); + + mb_index++; + } + + fprintf(mvs, "\n"); + mb_index++; + } + + fprintf(mvs, "\n"); + + mb_index = 0; + fprintf(mvs, "Mb mv ref for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) + { + for (mb_col = 0; mb_col < cols; mb_col++) + { + + fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame); + + mb_index++; + } + + fprintf(mvs, "\n"); + mb_index++; + } + + fprintf(mvs, "\n"); + + /* print out the macroblock UV modes */ + mb_index = 0; + fprintf(mvs, "UV Modes for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) + { + for (mb_col = 0; mb_col < cols; mb_col++) + { + + fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode); + + mb_index++; + } + + mb_index++; + fprintf(mvs, "\n"); + } + + fprintf(mvs, "\n"); + + /* print out the block modes */ + fprintf(mvs, "Mbs for Frame %d\n", frame); + { + int b_row; + + for (b_row = 0; b_row < 4 * rows; b_row++) + { + int b_col; + int bindex; + + for (b_col = 0; b_col < 4 * cols; b_col++) + { + mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2); + bindex = (b_row & 3) * 4 + (b_col & 3); + + if (mi[mb_index].mbmi.mode == B_PRED) + fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode); + else + fprintf(mvs, "xx "); + + } + + fprintf(mvs, "\n"); + } + } + fprintf(mvs, "\n"); + + /* print out the macroblock mvs */ + mb_index = 0; + fprintf(mvs, "MVs for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) + { + for (mb_col = 0; mb_col < cols; mb_col++) + { + fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv.as_mv.row / 2, mi[mb_index].mbmi.mv.as_mv.col / 2); + + mb_index++; + } + + mb_index++; + fprintf(mvs, "\n"); + } + + fprintf(mvs, "\n"); + + + /* print out the block modes */ + fprintf(mvs, "MVs for Frame %d\n", frame); + { + int b_row; + + for (b_row = 0; b_row < 4 * rows; b_row++) + { + int b_col; + int bindex; + + for (b_col = 0; b_col < 4 * cols; b_col++) + { + mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2); + bindex = (b_row & 3) * 4 + (b_col & 3); + fprintf(mvs, "%3d:%-3d ", mi[mb_index].bmi[bindex].mv.as_mv.row, mi[mb_index].bmi[bindex].mv.as_mv.col); + + } + + fprintf(mvs, "\n"); + } + } + fprintf(mvs, "\n"); + + + fclose(mvs); +} diff --git a/thirdparty/libvpx/vp8/common/default_coef_probs.h b/thirdparty/libvpx/vp8/common/default_coef_probs.h new file mode 100644 index 000000000000..4d69e4be6645 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/default_coef_probs.h @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. +*/ + +#ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_ +#define VP8_COMMON_DEFAULT_COEF_PROBS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/*Generated file, included by entropy.c*/ + + +static const vp8_prob default_coef_probs [BLOCK_TYPES] + [COEF_BANDS] + [PREV_COEF_CONTEXTS] + [ENTROPY_NODES] = +{ + { /* Block Type ( 0 ) */ + { /* Coeff Band ( 0 )*/ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 1 )*/ + { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 }, + { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 }, + { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 2 )*/ + { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 }, + { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 }, + { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 } + }, + { /* Coeff Band ( 3 )*/ + { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 }, + { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 }, + { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 4 )*/ + { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 }, + { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 }, + { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 5 )*/ + { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 }, + { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 }, + { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 6 )*/ + { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 }, + { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 }, + { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 7 )*/ + { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + } + }, + { /* Block Type ( 1 ) */ + { /* Coeff Band ( 0 )*/ + { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 }, + { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 }, + { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 } + }, + { /* Coeff Band ( 1 )*/ + { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 }, + { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 }, + { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 } + }, + { /* Coeff Band ( 2 )*/ + { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 }, + { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 }, + { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 } + }, + { /* Coeff Band ( 3 )*/ + { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 }, + { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 }, + { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 4 )*/ + { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 }, + { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 }, + { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 } + }, + { /* Coeff Band ( 5 )*/ + { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 }, + { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 }, + { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 } + }, + { /* Coeff Band ( 6 )*/ + { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 }, + { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 }, + { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 } + }, + { /* Coeff Band ( 7 )*/ + { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 }, + { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 } + } + }, + { /* Block Type ( 2 ) */ + { /* Coeff Band ( 0 )*/ + { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 }, + { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 }, + { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 } + }, + { /* Coeff Band ( 1 )*/ + { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 }, + { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 }, + { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 2 )*/ + { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 }, + { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 }, + { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 } + }, + { /* Coeff Band ( 3 )*/ + { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 }, + { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 4 )*/ + { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 }, + { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 5 )*/ + { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 6 )*/ + { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 } + }, + { /* Coeff Band ( 7 )*/ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 } + } + }, + { /* Block Type ( 3 ) */ + { /* Coeff Band ( 0 )*/ + { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 }, + { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 }, + { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 } + }, + { /* Coeff Band ( 1 )*/ + { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 }, + { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 }, + { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 } + }, + { /* Coeff Band ( 2 )*/ + { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 }, + { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 }, + { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 } + }, + { /* Coeff Band ( 3 )*/ + { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 }, + { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 }, + { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 } + }, + { /* Coeff Band ( 4 )*/ + { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 }, + { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 }, + { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 } + }, + { /* Coeff Band ( 5 )*/ + { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 }, + { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 }, + { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 } + }, + { /* Coeff Band ( 6 )*/ + { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 }, + { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 }, + { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 } + }, + { /* Coeff Band ( 7 )*/ + { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 } + } + } +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_DEFAULT_COEF_PROBS_H_ diff --git a/thirdparty/libvpx/vp8/common/dequantize.c b/thirdparty/libvpx/vp8/common/dequantize.c new file mode 100644 index 000000000000..f8b04fa4ee55 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/dequantize.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_dequantize_b_c(BLOCKD *d, short *DQC) +{ + int i; + short *DQ = d->dqcoeff; + short *Q = d->qcoeff; + + for (i = 0; i < 16; i++) + { + DQ[i] = Q[i] * DQC[i]; + } +} + +void vp8_dequant_idct_add_c(short *input, short *dq, + unsigned char *dest, int stride) +{ + int i; + + for (i = 0; i < 16; i++) + { + input[i] = dq[i] * input[i]; + } + + vp8_short_idct4x4llm_c(input, dest, stride, dest, stride); + + memset(input, 0, 32); + +} diff --git a/thirdparty/libvpx/vp8/common/entropy.c b/thirdparty/libvpx/vp8/common/entropy.c new file mode 100644 index 000000000000..c00e565f0637 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/entropy.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "entropy.h" +#include "blockd.h" +#include "onyxc_int.h" +#include "vpx_mem/vpx_mem.h" + +#include "coefupdateprobs.h" + +DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) = +{ + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]) = +{ 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7}; + +DECLARE_ALIGNED(16, const unsigned char, + vp8_prev_token_class[MAX_ENTROPY_TOKENS]) = +{ 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0}; + +DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) = +{ + 0, 1, 4, 8, + 5, 2, 3, 6, + 9, 12, 13, 10, + 7, 11, 14, 15, +}; + +DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) = +{ + 1, 2, 6, 7, + 3, 5, 8, 13, + 4, 9, 12, 14, + 10, 11, 15, 16 +}; + +/* vp8_default_zig_zag_mask generated with: + + void vp8_init_scan_order_mask() + { + int i; + + for (i = 0; i < 16; i++) + { + vp8_default_zig_zag_mask[vp8_default_zig_zag1d[i]] = 1 << i; + } + + } +*/ +DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]) = +{ + 1, 2, 32, 64, + 4, 16, 128, 4096, + 8, 256, 2048, 8192, + 512, 1024, 16384, -32768 +}; + +const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6}; + +/* Array indices are identical to previously-existing CONTEXT_NODE indices */ + +const vp8_tree_index vp8_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */ +{ + -DCT_EOB_TOKEN, 2, /* 0 = EOB */ + -ZERO_TOKEN, 4, /* 1 = ZERO */ + -ONE_TOKEN, 6, /* 2 = ONE */ + 8, 12, /* 3 = LOW_VAL */ + -TWO_TOKEN, 10, /* 4 = TWO */ + -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */ + 14, 16, /* 6 = HIGH_LOW */ + -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */ + 18, 20, /* 8 = CAT_THREEFOUR */ + -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */ + -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */ +}; + +/* vp8_coef_encodings generated with: + vp8_tokens_from_tree(vp8_coef_encodings, vp8_coef_tree); +*/ +vp8_token vp8_coef_encodings[MAX_ENTROPY_TOKENS] = +{ + {2, 2}, + {6, 3}, + {28, 5}, + {58, 6}, + {59, 6}, + {60, 6}, + {61, 6}, + {124, 7}, + {125, 7}, + {126, 7}, + {127, 7}, + {0, 1} +}; + +/* Trees for extra bits. Probabilities are constant and + do not depend on previously encoded bits */ + +static const vp8_prob Pcat1[] = { 159}; +static const vp8_prob Pcat2[] = { 165, 145}; +static const vp8_prob Pcat3[] = { 173, 148, 140}; +static const vp8_prob Pcat4[] = { 176, 155, 140, 135}; +static const vp8_prob Pcat5[] = { 180, 157, 141, 134, 130}; +static const vp8_prob Pcat6[] = +{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129}; + + +/* tree index tables generated with: + + void init_bit_tree(vp8_tree_index *p, int n) + { + int i = 0; + + while (++i < n) + { + p[0] = p[1] = i << 1; + p += 2; + } + + p[0] = p[1] = 0; + } + + void init_bit_trees() + { + init_bit_tree(cat1, 1); + init_bit_tree(cat2, 2); + init_bit_tree(cat3, 3); + init_bit_tree(cat4, 4); + init_bit_tree(cat5, 5); + init_bit_tree(cat6, 11); + } +*/ + +static const vp8_tree_index cat1[2] = { 0, 0 }; +static const vp8_tree_index cat2[4] = { 2, 2, 0, 0 }; +static const vp8_tree_index cat3[6] = { 2, 2, 4, 4, 0, 0 }; +static const vp8_tree_index cat4[8] = { 2, 2, 4, 4, 6, 6, 0, 0 }; +static const vp8_tree_index cat5[10] = { 2, 2, 4, 4, 6, 6, 8, 8, 0, 0 }; +static const vp8_tree_index cat6[22] = { 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, + 14, 14, 16, 16, 18, 18, 20, 20, 0, 0 }; + +const vp8_extra_bit_struct vp8_extra_bits[12] = +{ + { 0, 0, 0, 0}, + { 0, 0, 0, 1}, + { 0, 0, 0, 2}, + { 0, 0, 0, 3}, + { 0, 0, 0, 4}, + { cat1, Pcat1, 1, 5}, + { cat2, Pcat2, 2, 7}, + { cat3, Pcat3, 3, 11}, + { cat4, Pcat4, 4, 19}, + { cat5, Pcat5, 5, 35}, + { cat6, Pcat6, 11, 67}, + { 0, 0, 0, 0} +}; + +#include "default_coef_probs.h" + +void vp8_default_coef_probs(VP8_COMMON *pc) +{ + memcpy(pc->fc.coef_probs, default_coef_probs, sizeof(default_coef_probs)); +} + diff --git a/thirdparty/libvpx/vp8/common/entropy.h b/thirdparty/libvpx/vp8/common/entropy.h new file mode 100644 index 000000000000..a90bab4bac2e --- /dev/null +++ b/thirdparty/libvpx/vp8/common/entropy.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_ENTROPY_H_ +#define VP8_COMMON_ENTROPY_H_ + +#include "treecoder.h" +#include "blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Coefficient token alphabet */ + +#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */ +#define ONE_TOKEN 1 /* 1 Extra Bits 0+1 */ +#define TWO_TOKEN 2 /* 2 Extra Bits 0+1 */ +#define THREE_TOKEN 3 /* 3 Extra Bits 0+1 */ +#define FOUR_TOKEN 4 /* 4 Extra Bits 0+1 */ +#define DCT_VAL_CATEGORY1 5 /* 5-6 Extra Bits 1+1 */ +#define DCT_VAL_CATEGORY2 6 /* 7-10 Extra Bits 2+1 */ +#define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */ +#define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */ +#define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */ +#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 11+1 */ +#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */ + +#define MAX_ENTROPY_TOKENS 12 +#define ENTROPY_NODES 11 + +extern const vp8_tree_index vp8_coef_tree[]; + +extern const struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS]; + +typedef struct +{ + vp8_tree_p tree; + const vp8_prob *prob; + int Len; + int base_val; +} vp8_extra_bit_struct; + +extern const vp8_extra_bit_struct vp8_extra_bits[12]; /* indexed by token value */ + +#define PROB_UPDATE_BASELINE_COST 7 + +#define MAX_PROB 255 +#define DCT_MAX_VALUE 2048 + + +/* Coefficients are predicted via a 3-dimensional probability table. */ + +/* Outside dimension. 0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */ + +#define BLOCK_TYPES 4 + +/* Middle dimension is a coarsening of the coefficient's + position within the 4x4 DCT. */ + +#define COEF_BANDS 8 +extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]); + +/* Inside dimension is 3-valued measure of nearby complexity, that is, + the extent to which nearby coefficients are nonzero. For the first + coefficient (DC, unless block type is 0), we look at the (already encoded) + blocks above and to the left of the current block. The context index is + then the number (0,1,or 2) of these blocks having nonzero coefficients. + After decoding a coefficient, the measure is roughly the size of the + most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1). + Note that the intuitive meaning of this measure changes as coefficients + are decoded, e.g., prior to the first token, a zero means that my neighbors + are empty while, after the first token, because of the use of end-of-block, + a zero means we just decoded a zero and hence guarantees that a non-zero + coefficient will appear later in this block. However, this shift + in meaning is perfectly OK because our context depends also on the + coefficient band (and since zigzag positions 0, 1, and 2 are in + distinct bands). */ + +/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */ +# define PREV_COEF_CONTEXTS 3 + +extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[MAX_ENTROPY_TOKENS]); + +extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + + +struct VP8Common; +void vp8_default_coef_probs(struct VP8Common *); + +extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]); +extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]); +extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]); +extern const int vp8_mb_feature_data_bits[MB_LVL_MAX]; + +void vp8_coef_tree_initialize(void); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_ENTROPY_H_ diff --git a/thirdparty/libvpx/vp8/common/entropymode.c b/thirdparty/libvpx/vp8/common/entropymode.c new file mode 100644 index 000000000000..8981a8d3c2a8 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/entropymode.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#define USE_PREBUILT_TABLES + +#include "entropymode.h" +#include "entropy.h" +#include "vpx_mem/vpx_mem.h" + +#include "vp8_entropymodedata.h" + +int vp8_mv_cont(const int_mv *l, const int_mv *a) +{ + int lez = (l->as_int == 0); + int aez = (a->as_int == 0); + int lea = (l->as_int == a->as_int); + + if (lea && lez) + return SUBMVREF_LEFT_ABOVE_ZED; + + if (lea) + return SUBMVREF_LEFT_ABOVE_SAME; + + if (aez) + return SUBMVREF_ABOVE_ZED; + + if (lez) + return SUBMVREF_LEFT_ZED; + + return SUBMVREF_NORMAL; +} + +static const vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1] = { 180, 162, 25}; + +const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1] = +{ + { 147, 136, 18 }, + { 106, 145, 1 }, + { 179, 121, 1 }, + { 223, 1 , 34 }, + { 208, 1 , 1 } +}; + + + +const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS] = +{ + { + 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 1, 1, 1, + 1, 1, 1, 1, + }, + { + 0, 0, 1, 1, + 0, 0, 1, 1, + 0, 0, 1, 1, + 0, 0, 1, 1, + }, + { + 0, 0, 1, 1, + 0, 0, 1, 1, + 2, 2, 3, 3, + 2, 2, 3, 3, + }, + { + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + } +}; + +const int vp8_mbsplit_count [VP8_NUMMBSPLITS] = { 2, 2, 4, 16}; + +const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1] = { 110, 111, 150}; + + +/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */ + +const vp8_tree_index vp8_bmode_tree[18] = /* INTRAMODECONTEXTNODE value */ +{ + -B_DC_PRED, 2, /* 0 = DC_NODE */ + -B_TM_PRED, 4, /* 1 = TM_NODE */ + -B_VE_PRED, 6, /* 2 = VE_NODE */ + 8, 12, /* 3 = COM_NODE */ + -B_HE_PRED, 10, /* 4 = HE_NODE */ + -B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */ + -B_LD_PRED, 14, /* 6 = LD_NODE */ + -B_VL_PRED, 16, /* 7 = VL_NODE */ + -B_HD_PRED, -B_HU_PRED /* 8 = HD_NODE */ +}; + +/* Again, these trees use the same probability indices as their + explicitly-programmed predecessors. */ + +const vp8_tree_index vp8_ymode_tree[8] = +{ + -DC_PRED, 2, + 4, 6, + -V_PRED, -H_PRED, + -TM_PRED, -B_PRED +}; + +const vp8_tree_index vp8_kf_ymode_tree[8] = +{ + -B_PRED, 2, + 4, 6, + -DC_PRED, -V_PRED, + -H_PRED, -TM_PRED +}; + +const vp8_tree_index vp8_uv_mode_tree[6] = +{ + -DC_PRED, 2, + -V_PRED, 4, + -H_PRED, -TM_PRED +}; + +const vp8_tree_index vp8_mbsplit_tree[6] = +{ + -3, 2, + -2, 4, + -0, -1 +}; + +const vp8_tree_index vp8_mv_ref_tree[8] = +{ + -ZEROMV, 2, + -NEARESTMV, 4, + -NEARMV, 6, + -NEWMV, -SPLITMV +}; + +const vp8_tree_index vp8_sub_mv_ref_tree[6] = +{ + -LEFT4X4, 2, + -ABOVE4X4, 4, + -ZERO4X4, -NEW4X4 +}; + +const vp8_tree_index vp8_small_mvtree [14] = +{ + 2, 8, + 4, 6, + -0, -1, + -2, -3, + 10, 12, + -4, -5, + -6, -7 +}; + +void vp8_init_mbmode_probs(VP8_COMMON *x) +{ + memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob)); + memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob)); + memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob)); +} + +void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES-1]) +{ + memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob)); +} + diff --git a/thirdparty/libvpx/vp8/common/entropymode.h b/thirdparty/libvpx/vp8/common/entropymode.h new file mode 100644 index 000000000000..81bdfc4b8bdd --- /dev/null +++ b/thirdparty/libvpx/vp8/common/entropymode.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_ENTROPYMODE_H_ +#define VP8_COMMON_ENTROPYMODE_H_ + +#include "onyxc_int.h" +#include "treecoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum +{ + SUBMVREF_NORMAL, + SUBMVREF_LEFT_ZED, + SUBMVREF_ABOVE_ZED, + SUBMVREF_LEFT_ABOVE_SAME, + SUBMVREF_LEFT_ABOVE_ZED +} sumvfref_t; + +typedef int vp8_mbsplit[16]; + +#define VP8_NUMMBSPLITS 4 + +extern const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS]; + +extern const int vp8_mbsplit_count [VP8_NUMMBSPLITS]; /* # of subsets */ + +extern const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1]; + +extern int vp8_mv_cont(const int_mv *l, const int_mv *a); +#define SUBMVREF_COUNT 5 +extern const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1]; + + +extern const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES]; + + +extern const vp8_tree_index vp8_bmode_tree[]; + +extern const vp8_tree_index vp8_ymode_tree[]; +extern const vp8_tree_index vp8_kf_ymode_tree[]; +extern const vp8_tree_index vp8_uv_mode_tree[]; + +extern const vp8_tree_index vp8_mbsplit_tree[]; +extern const vp8_tree_index vp8_mv_ref_tree[]; +extern const vp8_tree_index vp8_sub_mv_ref_tree[]; + +extern const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES]; +extern const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES]; +extern const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES]; +extern const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES]; +extern const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS]; + +/* Inter mode values do not start at zero */ + +extern const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS]; +extern const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS]; + +extern const vp8_tree_index vp8_small_mvtree[]; + +extern const struct vp8_token_struct vp8_small_mvencodings[8]; + +/* Key frame default mode probs */ +extern const vp8_prob vp8_kf_bmode_prob[VP8_BINTRAMODES][VP8_BINTRAMODES] +[VP8_BINTRAMODES-1]; +extern const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1]; +extern const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1]; + +void vp8_init_mbmode_probs(VP8_COMMON *x); +void vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]); +void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_ENTROPYMODE_H_ diff --git a/thirdparty/libvpx/vp8/common/entropymv.c b/thirdparty/libvpx/vp8/common/entropymv.c new file mode 100644 index 000000000000..e5df1f095555 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/entropymv.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "entropymv.h" + +const MV_CONTEXT vp8_mv_update_probs[2] = +{ + {{ + 237, + 246, + 253, 253, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 250, 250, 252, 254, 254 + }}, + {{ + 231, + 243, + 245, 253, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 251, 251, 254, 254, 254 + }} +}; +const MV_CONTEXT vp8_default_mv_context[2] = +{ + {{ + /* row */ + 162, /* is short */ + 128, /* sign */ + 225, 146, 172, 147, 214, 39, 156, /* short tree */ + 128, 129, 132, 75, 145, 178, 206, 239, 254, 254 /* long bits */ + }}, + + + + {{ + /* same for column */ + 164, /* is short */ + 128, + 204, 170, 119, 235, 140, 230, 228, + 128, 130, 130, 74, 148, 180, 203, 236, 254, 254 /* long bits */ + + }} +}; diff --git a/thirdparty/libvpx/vp8/common/entropymv.h b/thirdparty/libvpx/vp8/common/entropymv.h new file mode 100644 index 000000000000..42840d58ad26 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/entropymv.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_ENTROPYMV_H_ +#define VP8_COMMON_ENTROPYMV_H_ + +#include "treecoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum +{ + mv_max = 1023, /* max absolute value of a MV component */ + MVvals = (2 * mv_max) + 1, /* # possible values "" */ + mvfp_max = 255, /* max absolute value of a full pixel MV component */ + MVfpvals = (2 * mvfp_max) +1, /* # possible full pixel MV values */ + + mvlong_width = 10, /* Large MVs have 9 bit magnitudes */ + mvnum_short = 8, /* magnitudes 0 through 7 */ + + /* probability offsets for coding each MV component */ + + mvpis_short = 0, /* short (<= 7) vs long (>= 8) */ + MVPsign, /* sign for non-zero */ + MVPshort, /* 8 short values = 7-position tree */ + + MVPbits = MVPshort + mvnum_short - 1, /* mvlong_width long value bits */ + MVPcount = MVPbits + mvlong_width /* (with independent probabilities) */ +}; + +typedef struct mv_context +{ + vp8_prob prob[MVPcount]; /* often come in row, col pairs */ +} MV_CONTEXT; + +extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2]; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_ENTROPYMV_H_ diff --git a/thirdparty/libvpx/vp8/common/extend.c b/thirdparty/libvpx/vp8/common/extend.c new file mode 100644 index 000000000000..2d938ad7825b --- /dev/null +++ b/thirdparty/libvpx/vp8/common/extend.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "extend.h" +#include "vpx_mem/vpx_mem.h" + + +static void copy_and_extend_plane +( + unsigned char *s, /* source */ + int sp, /* source pitch */ + unsigned char *d, /* destination */ + int dp, /* destination pitch */ + int h, /* height */ + int w, /* width */ + int et, /* extend top border */ + int el, /* extend left border */ + int eb, /* extend bottom border */ + int er /* extend right border */ +) +{ + int i; + unsigned char *src_ptr1, *src_ptr2; + unsigned char *dest_ptr1, *dest_ptr2; + int linesize; + + /* copy the left and right most columns out */ + src_ptr1 = s; + src_ptr2 = s + w - 1; + dest_ptr1 = d - el; + dest_ptr2 = d + w; + + for (i = 0; i < h; i++) + { + memset(dest_ptr1, src_ptr1[0], el); + memcpy(dest_ptr1 + el, src_ptr1, w); + memset(dest_ptr2, src_ptr2[0], er); + src_ptr1 += sp; + src_ptr2 += sp; + dest_ptr1 += dp; + dest_ptr2 += dp; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = d - el; + src_ptr2 = d + dp * (h - 1) - el; + dest_ptr1 = d + dp * (-et) - el; + dest_ptr2 = d + dp * (h) - el; + linesize = el + er + w; + + for (i = 0; i < et; i++) + { + memcpy(dest_ptr1, src_ptr1, linesize); + dest_ptr1 += dp; + } + + for (i = 0; i < eb; i++) + { + memcpy(dest_ptr2, src_ptr2, linesize); + dest_ptr2 += dp; + } +} + + +void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) +{ + int et = dst->border; + int el = dst->border; + int eb = dst->border + dst->y_height - src->y_height; + int er = dst->border + dst->y_width - src->y_width; + + copy_and_extend_plane(src->y_buffer, src->y_stride, + dst->y_buffer, dst->y_stride, + src->y_height, src->y_width, + et, el, eb, er); + + et = dst->border >> 1; + el = dst->border >> 1; + eb = (dst->border >> 1) + dst->uv_height - src->uv_height; + er = (dst->border >> 1) + dst->uv_width - src->uv_width; + + copy_and_extend_plane(src->u_buffer, src->uv_stride, + dst->u_buffer, dst->uv_stride, + src->uv_height, src->uv_width, + et, el, eb, er); + + copy_and_extend_plane(src->v_buffer, src->uv_stride, + dst->v_buffer, dst->uv_stride, + src->uv_height, src->uv_width, + et, el, eb, er); +} + + +void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + int srcy, int srcx, + int srch, int srcw) +{ + int et = dst->border; + int el = dst->border; + int eb = dst->border + dst->y_height - src->y_height; + int er = dst->border + dst->y_width - src->y_width; + int src_y_offset = srcy * src->y_stride + srcx; + int dst_y_offset = srcy * dst->y_stride + srcx; + int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); + int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); + + /* If the side is not touching the bounder then don't extend. */ + if (srcy) + et = 0; + if (srcx) + el = 0; + if (srcy + srch != src->y_height) + eb = 0; + if (srcx + srcw != src->y_width) + er = 0; + + copy_and_extend_plane(src->y_buffer + src_y_offset, + src->y_stride, + dst->y_buffer + dst_y_offset, + dst->y_stride, + srch, srcw, + et, el, eb, er); + + et = (et + 1) >> 1; + el = (el + 1) >> 1; + eb = (eb + 1) >> 1; + er = (er + 1) >> 1; + srch = (srch + 1) >> 1; + srcw = (srcw + 1) >> 1; + + copy_and_extend_plane(src->u_buffer + src_uv_offset, + src->uv_stride, + dst->u_buffer + dst_uv_offset, + dst->uv_stride, + srch, srcw, + et, el, eb, er); + + copy_and_extend_plane(src->v_buffer + src_uv_offset, + src->uv_stride, + dst->v_buffer + dst_uv_offset, + dst->uv_stride, + srch, srcw, + et, el, eb, er); +} + + +/* note the extension is only for the last row, for intra prediction purpose */ +void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, + unsigned char *YPtr, + unsigned char *UPtr, + unsigned char *VPtr) +{ + int i; + + YPtr += ybf->y_stride * 14; + UPtr += ybf->uv_stride * 6; + VPtr += ybf->uv_stride * 6; + + for (i = 0; i < 4; i++) + { + YPtr[i] = YPtr[-1]; + UPtr[i] = UPtr[-1]; + VPtr[i] = VPtr[-1]; + } + + YPtr += ybf->y_stride; + UPtr += ybf->uv_stride; + VPtr += ybf->uv_stride; + + for (i = 0; i < 4; i++) + { + YPtr[i] = YPtr[-1]; + UPtr[i] = UPtr[-1]; + VPtr[i] = VPtr[-1]; + } +} diff --git a/thirdparty/libvpx/vp8/common/extend.h b/thirdparty/libvpx/vp8/common/extend.h new file mode 100644 index 000000000000..068f4ac5236f --- /dev/null +++ b/thirdparty/libvpx/vp8/common/extend.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_EXTEND_H_ +#define VP8_COMMON_EXTEND_H_ + +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr); +void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); +void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + int srcy, int srcx, + int srch, int srcw); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_EXTEND_H_ diff --git a/thirdparty/libvpx/vp8/common/filter.c b/thirdparty/libvpx/vp8/common/filter.c new file mode 100644 index 000000000000..84c608effaa8 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/filter.c @@ -0,0 +1,493 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "filter.h" +#include "./vp8_rtcd.h" + +DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) = +{ + { 128, 0 }, + { 112, 16 }, + { 96, 32 }, + { 80, 48 }, + { 64, 64 }, + { 48, 80 }, + { 32, 96 }, + { 16, 112 } +}; + +DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) = +{ + + { 0, 0, 128, 0, 0, 0 }, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */ + { 0, -6, 123, 12, -1, 0 }, + { 2, -11, 108, 36, -8, 1 }, /* New 1/4 pel 6 tap filter */ + { 0, -9, 93, 50, -6, 0 }, + { 3, -16, 77, 77, -16, 3 }, /* New 1/2 pel 6 tap filter */ + { 0, -6, 50, 93, -9, 0 }, + { 1, -8, 36, 108, -11, 2 }, /* New 1/4 pel 6 tap filter */ + { 0, -1, 12, 123, -6, 0 }, +}; + +static void filter_block2d_first_pass +( + unsigned char *src_ptr, + int *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +) +{ + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; i++) + { + for (j = 0; j < output_width; j++) + { + Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) + + ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) + + ((int)src_ptr[0] * vp8_filter[2]) + + ((int)src_ptr[pixel_step] * vp8_filter[3]) + + ((int)src_ptr[2*pixel_step] * vp8_filter[4]) + + ((int)src_ptr[3*pixel_step] * vp8_filter[5]) + + (VP8_FILTER_WEIGHT >> 1); /* Rounding */ + + /* Normalize back to 0-255 */ + Temp = Temp >> VP8_FILTER_SHIFT; + + if (Temp < 0) + Temp = 0; + else if (Temp > 255) + Temp = 255; + + output_ptr[j] = Temp; + src_ptr++; + } + + /* Next row... */ + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +static void filter_block2d_second_pass +( + int *src_ptr, + unsigned char *output_ptr, + int output_pitch, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +) +{ + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; i++) + { + for (j = 0; j < output_width; j++) + { + /* Apply filter */ + Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) + + ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) + + ((int)src_ptr[0] * vp8_filter[2]) + + ((int)src_ptr[pixel_step] * vp8_filter[3]) + + ((int)src_ptr[2*pixel_step] * vp8_filter[4]) + + ((int)src_ptr[3*pixel_step] * vp8_filter[5]) + + (VP8_FILTER_WEIGHT >> 1); /* Rounding */ + + /* Normalize back to 0-255 */ + Temp = Temp >> VP8_FILTER_SHIFT; + + if (Temp < 0) + Temp = 0; + else if (Temp > 255) + Temp = 255; + + output_ptr[j] = (unsigned char)Temp; + src_ptr++; + } + + /* Start next row */ + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_pitch; + } +} + + +static void filter_block2d +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + int output_pitch, + const short *HFilter, + const short *VFilter +) +{ + int FData[9*4]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter); + + /* then filter verticaly... */ + filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter); +} + + +void vp8_sixtap_predict4x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter); +} +void vp8_sixtap_predict8x8_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + int FData[13*16]; /* Temp data buffer used in filtering */ + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter); + + + /* then filter verticaly... */ + filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter); + +} + +void vp8_sixtap_predict8x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + int FData[13*16]; /* Temp data buffer used in filtering */ + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter); + + + /* then filter verticaly... */ + filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter); + +} + +void vp8_sixtap_predict16x16_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + int FData[21*24]; /* Temp data buffer used in filtering */ + + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter); + + /* then filter verticaly... */ + filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter); + +} + + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_first_pass + * + * INPUTS : UINT8 *src_ptr : Pointer to source block. + * UINT32 src_stride : Stride of source block. + * UINT32 height : Block height. + * UINT32 width : Block width. + * INT32 *vp8_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : INT32 *dst_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block + * in the horizontal direction to produce the filtered output + * block. Used to implement first-pass of 2-D separable filter. + * + * SPECIAL NOTES : Produces INT32 output to retain precision for next pass. + * Two filter taps should sum to VP8_FILTER_WEIGHT. + * + ****************************************************************************/ +static void filter_block2d_bil_first_pass +( + unsigned char *src_ptr, + unsigned short *dst_ptr, + unsigned int src_stride, + unsigned int height, + unsigned int width, + const short *vp8_filter +) +{ + unsigned int i, j; + + for (i = 0; i < height; i++) + { + for (j = 0; j < width; j++) + { + /* Apply bilinear filter */ + dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[1] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT; + src_ptr++; + } + + /* Next row... */ + src_ptr += src_stride - width; + dst_ptr += width; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_second_pass + * + * INPUTS : INT32 *src_ptr : Pointer to source block. + * UINT32 dst_pitch : Destination block pitch. + * UINT32 height : Block height. + * UINT32 width : Block width. + * INT32 *vp8_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block + * in the vertical direction to produce the filtered output + * block. Used to implement second-pass of 2-D separable filter. + * + * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. + * Two filter taps should sum to VP8_FILTER_WEIGHT. + * + ****************************************************************************/ +static void filter_block2d_bil_second_pass +( + unsigned short *src_ptr, + unsigned char *dst_ptr, + int dst_pitch, + unsigned int height, + unsigned int width, + const short *vp8_filter +) +{ + unsigned int i, j; + int Temp; + + for (i = 0; i < height; i++) + { + for (j = 0; j < width; j++) + { + /* Apply filter */ + Temp = ((int)src_ptr[0] * vp8_filter[0]) + + ((int)src_ptr[width] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT / 2); + dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT); + src_ptr++; + } + + /* Next row... */ + dst_ptr += dst_pitch; + } +} + + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil + * + * INPUTS : UINT8 *src_ptr : Pointer to source block. + * UINT32 src_pitch : Stride of source block. + * UINT32 dst_pitch : Stride of destination block. + * INT32 *HFilter : Array of 2 horizontal filter taps. + * INT32 *VFilter : Array of 2 vertical filter taps. + * INT32 Width : Block width + * INT32 Height : Block height + * + * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : 2-D filters an input block by applying a 2-tap + * bi-linear filter horizontally followed by a 2-tap + * bi-linear filter vertically on the result. + * + * SPECIAL NOTES : The largest block size can be handled here is 16x16 + * + ****************************************************************************/ +static void filter_block2d_bil +( + unsigned char *src_ptr, + unsigned char *dst_ptr, + unsigned int src_pitch, + unsigned int dst_pitch, + const short *HFilter, + const short *VFilter, + int Width, + int Height +) +{ + + unsigned short FData[17*16]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); + + /* then 1-D vertically... */ + filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter); +} + + +void vp8_bilinear_predict4x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; +#if 0 + { + int i; + unsigned char temp1[16]; + unsigned char temp2[16]; + + bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4); + filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4); + + for (i = 0; i < 16; i++) + { + if (temp1[i] != temp2[i]) + { + bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4); + filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4); + } + } + } +#endif + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); + +} + +void vp8_bilinear_predict8x8_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8); + +} + +void vp8_bilinear_predict8x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4); + +} + +void vp8_bilinear_predict16x16_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16); +} diff --git a/thirdparty/libvpx/vp8/common/filter.h b/thirdparty/libvpx/vp8/common/filter.h new file mode 100644 index 000000000000..cfba775fce47 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/filter.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_FILTER_H_ +#define VP8_COMMON_FILTER_H_ + +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLOCK_HEIGHT_WIDTH 4 +#define VP8_FILTER_WEIGHT 128 +#define VP8_FILTER_SHIFT 7 + +extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]); +extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_FILTER_H_ diff --git a/thirdparty/libvpx/vp8/common/findnearmv.c b/thirdparty/libvpx/vp8/common/findnearmv.c new file mode 100644 index 000000000000..e8ee40f56c6d --- /dev/null +++ b/thirdparty/libvpx/vp8/common/findnearmv.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "findnearmv.h" + +const unsigned char vp8_mbsplit_offset[4][16] = { + { 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} +}; + +/* Predict motion vectors using those from already-decoded nearby blocks. + Note that we only consider one 4x4 subblock from each candidate 16x16 + macroblock. */ +void vp8_find_near_mvs +( + MACROBLOCKD *xd, + const MODE_INFO *here, + int_mv *nearest, + int_mv *nearby, + int_mv *best_mv, + int cnt[4], + int refframe, + int *ref_frame_sign_bias +) +{ + const MODE_INFO *above = here - xd->mode_info_stride; + const MODE_INFO *left = here - 1; + const MODE_INFO *aboveleft = above - 1; + int_mv near_mvs[4]; + int_mv *mv = near_mvs; + int *cntx = cnt; + enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV}; + + /* Zero accumulators */ + mv[0].as_int = mv[1].as_int = mv[2].as_int = 0; + cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0; + + /* Process above */ + if (above->mbmi.ref_frame != INTRA_FRAME) + { + if (above->mbmi.mv.as_int) + { + (++mv)->as_int = above->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv, ref_frame_sign_bias); + ++cntx; + } + + *cntx += 2; + } + + /* Process left */ + if (left->mbmi.ref_frame != INTRA_FRAME) + { + if (left->mbmi.mv.as_int) + { + int_mv this_mv; + + this_mv.as_int = left->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias); + + if (this_mv.as_int != mv->as_int) + { + (++mv)->as_int = this_mv.as_int; + ++cntx; + } + + *cntx += 2; + } + else + cnt[CNT_INTRA] += 2; + } + + /* Process above left */ + if (aboveleft->mbmi.ref_frame != INTRA_FRAME) + { + if (aboveleft->mbmi.mv.as_int) + { + int_mv this_mv; + + this_mv.as_int = aboveleft->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias); + + if (this_mv.as_int != mv->as_int) + { + (++mv)->as_int = this_mv.as_int; + ++cntx; + } + + *cntx += 1; + } + else + cnt[CNT_INTRA] += 1; + } + + /* If we have three distinct MV's ... */ + if (cnt[CNT_SPLITMV]) + { + /* See if above-left MV can be merged with NEAREST */ + if (mv->as_int == near_mvs[CNT_NEAREST].as_int) + cnt[CNT_NEAREST] += 1; + } + + cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV) + + (left->mbmi.mode == SPLITMV)) * 2 + + (aboveleft->mbmi.mode == SPLITMV); + + /* Swap near and nearest if necessary */ + if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) + { + int tmp; + tmp = cnt[CNT_NEAREST]; + cnt[CNT_NEAREST] = cnt[CNT_NEAR]; + cnt[CNT_NEAR] = tmp; + tmp = near_mvs[CNT_NEAREST].as_int; + near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int; + near_mvs[CNT_NEAR].as_int = tmp; + } + + /* Use near_mvs[0] to store the "best" MV */ + if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA]) + near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST]; + + /* Set up return values */ + best_mv->as_int = near_mvs[0].as_int; + nearest->as_int = near_mvs[CNT_NEAREST].as_int; + nearby->as_int = near_mvs[CNT_NEAR].as_int; +} + + +static void invert_and_clamp_mvs(int_mv *inv, int_mv *src, MACROBLOCKD *xd) +{ + inv->as_mv.row = src->as_mv.row * -1; + inv->as_mv.col = src->as_mv.col * -1; + vp8_clamp_mv2(inv, xd); + vp8_clamp_mv2(src, xd); +} + + +int vp8_find_near_mvs_bias +( + MACROBLOCKD *xd, + const MODE_INFO *here, + int_mv mode_mv_sb[2][MB_MODE_COUNT], + int_mv best_mv_sb[2], + int cnt[4], + int refframe, + int *ref_frame_sign_bias +) +{ + int sign_bias = ref_frame_sign_bias[refframe]; + + vp8_find_near_mvs(xd, + here, + &mode_mv_sb[sign_bias][NEARESTMV], + &mode_mv_sb[sign_bias][NEARMV], + &best_mv_sb[sign_bias], + cnt, + refframe, + ref_frame_sign_bias); + + invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARESTMV], + &mode_mv_sb[sign_bias][NEARESTMV], xd); + invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARMV], + &mode_mv_sb[sign_bias][NEARMV], xd); + invert_and_clamp_mvs(&best_mv_sb[!sign_bias], + &best_mv_sb[sign_bias], xd); + + return sign_bias; +} + + +vp8_prob *vp8_mv_ref_probs( + vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4] +) +{ + p[0] = vp8_mode_contexts [near_mv_ref_ct[0]] [0]; + p[1] = vp8_mode_contexts [near_mv_ref_ct[1]] [1]; + p[2] = vp8_mode_contexts [near_mv_ref_ct[2]] [2]; + p[3] = vp8_mode_contexts [near_mv_ref_ct[3]] [3]; + /*p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3];*/ + return p; +} + diff --git a/thirdparty/libvpx/vp8/common/findnearmv.h b/thirdparty/libvpx/vp8/common/findnearmv.h new file mode 100644 index 000000000000..472a7b5d8da0 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/findnearmv.h @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_FINDNEARMV_H_ +#define VP8_COMMON_FINDNEARMV_H_ + +#include "./vpx_config.h" +#include "mv.h" +#include "blockd.h" +#include "modecont.h" +#include "treecoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +static INLINE void mv_bias(int refmb_ref_frame_sign_bias, int refframe, + int_mv *mvp, const int *ref_frame_sign_bias) +{ + if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) + { + mvp->as_mv.row *= -1; + mvp->as_mv.col *= -1; + } +} + +#define LEFT_TOP_MARGIN (16 << 3) +#define RIGHT_BOTTOM_MARGIN (16 << 3) +static INLINE void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) +{ + if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN)) + mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN; + else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN) + mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN; + + if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN)) + mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN; + else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN) + mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN; +} + +static INLINE void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge, + int mb_to_right_edge, int mb_to_top_edge, + int mb_to_bottom_edge) +{ + mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ? + mb_to_left_edge : mv->as_mv.col; + mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ? + mb_to_right_edge : mv->as_mv.col; + mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ? + mb_to_top_edge : mv->as_mv.row; + mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ? + mb_to_bottom_edge : mv->as_mv.row; +} +static INLINE unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge, + int mb_to_right_edge, + int mb_to_top_edge, + int mb_to_bottom_edge) +{ + unsigned int need_to_clamp; + need_to_clamp = (mv->as_mv.col < mb_to_left_edge); + need_to_clamp |= (mv->as_mv.col > mb_to_right_edge); + need_to_clamp |= (mv->as_mv.row < mb_to_top_edge); + need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge); + return need_to_clamp; +} + +void vp8_find_near_mvs +( + MACROBLOCKD *xd, + const MODE_INFO *here, + int_mv *nearest, int_mv *nearby, int_mv *best, + int near_mv_ref_cts[4], + int refframe, + int *ref_frame_sign_bias +); + + +int vp8_find_near_mvs_bias +( + MACROBLOCKD *xd, + const MODE_INFO *here, + int_mv mode_mv_sb[2][MB_MODE_COUNT], + int_mv best_mv_sb[2], + int cnt[4], + int refframe, + int *ref_frame_sign_bias +); + + +vp8_prob *vp8_mv_ref_probs( + vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4] +); + +extern const unsigned char vp8_mbsplit_offset[4][16]; + + +static INLINE uint32_t left_block_mv(const MODE_INFO *cur_mb, int b) +{ + if (!(b & 3)) + { + /* On L edge, get from MB to left of us */ + --cur_mb; + + if(cur_mb->mbmi.mode != SPLITMV) + return cur_mb->mbmi.mv.as_int; + b += 4; + } + + return (cur_mb->bmi + b - 1)->mv.as_int; +} + +static INLINE uint32_t above_block_mv(const MODE_INFO *cur_mb, int b, + int mi_stride) +{ + if (!(b >> 2)) + { + /* On top edge, get from MB above us */ + cur_mb -= mi_stride; + + if(cur_mb->mbmi.mode != SPLITMV) + return cur_mb->mbmi.mv.as_int; + b += 16; + } + + return (cur_mb->bmi + (b - 4))->mv.as_int; +} +static INLINE B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) +{ + if (!(b & 3)) + { + /* On L edge, get from MB to left of us */ + --cur_mb; + switch (cur_mb->mbmi.mode) + { + case B_PRED: + return (cur_mb->bmi + b + 3)->as_mode; + case DC_PRED: + return B_DC_PRED; + case V_PRED: + return B_VE_PRED; + case H_PRED: + return B_HE_PRED; + case TM_PRED: + return B_TM_PRED; + default: + return B_DC_PRED; + } + } + + return (cur_mb->bmi + b - 1)->as_mode; +} + +static INLINE B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, + int mi_stride) +{ + if (!(b >> 2)) + { + /* On top edge, get from MB above us */ + cur_mb -= mi_stride; + + switch (cur_mb->mbmi.mode) + { + case B_PRED: + return (cur_mb->bmi + b + 12)->as_mode; + case DC_PRED: + return B_DC_PRED; + case V_PRED: + return B_VE_PRED; + case H_PRED: + return B_HE_PRED; + case TM_PRED: + return B_TM_PRED; + default: + return B_DC_PRED; + } + } + + return (cur_mb->bmi + b - 4)->as_mode; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_FINDNEARMV_H_ diff --git a/thirdparty/libvpx/vp8/common/generic/systemdependent.c b/thirdparty/libvpx/vp8/common/generic/systemdependent.c new file mode 100644 index 000000000000..6d5f302d7a47 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/generic/systemdependent.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#if ARCH_ARM +#include "vpx_ports/arm.h" +#elif ARCH_X86 || ARCH_X86_64 +#include "vpx_ports/x86.h" +#endif +#include "vp8/common/onyxc_int.h" +#include "vp8/common/systemdependent.h" + +#if CONFIG_MULTITHREAD +#if HAVE_UNISTD_H && !defined(__OS2__) +#include +#elif defined(_WIN32) +#include +typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO); +#elif defined(__OS2__) +#define INCL_DOS +#define INCL_DOSSPINLOCK +#include +#endif +#endif + +#if CONFIG_MULTITHREAD +static int get_cpu_count() +{ + int core_count = 16; + +#if HAVE_UNISTD_H && !defined(__OS2__) +#if defined(_SC_NPROCESSORS_ONLN) + core_count = sysconf(_SC_NPROCESSORS_ONLN); +#elif defined(_SC_NPROC_ONLN) + core_count = sysconf(_SC_NPROC_ONLN); +#endif +#elif defined(_WIN32) + { +#if _WIN32_WINNT >= 0x0501 + SYSTEM_INFO sysinfo; + GetNativeSystemInfo(&sysinfo); +#else + PGNSI pGNSI; + SYSTEM_INFO sysinfo; + + /* Call GetNativeSystemInfo if supported or + * GetSystemInfo otherwise. */ + + pGNSI = (PGNSI) GetProcAddress( + GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo"); + if (pGNSI != NULL) + pGNSI(&sysinfo); + else + GetSystemInfo(&sysinfo); +#endif + + core_count = sysinfo.dwNumberOfProcessors; + } +#elif defined(__OS2__) + { + ULONG proc_id; + ULONG status; + + core_count = 0; + for (proc_id = 1; ; proc_id++) + { + if (DosGetProcessorStatus(proc_id, &status)) + break; + + if (status == PROC_ONLINE) + core_count++; + } + } +#else + /* other platforms */ +#endif + + return core_count > 0 ? core_count : 1; +} +#endif + +void vp8_clear_system_state_c() {}; + +void vp8_machine_specific_config(VP8_COMMON *ctx) +{ +#if CONFIG_MULTITHREAD + ctx->processor_core_count = get_cpu_count(); +#else + (void)ctx; +#endif /* CONFIG_MULTITHREAD */ + +#if ARCH_ARM + ctx->cpu_caps = arm_cpu_caps(); +#elif ARCH_X86 || ARCH_X86_64 + ctx->cpu_caps = x86_simd_caps(); +#endif +} diff --git a/thirdparty/libvpx/vp8/common/header.h b/thirdparty/libvpx/vp8/common/header.h new file mode 100644 index 000000000000..e27bca16bd73 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/header.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_HEADER_H_ +#define VP8_COMMON_HEADER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* 24 bits total */ +typedef struct +{ + unsigned int type: 1; + unsigned int version: 3; + unsigned int show_frame: 1; + + /* Allow 2^20 bytes = 8 megabits for first partition */ + + unsigned int first_partition_length_in_bytes: 19; + +#ifdef PACKET_TESTING + unsigned int frame_number; + unsigned int update_gold: 1; + unsigned int uses_gold: 1; + unsigned int update_last: 1; + unsigned int uses_last: 1; +#endif + +} VP8_HEADER; + +#ifdef PACKET_TESTING +#define VP8_HEADER_SIZE 8 +#else +#define VP8_HEADER_SIZE 3 +#endif + + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_HEADER_H_ diff --git a/thirdparty/libvpx/vp8/common/idct_blk.c b/thirdparty/libvpx/vp8/common/idct_blk.c new file mode 100644 index 000000000000..8aa7d9bf0ff0 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/idct_blk.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_dequant_idct_add_c(short *input, short *dq, + unsigned char *dest, int stride); +void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred, + int pred_stride, unsigned char *dst_ptr, + int dst_stride); + +void vp8_dequant_idct_add_y_block_c + (short *q, short *dq, + unsigned char *dst, int stride, char *eobs) +{ + int i, j; + + for (i = 0; i < 4; i++) + { + for (j = 0; j < 4; j++) + { + if (*eobs++ > 1) + vp8_dequant_idct_add_c (q, dq, dst, stride); + else + { + vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst += 4; + } + + dst += 4*stride - 16; + } +} + +void vp8_dequant_idct_add_uv_block_c + (short *q, short *dq, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + int i, j; + + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + if (*eobs++ > 1) + vp8_dequant_idct_add_c (q, dq, dstu, stride); + else + { + vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dstu += 4; + } + + dstu += 4*stride - 8; + } + + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + if (*eobs++ > 1) + vp8_dequant_idct_add_c (q, dq, dstv, stride); + else + { + vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dstv += 4; + } + + dstv += 4*stride - 8; + } +} diff --git a/thirdparty/libvpx/vp8/common/idctllm.c b/thirdparty/libvpx/vp8/common/idctllm.c new file mode 100644 index 000000000000..f5403c5aaf78 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/idctllm.c @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" + +/**************************************************************************** + * Notes: + * + * This implementation makes use of 16 bit fixed point verio of two multiply + * constants: + * 1. sqrt(2) * cos (pi/8) + * 2. sqrt(2) * sin (pi/8) + * Becuase the first constant is bigger than 1, to maintain the same 16 bit + * fixed point precision as the second one, we use a trick of + * x * a = x + x*(a-1) + * so + * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). + **************************************************************************/ +static const int cospi8sqrt2minus1 = 20091; +static const int sinpi8sqrt2 = 35468; + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) +{ + int i; + int r, c; + int a1, b1, c1, d1; + short output[16]; + short *ip = input; + short *op = output; + int temp1, temp2; + int shortpitch = 4; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[8]; + b1 = ip[0] - ip[8]; + + temp1 = (ip[4] * sinpi8sqrt2) >> 16; + temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[12] * sinpi8sqrt2) >> 16; + d1 = temp1 + temp2; + + op[shortpitch*0] = a1 + d1; + op[shortpitch*3] = a1 - d1; + + op[shortpitch*1] = b1 + c1; + op[shortpitch*2] = b1 - c1; + + ip++; + op++; + } + + ip = output; + op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[2]; + b1 = ip[0] - ip[2]; + + temp1 = (ip[1] * sinpi8sqrt2) >> 16; + temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[3] * sinpi8sqrt2) >> 16; + d1 = temp1 + temp2; + + + op[0] = (a1 + d1 + 4) >> 3; + op[3] = (a1 - d1 + 4) >> 3; + + op[1] = (b1 + c1 + 4) >> 3; + op[2] = (b1 - c1 + 4) >> 3; + + ip += shortpitch; + op += shortpitch; + } + + ip = output; + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + int a = ip[c] + pred_ptr[c] ; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a ; + } + ip += 4; + dst_ptr += dst_stride; + pred_ptr += pred_stride; + } +} + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) +{ + int a1 = ((input_dc + 4) >> 3); + int r, c; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + int a = a1 + pred_ptr[c] ; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a ; + } + + dst_ptr += dst_stride; + pred_ptr += pred_stride; + } + +} + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff) +{ + short output[16]; + int i; + int a1, b1, c1, d1; + int a2, b2, c2, d2; + short *ip = input; + short *op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[12]; + b1 = ip[4] + ip[8]; + c1 = ip[4] - ip[8]; + d1 = ip[0] - ip[12]; + + op[0] = a1 + b1; + op[4] = c1 + d1; + op[8] = a1 - b1; + op[12] = d1 - c1; + ip++; + op++; + } + + ip = output; + op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[3]; + b1 = ip[1] + ip[2]; + c1 = ip[1] - ip[2]; + d1 = ip[0] - ip[3]; + + a2 = a1 + b1; + b2 = c1 + d1; + c2 = a1 - b1; + d2 = d1 - c1; + + op[0] = (a2 + 3) >> 3; + op[1] = (b2 + 3) >> 3; + op[2] = (c2 + 3) >> 3; + op[3] = (d2 + 3) >> 3; + + ip += 4; + op += 4; + } + + for(i = 0; i < 16; i++) + { + mb_dqcoeff[i * 16] = output[i]; + } +} + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff) +{ + int i; + int a1; + + a1 = ((input[0] + 3) >> 3); + for(i = 0; i < 16; i++) + { + mb_dqcoeff[i * 16] = a1; + } +} diff --git a/thirdparty/libvpx/vp8/common/invtrans.h b/thirdparty/libvpx/vp8/common/invtrans.h new file mode 100644 index 000000000000..9cfea8d51316 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/invtrans.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_INVTRANS_H_ +#define VP8_COMMON_INVTRANS_H_ + +#include "./vpx_config.h" +#include "vp8_rtcd.h" +#include "blockd.h" +#include "onyxc_int.h" + +#if CONFIG_MULTITHREAD +#include "vpx_mem/vpx_mem.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +static void eob_adjust(char *eobs, short *diff) +{ + /* eob adjust.... the idct can only skip if both the dc and eob are zero */ + int js; + for(js = 0; js < 16; js++) + { + if((eobs[js] == 0) && (diff[0] != 0)) + eobs[js]++; + diff+=16; + } +} + +static INLINE void vp8_inverse_transform_mby(MACROBLOCKD *xd) +{ + short *DQC = xd->dequant_y1; + + if (xd->mode_info_context->mbmi.mode != SPLITMV) + { + /* do 2nd order transform on the dc block */ + if (xd->eobs[24] > 1) + { + vp8_short_inv_walsh4x4 + (&xd->block[24].dqcoeff[0], xd->qcoeff); + } + else + { + vp8_short_inv_walsh4x4_1 + (&xd->block[24].dqcoeff[0], xd->qcoeff); + } + eob_adjust(xd->eobs, xd->qcoeff); + + DQC = xd->dequant_y1_dc; + } + vp8_dequant_idct_add_y_block + (xd->qcoeff, DQC, + xd->dst.y_buffer, + xd->dst.y_stride, xd->eobs); +} +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_INVTRANS_H_ diff --git a/thirdparty/libvpx/vp8/common/loopfilter.h b/thirdparty/libvpx/vp8/common/loopfilter.h new file mode 100644 index 000000000000..20a6bd375b60 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/loopfilter.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_LOOPFILTER_H_ +#define VP8_COMMON_LOOPFILTER_H_ + +#include "vpx_ports/mem.h" +#include "vpx_config.h" +#include "vp8_rtcd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_LOOP_FILTER 63 +/* fraction of total macroblock rows to be used in fast filter level picking */ +/* has to be > 2 */ +#define PARTIAL_FRAME_FRACTION 8 + +typedef enum +{ + NORMAL_LOOPFILTER = 0, + SIMPLE_LOOPFILTER = 1 +} LOOPFILTERTYPE; + +#if ARCH_ARM +#define SIMD_WIDTH 1 +#else +#define SIMD_WIDTH 16 +#endif + +/* Need to align this structure so when it is declared and + * passed it can be loaded into vector registers. + */ +typedef struct +{ + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]); + unsigned char lvl[4][4][4]; + unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1]; + unsigned char mode_lf_lut[10]; +} loop_filter_info_n; + +typedef struct loop_filter_info +{ + const unsigned char * mblim; + const unsigned char * blim; + const unsigned char * lim; + const unsigned char * hev_thr; +} loop_filter_info; + + +typedef void loop_filter_uvfunction +( + unsigned char *u, /* source pointer */ + int p, /* pitch */ + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + unsigned char *v +); + +/* assorted loopfilter functions which get used elsewhere */ +struct VP8Common; +struct macroblockd; +struct modeinfo; + +void vp8_loop_filter_init(struct VP8Common *cm); + +void vp8_loop_filter_frame_init(struct VP8Common *cm, + struct macroblockd *mbd, + int default_filt_lvl); + +void vp8_loop_filter_frame(struct VP8Common *cm, struct macroblockd *mbd, + int frame_type); + +void vp8_loop_filter_partial_frame(struct VP8Common *cm, + struct macroblockd *mbd, + int default_filt_lvl); + +void vp8_loop_filter_frame_yonly(struct VP8Common *cm, + struct macroblockd *mbd, + int default_filt_lvl); + +void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi, + int sharpness_lvl); + +void vp8_loop_filter_row_normal(struct VP8Common *cm, + struct modeinfo *mode_info_context, + int mb_row, int post_ystride, int post_uvstride, + unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr); + +void vp8_loop_filter_row_simple(struct VP8Common *cm, + struct modeinfo *mode_info_context, + int mb_row, int post_ystride, int post_uvstride, + unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_LOOPFILTER_H_ diff --git a/thirdparty/libvpx/vp8/common/loopfilter_filters.c b/thirdparty/libvpx/vp8/common/loopfilter_filters.c new file mode 100644 index 000000000000..1d51696ff7e8 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/loopfilter_filters.c @@ -0,0 +1,430 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include +#include "loopfilter.h" +#include "onyxc_int.h" + +typedef unsigned char uc; + +static signed char vp8_signed_char_clamp(int t) +{ + t = (t < -128 ? -128 : t); + t = (t > 127 ? 127 : t); + return (signed char) t; +} + + +/* should we apply any filter at all ( 11111111 yes, 00000000 no) */ +static signed char vp8_filter_mask(uc limit, uc blimit, + uc p3, uc p2, uc p1, uc p0, + uc q0, uc q1, uc q2, uc q3) +{ + signed char mask = 0; + mask |= (abs(p3 - p2) > limit); + mask |= (abs(p2 - p1) > limit); + mask |= (abs(p1 - p0) > limit); + mask |= (abs(q1 - q0) > limit); + mask |= (abs(q2 - q1) > limit); + mask |= (abs(q3 - q2) > limit); + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit); + return mask - 1; +} + +/* is there high variance internal edge ( 11111111 yes, 00000000 no) */ +static signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) +{ + signed char hev = 0; + hev |= (abs(p1 - p0) > thresh) * -1; + hev |= (abs(q1 - q0) > thresh) * -1; + return hev; +} + +static void vp8_filter(signed char mask, uc hev, uc *op1, + uc *op0, uc *oq0, uc *oq1) + +{ + signed char ps0, qs0; + signed char ps1, qs1; + signed char filter_value, Filter1, Filter2; + signed char u; + + ps1 = (signed char) * op1 ^ 0x80; + ps0 = (signed char) * op0 ^ 0x80; + qs0 = (signed char) * oq0 ^ 0x80; + qs1 = (signed char) * oq1 ^ 0x80; + + /* add outer taps if we have high edge variance */ + filter_value = vp8_signed_char_clamp(ps1 - qs1); + filter_value &= hev; + + /* inner taps */ + filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0)); + filter_value &= mask; + + /* save bottom 3 bits so that we round one side +4 and the other +3 + * if it equals 4 we'll set to adjust by -1 to account for the fact + * we'd round 3 the other way + */ + Filter1 = vp8_signed_char_clamp(filter_value + 4); + Filter2 = vp8_signed_char_clamp(filter_value + 3); + Filter1 >>= 3; + Filter2 >>= 3; + u = vp8_signed_char_clamp(qs0 - Filter1); + *oq0 = u ^ 0x80; + u = vp8_signed_char_clamp(ps0 + Filter2); + *op0 = u ^ 0x80; + filter_value = Filter1; + + /* outer tap adjustments */ + filter_value += 1; + filter_value >>= 1; + filter_value &= ~hev; + + u = vp8_signed_char_clamp(qs1 - filter_value); + *oq1 = u ^ 0x80; + u = vp8_signed_char_clamp(ps1 + filter_value); + *op1 = u ^ 0x80; + +} +void vp8_loop_filter_horizontal_edge_c +( + unsigned char *s, + int p, /* pitch */ + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count +) +{ + int hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + do + { + mask = vp8_filter_mask(limit[0], blimit[0], + s[-4*p], s[-3*p], s[-2*p], s[-1*p], + s[0*p], s[1*p], s[2*p], s[3*p]); + + hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); + + vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p); + + ++s; + } + while (++i < count * 8); +} + +void vp8_loop_filter_vertical_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count +) +{ + int hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + do + { + mask = vp8_filter_mask(limit[0], blimit[0], + s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]); + + hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); + + vp8_filter(mask, hev, s - 2, s - 1, s, s + 1); + + s += p; + } + while (++i < count * 8); +} + +static void vp8_mbfilter(signed char mask, uc hev, + uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2) +{ + signed char s, u; + signed char filter_value, Filter1, Filter2; + signed char ps2 = (signed char) * op2 ^ 0x80; + signed char ps1 = (signed char) * op1 ^ 0x80; + signed char ps0 = (signed char) * op0 ^ 0x80; + signed char qs0 = (signed char) * oq0 ^ 0x80; + signed char qs1 = (signed char) * oq1 ^ 0x80; + signed char qs2 = (signed char) * oq2 ^ 0x80; + + /* add outer taps if we have high edge variance */ + filter_value = vp8_signed_char_clamp(ps1 - qs1); + filter_value = vp8_signed_char_clamp(filter_value + 3 * (qs0 - ps0)); + filter_value &= mask; + + Filter2 = filter_value; + Filter2 &= hev; + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + Filter1 = vp8_signed_char_clamp(Filter2 + 4); + Filter2 = vp8_signed_char_clamp(Filter2 + 3); + Filter1 >>= 3; + Filter2 >>= 3; + qs0 = vp8_signed_char_clamp(qs0 - Filter1); + ps0 = vp8_signed_char_clamp(ps0 + Filter2); + + + /* only apply wider filter if not high edge variance */ + filter_value &= ~hev; + Filter2 = filter_value; + + /* roughly 3/7th difference across boundary */ + u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7); + s = vp8_signed_char_clamp(qs0 - u); + *oq0 = s ^ 0x80; + s = vp8_signed_char_clamp(ps0 + u); + *op0 = s ^ 0x80; + + /* roughly 2/7th difference across boundary */ + u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7); + s = vp8_signed_char_clamp(qs1 - u); + *oq1 = s ^ 0x80; + s = vp8_signed_char_clamp(ps1 + u); + *op1 = s ^ 0x80; + + /* roughly 1/7th difference across boundary */ + u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); + s = vp8_signed_char_clamp(qs2 - u); + *oq2 = s ^ 0x80; + s = vp8_signed_char_clamp(ps2 + u); + *op2 = s ^ 0x80; +} + +void vp8_mbloop_filter_horizontal_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count +) +{ + signed char hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + do + { + + mask = vp8_filter_mask(limit[0], blimit[0], + s[-4*p], s[-3*p], s[-2*p], s[-1*p], + s[0*p], s[1*p], s[2*p], s[3*p]); + + hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); + + vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p); + + ++s; + } + while (++i < count * 8); + +} + + +void vp8_mbloop_filter_vertical_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count +) +{ + signed char hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + do + { + + mask = vp8_filter_mask(limit[0], blimit[0], + s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]); + + hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); + + vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2); + + s += p; + } + while (++i < count * 8); + +} + +/* should we apply any filter at all ( 11111111 yes, 00000000 no) */ +static signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1) +{ +/* Why does this cause problems for win32? + * error C2143: syntax error : missing ';' before 'type' + * (void) limit; + */ + signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1; + return mask; +} + +static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1) +{ + signed char filter_value, Filter1, Filter2; + signed char p1 = (signed char) * op1 ^ 0x80; + signed char p0 = (signed char) * op0 ^ 0x80; + signed char q0 = (signed char) * oq0 ^ 0x80; + signed char q1 = (signed char) * oq1 ^ 0x80; + signed char u; + + filter_value = vp8_signed_char_clamp(p1 - q1); + filter_value = vp8_signed_char_clamp(filter_value + 3 * (q0 - p0)); + filter_value &= mask; + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + Filter1 = vp8_signed_char_clamp(filter_value + 4); + Filter1 >>= 3; + u = vp8_signed_char_clamp(q0 - Filter1); + *oq0 = u ^ 0x80; + + Filter2 = vp8_signed_char_clamp(filter_value + 3); + Filter2 >>= 3; + u = vp8_signed_char_clamp(p0 + Filter2); + *op0 = u ^ 0x80; +} + +void vp8_loop_filter_simple_horizontal_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit +) +{ + signed char mask = 0; + int i = 0; + + do + { + mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]); + vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p); + ++s; + } + while (++i < 16); +} + +void vp8_loop_filter_simple_vertical_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit +) +{ + signed char mask = 0; + int i = 0; + + do + { + mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]); + vp8_simple_filter(mask, s - 2, s - 1, s, s + 1); + s += p; + } + while (++i < 16); + +} + +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) +{ + vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) +{ + vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) +{ + vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) +{ + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) +{ + vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) +{ + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit); +} diff --git a/thirdparty/libvpx/vp8/common/mbpitch.c b/thirdparty/libvpx/vp8/common/mbpitch.c new file mode 100644 index 000000000000..32e1b66409f6 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/mbpitch.c @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "blockd.h" + +void vp8_setup_block_dptrs(MACROBLOCKD *x) +{ + int r, c; + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4; + } + } + + for (r = 0; r < 2; r++) + { + for (c = 0; c < 2; c++) + { + x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4; + + } + } + + for (r = 0; r < 2; r++) + { + for (c = 0; c < 2; c++) + { + x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4; + + } + } + + for (r = 0; r < 25; r++) + { + x->block[r].qcoeff = x->qcoeff + r * 16; + x->block[r].dqcoeff = x->dqcoeff + r * 16; + x->block[r].eob = x->eobs + r; + } +} + +void vp8_build_block_doffsets(MACROBLOCKD *x) +{ + int block; + + for (block = 0; block < 16; block++) /* y blocks */ + { + x->block[block].offset = + (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4; + } + + for (block = 16; block < 20; block++) /* U and V blocks */ + { + x->block[block+4].offset = + x->block[block].offset = + ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4; + } +} diff --git a/thirdparty/libvpx/vp8/common/modecont.c b/thirdparty/libvpx/vp8/common/modecont.c new file mode 100644 index 000000000000..86a74bc0ff55 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/modecont.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "entropy.h" + +const int vp8_mode_contexts[6][4] = +{ + { + /* 0 */ + 7, 1, 1, 143, + }, + { + /* 1 */ + 14, 18, 14, 107, + }, + { + /* 2 */ + 135, 64, 57, 68, + }, + { + /* 3 */ + 60, 56, 128, 65, + }, + { + /* 4 */ + 159, 134, 128, 34, + }, + { + /* 5 */ + 234, 188, 128, 28, + }, +}; diff --git a/thirdparty/libvpx/vp8/common/modecont.h b/thirdparty/libvpx/vp8/common/modecont.h new file mode 100644 index 000000000000..ff34c33c557c --- /dev/null +++ b/thirdparty/libvpx/vp8/common/modecont.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_MODECONT_H_ +#define VP8_COMMON_MODECONT_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +extern const int vp8_mode_contexts[6][4]; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_MODECONT_H_ diff --git a/thirdparty/libvpx/vp8/common/mv.h b/thirdparty/libvpx/vp8/common/mv.h new file mode 100644 index 000000000000..111ccd63c722 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/mv.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_MV_H_ +#define VP8_COMMON_MV_H_ +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct +{ + short row; + short col; +} MV; + +typedef union int_mv +{ + uint32_t as_int; + MV as_mv; +} int_mv; /* facilitates faster equality tests and copies */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_MV_H_ diff --git a/thirdparty/libvpx/vp8/common/onyxc_int.h b/thirdparty/libvpx/vp8/common/onyxc_int.h new file mode 100644 index 000000000000..6d89865c600a --- /dev/null +++ b/thirdparty/libvpx/vp8/common/onyxc_int.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_ONYXC_INT_H_ +#define VP8_COMMON_ONYXC_INT_H_ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "loopfilter.h" +#include "entropymv.h" +#include "entropy.h" +#if CONFIG_POSTPROC +#include "postproc.h" +#endif + +/*#ifdef PACKET_TESTING*/ +#include "header.h" +/*#endif*/ + +#ifdef __cplusplus +extern "C" { +#endif + +#define MINQ 0 +#define MAXQ 127 +#define QINDEX_RANGE (MAXQ + 1) + +#define NUM_YV12_BUFFERS 4 + +#define MAX_PARTITIONS 9 + +typedef struct frame_contexts +{ + vp8_prob bmode_prob [VP8_BINTRAMODES-1]; + vp8_prob ymode_prob [VP8_YMODES-1]; /* interframe intra mode probs */ + vp8_prob uv_mode_prob [VP8_UV_MODES-1]; + vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1]; + vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + MV_CONTEXT mvc[2]; +} FRAME_CONTEXT; + +typedef enum +{ + ONE_PARTITION = 0, + TWO_PARTITION = 1, + FOUR_PARTITION = 2, + EIGHT_PARTITION = 3 +} TOKEN_PARTITION; + +typedef enum +{ + RECON_CLAMP_REQUIRED = 0, + RECON_CLAMP_NOTREQUIRED = 1 +} CLAMP_TYPE; + +typedef struct VP8Common + +{ + struct vpx_internal_error_info error; + + DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][2]); + DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][2]); + DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][2]); + + int Width; + int Height; + int horiz_scale; + int vert_scale; + + CLAMP_TYPE clamp_type; + + YV12_BUFFER_CONFIG *frame_to_show; + + YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS]; + int fb_idx_ref_cnt[NUM_YV12_BUFFERS]; + int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx; + + YV12_BUFFER_CONFIG temp_scale_frame; + +#if CONFIG_POSTPROC + YV12_BUFFER_CONFIG post_proc_buffer; + YV12_BUFFER_CONFIG post_proc_buffer_int; + int post_proc_buffer_int_used; + unsigned char *pp_limits_buffer; /* post-processing filter coefficients */ +#endif + + FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */ + FRAME_TYPE frame_type; + + int show_frame; + + int frame_flags; + int MBs; + int mb_rows; + int mb_cols; + int mode_info_stride; + + /* profile settings */ + int mb_no_coeff_skip; + int no_lpf; + int use_bilinear_mc_filter; + int full_pixel; + + int base_qindex; + + int y1dc_delta_q; + int y2dc_delta_q; + int y2ac_delta_q; + int uvdc_delta_q; + int uvac_delta_q; + + /* We allocate a MODE_INFO struct for each macroblock, together with + an extra row on top and column on the left to simplify prediction. */ + + MODE_INFO *mip; /* Base of allocated array */ + MODE_INFO *mi; /* Corresponds to upper left visible macroblock */ +#if CONFIG_ERROR_CONCEALMENT + MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */ + MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */ +#endif + MODE_INFO *show_frame_mi; /* MODE_INFO for the last decoded frame + to show */ + LOOPFILTERTYPE filter_type; + + loop_filter_info_n lf_info; + + int filter_level; + int last_sharpness_level; + int sharpness_level; + + int refresh_last_frame; /* Two state 0 = NO, 1 = YES */ + int refresh_golden_frame; /* Two state 0 = NO, 1 = YES */ + int refresh_alt_ref_frame; /* Two state 0 = NO, 1 = YES */ + + int copy_buffer_to_gf; /* 0 none, 1 Last to GF, 2 ARF to GF */ + int copy_buffer_to_arf; /* 0 none, 1 Last to ARF, 2 GF to ARF */ + + int refresh_entropy_probs; /* Two state 0 = NO, 1 = YES */ + + int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */ + + /* Y,U,V,Y2 */ + ENTROPY_CONTEXT_PLANES *above_context; /* row of context for each plane */ + ENTROPY_CONTEXT_PLANES left_context; /* (up to) 4 contexts "" */ + + FRAME_CONTEXT lfc; /* last frame entropy */ + FRAME_CONTEXT fc; /* this frame entropy */ + + unsigned int current_video_frame; + + int version; + + TOKEN_PARTITION multi_token_partition; + +#ifdef PACKET_TESTING + VP8_HEADER oh; +#endif +#if CONFIG_POSTPROC_VISUALIZER + double bitrate; + double framerate; +#endif + +#if CONFIG_MULTITHREAD + int processor_core_count; +#endif +#if CONFIG_POSTPROC + struct postproc_state postproc_state; +#endif + int cpu_caps; +} VP8_COMMON; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_ONYXC_INT_H_ diff --git a/thirdparty/libvpx/vp8/common/onyxd.h b/thirdparty/libvpx/vp8/common/onyxd.h new file mode 100644 index 000000000000..e37b29f32cff --- /dev/null +++ b/thirdparty/libvpx/vp8/common/onyxd.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_ONYXD_H_ +#define VP8_COMMON_ONYXD_H_ + + +/* Create/destroy static data structures. */ +#ifdef __cplusplus +extern "C" +{ +#endif +#include "vpx_scale/yv12config.h" +#include "ppflags.h" +#include "vpx_ports/mem.h" +#include "vpx/vpx_codec.h" +#include "vpx/vp8.h" + + struct VP8D_COMP; + + typedef struct + { + int Width; + int Height; + int Version; + int postprocess; + int max_threads; + int error_concealment; + } VP8D_CONFIG; + + typedef enum + { + VP8D_OK = 0 + } VP8D_SETTING; + + void vp8dx_initialize(void); + + void vp8dx_set_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst, int x); + + int vp8dx_get_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst); + + int vp8dx_receive_compressed_data(struct VP8D_COMP* comp, + size_t size, const uint8_t *dest, + int64_t time_stamp); + int vp8dx_get_raw_frame(struct VP8D_COMP* comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags); + + vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); + vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); + +#ifdef __cplusplus +} +#endif + + +#endif // VP8_COMMON_ONYXD_H_ diff --git a/thirdparty/libvpx/vp8/common/ppflags.h b/thirdparty/libvpx/vp8/common/ppflags.h new file mode 100644 index 000000000000..768224aad5e6 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/ppflags.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_PPFLAGS_H_ +#define VP8_COMMON_PPFLAGS_H_ + +#ifdef __cplusplus +extern "C" { +#endif +enum +{ + VP8D_NOFILTERING = 0, + VP8D_DEBLOCK = 1<<0, + VP8D_DEMACROBLOCK = 1<<1, + VP8D_ADDNOISE = 1<<2, + VP8D_DEBUG_TXT_FRAME_INFO = 1<<3, + VP8D_DEBUG_TXT_MBLK_MODES = 1<<4, + VP8D_DEBUG_TXT_DC_DIFF = 1<<5, + VP8D_DEBUG_TXT_RATE_INFO = 1<<6, + VP8D_DEBUG_DRAW_MV = 1<<7, + VP8D_DEBUG_CLR_BLK_MODES = 1<<8, + VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9, + VP8D_MFQE = 1<<10 +}; + +typedef struct +{ + int post_proc_flag; + int deblocking_level; + int noise_level; + int display_ref_frame_flag; + int display_mb_modes_flag; + int display_b_modes_flag; + int display_mv_flag; +} vp8_ppflags_t; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_PPFLAGS_H_ diff --git a/thirdparty/libvpx/vp8/common/quant_common.c b/thirdparty/libvpx/vp8/common/quant_common.c new file mode 100644 index 000000000000..05f921070288 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/quant_common.c @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "quant_common.h" + +static const int dc_qlookup[QINDEX_RANGE] = +{ + 4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17, 17, + 18, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 25, 25, 26, 27, 28, + 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 91, 93, 95, 96, 98, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118, + 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 143, 145, 148, 151, 154, 157, +}; + +static const int ac_qlookup[QINDEX_RANGE] = +{ + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, + 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, + 110, 112, 114, 116, 119, 122, 125, 128, 131, 134, 137, 140, 143, 146, 149, 152, + 155, 158, 161, 164, 167, 170, 173, 177, 181, 185, 189, 193, 197, 201, 205, 209, + 213, 217, 221, 225, 229, 234, 239, 245, 249, 254, 259, 264, 269, 274, 279, 284, +}; + + +int vp8_dc_quant(int QIndex, int Delta) +{ + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + retval = dc_qlookup[ QIndex ]; + return retval; +} + +int vp8_dc2quant(int QIndex, int Delta) +{ + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + retval = dc_qlookup[ QIndex ] * 2; + return retval; + +} +int vp8_dc_uv_quant(int QIndex, int Delta) +{ + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + retval = dc_qlookup[ QIndex ]; + + if (retval > 132) + retval = 132; + + return retval; +} + +int vp8_ac_yquant(int QIndex) +{ + int retval; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + retval = ac_qlookup[ QIndex ]; + return retval; +} + +int vp8_ac2quant(int QIndex, int Delta) +{ + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + /* For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16. + * The smallest precision for that is '(x*6349) >> 12' but 16 is a good + * word size. */ + retval = (ac_qlookup[ QIndex ] * 101581) >> 16; + + if (retval < 8) + retval = 8; + + return retval; +} +int vp8_ac_uv_quant(int QIndex, int Delta) +{ + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > 127) + QIndex = 127; + else if (QIndex < 0) + QIndex = 0; + + retval = ac_qlookup[ QIndex ]; + return retval; +} diff --git a/thirdparty/libvpx/vp8/common/quant_common.h b/thirdparty/libvpx/vp8/common/quant_common.h new file mode 100644 index 000000000000..700b5e6d726c --- /dev/null +++ b/thirdparty/libvpx/vp8/common/quant_common.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_COMMON_QUANT_COMMON_H_ +#define VP8_COMMON_QUANT_COMMON_H_ + + +#include "string.h" +#include "blockd.h" +#include "onyxc_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern int vp8_ac_yquant(int QIndex); +extern int vp8_dc_quant(int QIndex, int Delta); +extern int vp8_dc2quant(int QIndex, int Delta); +extern int vp8_ac2quant(int QIndex, int Delta); +extern int vp8_dc_uv_quant(int QIndex, int Delta); +extern int vp8_ac_uv_quant(int QIndex, int Delta); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_QUANT_COMMON_H_ diff --git a/thirdparty/libvpx/vp8/common/reconinter.c b/thirdparty/libvpx/vp8/common/reconinter.c new file mode 100644 index 000000000000..e30259558713 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/reconinter.c @@ -0,0 +1,544 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include +#include + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx/vpx_integer.h" +#include "blockd.h" +#include "reconinter.h" +#if CONFIG_RUNTIME_CPU_DETECT +#include "onyxc_int.h" +#endif + +void vp8_copy_mem16x16_c( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) +{ + + int r; + + for (r = 0; r < 16; r++) + { + memcpy(dst, src, 16); + + src += src_stride; + dst += dst_stride; + + } + +} + +void vp8_copy_mem8x8_c( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) +{ + int r; + + for (r = 0; r < 8; r++) + { + memcpy(dst, src, 8); + + src += src_stride; + dst += dst_stride; + + } + +} + +void vp8_copy_mem8x4_c( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) +{ + int r; + + for (r = 0; r < 4; r++) + { + memcpy(dst, src, 8); + + src += src_stride; + dst += dst_stride; + + } + +} + + +void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf) +{ + int r; + unsigned char *pred_ptr = d->predictor; + unsigned char *ptr; + ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) + { + sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch); + } + else + { + for (r = 0; r < 4; r++) + { + pred_ptr[0] = ptr[0]; + pred_ptr[1] = ptr[1]; + pred_ptr[2] = ptr[2]; + pred_ptr[3] = ptr[3]; + pred_ptr += pitch; + ptr += pre_stride; + } + } +} + +static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride) +{ + unsigned char *ptr; + ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) + { + x->subpixel_predict8x8(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride); + } + else + { + vp8_copy_mem8x8(ptr, pre_stride, dst, dst_stride); + } +} + +static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride) +{ + unsigned char *ptr; + ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) + { + x->subpixel_predict8x4(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride); + } + else + { + vp8_copy_mem8x4(ptr, pre_stride, dst, dst_stride); + } +} + +static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf) +{ + int r; + unsigned char *ptr; + ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) + { + sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride); + } + else + { + for (r = 0; r < 4; r++) + { + dst[0] = ptr[0]; + dst[1] = ptr[1]; + dst[2] = ptr[2]; + dst[3] = ptr[3]; + dst += dst_stride; + ptr += pre_stride; + } + } +} + + +/*encoder only*/ +void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x) +{ + unsigned char *uptr, *vptr; + unsigned char *upred_ptr = &x->predictor[256]; + unsigned char *vpred_ptr = &x->predictor[320]; + + int mv_row = x->mode_info_context->mbmi.mv.as_mv.row; + int mv_col = x->mode_info_context->mbmi.mv.as_mv.col; + int offset; + int pre_stride = x->pre.uv_stride; + + /* calc uv motion vectors */ + mv_row += 1 | (mv_row >> (sizeof(int) * CHAR_BIT - 1)); + mv_col += 1 | (mv_col >> (sizeof(int) * CHAR_BIT - 1)); + mv_row /= 2; + mv_col /= 2; + mv_row &= x->fullpixel_mask; + mv_col &= x->fullpixel_mask; + + offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); + uptr = x->pre.u_buffer + offset; + vptr = x->pre.v_buffer + offset; + + if ((mv_row | mv_col) & 7) + { + x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8); + x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8); + } + else + { + vp8_copy_mem8x8(uptr, pre_stride, upred_ptr, 8); + vp8_copy_mem8x8(vptr, pre_stride, vpred_ptr, 8); + } +} + +/*encoder only*/ +void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x) +{ + int i, j; + int pre_stride = x->pre.uv_stride; + unsigned char *base_pre; + + /* build uv mvs */ + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + int yoffset = i * 8 + j * 2; + int uoffset = 16 + i * 2 + j; + int voffset = 20 + i * 2 + j; + + int temp; + + temp = x->block[yoffset ].bmi.mv.as_mv.row + + x->block[yoffset+1].bmi.mv.as_mv.row + + x->block[yoffset+4].bmi.mv.as_mv.row + + x->block[yoffset+5].bmi.mv.as_mv.row; + + temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8); + + x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask; + + temp = x->block[yoffset ].bmi.mv.as_mv.col + + x->block[yoffset+1].bmi.mv.as_mv.col + + x->block[yoffset+4].bmi.mv.as_mv.col + + x->block[yoffset+5].bmi.mv.as_mv.col; + + temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8); + + x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask; + + x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int; + } + } + + base_pre = x->pre.u_buffer; + for (i = 16; i < 20; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride); + else + { + vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict); + vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict); + } + } + + base_pre = x->pre.v_buffer; + for (i = 20; i < 24; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride); + else + { + vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict); + vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict); + } + } +} + + +/*encoder only*/ +void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x, + unsigned char *dst_y, + int dst_ystride) +{ + unsigned char *ptr_base; + unsigned char *ptr; + int mv_row = x->mode_info_context->mbmi.mv.as_mv.row; + int mv_col = x->mode_info_context->mbmi.mv.as_mv.col; + int pre_stride = x->pre.y_stride; + + ptr_base = x->pre.y_buffer; + ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); + + if ((mv_row | mv_col) & 7) + { + x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, + dst_y, dst_ystride); + } + else + { + vp8_copy_mem16x16(ptr, pre_stride, dst_y, + dst_ystride); + } +} + +static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) +{ + /* If the MV points so far into the UMV border that no visible pixels + * are used for reconstruction, the subpel part of the MV can be + * discarded and the MV limited to 16 pixels with equivalent results. + * + * This limit kicks in at 19 pixels for the top and left edges, for + * the 16 pixels plus 3 taps right of the central pixel when subpel + * filtering. The bottom and right edges use 16 pixels plus 2 pixels + * left of the central pixel when filtering. + */ + if (mv->col < (xd->mb_to_left_edge - (19 << 3))) + mv->col = xd->mb_to_left_edge - (16 << 3); + else if (mv->col > xd->mb_to_right_edge + (18 << 3)) + mv->col = xd->mb_to_right_edge + (16 << 3); + + if (mv->row < (xd->mb_to_top_edge - (19 << 3))) + mv->row = xd->mb_to_top_edge - (16 << 3); + else if (mv->row > xd->mb_to_bottom_edge + (18 << 3)) + mv->row = xd->mb_to_bottom_edge + (16 << 3); +} + +/* A version of the above function for chroma block MVs.*/ +static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) +{ + mv->col = (2*mv->col < (xd->mb_to_left_edge - (19 << 3))) ? + (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col; + mv->col = (2*mv->col > xd->mb_to_right_edge + (18 << 3)) ? + (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col; + + mv->row = (2*mv->row < (xd->mb_to_top_edge - (19 << 3))) ? + (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row; + mv->row = (2*mv->row > xd->mb_to_bottom_edge + (18 << 3)) ? + (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row; +} + +void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, + unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_ystride, + int dst_uvstride) +{ + int offset; + unsigned char *ptr; + unsigned char *uptr, *vptr; + + int_mv _16x16mv; + + unsigned char *ptr_base = x->pre.y_buffer; + int pre_stride = x->pre.y_stride; + + _16x16mv.as_int = x->mode_info_context->mbmi.mv.as_int; + + if (x->mode_info_context->mbmi.need_to_clamp_mvs) + { + clamp_mv_to_umv_border(&_16x16mv.as_mv, x); + } + + ptr = ptr_base + ( _16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3); + + if ( _16x16mv.as_int & 0x00070007) + { + x->subpixel_predict16x16(ptr, pre_stride, _16x16mv.as_mv.col & 7, _16x16mv.as_mv.row & 7, dst_y, dst_ystride); + } + else + { + vp8_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride); + } + + /* calc uv motion vectors */ + _16x16mv.as_mv.row += 1 | (_16x16mv.as_mv.row >> (sizeof(int) * CHAR_BIT - 1)); + _16x16mv.as_mv.col += 1 | (_16x16mv.as_mv.col >> (sizeof(int) * CHAR_BIT - 1)); + _16x16mv.as_mv.row /= 2; + _16x16mv.as_mv.col /= 2; + _16x16mv.as_mv.row &= x->fullpixel_mask; + _16x16mv.as_mv.col &= x->fullpixel_mask; + + pre_stride >>= 1; + offset = ( _16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3); + uptr = x->pre.u_buffer + offset; + vptr = x->pre.v_buffer + offset; + + if ( _16x16mv.as_int & 0x00070007) + { + x->subpixel_predict8x8(uptr, pre_stride, _16x16mv.as_mv.col & 7, _16x16mv.as_mv.row & 7, dst_u, dst_uvstride); + x->subpixel_predict8x8(vptr, pre_stride, _16x16mv.as_mv.col & 7, _16x16mv.as_mv.row & 7, dst_v, dst_uvstride); + } + else + { + vp8_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride); + vp8_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride); + } +} + +static void build_inter4x4_predictors_mb(MACROBLOCKD *x) +{ + int i; + unsigned char *base_dst = x->dst.y_buffer; + unsigned char *base_pre = x->pre.y_buffer; + + if (x->mode_info_context->mbmi.partitioning < 3) + { + BLOCKD *b; + int dst_stride = x->dst.y_stride; + + x->block[ 0].bmi = x->mode_info_context->bmi[ 0]; + x->block[ 2].bmi = x->mode_info_context->bmi[ 2]; + x->block[ 8].bmi = x->mode_info_context->bmi[ 8]; + x->block[10].bmi = x->mode_info_context->bmi[10]; + if (x->mode_info_context->mbmi.need_to_clamp_mvs) + { + clamp_mv_to_umv_border(&x->block[ 0].bmi.mv.as_mv, x); + clamp_mv_to_umv_border(&x->block[ 2].bmi.mv.as_mv, x); + clamp_mv_to_umv_border(&x->block[ 8].bmi.mv.as_mv, x); + clamp_mv_to_umv_border(&x->block[10].bmi.mv.as_mv, x); + } + + b = &x->block[ 0]; + build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride); + b = &x->block[ 2]; + build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride); + b = &x->block[ 8]; + build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride); + b = &x->block[10]; + build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride); + } + else + { + for (i = 0; i < 16; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + int dst_stride = x->dst.y_stride; + + x->block[i+0].bmi = x->mode_info_context->bmi[i+0]; + x->block[i+1].bmi = x->mode_info_context->bmi[i+1]; + if (x->mode_info_context->mbmi.need_to_clamp_mvs) + { + clamp_mv_to_umv_border(&x->block[i+0].bmi.mv.as_mv, x); + clamp_mv_to_umv_border(&x->block[i+1].bmi.mv.as_mv, x); + } + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride); + else + { + build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict); + build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict); + } + + } + + } + base_dst = x->dst.u_buffer; + base_pre = x->pre.u_buffer; + for (i = 16; i < 20; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + int dst_stride = x->dst.uv_stride; + + /* Note: uv mvs already clamped in build_4x4uvmvs() */ + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride); + else + { + build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict); + build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict); + } + } + + base_dst = x->dst.v_buffer; + base_pre = x->pre.v_buffer; + for (i = 20; i < 24; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + int dst_stride = x->dst.uv_stride; + + /* Note: uv mvs already clamped in build_4x4uvmvs() */ + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride); + else + { + build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict); + build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict); + } + } +} + +static +void build_4x4uvmvs(MACROBLOCKD *x) +{ + int i, j; + + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + int yoffset = i * 8 + j * 2; + int uoffset = 16 + i * 2 + j; + int voffset = 20 + i * 2 + j; + + int temp; + + temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.row + + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.row + + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.row + + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.row; + + temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8); + + x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask; + + temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.col + + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.col + + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.col + + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.col; + + temp += 4 + ((temp >> (sizeof(temp) * CHAR_BIT - 1)) * 8); + + x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask; + + if (x->mode_info_context->mbmi.need_to_clamp_mvs) + clamp_uvmv_to_umv_border(&x->block[uoffset].bmi.mv.as_mv, x); + + x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int; + } + } +} + +void vp8_build_inter_predictors_mb(MACROBLOCKD *xd) +{ + if (xd->mode_info_context->mbmi.mode != SPLITMV) + { + vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer, + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.y_stride, xd->dst.uv_stride); + } + else + { + build_4x4uvmvs(xd); + build_inter4x4_predictors_mb(xd); + } +} diff --git a/thirdparty/libvpx/vp8/common/reconinter.h b/thirdparty/libvpx/vp8/common/reconinter.h new file mode 100644 index 000000000000..ba979b9664af --- /dev/null +++ b/thirdparty/libvpx/vp8/common/reconinter.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_RECONINTER_H_ +#define VP8_COMMON_RECONINTER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x); +extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, + unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_ystride, + int dst_uvstride); + + +extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x, + unsigned char *dst_y, + int dst_ystride); +extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, + unsigned char *base_pre, + int pre_stride, + vp8_subpix_fn_t sppf); + +extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x); +extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_RECONINTER_H_ diff --git a/thirdparty/libvpx/vp8/common/reconintra.c b/thirdparty/libvpx/vp8/common/reconintra.c new file mode 100644 index 000000000000..356655dac7bc --- /dev/null +++ b/thirdparty/libvpx/vp8/common/reconintra.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "./vp8_rtcd.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_once.h" +#include "blockd.h" +#include "vp8/common/reconintra.h" +#include "vp8/common/reconintra4x4.h" + +enum { + SIZE_16, + SIZE_8, + NUM_SIZES, +}; + +typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left); + +static intra_pred_fn pred[4][NUM_SIZES]; +static intra_pred_fn dc_pred[2][2][NUM_SIZES]; + +static void vp8_init_intra_predictors_internal(void) +{ +#define INIT_SIZE(sz) \ + pred[V_PRED][SIZE_##sz] = vpx_v_predictor_##sz##x##sz; \ + pred[H_PRED][SIZE_##sz] = vpx_h_predictor_##sz##x##sz; \ + pred[TM_PRED][SIZE_##sz] = vpx_tm_predictor_##sz##x##sz; \ + \ + dc_pred[0][0][SIZE_##sz] = vpx_dc_128_predictor_##sz##x##sz; \ + dc_pred[0][1][SIZE_##sz] = vpx_dc_top_predictor_##sz##x##sz; \ + dc_pred[1][0][SIZE_##sz] = vpx_dc_left_predictor_##sz##x##sz; \ + dc_pred[1][1][SIZE_##sz] = vpx_dc_predictor_##sz##x##sz + + INIT_SIZE(16); + INIT_SIZE(8); + vp8_init_intra4x4_predictors_internal(); +} + +void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x, + unsigned char * yabove_row, + unsigned char * yleft, + int left_stride, + unsigned char * ypred_ptr, + int y_stride) +{ + MB_PREDICTION_MODE mode = x->mode_info_context->mbmi.mode; + DECLARE_ALIGNED(16, uint8_t, yleft_col[16]); + int i; + intra_pred_fn fn; + + for (i = 0; i < 16; i++) + { + yleft_col[i] = yleft[i* left_stride]; + } + + if (mode == DC_PRED) + { + fn = dc_pred[x->left_available][x->up_available][SIZE_16]; + } + else + { + fn = pred[mode][SIZE_16]; + } + + fn(ypred_ptr, y_stride, yabove_row, yleft_col); +} + +void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, + unsigned char * upred_ptr, + unsigned char * vpred_ptr, + int pred_stride) +{ + MB_PREDICTION_MODE uvmode = x->mode_info_context->mbmi.uv_mode; + unsigned char uleft_col[8]; + unsigned char vleft_col[8]; + int i; + intra_pred_fn fn; + + for (i = 0; i < 8; i++) + { + uleft_col[i] = uleft[i * left_stride]; + vleft_col[i] = vleft[i * left_stride]; + } + + if (uvmode == DC_PRED) + { + fn = dc_pred[x->left_available][x->up_available][SIZE_8]; + } + else + { + fn = pred[uvmode][SIZE_8]; + } + + fn(upred_ptr, pred_stride, uabove_row, uleft_col); + fn(vpred_ptr, pred_stride, vabove_row, vleft_col); +} + +void vp8_init_intra_predictors(void) +{ + once(vp8_init_intra_predictors_internal); +} diff --git a/thirdparty/libvpx/vp8/common/reconintra.h b/thirdparty/libvpx/vp8/common/reconintra.h new file mode 100644 index 000000000000..b6225a663798 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/reconintra.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_RECONINTRA_H_ +#define VP8_COMMON_RECONINTRA_H_ + +#include "vp8/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x, + unsigned char *yabove_row, + unsigned char *yleft, + int left_stride, + unsigned char *ypred_ptr, + int y_stride); + +void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, + unsigned char * upred_ptr, + unsigned char * vpred_ptr, + int pred_stride); + +void vp8_init_intra_predictors(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_RECONINTRA_H_ diff --git a/thirdparty/libvpx/vp8/common/reconintra4x4.c b/thirdparty/libvpx/vp8/common/reconintra4x4.c new file mode 100644 index 000000000000..35ad891eff49 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/reconintra4x4.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vp8_rtcd.h" +#include "blockd.h" + +typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left); + +static intra_pred_fn pred[10]; + +void vp8_init_intra4x4_predictors_internal(void) +{ + pred[B_DC_PRED] = vpx_dc_predictor_4x4; + pred[B_TM_PRED] = vpx_tm_predictor_4x4; + pred[B_VE_PRED] = vpx_ve_predictor_4x4; + pred[B_HE_PRED] = vpx_he_predictor_4x4; + pred[B_LD_PRED] = vpx_d45e_predictor_4x4; + pred[B_RD_PRED] = vpx_d135_predictor_4x4; + pred[B_VR_PRED] = vpx_d117_predictor_4x4; + pred[B_VL_PRED] = vpx_d63f_predictor_4x4; + pred[B_HD_PRED] = vpx_d153_predictor_4x4; + pred[B_HU_PRED] = vpx_d207_predictor_4x4; +} + +void vp8_intra4x4_predict(unsigned char *above, + unsigned char *yleft, int left_stride, + B_PREDICTION_MODE b_mode, + unsigned char *dst, int dst_stride, + unsigned char top_left) +{ + unsigned char Left[4]; + unsigned char Aboveb[12], *Above = Aboveb + 4; + + Left[0] = yleft[0]; + Left[1] = yleft[left_stride]; + Left[2] = yleft[2 * left_stride]; + Left[3] = yleft[3 * left_stride]; + memcpy(Above, above, 8); + Above[-1] = top_left; + + pred[b_mode](dst, dst_stride, Above, Left); +} diff --git a/thirdparty/libvpx/vp8/common/reconintra4x4.h b/thirdparty/libvpx/vp8/common/reconintra4x4.h new file mode 100644 index 000000000000..5dc5d13a5c11 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/reconintra4x4.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_RECONINTRA4X4_H_ +#define VP8_COMMON_RECONINTRA4X4_H_ +#include "vp8/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE void intra_prediction_down_copy(MACROBLOCKD *xd, + unsigned char *above_right_src) +{ + int dst_stride = xd->dst.y_stride; + unsigned char *above_right_dst = xd->dst.y_buffer - dst_stride + 16; + + unsigned int *src_ptr = (unsigned int *)above_right_src; + unsigned int *dst_ptr0 = (unsigned int *)(above_right_dst + 4 * dst_stride); + unsigned int *dst_ptr1 = (unsigned int *)(above_right_dst + 8 * dst_stride); + unsigned int *dst_ptr2 = (unsigned int *)(above_right_dst + 12 * dst_stride); + + *dst_ptr0 = *src_ptr; + *dst_ptr1 = *src_ptr; + *dst_ptr2 = *src_ptr; +} + +void vp8_intra4x4_predict(unsigned char *Above, + unsigned char *yleft, int left_stride, + B_PREDICTION_MODE b_mode, + unsigned char *dst, int dst_stride, + unsigned char top_left); + +void vp8_init_intra4x4_predictors_internal(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_RECONINTRA4X4_H_ diff --git a/thirdparty/libvpx/vp8/common/rtcd.c b/thirdparty/libvpx/vp8/common/rtcd.c new file mode 100644 index 000000000000..ab0e9b47fe8d --- /dev/null +++ b/thirdparty/libvpx/vp8/common/rtcd.c @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#define RTCD_C +#include "./vp8_rtcd.h" +#include "vpx_ports/vpx_once.h" + + +void vp8_rtcd() +{ + once(setup_rtcd_internal); +} diff --git a/thirdparty/libvpx/vp8/common/setupintrarecon.c b/thirdparty/libvpx/vp8/common/setupintrarecon.c new file mode 100644 index 000000000000..669564db42b2 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/setupintrarecon.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "setupintrarecon.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) +{ + int i; + + /* set up frame new frame for intra coded blocks */ + memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5); + for (i = 0; i < ybf->y_height; i++) + ybf->y_buffer[ybf->y_stride *i - 1] = (unsigned char) 129; + + memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); + for (i = 0; i < ybf->uv_height; i++) + ybf->u_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129; + + memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); + for (i = 0; i < ybf->uv_height; i++) + ybf->v_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129; + +} + +void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf) +{ + memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5); + memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); + memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); +} diff --git a/thirdparty/libvpx/vp8/common/setupintrarecon.h b/thirdparty/libvpx/vp8/common/setupintrarecon.h new file mode 100644 index 000000000000..1857c4e26a59 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/setupintrarecon.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_COMMON_SETUPINTRARECON_H_ +#define VP8_COMMON_SETUPINTRARECON_H_ + +#include "./vpx_config.h" +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif +extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf); +extern void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf); + +static INLINE void setup_intra_recon_left(unsigned char *y_buffer, + unsigned char *u_buffer, + unsigned char *v_buffer, + int y_stride, + int uv_stride) +{ + int i; + + for (i = 0; i < 16; i++) + y_buffer[y_stride *i] = (unsigned char) 129; + + for (i = 0; i < 8; i++) + u_buffer[uv_stride *i] = (unsigned char) 129; + + for (i = 0; i < 8; i++) + v_buffer[uv_stride *i] = (unsigned char) 129; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_SETUPINTRARECON_H_ diff --git a/thirdparty/libvpx/vp8/common/swapyv12buffer.c b/thirdparty/libvpx/vp8/common/swapyv12buffer.c new file mode 100644 index 000000000000..73656b3d721e --- /dev/null +++ b/thirdparty/libvpx/vp8/common/swapyv12buffer.c @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "swapyv12buffer.h" + +void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame) +{ + unsigned char *temp; + + temp = last_frame->buffer_alloc; + last_frame->buffer_alloc = new_frame->buffer_alloc; + new_frame->buffer_alloc = temp; + + temp = last_frame->y_buffer; + last_frame->y_buffer = new_frame->y_buffer; + new_frame->y_buffer = temp; + + temp = last_frame->u_buffer; + last_frame->u_buffer = new_frame->u_buffer; + new_frame->u_buffer = temp; + + temp = last_frame->v_buffer; + last_frame->v_buffer = new_frame->v_buffer; + new_frame->v_buffer = temp; + +} diff --git a/thirdparty/libvpx/vp8/common/swapyv12buffer.h b/thirdparty/libvpx/vp8/common/swapyv12buffer.h new file mode 100644 index 000000000000..1d66cd3d62e6 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/swapyv12buffer.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_SWAPYV12BUFFER_H_ +#define VP8_COMMON_SWAPYV12BUFFER_H_ + +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_SWAPYV12BUFFER_H_ diff --git a/thirdparty/libvpx/vp8/common/systemdependent.h b/thirdparty/libvpx/vp8/common/systemdependent.h new file mode 100644 index 000000000000..3d44e37cf249 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/systemdependent.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_COMMON_SYSTEMDEPENDENT_H_ +#define VP8_COMMON_SYSTEMDEPENDENT_H_ + +#include "vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8Common; +void vp8_machine_specific_config(struct VP8Common *); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_SYSTEMDEPENDENT_H_ diff --git a/thirdparty/libvpx/vp8/common/threading.h b/thirdparty/libvpx/vp8/common/threading.h new file mode 100644 index 000000000000..183b49b8ff1d --- /dev/null +++ b/thirdparty/libvpx/vp8/common/threading.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_THREADING_H_ +#define VP8_COMMON_THREADING_H_ + +#include "./vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD + +/* Thread management macros */ +#if defined(_WIN32) && !HAVE_PTHREAD_H +/* Win32 */ +#include +#include +#define THREAD_FUNCTION unsigned int __stdcall +#define THREAD_FUNCTION_RETURN DWORD +#define THREAD_SPECIFIC_INDEX DWORD +#define pthread_t HANDLE +#define pthread_attr_t DWORD +#define pthread_detach(thread) if(thread!=NULL)CloseHandle(thread) +#define thread_sleep(nms) Sleep(nms) +#define pthread_cancel(thread) terminate_thread(thread,0) +#define ts_key_create(ts_key, destructor) {ts_key = TlsAlloc();}; +#define pthread_getspecific(ts_key) TlsGetValue(ts_key) +#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value) +#define pthread_self() GetCurrentThreadId() + +#elif defined(__OS2__) +/* OS/2 */ +#define INCL_DOS +#include + +#include +#define THREAD_FUNCTION void * +#define THREAD_FUNCTION_RETURN void * +#define THREAD_SPECIFIC_INDEX PULONG +#define pthread_t TID +#define pthread_attr_t ULONG +#define pthread_detach(thread) 0 +#define thread_sleep(nms) DosSleep(nms) +#define pthread_cancel(thread) DosKillThread(thread) +#define ts_key_create(ts_key, destructor) \ + DosAllocThreadLocalMemory(1, &(ts_key)); +#define pthread_getspecific(ts_key) ((void *)(*(ts_key))) +#define pthread_setspecific(ts_key, value) (*(ts_key)=(ULONG)(value)) +#define pthread_self() _gettid() +#else +#ifdef __APPLE__ +#include +#include +#include +#include +#include + +#else +#include +#endif + +#include +/* pthreads */ +/* Nearly everything is already defined */ +#define THREAD_FUNCTION void * +#define THREAD_FUNCTION_RETURN void * +#define THREAD_SPECIFIC_INDEX pthread_key_t +#define ts_key_create(ts_key, destructor) pthread_key_create (&(ts_key), destructor); +#endif + +/* Synchronization macros: Win32 and Pthreads */ +#if defined(_WIN32) && !HAVE_PTHREAD_H +#define sem_t HANDLE +#define pause(voidpara) __asm PAUSE +#define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateSemaphore(NULL,0,32768,NULL))==NULL) +#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,INFINITE)) +#define sem_post(sem) ReleaseSemaphore(*sem,1,NULL) +#define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE) +#define thread_sleep(nms) Sleep(nms) + +#elif defined(__OS2__) +typedef struct +{ + HEV event; + HMTX wait_mutex; + HMTX count_mutex; + int count; +} sem_t; + +static inline int sem_init(sem_t *sem, int pshared, unsigned int value) +{ + DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0, + value > 0 ? TRUE : FALSE); + DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE); + DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE); + + sem->count = value; + + return 0; +} + +static inline int sem_wait(sem_t * sem) +{ + DosRequestMutexSem(sem->wait_mutex, -1); + + DosWaitEventSem(sem->event, -1); + + DosRequestMutexSem(sem->count_mutex, -1); + + sem->count--; + if (sem->count == 0) + { + ULONG post_count; + + DosResetEventSem(sem->event, &post_count); + } + + DosReleaseMutexSem(sem->count_mutex); + + DosReleaseMutexSem(sem->wait_mutex); + + return 0; +} + +static inline int sem_post(sem_t * sem) +{ + DosRequestMutexSem(sem->count_mutex, -1); + + if (sem->count < 32768) + { + sem->count++; + DosPostEventSem(sem->event); + } + + DosReleaseMutexSem(sem->count_mutex); + + return 0; +} + +static inline int sem_destroy(sem_t * sem) +{ + DosCloseEventSem(sem->event); + DosCloseMutexSem(sem->wait_mutex); + DosCloseMutexSem(sem->count_mutex); + + return 0; +} + +#define thread_sleep(nms) DosSleep(nms) + +#else + +#ifdef __APPLE__ +#define sem_t semaphore_t +#define sem_init(X,Y,Z) semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z) +#define sem_wait(sem) (semaphore_wait(*sem) ) +#define sem_post(sem) semaphore_signal(*sem) +#define sem_destroy(sem) semaphore_destroy(mach_task_self(),*sem) +#define thread_sleep(nms) /* { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */ +#else +#include +#include +#define thread_sleep(nms) sched_yield();/* {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */ +#endif +/* Not Windows. Assume pthreads */ + +#endif + +#if ARCH_X86 || ARCH_X86_64 +#include "vpx_ports/x86.h" +#else +#define x86_pause_hint() +#endif + +#include "vpx_util/vpx_thread.h" + +static INLINE void mutex_lock(pthread_mutex_t *const mutex) { + const int kMaxTryLocks = 4000; + int locked = 0; + int i; + + for (i = 0; i < kMaxTryLocks; ++i) { + if (!pthread_mutex_trylock(mutex)) { + locked = 1; + break; + } + } + + if (!locked) + pthread_mutex_lock(mutex); +} + +static INLINE int protected_read(pthread_mutex_t *const mutex, const int *p) { + int ret; + mutex_lock(mutex); + ret = *p; + pthread_mutex_unlock(mutex); + return ret; +} + +static INLINE void sync_read(pthread_mutex_t *const mutex, int mb_col, + const int *last_row_current_mb_col, + const int nsync) { + while (mb_col > (protected_read(mutex, last_row_current_mb_col) - nsync)) { + x86_pause_hint(); + thread_sleep(0); + } +} + +static INLINE void protected_write(pthread_mutex_t *mutex, int *p, int v) { + mutex_lock(mutex); + *p = v; + pthread_mutex_unlock(mutex); +} + +#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_THREADING_H_ diff --git a/thirdparty/libvpx/vp8/common/treecoder.c b/thirdparty/libvpx/vp8/common/treecoder.c new file mode 100644 index 000000000000..d80c64bdfa1b --- /dev/null +++ b/thirdparty/libvpx/vp8/common/treecoder.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#if CONFIG_DEBUG +#include +#endif +#include + +#include "treecoder.h" + +static void tree2tok( + struct vp8_token_struct *const p, + vp8_tree t, + int i, + int v, + int L +) +{ + v += v; + ++L; + + do + { + const vp8_tree_index j = t[i++]; + + if (j <= 0) + { + p[-j].value = v; + p[-j].Len = L; + } + else + tree2tok(p, t, j, v, L); + } + while (++v & 1); +} + +void vp8_tokens_from_tree(struct vp8_token_struct *p, vp8_tree t) +{ + tree2tok(p, t, 0, 0, 0); +} + +void vp8_tokens_from_tree_offset(struct vp8_token_struct *p, vp8_tree t, + int offset) +{ + tree2tok(p - offset, t, 0, 0, 0); +} + +static void branch_counts( + int n, /* n = size of alphabet */ + vp8_token tok [ /* n */ ], + vp8_tree tree, + unsigned int branch_ct [ /* n-1 */ ] [2], + const unsigned int num_events[ /* n */ ] +) +{ + const int tree_len = n - 1; + int t = 0; + +#if CONFIG_DEBUG + assert(tree_len); +#endif + + do + { + branch_ct[t][0] = branch_ct[t][1] = 0; + } + while (++t < tree_len); + + t = 0; + + do + { + int L = tok[t].Len; + const int enc = tok[t].value; + const unsigned int ct = num_events[t]; + + vp8_tree_index i = 0; + + do + { + const int b = (enc >> --L) & 1; + const int j = i >> 1; +#if CONFIG_DEBUG + assert(j < tree_len && 0 <= L); +#endif + + branch_ct [j] [b] += ct; + i = tree[ i + b]; + } + while (i > 0); + +#if CONFIG_DEBUG + assert(!L); +#endif + } + while (++t < n); + +} + + +void vp8_tree_probs_from_distribution( + int n, /* n = size of alphabet */ + vp8_token tok [ /* n */ ], + vp8_tree tree, + vp8_prob probs [ /* n-1 */ ], + unsigned int branch_ct [ /* n-1 */ ] [2], + const unsigned int num_events[ /* n */ ], + unsigned int Pfac, + int rd +) +{ + const int tree_len = n - 1; + int t = 0; + + branch_counts(n, tok, tree, branch_ct, num_events); + + do + { + const unsigned int *const c = branch_ct[t]; + const unsigned int tot = c[0] + c[1]; + +#if CONFIG_DEBUG + assert(tot < (1 << 24)); /* no overflow below */ +#endif + + if (tot) + { + const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot; + probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */ + } + else + probs[t] = vp8_prob_half; + } + while (++t < tree_len); +} diff --git a/thirdparty/libvpx/vp8/common/treecoder.h b/thirdparty/libvpx/vp8/common/treecoder.h new file mode 100644 index 000000000000..d22b7c570cbe --- /dev/null +++ b/thirdparty/libvpx/vp8/common/treecoder.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_COMMON_TREECODER_H_ +#define VP8_COMMON_TREECODER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef unsigned char vp8bc_index_t; /* probability index */ + + +typedef unsigned char vp8_prob; + +#define vp8_prob_half ( (vp8_prob) 128) + +typedef signed char vp8_tree_index; +struct bool_coder_spec; + +typedef struct bool_coder_spec bool_coder_spec; +typedef struct bool_writer bool_writer; +typedef struct bool_reader bool_reader; + +typedef const bool_coder_spec c_bool_coder_spec; +typedef const bool_writer c_bool_writer; +typedef const bool_reader c_bool_reader; + + + +# define vp8_complement( x) (255 - x) + + +/* We build coding trees compactly in arrays. + Each node of the tree is a pair of vp8_tree_indices. + Array index often references a corresponding probability table. + Index <= 0 means done encoding/decoding and value = -Index, + Index > 0 means need another bit, specification at index. + Nonnegative indices are always even; processing begins at node 0. */ + +typedef const vp8_tree_index vp8_tree[], *vp8_tree_p; + + +typedef const struct vp8_token_struct +{ + int value; + int Len; +} vp8_token; + +/* Construct encoding array from tree. */ + +void vp8_tokens_from_tree(struct vp8_token_struct *, vp8_tree); +void vp8_tokens_from_tree_offset(struct vp8_token_struct *, vp8_tree, + int offset); + + +/* Convert array of token occurrence counts into a table of probabilities + for the associated binary encoding tree. Also writes count of branches + taken for each node on the tree; this facilitiates decisions as to + probability updates. */ + +void vp8_tree_probs_from_distribution( + int n, /* n = size of alphabet */ + vp8_token tok [ /* n */ ], + vp8_tree tree, + vp8_prob probs [ /* n-1 */ ], + unsigned int branch_ct [ /* n-1 */ ] [2], + const unsigned int num_events[ /* n */ ], + unsigned int Pfactor, + int Round +); + +/* Variant of above using coder spec rather than hardwired 8-bit probs. */ + +void vp8bc_tree_probs_from_distribution( + int n, /* n = size of alphabet */ + vp8_token tok [ /* n */ ], + vp8_tree tree, + vp8_prob probs [ /* n-1 */ ], + unsigned int branch_ct [ /* n-1 */ ] [2], + const unsigned int num_events[ /* n */ ], + c_bool_coder_spec *s +); + + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_TREECODER_H_ diff --git a/thirdparty/libvpx/vp8/common/vp8_entropymodedata.h b/thirdparty/libvpx/vp8/common/vp8_entropymodedata.h new file mode 100644 index 000000000000..c4aed4989705 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/vp8_entropymodedata.h @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. +*/ + +#ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_ +#define VP8_COMMON_VP8_ENTROPYMODEDATA_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/*Generated file, included by entropymode.c*/ + + +const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES] = +{ + { 0, 1 }, + { 2, 2 }, + { 6, 3 }, + { 28, 5 }, + { 30, 5 }, + { 58, 6 }, + { 59, 6 }, + { 62, 6 }, + { 126, 7 }, + { 127, 7 } +}; + +const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES] = +{ + { 0, 1 }, + { 4, 3 }, + { 5, 3 }, + { 6, 3 }, + { 7, 3 } +}; + +const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES] = +{ + { 4, 3 }, + { 5, 3 }, + { 6, 3 }, + { 7, 3 }, + { 0, 1 } +}; + +const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES] = +{ + { 0, 1 }, + { 2, 2 }, + { 6, 3 }, + { 7, 3 } +}; + +const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS] = +{ + { 6, 3 }, + { 7, 3 }, + { 2, 2 }, + { 0, 1 } +}; + +const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS] = +{ + { 2, 2 }, + { 6, 3 }, + { 0, 1 }, + { 14, 4 }, + { 15, 4 } +}; + +const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS] = +{ + { 0, 1 }, + { 2, 2 }, + { 6, 3 }, + { 7, 3 } +}; + +const struct vp8_token_struct vp8_small_mvencodings[8] = +{ + { 0, 3 }, + { 1, 3 }, + { 2, 3 }, + { 3, 3 }, + { 4, 3 }, + { 5, 3 }, + { 6, 3 }, + { 7, 3 } +}; + +const vp8_prob vp8_ymode_prob[VP8_YMODES-1] = +{ + 112, 86, 140, 37 +}; + +const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1] = +{ + 145, 156, 163, 128 +}; + +const vp8_prob vp8_uv_mode_prob[VP8_UV_MODES-1] = +{ + 162, 101, 204 +}; + +const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1] = +{ + 142, 114, 183 +}; + +const vp8_prob vp8_bmode_prob[VP8_BINTRAMODES-1] = +{ + 120, 90, 79, 133, 87, 85, 80, 111, 151 +}; + + + +const vp8_prob vp8_kf_bmode_prob +[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1] = +{ + { + { 231, 120, 48, 89, 115, 113, 120, 152, 112 }, + { 152, 179, 64, 126, 170, 118, 46, 70, 95 }, + { 175, 69, 143, 80, 85, 82, 72, 155, 103 }, + { 56, 58, 10, 171, 218, 189, 17, 13, 152 }, + { 144, 71, 10, 38, 171, 213, 144, 34, 26 }, + { 114, 26, 17, 163, 44, 195, 21, 10, 173 }, + { 121, 24, 80, 195, 26, 62, 44, 64, 85 }, + { 170, 46, 55, 19, 136, 160, 33, 206, 71 }, + { 63, 20, 8, 114, 114, 208, 12, 9, 226 }, + { 81, 40, 11, 96, 182, 84, 29, 16, 36 } + }, + { + { 134, 183, 89, 137, 98, 101, 106, 165, 148 }, + { 72, 187, 100, 130, 157, 111, 32, 75, 80 }, + { 66, 102, 167, 99, 74, 62, 40, 234, 128 }, + { 41, 53, 9, 178, 241, 141, 26, 8, 107 }, + { 104, 79, 12, 27, 217, 255, 87, 17, 7 }, + { 74, 43, 26, 146, 73, 166, 49, 23, 157 }, + { 65, 38, 105, 160, 51, 52, 31, 115, 128 }, + { 87, 68, 71, 44, 114, 51, 15, 186, 23 }, + { 47, 41, 14, 110, 182, 183, 21, 17, 194 }, + { 66, 45, 25, 102, 197, 189, 23, 18, 22 } + }, + { + { 88, 88, 147, 150, 42, 46, 45, 196, 205 }, + { 43, 97, 183, 117, 85, 38, 35, 179, 61 }, + { 39, 53, 200, 87, 26, 21, 43, 232, 171 }, + { 56, 34, 51, 104, 114, 102, 29, 93, 77 }, + { 107, 54, 32, 26, 51, 1, 81, 43, 31 }, + { 39, 28, 85, 171, 58, 165, 90, 98, 64 }, + { 34, 22, 116, 206, 23, 34, 43, 166, 73 }, + { 68, 25, 106, 22, 64, 171, 36, 225, 114 }, + { 34, 19, 21, 102, 132, 188, 16, 76, 124 }, + { 62, 18, 78, 95, 85, 57, 50, 48, 51 } + }, + { + { 193, 101, 35, 159, 215, 111, 89, 46, 111 }, + { 60, 148, 31, 172, 219, 228, 21, 18, 111 }, + { 112, 113, 77, 85, 179, 255, 38, 120, 114 }, + { 40, 42, 1, 196, 245, 209, 10, 25, 109 }, + { 100, 80, 8, 43, 154, 1, 51, 26, 71 }, + { 88, 43, 29, 140, 166, 213, 37, 43, 154 }, + { 61, 63, 30, 155, 67, 45, 68, 1, 209 }, + { 142, 78, 78, 16, 255, 128, 34, 197, 171 }, + { 41, 40, 5, 102, 211, 183, 4, 1, 221 }, + { 51, 50, 17, 168, 209, 192, 23, 25, 82 } + }, + { + { 125, 98, 42, 88, 104, 85, 117, 175, 82 }, + { 95, 84, 53, 89, 128, 100, 113, 101, 45 }, + { 75, 79, 123, 47, 51, 128, 81, 171, 1 }, + { 57, 17, 5, 71, 102, 57, 53, 41, 49 }, + { 115, 21, 2, 10, 102, 255, 166, 23, 6 }, + { 38, 33, 13, 121, 57, 73, 26, 1, 85 }, + { 41, 10, 67, 138, 77, 110, 90, 47, 114 }, + { 101, 29, 16, 10, 85, 128, 101, 196, 26 }, + { 57, 18, 10, 102, 102, 213, 34, 20, 43 }, + { 117, 20, 15, 36, 163, 128, 68, 1, 26 } + }, + { + { 138, 31, 36, 171, 27, 166, 38, 44, 229 }, + { 67, 87, 58, 169, 82, 115, 26, 59, 179 }, + { 63, 59, 90, 180, 59, 166, 93, 73, 154 }, + { 40, 40, 21, 116, 143, 209, 34, 39, 175 }, + { 57, 46, 22, 24, 128, 1, 54, 17, 37 }, + { 47, 15, 16, 183, 34, 223, 49, 45, 183 }, + { 46, 17, 33, 183, 6, 98, 15, 32, 183 }, + { 65, 32, 73, 115, 28, 128, 23, 128, 205 }, + { 40, 3, 9, 115, 51, 192, 18, 6, 223 }, + { 87, 37, 9, 115, 59, 77, 64, 21, 47 } + }, + { + { 104, 55, 44, 218, 9, 54, 53, 130, 226 }, + { 64, 90, 70, 205, 40, 41, 23, 26, 57 }, + { 54, 57, 112, 184, 5, 41, 38, 166, 213 }, + { 30, 34, 26, 133, 152, 116, 10, 32, 134 }, + { 75, 32, 12, 51, 192, 255, 160, 43, 51 }, + { 39, 19, 53, 221, 26, 114, 32, 73, 255 }, + { 31, 9, 65, 234, 2, 15, 1, 118, 73 }, + { 88, 31, 35, 67, 102, 85, 55, 186, 85 }, + { 56, 21, 23, 111, 59, 205, 45, 37, 192 }, + { 55, 38, 70, 124, 73, 102, 1, 34, 98 } + }, + { + { 102, 61, 71, 37, 34, 53, 31, 243, 192 }, + { 69, 60, 71, 38, 73, 119, 28, 222, 37 }, + { 68, 45, 128, 34, 1, 47, 11, 245, 171 }, + { 62, 17, 19, 70, 146, 85, 55, 62, 70 }, + { 75, 15, 9, 9, 64, 255, 184, 119, 16 }, + { 37, 43, 37, 154, 100, 163, 85, 160, 1 }, + { 63, 9, 92, 136, 28, 64, 32, 201, 85 }, + { 86, 6, 28, 5, 64, 255, 25, 248, 1 }, + { 56, 8, 17, 132, 137, 255, 55, 116, 128 }, + { 58, 15, 20, 82, 135, 57, 26, 121, 40 } + }, + { + { 164, 50, 31, 137, 154, 133, 25, 35, 218 }, + { 51, 103, 44, 131, 131, 123, 31, 6, 158 }, + { 86, 40, 64, 135, 148, 224, 45, 183, 128 }, + { 22, 26, 17, 131, 240, 154, 14, 1, 209 }, + { 83, 12, 13, 54, 192, 255, 68, 47, 28 }, + { 45, 16, 21, 91, 64, 222, 7, 1, 197 }, + { 56, 21, 39, 155, 60, 138, 23, 102, 213 }, + { 85, 26, 85, 85, 128, 128, 32, 146, 171 }, + { 18, 11, 7, 63, 144, 171, 4, 4, 246 }, + { 35, 27, 10, 146, 174, 171, 12, 26, 128 } + }, + { + { 190, 80, 35, 99, 180, 80, 126, 54, 45 }, + { 85, 126, 47, 87, 176, 51, 41, 20, 32 }, + { 101, 75, 128, 139, 118, 146, 116, 128, 85 }, + { 56, 41, 15, 176, 236, 85, 37, 9, 62 }, + { 146, 36, 19, 30, 171, 255, 97, 27, 20 }, + { 71, 30, 17, 119, 118, 255, 17, 18, 138 }, + { 101, 38, 60, 138, 55, 70, 43, 26, 142 }, + { 138, 45, 61, 62, 219, 1, 81, 188, 64 }, + { 32, 41, 20, 117, 151, 142, 20, 21, 163 }, + { 112, 19, 12, 61, 195, 128, 48, 4, 24 } + } +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_VP8_ENTROPYMODEDATA_H_ diff --git a/thirdparty/libvpx/vp8/common/vp8_loopfilter.c b/thirdparty/libvpx/vp8/common/vp8_loopfilter.c new file mode 100644 index 000000000000..756ad488f95c --- /dev/null +++ b/thirdparty/libvpx/vp8/common/vp8_loopfilter.c @@ -0,0 +1,661 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "loopfilter.h" +#include "onyxc_int.h" +#include "vpx_mem/vpx_mem.h" + + +static void lf_init_lut(loop_filter_info_n *lfi) +{ + int filt_lvl; + + for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++) + { + if (filt_lvl >= 40) + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3; + } + else if (filt_lvl >= 20) + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2; + } + else if (filt_lvl >= 15) + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1; + } + else + { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0; + } + } + + lfi->mode_lf_lut[DC_PRED] = 1; + lfi->mode_lf_lut[V_PRED] = 1; + lfi->mode_lf_lut[H_PRED] = 1; + lfi->mode_lf_lut[TM_PRED] = 1; + lfi->mode_lf_lut[B_PRED] = 0; + + lfi->mode_lf_lut[ZEROMV] = 1; + lfi->mode_lf_lut[NEARESTMV] = 2; + lfi->mode_lf_lut[NEARMV] = 2; + lfi->mode_lf_lut[NEWMV] = 2; + lfi->mode_lf_lut[SPLITMV] = 3; + +} + +void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi, + int sharpness_lvl) +{ + int i; + + /* For each possible value for the loop filter fill out limits */ + for (i = 0; i <= MAX_LOOP_FILTER; i++) + { + int filt_lvl = i; + int block_inside_limit = 0; + + /* Set loop filter paramaeters that control sharpness. */ + block_inside_limit = filt_lvl >> (sharpness_lvl > 0); + block_inside_limit = block_inside_limit >> (sharpness_lvl > 4); + + if (sharpness_lvl > 0) + { + if (block_inside_limit > (9 - sharpness_lvl)) + block_inside_limit = (9 - sharpness_lvl); + } + + if (block_inside_limit < 1) + block_inside_limit = 1; + + memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH); + memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), SIMD_WIDTH); + memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit), + SIMD_WIDTH); + } +} + +void vp8_loop_filter_init(VP8_COMMON *cm) +{ + loop_filter_info_n *lfi = &cm->lf_info; + int i; + + /* init limits for given sharpness*/ + vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level); + cm->last_sharpness_level = cm->sharpness_level; + + /* init LUT for lvl and hev thr picking */ + lf_init_lut(lfi); + + /* init hev threshold const vectors */ + for(i = 0; i < 4 ; i++) + { + memset(lfi->hev_thr[i], i, SIMD_WIDTH); + } +} + +void vp8_loop_filter_frame_init(VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl) +{ + int seg, /* segment number */ + ref, /* index in ref_lf_deltas */ + mode; /* index in mode_lf_deltas */ + + loop_filter_info_n *lfi = &cm->lf_info; + + /* update limits if sharpness has changed */ + if(cm->last_sharpness_level != cm->sharpness_level) + { + vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level); + cm->last_sharpness_level = cm->sharpness_level; + } + + for(seg = 0; seg < MAX_MB_SEGMENTS; seg++) + { + int lvl_seg = default_filt_lvl; + int lvl_ref, lvl_mode; + + /* Note the baseline filter values for each segment */ + if (mbd->segmentation_enabled) + { + /* Abs value */ + if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) + { + lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; + } + else /* Delta Value */ + { + lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; + } + lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0; + } + + if (!mbd->mode_ref_lf_delta_enabled) + { + /* we could get rid of this if we assume that deltas are set to + * zero when not in use; encoder always uses deltas + */ + memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 ); + continue; + } + + /* INTRA_FRAME */ + ref = INTRA_FRAME; + + /* Apply delta for reference frame */ + lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref]; + + /* Apply delta for Intra modes */ + mode = 0; /* B_PRED */ + /* Only the split mode BPRED has a further special case */ + lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; + /* clamp */ + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; + + lfi->lvl[seg][ref][mode] = lvl_mode; + + mode = 1; /* all the rest of Intra modes */ + /* clamp */ + lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; + lfi->lvl[seg][ref][mode] = lvl_mode; + + /* LAST, GOLDEN, ALT */ + for(ref = 1; ref < MAX_REF_FRAMES; ref++) + { + /* Apply delta for reference frame */ + lvl_ref = lvl_seg + mbd->ref_lf_deltas[ref]; + + /* Apply delta for Inter modes */ + for (mode = 1; mode < 4; mode++) + { + lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode]; + /* clamp */ + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; + + lfi->lvl[seg][ref][mode] = lvl_mode; + } + } + } +} + + +void vp8_loop_filter_row_normal(VP8_COMMON *cm, MODE_INFO *mode_info_context, + int mb_row, int post_ystride, int post_uvstride, + unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr) +{ + int mb_col; + int filter_level; + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + FRAME_TYPE frame_type = cm->frame_type; + + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv + (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv + (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_mbh + (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh + (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi); + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } + +} + +void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context, + int mb_row, int post_ystride, int post_uvstride, + unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr) +{ + int mb_col; + int filter_level; + loop_filter_info_n *lfi_n = &cm->lf_info; + (void)post_uvstride; + + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) + { + if (mb_col > 0) + vp8_loop_filter_simple_mbv + (y_ptr, post_ystride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bv + (y_ptr, post_ystride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_simple_mbh + (y_ptr, post_ystride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bh + (y_ptr, post_ystride, lfi_n->blim[filter_level]); + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } + +} +void vp8_loop_filter_frame(VP8_COMMON *cm, + MACROBLOCKD *mbd, + int frame_type) +{ + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + + int mb_row; + int mb_col; + int mb_rows = cm->mb_rows; + int mb_cols = cm->mb_cols; + + int filter_level; + + unsigned char *y_ptr, *u_ptr, *v_ptr; + + /* Point at base of Mb MODE_INFO list */ + const MODE_INFO *mode_info_context = cm->mi; + int post_y_stride = post->y_stride; + int post_uv_stride = post->uv_stride; + + /* Initialize the loop filter for this frame. */ + vp8_loop_filter_frame_init(cm, mbd, cm->filter_level); + + /* Set up the buffer pointers */ + y_ptr = post->y_buffer; + u_ptr = post->u_buffer; + v_ptr = post->v_buffer; + + /* vp8_filter each macro block */ + if (cm->filter_type == NORMAL_LOOPFILTER) + { + for (mb_row = 0; mb_row < mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv + (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv + (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_mbh + (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh + (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi); + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } + y_ptr += post_y_stride * 16 - post->y_width; + u_ptr += post_uv_stride * 8 - post->uv_width; + v_ptr += post_uv_stride * 8 - post->uv_width; + + mode_info_context++; /* Skip border mb */ + + } + } + else /* SIMPLE_LOOPFILTER */ + { + for (mb_row = 0; mb_row < mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + if (filter_level) + { + const unsigned char * mblim = lfi_n->mblim[filter_level]; + const unsigned char * blim = lfi_n->blim[filter_level]; + + if (mb_col > 0) + vp8_loop_filter_simple_mbv + (y_ptr, post_y_stride, mblim); + + if (!skip_lf) + vp8_loop_filter_simple_bv + (y_ptr, post_y_stride, blim); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_simple_mbh + (y_ptr, post_y_stride, mblim); + + if (!skip_lf) + vp8_loop_filter_simple_bh + (y_ptr, post_y_stride, blim); + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } + y_ptr += post_y_stride * 16 - post->y_width; + u_ptr += post_uv_stride * 8 - post->uv_width; + v_ptr += post_uv_stride * 8 - post->uv_width; + + mode_info_context++; /* Skip border mb */ + + } + } +} + +void vp8_loop_filter_frame_yonly +( + VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl +) +{ + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + + unsigned char *y_ptr; + int mb_row; + int mb_col; + + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + + int filter_level; + FRAME_TYPE frame_type = cm->frame_type; + + /* Point at base of Mb MODE_INFO list */ + const MODE_INFO *mode_info_context = cm->mi; + +#if 0 + if(default_filt_lvl == 0) /* no filter applied */ + return; +#endif + + /* Initialize the loop filter for this frame. */ + vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl); + + /* Set up the buffer pointers */ + y_ptr = post->y_buffer; + + /* vp8_filter each macro block */ + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) + { + if (cm->filter_type == NORMAL_LOOPFILTER) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_mbh + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + } + else + { + if (mb_col > 0) + vp8_loop_filter_simple_mbv + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bv + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_simple_mbh + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bh + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + } + } + + y_ptr += 16; + mode_info_context ++; /* step to next MB */ + + } + + y_ptr += post->y_stride * 16 - post->y_width; + mode_info_context ++; /* Skip border mb */ + } + +} + +void vp8_loop_filter_partial_frame +( + VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl +) +{ + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + + unsigned char *y_ptr; + int mb_row; + int mb_col; + int mb_cols = post->y_width >> 4; + int mb_rows = post->y_height >> 4; + + int linestocopy; + + loop_filter_info_n *lfi_n = &cm->lf_info; + loop_filter_info lfi; + + int filter_level; + FRAME_TYPE frame_type = cm->frame_type; + + const MODE_INFO *mode_info_context; + +#if 0 + if(default_filt_lvl == 0) /* no filter applied */ + return; +#endif + + /* Initialize the loop filter for this frame. */ + vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl); + + /* number of MB rows to use in partial filtering */ + linestocopy = mb_rows / PARTIAL_FRAME_FRACTION; + linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */ + + /* Set up the buffer pointers; partial image starts at ~middle of frame */ + y_ptr = post->y_buffer + ((post->y_height >> 5) * 16) * post->y_stride; + mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); + + /* vp8_filter each macro block */ + for (mb_row = 0; mb_row<(linestocopy >> 4); mb_row++) + { + for (mb_col = 0; mb_col < mb_cols; mb_col++) + { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = + lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) + { + if (cm->filter_type == NORMAL_LOOPFILTER) + { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + vp8_loop_filter_mbh + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh + (y_ptr, 0, 0, post->y_stride, 0, &lfi); + } + else + { + if (mb_col > 0) + vp8_loop_filter_simple_mbv + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bv + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + + vp8_loop_filter_simple_mbh + (y_ptr, post->y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bh + (y_ptr, post->y_stride, lfi_n->blim[filter_level]); + } + } + + y_ptr += 16; + mode_info_context += 1; /* step to next MB */ + } + + y_ptr += post->y_stride * 16 - post->y_width; + mode_info_context += 1; /* Skip border mb */ + } +} diff --git a/thirdparty/libvpx/vp8/common/x86/copy_sse2.asm b/thirdparty/libvpx/vp8/common/x86/copy_sse2.asm new file mode 100644 index 000000000000..86fae2695631 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/copy_sse2.asm @@ -0,0 +1,93 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void vp8_copy32xn_sse2( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; int height); +global sym(vp8_copy32xn_sse2) PRIVATE +sym(vp8_copy32xn_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;dst_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;dst_stride + movsxd rcx, dword ptr arg(4) ;height + +.block_copy_sse2_loopx4: + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + 16] + movdqu xmm2, XMMWORD PTR [rsi + rax] + movdqu xmm3, XMMWORD PTR [rsi + rax + 16] + + lea rsi, [rsi+rax*2] + + movdqu xmm4, XMMWORD PTR [rsi] + movdqu xmm5, XMMWORD PTR [rsi + 16] + movdqu xmm6, XMMWORD PTR [rsi + rax] + movdqu xmm7, XMMWORD PTR [rsi + rax + 16] + + lea rsi, [rsi+rax*2] + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm1 + movdqa XMMWORD PTR [rdi + rdx], xmm2 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 + + lea rdi, [rdi+rdx*2] + + movdqa XMMWORD PTR [rdi], xmm4 + movdqa XMMWORD PTR [rdi + 16], xmm5 + movdqa XMMWORD PTR [rdi + rdx], xmm6 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 + + lea rdi, [rdi+rdx*2] + + sub rcx, 4 + cmp rcx, 4 + jge .block_copy_sse2_loopx4 + + cmp rcx, 0 + je .copy_is_done + +.block_copy_sse2_loop: + movdqu xmm0, XMMWORD PTR [rsi] + movdqu xmm1, XMMWORD PTR [rsi + 16] + lea rsi, [rsi+rax] + + movdqa XMMWORD PTR [rdi], xmm0 + movdqa XMMWORD PTR [rdi + 16], xmm1 + lea rdi, [rdi+rdx] + + sub rcx, 1 + jne .block_copy_sse2_loop + +.copy_is_done: + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/thirdparty/libvpx/vp8/common/x86/copy_sse3.asm b/thirdparty/libvpx/vp8/common/x86/copy_sse3.asm new file mode 100644 index 000000000000..d789a40ccf7c --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/copy_sse3.asm @@ -0,0 +1,146 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro STACK_FRAME_CREATE_X3 0 +%if ABI_IS_32BIT + %define src_ptr rsi + %define src_stride rax + %define ref_ptr rdi + %define ref_stride rdx + %define end_ptr rcx + %define ret_var rbx + %define result_ptr arg(4) + %define max_sad arg(4) + %define height dword ptr arg(4) + push rbp + mov rbp, rsp + push rsi + push rdi + push rbx + + mov rsi, arg(0) ; src_ptr + mov rdi, arg(2) ; ref_ptr + + movsxd rax, dword ptr arg(1) ; src_stride + movsxd rdx, dword ptr arg(3) ; ref_stride +%else + %if LIBVPX_YASM_WIN64 + SAVE_XMM 7, u + %define src_ptr rcx + %define src_stride rdx + %define ref_ptr r8 + %define ref_stride r9 + %define end_ptr r10 + %define ret_var r11 + %define result_ptr [rsp+xmm_stack_space+8+4*8] + %define max_sad [rsp+xmm_stack_space+8+4*8] + %define height dword ptr [rsp+xmm_stack_space+8+4*8] + %else + %define src_ptr rdi + %define src_stride rsi + %define ref_ptr rdx + %define ref_stride rcx + %define end_ptr r9 + %define ret_var r10 + %define result_ptr r8 + %define max_sad r8 + %define height r8 + %endif +%endif + +%endmacro + +%macro STACK_FRAME_DESTROY_X3 0 + %define src_ptr + %define src_stride + %define ref_ptr + %define ref_stride + %define end_ptr + %define ret_var + %define result_ptr + %define max_sad + %define height + +%if ABI_IS_32BIT + pop rbx + pop rdi + pop rsi + pop rbp +%else + %if LIBVPX_YASM_WIN64 + RESTORE_XMM + %endif +%endif + ret +%endmacro + + +;void vp8_copy32xn_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; int height); +global sym(vp8_copy32xn_sse3) PRIVATE +sym(vp8_copy32xn_sse3): + + STACK_FRAME_CREATE_X3 + +.block_copy_sse3_loopx4: + lea end_ptr, [src_ptr+src_stride*2] + + movdqu xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [src_ptr + 16] + movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] + movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] + movdqu xmm4, XMMWORD PTR [end_ptr] + movdqu xmm5, XMMWORD PTR [end_ptr + 16] + movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] + movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] + + lea src_ptr, [src_ptr+src_stride*4] + + lea end_ptr, [ref_ptr+ref_stride*2] + + movdqa XMMWORD PTR [ref_ptr], xmm0 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 + movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 + movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 + movdqa XMMWORD PTR [end_ptr], xmm4 + movdqa XMMWORD PTR [end_ptr + 16], xmm5 + movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 + movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 + + lea ref_ptr, [ref_ptr+ref_stride*4] + + sub height, 4 + cmp height, 4 + jge .block_copy_sse3_loopx4 + + ;Check to see if there is more rows need to be copied. + cmp height, 0 + je .copy_is_done + +.block_copy_sse3_loop: + movdqu xmm0, XMMWORD PTR [src_ptr] + movdqu xmm1, XMMWORD PTR [src_ptr + 16] + lea src_ptr, [src_ptr+src_stride] + + movdqa XMMWORD PTR [ref_ptr], xmm0 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 + lea ref_ptr, [ref_ptr+ref_stride] + + sub height, 1 + jne .block_copy_sse3_loop + +.copy_is_done: + STACK_FRAME_DESTROY_X3 diff --git a/thirdparty/libvpx/vp8/common/x86/dequantize_mmx.asm b/thirdparty/libvpx/vp8/common/x86/dequantize_mmx.asm new file mode 100644 index 000000000000..4e551f00aa60 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/dequantize_mmx.asm @@ -0,0 +1,258 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q) +global sym(vp8_dequantize_b_impl_mmx) PRIVATE +sym(vp8_dequantize_b_impl_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;sq + mov rdi, arg(1) ;dq + mov rax, arg(2) ;q + + movq mm1, [rsi] + pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers. + movq [rdi], mm1 + + movq mm1, [rsi+8] + pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers. + movq [rdi+8], mm1 + + movq mm1, [rsi+16] + pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers. + movq [rdi+16], mm1 + + movq mm1, [rsi+24] + pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers. + movq [rdi+24], mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void dequant_idct_add_mmx( +;short *input, 0 +;short *dq, 1 +;unsigned char *dest, 2 +;int stride) 3 +global sym(vp8_dequant_idct_add_mmx) PRIVATE +sym(vp8_dequant_idct_add_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + GET_GOT rbx + push rdi + ; end prolog + + mov rax, arg(0) ;input + mov rdx, arg(1) ;dq + + + movq mm0, [rax ] + pmullw mm0, [rdx] + + movq mm1, [rax +8] + pmullw mm1, [rdx +8] + + movq mm2, [rax+16] + pmullw mm2, [rdx+16] + + movq mm3, [rax+24] + pmullw mm3, [rdx+24] + + mov rdx, arg(2) ;dest + + pxor mm7, mm7 + + + movq [rax], mm7 + movq [rax+8], mm7 + + movq [rax+16],mm7 + movq [rax+24],mm7 + + + movsxd rdi, dword ptr arg(3) ;stride + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 + + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + movq mm3, mm5 ; 33 23 13 03 + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 + + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + paddw mm0, [GLOBAL(fours)] + + paddw mm2, [GLOBAL(fours)] + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + psraw mm2, 3 + + psraw mm0, 3 + psraw mm4, 3 + + psraw mm6, 3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + pxor mm7, mm7 + + movd mm4, [rdx] + punpcklbw mm4, mm7 + paddsw mm0, mm4 + packuswb mm0, mm7 + movd [rdx], mm0 + + movd mm4, [rdx+rdi] + punpcklbw mm4, mm7 + paddsw mm1, mm4 + packuswb mm1, mm7 + movd [rdx+rdi], mm1 + + movd mm4, [rdx+2*rdi] + punpcklbw mm4, mm7 + paddsw mm2, mm4 + packuswb mm2, mm7 + movd [rdx+rdi*2], mm2 + + add rdx, rdi + + movd mm4, [rdx+2*rdi] + punpcklbw mm4, mm7 + paddsw mm5, mm4 + packuswb mm5, mm7 + movd [rdx+rdi*2], mm5 + + ; begin epilog + pop rdi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +x_s1sqr2: + times 4 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 4 dw 0x4E7B +align 16 +fours: + times 4 dw 0x0004 diff --git a/thirdparty/libvpx/vp8/common/x86/filter_x86.c b/thirdparty/libvpx/vp8/common/x86/filter_x86.c new file mode 100644 index 000000000000..7f496ed7dba7 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/filter_x86.c @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/x86/filter_x86.h" + +DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = +{ + { 128, 128, 128, 128, 0, 0, 0, 0 }, + { 112, 112, 112, 112, 16, 16, 16, 16 }, + { 96, 96, 96, 96, 32, 32, 32, 32 }, + { 80, 80, 80, 80, 48, 48, 48, 48 }, + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 48, 48, 48, 48, 80, 80, 80, 80 }, + { 32, 32, 32, 32, 96, 96, 96, 96 }, + { 16, 16, 16, 16, 112, 112, 112, 112 } +}; + +DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = +{ + { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 }, + { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 }, + { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 }, + { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 }, + { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 } +}; diff --git a/thirdparty/libvpx/vp8/common/x86/filter_x86.h b/thirdparty/libvpx/vp8/common/x86/filter_x86.h new file mode 100644 index 000000000000..d282841bee4d --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/filter_x86.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_COMMON_X86_FILTER_X86_H_ +#define VP8_COMMON_X86_FILTER_X86_H_ + +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with + * duplicated values */ + +/* duplicated 4x */ +extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]); + +/* duplicated 8x */ +extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_X86_FILTER_X86_H_ diff --git a/thirdparty/libvpx/vp8/common/x86/idct_blk_mmx.c b/thirdparty/libvpx/vp8/common/x86/idct_blk_mmx.c new file mode 100644 index 000000000000..f2532b34da2c --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/idct_blk_mmx.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_mem/vpx_mem.h" + +extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q); + +void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC) +{ + short *sq = (short *) d->qcoeff; + short *dq = (short *) d->dqcoeff; + + vp8_dequantize_b_impl_mmx(sq, dq, DQC); +} + +void vp8_dequant_idct_add_y_block_mmx + (short *q, short *dq, + unsigned char *dst, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_mmx (q, dq, dst, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride, + dst+4, stride); + memset(q + 16, 0, 2 * sizeof(q[0])); + } + + if (eobs[2] > 1) + vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride); + else if (eobs[2] == 1) + { + vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride, + dst+8, stride); + memset(q + 32, 0, 2 * sizeof(q[0])); + } + + if (eobs[3] > 1) + vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride); + else if (eobs[3] == 1) + { + vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride, + dst+12, stride); + memset(q + 48, 0, 2 * sizeof(q[0])); + } + + q += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_mmx + (short *q, short *dq, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_mmx (q, dq, dstu, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride, + dstu+4, stride); + memset(q + 16, 0, 2 * sizeof(q[0])); + } + + q += 32; + dstu += 4*stride; + eobs += 2; + } + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_mmx (q, dq, dstv, stride); + else if (eobs[0] == 1) + { + vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride); + else if (eobs[1] == 1) + { + vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride, + dstv+4, stride); + memset(q + 16, 0, 2 * sizeof(q[0])); + } + + q += 32; + dstv += 4*stride; + eobs += 2; + } +} diff --git a/thirdparty/libvpx/vp8/common/x86/idct_blk_sse2.c b/thirdparty/libvpx/vp8/common/x86/idct_blk_sse2.c new file mode 100644 index 000000000000..ae96ec858ce2 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/idct_blk_sse2.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_rtcd.h" + +void vp8_idct_dequant_0_2x_sse2 + (short *q, short *dq , + unsigned char *dst, int dst_stride); +void vp8_idct_dequant_full_2x_sse2 + (short *q, short *dq , + unsigned char *dst, int dst_stride); + +void vp8_dequant_idct_add_y_block_sse2 + (short *q, short *dq, + unsigned char *dst, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (((short *)(eobs))[0]) + { + if (((short *)(eobs))[0] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride); + } + if (((short *)(eobs))[1]) + { + if (((short *)(eobs))[1] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride); + else + vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride); + } + q += 64; + dst += stride*4; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_sse2 + (short *q, short *dq, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + if (((short *)(eobs))[0]) + { + if (((short *)(eobs))[0] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride); + } + q += 32; + dstu += stride*4; + + if (((short *)(eobs))[1]) + { + if (((short *)(eobs))[1] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride); + } + q += 32; + + if (((short *)(eobs))[2]) + { + if (((short *)(eobs))[2] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride); + } + q += 32; + dstv += stride*4; + + if (((short *)(eobs))[3]) + { + if (((short *)(eobs))[3] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride); + } +} diff --git a/thirdparty/libvpx/vp8/common/x86/idctllm_mmx.asm b/thirdparty/libvpx/vp8/common/x86/idctllm_mmx.asm new file mode 100644 index 000000000000..96fa2c60d0ff --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/idctllm_mmx.asm @@ -0,0 +1,295 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +; /**************************************************************************** +; * Notes: +; * +; * This implementation makes use of 16 bit fixed point version of two multiply +; * constants: +; * 1. sqrt(2) * cos (pi/8) +; * 2. sqrt(2) * sin (pi/8) +; * Because the first constant is bigger than 1, to maintain the same 16 bit +; * fixed point precision as the second one, we use a trick of +; * x * a = x + x*(a-1) +; * so +; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). +; * +; * For the second constant, because of the 16bit version is 35468, which +; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative +; * number. +; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x +; * +; **************************************************************************/ + + +;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, +;int pitch, unsigned char *dest,int stride) +global sym(vp8_short_idct4x4llm_mmx) PRIVATE +sym(vp8_short_idct4x4llm_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(0) ;input + mov rsi, arg(1) ;pred + + movq mm0, [rax ] + movq mm1, [rax+ 8] + movq mm2, [rax+16] + movq mm3, [rax+24] + +%if 0 + pxor mm7, mm7 + movq [rax], mm7 + movq [rax+8], mm7 + movq [rax+16],mm7 + movq [rax+24],mm7 +%endif + movsxd rax, dword ptr arg(2) ;pitch + mov rdx, arg(3) ;dest + movsxd rdi, dword ptr arg(4) ;stride + + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 + + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + movq mm3, mm5 ; 33 23 13 03 + + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; + + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 + + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; + + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 + + movq mm5, mm1 + movq mm4, mm3 + + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 + + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 + + paddw mm3, mm5 ; d1 + paddw mm0, [GLOBAL(fours)] + + paddw mm2, [GLOBAL(fours)] + movq mm6, mm2 ; a1 + + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 + + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 + + psubw mm6, mm3 ;3 + psraw mm2, 3 + + psraw mm0, 3 + psraw mm4, 3 + + psraw mm6, 3 + + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 + + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 + + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 + + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 + + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 + + pxor mm7, mm7 + + movd mm4, [rsi] + punpcklbw mm4, mm7 + paddsw mm0, mm4 + packuswb mm0, mm7 + movd [rdx], mm0 + + movd mm4, [rsi+rax] + punpcklbw mm4, mm7 + paddsw mm1, mm4 + packuswb mm1, mm7 + movd [rdx+rdi], mm1 + + movd mm4, [rsi+2*rax] + punpcklbw mm4, mm7 + paddsw mm2, mm4 + packuswb mm2, mm7 + movd [rdx+rdi*2], mm2 + + add rdx, rdi + add rsi, rax + + movd mm4, [rsi+2*rax] + punpcklbw mm4, mm7 + paddsw mm5, mm4 + packuswb mm5, mm7 + movd [rdx+rdi*2], mm5 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_dc_only_idct_add_mmx( +;short input_dc, +;unsigned char *pred_ptr, +;int pred_stride, +;unsigned char *dst_ptr, +;int stride) +global sym(vp8_dc_only_idct_add_mmx) PRIVATE +sym(vp8_dc_only_idct_add_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + ; end prolog + + movd mm5, arg(0) ;input_dc + mov rax, arg(1) ;pred_ptr + movsxd rdx, dword ptr arg(2) ;pred_stride + + pxor mm0, mm0 + + paddw mm5, [GLOBAL(fours)] + lea rcx, [rdx + rdx*2] + + psraw mm5, 3 + + punpcklwd mm5, mm5 + + punpckldq mm5, mm5 + + movd mm1, [rax] + movd mm2, [rax+rdx] + movd mm3, [rax+2*rdx] + movd mm4, [rax+rcx] + + mov rax, arg(3) ;d -- destination + movsxd rdx, dword ptr arg(4) ;dst_stride + + punpcklbw mm1, mm0 + paddsw mm1, mm5 + packuswb mm1, mm0 ; pack and unpack to saturate + lea rcx, [rdx + rdx*2] + + punpcklbw mm2, mm0 + paddsw mm2, mm5 + packuswb mm2, mm0 ; pack and unpack to saturate + + punpcklbw mm3, mm0 + paddsw mm3, mm5 + packuswb mm3, mm0 ; pack and unpack to saturate + + punpcklbw mm4, mm0 + paddsw mm4, mm5 + packuswb mm4, mm0 ; pack and unpack to saturate + + movd [rax], mm1 + movd [rax+rdx], mm2 + movd [rax+2*rdx], mm3 + movd [rax+rcx], mm4 + + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +x_s1sqr2: + times 4 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 4 dw 0x4E7B +align 16 +fours: + times 4 dw 0x0004 diff --git a/thirdparty/libvpx/vp8/common/x86/idctllm_sse2.asm b/thirdparty/libvpx/vp8/common/x86/idctllm_sse2.asm new file mode 100644 index 000000000000..bf8e2c402106 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/idctllm_sse2.asm @@ -0,0 +1,708 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_idct_dequant_0_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; ) + +global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE +sym(vp8_idct_dequant_0_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + GET_GOT rbx + ; end prolog + + mov rdx, arg(1) ; dequant + mov rax, arg(0) ; qcoeff + + movd xmm4, [rax] + movd xmm5, [rdx] + + pinsrw xmm4, [rax+32], 4 + pinsrw xmm5, [rdx], 4 + + pmullw xmm4, xmm5 + + ; Zero out xmm5, for use unpacking + pxor xmm5, xmm5 + + ; clear coeffs + movd [rax], xmm5 + movd [rax+32], xmm5 +;pshufb + mov rax, arg(2) ; dst + movsxd rdx, dword ptr arg(3) ; dst_stride + + pshuflw xmm4, xmm4, 00000000b + pshufhw xmm4, xmm4, 00000000b + + lea rcx, [rdx + rdx*2] + paddw xmm4, [GLOBAL(fours)] + + psraw xmm4, 3 + + movq xmm0, [rax] + movq xmm1, [rax+rdx] + movq xmm2, [rax+2*rdx] + movq xmm3, [rax+rcx] + + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + + + ; Add to predict buffer + paddw xmm0, xmm4 + paddw xmm1, xmm4 + paddw xmm2, xmm4 + paddw xmm3, xmm4 + + ; pack up before storing + packuswb xmm0, xmm5 + packuswb xmm1, xmm5 + packuswb xmm2, xmm5 + packuswb xmm3, xmm5 + + ; store blocks back out + movq [rax], xmm0 + movq [rax + rdx], xmm1 + + lea rax, [rax + 2*rdx] + + movq [rax], xmm2 + movq [rax + rdx], xmm3 + + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_idct_dequant_full_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; ) +global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE +sym(vp8_idct_dequant_full_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + mov rdx, arg(1) ; dequant + mov rdi, arg(2) ; dst + + + ; Zero out xmm7, for use unpacking + pxor xmm7, xmm7 + + + ; note the transpose of xmm1 and xmm2, necessary for shuffle + ; to spit out sensicle data + movdqa xmm0, [rax] + movdqa xmm2, [rax+16] + movdqa xmm1, [rax+32] + movdqa xmm3, [rax+48] + + ; Clear out coeffs + movdqa [rax], xmm7 + movdqa [rax+16], xmm7 + movdqa [rax+32], xmm7 + movdqa [rax+48], xmm7 + + ; dequantize qcoeff buffer + pmullw xmm0, [rdx] + pmullw xmm2, [rdx+16] + pmullw xmm1, [rdx] + pmullw xmm3, [rdx+16] + movsxd rdx, dword ptr arg(3) ; dst_stride + + ; repack so block 0 row x and block 1 row x are together + movdqa xmm4, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm4, xmm1 + + pshufd xmm0, xmm0, 11011000b + pshufd xmm1, xmm4, 11011000b + + movdqa xmm4, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm4, xmm3 + + pshufd xmm2, xmm2, 11011000b + pshufd xmm3, xmm4, 11011000b + + ; first pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 ; + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + lea rcx, [rdx + rdx*2] ;dst_stride * 3 + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + + ; transpose for the second pass + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + ; second pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + paddw xmm0, [GLOBAL(fours)] + + paddw xmm2, [GLOBAL(fours)] + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + psraw xmm2, 3 + + psraw xmm0, 3 + psraw xmm4, 3 + + psraw xmm6, 3 + + ; transpose to save + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + pxor xmm7, xmm7 + + ; Load up predict blocks + movq xmm4, [rdi] + movq xmm5, [rdi+rdx] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movq xmm4, [rdi+2*rdx] + movq xmm5, [rdi+rcx] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm4 + paddw xmm3, xmm5 + +.finish: + + ; pack up before storing + packuswb xmm0, xmm7 + packuswb xmm1, xmm7 + packuswb xmm2, xmm7 + packuswb xmm3, xmm7 + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + movq [rdi + rdx*2], xmm2 + movq [rdi + rcx], xmm3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_idct_dequant_dc_0_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; short *dc - 4 +; ) +global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE +sym(vp8_idct_dequant_dc_0_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + + mov rdi, arg(2) ; dst + mov rdx, arg(4) ; dc + + ; Zero out xmm5, for use unpacking + pxor xmm5, xmm5 + + ; load up 2 dc words here == 2*16 = doubleword + movd xmm4, [rdx] + + movsxd rdx, dword ptr arg(3) ; dst_stride + lea rcx, [rdx + rdx*2] + ; Load up predict blocks + movq xmm0, [rdi] + movq xmm1, [rdi+rdx*1] + movq xmm2, [rdi+rdx*2] + movq xmm3, [rdi+rcx] + + ; Duplicate and expand dc across + punpcklwd xmm4, xmm4 + punpckldq xmm4, xmm4 + + ; Rounding to dequant and downshift + paddw xmm4, [GLOBAL(fours)] + psraw xmm4, 3 + + ; Predict buffer needs to be expanded from bytes to words + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + + ; Add to predict buffer + paddw xmm0, xmm4 + paddw xmm1, xmm4 + paddw xmm2, xmm4 + paddw xmm3, xmm4 + + ; pack up before storing + packuswb xmm0, xmm5 + packuswb xmm1, xmm5 + packuswb xmm2, xmm5 + packuswb xmm3, xmm5 + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + movq [rdi + rdx*2], xmm2 + movq [rdi + rcx], xmm3 + + ; begin epilog + pop rdi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +;void vp8_idct_dequant_dc_full_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; short *dc - 4 +; ) +global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE +sym(vp8_idct_dequant_dc_full_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + mov rdx, arg(1) ; dequant + + mov rdi, arg(2) ; dst + + ; Zero out xmm7, for use unpacking + pxor xmm7, xmm7 + + + ; note the transpose of xmm1 and xmm2, necessary for shuffle + ; to spit out sensicle data + movdqa xmm0, [rax] + movdqa xmm2, [rax+16] + movdqa xmm1, [rax+32] + movdqa xmm3, [rax+48] + + ; Clear out coeffs + movdqa [rax], xmm7 + movdqa [rax+16], xmm7 + movdqa [rax+32], xmm7 + movdqa [rax+48], xmm7 + + ; dequantize qcoeff buffer + pmullw xmm0, [rdx] + pmullw xmm2, [rdx+16] + pmullw xmm1, [rdx] + pmullw xmm3, [rdx+16] + + ; DC component + mov rdx, arg(4) + + ; repack so block 0 row x and block 1 row x are together + movdqa xmm4, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm4, xmm1 + + pshufd xmm0, xmm0, 11011000b + pshufd xmm1, xmm4, 11011000b + + movdqa xmm4, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm4, xmm3 + + pshufd xmm2, xmm2, 11011000b + pshufd xmm3, xmm4, 11011000b + + ; insert DC component + pinsrw xmm0, [rdx], 0 + pinsrw xmm0, [rdx+2], 4 + + ; first pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 ; + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + + ; transpose for the second pass + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + ; second pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + paddw xmm0, [GLOBAL(fours)] + + paddw xmm2, [GLOBAL(fours)] + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + psraw xmm2, 3 + + psraw xmm0, 3 + psraw xmm4, 3 + + psraw xmm6, 3 + + ; transpose to save + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + pxor xmm7, xmm7 + + ; Load up predict blocks + movsxd rdx, dword ptr arg(3) ; dst_stride + movq xmm4, [rdi] + movq xmm5, [rdi+rdx] + lea rcx, [rdx + rdx*2] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movq xmm4, [rdi+rdx*2] + movq xmm5, [rdi+rcx] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm4 + paddw xmm3, xmm5 + +.finish: + + ; pack up before storing + packuswb xmm0, xmm7 + packuswb xmm1, xmm7 + packuswb xmm2, xmm7 + packuswb xmm3, xmm7 + + ; Load destination stride before writing out, + ; doesn't need to persist + movsxd rdx, dword ptr arg(3) ; dst_stride + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + + lea rdi, [rdi + 2*rdx] + + movq [rdi], xmm2 + movq [rdi + rdx], xmm3 + + + ; begin epilog + pop rdi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +fours: + times 8 dw 0x0004 +align 16 +x_s1sqr2: + times 8 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 8 dw 0x4E7B diff --git a/thirdparty/libvpx/vp8/common/x86/iwalsh_mmx.asm b/thirdparty/libvpx/vp8/common/x86/iwalsh_mmx.asm new file mode 100644 index 000000000000..158c3b745838 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/iwalsh_mmx.asm @@ -0,0 +1,140 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_short_inv_walsh4x4_mmx(short *input, short *output) +global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE +sym(vp8_short_inv_walsh4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + ; end prolog + + mov rdx, arg(0) + mov rax, 30003h + + movq mm0, [rdx + 0] ;ip[0] + movq mm1, [rdx + 8] ;ip[4] + movq mm7, rax + + movq mm2, [rdx + 16] ;ip[8] + movq mm3, [rdx + 24] ;ip[12] + punpcklwd mm7, mm7 ;0003000300030003h + mov rdx, arg(1) + + movq mm4, mm0 + movq mm5, mm1 + + paddw mm4, mm3 ;ip[0] + ip[12] aka al + paddw mm5, mm2 ;ip[4] + ip[8] aka bl + + movq mm6, mm4 ;temp al + paddw mm4, mm5 ;al + bl + psubw mm6, mm5 ;al - bl + + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 + psubw mm1, mm2 ;ip[4] - ip[8] aka c1 + + movq mm5, mm0 ;temp dl + paddw mm0, mm1 ;dl + cl + psubw mm5, mm1 ;dl - cl + + ; 03 02 01 00 + ; 13 12 11 10 + ; 23 22 21 20 + ; 33 32 31 30 + + movq mm3, mm4 ; 03 02 01 00 + punpcklwd mm4, mm0 ; 11 01 10 00 + punpckhwd mm3, mm0 ; 13 03 12 02 + + movq mm1, mm6 ; 23 22 21 20 + punpcklwd mm6, mm5 ; 31 21 30 20 + punpckhwd mm1, mm5 ; 33 23 32 22 + + movq mm0, mm4 ; 11 01 10 00 + movq mm2, mm3 ; 13 03 12 02 + + punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] + punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] + + punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] + punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] +;~~~~~~~~~~~~~~~~~~~~~ + movq mm1, mm0 + movq mm5, mm4 + paddw mm1, mm3 ;ip[0] + ip[12] aka al + paddw mm5, mm2 ;ip[4] + ip[8] aka bl + + movq mm6, mm1 ;temp al + paddw mm1, mm5 ;al + bl + psubw mm6, mm5 ;al - bl + paddw mm1, mm7 + paddw mm6, mm7 + psraw mm1, 3 + psraw mm6, 3 + + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 + psubw mm4, mm2 ;ip[4] - ip[8] aka c1 + + movq mm5, mm0 ;temp dl + paddw mm0, mm4 ;dl + cl + psubw mm5, mm4 ;dl - cl + paddw mm0, mm7 + paddw mm5, mm7 + psraw mm0, 3 + psraw mm5, 3 +;~~~~~~~~~~~~~~~~~~~~~ + + movd eax, mm1 + movd ecx, mm0 + psrlq mm0, 32 + psrlq mm1, 32 + mov word ptr[rdx+32*0], ax + mov word ptr[rdx+32*1], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*4], ax + mov word ptr[rdx+32*5], cx + movd eax, mm1 + movd ecx, mm0 + mov word ptr[rdx+32*8], ax + mov word ptr[rdx+32*9], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*12], ax + mov word ptr[rdx+32*13], cx + + movd eax, mm6 + movd ecx, mm5 + psrlq mm5, 32 + psrlq mm6, 32 + mov word ptr[rdx+32*2], ax + mov word ptr[rdx+32*3], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*6], ax + mov word ptr[rdx+32*7], cx + movd eax, mm6 + movd ecx, mm5 + mov word ptr[rdx+32*10], ax + mov word ptr[rdx+32*11], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*14], ax + mov word ptr[rdx+32*15], cx + + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + diff --git a/thirdparty/libvpx/vp8/common/x86/iwalsh_sse2.asm b/thirdparty/libvpx/vp8/common/x86/iwalsh_sse2.asm new file mode 100644 index 000000000000..06e86a80b697 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/iwalsh_sse2.asm @@ -0,0 +1,121 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_short_inv_walsh4x4_sse2(short *input, short *output) +global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE +sym(vp8_short_inv_walsh4x4_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + ; end prolog + + mov rcx, arg(0) + mov rdx, arg(1) + mov rax, 30003h + + movdqa xmm0, [rcx + 0] ;ip[4] ip[0] + movdqa xmm1, [rcx + 16] ;ip[12] ip[8] + + + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm0 ;ip[4] ip[0] + + paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm4, xmm0 + punpcklqdq xmm0, xmm3 ;d1 a1 + punpckhqdq xmm4, xmm3 ;c1 b1 + + movdqa xmm1, xmm4 ;c1 b1 + paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; 13 12 11 10 03 02 01 00 + ; + ; 33 32 31 30 23 22 21 20 + ; + movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + movd xmm0, eax + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm4 ;ip[4] ip[0] + + pshufd xmm0, xmm0, 0 ;03 03 03 03 03 03 03 03 + + paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm3 ;d1 a1 + punpckhqdq xmm5, xmm3 ;c1 b1 + + movdqa xmm1, xmm5 ;c1 b1 + paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + + paddw xmm5, xmm0 + paddw xmm4, xmm0 + psraw xmm5, 3 + psraw xmm4, 3 + + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*0], ax + mov word ptr[rdx+32*2], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*4], ax + mov word ptr[rdx+32*6], cx + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*8], ax + mov word ptr[rdx+32*10], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*12], ax + mov word ptr[rdx+32*14], cx + + movd eax, xmm5 + movd ecx, xmm4 + psrldq xmm5, 4 + psrldq xmm4, 4 + mov word ptr[rdx+32*1], ax + mov word ptr[rdx+32*3], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*5], ax + mov word ptr[rdx+32*7], cx + movd eax, xmm5 + movd ecx, xmm4 + mov word ptr[rdx+32*9], ax + mov word ptr[rdx+32*11], cx + shr eax, 16 + shr ecx, 16 + mov word ptr[rdx+32*13], ax + mov word ptr[rdx+32*15], cx + + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret diff --git a/thirdparty/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm b/thirdparty/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm new file mode 100644 index 000000000000..6d5aaa19db79 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm @@ -0,0 +1,815 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%macro LF_ABS 2 + ; %1 value not preserved + ; %2 value preserved + ; output in %1 + movdqa scratch1, %2 ; v2 + + psubusb scratch1, %1 ; v2 - v1 + psubusb %1, %2 ; v1 - v2 + por %1, scratch1 ; abs(v2 - v1) +%endmacro + +%macro LF_FILTER_HEV_MASK 8-9 + + LF_ABS %1, %2 ; abs(p3 - p2) + LF_ABS %2, %3 ; abs(p2 - p1) + pmaxub %1, %2 ; accumulate mask +%if %0 == 8 + movdqa scratch2, %3 ; save p1 + LF_ABS scratch2, %4 ; abs(p1 - p0) +%endif + LF_ABS %4, %5 ; abs(p0 - q0) + LF_ABS %5, %6 ; abs(q0 - q1) +%if %0 == 8 + pmaxub %5, scratch2 ; accumulate hev +%else + pmaxub %5, %9 +%endif + pmaxub %1, %5 ; accumulate mask + + LF_ABS %3, %6 ; abs(p1 - q1) + LF_ABS %6, %7 ; abs(q1 - q2) + pmaxub %1, %6 ; accumulate mask + LF_ABS %7, %8 ; abs(q2 - q3) + pmaxub %1, %7 ; accumulate mask + + paddusb %4, %4 ; 2 * abs(p0 - q0) + pand %3, [GLOBAL(tfe)] + psrlw %3, 1 ; abs(p1 - q1) / 2 + paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + + psubusb %1, [limit] + psubusb %4, [blimit] + por %1, %4 + pcmpeqb %1, zero ; mask + + psubusb %5, [thresh] + pcmpeqb %5, zero ; ~hev +%endmacro + +%macro LF_FILTER 6 + ; %1-%4: p1-q1 + ; %5: mask + ; %6: hev + + movdqa scratch2, %6 ; save hev + + pxor %1, [GLOBAL(t80)] ; ps1 + pxor %4, [GLOBAL(t80)] ; qs1 + movdqa scratch1, %1 + psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1) + pandn scratch2, scratch1 ; vp8_filter &= hev + + pxor %2, [GLOBAL(t80)] ; ps0 + pxor %3, [GLOBAL(t80)] ; qs0 + movdqa scratch1, %3 + psubsb scratch1, %2 ; qs0 - ps0 + paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) + paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) + paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) + pand %5, scratch2 ; &= mask + + movdqa scratch2, %5 + paddsb %5, [GLOBAL(t4)] ; Filter1 + paddsb scratch2, [GLOBAL(t3)] ; Filter2 + + ; Filter1 >> 3 + movdqa scratch1, zero + pcmpgtb scratch1, %5 + psrlw %5, 3 + pand scratch1, [GLOBAL(te0)] + pand %5, [GLOBAL(t1f)] + por %5, scratch1 + + psubsb %3, %5 ; qs0 - Filter1 + pxor %3, [GLOBAL(t80)] + + ; Filter2 >> 3 + movdqa scratch1, zero + pcmpgtb scratch1, scratch2 + psrlw scratch2, 3 + pand scratch1, [GLOBAL(te0)] + pand scratch2, [GLOBAL(t1f)] + por scratch2, scratch1 + + paddsb %2, scratch2 ; ps0 + Filter2 + pxor %2, [GLOBAL(t80)] + + ; outer tap adjustments + paddsb %5, [GLOBAL(t1)] + movdqa scratch1, zero + pcmpgtb scratch1, %5 + psrlw %5, 1 + pand scratch1, [GLOBAL(t80)] + pand %5, [GLOBAL(t7f)] + por %5, scratch1 + pand %5, %6 ; vp8_filter &= ~hev + + psubsb %4, %5 ; qs1 - vp8_filter + pxor %4, [GLOBAL(t80)] + + paddsb %1, %5 ; ps1 + vp8_filter + pxor %1, [GLOBAL(t80)] +%endmacro + +;void vp8_loop_filter_bh_y_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh +;) +global sym(vp8_loop_filter_bh_y_sse2) PRIVATE +sym(vp8_loop_filter_bh_y_sse2): + +%if LIBVPX_YASM_WIN64 + %define src rcx ; src_ptr + %define stride rdx ; src_pixel_step + %define blimit r8 + %define limit r9 + %define thresh r10 + + %define spp rax + %define stride3 r11 + %define stride5 r12 + %define stride7 r13 + + push rbp + mov rbp, rsp + SAVE_XMM 11 + push r12 + push r13 + mov thresh, arg(4) +%else + %define src rdi ; src_ptr + %define stride rsi ; src_pixel_step + %define blimit rdx + %define limit rcx + %define thresh r8 + + %define spp rax + %define stride3 r9 + %define stride5 r10 + %define stride7 r11 +%endif + + %define scratch1 xmm5 + %define scratch2 xmm6 + %define zero xmm7 + + %define i0 [src] + %define i1 [spp] + %define i2 [src + 2 * stride] + %define i3 [spp + 2 * stride] + %define i4 [src + 4 * stride] + %define i5 [spp + 4 * stride] + %define i6 [src + 2 * stride3] + %define i7 [spp + 2 * stride3] + %define i8 [src + 8 * stride] + %define i9 [spp + 8 * stride] + %define i10 [src + 2 * stride5] + %define i11 [spp + 2 * stride5] + %define i12 [src + 4 * stride3] + %define i13 [spp + 4 * stride3] + %define i14 [src + 2 * stride7] + %define i15 [spp + 2 * stride7] + + ; prep work + lea spp, [src + stride] + lea stride3, [stride + 2 * stride] + lea stride5, [stride3 + 2 * stride] + lea stride7, [stride3 + 4 * stride] + pxor zero, zero + + ; load the first set into registers + movdqa xmm0, i0 + movdqa xmm1, i1 + movdqa xmm2, i2 + movdqa xmm3, i3 + movdqa xmm4, i4 + movdqa xmm8, i5 + movdqa xmm9, i6 ; q2, will contain abs(p1-p0) + movdqa xmm10, i7 +LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10 + + movdqa xmm1, i2 + movdqa xmm2, i3 + movdqa xmm3, i4 + movdqa xmm8, i5 +LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4 + movdqa i2, xmm1 + movdqa i3, xmm2 + +; second set + movdqa i4, xmm3 + movdqa i5, xmm8 + + movdqa xmm0, i6 + movdqa xmm1, i7 + movdqa xmm2, i8 + movdqa xmm4, i9 + movdqa xmm10, i10 ; q2, will contain abs(p1-p0) + movdqa xmm11, i11 +LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9 + + movdqa xmm0, i6 + movdqa xmm1, i7 + movdqa xmm4, i8 + movdqa xmm8, i9 +LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 + movdqa i6, xmm0 + movdqa i7, xmm1 + +; last set + movdqa i8, xmm4 + movdqa i9, xmm8 + + movdqa xmm0, i10 + movdqa xmm1, i11 + movdqa xmm2, i12 + movdqa xmm3, i13 + movdqa xmm9, i14 ; q2, will contain abs(p1-p0) + movdqa xmm11, i15 +LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10 + + movdqa xmm0, i10 + movdqa xmm1, i11 + movdqa xmm3, i12 + movdqa xmm8, i13 +LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2 + movdqa i10, xmm0 + movdqa i11, xmm1 + movdqa i12, xmm3 + movdqa i13, xmm8 + +%if LIBVPX_YASM_WIN64 + pop r13 + pop r12 + RESTORE_XMM + pop rbp +%endif + + ret + + +;void vp8_loop_filter_bv_y_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh +;) + +global sym(vp8_loop_filter_bv_y_sse2) PRIVATE +sym(vp8_loop_filter_bv_y_sse2): + +%if LIBVPX_YASM_WIN64 + %define src rcx ; src_ptr + %define stride rdx ; src_pixel_step + %define blimit r8 + %define limit r9 + %define thresh r10 + + %define spp rax + %define stride3 r11 + %define stride5 r12 + %define stride7 r13 + + push rbp + mov rbp, rsp + SAVE_XMM 15 + push r12 + push r13 + mov thresh, arg(4) +%else + %define src rdi + %define stride rsi + %define blimit rdx + %define limit rcx + %define thresh r8 + + %define spp rax + %define stride3 r9 + %define stride5 r10 + %define stride7 r11 +%endif + + %define scratch1 xmm5 + %define scratch2 xmm6 + %define zero xmm7 + + %define s0 [src] + %define s1 [spp] + %define s2 [src + 2 * stride] + %define s3 [spp + 2 * stride] + %define s4 [src + 4 * stride] + %define s5 [spp + 4 * stride] + %define s6 [src + 2 * stride3] + %define s7 [spp + 2 * stride3] + %define s8 [src + 8 * stride] + %define s9 [spp + 8 * stride] + %define s10 [src + 2 * stride5] + %define s11 [spp + 2 * stride5] + %define s12 [src + 4 * stride3] + %define s13 [spp + 4 * stride3] + %define s14 [src + 2 * stride7] + %define s15 [spp + 2 * stride7] + + %define i0 [rsp] + %define i1 [rsp + 16] + %define i2 [rsp + 32] + %define i3 [rsp + 48] + %define i4 [rsp + 64] + %define i5 [rsp + 80] + %define i6 [rsp + 96] + %define i7 [rsp + 112] + %define i8 [rsp + 128] + %define i9 [rsp + 144] + %define i10 [rsp + 160] + %define i11 [rsp + 176] + %define i12 [rsp + 192] + %define i13 [rsp + 208] + %define i14 [rsp + 224] + %define i15 [rsp + 240] + + ALIGN_STACK 16, rax + + ; reserve stack space + %define temp_storage 0 ; size is 256 (16*16) + %define stack_size 256 + sub rsp, stack_size + + ; prep work + lea spp, [src + stride] + lea stride3, [stride + 2 * stride] + lea stride5, [stride3 + 2 * stride] + lea stride7, [stride3 + 4 * stride] + + ; 8-f + movdqa xmm0, s8 + movdqa xmm1, xmm0 + punpcklbw xmm0, s9 ; 80 90 + punpckhbw xmm1, s9 ; 88 98 + + movdqa xmm2, s10 + movdqa xmm3, xmm2 + punpcklbw xmm2, s11 ; a0 b0 + punpckhbw xmm3, s11 ; a8 b8 + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm2 ; 80 90 a0 b0 + punpckhwd xmm4, xmm2 ; 84 94 a4 b4 + + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm3 ; 88 98 a8 b8 + punpckhwd xmm2, xmm3 ; 8c 9c ac bc + + ; using xmm[0124] + ; work on next 4 rows + + movdqa xmm3, s12 + movdqa xmm5, xmm3 + punpcklbw xmm3, s13 ; c0 d0 + punpckhbw xmm5, s13 ; c8 d8 + + movdqa xmm6, s14 + movdqa xmm7, xmm6 + punpcklbw xmm6, s15 ; e0 f0 + punpckhbw xmm7, s15 ; e8 f8 + + movdqa xmm8, xmm3 + punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 + punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 + + movdqa xmm6, xmm5 + punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 + punpckhwd xmm6, xmm7 ; cc dc ec fc + + ; pull the third and fourth sets together + + movdqa xmm7, xmm0 + punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 + punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 + + movdqa xmm3, xmm4 + punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 + punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 + + movdqa xmm8, xmm1 + punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 + punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa + + movdqa xmm5, xmm2 + punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc + punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe + + ; save the calculations. we only have 15 registers ... + movdqa i0, xmm0 + movdqa i1, xmm7 + movdqa i2, xmm4 + movdqa i3, xmm3 + movdqa i4, xmm1 + movdqa i5, xmm8 + movdqa i6, xmm2 + movdqa i7, xmm5 + + ; 0-7 + movdqa xmm0, s0 + movdqa xmm1, xmm0 + punpcklbw xmm0, s1 ; 00 10 + punpckhbw xmm1, s1 ; 08 18 + + movdqa xmm2, s2 + movdqa xmm3, xmm2 + punpcklbw xmm2, s3 ; 20 30 + punpckhbw xmm3, s3 ; 28 38 + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm2 ; 00 10 20 30 + punpckhwd xmm4, xmm2 ; 04 14 24 34 + + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm3 ; 08 18 28 38 + punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c + + ; using xmm[0124] + ; work on next 4 rows + + movdqa xmm3, s4 + movdqa xmm5, xmm3 + punpcklbw xmm3, s5 ; 40 50 + punpckhbw xmm5, s5 ; 48 58 + + movdqa xmm6, s6 + movdqa xmm7, xmm6 + punpcklbw xmm6, s7 ; 60 70 + punpckhbw xmm7, s7 ; 68 78 + + movdqa xmm8, xmm3 + punpcklwd xmm3, xmm6 ; 40 50 60 70 + punpckhwd xmm8, xmm6 ; 44 54 64 74 + + movdqa xmm6, xmm5 + punpcklwd xmm5, xmm7 ; 48 58 68 78 + punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c + + ; pull the first two sets together + + movdqa xmm7, xmm0 + punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 + punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 + + movdqa xmm3, xmm4 + punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 + punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 + + movdqa xmm8, xmm1 + punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 + punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a + + movdqa xmm5, xmm2 + punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c + punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e + ; final combination + + movdqa xmm6, xmm0 + punpcklqdq xmm0, i0 + punpckhqdq xmm6, i0 + + movdqa xmm9, xmm7 + punpcklqdq xmm7, i1 + punpckhqdq xmm9, i1 + + movdqa xmm10, xmm4 + punpcklqdq xmm4, i2 + punpckhqdq xmm10, i2 + + movdqa xmm11, xmm3 + punpcklqdq xmm3, i3 + punpckhqdq xmm11, i3 + + movdqa xmm12, xmm1 + punpcklqdq xmm1, i4 + punpckhqdq xmm12, i4 + + movdqa xmm13, xmm8 + punpcklqdq xmm8, i5 + punpckhqdq xmm13, i5 + + movdqa xmm14, xmm2 + punpcklqdq xmm2, i6 + punpckhqdq xmm14, i6 + + movdqa xmm15, xmm5 + punpcklqdq xmm5, i7 + punpckhqdq xmm15, i7 + + movdqa i0, xmm0 + movdqa i1, xmm6 + movdqa i2, xmm7 + movdqa i3, xmm9 + movdqa i4, xmm4 + movdqa i5, xmm10 + movdqa i6, xmm3 + movdqa i7, xmm11 + movdqa i8, xmm1 + movdqa i9, xmm12 + movdqa i10, xmm8 + movdqa i11, xmm13 + movdqa i12, xmm2 + movdqa i13, xmm14 + movdqa i14, xmm5 + movdqa i15, xmm15 + +; TRANSPOSED DATA AVAILABLE ON THE STACK + + movdqa xmm12, xmm6 + movdqa xmm13, xmm7 + + pxor zero, zero + +LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11 + + movdqa xmm1, i2 + movdqa xmm2, i3 + movdqa xmm8, i4 + movdqa xmm9, i5 +LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4 + movdqa i2, xmm1 + movdqa i3, xmm2 + +; second set + movdqa i4, xmm8 + movdqa i5, xmm9 + + movdqa xmm0, i6 + movdqa xmm1, i7 + movdqa xmm2, i8 + movdqa xmm4, i9 + movdqa xmm10, i10 ; q2, will contain abs(p1-p0) + movdqa xmm11, i11 +LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3 + + movdqa xmm0, i6 + movdqa xmm1, i7 + movdqa xmm3, i8 + movdqa xmm4, i9 +LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2 + movdqa i6, xmm0 + movdqa i7, xmm1 + +; last set + movdqa i8, xmm3 + movdqa i9, xmm4 + + movdqa xmm0, i10 + movdqa xmm1, i11 + movdqa xmm2, i12 + movdqa xmm8, i13 + movdqa xmm9, i14 ; q2, will contain abs(p1-p0) + movdqa xmm11, i15 +LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10 + + movdqa xmm0, i10 + movdqa xmm1, i11 + movdqa xmm4, i12 + movdqa xmm8, i13 +LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 + movdqa i10, xmm0 + movdqa i11, xmm1 + movdqa i12, xmm4 + movdqa i13, xmm8 + + +; RESHUFFLE AND WRITE OUT + ; 8-f + movdqa xmm0, i8 + movdqa xmm1, xmm0 + punpcklbw xmm0, i9 ; 80 90 + punpckhbw xmm1, i9 ; 88 98 + + movdqa xmm2, i10 + movdqa xmm3, xmm2 + punpcklbw xmm2, i11 ; a0 b0 + punpckhbw xmm3, i11 ; a8 b8 + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm2 ; 80 90 a0 b0 + punpckhwd xmm4, xmm2 ; 84 94 a4 b4 + + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm3 ; 88 98 a8 b8 + punpckhwd xmm2, xmm3 ; 8c 9c ac bc + + ; using xmm[0124] + ; work on next 4 rows + + movdqa xmm3, i12 + movdqa xmm5, xmm3 + punpcklbw xmm3, i13 ; c0 d0 + punpckhbw xmm5, i13 ; c8 d8 + + movdqa xmm6, i14 + movdqa xmm7, xmm6 + punpcklbw xmm6, i15 ; e0 f0 + punpckhbw xmm7, i15 ; e8 f8 + + movdqa xmm8, xmm3 + punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 + punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 + + movdqa xmm6, xmm5 + punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 + punpckhwd xmm6, xmm7 ; cc dc ec fc + + ; pull the third and fourth sets together + + movdqa xmm7, xmm0 + punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 + punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 + + movdqa xmm3, xmm4 + punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 + punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 + + movdqa xmm8, xmm1 + punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 + punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa + + movdqa xmm5, xmm2 + punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc + punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe + + ; save the calculations. we only have 15 registers ... + movdqa i8, xmm0 + movdqa i9, xmm7 + movdqa i10, xmm4 + movdqa i11, xmm3 + movdqa i12, xmm1 + movdqa i13, xmm8 + movdqa i14, xmm2 + movdqa i15, xmm5 + + ; 0-7 + movdqa xmm0, i0 + movdqa xmm1, xmm0 + punpcklbw xmm0, i1 ; 00 10 + punpckhbw xmm1, i1 ; 08 18 + + movdqa xmm2, i2 + movdqa xmm3, xmm2 + punpcklbw xmm2, i3 ; 20 30 + punpckhbw xmm3, i3 ; 28 38 + + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm2 ; 00 10 20 30 + punpckhwd xmm4, xmm2 ; 04 14 24 34 + + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm3 ; 08 18 28 38 + punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c + + ; using xmm[0124] + ; work on next 4 rows + + movdqa xmm3, i4 + movdqa xmm5, xmm3 + punpcklbw xmm3, i5 ; 40 50 + punpckhbw xmm5, i5 ; 48 58 + + movdqa xmm6, i6 + movdqa xmm7, xmm6 + punpcklbw xmm6, i7 ; 60 70 + punpckhbw xmm7, i7 ; 68 78 + + movdqa xmm8, xmm3 + punpcklwd xmm3, xmm6 ; 40 50 60 70 + punpckhwd xmm8, xmm6 ; 44 54 64 74 + + movdqa xmm6, xmm5 + punpcklwd xmm5, xmm7 ; 48 58 68 78 + punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c + + ; pull the first two sets together + + movdqa xmm7, xmm0 + punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 + punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 + + movdqa xmm3, xmm4 + punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 + punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 + + movdqa xmm8, xmm1 + punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 + punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a + + movdqa xmm5, xmm2 + punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c + punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e + ; final combination + + movdqa xmm6, xmm0 + punpcklqdq xmm0, i8 + punpckhqdq xmm6, i8 + + movdqa xmm9, xmm7 + punpcklqdq xmm7, i9 + punpckhqdq xmm9, i9 + + movdqa xmm10, xmm4 + punpcklqdq xmm4, i10 + punpckhqdq xmm10, i10 + + movdqa xmm11, xmm3 + punpcklqdq xmm3, i11 + punpckhqdq xmm11, i11 + + movdqa xmm12, xmm1 + punpcklqdq xmm1, i12 + punpckhqdq xmm12, i12 + + movdqa xmm13, xmm8 + punpcklqdq xmm8, i13 + punpckhqdq xmm13, i13 + + movdqa xmm14, xmm2 + punpcklqdq xmm2, i14 + punpckhqdq xmm14, i14 + + movdqa xmm15, xmm5 + punpcklqdq xmm5, i15 + punpckhqdq xmm15, i15 + + movdqa s0, xmm0 + movdqa s1, xmm6 + movdqa s2, xmm7 + movdqa s3, xmm9 + movdqa s4, xmm4 + movdqa s5, xmm10 + movdqa s6, xmm3 + movdqa s7, xmm11 + movdqa s8, xmm1 + movdqa s9, xmm12 + movdqa s10, xmm8 + movdqa s11, xmm13 + movdqa s12, xmm2 + movdqa s13, xmm14 + movdqa s14, xmm5 + movdqa s15, xmm15 + + ; free stack space + add rsp, stack_size + + ; un-ALIGN_STACK + pop rsp + +%if LIBVPX_YASM_WIN64 + pop r13 + pop r12 + RESTORE_XMM + pop rbp +%endif + + ret + +SECTION_RODATA +align 16 +te0: + times 16 db 0xe0 +align 16 +t7f: + times 16 db 0x7f +align 16 +tfe: + times 16 db 0xfe +align 16 +t1f: + times 16 db 0x1f +align 16 +t80: + times 16 db 0x80 +align 16 +t1: + times 16 db 0x01 +align 16 +t3: + times 16 db 0x03 +align 16 +t4: + times 16 db 0x04 diff --git a/thirdparty/libvpx/vp8/common/x86/loopfilter_sse2.asm b/thirdparty/libvpx/vp8/common/x86/loopfilter_sse2.asm new file mode 100644 index 000000000000..1913abc69b07 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/loopfilter_sse2.asm @@ -0,0 +1,1640 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" +%define _t0 0 +%define _t1 _t0 + 16 +%define _p3 _t1 + 16 +%define _p2 _p3 + 16 +%define _p1 _p2 + 16 +%define _p0 _p1 + 16 +%define _q0 _p0 + 16 +%define _q1 _q0 + 16 +%define _q2 _q1 + 16 +%define _q3 _q2 + 16 +%define lf_var_size 160 + +; Use of pmaxub instead of psubusb to compute filter mask was seen +; in ffvp8 + +%macro LFH_FILTER_AND_HEV_MASK 1 +%if %1 + movdqa xmm2, [rdi+2*rax] ; q3 + movdqa xmm1, [rsi+2*rax] ; q2 + movdqa xmm4, [rsi+rax] ; q1 + movdqa xmm5, [rsi] ; q0 + neg rax ; negate pitch to deal with above border +%else + movlps xmm2, [rsi + rcx*2] ; q3 + movlps xmm1, [rsi + rcx] ; q2 + movlps xmm4, [rsi] ; q1 + movlps xmm5, [rsi + rax] ; q0 + + movhps xmm2, [rdi + rcx*2] + movhps xmm1, [rdi + rcx] + movhps xmm4, [rdi] + movhps xmm5, [rdi + rax] + + lea rsi, [rsi + rax*4] + lea rdi, [rdi + rax*4] + + movdqa [rsp+_q2], xmm1 ; store q2 + movdqa [rsp+_q1], xmm4 ; store q1 +%endif + movdqa xmm7, [rdx] ;limit + + movdqa xmm6, xmm1 ; q2 + movdqa xmm3, xmm4 ; q1 + + psubusb xmm1, xmm2 ; q2-=q3 + psubusb xmm2, xmm6 ; q3-=q2 + + psubusb xmm4, xmm6 ; q1-=q2 + psubusb xmm6, xmm3 ; q2-=q1 + + por xmm4, xmm6 ; abs(q2-q1) + por xmm1, xmm2 ; abs(q3-q2) + + movdqa xmm0, xmm5 ; q0 + pmaxub xmm1, xmm4 + + psubusb xmm5, xmm3 ; q0-=q1 + psubusb xmm3, xmm0 ; q1-=q0 + + por xmm5, xmm3 ; abs(q0-q1) + movdqa [rsp+_t0], xmm5 ; save to t0 + + pmaxub xmm1, xmm5 + +%if %1 + movdqa xmm2, [rsi+4*rax] ; p3 + movdqa xmm4, [rdi+4*rax] ; p2 + movdqa xmm6, [rsi+2*rax] ; p1 +%else + movlps xmm2, [rsi + rax] ; p3 + movlps xmm4, [rsi] ; p2 + movlps xmm6, [rsi + rcx] ; p1 + + movhps xmm2, [rdi + rax] + movhps xmm4, [rdi] + movhps xmm6, [rdi + rcx] + + movdqa [rsp+_p2], xmm4 ; store p2 + movdqa [rsp+_p1], xmm6 ; store p1 +%endif + + movdqa xmm5, xmm4 ; p2 + movdqa xmm3, xmm6 ; p1 + + psubusb xmm4, xmm2 ; p2-=p3 + psubusb xmm2, xmm5 ; p3-=p2 + + psubusb xmm3, xmm5 ; p1-=p2 + pmaxub xmm1, xmm4 ; abs(p3 - p2) + + psubusb xmm5, xmm6 ; p2-=p1 + pmaxub xmm1, xmm2 ; abs(p3 - p2) + + pmaxub xmm1, xmm5 ; abs(p2 - p1) + movdqa xmm2, xmm6 ; p1 + + pmaxub xmm1, xmm3 ; abs(p2 - p1) +%if %1 + movdqa xmm4, [rsi+rax] ; p0 + movdqa xmm3, [rdi] ; q1 +%else + movlps xmm4, [rsi + rcx*2] ; p0 + movhps xmm4, [rdi + rcx*2] + movdqa xmm3, [rsp+_q1] ; q1 +%endif + + movdqa xmm5, xmm4 ; p0 + psubusb xmm4, xmm6 ; p0-=p1 + + psubusb xmm6, xmm5 ; p1-=p0 + + por xmm6, xmm4 ; abs(p1 - p0) + mov rdx, arg(2) ; get blimit + + movdqa [rsp+_t1], xmm6 ; save to t1 + + movdqa xmm4, xmm3 ; q1 + pmaxub xmm1, xmm6 + + psubusb xmm3, xmm2 ; q1-=p1 + psubusb xmm2, xmm4 ; p1-=q1 + + psubusb xmm1, xmm7 + por xmm2, xmm3 ; abs(p1-q1) + + movdqa xmm7, [rdx] ; blimit + mov rdx, arg(4) ; hev get thresh + + movdqa xmm3, xmm0 ; q0 + pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero + + movdqa xmm6, xmm5 ; p0 + psrlw xmm2, 1 ; abs(p1-q1)/2 + + psubusb xmm5, xmm3 ; p0-=q0 + psubusb xmm3, xmm6 ; q0-=p0 + por xmm5, xmm3 ; abs(p0 - q0) + + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + + movdqa xmm4, [rsp+_t0] ; hev get abs (q1 - q0) + movdqa xmm3, [rsp+_t1] ; get abs (p1 - p0) + + paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + movdqa xmm2, [rdx] ; hev + + psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + psubusb xmm4, xmm2 ; hev + + psubusb xmm3, xmm2 ; hev + por xmm1, xmm5 + + pxor xmm7, xmm7 + paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb xmm4, xmm5 ; hev + pcmpeqb xmm3, xmm3 ; hev + + pcmpeqb xmm1, xmm7 ; mask xmm1 + pxor xmm4, xmm3 ; hev +%endmacro + +%macro B_FILTER 1 + movdqa xmm3, [GLOBAL(t80)] +%if %1 == 0 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm7, [rsp+_q1] ; q1 +%elif %1 == 1 + movdqa xmm2, [rsi+2*rax] ; p1 + movdqa xmm7, [rdi] ; q1 +%elif %1 == 2 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm6, [rsp+_p0] ; p0 + movdqa xmm0, [rsp+_q0] ; q0 + movdqa xmm7, [rsp+_q1] ; q1 +%endif + + pxor xmm2, xmm3 ; p1 offset to convert to signed values + pxor xmm7, xmm3 ; q1 offset to convert to signed values + + psubsb xmm2, xmm7 ; p1 - q1 + pxor xmm6, xmm3 ; offset to convert to signed values + + pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) + pxor xmm0, xmm3 ; offset to convert to signed values + + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand xmm1, xmm2 ; mask filter values we don't care about + + movdqa xmm2, xmm1 + paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + + punpckhbw xmm5, xmm2 ; axbxcxdx + punpcklbw xmm2, xmm2 ; exfxgxhx + + punpcklbw xmm0, xmm1 ; exfxgxhx + psraw xmm5, 11 ; sign extended shift right by 3 + + punpckhbw xmm1, xmm1 ; axbxcxdx + psraw xmm2, 11 ; sign extended shift right by 3 + + packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + psraw xmm0, 11 ; sign extended shift right by 3 + + psraw xmm1, 11 ; sign extended shift right by 3 + movdqa xmm5, xmm0 ; save results + + packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + + paddsb xmm6, xmm2 ; p0+= p0 add + + movdqa xmm2, [GLOBAL(ones)] + paddsw xmm5, xmm2 + paddsw xmm1, xmm2 + psraw xmm5, 1 ; partial shifted one more time for 2nd tap + psraw xmm1, 1 ; partial shifted one more time for 2nd tap + packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + movdqa xmm2, [GLOBAL(t80)] + +%if %1 == 0 + movdqa xmm1, [rsp+_p1] ; p1 + lea rsi, [rsi + rcx*2] + lea rdi, [rdi + rcx*2] +%elif %1 == 1 + movdqa xmm1, [rsi+2*rax] ; p1 +%elif %1 == 2 + movdqa xmm1, [rsp+_p1] ; p1 +%endif + + pandn xmm4, xmm5 ; high edge variance additive + pxor xmm6, xmm2 ; unoffset + + pxor xmm1, xmm2 ; reoffset + psubsb xmm3, xmm0 ; q0-= q0 add + + paddsb xmm1, xmm4 ; p1+= p1 add + pxor xmm3, xmm2 ; unoffset + + pxor xmm1, xmm2 ; unoffset + psubsb xmm7, xmm4 ; q1-= q1 add + + pxor xmm7, xmm2 ; unoffset +%if %1 == 0 + movq [rsi], xmm6 ; p0 + movhps [rdi], xmm6 + movq [rsi + rax], xmm1 ; p1 + movhps [rdi + rax], xmm1 + movq [rsi + rcx], xmm3 ; q0 + movhps [rdi + rcx], xmm3 + movq [rsi + rcx*2], xmm7 ; q1 + movhps [rdi + rcx*2], xmm7 +%elif %1 == 1 + movdqa [rsi+rax], xmm6 ; write back + movdqa [rsi+2*rax], xmm1 ; write back + movdqa [rsi], xmm3 ; write back + movdqa [rdi], xmm7 ; write back +%endif + +%endmacro + +%if ABI_IS_32BIT + +;void vp8_loop_filter_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +;) +global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE +sym(vp8_loop_filter_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step + + mov rdx, arg(3) ;limit + + lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 1 + ; filter and write back the result + B_FILTER 1 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +%endif + +;void vp8_loop_filter_horizontal_edge_uv_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE +sym(vp8_loop_filter_horizontal_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; u + mov rdi, arg(5) ; v + movsxd rax, dword ptr arg(1) ; src_pixel_step + mov rcx, rax + neg rax ; negate pitch to deal with above border + + mov rdx, arg(3) ;limit + + lea rsi, [rsi + rcx] + lea rdi, [rdi + rcx] + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 0 + ; filter and write back the result + B_FILTER 0 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +%macro MB_FILTER_AND_WRITEBACK 1 + movdqa xmm3, [GLOBAL(t80)] +%if %1 == 0 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm7, [rsp+_q1] ; q1 +%elif %1 == 1 + movdqa xmm2, [rsi+2*rax] ; p1 + movdqa xmm7, [rdi] ; q1 + + mov rcx, rax + neg rcx +%elif %1 == 2 + movdqa xmm2, [rsp+_p1] ; p1 + movdqa xmm6, [rsp+_p0] ; p0 + movdqa xmm0, [rsp+_q0] ; q0 + movdqa xmm7, [rsp+_q1] ; q1 +%endif + + pxor xmm2, xmm3 ; p1 offset to convert to signed values + pxor xmm7, xmm3 ; q1 offset to convert to signed values + pxor xmm6, xmm3 ; offset to convert to signed values + pxor xmm0, xmm3 ; offset to convert to signed values + + psubsb xmm2, xmm7 ; p1 - q1 + + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) + paddsb xmm2, xmm0 ; 2 * (q0 - p0) + paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1) + pand xmm1, xmm2 ; mask filter values we don't care about + + movdqa xmm2, xmm1 ; vp8_filter + + pand xmm2, xmm4 ; Filter2 = vp8_filter & hev + pxor xmm0, xmm0 + + pandn xmm4, xmm1 ; vp8_filter&=~hev + pxor xmm1, xmm1 + + punpcklbw xmm0, xmm4 ; Filter 2 (hi) + punpckhbw xmm1, xmm4 ; Filter 2 (lo) + + movdqa xmm5, xmm2 + + movdqa xmm4, [GLOBAL(s9)] + paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3) + paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) + + pmulhw xmm1, xmm4 ; Filter 2 (lo) * 9 + pmulhw xmm0, xmm4 ; Filter 2 (hi) * 9 + + punpckhbw xmm7, xmm5 ; axbxcxdx + punpcklbw xmm5, xmm5 ; exfxgxhx + + psraw xmm7, 11 ; sign extended shift right by 3 + + psraw xmm5, 11 ; sign extended shift right by 3 + punpckhbw xmm4, xmm2 ; axbxcxdx + + punpcklbw xmm2, xmm2 ; exfxgxhx + psraw xmm4, 11 ; sign extended shift right by 3 + + packsswb xmm5, xmm7 ; Filter2 >>=3; + psraw xmm2, 11 ; sign extended shift right by 3 + + packsswb xmm2, xmm4 ; Filter1 >>=3; + + paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 + + psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1 + movdqa xmm7, xmm1 + + movdqa xmm4, [GLOBAL(s63)] + movdqa xmm5, xmm0 + movdqa xmm2, xmm5 + paddw xmm0, xmm4 ; Filter 2 (hi) * 9 + 63 + paddw xmm1, xmm4 ; Filter 2 (lo) * 9 + 63 + movdqa xmm4, xmm7 + + paddw xmm5, xmm5 ; Filter 2 (hi) * 18 + + paddw xmm7, xmm7 ; Filter 2 (lo) * 18 + paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63 + + paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63 + paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63 + psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7 + + paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63 + psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7 + psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7 + + packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) + + psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7 + psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7 + psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7 + + packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) + packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) + movdqa xmm7, [GLOBAL(t80)] + +%if %1 == 0 + movdqa xmm1, [rsp+_q1] ; q1 + movdqa xmm4, [rsp+_p1] ; p1 + lea rsi, [rsi+rcx*2] + lea rdi, [rdi+rcx*2] + +%elif %1 == 1 + movdqa xmm1, [rdi] ; q1 + movdqa xmm4, [rsi+rax*2] ; p1 +%elif %1 == 2 + movdqa xmm4, [rsp+_p1] ; p1 + movdqa xmm1, [rsp+_q1] ; q1 +%endif + + pxor xmm1, xmm7 + pxor xmm4, xmm7 + + psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3) + paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3) + psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2) + paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2) + +%if %1 == 1 + movdqa xmm2, [rdi+rax*4] ; p2 + movdqa xmm5, [rdi+rcx] ; q2 +%else + movdqa xmm2, [rsp+_p2] ; p2 + movdqa xmm5, [rsp+_q2] ; q2 +%endif + + pxor xmm1, xmm7 ; *oq1 = sq^0x80; + pxor xmm4, xmm7 ; *op1 = sp^0x80; + pxor xmm2, xmm7 + pxor xmm5, xmm7 + paddsb xmm2, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u) + psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u) + pxor xmm2, xmm7 ; *op2 = sp^0x80; + pxor xmm5, xmm7 ; *oq2 = sq^0x80; + pxor xmm3, xmm7 ; *oq0 = sq^0x80 + pxor xmm6, xmm7 ; *oq0 = sp^0x80 +%if %1 == 0 + movq [rsi], xmm6 ; p0 + movhps [rdi], xmm6 + movq [rsi + rcx], xmm3 ; q0 + movhps [rdi + rcx], xmm3 + lea rdx, [rcx + rcx*2] + movq [rsi+rcx*2], xmm1 ; q1 + movhps [rdi+rcx*2], xmm1 + + movq [rsi + rax], xmm4 ; p1 + movhps [rdi + rax], xmm4 + + movq [rsi+rax*2], xmm2 ; p2 + movhps [rdi+rax*2], xmm2 + + movq [rsi+rdx], xmm5 ; q2 + movhps [rdi+rdx], xmm5 +%elif %1 == 1 + movdqa [rdi+rcx], xmm5 ; q2 + movdqa [rdi], xmm1 ; q1 + movdqa [rsi], xmm3 ; q0 + movdqa [rsi+rax ], xmm6 ; p0 + movdqa [rsi+rax*2], xmm4 ; p1 + movdqa [rdi+rax*4], xmm2 ; p2 +%elif %1 == 2 + movdqa [rsp+_p1], xmm4 ; p1 + movdqa [rsp+_p0], xmm6 ; p0 + movdqa [rsp+_q0], xmm3 ; q0 + movdqa [rsp+_q1], xmm1 ; q1 +%endif + +%endmacro + + +;void vp8_mbloop_filter_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +;) +global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE +sym(vp8_mbloop_filter_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step + mov rdx, arg(3) ;limit + + lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 1 + ; filter and write back the results + MB_FILTER_AND_WRITEBACK 1 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_horizontal_edge_uv_sse2 +;( +; unsigned char *u, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; unsigned char *v +;) +global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE +sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; u + mov rdi, arg(5) ; v + movsxd rax, dword ptr arg(1) ; src_pixel_step + mov rcx, rax + neg rax ; negate pitch to deal with above border + mov rdx, arg(3) ;limit + + lea rsi, [rsi + rcx] + lea rdi, [rdi + rcx] + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 0 + ; filter and write back the results + MB_FILTER_AND_WRITEBACK 0 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +%macro TRANSPOSE_16X8 2 + movq xmm4, [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 + movq xmm1, [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 + movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 + movq xmm7, [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 + movq xmm5, [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 + movq xmm2, [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 + + punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + + movq xmm1, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 + + movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 + + movq xmm7, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 + + punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 +%if %1 + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] +%else + mov rsi, arg(5) ; v_ptr +%endif + + movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 + punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 + punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 + punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + +%if %1 == 0 + lea rdi, [rsi + rax - 4] ; rdi points to row +1 for indirect addressing + lea rsi, [rsi - 4] +%endif + + movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 + + movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 + punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + + punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + + punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + + punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + + movdqa [rsp+_t0], xmm2 ; save to free XMM2 + + movq xmm2, [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 + movq xmm6, [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 + movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 + movq xmm5, [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 + movq xmm1, [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 + + punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + + movq xmm6, [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 + + punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 + + movq xmm5, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 + + punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 + + movq xmm6, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 + + punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 + + movdqa xmm6, xmm1 ; + punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 + + punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 + movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + + punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + + punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 + + movdqa xmm0, xmm5 + punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + + punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 + + punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 + + punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 + movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + + punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 + + punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 + +%if %2 == 0 + movdqa [rsp+_q3], xmm7 ; save 7 + movdqa [rsp+_q2], xmm6 ; save 6 +%endif + movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + movdqa [rsp+_p1], xmm2 ; save 2 + + movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + movdqa [rsp+_p0], xmm3 ; save 3 + + punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + + movdqa [rsp+_q0], xmm4 ; save 4 + movdqa [rsp+_q1], xmm5 ; save 5 + movdqa xmm1, [rsp+_t0] + + movdqa xmm2, xmm1 ; + punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + +%if %2 == 0 + movdqa [rsp+_p2], xmm1 + movdqa [rsp+_p3], xmm2 +%endif + +%endmacro + +%macro LFV_FILTER_MASK_HEV_MASK 0 + movdqa xmm0, xmm6 ; q2 + psubusb xmm0, xmm7 ; q2-q3 + + psubusb xmm7, xmm6 ; q3-q2 + movdqa xmm4, xmm5 ; q1 + + por xmm7, xmm0 ; abs (q3-q2) + psubusb xmm4, xmm6 ; q1-q2 + + movdqa xmm0, xmm1 + psubusb xmm6, xmm5 ; q2-q1 + + por xmm6, xmm4 ; abs (q2-q1) + psubusb xmm0, xmm2 ; p2 - p3; + + psubusb xmm2, xmm1 ; p3 - p2; + por xmm0, xmm2 ; abs(p2-p3) + + movdqa xmm5, [rsp+_p1] ; p1 + pmaxub xmm0, xmm7 + + movdqa xmm2, xmm5 ; p1 + psubusb xmm5, xmm1 ; p1-p2 + psubusb xmm1, xmm2 ; p2-p1 + + movdqa xmm7, xmm3 ; p0 + psubusb xmm7, xmm2 ; p0-p1 + + por xmm1, xmm5 ; abs(p2-p1) + pmaxub xmm0, xmm6 + + pmaxub xmm0, xmm1 + movdqa xmm1, xmm2 ; p1 + + psubusb xmm2, xmm3 ; p1-p0 + + por xmm2, xmm7 ; abs(p1-p0) + + pmaxub xmm0, xmm2 + + movdqa xmm5, [rsp+_q0] ; q0 + movdqa xmm7, [rsp+_q1] ; q1 + + mov rdx, arg(3) ; limit + + movdqa xmm6, xmm5 ; q0 + movdqa xmm4, xmm7 ; q1 + + psubusb xmm5, xmm7 ; q0-q1 + psubusb xmm7, xmm6 ; q1-q0 + + por xmm7, xmm5 ; abs(q1-q0) + + pmaxub xmm0, xmm7 + + psubusb xmm0, [rdx] ; limit + + mov rdx, arg(2) ; blimit + movdqa xmm5, xmm4 ; q1 + + psubusb xmm5, xmm1 ; q1-=p1 + psubusb xmm1, xmm4 ; p1-=q1 + + por xmm5, xmm1 ; abs(p1-q1) + movdqa xmm1, xmm3 ; p0 + + pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero + psubusb xmm1, xmm6 ; p0-q0 + + movdqa xmm4, [rdx] ; blimit + mov rdx, arg(4) ; get thresh + + psrlw xmm5, 1 ; abs(p1-q1)/2 + psubusb xmm6, xmm3 ; q0-p0 + + por xmm1, xmm6 ; abs(q0-p0) + paddusb xmm1, xmm1 ; abs(q0-p0)*2 + movdqa xmm3, [rdx] + + paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + psubusb xmm2, xmm3 ; abs(q1 - q0) > thresh + + psubusb xmm7, xmm3 ; abs(p1 - p0)> thresh + + psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por xmm2, xmm7 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + por xmm1, xmm0 ; mask + pcmpeqb xmm2, xmm0 + + pxor xmm0, xmm0 + pcmpeqb xmm4, xmm4 + + pcmpeqb xmm1, xmm0 + pxor xmm4, xmm2 +%endmacro + +%macro BV_TRANSPOSE 0 + ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + + movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + + punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 + + movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 + + punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 + movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 + + punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 + ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 + ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 + ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 + ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 +%endmacro + +%macro BV_WRITEBACK 2 + movd [rsi+2], %1 + movd [rsi+4*rax+2], %2 + psrldq %1, 4 + psrldq %2, 4 + movd [rdi+2], %1 + movd [rdi+4*rax+2], %2 + psrldq %1, 4 + psrldq %2, 4 + movd [rsi+2*rax+2], %1 + movd [rsi+2*rcx+2], %2 + psrldq %1, 4 + psrldq %2, 4 + movd [rdi+2*rax+2], %1 + movd [rdi+2*rcx+2], %2 +%endmacro + +%if ABI_IS_32BIT + +;void vp8_loop_filter_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +;) +global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE +sym(vp8_loop_filter_vertical_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; src_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax*2+rax] + + ;transpose 16x8 to 8x16, and store the 8-line result on stack. + TRANSPOSE_16X8 1, 1 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK + + ; start work on filters + B_FILTER 2 + + ; transpose and write back - only work on q1, q0, p0, p1 + BV_TRANSPOSE + ; store 16-line result + + lea rdx, [rax] + neg rdx + + BV_WRITEBACK xmm1, xmm5 + + lea rsi, [rsi+rdx*8] + lea rdi, [rdi+rdx*8] + BV_WRITEBACK xmm2, xmm6 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +%endif + +;void vp8_loop_filter_vertical_edge_uv_sse2 +;( +; unsigned char *u, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; unsigned char *v +;) +global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE +sym(vp8_loop_filter_vertical_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; u_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax+2*rax] + + ;transpose 16x8 to 8x16, and store the 8-line result on stack. + TRANSPOSE_16X8 0, 1 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK + + ; start work on filters + B_FILTER 2 + + ; transpose and write back - only work on q1, q0, p0, p1 + BV_TRANSPOSE + + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + + ; store 16-line result + BV_WRITEBACK xmm1, xmm5 + + mov rsi, arg(0) ; u_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + BV_WRITEBACK xmm2, xmm6 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +%macro MBV_TRANSPOSE 0 + movdqa xmm0, [rsp+_p3] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + + punpcklbw xmm0, xmm2 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpckhbw xmm1, xmm2 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + + movdqa xmm7, [rsp+_p1] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + movdqa xmm6, xmm7 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + + punpcklbw xmm7, [rsp+_p0] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpckhbw xmm6, [rsp+_p0] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpcklwd xmm0, xmm7 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + + punpckhwd xmm3, xmm7 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + + punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + + movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpcklbw xmm7, [rsp+_q1] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + + movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 + punpcklbw xmm6, [rsp+_q3] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06 + + movdqa xmm2, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + punpcklwd xmm7, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04 + + punpckhwd xmm2, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44 + movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + + punpckldq xmm0, xmm7 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00 + punpckhdq xmm6, xmm7 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20 +%endmacro + +%macro MBV_WRITEBACK_1 0 + movq [rsi], xmm0 + movhps [rdi], xmm0 + + movq [rsi+2*rax], xmm6 + movhps [rdi+2*rax], xmm6 + + movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + punpckldq xmm0, xmm2 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 + punpckhdq xmm3, xmm2 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 + + movq [rsi+4*rax], xmm0 + movhps [rdi+4*rax], xmm0 + + movq [rsi+2*rcx], xmm3 + movhps [rdi+2*rcx], xmm3 + + movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpckhbw xmm7, [rsp+_q1] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 + punpckhbw xmm5, [rsp+_q3] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 + + movdqa xmm0, xmm7 + punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84 + punpckhwd xmm7, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 + + movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80 + punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0 +%endmacro + +%macro MBV_WRITEBACK_2 0 + movq [rsi], xmm1 + movhps [rdi], xmm1 + + movq [rsi+2*rax], xmm5 + movhps [rdi+2*rax], xmm5 + + movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + punpckldq xmm1, xmm7 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 + punpckhdq xmm4, xmm7 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 + + movq [rsi+4*rax], xmm1 + movhps [rdi+4*rax], xmm1 + + movq [rsi+2*rcx], xmm4 + movhps [rdi+2*rcx], xmm4 +%endmacro + + +;void vp8_mbloop_filter_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +;) +global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE +sym(vp8_mbloop_filter_vertical_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; src_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax*2+rax] + + ; Transpose + TRANSPOSE_16X8 1, 0 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK + + neg rax + ; start work on filters + MB_FILTER_AND_WRITEBACK 2 + + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] + + ; transpose and write back + MBV_TRANSPOSE + + neg rax + + MBV_WRITEBACK_1 + + + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] + MBV_WRITEBACK_2 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_vertical_edge_uv_sse2 +;( +; unsigned char *u, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; unsigned char *v +;) +global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE +sym(vp8_mbloop_filter_vertical_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, lf_var_size + + mov rsi, arg(0) ; u_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax+2*rax] + + ; Transpose + TRANSPOSE_16X8 0, 0 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK + + ; start work on filters + MB_FILTER_AND_WRITEBACK 2 + + ; transpose and write back + MBV_TRANSPOSE + + mov rsi, arg(0) ;u_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] + MBV_WRITEBACK_1 + mov rsi, arg(5) ;v_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] + MBV_WRITEBACK_2 + + add rsp, lf_var_size + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +;) +global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE +sym(vp8_loop_filter_simple_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 7 + GET_GOT rbx + ; end prolog + + mov rcx, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + movdqa xmm6, [GLOBAL(tfe)] + lea rdx, [rcx + rax] + neg rax + + ; calculate mask + movdqa xmm0, [rdx] ; q1 + mov rdx, arg(2) ;blimit + movdqa xmm1, [rcx+2*rax] ; p1 + + movdqa xmm2, xmm1 + movdqa xmm3, xmm0 + + psubusb xmm0, xmm1 ; q1-=p1 + psubusb xmm1, xmm3 ; p1-=q1 + por xmm1, xmm0 ; abs(p1-q1) + pand xmm1, xmm6 ; set lsb of each byte to zero + psrlw xmm1, 1 ; abs(p1-q1)/2 + + movdqa xmm7, XMMWORD PTR [rdx] + + movdqa xmm5, [rcx+rax] ; p0 + movdqa xmm4, [rcx] ; q0 + movdqa xmm0, xmm4 ; q0 + movdqa xmm6, xmm5 ; p0 + psubusb xmm5, xmm4 ; p0-=q0 + psubusb xmm4, xmm6 ; q0-=p0 + por xmm5, xmm4 ; abs(p0 - q0) + + movdqa xmm4, [GLOBAL(t80)] + + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor xmm7, xmm7 + pcmpeqb xmm5, xmm7 + + + ; start work on filters + pxor xmm2, xmm4 ; p1 offset to convert to signed values + pxor xmm3, xmm4 ; q1 offset to convert to signed values + psubsb xmm2, xmm3 ; p1 - q1 + + pxor xmm6, xmm4 ; offset to convert to signed values + pxor xmm0, xmm4 ; offset to convert to signed values + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) + paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) + paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) + pand xmm5, xmm2 ; mask filter values we don't care about + + movdqa xmm0, xmm5 + paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 + paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 + + movdqa xmm1, [GLOBAL(te0)] + movdqa xmm2, [GLOBAL(t1f)] + +; pxor xmm7, xmm7 + pcmpgtb xmm7, xmm0 ;save sign + pand xmm7, xmm1 ;preserve the upper 3 bits + psrlw xmm0, 3 + pand xmm0, xmm2 ;clear out upper 3 bits + por xmm0, xmm7 ;add sign + psubsb xmm3, xmm0 ; q0-= q0sz add + + pxor xmm7, xmm7 + pcmpgtb xmm7, xmm5 ;save sign + pand xmm7, xmm1 ;preserve the upper 3 bits + psrlw xmm5, 3 + pand xmm5, xmm2 ;clear out upper 3 bits + por xmm5, xmm7 ;add sign + paddsb xmm6, xmm5 ; p0+= p0 add + + pxor xmm3, xmm4 ; unoffset + movdqa [rcx], xmm3 ; write back + + pxor xmm6, xmm4 ; unoffset + movdqa [rcx+rax], xmm6 ; write back + + ; begin epilog + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +;) +global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE +sym(vp8_loop_filter_simple_vertical_edge_sse2): + push rbp ; save old base pointer value. + mov rbp, rsp ; set new base pointer value. + SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 7 + GET_GOT rbx ; save callee-saved reg + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi - 2 ] + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 + movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 + movd xmm2, [rdi] ; 13 12 11 10 + movd xmm3, [rcx] ; 53 52 51 50 + punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 + punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 + + movd xmm4, [rsi + rax*2] ; 23 22 21 20 + movd xmm5, [rdx + rax*2] ; 63 62 61 60 + movd xmm6, [rdi + rax*2] ; 33 32 31 30 + movd xmm7, [rcx + rax*2] ; 73 72 71 70 + punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 + punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 + + punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 + punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 + + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + + movdqa xmm2, xmm0 + punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + + lea rsi, [rsi + rax*8] + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movd xmm4, [rsi] ; 83 82 81 80 + movd xmm1, [rdx] ; c3 c2 c1 c0 + movd xmm6, [rdi] ; 93 92 91 90 + movd xmm3, [rcx] ; d3 d2 d1 d0 + punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 + punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 + + movd xmm1, [rsi + rax*2] ; a3 a2 a1 a0 + movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0 + movd xmm3, [rdi + rax*2] ; b3 b2 b1 b0 + movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0 + punpckldq xmm1, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 + punpckldq xmm3, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 + + punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 + punpcklbw xmm1, xmm3 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 + + movdqa xmm7, xmm4 + punpcklwd xmm4, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + punpckhwd xmm7, xmm1 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 + + movdqa xmm6, xmm4 + punpckldq xmm4, xmm7 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + punpckhdq xmm6, xmm7 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + + punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + + mov rdx, arg(2) ;blimit + + ; calculate mask + movdqa xmm6, xmm0 ; p1 + movdqa xmm7, xmm3 ; q1 + psubusb xmm7, xmm0 ; q1-=p1 + psubusb xmm6, xmm3 ; p1-=q1 + por xmm6, xmm7 ; abs(p1-q1) + pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw xmm6, 1 ; abs(p1-q1)/2 + + movdqa xmm7, [rdx] + + movdqa xmm5, xmm1 ; p0 + movdqa xmm4, xmm2 ; q0 + psubusb xmm5, xmm2 ; p0-=q0 + psubusb xmm4, xmm1 ; q0-=p0 + por xmm5, xmm4 ; abs(p0 - q0) + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + movdqa xmm4, [GLOBAL(t80)] + + psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor xmm7, xmm7 + pcmpeqb xmm5, xmm7 ; mm5 = mask + + ; start work on filters + movdqa t0, xmm0 + movdqa t1, xmm3 + + pxor xmm0, xmm4 ; p1 offset to convert to signed values + pxor xmm3, xmm4 ; q1 offset to convert to signed values + psubsb xmm0, xmm3 ; p1 - q1 + + pxor xmm1, xmm4 ; offset to convert to signed values + pxor xmm2, xmm4 ; offset to convert to signed values + + movdqa xmm3, xmm2 ; offseted ; q0 + psubsb xmm2, xmm1 ; q0 - p0 + paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0) + paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0) + paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0) + pand xmm5, xmm0 ; mask filter values we don't care about + + movdqa xmm0, xmm5 + paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 + paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 + + movdqa xmm6, [GLOBAL(te0)] + movdqa xmm2, [GLOBAL(t1f)] + +; pxor xmm7, xmm7 + pcmpgtb xmm7, xmm0 ;save sign + pand xmm7, xmm6 ;preserve the upper 3 bits + psrlw xmm0, 3 + pand xmm0, xmm2 ;clear out upper 3 bits + por xmm0, xmm7 ;add sign + psubsb xmm3, xmm0 ; q0-= q0sz add + + pxor xmm7, xmm7 + pcmpgtb xmm7, xmm5 ;save sign + pand xmm7, xmm6 ;preserve the upper 3 bits + psrlw xmm5, 3 + pand xmm5, xmm2 ;clear out upper 3 bits + por xmm5, xmm7 ;add sign + paddsb xmm1, xmm5 ; p0+= p0 add + + pxor xmm3, xmm4 ; unoffset q0 + pxor xmm1, xmm4 ; unoffset p0 + + movdqa xmm0, t0 ; p1 + movdqa xmm4, t1 ; q1 + + ; write out order: xmm0 xmm2 xmm1 xmm3 + lea rdx, [rsi + rax*4] + + ; transpose back to write out + ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + movdqa xmm6, xmm0 + punpcklbw xmm0, xmm1 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpckhbw xmm6, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + + movdqa xmm5, xmm3 + punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + + movdqa xmm3, xmm6 + punpcklwd xmm6, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + + movd [rsi], xmm6 ; write the second 8-line result + movd [rdx], xmm3 + psrldq xmm6, 4 + psrldq xmm3, 4 + movd [rdi], xmm6 + movd [rcx], xmm3 + psrldq xmm6, 4 + psrldq xmm3, 4 + movd [rsi + rax*2], xmm6 + movd [rdx + rax*2], xmm3 + psrldq xmm6, 4 + psrldq xmm3, 4 + movd [rdi + rax*2], xmm6 + movd [rcx + rax*2], xmm3 + + neg rax + lea rsi, [rsi + rax*8] + neg rax + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movd [rsi], xmm0 ; write the first 8-line result + movd [rdx], xmm2 + psrldq xmm0, 4 + psrldq xmm2, 4 + movd [rdi], xmm0 + movd [rcx], xmm2 + psrldq xmm0, 4 + psrldq xmm2, 4 + movd [rsi + rax*2], xmm0 + movd [rdx + rax*2], xmm2 + psrldq xmm0, 4 + psrldq xmm2, 4 + movd [rdi + rax*2], xmm0 + movd [rcx + rax*2], xmm2 + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +tfe: + times 16 db 0xfe +align 16 +t80: + times 16 db 0x80 +align 16 +t1s: + times 16 db 0x01 +align 16 +t3: + times 16 db 0x03 +align 16 +t4: + times 16 db 0x04 +align 16 +ones: + times 8 dw 0x0001 +align 16 +s9: + times 8 dw 0x0900 +align 16 +s63: + times 8 dw 0x003f +align 16 +te0: + times 16 db 0xe0 +align 16 +t1f: + times 16 db 0x1f diff --git a/thirdparty/libvpx/vp8/common/x86/loopfilter_x86.c b/thirdparty/libvpx/vp8/common/x86/loopfilter_x86.c new file mode 100644 index 000000000000..658600460032 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/loopfilter_x86.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8/common/loopfilter.h" + +#define prototype_loopfilter(sym) \ + void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ + const unsigned char *limit, const unsigned char *thresh, int count) + +#define prototype_loopfilter_nc(sym) \ + void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ + const unsigned char *limit, const unsigned char *thresh) + +#define prototype_simple_loopfilter(sym) \ + void sym(unsigned char *y, int ystride, const unsigned char *blimit) + +prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx); +prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx); +prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx); +prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx); +prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx); +prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx); + +#if HAVE_SSE2 && ARCH_X86_64 +prototype_loopfilter(vp8_loop_filter_bv_y_sse2); +prototype_loopfilter(vp8_loop_filter_bh_y_sse2); +#else +prototype_loopfilter_nc(vp8_loop_filter_vertical_edge_sse2); +prototype_loopfilter_nc(vp8_loop_filter_horizontal_edge_sse2); +#endif +prototype_loopfilter_nc(vp8_mbloop_filter_vertical_edge_sse2); +prototype_loopfilter_nc(vp8_mbloop_filter_horizontal_edge_sse2); + +extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2; +extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2; +extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2; +extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2; + +#if HAVE_MMX +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + + +void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) +{ + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit); +} + + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + + +void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) +{ + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit); +} +#endif + + +/* Horizontal MB filtering */ +#if HAVE_SSE2 +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); +} + + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ + vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); +} + + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ +#if ARCH_X86_64 + vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); +#else + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); +#endif + + if (u_ptr) + vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride); +} + + +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) +{ + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit); +} + + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) +{ +#if ARCH_X86_64 + vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); +#else + vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); + vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); +#endif + + if (u_ptr) + vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4); +} + + +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) +{ + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit); +} + +#endif diff --git a/thirdparty/libvpx/vp8/common/x86/recon_mmx.asm b/thirdparty/libvpx/vp8/common/x86/recon_mmx.asm new file mode 100644 index 000000000000..15e98713c7f3 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/recon_mmx.asm @@ -0,0 +1,274 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void copy_mem8x8_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp8_copy_mem8x8_mmx) PRIVATE +sym(vp8_copy_mem8x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movq mm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movq mm1, [rsi+rax] + movq mm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movq [rdi], mm0 + add rsi, rax + + movq [rdi+rcx], mm1 + movq [rdi+rcx*2], mm2 + + + lea rdi, [rdi+rcx*2] + movq mm3, [rsi] + + add rdi, rcx + movq mm4, [rsi+rax] + + movq mm5, [rsi+rax*2] + movq [rdi], mm3 + + lea rsi, [rsi+rax*2] + movq [rdi+rcx], mm4 + + movq [rdi+rcx*2], mm5 + lea rdi, [rdi+rcx*2] + + movq mm0, [rsi+rax] + movq mm1, [rsi+rax*2] + + movq [rdi+rcx], mm0 + movq [rdi+rcx*2],mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void copy_mem8x4_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp8_copy_mem8x4_mmx) PRIVATE +sym(vp8_copy_mem8x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movq mm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movq mm1, [rsi+rax] + movq mm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movq [rdi], mm0 + movq [rdi+rcx], mm1 + + movq [rdi+rcx*2], mm2 + lea rdi, [rdi+rcx*2] + + movq mm3, [rsi+rax] + movq [rdi+rcx], mm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void copy_mem16x16_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp8_copy_mem16x16_mmx) PRIVATE +sym(vp8_copy_mem16x16_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movsxd rax, dword ptr arg(1) ;src_stride; + + mov rdi, arg(2) ;dst; + movsxd rcx, dword ptr arg(3) ;dst_stride + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq [rdi], mm0 + movq [rdi+8], mm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/thirdparty/libvpx/vp8/common/x86/recon_sse2.asm b/thirdparty/libvpx/vp8/common/x86/recon_sse2.asm new file mode 100644 index 000000000000..cb89537f761b --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/recon_sse2.asm @@ -0,0 +1,116 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void copy_mem16x16_sse2( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp8_copy_mem16x16_sse2) PRIVATE +sym(vp8_copy_mem16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movdqu xmm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movdqu xmm1, [rsi+rax] + movdqu xmm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm0 + add rsi, rax + + movdqa [rdi+rcx], xmm1 + movdqa [rdi+rcx*2],xmm2 + + lea rdi, [rdi+rcx*2] + movdqu xmm3, [rsi] + + add rdi, rcx + movdqu xmm4, [rsi+rax] + + movdqu xmm5, [rsi+rax*2] + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm3 + add rsi, rax + + movdqa [rdi+rcx], xmm4 + movdqa [rdi+rcx*2],xmm5 + + lea rdi, [rdi+rcx*2] + movdqu xmm0, [rsi] + + add rdi, rcx + movdqu xmm1, [rsi+rax] + + movdqu xmm2, [rsi+rax*2] + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm0 + add rsi, rax + + movdqa [rdi+rcx], xmm1 + + movdqa [rdi+rcx*2], xmm2 + movdqu xmm3, [rsi] + + movdqu xmm4, [rsi+rax] + lea rdi, [rdi+rcx*2] + + add rdi, rcx + movdqu xmm5, [rsi+rax*2] + + lea rsi, [rsi+rax*2] + movdqa [rdi], xmm3 + + add rsi, rax + movdqa [rdi+rcx], xmm4 + + movdqa [rdi+rcx*2],xmm5 + movdqu xmm0, [rsi] + + lea rdi, [rdi+rcx*2] + movdqu xmm1, [rsi+rax] + + add rdi, rcx + movdqu xmm2, [rsi+rax*2] + + lea rsi, [rsi+rax*2] + movdqa [rdi], xmm0 + + movdqa [rdi+rcx], xmm1 + movdqa [rdi+rcx*2],xmm2 + + movdqu xmm3, [rsi+rax] + lea rdi, [rdi+rcx*2] + + movdqa [rdi+rcx], xmm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm b/thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm new file mode 100644 index 000000000000..47dd452297f1 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm @@ -0,0 +1,702 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" +extern sym(vp8_bilinear_filters_x86_8) + + +%define BLOCK_HEIGHT_WIDTH 4 +%define vp8_filter_weight 128 +%define VP8_FILTER_SHIFT 7 + + +;void vp8_filter_block1d_h6_mmx +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +global sym(vp8_filter_block1d_h6_mmx) PRIVATE +sym(vp8_filter_block1d_h6_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + + movq mm1, [rdx + 16] ; do both the negative taps first!!! + movq mm2, [rdx + 32] ; + movq mm6, [rdx + 48] ; + movq mm7, [rdx + 64] ; + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + +.nextrow: + movq mm3, [rsi-2] ; mm3 = p-2..p5 + movq mm4, mm3 ; mm4 = p-2..p5 + psrlq mm3, 8 ; mm3 = p-1..p5 + punpcklbw mm3, mm0 ; mm3 = p-1..p2 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. + + movq mm5, mm4 ; mm5 = p-2..p5 + punpckhbw mm4, mm0 ; mm5 = p2..p5 + pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + movq mm4, mm5 ; mm4 = p-2..p5; + psrlq mm5, 16 ; mm5 = p0..p5; + punpcklbw mm5, mm0 ; mm5 = p0..p3 + pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers + paddsw mm3, mm5 ; mm3 += mm5 + + movq mm5, mm4 ; mm5 = p-2..p5 + psrlq mm4, 24 ; mm4 = p1..p5 + punpcklbw mm4, mm0 ; mm4 = p1..p4 + pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + ; do outer positive taps + movd mm4, [rsi+3] + punpcklbw mm4, mm0 ; mm5 = p3..p6 + pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + punpcklbw mm5, mm0 ; mm5 = p-2..p1 + pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers + paddsw mm3, mm5 ; mm3 += mm5 + + paddsw mm3, [GLOBAL(rd)] ; mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 + packuswb mm3, mm0 ; pack and unpack to saturate + punpcklbw mm3, mm0 ; + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line + add rdi, rax; +%else + movsxd r8, dword ptr arg(2) ;src_pixels_per_line + add rdi, rax; + + add rsi, r8 ; next line +%endif + + dec rcx ; decrement count + jnz .nextrow ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1dc_v6_mmx +;( +; short *src_ptr, +; unsigned char *output_ptr, +; int output_pitch, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +global sym(vp8_filter_block1dc_v6_mmx) PRIVATE +sym(vp8_filter_block1dc_v6_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movq mm5, [GLOBAL(rd)] + push rbx + mov rbx, arg(7) ;vp8_filter + movq mm1, [rbx + 16] ; do both the negative taps first!!! + movq mm2, [rbx + 32] ; + movq mm6, [rbx + 48] ; + movq mm7, [rbx + 64] ; + + movsxd rdx, dword ptr arg(3) ;pixels_per_line + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + sub rsi, rdx + sub rsi, rdx + movsxd rcx, DWORD PTR arg(5) ;output_height + movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + + +.nextrow_cv: + movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. + + + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 + pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 + pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi] ; mm4 = p0..p3 = row -2 + pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + + add rsi, rdx ; move source forward 1 line to avoid 3 * pitch + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 + pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 + pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + + paddsw mm3, mm5 ; mm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 + packuswb mm3, mm0 ; pack and saturate + + movd [rdi],mm3 ; store the results in the destination + ; the subsequent iterations repeat 3 out of 4 of these reads. Since the + ; recon block should be in cache this shouldn't cost much. Its obviously + ; avoidable!!!. + lea rdi, [rdi+rax] ; + dec rcx ; decrement count + jnz .nextrow_cv ; next row + + pop rbx + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void bilinear_predict8x8_mmx +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict8x8_mmx) PRIVATE +sym(vp8_bilinear_predict8x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; + + movsxd rax, dword ptr arg(2) ;xoffset + mov rdi, arg(4) ;dst_ptr ; + + shl rax, 5 ; offset * 32 + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] + + add rax, rcx ; HFilter + mov rsi, arg(0) ;src_ptr ; + + movsxd rdx, dword ptr arg(5) ;dst_pitch + movq mm1, [rax] ; + + movq mm2, [rax+16] ; + movsxd rax, dword ptr arg(3) ;yoffset + + pxor mm0, mm0 ; + + shl rax, 5 ; offset*32 + add rax, rcx ; VFilter + + lea rcx, [rdi+rdx*8] ; + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; + + + + ; get the first horizontal line done ; + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + add rsi, rdx ; next line +.next_row_8x8: + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + movq mm5, mm7 ; + movq mm6, mm7 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 + + pmullw mm5, [rax] ; + pmullw mm6, [rax] ; + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + + pmullw mm3, [rax+16] ; + pmullw mm4, [rax+16] ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + packuswb mm3, mm4 + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, rdx ; next line + add rdi, dword ptr arg(5) ;dst_pitch ; +%else + movsxd r8, dword ptr arg(5) ;dst_pitch + add rsi, rdx ; next line + add rdi, r8 ;dst_pitch +%endif + cmp rdi, rcx ; + jne .next_row_8x8 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void bilinear_predict8x4_mmx +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict8x4_mmx) PRIVATE +sym(vp8_bilinear_predict8x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; + + movsxd rax, dword ptr arg(2) ;xoffset + mov rdi, arg(4) ;dst_ptr ; + + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] + shl rax, 5 + + mov rsi, arg(0) ;src_ptr ; + add rax, rcx + + movsxd rdx, dword ptr arg(5) ;dst_pitch + movq mm1, [rax] ; + + movq mm2, [rax+16] ; + movsxd rax, dword ptr arg(3) ;yoffset + + pxor mm0, mm0 ; + shl rax, 5 + + add rax, rcx + lea rcx, [rdi+rdx*4] ; + + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; + + ; get the first horizontal line done ; + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + add rsi, rdx ; next line +.next_row_8x4: + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + movq mm5, mm7 ; + movq mm6, mm7 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 + + pmullw mm5, [rax] ; + pmullw mm6, [rax] ; + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + + pmullw mm3, [rax+16] ; + pmullw mm4, [rax+16] ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP8_FILTER_SHIFT ; + + packuswb mm3, mm4 + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, rdx ; next line + add rdi, dword ptr arg(5) ;dst_pitch ; +%else + movsxd r8, dword ptr arg(5) ;dst_pitch + add rsi, rdx ; next line + add rdi, r8 +%endif + cmp rdi, rcx ; + jne .next_row_8x4 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void bilinear_predict4x4_mmx +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict4x4_mmx) PRIVATE +sym(vp8_bilinear_predict4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; + + movsxd rax, dword ptr arg(2) ;xoffset + mov rdi, arg(4) ;dst_ptr ; + + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] + shl rax, 5 + + add rax, rcx ; HFilter + mov rsi, arg(0) ;src_ptr ; + + movsxd rdx, dword ptr arg(5) ;ldst_pitch + movq mm1, [rax] ; + + movq mm2, [rax+16] ; + movsxd rax, dword ptr arg(3) ;yoffset + + pxor mm0, mm0 ; + shl rax, 5 + + add rax, rcx + lea rcx, [rdi+rdx*4] ; + + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; + + ; get the first horizontal line done ; + movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + + pmullw mm3, mm1 ; + movd mm5, [rsi+1] ; + + punpcklbw mm5, mm0 ; + pmullw mm5, mm2 ; + + paddw mm3, mm5 ; + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + movq mm7, mm3 ; + packuswb mm7, mm0 ; + + add rsi, rdx ; next line +.next_row_4x4: + movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + + pmullw mm3, mm1 ; + movd mm5, [rsi+1] ; + + punpcklbw mm5, mm0 ; + pmullw mm5, mm2 ; + + paddw mm3, mm5 ; + + movq mm5, mm7 ; + punpcklbw mm5, mm0 ; + + pmullw mm5, [rax] ; + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + movq mm7, mm3 ; + + packuswb mm7, mm0 ; + + pmullw mm3, [rax+16] ; + paddw mm3, mm5 ; + + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + packuswb mm3, mm0 + movd [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, rdx ; next line + add rdi, dword ptr arg(5) ;dst_pitch ; +%else + movsxd r8, dword ptr arg(5) ;dst_pitch ; + add rsi, rdx ; next line + add rdi, r8 +%endif + + cmp rdi, rcx ; + jne .next_row_4x4 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + +SECTION_RODATA +align 16 +rd: + times 4 dw 0x40 + +align 16 +global HIDDEN_DATA(sym(vp8_six_tap_mmx)) +sym(vp8_six_tap_mmx): + times 8 dw 0 + times 8 dw 0 + times 8 dw 128 + times 8 dw 0 + times 8 dw 0 + times 8 dw 0 + + times 8 dw 0 + times 8 dw -6 + times 8 dw 123 + times 8 dw 12 + times 8 dw -1 + times 8 dw 0 + + times 8 dw 2 + times 8 dw -11 + times 8 dw 108 + times 8 dw 36 + times 8 dw -8 + times 8 dw 1 + + times 8 dw 0 + times 8 dw -9 + times 8 dw 93 + times 8 dw 50 + times 8 dw -6 + times 8 dw 0 + + times 8 dw 3 + times 8 dw -16 + times 8 dw 77 + times 8 dw 77 + times 8 dw -16 + times 8 dw 3 + + times 8 dw 0 + times 8 dw -6 + times 8 dw 50 + times 8 dw 93 + times 8 dw -9 + times 8 dw 0 + + times 8 dw 1 + times 8 dw -8 + times 8 dw 36 + times 8 dw 108 + times 8 dw -11 + times 8 dw 2 + + times 8 dw 0 + times 8 dw -1 + times 8 dw 12 + times 8 dw 123 + times 8 dw -6 + times 8 dw 0 + + diff --git a/thirdparty/libvpx/vp8/common/x86/subpixel_sse2.asm b/thirdparty/libvpx/vp8/common/x86/subpixel_sse2.asm new file mode 100644 index 000000000000..69f8d103c141 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/subpixel_sse2.asm @@ -0,0 +1,1372 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" +extern sym(vp8_bilinear_filters_x86_8) + +%define BLOCK_HEIGHT_WIDTH 4 +%define VP8_FILTER_WEIGHT 128 +%define VP8_FILTER_SHIFT 7 + + +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +;*************************************************************************************/ +;void vp8_filter_block1d8_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short *vp8_filter +;) +global sym(vp8_filter_block1d8_h6_sse2) PRIVATE +sym(vp8_filter_block1d8_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;output_width +%endif + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d8_h6_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm4 + lea rsi, [rsi + rax] + +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(5) ;[output_width] +%else + add rdi, r8 +%endif + dec rcx + + jnz .filter_block1d8_h6_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d16_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short *vp8_filter +;) +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +;*************************************************************************************/ +global sym(vp8_filter_block1d16_h6_sse2) PRIVATE +sym(vp8_filter_block1d16_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;output_width +%endif + + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d16_h6_sse2_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + movq xmm2, MMWORD PTR [rsi +14] + pslldq xmm2, 8 + + por xmm2, xmm1 + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm4 + + movdqa xmm3, xmm2 + movdqa xmm4, xmm2 + + movdqa xmm5, xmm2 + movdqa xmm6, xmm2 + + movdqa xmm7, xmm2 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm2 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi+16], xmm4 + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(5) ;[output_width] +%else + add rdi, r8 +%endif + + dec rcx + jnz .filter_block1d16_h6_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d8_v6_sse2 +;( +; short *src_ptr, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp8_filter +;) +;/************************************************************************************ +; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The +; input pixel array has output_height rows. +;*************************************************************************************/ +global sym(vp8_filter_block1d8_v6_sse2) PRIVATE +sym(vp8_filter_block1d8_v6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(7) ;vp8_filter + movsxd rdx, dword ptr arg(3) ;pixels_per_line + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + + sub rsi, rdx + sub rsi, rdx + + movsxd rcx, DWORD PTR arg(5) ;[output_height] + pxor xmm0, xmm0 ; clear xmm0 + + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(2) ; dst_ptich +%endif + +.vp8_filter_block1d8_v6_sse2_loop: + movdqa xmm1, XMMWORD PTR [rsi] + pmullw xmm1, [rax] + + movdqa xmm2, XMMWORD PTR [rsi + rdx] + pmullw xmm2, [rax + 16] + + movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] + pmullw xmm3, [rax + 32] + + movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] + pmullw xmm5, [rax + 64] + + add rsi, rdx + movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] + + pmullw xmm4, [rax + 48] + movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] + + pmullw xmm6, [rax + 80] + + paddsw xmm2, xmm5 + paddsw xmm2, xmm3 + + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + + paddsw xmm2, xmm6 + paddsw xmm2, xmm7 + + psraw xmm2, 7 + packuswb xmm2, xmm0 ; pack and saturate + + movq QWORD PTR [rdi], xmm2 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(2) ;[dst_ptich] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz .vp8_filter_block1d8_v6_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d16_v6_sse2 +;( +; unsigned short *src_ptr, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; const short *vp8_filter +;) +;/************************************************************************************ +; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The +; input pixel array has output_height rows. +;*************************************************************************************/ +global sym(vp8_filter_block1d16_v6_sse2) PRIVATE +sym(vp8_filter_block1d16_v6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(7) ;vp8_filter + movsxd rdx, dword ptr arg(3) ;pixels_per_line + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + + sub rsi, rdx + sub rsi, rdx + + movsxd rcx, DWORD PTR arg(5) ;[output_height] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(2) ; dst_ptich +%endif + +.vp8_filter_block1d16_v6_sse2_loop: +; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. + movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 + movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] + pmullw xmm1, [rax + 16] + pmullw xmm2, [rax + 16] + + movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 + movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] + pmullw xmm3, [rax + 64] + pmullw xmm4, [rax + 64] + + movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 + movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] + pmullw xmm5, [rax + 32] + pmullw xmm6, [rax + 32] + + movdqa xmm7, XMMWORD PTR [rsi] ; line 1 + movdqa xmm0, XMMWORD PTR [rsi + 16] + pmullw xmm7, [rax] + pmullw xmm0, [rax] + + paddsw xmm1, xmm3 + paddsw xmm2, xmm4 + paddsw xmm1, xmm5 + paddsw xmm2, xmm6 + paddsw xmm1, xmm7 + paddsw xmm2, xmm0 + + add rsi, rdx + + movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 + movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] + pmullw xmm3, [rax + 48] + pmullw xmm4, [rax + 48] + + movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 + movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] + pmullw xmm5, [rax + 80] + pmullw xmm6, [rax + 80] + + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] + pxor xmm0, xmm0 ; clear xmm0 + + paddsw xmm1, xmm3 + paddsw xmm2, xmm4 + paddsw xmm1, xmm5 + paddsw xmm2, xmm6 + + paddsw xmm1, xmm7 + paddsw xmm2, xmm7 + + psraw xmm1, 7 + psraw xmm2, 7 + + packuswb xmm1, xmm2 ; pack and saturate + movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(2) ;[dst_ptich] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz .vp8_filter_block1d16_v6_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d8_h6_only_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int output_height, +; const short *vp8_filter +;) +; First-pass filter only when yoffset==0 +global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE +sym(vp8_filter_block1d8_h6_only_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(5) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(2) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(3) ;dst_ptich +%endif + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d8_h6_only_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + + movq QWORD PTR [rdi], xmm4 ; store the results in the destination + lea rsi, [rsi + rax] + +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(3) ;dst_ptich +%else + add rdi, r8 +%endif + dec rcx + + jnz .filter_block1d8_h6_only_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d16_h6_only_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int output_height, +; const short *vp8_filter +;) +; First-pass filter only when yoffset==0 +global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE +sym(vp8_filter_block1d16_h6_only_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(5) ;vp8_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(2) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(3) ;dst_ptich +%endif + + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d16_h6_only_sse2_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + movq xmm2, MMWORD PTR [rsi +14] + pslldq xmm2, 8 + + por xmm2, xmm1 + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 ; lower 8 bytes + + movq QWORD Ptr [rdi], xmm4 ; store the results in the destination + + movdqa xmm3, xmm2 + movdqa xmm4, xmm2 + + movdqa xmm5, xmm2 + movdqa xmm6, xmm2 + + movdqa xmm7, xmm2 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm2 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 ; higher 8 bytes + + movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(3) ;dst_ptich +%else + add rdi, r8 +%endif + + dec rcx + jnz .filter_block1d16_h6_only_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_filter_block1d8_v6_only_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int output_height, +; const short *vp8_filter +;) +; Second-pass filter only when xoffset==0 +global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE +sym(vp8_filter_block1d8_v6_only_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + mov rax, arg(5) ;vp8_filter + + pxor xmm0, xmm0 ; clear xmm0 + + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(3) ; dst_ptich +%endif + +.vp8_filter_block1d8_v6_only_sse2_loop: + movq xmm1, MMWORD PTR [rsi] + movq xmm2, MMWORD PTR [rsi + rdx] + movq xmm3, MMWORD PTR [rsi + rdx * 2] + movq xmm5, MMWORD PTR [rsi + rdx * 4] + add rsi, rdx + movq xmm4, MMWORD PTR [rsi + rdx * 2] + movq xmm6, MMWORD PTR [rsi + rdx * 4] + + punpcklbw xmm1, xmm0 + pmullw xmm1, [rax] + + punpcklbw xmm2, xmm0 + pmullw xmm2, [rax + 16] + + punpcklbw xmm3, xmm0 + pmullw xmm3, [rax + 32] + + punpcklbw xmm5, xmm0 + pmullw xmm5, [rax + 64] + + punpcklbw xmm4, xmm0 + pmullw xmm4, [rax + 48] + + punpcklbw xmm6, xmm0 + pmullw xmm6, [rax + 80] + + paddsw xmm2, xmm5 + paddsw xmm2, xmm3 + + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + + paddsw xmm2, xmm6 + paddsw xmm2, xmm7 + + psraw xmm2, 7 + packuswb xmm2, xmm0 ; pack and saturate + + movq QWORD PTR [rdi], xmm2 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[dst_ptich] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_unpack_block1d16_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int output_height, +; unsigned int output_width +;) +global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE +sym(vp8_unpack_block1d16_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(3) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source + + pxor xmm0, xmm0 ; clear xmm0 for unpack +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source +%endif + +.unpack_block1d16_h6_sse2_rowloop: + movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 + movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + punpcklbw xmm1, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm1 + movdqa XMMWORD Ptr [rdi + 16], xmm3 + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(4) ;[output_width] +%else + add rdi, r8 +%endif + dec rcx + jnz .unpack_block1d16_h6_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_bilinear_predict16x16_sse2 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +extern sym(vp8_bilinear_filters_x86_8) +global sym(vp8_bilinear_predict16x16_sse2) PRIVATE +sym(vp8_bilinear_predict16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] + + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] + movsxd rax, dword ptr arg(2) ;xoffset + + cmp rax, 0 ;skip first_pass filter if xoffset=0 + je .b16x16_sp_only + + shl rax, 5 + add rax, rcx ;HFilter + + mov rdi, arg(4) ;dst_ptr + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(5) ;dst_pitch + + movdqa xmm1, [rax] + movdqa xmm2, [rax+16] + + movsxd rax, dword ptr arg(3) ;yoffset + + cmp rax, 0 ;skip second_pass filter if yoffset=0 + je .b16x16_fp_only + + shl rax, 5 + add rax, rcx ;VFilter + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + pxor xmm0, xmm0 + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;dst_pitch +%endif + ; get the first horizontal line done + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movdqa xmm4, xmm3 ; make a copy of current line + + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm1 + + movdqu xmm5, [rsi+1] + movdqa xmm6, xmm5 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, xmm2 + pmullw xmm6, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + movdqa xmm7, xmm3 + packuswb xmm7, xmm4 + + add rsi, rdx ; next line +.next_row: + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movdqa xmm4, xmm3 ; make a copy of current line + + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm1 + + movdqu xmm5, [rsi+1] + movdqa xmm6, xmm5 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, xmm2 + pmullw xmm6, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + movdqa xmm5, xmm7 + movdqa xmm6, xmm7 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, [rax] + pmullw xmm6, [rax] + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + movdqa xmm7, xmm3 + packuswb xmm7, xmm4 + + pmullw xmm3, [rax+16] + pmullw xmm4, [rax+16] + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + packuswb xmm3, xmm4 + movdqa [rdi], xmm3 ; store the results in the destination + + add rsi, rdx ; next line +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(5) ;dst_pitch +%else + add rdi, r8 +%endif + + cmp rdi, rcx + jne .next_row + + jmp .done + +.b16x16_sp_only: + movsxd rax, dword ptr arg(3) ;yoffset + shl rax, 5 + add rax, rcx ;VFilter + + mov rdi, arg(4) ;dst_ptr + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(5) ;dst_pitch + + movdqa xmm1, [rax] + movdqa xmm2, [rax+16] + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + + pxor xmm0, xmm0 + + ; get the first horizontal line done + movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + + add rsi, rax ; next line +.next_row_spo: + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + + movdqa xmm5, xmm7 + movdqa xmm6, xmm7 + + movdqa xmm4, xmm3 ; make a copy of current line + movdqa xmm7, xmm3 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm5, xmm1 + pmullw xmm6, xmm1 + pmullw xmm3, xmm2 + pmullw xmm4, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + packuswb xmm3, xmm4 + movdqa [rdi], xmm3 ; store the results in the destination + + add rsi, rax ; next line + add rdi, rdx ;dst_pitch + cmp rdi, rcx + jne .next_row_spo + + jmp .done + +.b16x16_fp_only: + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + pxor xmm0, xmm0 + +.next_row_fpo: + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movdqa xmm4, xmm3 ; make a copy of current line + + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm1 + + movdqu xmm5, [rsi+1] + movdqa xmm6, xmm5 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, xmm2 + pmullw xmm6, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + packuswb xmm3, xmm4 + movdqa [rdi], xmm3 ; store the results in the destination + + add rsi, rax ; next line + add rdi, rdx ; dst_pitch + cmp rdi, rcx + jne .next_row_fpo + +.done: + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_bilinear_predict8x8_sse2 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict8x8_sse2) PRIVATE +sym(vp8_bilinear_predict8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 144 ; reserve 144 bytes + + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] + + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + ;Read 9-line unaligned data in and put them on stack. This gives a big + ;performance boost. + movdqu xmm0, [rsi] + lea rax, [rdx + rdx*2] + movdqu xmm1, [rsi+rdx] + movdqu xmm2, [rsi+rdx*2] + add rsi, rax + movdqu xmm3, [rsi] + movdqu xmm4, [rsi+rdx] + movdqu xmm5, [rsi+rdx*2] + add rsi, rax + movdqu xmm6, [rsi] + movdqu xmm7, [rsi+rdx] + + movdqa XMMWORD PTR [rsp], xmm0 + + movdqu xmm0, [rsi+rdx*2] + + movdqa XMMWORD PTR [rsp+16], xmm1 + movdqa XMMWORD PTR [rsp+32], xmm2 + movdqa XMMWORD PTR [rsp+48], xmm3 + movdqa XMMWORD PTR [rsp+64], xmm4 + movdqa XMMWORD PTR [rsp+80], xmm5 + movdqa XMMWORD PTR [rsp+96], xmm6 + movdqa XMMWORD PTR [rsp+112], xmm7 + movdqa XMMWORD PTR [rsp+128], xmm0 + + movsxd rax, dword ptr arg(2) ;xoffset + shl rax, 5 + add rax, rcx ;HFilter + + mov rdi, arg(4) ;dst_ptr + movsxd rdx, dword ptr arg(5) ;dst_pitch + + movdqa xmm1, [rax] + movdqa xmm2, [rax+16] + + movsxd rax, dword ptr arg(3) ;yoffset + shl rax, 5 + add rax, rcx ;VFilter + + lea rcx, [rdi+rdx*8] + + movdqa xmm5, [rax] + movdqa xmm6, [rax+16] + + pxor xmm0, xmm0 + + ; get the first horizontal line done + movdqa xmm3, XMMWORD PTR [rsp] + movdqa xmm4, xmm3 ; make a copy of current line + psrldq xmm4, 1 + + punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 + punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm2 + + paddw xmm3, xmm4 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + movdqa xmm7, xmm3 + add rsp, 16 ; next line +.next_row8x8: + movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + movdqa xmm4, xmm3 ; make a copy of current line + psrldq xmm4, 1 + + punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 + punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm2 + + paddw xmm3, xmm4 + pmullw xmm7, xmm5 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + movdqa xmm4, xmm3 + + pmullw xmm3, xmm6 + paddw xmm3, xmm7 + + movdqa xmm7, xmm4 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + packuswb xmm3, xmm0 + movq [rdi], xmm3 ; store the results in the destination + + add rsp, 16 ; next line + add rdi, rdx + + cmp rdi, rcx + jne .next_row8x8 + + ;add rsp, 144 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +rd: + times 8 dw 0x40 diff --git a/thirdparty/libvpx/vp8/common/x86/subpixel_ssse3.asm b/thirdparty/libvpx/vp8/common/x86/subpixel_ssse3.asm new file mode 100644 index 000000000000..c06f24556e68 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/subpixel_ssse3.asm @@ -0,0 +1,1508 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define BLOCK_HEIGHT_WIDTH 4 +%define VP8_FILTER_WEIGHT 128 +%define VP8_FILTER_SHIFT 7 + + +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +; +; This is an implementation of some of the SSE optimizations first seen in ffvp8 +; +;*************************************************************************************/ +;void vp8_filter_block1d8_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE +sym(vp8_filter_block1d8_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 + + movdqa xmm7, [GLOBAL(rd)] + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + mov rdi, arg(2) ;output_ptr + + cmp esi, DWORD PTR [rax] + je vp8_filter_block1d8_h4_ssse3 + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + + sub rdi, rdx +;xmm3 free +.filter_block1d8_h6_rowloop_ssse3: + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 + + movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 + + punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 + + movdqa xmm1, xmm0 + pmaddubsw xmm0, xmm4 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf2bfrom1)] + + pshufb xmm2, [GLOBAL(shuf3bfrom1)] + pmaddubsw xmm1, xmm5 + + lea rdi, [rdi + rdx] + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] + dec rcx + + paddsw xmm0, xmm1 + paddsw xmm2, xmm7 + + paddsw xmm0, xmm2 + + psraw xmm0, 7 + + packuswb xmm0, xmm0 + + movq MMWORD Ptr [rdi], xmm0 + jnz .filter_block1d8_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +vp8_filter_block1d8_h4_ssse3: + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)] + movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)] + + mov rsi, arg(0) ;src_ptr + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + + sub rdi, rdx + +.filter_block1d8_h4_rowloop_ssse3: + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 + + movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 + + punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 + + movdqa xmm2, xmm0 + pshufb xmm0, xmm3 + + pshufb xmm2, xmm4 + pmaddubsw xmm0, xmm5 + + lea rdi, [rdi + rdx] + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] + dec rcx + + paddsw xmm0, xmm7 + + paddsw xmm0, xmm2 + + psraw xmm0, 7 + + packuswb xmm0, xmm0 + + movq MMWORD Ptr [rdi], xmm0 + + jnz .filter_block1d8_h4_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +;void vp8_filter_block1d16_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE +sym(vp8_filter_block1d16_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + mov rdi, arg(2) ;output_ptr + + mov rsi, arg(0) ;src_ptr + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rdx, dword ptr arg(3) ;output_pitch + +.filter_block1d16_h6_rowloop_ssse3: + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 + + movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 + + punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 + + movdqa xmm1, xmm0 + pmaddubsw xmm0, xmm4 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf2bfrom1)] + + pshufb xmm2, [GLOBAL(shuf3bfrom1)] + movq xmm3, MMWORD PTR [rsi + 6] + + pmaddubsw xmm1, xmm5 + movq xmm7, MMWORD PTR [rsi + 11] + + pmaddubsw xmm2, xmm6 + punpcklbw xmm3, xmm7 + + paddsw xmm0, xmm1 + movdqa xmm1, xmm3 + + pmaddubsw xmm3, xmm4 + paddsw xmm0, xmm2 + + movdqa xmm2, xmm1 + paddsw xmm0, [GLOBAL(rd)] + + pshufb xmm1, [GLOBAL(shuf2bfrom1)] + pshufb xmm2, [GLOBAL(shuf3bfrom1)] + + psraw xmm0, 7 + pmaddubsw xmm1, xmm5 + + pmaddubsw xmm2, xmm6 + packuswb xmm0, xmm0 + + lea rsi, [rsi + rax] + paddsw xmm3, xmm1 + + paddsw xmm3, xmm2 + + paddsw xmm3, [GLOBAL(rd)] + + psraw xmm3, 7 + + packuswb xmm3, xmm3 + + punpcklqdq xmm0, xmm3 + + movdqa XMMWORD Ptr [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .filter_block1d16_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_filter_block1d4_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE +sym(vp8_filter_block1d4_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + movdqa xmm7, [GLOBAL(rd)] + + cmp esi, DWORD PTR [rax] + je .vp8_filter_block1d4_h4_ssse3 + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + +;xmm3 free +.filter_block1d4_h6_rowloop_ssse3: + movdqu xmm0, XMMWORD PTR [rsi - 2] + + movdqa xmm1, xmm0 + pshufb xmm0, [GLOBAL(shuf1b)] + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf2b)] + pmaddubsw xmm0, xmm4 + pshufb xmm2, [GLOBAL(shuf3b)] + pmaddubsw xmm1, xmm5 + +;-- + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] +;-- + paddsw xmm0, xmm1 + paddsw xmm0, xmm7 + pxor xmm1, xmm1 + paddsw xmm0, xmm2 + psraw xmm0, 7 + packuswb xmm0, xmm0 + + movd DWORD PTR [rdi], xmm0 + + add rdi, rdx + dec rcx + jnz .filter_block1d4_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +.vp8_filter_block1d4_h4_ssse3: + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] + movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)] + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + +.filter_block1d4_h4_rowloop_ssse3: + movdqu xmm1, XMMWORD PTR [rsi - 2] + + movdqa xmm2, xmm1 + pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)] + pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)] + pmaddubsw xmm1, xmm5 + +;-- + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] +;-- + paddsw xmm1, xmm7 + paddsw xmm1, xmm2 + psraw xmm1, 7 + packuswb xmm1, xmm1 + + movd DWORD PTR [rdi], xmm1 + + add rdi, rdx + dec rcx + jnz .filter_block1d4_h4_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + + +;void vp8_filter_block1d16_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE +sym(vp8_filter_block1d16_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + cmp esi, DWORD PTR [rax] + je .vp8_filter_block1d16_v4_ssse3 + + movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + +.vp8_filter_block1d16_v6_ssse3_loop: + movq xmm1, MMWORD PTR [rsi] ;A + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4] ;F + + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, [GLOBAL(rd)] + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 ;store the results + + movq xmm1, MMWORD PTR [rsi + 8] ;A + movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, [GLOBAL(rd)] + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi+8], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d16_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +.vp8_filter_block1d16_v4_ssse3: + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + +.vp8_filter_block1d16_v4_ssse3_loop: + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + pmaddubsw xmm3, xmm6 + pmaddubsw xmm2, xmm7 + movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B + movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E + + paddsw xmm2, [GLOBAL(rd)] + paddsw xmm2, xmm3 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + punpcklbw xmm5, xmm4 ;B D + punpcklbw xmm1, xmm0 ;C E + + pmaddubsw xmm1, xmm6 + pmaddubsw xmm5, xmm7 + + movdqa xmm4, [GLOBAL(rd)] + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm5, xmm1 + paddsw xmm5, xmm4 + psraw xmm5, 7 + packuswb xmm5, xmm5 + + punpcklqdq xmm2, xmm5 + + movdqa XMMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d16_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_filter_block1d8_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE +sym(vp8_filter_block1d8_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ; out_pitch +%endif + movsxd rcx, DWORD PTR arg(4) ;[output_height] + + cmp esi, DWORD PTR [rax] + je .vp8_filter_block1d8_v4_ssse3 + + movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp8_filter_block1d8_v6_ssse3_loop: + movq xmm1, MMWORD PTR [rsi] ;A + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4] ;F + movdqa xmm4, [GLOBAL(rd)] + + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d8_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +.vp8_filter_block1d8_v4_ssse3: + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + movdqa xmm5, [GLOBAL(rd)] + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp8_filter_block1d8_v4_ssse3_loop: + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + pmaddubsw xmm3, xmm6 + pmaddubsw xmm2, xmm7 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm5 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d8_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +;void vp8_filter_block1d4_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp8_filter_index +;) +global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE +sym(vp8_filter_block1d4_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ; out_pitch +%endif + movsxd rcx, DWORD PTR arg(4) ;[output_height] + + cmp esi, DWORD PTR [rax] + je .vp8_filter_block1d4_v4_ssse3 + + movq mm5, MMWORD PTR [rax] ;k0_k5 + movq mm6, MMWORD PTR [rax+256] ;k2_k4 + movq mm7, MMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp8_filter_block1d4_v6_ssse3_loop: + movd mm1, DWORD PTR [rsi] ;A + movd mm2, DWORD PTR [rsi + rdx] ;B + movd mm3, DWORD PTR [rsi + rdx * 2] ;C + movd mm4, DWORD PTR [rax + rdx * 2] ;D + movd mm0, DWORD PTR [rsi + rdx * 4] ;E + + punpcklbw mm2, mm4 ;B D + punpcklbw mm3, mm0 ;C E + + movd mm0, DWORD PTR [rax + rdx * 4] ;F + + movq mm4, [GLOBAL(rd)] + + pmaddubsw mm3, mm6 + punpcklbw mm1, mm0 ;A F + pmaddubsw mm2, mm7 + pmaddubsw mm1, mm5 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw mm2, mm3 + paddsw mm2, mm1 + paddsw mm2, mm4 + psraw mm2, 7 + packuswb mm2, mm2 + + movd DWORD PTR [rdi], mm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d4_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +.vp8_filter_block1d4_v4_ssse3: + movq mm6, MMWORD PTR [rax+256] ;k2_k4 + movq mm7, MMWORD PTR [rax+128] ;k1_k3 + movq mm5, MMWORD PTR [GLOBAL(rd)] + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp8_filter_block1d4_v4_ssse3_loop: + movd mm2, DWORD PTR [rsi + rdx] ;B + movd mm3, DWORD PTR [rsi + rdx * 2] ;C + movd mm4, DWORD PTR [rax + rdx * 2] ;D + movd mm0, DWORD PTR [rsi + rdx * 4] ;E + + punpcklbw mm2, mm4 ;B D + punpcklbw mm3, mm0 ;C E + + pmaddubsw mm3, mm6 + pmaddubsw mm2, mm7 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw mm2, mm3 + paddsw mm2, mm5 + psraw mm2, 7 + packuswb mm2, mm2 + + movd DWORD PTR [rdi], mm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d4_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_bilinear_predict16x16_ssse3 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE +sym(vp8_bilinear_predict16x16_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] + movsxd rax, dword ptr arg(2) ; xoffset + + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je .b16x16_sp_only + + shl rax, 4 + lea rax, [rax + rcx] ; HFilter + + mov rdi, arg(4) ; dst_ptr + mov rsi, arg(0) ; src_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm1, [rax] + + movsxd rax, dword ptr arg(3) ; yoffset + + cmp rax, 0 ; skip second_pass filter if yoffset=0 + je .b16x16_fp_only + + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rdx, dword ptr arg(1) ; src_pixels_per_line + + movdqa xmm2, [rax] + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ; dst_pitch +%endif + movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07 + movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 + + punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 + movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 + + movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 + + lea rsi, [rsi + rdx] ; next line + + pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14 + + punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 + pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value + psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 + + movdqa xmm7, xmm3 + packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + +.next_row: + movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07 + movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 + + punpcklbw xmm6, xmm5 + movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 + + movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 + lea rsi, [rsi + rdx] ; next line + + pmaddubsw xmm6, xmm1 + + punpcklbw xmm4, xmm5 + pmaddubsw xmm4, xmm1 + + paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value + psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 + + paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value + psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 + + packuswb xmm6, xmm4 + movdqa xmm5, xmm7 + + punpcklbw xmm5, xmm6 + pmaddubsw xmm5, xmm2 + + punpckhbw xmm7, xmm6 + pmaddubsw xmm7, xmm2 + + paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value + psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128 + + paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value + psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 + + packuswb xmm5, xmm7 + movdqa xmm7, xmm6 + + movdqa [rdi], xmm5 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(5) ; dst_pitch +%else + add rdi, r8 +%endif + + cmp rdi, rcx + jne .next_row + + jmp .done + +.b16x16_sp_only: + movsxd rax, dword ptr arg(3) ; yoffset + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + mov rdi, arg(4) ; dst_ptr + mov rsi, arg(0) ; src_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm1, [rax] ; VFilter + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ; src_pixels_per_line + + ; get the first horizontal line done + movq xmm4, [rsi] ; load row 0 + movq xmm2, [rsi + 8] ; load row 0 + + lea rsi, [rsi + rax] ; next line +.next_row_sp: + movq xmm3, [rsi] ; load row + 1 + movq xmm5, [rsi + 8] ; load row + 1 + + punpcklbw xmm4, xmm3 + punpcklbw xmm2, xmm5 + + pmaddubsw xmm4, xmm1 + movq xmm7, [rsi + rax] ; load row + 2 + + pmaddubsw xmm2, xmm1 + movq xmm6, [rsi + rax + 8] ; load row + 2 + + punpcklbw xmm3, xmm7 + punpcklbw xmm5, xmm6 + + pmaddubsw xmm3, xmm1 + paddw xmm4, [GLOBAL(rd)] + + pmaddubsw xmm5, xmm1 + paddw xmm2, [GLOBAL(rd)] + + psraw xmm4, VP8_FILTER_SHIFT + psraw xmm2, VP8_FILTER_SHIFT + + packuswb xmm4, xmm2 + paddw xmm3, [GLOBAL(rd)] + + movdqa [rdi], xmm4 ; store row 0 + paddw xmm5, [GLOBAL(rd)] + + psraw xmm3, VP8_FILTER_SHIFT + psraw xmm5, VP8_FILTER_SHIFT + + packuswb xmm3, xmm5 + movdqa xmm4, xmm7 + + movdqa [rdi + rdx],xmm3 ; store row 1 + lea rsi, [rsi + 2*rax] + + movdqa xmm2, xmm6 + lea rdi, [rdi + 2*rdx] + + cmp rdi, rcx + jne .next_row_sp + + jmp .done + +.b16x16_fp_only: + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ; src_pixels_per_line + +.next_row_fp: + movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 + movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 + + punpcklbw xmm2, xmm4 + movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15 + + pmaddubsw xmm2, xmm1 + movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16 + + lea rsi, [rsi + rax] ; next line + punpcklbw xmm3, xmm4 + + pmaddubsw xmm3, xmm1 + movq xmm5, [rsi] + + paddw xmm2, [GLOBAL(rd)] + movq xmm7, [rsi+1] + + movq xmm6, [rsi+8] + psraw xmm2, VP8_FILTER_SHIFT + + punpcklbw xmm5, xmm7 + movq xmm7, [rsi+9] + + paddw xmm3, [GLOBAL(rd)] + pmaddubsw xmm5, xmm1 + + psraw xmm3, VP8_FILTER_SHIFT + punpcklbw xmm6, xmm7 + + packuswb xmm2, xmm3 + pmaddubsw xmm6, xmm1 + + movdqa [rdi], xmm2 ; store the results in the destination + paddw xmm5, [GLOBAL(rd)] + + lea rdi, [rdi + rdx] ; dst_pitch + psraw xmm5, VP8_FILTER_SHIFT + + paddw xmm6, [GLOBAL(rd)] + psraw xmm6, VP8_FILTER_SHIFT + + packuswb xmm5, xmm6 + lea rsi, [rsi + rax] ; next line + + movdqa [rdi], xmm5 ; store the results in the destination + lea rdi, [rdi + rdx] ; dst_pitch + + cmp rdi, rcx + + jne .next_row_fp + +.done: + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_bilinear_predict8x8_ssse3 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE +sym(vp8_bilinear_predict8x8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 144 ; reserve 144 bytes + + lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] + + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + ;Read 9-line unaligned data in and put them on stack. This gives a big + ;performance boost. + movdqu xmm0, [rsi] + lea rax, [rdx + rdx*2] + movdqu xmm1, [rsi+rdx] + movdqu xmm2, [rsi+rdx*2] + add rsi, rax + movdqu xmm3, [rsi] + movdqu xmm4, [rsi+rdx] + movdqu xmm5, [rsi+rdx*2] + add rsi, rax + movdqu xmm6, [rsi] + movdqu xmm7, [rsi+rdx] + + movdqa XMMWORD PTR [rsp], xmm0 + + movdqu xmm0, [rsi+rdx*2] + + movdqa XMMWORD PTR [rsp+16], xmm1 + movdqa XMMWORD PTR [rsp+32], xmm2 + movdqa XMMWORD PTR [rsp+48], xmm3 + movdqa XMMWORD PTR [rsp+64], xmm4 + movdqa XMMWORD PTR [rsp+80], xmm5 + movdqa XMMWORD PTR [rsp+96], xmm6 + movdqa XMMWORD PTR [rsp+112], xmm7 + movdqa XMMWORD PTR [rsp+128], xmm0 + + movsxd rax, dword ptr arg(2) ; xoffset + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je .b8x8_sp_only + + shl rax, 4 + add rax, rcx ; HFilter + + mov rdi, arg(4) ; dst_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm0, [rax] + + movsxd rax, dword ptr arg(3) ; yoffset + cmp rax, 0 ; skip second_pass filter if yoffset=0 + je .b8x8_fp_only + + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + lea rcx, [rdi+rdx*8] + + movdqa xmm1, [rax] + + ; get the first horizontal line done + movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx + + psrldq xmm5, 1 + lea rsp, [rsp + 16] ; next line + + punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 + pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 + + movdqa xmm7, xmm3 + packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + +.next_row: + movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + lea rsp, [rsp + 16] ; next line + + movdqa xmm5, xmm6 + + psrldq xmm5, 1 + + punpcklbw xmm6, xmm5 + pmaddubsw xmm6, xmm0 + + paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value + psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 + + packuswb xmm6, xmm6 + + punpcklbw xmm7, xmm6 + pmaddubsw xmm7, xmm1 + + paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value + psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 + + packuswb xmm7, xmm7 + + movq [rdi], xmm7 ; store the results in the destination + lea rdi, [rdi + rdx] + + movdqa xmm7, xmm6 + + cmp rdi, rcx + jne .next_row + + jmp .done8x8 + +.b8x8_sp_only: + movsxd rax, dword ptr arg(3) ; yoffset + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + mov rdi, arg(4) ;dst_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm0, [rax] ; VFilter + + movq xmm1, XMMWORD PTR [rsp] + movq xmm2, XMMWORD PTR [rsp+16] + + movq xmm3, XMMWORD PTR [rsp+32] + punpcklbw xmm1, xmm2 + + movq xmm4, XMMWORD PTR [rsp+48] + punpcklbw xmm2, xmm3 + + movq xmm5, XMMWORD PTR [rsp+64] + punpcklbw xmm3, xmm4 + + movq xmm6, XMMWORD PTR [rsp+80] + punpcklbw xmm4, xmm5 + + movq xmm7, XMMWORD PTR [rsp+96] + punpcklbw xmm5, xmm6 + + pmaddubsw xmm1, xmm0 + pmaddubsw xmm2, xmm0 + + pmaddubsw xmm3, xmm0 + pmaddubsw xmm4, xmm0 + + pmaddubsw xmm5, xmm0 + punpcklbw xmm6, xmm7 + + pmaddubsw xmm6, xmm0 + paddw xmm1, [GLOBAL(rd)] + + paddw xmm2, [GLOBAL(rd)] + psraw xmm1, VP8_FILTER_SHIFT + + paddw xmm3, [GLOBAL(rd)] + psraw xmm2, VP8_FILTER_SHIFT + + paddw xmm4, [GLOBAL(rd)] + psraw xmm3, VP8_FILTER_SHIFT + + paddw xmm5, [GLOBAL(rd)] + psraw xmm4, VP8_FILTER_SHIFT + + paddw xmm6, [GLOBAL(rd)] + psraw xmm5, VP8_FILTER_SHIFT + + psraw xmm6, VP8_FILTER_SHIFT + packuswb xmm1, xmm1 + + packuswb xmm2, xmm2 + movq [rdi], xmm1 + + packuswb xmm3, xmm3 + movq [rdi+rdx], xmm2 + + packuswb xmm4, xmm4 + movq xmm1, XMMWORD PTR [rsp+112] + + lea rdi, [rdi + 2*rdx] + movq xmm2, XMMWORD PTR [rsp+128] + + packuswb xmm5, xmm5 + movq [rdi], xmm3 + + packuswb xmm6, xmm6 + movq [rdi+rdx], xmm4 + + lea rdi, [rdi + 2*rdx] + punpcklbw xmm7, xmm1 + + movq [rdi], xmm5 + pmaddubsw xmm7, xmm0 + + movq [rdi+rdx], xmm6 + punpcklbw xmm1, xmm2 + + pmaddubsw xmm1, xmm0 + paddw xmm7, [GLOBAL(rd)] + + psraw xmm7, VP8_FILTER_SHIFT + paddw xmm1, [GLOBAL(rd)] + + psraw xmm1, VP8_FILTER_SHIFT + packuswb xmm7, xmm7 + + packuswb xmm1, xmm1 + lea rdi, [rdi + 2*rdx] + + movq [rdi], xmm7 + + movq [rdi+rdx], xmm1 + lea rsp, [rsp + 144] + + jmp .done8x8 + +.b8x8_fp_only: + lea rcx, [rdi+rdx*8] + +.next_row_fp: + movdqa xmm1, XMMWORD PTR [rsp] + movdqa xmm3, XMMWORD PTR [rsp+16] + + movdqa xmm2, xmm1 + movdqa xmm5, XMMWORD PTR [rsp+32] + + psrldq xmm2, 1 + movdqa xmm7, XMMWORD PTR [rsp+48] + + movdqa xmm4, xmm3 + psrldq xmm4, 1 + + movdqa xmm6, xmm5 + psrldq xmm6, 1 + + punpcklbw xmm1, xmm2 + pmaddubsw xmm1, xmm0 + + punpcklbw xmm3, xmm4 + pmaddubsw xmm3, xmm0 + + punpcklbw xmm5, xmm6 + pmaddubsw xmm5, xmm0 + + movdqa xmm2, xmm7 + psrldq xmm2, 1 + + punpcklbw xmm7, xmm2 + pmaddubsw xmm7, xmm0 + + paddw xmm1, [GLOBAL(rd)] + psraw xmm1, VP8_FILTER_SHIFT + + paddw xmm3, [GLOBAL(rd)] + psraw xmm3, VP8_FILTER_SHIFT + + paddw xmm5, [GLOBAL(rd)] + psraw xmm5, VP8_FILTER_SHIFT + + paddw xmm7, [GLOBAL(rd)] + psraw xmm7, VP8_FILTER_SHIFT + + packuswb xmm1, xmm1 + packuswb xmm3, xmm3 + + packuswb xmm5, xmm5 + movq [rdi], xmm1 + + packuswb xmm7, xmm7 + movq [rdi+rdx], xmm3 + + lea rdi, [rdi + 2*rdx] + movq [rdi], xmm5 + + lea rsp, [rsp + 4*16] + movq [rdi+rdx], xmm7 + + lea rdi, [rdi + 2*rdx] + cmp rdi, rcx + + jne .next_row_fp + + lea rsp, [rsp + 16] + +.done8x8: + ;add rsp, 144 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +shuf1b: + db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 +shuf2b: + db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 +shuf3b: + db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 + +align 16 +shuf2bfrom1: + db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13 +align 16 +shuf3bfrom1: + db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11 + +align 16 +rd: + times 8 dw 0x40 + +align 16 +k0_k5: + times 8 db 0, 0 ;placeholder + times 8 db 0, 0 + times 8 db 2, 1 + times 8 db 0, 0 + times 8 db 3, 3 + times 8 db 0, 0 + times 8 db 1, 2 + times 8 db 0, 0 +k1_k3: + times 8 db 0, 0 ;placeholder + times 8 db -6, 12 + times 8 db -11, 36 + times 8 db -9, 50 + times 8 db -16, 77 + times 8 db -6, 93 + times 8 db -8, 108 + times 8 db -1, 123 +k2_k4: + times 8 db 128, 0 ;placeholder + times 8 db 123, -1 + times 8 db 108, -8 + times 8 db 93, -6 + times 8 db 77, -16 + times 8 db 50, -9 + times 8 db 36, -11 + times 8 db 12, -6 +align 16 +vp8_bilinear_filters_ssse3: + times 8 db 128, 0 + times 8 db 112, 16 + times 8 db 96, 32 + times 8 db 80, 48 + times 8 db 64, 64 + times 8 db 48, 80 + times 8 db 32, 96 + times 8 db 16, 112 + diff --git a/thirdparty/libvpx/vp8/common/x86/vp8_asm_stubs.c b/thirdparty/libvpx/vp8/common/x86/vp8_asm_stubs.c new file mode 100644 index 000000000000..fb0b57eb1c1e --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/vp8_asm_stubs.c @@ -0,0 +1,625 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "vpx_ports/mem.h" +#include "filter_x86.h" + +extern const short vp8_six_tap_mmx[8][6*8]; + +extern void vp8_filter_block1d_h6_mmx +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1dc_v6_mmx +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int output_pitch, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1d8_h6_sse2 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1d16_h6_sse2 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1d8_v6_sse2 +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int dst_ptich, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_filter_block1d16_v6_sse2 +( + unsigned short *src_ptr, + unsigned char *output_ptr, + int dst_ptich, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp8_filter +); +extern void vp8_unpack_block1d16_h6_sse2 +( + unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width +); +extern void vp8_filter_block1d8_h6_only_sse2 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + int dst_ptich, + unsigned int output_height, + const short *vp8_filter +); +extern void vp8_filter_block1d16_h6_only_sse2 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + int dst_ptich, + unsigned int output_height, + const short *vp8_filter +); +extern void vp8_filter_block1d8_v6_only_sse2 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + int dst_ptich, + unsigned int output_height, + const short *vp8_filter +); + + +#if HAVE_MMX +void vp8_sixtap_predict4x4_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned short, FData2[16*16]); /* Temp data bufffer used in filtering */ + const short *HFilter, *VFilter; + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter); + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter); + +} + + +void vp8_sixtap_predict16x16_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + + DECLARE_ALIGNED(16, unsigned short, FData2[24*24]); /* Temp data bufffer used in filtering */ + + const short *HFilter, *VFilter; + + + HFilter = vp8_six_tap_mmx[xoffset]; + + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter); + + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter); + +} + + +void vp8_sixtap_predict8x8_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + + DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */ + + const short *HFilter, *VFilter; + + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter); + + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter); + +} + + +void vp8_sixtap_predict8x4_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + + DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */ + + const short *HFilter, *VFilter; + + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter); + + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter); + vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter); + +} + + + +void vp8_bilinear_predict16x16_mmx +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch); + vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch); + vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch); + vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch); +} +#endif + + +#if HAVE_SSE2 +void vp8_sixtap_predict16x16_sse2 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch + +) +{ + DECLARE_ALIGNED(16, unsigned short, FData2[24*24]); /* Temp data bufffer used in filtering */ + + const short *HFilter, *VFilter; + + if (xoffset) + { + if (yoffset) + { + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter); + } + else + { + /* First-pass only */ + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter); + } + } + else + { + /* Second-pass only */ + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32); + vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter); + } +} + + +void vp8_sixtap_predict8x8_sse2 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */ + const short *HFilter, *VFilter; + + if (xoffset) + { + if (yoffset) + { + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter); + } + else + { + /* First-pass only */ + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter); + } + } + else + { + /* Second-pass only */ + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter); + } +} + + +void vp8_sixtap_predict8x4_sse2 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */ + const short *HFilter, *VFilter; + + if (xoffset) + { + if (yoffset) + { + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter); + } + else + { + /* First-pass only */ + HFilter = vp8_six_tap_mmx[xoffset]; + vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter); + } + } + else + { + /* Second-pass only */ + VFilter = vp8_six_tap_mmx[yoffset]; + vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter); + } +} + +#endif + +#if HAVE_SSSE3 + +extern void vp8_filter_block1d8_h6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d16_h6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d16_v6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d8_v6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d4_h6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +extern void vp8_filter_block1d4_v6_ssse3 +( + unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp8_filter_index +); + +void vp8_sixtap_predict16x16_ssse3 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch + +) +{ + DECLARE_ALIGNED(16, unsigned char, FData2[24*24]); + + if (xoffset) + { + if (yoffset) + { + vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, FData2, + 16, 21, xoffset); + vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch, + 16, yoffset); + } + else + { + /* First-pass only */ + vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 16, xoffset); + } + } + else + { + if (yoffset) + { + /* Second-pass only */ + vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 16, yoffset); + } + else + { + /* ssse3 second-pass only function couldn't handle (xoffset==0 && + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ + vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); + } + } +} + +void vp8_sixtap_predict8x8_ssse3 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned char, FData2[256]); + + if (xoffset) + { + if (yoffset) + { + vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, FData2, + 8, 13, xoffset); + vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, + 8, yoffset); + } + else + { + vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 8, xoffset); + } + } + else + { + if (yoffset) + { + /* Second-pass only */ + vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 8, yoffset); + } + else + { + /* ssse3 second-pass only function couldn't handle (xoffset==0 && + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ + vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); + } + } +} + + +void vp8_sixtap_predict8x4_ssse3 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned char, FData2[256]); + + if (xoffset) + { + if (yoffset) + { + vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, FData2, + 8, 9, xoffset); + vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, + 4, yoffset); + } + else + { + /* First-pass only */ + vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 4, xoffset); + } + } + else + { + if (yoffset) + { + /* Second-pass only */ + vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 4, yoffset); + } + else + { + /* ssse3 second-pass only function couldn't handle (xoffset==0 && + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ + vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); + } + } +} + +void vp8_sixtap_predict4x4_ssse3 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) +{ + DECLARE_ALIGNED(16, unsigned char, FData2[4*9]); + + if (xoffset) + { + if (yoffset) + { + vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + FData2, 4, 9, xoffset); + vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, + 4, yoffset); + } + else + { + vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 4, xoffset); + } + } + else + { + if (yoffset) + { + vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 4, yoffset); + } + else + { + /* ssse3 second-pass only function couldn't handle (xoffset==0 && + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ + int r; + + for (r = 0; r < 4; r++) + { + dst_ptr[0] = src_ptr[0]; + dst_ptr[1] = src_ptr[1]; + dst_ptr[2] = src_ptr[2]; + dst_ptr[3] = src_ptr[3]; + dst_ptr += dst_pitch; + src_ptr += src_pixels_per_line; + } + } + } +} + +#endif diff --git a/thirdparty/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm b/thirdparty/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm new file mode 100644 index 000000000000..88a07b9f3fa7 --- /dev/null +++ b/thirdparty/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm @@ -0,0 +1,1753 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void vp8_loop_filter_horizontal_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE +sym(vp8_loop_filter_horizontal_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + movsxd rcx, dword ptr arg(5) ;count +.next8_h: + mov rdx, arg(3) ;limit + movq mm7, [rdx] + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + ; calculate breakout conditions + movq mm2, [rdi+2*rax] ; q3 + movq mm1, [rsi+2*rax] ; q2 + movq mm6, mm1 ; q2 + psubusb mm1, mm2 ; q2-=q3 + psubusb mm2, mm6 ; q3-=q2 + por mm1, mm2 ; abs(q3-q2) + psubusb mm1, mm7 ; + + + movq mm4, [rsi+rax] ; q1 + movq mm3, mm4 ; q1 + psubusb mm4, mm6 ; q1-=q2 + psubusb mm6, mm3 ; q2-=q1 + por mm4, mm6 ; abs(q2-q1) + + psubusb mm4, mm7 + por mm1, mm4 + + movq mm4, [rsi] ; q0 + movq mm0, mm4 ; q0 + psubusb mm4, mm3 ; q0-=q1 + psubusb mm3, mm0 ; q1-=q0 + por mm4, mm3 ; abs(q0-q1) + movq t0, mm4 ; save to t0 + psubusb mm4, mm7 + por mm1, mm4 + + + neg rax ; negate pitch to deal with above border + + movq mm2, [rsi+4*rax] ; p3 + movq mm4, [rdi+4*rax] ; p2 + movq mm5, mm4 ; p2 + psubusb mm4, mm2 ; p2-=p3 + psubusb mm2, mm5 ; p3-=p2 + por mm4, mm2 ; abs(p3 - p2) + psubusb mm4, mm7 + por mm1, mm4 + + + movq mm4, [rsi+2*rax] ; p1 + movq mm3, mm4 ; p1 + psubusb mm4, mm5 ; p1-=p2 + psubusb mm5, mm3 ; p2-=p1 + por mm4, mm5 ; abs(p2 - p1) + psubusb mm4, mm7 + por mm1, mm4 + + movq mm2, mm3 ; p1 + + movq mm4, [rsi+rax] ; p0 + movq mm5, mm4 ; p0 + psubusb mm4, mm3 ; p0-=p1 + psubusb mm3, mm5 ; p1-=p0 + por mm4, mm3 ; abs(p1 - p0) + movq t1, mm4 ; save to t1 + psubusb mm4, mm7 + por mm1, mm4 + + movq mm3, [rdi] ; q1 + movq mm4, mm3 ; q1 + psubusb mm3, mm2 ; q1-=p1 + psubusb mm2, mm4 ; p1-=q1 + por mm2, mm3 ; abs(p1-q1) + pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm2, 1 ; abs(p1-q1)/2 + + movq mm6, mm5 ; p0 + movq mm3, [rsi] ; q0 + psubusb mm5, mm3 ; p0-=q0 + psubusb mm3, mm6 ; q0-=p0 + por mm5, mm3 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] ; blimit + + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm5 + pxor mm5, mm5 + pcmpeqb mm1, mm5 ; mask mm1 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb mm4, mm5 + + pcmpeqb mm5, mm5 + pxor mm4, mm5 + + + ; start work on filters + movq mm2, [rsi+2*rax] ; p1 + movq mm7, [rdi] ; q1 + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + pand mm2, mm4 ; high var mask (hvm)(p1 - q1) + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + movq mm2, mm1 + paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + + pxor mm0, mm0 ; + pxor mm5, mm5 + punpcklbw mm0, mm2 ; + punpckhbw mm5, mm2 ; + psraw mm0, 11 ; + psraw mm5, 11 + packsswb mm0, mm5 + movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + pxor mm0, mm0 ; 0 + movq mm5, mm1 ; abcdefgh + punpcklbw mm0, mm1 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + pxor mm1, mm1 ; 0 + punpckhbw mm1, mm5 ; a0b0c0d0 + psraw mm1, 11 ; sign extended shift right by 3 + movq mm5, mm0 ; save results + + packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw mm5, [GLOBAL(ones)] + paddsw mm1, [GLOBAL(ones)] + psraw mm5, 1 ; partial shifted one more time for 2nd tap + psraw mm1, 1 ; partial shifted one more time for 2nd tap + packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + pandn mm4, mm5 ; high edge variance additive + + paddsb mm6, mm2 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + movq [rsi+rax], mm6 ; write back + + movq mm6, [rsi+2*rax] ; p1 + pxor mm6, [GLOBAL(t80)] ; reoffset + paddsb mm6, mm4 ; p1+= p1 add + pxor mm6, [GLOBAL(t80)] ; unoffset + movq [rsi+2*rax], mm6 ; write back + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [GLOBAL(t80)] ; unoffset + movq [rsi], mm3 ; write back + + psubsb mm7, mm4 ; q1-= q1 add + pxor mm7, [GLOBAL(t80)] ; unoffset + movq [rdi], mm7 ; write back + + add rsi,8 + neg rax + dec rcx + jnz .next8_h + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_vertical_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE +sym(vp8_loop_filter_vertical_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 64 ; reserve 64 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[32]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4 - 4] + + movsxd rcx, dword ptr arg(5) ;count +.next8_v: + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + + ;transpose + movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 + movq mm7, mm6 ; 77 76 75 74 73 72 71 70 + + punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64 + punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60 + + movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 + movq mm5, mm4 ; 47 46 45 44 43 42 41 40 + + punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44 + punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40 + + movq mm3, mm5 ; 57 47 56 46 55 45 54 44 + punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 + + punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 + movq mm2, mm4 ; 53 43 52 42 51 41 50 40 + + punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 + punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 + + neg rax + movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 + + movq mm1, mm6 ; 27 26 25 24 23 22 21 20 + punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24 + + punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20 + movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 + + punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 + movq mm0, mm7 ; 17 07 16 06 15 05 14 04 + + punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 + punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 + + movq mm6, mm7 ; 37 27 17 07 36 26 16 06 + punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 + + punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 + + movq mm5, mm6 ; 76 66 56 46 36 26 16 06 + psubusb mm5, mm7 ; q2-q3 + + psubusb mm7, mm6 ; q3-q2 + por mm7, mm5; ; mm7=abs (q3-q2) + + movq mm5, mm0 ; 35 25 15 05 34 24 14 04 + punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 + + punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 + movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 + + psubusb mm3, mm6 ; q1-q2 + psubusb mm6, mm5 ; q2-q1 + + por mm6, mm3 ; mm6=abs(q2-q1) + lea rdx, srct + + movq [rdx+24], mm5 ; save q1 + movq [rdx+16], mm0 ; save q0 + + movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 + punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 + + movq mm0, mm3 ; 13 03 12 02 11 01 10 00 + punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 + + punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 + movq mm1, mm0 ; 31 21 11 01 30 20 10 00 + + punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 + punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 + + movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 + psubusb mm2, mm0 ; p2-p3 + + psubusb mm0, mm1 ; p3-p2 + por mm0, mm2 ; mm0=abs(p3-p2) + + movq mm2, mm3 ; 33 23 13 03 32 22 12 02 + punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 + + punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 + movq [rdx+8], mm3 ; save p0 + + movq [rdx], mm2 ; save p1 + movq mm5, mm2 ; mm5 = p1 + + psubusb mm2, mm1 ; p1-p2 + psubusb mm1, mm5 ; p2-p1 + + por mm1, mm2 ; mm1=abs(p2-p1) + mov rdx, arg(3) ;limit + + movq mm4, [rdx] ; mm4 = limit + psubusb mm7, mm4 + + psubusb mm0, mm4 + psubusb mm1, mm4 + + psubusb mm6, mm4 + por mm7, mm6 + + por mm0, mm1 + por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit + + movq mm1, mm5 ; p1 + + movq mm7, mm3 ; mm3=mm7=p0 + psubusb mm7, mm5 ; p0 - p1 + + psubusb mm5, mm3 ; p1 - p0 + por mm5, mm7 ; abs(p1-p0) + + movq t0, mm5 ; save abs(p1-p0) + lea rdx, srct + + psubusb mm5, mm4 + por mm0, mm5 ; mm0=mask + + movq mm5, [rdx+16] ; mm5=q0 + movq mm7, [rdx+24] ; mm7=q1 + + movq mm6, mm5 ; mm6=q0 + movq mm2, mm7 ; q1 + psubusb mm5, mm7 ; q0-q1 + + psubusb mm7, mm6 ; q1-q0 + por mm7, mm5 ; abs(q1-q0) + + movq t1, mm7 ; save abs(q1-q0) + psubusb mm7, mm4 + + por mm0, mm7 ; mask + + movq mm5, mm2 ; q1 + psubusb mm5, mm1 ; q1-=p1 + psubusb mm1, mm2 ; p1-=q1 + por mm5, mm1 ; abs(p1-q1) + pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm5, 1 ; abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; + + movq mm4, [rdx] ;blimit + movq mm1, mm3 ; mm1=mm3=p0 + + movq mm7, mm6 ; mm7=mm6=q0 + psubusb mm1, mm7 ; p0-q0 + + psubusb mm7, mm3 ; q0-p0 + por mm1, mm7 ; abs(q0-p0) + paddusb mm1, mm1 ; abs(q0-p0)*2 + paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm0; ; mask + + pxor mm0, mm0 + pcmpeqb mm1, mm0 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] + ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + + por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + pcmpeqb mm4, mm0 + + pcmpeqb mm0, mm0 + pxor mm4, mm0 + + + + ; start work on filters + lea rdx, srct + + movq mm2, [rdx] ; p1 + movq mm7, [rdx+24] ; q1 + + movq mm6, [rdx+8] ; p0 + movq mm0, [rdx+16] ; q0 + + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + + psubsb mm2, mm7 ; p1 - q1 + pand mm2, mm4 ; high var mask (hvm)(p1 - q1) + + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + + paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + + paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + + movq mm2, mm1 + paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + + paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + pxor mm0, mm0 ; + + pxor mm5, mm5 + punpcklbw mm0, mm2 ; + + punpckhbw mm5, mm2 ; + psraw mm0, 11 ; + + psraw mm5, 11 + packsswb mm0, mm5 + + movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + pxor mm0, mm0 ; 0 + movq mm5, mm1 ; abcdefgh + + punpcklbw mm0, mm1 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + + pxor mm1, mm1 ; 0 + punpckhbw mm1, mm5 ; a0b0c0d0 + + psraw mm1, 11 ; sign extended shift right by 3 + movq mm5, mm0 ; save results + + packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw mm5, [GLOBAL(ones)] + + paddsw mm1, [GLOBAL(ones)] + psraw mm5, 1 ; partial shifted one more time for 2nd tap + + psraw mm1, 1 ; partial shifted one more time for 2nd tap + packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + + pandn mm4, mm5 ; high edge variance additive + + paddsb mm6, mm2 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + + ; mm6=p0 ; + movq mm1, [rdx] ; p1 + pxor mm1, [GLOBAL(t80)] ; reoffset + + paddsb mm1, mm4 ; p1+= p1 add + pxor mm1, [GLOBAL(t80)] ; unoffset + ; mm6 = p0 mm1 = p1 + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [GLOBAL(t80)] ; unoffset + + ; mm3 = q0 + psubsb mm7, mm4 ; q1-= q1 add + pxor mm7, [GLOBAL(t80)] ; unoffset + ; mm7 = q1 + + ; transpose and write back + ; mm1 = 72 62 52 42 32 22 12 02 + ; mm6 = 73 63 53 43 33 23 13 03 + ; mm3 = 74 64 54 44 34 24 14 04 + ; mm7 = 75 65 55 45 35 25 15 05 + + movq mm2, mm1 ; 72 62 52 42 32 22 12 02 + punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02 + + movq mm4, mm3 ; 74 64 54 44 34 24 14 04 + punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42 + + punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04 + punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44 + + movq mm6, mm2 ; 33 32 23 22 13 12 03 02 + punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02 + + punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22 + movq mm5, mm1 ; 73 72 63 62 53 52 43 42 + + punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42 + punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62 + + + ; mm2 = 15 14 13 12 05 04 03 02 + ; mm6 = 35 34 33 32 25 24 23 22 + ; mm5 = 55 54 53 52 45 44 43 42 + ; mm1 = 75 74 73 72 65 64 63 62 + + + + movd [rsi+rax*4+2], mm2 + psrlq mm2, 32 + + movd [rdi+rax*4+2], mm2 + movd [rsi+rax*2+2], mm6 + + psrlq mm6, 32 + movd [rsi+rax+2],mm6 + + movd [rsi+2], mm1 + psrlq mm1, 32 + + movd [rdi+2], mm1 + neg rax + + movd [rdi+rax+2],mm5 + psrlq mm5, 32 + + movd [rdi+rax*2+2], mm5 + + lea rsi, [rsi+rax*8] + dec rcx + jnz .next8_v + + add rsp, 64 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_horizontal_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE +sym(vp8_mbloop_filter_horizontal_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + movsxd rcx, dword ptr arg(5) ;count +.next8_mbh: + mov rdx, arg(3) ;limit + movq mm7, [rdx] + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + ; calculate breakout conditions + movq mm2, [rdi+2*rax] ; q3 + + movq mm1, [rsi+2*rax] ; q2 + movq mm6, mm1 ; q2 + psubusb mm1, mm2 ; q2-=q3 + psubusb mm2, mm6 ; q3-=q2 + por mm1, mm2 ; abs(q3-q2) + psubusb mm1, mm7 + + + ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit + movq mm4, [rsi+rax] ; q1 + movq mm3, mm4 ; q1 + psubusb mm4, mm6 ; q1-=q2 + psubusb mm6, mm3 ; q2-=q1 + por mm4, mm6 ; abs(q2-q1) + psubusb mm4, mm7 + por mm1, mm4 + + + ; mm1 = mask, mm3=q1, mm7 = limit + + movq mm4, [rsi] ; q0 + movq mm0, mm4 ; q0 + psubusb mm4, mm3 ; q0-=q1 + psubusb mm3, mm0 ; q1-=q0 + por mm4, mm3 ; abs(q0-q1) + movq t0, mm4 ; save to t0 + psubusb mm4, mm7 + por mm1, mm4 + + + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) + + neg rax ; negate pitch to deal with above border + + movq mm2, [rsi+4*rax] ; p3 + movq mm4, [rdi+4*rax] ; p2 + movq mm5, mm4 ; p2 + psubusb mm4, mm2 ; p2-=p3 + psubusb mm2, mm5 ; p3-=p2 + por mm4, mm2 ; abs(p3 - p2) + psubusb mm4, mm7 + por mm1, mm4 + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) + + movq mm4, [rsi+2*rax] ; p1 + movq mm3, mm4 ; p1 + psubusb mm4, mm5 ; p1-=p2 + psubusb mm5, mm3 ; p2-=p1 + por mm4, mm5 ; abs(p2 - p1) + psubusb mm4, mm7 + por mm1, mm4 + + movq mm2, mm3 ; p1 + + + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) + + movq mm4, [rsi+rax] ; p0 + movq mm5, mm4 ; p0 + psubusb mm4, mm3 ; p0-=p1 + psubusb mm3, mm5 ; p1-=p0 + por mm4, mm3 ; abs(p1 - p0) + movq t1, mm4 ; save to t1 + psubusb mm4, mm7 + por mm1, mm4 + ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm5 = p0 + movq mm3, [rdi] ; q1 + movq mm4, mm3 ; q1 + psubusb mm3, mm2 ; q1-=p1 + psubusb mm2, mm4 ; p1-=q1 + por mm2, mm3 ; abs(p1-q1) + pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm2, 1 ; abs(p1-q1)/2 + + movq mm6, mm5 ; p0 + movq mm3, mm0 ; q0 + psubusb mm5, mm3 ; p0-=q0 + psubusb mm3, mm6 ; q0-=p0 + por mm5, mm3 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] ; blimit + + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm5 + pxor mm5, mm5 + pcmpeqb mm1, mm5 ; mask mm1 + + ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm6 = p0, + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb mm4, mm5 + + pcmpeqb mm5, mm5 + pxor mm4, mm5 + + + + ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0) + ; mm6 = p0, mm4=hev + ; start work on filters + movq mm2, [rsi+2*rax] ; p1 + movq mm7, [rdi] ; q1 + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + + + ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 + movq mm2, mm1 ; vp8_filter + pand mm2, mm4; ; Filter2 = vp8_filter & hev + + movq mm5, mm2 ; + paddsb mm5, [GLOBAL(t3)]; + + pxor mm0, mm0 ; 0 + pxor mm7, mm7 ; 0 + + punpcklbw mm0, mm5 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + punpckhbw mm7, mm5 ; a0b0c0d0 + psraw mm7, 11 ; sign extended shift right by 3 + packsswb mm0, mm7 ; Filter2 >>=3; + + movq mm5, mm0 ; Filter2 + + paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) + pxor mm0, mm0 ; 0 + pxor mm7, mm7 ; 0 + + punpcklbw mm0, mm2 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + punpckhbw mm7, mm2 ; a0b0c0d0 + psraw mm7, 11 ; sign extended shift right by 3 + packsswb mm0, mm7 ; Filter2 >>=3; + + ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 + psubsb mm3, mm0 ; qs0 =qs0 - filter1 + paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 + + ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 + ; vp8_filter &= ~hev; + ; Filter2 = vp8_filter; + pandn mm4, mm1 ; vp8_filter&=~hev + + + ; mm3=qs0, mm4=filter2, mm6=ps0 + + ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); + ; s = vp8_signed_char_clamp(qs0 - u); + ; *oq0 = s^0x80; + ; s = vp8_signed_char_clamp(ps0 + u); + ; *op0 = s^0x80; + pxor mm0, mm0 + + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s27)] + pmulhw mm2, [GLOBAL(s27)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + psubsb mm3, mm1 + paddsb mm6, mm1 + + pxor mm3, [GLOBAL(t80)] + pxor mm6, [GLOBAL(t80)] + movq [rsi+rax], mm6 + movq [rsi], mm3 + + ; roughly 2/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); + ; s = vp8_signed_char_clamp(qs1 - u); + ; *oq1 = s^0x80; + ; s = vp8_signed_char_clamp(ps1 + u); + ; *op1 = s^0x80; + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s18)] + pmulhw mm2, [GLOBAL(s18)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + movq mm3, [rdi] + movq mm6, [rsi+rax*2] ; p1 + + pxor mm3, [GLOBAL(t80)] + pxor mm6, [GLOBAL(t80)] + + paddsb mm6, mm1 + psubsb mm3, mm1 + + pxor mm6, [GLOBAL(t80)] + pxor mm3, [GLOBAL(t80)] + movq [rdi], mm3 + movq [rsi+rax*2], mm6 + + ; roughly 1/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); + ; s = vp8_signed_char_clamp(qs2 - u); + ; *oq2 = s^0x80; + ; s = vp8_signed_char_clamp(ps2 + u); + ; *op2 = s^0x80; + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s9)] + pmulhw mm2, [GLOBAL(s9)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + + movq mm6, [rdi+rax*4] + neg rax + movq mm3, [rdi+rax ] + + pxor mm6, [GLOBAL(t80)] + pxor mm3, [GLOBAL(t80)] + + paddsb mm6, mm1 + psubsb mm3, mm1 + + pxor mm6, [GLOBAL(t80)] + pxor mm3, [GLOBAL(t80)] + movq [rdi+rax ], mm3 + neg rax + movq [rdi+rax*4], mm6 + +;EARLY_BREAK_OUT: + neg rax + add rsi,8 + dec rcx + jnz .next8_mbh + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_mbloop_filter_vertical_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE +sym(vp8_mbloop_filter_vertical_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 96 ; reserve 96 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4 - 4] + + movsxd rcx, dword ptr arg(5) ;count +.next8_mbv: + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + + ;transpose + movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70 + movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 + + movq mm7, mm6 ; 77 76 75 74 73 72 71 70 + punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64 + + punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 + movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50 + + movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 + movq mm5, mm4 ; 47 46 45 44 43 42 41 40 + + punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44 + punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 + + movq mm3, mm5 ; 57 47 56 46 55 45 54 44 + punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 + + punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 + movq mm2, mm4 ; 53 43 52 42 51 41 50 40 + + punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 + punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 + + neg rax + + movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30 + movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 + + movq mm1, mm6 ; 27 26 25 24 23 22 21 20 + punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24 + + punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20 + + movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 + punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 + + movq mm0, mm7 ; 17 07 16 06 15 05 14 04 + punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 + + punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 + movq mm6, mm7 ; 37 27 17 07 36 26 16 06 + + punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 + punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 + + lea rdx, srct + movq mm5, mm6 ; 76 66 56 46 36 26 16 06 + + movq [rdx+56], mm7 + psubusb mm5, mm7 ; q2-q3 + + + movq [rdx+48], mm6 + psubusb mm7, mm6 ; q3-q2 + + por mm7, mm5; ; mm7=abs (q3-q2) + movq mm5, mm0 ; 35 25 15 05 34 24 14 04 + + punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 + punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 + + movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 + psubusb mm3, mm6 ; q1-q2 + + psubusb mm6, mm5 ; q2-q1 + por mm6, mm3 ; mm6=abs(q2-q1) + + movq [rdx+40], mm5 ; save q1 + movq [rdx+32], mm0 ; save q0 + + movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 + punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 + + movq mm0, mm3 ; 13 03 12 02 11 01 10 00 + punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 + + punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 + movq mm1, mm0 ; 31 21 11 01 30 20 10 00 + + punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 + punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 + + movq [rdx], mm0 ; save p3 + movq [rdx+8], mm1 ; save p2 + + movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 + psubusb mm2, mm0 ; p2-p3 + + psubusb mm0, mm1 ; p3-p2 + por mm0, mm2 ; mm0=abs(p3-p2) + + movq mm2, mm3 ; 33 23 13 03 32 22 12 02 + punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 + + punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 + movq [rdx+24], mm3 ; save p0 + + movq [rdx+16], mm2 ; save p1 + movq mm5, mm2 ; mm5 = p1 + + psubusb mm2, mm1 ; p1-p2 + psubusb mm1, mm5 ; p2-p1 + + por mm1, mm2 ; mm1=abs(p2-p1) + mov rdx, arg(3) ;limit + + movq mm4, [rdx] ; mm4 = limit + psubusb mm7, mm4 ; abs(q3-q2) > limit + + psubusb mm0, mm4 ; abs(p3-p2) > limit + psubusb mm1, mm4 ; abs(p2-p1) > limit + + psubusb mm6, mm4 ; abs(q2-q1) > limit + por mm7, mm6 ; or + + por mm0, mm1 ; + por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit + + movq mm1, mm5 ; p1 + + movq mm7, mm3 ; mm3=mm7=p0 + psubusb mm7, mm5 ; p0 - p1 + + psubusb mm5, mm3 ; p1 - p0 + por mm5, mm7 ; abs(p1-p0) + + movq t0, mm5 ; save abs(p1-p0) + lea rdx, srct + + psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit + por mm0, mm5 ; mm0=mask + + movq mm5, [rdx+32] ; mm5=q0 + movq mm7, [rdx+40] ; mm7=q1 + + movq mm6, mm5 ; mm6=q0 + movq mm2, mm7 ; q1 + psubusb mm5, mm7 ; q0-q1 + + psubusb mm7, mm6 ; q1-q0 + por mm7, mm5 ; abs(q1-q0) + + movq t1, mm7 ; save abs(q1-q0) + psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit + + por mm0, mm7 ; mask + + movq mm5, mm2 ; q1 + psubusb mm5, mm1 ; q1-=p1 + psubusb mm1, mm2 ; p1-=q1 + por mm5, mm1 ; abs(p1-q1) + pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm5, 1 ; abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; + + movq mm4, [rdx] ;blimit + movq mm1, mm3 ; mm1=mm3=p0 + + movq mm7, mm6 ; mm7=mm6=q0 + psubusb mm1, mm7 ; p0-q0 + + psubusb mm7, mm3 ; q0-p0 + por mm1, mm7 ; abs(q0-p0) + paddusb mm1, mm1 ; abs(q0-p0)*2 + paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm0; ; mask + + pxor mm0, mm0 + pcmpeqb mm1, mm0 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] + ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 ; abs(q1 - q0) > thresh + + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 ; abs(p1 - p0)> thresh + + por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + pcmpeqb mm4, mm0 + + pcmpeqb mm0, mm0 + pxor mm4, mm0 + + + + + ; start work on filters + lea rdx, srct + + ; start work on filters + movq mm2, [rdx+16] ; p1 + movq mm7, [rdx+40] ; q1 + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + + movq mm6, [rdx+24] ; p0 + movq mm0, [rdx+32] ; q0 + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + + ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 + movq mm2, mm1 ; vp8_filter + pand mm2, mm4; ; Filter2 = vp8_filter & hev + + movq mm5, mm2 ; + paddsb mm5, [GLOBAL(t3)]; + + pxor mm0, mm0 ; 0 + pxor mm7, mm7 ; 0 + + punpcklbw mm0, mm5 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + punpckhbw mm7, mm5 ; a0b0c0d0 + psraw mm7, 11 ; sign extended shift right by 3 + packsswb mm0, mm7 ; Filter2 >>=3; + + movq mm5, mm0 ; Filter2 + + paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) + pxor mm0, mm0 ; 0 + pxor mm7, mm7 ; 0 + + punpcklbw mm0, mm2 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + punpckhbw mm7, mm2 ; a0b0c0d0 + psraw mm7, 11 ; sign extended shift right by 3 + packsswb mm0, mm7 ; Filter2 >>=3; + + ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 + psubsb mm3, mm0 ; qs0 =qs0 - filter1 + paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 + + ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 + ; vp8_filter &= ~hev; + ; Filter2 = vp8_filter; + pandn mm4, mm1 ; vp8_filter&=~hev + + + ; mm3=qs0, mm4=filter2, mm6=ps0 + + ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); + ; s = vp8_signed_char_clamp(qs0 - u); + ; *oq0 = s^0x80; + ; s = vp8_signed_char_clamp(ps0 + u); + ; *op0 = s^0x80; + pxor mm0, mm0 + + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s27)] + pmulhw mm2, [GLOBAL(s27)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + psubsb mm3, mm1 + paddsb mm6, mm1 + + pxor mm3, [GLOBAL(t80)] + pxor mm6, [GLOBAL(t80)] + movq [rdx+24], mm6 + movq [rdx+32], mm3 + + ; roughly 2/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); + ; s = vp8_signed_char_clamp(qs1 - u); + ; *oq1 = s^0x80; + ; s = vp8_signed_char_clamp(ps1 + u); + ; *op1 = s^0x80; + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s18)] + pmulhw mm2, [GLOBAL(s18)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + movq mm3, [rdx + 40] + movq mm6, [rdx + 16] ; p1 + pxor mm3, [GLOBAL(t80)] + pxor mm6, [GLOBAL(t80)] + + paddsb mm6, mm1 + psubsb mm3, mm1 + + pxor mm6, [GLOBAL(t80)] + pxor mm3, [GLOBAL(t80)] + movq [rdx + 40], mm3 + movq [rdx + 16], mm6 + + ; roughly 1/7th difference across boundary + ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); + ; s = vp8_signed_char_clamp(qs2 - u); + ; *oq2 = s^0x80; + ; s = vp8_signed_char_clamp(ps2 + u); + ; *op2 = s^0x80; + pxor mm1, mm1 + pxor mm2, mm2 + punpcklbw mm1, mm4 + punpckhbw mm2, mm4 + pmulhw mm1, [GLOBAL(s9)] + pmulhw mm2, [GLOBAL(s9)] + paddw mm1, [GLOBAL(s63)] + paddw mm2, [GLOBAL(s63)] + psraw mm1, 7 + psraw mm2, 7 + packsswb mm1, mm2 + + movq mm6, [rdx+ 8] + movq mm3, [rdx+48] + + pxor mm6, [GLOBAL(t80)] + pxor mm3, [GLOBAL(t80)] + + paddsb mm6, mm1 + psubsb mm3, mm1 + + pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01 + pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06 + + ; transpose and write back + movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00 + movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00 + + punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00 + punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40 + + movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02 + movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02 + + punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02 + punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42 + + movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00 + punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00 + + punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20 + movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40 + + punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40 + punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60 + + movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 + punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04 + + movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06 + punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06 + + movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04 + punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04 + + punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24 + movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00 + + punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00 + punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10 + + movq [rsi+rax*4], mm0 ; write out + movq [rdi+rax*4], mm6 ; write out + + movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20 + punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20 + + punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30 + movq [rsi+rax*2], mm0 ; write out + + movq [rdi+rax*2], mm5 ; write out + movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 + + punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44 + punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46 + + movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44 + punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44 + + punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64 + movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40 + + movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60 + punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40 + + punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50 + movq [rsi], mm0 ; write out + + movq [rdi], mm1 ; write out + neg rax + + punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60 + punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60 + + movq [rsi+rax*2], mm3 + movq [rdi+rax*2], mm4 + + lea rsi, [rsi+rax*8] + dec rcx + + jnz .next8_mbv + + add rsp, 96 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_horizontal_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit +;) +global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE +sym(vp8_loop_filter_simple_horizontal_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + mov rcx, 2 ; count +.nexts8_h: + mov rdx, arg(2) ;blimit ; get blimit + movq mm3, [rdx] ; + + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + neg rax + + ; calculate mask + movq mm1, [rsi+2*rax] ; p1 + movq mm0, [rdi] ; q1 + movq mm2, mm1 + movq mm7, mm0 + movq mm4, mm0 + psubusb mm0, mm1 ; q1-=p1 + psubusb mm1, mm4 ; p1-=q1 + por mm1, mm0 ; abs(p1-q1) + pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm1, 1 ; abs(p1-q1)/2 + + movq mm5, [rsi+rax] ; p0 + movq mm4, [rsi] ; q0 + movq mm0, mm4 ; q0 + movq mm6, mm5 ; p0 + psubusb mm5, mm4 ; p0-=q0 + psubusb mm4, mm6 ; q0-=p0 + por mm5, mm4 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor mm3, mm3 + pcmpeqb mm5, mm3 + + ; start work on filters + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0) + paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0) + paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0) + pand mm5, mm2 ; mask filter values we don't care about + + ; do + 4 side + paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + movq mm1, mm5 ; get a copy of filters + psraw mm1, 11 ; arithmetic shift right 11 + psllw mm1, 8 ; shift left 8 to put it back + + por mm0, mm1 ; put the two together to get result + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [GLOBAL(t80)] ; unoffset + movq [rsi], mm3 ; write back + + + ; now do +3 side + psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + psraw mm5, 11 ; arithmetic shift right 11 + psllw mm5, 8 ; shift left 8 to put it back + por mm0, mm5 ; put the two together to get result + + + paddsb mm6, mm0 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + movq [rsi+rax], mm6 ; write back + + add rsi,8 + neg rax + dec rcx + jnz .nexts8_h + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_loop_filter_simple_vertical_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit +;) +global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE +sym(vp8_loop_filter_simple_vertical_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4- 2]; ; + mov rcx, 2 ; count +.nexts8_v: + + lea rdi, [rsi + rax]; + movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70 + + movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60 + punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 + + movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50 + movd mm4, [rsi] ; xx xx xx xx 43 42 41 40 + + punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 + movq mm5, mm4 ; 53 43 52 42 51 41 50 40 + + punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40 + punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42 + + neg rax + + movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30 + movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20 + + punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20 + movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10 + + movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00 + punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00 + + movq mm2, mm0 ; 13 03 12 02 11 01 10 00 + punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00 + + punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02 + movq mm1, mm0 ; 13 03 12 02 11 01 10 00 + + punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1 + movq mm3, mm2 ; 33 23 13 03 32 22 12 02 + + punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0 + punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0 + + punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1 + + + ; calculate mask + movq mm6, mm0 ; p1 + movq mm7, mm3 ; q1 + psubusb mm7, mm6 ; q1-=p1 + psubusb mm6, mm3 ; p1-=q1 + por mm6, mm7 ; abs(p1-q1) + pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm6, 1 ; abs(p1-q1)/2 + + movq mm5, mm1 ; p0 + movq mm4, mm2 ; q0 + + psubusb mm5, mm2 ; p0-=q0 + psubusb mm4, mm1 ; q0-=p0 + + por mm5, mm4 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] + + psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor mm7, mm7 + pcmpeqb mm5, mm7 ; mm5 = mask + + ; start work on filters + movq t0, mm0 + movq t1, mm3 + + pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values + + psubsb mm0, mm3 ; p1 - q1 + movq mm6, mm1 ; p0 + + movq mm7, mm2 ; q0 + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + + pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm7 ; offseted ; q0 + + psubsb mm7, mm6 ; q0 - p0 + paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0) + + paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0) + paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0) + + pand mm5, mm0 ; mask filter values we don't care about + + paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + + movq mm7, mm5 ; get a copy of filters + psraw mm7, 11 ; arithmetic shift right 11 + psllw mm7, 8 ; shift left 8 to put it back + + por mm0, mm7 ; put the two together to get result + + psubsb mm3, mm0 ; q0-= q0sz add + pxor mm3, [GLOBAL(t80)] ; unoffset + + ; now do +3 side + psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + + psraw mm5, 11 ; arithmetic shift right 11 + psllw mm5, 8 ; shift left 8 to put it back + por mm0, mm5 ; put the two together to get result + + paddsb mm6, mm0 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + + + movq mm0, t0 + movq mm4, t1 + + ; mm0 = 70 60 50 40 30 20 10 00 + ; mm6 = 71 61 51 41 31 21 11 01 + ; mm3 = 72 62 52 42 32 22 12 02 + ; mm4 = 73 63 53 43 33 23 13 03 + ; transpose back to write out + + movq mm1, mm0 ; + punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00 + + punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40 + movq mm2, mm3 ; + + punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02 + movq mm5, mm1 ; 71 70 61 60 51 50 41 40 + + punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42 + movq mm6, mm0 ; 31 30 21 20 11 10 01 00 + + punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00 + punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20 + + movd [rsi+rax*4], mm0 ; write 03 02 01 00 + punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40 + + psrlq mm0, 32 ; xx xx xx xx 13 12 11 10 + punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60 + + movd [rdi+rax*4], mm0 ; write 13 12 11 10 + movd [rsi+rax*2], mm6 ; write 23 22 21 20 + + psrlq mm6, 32 ; 33 32 31 30 + movd [rsi], mm1 ; write 43 42 41 40 + + movd [rsi + rax], mm6 ; write 33 32 31 30 + neg rax + + movd [rsi + rax*2], mm5 ; write 63 62 61 60 + psrlq mm1, 32 ; 53 52 51 50 + + movd [rdi], mm1 ; write out 53 52 51 50 + psrlq mm5, 32 ; 73 72 71 70 + + movd [rdi + rax*2], mm5 ; write 73 72 71 70 + + lea rsi, [rsi+rax*8] ; next 8 + + dec rcx + jnz .nexts8_v + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + +;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr, +; int y_stride, +; loop_filter_info *lfi) +;{ +; +; +; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2); +; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2); +; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2); +;} + +SECTION_RODATA +align 16 +tfe: + times 8 db 0xfe +align 16 +t80: + times 8 db 0x80 +align 16 +t1s: + times 8 db 0x01 +align 16 +t3: + times 8 db 0x03 +align 16 +t4: + times 8 db 0x04 +align 16 +ones: + times 4 dw 0x0001 +align 16 +s27: + times 4 dw 0x1b00 +align 16 +s18: + times 4 dw 0x1200 +align 16 +s9: + times 4 dw 0x0900 +align 16 +s63: + times 4 dw 0x003f diff --git a/thirdparty/libvpx/vp8/decoder/dboolhuff.c b/thirdparty/libvpx/vp8/decoder/dboolhuff.c new file mode 100644 index 000000000000..5cdd2a249156 --- /dev/null +++ b/thirdparty/libvpx/vp8/decoder/dboolhuff.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "dboolhuff.h" +#include "vp8/common/common.h" +#include "vpx_dsp/vpx_dsp_common.h" + +int vp8dx_start_decode(BOOL_DECODER *br, + const unsigned char *source, + unsigned int source_sz, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state) +{ + br->user_buffer_end = source+source_sz; + br->user_buffer = source; + br->value = 0; + br->count = -8; + br->range = 255; + br->decrypt_cb = decrypt_cb; + br->decrypt_state = decrypt_state; + + if (source_sz && !source) + return 1; + + /* Populate the buffer */ + vp8dx_bool_decoder_fill(br); + + return 0; +} + +void vp8dx_bool_decoder_fill(BOOL_DECODER *br) +{ + const unsigned char *bufptr = br->user_buffer; + VP8_BD_VALUE value = br->value; + int count = br->count; + int shift = VP8_BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT); + size_t bytes_left = br->user_buffer_end - bufptr; + size_t bits_left = bytes_left * CHAR_BIT; + int x = shift + CHAR_BIT - (int)bits_left; + int loop_end = 0; + unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1]; + + if (br->decrypt_cb) { + size_t n = VPXMIN(sizeof(decrypted), bytes_left); + br->decrypt_cb(br->decrypt_state, bufptr, decrypted, (int)n); + bufptr = decrypted; + } + + if(x >= 0) + { + count += VP8_LOTS_OF_BITS; + loop_end = x; + } + + if (x < 0 || bits_left) + { + while(shift >= loop_end) + { + count += CHAR_BIT; + value |= (VP8_BD_VALUE)*bufptr << shift; + ++bufptr; + ++br->user_buffer; + shift -= CHAR_BIT; + } + } + + br->value = value; + br->count = count; +} diff --git a/thirdparty/libvpx/vp8/decoder/dboolhuff.h b/thirdparty/libvpx/vp8/decoder/dboolhuff.h new file mode 100644 index 000000000000..1b1bbf868ec0 --- /dev/null +++ b/thirdparty/libvpx/vp8/decoder/dboolhuff.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_DECODER_DBOOLHUFF_H_ +#define VP8_DECODER_DBOOLHUFF_H_ + +#include +#include + +#include "./vpx_config.h" +#include "vpx_ports/mem.h" +#include "vpx/vp8dx.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef size_t VP8_BD_VALUE; + +#define VP8_BD_VALUE_SIZE ((int)sizeof(VP8_BD_VALUE)*CHAR_BIT) + +/*This is meant to be a large, positive constant that can still be efficiently + loaded as an immediate (on platforms like ARM, for example). + Even relatively modest values like 100 would work fine.*/ +#define VP8_LOTS_OF_BITS (0x40000000) + +typedef struct +{ + const unsigned char *user_buffer_end; + const unsigned char *user_buffer; + VP8_BD_VALUE value; + int count; + unsigned int range; + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; +} BOOL_DECODER; + +DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]); + +int vp8dx_start_decode(BOOL_DECODER *br, + const unsigned char *source, + unsigned int source_sz, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state); + +void vp8dx_bool_decoder_fill(BOOL_DECODER *br); + + +static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) { + unsigned int bit = 0; + VP8_BD_VALUE value; + unsigned int split; + VP8_BD_VALUE bigsplit; + int count; + unsigned int range; + + split = 1 + (((br->range - 1) * probability) >> 8); + + if(br->count < 0) + vp8dx_bool_decoder_fill(br); + + value = br->value; + count = br->count; + + bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); + + range = split; + + if (value >= bigsplit) + { + range = br->range - split; + value = value - bigsplit; + bit = 1; + } + + { + register int shift = vp8_norm[range]; + range <<= shift; + value <<= shift; + count -= shift; + } + br->value = value; + br->count = count; + br->range = range; + + return bit; +} + +static INLINE int vp8_decode_value(BOOL_DECODER *br, int bits) +{ + int z = 0; + int bit; + + for (bit = bits - 1; bit >= 0; bit--) + { + z |= (vp8dx_decode_bool(br, 0x80) << bit); + } + + return z; +} + +static INLINE int vp8dx_bool_error(BOOL_DECODER *br) +{ + /* Check if we have reached the end of the buffer. + * + * Variable 'count' stores the number of bits in the 'value' buffer, minus + * 8. The top byte is part of the algorithm, and the remainder is buffered + * to be shifted into it. So if count == 8, the top 16 bits of 'value' are + * occupied, 8 for the algorithm and 8 in the buffer. + * + * When reading a byte from the user's buffer, count is filled with 8 and + * one byte is filled into the value buffer. When we reach the end of the + * data, count is additionally filled with VP8_LOTS_OF_BITS. So when + * count == VP8_LOTS_OF_BITS - 1, the user's data has been exhausted. + */ + if ((br->count > VP8_BD_VALUE_SIZE) && (br->count < VP8_LOTS_OF_BITS)) + { + /* We have tried to decode bits after the end of + * stream was encountered. + */ + return 1; + } + + /* No error. */ + return 0; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_DBOOLHUFF_H_ diff --git a/thirdparty/libvpx/vp8/decoder/decodeframe.c b/thirdparty/libvpx/vp8/decoder/decodeframe.c new file mode 100644 index 000000000000..51acdbb9c8a0 --- /dev/null +++ b/thirdparty/libvpx/vp8/decoder/decodeframe.c @@ -0,0 +1,1397 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "onyxd_int.h" +#include "vp8/common/header.h" +#include "vp8/common/reconintra4x4.h" +#include "vp8/common/reconinter.h" +#include "detokenize.h" +#include "vp8/common/common.h" +#include "vp8/common/invtrans.h" +#include "vp8/common/alloccommon.h" +#include "vp8/common/entropymode.h" +#include "vp8/common/quant_common.h" +#include "vpx_scale/vpx_scale.h" +#include "vp8/common/reconintra.h" +#include "vp8/common/setupintrarecon.h" + +#include "decodemv.h" +#include "vp8/common/extend.h" +#if CONFIG_ERROR_CONCEALMENT +#include "error_concealment.h" +#endif +#include "vpx_mem/vpx_mem.h" +#include "vp8/common/threading.h" +#include "decoderthreading.h" +#include "dboolhuff.h" +#include "vpx_dsp/vpx_dsp_common.h" + +#include +#include + +void vp8cx_init_de_quantizer(VP8D_COMP *pbi) +{ + int Q; + VP8_COMMON *const pc = & pbi->common; + + for (Q = 0; Q < QINDEX_RANGE; Q++) + { + pc->Y1dequant[Q][0] = (short)vp8_dc_quant(Q, pc->y1dc_delta_q); + pc->Y2dequant[Q][0] = (short)vp8_dc2quant(Q, pc->y2dc_delta_q); + pc->UVdequant[Q][0] = (short)vp8_dc_uv_quant(Q, pc->uvdc_delta_q); + + pc->Y1dequant[Q][1] = (short)vp8_ac_yquant(Q); + pc->Y2dequant[Q][1] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q); + pc->UVdequant[Q][1] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q); + } +} + +void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd) +{ + int i; + int QIndex; + MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; + VP8_COMMON *const pc = & pbi->common; + + /* Decide whether to use the default or alternate baseline Q value. */ + if (xd->segmentation_enabled) + { + /* Abs Value */ + if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA) + QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id]; + + /* Delta Value */ + else + QIndex = pc->base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id]; + + QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0; /* Clamp to valid range */ + } + else + QIndex = pc->base_qindex; + + /* Set up the macroblock dequant constants */ + xd->dequant_y1_dc[0] = 1; + xd->dequant_y1[0] = pc->Y1dequant[QIndex][0]; + xd->dequant_y2[0] = pc->Y2dequant[QIndex][0]; + xd->dequant_uv[0] = pc->UVdequant[QIndex][0]; + + for (i = 1; i < 16; i++) + { + xd->dequant_y1_dc[i] = + xd->dequant_y1[i] = pc->Y1dequant[QIndex][1]; + xd->dequant_y2[i] = pc->Y2dequant[QIndex][1]; + xd->dequant_uv[i] = pc->UVdequant[QIndex][1]; + } +} + +static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, + unsigned int mb_idx) +{ + MB_PREDICTION_MODE mode; + int i; +#if CONFIG_ERROR_CONCEALMENT + int corruption_detected = 0; +#else + (void)mb_idx; +#endif + + if (xd->mode_info_context->mbmi.mb_skip_coeff) + { + vp8_reset_mb_tokens_context(xd); + } + else if (!vp8dx_bool_error(xd->current_bc)) + { + int eobtotal; + eobtotal = vp8_decode_mb_tokens(pbi, xd); + + /* Special case: Force the loopfilter to skip when eobtotal is zero */ + xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal==0); + } + + mode = xd->mode_info_context->mbmi.mode; + + if (xd->segmentation_enabled) + vp8_mb_init_dequantizer(pbi, xd); + + +#if CONFIG_ERROR_CONCEALMENT + + if(pbi->ec_active) + { + int throw_residual; + /* When we have independent partitions we can apply residual even + * though other partitions within the frame are corrupt. + */ + throw_residual = (!pbi->independent_partitions && + pbi->frame_corrupt_residual); + throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc)); + + if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual)) + { + /* MB with corrupt residuals or corrupt mode/motion vectors. + * Better to use the predictor as reconstruction. + */ + pbi->frame_corrupt_residual = 1; + memset(xd->qcoeff, 0, sizeof(xd->qcoeff)); + + corruption_detected = 1; + + /* force idct to be skipped for B_PRED and use the + * prediction only for reconstruction + * */ + memset(xd->eobs, 0, 25); + } + } +#endif + + /* do prediction */ + if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) + { + vp8_build_intra_predictors_mbuv_s(xd, + xd->recon_above[1], + xd->recon_above[2], + xd->recon_left[1], + xd->recon_left[2], + xd->recon_left_stride[1], + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.uv_stride); + + if (mode != B_PRED) + { + vp8_build_intra_predictors_mby_s(xd, + xd->recon_above[0], + xd->recon_left[0], + xd->recon_left_stride[0], + xd->dst.y_buffer, + xd->dst.y_stride); + } + else + { + short *DQC = xd->dequant_y1; + int dst_stride = xd->dst.y_stride; + + /* clear out residual eob info */ + if(xd->mode_info_context->mbmi.mb_skip_coeff) + memset(xd->eobs, 0, 25); + + intra_prediction_down_copy(xd, xd->recon_above[0] + 16); + + for (i = 0; i < 16; i++) + { + BLOCKD *b = &xd->block[i]; + unsigned char *dst = xd->dst.y_buffer + b->offset; + B_PREDICTION_MODE b_mode = + xd->mode_info_context->bmi[i].as_mode; + unsigned char *Above = dst - dst_stride; + unsigned char *yleft = dst - 1; + int left_stride = dst_stride; + unsigned char top_left = Above[-1]; + + vp8_intra4x4_predict(Above, yleft, left_stride, b_mode, + dst, dst_stride, top_left); + + if (xd->eobs[i]) + { + if (xd->eobs[i] > 1) + { + vp8_dequant_idct_add(b->qcoeff, DQC, dst, dst_stride); + } + else + { + vp8_dc_only_idct_add + (b->qcoeff[0] * DQC[0], + dst, dst_stride, + dst, dst_stride); + memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0])); + } + } + } + } + } + else + { + vp8_build_inter_predictors_mb(xd); + } + + +#if CONFIG_ERROR_CONCEALMENT + if (corruption_detected) + { + return; + } +#endif + + if(!xd->mode_info_context->mbmi.mb_skip_coeff) + { + /* dequantization and idct */ + if (mode != B_PRED) + { + short *DQC = xd->dequant_y1; + + if (mode != SPLITMV) + { + BLOCKD *b = &xd->block[24]; + + /* do 2nd order transform on the dc block */ + if (xd->eobs[24] > 1) + { + vp8_dequantize_b(b, xd->dequant_y2); + + vp8_short_inv_walsh4x4(&b->dqcoeff[0], + xd->qcoeff); + memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0])); + } + else + { + b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0]; + vp8_short_inv_walsh4x4_1(&b->dqcoeff[0], + xd->qcoeff); + memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0])); + } + + /* override the dc dequant constant in order to preserve the + * dc components + */ + DQC = xd->dequant_y1_dc; + } + + vp8_dequant_idct_add_y_block + (xd->qcoeff, DQC, + xd->dst.y_buffer, + xd->dst.y_stride, xd->eobs); + } + + vp8_dequant_idct_add_uv_block + (xd->qcoeff+16*16, xd->dequant_uv, + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.uv_stride, xd->eobs+16); + } +} + +static int get_delta_q(vp8_reader *bc, int prev, int *q_update) +{ + int ret_val = 0; + + if (vp8_read_bit(bc)) + { + ret_val = vp8_read_literal(bc, 4); + + if (vp8_read_bit(bc)) + ret_val = -ret_val; + } + + /* Trigger a quantizer update if the delta-q value has changed */ + if (ret_val != prev) + *q_update = 1; + + return ret_val; +} + +#ifdef PACKET_TESTING +#include +FILE *vpxlog = 0; +#endif + +static void yv12_extend_frame_top_c(YV12_BUFFER_CONFIG *ybf) +{ + int i; + unsigned char *src_ptr1; + unsigned char *dest_ptr1; + + unsigned int Border; + int plane_stride; + + /***********/ + /* Y Plane */ + /***********/ + Border = ybf->border; + plane_stride = ybf->y_stride; + src_ptr1 = ybf->y_buffer - Border; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + + for (i = 0; i < (int)Border; i++) + { + memcpy(dest_ptr1, src_ptr1, plane_stride); + dest_ptr1 += plane_stride; + } + + + /***********/ + /* U Plane */ + /***********/ + plane_stride = ybf->uv_stride; + Border /= 2; + src_ptr1 = ybf->u_buffer - Border; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + + for (i = 0; i < (int)(Border); i++) + { + memcpy(dest_ptr1, src_ptr1, plane_stride); + dest_ptr1 += plane_stride; + } + + /***********/ + /* V Plane */ + /***********/ + + src_ptr1 = ybf->v_buffer - Border; + dest_ptr1 = src_ptr1 - (Border * plane_stride); + + for (i = 0; i < (int)(Border); i++) + { + memcpy(dest_ptr1, src_ptr1, plane_stride); + dest_ptr1 += plane_stride; + } +} + +static void yv12_extend_frame_bottom_c(YV12_BUFFER_CONFIG *ybf) +{ + int i; + unsigned char *src_ptr1, *src_ptr2; + unsigned char *dest_ptr2; + + unsigned int Border; + int plane_stride; + int plane_height; + + /***********/ + /* Y Plane */ + /***********/ + Border = ybf->border; + plane_stride = ybf->y_stride; + plane_height = ybf->y_height; + + src_ptr1 = ybf->y_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)Border; i++) + { + memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr2 += plane_stride; + } + + + /***********/ + /* U Plane */ + /***********/ + plane_stride = ybf->uv_stride; + plane_height = ybf->uv_height; + Border /= 2; + + src_ptr1 = ybf->u_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)(Border); i++) + { + memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr2 += plane_stride; + } + + /***********/ + /* V Plane */ + /***********/ + + src_ptr1 = ybf->v_buffer - Border; + src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride; + dest_ptr2 = src_ptr2 + plane_stride; + + for (i = 0; i < (int)(Border); i++) + { + memcpy(dest_ptr2, src_ptr2, plane_stride); + dest_ptr2 += plane_stride; + } +} + +static void yv12_extend_frame_left_right_c(YV12_BUFFER_CONFIG *ybf, + unsigned char *y_src, + unsigned char *u_src, + unsigned char *v_src) +{ + int i; + unsigned char *src_ptr1, *src_ptr2; + unsigned char *dest_ptr1, *dest_ptr2; + + unsigned int Border; + int plane_stride; + int plane_height; + int plane_width; + + /***********/ + /* Y Plane */ + /***********/ + Border = ybf->border; + plane_stride = ybf->y_stride; + plane_height = 16; + plane_width = ybf->y_width; + + /* copy the left and right most columns out */ + src_ptr1 = y_src; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; i++) + { + memset(dest_ptr1, src_ptr1[0], Border); + memset(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + /***********/ + /* U Plane */ + /***********/ + plane_stride = ybf->uv_stride; + plane_height = 8; + plane_width = ybf->uv_width; + Border /= 2; + + /* copy the left and right most columns out */ + src_ptr1 = u_src; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; i++) + { + memset(dest_ptr1, src_ptr1[0], Border); + memset(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } + + /***********/ + /* V Plane */ + /***********/ + + /* copy the left and right most columns out */ + src_ptr1 = v_src; + src_ptr2 = src_ptr1 + plane_width - 1; + dest_ptr1 = src_ptr1 - Border; + dest_ptr2 = src_ptr2 + 1; + + for (i = 0; i < plane_height; i++) + { + memset(dest_ptr1, src_ptr1[0], Border); + memset(dest_ptr2, src_ptr2[0], Border); + src_ptr1 += plane_stride; + src_ptr2 += plane_stride; + dest_ptr1 += plane_stride; + dest_ptr2 += plane_stride; + } +} + +static void decode_mb_rows(VP8D_COMP *pbi) +{ + VP8_COMMON *const pc = & pbi->common; + MACROBLOCKD *const xd = & pbi->mb; + + MODE_INFO *lf_mic = xd->mode_info_context; + + int ibc = 0; + int num_part = 1 << pc->multi_token_partition; + + int recon_yoffset, recon_uvoffset; + int mb_row, mb_col; + int mb_idx = 0; + + YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME]; + + int recon_y_stride = yv12_fb_new->y_stride; + int recon_uv_stride = yv12_fb_new->uv_stride; + + unsigned char *ref_buffer[MAX_REF_FRAMES][3]; + unsigned char *dst_buffer[3]; + unsigned char *lf_dst[3]; + unsigned char *eb_dst[3]; + int i; + int ref_fb_corrupted[MAX_REF_FRAMES]; + + ref_fb_corrupted[INTRA_FRAME] = 0; + + for(i = 1; i < MAX_REF_FRAMES; i++) + { + YV12_BUFFER_CONFIG *this_fb = pbi->dec_fb_ref[i]; + + ref_buffer[i][0] = this_fb->y_buffer; + ref_buffer[i][1] = this_fb->u_buffer; + ref_buffer[i][2] = this_fb->v_buffer; + + ref_fb_corrupted[i] = this_fb->corrupted; + } + + /* Set up the buffer pointers */ + eb_dst[0] = lf_dst[0] = dst_buffer[0] = yv12_fb_new->y_buffer; + eb_dst[1] = lf_dst[1] = dst_buffer[1] = yv12_fb_new->u_buffer; + eb_dst[2] = lf_dst[2] = dst_buffer[2] = yv12_fb_new->v_buffer; + + xd->up_available = 0; + + /* Initialize the loop filter for this frame. */ + if(pc->filter_level) + vp8_loop_filter_frame_init(pc, xd, pc->filter_level); + + vp8_setup_intra_recon_top_line(yv12_fb_new); + + /* Decode the individual macro block */ + for (mb_row = 0; mb_row < pc->mb_rows; mb_row++) + { + if (num_part > 1) + { + xd->current_bc = & pbi->mbc[ibc]; + ibc++; + + if (ibc == num_part) + ibc = 0; + } + + recon_yoffset = mb_row * recon_y_stride * 16; + recon_uvoffset = mb_row * recon_uv_stride * 8; + + /* reset contexts */ + xd->above_context = pc->above_context; + memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)); + + xd->left_available = 0; + + xd->mb_to_top_edge = -((mb_row * 16) << 3); + xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3; + + xd->recon_above[0] = dst_buffer[0] + recon_yoffset; + xd->recon_above[1] = dst_buffer[1] + recon_uvoffset; + xd->recon_above[2] = dst_buffer[2] + recon_uvoffset; + + xd->recon_left[0] = xd->recon_above[0] - 1; + xd->recon_left[1] = xd->recon_above[1] - 1; + xd->recon_left[2] = xd->recon_above[2] - 1; + + xd->recon_above[0] -= xd->dst.y_stride; + xd->recon_above[1] -= xd->dst.uv_stride; + xd->recon_above[2] -= xd->dst.uv_stride; + + /* TODO: move to outside row loop */ + xd->recon_left_stride[0] = xd->dst.y_stride; + xd->recon_left_stride[1] = xd->dst.uv_stride; + + setup_intra_recon_left(xd->recon_left[0], xd->recon_left[1], + xd->recon_left[2], xd->dst.y_stride, + xd->dst.uv_stride); + + for (mb_col = 0; mb_col < pc->mb_cols; mb_col++) + { + /* Distance of Mb to the various image edges. + * These are specified to 8th pel as they are always compared to values + * that are in 1/8th pel units + */ + xd->mb_to_left_edge = -((mb_col * 16) << 3); + xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3; + +#if CONFIG_ERROR_CONCEALMENT + { + int corrupt_residual = (!pbi->independent_partitions && + pbi->frame_corrupt_residual) || + vp8dx_bool_error(xd->current_bc); + if (pbi->ec_active && + xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME && + corrupt_residual) + { + /* We have an intra block with corrupt coefficients, better to + * conceal with an inter block. Interpolate MVs from neighboring + * MBs. + * + * Note that for the first mb with corrupt residual in a frame, + * we might not discover that before decoding the residual. That + * happens after this check, and therefore no inter concealment + * will be done. + */ + vp8_interpolate_motion(xd, + mb_row, mb_col, + pc->mb_rows, pc->mb_cols); + } + } +#endif + + xd->dst.y_buffer = dst_buffer[0] + recon_yoffset; + xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset; + xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset; + + if (xd->mode_info_context->mbmi.ref_frame >= LAST_FRAME) { + const MV_REFERENCE_FRAME ref = xd->mode_info_context->mbmi.ref_frame; + xd->pre.y_buffer = ref_buffer[ref][0] + recon_yoffset; + xd->pre.u_buffer = ref_buffer[ref][1] + recon_uvoffset; + xd->pre.v_buffer = ref_buffer[ref][2] + recon_uvoffset; + } else { + // ref_frame is INTRA_FRAME, pre buffer should not be used. + xd->pre.y_buffer = 0; + xd->pre.u_buffer = 0; + xd->pre.v_buffer = 0; + } + + /* propagate errors from reference frames */ + xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame]; + + decode_macroblock(pbi, xd, mb_idx); + + mb_idx++; + xd->left_available = 1; + + /* check if the boolean decoder has suffered an error */ + xd->corrupted |= vp8dx_bool_error(xd->current_bc); + + xd->recon_above[0] += 16; + xd->recon_above[1] += 8; + xd->recon_above[2] += 8; + xd->recon_left[0] += 16; + xd->recon_left[1] += 8; + xd->recon_left[2] += 8; + + recon_yoffset += 16; + recon_uvoffset += 8; + + ++xd->mode_info_context; /* next mb */ + + xd->above_context++; + } + + /* adjust to the next row of mbs */ + vp8_extend_mb_row(yv12_fb_new, xd->dst.y_buffer + 16, + xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); + + ++xd->mode_info_context; /* skip prediction column */ + xd->up_available = 1; + + if(pc->filter_level) + { + if(mb_row > 0) + { + if (pc->filter_type == NORMAL_LOOPFILTER) + vp8_loop_filter_row_normal(pc, lf_mic, mb_row-1, + recon_y_stride, recon_uv_stride, + lf_dst[0], lf_dst[1], lf_dst[2]); + else + vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1, + recon_y_stride, recon_uv_stride, + lf_dst[0], lf_dst[1], lf_dst[2]); + if(mb_row > 1) + { + yv12_extend_frame_left_right_c(yv12_fb_new, + eb_dst[0], + eb_dst[1], + eb_dst[2]); + + eb_dst[0] += recon_y_stride * 16; + eb_dst[1] += recon_uv_stride * 8; + eb_dst[2] += recon_uv_stride * 8; + } + + lf_dst[0] += recon_y_stride * 16; + lf_dst[1] += recon_uv_stride * 8; + lf_dst[2] += recon_uv_stride * 8; + lf_mic += pc->mb_cols; + lf_mic++; /* Skip border mb */ + } + } + else + { + if(mb_row > 0) + { + /**/ + yv12_extend_frame_left_right_c(yv12_fb_new, + eb_dst[0], + eb_dst[1], + eb_dst[2]); + eb_dst[0] += recon_y_stride * 16; + eb_dst[1] += recon_uv_stride * 8; + eb_dst[2] += recon_uv_stride * 8; + } + } + } + + if(pc->filter_level) + { + if (pc->filter_type == NORMAL_LOOPFILTER) + vp8_loop_filter_row_normal(pc, lf_mic, mb_row-1, recon_y_stride, + recon_uv_stride, lf_dst[0], lf_dst[1], + lf_dst[2]); + else + vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1, recon_y_stride, + recon_uv_stride, lf_dst[0], lf_dst[1], + lf_dst[2]); + + yv12_extend_frame_left_right_c(yv12_fb_new, + eb_dst[0], + eb_dst[1], + eb_dst[2]); + eb_dst[0] += recon_y_stride * 16; + eb_dst[1] += recon_uv_stride * 8; + eb_dst[2] += recon_uv_stride * 8; + } + yv12_extend_frame_left_right_c(yv12_fb_new, + eb_dst[0], + eb_dst[1], + eb_dst[2]); + yv12_extend_frame_top_c(yv12_fb_new); + yv12_extend_frame_bottom_c(yv12_fb_new); + +} + +static unsigned int read_partition_size(VP8D_COMP *pbi, + const unsigned char *cx_size) +{ + unsigned char temp[3]; + if (pbi->decrypt_cb) + { + pbi->decrypt_cb(pbi->decrypt_state, cx_size, temp, 3); + cx_size = temp; + } + return cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16); +} + +static int read_is_valid(const unsigned char *start, + size_t len, + const unsigned char *end) +{ + return (start + len > start && start + len <= end); +} + +static unsigned int read_available_partition_size( + VP8D_COMP *pbi, + const unsigned char *token_part_sizes, + const unsigned char *fragment_start, + const unsigned char *first_fragment_end, + const unsigned char *fragment_end, + int i, + int num_part) +{ + VP8_COMMON* pc = &pbi->common; + const unsigned char *partition_size_ptr = token_part_sizes + i * 3; + unsigned int partition_size = 0; + ptrdiff_t bytes_left = fragment_end - fragment_start; + /* Calculate the length of this partition. The last partition + * size is implicit. If the partition size can't be read, then + * either use the remaining data in the buffer (for EC mode) + * or throw an error. + */ + if (i < num_part - 1) + { + if (read_is_valid(partition_size_ptr, 3, first_fragment_end)) + partition_size = read_partition_size(pbi, partition_size_ptr); + else if (pbi->ec_active) + partition_size = (unsigned int)bytes_left; + else + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated partition size data"); + } + else + partition_size = (unsigned int)bytes_left; + + /* Validate the calculated partition length. If the buffer + * described by the partition can't be fully read, then restrict + * it to the portion that can be (for EC mode) or throw an error. + */ + if (!read_is_valid(fragment_start, partition_size, fragment_end)) + { + if (pbi->ec_active) + partition_size = (unsigned int)bytes_left; + else + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt partition " + "%d length", i + 1); + } + return partition_size; +} + + +static void setup_token_decoder(VP8D_COMP *pbi, + const unsigned char* token_part_sizes) +{ + vp8_reader *bool_decoder = &pbi->mbc[0]; + unsigned int partition_idx; + unsigned int fragment_idx; + unsigned int num_token_partitions; + const unsigned char *first_fragment_end = pbi->fragments.ptrs[0] + + pbi->fragments.sizes[0]; + + TOKEN_PARTITION multi_token_partition = + (TOKEN_PARTITION)vp8_read_literal(&pbi->mbc[8], 2); + if (!vp8dx_bool_error(&pbi->mbc[8])) + pbi->common.multi_token_partition = multi_token_partition; + num_token_partitions = 1 << pbi->common.multi_token_partition; + + /* Check for partitions within the fragments and unpack the fragments + * so that each fragment pointer points to its corresponding partition. */ + for (fragment_idx = 0; fragment_idx < pbi->fragments.count; ++fragment_idx) + { + unsigned int fragment_size = pbi->fragments.sizes[fragment_idx]; + const unsigned char *fragment_end = pbi->fragments.ptrs[fragment_idx] + + fragment_size; + /* Special case for handling the first partition since we have already + * read its size. */ + if (fragment_idx == 0) + { + /* Size of first partition + token partition sizes element */ + ptrdiff_t ext_first_part_size = token_part_sizes - + pbi->fragments.ptrs[0] + 3 * (num_token_partitions - 1); + fragment_size -= (unsigned int)ext_first_part_size; + if (fragment_size > 0) + { + pbi->fragments.sizes[0] = (unsigned int)ext_first_part_size; + /* The fragment contains an additional partition. Move to + * next. */ + fragment_idx++; + pbi->fragments.ptrs[fragment_idx] = pbi->fragments.ptrs[0] + + pbi->fragments.sizes[0]; + } + } + /* Split the chunk into partitions read from the bitstream */ + while (fragment_size > 0) + { + ptrdiff_t partition_size = read_available_partition_size( + pbi, + token_part_sizes, + pbi->fragments.ptrs[fragment_idx], + first_fragment_end, + fragment_end, + fragment_idx - 1, + num_token_partitions); + pbi->fragments.sizes[fragment_idx] = (unsigned int)partition_size; + fragment_size -= (unsigned int)partition_size; + assert(fragment_idx <= num_token_partitions); + if (fragment_size > 0) + { + /* The fragment contains an additional partition. + * Move to next. */ + fragment_idx++; + pbi->fragments.ptrs[fragment_idx] = + pbi->fragments.ptrs[fragment_idx - 1] + partition_size; + } + } + } + + pbi->fragments.count = num_token_partitions + 1; + + for (partition_idx = 1; partition_idx < pbi->fragments.count; ++partition_idx) + { + if (vp8dx_start_decode(bool_decoder, + pbi->fragments.ptrs[partition_idx], + pbi->fragments.sizes[partition_idx], + pbi->decrypt_cb, pbi->decrypt_state)) + vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate bool decoder %d", + partition_idx); + + bool_decoder++; + } + +#if CONFIG_MULTITHREAD + /* Clamp number of decoder threads */ + if (pbi->decoding_thread_count > num_token_partitions - 1) + pbi->decoding_thread_count = num_token_partitions - 1; +#endif +} + + +static void init_frame(VP8D_COMP *pbi) +{ + VP8_COMMON *const pc = & pbi->common; + MACROBLOCKD *const xd = & pbi->mb; + + if (pc->frame_type == KEY_FRAME) + { + /* Various keyframe initializations */ + memcpy(pc->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context)); + + vp8_init_mbmode_probs(pc); + + vp8_default_coef_probs(pc); + + /* reset the segment feature data to 0 with delta coding (Default state). */ + memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data)); + xd->mb_segement_abs_delta = SEGMENT_DELTADATA; + + /* reset the mode ref deltasa for loop filter */ + memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas)); + memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas)); + + /* All buffers are implicitly updated on key frames. */ + pc->refresh_golden_frame = 1; + pc->refresh_alt_ref_frame = 1; + pc->copy_buffer_to_gf = 0; + pc->copy_buffer_to_arf = 0; + + /* Note that Golden and Altref modes cannot be used on a key frame so + * ref_frame_sign_bias[] is undefined and meaningless + */ + pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0; + pc->ref_frame_sign_bias[ALTREF_FRAME] = 0; + } + else + { + /* To enable choice of different interploation filters */ + if (!pc->use_bilinear_mc_filter) + { + xd->subpixel_predict = vp8_sixtap_predict4x4; + xd->subpixel_predict8x4 = vp8_sixtap_predict8x4; + xd->subpixel_predict8x8 = vp8_sixtap_predict8x8; + xd->subpixel_predict16x16 = vp8_sixtap_predict16x16; + } + else + { + xd->subpixel_predict = vp8_bilinear_predict4x4; + xd->subpixel_predict8x4 = vp8_bilinear_predict8x4; + xd->subpixel_predict8x8 = vp8_bilinear_predict8x8; + xd->subpixel_predict16x16 = vp8_bilinear_predict16x16; + } + + if (pbi->decoded_key_frame && pbi->ec_enabled && !pbi->ec_active) + pbi->ec_active = 1; + } + + xd->left_context = &pc->left_context; + xd->mode_info_context = pc->mi; + xd->frame_type = pc->frame_type; + xd->mode_info_context->mbmi.mode = DC_PRED; + xd->mode_info_stride = pc->mode_info_stride; + xd->corrupted = 0; /* init without corruption */ + + xd->fullpixel_mask = 0xffffffff; + if(pc->full_pixel) + xd->fullpixel_mask = 0xfffffff8; + +} + +int vp8_decode_frame(VP8D_COMP *pbi) +{ + vp8_reader *const bc = &pbi->mbc[8]; + VP8_COMMON *const pc = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + const unsigned char *data = pbi->fragments.ptrs[0]; + const unsigned int data_sz = pbi->fragments.sizes[0]; + const unsigned char *data_end = data + data_sz; + ptrdiff_t first_partition_length_in_bytes; + + int i, j, k, l; + const int *const mb_feature_data_bits = vp8_mb_feature_data_bits; + int corrupt_tokens = 0; + int prev_independent_partitions = pbi->independent_partitions; + + YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME]; + + /* start with no corruption of current frame */ + xd->corrupted = 0; + yv12_fb_new->corrupted = 0; + + if (data_end - data < 3) + { + if (!pbi->ec_active) + { + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet"); + } + + /* Declare the missing frame as an inter frame since it will + be handled as an inter frame when we have estimated its + motion vectors. */ + pc->frame_type = INTER_FRAME; + pc->version = 0; + pc->show_frame = 1; + first_partition_length_in_bytes = 0; + } + else + { + unsigned char clear_buffer[10]; + const unsigned char *clear = data; + if (pbi->decrypt_cb) + { + int n = (int)VPXMIN(sizeof(clear_buffer), data_sz); + pbi->decrypt_cb(pbi->decrypt_state, data, clear_buffer, n); + clear = clear_buffer; + } + + pc->frame_type = (FRAME_TYPE)(clear[0] & 1); + pc->version = (clear[0] >> 1) & 7; + pc->show_frame = (clear[0] >> 4) & 1; + first_partition_length_in_bytes = + (clear[0] | (clear[1] << 8) | (clear[2] << 16)) >> 5; + + if (!pbi->ec_active && + (data + first_partition_length_in_bytes > data_end + || data + first_partition_length_in_bytes < data)) + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt partition 0 length"); + + data += 3; + clear += 3; + + vp8_setup_version(pc); + + + if (pc->frame_type == KEY_FRAME) + { + /* vet via sync code */ + /* When error concealment is enabled we should only check the sync + * code if we have enough bits available + */ + if (!pbi->ec_active || data + 3 < data_end) + { + if (clear[0] != 0x9d || clear[1] != 0x01 || clear[2] != 0x2a) + vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid frame sync code"); + } + + /* If error concealment is enabled we should only parse the new size + * if we have enough data. Otherwise we will end up with the wrong + * size. + */ + if (!pbi->ec_active || data + 6 < data_end) + { + pc->Width = (clear[3] | (clear[4] << 8)) & 0x3fff; + pc->horiz_scale = clear[4] >> 6; + pc->Height = (clear[5] | (clear[6] << 8)) & 0x3fff; + pc->vert_scale = clear[6] >> 6; + } + data += 7; + } + else + { + memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG)); + memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG)); + } + } + if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME)) + { + return -1; + } + + init_frame(pbi); + + if (vp8dx_start_decode(bc, data, (unsigned int)(data_end - data), + pbi->decrypt_cb, pbi->decrypt_state)) + vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate bool decoder 0"); + if (pc->frame_type == KEY_FRAME) { + (void)vp8_read_bit(bc); // colorspace + pc->clamp_type = (CLAMP_TYPE)vp8_read_bit(bc); + } + + /* Is segmentation enabled */ + xd->segmentation_enabled = (unsigned char)vp8_read_bit(bc); + + if (xd->segmentation_enabled) + { + /* Signal whether or not the segmentation map is being explicitly updated this frame. */ + xd->update_mb_segmentation_map = (unsigned char)vp8_read_bit(bc); + xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc); + + if (xd->update_mb_segmentation_data) + { + xd->mb_segement_abs_delta = (unsigned char)vp8_read_bit(bc); + + memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data)); + + /* For each segmentation feature (Quant and loop filter level) */ + for (i = 0; i < MB_LVL_MAX; i++) + { + for (j = 0; j < MAX_MB_SEGMENTS; j++) + { + /* Frame level data */ + if (vp8_read_bit(bc)) + { + xd->segment_feature_data[i][j] = (signed char)vp8_read_literal(bc, mb_feature_data_bits[i]); + + if (vp8_read_bit(bc)) + xd->segment_feature_data[i][j] = -xd->segment_feature_data[i][j]; + } + else + xd->segment_feature_data[i][j] = 0; + } + } + } + + if (xd->update_mb_segmentation_map) + { + /* Which macro block level features are enabled */ + memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs)); + + /* Read the probs used to decode the segment id for each macro block. */ + for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) + { + /* If not explicitly set value is defaulted to 255 by memset above */ + if (vp8_read_bit(bc)) + xd->mb_segment_tree_probs[i] = (vp8_prob)vp8_read_literal(bc, 8); + } + } + } + else + { + /* No segmentation updates on this frame */ + xd->update_mb_segmentation_map = 0; + xd->update_mb_segmentation_data = 0; + } + + /* Read the loop filter level and type */ + pc->filter_type = (LOOPFILTERTYPE) vp8_read_bit(bc); + pc->filter_level = vp8_read_literal(bc, 6); + pc->sharpness_level = vp8_read_literal(bc, 3); + + /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */ + xd->mode_ref_lf_delta_update = 0; + xd->mode_ref_lf_delta_enabled = (unsigned char)vp8_read_bit(bc); + + if (xd->mode_ref_lf_delta_enabled) + { + /* Do the deltas need to be updated */ + xd->mode_ref_lf_delta_update = (unsigned char)vp8_read_bit(bc); + + if (xd->mode_ref_lf_delta_update) + { + /* Send update */ + for (i = 0; i < MAX_REF_LF_DELTAS; i++) + { + if (vp8_read_bit(bc)) + { + /*sign = vp8_read_bit( bc );*/ + xd->ref_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6); + + if (vp8_read_bit(bc)) /* Apply sign */ + xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1; + } + } + + /* Send update */ + for (i = 0; i < MAX_MODE_LF_DELTAS; i++) + { + if (vp8_read_bit(bc)) + { + /*sign = vp8_read_bit( bc );*/ + xd->mode_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6); + + if (vp8_read_bit(bc)) /* Apply sign */ + xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1; + } + } + } + } + + setup_token_decoder(pbi, data + first_partition_length_in_bytes); + + xd->current_bc = &pbi->mbc[0]; + + /* Read the default quantizers. */ + { + int Q, q_update; + + Q = vp8_read_literal(bc, 7); /* AC 1st order Q = default */ + pc->base_qindex = Q; + q_update = 0; + pc->y1dc_delta_q = get_delta_q(bc, pc->y1dc_delta_q, &q_update); + pc->y2dc_delta_q = get_delta_q(bc, pc->y2dc_delta_q, &q_update); + pc->y2ac_delta_q = get_delta_q(bc, pc->y2ac_delta_q, &q_update); + pc->uvdc_delta_q = get_delta_q(bc, pc->uvdc_delta_q, &q_update); + pc->uvac_delta_q = get_delta_q(bc, pc->uvac_delta_q, &q_update); + + if (q_update) + vp8cx_init_de_quantizer(pbi); + + /* MB level dequantizer setup */ + vp8_mb_init_dequantizer(pbi, &pbi->mb); + } + + /* Determine if the golden frame or ARF buffer should be updated and how. + * For all non key frames the GF and ARF refresh flags and sign bias + * flags must be set explicitly. + */ + if (pc->frame_type != KEY_FRAME) + { + /* Should the GF or ARF be updated from the current frame */ + pc->refresh_golden_frame = vp8_read_bit(bc); +#if CONFIG_ERROR_CONCEALMENT + /* Assume we shouldn't refresh golden if the bit is missing */ + xd->corrupted |= vp8dx_bool_error(bc); + if (pbi->ec_active && xd->corrupted) + pc->refresh_golden_frame = 0; +#endif + + pc->refresh_alt_ref_frame = vp8_read_bit(bc); +#if CONFIG_ERROR_CONCEALMENT + /* Assume we shouldn't refresh altref if the bit is missing */ + xd->corrupted |= vp8dx_bool_error(bc); + if (pbi->ec_active && xd->corrupted) + pc->refresh_alt_ref_frame = 0; +#endif + + /* Buffer to buffer copy flags. */ + pc->copy_buffer_to_gf = 0; + + if (!pc->refresh_golden_frame) + pc->copy_buffer_to_gf = vp8_read_literal(bc, 2); + +#if CONFIG_ERROR_CONCEALMENT + /* Assume we shouldn't copy to the golden if the bit is missing */ + xd->corrupted |= vp8dx_bool_error(bc); + if (pbi->ec_active && xd->corrupted) + pc->copy_buffer_to_gf = 0; +#endif + + pc->copy_buffer_to_arf = 0; + + if (!pc->refresh_alt_ref_frame) + pc->copy_buffer_to_arf = vp8_read_literal(bc, 2); + +#if CONFIG_ERROR_CONCEALMENT + /* Assume we shouldn't copy to the alt-ref if the bit is missing */ + xd->corrupted |= vp8dx_bool_error(bc); + if (pbi->ec_active && xd->corrupted) + pc->copy_buffer_to_arf = 0; +#endif + + + pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp8_read_bit(bc); + pc->ref_frame_sign_bias[ALTREF_FRAME] = vp8_read_bit(bc); + } + + pc->refresh_entropy_probs = vp8_read_bit(bc); +#if CONFIG_ERROR_CONCEALMENT + /* Assume we shouldn't refresh the probabilities if the bit is + * missing */ + xd->corrupted |= vp8dx_bool_error(bc); + if (pbi->ec_active && xd->corrupted) + pc->refresh_entropy_probs = 0; +#endif + if (pc->refresh_entropy_probs == 0) + { + memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc)); + } + + pc->refresh_last_frame = pc->frame_type == KEY_FRAME || vp8_read_bit(bc); + +#if CONFIG_ERROR_CONCEALMENT + /* Assume we should refresh the last frame if the bit is missing */ + xd->corrupted |= vp8dx_bool_error(bc); + if (pbi->ec_active && xd->corrupted) + pc->refresh_last_frame = 1; +#endif + + if (0) + { + FILE *z = fopen("decodestats.stt", "a"); + fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n", + pc->current_video_frame, + pc->frame_type, + pc->refresh_golden_frame, + pc->refresh_alt_ref_frame, + pc->refresh_last_frame, + pc->base_qindex); + fclose(z); + } + + { + pbi->independent_partitions = 1; + + /* read coef probability tree */ + for (i = 0; i < BLOCK_TYPES; i++) + for (j = 0; j < COEF_BANDS; j++) + for (k = 0; k < PREV_COEF_CONTEXTS; k++) + for (l = 0; l < ENTROPY_NODES; l++) + { + + vp8_prob *const p = pc->fc.coef_probs [i][j][k] + l; + + if (vp8_read(bc, vp8_coef_update_probs [i][j][k][l])) + { + *p = (vp8_prob)vp8_read_literal(bc, 8); + + } + if (k > 0 && *p != pc->fc.coef_probs[i][j][k-1][l]) + pbi->independent_partitions = 0; + + } + } + + /* clear out the coeff buffer */ + memset(xd->qcoeff, 0, sizeof(xd->qcoeff)); + + vp8_decode_mode_mvs(pbi); + +#if CONFIG_ERROR_CONCEALMENT + if (pbi->ec_active && + pbi->mvs_corrupt_from_mb < (unsigned int)pc->mb_cols * pc->mb_rows) + { + /* Motion vectors are missing in this frame. We will try to estimate + * them and then continue decoding the frame as usual */ + vp8_estimate_missing_mvs(pbi); + } +#endif + + memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols); + pbi->frame_corrupt_residual = 0; + +#if CONFIG_MULTITHREAD + if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION) + { + unsigned int thread; + vp8mt_decode_mb_rows(pbi, xd); + vp8_yv12_extend_frame_borders(yv12_fb_new); + for (thread = 0; thread < pbi->decoding_thread_count; ++thread) + corrupt_tokens |= pbi->mb_row_di[thread].mbd.corrupted; + } + else +#endif + { + decode_mb_rows(pbi); + corrupt_tokens |= xd->corrupted; + } + + /* Collect information about decoder corruption. */ + /* 1. Check first boolean decoder for errors. */ + yv12_fb_new->corrupted = vp8dx_bool_error(bc); + /* 2. Check the macroblock information */ + yv12_fb_new->corrupted |= corrupt_tokens; + + if (!pbi->decoded_key_frame) + { + if (pc->frame_type == KEY_FRAME && + !yv12_fb_new->corrupted) + pbi->decoded_key_frame = 1; + else + vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME, + "A stream must start with a complete key frame"); + } + + /* vpx_log("Decoder: Frame Decoded, Size Roughly:%d bytes \n",bc->pos+pbi->bc2.pos); */ + + if (pc->refresh_entropy_probs == 0) + { + memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc)); + pbi->independent_partitions = prev_independent_partitions; + } + +#ifdef PACKET_TESTING + { + FILE *f = fopen("decompressor.VP8", "ab"); + unsigned int size = pbi->bc2.pos + pbi->bc.pos + 8; + fwrite((void *) &size, 4, 1, f); + fwrite((void *) pbi->Source, size, 1, f); + fclose(f); + } +#endif + + return 0; +} diff --git a/thirdparty/libvpx/vp8/decoder/decodemv.c b/thirdparty/libvpx/vp8/decoder/decodemv.c new file mode 100644 index 000000000000..1d155e7e16da --- /dev/null +++ b/thirdparty/libvpx/vp8/decoder/decodemv.c @@ -0,0 +1,670 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "treereader.h" +#include "vp8/common/entropymv.h" +#include "vp8/common/entropymode.h" +#include "onyxd_int.h" +#include "vp8/common/findnearmv.h" + +#if CONFIG_DEBUG +#include +#endif +static B_PREDICTION_MODE read_bmode(vp8_reader *bc, const vp8_prob *p) +{ + const int i = vp8_treed_read(bc, vp8_bmode_tree, p); + + return (B_PREDICTION_MODE)i; +} + +static MB_PREDICTION_MODE read_ymode(vp8_reader *bc, const vp8_prob *p) +{ + const int i = vp8_treed_read(bc, vp8_ymode_tree, p); + + return (MB_PREDICTION_MODE)i; +} + +static MB_PREDICTION_MODE read_kf_ymode(vp8_reader *bc, const vp8_prob *p) +{ + const int i = vp8_treed_read(bc, vp8_kf_ymode_tree, p); + + return (MB_PREDICTION_MODE)i; +} + +static MB_PREDICTION_MODE read_uv_mode(vp8_reader *bc, const vp8_prob *p) +{ + const int i = vp8_treed_read(bc, vp8_uv_mode_tree, p); + + return (MB_PREDICTION_MODE)i; +} + +static void read_kf_modes(VP8D_COMP *pbi, MODE_INFO *mi) +{ + vp8_reader *const bc = & pbi->mbc[8]; + const int mis = pbi->common.mode_info_stride; + + mi->mbmi.ref_frame = INTRA_FRAME; + mi->mbmi.mode = read_kf_ymode(bc, vp8_kf_ymode_prob); + + if (mi->mbmi.mode == B_PRED) + { + int i = 0; + mi->mbmi.is_4x4 = 1; + + do + { + const B_PREDICTION_MODE A = above_block_mode(mi, i, mis); + const B_PREDICTION_MODE L = left_block_mode(mi, i); + + mi->bmi[i].as_mode = + read_bmode(bc, vp8_kf_bmode_prob [A] [L]); + } + while (++i < 16); + } + + mi->mbmi.uv_mode = read_uv_mode(bc, vp8_kf_uv_mode_prob); +} + +static int read_mvcomponent(vp8_reader *r, const MV_CONTEXT *mvc) +{ + const vp8_prob *const p = (const vp8_prob *) mvc; + int x = 0; + + if (vp8_read(r, p [mvpis_short])) /* Large */ + { + int i = 0; + + do + { + x += vp8_read(r, p [MVPbits + i]) << i; + } + while (++i < 3); + + i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */ + + do + { + x += vp8_read(r, p [MVPbits + i]) << i; + } + while (--i > 3); + + if (!(x & 0xFFF0) || vp8_read(r, p [MVPbits + 3])) + x += 8; + } + else /* small */ + x = vp8_treed_read(r, vp8_small_mvtree, p + MVPshort); + + if (x && vp8_read(r, p [MVPsign])) + x = -x; + + return x; +} + +static void read_mv(vp8_reader *r, MV *mv, const MV_CONTEXT *mvc) +{ + mv->row = (short)(read_mvcomponent(r, mvc) * 2); + mv->col = (short)(read_mvcomponent(r, ++mvc) * 2); +} + + +static void read_mvcontexts(vp8_reader *bc, MV_CONTEXT *mvc) +{ + int i = 0; + + do + { + const vp8_prob *up = vp8_mv_update_probs[i].prob; + vp8_prob *p = (vp8_prob *)(mvc + i); + vp8_prob *const pstop = p + MVPcount; + + do + { + if (vp8_read(bc, *up++)) + { + const vp8_prob x = (vp8_prob)vp8_read_literal(bc, 7); + + *p = x ? x << 1 : 1; + } + } + while (++p < pstop); + } + while (++i < 2); +} + +static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1}; +static const unsigned char mbsplit_fill_offset[4][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}, + { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15}, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} +}; + + +static void mb_mode_mv_init(VP8D_COMP *pbi) +{ + vp8_reader *const bc = & pbi->mbc[8]; + MV_CONTEXT *const mvc = pbi->common.fc.mvc; + +#if CONFIG_ERROR_CONCEALMENT + /* Default is that no macroblock is corrupt, therefore we initialize + * mvs_corrupt_from_mb to something very big, which we can be sure is + * outside the frame. */ + pbi->mvs_corrupt_from_mb = UINT_MAX; +#endif + /* Read the mb_no_coeff_skip flag */ + pbi->common.mb_no_coeff_skip = (int)vp8_read_bit(bc); + + pbi->prob_skip_false = 0; + if (pbi->common.mb_no_coeff_skip) + pbi->prob_skip_false = (vp8_prob)vp8_read_literal(bc, 8); + + if(pbi->common.frame_type != KEY_FRAME) + { + pbi->prob_intra = (vp8_prob)vp8_read_literal(bc, 8); + pbi->prob_last = (vp8_prob)vp8_read_literal(bc, 8); + pbi->prob_gf = (vp8_prob)vp8_read_literal(bc, 8); + + if (vp8_read_bit(bc)) + { + int i = 0; + + do + { + pbi->common.fc.ymode_prob[i] = + (vp8_prob) vp8_read_literal(bc, 8); + } + while (++i < 4); + } + + if (vp8_read_bit(bc)) + { + int i = 0; + + do + { + pbi->common.fc.uv_mode_prob[i] = + (vp8_prob) vp8_read_literal(bc, 8); + } + while (++i < 3); + } + + read_mvcontexts(bc, mvc); + } +} + +const vp8_prob vp8_sub_mv_ref_prob3 [8][VP8_SUBMVREFS-1] = +{ + { 147, 136, 18 }, /* SUBMVREF_NORMAL */ + { 223, 1 , 34 }, /* SUBMVREF_LEFT_ABOVE_SAME */ + { 106, 145, 1 }, /* SUBMVREF_LEFT_ZED */ + { 208, 1 , 1 }, /* SUBMVREF_LEFT_ABOVE_ZED */ + { 179, 121, 1 }, /* SUBMVREF_ABOVE_ZED */ + { 223, 1 , 34 }, /* SUBMVREF_LEFT_ABOVE_SAME */ + { 179, 121, 1 }, /* SUBMVREF_ABOVE_ZED */ + { 208, 1 , 1 } /* SUBMVREF_LEFT_ABOVE_ZED */ +}; + +static +const vp8_prob * get_sub_mv_ref_prob(const int left, const int above) +{ + int lez = (left == 0); + int aez = (above == 0); + int lea = (left == above); + const vp8_prob * prob; + + prob = vp8_sub_mv_ref_prob3[(aez << 2) | + (lez << 1) | + (lea)]; + + return prob; +} + +static void decode_split_mv(vp8_reader *const bc, MODE_INFO *mi, + const MODE_INFO *left_mb, const MODE_INFO *above_mb, + MB_MODE_INFO *mbmi, int_mv best_mv, + MV_CONTEXT *const mvc, int mb_to_left_edge, + int mb_to_right_edge, int mb_to_top_edge, + int mb_to_bottom_edge) +{ + int s; /* split configuration (16x8, 8x16, 8x8, 4x4) */ + int num_p; /* number of partitions in the split configuration + (see vp8_mbsplit_count) */ + int j = 0; + + s = 3; + num_p = 16; + if( vp8_read(bc, 110) ) + { + s = 2; + num_p = 4; + if( vp8_read(bc, 111) ) + { + s = vp8_read(bc, 150); + num_p = 2; + } + } + + do /* for each subset j */ + { + int_mv leftmv, abovemv; + int_mv blockmv; + int k; /* first block in subset j */ + + const vp8_prob *prob; + k = vp8_mbsplit_offset[s][j]; + + if (!(k & 3)) + { + /* On L edge, get from MB to left of us */ + if(left_mb->mbmi.mode != SPLITMV) + leftmv.as_int = left_mb->mbmi.mv.as_int; + else + leftmv.as_int = (left_mb->bmi + k + 4 - 1)->mv.as_int; + } + else + leftmv.as_int = (mi->bmi + k - 1)->mv.as_int; + + if (!(k >> 2)) + { + /* On top edge, get from MB above us */ + if(above_mb->mbmi.mode != SPLITMV) + abovemv.as_int = above_mb->mbmi.mv.as_int; + else + abovemv.as_int = (above_mb->bmi + k + 16 - 4)->mv.as_int; + } + else + abovemv.as_int = (mi->bmi + k - 4)->mv.as_int; + + prob = get_sub_mv_ref_prob(leftmv.as_int, abovemv.as_int); + + if( vp8_read(bc, prob[0]) ) + { + if( vp8_read(bc, prob[1]) ) + { + blockmv.as_int = 0; + if( vp8_read(bc, prob[2]) ) + { + blockmv.as_mv.row = read_mvcomponent(bc, &mvc[0]) * 2; + blockmv.as_mv.row += best_mv.as_mv.row; + blockmv.as_mv.col = read_mvcomponent(bc, &mvc[1]) * 2; + blockmv.as_mv.col += best_mv.as_mv.col; + } + } + else + { + blockmv.as_int = abovemv.as_int; + } + } + else + { + blockmv.as_int = leftmv.as_int; + } + + mbmi->need_to_clamp_mvs |= vp8_check_mv_bounds(&blockmv, + mb_to_left_edge, + mb_to_right_edge, + mb_to_top_edge, + mb_to_bottom_edge); + + { + /* Fill (uniform) modes, mvs of jth subset. + Must do it here because ensuing subsets can + refer back to us via "left" or "above". */ + const unsigned char *fill_offset; + unsigned int fill_count = mbsplit_fill_count[s]; + + fill_offset = &mbsplit_fill_offset[s] + [(unsigned char)j * mbsplit_fill_count[s]]; + + do { + mi->bmi[ *fill_offset].mv.as_int = blockmv.as_int; + fill_offset++; + }while (--fill_count); + } + + } + while (++j < num_p); + + mbmi->partitioning = s; +} + +static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi) +{ + vp8_reader *const bc = & pbi->mbc[8]; + mbmi->ref_frame = (MV_REFERENCE_FRAME) vp8_read(bc, pbi->prob_intra); + if (mbmi->ref_frame) /* inter MB */ + { + enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV}; + int cnt[4]; + int *cntx = cnt; + int_mv near_mvs[4]; + int_mv *nmv = near_mvs; + const int mis = pbi->mb.mode_info_stride; + const MODE_INFO *above = mi - mis; + const MODE_INFO *left = mi - 1; + const MODE_INFO *aboveleft = above - 1; + int *ref_frame_sign_bias = pbi->common.ref_frame_sign_bias; + + mbmi->need_to_clamp_mvs = 0; + + if (vp8_read(bc, pbi->prob_last)) + { + mbmi->ref_frame = + (MV_REFERENCE_FRAME)((int)(2 + vp8_read(bc, pbi->prob_gf))); + } + + /* Zero accumulators */ + nmv[0].as_int = nmv[1].as_int = nmv[2].as_int = 0; + cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0; + + /* Process above */ + if (above->mbmi.ref_frame != INTRA_FRAME) + { + if (above->mbmi.mv.as_int) + { + (++nmv)->as_int = above->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], + mbmi->ref_frame, nmv, ref_frame_sign_bias); + ++cntx; + } + + *cntx += 2; + } + + /* Process left */ + if (left->mbmi.ref_frame != INTRA_FRAME) + { + if (left->mbmi.mv.as_int) + { + int_mv this_mv; + + this_mv.as_int = left->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], + mbmi->ref_frame, &this_mv, ref_frame_sign_bias); + + if (this_mv.as_int != nmv->as_int) + { + (++nmv)->as_int = this_mv.as_int; + ++cntx; + } + + *cntx += 2; + } + else + cnt[CNT_INTRA] += 2; + } + + /* Process above left */ + if (aboveleft->mbmi.ref_frame != INTRA_FRAME) + { + if (aboveleft->mbmi.mv.as_int) + { + int_mv this_mv; + + this_mv.as_int = aboveleft->mbmi.mv.as_int; + mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], + mbmi->ref_frame, &this_mv, ref_frame_sign_bias); + + if (this_mv.as_int != nmv->as_int) + { + (++nmv)->as_int = this_mv.as_int; + ++cntx; + } + + *cntx += 1; + } + else + cnt[CNT_INTRA] += 1; + } + + if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_INTRA]] [0]) ) + { + + /* If we have three distinct MV's ... */ + /* See if above-left MV can be merged with NEAREST */ + cnt[CNT_NEAREST] += ( (cnt[CNT_SPLITMV] > 0) & + (nmv->as_int == near_mvs[CNT_NEAREST].as_int)); + + /* Swap near and nearest if necessary */ + if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) + { + int tmp; + tmp = cnt[CNT_NEAREST]; + cnt[CNT_NEAREST] = cnt[CNT_NEAR]; + cnt[CNT_NEAR] = tmp; + tmp = near_mvs[CNT_NEAREST].as_int; + near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int; + near_mvs[CNT_NEAR].as_int = tmp; + } + + if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_NEAREST]] [1]) ) + { + + if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_NEAR]] [2]) ) + { + int mb_to_top_edge; + int mb_to_bottom_edge; + int mb_to_left_edge; + int mb_to_right_edge; + MV_CONTEXT *const mvc = pbi->common.fc.mvc; + int near_index; + + mb_to_top_edge = pbi->mb.mb_to_top_edge; + mb_to_bottom_edge = pbi->mb.mb_to_bottom_edge; + mb_to_top_edge -= LEFT_TOP_MARGIN; + mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN; + mb_to_right_edge = pbi->mb.mb_to_right_edge; + mb_to_right_edge += RIGHT_BOTTOM_MARGIN; + mb_to_left_edge = pbi->mb.mb_to_left_edge; + mb_to_left_edge -= LEFT_TOP_MARGIN; + + /* Use near_mvs[0] to store the "best" MV */ + near_index = CNT_INTRA + + (cnt[CNT_NEAREST] >= cnt[CNT_INTRA]); + + vp8_clamp_mv2(&near_mvs[near_index], &pbi->mb); + + cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV) + + (left->mbmi.mode == SPLITMV)) * 2 + + (aboveleft->mbmi.mode == SPLITMV); + + if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_SPLITMV]] [3]) ) + { + decode_split_mv(bc, mi, left, above, + mbmi, + near_mvs[near_index], + mvc, mb_to_left_edge, + mb_to_right_edge, + mb_to_top_edge, + mb_to_bottom_edge); + mbmi->mv.as_int = mi->bmi[15].mv.as_int; + mbmi->mode = SPLITMV; + mbmi->is_4x4 = 1; + } + else + { + int_mv *const mbmi_mv = & mbmi->mv; + read_mv(bc, &mbmi_mv->as_mv, (const MV_CONTEXT *) mvc); + mbmi_mv->as_mv.row += near_mvs[near_index].as_mv.row; + mbmi_mv->as_mv.col += near_mvs[near_index].as_mv.col; + + /* Don't need to check this on NEARMV and NEARESTMV + * modes since those modes clamp the MV. The NEWMV mode + * does not, so signal to the prediction stage whether + * special handling may be required. + */ + mbmi->need_to_clamp_mvs = + vp8_check_mv_bounds(mbmi_mv, mb_to_left_edge, + mb_to_right_edge, + mb_to_top_edge, + mb_to_bottom_edge); + mbmi->mode = NEWMV; + } + } + else + { + mbmi->mode = NEARMV; + mbmi->mv.as_int = near_mvs[CNT_NEAR].as_int; + vp8_clamp_mv2(&mbmi->mv, &pbi->mb); + } + } + else + { + mbmi->mode = NEARESTMV; + mbmi->mv.as_int = near_mvs[CNT_NEAREST].as_int; + vp8_clamp_mv2(&mbmi->mv, &pbi->mb); + } + } + else + { + mbmi->mode = ZEROMV; + mbmi->mv.as_int = 0; + } + +#if CONFIG_ERROR_CONCEALMENT + if(pbi->ec_enabled && (mbmi->mode != SPLITMV)) + { + mi->bmi[ 0].mv.as_int = + mi->bmi[ 1].mv.as_int = + mi->bmi[ 2].mv.as_int = + mi->bmi[ 3].mv.as_int = + mi->bmi[ 4].mv.as_int = + mi->bmi[ 5].mv.as_int = + mi->bmi[ 6].mv.as_int = + mi->bmi[ 7].mv.as_int = + mi->bmi[ 8].mv.as_int = + mi->bmi[ 9].mv.as_int = + mi->bmi[10].mv.as_int = + mi->bmi[11].mv.as_int = + mi->bmi[12].mv.as_int = + mi->bmi[13].mv.as_int = + mi->bmi[14].mv.as_int = + mi->bmi[15].mv.as_int = mbmi->mv.as_int; + } +#endif + } + else + { + /* required for left and above block mv */ + mbmi->mv.as_int = 0; + + /* MB is intra coded */ + if ((mbmi->mode = read_ymode(bc, pbi->common.fc.ymode_prob)) == B_PRED) + { + int j = 0; + mbmi->is_4x4 = 1; + do + { + mi->bmi[j].as_mode = read_bmode(bc, pbi->common.fc.bmode_prob); + } + while (++j < 16); + } + + mbmi->uv_mode = read_uv_mode(bc, pbi->common.fc.uv_mode_prob); + } + +} + +static void read_mb_features(vp8_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *x) +{ + /* Is segmentation enabled */ + if (x->segmentation_enabled && x->update_mb_segmentation_map) + { + /* If so then read the segment id. */ + if (vp8_read(r, x->mb_segment_tree_probs[0])) + mi->segment_id = + (unsigned char)(2 + vp8_read(r, x->mb_segment_tree_probs[2])); + else + mi->segment_id = + (unsigned char)(vp8_read(r, x->mb_segment_tree_probs[1])); + } +} + +static void decode_mb_mode_mvs(VP8D_COMP *pbi, MODE_INFO *mi, + MB_MODE_INFO *mbmi) +{ + (void)mbmi; + + /* Read the Macroblock segmentation map if it is being updated explicitly + * this frame (reset to 0 above by default) + * By default on a key frame reset all MBs to segment 0 + */ + if (pbi->mb.update_mb_segmentation_map) + read_mb_features(&pbi->mbc[8], &mi->mbmi, &pbi->mb); + else if(pbi->common.frame_type == KEY_FRAME) + mi->mbmi.segment_id = 0; + + /* Read the macroblock coeff skip flag if this feature is in use, + * else default to 0 */ + if (pbi->common.mb_no_coeff_skip) + mi->mbmi.mb_skip_coeff = vp8_read(&pbi->mbc[8], pbi->prob_skip_false); + else + mi->mbmi.mb_skip_coeff = 0; + + mi->mbmi.is_4x4 = 0; + if(pbi->common.frame_type == KEY_FRAME) + read_kf_modes(pbi, mi); + else + read_mb_modes_mv(pbi, mi, &mi->mbmi); + +} + +void vp8_decode_mode_mvs(VP8D_COMP *pbi) +{ + MODE_INFO *mi = pbi->common.mi; + int mb_row = -1; + int mb_to_right_edge_start; + + mb_mode_mv_init(pbi); + + pbi->mb.mb_to_top_edge = 0; + pbi->mb.mb_to_bottom_edge = ((pbi->common.mb_rows - 1) * 16) << 3; + mb_to_right_edge_start = ((pbi->common.mb_cols - 1) * 16) << 3; + + while (++mb_row < pbi->common.mb_rows) + { + int mb_col = -1; + + pbi->mb.mb_to_left_edge = 0; + pbi->mb.mb_to_right_edge = mb_to_right_edge_start; + + while (++mb_col < pbi->common.mb_cols) + { +#if CONFIG_ERROR_CONCEALMENT + int mb_num = mb_row * pbi->common.mb_cols + mb_col; +#endif + + decode_mb_mode_mvs(pbi, mi, &mi->mbmi); + +#if CONFIG_ERROR_CONCEALMENT + /* look for corruption. set mvs_corrupt_from_mb to the current + * mb_num if the frame is corrupt from this macroblock. */ + if (vp8dx_bool_error(&pbi->mbc[8]) && mb_num < + (int)pbi->mvs_corrupt_from_mb) + { + pbi->mvs_corrupt_from_mb = mb_num; + /* no need to continue since the partition is corrupt from + * here on. + */ + return; + } +#endif + + pbi->mb.mb_to_left_edge -= (16 << 3); + pbi->mb.mb_to_right_edge -= (16 << 3); + mi++; /* next macroblock */ + } + pbi->mb.mb_to_top_edge -= (16 << 3); + pbi->mb.mb_to_bottom_edge -= (16 << 3); + + mi++; /* skip left predictor each row */ + } +} diff --git a/thirdparty/libvpx/vp8/decoder/decodemv.h b/thirdparty/libvpx/vp8/decoder/decodemv.h new file mode 100644 index 000000000000..f33b07351d3c --- /dev/null +++ b/thirdparty/libvpx/vp8/decoder/decodemv.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_DECODER_DECODEMV_H_ +#define VP8_DECODER_DECODEMV_H_ + +#include "onyxd_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_decode_mode_mvs(VP8D_COMP *); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_DECODEMV_H_ diff --git a/thirdparty/libvpx/vp8/decoder/decoderthreading.h b/thirdparty/libvpx/vp8/decoder/decoderthreading.h new file mode 100644 index 000000000000..c563cf6e93aa --- /dev/null +++ b/thirdparty/libvpx/vp8/decoder/decoderthreading.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_DECODER_DECODERTHREADING_H_ +#define VP8_DECODER_DECODERTHREADING_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_MULTITHREAD +void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd); +void vp8_decoder_remove_threads(VP8D_COMP *pbi); +void vp8_decoder_create_threads(VP8D_COMP *pbi); +void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows); +void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_DECODERTHREADING_H_ diff --git a/thirdparty/libvpx/vp8/decoder/detokenize.c b/thirdparty/libvpx/vp8/decoder/detokenize.c new file mode 100644 index 000000000000..fcc7533c50f5 --- /dev/null +++ b/thirdparty/libvpx/vp8/decoder/detokenize.c @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp8/common/blockd.h" +#include "onyxd_int.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "detokenize.h" + +void vp8_reset_mb_tokens_context(MACROBLOCKD *x) +{ + ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context); + ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context); + + memset(a_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1); + memset(l_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1); + + /* Clear entropy contexts for Y2 blocks */ + if (!x->mode_info_context->mbmi.is_4x4) + { + a_ctx[8] = l_ctx[8] = 0; + } +} + +/* + ------------------------------------------------------------------------------ + Residual decoding (Paragraph 13.2 / 13.3) +*/ +static const uint8_t kBands[16 + 1] = { + 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, + 0 /* extra entry as sentinel */ +}; + +static const uint8_t kCat3[] = { 173, 148, 140, 0 }; +static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 }; +static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 }; +static const uint8_t kCat6[] = + { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 }; +static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 }; +static const uint8_t kZigzag[16] = { + 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 +}; + +#define VP8GetBit vp8dx_decode_bool +#define NUM_PROBAS 11 +#define NUM_CTX 3 + +/* for const-casting */ +typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS]; + +static int GetSigned(BOOL_DECODER *br, int value_to_sign) +{ + int split = (br->range + 1) >> 1; + VP8_BD_VALUE bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); + int v; + + if(br->count < 0) + vp8dx_bool_decoder_fill(br); + + if ( br->value < bigsplit ) + { + br->range = split; + v= value_to_sign; + } + else + { + br->range = br->range-split; + br->value = br->value-bigsplit; + v = -value_to_sign; + } + br->range +=br->range; + br->value +=br->value; + br->count--; + + return v; +} +/* + Returns the position of the last non-zero coeff plus one + (and 0 if there's no coeff at all) +*/ +static int GetCoeffs(BOOL_DECODER *br, ProbaArray prob, + int ctx, int n, int16_t* out) +{ + const uint8_t* p = prob[n][ctx]; + if (!VP8GetBit(br, p[0])) + { /* first EOB is more a 'CBP' bit. */ + return 0; + } + while (1) + { + ++n; + if (!VP8GetBit(br, p[1])) + { + p = prob[kBands[n]][0]; + } + else + { /* non zero coeff */ + int v, j; + if (!VP8GetBit(br, p[2])) + { + p = prob[kBands[n]][1]; + v = 1; + } + else + { + if (!VP8GetBit(br, p[3])) + { + if (!VP8GetBit(br, p[4])) + { + v = 2; + } + else + { + v = 3 + VP8GetBit(br, p[5]); + } + } + else + { + if (!VP8GetBit(br, p[6])) + { + if (!VP8GetBit(br, p[7])) + { + v = 5 + VP8GetBit(br, 159); + } else + { + v = 7 + 2 * VP8GetBit(br, 165); + v += VP8GetBit(br, 145); + } + } + else + { + const uint8_t* tab; + const int bit1 = VP8GetBit(br, p[8]); + const int bit0 = VP8GetBit(br, p[9 + bit1]); + const int cat = 2 * bit1 + bit0; + v = 0; + for (tab = kCat3456[cat]; *tab; ++tab) + { + v += v + VP8GetBit(br, *tab); + } + v += 3 + (8 << cat); + } + } + p = prob[kBands[n]][2]; + } + j = kZigzag[n - 1]; + + out[j] = GetSigned(br, v); + + if (n == 16 || !VP8GetBit(br, p[0])) + { /* EOB */ + return n; + } + } + if (n == 16) + { + return 16; + } + } +} + +int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x) +{ + BOOL_DECODER *bc = x->current_bc; + const FRAME_CONTEXT * const fc = &dx->common.fc; + char *eobs = x->eobs; + + int i; + int nonzeros; + int eobtotal = 0; + + short *qcoeff_ptr; + ProbaArray coef_probs; + ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context); + ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context); + ENTROPY_CONTEXT *a; + ENTROPY_CONTEXT *l; + int skip_dc = 0; + + qcoeff_ptr = &x->qcoeff[0]; + + if (!x->mode_info_context->mbmi.is_4x4) + { + a = a_ctx + 8; + l = l_ctx + 8; + + coef_probs = fc->coef_probs [1]; + + nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), 0, qcoeff_ptr + 24 * 16); + *a = *l = (nonzeros > 0); + + eobs[24] = nonzeros; + eobtotal += nonzeros - 16; + + coef_probs = fc->coef_probs [0]; + skip_dc = 1; + } + else + { + coef_probs = fc->coef_probs [3]; + skip_dc = 0; + } + + for (i = 0; i < 16; ++i) + { + a = a_ctx + (i&3); + l = l_ctx + ((i&0xc)>>2); + + nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), skip_dc, qcoeff_ptr); + *a = *l = (nonzeros > 0); + + nonzeros += skip_dc; + eobs[i] = nonzeros; + eobtotal += nonzeros; + qcoeff_ptr += 16; + } + + coef_probs = fc->coef_probs [2]; + + a_ctx += 4; + l_ctx += 4; + for (i = 16; i < 24; ++i) + { + a = a_ctx + ((i > 19)<<1) + (i&1); + l = l_ctx + ((i > 19)<<1) + ((i&3)>1); + + nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), 0, qcoeff_ptr); + *a = *l = (nonzeros > 0); + + eobs[i] = nonzeros; + eobtotal += nonzeros; + qcoeff_ptr += 16; + } + + return eobtotal; +} + diff --git a/thirdparty/libvpx/vp8/decoder/detokenize.h b/thirdparty/libvpx/vp8/decoder/detokenize.h new file mode 100644 index 000000000000..f0b125444f03 --- /dev/null +++ b/thirdparty/libvpx/vp8/decoder/detokenize.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_DECODER_DETOKENIZE_H_ +#define VP8_DECODER_DETOKENIZE_H_ + +#include "onyxd_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_reset_mb_tokens_context(MACROBLOCKD *x); +int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_DETOKENIZE_H_ diff --git a/thirdparty/libvpx/vp8/decoder/onyxd_if.c b/thirdparty/libvpx/vp8/decoder/onyxd_if.c new file mode 100644 index 000000000000..3468268a2a15 --- /dev/null +++ b/thirdparty/libvpx/vp8/decoder/onyxd_if.c @@ -0,0 +1,521 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp8/common/onyxc_int.h" +#if CONFIG_POSTPROC +#include "vp8/common/postproc.h" +#endif +#include "vp8/common/onyxd.h" +#include "onyxd_int.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8/common/alloccommon.h" +#include "vp8/common/loopfilter.h" +#include "vp8/common/swapyv12buffer.h" +#include "vp8/common/threading.h" +#include "decoderthreading.h" +#include +#include + +#include "vp8/common/quant_common.h" +#include "vp8/common/reconintra.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "vpx_scale/vpx_scale.h" +#include "vp8/common/systemdependent.h" +#include "vpx_ports/vpx_once.h" +#include "vpx_ports/vpx_timer.h" +#include "detokenize.h" +#if CONFIG_ERROR_CONCEALMENT +#include "error_concealment.h" +#endif +#if ARCH_ARM +#include "vpx_ports/arm.h" +#endif + +extern void vp8_init_loop_filter(VP8_COMMON *cm); +extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi); +static int get_free_fb (VP8_COMMON *cm); +static void ref_cnt_fb (int *buf, int *idx, int new_idx); + +static void initialize_dec(void) { + static volatile int init_done = 0; + + if (!init_done) + { + vpx_dsp_rtcd(); + vp8_init_intra_predictors(); + init_done = 1; + } +} + +static void remove_decompressor(VP8D_COMP *pbi) +{ +#if CONFIG_ERROR_CONCEALMENT + vp8_de_alloc_overlap_lists(pbi); +#endif + vp8_remove_common(&pbi->common); + vpx_free(pbi); +} + +static struct VP8D_COMP * create_decompressor(VP8D_CONFIG *oxcf) +{ + VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP)); + + if (!pbi) + return NULL; + + memset(pbi, 0, sizeof(VP8D_COMP)); + + if (setjmp(pbi->common.error.jmp)) + { + pbi->common.error.setjmp = 0; + remove_decompressor(pbi); + return 0; + } + + pbi->common.error.setjmp = 1; + + vp8_create_common(&pbi->common); + + pbi->common.current_video_frame = 0; + pbi->ready_for_new_data = 1; + + /* vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid + * unnecessary calling of vp8cx_init_de_quantizer() for every frame. + */ + vp8cx_init_de_quantizer(pbi); + + vp8_loop_filter_init(&pbi->common); + + pbi->common.error.setjmp = 0; + +#if CONFIG_ERROR_CONCEALMENT + pbi->ec_enabled = oxcf->error_concealment; + pbi->overlaps = NULL; +#else + (void)oxcf; + pbi->ec_enabled = 0; +#endif + /* Error concealment is activated after a key frame has been + * decoded without errors when error concealment is enabled. + */ + pbi->ec_active = 0; + + pbi->decoded_key_frame = 0; + + /* Independent partitions is activated when a frame updates the + * token probability table to have equal probabilities over the + * PREV_COEF context. + */ + pbi->independent_partitions = 0; + + vp8_setup_block_dptrs(&pbi->mb); + + once(initialize_dec); + + return pbi; +} + +vpx_codec_err_t vp8dx_get_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd) +{ + VP8_COMMON *cm = &pbi->common; + int ref_fb_idx; + + if (ref_frame_flag == VP8_LAST_FRAME) + ref_fb_idx = cm->lst_fb_idx; + else if (ref_frame_flag == VP8_GOLD_FRAME) + ref_fb_idx = cm->gld_fb_idx; + else if (ref_frame_flag == VP8_ALTR_FRAME) + ref_fb_idx = cm->alt_fb_idx; + else{ + vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, + "Invalid reference frame"); + return pbi->common.error.error_code; + } + + if(cm->yv12_fb[ref_fb_idx].y_height != sd->y_height || + cm->yv12_fb[ref_fb_idx].y_width != sd->y_width || + cm->yv12_fb[ref_fb_idx].uv_height != sd->uv_height || + cm->yv12_fb[ref_fb_idx].uv_width != sd->uv_width){ + vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, + "Incorrect buffer dimensions"); + } + else + vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd); + + return pbi->common.error.error_code; +} + + +vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd) +{ + VP8_COMMON *cm = &pbi->common; + int *ref_fb_ptr = NULL; + int free_fb; + + if (ref_frame_flag == VP8_LAST_FRAME) + ref_fb_ptr = &cm->lst_fb_idx; + else if (ref_frame_flag == VP8_GOLD_FRAME) + ref_fb_ptr = &cm->gld_fb_idx; + else if (ref_frame_flag == VP8_ALTR_FRAME) + ref_fb_ptr = &cm->alt_fb_idx; + else{ + vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, + "Invalid reference frame"); + return pbi->common.error.error_code; + } + + if(cm->yv12_fb[*ref_fb_ptr].y_height != sd->y_height || + cm->yv12_fb[*ref_fb_ptr].y_width != sd->y_width || + cm->yv12_fb[*ref_fb_ptr].uv_height != sd->uv_height || + cm->yv12_fb[*ref_fb_ptr].uv_width != sd->uv_width){ + vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, + "Incorrect buffer dimensions"); + } + else{ + /* Find an empty frame buffer. */ + free_fb = get_free_fb(cm); + /* Decrease fb_idx_ref_cnt since it will be increased again in + * ref_cnt_fb() below. */ + cm->fb_idx_ref_cnt[free_fb]--; + + /* Manage the reference counters and copy image. */ + ref_cnt_fb (cm->fb_idx_ref_cnt, ref_fb_ptr, free_fb); + vp8_yv12_copy_frame(sd, &cm->yv12_fb[*ref_fb_ptr]); + } + + return pbi->common.error.error_code; +} + +static int get_free_fb (VP8_COMMON *cm) +{ + int i; + for (i = 0; i < NUM_YV12_BUFFERS; i++) + if (cm->fb_idx_ref_cnt[i] == 0) + break; + + assert(i < NUM_YV12_BUFFERS); + cm->fb_idx_ref_cnt[i] = 1; + return i; +} + +static void ref_cnt_fb (int *buf, int *idx, int new_idx) +{ + if (buf[*idx] > 0) + buf[*idx]--; + + *idx = new_idx; + + buf[new_idx]++; +} + +/* If any buffer copy / swapping is signalled it should be done here. */ +static int swap_frame_buffers (VP8_COMMON *cm) +{ + int err = 0; + + /* The alternate reference frame or golden frame can be updated + * using the new, last, or golden/alt ref frame. If it + * is updated using the newly decoded frame it is a refresh. + * An update using the last or golden/alt ref frame is a copy. + */ + if (cm->copy_buffer_to_arf) + { + int new_fb = 0; + + if (cm->copy_buffer_to_arf == 1) + new_fb = cm->lst_fb_idx; + else if (cm->copy_buffer_to_arf == 2) + new_fb = cm->gld_fb_idx; + else + err = -1; + + ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb); + } + + if (cm->copy_buffer_to_gf) + { + int new_fb = 0; + + if (cm->copy_buffer_to_gf == 1) + new_fb = cm->lst_fb_idx; + else if (cm->copy_buffer_to_gf == 2) + new_fb = cm->alt_fb_idx; + else + err = -1; + + ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb); + } + + if (cm->refresh_golden_frame) + ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx); + + if (cm->refresh_alt_ref_frame) + ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx); + + if (cm->refresh_last_frame) + { + ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx); + + cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx]; + } + else + cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx]; + + cm->fb_idx_ref_cnt[cm->new_fb_idx]--; + + return err; +} + +static int check_fragments_for_errors(VP8D_COMP *pbi) +{ + if (!pbi->ec_active && + pbi->fragments.count <= 1 && pbi->fragments.sizes[0] == 0) + { + VP8_COMMON *cm = &pbi->common; + + /* If error concealment is disabled we won't signal missing frames + * to the decoder. + */ + if (cm->fb_idx_ref_cnt[cm->lst_fb_idx] > 1) + { + /* The last reference shares buffer with another reference + * buffer. Move it to its own buffer before setting it as + * corrupt, otherwise we will make multiple buffers corrupt. + */ + const int prev_idx = cm->lst_fb_idx; + cm->fb_idx_ref_cnt[prev_idx]--; + cm->lst_fb_idx = get_free_fb(cm); + vp8_yv12_copy_frame(&cm->yv12_fb[prev_idx], + &cm->yv12_fb[cm->lst_fb_idx]); + } + /* This is used to signal that we are missing frames. + * We do not know if the missing frame(s) was supposed to update + * any of the reference buffers, but we act conservative and + * mark only the last buffer as corrupted. + */ + cm->yv12_fb[cm->lst_fb_idx].corrupted = 1; + + /* Signal that we have no frame to show. */ + cm->show_frame = 0; + + /* Nothing more to do. */ + return 0; + } + + return 1; +} + +int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size, + const uint8_t *source, + int64_t time_stamp) +{ + VP8_COMMON *cm = &pbi->common; + int retcode = -1; + (void)size; + (void)source; + + pbi->common.error.error_code = VPX_CODEC_OK; + + retcode = check_fragments_for_errors(pbi); + if(retcode <= 0) + return retcode; + + cm->new_fb_idx = get_free_fb (cm); + + /* setup reference frames for vp8_decode_frame */ + pbi->dec_fb_ref[INTRA_FRAME] = &cm->yv12_fb[cm->new_fb_idx]; + pbi->dec_fb_ref[LAST_FRAME] = &cm->yv12_fb[cm->lst_fb_idx]; + pbi->dec_fb_ref[GOLDEN_FRAME] = &cm->yv12_fb[cm->gld_fb_idx]; + pbi->dec_fb_ref[ALTREF_FRAME] = &cm->yv12_fb[cm->alt_fb_idx]; + + if (setjmp(pbi->common.error.jmp)) + { + /* We do not know if the missing frame(s) was supposed to update + * any of the reference buffers, but we act conservative and + * mark only the last buffer as corrupted. + */ + cm->yv12_fb[cm->lst_fb_idx].corrupted = 1; + + if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0) + cm->fb_idx_ref_cnt[cm->new_fb_idx]--; + + goto decode_exit; + } + + pbi->common.error.setjmp = 1; + + retcode = vp8_decode_frame(pbi); + + if (retcode < 0) + { + if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0) + cm->fb_idx_ref_cnt[cm->new_fb_idx]--; + + pbi->common.error.error_code = VPX_CODEC_ERROR; + goto decode_exit; + } + + if (swap_frame_buffers (cm)) + { + pbi->common.error.error_code = VPX_CODEC_ERROR; + goto decode_exit; + } + + vp8_clear_system_state(); + + if (cm->show_frame) + { + cm->current_video_frame++; + cm->show_frame_mi = cm->mi; + } + + #if CONFIG_ERROR_CONCEALMENT + /* swap the mode infos to storage for future error concealment */ + if (pbi->ec_enabled && pbi->common.prev_mi) + { + MODE_INFO* tmp = pbi->common.prev_mi; + int row, col; + pbi->common.prev_mi = pbi->common.mi; + pbi->common.mi = tmp; + + /* Propagate the segment_ids to the next frame */ + for (row = 0; row < pbi->common.mb_rows; ++row) + { + for (col = 0; col < pbi->common.mb_cols; ++col) + { + const int i = row*pbi->common.mode_info_stride + col; + pbi->common.mi[i].mbmi.segment_id = + pbi->common.prev_mi[i].mbmi.segment_id; + } + } + } +#endif + + pbi->ready_for_new_data = 0; + pbi->last_time_stamp = time_stamp; + +decode_exit: + pbi->common.error.setjmp = 0; + vp8_clear_system_state(); + return retcode; +} +int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags) +{ + int ret = -1; + + if (pbi->ready_for_new_data == 1) + return ret; + + /* ie no raw frame to show!!! */ + if (pbi->common.show_frame == 0) + return ret; + + pbi->ready_for_new_data = 1; + *time_stamp = pbi->last_time_stamp; + *time_end_stamp = 0; + +#if CONFIG_POSTPROC + ret = vp8_post_proc_frame(&pbi->common, sd, flags); +#else + (void)flags; + + if (pbi->common.frame_to_show) + { + *sd = *pbi->common.frame_to_show; + sd->y_width = pbi->common.Width; + sd->y_height = pbi->common.Height; + sd->uv_height = pbi->common.Height / 2; + ret = 0; + } + else + { + ret = -1; + } + +#endif /*!CONFIG_POSTPROC*/ + vp8_clear_system_state(); + return ret; +} + + +/* This function as written isn't decoder specific, but the encoder has + * much faster ways of computing this, so it's ok for it to live in a + * decode specific file. + */ +int vp8dx_references_buffer( VP8_COMMON *oci, int ref_frame ) +{ + const MODE_INFO *mi = oci->mi; + int mb_row, mb_col; + + for (mb_row = 0; mb_row < oci->mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < oci->mb_cols; mb_col++,mi++) + { + if( mi->mbmi.ref_frame == ref_frame) + return 1; + } + mi++; + } + return 0; + +} + +int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf) +{ + if(!fb->use_frame_threads) + { + /* decoder instance for single thread mode */ + fb->pbi[0] = create_decompressor(oxcf); + if(!fb->pbi[0]) + return VPX_CODEC_ERROR; + +#if CONFIG_MULTITHREAD + /* enable row-based threading only when use_frame_threads + * is disabled */ + fb->pbi[0]->max_threads = oxcf->max_threads; + vp8_decoder_create_threads(fb->pbi[0]); +#endif + } + else + { + /* TODO : create frame threads and decoder instances for each + * thread here */ + } + + return VPX_CODEC_OK; +} + +int vp8_remove_decoder_instances(struct frame_buffers *fb) +{ + if(!fb->use_frame_threads) + { + VP8D_COMP *pbi = fb->pbi[0]; + + if (!pbi) + return VPX_CODEC_ERROR; +#if CONFIG_MULTITHREAD + if (pbi->b_multithreaded_rd) + vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows); + vp8_decoder_remove_threads(pbi); +#endif + + /* decoder instance for single thread mode */ + remove_decompressor(pbi); + } + else + { + /* TODO : remove frame threads and decoder instances for each + * thread here */ + } + + return VPX_CODEC_OK; +} diff --git a/thirdparty/libvpx/vp8/decoder/onyxd_int.h b/thirdparty/libvpx/vp8/decoder/onyxd_int.h new file mode 100644 index 000000000000..313fe01c0773 --- /dev/null +++ b/thirdparty/libvpx/vp8/decoder/onyxd_int.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_DECODER_ONYXD_INT_H_ +#define VP8_DECODER_ONYXD_INT_H_ + +#include "vpx_config.h" +#include "vp8/common/onyxd.h" +#include "treereader.h" +#include "vp8/common/onyxc_int.h" +#include "vp8/common/threading.h" + +#if CONFIG_ERROR_CONCEALMENT +#include "ec_types.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct +{ + int ithread; + void *ptr1; + void *ptr2; +} DECODETHREAD_DATA; + +typedef struct +{ + MACROBLOCKD mbd; +} MB_ROW_DEC; + + +typedef struct +{ + int enabled; + unsigned int count; + const unsigned char *ptrs[MAX_PARTITIONS]; + unsigned int sizes[MAX_PARTITIONS]; +} FRAGMENT_DATA; + +#define MAX_FB_MT_DEC 32 + +struct frame_buffers +{ + /* + * this struct will be populated with frame buffer management + * info in future commits. */ + + /* enable/disable frame-based threading */ + int use_frame_threads; + + /* decoder instances */ + struct VP8D_COMP *pbi[MAX_FB_MT_DEC]; + +}; + +typedef struct VP8D_COMP +{ + DECLARE_ALIGNED(16, MACROBLOCKD, mb); + + YV12_BUFFER_CONFIG *dec_fb_ref[NUM_YV12_BUFFERS]; + + DECLARE_ALIGNED(16, VP8_COMMON, common); + + /* the last partition will be used for the modes/mvs */ + vp8_reader mbc[MAX_PARTITIONS]; + + VP8D_CONFIG oxcf; + + FRAGMENT_DATA fragments; + +#if CONFIG_MULTITHREAD + /* variable for threading */ + + int b_multithreaded_rd; + int max_threads; + int current_mb_col_main; + unsigned int decoding_thread_count; + int allocated_decoding_thread_count; + + int mt_baseline_filter_level[MAX_MB_SEGMENTS]; + int sync_range; + int *mt_current_mb_col; /* Each row remembers its already decoded column. */ + pthread_mutex_t *pmutex; + pthread_mutex_t mt_mutex; /* mutex for b_multithreaded_rd */ + + unsigned char **mt_yabove_row; /* mb_rows x width */ + unsigned char **mt_uabove_row; + unsigned char **mt_vabove_row; + unsigned char **mt_yleft_col; /* mb_rows x 16 */ + unsigned char **mt_uleft_col; /* mb_rows x 8 */ + unsigned char **mt_vleft_col; /* mb_rows x 8 */ + + MB_ROW_DEC *mb_row_di; + DECODETHREAD_DATA *de_thread_data; + + pthread_t *h_decoding_thread; + sem_t *h_event_start_decoding; + sem_t h_event_end_decoding; + /* end of threading data */ +#endif + + int64_t last_time_stamp; + int ready_for_new_data; + + vp8_prob prob_intra; + vp8_prob prob_last; + vp8_prob prob_gf; + vp8_prob prob_skip_false; + +#if CONFIG_ERROR_CONCEALMENT + MB_OVERLAP *overlaps; + /* the mb num from which modes and mvs (first partition) are corrupt */ + unsigned int mvs_corrupt_from_mb; +#endif + int ec_enabled; + int ec_active; + int decoded_key_frame; + int independent_partitions; + int frame_corrupt_residual; + + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; +} VP8D_COMP; + +int vp8_decode_frame(VP8D_COMP *cpi); + +int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf); +int vp8_remove_decoder_instances(struct frame_buffers *fb); + +#if CONFIG_DEBUG +#define CHECK_MEM_ERROR(lval,expr) do {\ + lval = (expr); \ + if(!lval) \ + vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\ + "Failed to allocate "#lval" at %s:%d", \ + __FILE__,__LINE__);\ + } while(0) +#else +#define CHECK_MEM_ERROR(lval,expr) do {\ + lval = (expr); \ + if(!lval) \ + vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\ + "Failed to allocate "#lval);\ + } while(0) +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_ONYXD_INT_H_ diff --git a/thirdparty/libvpx/vp8/decoder/threading.c b/thirdparty/libvpx/vp8/decoder/threading.c new file mode 100644 index 000000000000..3c1b8387ec94 --- /dev/null +++ b/thirdparty/libvpx/vp8/decoder/threading.c @@ -0,0 +1,928 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp8_rtcd.h" +#if !defined(WIN32) && CONFIG_OS_SUPPORT == 1 +# include +#endif +#include "onyxd_int.h" +#include "vpx_mem/vpx_mem.h" +#include "vp8/common/threading.h" + +#include "vp8/common/loopfilter.h" +#include "vp8/common/extend.h" +#include "vpx_ports/vpx_timer.h" +#include "detokenize.h" +#include "vp8/common/reconintra4x4.h" +#include "vp8/common/reconinter.h" +#include "vp8/common/reconintra.h" +#include "vp8/common/setupintrarecon.h" +#if CONFIG_ERROR_CONCEALMENT +#include "error_concealment.h" +#endif + +#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n))) +#define CALLOC_ARRAY_ALIGNED(p, n, algn) do { \ + CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n))); \ + memset((p), 0, (n) * sizeof(*(p))); \ +} while (0) + + +void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd); + +static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count) +{ + VP8_COMMON *const pc = & pbi->common; + int i; + + for (i = 0; i < count; i++) + { + MACROBLOCKD *mbd = &mbrd[i].mbd; + mbd->subpixel_predict = xd->subpixel_predict; + mbd->subpixel_predict8x4 = xd->subpixel_predict8x4; + mbd->subpixel_predict8x8 = xd->subpixel_predict8x8; + mbd->subpixel_predict16x16 = xd->subpixel_predict16x16; + + mbd->frame_type = pc->frame_type; + mbd->pre = xd->pre; + mbd->dst = xd->dst; + + mbd->segmentation_enabled = xd->segmentation_enabled; + mbd->mb_segement_abs_delta = xd->mb_segement_abs_delta; + memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data)); + + /*signed char ref_lf_deltas[MAX_REF_LF_DELTAS];*/ + memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas)); + /*signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];*/ + memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas)); + /*unsigned char mode_ref_lf_delta_enabled; + unsigned char mode_ref_lf_delta_update;*/ + mbd->mode_ref_lf_delta_enabled = xd->mode_ref_lf_delta_enabled; + mbd->mode_ref_lf_delta_update = xd->mode_ref_lf_delta_update; + + mbd->current_bc = &pbi->mbc[0]; + + memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc)); + memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1)); + memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2)); + memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv)); + + mbd->fullpixel_mask = 0xffffffff; + + if (pc->full_pixel) + mbd->fullpixel_mask = 0xfffffff8; + + } + + for (i = 0; i < pc->mb_rows; i++) + pbi->mt_current_mb_col[i] = -1; +} + +static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, + unsigned int mb_idx) +{ + MB_PREDICTION_MODE mode; + int i; +#if CONFIG_ERROR_CONCEALMENT + int corruption_detected = 0; +#else + (void)mb_idx; +#endif + + if (xd->mode_info_context->mbmi.mb_skip_coeff) + { + vp8_reset_mb_tokens_context(xd); + } + else if (!vp8dx_bool_error(xd->current_bc)) + { + int eobtotal; + eobtotal = vp8_decode_mb_tokens(pbi, xd); + + /* Special case: Force the loopfilter to skip when eobtotal is zero */ + xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal==0); + } + + mode = xd->mode_info_context->mbmi.mode; + + if (xd->segmentation_enabled) + vp8_mb_init_dequantizer(pbi, xd); + + +#if CONFIG_ERROR_CONCEALMENT + + if(pbi->ec_active) + { + int throw_residual; + /* When we have independent partitions we can apply residual even + * though other partitions within the frame are corrupt. + */ + throw_residual = (!pbi->independent_partitions && + pbi->frame_corrupt_residual); + throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc)); + + if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual)) + { + /* MB with corrupt residuals or corrupt mode/motion vectors. + * Better to use the predictor as reconstruction. + */ + pbi->frame_corrupt_residual = 1; + memset(xd->qcoeff, 0, sizeof(xd->qcoeff)); + + corruption_detected = 1; + + /* force idct to be skipped for B_PRED and use the + * prediction only for reconstruction + * */ + memset(xd->eobs, 0, 25); + } + } +#endif + + /* do prediction */ + if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) + { + vp8_build_intra_predictors_mbuv_s(xd, + xd->recon_above[1], + xd->recon_above[2], + xd->recon_left[1], + xd->recon_left[2], + xd->recon_left_stride[1], + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.uv_stride); + + if (mode != B_PRED) + { + vp8_build_intra_predictors_mby_s(xd, + xd->recon_above[0], + xd->recon_left[0], + xd->recon_left_stride[0], + xd->dst.y_buffer, + xd->dst.y_stride); + } + else + { + short *DQC = xd->dequant_y1; + int dst_stride = xd->dst.y_stride; + + /* clear out residual eob info */ + if(xd->mode_info_context->mbmi.mb_skip_coeff) + memset(xd->eobs, 0, 25); + + intra_prediction_down_copy(xd, xd->recon_above[0] + 16); + + for (i = 0; i < 16; i++) + { + BLOCKD *b = &xd->block[i]; + unsigned char *dst = xd->dst.y_buffer + b->offset; + B_PREDICTION_MODE b_mode = + xd->mode_info_context->bmi[i].as_mode; + unsigned char *Above; + unsigned char *yleft; + int left_stride; + unsigned char top_left; + + /*Caution: For some b_mode, it needs 8 pixels (4 above + 4 above-right).*/ + if (i < 4 && pbi->common.filter_level) + Above = xd->recon_above[0] + b->offset; + else + Above = dst - dst_stride; + + if (i%4==0 && pbi->common.filter_level) + { + yleft = xd->recon_left[0] + i; + left_stride = 1; + } + else + { + yleft = dst - 1; + left_stride = dst_stride; + } + + if ((i==4 || i==8 || i==12) && pbi->common.filter_level) + top_left = *(xd->recon_left[0] + i - 1); + else + top_left = Above[-1]; + + vp8_intra4x4_predict(Above, yleft, left_stride, + b_mode, dst, dst_stride, top_left); + + if (xd->eobs[i] ) + { + if (xd->eobs[i] > 1) + { + vp8_dequant_idct_add(b->qcoeff, DQC, dst, dst_stride); + } + else + { + vp8_dc_only_idct_add(b->qcoeff[0] * DQC[0], + dst, dst_stride, dst, dst_stride); + memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0])); + } + } + } + } + } + else + { + vp8_build_inter_predictors_mb(xd); + } + + +#if CONFIG_ERROR_CONCEALMENT + if (corruption_detected) + { + return; + } +#endif + + if(!xd->mode_info_context->mbmi.mb_skip_coeff) + { + /* dequantization and idct */ + if (mode != B_PRED) + { + short *DQC = xd->dequant_y1; + + if (mode != SPLITMV) + { + BLOCKD *b = &xd->block[24]; + + /* do 2nd order transform on the dc block */ + if (xd->eobs[24] > 1) + { + vp8_dequantize_b(b, xd->dequant_y2); + + vp8_short_inv_walsh4x4(&b->dqcoeff[0], + xd->qcoeff); + memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0])); + } + else + { + b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0]; + vp8_short_inv_walsh4x4_1(&b->dqcoeff[0], + xd->qcoeff); + memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0])); + } + + /* override the dc dequant constant in order to preserve the + * dc components + */ + DQC = xd->dequant_y1_dc; + } + + vp8_dequant_idct_add_y_block + (xd->qcoeff, DQC, + xd->dst.y_buffer, + xd->dst.y_stride, xd->eobs); + } + + vp8_dequant_idct_add_uv_block + (xd->qcoeff+16*16, xd->dequant_uv, + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.uv_stride, xd->eobs+16); + } +} + +static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row) +{ + const int *last_row_current_mb_col; + int *current_mb_col; + int mb_row; + VP8_COMMON *pc = &pbi->common; + const int nsync = pbi->sync_range; + const int first_row_no_sync_above = pc->mb_cols + nsync; + int num_part = 1 << pbi->common.multi_token_partition; + int last_mb_row = start_mb_row; + + YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME]; + YV12_BUFFER_CONFIG *yv12_fb_lst = pbi->dec_fb_ref[LAST_FRAME]; + + int recon_y_stride = yv12_fb_new->y_stride; + int recon_uv_stride = yv12_fb_new->uv_stride; + + unsigned char *ref_buffer[MAX_REF_FRAMES][3]; + unsigned char *dst_buffer[3]; + int i; + int ref_fb_corrupted[MAX_REF_FRAMES]; + + ref_fb_corrupted[INTRA_FRAME] = 0; + + for(i = 1; i < MAX_REF_FRAMES; i++) + { + YV12_BUFFER_CONFIG *this_fb = pbi->dec_fb_ref[i]; + + ref_buffer[i][0] = this_fb->y_buffer; + ref_buffer[i][1] = this_fb->u_buffer; + ref_buffer[i][2] = this_fb->v_buffer; + + ref_fb_corrupted[i] = this_fb->corrupted; + } + + dst_buffer[0] = yv12_fb_new->y_buffer; + dst_buffer[1] = yv12_fb_new->u_buffer; + dst_buffer[2] = yv12_fb_new->v_buffer; + + xd->up_available = (start_mb_row != 0); + + xd->mode_info_context = pc->mi + pc->mode_info_stride * start_mb_row; + xd->mode_info_stride = pc->mode_info_stride; + + for (mb_row = start_mb_row; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1)) + { + int recon_yoffset, recon_uvoffset; + int mb_col; + int filter_level; + loop_filter_info_n *lfi_n = &pc->lf_info; + + /* save last row processed by this thread */ + last_mb_row = mb_row; + /* select bool coder for current partition */ + xd->current_bc = &pbi->mbc[mb_row%num_part]; + + if (mb_row > 0) + last_row_current_mb_col = &pbi->mt_current_mb_col[mb_row -1]; + else + last_row_current_mb_col = &first_row_no_sync_above; + + current_mb_col = &pbi->mt_current_mb_col[mb_row]; + + recon_yoffset = mb_row * recon_y_stride * 16; + recon_uvoffset = mb_row * recon_uv_stride * 8; + + /* reset contexts */ + xd->above_context = pc->above_context; + memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)); + + xd->left_available = 0; + + xd->mb_to_top_edge = -((mb_row * 16)) << 3; + xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3; + + if (pbi->common.filter_level) + { + xd->recon_above[0] = pbi->mt_yabove_row[mb_row] + 0*16 +32; + xd->recon_above[1] = pbi->mt_uabove_row[mb_row] + 0*8 +16; + xd->recon_above[2] = pbi->mt_vabove_row[mb_row] + 0*8 +16; + + xd->recon_left[0] = pbi->mt_yleft_col[mb_row]; + xd->recon_left[1] = pbi->mt_uleft_col[mb_row]; + xd->recon_left[2] = pbi->mt_vleft_col[mb_row]; + + /* TODO: move to outside row loop */ + xd->recon_left_stride[0] = 1; + xd->recon_left_stride[1] = 1; + } + else + { + xd->recon_above[0] = dst_buffer[0] + recon_yoffset; + xd->recon_above[1] = dst_buffer[1] + recon_uvoffset; + xd->recon_above[2] = dst_buffer[2] + recon_uvoffset; + + xd->recon_left[0] = xd->recon_above[0] - 1; + xd->recon_left[1] = xd->recon_above[1] - 1; + xd->recon_left[2] = xd->recon_above[2] - 1; + + xd->recon_above[0] -= xd->dst.y_stride; + xd->recon_above[1] -= xd->dst.uv_stride; + xd->recon_above[2] -= xd->dst.uv_stride; + + /* TODO: move to outside row loop */ + xd->recon_left_stride[0] = xd->dst.y_stride; + xd->recon_left_stride[1] = xd->dst.uv_stride; + + setup_intra_recon_left(xd->recon_left[0], xd->recon_left[1], + xd->recon_left[2], xd->dst.y_stride, + xd->dst.uv_stride); + } + + for (mb_col = 0; mb_col < pc->mb_cols; mb_col++) { + if (((mb_col - 1) % nsync) == 0) { + pthread_mutex_t *mutex = &pbi->pmutex[mb_row]; + protected_write(mutex, current_mb_col, mb_col - 1); + } + + if (mb_row && !(mb_col & (nsync - 1))) { + pthread_mutex_t *mutex = &pbi->pmutex[mb_row-1]; + sync_read(mutex, mb_col, last_row_current_mb_col, nsync); + } + + /* Distance of MB to the various image edges. + * These are specified to 8th pel as they are always + * compared to values that are in 1/8th pel units. + */ + xd->mb_to_left_edge = -((mb_col * 16) << 3); + xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3; + + #if CONFIG_ERROR_CONCEALMENT + { + int corrupt_residual = + (!pbi->independent_partitions && + pbi->frame_corrupt_residual) || + vp8dx_bool_error(xd->current_bc); + if (pbi->ec_active && + (xd->mode_info_context->mbmi.ref_frame == + INTRA_FRAME) && + corrupt_residual) + { + /* We have an intra block with corrupt + * coefficients, better to conceal with an inter + * block. + * Interpolate MVs from neighboring MBs + * + * Note that for the first mb with corrupt + * residual in a frame, we might not discover + * that before decoding the residual. That + * happens after this check, and therefore no + * inter concealment will be done. + */ + vp8_interpolate_motion(xd, + mb_row, mb_col, + pc->mb_rows, pc->mb_cols); + } + } + #endif + + + xd->dst.y_buffer = dst_buffer[0] + recon_yoffset; + xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset; + xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset; + + xd->pre.y_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset; + xd->pre.u_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset; + xd->pre.v_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset; + + /* propagate errors from reference frames */ + xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame]; + + mt_decode_macroblock(pbi, xd, 0); + + xd->left_available = 1; + + /* check if the boolean decoder has suffered an error */ + xd->corrupted |= vp8dx_bool_error(xd->current_bc); + + xd->recon_above[0] += 16; + xd->recon_above[1] += 8; + xd->recon_above[2] += 8; + + if (!pbi->common.filter_level) + { + xd->recon_left[0] += 16; + xd->recon_left[1] += 8; + xd->recon_left[2] += 8; + } + + if (pbi->common.filter_level) + { + int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED && + xd->mode_info_context->mbmi.mode != SPLITMV && + xd->mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode]; + const int seg = xd->mode_info_context->mbmi.segment_id; + const int ref_frame = xd->mode_info_context->mbmi.ref_frame; + + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if( mb_row != pc->mb_rows-1 ) + { + /* Save decoded MB last row data for next-row decoding */ + memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16); + memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8); + memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8); + } + + /* save left_col for next MB decoding */ + if(mb_col != pc->mb_cols-1) + { + MODE_INFO *next = xd->mode_info_context +1; + + if (next->mbmi.ref_frame == INTRA_FRAME) + { + for (i = 0; i < 16; i++) + pbi->mt_yleft_col[mb_row][i] = xd->dst.y_buffer [i* recon_y_stride + 15]; + for (i = 0; i < 8; i++) + { + pbi->mt_uleft_col[mb_row][i] = xd->dst.u_buffer [i* recon_uv_stride + 7]; + pbi->mt_vleft_col[mb_row][i] = xd->dst.v_buffer [i* recon_uv_stride + 7]; + } + } + } + + /* loopfilter on this macroblock. */ + if (filter_level) + { + if(pc->filter_type == NORMAL_LOOPFILTER) + { + loop_filter_info lfi; + FRAME_TYPE frame_type = pc->frame_type; + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp8_loop_filter_mbv + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bv + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_mbh + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); + + if (!skip_lf) + vp8_loop_filter_bh + (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi); + } + else + { + if (mb_col > 0) + vp8_loop_filter_simple_mbv + (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bv + (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + vp8_loop_filter_simple_mbh + (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp8_loop_filter_simple_bh + (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]); + } + } + + } + + recon_yoffset += 16; + recon_uvoffset += 8; + + ++xd->mode_info_context; /* next mb */ + + xd->above_context++; + } + + /* adjust to the next row of mbs */ + if (pbi->common.filter_level) + { + if(mb_row != pc->mb_rows-1) + { + int lasty = yv12_fb_lst->y_width + VP8BORDERINPIXELS; + int lastuv = (yv12_fb_lst->y_width>>1) + (VP8BORDERINPIXELS>>1); + + for (i = 0; i < 4; i++) + { + pbi->mt_yabove_row[mb_row +1][lasty + i] = pbi->mt_yabove_row[mb_row +1][lasty -1]; + pbi->mt_uabove_row[mb_row +1][lastuv + i] = pbi->mt_uabove_row[mb_row +1][lastuv -1]; + pbi->mt_vabove_row[mb_row +1][lastuv + i] = pbi->mt_vabove_row[mb_row +1][lastuv -1]; + } + } + } + else + vp8_extend_mb_row(yv12_fb_new, xd->dst.y_buffer + 16, + xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); + + /* last MB of row is ready just after extension is done */ + protected_write(&pbi->pmutex[mb_row], current_mb_col, mb_col + nsync); + + ++xd->mode_info_context; /* skip prediction column */ + xd->up_available = 1; + + /* since we have multithread */ + xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count; + } + + /* signal end of frame decoding if this thread processed the last mb_row */ + if (last_mb_row == (pc->mb_rows - 1)) + sem_post(&pbi->h_event_end_decoding); + +} + + +static THREAD_FUNCTION thread_decoding_proc(void *p_data) +{ + int ithread = ((DECODETHREAD_DATA *)p_data)->ithread; + VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1); + MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2); + ENTROPY_CONTEXT_PLANES mb_row_left_context; + + while (1) + { + if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) + break; + + if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) + { + if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) + break; + else + { + MACROBLOCKD *xd = &mbrd->mbd; + xd->left_context = &mb_row_left_context; + + mt_decode_mb_rows(pbi, xd, ithread+1); + } + } + } + + return 0 ; +} + + +void vp8_decoder_create_threads(VP8D_COMP *pbi) +{ + int core_count = 0; + unsigned int ithread; + + pbi->b_multithreaded_rd = 0; + pbi->allocated_decoding_thread_count = 0; + pthread_mutex_init(&pbi->mt_mutex, NULL); + + /* limit decoding threads to the max number of token partitions */ + core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads; + + /* limit decoding threads to the available cores */ + if (core_count > pbi->common.processor_core_count) + core_count = pbi->common.processor_core_count; + + if (core_count > 1) + { + pbi->b_multithreaded_rd = 1; + pbi->decoding_thread_count = core_count - 1; + + CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count); + CALLOC_ARRAY(pbi->h_event_start_decoding, pbi->decoding_thread_count); + CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32); + CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count); + + for (ithread = 0; ithread < pbi->decoding_thread_count; ithread++) + { + sem_init(&pbi->h_event_start_decoding[ithread], 0, 0); + + vp8_setup_block_dptrs(&pbi->mb_row_di[ithread].mbd); + + pbi->de_thread_data[ithread].ithread = ithread; + pbi->de_thread_data[ithread].ptr1 = (void *)pbi; + pbi->de_thread_data[ithread].ptr2 = (void *) &pbi->mb_row_di[ithread]; + + pthread_create(&pbi->h_decoding_thread[ithread], 0, thread_decoding_proc, (&pbi->de_thread_data[ithread])); + } + + sem_init(&pbi->h_event_end_decoding, 0, 0); + + pbi->allocated_decoding_thread_count = pbi->decoding_thread_count; + } +} + + +void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows) +{ + int i; + + if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) + { + /* De-allocate mutex */ + if (pbi->pmutex != NULL) { + for (i = 0; i < mb_rows; i++) { + pthread_mutex_destroy(&pbi->pmutex[i]); + } + vpx_free(pbi->pmutex); + pbi->pmutex = NULL; + } + + vpx_free(pbi->mt_current_mb_col); + pbi->mt_current_mb_col = NULL ; + + /* Free above_row buffers. */ + if (pbi->mt_yabove_row) + { + for (i=0; i< mb_rows; i++) + { + vpx_free(pbi->mt_yabove_row[i]); + pbi->mt_yabove_row[i] = NULL ; + } + vpx_free(pbi->mt_yabove_row); + pbi->mt_yabove_row = NULL ; + } + + if (pbi->mt_uabove_row) + { + for (i=0; i< mb_rows; i++) + { + vpx_free(pbi->mt_uabove_row[i]); + pbi->mt_uabove_row[i] = NULL ; + } + vpx_free(pbi->mt_uabove_row); + pbi->mt_uabove_row = NULL ; + } + + if (pbi->mt_vabove_row) + { + for (i=0; i< mb_rows; i++) + { + vpx_free(pbi->mt_vabove_row[i]); + pbi->mt_vabove_row[i] = NULL ; + } + vpx_free(pbi->mt_vabove_row); + pbi->mt_vabove_row = NULL ; + } + + /* Free left_col buffers. */ + if (pbi->mt_yleft_col) + { + for (i=0; i< mb_rows; i++) + { + vpx_free(pbi->mt_yleft_col[i]); + pbi->mt_yleft_col[i] = NULL ; + } + vpx_free(pbi->mt_yleft_col); + pbi->mt_yleft_col = NULL ; + } + + if (pbi->mt_uleft_col) + { + for (i=0; i< mb_rows; i++) + { + vpx_free(pbi->mt_uleft_col[i]); + pbi->mt_uleft_col[i] = NULL ; + } + vpx_free(pbi->mt_uleft_col); + pbi->mt_uleft_col = NULL ; + } + + if (pbi->mt_vleft_col) + { + for (i=0; i< mb_rows; i++) + { + vpx_free(pbi->mt_vleft_col[i]); + pbi->mt_vleft_col[i] = NULL ; + } + vpx_free(pbi->mt_vleft_col); + pbi->mt_vleft_col = NULL ; + } + } +} + + +void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) +{ + VP8_COMMON *const pc = & pbi->common; + int i; + int uv_width; + + if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) + { + vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows); + + /* our internal buffers are always multiples of 16 */ + if ((width & 0xf) != 0) + width += 16 - (width & 0xf); + + if (width < 640) pbi->sync_range = 1; + else if (width <= 1280) pbi->sync_range = 8; + else if (width <= 2560) pbi->sync_range =16; + else pbi->sync_range = 32; + + uv_width = width >>1; + + /* Allocate mutex */ + CHECK_MEM_ERROR(pbi->pmutex, vpx_malloc(sizeof(*pbi->pmutex) * + pc->mb_rows)); + if (pbi->pmutex) { + for (i = 0; i < pc->mb_rows; i++) { + pthread_mutex_init(&pbi->pmutex[i], NULL); + } + } + + /* Allocate an int for each mb row. */ + CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows); + + /* Allocate memory for above_row buffers. */ + CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows); + for (i = 0; i < pc->mb_rows; i++) + CHECK_MEM_ERROR(pbi->mt_yabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (width + (VP8BORDERINPIXELS<<1)))); + + CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows); + for (i = 0; i < pc->mb_rows; i++) + CHECK_MEM_ERROR(pbi->mt_uabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); + + CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows); + for (i = 0; i < pc->mb_rows; i++) + CHECK_MEM_ERROR(pbi->mt_vabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); + + /* Allocate memory for left_col buffers. */ + CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows); + for (i = 0; i < pc->mb_rows; i++) + CHECK_MEM_ERROR(pbi->mt_yleft_col[i], vpx_calloc(sizeof(unsigned char) * 16, 1)); + + CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows); + for (i = 0; i < pc->mb_rows; i++) + CHECK_MEM_ERROR(pbi->mt_uleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1)); + + CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows); + for (i = 0; i < pc->mb_rows; i++) + CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1)); + } +} + + +void vp8_decoder_remove_threads(VP8D_COMP *pbi) +{ + /* shutdown MB Decoding thread; */ + if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) + { + int i; + + protected_write(&pbi->mt_mutex, &pbi->b_multithreaded_rd, 0); + + /* allow all threads to exit */ + for (i = 0; i < pbi->allocated_decoding_thread_count; i++) + { + sem_post(&pbi->h_event_start_decoding[i]); + pthread_join(pbi->h_decoding_thread[i], NULL); + } + + for (i = 0; i < pbi->allocated_decoding_thread_count; i++) + { + sem_destroy(&pbi->h_event_start_decoding[i]); + } + + sem_destroy(&pbi->h_event_end_decoding); + + vpx_free(pbi->h_decoding_thread); + pbi->h_decoding_thread = NULL; + + vpx_free(pbi->h_event_start_decoding); + pbi->h_event_start_decoding = NULL; + + vpx_free(pbi->mb_row_di); + pbi->mb_row_di = NULL ; + + vpx_free(pbi->de_thread_data); + pbi->de_thread_data = NULL; + } + pthread_mutex_destroy(&pbi->mt_mutex); +} + +void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd) +{ + VP8_COMMON *pc = &pbi->common; + unsigned int i; + int j; + + int filter_level = pc->filter_level; + YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME]; + + if (filter_level) + { + /* Set above_row buffer to 127 for decoding first MB row */ + memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, yv12_fb_new->y_width + 5); + memset(pbi->mt_uabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5); + memset(pbi->mt_vabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5); + + for (j=1; jmb_rows; j++) + { + memset(pbi->mt_yabove_row[j] + VP8BORDERINPIXELS-1, (unsigned char)129, 1); + memset(pbi->mt_uabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1); + memset(pbi->mt_vabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1); + } + + /* Set left_col to 129 initially */ + for (j=0; jmb_rows; j++) + { + memset(pbi->mt_yleft_col[j], (unsigned char)129, 16); + memset(pbi->mt_uleft_col[j], (unsigned char)129, 8); + memset(pbi->mt_vleft_col[j], (unsigned char)129, 8); + } + + /* Initialize the loop filter for this frame. */ + vp8_loop_filter_frame_init(pc, &pbi->mb, filter_level); + } + else + vp8_setup_intra_recon_top_line(yv12_fb_new); + + setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count); + + for (i = 0; i < pbi->decoding_thread_count; i++) + sem_post(&pbi->h_event_start_decoding[i]); + + mt_decode_mb_rows(pbi, xd, 0); + + sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */ +} diff --git a/thirdparty/libvpx/vp8/decoder/treereader.h b/thirdparty/libvpx/vp8/decoder/treereader.h new file mode 100644 index 000000000000..f7d23c369898 --- /dev/null +++ b/thirdparty/libvpx/vp8/decoder/treereader.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP8_DECODER_TREEREADER_H_ +#define VP8_DECODER_TREEREADER_H_ + +#include "./vpx_config.h" +#include "vp8/common/treecoder.h" +#include "dboolhuff.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef BOOL_DECODER vp8_reader; + +#define vp8_read vp8dx_decode_bool +#define vp8_read_literal vp8_decode_value +#define vp8_read_bit(R) vp8_read(R, vp8_prob_half) + + +/* Intent of tree data structure is to make decoding trivial. */ + +static INLINE int vp8_treed_read( + vp8_reader *const r, /* !!! must return a 0 or 1 !!! */ + vp8_tree t, + const vp8_prob *const p +) +{ + register vp8_tree_index i = 0; + + while ((i = t[ i + vp8_read(r, p[i>>1])]) > 0) ; + + return -i; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_TREEREADER_H_ diff --git a/thirdparty/libvpx/vp8/vp8_dx_iface.c b/thirdparty/libvpx/vp8/vp8_dx_iface.c new file mode 100644 index 000000000000..fc9288d62b68 --- /dev/null +++ b/thirdparty/libvpx/vp8/vp8_dx_iface.c @@ -0,0 +1,828 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include +#include +#include +#include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "vpx/vpx_decoder.h" +#include "vpx/vp8dx.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx_version.h" +#include "common/alloccommon.h" +#include "common/common.h" +#include "common/onyxd.h" +#include "decoder/onyxd_int.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#if CONFIG_ERROR_CONCEALMENT +#include "decoder/error_concealment.h" +#endif +#include "decoder/decoderthreading.h" + +#define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0) +#define VP8_CAP_ERROR_CONCEALMENT (CONFIG_ERROR_CONCEALMENT ? \ + VPX_CODEC_CAP_ERROR_CONCEALMENT : 0) + +typedef vpx_codec_stream_info_t vp8_stream_info_t; + +/* Structures for handling memory allocations */ +typedef enum +{ + VP8_SEG_ALG_PRIV = 256, + VP8_SEG_MAX +} mem_seg_id_t; +#define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0]))) + +struct vpx_codec_alg_priv +{ + vpx_codec_priv_t base; + vpx_codec_dec_cfg_t cfg; + vp8_stream_info_t si; + int decoder_init; + int postproc_cfg_set; + vp8_postproc_cfg_t postproc_cfg; +#if CONFIG_POSTPROC_VISUALIZER + unsigned int dbg_postproc_flag; + int dbg_color_ref_frame_flag; + int dbg_color_mb_modes_flag; + int dbg_color_b_modes_flag; + int dbg_display_mv_flag; +#endif + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; + vpx_image_t img; + int img_setup; + struct frame_buffers yv12_frame_buffers; + void *user_priv; + FRAGMENT_DATA fragments; +}; + +static int vp8_init_ctx(vpx_codec_ctx_t *ctx) +{ + vpx_codec_alg_priv_t *priv = + (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv)); + if (!priv) return 1; + + ctx->priv = (vpx_codec_priv_t *)priv; + ctx->priv->init_flags = ctx->init_flags; + + priv->si.sz = sizeof(priv->si); + priv->decrypt_cb = NULL; + priv->decrypt_state = NULL; + + if (ctx->config.dec) + { + /* Update the reference to the config structure to an internal copy. */ + priv->cfg = *ctx->config.dec; + ctx->config.dec = &priv->cfg; + } + + return 0; +} + +static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *data) +{ + vpx_codec_err_t res = VPX_CODEC_OK; + vpx_codec_alg_priv_t *priv = NULL; + (void) data; + + vp8_rtcd(); + vpx_dsp_rtcd(); + vpx_scale_rtcd(); + + /* This function only allocates space for the vpx_codec_alg_priv_t + * structure. More memory may be required at the time the stream + * information becomes known. + */ + if (!ctx->priv) { + if (vp8_init_ctx(ctx)) return VPX_CODEC_MEM_ERROR; + priv = (vpx_codec_alg_priv_t *)ctx->priv; + + /* initialize number of fragments to zero */ + priv->fragments.count = 0; + /* is input fragments enabled? */ + priv->fragments.enabled = + (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS); + + /*post processing level initialized to do nothing */ + } else { + priv = (vpx_codec_alg_priv_t *)ctx->priv; + } + + priv->yv12_frame_buffers.use_frame_threads = + (ctx->priv->init_flags & VPX_CODEC_USE_FRAME_THREADING); + + /* for now, disable frame threading */ + priv->yv12_frame_buffers.use_frame_threads = 0; + + if (priv->yv12_frame_buffers.use_frame_threads && + ((ctx->priv->init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT) || + (ctx->priv->init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS))) { + /* row-based threading, error concealment, and input fragments will + * not be supported when using frame-based threading */ + res = VPX_CODEC_INVALID_PARAM; + } + + return res; +} + +static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) +{ + vp8_remove_decoder_instances(&ctx->yv12_frame_buffers); + + vpx_free(ctx); + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state) +{ + vpx_codec_err_t res = VPX_CODEC_OK; + + assert(data != NULL); + + if(data + data_sz <= data) + { + res = VPX_CODEC_INVALID_PARAM; + } + else + { + /* Parse uncompresssed part of key frame header. + * 3 bytes:- including version, frame type and an offset + * 3 bytes:- sync code (0x9d, 0x01, 0x2a) + * 4 bytes:- including image width and height in the lowest 14 bits + * of each 2-byte value. + */ + uint8_t clear_buffer[10]; + const uint8_t *clear = data; + if (decrypt_cb) + { + int n = VPXMIN(sizeof(clear_buffer), data_sz); + decrypt_cb(decrypt_state, data, clear_buffer, n); + clear = clear_buffer; + } + si->is_kf = 0; + + if (data_sz >= 10 && !(clear[0] & 0x01)) /* I-Frame */ + { + si->is_kf = 1; + + /* vet via sync code */ + if (clear[3] != 0x9d || clear[4] != 0x01 || clear[5] != 0x2a) + return VPX_CODEC_UNSUP_BITSTREAM; + + si->w = (clear[6] | (clear[7] << 8)) & 0x3fff; + si->h = (clear[8] | (clear[9] << 8)) & 0x3fff; + + /*printf("w=%d, h=%d\n", si->w, si->h);*/ + if (!(si->h | si->w)) + res = VPX_CODEC_UNSUP_BITSTREAM; + } + else + { + res = VPX_CODEC_UNSUP_BITSTREAM; + } + } + + return res; +} + +static vpx_codec_err_t vp8_peek_si(const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si) { + return vp8_peek_si_internal(data, data_sz, si, NULL, NULL); +} + +static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t *ctx, + vpx_codec_stream_info_t *si) +{ + + unsigned int sz; + + if (si->sz >= sizeof(vp8_stream_info_t)) + sz = sizeof(vp8_stream_info_t); + else + sz = sizeof(vpx_codec_stream_info_t); + + memcpy(si, &ctx->si, sz); + si->sz = sz; + + return VPX_CODEC_OK; +} + + +static vpx_codec_err_t +update_error_state(vpx_codec_alg_priv_t *ctx, + const struct vpx_internal_error_info *error) +{ + vpx_codec_err_t res; + + if ((res = error->error_code)) + ctx->base.err_detail = error->has_detail + ? error->detail + : NULL; + + return res; +} + +static void yuvconfig2image(vpx_image_t *img, + const YV12_BUFFER_CONFIG *yv12, + void *user_priv) +{ + /** vpx_img_wrap() doesn't allow specifying independent strides for + * the Y, U, and V planes, nor other alignment adjustments that + * might be representable by a YV12_BUFFER_CONFIG, so we just + * initialize all the fields.*/ + img->fmt = VPX_IMG_FMT_I420; + img->w = yv12->y_stride; + img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15; + img->d_w = img->r_w = yv12->y_width; + img->d_h = img->r_h = yv12->y_height; + img->x_chroma_shift = 1; + img->y_chroma_shift = 1; + img->planes[VPX_PLANE_Y] = yv12->y_buffer; + img->planes[VPX_PLANE_U] = yv12->u_buffer; + img->planes[VPX_PLANE_V] = yv12->v_buffer; + img->planes[VPX_PLANE_ALPHA] = NULL; + img->stride[VPX_PLANE_Y] = yv12->y_stride; + img->stride[VPX_PLANE_U] = yv12->uv_stride; + img->stride[VPX_PLANE_V] = yv12->uv_stride; + img->stride[VPX_PLANE_ALPHA] = yv12->y_stride; + img->bit_depth = 8; + img->bps = 12; + img->user_priv = user_priv; + img->img_data = yv12->buffer_alloc; + img->img_data_owner = 0; + img->self_allocd = 0; +} + +static int +update_fragments(vpx_codec_alg_priv_t *ctx, + const uint8_t *data, + unsigned int data_sz, + vpx_codec_err_t *res) +{ + *res = VPX_CODEC_OK; + + if (ctx->fragments.count == 0) + { + /* New frame, reset fragment pointers and sizes */ + memset((void*)ctx->fragments.ptrs, 0, sizeof(ctx->fragments.ptrs)); + memset(ctx->fragments.sizes, 0, sizeof(ctx->fragments.sizes)); + } + if (ctx->fragments.enabled && !(data == NULL && data_sz == 0)) + { + /* Store a pointer to this fragment and return. We haven't + * received the complete frame yet, so we will wait with decoding. + */ + ctx->fragments.ptrs[ctx->fragments.count] = data; + ctx->fragments.sizes[ctx->fragments.count] = data_sz; + ctx->fragments.count++; + if (ctx->fragments.count > (1 << EIGHT_PARTITION) + 1) + { + ctx->fragments.count = 0; + *res = VPX_CODEC_INVALID_PARAM; + return -1; + } + return 0; + } + + if (!ctx->fragments.enabled && (data == NULL && data_sz == 0)) + { + return 0; + } + + if (!ctx->fragments.enabled) + { + ctx->fragments.ptrs[0] = data; + ctx->fragments.sizes[0] = data_sz; + ctx->fragments.count = 1; + } + + return 1; +} + +static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, + const uint8_t *data, + unsigned int data_sz, + void *user_priv, + long deadline) +{ + vpx_codec_err_t res = VPX_CODEC_OK; + unsigned int resolution_change = 0; + unsigned int w, h; + + if (!ctx->fragments.enabled && (data == NULL && data_sz == 0)) + { + return 0; + } + + /* Update the input fragment data */ + if(update_fragments(ctx, data, data_sz, &res) <= 0) + return res; + + /* Determine the stream parameters. Note that we rely on peek_si to + * validate that we have a buffer that does not wrap around the top + * of the heap. + */ + w = ctx->si.w; + h = ctx->si.h; + + res = vp8_peek_si_internal(ctx->fragments.ptrs[0], ctx->fragments.sizes[0], + &ctx->si, ctx->decrypt_cb, ctx->decrypt_state); + + if((res == VPX_CODEC_UNSUP_BITSTREAM) && !ctx->si.is_kf) + { + /* the peek function returns an error for non keyframes, however for + * this case, it is not an error */ + res = VPX_CODEC_OK; + } + + if(!ctx->decoder_init && !ctx->si.is_kf) + res = VPX_CODEC_UNSUP_BITSTREAM; + + if ((ctx->si.h != h) || (ctx->si.w != w)) + resolution_change = 1; + + /* Initialize the decoder instance on the first frame*/ + if (!res && !ctx->decoder_init) + { + VP8D_CONFIG oxcf; + + oxcf.Width = ctx->si.w; + oxcf.Height = ctx->si.h; + oxcf.Version = 9; + oxcf.postprocess = 0; + oxcf.max_threads = ctx->cfg.threads; + oxcf.error_concealment = + (ctx->base.init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT); + + /* If postprocessing was enabled by the application and a + * configuration has not been provided, default it. + */ + if (!ctx->postproc_cfg_set + && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) { + ctx->postproc_cfg.post_proc_flag = + VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE; + ctx->postproc_cfg.deblocking_level = 4; + ctx->postproc_cfg.noise_level = 0; + } + + res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf); + ctx->decoder_init = 1; + } + + /* Set these even if already initialized. The caller may have changed the + * decrypt config between frames. + */ + if (ctx->decoder_init) { + ctx->yv12_frame_buffers.pbi[0]->decrypt_cb = ctx->decrypt_cb; + ctx->yv12_frame_buffers.pbi[0]->decrypt_state = ctx->decrypt_state; + } + + if (!res) + { + VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; + if (resolution_change) + { + VP8_COMMON *const pc = & pbi->common; + MACROBLOCKD *const xd = & pbi->mb; +#if CONFIG_MULTITHREAD + int i; +#endif + pc->Width = ctx->si.w; + pc->Height = ctx->si.h; + { + int prev_mb_rows = pc->mb_rows; + + if (setjmp(pbi->common.error.jmp)) + { + pbi->common.error.setjmp = 0; + vp8_clear_system_state(); + /* same return value as used in vp8dx_receive_compressed_data */ + return -1; + } + + pbi->common.error.setjmp = 1; + + if (pc->Width <= 0) + { + pc->Width = w; + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid frame width"); + } + + if (pc->Height <= 0) + { + pc->Height = h; + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid frame height"); + } + + if (vp8_alloc_frame_buffers(pc, pc->Width, pc->Height)) + vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffers"); + + xd->pre = pc->yv12_fb[pc->lst_fb_idx]; + xd->dst = pc->yv12_fb[pc->new_fb_idx]; + +#if CONFIG_MULTITHREAD + for (i = 0; i < pbi->allocated_decoding_thread_count; i++) + { + pbi->mb_row_di[i].mbd.dst = pc->yv12_fb[pc->new_fb_idx]; + vp8_build_block_doffsets(&pbi->mb_row_di[i].mbd); + } +#endif + vp8_build_block_doffsets(&pbi->mb); + + /* allocate memory for last frame MODE_INFO array */ +#if CONFIG_ERROR_CONCEALMENT + + if (pbi->ec_enabled) + { + /* old prev_mip was released by vp8_de_alloc_frame_buffers() + * called in vp8_alloc_frame_buffers() */ + pc->prev_mip = vpx_calloc( + (pc->mb_cols + 1) * (pc->mb_rows + 1), + sizeof(MODE_INFO)); + + if (!pc->prev_mip) + { + vp8_de_alloc_frame_buffers(pc); + vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate" + "last frame MODE_INFO array"); + } + + pc->prev_mi = pc->prev_mip + pc->mode_info_stride + 1; + + if (vp8_alloc_overlap_lists(pbi)) + vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate overlap lists " + "for error concealment"); + } + +#endif + +#if CONFIG_MULTITHREAD + if (pbi->b_multithreaded_rd) + vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows); +#else + (void)prev_mb_rows; +#endif + } + + pbi->common.error.setjmp = 0; + + /* required to get past the first get_free_fb() call */ + pbi->common.fb_idx_ref_cnt[0] = 0; + } + + /* update the pbi fragment data */ + pbi->fragments = ctx->fragments; + + ctx->user_priv = user_priv; + if (vp8dx_receive_compressed_data(pbi, data_sz, data, deadline)) + { + res = update_error_state(ctx, &pbi->common.error); + } + + /* get ready for the next series of fragments */ + ctx->fragments.count = 0; + } + + return res; +} + +static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx, + vpx_codec_iter_t *iter) +{ + vpx_image_t *img = NULL; + + /* iter acts as a flip flop, so an image is only returned on the first + * call to get_frame. + */ + if (!(*iter) && ctx->yv12_frame_buffers.pbi[0]) + { + YV12_BUFFER_CONFIG sd; + int64_t time_stamp = 0, time_end_stamp = 0; + vp8_ppflags_t flags; + vp8_zero(flags); + + if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) + { + flags.post_proc_flag= ctx->postproc_cfg.post_proc_flag +#if CONFIG_POSTPROC_VISUALIZER + + | ((ctx->dbg_color_ref_frame_flag != 0) ? VP8D_DEBUG_CLR_FRM_REF_BLKS : 0) + | ((ctx->dbg_color_mb_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0) + | ((ctx->dbg_color_b_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0) + | ((ctx->dbg_display_mv_flag != 0) ? VP8D_DEBUG_DRAW_MV : 0) +#endif + ; + flags.deblocking_level = ctx->postproc_cfg.deblocking_level; + flags.noise_level = ctx->postproc_cfg.noise_level; +#if CONFIG_POSTPROC_VISUALIZER + flags.display_ref_frame_flag= ctx->dbg_color_ref_frame_flag; + flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag; + flags.display_b_modes_flag = ctx->dbg_color_b_modes_flag; + flags.display_mv_flag = ctx->dbg_display_mv_flag; +#endif + } + + if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd, + &time_stamp, &time_end_stamp, &flags)) + { + yuvconfig2image(&ctx->img, &sd, ctx->user_priv); + + img = &ctx->img; + *iter = img; + } + } + + return img; +} + +static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, + YV12_BUFFER_CONFIG *yv12) +{ + const int y_w = img->d_w; + const int y_h = img->d_h; + const int uv_w = (img->d_w + 1) / 2; + const int uv_h = (img->d_h + 1) / 2; + vpx_codec_err_t res = VPX_CODEC_OK; + yv12->y_buffer = img->planes[VPX_PLANE_Y]; + yv12->u_buffer = img->planes[VPX_PLANE_U]; + yv12->v_buffer = img->planes[VPX_PLANE_V]; + + yv12->y_crop_width = y_w; + yv12->y_crop_height = y_h; + yv12->y_width = y_w; + yv12->y_height = y_h; + yv12->uv_crop_width = uv_w; + yv12->uv_crop_height = uv_h; + yv12->uv_width = uv_w; + yv12->uv_height = uv_h; + + yv12->y_stride = img->stride[VPX_PLANE_Y]; + yv12->uv_stride = img->stride[VPX_PLANE_U]; + + yv12->border = (img->stride[VPX_PLANE_Y] - img->d_w) / 2; + return res; +} + + +static vpx_codec_err_t vp8_set_reference(vpx_codec_alg_priv_t *ctx, + va_list args) +{ + + vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); + + if (data && !ctx->yv12_frame_buffers.use_frame_threads) + { + vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; + YV12_BUFFER_CONFIG sd; + + image2yuvconfig(&frame->img, &sd); + + return vp8dx_set_reference(ctx->yv12_frame_buffers.pbi[0], + frame->frame_type, &sd); + } + else + return VPX_CODEC_INVALID_PARAM; + +} + +static vpx_codec_err_t vp8_get_reference(vpx_codec_alg_priv_t *ctx, + va_list args) +{ + + vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); + + if (data && !ctx->yv12_frame_buffers.use_frame_threads) + { + vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; + YV12_BUFFER_CONFIG sd; + + image2yuvconfig(&frame->img, &sd); + + return vp8dx_get_reference(ctx->yv12_frame_buffers.pbi[0], + frame->frame_type, &sd); + } + else + return VPX_CODEC_INVALID_PARAM; + +} + +static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx, + va_list args) +{ +#if CONFIG_POSTPROC + vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *); + + if (data) + { + ctx->postproc_cfg_set = 1; + ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data); + return VPX_CODEC_OK; + } + else + return VPX_CODEC_INVALID_PARAM; + +#else + (void)ctx; + (void)args; + return VPX_CODEC_INCAPABLE; +#endif +} + + +static vpx_codec_err_t vp8_set_dbg_color_ref_frame(vpx_codec_alg_priv_t *ctx, + va_list args) { +#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC + ctx->dbg_color_ref_frame_flag = va_arg(args, int); + return VPX_CODEC_OK; +#else + (void)ctx; + (void)args; + return VPX_CODEC_INCAPABLE; +#endif +} + +static vpx_codec_err_t vp8_set_dbg_color_mb_modes(vpx_codec_alg_priv_t *ctx, + va_list args) { +#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC + ctx->dbg_color_mb_modes_flag = va_arg(args, int); + return VPX_CODEC_OK; +#else + (void)ctx; + (void)args; + return VPX_CODEC_INCAPABLE; +#endif +} + +static vpx_codec_err_t vp8_set_dbg_color_b_modes(vpx_codec_alg_priv_t *ctx, + va_list args) { +#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC + ctx->dbg_color_b_modes_flag = va_arg(args, int); + return VPX_CODEC_OK; +#else + (void)ctx; + (void)args; + return VPX_CODEC_INCAPABLE; +#endif +} + +static vpx_codec_err_t vp8_set_dbg_display_mv(vpx_codec_alg_priv_t *ctx, + va_list args) { +#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC + ctx->dbg_display_mv_flag = va_arg(args, int); + return VPX_CODEC_OK; +#else + (void)ctx; + (void)args; + return VPX_CODEC_INCAPABLE; +#endif +} + +static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, + va_list args) +{ + int *update_info = va_arg(args, int *); + + if (update_info && !ctx->yv12_frame_buffers.use_frame_threads) + { + VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0]; + + *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME + + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME + + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME; + + return VPX_CODEC_OK; + } + else + return VPX_CODEC_INVALID_PARAM; +} + +extern int vp8dx_references_buffer( VP8_COMMON *oci, int ref_frame ); +static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx, + va_list args) +{ + int *ref_info = va_arg(args, int *); + + if (ref_info && !ctx->yv12_frame_buffers.use_frame_threads) + { + VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0]; + VP8_COMMON *oci = &pbi->common; + *ref_info = + (vp8dx_references_buffer( oci, ALTREF_FRAME )?VP8_ALTR_FRAME:0) | + (vp8dx_references_buffer( oci, GOLDEN_FRAME )?VP8_GOLD_FRAME:0) | + (vp8dx_references_buffer( oci, LAST_FRAME )?VP8_LAST_FRAME:0); + + return VPX_CODEC_OK; + } + else + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, + va_list args) +{ + + int *corrupted = va_arg(args, int *); + VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0]; + + if (corrupted && pbi) + { + const YV12_BUFFER_CONFIG *const frame = pbi->common.frame_to_show; + if (frame == NULL) return VPX_CODEC_ERROR; + *corrupted = frame->corrupted; + return VPX_CODEC_OK; + } + else + return VPX_CODEC_INVALID_PARAM; + +} + +static vpx_codec_err_t vp8_set_decryptor(vpx_codec_alg_priv_t *ctx, + va_list args) +{ + vpx_decrypt_init *init = va_arg(args, vpx_decrypt_init *); + + if (init) + { + ctx->decrypt_cb = init->decrypt_cb; + ctx->decrypt_state = init->decrypt_state; + } + else + { + ctx->decrypt_cb = NULL; + ctx->decrypt_state = NULL; + } + return VPX_CODEC_OK; +} + +vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] = +{ + {VP8_SET_REFERENCE, vp8_set_reference}, + {VP8_COPY_REFERENCE, vp8_get_reference}, + {VP8_SET_POSTPROC, vp8_set_postproc}, + {VP8_SET_DBG_COLOR_REF_FRAME, vp8_set_dbg_color_ref_frame}, + {VP8_SET_DBG_COLOR_MB_MODES, vp8_set_dbg_color_mb_modes}, + {VP8_SET_DBG_COLOR_B_MODES, vp8_set_dbg_color_b_modes}, + {VP8_SET_DBG_DISPLAY_MV, vp8_set_dbg_display_mv}, + {VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates}, + {VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted}, + {VP8D_GET_LAST_REF_USED, vp8_get_last_ref_frame}, + {VPXD_SET_DECRYPTOR, vp8_set_decryptor}, + { -1, NULL}, +}; + + +#ifndef VERSION_STRING +#define VERSION_STRING +#endif +CODEC_INTERFACE(vpx_codec_vp8_dx) = +{ + "WebM Project VP8 Decoder" VERSION_STRING, + VPX_CODEC_INTERNAL_ABI_VERSION, + VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC | VP8_CAP_ERROR_CONCEALMENT | + VPX_CODEC_CAP_INPUT_FRAGMENTS, + /* vpx_codec_caps_t caps; */ + vp8_init, /* vpx_codec_init_fn_t init; */ + vp8_destroy, /* vpx_codec_destroy_fn_t destroy; */ + vp8_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */ + { + vp8_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */ + vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */ + vp8_decode, /* vpx_codec_decode_fn_t decode; */ + vp8_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */ + NULL, + }, + { /* encoder functions */ + 0, + NULL, /* vpx_codec_enc_cfg_map_t */ + NULL, /* vpx_codec_encode_fn_t */ + NULL, /* vpx_codec_get_cx_data_fn_t */ + NULL, /* vpx_codec_enc_config_set_fn_t */ + NULL, /* vpx_codec_get_global_headers_fn_t */ + NULL, /* vpx_codec_get_preview_frame_fn_t */ + NULL /* vpx_codec_enc_mr_get_mem_loc_fn_t */ + } +}; diff --git a/thirdparty/libvpx/vp8_rtcd.h b/thirdparty/libvpx/vp8_rtcd.h new file mode 100644 index 000000000000..c5eeb5e579fa --- /dev/null +++ b/thirdparty/libvpx/vp8_rtcd.h @@ -0,0 +1,9 @@ +#include "vpx_config.h" + +#if defined(WEBM_X86ASM) && (ARCH_X86 || ARCH_X86_64) + #include "rtcd/vp8_rtcd_x86.h" +#elif defined(WEBM_ARMASM) && ARCH_ARM + #include "rtcd/vp8_rtcd_arm.h" +#else + #include "rtcd/vp8_rtcd_c.h" +#endif diff --git a/thirdparty/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/thirdparty/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c new file mode 100644 index 000000000000..1761fada2faf --- /dev/null +++ b/thirdparty/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" + +static int16_t sinpi_1_9 = 0x14a3; +static int16_t sinpi_2_9 = 0x26c9; +static int16_t sinpi_3_9 = 0x3441; +static int16_t sinpi_4_9 = 0x3b6c; +static int16_t cospi_8_64 = 0x3b21; +static int16_t cospi_16_64 = 0x2d41; +static int16_t cospi_24_64 = 0x187e; + +static INLINE void TRANSPOSE4X4( + int16x8_t *q8s16, + int16x8_t *q9s16) { + int32x4_t q8s32, q9s32; + int16x4x2_t d0x2s16, d1x2s16; + int32x4x2_t q0x2s32; + + d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16)); + d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16)); + + q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1])); + q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1])); + q0x2s32 = vtrnq_s32(q8s32, q9s32); + + *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]); + *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]); + return; +} + +static INLINE void GENERATE_COSINE_CONSTANTS( + int16x4_t *d0s16, + int16x4_t *d1s16, + int16x4_t *d2s16) { + *d0s16 = vdup_n_s16(cospi_8_64); + *d1s16 = vdup_n_s16(cospi_16_64); + *d2s16 = vdup_n_s16(cospi_24_64); + return; +} + +static INLINE void GENERATE_SINE_CONSTANTS( + int16x4_t *d3s16, + int16x4_t *d4s16, + int16x4_t *d5s16, + int16x8_t *q3s16) { + *d3s16 = vdup_n_s16(sinpi_1_9); + *d4s16 = vdup_n_s16(sinpi_2_9); + *q3s16 = vdupq_n_s16(sinpi_3_9); + *d5s16 = vdup_n_s16(sinpi_4_9); + return; +} + +static INLINE void IDCT4x4_1D( + int16x4_t *d0s16, + int16x4_t *d1s16, + int16x4_t *d2s16, + int16x8_t *q8s16, + int16x8_t *q9s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16; + int16x4_t d26s16, d27s16, d28s16, d29s16; + int32x4_t q10s32, q13s32, q14s32, q15s32; + int16x8_t q13s16, q14s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + + d23s16 = vadd_s16(d16s16, d18s16); + d24s16 = vsub_s16(d16s16, d18s16); + + q15s32 = vmull_s16(d17s16, *d2s16); + q10s32 = vmull_s16(d17s16, *d0s16); + q13s32 = vmull_s16(d23s16, *d1s16); + q14s32 = vmull_s16(d24s16, *d1s16); + q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16); + q10s32 = vmlal_s16(q10s32, d19s16, *d2s16); + + d26s16 = vqrshrn_n_s32(q13s32, 14); + d27s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d28s16 = vqrshrn_n_s32(q10s32, 14); + + q13s16 = vcombine_s16(d26s16, d27s16); + q14s16 = vcombine_s16(d28s16, d29s16); + *q8s16 = vaddq_s16(q13s16, q14s16); + *q9s16 = vsubq_s16(q13s16, q14s16); + *q9s16 = vcombine_s16(vget_high_s16(*q9s16), + vget_low_s16(*q9s16)); // vswp + return; +} + +static INLINE void IADST4x4_1D( + int16x4_t *d3s16, + int16x4_t *d4s16, + int16x4_t *d5s16, + int16x8_t *q3s16, + int16x8_t *q8s16, + int16x8_t *q9s16) { + int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16; + int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; + + d6s16 = vget_low_s16(*q3s16); + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + + q10s32 = vmull_s16(*d3s16, d16s16); + q11s32 = vmull_s16(*d4s16, d16s16); + q12s32 = vmull_s16(d6s16, d17s16); + q13s32 = vmull_s16(*d5s16, d18s16); + q14s32 = vmull_s16(*d3s16, d18s16); + q15s32 = vmovl_s16(d16s16); + q15s32 = vaddw_s16(q15s32, d19s16); + q8s32 = vmull_s16(*d4s16, d19s16); + q15s32 = vsubw_s16(q15s32, d18s16); + q9s32 = vmull_s16(*d5s16, d19s16); + + q10s32 = vaddq_s32(q10s32, q13s32); + q10s32 = vaddq_s32(q10s32, q8s32); + q11s32 = vsubq_s32(q11s32, q14s32); + q8s32 = vdupq_n_s32(sinpi_3_9); + q11s32 = vsubq_s32(q11s32, q9s32); + q15s32 = vmulq_s32(q15s32, q8s32); + + q13s32 = vaddq_s32(q10s32, q12s32); + q10s32 = vaddq_s32(q10s32, q11s32); + q14s32 = vaddq_s32(q11s32, q12s32); + q10s32 = vsubq_s32(q10s32, q12s32); + + d16s16 = vqrshrn_n_s32(q13s32, 14); + d17s16 = vqrshrn_n_s32(q14s32, 14); + d18s16 = vqrshrn_n_s32(q15s32, 14); + d19s16 = vqrshrn_n_s32(q10s32, 14); + + *q8s16 = vcombine_s16(d16s16, d17s16); + *q9s16 = vcombine_s16(d18s16, d19s16); + return; +} + +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, + int dest_stride, int tx_type) { + uint8x8_t d26u8, d27u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16; + uint32x2_t d26u32, d27u32; + int16x8_t q3s16, q8s16, q9s16; + uint16x8_t q8u16, q9u16; + + d26u32 = d27u32 = vdup_n_u32(0); + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + + TRANSPOSE4X4(&q8s16, &q9s16); + + switch (tx_type) { + case 0: // idct_idct is not supported. Fall back to C + vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type); + return; + break; + case 1: // iadst_idct + // generate constants + GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); + GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); + + // first transform rows + IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); + + // transpose the matrix + TRANSPOSE4X4(&q8s16, &q9s16); + + // then transform columns + IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + break; + case 2: // idct_iadst + // generate constantsyy + GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); + GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); + + // first transform rows + IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + + // transpose the matrix + TRANSPOSE4X4(&q8s16, &q9s16); + + // then transform columns + IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); + break; + case 3: // iadst_iadst + // generate constants + GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); + + // first transform rows + IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + + // transpose the matrix + TRANSPOSE4X4(&q8s16, &q9s16); + + // then transform columns + IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + break; + default: // iadst_idct + assert(0); + break; + } + + q8s16 = vrshrq_n_s16(q8s16, 4); + q9s16 = vrshrq_n_s16(q9s16, 4); + + d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0); + dest += dest_stride; + d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1); + dest += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0); + dest += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1); + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); + + d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1); + dest -= dest_stride; + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0); + dest -= dest_stride; + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1); + dest -= dest_stride; + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0); + return; +} diff --git a/thirdparty/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/thirdparty/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c new file mode 100644 index 000000000000..04b342c3d34f --- /dev/null +++ b/thirdparty/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c @@ -0,0 +1,624 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" + +static int16_t cospi_2_64 = 16305; +static int16_t cospi_4_64 = 16069; +static int16_t cospi_6_64 = 15679; +static int16_t cospi_8_64 = 15137; +static int16_t cospi_10_64 = 14449; +static int16_t cospi_12_64 = 13623; +static int16_t cospi_14_64 = 12665; +static int16_t cospi_16_64 = 11585; +static int16_t cospi_18_64 = 10394; +static int16_t cospi_20_64 = 9102; +static int16_t cospi_22_64 = 7723; +static int16_t cospi_24_64 = 6270; +static int16_t cospi_26_64 = 4756; +static int16_t cospi_28_64 = 3196; +static int16_t cospi_30_64 = 1606; + +static INLINE void TRANSPOSE8X8( + int16x8_t *q8s16, + int16x8_t *q9s16, + int16x8_t *q10s16, + int16x8_t *q11s16, + int16x8_t *q12s16, + int16x8_t *q13s16, + int16x8_t *q14s16, + int16x8_t *q15s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + *q12s16 = vcombine_s16(d17s16, d25s16); + *q13s16 = vcombine_s16(d19s16, d27s16); + *q14s16 = vcombine_s16(d21s16, d29s16); + *q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16), + vreinterpretq_s32_s16(*q10s16)); + q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16), + vreinterpretq_s32_s16(*q11s16)); + q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16), + vreinterpretq_s32_s16(*q14s16)); + q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16), + vreinterpretq_s32_s16(*q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + *q8s16 = q0x2s16.val[0]; + *q9s16 = q0x2s16.val[1]; + *q10s16 = q1x2s16.val[0]; + *q11s16 = q1x2s16.val[1]; + *q12s16 = q2x2s16.val[0]; + *q13s16 = q2x2s16.val[1]; + *q14s16 = q3x2s16.val[0]; + *q15s16 = q3x2s16.val[1]; + return; +} + +static INLINE void IDCT8x8_1D( + int16x8_t *q8s16, + int16x8_t *q9s16, + int16x8_t *q10s16, + int16x8_t *q11s16, + int16x8_t *q12s16, + int16x8_t *q13s16, + int16x8_t *q14s16, + int16x8_t *q15s16) { + int16x4_t d0s16, d1s16, d2s16, d3s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; + + d0s16 = vdup_n_s16(cospi_28_64); + d1s16 = vdup_n_s16(cospi_4_64); + d2s16 = vdup_n_s16(cospi_12_64); + d3s16 = vdup_n_s16(cospi_20_64); + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + q2s32 = vmull_s16(d18s16, d0s16); + q3s32 = vmull_s16(d19s16, d0s16); + q5s32 = vmull_s16(d26s16, d2s16); + q6s32 = vmull_s16(d27s16, d2s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); + q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); + q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); + + d8s16 = vqrshrn_n_s32(q2s32, 14); + d9s16 = vqrshrn_n_s32(q3s32, 14); + d10s16 = vqrshrn_n_s32(q5s32, 14); + d11s16 = vqrshrn_n_s32(q6s32, 14); + q4s16 = vcombine_s16(d8s16, d9s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q2s32 = vmull_s16(d18s16, d1s16); + q3s32 = vmull_s16(d19s16, d1s16); + q9s32 = vmull_s16(d26s16, d3s16); + q13s32 = vmull_s16(d27s16, d3s16); + + q2s32 = vmlal_s16(q2s32, d30s16, d0s16); + q3s32 = vmlal_s16(q3s32, d31s16, d0s16); + q9s32 = vmlal_s16(q9s32, d22s16, d2s16); + q13s32 = vmlal_s16(q13s32, d23s16, d2s16); + + d14s16 = vqrshrn_n_s32(q2s32, 14); + d15s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q13s32, 14); + q6s16 = vcombine_s16(d12s16, d13s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + d0s16 = vdup_n_s16(cospi_16_64); + + q2s32 = vmull_s16(d16s16, d0s16); + q3s32 = vmull_s16(d17s16, d0s16); + q13s32 = vmull_s16(d16s16, d0s16); + q15s32 = vmull_s16(d17s16, d0s16); + + q2s32 = vmlal_s16(q2s32, d24s16, d0s16); + q3s32 = vmlal_s16(q3s32, d25s16, d0s16); + q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); + q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); + + d0s16 = vdup_n_s16(cospi_24_64); + d1s16 = vdup_n_s16(cospi_8_64); + + d18s16 = vqrshrn_n_s32(q2s32, 14); + d19s16 = vqrshrn_n_s32(q3s32, 14); + d22s16 = vqrshrn_n_s32(q13s32, 14); + d23s16 = vqrshrn_n_s32(q15s32, 14); + *q9s16 = vcombine_s16(d18s16, d19s16); + *q11s16 = vcombine_s16(d22s16, d23s16); + + q2s32 = vmull_s16(d20s16, d0s16); + q3s32 = vmull_s16(d21s16, d0s16); + q8s32 = vmull_s16(d20s16, d1s16); + q12s32 = vmull_s16(d21s16, d1s16); + + q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); + q8s32 = vmlal_s16(q8s32, d28s16, d0s16); + q12s32 = vmlal_s16(q12s32, d29s16, d0s16); + + d26s16 = vqrshrn_n_s32(q2s32, 14); + d27s16 = vqrshrn_n_s32(q3s32, 14); + d30s16 = vqrshrn_n_s32(q8s32, 14); + d31s16 = vqrshrn_n_s32(q12s32, 14); + *q13s16 = vcombine_s16(d26s16, d27s16); + *q15s16 = vcombine_s16(d30s16, d31s16); + + q0s16 = vaddq_s16(*q9s16, *q15s16); + q1s16 = vaddq_s16(*q11s16, *q13s16); + q2s16 = vsubq_s16(*q11s16, *q13s16); + q3s16 = vsubq_s16(*q9s16, *q15s16); + + *q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + *q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q7s16, q6s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + + d16s16 = vdup_n_s16(cospi_16_64); + + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + q11s32 = vmull_s16(d28s16, d16s16); + q12s32 = vmull_s16(d29s16, d16s16); + + q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); + q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); + q11s32 = vmlal_s16(q11s32, d26s16, d16s16); + q12s32 = vmlal_s16(q12s32, d27s16, d16s16); + + d10s16 = vqrshrn_n_s32(q9s32, 14); + d11s16 = vqrshrn_n_s32(q10s32, 14); + d12s16 = vqrshrn_n_s32(q11s32, 14); + d13s16 = vqrshrn_n_s32(q12s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + *q8s16 = vaddq_s16(q0s16, q7s16); + *q9s16 = vaddq_s16(q1s16, q6s16); + *q10s16 = vaddq_s16(q2s16, q5s16); + *q11s16 = vaddq_s16(q3s16, q4s16); + *q12s16 = vsubq_s16(q3s16, q4s16); + *q13s16 = vsubq_s16(q2s16, q5s16); + *q14s16 = vsubq_s16(q1s16, q6s16); + *q15s16 = vsubq_s16(q0s16, q7s16); + return; +} + +static INLINE void IADST8X8_1D( + int16x8_t *q8s16, + int16x8_t *q9s16, + int16x8_t *q10s16, + int16x8_t *q11s16, + int16x8_t *q12s16, + int16x8_t *q13s16, + int16x8_t *q14s16, + int16x8_t *q15s16) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int16x8_t q2s16, q4s16, q5s16, q6s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32; + int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + d14s16 = vdup_n_s16(cospi_2_64); + d15s16 = vdup_n_s16(cospi_30_64); + + q1s32 = vmull_s16(d30s16, d14s16); + q2s32 = vmull_s16(d31s16, d14s16); + q3s32 = vmull_s16(d30s16, d15s16); + q4s32 = vmull_s16(d31s16, d15s16); + + d30s16 = vdup_n_s16(cospi_18_64); + d31s16 = vdup_n_s16(cospi_14_64); + + q1s32 = vmlal_s16(q1s32, d16s16, d15s16); + q2s32 = vmlal_s16(q2s32, d17s16, d15s16); + q3s32 = vmlsl_s16(q3s32, d16s16, d14s16); + q4s32 = vmlsl_s16(q4s32, d17s16, d14s16); + + q5s32 = vmull_s16(d22s16, d30s16); + q6s32 = vmull_s16(d23s16, d30s16); + q7s32 = vmull_s16(d22s16, d31s16); + q8s32 = vmull_s16(d23s16, d31s16); + + q5s32 = vmlal_s16(q5s32, d24s16, d31s16); + q6s32 = vmlal_s16(q6s32, d25s16, d31s16); + q7s32 = vmlsl_s16(q7s32, d24s16, d30s16); + q8s32 = vmlsl_s16(q8s32, d25s16, d30s16); + + q11s32 = vaddq_s32(q1s32, q5s32); + q12s32 = vaddq_s32(q2s32, q6s32); + q1s32 = vsubq_s32(q1s32, q5s32); + q2s32 = vsubq_s32(q2s32, q6s32); + + d22s16 = vqrshrn_n_s32(q11s32, 14); + d23s16 = vqrshrn_n_s32(q12s32, 14); + *q11s16 = vcombine_s16(d22s16, d23s16); + + q12s32 = vaddq_s32(q3s32, q7s32); + q15s32 = vaddq_s32(q4s32, q8s32); + q3s32 = vsubq_s32(q3s32, q7s32); + q4s32 = vsubq_s32(q4s32, q8s32); + + d2s16 = vqrshrn_n_s32(q1s32, 14); + d3s16 = vqrshrn_n_s32(q2s32, 14); + d24s16 = vqrshrn_n_s32(q12s32, 14); + d25s16 = vqrshrn_n_s32(q15s32, 14); + d6s16 = vqrshrn_n_s32(q3s32, 14); + d7s16 = vqrshrn_n_s32(q4s32, 14); + *q12s16 = vcombine_s16(d24s16, d25s16); + + d0s16 = vdup_n_s16(cospi_10_64); + d1s16 = vdup_n_s16(cospi_22_64); + q4s32 = vmull_s16(d26s16, d0s16); + q5s32 = vmull_s16(d27s16, d0s16); + q2s32 = vmull_s16(d26s16, d1s16); + q6s32 = vmull_s16(d27s16, d1s16); + + d30s16 = vdup_n_s16(cospi_26_64); + d31s16 = vdup_n_s16(cospi_6_64); + + q4s32 = vmlal_s16(q4s32, d20s16, d1s16); + q5s32 = vmlal_s16(q5s32, d21s16, d1s16); + q2s32 = vmlsl_s16(q2s32, d20s16, d0s16); + q6s32 = vmlsl_s16(q6s32, d21s16, d0s16); + + q0s32 = vmull_s16(d18s16, d30s16); + q13s32 = vmull_s16(d19s16, d30s16); + + q0s32 = vmlal_s16(q0s32, d28s16, d31s16); + q13s32 = vmlal_s16(q13s32, d29s16, d31s16); + + q10s32 = vmull_s16(d18s16, d31s16); + q9s32 = vmull_s16(d19s16, d31s16); + + q10s32 = vmlsl_s16(q10s32, d28s16, d30s16); + q9s32 = vmlsl_s16(q9s32, d29s16, d30s16); + + q14s32 = vaddq_s32(q2s32, q10s32); + q15s32 = vaddq_s32(q6s32, q9s32); + q2s32 = vsubq_s32(q2s32, q10s32); + q6s32 = vsubq_s32(q6s32, q9s32); + + d28s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d4s16 = vqrshrn_n_s32(q2s32, 14); + d5s16 = vqrshrn_n_s32(q6s32, 14); + *q14s16 = vcombine_s16(d28s16, d29s16); + + q9s32 = vaddq_s32(q4s32, q0s32); + q10s32 = vaddq_s32(q5s32, q13s32); + q4s32 = vsubq_s32(q4s32, q0s32); + q5s32 = vsubq_s32(q5s32, q13s32); + + d30s16 = vdup_n_s16(cospi_8_64); + d31s16 = vdup_n_s16(cospi_24_64); + + d18s16 = vqrshrn_n_s32(q9s32, 14); + d19s16 = vqrshrn_n_s32(q10s32, 14); + d8s16 = vqrshrn_n_s32(q4s32, 14); + d9s16 = vqrshrn_n_s32(q5s32, 14); + *q9s16 = vcombine_s16(d18s16, d19s16); + + q5s32 = vmull_s16(d2s16, d30s16); + q6s32 = vmull_s16(d3s16, d30s16); + q7s32 = vmull_s16(d2s16, d31s16); + q0s32 = vmull_s16(d3s16, d31s16); + + q5s32 = vmlal_s16(q5s32, d6s16, d31s16); + q6s32 = vmlal_s16(q6s32, d7s16, d31s16); + q7s32 = vmlsl_s16(q7s32, d6s16, d30s16); + q0s32 = vmlsl_s16(q0s32, d7s16, d30s16); + + q1s32 = vmull_s16(d4s16, d30s16); + q3s32 = vmull_s16(d5s16, d30s16); + q10s32 = vmull_s16(d4s16, d31s16); + q2s32 = vmull_s16(d5s16, d31s16); + + q1s32 = vmlsl_s16(q1s32, d8s16, d31s16); + q3s32 = vmlsl_s16(q3s32, d9s16, d31s16); + q10s32 = vmlal_s16(q10s32, d8s16, d30s16); + q2s32 = vmlal_s16(q2s32, d9s16, d30s16); + + *q8s16 = vaddq_s16(*q11s16, *q9s16); + *q11s16 = vsubq_s16(*q11s16, *q9s16); + q4s16 = vaddq_s16(*q12s16, *q14s16); + *q12s16 = vsubq_s16(*q12s16, *q14s16); + + q14s32 = vaddq_s32(q5s32, q1s32); + q15s32 = vaddq_s32(q6s32, q3s32); + q5s32 = vsubq_s32(q5s32, q1s32); + q6s32 = vsubq_s32(q6s32, q3s32); + + d18s16 = vqrshrn_n_s32(q14s32, 14); + d19s16 = vqrshrn_n_s32(q15s32, 14); + d10s16 = vqrshrn_n_s32(q5s32, 14); + d11s16 = vqrshrn_n_s32(q6s32, 14); + *q9s16 = vcombine_s16(d18s16, d19s16); + + q1s32 = vaddq_s32(q7s32, q10s32); + q3s32 = vaddq_s32(q0s32, q2s32); + q7s32 = vsubq_s32(q7s32, q10s32); + q0s32 = vsubq_s32(q0s32, q2s32); + + d28s16 = vqrshrn_n_s32(q1s32, 14); + d29s16 = vqrshrn_n_s32(q3s32, 14); + d14s16 = vqrshrn_n_s32(q7s32, 14); + d15s16 = vqrshrn_n_s32(q0s32, 14); + *q14s16 = vcombine_s16(d28s16, d29s16); + + d30s16 = vdup_n_s16(cospi_16_64); + + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + q2s32 = vmull_s16(d22s16, d30s16); + q3s32 = vmull_s16(d23s16, d30s16); + q13s32 = vmull_s16(d22s16, d30s16); + q1s32 = vmull_s16(d23s16, d30s16); + + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + q2s32 = vmlal_s16(q2s32, d24s16, d30s16); + q3s32 = vmlal_s16(q3s32, d25s16, d30s16); + q13s32 = vmlsl_s16(q13s32, d24s16, d30s16); + q1s32 = vmlsl_s16(q1s32, d25s16, d30s16); + + d4s16 = vqrshrn_n_s32(q2s32, 14); + d5s16 = vqrshrn_n_s32(q3s32, 14); + d24s16 = vqrshrn_n_s32(q13s32, 14); + d25s16 = vqrshrn_n_s32(q1s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + *q12s16 = vcombine_s16(d24s16, d25s16); + + q13s32 = vmull_s16(d10s16, d30s16); + q1s32 = vmull_s16(d11s16, d30s16); + q11s32 = vmull_s16(d10s16, d30s16); + q0s32 = vmull_s16(d11s16, d30s16); + + q13s32 = vmlal_s16(q13s32, d14s16, d30s16); + q1s32 = vmlal_s16(q1s32, d15s16, d30s16); + q11s32 = vmlsl_s16(q11s32, d14s16, d30s16); + q0s32 = vmlsl_s16(q0s32, d15s16, d30s16); + + d20s16 = vqrshrn_n_s32(q13s32, 14); + d21s16 = vqrshrn_n_s32(q1s32, 14); + d12s16 = vqrshrn_n_s32(q11s32, 14); + d13s16 = vqrshrn_n_s32(q0s32, 14); + *q10s16 = vcombine_s16(d20s16, d21s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + q5s16 = vdupq_n_s16(0); + + *q9s16 = vsubq_s16(q5s16, *q9s16); + *q11s16 = vsubq_s16(q5s16, q2s16); + *q13s16 = vsubq_s16(q5s16, q6s16); + *q15s16 = vsubq_s16(q5s16, q4s16); + return; +} + +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, + int dest_stride, int tx_type) { + int i; + uint8_t *d1, *d2; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + uint64x1_t d0u64, d1u64, d2u64, d3u64; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + q10s16 = vld1q_s16(input + 8 * 2); + q11s16 = vld1q_s16(input + 8 * 3); + q12s16 = vld1q_s16(input + 8 * 4); + q13s16 = vld1q_s16(input + 8 * 5); + q14s16 = vld1q_s16(input + 8 * 6); + q15s16 = vld1q_s16(input + 8 * 7); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + switch (tx_type) { + case 0: // idct_idct is not supported. Fall back to C + vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type); + return; + break; + case 1: // iadst_idct + // generate IDCT constants + // GENERATE_IDCT_CONSTANTS + + // first transform rows + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // transpose the matrix + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // generate IADST constants + // GENERATE_IADST_CONSTANTS + + // then transform columns + IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + break; + case 2: // idct_iadst + // generate IADST constants + // GENERATE_IADST_CONSTANTS + + // first transform rows + IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // transpose the matrix + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // generate IDCT constants + // GENERATE_IDCT_CONSTANTS + + // then transform columns + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + break; + case 3: // iadst_iadst + // generate IADST constants + // GENERATE_IADST_CONSTANTS + + // first transform rows + IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // transpose the matrix + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // then transform columns + IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + break; + default: // iadst_idct + assert(0); + break; + } + + q8s16 = vrshrq_n_s16(q8s16, 5); + q9s16 = vrshrq_n_s16(q9s16, 5); + q10s16 = vrshrq_n_s16(q10s16, 5); + q11s16 = vrshrq_n_s16(q11s16, 5); + q12s16 = vrshrq_n_s16(q12s16, 5); + q13s16 = vrshrq_n_s16(q13s16, 5); + q14s16 = vrshrq_n_s16(q14s16, 5); + q15s16 = vrshrq_n_s16(q15s16, 5); + + for (d1 = d2 = dest, i = 0; i < 2; i++) { + if (i != 0) { + q8s16 = q12s16; + q9s16 = q13s16; + q10s16 = q14s16; + q11s16 = q15s16; + } + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), + vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), + vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + } + return; +} diff --git a/thirdparty/libvpx/vp9/common/vp9_alloccommon.c b/thirdparty/libvpx/vp9/common/vp9_alloccommon.c new file mode 100644 index 000000000000..7dd1005d3feb --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_alloccommon.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_onyxc_int.h" + +// TODO(hkuang): Don't need to lock the whole pool after implementing atomic +// frame reference count. +void lock_buffer_pool(BufferPool *const pool) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&pool->pool_mutex); +#else + (void)pool; +#endif +} + +void unlock_buffer_pool(BufferPool *const pool) { +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&pool->pool_mutex); +#else + (void)pool; +#endif +} + +void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) { + const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); + const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); + + cm->mi_cols = aligned_width >> MI_SIZE_LOG2; + cm->mi_rows = aligned_height >> MI_SIZE_LOG2; + cm->mi_stride = calc_mi_size(cm->mi_cols); + + cm->mb_cols = (cm->mi_cols + 1) >> 1; + cm->mb_rows = (cm->mi_rows + 1) >> 1; + cm->MBs = cm->mb_rows * cm->mb_cols; +} + +static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) { + int i; + + for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) { + cm->seg_map_array[i] = (uint8_t *)vpx_calloc(seg_map_size, 1); + if (cm->seg_map_array[i] == NULL) + return 1; + } + cm->seg_map_alloc_size = seg_map_size; + + // Init the index. + cm->seg_map_idx = 0; + cm->prev_seg_map_idx = 1; + + cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx]; + if (!cm->frame_parallel_decode) + cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx]; + + return 0; +} + +static void free_seg_map(VP9_COMMON *cm) { + int i; + + for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) { + vpx_free(cm->seg_map_array[i]); + cm->seg_map_array[i] = NULL; + } + + cm->current_frame_seg_map = NULL; + + if (!cm->frame_parallel_decode) { + cm->last_frame_seg_map = NULL; + } +} + +void vp9_free_ref_frame_buffers(BufferPool *pool) { + int i; + + for (i = 0; i < FRAME_BUFFERS; ++i) { + if (pool->frame_bufs[i].ref_count > 0 && + pool->frame_bufs[i].raw_frame_buffer.data != NULL) { + pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer); + pool->frame_bufs[i].ref_count = 0; + } + vpx_free(pool->frame_bufs[i].mvs); + pool->frame_bufs[i].mvs = NULL; + vpx_free_frame_buffer(&pool->frame_bufs[i].buf); + } +} + +void vp9_free_postproc_buffers(VP9_COMMON *cm) { +#if CONFIG_VP9_POSTPROC + vpx_free_frame_buffer(&cm->post_proc_buffer); + vpx_free_frame_buffer(&cm->post_proc_buffer_int); +#else + (void)cm; +#endif +} + +void vp9_free_context_buffers(VP9_COMMON *cm) { + cm->free_mi(cm); + free_seg_map(cm); + vpx_free(cm->above_context); + cm->above_context = NULL; + vpx_free(cm->above_seg_context); + cm->above_seg_context = NULL; + vpx_free(cm->lf.lfm); + cm->lf.lfm = NULL; +} + + +int vp9_alloc_loop_filter(VP9_COMMON *cm) { + vpx_free(cm->lf.lfm); + // Each lfm holds bit masks for all the 8x8 blocks in a 64x64 region. The + // stride and rows are rounded up / truncated to a multiple of 8. + cm->lf.lfm_stride = (cm->mi_cols + (MI_BLOCK_SIZE - 1)) >> 3; + cm->lf.lfm = (LOOP_FILTER_MASK *)vpx_calloc( + ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride, + sizeof(*cm->lf.lfm)); + if (!cm->lf.lfm) + return 1; + return 0; +} + +int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { + int new_mi_size; + + vp9_set_mb_mi(cm, width, height); + new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows); + if (cm->mi_alloc_size < new_mi_size) { + cm->free_mi(cm); + if (cm->alloc_mi(cm, new_mi_size)) + goto fail; + } + + if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) { + // Create the segmentation map structure and set to 0. + free_seg_map(cm); + if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) + goto fail; + } + + if (cm->above_context_alloc_cols < cm->mi_cols) { + vpx_free(cm->above_context); + cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc( + 2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE, + sizeof(*cm->above_context)); + if (!cm->above_context) goto fail; + + vpx_free(cm->above_seg_context); + cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc( + mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context)); + if (!cm->above_seg_context) goto fail; + cm->above_context_alloc_cols = cm->mi_cols; + } + + if (vp9_alloc_loop_filter(cm)) + goto fail; + + return 0; + + fail: + vp9_free_context_buffers(cm); + return 1; +} + +void vp9_remove_common(VP9_COMMON *cm) { + vp9_free_context_buffers(cm); + + vpx_free(cm->fc); + cm->fc = NULL; + vpx_free(cm->frame_contexts); + cm->frame_contexts = NULL; +} + +void vp9_init_context_buffers(VP9_COMMON *cm) { + cm->setup_mi(cm); + if (cm->last_frame_seg_map && !cm->frame_parallel_decode) + memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols); +} + +void vp9_swap_current_and_last_seg_map(VP9_COMMON *cm) { + // Swap indices. + const int tmp = cm->seg_map_idx; + cm->seg_map_idx = cm->prev_seg_map_idx; + cm->prev_seg_map_idx = tmp; + + cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx]; + cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx]; +} diff --git a/thirdparty/libvpx/vp9/common/vp9_alloccommon.h b/thirdparty/libvpx/vp9/common/vp9_alloccommon.h new file mode 100644 index 000000000000..e53955b99883 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_alloccommon.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_COMMON_VP9_ALLOCCOMMON_H_ +#define VP9_COMMON_VP9_ALLOCCOMMON_H_ + +#define INVALID_IDX -1 // Invalid buffer index. + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP9Common; +struct BufferPool; + +void vp9_remove_common(struct VP9Common *cm); + +int vp9_alloc_loop_filter(struct VP9Common *cm); +int vp9_alloc_context_buffers(struct VP9Common *cm, int width, int height); +void vp9_init_context_buffers(struct VP9Common *cm); +void vp9_free_context_buffers(struct VP9Common *cm); + +void vp9_free_ref_frame_buffers(struct BufferPool *pool); +void vp9_free_postproc_buffers(struct VP9Common *cm); + +int vp9_alloc_state_buffers(struct VP9Common *cm, int width, int height); +void vp9_free_state_buffers(struct VP9Common *cm); + +void vp9_set_mb_mi(struct VP9Common *cm, int width, int height); + +void vp9_swap_current_and_last_seg_map(struct VP9Common *cm); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_ALLOCCOMMON_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_blockd.c b/thirdparty/libvpx/vp9/common/vp9_blockd.c new file mode 100644 index 000000000000..7bab27d4fd04 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_blockd.c @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_blockd.h" + +PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *left_mi, int b) { + if (b == 0 || b == 2) { + if (!left_mi || is_inter_block(left_mi)) + return DC_PRED; + + return get_y_mode(left_mi, b + 1); + } else { + assert(b == 1 || b == 3); + return cur_mi->bmi[b - 1].as_mode; + } +} + +PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *above_mi, int b) { + if (b == 0 || b == 1) { + if (!above_mi || is_inter_block(above_mi)) + return DC_PRED; + + return get_y_mode(above_mi, b + 2); + } else { + assert(b == 2 || b == 3); + return cur_mi->bmi[b - 2].as_mode; + } +} + +void vp9_foreach_transformed_block_in_plane( + const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, + foreach_transformed_block_visitor visit, void *arg) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const MODE_INFO* mi = xd->mi[0]; + // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + // transform size varies per plane, look it up in a common way. + const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) + : mi->tx_size; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int step = 1 << (tx_size << 1); + int i = 0, r, c; + + // If mb_to_right_edge is < 0 we are in a situation in which + // the current block size extends into the UMV and we won't + // visit the sub blocks that are wholly within the UMV. + const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : + xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : + xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + const int extra_step = ((num_4x4_w - max_blocks_wide) >> tx_size) * step; + + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + for (r = 0; r < max_blocks_high; r += (1 << tx_size)) { + // Skip visiting the sub blocks that are wholly within the UMV. + for (c = 0; c < max_blocks_wide; c += (1 << tx_size)) { + visit(plane, i, plane_bsize, tx_size, arg); + i += step; + } + i += extra_step; + } +} + +void vp9_foreach_transformed_block(const MACROBLOCKD* const xd, + BLOCK_SIZE bsize, + foreach_transformed_block_visitor visit, + void *arg) { + int plane; + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) + vp9_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); +} + +void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, + int aoff, int loff) { + ENTROPY_CONTEXT *const a = pd->above_context + aoff; + ENTROPY_CONTEXT *const l = pd->left_context + loff; + const int tx_size_in_blocks = 1 << tx_size; + + // above + if (has_eob && xd->mb_to_right_edge < 0) { + int i; + const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] + + (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + int above_contexts = tx_size_in_blocks; + if (above_contexts + aoff > blocks_wide) + above_contexts = blocks_wide - aoff; + + for (i = 0; i < above_contexts; ++i) + a[i] = has_eob; + for (i = above_contexts; i < tx_size_in_blocks; ++i) + a[i] = 0; + } else { + memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); + } + + // left + if (has_eob && xd->mb_to_bottom_edge < 0) { + int i; + const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] + + (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + int left_contexts = tx_size_in_blocks; + if (left_contexts + loff > blocks_high) + left_contexts = blocks_high - loff; + + for (i = 0; i < left_contexts; ++i) + l[i] = has_eob; + for (i = left_contexts; i < tx_size_in_blocks; ++i) + l[i] = 0; + } else { + memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); + } +} + +void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y) { + int i; + + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].subsampling_x = i ? ss_x : 0; + xd->plane[i].subsampling_y = i ? ss_y : 0; + } +} diff --git a/thirdparty/libvpx/vp9/common/vp9_blockd.h b/thirdparty/libvpx/vp9/common/vp9_blockd.h new file mode 100644 index 000000000000..3d26fb2b5d05 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_blockd.h @@ -0,0 +1,305 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_COMMON_VP9_BLOCKD_H_ +#define VP9_COMMON_VP9_BLOCKD_H_ + +#include "./vpx_config.h" + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/mem.h" +#include "vpx_scale/yv12config.h" + +#include "vp9/common/vp9_common_data.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_mv.h" +#include "vp9/common/vp9_scale.h" +#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_tile_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_MB_PLANE 3 + +typedef enum { + KEY_FRAME = 0, + INTER_FRAME = 1, + FRAME_TYPES, +} FRAME_TYPE; + +static INLINE int is_inter_mode(PREDICTION_MODE mode) { + return mode >= NEARESTMV && mode <= NEWMV; +} + +/* For keyframes, intra block modes are predicted by the (already decoded) + modes for the Y blocks to the left and above us; for interframes, there + is a single probability table. */ + +typedef struct { + PREDICTION_MODE as_mode; + int_mv as_mv[2]; // first, second inter predictor motion vectors +} b_mode_info; + +// Note that the rate-distortion optimization loop, bit-stream writer, and +// decoder implementation modules critically rely on the defined entry values +// specified herein. They should be refactored concurrently. + +#define NONE -1 +#define INTRA_FRAME 0 +#define LAST_FRAME 1 +#define GOLDEN_FRAME 2 +#define ALTREF_FRAME 3 +#define MAX_REF_FRAMES 4 +typedef int8_t MV_REFERENCE_FRAME; + +// This structure now relates to 8x8 block regions. +typedef struct MODE_INFO { + // Common for both INTER and INTRA blocks + BLOCK_SIZE sb_type; + PREDICTION_MODE mode; + TX_SIZE tx_size; + int8_t skip; + int8_t segment_id; + int8_t seg_id_predicted; // valid only when temporal_update is enabled + + // Only for INTRA blocks + PREDICTION_MODE uv_mode; + + // Only for INTER blocks + INTERP_FILTER interp_filter; + MV_REFERENCE_FRAME ref_frame[2]; + + // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead. + int_mv mv[2]; + + b_mode_info bmi[4]; +} MODE_INFO; + +static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) { + return mi->sb_type < BLOCK_8X8 ? mi->bmi[block].as_mode + : mi->mode; +} + +static INLINE int is_inter_block(const MODE_INFO *mi) { + return mi->ref_frame[0] > INTRA_FRAME; +} + +static INLINE int has_second_ref(const MODE_INFO *mi) { + return mi->ref_frame[1] > INTRA_FRAME; +} + +PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *left_mi, int b); + +PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *above_mi, int b); + +enum mv_precision { + MV_PRECISION_Q3, + MV_PRECISION_Q4 +}; + +struct buf_2d { + uint8_t *buf; + int stride; +}; + +struct macroblockd_plane { + tran_low_t *dqcoeff; + int subsampling_x; + int subsampling_y; + struct buf_2d dst; + struct buf_2d pre[2]; + ENTROPY_CONTEXT *above_context; + ENTROPY_CONTEXT *left_context; + int16_t seg_dequant[MAX_SEGMENTS][2]; + + // number of 4x4s in current block + uint16_t n4_w, n4_h; + // log2 of n4_w, n4_h + uint8_t n4_wl, n4_hl; + + // encoder + const int16_t *dequant; +}; + +#define BLOCK_OFFSET(x, i) ((x) + (i) * 16) + +typedef struct RefBuffer { + // TODO(dkovalev): idx is not really required and should be removed, now it + // is used in vp9_onyxd_if.c + int idx; + YV12_BUFFER_CONFIG *buf; + struct scale_factors sf; +} RefBuffer; + +typedef struct macroblockd { + struct macroblockd_plane plane[MAX_MB_PLANE]; + uint8_t bmode_blocks_wl; + uint8_t bmode_blocks_hl; + + FRAME_COUNTS *counts; + TileInfo tile; + + int mi_stride; + + MODE_INFO **mi; + MODE_INFO *left_mi; + MODE_INFO *above_mi; + + unsigned int max_blocks_wide; + unsigned int max_blocks_high; + + const vpx_prob (*partition_probs)[PARTITION_TYPES - 1]; + + /* Distance of MB away from frame edges */ + int mb_to_left_edge; + int mb_to_right_edge; + int mb_to_top_edge; + int mb_to_bottom_edge; + + FRAME_CONTEXT *fc; + + /* pointers to reference frames */ + RefBuffer *block_refs[2]; + + /* pointer to current frame */ + const YV12_BUFFER_CONFIG *cur_buf; + + ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; + ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16]; + + PARTITION_CONTEXT *above_seg_context; + PARTITION_CONTEXT left_seg_context[8]; + +#if CONFIG_VP9_HIGHBITDEPTH + /* Bit depth: 8, 10, 12 */ + int bd; +#endif + + int lossless; + int corrupted; + + struct vpx_internal_error_info *error_info; +} MACROBLOCKD; + +static INLINE PLANE_TYPE get_plane_type(int plane) { + return (PLANE_TYPE)(plane > 0); +} + +static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, + PARTITION_TYPE partition) { + return subsize_lookup[partition][bsize]; +} + +extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES]; + +static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type, + const MACROBLOCKD *xd) { + const MODE_INFO *const mi = xd->mi[0]; + + if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mi)) + return DCT_DCT; + + return intra_mode_to_tx_type_lookup[mi->mode]; +} + +static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type, + const MACROBLOCKD *xd, int ib) { + const MODE_INFO *const mi = xd->mi[0]; + + if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mi)) + return DCT_DCT; + + return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)]; +} + +void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y); + +static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize, + int xss, int yss) { + if (bsize < BLOCK_8X8) { + return TX_4X4; + } else { + const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][xss][yss]; + return VPXMIN(y_tx_size, max_txsize_lookup[plane_bsize]); + } +} + +static INLINE TX_SIZE get_uv_tx_size(const MODE_INFO *mi, + const struct macroblockd_plane *pd) { + return get_uv_tx_size_impl(mi->tx_size, mi->sb_type, pd->subsampling_x, + pd->subsampling_y); +} + +static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize, + const struct macroblockd_plane *pd) { + return ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y]; +} + +static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) { + struct macroblockd_plane *const pd = &xd->plane[i]; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + memset(pd->above_context, 0, + sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[plane_bsize]); + memset(pd->left_context, 0, + sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[plane_bsize]); + } +} + +static INLINE const vpx_prob *get_y_mode_probs(const MODE_INFO *mi, + const MODE_INFO *above_mi, + const MODE_INFO *left_mi, + int block) { + const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block); + const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block); + return vp9_kf_y_mode_prob[above][left]; +} + +typedef void (*foreach_transformed_block_visitor)(int plane, int block, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, + void *arg); + +void vp9_foreach_transformed_block_in_plane( + const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, + foreach_transformed_block_visitor visit, void *arg); + + +void vp9_foreach_transformed_block( + const MACROBLOCKD* const xd, BLOCK_SIZE bsize, + foreach_transformed_block_visitor visit, void *arg); + +static INLINE void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, int block, + int *x, int *y) { + const int bwl = b_width_log2_lookup[plane_bsize]; + const int tx_cols_log2 = bwl - tx_size; + const int tx_cols = 1 << tx_cols_log2; + const int raster_mb = block >> (tx_size << 1); + *x = (raster_mb & (tx_cols - 1)) << tx_size; + *y = (raster_mb >> tx_cols_log2) << tx_size; +} + +void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, + int aoff, int loff); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_BLOCKD_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_common.h b/thirdparty/libvpx/vp9/common/vp9_common.h new file mode 100644 index 000000000000..908fa80a31c0 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_common.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_COMMON_H_ +#define VP9_COMMON_VP9_COMMON_H_ + +/* Interface header for common constant data structures and lookup tables */ + +#include + +#include "./vpx_config.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/bitops.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Only need this for fixed-size arrays, for structs just assign. +#define vp9_copy(dest, src) { \ + assert(sizeof(dest) == sizeof(src)); \ + memcpy(dest, src, sizeof(src)); \ + } + +// Use this for variably-sized arrays. +#define vp9_copy_array(dest, src, n) { \ + assert(sizeof(*dest) == sizeof(*src)); \ + memcpy(dest, src, n * sizeof(*src)); \ + } + +#define vp9_zero(dest) memset(&(dest), 0, sizeof(dest)) +#define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest)) + +static INLINE int get_unsigned_bits(unsigned int num_values) { + return num_values > 0 ? get_msb(num_values) + 1 : 0; +} + +#if CONFIG_DEBUG +#define CHECK_MEM_ERROR(cm, lval, expr) do { \ + lval = (expr); \ + if (!lval) \ + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, \ + "Failed to allocate "#lval" at %s:%d", \ + __FILE__, __LINE__); \ + } while (0) +#else +#define CHECK_MEM_ERROR(cm, lval, expr) do { \ + lval = (expr); \ + if (!lval) \ + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, \ + "Failed to allocate "#lval); \ + } while (0) +#endif + +#define VP9_SYNC_CODE_0 0x49 +#define VP9_SYNC_CODE_1 0x83 +#define VP9_SYNC_CODE_2 0x42 + +#define VP9_FRAME_MARKER 0x2 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_COMMON_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_common_data.c b/thirdparty/libvpx/vp9/common/vp9_common_data.c new file mode 100644 index 000000000000..3409d04844ba --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_common_data.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_common_data.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Log 2 conversion lookup tables for block width and height +const uint8_t b_width_log2_lookup[BLOCK_SIZES] = + {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4}; +const uint8_t b_height_log2_lookup[BLOCK_SIZES] = + {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4}; +const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES] = + {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16}; +const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES] = + {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16}; +// Log 2 conversion lookup tables for modeinfo width and height +const uint8_t mi_width_log2_lookup[BLOCK_SIZES] = + {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3}; +const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] = + {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8}; +const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] = + {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8}; + +// VPXMIN(3, VPXMIN(b_width_log2(bsize), b_height_log2(bsize))) +const uint8_t size_group_lookup[BLOCK_SIZES] = + {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3}; + +const uint8_t num_pels_log2_lookup[BLOCK_SIZES] = + {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12}; + +const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = { + { // 4X4 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID + }, { // 8X8 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID + }, { // 16X16 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, + PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID + }, { // 32X32 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, + PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID + }, { // 64X64 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, + PARTITION_NONE + } +}; + +const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = { + { // PARTITION_NONE + BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, + BLOCK_16X16, BLOCK_16X32, BLOCK_32X16, + BLOCK_32X32, BLOCK_32X64, BLOCK_64X32, + BLOCK_64X64, + }, { // PARTITION_HORZ + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_8X4, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_16X8, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_32X16, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_64X32, + }, { // PARTITION_VERT + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_4X8, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_8X16, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_16X32, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_32X64, + }, { // PARTITION_SPLIT + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_4X4, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_8X8, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_16X16, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_32X32, + } +}; + +const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = { + TX_4X4, TX_4X4, TX_4X4, + TX_8X8, TX_8X8, TX_8X8, + TX_16X16, TX_16X16, TX_16X16, + TX_32X32, TX_32X32, TX_32X32, TX_32X32 +}; + +const BLOCK_SIZE txsize_to_bsize[TX_SIZES] = { + BLOCK_4X4, // TX_4X4 + BLOCK_8X8, // TX_8X8 + BLOCK_16X16, // TX_16X16 + BLOCK_32X32, // TX_32X32 +}; + +const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { + TX_4X4, // ONLY_4X4 + TX_8X8, // ALLOW_8X8 + TX_16X16, // ALLOW_16X16 + TX_32X32, // ALLOW_32X32 + TX_32X32, // TX_MODE_SELECT +}; + +const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = { +// ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 +// ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 + {{BLOCK_4X4, BLOCK_INVALID}, {BLOCK_INVALID, BLOCK_INVALID}}, + {{BLOCK_4X8, BLOCK_4X4}, {BLOCK_INVALID, BLOCK_INVALID}}, + {{BLOCK_8X4, BLOCK_INVALID}, {BLOCK_4X4, BLOCK_INVALID}}, + {{BLOCK_8X8, BLOCK_8X4}, {BLOCK_4X8, BLOCK_4X4}}, + {{BLOCK_8X16, BLOCK_8X8}, {BLOCK_INVALID, BLOCK_4X8}}, + {{BLOCK_16X8, BLOCK_INVALID}, {BLOCK_8X8, BLOCK_8X4}}, + {{BLOCK_16X16, BLOCK_16X8}, {BLOCK_8X16, BLOCK_8X8}}, + {{BLOCK_16X32, BLOCK_16X16}, {BLOCK_INVALID, BLOCK_8X16}}, + {{BLOCK_32X16, BLOCK_INVALID}, {BLOCK_16X16, BLOCK_16X8}}, + {{BLOCK_32X32, BLOCK_32X16}, {BLOCK_16X32, BLOCK_16X16}}, + {{BLOCK_32X64, BLOCK_32X32}, {BLOCK_INVALID, BLOCK_16X32}}, + {{BLOCK_64X32, BLOCK_INVALID}, {BLOCK_32X32, BLOCK_32X16}}, + {{BLOCK_64X64, BLOCK_64X32}, {BLOCK_32X64, BLOCK_32X32}}, +}; + +// Generates 4 bit field in which each bit set to 1 represents +// a blocksize partition 1111 means we split 64x64, 32x32, 16x16 +// and 8x8. 1000 means we just split the 64x64 to 32x32 +const struct { + PARTITION_CONTEXT above; + PARTITION_CONTEXT left; +} partition_context_lookup[BLOCK_SIZES]= { + {15, 15}, // 4X4 - {0b1111, 0b1111} + {15, 14}, // 4X8 - {0b1111, 0b1110} + {14, 15}, // 8X4 - {0b1110, 0b1111} + {14, 14}, // 8X8 - {0b1110, 0b1110} + {14, 12}, // 8X16 - {0b1110, 0b1100} + {12, 14}, // 16X8 - {0b1100, 0b1110} + {12, 12}, // 16X16 - {0b1100, 0b1100} + {12, 8 }, // 16X32 - {0b1100, 0b1000} + {8, 12}, // 32X16 - {0b1000, 0b1100} + {8, 8 }, // 32X32 - {0b1000, 0b1000} + {8, 0 }, // 32X64 - {0b1000, 0b0000} + {0, 8 }, // 64X32 - {0b0000, 0b1000} + {0, 0 }, // 64X64 - {0b0000, 0b0000} +}; + +#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH +const uint8_t need_top_left[INTRA_MODES] = { + 0, // DC_PRED + 0, // V_PRED + 0, // H_PRED + 0, // D45_PRED + 1, // D135_PRED + 1, // D117_PRED + 1, // D153_PRED + 0, // D207_PRED + 0, // D63_PRED + 1, // TM_PRED +}; +#endif // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH diff --git a/thirdparty/libvpx/vp9/common/vp9_common_data.h b/thirdparty/libvpx/vp9/common/vp9_common_data.h new file mode 100644 index 000000000000..0ae24dad5499 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_common_data.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_COMMON_DATA_H_ +#define VP9_COMMON_VP9_COMMON_DATA_H_ + +#include "vp9/common/vp9_enums.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern const uint8_t b_width_log2_lookup[BLOCK_SIZES]; +extern const uint8_t b_height_log2_lookup[BLOCK_SIZES]; +extern const uint8_t mi_width_log2_lookup[BLOCK_SIZES]; +extern const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES]; +extern const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES]; +extern const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES]; +extern const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES]; +extern const uint8_t size_group_lookup[BLOCK_SIZES]; +extern const uint8_t num_pels_log2_lookup[BLOCK_SIZES]; +extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES]; +extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES]; +extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES]; +extern const BLOCK_SIZE txsize_to_bsize[TX_SIZES]; +extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES]; +extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2]; +#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH +extern const uint8_t need_top_left[INTRA_MODES]; +#endif // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_COMMON_DATA_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_debugmodes.c b/thirdparty/libvpx/vp9/common/vp9_debugmodes.c new file mode 100644 index 000000000000..d9c1fd968618 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_debugmodes.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_onyxc_int.h" + +static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) { + fprintf(f, "%s", str); + fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_video_frame, + cm->show_frame, cm->base_qindex); +} +/* This function dereferences a pointer to the mbmi structure + * and uses the passed in member offset to print out the value of an integer + * for each mbmi member value in the mi structure. + */ +static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor, + size_t member_offset) { + int mi_row, mi_col; + MODE_INFO **mi = cm->mi_grid_visible; + int rows = cm->mi_rows; + int cols = cm->mi_cols; + char prefix = descriptor[0]; + + log_frame_info(cm, descriptor, file); + for (mi_row = 0; mi_row < rows; mi_row++) { + fprintf(file, "%c ", prefix); + for (mi_col = 0; mi_col < cols; mi_col++) { + fprintf(file, "%2d ", + *((int*) ((char *) (mi[0]) + + member_offset))); + mi++; + } + fprintf(file, "\n"); + mi += 8; + } + fprintf(file, "\n"); +} + +void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, const char *file) { + int mi_row; + int mi_col; + FILE *mvs = fopen(file, "a"); + MODE_INFO **mi = cm->mi_grid_visible; + int rows = cm->mi_rows; + int cols = cm->mi_cols; + + print_mi_data(cm, mvs, "Partitions:", offsetof(MODE_INFO, sb_type)); + print_mi_data(cm, mvs, "Modes:", offsetof(MODE_INFO, mode)); + print_mi_data(cm, mvs, "Ref frame:", offsetof(MODE_INFO, ref_frame[0])); + print_mi_data(cm, mvs, "Transform:", offsetof(MODE_INFO, tx_size)); + print_mi_data(cm, mvs, "UV Modes:", offsetof(MODE_INFO, uv_mode)); + + // output skip infomation. + log_frame_info(cm, "Skips:", mvs); + for (mi_row = 0; mi_row < rows; mi_row++) { + fprintf(mvs, "S "); + for (mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%2d ", mi[0]->skip); + mi++; + } + fprintf(mvs, "\n"); + mi += 8; + } + fprintf(mvs, "\n"); + + // output motion vectors. + log_frame_info(cm, "Vectors ", mvs); + mi = cm->mi_grid_visible; + for (mi_row = 0; mi_row < rows; mi_row++) { + fprintf(mvs, "V "); + for (mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row, + mi[0]->mv[0].as_mv.col); + mi++; + } + fprintf(mvs, "\n"); + mi += 8; + } + fprintf(mvs, "\n"); + + fclose(mvs); +} diff --git a/thirdparty/libvpx/vp9/common/vp9_entropy.c b/thirdparty/libvpx/vp9/common/vp9_entropy.c new file mode 100644 index 000000000000..7b490af34fd7 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_entropy.c @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_entropymode.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx/vpx_integer.h" + +// Unconstrained Node Tree +const vpx_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = { + 2, 6, // 0 = LOW_VAL + -TWO_TOKEN, 4, // 1 = TWO + -THREE_TOKEN, -FOUR_TOKEN, // 2 = THREE + 8, 10, // 3 = HIGH_LOW + -CATEGORY1_TOKEN, -CATEGORY2_TOKEN, // 4 = CAT_ONE + 12, 14, // 5 = CAT_THREEFOUR + -CATEGORY3_TOKEN, -CATEGORY4_TOKEN, // 6 = CAT_THREE + -CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 7 = CAT_FIVE +}; + +const vpx_prob vp9_cat1_prob[] = { 159 }; +const vpx_prob vp9_cat2_prob[] = { 165, 145 }; +const vpx_prob vp9_cat3_prob[] = { 173, 148, 140 }; +const vpx_prob vp9_cat4_prob[] = { 176, 155, 140, 135 }; +const vpx_prob vp9_cat5_prob[] = { 180, 157, 141, 134, 130 }; +const vpx_prob vp9_cat6_prob[] = { + 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 +}; +#if CONFIG_VP9_HIGHBITDEPTH +const vpx_prob vp9_cat6_prob_high12[] = { + 255, 255, 255, 255, 254, 254, 254, 252, 249, + 243, 230, 196, 177, 153, 140, 133, 130, 129 +}; +#endif + +const uint8_t vp9_coefband_trans_8x8plus[1024] = { + 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 5, + // beyond MAXBAND_INDEX+1 all values are filled as 5 + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, +}; + +const uint8_t vp9_coefband_trans_4x4[16] = { + 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, +}; + +const uint8_t vp9_pt_energy_class[ENTROPY_TOKENS] = { + 0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5 +}; + +// Model obtained from a 2-sided zero-centerd distribuition derived +// from a Pareto distribution. The cdf of the distribution is: +// cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta] +// +// For a given beta and a given probablity of the 1-node, the alpha +// is first solved, and then the {alpha, beta} pair is used to generate +// the probabilities for the rest of the nodes. + +// beta = 8 + +// Every odd line in this table can be generated from the even lines +// by averaging : +// vp9_pareto8_full[l][node] = (vp9_pareto8_full[l-1][node] + +// vp9_pareto8_full[l+1][node] ) >> 1; +const vpx_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = { + { 3, 86, 128, 6, 86, 23, 88, 29}, + { 6, 86, 128, 11, 87, 42, 91, 52}, + { 9, 86, 129, 17, 88, 61, 94, 76}, + { 12, 86, 129, 22, 88, 77, 97, 93}, + { 15, 87, 129, 28, 89, 93, 100, 110}, + { 17, 87, 129, 33, 90, 105, 103, 123}, + { 20, 88, 130, 38, 91, 118, 106, 136}, + { 23, 88, 130, 43, 91, 128, 108, 146}, + { 26, 89, 131, 48, 92, 139, 111, 156}, + { 28, 89, 131, 53, 93, 147, 114, 163}, + { 31, 90, 131, 58, 94, 156, 117, 171}, + { 34, 90, 131, 62, 94, 163, 119, 177}, + { 37, 90, 132, 66, 95, 171, 122, 184}, + { 39, 90, 132, 70, 96, 177, 124, 189}, + { 42, 91, 132, 75, 97, 183, 127, 194}, + { 44, 91, 132, 79, 97, 188, 129, 198}, + { 47, 92, 133, 83, 98, 193, 132, 202}, + { 49, 92, 133, 86, 99, 197, 134, 205}, + { 52, 93, 133, 90, 100, 201, 137, 208}, + { 54, 93, 133, 94, 100, 204, 139, 211}, + { 57, 94, 134, 98, 101, 208, 142, 214}, + { 59, 94, 134, 101, 102, 211, 144, 216}, + { 62, 94, 135, 105, 103, 214, 146, 218}, + { 64, 94, 135, 108, 103, 216, 148, 220}, + { 66, 95, 135, 111, 104, 219, 151, 222}, + { 68, 95, 135, 114, 105, 221, 153, 223}, + { 71, 96, 136, 117, 106, 224, 155, 225}, + { 73, 96, 136, 120, 106, 225, 157, 226}, + { 76, 97, 136, 123, 107, 227, 159, 228}, + { 78, 97, 136, 126, 108, 229, 160, 229}, + { 80, 98, 137, 129, 109, 231, 162, 231}, + { 82, 98, 137, 131, 109, 232, 164, 232}, + { 84, 98, 138, 134, 110, 234, 166, 233}, + { 86, 98, 138, 137, 111, 235, 168, 234}, + { 89, 99, 138, 140, 112, 236, 170, 235}, + { 91, 99, 138, 142, 112, 237, 171, 235}, + { 93, 100, 139, 145, 113, 238, 173, 236}, + { 95, 100, 139, 147, 114, 239, 174, 237}, + { 97, 101, 140, 149, 115, 240, 176, 238}, + { 99, 101, 140, 151, 115, 241, 177, 238}, + {101, 102, 140, 154, 116, 242, 179, 239}, + {103, 102, 140, 156, 117, 242, 180, 239}, + {105, 103, 141, 158, 118, 243, 182, 240}, + {107, 103, 141, 160, 118, 243, 183, 240}, + {109, 104, 141, 162, 119, 244, 185, 241}, + {111, 104, 141, 164, 119, 244, 186, 241}, + {113, 104, 142, 166, 120, 245, 187, 242}, + {114, 104, 142, 168, 121, 245, 188, 242}, + {116, 105, 143, 170, 122, 246, 190, 243}, + {118, 105, 143, 171, 122, 246, 191, 243}, + {120, 106, 143, 173, 123, 247, 192, 244}, + {121, 106, 143, 175, 124, 247, 193, 244}, + {123, 107, 144, 177, 125, 248, 195, 244}, + {125, 107, 144, 178, 125, 248, 196, 244}, + {127, 108, 145, 180, 126, 249, 197, 245}, + {128, 108, 145, 181, 127, 249, 198, 245}, + {130, 109, 145, 183, 128, 249, 199, 245}, + {132, 109, 145, 184, 128, 249, 200, 245}, + {134, 110, 146, 186, 129, 250, 201, 246}, + {135, 110, 146, 187, 130, 250, 202, 246}, + {137, 111, 147, 189, 131, 251, 203, 246}, + {138, 111, 147, 190, 131, 251, 204, 246}, + {140, 112, 147, 192, 132, 251, 205, 247}, + {141, 112, 147, 193, 132, 251, 206, 247}, + {143, 113, 148, 194, 133, 251, 207, 247}, + {144, 113, 148, 195, 134, 251, 207, 247}, + {146, 114, 149, 197, 135, 252, 208, 248}, + {147, 114, 149, 198, 135, 252, 209, 248}, + {149, 115, 149, 199, 136, 252, 210, 248}, + {150, 115, 149, 200, 137, 252, 210, 248}, + {152, 115, 150, 201, 138, 252, 211, 248}, + {153, 115, 150, 202, 138, 252, 212, 248}, + {155, 116, 151, 204, 139, 253, 213, 249}, + {156, 116, 151, 205, 139, 253, 213, 249}, + {158, 117, 151, 206, 140, 253, 214, 249}, + {159, 117, 151, 207, 141, 253, 215, 249}, + {161, 118, 152, 208, 142, 253, 216, 249}, + {162, 118, 152, 209, 142, 253, 216, 249}, + {163, 119, 153, 210, 143, 253, 217, 249}, + {164, 119, 153, 211, 143, 253, 217, 249}, + {166, 120, 153, 212, 144, 254, 218, 250}, + {167, 120, 153, 212, 145, 254, 219, 250}, + {168, 121, 154, 213, 146, 254, 220, 250}, + {169, 121, 154, 214, 146, 254, 220, 250}, + {171, 122, 155, 215, 147, 254, 221, 250}, + {172, 122, 155, 216, 147, 254, 221, 250}, + {173, 123, 155, 217, 148, 254, 222, 250}, + {174, 123, 155, 217, 149, 254, 222, 250}, + {176, 124, 156, 218, 150, 254, 223, 250}, + {177, 124, 156, 219, 150, 254, 223, 250}, + {178, 125, 157, 220, 151, 254, 224, 251}, + {179, 125, 157, 220, 151, 254, 224, 251}, + {180, 126, 157, 221, 152, 254, 225, 251}, + {181, 126, 157, 221, 152, 254, 225, 251}, + {183, 127, 158, 222, 153, 254, 226, 251}, + {184, 127, 158, 223, 154, 254, 226, 251}, + {185, 128, 159, 224, 155, 255, 227, 251}, + {186, 128, 159, 224, 155, 255, 227, 251}, + {187, 129, 160, 225, 156, 255, 228, 251}, + {188, 130, 160, 225, 156, 255, 228, 251}, + {189, 131, 160, 226, 157, 255, 228, 251}, + {190, 131, 160, 226, 158, 255, 228, 251}, + {191, 132, 161, 227, 159, 255, 229, 251}, + {192, 132, 161, 227, 159, 255, 229, 251}, + {193, 133, 162, 228, 160, 255, 230, 252}, + {194, 133, 162, 229, 160, 255, 230, 252}, + {195, 134, 163, 230, 161, 255, 231, 252}, + {196, 134, 163, 230, 161, 255, 231, 252}, + {197, 135, 163, 231, 162, 255, 231, 252}, + {198, 135, 163, 231, 162, 255, 231, 252}, + {199, 136, 164, 232, 163, 255, 232, 252}, + {200, 136, 164, 232, 164, 255, 232, 252}, + {201, 137, 165, 233, 165, 255, 233, 252}, + {201, 137, 165, 233, 165, 255, 233, 252}, + {202, 138, 166, 233, 166, 255, 233, 252}, + {203, 138, 166, 233, 166, 255, 233, 252}, + {204, 139, 166, 234, 167, 255, 234, 252}, + {205, 139, 166, 234, 167, 255, 234, 252}, + {206, 140, 167, 235, 168, 255, 235, 252}, + {206, 140, 167, 235, 168, 255, 235, 252}, + {207, 141, 168, 236, 169, 255, 235, 252}, + {208, 141, 168, 236, 170, 255, 235, 252}, + {209, 142, 169, 237, 171, 255, 236, 252}, + {209, 143, 169, 237, 171, 255, 236, 252}, + {210, 144, 169, 237, 172, 255, 236, 252}, + {211, 144, 169, 237, 172, 255, 236, 252}, + {212, 145, 170, 238, 173, 255, 237, 252}, + {213, 145, 170, 238, 173, 255, 237, 252}, + {214, 146, 171, 239, 174, 255, 237, 253}, + {214, 146, 171, 239, 174, 255, 237, 253}, + {215, 147, 172, 240, 175, 255, 238, 253}, + {215, 147, 172, 240, 175, 255, 238, 253}, + {216, 148, 173, 240, 176, 255, 238, 253}, + {217, 148, 173, 240, 176, 255, 238, 253}, + {218, 149, 173, 241, 177, 255, 239, 253}, + {218, 149, 173, 241, 178, 255, 239, 253}, + {219, 150, 174, 241, 179, 255, 239, 253}, + {219, 151, 174, 241, 179, 255, 239, 253}, + {220, 152, 175, 242, 180, 255, 240, 253}, + {221, 152, 175, 242, 180, 255, 240, 253}, + {222, 153, 176, 242, 181, 255, 240, 253}, + {222, 153, 176, 242, 181, 255, 240, 253}, + {223, 154, 177, 243, 182, 255, 240, 253}, + {223, 154, 177, 243, 182, 255, 240, 253}, + {224, 155, 178, 244, 183, 255, 241, 253}, + {224, 155, 178, 244, 183, 255, 241, 253}, + {225, 156, 178, 244, 184, 255, 241, 253}, + {225, 157, 178, 244, 184, 255, 241, 253}, + {226, 158, 179, 244, 185, 255, 242, 253}, + {227, 158, 179, 244, 185, 255, 242, 253}, + {228, 159, 180, 245, 186, 255, 242, 253}, + {228, 159, 180, 245, 186, 255, 242, 253}, + {229, 160, 181, 245, 187, 255, 242, 253}, + {229, 160, 181, 245, 187, 255, 242, 253}, + {230, 161, 182, 246, 188, 255, 243, 253}, + {230, 162, 182, 246, 188, 255, 243, 253}, + {231, 163, 183, 246, 189, 255, 243, 253}, + {231, 163, 183, 246, 189, 255, 243, 253}, + {232, 164, 184, 247, 190, 255, 243, 253}, + {232, 164, 184, 247, 190, 255, 243, 253}, + {233, 165, 185, 247, 191, 255, 244, 253}, + {233, 165, 185, 247, 191, 255, 244, 253}, + {234, 166, 185, 247, 192, 255, 244, 253}, + {234, 167, 185, 247, 192, 255, 244, 253}, + {235, 168, 186, 248, 193, 255, 244, 253}, + {235, 168, 186, 248, 193, 255, 244, 253}, + {236, 169, 187, 248, 194, 255, 244, 253}, + {236, 169, 187, 248, 194, 255, 244, 253}, + {236, 170, 188, 248, 195, 255, 245, 253}, + {236, 170, 188, 248, 195, 255, 245, 253}, + {237, 171, 189, 249, 196, 255, 245, 254}, + {237, 172, 189, 249, 196, 255, 245, 254}, + {238, 173, 190, 249, 197, 255, 245, 254}, + {238, 173, 190, 249, 197, 255, 245, 254}, + {239, 174, 191, 249, 198, 255, 245, 254}, + {239, 174, 191, 249, 198, 255, 245, 254}, + {240, 175, 192, 249, 199, 255, 246, 254}, + {240, 176, 192, 249, 199, 255, 246, 254}, + {240, 177, 193, 250, 200, 255, 246, 254}, + {240, 177, 193, 250, 200, 255, 246, 254}, + {241, 178, 194, 250, 201, 255, 246, 254}, + {241, 178, 194, 250, 201, 255, 246, 254}, + {242, 179, 195, 250, 202, 255, 246, 254}, + {242, 180, 195, 250, 202, 255, 246, 254}, + {242, 181, 196, 250, 203, 255, 247, 254}, + {242, 181, 196, 250, 203, 255, 247, 254}, + {243, 182, 197, 251, 204, 255, 247, 254}, + {243, 183, 197, 251, 204, 255, 247, 254}, + {244, 184, 198, 251, 205, 255, 247, 254}, + {244, 184, 198, 251, 205, 255, 247, 254}, + {244, 185, 199, 251, 206, 255, 247, 254}, + {244, 185, 199, 251, 206, 255, 247, 254}, + {245, 186, 200, 251, 207, 255, 247, 254}, + {245, 187, 200, 251, 207, 255, 247, 254}, + {246, 188, 201, 252, 207, 255, 248, 254}, + {246, 188, 201, 252, 207, 255, 248, 254}, + {246, 189, 202, 252, 208, 255, 248, 254}, + {246, 190, 202, 252, 208, 255, 248, 254}, + {247, 191, 203, 252, 209, 255, 248, 254}, + {247, 191, 203, 252, 209, 255, 248, 254}, + {247, 192, 204, 252, 210, 255, 248, 254}, + {247, 193, 204, 252, 210, 255, 248, 254}, + {248, 194, 205, 252, 211, 255, 248, 254}, + {248, 194, 205, 252, 211, 255, 248, 254}, + {248, 195, 206, 252, 212, 255, 249, 254}, + {248, 196, 206, 252, 212, 255, 249, 254}, + {249, 197, 207, 253, 213, 255, 249, 254}, + {249, 197, 207, 253, 213, 255, 249, 254}, + {249, 198, 208, 253, 214, 255, 249, 254}, + {249, 199, 209, 253, 214, 255, 249, 254}, + {250, 200, 210, 253, 215, 255, 249, 254}, + {250, 200, 210, 253, 215, 255, 249, 254}, + {250, 201, 211, 253, 215, 255, 249, 254}, + {250, 202, 211, 253, 215, 255, 249, 254}, + {250, 203, 212, 253, 216, 255, 249, 254}, + {250, 203, 212, 253, 216, 255, 249, 254}, + {251, 204, 213, 253, 217, 255, 250, 254}, + {251, 205, 213, 253, 217, 255, 250, 254}, + {251, 206, 214, 254, 218, 255, 250, 254}, + {251, 206, 215, 254, 218, 255, 250, 254}, + {252, 207, 216, 254, 219, 255, 250, 254}, + {252, 208, 216, 254, 219, 255, 250, 254}, + {252, 209, 217, 254, 220, 255, 250, 254}, + {252, 210, 217, 254, 220, 255, 250, 254}, + {252, 211, 218, 254, 221, 255, 250, 254}, + {252, 212, 218, 254, 221, 255, 250, 254}, + {253, 213, 219, 254, 222, 255, 250, 254}, + {253, 213, 220, 254, 222, 255, 250, 254}, + {253, 214, 221, 254, 223, 255, 250, 254}, + {253, 215, 221, 254, 223, 255, 250, 254}, + {253, 216, 222, 254, 224, 255, 251, 254}, + {253, 217, 223, 254, 224, 255, 251, 254}, + {253, 218, 224, 254, 225, 255, 251, 254}, + {253, 219, 224, 254, 225, 255, 251, 254}, + {254, 220, 225, 254, 225, 255, 251, 254}, + {254, 221, 226, 254, 225, 255, 251, 254}, + {254, 222, 227, 255, 226, 255, 251, 254}, + {254, 223, 227, 255, 226, 255, 251, 254}, + {254, 224, 228, 255, 227, 255, 251, 254}, + {254, 225, 229, 255, 227, 255, 251, 254}, + {254, 226, 230, 255, 228, 255, 251, 254}, + {254, 227, 230, 255, 229, 255, 251, 254}, + {255, 228, 231, 255, 230, 255, 251, 254}, + {255, 229, 232, 255, 230, 255, 251, 254}, + {255, 230, 233, 255, 231, 255, 252, 254}, + {255, 231, 234, 255, 231, 255, 252, 254}, + {255, 232, 235, 255, 232, 255, 252, 254}, + {255, 233, 236, 255, 232, 255, 252, 254}, + {255, 235, 237, 255, 233, 255, 252, 254}, + {255, 236, 238, 255, 234, 255, 252, 254}, + {255, 238, 240, 255, 235, 255, 252, 255}, + {255, 239, 241, 255, 235, 255, 252, 254}, + {255, 241, 243, 255, 236, 255, 252, 254}, + {255, 243, 245, 255, 237, 255, 252, 254}, + {255, 246, 247, 255, 239, 255, 253, 255}, +}; + +static const vp9_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = { + { // Y plane + { // Intra + { // Band 0 + { 195, 29, 183 }, { 84, 49, 136 }, { 8, 42, 71 } + }, { // Band 1 + { 31, 107, 169 }, { 35, 99, 159 }, { 17, 82, 140 }, + { 8, 66, 114 }, { 2, 44, 76 }, { 1, 19, 32 } + }, { // Band 2 + { 40, 132, 201 }, { 29, 114, 187 }, { 13, 91, 157 }, + { 7, 75, 127 }, { 3, 58, 95 }, { 1, 28, 47 } + }, { // Band 3 + { 69, 142, 221 }, { 42, 122, 201 }, { 15, 91, 159 }, + { 6, 67, 121 }, { 1, 42, 77 }, { 1, 17, 31 } + }, { // Band 4 + { 102, 148, 228 }, { 67, 117, 204 }, { 17, 82, 154 }, + { 6, 59, 114 }, { 2, 39, 75 }, { 1, 15, 29 } + }, { // Band 5 + { 156, 57, 233 }, { 119, 57, 212 }, { 58, 48, 163 }, + { 29, 40, 124 }, { 12, 30, 81 }, { 3, 12, 31 } + } + }, { // Inter + { // Band 0 + { 191, 107, 226 }, { 124, 117, 204 }, { 25, 99, 155 } + }, { // Band 1 + { 29, 148, 210 }, { 37, 126, 194 }, { 8, 93, 157 }, + { 2, 68, 118 }, { 1, 39, 69 }, { 1, 17, 33 } + }, { // Band 2 + { 41, 151, 213 }, { 27, 123, 193 }, { 3, 82, 144 }, + { 1, 58, 105 }, { 1, 32, 60 }, { 1, 13, 26 } + }, { // Band 3 + { 59, 159, 220 }, { 23, 126, 198 }, { 4, 88, 151 }, + { 1, 66, 114 }, { 1, 38, 71 }, { 1, 18, 34 } + }, { // Band 4 + { 114, 136, 232 }, { 51, 114, 207 }, { 11, 83, 155 }, + { 3, 56, 105 }, { 1, 33, 65 }, { 1, 17, 34 } + }, { // Band 5 + { 149, 65, 234 }, { 121, 57, 215 }, { 61, 49, 166 }, + { 28, 36, 114 }, { 12, 25, 76 }, { 3, 16, 42 } + } + } + }, { // UV plane + { // Intra + { // Band 0 + { 214, 49, 220 }, { 132, 63, 188 }, { 42, 65, 137 } + }, { // Band 1 + { 85, 137, 221 }, { 104, 131, 216 }, { 49, 111, 192 }, + { 21, 87, 155 }, { 2, 49, 87 }, { 1, 16, 28 } + }, { // Band 2 + { 89, 163, 230 }, { 90, 137, 220 }, { 29, 100, 183 }, + { 10, 70, 135 }, { 2, 42, 81 }, { 1, 17, 33 } + }, { // Band 3 + { 108, 167, 237 }, { 55, 133, 222 }, { 15, 97, 179 }, + { 4, 72, 135 }, { 1, 45, 85 }, { 1, 19, 38 } + }, { // Band 4 + { 124, 146, 240 }, { 66, 124, 224 }, { 17, 88, 175 }, + { 4, 58, 122 }, { 1, 36, 75 }, { 1, 18, 37 } + }, { // Band 5 + { 141, 79, 241 }, { 126, 70, 227 }, { 66, 58, 182 }, + { 30, 44, 136 }, { 12, 34, 96 }, { 2, 20, 47 } + } + }, { // Inter + { // Band 0 + { 229, 99, 249 }, { 143, 111, 235 }, { 46, 109, 192 } + }, { // Band 1 + { 82, 158, 236 }, { 94, 146, 224 }, { 25, 117, 191 }, + { 9, 87, 149 }, { 3, 56, 99 }, { 1, 33, 57 } + }, { // Band 2 + { 83, 167, 237 }, { 68, 145, 222 }, { 10, 103, 177 }, + { 2, 72, 131 }, { 1, 41, 79 }, { 1, 20, 39 } + }, { // Band 3 + { 99, 167, 239 }, { 47, 141, 224 }, { 10, 104, 178 }, + { 2, 73, 133 }, { 1, 44, 85 }, { 1, 22, 47 } + }, { // Band 4 + { 127, 145, 243 }, { 71, 129, 228 }, { 17, 93, 177 }, + { 3, 61, 124 }, { 1, 41, 84 }, { 1, 21, 52 } + }, { // Band 5 + { 157, 78, 244 }, { 140, 72, 231 }, { 69, 58, 184 }, + { 31, 44, 137 }, { 14, 38, 105 }, { 8, 23, 61 } + } + } + } +}; + +static const vp9_coeff_probs_model default_coef_probs_8x8[PLANE_TYPES] = { + { // Y plane + { // Intra + { // Band 0 + { 125, 34, 187 }, { 52, 41, 133 }, { 6, 31, 56 } + }, { // Band 1 + { 37, 109, 153 }, { 51, 102, 147 }, { 23, 87, 128 }, + { 8, 67, 101 }, { 1, 41, 63 }, { 1, 19, 29 } + }, { // Band 2 + { 31, 154, 185 }, { 17, 127, 175 }, { 6, 96, 145 }, + { 2, 73, 114 }, { 1, 51, 82 }, { 1, 28, 45 } + }, { // Band 3 + { 23, 163, 200 }, { 10, 131, 185 }, { 2, 93, 148 }, + { 1, 67, 111 }, { 1, 41, 69 }, { 1, 14, 24 } + }, { // Band 4 + { 29, 176, 217 }, { 12, 145, 201 }, { 3, 101, 156 }, + { 1, 69, 111 }, { 1, 39, 63 }, { 1, 14, 23 } + }, { // Band 5 + { 57, 192, 233 }, { 25, 154, 215 }, { 6, 109, 167 }, + { 3, 78, 118 }, { 1, 48, 69 }, { 1, 21, 29 } + } + }, { // Inter + { // Band 0 + { 202, 105, 245 }, { 108, 106, 216 }, { 18, 90, 144 } + }, { // Band 1 + { 33, 172, 219 }, { 64, 149, 206 }, { 14, 117, 177 }, + { 5, 90, 141 }, { 2, 61, 95 }, { 1, 37, 57 } + }, { // Band 2 + { 33, 179, 220 }, { 11, 140, 198 }, { 1, 89, 148 }, + { 1, 60, 104 }, { 1, 33, 57 }, { 1, 12, 21 } + }, { // Band 3 + { 30, 181, 221 }, { 8, 141, 198 }, { 1, 87, 145 }, + { 1, 58, 100 }, { 1, 31, 55 }, { 1, 12, 20 } + }, { // Band 4 + { 32, 186, 224 }, { 7, 142, 198 }, { 1, 86, 143 }, + { 1, 58, 100 }, { 1, 31, 55 }, { 1, 12, 22 } + }, { // Band 5 + { 57, 192, 227 }, { 20, 143, 204 }, { 3, 96, 154 }, + { 1, 68, 112 }, { 1, 42, 69 }, { 1, 19, 32 } + } + } + }, { // UV plane + { // Intra + { // Band 0 + { 212, 35, 215 }, { 113, 47, 169 }, { 29, 48, 105 } + }, { // Band 1 + { 74, 129, 203 }, { 106, 120, 203 }, { 49, 107, 178 }, + { 19, 84, 144 }, { 4, 50, 84 }, { 1, 15, 25 } + }, { // Band 2 + { 71, 172, 217 }, { 44, 141, 209 }, { 15, 102, 173 }, + { 6, 76, 133 }, { 2, 51, 89 }, { 1, 24, 42 } + }, { // Band 3 + { 64, 185, 231 }, { 31, 148, 216 }, { 8, 103, 175 }, + { 3, 74, 131 }, { 1, 46, 81 }, { 1, 18, 30 } + }, { // Band 4 + { 65, 196, 235 }, { 25, 157, 221 }, { 5, 105, 174 }, + { 1, 67, 120 }, { 1, 38, 69 }, { 1, 15, 30 } + }, { // Band 5 + { 65, 204, 238 }, { 30, 156, 224 }, { 7, 107, 177 }, + { 2, 70, 124 }, { 1, 42, 73 }, { 1, 18, 34 } + } + }, { // Inter + { // Band 0 + { 225, 86, 251 }, { 144, 104, 235 }, { 42, 99, 181 } + }, { // Band 1 + { 85, 175, 239 }, { 112, 165, 229 }, { 29, 136, 200 }, + { 12, 103, 162 }, { 6, 77, 123 }, { 2, 53, 84 } + }, { // Band 2 + { 75, 183, 239 }, { 30, 155, 221 }, { 3, 106, 171 }, + { 1, 74, 128 }, { 1, 44, 76 }, { 1, 17, 28 } + }, { // Band 3 + { 73, 185, 240 }, { 27, 159, 222 }, { 2, 107, 172 }, + { 1, 75, 127 }, { 1, 42, 73 }, { 1, 17, 29 } + }, { // Band 4 + { 62, 190, 238 }, { 21, 159, 222 }, { 2, 107, 172 }, + { 1, 72, 122 }, { 1, 40, 71 }, { 1, 18, 32 } + }, { // Band 5 + { 61, 199, 240 }, { 27, 161, 226 }, { 4, 113, 180 }, + { 1, 76, 129 }, { 1, 46, 80 }, { 1, 23, 41 } + } + } + } +}; + +static const vp9_coeff_probs_model default_coef_probs_16x16[PLANE_TYPES] = { + { // Y plane + { // Intra + { // Band 0 + { 7, 27, 153 }, { 5, 30, 95 }, { 1, 16, 30 } + }, { // Band 1 + { 50, 75, 127 }, { 57, 75, 124 }, { 27, 67, 108 }, + { 10, 54, 86 }, { 1, 33, 52 }, { 1, 12, 18 } + }, { // Band 2 + { 43, 125, 151 }, { 26, 108, 148 }, { 7, 83, 122 }, + { 2, 59, 89 }, { 1, 38, 60 }, { 1, 17, 27 } + }, { // Band 3 + { 23, 144, 163 }, { 13, 112, 154 }, { 2, 75, 117 }, + { 1, 50, 81 }, { 1, 31, 51 }, { 1, 14, 23 } + }, { // Band 4 + { 18, 162, 185 }, { 6, 123, 171 }, { 1, 78, 125 }, + { 1, 51, 86 }, { 1, 31, 54 }, { 1, 14, 23 } + }, { // Band 5 + { 15, 199, 227 }, { 3, 150, 204 }, { 1, 91, 146 }, + { 1, 55, 95 }, { 1, 30, 53 }, { 1, 11, 20 } + } + }, { // Inter + { // Band 0 + { 19, 55, 240 }, { 19, 59, 196 }, { 3, 52, 105 } + }, { // Band 1 + { 41, 166, 207 }, { 104, 153, 199 }, { 31, 123, 181 }, + { 14, 101, 152 }, { 5, 72, 106 }, { 1, 36, 52 } + }, { // Band 2 + { 35, 176, 211 }, { 12, 131, 190 }, { 2, 88, 144 }, + { 1, 60, 101 }, { 1, 36, 60 }, { 1, 16, 28 } + }, { // Band 3 + { 28, 183, 213 }, { 8, 134, 191 }, { 1, 86, 142 }, + { 1, 56, 96 }, { 1, 30, 53 }, { 1, 12, 20 } + }, { // Band 4 + { 20, 190, 215 }, { 4, 135, 192 }, { 1, 84, 139 }, + { 1, 53, 91 }, { 1, 28, 49 }, { 1, 11, 20 } + }, { // Band 5 + { 13, 196, 216 }, { 2, 137, 192 }, { 1, 86, 143 }, + { 1, 57, 99 }, { 1, 32, 56 }, { 1, 13, 24 } + } + } + }, { // UV plane + { // Intra + { // Band 0 + { 211, 29, 217 }, { 96, 47, 156 }, { 22, 43, 87 } + }, { // Band 1 + { 78, 120, 193 }, { 111, 116, 186 }, { 46, 102, 164 }, + { 15, 80, 128 }, { 2, 49, 76 }, { 1, 18, 28 } + }, { // Band 2 + { 71, 161, 203 }, { 42, 132, 192 }, { 10, 98, 150 }, + { 3, 69, 109 }, { 1, 44, 70 }, { 1, 18, 29 } + }, { // Band 3 + { 57, 186, 211 }, { 30, 140, 196 }, { 4, 93, 146 }, + { 1, 62, 102 }, { 1, 38, 65 }, { 1, 16, 27 } + }, { // Band 4 + { 47, 199, 217 }, { 14, 145, 196 }, { 1, 88, 142 }, + { 1, 57, 98 }, { 1, 36, 62 }, { 1, 15, 26 } + }, { // Band 5 + { 26, 219, 229 }, { 5, 155, 207 }, { 1, 94, 151 }, + { 1, 60, 104 }, { 1, 36, 62 }, { 1, 16, 28 } + } + }, { // Inter + { // Band 0 + { 233, 29, 248 }, { 146, 47, 220 }, { 43, 52, 140 } + }, { // Band 1 + { 100, 163, 232 }, { 179, 161, 222 }, { 63, 142, 204 }, + { 37, 113, 174 }, { 26, 89, 137 }, { 18, 68, 97 } + }, { // Band 2 + { 85, 181, 230 }, { 32, 146, 209 }, { 7, 100, 164 }, + { 3, 71, 121 }, { 1, 45, 77 }, { 1, 18, 30 } + }, { // Band 3 + { 65, 187, 230 }, { 20, 148, 207 }, { 2, 97, 159 }, + { 1, 68, 116 }, { 1, 40, 70 }, { 1, 14, 29 } + }, { // Band 4 + { 40, 194, 227 }, { 8, 147, 204 }, { 1, 94, 155 }, + { 1, 65, 112 }, { 1, 39, 66 }, { 1, 14, 26 } + }, { // Band 5 + { 16, 208, 228 }, { 3, 151, 207 }, { 1, 98, 160 }, + { 1, 67, 117 }, { 1, 41, 74 }, { 1, 17, 31 } + } + } + } +}; + +static const vp9_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = { + { // Y plane + { // Intra + { // Band 0 + { 17, 38, 140 }, { 7, 34, 80 }, { 1, 17, 29 } + }, { // Band 1 + { 37, 75, 128 }, { 41, 76, 128 }, { 26, 66, 116 }, + { 12, 52, 94 }, { 2, 32, 55 }, { 1, 10, 16 } + }, { // Band 2 + { 50, 127, 154 }, { 37, 109, 152 }, { 16, 82, 121 }, + { 5, 59, 85 }, { 1, 35, 54 }, { 1, 13, 20 } + }, { // Band 3 + { 40, 142, 167 }, { 17, 110, 157 }, { 2, 71, 112 }, + { 1, 44, 72 }, { 1, 27, 45 }, { 1, 11, 17 } + }, { // Band 4 + { 30, 175, 188 }, { 9, 124, 169 }, { 1, 74, 116 }, + { 1, 48, 78 }, { 1, 30, 49 }, { 1, 11, 18 } + }, { // Band 5 + { 10, 222, 223 }, { 2, 150, 194 }, { 1, 83, 128 }, + { 1, 48, 79 }, { 1, 27, 45 }, { 1, 11, 17 } + } + }, { // Inter + { // Band 0 + { 36, 41, 235 }, { 29, 36, 193 }, { 10, 27, 111 } + }, { // Band 1 + { 85, 165, 222 }, { 177, 162, 215 }, { 110, 135, 195 }, + { 57, 113, 168 }, { 23, 83, 120 }, { 10, 49, 61 } + }, { // Band 2 + { 85, 190, 223 }, { 36, 139, 200 }, { 5, 90, 146 }, + { 1, 60, 103 }, { 1, 38, 65 }, { 1, 18, 30 } + }, { // Band 3 + { 72, 202, 223 }, { 23, 141, 199 }, { 2, 86, 140 }, + { 1, 56, 97 }, { 1, 36, 61 }, { 1, 16, 27 } + }, { // Band 4 + { 55, 218, 225 }, { 13, 145, 200 }, { 1, 86, 141 }, + { 1, 57, 99 }, { 1, 35, 61 }, { 1, 13, 22 } + }, { // Band 5 + { 15, 235, 212 }, { 1, 132, 184 }, { 1, 84, 139 }, + { 1, 57, 97 }, { 1, 34, 56 }, { 1, 14, 23 } + } + } + }, { // UV plane + { // Intra + { // Band 0 + { 181, 21, 201 }, { 61, 37, 123 }, { 10, 38, 71 } + }, { // Band 1 + { 47, 106, 172 }, { 95, 104, 173 }, { 42, 93, 159 }, + { 18, 77, 131 }, { 4, 50, 81 }, { 1, 17, 23 } + }, { // Band 2 + { 62, 147, 199 }, { 44, 130, 189 }, { 28, 102, 154 }, + { 18, 75, 115 }, { 2, 44, 65 }, { 1, 12, 19 } + }, { // Band 3 + { 55, 153, 210 }, { 24, 130, 194 }, { 3, 93, 146 }, + { 1, 61, 97 }, { 1, 31, 50 }, { 1, 10, 16 } + }, { // Band 4 + { 49, 186, 223 }, { 17, 148, 204 }, { 1, 96, 142 }, + { 1, 53, 83 }, { 1, 26, 44 }, { 1, 11, 17 } + }, { // Band 5 + { 13, 217, 212 }, { 2, 136, 180 }, { 1, 78, 124 }, + { 1, 50, 83 }, { 1, 29, 49 }, { 1, 14, 23 } + } + }, { // Inter + { // Band 0 + { 197, 13, 247 }, { 82, 17, 222 }, { 25, 17, 162 } + }, { // Band 1 + { 126, 186, 247 }, { 234, 191, 243 }, { 176, 177, 234 }, + { 104, 158, 220 }, { 66, 128, 186 }, { 55, 90, 137 } + }, { // Band 2 + { 111, 197, 242 }, { 46, 158, 219 }, { 9, 104, 171 }, + { 2, 65, 125 }, { 1, 44, 80 }, { 1, 17, 91 } + }, { // Band 3 + { 104, 208, 245 }, { 39, 168, 224 }, { 3, 109, 162 }, + { 1, 79, 124 }, { 1, 50, 102 }, { 1, 43, 102 } + }, { // Band 4 + { 84, 220, 246 }, { 31, 177, 231 }, { 2, 115, 180 }, + { 1, 79, 134 }, { 1, 55, 77 }, { 1, 60, 79 } + }, { // Band 5 + { 43, 243, 240 }, { 8, 180, 217 }, { 1, 115, 166 }, + { 1, 84, 121 }, { 1, 51, 67 }, { 1, 16, 6 } + } + } + } +}; + +static void extend_to_full_distribution(vpx_prob *probs, vpx_prob p) { + assert(p != 0); + memcpy(probs, vp9_pareto8_full[p - 1], MODEL_NODES * sizeof(vpx_prob)); +} + +void vp9_model_to_full_probs(const vpx_prob *model, vpx_prob *full) { + if (full != model) + memcpy(full, model, sizeof(vpx_prob) * UNCONSTRAINED_NODES); + extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]); +} + +void vp9_default_coef_probs(VP9_COMMON *cm) { + vp9_copy(cm->fc->coef_probs[TX_4X4], default_coef_probs_4x4); + vp9_copy(cm->fc->coef_probs[TX_8X8], default_coef_probs_8x8); + vp9_copy(cm->fc->coef_probs[TX_16X16], default_coef_probs_16x16); + vp9_copy(cm->fc->coef_probs[TX_32X32], default_coef_probs_32x32); +} + +#define COEF_COUNT_SAT 24 +#define COEF_MAX_UPDATE_FACTOR 112 +#define COEF_COUNT_SAT_KEY 24 +#define COEF_MAX_UPDATE_FACTOR_KEY 112 +#define COEF_COUNT_SAT_AFTER_KEY 24 +#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128 + +static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size, + unsigned int count_sat, + unsigned int update_factor) { + const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; + vp9_coeff_probs_model *const probs = cm->fc->coef_probs[tx_size]; + const vp9_coeff_probs_model *const pre_probs = pre_fc->coef_probs[tx_size]; + vp9_coeff_count_model *counts = cm->counts.coef[tx_size]; + unsigned int (*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] = + cm->counts.eob_branch[tx_size]; + int i, j, k, l, m; + + for (i = 0; i < PLANE_TYPES; ++i) + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + const int n0 = counts[i][j][k][l][ZERO_TOKEN]; + const int n1 = counts[i][j][k][l][ONE_TOKEN]; + const int n2 = counts[i][j][k][l][TWO_TOKEN]; + const int neob = counts[i][j][k][l][EOB_MODEL_TOKEN]; + const unsigned int branch_ct[UNCONSTRAINED_NODES][2] = { + { neob, eob_counts[i][j][k][l] - neob }, + { n0, n1 + n2 }, + { n1, n2 } + }; + for (m = 0; m < UNCONSTRAINED_NODES; ++m) + probs[i][j][k][l][m] = merge_probs(pre_probs[i][j][k][l][m], + branch_ct[m], + count_sat, update_factor); + } +} + +void vp9_adapt_coef_probs(VP9_COMMON *cm) { + TX_SIZE t; + unsigned int count_sat, update_factor; + + if (frame_is_intra_only(cm)) { + update_factor = COEF_MAX_UPDATE_FACTOR_KEY; + count_sat = COEF_COUNT_SAT_KEY; + } else if (cm->last_frame_type == KEY_FRAME) { + update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY; /* adapt quickly */ + count_sat = COEF_COUNT_SAT_AFTER_KEY; + } else { + update_factor = COEF_MAX_UPDATE_FACTOR; + count_sat = COEF_COUNT_SAT; + } + for (t = TX_4X4; t <= TX_32X32; t++) + adapt_coef_probs(cm, t, count_sat, update_factor); +} diff --git a/thirdparty/libvpx/vp9/common/vp9_entropy.h b/thirdparty/libvpx/vp9/common/vp9_entropy.h new file mode 100644 index 000000000000..63b3bff5d91c --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_entropy.h @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_ENTROPY_H_ +#define VP9_COMMON_VP9_ENTROPY_H_ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/prob.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define DIFF_UPDATE_PROB 252 + +// Coefficient token alphabet +#define ZERO_TOKEN 0 // 0 Extra Bits 0+0 +#define ONE_TOKEN 1 // 1 Extra Bits 0+1 +#define TWO_TOKEN 2 // 2 Extra Bits 0+1 +#define THREE_TOKEN 3 // 3 Extra Bits 0+1 +#define FOUR_TOKEN 4 // 4 Extra Bits 0+1 +#define CATEGORY1_TOKEN 5 // 5-6 Extra Bits 1+1 +#define CATEGORY2_TOKEN 6 // 7-10 Extra Bits 2+1 +#define CATEGORY3_TOKEN 7 // 11-18 Extra Bits 3+1 +#define CATEGORY4_TOKEN 8 // 19-34 Extra Bits 4+1 +#define CATEGORY5_TOKEN 9 // 35-66 Extra Bits 5+1 +#define CATEGORY6_TOKEN 10 // 67+ Extra Bits 14+1 +#define EOB_TOKEN 11 // EOB Extra Bits 0+0 + +#define ENTROPY_TOKENS 12 + +#define ENTROPY_NODES 11 + +DECLARE_ALIGNED(16, extern const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]); + +#define CAT1_MIN_VAL 5 +#define CAT2_MIN_VAL 7 +#define CAT3_MIN_VAL 11 +#define CAT4_MIN_VAL 19 +#define CAT5_MIN_VAL 35 +#define CAT6_MIN_VAL 67 + +// Extra bit probabilities. +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat1_prob[1]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat2_prob[2]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat3_prob[3]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat4_prob[4]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob[5]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob[14]); + +#if CONFIG_VP9_HIGHBITDEPTH +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat1_prob_high10[1]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat2_prob_high10[2]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat3_prob_high10[3]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat4_prob_high10[4]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob_high10[5]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob_high10[16]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat1_prob_high12[1]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat2_prob_high12[2]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat3_prob_high12[3]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat4_prob_high12[4]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob_high12[5]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob_high12[18]); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#define EOB_MODEL_TOKEN 3 + +#define DCT_MAX_VALUE 16384 +#if CONFIG_VP9_HIGHBITDEPTH +#define DCT_MAX_VALUE_HIGH10 65536 +#define DCT_MAX_VALUE_HIGH12 262144 +#endif // CONFIG_VP9_HIGHBITDEPTH + +/* Coefficients are predicted via a 3-dimensional probability table. */ + +#define REF_TYPES 2 // intra=0, inter=1 + +/* Middle dimension reflects the coefficient position within the transform. */ +#define COEF_BANDS 6 + +/* Inside dimension is measure of nearby complexity, that reflects the energy + of nearby coefficients are nonzero. For the first coefficient (DC, unless + block type is 0), we look at the (already encoded) blocks above and to the + left of the current block. The context index is then the number (0,1,or 2) + of these blocks having nonzero coefficients. + After decoding a coefficient, the measure is determined by the size of the + most recently decoded coefficient. + Note that the intuitive meaning of this measure changes as coefficients + are decoded, e.g., prior to the first token, a zero means that my neighbors + are empty while, after the first token, because of the use of end-of-block, + a zero means we just decoded a zero and hence guarantees that a non-zero + coefficient will appear later in this block. However, this shift + in meaning is perfectly OK because our context depends also on the + coefficient band (and since zigzag positions 0, 1, and 2 are in + distinct bands). */ + +#define COEFF_CONTEXTS 6 +#define BAND_COEFF_CONTEXTS(band) ((band) == 0 ? 3 : COEFF_CONTEXTS) + +// #define ENTROPY_STATS + +typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] + [ENTROPY_TOKENS]; +typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] + [ENTROPY_NODES][2]; + +#define SUBEXP_PARAM 4 /* Subexponential code parameter */ +#define MODULUS_PARAM 13 /* Modulus parameter */ + +struct VP9Common; +void vp9_default_coef_probs(struct VP9Common *cm); +void vp9_adapt_coef_probs(struct VP9Common *cm); + +// This is the index in the scan order beyond which all coefficients for +// 8x8 transform and above are in the top band. +// This macro is currently unused but may be used by certain implementations +#define MAXBAND_INDEX 21 + +DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_8x8plus[1024]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_4x4[16]); + +static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) { + return tx_size == TX_4X4 ? vp9_coefband_trans_4x4 + : vp9_coefband_trans_8x8plus; +} + +// 128 lists of probabilities are stored for the following ONE node probs: +// 1, 3, 5, 7, ..., 253, 255 +// In between probabilities are interpolated linearly + +#define COEFF_PROB_MODELS 255 + +#define UNCONSTRAINED_NODES 3 + +#define PIVOT_NODE 2 // which node is pivot + +#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES) +extern const vpx_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)]; +extern const vpx_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES]; + +typedef vpx_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS] + [COEFF_CONTEXTS][UNCONSTRAINED_NODES]; + +typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS] + [COEFF_CONTEXTS] + [UNCONSTRAINED_NODES + 1]; + +void vp9_model_to_full_probs(const vpx_prob *model, vpx_prob *full); + +typedef char ENTROPY_CONTEXT; + +static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a, + ENTROPY_CONTEXT b) { + return (a != 0) + (b != 0); +} + +static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, + const ENTROPY_CONTEXT *l) { + ENTROPY_CONTEXT above_ec = 0, left_ec = 0; + + switch (tx_size) { + case TX_4X4: + above_ec = a[0] != 0; + left_ec = l[0] != 0; + break; + case TX_8X8: + above_ec = !!*(const uint16_t *)a; + left_ec = !!*(const uint16_t *)l; + break; + case TX_16X16: + above_ec = !!*(const uint32_t *)a; + left_ec = !!*(const uint32_t *)l; + break; + case TX_32X32: + above_ec = !!*(const uint64_t *)a; + left_ec = !!*(const uint64_t *)l; + break; + default: + assert(0 && "Invalid transform size."); + break; + } + + return combine_entropy_contexts(above_ec, left_ec); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_ENTROPY_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_entropymode.c b/thirdparty/libvpx/vp9/common/vp9_entropymode.c new file mode 100644 index 000000000000..670348bafd3e --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_entropymode.c @@ -0,0 +1,469 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_seg_common.h" + +const vpx_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = { + { // above = dc + { 137, 30, 42, 148, 151, 207, 70, 52, 91 }, // left = dc + { 92, 45, 102, 136, 116, 180, 74, 90, 100 }, // left = v + { 73, 32, 19, 187, 222, 215, 46, 34, 100 }, // left = h + { 91, 30, 32, 116, 121, 186, 93, 86, 94 }, // left = d45 + { 72, 35, 36, 149, 68, 206, 68, 63, 105 }, // left = d135 + { 73, 31, 28, 138, 57, 124, 55, 122, 151 }, // left = d117 + { 67, 23, 21, 140, 126, 197, 40, 37, 171 }, // left = d153 + { 86, 27, 28, 128, 154, 212, 45, 43, 53 }, // left = d207 + { 74, 32, 27, 107, 86, 160, 63, 134, 102 }, // left = d63 + { 59, 67, 44, 140, 161, 202, 78, 67, 119 } // left = tm + }, { // above = v + { 63, 36, 126, 146, 123, 158, 60, 90, 96 }, // left = dc + { 43, 46, 168, 134, 107, 128, 69, 142, 92 }, // left = v + { 44, 29, 68, 159, 201, 177, 50, 57, 77 }, // left = h + { 58, 38, 76, 114, 97, 172, 78, 133, 92 }, // left = d45 + { 46, 41, 76, 140, 63, 184, 69, 112, 57 }, // left = d135 + { 38, 32, 85, 140, 46, 112, 54, 151, 133 }, // left = d117 + { 39, 27, 61, 131, 110, 175, 44, 75, 136 }, // left = d153 + { 52, 30, 74, 113, 130, 175, 51, 64, 58 }, // left = d207 + { 47, 35, 80, 100, 74, 143, 64, 163, 74 }, // left = d63 + { 36, 61, 116, 114, 128, 162, 80, 125, 82 } // left = tm + }, { // above = h + { 82, 26, 26, 171, 208, 204, 44, 32, 105 }, // left = dc + { 55, 44, 68, 166, 179, 192, 57, 57, 108 }, // left = v + { 42, 26, 11, 199, 241, 228, 23, 15, 85 }, // left = h + { 68, 42, 19, 131, 160, 199, 55, 52, 83 }, // left = d45 + { 58, 50, 25, 139, 115, 232, 39, 52, 118 }, // left = d135 + { 50, 35, 33, 153, 104, 162, 64, 59, 131 }, // left = d117 + { 44, 24, 16, 150, 177, 202, 33, 19, 156 }, // left = d153 + { 55, 27, 12, 153, 203, 218, 26, 27, 49 }, // left = d207 + { 53, 49, 21, 110, 116, 168, 59, 80, 76 }, // left = d63 + { 38, 72, 19, 168, 203, 212, 50, 50, 107 } // left = tm + }, { // above = d45 + { 103, 26, 36, 129, 132, 201, 83, 80, 93 }, // left = dc + { 59, 38, 83, 112, 103, 162, 98, 136, 90 }, // left = v + { 62, 30, 23, 158, 200, 207, 59, 57, 50 }, // left = h + { 67, 30, 29, 84, 86, 191, 102, 91, 59 }, // left = d45 + { 60, 32, 33, 112, 71, 220, 64, 89, 104 }, // left = d135 + { 53, 26, 34, 130, 56, 149, 84, 120, 103 }, // left = d117 + { 53, 21, 23, 133, 109, 210, 56, 77, 172 }, // left = d153 + { 77, 19, 29, 112, 142, 228, 55, 66, 36 }, // left = d207 + { 61, 29, 29, 93, 97, 165, 83, 175, 162 }, // left = d63 + { 47, 47, 43, 114, 137, 181, 100, 99, 95 } // left = tm + }, { // above = d135 + { 69, 23, 29, 128, 83, 199, 46, 44, 101 }, // left = dc + { 53, 40, 55, 139, 69, 183, 61, 80, 110 }, // left = v + { 40, 29, 19, 161, 180, 207, 43, 24, 91 }, // left = h + { 60, 34, 19, 105, 61, 198, 53, 64, 89 }, // left = d45 + { 52, 31, 22, 158, 40, 209, 58, 62, 89 }, // left = d135 + { 44, 31, 29, 147, 46, 158, 56, 102, 198 }, // left = d117 + { 35, 19, 12, 135, 87, 209, 41, 45, 167 }, // left = d153 + { 55, 25, 21, 118, 95, 215, 38, 39, 66 }, // left = d207 + { 51, 38, 25, 113, 58, 164, 70, 93, 97 }, // left = d63 + { 47, 54, 34, 146, 108, 203, 72, 103, 151 } // left = tm + }, { // above = d117 + { 64, 19, 37, 156, 66, 138, 49, 95, 133 }, // left = dc + { 46, 27, 80, 150, 55, 124, 55, 121, 135 }, // left = v + { 36, 23, 27, 165, 149, 166, 54, 64, 118 }, // left = h + { 53, 21, 36, 131, 63, 163, 60, 109, 81 }, // left = d45 + { 40, 26, 35, 154, 40, 185, 51, 97, 123 }, // left = d135 + { 35, 19, 34, 179, 19, 97, 48, 129, 124 }, // left = d117 + { 36, 20, 26, 136, 62, 164, 33, 77, 154 }, // left = d153 + { 45, 18, 32, 130, 90, 157, 40, 79, 91 }, // left = d207 + { 45, 26, 28, 129, 45, 129, 49, 147, 123 }, // left = d63 + { 38, 44, 51, 136, 74, 162, 57, 97, 121 } // left = tm + }, { // above = d153 + { 75, 17, 22, 136, 138, 185, 32, 34, 166 }, // left = dc + { 56, 39, 58, 133, 117, 173, 48, 53, 187 }, // left = v + { 35, 21, 12, 161, 212, 207, 20, 23, 145 }, // left = h + { 56, 29, 19, 117, 109, 181, 55, 68, 112 }, // left = d45 + { 47, 29, 17, 153, 64, 220, 59, 51, 114 }, // left = d135 + { 46, 16, 24, 136, 76, 147, 41, 64, 172 }, // left = d117 + { 34, 17, 11, 108, 152, 187, 13, 15, 209 }, // left = d153 + { 51, 24, 14, 115, 133, 209, 32, 26, 104 }, // left = d207 + { 55, 30, 18, 122, 79, 179, 44, 88, 116 }, // left = d63 + { 37, 49, 25, 129, 168, 164, 41, 54, 148 } // left = tm + }, { // above = d207 + { 82, 22, 32, 127, 143, 213, 39, 41, 70 }, // left = dc + { 62, 44, 61, 123, 105, 189, 48, 57, 64 }, // left = v + { 47, 25, 17, 175, 222, 220, 24, 30, 86 }, // left = h + { 68, 36, 17, 106, 102, 206, 59, 74, 74 }, // left = d45 + { 57, 39, 23, 151, 68, 216, 55, 63, 58 }, // left = d135 + { 49, 30, 35, 141, 70, 168, 82, 40, 115 }, // left = d117 + { 51, 25, 15, 136, 129, 202, 38, 35, 139 }, // left = d153 + { 68, 26, 16, 111, 141, 215, 29, 28, 28 }, // left = d207 + { 59, 39, 19, 114, 75, 180, 77, 104, 42 }, // left = d63 + { 40, 61, 26, 126, 152, 206, 61, 59, 93 } // left = tm + }, { // above = d63 + { 78, 23, 39, 111, 117, 170, 74, 124, 94 }, // left = dc + { 48, 34, 86, 101, 92, 146, 78, 179, 134 }, // left = v + { 47, 22, 24, 138, 187, 178, 68, 69, 59 }, // left = h + { 56, 25, 33, 105, 112, 187, 95, 177, 129 }, // left = d45 + { 48, 31, 27, 114, 63, 183, 82, 116, 56 }, // left = d135 + { 43, 28, 37, 121, 63, 123, 61, 192, 169 }, // left = d117 + { 42, 17, 24, 109, 97, 177, 56, 76, 122 }, // left = d153 + { 58, 18, 28, 105, 139, 182, 70, 92, 63 }, // left = d207 + { 46, 23, 32, 74, 86, 150, 67, 183, 88 }, // left = d63 + { 36, 38, 48, 92, 122, 165, 88, 137, 91 } // left = tm + }, { // above = tm + { 65, 70, 60, 155, 159, 199, 61, 60, 81 }, // left = dc + { 44, 78, 115, 132, 119, 173, 71, 112, 93 }, // left = v + { 39, 38, 21, 184, 227, 206, 42, 32, 64 }, // left = h + { 58, 47, 36, 124, 137, 193, 80, 82, 78 }, // left = d45 + { 49, 50, 35, 144, 95, 205, 63, 78, 59 }, // left = d135 + { 41, 53, 52, 148, 71, 142, 65, 128, 51 }, // left = d117 + { 40, 36, 28, 143, 143, 202, 40, 55, 137 }, // left = d153 + { 52, 34, 29, 129, 183, 227, 42, 35, 43 }, // left = d207 + { 42, 44, 44, 104, 105, 164, 64, 130, 80 }, // left = d63 + { 43, 81, 53, 140, 169, 204, 68, 84, 72 } // left = tm + } +}; + +const vpx_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = { + { 144, 11, 54, 157, 195, 130, 46, 58, 108 }, // y = dc + { 118, 15, 123, 148, 131, 101, 44, 93, 131 }, // y = v + { 113, 12, 23, 188, 226, 142, 26, 32, 125 }, // y = h + { 120, 11, 50, 123, 163, 135, 64, 77, 103 }, // y = d45 + { 113, 9, 36, 155, 111, 157, 32, 44, 161 }, // y = d135 + { 116, 9, 55, 176, 76, 96, 37, 61, 149 }, // y = d117 + { 115, 9, 28, 141, 161, 167, 21, 25, 193 }, // y = d153 + { 120, 12, 32, 145, 195, 142, 32, 38, 86 }, // y = d207 + { 116, 12, 64, 120, 140, 125, 49, 115, 121 }, // y = d63 + { 102, 19, 66, 162, 182, 122, 35, 59, 128 } // y = tm +}; + +static const vpx_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = { + { 65, 32, 18, 144, 162, 194, 41, 51, 98 }, // block_size < 8x8 + { 132, 68, 18, 165, 217, 196, 45, 40, 78 }, // block_size < 16x16 + { 173, 80, 19, 176, 240, 193, 64, 35, 46 }, // block_size < 32x32 + { 221, 135, 38, 194, 248, 121, 96, 85, 29 } // block_size >= 32x32 +}; + +static const vpx_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = { + { 120, 7, 76, 176, 208, 126, 28, 54, 103 }, // y = dc + { 48, 12, 154, 155, 139, 90, 34, 117, 119 }, // y = v + { 67, 6, 25, 204, 243, 158, 13, 21, 96 }, // y = h + { 97, 5, 44, 131, 176, 139, 48, 68, 97 }, // y = d45 + { 83, 5, 42, 156, 111, 152, 26, 49, 152 }, // y = d135 + { 80, 5, 58, 178, 74, 83, 33, 62, 145 }, // y = d117 + { 86, 5, 32, 154, 192, 168, 14, 22, 163 }, // y = d153 + { 85, 5, 32, 156, 216, 148, 19, 29, 73 }, // y = d207 + { 77, 7, 64, 116, 132, 122, 37, 126, 120 }, // y = d63 + { 101, 21, 107, 181, 192, 103, 19, 67, 125 } // y = tm +}; + +const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS] + [PARTITION_TYPES - 1] = { + // 8x8 -> 4x4 + { 158, 97, 94 }, // a/l both not split + { 93, 24, 99 }, // a split, l not split + { 85, 119, 44 }, // l split, a not split + { 62, 59, 67 }, // a/l both split + // 16x16 -> 8x8 + { 149, 53, 53 }, // a/l both not split + { 94, 20, 48 }, // a split, l not split + { 83, 53, 24 }, // l split, a not split + { 52, 18, 18 }, // a/l both split + // 32x32 -> 16x16 + { 150, 40, 39 }, // a/l both not split + { 78, 12, 26 }, // a split, l not split + { 67, 33, 11 }, // l split, a not split + { 24, 7, 5 }, // a/l both split + // 64x64 -> 32x32 + { 174, 35, 49 }, // a/l both not split + { 68, 11, 27 }, // a split, l not split + { 57, 15, 9 }, // l split, a not split + { 12, 3, 3 }, // a/l both split +}; + +static const vpx_prob default_partition_probs[PARTITION_CONTEXTS] + [PARTITION_TYPES - 1] = { + // 8x8 -> 4x4 + { 199, 122, 141 }, // a/l both not split + { 147, 63, 159 }, // a split, l not split + { 148, 133, 118 }, // l split, a not split + { 121, 104, 114 }, // a/l both split + // 16x16 -> 8x8 + { 174, 73, 87 }, // a/l both not split + { 92, 41, 83 }, // a split, l not split + { 82, 99, 50 }, // l split, a not split + { 53, 39, 39 }, // a/l both split + // 32x32 -> 16x16 + { 177, 58, 59 }, // a/l both not split + { 68, 26, 63 }, // a split, l not split + { 52, 79, 25 }, // l split, a not split + { 17, 14, 12 }, // a/l both split + // 64x64 -> 32x32 + { 222, 34, 30 }, // a/l both not split + { 72, 16, 44 }, // a split, l not split + { 58, 32, 12 }, // l split, a not split + { 10, 7, 6 }, // a/l both split +}; + +static const vpx_prob default_inter_mode_probs[INTER_MODE_CONTEXTS] + [INTER_MODES - 1] = { + {2, 173, 34}, // 0 = both zero mv + {7, 145, 85}, // 1 = one zero mv + one a predicted mv + {7, 166, 63}, // 2 = two predicted mvs + {7, 94, 66}, // 3 = one predicted/zero and one new mv + {8, 64, 46}, // 4 = two new mvs + {17, 81, 31}, // 5 = one intra neighbour + x + {25, 29, 30}, // 6 = two intra neighbours +}; + +/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */ +const vpx_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = { + -DC_PRED, 2, /* 0 = DC_NODE */ + -TM_PRED, 4, /* 1 = TM_NODE */ + -V_PRED, 6, /* 2 = V_NODE */ + 8, 12, /* 3 = COM_NODE */ + -H_PRED, 10, /* 4 = H_NODE */ + -D135_PRED, -D117_PRED, /* 5 = D135_NODE */ + -D45_PRED, 14, /* 6 = D45_NODE */ + -D63_PRED, 16, /* 7 = D63_NODE */ + -D153_PRED, -D207_PRED /* 8 = D153_NODE */ +}; + +const vpx_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = { + -INTER_OFFSET(ZEROMV), 2, + -INTER_OFFSET(NEARESTMV), 4, + -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV) +}; + +const vpx_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = { + -PARTITION_NONE, 2, + -PARTITION_HORZ, 4, + -PARTITION_VERT, -PARTITION_SPLIT +}; + +static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = { + 9, 102, 187, 225 +}; + +static const vpx_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = { + 239, 183, 119, 96, 41 +}; + +static const vpx_prob default_comp_ref_p[REF_CONTEXTS] = { + 50, 126, 123, 221, 226 +}; + +static const vpx_prob default_single_ref_p[REF_CONTEXTS][2] = { + { 33, 16 }, + { 77, 74 }, + { 142, 142 }, + { 172, 170 }, + { 238, 247 } +}; + +static const struct tx_probs default_tx_probs = { + { { 3, 136, 37 }, + { 5, 52, 13 } }, + + { { 20, 152 }, + { 15, 101 } }, + + { { 100 }, + { 66 } } +}; + +void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p, + unsigned int (*ct_32x32p)[2]) { + ct_32x32p[0][0] = tx_count_32x32p[TX_4X4]; + ct_32x32p[0][1] = tx_count_32x32p[TX_8X8] + + tx_count_32x32p[TX_16X16] + + tx_count_32x32p[TX_32X32]; + ct_32x32p[1][0] = tx_count_32x32p[TX_8X8]; + ct_32x32p[1][1] = tx_count_32x32p[TX_16X16] + + tx_count_32x32p[TX_32X32]; + ct_32x32p[2][0] = tx_count_32x32p[TX_16X16]; + ct_32x32p[2][1] = tx_count_32x32p[TX_32X32]; +} + +void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p, + unsigned int (*ct_16x16p)[2]) { + ct_16x16p[0][0] = tx_count_16x16p[TX_4X4]; + ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16]; + ct_16x16p[1][0] = tx_count_16x16p[TX_8X8]; + ct_16x16p[1][1] = tx_count_16x16p[TX_16X16]; +} + +void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, + unsigned int (*ct_8x8p)[2]) { + ct_8x8p[0][0] = tx_count_8x8p[TX_4X4]; + ct_8x8p[0][1] = tx_count_8x8p[TX_8X8]; +} + +static const vpx_prob default_skip_probs[SKIP_CONTEXTS] = { + 192, 128, 64 +}; + +static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS - 1] = { + { 235, 162, }, + { 36, 255, }, + { 34, 3, }, + { 149, 144, }, +}; + +static void init_mode_probs(FRAME_CONTEXT *fc) { + vp9_copy(fc->uv_mode_prob, default_if_uv_probs); + vp9_copy(fc->y_mode_prob, default_if_y_probs); + vp9_copy(fc->switchable_interp_prob, default_switchable_interp_prob); + vp9_copy(fc->partition_prob, default_partition_probs); + vp9_copy(fc->intra_inter_prob, default_intra_inter_p); + vp9_copy(fc->comp_inter_prob, default_comp_inter_p); + vp9_copy(fc->comp_ref_prob, default_comp_ref_p); + vp9_copy(fc->single_ref_prob, default_single_ref_p); + fc->tx_probs = default_tx_probs; + vp9_copy(fc->skip_probs, default_skip_probs); + vp9_copy(fc->inter_mode_probs, default_inter_mode_probs); +} + +const vpx_tree_index vp9_switchable_interp_tree + [TREE_SIZE(SWITCHABLE_FILTERS)] = { + -EIGHTTAP, 2, + -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP +}; + +void vp9_adapt_mode_probs(VP9_COMMON *cm) { + int i, j; + FRAME_CONTEXT *fc = cm->fc; + const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; + const FRAME_COUNTS *counts = &cm->counts; + + for (i = 0; i < INTRA_INTER_CONTEXTS; i++) + fc->intra_inter_prob[i] = mode_mv_merge_probs(pre_fc->intra_inter_prob[i], + counts->intra_inter[i]); + for (i = 0; i < COMP_INTER_CONTEXTS; i++) + fc->comp_inter_prob[i] = mode_mv_merge_probs(pre_fc->comp_inter_prob[i], + counts->comp_inter[i]); + for (i = 0; i < REF_CONTEXTS; i++) + fc->comp_ref_prob[i] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i], + counts->comp_ref[i]); + for (i = 0; i < REF_CONTEXTS; i++) + for (j = 0; j < 2; j++) + fc->single_ref_prob[i][j] = mode_mv_merge_probs( + pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]); + + for (i = 0; i < INTER_MODE_CONTEXTS; i++) + vpx_tree_merge_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i], + counts->inter_mode[i], fc->inter_mode_probs[i]); + + for (i = 0; i < BLOCK_SIZE_GROUPS; i++) + vpx_tree_merge_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i], + counts->y_mode[i], fc->y_mode_prob[i]); + + for (i = 0; i < INTRA_MODES; ++i) + vpx_tree_merge_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i], + counts->uv_mode[i], fc->uv_mode_prob[i]); + + for (i = 0; i < PARTITION_CONTEXTS; i++) + vpx_tree_merge_probs(vp9_partition_tree, pre_fc->partition_prob[i], + counts->partition[i], fc->partition_prob[i]); + + if (cm->interp_filter == SWITCHABLE) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + vpx_tree_merge_probs(vp9_switchable_interp_tree, + pre_fc->switchable_interp_prob[i], + counts->switchable_interp[i], + fc->switchable_interp_prob[i]); + } + + if (cm->tx_mode == TX_MODE_SELECT) { + int j; + unsigned int branch_ct_8x8p[TX_SIZES - 3][2]; + unsigned int branch_ct_16x16p[TX_SIZES - 2][2]; + unsigned int branch_ct_32x32p[TX_SIZES - 1][2]; + + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { + tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p); + for (j = 0; j < TX_SIZES - 3; ++j) + fc->tx_probs.p8x8[i][j] = mode_mv_merge_probs( + pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]); + + tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p); + for (j = 0; j < TX_SIZES - 2; ++j) + fc->tx_probs.p16x16[i][j] = mode_mv_merge_probs( + pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]); + + tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p); + for (j = 0; j < TX_SIZES - 1; ++j) + fc->tx_probs.p32x32[i][j] = mode_mv_merge_probs( + pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]); + } + } + + for (i = 0; i < SKIP_CONTEXTS; ++i) + fc->skip_probs[i] = mode_mv_merge_probs( + pre_fc->skip_probs[i], counts->skip[i]); +} + +static void set_default_lf_deltas(struct loopfilter *lf) { + lf->mode_ref_delta_enabled = 1; + lf->mode_ref_delta_update = 1; + + lf->ref_deltas[INTRA_FRAME] = 1; + lf->ref_deltas[LAST_FRAME] = 0; + lf->ref_deltas[GOLDEN_FRAME] = -1; + lf->ref_deltas[ALTREF_FRAME] = -1; + + lf->mode_deltas[0] = 0; + lf->mode_deltas[1] = 0; +} + +void vp9_setup_past_independence(VP9_COMMON *cm) { + // Reset the segment feature data to the default stats: + // Features disabled, 0, with delta coding (Default state). + struct loopfilter *const lf = &cm->lf; + + int i; + vp9_clearall_segfeatures(&cm->seg); + cm->seg.abs_delta = SEGMENT_DELTADATA; + + if (cm->last_frame_seg_map && !cm->frame_parallel_decode) + memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols)); + + if (cm->current_frame_seg_map) + memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols)); + + // Reset the mode ref deltas for loop filter + vp9_zero(lf->last_ref_deltas); + vp9_zero(lf->last_mode_deltas); + set_default_lf_deltas(lf); + + // To force update of the sharpness + lf->last_sharpness_level = -1; + + vp9_default_coef_probs(cm); + init_mode_probs(cm->fc); + vp9_init_mv_probs(cm); + cm->fc->initialized = 1; + + if (cm->frame_type == KEY_FRAME || + cm->error_resilient_mode || cm->reset_frame_context == 3) { + // Reset all frame contexts. + for (i = 0; i < FRAME_CONTEXTS; ++i) + cm->frame_contexts[i] = *cm->fc; + } else if (cm->reset_frame_context == 2) { + // Reset only the frame context specified in the frame header. + cm->frame_contexts[cm->frame_context_idx] = *cm->fc; + } + + // prev_mip will only be allocated in encoder. + if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode) + memset(cm->prev_mip, 0, + cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip)); + + vp9_zero(cm->ref_frame_sign_bias); + + cm->frame_context_idx = 0; +} diff --git a/thirdparty/libvpx/vp9/common/vp9_entropymode.h b/thirdparty/libvpx/vp9/common/vp9_entropymode.h new file mode 100644 index 000000000000..0285be155734 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_entropymode.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_ENTROPYMODE_H_ +#define VP9_COMMON_VP9_ENTROPYMODE_H_ + +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_filter.h" +#include "vpx_dsp/vpx_filter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLOCK_SIZE_GROUPS 4 + +#define TX_SIZE_CONTEXTS 2 + +#define INTER_OFFSET(mode) ((mode) - NEARESTMV) + +struct VP9Common; + +struct tx_probs { + vpx_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1]; + vpx_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2]; + vpx_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3]; +}; + +struct tx_counts { + unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES]; + unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1]; + unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2]; + unsigned int tx_totals[TX_SIZES]; +}; + +typedef struct frame_contexts { + vpx_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1]; + vpx_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; + vpx_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1]; + vp9_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES]; + vpx_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS - 1]; + vpx_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1]; + vpx_prob intra_inter_prob[INTRA_INTER_CONTEXTS]; + vpx_prob comp_inter_prob[COMP_INTER_CONTEXTS]; + vpx_prob single_ref_prob[REF_CONTEXTS][2]; + vpx_prob comp_ref_prob[REF_CONTEXTS]; + struct tx_probs tx_probs; + vpx_prob skip_probs[SKIP_CONTEXTS]; + nmv_context nmvc; + int initialized; +} FRAME_CONTEXT; + +typedef struct FRAME_COUNTS { + unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES]; + unsigned int uv_mode[INTRA_MODES][INTRA_MODES]; + unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES]; + vp9_coeff_count_model coef[TX_SIZES][PLANE_TYPES]; + unsigned int eob_branch[TX_SIZES][PLANE_TYPES][REF_TYPES] + [COEF_BANDS][COEFF_CONTEXTS]; + unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS]; + unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES]; + unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; + unsigned int comp_inter[COMP_INTER_CONTEXTS][2]; + unsigned int single_ref[REF_CONTEXTS][2][2]; + unsigned int comp_ref[REF_CONTEXTS][2]; + struct tx_counts tx; + unsigned int skip[SKIP_CONTEXTS][2]; + nmv_context_counts mv; +} FRAME_COUNTS; + +extern const vpx_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; +extern const vpx_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES] + [INTRA_MODES - 1]; +extern const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS] + [PARTITION_TYPES - 1]; +extern const vpx_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)]; +extern const vpx_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)]; +extern const vpx_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)]; +extern const vpx_tree_index vp9_switchable_interp_tree + [TREE_SIZE(SWITCHABLE_FILTERS)]; + +void vp9_setup_past_independence(struct VP9Common *cm); + +void vp9_adapt_mode_probs(struct VP9Common *cm); + +void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p, + unsigned int (*ct_32x32p)[2]); +void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p, + unsigned int (*ct_16x16p)[2]); +void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, + unsigned int (*ct_8x8p)[2]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_ENTROPYMODE_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_entropymv.c b/thirdparty/libvpx/vp9/common/vp9_entropymv.c new file mode 100644 index 000000000000..566ae91cf7e9 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_entropymv.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_entropymv.h" + +const vpx_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = { + -MV_JOINT_ZERO, 2, + -MV_JOINT_HNZVZ, 4, + -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ +}; + +const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = { + -MV_CLASS_0, 2, + -MV_CLASS_1, 4, + 6, 8, + -MV_CLASS_2, -MV_CLASS_3, + 10, 12, + -MV_CLASS_4, -MV_CLASS_5, + -MV_CLASS_6, 14, + 16, 18, + -MV_CLASS_7, -MV_CLASS_8, + -MV_CLASS_9, -MV_CLASS_10, +}; + +const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { + -0, -1, +}; + +const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { + -0, 2, + -1, 4, + -2, -3 +}; + +static const nmv_context default_nmv_context = { + {32, 64, 96}, + { + { // Vertical component + 128, // sign + {224, 144, 192, 168, 192, 176, 192, 198, 198, 245}, // class + {216}, // class0 + {136, 140, 148, 160, 176, 192, 224, 234, 234, 240}, // bits + {{128, 128, 64}, {96, 112, 64}}, // class0_fp + {64, 96, 64}, // fp + 160, // class0_hp bit + 128, // hp + }, + { // Horizontal component + 128, // sign + {216, 128, 176, 160, 176, 176, 192, 198, 198, 208}, // class + {208}, // class0 + {136, 140, 148, 160, 176, 192, 224, 234, 234, 240}, // bits + {{128, 128, 64}, {96, 112, 64}}, // class0_fp + {64, 96, 64}, // fp + 160, // class0_hp bit + 128, // hp + } + }, +}; + +static const uint8_t log_in_base_2[] = { + 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10 +}; + +static INLINE int mv_class_base(MV_CLASS_TYPE c) { + return c ? CLASS0_SIZE << (c + 2) : 0; +} + +MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { + const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096) ? + MV_CLASS_10 : (MV_CLASS_TYPE)log_in_base_2[z >> 3]; + if (offset) + *offset = z - mv_class_base(c); + return c; +} + +static void inc_mv_component(int v, nmv_component_counts *comp_counts, + int incr, int usehp) { + int s, z, c, o, d, e, f; + assert(v != 0); /* should not be zero */ + s = v < 0; + comp_counts->sign[s] += incr; + z = (s ? -v : v) - 1; /* magnitude - 1 */ + + c = vp9_get_mv_class(z, &o); + comp_counts->classes[c] += incr; + + d = (o >> 3); /* int mv data */ + f = (o >> 1) & 3; /* fractional pel mv data */ + e = (o & 1); /* high precision mv data */ + + if (c == MV_CLASS_0) { + comp_counts->class0[d] += incr; + comp_counts->class0_fp[d][f] += incr; + comp_counts->class0_hp[e] += usehp * incr; + } else { + int i; + int b = c + CLASS0_BITS - 1; // number of bits + for (i = 0; i < b; ++i) + comp_counts->bits[i][((d >> i) & 1)] += incr; + comp_counts->fp[f] += incr; + comp_counts->hp[e] += usehp * incr; + } +} + +void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) { + if (counts != NULL) { + const MV_JOINT_TYPE j = vp9_get_mv_joint(mv); + ++counts->joints[j]; + + if (mv_joint_vertical(j)) { + inc_mv_component(mv->row, &counts->comps[0], 1, 1); + } + + if (mv_joint_horizontal(j)) { + inc_mv_component(mv->col, &counts->comps[1], 1, 1); + } + } +} + +void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { + int i, j; + + nmv_context *fc = &cm->fc->nmvc; + const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc; + const nmv_context_counts *counts = &cm->counts.mv; + + vpx_tree_merge_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, + fc->joints); + + for (i = 0; i < 2; ++i) { + nmv_component *comp = &fc->comps[i]; + const nmv_component *pre_comp = &pre_fc->comps[i]; + const nmv_component_counts *c = &counts->comps[i]; + + comp->sign = mode_mv_merge_probs(pre_comp->sign, c->sign); + vpx_tree_merge_probs(vp9_mv_class_tree, pre_comp->classes, c->classes, + comp->classes); + vpx_tree_merge_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, + comp->class0); + + for (j = 0; j < MV_OFFSET_BITS; ++j) + comp->bits[j] = mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]); + + for (j = 0; j < CLASS0_SIZE; ++j) + vpx_tree_merge_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], + c->class0_fp[j], comp->class0_fp[j]); + + vpx_tree_merge_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp); + + if (allow_hp) { + comp->class0_hp = mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp); + comp->hp = mode_mv_merge_probs(pre_comp->hp, c->hp); + } + } +} + +void vp9_init_mv_probs(VP9_COMMON *cm) { + cm->fc->nmvc = default_nmv_context; +} diff --git a/thirdparty/libvpx/vp9/common/vp9_entropymv.h b/thirdparty/libvpx/vp9/common/vp9_entropymv.h new file mode 100644 index 000000000000..2f05ad44b601 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_entropymv.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_COMMON_VP9_ENTROPYMV_H_ +#define VP9_COMMON_VP9_ENTROPYMV_H_ + +#include "./vpx_config.h" + +#include "vpx_dsp/prob.h" + +#include "vp9/common/vp9_mv.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP9Common; + +void vp9_init_mv_probs(struct VP9Common *cm); + +void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp); + +// Integer pel reference mv threshold for use of high-precision 1/8 mv +#define COMPANDED_MVREF_THRESH 8 + +static INLINE int use_mv_hp(const MV *ref) { + return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH && + (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH; +} + +#define MV_UPDATE_PROB 252 + +/* Symbols for coding which components are zero jointly */ +#define MV_JOINTS 4 +typedef enum { + MV_JOINT_ZERO = 0, /* Zero vector */ + MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */ + MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */ + MV_JOINT_HNZVNZ = 3, /* Both components nonzero */ +} MV_JOINT_TYPE; + +static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) { + return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ; +} + +static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) { + return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ; +} + +/* Symbols for coding magnitude class of nonzero components */ +#define MV_CLASSES 11 +typedef enum { + MV_CLASS_0 = 0, /* (0, 2] integer pel */ + MV_CLASS_1 = 1, /* (2, 4] integer pel */ + MV_CLASS_2 = 2, /* (4, 8] integer pel */ + MV_CLASS_3 = 3, /* (8, 16] integer pel */ + MV_CLASS_4 = 4, /* (16, 32] integer pel */ + MV_CLASS_5 = 5, /* (32, 64] integer pel */ + MV_CLASS_6 = 6, /* (64, 128] integer pel */ + MV_CLASS_7 = 7, /* (128, 256] integer pel */ + MV_CLASS_8 = 8, /* (256, 512] integer pel */ + MV_CLASS_9 = 9, /* (512, 1024] integer pel */ + MV_CLASS_10 = 10, /* (1024,2048] integer pel */ +} MV_CLASS_TYPE; + +#define CLASS0_BITS 1 /* bits at integer precision for class 0 */ +#define CLASS0_SIZE (1 << CLASS0_BITS) +#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2) +#define MV_FP_SIZE 4 + +#define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2) +#define MV_MAX ((1 << MV_MAX_BITS) - 1) +#define MV_VALS ((MV_MAX << 1) + 1) + +#define MV_IN_USE_BITS 14 +#define MV_UPP ((1 << MV_IN_USE_BITS) - 1) +#define MV_LOW (-(1 << MV_IN_USE_BITS)) + +extern const vpx_tree_index vp9_mv_joint_tree[]; +extern const vpx_tree_index vp9_mv_class_tree[]; +extern const vpx_tree_index vp9_mv_class0_tree[]; +extern const vpx_tree_index vp9_mv_fp_tree[]; + +typedef struct { + vpx_prob sign; + vpx_prob classes[MV_CLASSES - 1]; + vpx_prob class0[CLASS0_SIZE - 1]; + vpx_prob bits[MV_OFFSET_BITS]; + vpx_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1]; + vpx_prob fp[MV_FP_SIZE - 1]; + vpx_prob class0_hp; + vpx_prob hp; +} nmv_component; + +typedef struct { + vpx_prob joints[MV_JOINTS - 1]; + nmv_component comps[2]; +} nmv_context; + +static INLINE MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv) { + if (mv->row == 0) { + return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ; + } else { + return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ; + } +} + +MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset); + +typedef struct { + unsigned int sign[2]; + unsigned int classes[MV_CLASSES]; + unsigned int class0[CLASS0_SIZE]; + unsigned int bits[MV_OFFSET_BITS][2]; + unsigned int class0_fp[CLASS0_SIZE][MV_FP_SIZE]; + unsigned int fp[MV_FP_SIZE]; + unsigned int class0_hp[2]; + unsigned int hp[2]; +} nmv_component_counts; + +typedef struct { + unsigned int joints[MV_JOINTS]; + nmv_component_counts comps[2]; +} nmv_context_counts; + +void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_ENTROPYMV_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_enums.h b/thirdparty/libvpx/vp9/common/vp9_enums.h new file mode 100644 index 000000000000..d089f23f970c --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_enums.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_ENUMS_H_ +#define VP9_COMMON_VP9_ENUMS_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MI_SIZE_LOG2 3 +#define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2) // 64 = 2^6 + +#define MI_SIZE (1 << MI_SIZE_LOG2) // pixels per mi-unit +#define MI_BLOCK_SIZE (1 << MI_BLOCK_SIZE_LOG2) // mi-units per max block + +#define MI_MASK (MI_BLOCK_SIZE - 1) + +// Bitstream profiles indicated by 2-3 bits in the uncompressed header. +// 00: Profile 0. 8-bit 4:2:0 only. +// 10: Profile 1. 8-bit 4:4:4, 4:2:2, and 4:4:0. +// 01: Profile 2. 10-bit and 12-bit color only, with 4:2:0 sampling. +// 110: Profile 3. 10-bit and 12-bit color only, with 4:2:2/4:4:4/4:4:0 +// sampling. +// 111: Undefined profile. +typedef enum BITSTREAM_PROFILE { + PROFILE_0, + PROFILE_1, + PROFILE_2, + PROFILE_3, + MAX_PROFILES +} BITSTREAM_PROFILE; + +#define BLOCK_4X4 0 +#define BLOCK_4X8 1 +#define BLOCK_8X4 2 +#define BLOCK_8X8 3 +#define BLOCK_8X16 4 +#define BLOCK_16X8 5 +#define BLOCK_16X16 6 +#define BLOCK_16X32 7 +#define BLOCK_32X16 8 +#define BLOCK_32X32 9 +#define BLOCK_32X64 10 +#define BLOCK_64X32 11 +#define BLOCK_64X64 12 +#define BLOCK_SIZES 13 +#define BLOCK_INVALID BLOCK_SIZES +typedef uint8_t BLOCK_SIZE; + +typedef enum PARTITION_TYPE { + PARTITION_NONE, + PARTITION_HORZ, + PARTITION_VERT, + PARTITION_SPLIT, + PARTITION_TYPES, + PARTITION_INVALID = PARTITION_TYPES +} PARTITION_TYPE; + +typedef char PARTITION_CONTEXT; +#define PARTITION_PLOFFSET 4 // number of probability models per block size +#define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET) + +// block transform size +typedef uint8_t TX_SIZE; +#define TX_4X4 ((TX_SIZE)0) // 4x4 transform +#define TX_8X8 ((TX_SIZE)1) // 8x8 transform +#define TX_16X16 ((TX_SIZE)2) // 16x16 transform +#define TX_32X32 ((TX_SIZE)3) // 32x32 transform +#define TX_SIZES ((TX_SIZE)4) + +// frame transform mode +typedef enum { + ONLY_4X4 = 0, // only 4x4 transform used + ALLOW_8X8 = 1, // allow block transform size up to 8x8 + ALLOW_16X16 = 2, // allow block transform size up to 16x16 + ALLOW_32X32 = 3, // allow block transform size up to 32x32 + TX_MODE_SELECT = 4, // transform specified for each block + TX_MODES = 5, +} TX_MODE; + +typedef enum { + DCT_DCT = 0, // DCT in both horizontal and vertical + ADST_DCT = 1, // ADST in vertical, DCT in horizontal + DCT_ADST = 2, // DCT in vertical, ADST in horizontal + ADST_ADST = 3, // ADST in both directions + TX_TYPES = 4 +} TX_TYPE; + +typedef enum { + VP9_LAST_FLAG = 1 << 0, + VP9_GOLD_FLAG = 1 << 1, + VP9_ALT_FLAG = 1 << 2, +} VP9_REFFRAME; + +typedef enum { + PLANE_TYPE_Y = 0, + PLANE_TYPE_UV = 1, + PLANE_TYPES +} PLANE_TYPE; + +#define DC_PRED 0 // Average of above and left pixels +#define V_PRED 1 // Vertical +#define H_PRED 2 // Horizontal +#define D45_PRED 3 // Directional 45 deg = round(arctan(1/1) * 180/pi) +#define D135_PRED 4 // Directional 135 deg = 180 - 45 +#define D117_PRED 5 // Directional 117 deg = 180 - 63 +#define D153_PRED 6 // Directional 153 deg = 180 - 27 +#define D207_PRED 7 // Directional 207 deg = 180 + 27 +#define D63_PRED 8 // Directional 63 deg = round(arctan(2/1) * 180/pi) +#define TM_PRED 9 // True-motion +#define NEARESTMV 10 +#define NEARMV 11 +#define ZEROMV 12 +#define NEWMV 13 +#define MB_MODE_COUNT 14 +typedef uint8_t PREDICTION_MODE; + +#define INTRA_MODES (TM_PRED + 1) + +#define INTER_MODES (1 + NEWMV - NEARESTMV) + +#define SKIP_CONTEXTS 3 +#define INTER_MODE_CONTEXTS 7 + +/* Segment Feature Masks */ +#define MAX_MV_REF_CANDIDATES 2 + +#define INTRA_INTER_CONTEXTS 4 +#define COMP_INTER_CONTEXTS 5 +#define REF_CONTEXTS 5 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_ENUMS_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_filter.c b/thirdparty/libvpx/vp9/common/vp9_filter.c new file mode 100644 index 000000000000..4b2198fc40a6 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_filter.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_filter.h" + +DECLARE_ALIGNED(256, static const InterpKernel, + bilinear_filters[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, + { 0, 0, 0, 120, 8, 0, 0, 0 }, + { 0, 0, 0, 112, 16, 0, 0, 0 }, + { 0, 0, 0, 104, 24, 0, 0, 0 }, + { 0, 0, 0, 96, 32, 0, 0, 0 }, + { 0, 0, 0, 88, 40, 0, 0, 0 }, + { 0, 0, 0, 80, 48, 0, 0, 0 }, + { 0, 0, 0, 72, 56, 0, 0, 0 }, + { 0, 0, 0, 64, 64, 0, 0, 0 }, + { 0, 0, 0, 56, 72, 0, 0, 0 }, + { 0, 0, 0, 48, 80, 0, 0, 0 }, + { 0, 0, 0, 40, 88, 0, 0, 0 }, + { 0, 0, 0, 32, 96, 0, 0, 0 }, + { 0, 0, 0, 24, 104, 0, 0, 0 }, + { 0, 0, 0, 16, 112, 0, 0, 0 }, + { 0, 0, 0, 8, 120, 0, 0, 0 } +}; + +// Lagrangian interpolation filter +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_8[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0}, + { 0, 1, -5, 126, 8, -3, 1, 0}, + { -1, 3, -10, 122, 18, -6, 2, 0}, + { -1, 4, -13, 118, 27, -9, 3, -1}, + { -1, 4, -16, 112, 37, -11, 4, -1}, + { -1, 5, -18, 105, 48, -14, 4, -1}, + { -1, 5, -19, 97, 58, -16, 5, -1}, + { -1, 6, -19, 88, 68, -18, 5, -1}, + { -1, 6, -19, 78, 78, -19, 6, -1}, + { -1, 5, -18, 68, 88, -19, 6, -1}, + { -1, 5, -16, 58, 97, -19, 5, -1}, + { -1, 4, -14, 48, 105, -18, 5, -1}, + { -1, 4, -11, 37, 112, -16, 4, -1}, + { -1, 3, -9, 27, 118, -13, 4, -1}, + { 0, 2, -6, 18, 122, -10, 3, -1}, + { 0, 1, -3, 8, 126, -5, 1, 0} +}; + +// DCT based filter +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_8s[SUBPEL_SHIFTS]) = { + {0, 0, 0, 128, 0, 0, 0, 0}, + {-1, 3, -7, 127, 8, -3, 1, 0}, + {-2, 5, -13, 125, 17, -6, 3, -1}, + {-3, 7, -17, 121, 27, -10, 5, -2}, + {-4, 9, -20, 115, 37, -13, 6, -2}, + {-4, 10, -23, 108, 48, -16, 8, -3}, + {-4, 10, -24, 100, 59, -19, 9, -3}, + {-4, 11, -24, 90, 70, -21, 10, -4}, + {-4, 11, -23, 80, 80, -23, 11, -4}, + {-4, 10, -21, 70, 90, -24, 11, -4}, + {-3, 9, -19, 59, 100, -24, 10, -4}, + {-3, 8, -16, 48, 108, -23, 10, -4}, + {-2, 6, -13, 37, 115, -20, 9, -4}, + {-2, 5, -10, 27, 121, -17, 7, -3}, + {-1, 3, -6, 17, 125, -13, 5, -2}, + {0, 1, -3, 8, 127, -7, 3, -1} +}; + +// freqmultiplier = 0.5 +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_8lp[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0}, + {-3, -1, 32, 64, 38, 1, -3, 0}, + {-2, -2, 29, 63, 41, 2, -3, 0}, + {-2, -2, 26, 63, 43, 4, -4, 0}, + {-2, -3, 24, 62, 46, 5, -4, 0}, + {-2, -3, 21, 60, 49, 7, -4, 0}, + {-1, -4, 18, 59, 51, 9, -4, 0}, + {-1, -4, 16, 57, 53, 12, -4, -1}, + {-1, -4, 14, 55, 55, 14, -4, -1}, + {-1, -4, 12, 53, 57, 16, -4, -1}, + { 0, -4, 9, 51, 59, 18, -4, -1}, + { 0, -4, 7, 49, 60, 21, -3, -2}, + { 0, -4, 5, 46, 62, 24, -3, -2}, + { 0, -4, 4, 43, 63, 26, -2, -2}, + { 0, -3, 2, 41, 63, 29, -2, -2}, + { 0, -3, 1, 38, 64, 32, -1, -3} +}; + + +const InterpKernel *vp9_filter_kernels[4] = { + sub_pel_filters_8, + sub_pel_filters_8lp, + sub_pel_filters_8s, + bilinear_filters +}; diff --git a/thirdparty/libvpx/vp9/common/vp9_filter.h b/thirdparty/libvpx/vp9/common/vp9_filter.h new file mode 100644 index 000000000000..efa24bc67b84 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_filter.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_FILTER_H_ +#define VP9_COMMON_VP9_FILTER_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +#define EIGHTTAP 0 +#define EIGHTTAP_SMOOTH 1 +#define EIGHTTAP_SHARP 2 +#define SWITCHABLE_FILTERS 3 /* Number of switchable filters */ +#define BILINEAR 3 +// The codec can operate in four possible inter prediction filter mode: +// 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three. +#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1) +#define SWITCHABLE 4 /* should be the last one */ + +typedef uint8_t INTERP_FILTER; + +extern const InterpKernel *vp9_filter_kernels[4]; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_FILTER_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_frame_buffers.c b/thirdparty/libvpx/vp9/common/vp9_frame_buffers.c new file mode 100644 index 000000000000..0f41d66985f3 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_frame_buffers.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_frame_buffers.h" +#include "vpx_mem/vpx_mem.h" + +int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list) { + assert(list != NULL); + vp9_free_internal_frame_buffers(list); + + list->num_internal_frame_buffers = + VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + list->int_fb = + (InternalFrameBuffer *)vpx_calloc(list->num_internal_frame_buffers, + sizeof(*list->int_fb)); + return (list->int_fb == NULL); +} + +void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) { + int i; + + assert(list != NULL); + + for (i = 0; i < list->num_internal_frame_buffers; ++i) { + vpx_free(list->int_fb[i].data); + list->int_fb[i].data = NULL; + } + vpx_free(list->int_fb); + list->int_fb = NULL; +} + +int vp9_get_frame_buffer(void *cb_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb) { + int i; + InternalFrameBufferList *const int_fb_list = + (InternalFrameBufferList *)cb_priv; + if (int_fb_list == NULL) + return -1; + + // Find a free frame buffer. + for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) { + if (!int_fb_list->int_fb[i].in_use) + break; + } + + if (i == int_fb_list->num_internal_frame_buffers) + return -1; + + if (int_fb_list->int_fb[i].size < min_size) { + int_fb_list->int_fb[i].data = + (uint8_t *)vpx_realloc(int_fb_list->int_fb[i].data, min_size); + if (!int_fb_list->int_fb[i].data) + return -1; + + // This memset is needed for fixing valgrind error from C loop filter + // due to access uninitialized memory in frame border. It could be + // removed if border is totally removed. + memset(int_fb_list->int_fb[i].data, 0, min_size); + int_fb_list->int_fb[i].size = min_size; + } + + fb->data = int_fb_list->int_fb[i].data; + fb->size = int_fb_list->int_fb[i].size; + int_fb_list->int_fb[i].in_use = 1; + + // Set the frame buffer's private data to point at the internal frame buffer. + fb->priv = &int_fb_list->int_fb[i]; + return 0; +} + +int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) { + InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv; + (void)cb_priv; + if (int_fb) + int_fb->in_use = 0; + return 0; +} diff --git a/thirdparty/libvpx/vp9/common/vp9_frame_buffers.h b/thirdparty/libvpx/vp9/common/vp9_frame_buffers.h new file mode 100644 index 000000000000..e2cfe61b6621 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_frame_buffers.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_FRAME_BUFFERS_H_ +#define VP9_COMMON_VP9_FRAME_BUFFERS_H_ + +#include "vpx/vpx_frame_buffer.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct InternalFrameBuffer { + uint8_t *data; + size_t size; + int in_use; +} InternalFrameBuffer; + +typedef struct InternalFrameBufferList { + int num_internal_frame_buffers; + InternalFrameBuffer *int_fb; +} InternalFrameBufferList; + +// Initializes |list|. Returns 0 on success. +int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list); + +// Free any data allocated to the frame buffers. +void vp9_free_internal_frame_buffers(InternalFrameBufferList *list); + +// Callback used by libvpx to request an external frame buffer. |cb_priv| +// Callback private data, which points to an InternalFrameBufferList. +// |min_size| is the minimum size in bytes needed to decode the next frame. +// |fb| pointer to the frame buffer. +int vp9_get_frame_buffer(void *cb_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb); + +// Callback used by libvpx when there are no references to the frame buffer. +// |cb_priv| is not used. |fb| pointer to the frame buffer. +int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_FRAME_BUFFERS_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_idct.c b/thirdparty/libvpx/vp9/common/vp9_idct.c new file mode 100644 index 000000000000..1b420143bb32 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_idct.c @@ -0,0 +1,405 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/inv_txfm.h" +#include "vpx_ports/mem.h" + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + const transform_2d IHT_4[] = { + { idct4_c, idct4_c }, // DCT_DCT = 0 + { iadst4_c, idct4_c }, // ADST_DCT = 1 + { idct4_c, iadst4_c }, // DCT_ADST = 2 + { iadst4_c, iadst4_c } // ADST_ADST = 3 + }; + + int i, j; + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + tran_low_t temp_in[4], temp_out[4]; + + // inverse transform row vectors + for (i = 0; i < 4; ++i) { + IHT_4[tx_type].rows(input, outptr); + input += 4; + outptr += 4; + } + + // inverse transform column vectors + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + IHT_4[tx_type].cols(temp_in, temp_out); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 4)); + } + } +} + +static const transform_2d IHT_8[] = { + { idct8_c, idct8_c }, // DCT_DCT = 0 + { iadst8_c, idct8_c }, // ADST_DCT = 1 + { idct8_c, iadst8_c }, // DCT_ADST = 2 + { iadst8_c, iadst8_c } // ADST_ADST = 3 +}; + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + int i, j; + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + tran_low_t temp_in[8], temp_out[8]; + const transform_2d ht = IHT_8[tx_type]; + + // inverse transform row vectors + for (i = 0; i < 8; ++i) { + ht.rows(input, outptr); + input += 8; + outptr += 8; + } + + // inverse transform column vectors + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + ht.cols(temp_in, temp_out); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 5)); + } + } +} + +static const transform_2d IHT_16[] = { + { idct16_c, idct16_c }, // DCT_DCT = 0 + { iadst16_c, idct16_c }, // ADST_DCT = 1 + { idct16_c, iadst16_c }, // DCT_ADST = 2 + { iadst16_c, iadst16_c } // ADST_ADST = 3 +}; + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + int i, j; + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + const transform_2d ht = IHT_16[tx_type]; + + // Rows + for (i = 0; i < 16; ++i) { + ht.rows(input, outptr); + input += 16; + outptr += 16; + } + + // Columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + ht.cols(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +// idct +void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { + if (eob > 1) + vpx_idct4x4_16_add(input, dest, stride); + else + vpx_idct4x4_1_add(input, dest, stride); +} + + +void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { + if (eob > 1) + vpx_iwht4x4_16_add(input, dest, stride); + else + vpx_iwht4x4_1_add(input, dest, stride); +} + +void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { + // If dc is 1, then input[0] is the reconstructed value, do not need + // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. + + // The calculation can be simplified if there are not many non-zero dct + // coefficients. Use eobs to decide what to do. + // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. + // Combine that with code here. + if (eob == 1) + // DC only DCT coefficient + vpx_idct8x8_1_add(input, dest, stride); + else if (eob <= 12) + vpx_idct8x8_12_add(input, dest, stride); + else + vpx_idct8x8_64_add(input, dest, stride); +} + +void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { + /* The calculation can be simplified if there are not many non-zero dct + * coefficients. Use eobs to separate different cases. */ + if (eob == 1) + /* DC only DCT coefficient. */ + vpx_idct16x16_1_add(input, dest, stride); + else if (eob <= 10) + vpx_idct16x16_10_add(input, dest, stride); + else + vpx_idct16x16_256_add(input, dest, stride); +} + +void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob) { + if (eob == 1) + vpx_idct32x32_1_add(input, dest, stride); + else if (eob <= 34) + // non-zero coeff only in upper-left 8x8 + vpx_idct32x32_34_add(input, dest, stride); + else if (eob <= 135) + // non-zero coeff only in upper-left 16x16 + vpx_idct32x32_135_add(input, dest, stride); + else + vpx_idct32x32_1024_add(input, dest, stride); +} + +// iht +void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, + int stride, int eob) { + if (tx_type == DCT_DCT) + vp9_idct4x4_add(input, dest, stride, eob); + else + vp9_iht4x4_16_add(input, dest, stride, tx_type); +} + +void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, + int stride, int eob) { + if (tx_type == DCT_DCT) { + vp9_idct8x8_add(input, dest, stride, eob); + } else { + vp9_iht8x8_64_add(input, dest, stride, tx_type); + } +} + +void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, + int stride, int eob) { + if (tx_type == DCT_DCT) { + vp9_idct16x16_add(input, dest, stride, eob); + } else { + vp9_iht16x16_256_add(input, dest, stride, tx_type); + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int tx_type, int bd) { + const highbd_transform_2d IHT_4[] = { + { vpx_highbd_idct4_c, vpx_highbd_idct4_c }, // DCT_DCT = 0 + { vpx_highbd_iadst4_c, vpx_highbd_idct4_c }, // ADST_DCT = 1 + { vpx_highbd_idct4_c, vpx_highbd_iadst4_c }, // DCT_ADST = 2 + { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c } // ADST_ADST = 3 + }; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + int i, j; + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + tran_low_t temp_in[4], temp_out[4]; + + // Inverse transform row vectors. + for (i = 0; i < 4; ++i) { + IHT_4[tx_type].rows(input, outptr, bd); + input += 4; + outptr += 4; + } + + // Inverse transform column vectors. + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + IHT_4[tx_type].cols(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } + } +} + +static const highbd_transform_2d HIGH_IHT_8[] = { + { vpx_highbd_idct8_c, vpx_highbd_idct8_c }, // DCT_DCT = 0 + { vpx_highbd_iadst8_c, vpx_highbd_idct8_c }, // ADST_DCT = 1 + { vpx_highbd_idct8_c, vpx_highbd_iadst8_c }, // DCT_ADST = 2 + { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c } // ADST_ADST = 3 +}; + +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int tx_type, int bd) { + int i, j; + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + tran_low_t temp_in[8], temp_out[8]; + const highbd_transform_2d ht = HIGH_IHT_8[tx_type]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // Inverse transform row vectors. + for (i = 0; i < 8; ++i) { + ht.rows(input, outptr, bd); + input += 8; + outptr += 8; + } + + // Inverse transform column vectors. + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + ht.cols(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } +} + +static const highbd_transform_2d HIGH_IHT_16[] = { + { vpx_highbd_idct16_c, vpx_highbd_idct16_c }, // DCT_DCT = 0 + { vpx_highbd_iadst16_c, vpx_highbd_idct16_c }, // ADST_DCT = 1 + { vpx_highbd_idct16_c, vpx_highbd_iadst16_c }, // DCT_ADST = 2 + { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c } // ADST_ADST = 3 +}; + +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int tx_type, int bd) { + int i, j; + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + const highbd_transform_2d ht = HIGH_IHT_16[tx_type]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // Rows + for (i = 0; i < 16; ++i) { + ht.rows(input, outptr, bd); + input += 16; + outptr += 16; + } + + // Columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + ht.cols(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } +} + +// idct +void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd) { + if (eob > 1) + vpx_highbd_idct4x4_16_add(input, dest, stride, bd); + else + vpx_highbd_idct4x4_1_add(input, dest, stride, bd); +} + + +void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd) { + if (eob > 1) + vpx_highbd_iwht4x4_16_add(input, dest, stride, bd); + else + vpx_highbd_iwht4x4_1_add(input, dest, stride, bd); +} + +void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd) { + // If dc is 1, then input[0] is the reconstructed value, do not need + // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. + + // The calculation can be simplified if there are not many non-zero dct + // coefficients. Use eobs to decide what to do. + // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. + // Combine that with code here. + // DC only DCT coefficient + if (eob == 1) { + vpx_highbd_idct8x8_1_add(input, dest, stride, bd); + } else if (eob <= 10) { + vpx_highbd_idct8x8_10_add(input, dest, stride, bd); + } else { + vpx_highbd_idct8x8_64_add(input, dest, stride, bd); + } +} + +void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, + int stride, int eob, int bd) { + // The calculation can be simplified if there are not many non-zero dct + // coefficients. Use eobs to separate different cases. + // DC only DCT coefficient. + if (eob == 1) { + vpx_highbd_idct16x16_1_add(input, dest, stride, bd); + } else if (eob <= 10) { + vpx_highbd_idct16x16_10_add(input, dest, stride, bd); + } else { + vpx_highbd_idct16x16_256_add(input, dest, stride, bd); + } +} + +void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, + int stride, int eob, int bd) { + // Non-zero coeff only in upper-left 8x8 + if (eob == 1) { + vpx_highbd_idct32x32_1_add(input, dest, stride, bd); + } else if (eob <= 34) { + vpx_highbd_idct32x32_34_add(input, dest, stride, bd); + } else { + vpx_highbd_idct32x32_1024_add(input, dest, stride, bd); + } +} + +// iht +void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, + uint8_t *dest, int stride, int eob, int bd) { + if (tx_type == DCT_DCT) + vp9_highbd_idct4x4_add(input, dest, stride, eob, bd); + else + vp9_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd); +} + +void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, + uint8_t *dest, int stride, int eob, int bd) { + if (tx_type == DCT_DCT) { + vp9_highbd_idct8x8_add(input, dest, stride, eob, bd); + } else { + vp9_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd); + } +} + +void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, + uint8_t *dest, int stride, int eob, int bd) { + if (tx_type == DCT_DCT) { + vp9_highbd_idct16x16_add(input, dest, stride, eob, bd); + } else { + vp9_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/thirdparty/libvpx/vp9/common/vp9_idct.h b/thirdparty/libvpx/vp9/common/vp9_idct.h new file mode 100644 index 000000000000..b5a3fbf36208 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_idct.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_IDCT_H_ +#define VP9_COMMON_VP9_IDCT_H_ + +#include + +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*transform_1d)(const tran_low_t*, tran_low_t*); + +typedef struct { + transform_1d cols, rows; // vertical and horizontal +} transform_2d; + +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*highbd_transform_1d)(const tran_low_t*, tran_low_t*, int bd); + +typedef struct { + highbd_transform_1d cols, rows; // vertical and horizontal +} highbd_transform_2d; +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob); +void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob); +void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob); +void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob); +void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob); + +void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, + int stride, int eob); +void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, + int stride, int eob); +void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, + int stride, int eob); + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd); +void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd); +void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd); +void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, + int stride, int eob, int bd); +void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, + int stride, int eob, int bd); +void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, + uint8_t *dest, int stride, int eob, int bd); +void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, + uint8_t *dest, int stride, int eob, int bd); +void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, + uint8_t *dest, int stride, int eob, int bd); +#endif // CONFIG_VP9_HIGHBITDEPTH +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_IDCT_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_loopfilter.c b/thirdparty/libvpx/vp9/common/vp9_loopfilter.c new file mode 100644 index 000000000000..183dec4e714e --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_loopfilter.c @@ -0,0 +1,1697 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_reconinter.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" + +#include "vp9/common/vp9_seg_common.h" + +// 64 bit masks for left transform size. Each 1 represents a position where +// we should apply a loop filter across the left border of an 8x8 block +// boundary. +// +// In the case of TX_16X16-> ( in low order byte first we end up with +// a mask that looks like this +// +// 10101010 +// 10101010 +// 10101010 +// 10101010 +// 10101010 +// 10101010 +// 10101010 +// 10101010 +// +// A loopfilter should be applied to every other 8x8 horizontally. +static const uint64_t left_64x64_txform_mask[TX_SIZES]= { + 0xffffffffffffffffULL, // TX_4X4 + 0xffffffffffffffffULL, // TX_8x8 + 0x5555555555555555ULL, // TX_16x16 + 0x1111111111111111ULL, // TX_32x32 +}; + +// 64 bit masks for above transform size. Each 1 represents a position where +// we should apply a loop filter across the top border of an 8x8 block +// boundary. +// +// In the case of TX_32x32 -> ( in low order byte first we end up with +// a mask that looks like this +// +// 11111111 +// 00000000 +// 00000000 +// 00000000 +// 11111111 +// 00000000 +// 00000000 +// 00000000 +// +// A loopfilter should be applied to every other 4 the row vertically. +static const uint64_t above_64x64_txform_mask[TX_SIZES]= { + 0xffffffffffffffffULL, // TX_4X4 + 0xffffffffffffffffULL, // TX_8x8 + 0x00ff00ff00ff00ffULL, // TX_16x16 + 0x000000ff000000ffULL, // TX_32x32 +}; + +// 64 bit masks for prediction sizes (left). Each 1 represents a position +// where left border of an 8x8 block. These are aligned to the right most +// appropriate bit, and then shifted into place. +// +// In the case of TX_16x32 -> ( low order byte first ) we end up with +// a mask that looks like this : +// +// 10000000 +// 10000000 +// 10000000 +// 10000000 +// 00000000 +// 00000000 +// 00000000 +// 00000000 +static const uint64_t left_prediction_mask[BLOCK_SIZES] = { + 0x0000000000000001ULL, // BLOCK_4X4, + 0x0000000000000001ULL, // BLOCK_4X8, + 0x0000000000000001ULL, // BLOCK_8X4, + 0x0000000000000001ULL, // BLOCK_8X8, + 0x0000000000000101ULL, // BLOCK_8X16, + 0x0000000000000001ULL, // BLOCK_16X8, + 0x0000000000000101ULL, // BLOCK_16X16, + 0x0000000001010101ULL, // BLOCK_16X32, + 0x0000000000000101ULL, // BLOCK_32X16, + 0x0000000001010101ULL, // BLOCK_32X32, + 0x0101010101010101ULL, // BLOCK_32X64, + 0x0000000001010101ULL, // BLOCK_64X32, + 0x0101010101010101ULL, // BLOCK_64X64 +}; + +// 64 bit mask to shift and set for each prediction size. +static const uint64_t above_prediction_mask[BLOCK_SIZES] = { + 0x0000000000000001ULL, // BLOCK_4X4 + 0x0000000000000001ULL, // BLOCK_4X8 + 0x0000000000000001ULL, // BLOCK_8X4 + 0x0000000000000001ULL, // BLOCK_8X8 + 0x0000000000000001ULL, // BLOCK_8X16, + 0x0000000000000003ULL, // BLOCK_16X8 + 0x0000000000000003ULL, // BLOCK_16X16 + 0x0000000000000003ULL, // BLOCK_16X32, + 0x000000000000000fULL, // BLOCK_32X16, + 0x000000000000000fULL, // BLOCK_32X32, + 0x000000000000000fULL, // BLOCK_32X64, + 0x00000000000000ffULL, // BLOCK_64X32, + 0x00000000000000ffULL, // BLOCK_64X64 +}; +// 64 bit mask to shift and set for each prediction size. A bit is set for +// each 8x8 block that would be in the left most block of the given block +// size in the 64x64 block. +static const uint64_t size_mask[BLOCK_SIZES] = { + 0x0000000000000001ULL, // BLOCK_4X4 + 0x0000000000000001ULL, // BLOCK_4X8 + 0x0000000000000001ULL, // BLOCK_8X4 + 0x0000000000000001ULL, // BLOCK_8X8 + 0x0000000000000101ULL, // BLOCK_8X16, + 0x0000000000000003ULL, // BLOCK_16X8 + 0x0000000000000303ULL, // BLOCK_16X16 + 0x0000000003030303ULL, // BLOCK_16X32, + 0x0000000000000f0fULL, // BLOCK_32X16, + 0x000000000f0f0f0fULL, // BLOCK_32X32, + 0x0f0f0f0f0f0f0f0fULL, // BLOCK_32X64, + 0x00000000ffffffffULL, // BLOCK_64X32, + 0xffffffffffffffffULL, // BLOCK_64X64 +}; + +// These are used for masking the left and above borders. +static const uint64_t left_border = 0x1111111111111111ULL; +static const uint64_t above_border = 0x000000ff000000ffULL; + +// 16 bit masks for uv transform sizes. +static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= { + 0xffff, // TX_4X4 + 0xffff, // TX_8x8 + 0x5555, // TX_16x16 + 0x1111, // TX_32x32 +}; + +static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= { + 0xffff, // TX_4X4 + 0xffff, // TX_8x8 + 0x0f0f, // TX_16x16 + 0x000f, // TX_32x32 +}; + +// 16 bit left mask to shift and set for each uv prediction size. +static const uint16_t left_prediction_mask_uv[BLOCK_SIZES] = { + 0x0001, // BLOCK_4X4, + 0x0001, // BLOCK_4X8, + 0x0001, // BLOCK_8X4, + 0x0001, // BLOCK_8X8, + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8, + 0x0001, // BLOCK_16X16, + 0x0011, // BLOCK_16X32, + 0x0001, // BLOCK_32X16, + 0x0011, // BLOCK_32X32, + 0x1111, // BLOCK_32X64 + 0x0011, // BLOCK_64X32, + 0x1111, // BLOCK_64X64 +}; +// 16 bit above mask to shift and set for uv each prediction size. +static const uint16_t above_prediction_mask_uv[BLOCK_SIZES] = { + 0x0001, // BLOCK_4X4 + 0x0001, // BLOCK_4X8 + 0x0001, // BLOCK_8X4 + 0x0001, // BLOCK_8X8 + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8 + 0x0001, // BLOCK_16X16 + 0x0001, // BLOCK_16X32, + 0x0003, // BLOCK_32X16, + 0x0003, // BLOCK_32X32, + 0x0003, // BLOCK_32X64, + 0x000f, // BLOCK_64X32, + 0x000f, // BLOCK_64X64 +}; + +// 64 bit mask to shift and set for each uv prediction size +static const uint16_t size_mask_uv[BLOCK_SIZES] = { + 0x0001, // BLOCK_4X4 + 0x0001, // BLOCK_4X8 + 0x0001, // BLOCK_8X4 + 0x0001, // BLOCK_8X8 + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8 + 0x0001, // BLOCK_16X16 + 0x0011, // BLOCK_16X32, + 0x0003, // BLOCK_32X16, + 0x0033, // BLOCK_32X32, + 0x3333, // BLOCK_32X64, + 0x00ff, // BLOCK_64X32, + 0xffff, // BLOCK_64X64 +}; +static const uint16_t left_border_uv = 0x1111; +static const uint16_t above_border_uv = 0x000f; + +static const int mode_lf_lut[MB_MODE_COUNT] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES + 1, 1, 0, 1 // INTER_MODES (ZEROMV == 0) +}; + +static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { + int lvl; + + // For each possible value for the loop filter fill out limits + for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) { + // Set loop filter parameters that control sharpness. + int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4)); + + if (sharpness_lvl > 0) { + if (block_inside_limit > (9 - sharpness_lvl)) + block_inside_limit = (9 - sharpness_lvl); + } + + if (block_inside_limit < 1) + block_inside_limit = 1; + + memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH); + memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit), + SIMD_WIDTH); + } +} + +static uint8_t get_filter_level(const loop_filter_info_n *lfi_n, + const MODE_INFO *mi) { + return lfi_n->lvl[mi->segment_id][mi->ref_frame[0]] + [mode_lf_lut[mi->mode]]; +} + +void vp9_loop_filter_init(VP9_COMMON *cm) { + loop_filter_info_n *lfi = &cm->lf_info; + struct loopfilter *lf = &cm->lf; + int lvl; + + // init limits for given sharpness + update_sharpness(lfi, lf->sharpness_level); + lf->last_sharpness_level = lf->sharpness_level; + + // init hev threshold const vectors + for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) + memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH); +} + +void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { + int seg_id; + // n_shift is the multiplier for lf_deltas + // the multiplier is 1 for when filter_lvl is between 0 and 31; + // 2 when filter_lvl is between 32 and 63 + const int scale = 1 << (default_filt_lvl >> 5); + loop_filter_info_n *const lfi = &cm->lf_info; + struct loopfilter *const lf = &cm->lf; + const struct segmentation *const seg = &cm->seg; + + // update limits if sharpness has changed + if (lf->last_sharpness_level != lf->sharpness_level) { + update_sharpness(lfi, lf->sharpness_level); + lf->last_sharpness_level = lf->sharpness_level; + } + + for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) { + int lvl_seg = default_filt_lvl; + if (segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) { + const int data = get_segdata(seg, seg_id, SEG_LVL_ALT_LF); + lvl_seg = clamp(seg->abs_delta == SEGMENT_ABSDATA ? + data : default_filt_lvl + data, + 0, MAX_LOOP_FILTER); + } + + if (!lf->mode_ref_delta_enabled) { + // we could get rid of this if we assume that deltas are set to + // zero when not in use; encoder always uses deltas + memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id])); + } else { + int ref, mode; + const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale; + lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER); + + for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref) { + for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) { + const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale + + lf->mode_deltas[mode] * scale; + lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER); + } + } + } + } +} + +static void filter_selectively_vert_row2(int subsampling_factor, + uint8_t *s, int pitch, + unsigned int mask_16x16, + unsigned int mask_8x8, + unsigned int mask_4x4, + unsigned int mask_4x4_int, + const loop_filter_thresh *lfthr, + const uint8_t *lfl) { + const int dual_mask_cutoff = subsampling_factor ? 0xff : 0xffff; + const int lfl_forward = subsampling_factor ? 4 : 8; + const unsigned int dual_one = 1 | (1 << lfl_forward); + unsigned int mask; + uint8_t *ss[2]; + ss[0] = s; + + for (mask = + (mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int) & dual_mask_cutoff; + mask; mask = (mask & ~dual_one) >> 1) { + if (mask & dual_one) { + const loop_filter_thresh *lfis[2]; + lfis[0] = lfthr + *lfl; + lfis[1] = lfthr + *(lfl + lfl_forward); + ss[1] = ss[0] + 8 * pitch; + + if (mask_16x16 & dual_one) { + if ((mask_16x16 & dual_one) == dual_one) { + vpx_lpf_vertical_16_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim, + lfis[0]->hev_thr); + } else { + const loop_filter_thresh *lfi = lfis[!(mask_16x16 & 1)]; + vpx_lpf_vertical_16(ss[!(mask_16x16 & 1)], pitch, lfi->mblim, + lfi->lim, lfi->hev_thr); + } + } + + if (mask_8x8 & dual_one) { + if ((mask_8x8 & dual_one) == dual_one) { + vpx_lpf_vertical_8_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim, + lfis[0]->hev_thr, lfis[1]->mblim, + lfis[1]->lim, lfis[1]->hev_thr); + } else { + const loop_filter_thresh *lfi = lfis[!(mask_8x8 & 1)]; + vpx_lpf_vertical_8(ss[!(mask_8x8 & 1)], pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + } + } + + if (mask_4x4 & dual_one) { + if ((mask_4x4 & dual_one) == dual_one) { + vpx_lpf_vertical_4_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim, + lfis[0]->hev_thr, lfis[1]->mblim, + lfis[1]->lim, lfis[1]->hev_thr); + } else { + const loop_filter_thresh *lfi = lfis[!(mask_4x4 & 1)]; + vpx_lpf_vertical_4(ss[!(mask_4x4 & 1)], pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + } + } + + if (mask_4x4_int & dual_one) { + if ((mask_4x4_int & dual_one) == dual_one) { + vpx_lpf_vertical_4_dual(ss[0] + 4, pitch, lfis[0]->mblim, + lfis[0]->lim, lfis[0]->hev_thr, + lfis[1]->mblim, lfis[1]->lim, + lfis[1]->hev_thr); + } else { + const loop_filter_thresh *lfi = lfis[!(mask_4x4_int & 1)]; + vpx_lpf_vertical_4(ss[!(mask_4x4_int & 1)] + 4, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr); + } + } + } + + ss[0] += 8; + lfl += 1; + mask_16x16 >>= 1; + mask_8x8 >>= 1; + mask_4x4 >>= 1; + mask_4x4_int >>= 1; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_filter_selectively_vert_row2(int subsampling_factor, + uint16_t *s, int pitch, + unsigned int mask_16x16, + unsigned int mask_8x8, + unsigned int mask_4x4, + unsigned int mask_4x4_int, + const loop_filter_thresh *lfthr, + const uint8_t *lfl, int bd) { + const int dual_mask_cutoff = subsampling_factor ? 0xff : 0xffff; + const int lfl_forward = subsampling_factor ? 4 : 8; + const unsigned int dual_one = 1 | (1 << lfl_forward); + unsigned int mask; + uint16_t *ss[2]; + ss[0] = s; + + for (mask = + (mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int) & dual_mask_cutoff; + mask; mask = (mask & ~dual_one) >> 1) { + if (mask & dual_one) { + const loop_filter_thresh *lfis[2]; + lfis[0] = lfthr + *lfl; + lfis[1] = lfthr + *(lfl + lfl_forward); + ss[1] = ss[0] + 8 * pitch; + + if (mask_16x16 & dual_one) { + if ((mask_16x16 & dual_one) == dual_one) { + vpx_highbd_lpf_vertical_16_dual(ss[0], pitch, lfis[0]->mblim, + lfis[0]->lim, lfis[0]->hev_thr, bd); + } else { + const loop_filter_thresh *lfi = lfis[!(mask_16x16 & 1)]; + vpx_highbd_lpf_vertical_16(ss[!(mask_16x16 & 1)], pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); + } + } + + if (mask_8x8 & dual_one) { + if ((mask_8x8 & dual_one) == dual_one) { + vpx_highbd_lpf_vertical_8_dual(ss[0], pitch, lfis[0]->mblim, + lfis[0]->lim, lfis[0]->hev_thr, + lfis[1]->mblim, lfis[1]->lim, + lfis[1]->hev_thr, bd); + } else { + const loop_filter_thresh *lfi = lfis[!(mask_8x8 & 1)]; + vpx_highbd_lpf_vertical_8(ss[!(mask_8x8 & 1)], pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); + } + } + + if (mask_4x4 & dual_one) { + if ((mask_4x4 & dual_one) == dual_one) { + vpx_highbd_lpf_vertical_4_dual(ss[0], pitch, lfis[0]->mblim, + lfis[0]->lim, lfis[0]->hev_thr, + lfis[1]->mblim, lfis[1]->lim, + lfis[1]->hev_thr, bd); + } else { + const loop_filter_thresh *lfi = lfis[!(mask_4x4 & 1)]; + vpx_highbd_lpf_vertical_4(ss[!(mask_4x4 & 1)], pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); + } + } + + if (mask_4x4_int & dual_one) { + if ((mask_4x4_int & dual_one) == dual_one) { + vpx_highbd_lpf_vertical_4_dual(ss[0] + 4, pitch, lfis[0]->mblim, + lfis[0]->lim, lfis[0]->hev_thr, + lfis[1]->mblim, lfis[1]->lim, + lfis[1]->hev_thr, bd); + } else { + const loop_filter_thresh *lfi = lfis[!(mask_4x4_int & 1)]; + vpx_highbd_lpf_vertical_4(ss[!(mask_4x4_int & 1)] + 4, pitch, + lfi->mblim, lfi->lim, lfi->hev_thr, bd); + } + } + } + + ss[0] += 8; + lfl += 1; + mask_16x16 >>= 1; + mask_8x8 >>= 1; + mask_4x4 >>= 1; + mask_4x4_int >>= 1; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void filter_selectively_horiz(uint8_t *s, int pitch, + unsigned int mask_16x16, + unsigned int mask_8x8, + unsigned int mask_4x4, + unsigned int mask_4x4_int, + const loop_filter_thresh *lfthr, + const uint8_t *lfl) { + unsigned int mask; + int count; + + for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; + mask; mask >>= count) { + count = 1; + if (mask & 1) { + const loop_filter_thresh *lfi = lfthr + *lfl; + + if (mask_16x16 & 1) { + if ((mask_16x16 & 3) == 3) { + vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + count = 2; + } else { + vpx_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + } + } else if (mask_8x8 & 1) { + if ((mask_8x8 & 3) == 3) { + // Next block's thresholds. + const loop_filter_thresh *lfin = lfthr + *(lfl + 1); + + vpx_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + + if ((mask_4x4_int & 3) == 3) { + vpx_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr); + } else { + if (mask_4x4_int & 1) + vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + else if (mask_4x4_int & 2) + vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, + lfin->lim, lfin->hev_thr); + } + count = 2; + } else { + vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + + if (mask_4x4_int & 1) + vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + } + } else if (mask_4x4 & 1) { + if ((mask_4x4 & 3) == 3) { + // Next block's thresholds. + const loop_filter_thresh *lfin = lfthr + *(lfl + 1); + + vpx_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); + if ((mask_4x4_int & 3) == 3) { + vpx_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr); + } else { + if (mask_4x4_int & 1) + vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + else if (mask_4x4_int & 2) + vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, + lfin->lim, lfin->hev_thr); + } + count = 2; + } else { + vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + + if (mask_4x4_int & 1) + vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + } + } else { + vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + } + } + s += 8 * count; + lfl += count; + mask_16x16 >>= count; + mask_8x8 >>= count; + mask_4x4 >>= count; + mask_4x4_int >>= count; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, + unsigned int mask_16x16, + unsigned int mask_8x8, + unsigned int mask_4x4, + unsigned int mask_4x4_int, + const loop_filter_thresh *lfthr, + const uint8_t *lfl, int bd) { + unsigned int mask; + int count; + + for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; + mask; mask >>= count) { + count = 1; + if (mask & 1) { + const loop_filter_thresh *lfi = lfthr + *lfl; + + if (mask_16x16 & 1) { + if ((mask_16x16 & 3) == 3) { + vpx_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + count = 2; + } else { + vpx_highbd_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + } + } else if (mask_8x8 & 1) { + if ((mask_8x8 & 3) == 3) { + // Next block's thresholds. + const loop_filter_thresh *lfin = lfthr + *(lfl + 1); + + vpx_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr, bd); + + if ((mask_4x4_int & 3) == 3) { + vpx_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, + lfin->mblim, lfin->lim, + lfin->hev_thr, bd); + } else { + if (mask_4x4_int & 1) { + vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); + } else if (mask_4x4_int & 2) { + vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } + } + count = 2; + } else { + vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + + if (mask_4x4_int & 1) { + vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); + } + } + } else if (mask_4x4 & 1) { + if ((mask_4x4 & 3) == 3) { + // Next block's thresholds. + const loop_filter_thresh *lfin = lfthr + *(lfl + 1); + + vpx_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr, bd); + if ((mask_4x4_int & 3) == 3) { + vpx_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, + lfin->mblim, lfin->lim, + lfin->hev_thr, bd); + } else { + if (mask_4x4_int & 1) { + vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); + } else if (mask_4x4_int & 2) { + vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); + } + } + count = 2; + } else { + vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + + if (mask_4x4_int & 1) { + vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); + } + } + } else { + vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + } + } + s += 8 * count; + lfl += count; + mask_16x16 >>= count; + mask_8x8 >>= count; + mask_4x4 >>= count; + mask_4x4_int >>= count; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +// This function ors into the current lfm structure, where to do loop +// filters for the specific mi we are looking at. It uses information +// including the block_size_type (32x16, 32x32, etc.), the transform size, +// whether there were any coefficients encoded, and the loop filter strength +// block we are currently looking at. Shift is used to position the +// 1's we produce. +static void build_masks(const loop_filter_info_n *const lfi_n, + const MODE_INFO *mi, const int shift_y, + const int shift_uv, + LOOP_FILTER_MASK *lfm) { + const BLOCK_SIZE block_size = mi->sb_type; + const TX_SIZE tx_size_y = mi->tx_size; + const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1); + const int filter_level = get_filter_level(lfi_n, mi); + uint64_t *const left_y = &lfm->left_y[tx_size_y]; + uint64_t *const above_y = &lfm->above_y[tx_size_y]; + uint64_t *const int_4x4_y = &lfm->int_4x4_y; + uint16_t *const left_uv = &lfm->left_uv[tx_size_uv]; + uint16_t *const above_uv = &lfm->above_uv[tx_size_uv]; + uint16_t *const int_4x4_uv = &lfm->int_4x4_uv; + int i; + + // If filter level is 0 we don't loop filter. + if (!filter_level) { + return; + } else { + const int w = num_8x8_blocks_wide_lookup[block_size]; + const int h = num_8x8_blocks_high_lookup[block_size]; + int index = shift_y; + for (i = 0; i < h; i++) { + memset(&lfm->lfl_y[index], filter_level, w); + index += 8; + } + } + + // These set 1 in the current block size for the block size edges. + // For instance if the block size is 32x16, we'll set: + // above = 1111 + // 0000 + // and + // left = 1000 + // = 1000 + // NOTE : In this example the low bit is left most ( 1000 ) is stored as + // 1, not 8... + // + // U and V set things on a 16 bit scale. + // + *above_y |= above_prediction_mask[block_size] << shift_y; + *above_uv |= above_prediction_mask_uv[block_size] << shift_uv; + *left_y |= left_prediction_mask[block_size] << shift_y; + *left_uv |= left_prediction_mask_uv[block_size] << shift_uv; + + // If the block has no coefficients and is not intra we skip applying + // the loop filter on block edges. + if (mi->skip && is_inter_block(mi)) + return; + + // Here we are adding a mask for the transform size. The transform + // size mask is set to be correct for a 64x64 prediction block size. We + // mask to match the size of the block we are working on and then shift it + // into place.. + *above_y |= (size_mask[block_size] & + above_64x64_txform_mask[tx_size_y]) << shift_y; + *above_uv |= (size_mask_uv[block_size] & + above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv; + + *left_y |= (size_mask[block_size] & + left_64x64_txform_mask[tx_size_y]) << shift_y; + *left_uv |= (size_mask_uv[block_size] & + left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv; + + // Here we are trying to determine what to do with the internal 4x4 block + // boundaries. These differ from the 4x4 boundaries on the outside edge of + // an 8x8 in that the internal ones can be skipped and don't depend on + // the prediction block size. + if (tx_size_y == TX_4X4) + *int_4x4_y |= size_mask[block_size] << shift_y; + + if (tx_size_uv == TX_4X4) + *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv; +} + +// This function does the same thing as the one above with the exception that +// it only affects the y masks. It exists because for blocks < 16x16 in size, +// we only update u and v masks on the first block. +static void build_y_mask(const loop_filter_info_n *const lfi_n, + const MODE_INFO *mi, const int shift_y, + LOOP_FILTER_MASK *lfm) { + const BLOCK_SIZE block_size = mi->sb_type; + const TX_SIZE tx_size_y = mi->tx_size; + const int filter_level = get_filter_level(lfi_n, mi); + uint64_t *const left_y = &lfm->left_y[tx_size_y]; + uint64_t *const above_y = &lfm->above_y[tx_size_y]; + uint64_t *const int_4x4_y = &lfm->int_4x4_y; + int i; + + if (!filter_level) { + return; + } else { + const int w = num_8x8_blocks_wide_lookup[block_size]; + const int h = num_8x8_blocks_high_lookup[block_size]; + int index = shift_y; + for (i = 0; i < h; i++) { + memset(&lfm->lfl_y[index], filter_level, w); + index += 8; + } + } + + *above_y |= above_prediction_mask[block_size] << shift_y; + *left_y |= left_prediction_mask[block_size] << shift_y; + + if (mi->skip && is_inter_block(mi)) + return; + + *above_y |= (size_mask[block_size] & + above_64x64_txform_mask[tx_size_y]) << shift_y; + + *left_y |= (size_mask[block_size] & + left_64x64_txform_mask[tx_size_y]) << shift_y; + + if (tx_size_y == TX_4X4) + *int_4x4_y |= size_mask[block_size] << shift_y; +} + +void vp9_adjust_mask(VP9_COMMON *const cm, const int mi_row, + const int mi_col, LOOP_FILTER_MASK *lfm) { + int i; + + // The largest loopfilter we have is 16x16 so we use the 16x16 mask + // for 32x32 transforms also. + lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32]; + lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32]; + lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32]; + lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32]; + + // We do at least 8 tap filter on every 32x32 even if the transform size + // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and + // remove it from the 4x4. + lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border; + lfm->left_y[TX_4X4] &= ~left_border; + lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border; + lfm->above_y[TX_4X4] &= ~above_border; + lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv; + lfm->left_uv[TX_4X4] &= ~left_border_uv; + lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv; + lfm->above_uv[TX_4X4] &= ~above_border_uv; + + // We do some special edge handling. + if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) { + const uint64_t rows = cm->mi_rows - mi_row; + + // Each pixel inside the border gets a 1, + const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1); + const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1); + + // Remove values completely outside our border. + for (i = 0; i < TX_32X32; i++) { + lfm->left_y[i] &= mask_y; + lfm->above_y[i] &= mask_y; + lfm->left_uv[i] &= mask_uv; + lfm->above_uv[i] &= mask_uv; + } + lfm->int_4x4_y &= mask_y; + lfm->int_4x4_uv &= mask_uv; + + // We don't apply a wide loop filter on the last uv block row. If set + // apply the shorter one instead. + if (rows == 1) { + lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16]; + lfm->above_uv[TX_16X16] = 0; + } + if (rows == 5) { + lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00; + lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00); + } + } + + if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) { + const uint64_t columns = cm->mi_cols - mi_col; + + // Each pixel inside the border gets a 1, the multiply copies the border + // to where we need it. + const uint64_t mask_y = (((1 << columns) - 1)) * 0x0101010101010101ULL; + const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111; + + // Internal edges are not applied on the last column of the image so + // we mask 1 more for the internal edges + const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111; + + // Remove the bits outside the image edge. + for (i = 0; i < TX_32X32; i++) { + lfm->left_y[i] &= mask_y; + lfm->above_y[i] &= mask_y; + lfm->left_uv[i] &= mask_uv; + lfm->above_uv[i] &= mask_uv; + } + lfm->int_4x4_y &= mask_y; + lfm->int_4x4_uv &= mask_uv_int; + + // We don't apply a wide loop filter on the last uv column. If set + // apply the shorter one instead. + if (columns == 1) { + lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16]; + lfm->left_uv[TX_16X16] = 0; + } + if (columns == 5) { + lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc); + lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc); + } + } + // We don't apply a loop filter on the first column in the image, mask that + // out. + if (mi_col == 0) { + for (i = 0; i < TX_32X32; i++) { + lfm->left_y[i] &= 0xfefefefefefefefeULL; + lfm->left_uv[i] &= 0xeeee; + } + } + + // Assert if we try to apply 2 different loop filters at the same position. + assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8])); + assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4])); + assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4])); + assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16])); + assert(!(lfm->left_uv[TX_16X16]&lfm->left_uv[TX_8X8])); + assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4])); + assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4])); + assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16])); + assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8])); + assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4])); + assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4])); + assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16])); + assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8])); + assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4])); + assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4])); + assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16])); +} + +// This function sets up the bit masks for the entire 64x64 region represented +// by mi_row, mi_col. +void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, + MODE_INFO **mi, const int mode_info_stride, + LOOP_FILTER_MASK *lfm) { + int idx_32, idx_16, idx_8; + const loop_filter_info_n *const lfi_n = &cm->lf_info; + MODE_INFO **mip = mi; + MODE_INFO **mip2 = mi; + + // These are offsets to the next mi in the 64x64 block. It is what gets + // added to the mi ptr as we go through each loop. It helps us to avoid + // setting up special row and column counters for each index. The last step + // brings us out back to the starting position. + const int offset_32[] = {4, (mode_info_stride << 2) - 4, 4, + -(mode_info_stride << 2) - 4}; + const int offset_16[] = {2, (mode_info_stride << 1) - 2, 2, + -(mode_info_stride << 1) - 2}; + const int offset[] = {1, mode_info_stride - 1, 1, -mode_info_stride - 1}; + + // Following variables represent shifts to position the current block + // mask over the appropriate block. A shift of 36 to the left will move + // the bits for the final 32 by 32 block in the 64x64 up 4 rows and left + // 4 rows to the appropriate spot. + const int shift_32_y[] = {0, 4, 32, 36}; + const int shift_16_y[] = {0, 2, 16, 18}; + const int shift_8_y[] = {0, 1, 8, 9}; + const int shift_32_uv[] = {0, 2, 8, 10}; + const int shift_16_uv[] = {0, 1, 4, 5}; + const int max_rows = (mi_row + MI_BLOCK_SIZE > cm->mi_rows ? + cm->mi_rows - mi_row : MI_BLOCK_SIZE); + const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ? + cm->mi_cols - mi_col : MI_BLOCK_SIZE); + + vp9_zero(*lfm); + assert(mip[0] != NULL); + + switch (mip[0]->sb_type) { + case BLOCK_64X64: + build_masks(lfi_n, mip[0] , 0, 0, lfm); + break; + case BLOCK_64X32: + build_masks(lfi_n, mip[0], 0, 0, lfm); + mip2 = mip + mode_info_stride * 4; + if (4 >= max_rows) + break; + build_masks(lfi_n, mip2[0], 32, 8, lfm); + break; + case BLOCK_32X64: + build_masks(lfi_n, mip[0], 0, 0, lfm); + mip2 = mip + 4; + if (4 >= max_cols) + break; + build_masks(lfi_n, mip2[0], 4, 2, lfm); + break; + default: + for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) { + const int shift_y = shift_32_y[idx_32]; + const int shift_uv = shift_32_uv[idx_32]; + const int mi_32_col_offset = ((idx_32 & 1) << 2); + const int mi_32_row_offset = ((idx_32 >> 1) << 2); + if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows) + continue; + switch (mip[0]->sb_type) { + case BLOCK_32X32: + build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + break; + case BLOCK_32X16: + build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + if (mi_32_row_offset + 2 >= max_rows) + continue; + mip2 = mip + mode_info_stride * 2; + build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm); + break; + case BLOCK_16X32: + build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + if (mi_32_col_offset + 2 >= max_cols) + continue; + mip2 = mip + 2; + build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm); + break; + default: + for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) { + const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16]; + const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16]; + const int mi_16_col_offset = mi_32_col_offset + + ((idx_16 & 1) << 1); + const int mi_16_row_offset = mi_32_row_offset + + ((idx_16 >> 1) << 1); + + if (mi_16_col_offset >= max_cols || mi_16_row_offset >= max_rows) + continue; + + switch (mip[0]->sb_type) { + case BLOCK_16X16: + build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + break; + case BLOCK_16X8: + build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + if (mi_16_row_offset + 1 >= max_rows) + continue; + mip2 = mip + mode_info_stride; + build_y_mask(lfi_n, mip2[0], shift_y+8, lfm); + break; + case BLOCK_8X16: + build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + if (mi_16_col_offset +1 >= max_cols) + continue; + mip2 = mip + 1; + build_y_mask(lfi_n, mip2[0], shift_y+1, lfm); + break; + default: { + const int shift_y = shift_32_y[idx_32] + + shift_16_y[idx_16] + + shift_8_y[0]; + build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + mip += offset[0]; + for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) { + const int shift_y = shift_32_y[idx_32] + + shift_16_y[idx_16] + + shift_8_y[idx_8]; + const int mi_8_col_offset = mi_16_col_offset + + ((idx_8 & 1)); + const int mi_8_row_offset = mi_16_row_offset + + ((idx_8 >> 1)); + + if (mi_8_col_offset >= max_cols || + mi_8_row_offset >= max_rows) + continue; + build_y_mask(lfi_n, mip[0], shift_y, lfm); + } + break; + } + } + } + break; + } + } + break; + } +} + +static void filter_selectively_vert(uint8_t *s, int pitch, + unsigned int mask_16x16, + unsigned int mask_8x8, + unsigned int mask_4x4, + unsigned int mask_4x4_int, + const loop_filter_thresh *lfthr, + const uint8_t *lfl) { + unsigned int mask; + + for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; + mask; mask >>= 1) { + const loop_filter_thresh *lfi = lfthr + *lfl; + + if (mask & 1) { + if (mask_16x16 & 1) { + vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + } else if (mask_8x8 & 1) { + vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + } else if (mask_4x4 & 1) { + vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + } + } + if (mask_4x4_int & 1) + vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + s += 8; + lfl += 1; + mask_16x16 >>= 1; + mask_8x8 >>= 1; + mask_4x4 >>= 1; + mask_4x4_int >>= 1; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_filter_selectively_vert(uint16_t *s, int pitch, + unsigned int mask_16x16, + unsigned int mask_8x8, + unsigned int mask_4x4, + unsigned int mask_4x4_int, + const loop_filter_thresh *lfthr, + const uint8_t *lfl, int bd) { + unsigned int mask; + + for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; + mask; mask >>= 1) { + const loop_filter_thresh *lfi = lfthr + *lfl; + + if (mask & 1) { + if (mask_16x16 & 1) { + vpx_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + } else if (mask_8x8 & 1) { + vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + } else if (mask_4x4 & 1) { + vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + } + } + if (mask_4x4_int & 1) + vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, bd); + s += 8; + lfl += 1; + mask_16x16 >>= 1; + mask_8x8 >>= 1; + mask_4x4 >>= 1; + mask_4x4_int >>= 1; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_filter_block_plane_non420(VP9_COMMON *cm, + struct macroblockd_plane *plane, + MODE_INFO **mi_8x8, + int mi_row, int mi_col) { + const int ss_x = plane->subsampling_x; + const int ss_y = plane->subsampling_y; + const int row_step = 1 << ss_y; + const int col_step = 1 << ss_x; + const int row_step_stride = cm->mi_stride * row_step; + struct buf_2d *const dst = &plane->dst; + uint8_t* const dst0 = dst->buf; + unsigned int mask_16x16[MI_BLOCK_SIZE] = {0}; + unsigned int mask_8x8[MI_BLOCK_SIZE] = {0}; + unsigned int mask_4x4[MI_BLOCK_SIZE] = {0}; + unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0}; + uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE]; + int r, c; + + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) { + unsigned int mask_16x16_c = 0; + unsigned int mask_8x8_c = 0; + unsigned int mask_4x4_c = 0; + unsigned int border_mask; + + // Determine the vertical edges that need filtering + for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) { + const MODE_INFO *mi = mi_8x8[c]; + const BLOCK_SIZE sb_type = mi[0].sb_type; + const int skip_this = mi[0].skip && is_inter_block(mi); + // left edge of current unit is block/partition edge -> no skip + const int block_edge_left = (num_4x4_blocks_wide_lookup[sb_type] > 1) ? + !(c & (num_8x8_blocks_wide_lookup[sb_type] - 1)) : 1; + const int skip_this_c = skip_this && !block_edge_left; + // top edge of current unit is block/partition edge -> no skip + const int block_edge_above = (num_4x4_blocks_high_lookup[sb_type] > 1) ? + !(r & (num_8x8_blocks_high_lookup[sb_type] - 1)) : 1; + const int skip_this_r = skip_this && !block_edge_above; + const TX_SIZE tx_size = get_uv_tx_size(mi, plane); + const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1; + const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; + + // Filter level can vary per MI + if (!(lfl[(r << 3) + (c >> ss_x)] = + get_filter_level(&cm->lf_info, mi))) + continue; + + // Build masks based on the transform size of each block + if (tx_size == TX_32X32) { + if (!skip_this_c && ((c >> ss_x) & 3) == 0) { + if (!skip_border_4x4_c) + mask_16x16_c |= 1 << (c >> ss_x); + else + mask_8x8_c |= 1 << (c >> ss_x); + } + if (!skip_this_r && ((r >> ss_y) & 3) == 0) { + if (!skip_border_4x4_r) + mask_16x16[r] |= 1 << (c >> ss_x); + else + mask_8x8[r] |= 1 << (c >> ss_x); + } + } else if (tx_size == TX_16X16) { + if (!skip_this_c && ((c >> ss_x) & 1) == 0) { + if (!skip_border_4x4_c) + mask_16x16_c |= 1 << (c >> ss_x); + else + mask_8x8_c |= 1 << (c >> ss_x); + } + if (!skip_this_r && ((r >> ss_y) & 1) == 0) { + if (!skip_border_4x4_r) + mask_16x16[r] |= 1 << (c >> ss_x); + else + mask_8x8[r] |= 1 << (c >> ss_x); + } + } else { + // force 8x8 filtering on 32x32 boundaries + if (!skip_this_c) { + if (tx_size == TX_8X8 || ((c >> ss_x) & 3) == 0) + mask_8x8_c |= 1 << (c >> ss_x); + else + mask_4x4_c |= 1 << (c >> ss_x); + } + + if (!skip_this_r) { + if (tx_size == TX_8X8 || ((r >> ss_y) & 3) == 0) + mask_8x8[r] |= 1 << (c >> ss_x); + else + mask_4x4[r] |= 1 << (c >> ss_x); + } + + if (!skip_this && tx_size < TX_8X8 && !skip_border_4x4_c) + mask_4x4_int[r] |= 1 << (c >> ss_x); + } + } + + // Disable filtering on the leftmost column + border_mask = ~(mi_col == 0); +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + highbd_filter_selectively_vert(CONVERT_TO_SHORTPTR(dst->buf), + dst->stride, + mask_16x16_c & border_mask, + mask_8x8_c & border_mask, + mask_4x4_c & border_mask, + mask_4x4_int[r], + cm->lf_info.lfthr, &lfl[r << 3], + (int)cm->bit_depth); + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + filter_selectively_vert(dst->buf, dst->stride, + mask_16x16_c & border_mask, + mask_8x8_c & border_mask, + mask_4x4_c & border_mask, + mask_4x4_int[r], + cm->lf_info.lfthr, &lfl[r << 3]); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + dst->buf += 8 * dst->stride; + mi_8x8 += row_step_stride; + } + + // Now do horizontal pass + dst->buf = dst0; + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) { + const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; + const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r]; + + unsigned int mask_16x16_r; + unsigned int mask_8x8_r; + unsigned int mask_4x4_r; + + if (mi_row + r == 0) { + mask_16x16_r = 0; + mask_8x8_r = 0; + mask_4x4_r = 0; + } else { + mask_16x16_r = mask_16x16[r]; + mask_8x8_r = mask_8x8[r]; + mask_4x4_r = mask_4x4[r]; + } +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf), + dst->stride, + mask_16x16_r, + mask_8x8_r, + mask_4x4_r, + mask_4x4_int_r, + cm->lf_info.lfthr, &lfl[r << 3], + (int)cm->bit_depth); + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + filter_selectively_horiz(dst->buf, dst->stride, + mask_16x16_r, + mask_8x8_r, + mask_4x4_r, + mask_4x4_int_r, + cm->lf_info.lfthr, &lfl[r << 3]); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + dst->buf += 8 * dst->stride; + } +} + +void vp9_filter_block_plane_ss00(VP9_COMMON *const cm, + struct macroblockd_plane *const plane, + int mi_row, + LOOP_FILTER_MASK *lfm) { + struct buf_2d *const dst = &plane->dst; + uint8_t *const dst0 = dst->buf; + int r; + uint64_t mask_16x16 = lfm->left_y[TX_16X16]; + uint64_t mask_8x8 = lfm->left_y[TX_8X8]; + uint64_t mask_4x4 = lfm->left_y[TX_4X4]; + uint64_t mask_4x4_int = lfm->int_4x4_y; + + assert(plane->subsampling_x == 0 && plane->subsampling_y == 0); + + // Vertical pass: do 2 rows at one time + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) { + // Disable filtering on the leftmost column. +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + highbd_filter_selectively_vert_row2(plane->subsampling_x, + CONVERT_TO_SHORTPTR(dst->buf), + dst->stride, + (unsigned int)mask_16x16, + (unsigned int)mask_8x8, + (unsigned int)mask_4x4, + (unsigned int)mask_4x4_int, + cm->lf_info.lfthr, + &lfm->lfl_y[r << 3], + (int)cm->bit_depth); + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + filter_selectively_vert_row2(plane->subsampling_x, dst->buf, dst->stride, + (unsigned int)mask_16x16, + (unsigned int)mask_8x8, + (unsigned int)mask_4x4, + (unsigned int)mask_4x4_int, + cm->lf_info.lfthr, &lfm->lfl_y[r << 3]); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + dst->buf += 16 * dst->stride; + mask_16x16 >>= 16; + mask_8x8 >>= 16; + mask_4x4 >>= 16; + mask_4x4_int >>= 16; + } + + // Horizontal pass + dst->buf = dst0; + mask_16x16 = lfm->above_y[TX_16X16]; + mask_8x8 = lfm->above_y[TX_8X8]; + mask_4x4 = lfm->above_y[TX_4X4]; + mask_4x4_int = lfm->int_4x4_y; + + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) { + unsigned int mask_16x16_r; + unsigned int mask_8x8_r; + unsigned int mask_4x4_r; + + if (mi_row + r == 0) { + mask_16x16_r = 0; + mask_8x8_r = 0; + mask_4x4_r = 0; + } else { + mask_16x16_r = mask_16x16 & 0xff; + mask_8x8_r = mask_8x8 & 0xff; + mask_4x4_r = mask_4x4 & 0xff; + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf), + dst->stride, mask_16x16_r, mask_8x8_r, + mask_4x4_r, mask_4x4_int & 0xff, + cm->lf_info.lfthr, &lfm->lfl_y[r << 3], + (int)cm->bit_depth); + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r, + mask_4x4_r, mask_4x4_int & 0xff, + cm->lf_info.lfthr, &lfm->lfl_y[r << 3]); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + dst->buf += 8 * dst->stride; + mask_16x16 >>= 8; + mask_8x8 >>= 8; + mask_4x4 >>= 8; + mask_4x4_int >>= 8; + } +} + +void vp9_filter_block_plane_ss11(VP9_COMMON *const cm, + struct macroblockd_plane *const plane, + int mi_row, + LOOP_FILTER_MASK *lfm) { + struct buf_2d *const dst = &plane->dst; + uint8_t *const dst0 = dst->buf; + int r, c; + uint8_t lfl_uv[16]; + + uint16_t mask_16x16 = lfm->left_uv[TX_16X16]; + uint16_t mask_8x8 = lfm->left_uv[TX_8X8]; + uint16_t mask_4x4 = lfm->left_uv[TX_4X4]; + uint16_t mask_4x4_int = lfm->int_4x4_uv; + + assert(plane->subsampling_x == 1 && plane->subsampling_y == 1); + + // Vertical pass: do 2 rows at one time + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) { + for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) { + lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)]; + lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)]; + } + + // Disable filtering on the leftmost column. +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + highbd_filter_selectively_vert_row2(plane->subsampling_x, + CONVERT_TO_SHORTPTR(dst->buf), + dst->stride, + (unsigned int)mask_16x16, + (unsigned int)mask_8x8, + (unsigned int)mask_4x4, + (unsigned int)mask_4x4_int, + cm->lf_info.lfthr, &lfl_uv[r << 1], + (int)cm->bit_depth); + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + filter_selectively_vert_row2(plane->subsampling_x, dst->buf, dst->stride, + (unsigned int)mask_16x16, + (unsigned int)mask_8x8, + (unsigned int)mask_4x4, + (unsigned int)mask_4x4_int, + cm->lf_info.lfthr, &lfl_uv[r << 1]); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + dst->buf += 16 * dst->stride; + mask_16x16 >>= 8; + mask_8x8 >>= 8; + mask_4x4 >>= 8; + mask_4x4_int >>= 8; + } + + // Horizontal pass + dst->buf = dst0; + mask_16x16 = lfm->above_uv[TX_16X16]; + mask_8x8 = lfm->above_uv[TX_8X8]; + mask_4x4 = lfm->above_uv[TX_4X4]; + mask_4x4_int = lfm->int_4x4_uv; + + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) { + const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1; + const unsigned int mask_4x4_int_r = + skip_border_4x4_r ? 0 : (mask_4x4_int & 0xf); + unsigned int mask_16x16_r; + unsigned int mask_8x8_r; + unsigned int mask_4x4_r; + + if (mi_row + r == 0) { + mask_16x16_r = 0; + mask_8x8_r = 0; + mask_4x4_r = 0; + } else { + mask_16x16_r = mask_16x16 & 0xf; + mask_8x8_r = mask_8x8 & 0xf; + mask_4x4_r = mask_4x4 & 0xf; + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf), + dst->stride, mask_16x16_r, mask_8x8_r, + mask_4x4_r, mask_4x4_int_r, + cm->lf_info.lfthr, &lfl_uv[r << 1], + (int)cm->bit_depth); + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r, + mask_4x4_r, mask_4x4_int_r, cm->lf_info.lfthr, + &lfl_uv[r << 1]); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + dst->buf += 8 * dst->stride; + mask_16x16 >>= 4; + mask_8x8 >>= 4; + mask_4x4 >>= 4; + mask_4x4_int >>= 4; + } +} + +static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, VP9_COMMON *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int start, int stop, int y_only) { + const int num_planes = y_only ? 1 : MAX_MB_PLANE; + enum lf_path path; + int mi_row, mi_col; + + if (y_only) + path = LF_PATH_444; + else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1) + path = LF_PATH_420; + else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0) + path = LF_PATH_444; + else + path = LF_PATH_SLOW; + + for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) { + MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride; + LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0); + + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE, ++lfm) { + int plane; + + vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); + + // TODO(jimbankoski): For 444 only need to do y mask. + vp9_adjust_mask(cm, mi_row, mi_col, lfm); + + vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm); + for (plane = 1; plane < num_planes; ++plane) { + switch (path) { + case LF_PATH_420: + vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, lfm); + break; + case LF_PATH_444: + vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, lfm); + break; + case LF_PATH_SLOW: + vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, + mi_row, mi_col); + break; + } + } + } + } +} + +void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, + VP9_COMMON *cm, MACROBLOCKD *xd, + int frame_filter_level, + int y_only, int partial_frame) { + int start_mi_row, end_mi_row, mi_rows_to_filter; + if (!frame_filter_level) return; + start_mi_row = 0; + mi_rows_to_filter = cm->mi_rows; + if (partial_frame && cm->mi_rows > 8) { + start_mi_row = cm->mi_rows >> 1; + start_mi_row &= 0xfffffff8; + mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8); + } + end_mi_row = start_mi_row + mi_rows_to_filter; + loop_filter_rows(frame, cm, xd->plane, start_mi_row, end_mi_row, y_only); +} + +// Used by the encoder to build the loopfilter masks. +// TODO(slavarnway): Do the encoder the same way the decoder does it and +// build the masks in line as part of the encode process. +void vp9_build_mask_frame(VP9_COMMON *cm, int frame_filter_level, + int partial_frame) { + int start_mi_row, end_mi_row, mi_rows_to_filter; + int mi_col, mi_row; + if (!frame_filter_level) return; + start_mi_row = 0; + mi_rows_to_filter = cm->mi_rows; + if (partial_frame && cm->mi_rows > 8) { + start_mi_row = cm->mi_rows >> 1; + start_mi_row &= 0xfffffff8; + mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8); + } + end_mi_row = start_mi_row + mi_rows_to_filter; + + vp9_loop_filter_frame_init(cm, frame_filter_level); + + for (mi_row = start_mi_row; mi_row < end_mi_row; mi_row += MI_BLOCK_SIZE) { + MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride; + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) { + // vp9_setup_mask() zeros lfm + vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, + get_lfm(&cm->lf, mi_row, mi_col)); + } + } +} + +// 8x8 blocks in a superblock. A "1" represents the first block in a 16x16 +// or greater area. +static const uint8_t first_block_in_16x16[8][8] = { + {1, 0, 1, 0, 1, 0, 1, 0}, + {0, 0, 0, 0, 0, 0, 0, 0}, + {1, 0, 1, 0, 1, 0, 1, 0}, + {0, 0, 0, 0, 0, 0, 0, 0}, + {1, 0, 1, 0, 1, 0, 1, 0}, + {0, 0, 0, 0, 0, 0, 0, 0}, + {1, 0, 1, 0, 1, 0, 1, 0}, + {0, 0, 0, 0, 0, 0, 0, 0} +}; + +// This function sets up the bit masks for a block represented +// by mi_row, mi_col in a 64x64 region. +// TODO(SJL): This function only works for yv12. +void vp9_build_mask(VP9_COMMON *cm, const MODE_INFO *mi, int mi_row, + int mi_col, int bw, int bh) { + const BLOCK_SIZE block_size = mi->sb_type; + const TX_SIZE tx_size_y = mi->tx_size; + const loop_filter_info_n *const lfi_n = &cm->lf_info; + const int filter_level = get_filter_level(lfi_n, mi); + const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1); + LOOP_FILTER_MASK *const lfm = get_lfm(&cm->lf, mi_row, mi_col); + uint64_t *const left_y = &lfm->left_y[tx_size_y]; + uint64_t *const above_y = &lfm->above_y[tx_size_y]; + uint64_t *const int_4x4_y = &lfm->int_4x4_y; + uint16_t *const left_uv = &lfm->left_uv[tx_size_uv]; + uint16_t *const above_uv = &lfm->above_uv[tx_size_uv]; + uint16_t *const int_4x4_uv = &lfm->int_4x4_uv; + const int row_in_sb = (mi_row & 7); + const int col_in_sb = (mi_col & 7); + const int shift_y = col_in_sb + (row_in_sb << 3); + const int shift_uv = (col_in_sb >> 1) + ((row_in_sb >> 1) << 2); + const int build_uv = first_block_in_16x16[row_in_sb][col_in_sb]; + + if (!filter_level) { + return; + } else { + int index = shift_y; + int i; + for (i = 0; i < bh; i++) { + memset(&lfm->lfl_y[index], filter_level, bw); + index += 8; + } + } + + // These set 1 in the current block size for the block size edges. + // For instance if the block size is 32x16, we'll set: + // above = 1111 + // 0000 + // and + // left = 1000 + // = 1000 + // NOTE : In this example the low bit is left most ( 1000 ) is stored as + // 1, not 8... + // + // U and V set things on a 16 bit scale. + // + *above_y |= above_prediction_mask[block_size] << shift_y; + *left_y |= left_prediction_mask[block_size] << shift_y; + + if (build_uv) { + *above_uv |= above_prediction_mask_uv[block_size] << shift_uv; + *left_uv |= left_prediction_mask_uv[block_size] << shift_uv; + } + + // If the block has no coefficients and is not intra we skip applying + // the loop filter on block edges. + if (mi->skip && is_inter_block(mi)) + return; + + // Add a mask for the transform size. The transform size mask is set to + // be correct for a 64x64 prediction block size. Mask to match the size of + // the block we are working on and then shift it into place. + *above_y |= (size_mask[block_size] & + above_64x64_txform_mask[tx_size_y]) << shift_y; + *left_y |= (size_mask[block_size] & + left_64x64_txform_mask[tx_size_y]) << shift_y; + + if (build_uv) { + *above_uv |= (size_mask_uv[block_size] & + above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv; + + *left_uv |= (size_mask_uv[block_size] & + left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv; + } + + // Try to determine what to do with the internal 4x4 block boundaries. These + // differ from the 4x4 boundaries on the outside edge of an 8x8 in that the + // internal ones can be skipped and don't depend on the prediction block size. + if (tx_size_y == TX_4X4) + *int_4x4_y |= size_mask[block_size] << shift_y; + + if (build_uv && tx_size_uv == TX_4X4) + *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv; +} + +void vp9_loop_filter_data_reset( + LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer, + struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]) { + lf_data->frame_buffer = frame_buffer; + lf_data->cm = cm; + lf_data->start = 0; + lf_data->stop = 0; + lf_data->y_only = 0; + memcpy(lf_data->planes, planes, sizeof(lf_data->planes)); +} + +void vp9_reset_lfm(VP9_COMMON *const cm) { + if (cm->lf.filter_level) { + memset(cm->lf.lfm, 0, + ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride * + sizeof(*cm->lf.lfm)); + } +} + +int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) { + (void)unused; + loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only); + return 1; +} diff --git a/thirdparty/libvpx/vp9/common/vp9_loopfilter.h b/thirdparty/libvpx/vp9/common/vp9_loopfilter.h new file mode 100644 index 000000000000..fca8830fa19b --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_loopfilter.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_LOOPFILTER_H_ +#define VP9_COMMON_VP9_LOOPFILTER_H_ + +#include "vpx_ports/mem.h" +#include "./vpx_config.h" + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_seg_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_LOOP_FILTER 63 +#define MAX_SHARPNESS 7 + +#define SIMD_WIDTH 16 + +#define MAX_REF_LF_DELTAS 4 +#define MAX_MODE_LF_DELTAS 2 + +enum lf_path { + LF_PATH_420, + LF_PATH_444, + LF_PATH_SLOW, +}; + +// Need to align this structure so when it is declared and +// passed it can be loaded into vector registers. +typedef struct { + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]); +} loop_filter_thresh; + +typedef struct { + loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1]; + uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS]; +} loop_filter_info_n; + +// This structure holds bit masks for all 8x8 blocks in a 64x64 region. +// Each 1 bit represents a position in which we want to apply the loop filter. +// Left_ entries refer to whether we apply a filter on the border to the +// left of the block. Above_ entries refer to whether or not to apply a +// filter on the above border. Int_ entries refer to whether or not to +// apply borders on the 4x4 edges within the 8x8 block that each bit +// represents. +// Since each transform is accompanied by a potentially different type of +// loop filter there is a different entry in the array for each transform size. +typedef struct { + uint64_t left_y[TX_SIZES]; + uint64_t above_y[TX_SIZES]; + uint64_t int_4x4_y; + uint16_t left_uv[TX_SIZES]; + uint16_t above_uv[TX_SIZES]; + uint16_t int_4x4_uv; + uint8_t lfl_y[64]; +} LOOP_FILTER_MASK; + +struct loopfilter { + int filter_level; + int last_filt_level; + + int sharpness_level; + int last_sharpness_level; + + uint8_t mode_ref_delta_enabled; + uint8_t mode_ref_delta_update; + + // 0 = Intra, Last, GF, ARF + signed char ref_deltas[MAX_REF_LF_DELTAS]; + signed char last_ref_deltas[MAX_REF_LF_DELTAS]; + + // 0 = ZERO_MV, MV + signed char mode_deltas[MAX_MODE_LF_DELTAS]; + signed char last_mode_deltas[MAX_MODE_LF_DELTAS]; + + LOOP_FILTER_MASK *lfm; + int lfm_stride; +}; + +/* assorted loopfilter functions which get used elsewhere */ +struct VP9Common; +struct macroblockd; +struct VP9LfSyncData; + +// This function sets up the bit masks for the entire 64x64 region represented +// by mi_row, mi_col. +void vp9_setup_mask(struct VP9Common *const cm, + const int mi_row, const int mi_col, + MODE_INFO **mi_8x8, const int mode_info_stride, + LOOP_FILTER_MASK *lfm); + +void vp9_filter_block_plane_ss00(struct VP9Common *const cm, + struct macroblockd_plane *const plane, + int mi_row, + LOOP_FILTER_MASK *lfm); + +void vp9_filter_block_plane_ss11(struct VP9Common *const cm, + struct macroblockd_plane *const plane, + int mi_row, + LOOP_FILTER_MASK *lfm); + +void vp9_filter_block_plane_non420(struct VP9Common *cm, + struct macroblockd_plane *plane, + MODE_INFO **mi_8x8, + int mi_row, int mi_col); + +void vp9_loop_filter_init(struct VP9Common *cm); + +// Update the loop filter for the current frame. +// This should be called before vp9_loop_filter_frame(), vp9_build_mask_frame() +// calls this function directly. +void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl); + +void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, + struct VP9Common *cm, + struct macroblockd *mbd, + int filter_level, + int y_only, int partial_frame); + +// Get the superblock lfm for a given mi_row, mi_col. +static INLINE LOOP_FILTER_MASK *get_lfm(const struct loopfilter *lf, + const int mi_row, const int mi_col) { + return &lf->lfm[(mi_col >> 3) + ((mi_row >> 3) * lf->lfm_stride)]; +} + +void vp9_build_mask(struct VP9Common *cm, const MODE_INFO *mi, int mi_row, + int mi_col, int bw, int bh); +void vp9_adjust_mask(struct VP9Common *const cm, const int mi_row, + const int mi_col, LOOP_FILTER_MASK *lfm); +void vp9_build_mask_frame(struct VP9Common *cm, int frame_filter_level, + int partial_frame); +void vp9_reset_lfm(struct VP9Common *const cm); + +typedef struct LoopFilterWorkerData { + YV12_BUFFER_CONFIG *frame_buffer; + struct VP9Common *cm; + struct macroblockd_plane planes[MAX_MB_PLANE]; + + int start; + int stop; + int y_only; +} LFWorkerData; + +void vp9_loop_filter_data_reset( + LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer, + struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]); + +// Operates on the rows described by 'lf_data'. +int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_LOOPFILTER_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_mv.h b/thirdparty/libvpx/vp9/common/vp9_mv.h new file mode 100644 index 000000000000..5d89da8c25a4 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_mv.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_MV_H_ +#define VP9_COMMON_VP9_MV_H_ + +#include "vpx/vpx_integer.h" + +#include "vp9/common/vp9_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct mv { + int16_t row; + int16_t col; +} MV; + +typedef union int_mv { + uint32_t as_int; + MV as_mv; +} int_mv; /* facilitates faster equality tests and copies */ + +typedef struct mv32 { + int32_t row; + int32_t col; +} MV32; + +static INLINE int is_zero_mv(const MV *mv) { + return *((const uint32_t *)mv) == 0; +} + +static INLINE int is_equal_mv(const MV *a, const MV *b) { + return *((const uint32_t *)a) == *((const uint32_t *)b); +} + +static INLINE void clamp_mv(MV *mv, int min_col, int max_col, + int min_row, int max_row) { + mv->col = clamp(mv->col, min_col, max_col); + mv->row = clamp(mv->row, min_row, max_row); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_MV_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_mvref_common.c b/thirdparty/libvpx/vp9/common/vp9_mvref_common.c new file mode 100644 index 000000000000..0eb01a51badb --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_mvref_common.c @@ -0,0 +1,201 @@ + +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_mvref_common.h" + +// This function searches the neighborhood of a given MB/SB +// to try and find candidate reference vectors. +static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int block, int mi_row, int mi_col, + uint8_t *mode_context) { + const int *ref_sign_bias = cm->ref_frame_sign_bias; + int i, refmv_count = 0; + const POSITION *const mv_ref_search = mv_ref_blocks[mi->sb_type]; + int different_ref_found = 0; + int context_counter = 0; + const MV_REF *const prev_frame_mvs = cm->use_prev_frame_mvs ? + cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL; + const TileInfo *const tile = &xd->tile; + + // Blank the reference vector list + memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES); + + // The nearest 2 blocks are treated differently + // if the size < 8x8 we get the mv from the bmi substructure, + // and we also need to keep a mode count. + for (i = 0; i < 2; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row * + xd->mi_stride]; + // Keep counts for entropy encoding. + context_counter += mode_2_counter[candidate_mi->mode]; + different_ref_found = 1; + + if (candidate_mi->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block), + refmv_count, mv_ref_list, Done); + else if (candidate_mi->ref_frame[1] == ref_frame) + ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block), + refmv_count, mv_ref_list, Done); + } + } + + // Check the rest of the neighbors in much the same way + // as before except we don't need to keep track of sub blocks or + // mode counts. + for (; i < MVREF_NEIGHBOURS; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate_mi = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + different_ref_found = 1; + + if (candidate_mi->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST(candidate_mi->mv[0], refmv_count, mv_ref_list, Done); + else if (candidate_mi->ref_frame[1] == ref_frame) + ADD_MV_REF_LIST(candidate_mi->mv[1], refmv_count, mv_ref_list, Done); + } + } + + // Check the last frame's mode and mv info. + if (cm->use_prev_frame_mvs) { + if (prev_frame_mvs->ref_frame[0] == ref_frame) { + ADD_MV_REF_LIST(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done); + } else if (prev_frame_mvs->ref_frame[1] == ref_frame) { + ADD_MV_REF_LIST(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, Done); + } + } + + // Since we couldn't find 2 mvs from the same reference frame + // go back through the neighbors and find motion vectors from + // different reference frames. + if (different_ref_found) { + for (i = 0; i < MVREF_NEIGHBOURS; ++i) { + const POSITION *mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate_mi = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + + // If the candidate is INTRA we don't want to consider its mv. + IF_DIFF_REF_FRAME_ADD_MV(candidate_mi, ref_frame, ref_sign_bias, + refmv_count, mv_ref_list, Done); + } + } + } + + // Since we still don't have a candidate we'll try the last frame. + if (cm->use_prev_frame_mvs) { + if (prev_frame_mvs->ref_frame[0] != ref_frame && + prev_frame_mvs->ref_frame[0] > INTRA_FRAME) { + int_mv mv = prev_frame_mvs->mv[0]; + if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] != + ref_sign_bias[ref_frame]) { + mv.as_mv.row *= -1; + mv.as_mv.col *= -1; + } + ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done); + } + + if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME && + prev_frame_mvs->ref_frame[1] != ref_frame && + prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int) { + int_mv mv = prev_frame_mvs->mv[1]; + if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] != + ref_sign_bias[ref_frame]) { + mv.as_mv.row *= -1; + mv.as_mv.col *= -1; + } + ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done); + } + } + + Done: + + mode_context[ref_frame] = counter_to_context[context_counter]; + + // Clamp vectors + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) + clamp_mv_ref(&mv_ref_list[i].as_mv, xd); +} + +void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int mi_row, int mi_col, + uint8_t *mode_context) { + find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1, + mi_row, mi_col, mode_context); +} + +void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, + int_mv *mvlist, int_mv *nearest_mv, + int_mv *near_mv) { + int i; + // Make sure all the candidates are properly clamped etc + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { + lower_mv_precision(&mvlist[i].as_mv, allow_hp); + clamp_mv2(&mvlist[i].as_mv, xd); + } + *nearest_mv = mvlist[0]; + *near_mv = mvlist[1]; +} + +void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, + int block, int ref, int mi_row, int mi_col, + int_mv *nearest_mv, int_mv *near_mv, + uint8_t *mode_context) { + int_mv mv_list[MAX_MV_REF_CANDIDATES]; + MODE_INFO *const mi = xd->mi[0]; + b_mode_info *bmi = mi->bmi; + int n; + + assert(MAX_MV_REF_CANDIDATES == 2); + + find_mv_refs_idx(cm, xd, mi, mi->ref_frame[ref], mv_list, block, + mi_row, mi_col, mode_context); + + near_mv->as_int = 0; + switch (block) { + case 0: + nearest_mv->as_int = mv_list[0].as_int; + near_mv->as_int = mv_list[1].as_int; + break; + case 1: + case 2: + nearest_mv->as_int = bmi[0].as_mv[ref].as_int; + for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n) + if (nearest_mv->as_int != mv_list[n].as_int) { + near_mv->as_int = mv_list[n].as_int; + break; + } + break; + case 3: { + int_mv candidates[2 + MAX_MV_REF_CANDIDATES]; + candidates[0] = bmi[1].as_mv[ref]; + candidates[1] = bmi[0].as_mv[ref]; + candidates[2] = mv_list[0]; + candidates[3] = mv_list[1]; + + nearest_mv->as_int = bmi[2].as_mv[ref].as_int; + for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n) + if (nearest_mv->as_int != candidates[n].as_int) { + near_mv->as_int = candidates[n].as_int; + break; + } + break; + } + default: + assert(0 && "Invalid block index."); + } +} diff --git a/thirdparty/libvpx/vp9/common/vp9_mvref_common.h b/thirdparty/libvpx/vp9/common/vp9_mvref_common.h new file mode 100644 index 000000000000..4380843e24a4 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_mvref_common.h @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VP9_COMMON_VP9_MVREF_COMMON_H_ +#define VP9_COMMON_VP9_MVREF_COMMON_H_ + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define LEFT_TOP_MARGIN ((VP9_ENC_BORDER_IN_PIXELS - VP9_INTERP_EXTEND) << 3) +#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\ + VP9_INTERP_EXTEND) << 3) + +#define MVREF_NEIGHBOURS 8 + +typedef struct position { + int row; + int col; +} POSITION; + +typedef enum { + BOTH_ZERO = 0, + ZERO_PLUS_PREDICTED = 1, + BOTH_PREDICTED = 2, + NEW_PLUS_NON_INTRA = 3, + BOTH_NEW = 4, + INTRA_PLUS_NON_INTRA = 5, + BOTH_INTRA = 6, + INVALID_CASE = 9 +} motion_vector_context; + +// This is used to figure out a context for the ref blocks. The code flattens +// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by +// adding 9 for each intra block, 3 for each zero mv and 1 for each new +// motion vector. This single number is then converted into a context +// with a single lookup ( counter_to_context ). +static const int mode_2_counter[MB_MODE_COUNT] = { + 9, // DC_PRED + 9, // V_PRED + 9, // H_PRED + 9, // D45_PRED + 9, // D135_PRED + 9, // D117_PRED + 9, // D153_PRED + 9, // D207_PRED + 9, // D63_PRED + 9, // TM_PRED + 0, // NEARESTMV + 0, // NEARMV + 3, // ZEROMV + 1, // NEWMV +}; + +// There are 3^3 different combinations of 3 counts that can be either 0,1 or +// 2. However the actual count can never be greater than 2 so the highest +// counter we need is 18. 9 is an invalid counter that's never used. +static const int counter_to_context[19] = { + BOTH_PREDICTED, // 0 + NEW_PLUS_NON_INTRA, // 1 + BOTH_NEW, // 2 + ZERO_PLUS_PREDICTED, // 3 + NEW_PLUS_NON_INTRA, // 4 + INVALID_CASE, // 5 + BOTH_ZERO, // 6 + INVALID_CASE, // 7 + INVALID_CASE, // 8 + INTRA_PLUS_NON_INTRA, // 9 + INTRA_PLUS_NON_INTRA, // 10 + INVALID_CASE, // 11 + INTRA_PLUS_NON_INTRA, // 12 + INVALID_CASE, // 13 + INVALID_CASE, // 14 + INVALID_CASE, // 15 + INVALID_CASE, // 16 + INVALID_CASE, // 17 + BOTH_INTRA // 18 +}; + +static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = { + // 4X4 + {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, + // 4X8 + {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, + // 8X4 + {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, + // 8X8 + {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, + // 8X16 + {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}}, + // 16X8 + {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}}, + // 16X16 + {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, + // 16X32 + {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}}, + // 32X16 + {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, + // 32X32 + {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, + // 32X64 + {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}}, + // 64X32 + {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}}, + // 64X64 + {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}} +}; + +static const int idx_n_column_to_subblock[4][2] = { + {1, 2}, + {1, 3}, + {3, 2}, + {3, 3} +}; + +// clamp_mv_ref +#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units + +static INLINE void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) { + clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER, + xd->mb_to_right_edge + MV_BORDER, + xd->mb_to_top_edge - MV_BORDER, + xd->mb_to_bottom_edge + MV_BORDER); +} + +// This function returns either the appropriate sub block or block's mv +// on whether the block_size < 8x8 and we have check_sub_blocks set. +static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv, + int search_col, int block_idx) { + return block_idx >= 0 && candidate->sb_type < BLOCK_8X8 + ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]] + .as_mv[which_mv] + : candidate->mv[which_mv]; +} + + +// Performs mv sign inversion if indicated by the reference frame combination. +static INLINE int_mv scale_mv(const MODE_INFO *mi, int ref, + const MV_REFERENCE_FRAME this_ref_frame, + const int *ref_sign_bias) { + int_mv mv = mi->mv[ref]; + if (ref_sign_bias[mi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) { + mv.as_mv.row *= -1; + mv.as_mv.col *= -1; + } + return mv; +} + +// This macro is used to add a motion vector mv_ref list if it isn't +// already in the list. If it's the second motion vector it will also +// skip all additional processing and jump to Done! +#define ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done) \ + do { \ + if (refmv_count) { \ + if ((mv).as_int != (mv_ref_list)[0].as_int) { \ + (mv_ref_list)[(refmv_count)] = (mv); \ + goto Done; \ + } \ + } else { \ + (mv_ref_list)[(refmv_count)++] = (mv); \ + } \ + } while (0) + +// If either reference frame is different, not INTRA, and they +// are different from each other scale and add the mv to our list. +#define IF_DIFF_REF_FRAME_ADD_MV(mbmi, ref_frame, ref_sign_bias, refmv_count, \ + mv_ref_list, Done) \ + do { \ + if (is_inter_block(mbmi)) { \ + if ((mbmi)->ref_frame[0] != ref_frame) \ + ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \ + refmv_count, mv_ref_list, Done); \ + if (has_second_ref(mbmi) && \ + (mbmi)->ref_frame[1] != ref_frame && \ + (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \ + ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \ + refmv_count, mv_ref_list, Done); \ + } \ + } while (0) + + +// Checks that the given mi_row, mi_col and search point +// are inside the borders of the tile. +static INLINE int is_inside(const TileInfo *const tile, + int mi_col, int mi_row, int mi_rows, + const POSITION *mi_pos) { + return !(mi_row + mi_pos->row < 0 || + mi_col + mi_pos->col < tile->mi_col_start || + mi_row + mi_pos->row >= mi_rows || + mi_col + mi_pos->col >= tile->mi_col_end); +} + +// TODO(jingning): this mv clamping function should be block size dependent. +static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { + clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN, + xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, + xd->mb_to_top_edge - LEFT_TOP_MARGIN, + xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); +} + +static INLINE void lower_mv_precision(MV *mv, int allow_hp) { + const int use_hp = allow_hp && use_mv_hp(mv); + if (!use_hp) { + if (mv->row & 1) + mv->row += (mv->row > 0 ? -1 : 1); + if (mv->col & 1) + mv->col += (mv->col > 0 ? -1 : 1); + } +} + +typedef void (*find_mv_refs_sync)(void *const data, int mi_row); +void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, int mi_row, int mi_col, + uint8_t *mode_context); + +// check a list of motion vectors by sad score using a number rows of pixels +// above and a number cols of pixels in the left to select the one with best +// score to use as ref motion vector +void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, + int_mv *mvlist, int_mv *nearest_mv, int_mv *near_mv); + +void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, + int block, int ref, int mi_row, int mi_col, + int_mv *nearest_mv, int_mv *near_mv, + uint8_t *mode_context); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_MVREF_COMMON_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_onyxc_int.h b/thirdparty/libvpx/vp9/common/vp9_onyxc_int.h new file mode 100644 index 000000000000..3fd935e628b9 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_onyxc_int.h @@ -0,0 +1,446 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_ONYXC_INT_H_ +#define VP9_COMMON_VP9_ONYXC_INT_H_ + +#include "./vpx_config.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx_util/vpx_thread.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_frame_buffers.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_tile_common.h" + +#if CONFIG_VP9_POSTPROC +#include "vp9/common/vp9_postproc.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define REFS_PER_FRAME 3 + +#define REF_FRAMES_LOG2 3 +#define REF_FRAMES (1 << REF_FRAMES_LOG2) + +// 4 scratch frames for the new frames to support a maximum of 4 cores decoding +// in parallel, 3 for scaled references on the encoder. +// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number +// of framebuffers. +// TODO(jkoleszar): These 3 extra references could probably come from the +// normal reference pool. +#define FRAME_BUFFERS (REF_FRAMES + 7) + +#define FRAME_CONTEXTS_LOG2 2 +#define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2) + +#define NUM_PING_PONG_BUFFERS 2 + +extern const struct { + PARTITION_CONTEXT above; + PARTITION_CONTEXT left; +} partition_context_lookup[BLOCK_SIZES]; + + +typedef enum { + SINGLE_REFERENCE = 0, + COMPOUND_REFERENCE = 1, + REFERENCE_MODE_SELECT = 2, + REFERENCE_MODES = 3, +} REFERENCE_MODE; + +typedef struct { + int_mv mv[2]; + MV_REFERENCE_FRAME ref_frame[2]; +} MV_REF; + +typedef struct { + int ref_count; + MV_REF *mvs; + int mi_rows; + int mi_cols; + vpx_codec_frame_buffer_t raw_frame_buffer; + YV12_BUFFER_CONFIG buf; + + // The Following variables will only be used in frame parallel decode. + + // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means + // that no FrameWorker owns, or is decoding, this buffer. + VPxWorker *frame_worker_owner; + + // row and col indicate which position frame has been decoded to in real + // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX + // when the frame is fully decoded. + int row; + int col; +} RefCntBuffer; + +typedef struct BufferPool { + // Protect BufferPool from being accessed by several FrameWorkers at + // the same time during frame parallel decode. + // TODO(hkuang): Try to use atomic variable instead of locking the whole pool. +#if CONFIG_MULTITHREAD + pthread_mutex_t pool_mutex; +#endif + + // Private data associated with the frame buffer callbacks. + void *cb_priv; + + vpx_get_frame_buffer_cb_fn_t get_fb_cb; + vpx_release_frame_buffer_cb_fn_t release_fb_cb; + + RefCntBuffer frame_bufs[FRAME_BUFFERS]; + + // Frame buffers allocated internally by the codec. + InternalFrameBufferList int_frame_buffers; +} BufferPool; + +typedef struct VP9Common { + struct vpx_internal_error_info error; + vpx_color_space_t color_space; + vpx_color_range_t color_range; + int width; + int height; + int render_width; + int render_height; + int last_width; + int last_height; + + // TODO(jkoleszar): this implies chroma ss right now, but could vary per + // plane. Revisit as part of the future change to YV12_BUFFER_CONFIG to + // support additional planes. + int subsampling_x; + int subsampling_y; + +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth; // Marks if we need to use 16bit frame buffers. +#endif + + YV12_BUFFER_CONFIG *frame_to_show; + RefCntBuffer *prev_frame; + + // TODO(hkuang): Combine this with cur_buf in macroblockd. + RefCntBuffer *cur_frame; + + int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */ + + // Prepare ref_frame_map for the next frame. + // Only used in frame parallel decode. + int next_ref_frame_map[REF_FRAMES]; + + // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and + // roll new_fb_idx into it. + + // Each frame can reference REFS_PER_FRAME buffers + RefBuffer frame_refs[REFS_PER_FRAME]; + + int new_fb_idx; + +#if CONFIG_VP9_POSTPROC + YV12_BUFFER_CONFIG post_proc_buffer; + YV12_BUFFER_CONFIG post_proc_buffer_int; +#endif + + FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/ + FRAME_TYPE frame_type; + + int show_frame; + int last_show_frame; + int show_existing_frame; + + // Flag signaling that the frame is encoded using only INTRA modes. + uint8_t intra_only; + uint8_t last_intra_only; + + int allow_high_precision_mv; + + // Flag signaling that the frame context should be reset to default values. + // 0 or 1 implies don't reset, 2 reset just the context specified in the + // frame header, 3 reset all contexts. + int reset_frame_context; + + // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in + // MODE_INFO (8-pixel) units. + int MBs; + int mb_rows, mi_rows; + int mb_cols, mi_cols; + int mi_stride; + + /* profile settings */ + TX_MODE tx_mode; + + int base_qindex; + int y_dc_delta_q; + int uv_dc_delta_q; + int uv_ac_delta_q; + int16_t y_dequant[MAX_SEGMENTS][2]; + int16_t uv_dequant[MAX_SEGMENTS][2]; + + /* We allocate a MODE_INFO struct for each macroblock, together with + an extra row on top and column on the left to simplify prediction. */ + int mi_alloc_size; + MODE_INFO *mip; /* Base of allocated array */ + MODE_INFO *mi; /* Corresponds to upper left visible macroblock */ + + // TODO(agrange): Move prev_mi into encoder structure. + // prev_mip and prev_mi will only be allocated in VP9 encoder. + MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */ + MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */ + + // Separate mi functions between encoder and decoder. + int (*alloc_mi)(struct VP9Common *cm, int mi_size); + void (*free_mi)(struct VP9Common *cm); + void (*setup_mi)(struct VP9Common *cm); + + // Grid of pointers to 8x8 MODE_INFO structs. Any 8x8 not in the visible + // area will be NULL. + MODE_INFO **mi_grid_base; + MODE_INFO **mi_grid_visible; + MODE_INFO **prev_mi_grid_base; + MODE_INFO **prev_mi_grid_visible; + + // Whether to use previous frame's motion vectors for prediction. + int use_prev_frame_mvs; + + // Persistent mb segment id map used in prediction. + int seg_map_idx; + int prev_seg_map_idx; + + uint8_t *seg_map_array[NUM_PING_PONG_BUFFERS]; + uint8_t *last_frame_seg_map; + uint8_t *current_frame_seg_map; + int seg_map_alloc_size; + + INTERP_FILTER interp_filter; + + loop_filter_info_n lf_info; + + int refresh_frame_context; /* Two state 0 = NO, 1 = YES */ + + int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */ + + struct loopfilter lf; + struct segmentation seg; + + // TODO(hkuang): Remove this as it is the same as frame_parallel_decode + // in pbi. + int frame_parallel_decode; // frame-based threading. + + // Context probabilities for reference frame prediction + MV_REFERENCE_FRAME comp_fixed_ref; + MV_REFERENCE_FRAME comp_var_ref[2]; + REFERENCE_MODE reference_mode; + + FRAME_CONTEXT *fc; /* this frame entropy */ + FRAME_CONTEXT *frame_contexts; // FRAME_CONTEXTS + unsigned int frame_context_idx; /* Context to use/update */ + FRAME_COUNTS counts; + + unsigned int current_video_frame; + BITSTREAM_PROFILE profile; + + // VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3. + vpx_bit_depth_t bit_depth; + vpx_bit_depth_t dequant_bit_depth; // bit_depth of current dequantizer + +#if CONFIG_VP9_POSTPROC + struct postproc_state postproc_state; +#endif + + int error_resilient_mode; + int frame_parallel_decoding_mode; + + int log2_tile_cols, log2_tile_rows; + int byte_alignment; + int skip_loop_filter; + + // Private data associated with the frame buffer callbacks. + void *cb_priv; + vpx_get_frame_buffer_cb_fn_t get_fb_cb; + vpx_release_frame_buffer_cb_fn_t release_fb_cb; + + // Handles memory for the codec. + InternalFrameBufferList int_frame_buffers; + + // External BufferPool passed from outside. + BufferPool *buffer_pool; + + PARTITION_CONTEXT *above_seg_context; + ENTROPY_CONTEXT *above_context; + int above_context_alloc_cols; +} VP9_COMMON; + +// TODO(hkuang): Don't need to lock the whole pool after implementing atomic +// frame reference count. +void lock_buffer_pool(BufferPool *const pool); +void unlock_buffer_pool(BufferPool *const pool); + +static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) { + if (index < 0 || index >= REF_FRAMES) + return NULL; + if (cm->ref_frame_map[index] < 0) + return NULL; + assert(cm->ref_frame_map[index] < FRAME_BUFFERS); + return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf; +} + +static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) { + return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf; +} + +static INLINE int get_free_fb(VP9_COMMON *cm) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + int i; + + lock_buffer_pool(cm->buffer_pool); + for (i = 0; i < FRAME_BUFFERS; ++i) + if (frame_bufs[i].ref_count == 0) + break; + + if (i != FRAME_BUFFERS) { + frame_bufs[i].ref_count = 1; + } else { + // Reset i to be INVALID_IDX to indicate no free buffer found. + i = INVALID_IDX; + } + + unlock_buffer_pool(cm->buffer_pool); + return i; +} + +static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) { + const int ref_index = *idx; + + if (ref_index >= 0 && bufs[ref_index].ref_count > 0) + bufs[ref_index].ref_count--; + + *idx = new_idx; + + bufs[new_idx].ref_count++; +} + +static INLINE int mi_cols_aligned_to_sb(int n_mis) { + return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2); +} + +static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) { + return cm->frame_type == KEY_FRAME || cm->intra_only; +} + +static INLINE void set_partition_probs(const VP9_COMMON *const cm, + MACROBLOCKD *const xd) { + xd->partition_probs = + frame_is_intra_only(cm) ? + &vp9_kf_partition_probs[0] : + (const vpx_prob (*)[PARTITION_TYPES - 1])cm->fc->partition_prob; +} + +static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd, + tran_low_t *dqcoeff) { + int i; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + xd->plane[i].dqcoeff = dqcoeff; + xd->above_context[i] = cm->above_context + + i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols); + + if (get_plane_type(i) == PLANE_TYPE_Y) { + memcpy(xd->plane[i].seg_dequant, cm->y_dequant, sizeof(cm->y_dequant)); + } else { + memcpy(xd->plane[i].seg_dequant, cm->uv_dequant, sizeof(cm->uv_dequant)); + } + xd->fc = cm->fc; + } + + xd->above_seg_context = cm->above_seg_context; + xd->mi_stride = cm->mi_stride; + xd->error_info = &cm->error; + + set_partition_probs(cm, xd); +} + +static INLINE const vpx_prob* get_partition_probs(const MACROBLOCKD *xd, + int ctx) { + return xd->partition_probs[ctx]; +} + +static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) { + const int above_idx = mi_col * 2; + const int left_idx = (mi_row * 2) & 15; + int i; + for (i = 0; i < MAX_MB_PLANE; ++i) { + struct macroblockd_plane *const pd = &xd->plane[i]; + pd->above_context = &xd->above_context[i][above_idx >> pd->subsampling_x]; + pd->left_context = &xd->left_context[i][left_idx >> pd->subsampling_y]; + } +} + +static INLINE int calc_mi_size(int len) { + // len is in mi units. + return len + MI_BLOCK_SIZE; +} + +static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, + int mi_row, int bh, + int mi_col, int bw, + int mi_rows, int mi_cols) { + xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); + xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8; + xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); + xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8; + + // Are edges available for intra prediction? + xd->above_mi = (mi_row != 0) ? xd->mi[-xd->mi_stride] : NULL; + xd->left_mi = (mi_col > tile->mi_col_start) ? xd->mi[-1] : NULL; +} + +static INLINE void update_partition_context(MACROBLOCKD *xd, + int mi_row, int mi_col, + BLOCK_SIZE subsize, + BLOCK_SIZE bsize) { + PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col; + PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK); + + // num_4x4_blocks_wide_lookup[bsize] / 2 + const int bs = num_8x8_blocks_wide_lookup[bsize]; + + // update the partition context at the end notes. set partition bits + // of block sizes larger than the current one to be one, and partition + // bits of smaller block sizes to be zero. + memset(above_ctx, partition_context_lookup[subsize].above, bs); + memset(left_ctx, partition_context_lookup[subsize].left, bs); +} + +static INLINE int partition_plane_context(const MACROBLOCKD *xd, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col; + const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK); + const int bsl = mi_width_log2_lookup[bsize]; + int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1; + + assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]); + assert(bsl >= 0); + + return (left * 2 + above) + bsl * PARTITION_PLOFFSET; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_ONYXC_INT_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_ppflags.h b/thirdparty/libvpx/vp9/common/vp9_ppflags.h new file mode 100644 index 000000000000..12b989f43a50 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_ppflags.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_PPFLAGS_H_ +#define VP9_COMMON_VP9_PPFLAGS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + VP9D_NOFILTERING = 0, + VP9D_DEBLOCK = 1 << 0, + VP9D_DEMACROBLOCK = 1 << 1, + VP9D_ADDNOISE = 1 << 2, + VP9D_DEBUG_TXT_FRAME_INFO = 1 << 3, + VP9D_DEBUG_TXT_MBLK_MODES = 1 << 4, + VP9D_DEBUG_TXT_DC_DIFF = 1 << 5, + VP9D_DEBUG_TXT_RATE_INFO = 1 << 6, + VP9D_DEBUG_DRAW_MV = 1 << 7, + VP9D_DEBUG_CLR_BLK_MODES = 1 << 8, + VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9, + VP9D_MFQE = 1 << 10 +}; + +typedef struct { + int post_proc_flag; + int deblocking_level; + int noise_level; +} vp9_ppflags_t; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_PPFLAGS_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_pred_common.c b/thirdparty/libvpx/vp9/common/vp9_pred_common.c new file mode 100644 index 000000000000..8f90e70e73e9 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_pred_common.c @@ -0,0 +1,314 @@ + +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_seg_common.h" + +// Returns a context number for the given MB prediction signal +int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) { + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + const MODE_INFO *const left_mi = xd->left_mi; + const int left_type = left_mi && is_inter_block(left_mi) ? + left_mi->interp_filter : SWITCHABLE_FILTERS; + const MODE_INFO *const above_mi = xd->above_mi; + const int above_type = above_mi && is_inter_block(above_mi) ? + above_mi->interp_filter : SWITCHABLE_FILTERS; + + if (left_type == above_type) + return left_type; + else if (left_type == SWITCHABLE_FILTERS) + return above_type; + else if (above_type == SWITCHABLE_FILTERS) + return left_type; + else + return SWITCHABLE_FILTERS; +} + +int vp9_get_reference_mode_context(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + int ctx; + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const int has_above = !!above_mi; + const int has_left = !!left_mi; + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + if (has_above && has_left) { // both edges available + if (!has_second_ref(above_mi) && !has_second_ref(left_mi)) + // neither edge uses comp pred (0/1) + ctx = (above_mi->ref_frame[0] == cm->comp_fixed_ref) ^ + (left_mi->ref_frame[0] == cm->comp_fixed_ref); + else if (!has_second_ref(above_mi)) + // one of two edges uses comp pred (2/3) + ctx = 2 + (above_mi->ref_frame[0] == cm->comp_fixed_ref || + !is_inter_block(above_mi)); + else if (!has_second_ref(left_mi)) + // one of two edges uses comp pred (2/3) + ctx = 2 + (left_mi->ref_frame[0] == cm->comp_fixed_ref || + !is_inter_block(left_mi)); + else // both edges use comp pred (4) + ctx = 4; + } else if (has_above || has_left) { // one edge available + const MODE_INFO *edge_mi = has_above ? above_mi : left_mi; + + if (!has_second_ref(edge_mi)) + // edge does not use comp pred (0/1) + ctx = edge_mi->ref_frame[0] == cm->comp_fixed_ref; + else + // edge uses comp pred (3) + ctx = 3; + } else { // no edges available (1) + ctx = 1; + } + assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS); + return ctx; +} + +// Returns a context number for the given MB prediction signal +int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + int pred_context; + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const int above_in_image = !!above_mi; + const int left_in_image = !!left_mi; + + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; + const int var_ref_idx = !fix_ref_idx; + + if (above_in_image && left_in_image) { // both edges available + const int above_intra = !is_inter_block(above_mi); + const int left_intra = !is_inter_block(left_mi); + + if (above_intra && left_intra) { // intra/intra (2) + pred_context = 2; + } else if (above_intra || left_intra) { // intra/inter + const MODE_INFO *edge_mi = above_intra ? left_mi : above_mi; + + if (!has_second_ref(edge_mi)) // single pred (1/3) + pred_context = 1 + 2 * (edge_mi->ref_frame[0] != cm->comp_var_ref[1]); + else // comp pred (1/3) + pred_context = 1 + 2 * (edge_mi->ref_frame[var_ref_idx] + != cm->comp_var_ref[1]); + } else { // inter/inter + const int l_sg = !has_second_ref(left_mi); + const int a_sg = !has_second_ref(above_mi); + const MV_REFERENCE_FRAME vrfa = a_sg ? above_mi->ref_frame[0] + : above_mi->ref_frame[var_ref_idx]; + const MV_REFERENCE_FRAME vrfl = l_sg ? left_mi->ref_frame[0] + : left_mi->ref_frame[var_ref_idx]; + + if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) { + pred_context = 0; + } else if (l_sg && a_sg) { // single/single + if ((vrfa == cm->comp_fixed_ref && vrfl == cm->comp_var_ref[0]) || + (vrfl == cm->comp_fixed_ref && vrfa == cm->comp_var_ref[0])) + pred_context = 4; + else if (vrfa == vrfl) + pred_context = 3; + else + pred_context = 1; + } else if (l_sg || a_sg) { // single/comp + const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl; + const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl; + if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1]) + pred_context = 1; + else if (rfs == cm->comp_var_ref[1] && vrfc != cm->comp_var_ref[1]) + pred_context = 2; + else + pred_context = 4; + } else if (vrfa == vrfl) { // comp/comp + pred_context = 4; + } else { + pred_context = 2; + } + } + } else if (above_in_image || left_in_image) { // one edge available + const MODE_INFO *edge_mi = above_in_image ? above_mi : left_mi; + + if (!is_inter_block(edge_mi)) { + pred_context = 2; + } else { + if (has_second_ref(edge_mi)) + pred_context = 4 * (edge_mi->ref_frame[var_ref_idx] + != cm->comp_var_ref[1]); + else + pred_context = 3 * (edge_mi->ref_frame[0] != cm->comp_var_ref[1]); + } + } else { // no edges available (2) + pred_context = 2; + } + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + + return pred_context; +} + +int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { + int pred_context; + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const int has_above = !!above_mi; + const int has_left = !!left_mi; + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + if (has_above && has_left) { // both edges available + const int above_intra = !is_inter_block(above_mi); + const int left_intra = !is_inter_block(left_mi); + + if (above_intra && left_intra) { // intra/intra + pred_context = 2; + } else if (above_intra || left_intra) { // intra/inter or inter/intra + const MODE_INFO *edge_mi = above_intra ? left_mi : above_mi; + if (!has_second_ref(edge_mi)) + pred_context = 4 * (edge_mi->ref_frame[0] == LAST_FRAME); + else + pred_context = 1 + (edge_mi->ref_frame[0] == LAST_FRAME || + edge_mi->ref_frame[1] == LAST_FRAME); + } else { // inter/inter + const int above_has_second = has_second_ref(above_mi); + const int left_has_second = has_second_ref(left_mi); + const MV_REFERENCE_FRAME above0 = above_mi->ref_frame[0]; + const MV_REFERENCE_FRAME above1 = above_mi->ref_frame[1]; + const MV_REFERENCE_FRAME left0 = left_mi->ref_frame[0]; + const MV_REFERENCE_FRAME left1 = left_mi->ref_frame[1]; + + if (above_has_second && left_has_second) { + pred_context = 1 + (above0 == LAST_FRAME || above1 == LAST_FRAME || + left0 == LAST_FRAME || left1 == LAST_FRAME); + } else if (above_has_second || left_has_second) { + const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1; + + if (rfs == LAST_FRAME) + pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME); + else + pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME); + } else { + pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME); + } + } + } else if (has_above || has_left) { // one edge available + const MODE_INFO *edge_mi = has_above ? above_mi : left_mi; + if (!is_inter_block(edge_mi)) { // intra + pred_context = 2; + } else { // inter + if (!has_second_ref(edge_mi)) + pred_context = 4 * (edge_mi->ref_frame[0] == LAST_FRAME); + else + pred_context = 1 + (edge_mi->ref_frame[0] == LAST_FRAME || + edge_mi->ref_frame[1] == LAST_FRAME); + } + } else { // no edges available + pred_context = 2; + } + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { + int pred_context; + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const int has_above = !!above_mi; + const int has_left = !!left_mi; + + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + if (has_above && has_left) { // both edges available + const int above_intra = !is_inter_block(above_mi); + const int left_intra = !is_inter_block(left_mi); + + if (above_intra && left_intra) { // intra/intra + pred_context = 2; + } else if (above_intra || left_intra) { // intra/inter or inter/intra + const MODE_INFO *edge_mi = above_intra ? left_mi : above_mi; + if (!has_second_ref(edge_mi)) { + if (edge_mi->ref_frame[0] == LAST_FRAME) + pred_context = 3; + else + pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME); + } else { + pred_context = 1 + 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME || + edge_mi->ref_frame[1] == GOLDEN_FRAME); + } + } else { // inter/inter + const int above_has_second = has_second_ref(above_mi); + const int left_has_second = has_second_ref(left_mi); + const MV_REFERENCE_FRAME above0 = above_mi->ref_frame[0]; + const MV_REFERENCE_FRAME above1 = above_mi->ref_frame[1]; + const MV_REFERENCE_FRAME left0 = left_mi->ref_frame[0]; + const MV_REFERENCE_FRAME left1 = left_mi->ref_frame[1]; + + if (above_has_second && left_has_second) { + if (above0 == left0 && above1 == left1) + pred_context = 3 * (above0 == GOLDEN_FRAME || + above1 == GOLDEN_FRAME || + left0 == GOLDEN_FRAME || + left1 == GOLDEN_FRAME); + else + pred_context = 2; + } else if (above_has_second || left_has_second) { + const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1; + + if (rfs == GOLDEN_FRAME) + pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); + else if (rfs == ALTREF_FRAME) + pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME; + else + pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); + } else { + if (above0 == LAST_FRAME && left0 == LAST_FRAME) { + pred_context = 3; + } else if (above0 == LAST_FRAME || left0 == LAST_FRAME) { + const MV_REFERENCE_FRAME edge0 = (above0 == LAST_FRAME) ? left0 + : above0; + pred_context = 4 * (edge0 == GOLDEN_FRAME); + } else { + pred_context = 2 * (above0 == GOLDEN_FRAME) + + 2 * (left0 == GOLDEN_FRAME); + } + } + } + } else if (has_above || has_left) { // one edge available + const MODE_INFO *edge_mi = has_above ? above_mi : left_mi; + + if (!is_inter_block(edge_mi) || + (edge_mi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mi))) + pred_context = 2; + else if (!has_second_ref(edge_mi)) + pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME); + else + pred_context = 3 * (edge_mi->ref_frame[0] == GOLDEN_FRAME || + edge_mi->ref_frame[1] == GOLDEN_FRAME); + } else { // no edges available (2) + pred_context = 2; + } + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} diff --git a/thirdparty/libvpx/vp9/common/vp9_pred_common.h b/thirdparty/libvpx/vp9/common/vp9_pred_common.h new file mode 100644 index 000000000000..f3c676e953db --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_pred_common.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_PRED_COMMON_H_ +#define VP9_COMMON_VP9_PRED_COMMON_H_ + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vpx_dsp/vpx_dsp_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE int get_segment_id(const VP9_COMMON *cm, + const uint8_t *segment_ids, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + const int mi_offset = mi_row * cm->mi_cols + mi_col; + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; + const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); + const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); + int x, y, segment_id = MAX_SEGMENTS; + + for (y = 0; y < ymis; ++y) + for (x = 0; x < xmis; ++x) + segment_id = + VPXMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]); + + assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); + return segment_id; +} + +static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) { + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const int above_sip = (above_mi != NULL) ? + above_mi->seg_id_predicted : 0; + const int left_sip = (left_mi != NULL) ? left_mi->seg_id_predicted : 0; + + return above_sip + left_sip; +} + +static INLINE vpx_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg, + const MACROBLOCKD *xd) { + return seg->pred_probs[vp9_get_pred_context_seg_id(xd)]; +} + +static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) { + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const int above_skip = (above_mi != NULL) ? above_mi->skip : 0; + const int left_skip = (left_mi != NULL) ? left_mi->skip : 0; + return above_skip + left_skip; +} + +static INLINE vpx_prob vp9_get_skip_prob(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + return cm->fc->skip_probs[vp9_get_skip_context(xd)]; +} + +int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd); + +// The mode info data structure has a one element border above and to the +// left of the entries corresponding to real macroblocks. +// The prediction flags in these dummy entries are initialized to 0. +// 0 - inter/inter, inter/--, --/inter, --/-- +// 1 - intra/inter, inter/intra +// 2 - intra/--, --/intra +// 3 - intra/intra +static INLINE int get_intra_inter_context(const MACROBLOCKD *xd) { + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const int has_above = !!above_mi; + const int has_left = !!left_mi; + + if (has_above && has_left) { // both edges available + const int above_intra = !is_inter_block(above_mi); + const int left_intra = !is_inter_block(left_mi); + return left_intra && above_intra ? 3 : left_intra || above_intra; + } else if (has_above || has_left) { // one edge available + return 2 * !is_inter_block(has_above ? above_mi : left_mi); + } + return 0; +} + +static INLINE vpx_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + return cm->fc->intra_inter_prob[get_intra_inter_context(xd)]; +} + +int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd); + +static INLINE vpx_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + return cm->fc->comp_inter_prob[vp9_get_reference_mode_context(cm, xd)]; +} + +int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, + const MACROBLOCKD *xd); + +static INLINE vpx_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + const int pred_context = vp9_get_pred_context_comp_ref_p(cm, xd); + return cm->fc->comp_ref_prob[pred_context]; +} + +int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd); + +static INLINE vpx_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0]; +} + +int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd); + +static INLINE vpx_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1]; +} + +// Returns a context number for the given MB prediction signal +// The mode info data structure has a one element border above and to the +// left of the entries corresponding to real blocks. +// The prediction flags in these dummy entries are initialized to 0. +static INLINE int get_tx_size_context(const MACROBLOCKD *xd) { + const int max_tx_size = max_txsize_lookup[xd->mi[0]->sb_type]; + const MODE_INFO *const above_mi = xd->above_mi; + const MODE_INFO *const left_mi = xd->left_mi; + const int has_above = !!above_mi; + const int has_left = !!left_mi; + int above_ctx = (has_above && !above_mi->skip) ? (int)above_mi->tx_size + : max_tx_size; + int left_ctx = (has_left && !left_mi->skip) ? (int)left_mi->tx_size + : max_tx_size; + if (!has_left) + left_ctx = above_ctx; + + if (!has_above) + above_ctx = left_ctx; + + return (above_ctx + left_ctx) > max_tx_size; +} + +static INLINE const vpx_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, + const struct tx_probs *tx_probs) { + switch (max_tx_size) { + case TX_8X8: + return tx_probs->p8x8[ctx]; + case TX_16X16: + return tx_probs->p16x16[ctx]; + case TX_32X32: + return tx_probs->p32x32[ctx]; + default: + assert(0 && "Invalid max_tx_size."); + return NULL; + } +} + +static INLINE const vpx_prob *get_tx_probs2(TX_SIZE max_tx_size, + const MACROBLOCKD *xd, + const struct tx_probs *tx_probs) { + return get_tx_probs(max_tx_size, get_tx_size_context(xd), tx_probs); +} + +static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, + struct tx_counts *tx_counts) { + switch (max_tx_size) { + case TX_8X8: + return tx_counts->p8x8[ctx]; + case TX_16X16: + return tx_counts->p16x16[ctx]; + case TX_32X32: + return tx_counts->p32x32[ctx]; + default: + assert(0 && "Invalid max_tx_size."); + return NULL; + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_PRED_COMMON_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_quant_common.c b/thirdparty/libvpx/vp9/common/vp9_quant_common.c new file mode 100644 index 000000000000..d83f3c1a2f5e --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_quant_common.c @@ -0,0 +1,278 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_seg_common.h" + +static const int16_t dc_qlookup[QINDEX_RANGE] = { + 4, 8, 8, 9, 10, 11, 12, 12, + 13, 14, 15, 16, 17, 18, 19, 19, + 20, 21, 22, 23, 24, 25, 26, 26, + 27, 28, 29, 30, 31, 32, 32, 33, + 34, 35, 36, 37, 38, 38, 39, 40, + 41, 42, 43, 43, 44, 45, 46, 47, + 48, 48, 49, 50, 51, 52, 53, 53, + 54, 55, 56, 57, 57, 58, 59, 60, + 61, 62, 62, 63, 64, 65, 66, 66, + 67, 68, 69, 70, 70, 71, 72, 73, + 74, 74, 75, 76, 77, 78, 78, 79, + 80, 81, 81, 82, 83, 84, 85, 85, + 87, 88, 90, 92, 93, 95, 96, 98, + 99, 101, 102, 104, 105, 107, 108, 110, + 111, 113, 114, 116, 117, 118, 120, 121, + 123, 125, 127, 129, 131, 134, 136, 138, + 140, 142, 144, 146, 148, 150, 152, 154, + 156, 158, 161, 164, 166, 169, 172, 174, + 177, 180, 182, 185, 187, 190, 192, 195, + 199, 202, 205, 208, 211, 214, 217, 220, + 223, 226, 230, 233, 237, 240, 243, 247, + 250, 253, 257, 261, 265, 269, 272, 276, + 280, 284, 288, 292, 296, 300, 304, 309, + 313, 317, 322, 326, 330, 335, 340, 344, + 349, 354, 359, 364, 369, 374, 379, 384, + 389, 395, 400, 406, 411, 417, 423, 429, + 435, 441, 447, 454, 461, 467, 475, 482, + 489, 497, 505, 513, 522, 530, 539, 549, + 559, 569, 579, 590, 602, 614, 626, 640, + 654, 668, 684, 700, 717, 736, 755, 775, + 796, 819, 843, 869, 896, 925, 955, 988, + 1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336, +}; + +#if CONFIG_VP9_HIGHBITDEPTH +static const int16_t dc_qlookup_10[QINDEX_RANGE] = { + 4, 9, 10, 13, 15, 17, 20, 22, + 25, 28, 31, 34, 37, 40, 43, 47, + 50, 53, 57, 60, 64, 68, 71, 75, + 78, 82, 86, 90, 93, 97, 101, 105, + 109, 113, 116, 120, 124, 128, 132, 136, + 140, 143, 147, 151, 155, 159, 163, 166, + 170, 174, 178, 182, 185, 189, 193, 197, + 200, 204, 208, 212, 215, 219, 223, 226, + 230, 233, 237, 241, 244, 248, 251, 255, + 259, 262, 266, 269, 273, 276, 280, 283, + 287, 290, 293, 297, 300, 304, 307, 310, + 314, 317, 321, 324, 327, 331, 334, 337, + 343, 350, 356, 362, 369, 375, 381, 387, + 394, 400, 406, 412, 418, 424, 430, 436, + 442, 448, 454, 460, 466, 472, 478, 484, + 490, 499, 507, 516, 525, 533, 542, 550, + 559, 567, 576, 584, 592, 601, 609, 617, + 625, 634, 644, 655, 666, 676, 687, 698, + 708, 718, 729, 739, 749, 759, 770, 782, + 795, 807, 819, 831, 844, 856, 868, 880, + 891, 906, 920, 933, 947, 961, 975, 988, + 1001, 1015, 1030, 1045, 1061, 1076, 1090, 1105, + 1120, 1137, 1153, 1170, 1186, 1202, 1218, 1236, + 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, + 1398, 1416, 1436, 1456, 1476, 1496, 1516, 1537, + 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717, + 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, + 1958, 1990, 2021, 2054, 2088, 2123, 2159, 2197, + 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, + 2616, 2675, 2737, 2802, 2871, 2944, 3020, 3102, + 3188, 3280, 3375, 3478, 3586, 3702, 3823, 3953, + 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347, +}; + +static const int16_t dc_qlookup_12[QINDEX_RANGE] = { + 4, 12, 18, 25, 33, 41, 50, 60, + 70, 80, 91, 103, 115, 127, 140, 153, + 166, 180, 194, 208, 222, 237, 251, 266, + 281, 296, 312, 327, 343, 358, 374, 390, + 405, 421, 437, 453, 469, 484, 500, 516, + 532, 548, 564, 580, 596, 611, 627, 643, + 659, 674, 690, 706, 721, 737, 752, 768, + 783, 798, 814, 829, 844, 859, 874, 889, + 904, 919, 934, 949, 964, 978, 993, 1008, + 1022, 1037, 1051, 1065, 1080, 1094, 1108, 1122, + 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234, + 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, + 1368, 1393, 1419, 1444, 1469, 1494, 1519, 1544, + 1569, 1594, 1618, 1643, 1668, 1692, 1717, 1741, + 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933, + 1957, 1992, 2027, 2061, 2096, 2130, 2165, 2199, + 2233, 2267, 2300, 2334, 2367, 2400, 2434, 2467, + 2499, 2532, 2575, 2618, 2661, 2704, 2746, 2788, + 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127, + 3177, 3226, 3275, 3324, 3373, 3421, 3469, 3517, + 3565, 3621, 3677, 3733, 3788, 3843, 3897, 3951, + 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420, + 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, + 5013, 5083, 5153, 5222, 5291, 5367, 5442, 5517, + 5591, 5665, 5745, 5825, 5905, 5984, 6063, 6149, + 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867, + 6966, 7064, 7163, 7269, 7376, 7483, 7599, 7715, + 7832, 7958, 8085, 8214, 8352, 8492, 8635, 8788, + 8945, 9104, 9275, 9450, 9639, 9832, 10031, 10245, + 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, + 12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812, + 16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387, +}; +#endif + +static const int16_t ac_qlookup[QINDEX_RANGE] = { + 4, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 64, 65, 66, 67, 68, 69, 70, + 71, 72, 73, 74, 75, 76, 77, 78, + 79, 80, 81, 82, 83, 84, 85, 86, + 87, 88, 89, 90, 91, 92, 93, 94, + 95, 96, 97, 98, 99, 100, 101, 102, + 104, 106, 108, 110, 112, 114, 116, 118, + 120, 122, 124, 126, 128, 130, 132, 134, + 136, 138, 140, 142, 144, 146, 148, 150, + 152, 155, 158, 161, 164, 167, 170, 173, + 176, 179, 182, 185, 188, 191, 194, 197, + 200, 203, 207, 211, 215, 219, 223, 227, + 231, 235, 239, 243, 247, 251, 255, 260, + 265, 270, 275, 280, 285, 290, 295, 300, + 305, 311, 317, 323, 329, 335, 341, 347, + 353, 359, 366, 373, 380, 387, 394, 401, + 408, 416, 424, 432, 440, 448, 456, 465, + 474, 483, 492, 501, 510, 520, 530, 540, + 550, 560, 571, 582, 593, 604, 615, 627, + 639, 651, 663, 676, 689, 702, 715, 729, + 743, 757, 771, 786, 801, 816, 832, 848, + 864, 881, 898, 915, 933, 951, 969, 988, + 1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, + 1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343, + 1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567, + 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828, +}; + +#if CONFIG_VP9_HIGHBITDEPTH +static const int16_t ac_qlookup_10[QINDEX_RANGE] = { + 4, 9, 11, 13, 16, 18, 21, 24, + 27, 30, 33, 37, 40, 44, 48, 51, + 55, 59, 63, 67, 71, 75, 79, 83, + 88, 92, 96, 100, 105, 109, 114, 118, + 122, 127, 131, 136, 140, 145, 149, 154, + 158, 163, 168, 172, 177, 181, 186, 190, + 195, 199, 204, 208, 213, 217, 222, 226, + 231, 235, 240, 244, 249, 253, 258, 262, + 267, 271, 275, 280, 284, 289, 293, 297, + 302, 306, 311, 315, 319, 324, 328, 332, + 337, 341, 345, 349, 354, 358, 362, 367, + 371, 375, 379, 384, 388, 392, 396, 401, + 409, 417, 425, 433, 441, 449, 458, 466, + 474, 482, 490, 498, 506, 514, 523, 531, + 539, 547, 555, 563, 571, 579, 588, 596, + 604, 616, 628, 640, 652, 664, 676, 688, + 700, 713, 725, 737, 749, 761, 773, 785, + 797, 809, 825, 841, 857, 873, 889, 905, + 922, 938, 954, 970, 986, 1002, 1018, 1038, + 1058, 1078, 1098, 1118, 1138, 1158, 1178, 1198, + 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, + 1411, 1435, 1463, 1491, 1519, 1547, 1575, 1603, + 1631, 1663, 1695, 1727, 1759, 1791, 1823, 1859, + 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, + 2199, 2239, 2283, 2327, 2371, 2415, 2459, 2507, + 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915, + 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, + 3455, 3523, 3591, 3659, 3731, 3803, 3876, 3952, + 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, + 4692, 4784, 4876, 4972, 5068, 5168, 5268, 5372, + 5476, 5584, 5692, 5804, 5916, 6032, 6148, 6268, + 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312, +}; + +static const int16_t ac_qlookup_12[QINDEX_RANGE] = { + 4, 13, 19, 27, 35, 44, 54, 64, + 75, 87, 99, 112, 126, 139, 154, 168, + 183, 199, 214, 230, 247, 263, 280, 297, + 314, 331, 349, 366, 384, 402, 420, 438, + 456, 475, 493, 511, 530, 548, 567, 586, + 604, 623, 642, 660, 679, 698, 716, 735, + 753, 772, 791, 809, 828, 846, 865, 884, + 902, 920, 939, 957, 976, 994, 1012, 1030, + 1049, 1067, 1085, 1103, 1121, 1139, 1157, 1175, + 1193, 1211, 1229, 1246, 1264, 1282, 1299, 1317, + 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457, + 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, + 1627, 1660, 1693, 1725, 1758, 1791, 1824, 1856, + 1889, 1922, 1954, 1987, 2020, 2052, 2085, 2118, + 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378, + 2411, 2459, 2508, 2556, 2605, 2653, 2701, 2750, + 2798, 2847, 2895, 2943, 2992, 3040, 3088, 3137, + 3185, 3234, 3298, 3362, 3426, 3491, 3555, 3619, + 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149, + 4230, 4310, 4390, 4470, 4550, 4631, 4711, 4791, + 4871, 4967, 5064, 5160, 5256, 5352, 5448, 5544, + 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410, + 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435, + 7579, 7723, 7867, 8011, 8155, 8315, 8475, 8635, + 8795, 8956, 9132, 9308, 9484, 9660, 9836, 10028, + 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, + 11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565, + 13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806, + 16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414, + 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, + 21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070, + 25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247, +}; +#endif + +int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) { +#if CONFIG_VP9_HIGHBITDEPTH + switch (bit_depth) { + case VPX_BITS_8: + return dc_qlookup[clamp(qindex + delta, 0, MAXQ)]; + case VPX_BITS_10: + return dc_qlookup_10[clamp(qindex + delta, 0, MAXQ)]; + case VPX_BITS_12: + return dc_qlookup_12[clamp(qindex + delta, 0, MAXQ)]; + default: + assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; + } +#else + (void) bit_depth; + return dc_qlookup[clamp(qindex + delta, 0, MAXQ)]; +#endif +} + +int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) { +#if CONFIG_VP9_HIGHBITDEPTH + switch (bit_depth) { + case VPX_BITS_8: + return ac_qlookup[clamp(qindex + delta, 0, MAXQ)]; + case VPX_BITS_10: + return ac_qlookup_10[clamp(qindex + delta, 0, MAXQ)]; + case VPX_BITS_12: + return ac_qlookup_12[clamp(qindex + delta, 0, MAXQ)]; + default: + assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; + } +#else + (void) bit_depth; + return ac_qlookup[clamp(qindex + delta, 0, MAXQ)]; +#endif +} + +int vp9_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex) { + if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { + const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q); + const int seg_qindex = seg->abs_delta == SEGMENT_ABSDATA ? + data : base_qindex + data; + return clamp(seg_qindex, 0, MAXQ); + } else { + return base_qindex; + } +} + diff --git a/thirdparty/libvpx/vp9/common/vp9_quant_common.h b/thirdparty/libvpx/vp9/common/vp9_quant_common.h new file mode 100644 index 000000000000..4bae4a89677a --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_quant_common.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_QUANT_COMMON_H_ +#define VP9_COMMON_VP9_QUANT_COMMON_H_ + +#include "vpx/vpx_codec.h" +#include "vp9/common/vp9_seg_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MINQ 0 +#define MAXQ 255 +#define QINDEX_RANGE (MAXQ - MINQ + 1) +#define QINDEX_BITS 8 + +int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth); +int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth); + +int vp9_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_QUANT_COMMON_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_reconinter.c b/thirdparty/libvpx/vp9/common/vp9_reconinter.c new file mode 100644 index 000000000000..84718e970361 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_reconinter.c @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_scale_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_build_inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const MV *src_mv, + const struct scale_factors *sf, + int w, int h, int ref, + const InterpKernel *kernel, + enum mv_precision precision, + int x, int y, int bd) { + const int is_q4 = precision == MV_PRECISION_Q4; + const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, + is_q4 ? src_mv->col : src_mv->col * 2 }; + MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf); + const int subpel_x = mv.col & SUBPEL_MASK; + const int subpel_y = mv.row & SUBPEL_MASK; + + src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS); + + highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y, + sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4, + bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_build_inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const MV *src_mv, + const struct scale_factors *sf, + int w, int h, int ref, + const InterpKernel *kernel, + enum mv_precision precision, + int x, int y) { + const int is_q4 = precision == MV_PRECISION_Q4; + const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, + is_q4 ? src_mv->col : src_mv->col * 2 }; + MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf); + const int subpel_x = mv.col & SUBPEL_MASK; + const int subpel_y = mv.row & SUBPEL_MASK; + + src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS); + + inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y, + sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4); +} + +static INLINE int round_mv_comp_q4(int value) { + return (value < 0 ? value - 2 : value + 2) / 4; +} + +static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) { + MV res = { round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.row + + mi->bmi[1].as_mv[idx].as_mv.row + + mi->bmi[2].as_mv[idx].as_mv.row + + mi->bmi[3].as_mv[idx].as_mv.row), + round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.col + + mi->bmi[1].as_mv[idx].as_mv.col + + mi->bmi[2].as_mv[idx].as_mv.col + + mi->bmi[3].as_mv[idx].as_mv.col) }; + return res; +} + +static INLINE int round_mv_comp_q2(int value) { + return (value < 0 ? value - 1 : value + 1) / 2; +} + +static MV mi_mv_pred_q2(const MODE_INFO *mi, int idx, int block0, int block1) { + MV res = { round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.row + + mi->bmi[block1].as_mv[idx].as_mv.row), + round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.col + + mi->bmi[block1].as_mv[idx].as_mv.col) }; + return res; +} + +// TODO(jkoleszar): yet another mv clamping function :-( +MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv, + int bw, int bh, int ss_x, int ss_y) { + // If the MV points so far into the UMV border that no visible pixels + // are used for reconstruction, the subpel part of the MV can be + // discarded and the MV limited to 16 pixels with equivalent results. + const int spel_left = (VP9_INTERP_EXTEND + bw) << SUBPEL_BITS; + const int spel_right = spel_left - SUBPEL_SHIFTS; + const int spel_top = (VP9_INTERP_EXTEND + bh) << SUBPEL_BITS; + const int spel_bottom = spel_top - SUBPEL_SHIFTS; + MV clamped_mv = { + src_mv->row * (1 << (1 - ss_y)), + src_mv->col * (1 << (1 - ss_x)) + }; + assert(ss_x <= 1); + assert(ss_y <= 1); + + clamp_mv(&clamped_mv, + xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left, + xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right, + xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top, + xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom); + + return clamped_mv; +} + +MV average_split_mvs(const struct macroblockd_plane *pd, + const MODE_INFO *mi, int ref, int block) { + const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0); + MV res = {0, 0}; + switch (ss_idx) { + case 0: + res = mi->bmi[block].as_mv[ref].as_mv; + break; + case 1: + res = mi_mv_pred_q2(mi, ref, block, block + 2); + break; + case 2: + res = mi_mv_pred_q2(mi, ref, block, block + 1); + break; + case 3: + res = mi_mv_pred_q4(mi, ref); + break; + default: + assert(ss_idx <= 3 && ss_idx >= 0); + } + return res; +} + +static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, + int bw, int bh, + int x, int y, int w, int h, + int mi_x, int mi_y) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const MODE_INFO *mi = xd->mi[0]; + const int is_compound = has_second_ref(mi); + const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter]; + int ref; + + for (ref = 0; ref < 1 + is_compound; ++ref) { + const struct scale_factors *const sf = &xd->block_refs[ref]->sf; + struct buf_2d *const pre_buf = &pd->pre[ref]; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; + const MV mv = mi->sb_type < BLOCK_8X8 + ? average_split_mvs(pd, mi, ref, block) + : mi->mv[ref].as_mv; + + // TODO(jkoleszar): This clamping is done in the incorrect place for the + // scaling case. It needs to be done on the scaled MV, not the pre-scaling + // MV. Note however that it performs the subsampling aware scaling so + // that the result is always q4. + // mv_precision precision is MV_PRECISION_Q4. + const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, + pd->subsampling_x, + pd->subsampling_y); + + uint8_t *pre; + MV32 scaled_mv; + int xs, ys, subpel_x, subpel_y; + const int is_scaled = vp9_is_scaled(sf); + + if (is_scaled) { + // Co-ordinate of containing block to pixel precision. + const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)); + const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)); +#if CONFIG_BETTER_HW_COMPATIBILITY + assert(xd->mi[0]->sb_type != BLOCK_4X8 && + xd->mi[0]->sb_type != BLOCK_8X4); + assert(mv_q4.row == mv.row * (1 << (1 - pd->subsampling_y)) && + mv_q4.col == mv.col * (1 << (1 - pd->subsampling_x))); +#endif + if (plane == 0) + pre_buf->buf = xd->block_refs[ref]->buf->y_buffer; + else if (plane == 1) + pre_buf->buf = xd->block_refs[ref]->buf->u_buffer; + else + pre_buf->buf = xd->block_refs[ref]->buf->v_buffer; + + pre_buf->buf += scaled_buffer_offset(x_start + x, y_start + y, + pre_buf->stride, sf); + pre = pre_buf->buf; + scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf); + xs = sf->x_step_q4; + ys = sf->y_step_q4; + } else { + pre = pre_buf->buf + (y * pre_buf->stride + x); + scaled_mv.row = mv_q4.row; + scaled_mv.col = mv_q4.col; + xs = ys = 16; + } + subpel_x = scaled_mv.col & SUBPEL_MASK; + subpel_y = scaled_mv.row & SUBPEL_MASK; + pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride + + (scaled_mv.col >> SUBPEL_BITS); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + highbd_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, + subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys, + xd->bd); + } else { + inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, + subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys); + } +#else + inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, + subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys); +#endif // CONFIG_VP9_HIGHBITDEPTH + } +} + +static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize, + int mi_row, int mi_col, + int plane_from, int plane_to) { + int plane; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + for (plane = plane_from; plane <= plane_to; ++plane) { + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, + &xd->plane[plane]); + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int bw = 4 * num_4x4_w; + const int bh = 4 * num_4x4_h; + + if (xd->mi[0]->sb_type < BLOCK_8X8) { + int i = 0, x, y; + assert(bsize == BLOCK_8X8); + for (y = 0; y < num_4x4_h; ++y) + for (x = 0; x < num_4x4_w; ++x) + build_inter_predictors(xd, plane, i++, bw, bh, + 4 * x, 4 * y, 4, 4, mi_x, mi_y); + } else { + build_inter_predictors(xd, plane, 0, bw, bh, + 0, 0, bw, bh, mi_x, mi_y); + } + } +} + +void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0); +} + +void vp9_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize, int plane) { + build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, plane, plane); +} + +void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1, + MAX_MB_PLANE - 1); +} + +void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, + MAX_MB_PLANE - 1); +} + +void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col) { + uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer, + src->v_buffer}; + const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride, + src->uv_stride}; + int i; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + struct macroblockd_plane *const pd = &planes[i]; + setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL, + pd->subsampling_x, pd->subsampling_y); + } +} + +void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx, + const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col, + const struct scale_factors *sf) { + if (src != NULL) { + int i; + uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer, + src->v_buffer}; + const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride, + src->uv_stride}; + for (i = 0; i < MAX_MB_PLANE; ++i) { + struct macroblockd_plane *const pd = &xd->plane[i]; + setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col, + sf, pd->subsampling_x, pd->subsampling_y); + } + } +} diff --git a/thirdparty/libvpx/vp9/common/vp9_reconinter.h b/thirdparty/libvpx/vp9/common/vp9_reconinter.h new file mode 100644 index 000000000000..07745e3aaa5a --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_reconinter.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_RECONINTER_H_ +#define VP9_COMMON_VP9_RECONINTER_H_ + +#include "vp9/common/vp9_filter.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_filter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE void inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int subpel_x, + const int subpel_y, + const struct scale_factors *sf, + int w, int h, int ref, + const InterpKernel *kernel, + int xs, int ys) { + sf->predict[subpel_x != 0][subpel_y != 0][ref]( + src, src_stride, dst, dst_stride, + kernel[subpel_x], xs, kernel[subpel_y], ys, w, h); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int subpel_x, + const int subpel_y, + const struct scale_factors *sf, + int w, int h, int ref, + const InterpKernel *kernel, + int xs, int ys, int bd) { + sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref]( + src, src_stride, dst, dst_stride, + kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +MV average_split_mvs(const struct macroblockd_plane *pd, const MODE_INFO *mi, + int ref, int block); + +MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv, + int bw, int bh, int ss_x, int ss_y); + +void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize); + +void vp9_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize, int plane); + +void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize); + +void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize); + +void vp9_build_inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const MV *mv_q3, + const struct scale_factors *sf, + int w, int h, int do_avg, + const InterpKernel *kernel, + enum mv_precision precision, + int x, int y); + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_build_inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const MV *mv_q3, + const struct scale_factors *sf, + int w, int h, int do_avg, + const InterpKernel *kernel, + enum mv_precision precision, + int x, int y, int bd); +#endif + +static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride, + const struct scale_factors *sf) { + const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset; + const int y = sf ? sf->scale_value_y(y_offset, sf) : y_offset; + return y * stride + x; +} + +static INLINE void setup_pred_plane(struct buf_2d *dst, + uint8_t *src, int stride, + int mi_row, int mi_col, + const struct scale_factors *scale, + int subsampling_x, int subsampling_y) { + const int x = (MI_SIZE * mi_col) >> subsampling_x; + const int y = (MI_SIZE * mi_row) >> subsampling_y; + dst->buf = src + scaled_buffer_offset(x, y, stride, scale); + dst->stride = stride; +} + +void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col); + +void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const struct scale_factors *sf); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_RECONINTER_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_reconintra.c b/thirdparty/libvpx/vp9/common/vp9_reconintra.c new file mode 100644 index 000000000000..445785835a62 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_reconintra.c @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#if CONFIG_VP9_HIGHBITDEPTH +#include "vpx_dsp/vpx_dsp_common.h" +#endif // CONFIG_VP9_HIGHBITDEPTH +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/vpx_once.h" + +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_onyxc_int.h" + +const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = { + DCT_DCT, // DC + ADST_DCT, // V + DCT_ADST, // H + DCT_DCT, // D45 + ADST_ADST, // D135 + ADST_DCT, // D117 + DCT_ADST, // D153 + DCT_ADST, // D207 + ADST_DCT, // D63 + ADST_ADST, // TM +}; + +enum { + NEED_LEFT = 1 << 1, + NEED_ABOVE = 1 << 2, + NEED_ABOVERIGHT = 1 << 3, +}; + +static const uint8_t extend_modes[INTRA_MODES] = { + NEED_ABOVE | NEED_LEFT, // DC + NEED_ABOVE, // V + NEED_LEFT, // H + NEED_ABOVERIGHT, // D45 + NEED_LEFT | NEED_ABOVE, // D135 + NEED_LEFT | NEED_ABOVE, // D117 + NEED_LEFT | NEED_ABOVE, // D153 + NEED_LEFT, // D207 + NEED_ABOVERIGHT, // D63 + NEED_LEFT | NEED_ABOVE, // TM +}; + +typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left); + +static intra_pred_fn pred[INTRA_MODES][TX_SIZES]; +static intra_pred_fn dc_pred[2][2][TX_SIZES]; + +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd); +static intra_high_pred_fn pred_high[INTRA_MODES][4]; +static intra_high_pred_fn dc_pred_high[2][2][4]; +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void vp9_init_intra_predictors_internal(void) { +#define INIT_ALL_SIZES(p, type) \ + p[TX_4X4] = vpx_##type##_predictor_4x4; \ + p[TX_8X8] = vpx_##type##_predictor_8x8; \ + p[TX_16X16] = vpx_##type##_predictor_16x16; \ + p[TX_32X32] = vpx_##type##_predictor_32x32 + + INIT_ALL_SIZES(pred[V_PRED], v); + INIT_ALL_SIZES(pred[H_PRED], h); + INIT_ALL_SIZES(pred[D207_PRED], d207); + INIT_ALL_SIZES(pred[D45_PRED], d45); + INIT_ALL_SIZES(pred[D63_PRED], d63); + INIT_ALL_SIZES(pred[D117_PRED], d117); + INIT_ALL_SIZES(pred[D135_PRED], d135); + INIT_ALL_SIZES(pred[D153_PRED], d153); + INIT_ALL_SIZES(pred[TM_PRED], tm); + + INIT_ALL_SIZES(dc_pred[0][0], dc_128); + INIT_ALL_SIZES(dc_pred[0][1], dc_top); + INIT_ALL_SIZES(dc_pred[1][0], dc_left); + INIT_ALL_SIZES(dc_pred[1][1], dc); + +#if CONFIG_VP9_HIGHBITDEPTH + INIT_ALL_SIZES(pred_high[V_PRED], highbd_v); + INIT_ALL_SIZES(pred_high[H_PRED], highbd_h); + INIT_ALL_SIZES(pred_high[D207_PRED], highbd_d207); + INIT_ALL_SIZES(pred_high[D45_PRED], highbd_d45); + INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63); + INIT_ALL_SIZES(pred_high[D117_PRED], highbd_d117); + INIT_ALL_SIZES(pred_high[D135_PRED], highbd_d135); + INIT_ALL_SIZES(pred_high[D153_PRED], highbd_d153); + INIT_ALL_SIZES(pred_high[TM_PRED], highbd_tm); + + INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128); + INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top); + INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left); + INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#undef intra_pred_allsizes +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void build_intra_predictors_high(const MACROBLOCKD *xd, + const uint8_t *ref8, + int ref_stride, + uint8_t *dst8, + int dst_stride, + PREDICTION_MODE mode, + TX_SIZE tx_size, + int up_available, + int left_available, + int right_available, + int x, int y, + int plane, int bd) { + int i; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + DECLARE_ALIGNED(16, uint16_t, left_col[32]); + DECLARE_ALIGNED(16, uint16_t, above_data[64 + 16]); + uint16_t *above_row = above_data + 16; + const uint16_t *const_above_row = above_row; + const int bs = 4 << tx_size; + int frame_width, frame_height; + int x0, y0; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int need_left = extend_modes[mode] & NEED_LEFT; + const int need_above = extend_modes[mode] & NEED_ABOVE; + const int need_aboveright = extend_modes[mode] & NEED_ABOVERIGHT; + int base = 128 << (bd - 8); + // 127 127 127 .. 127 127 127 127 127 127 + // 129 A B .. Y Z + // 129 C D .. W X + // 129 E F .. U V + // 129 G H .. S T T T T T + // For 10 bit and 12 bit, 127 and 129 are replaced by base -1 and base + 1. + + // Get current frame pointer, width and height. + if (plane == 0) { + frame_width = xd->cur_buf->y_width; + frame_height = xd->cur_buf->y_height; + } else { + frame_width = xd->cur_buf->uv_width; + frame_height = xd->cur_buf->uv_height; + } + + // Get block position in current frame. + x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x; + y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y; + + // NEED_LEFT + if (need_left) { + if (left_available) { + if (xd->mb_to_bottom_edge < 0) { + /* slower path if the block needs border extension */ + if (y0 + bs <= frame_height) { + for (i = 0; i < bs; ++i) + left_col[i] = ref[i * ref_stride - 1]; + } else { + const int extend_bottom = frame_height - y0; + for (i = 0; i < extend_bottom; ++i) + left_col[i] = ref[i * ref_stride - 1]; + for (; i < bs; ++i) + left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1]; + } + } else { + /* faster path if the block does not need extension */ + for (i = 0; i < bs; ++i) + left_col[i] = ref[i * ref_stride - 1]; + } + } else { + vpx_memset16(left_col, base + 1, bs); + } + } + + // NEED_ABOVE + if (need_above) { + if (up_available) { + const uint16_t *above_ref = ref - ref_stride; + if (xd->mb_to_right_edge < 0) { + /* slower path if the block needs border extension */ + if (x0 + bs <= frame_width) { + memcpy(above_row, above_ref, bs * sizeof(above_row[0])); + } else if (x0 <= frame_width) { + const int r = frame_width - x0; + memcpy(above_row, above_ref, r * sizeof(above_row[0])); + vpx_memset16(above_row + r, above_row[r - 1], x0 + bs - frame_width); + } + } else { + /* faster path if the block does not need extension */ + if (bs == 4 && right_available && left_available) { + const_above_row = above_ref; + } else { + memcpy(above_row, above_ref, bs * sizeof(above_row[0])); + } + } + above_row[-1] = left_available ? above_ref[-1] : (base + 1); + } else { + vpx_memset16(above_row, base - 1, bs); + above_row[-1] = base - 1; + } + } + + // NEED_ABOVERIGHT + if (need_aboveright) { + if (up_available) { + const uint16_t *above_ref = ref - ref_stride; + if (xd->mb_to_right_edge < 0) { + /* slower path if the block needs border extension */ + if (x0 + 2 * bs <= frame_width) { + if (right_available && bs == 4) { + memcpy(above_row, above_ref, 2 * bs * sizeof(above_row[0])); + } else { + memcpy(above_row, above_ref, bs * sizeof(above_row[0])); + vpx_memset16(above_row + bs, above_row[bs - 1], bs); + } + } else if (x0 + bs <= frame_width) { + const int r = frame_width - x0; + if (right_available && bs == 4) { + memcpy(above_row, above_ref, r * sizeof(above_row[0])); + vpx_memset16(above_row + r, above_row[r - 1], + x0 + 2 * bs - frame_width); + } else { + memcpy(above_row, above_ref, bs * sizeof(above_row[0])); + vpx_memset16(above_row + bs, above_row[bs - 1], bs); + } + } else if (x0 <= frame_width) { + const int r = frame_width - x0; + memcpy(above_row, above_ref, r * sizeof(above_row[0])); + vpx_memset16(above_row + r, above_row[r - 1], + x0 + 2 * bs - frame_width); + } + above_row[-1] = left_available ? above_ref[-1] : (base + 1); + } else { + /* faster path if the block does not need extension */ + if (bs == 4 && right_available && left_available) { + const_above_row = above_ref; + } else { + memcpy(above_row, above_ref, bs * sizeof(above_row[0])); + if (bs == 4 && right_available) + memcpy(above_row + bs, above_ref + bs, bs * sizeof(above_row[0])); + else + vpx_memset16(above_row + bs, above_row[bs - 1], bs); + above_row[-1] = left_available ? above_ref[-1] : (base + 1); + } + } + } else { + vpx_memset16(above_row, base - 1, bs * 2); + above_row[-1] = base - 1; + } + } + + // predict + if (mode == DC_PRED) { + dc_pred_high[left_available][up_available][tx_size](dst, dst_stride, + const_above_row, + left_col, xd->bd); + } else { + pred_high[mode][tx_size](dst, dst_stride, const_above_row, left_col, + xd->bd); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, + int ref_stride, uint8_t *dst, int dst_stride, + PREDICTION_MODE mode, TX_SIZE tx_size, + int up_available, int left_available, + int right_available, int x, int y, + int plane) { + int i; + DECLARE_ALIGNED(16, uint8_t, left_col[32]); + DECLARE_ALIGNED(16, uint8_t, above_data[64 + 16]); + uint8_t *above_row = above_data + 16; + const uint8_t *const_above_row = above_row; + const int bs = 4 << tx_size; + int frame_width, frame_height; + int x0, y0; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + + // 127 127 127 .. 127 127 127 127 127 127 + // 129 A B .. Y Z + // 129 C D .. W X + // 129 E F .. U V + // 129 G H .. S T T T T T + // .. + + // Get current frame pointer, width and height. + if (plane == 0) { + frame_width = xd->cur_buf->y_width; + frame_height = xd->cur_buf->y_height; + } else { + frame_width = xd->cur_buf->uv_width; + frame_height = xd->cur_buf->uv_height; + } + + // Get block position in current frame. + x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x; + y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y; + + // NEED_LEFT + if (extend_modes[mode] & NEED_LEFT) { + if (left_available) { + if (xd->mb_to_bottom_edge < 0) { + /* slower path if the block needs border extension */ + if (y0 + bs <= frame_height) { + for (i = 0; i < bs; ++i) + left_col[i] = ref[i * ref_stride - 1]; + } else { + const int extend_bottom = frame_height - y0; + for (i = 0; i < extend_bottom; ++i) + left_col[i] = ref[i * ref_stride - 1]; + for (; i < bs; ++i) + left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1]; + } + } else { + /* faster path if the block does not need extension */ + for (i = 0; i < bs; ++i) + left_col[i] = ref[i * ref_stride - 1]; + } + } else { + memset(left_col, 129, bs); + } + } + + // NEED_ABOVE + if (extend_modes[mode] & NEED_ABOVE) { + if (up_available) { + const uint8_t *above_ref = ref - ref_stride; + if (xd->mb_to_right_edge < 0) { + /* slower path if the block needs border extension */ + if (x0 + bs <= frame_width) { + memcpy(above_row, above_ref, bs); + } else if (x0 <= frame_width) { + const int r = frame_width - x0; + memcpy(above_row, above_ref, r); + memset(above_row + r, above_row[r - 1], x0 + bs - frame_width); + } + } else { + /* faster path if the block does not need extension */ + if (bs == 4 && right_available && left_available) { + const_above_row = above_ref; + } else { + memcpy(above_row, above_ref, bs); + } + } + above_row[-1] = left_available ? above_ref[-1] : 129; + } else { + memset(above_row, 127, bs); + above_row[-1] = 127; + } + } + + // NEED_ABOVERIGHT + if (extend_modes[mode] & NEED_ABOVERIGHT) { + if (up_available) { + const uint8_t *above_ref = ref - ref_stride; + if (xd->mb_to_right_edge < 0) { + /* slower path if the block needs border extension */ + if (x0 + 2 * bs <= frame_width) { + if (right_available && bs == 4) { + memcpy(above_row, above_ref, 2 * bs); + } else { + memcpy(above_row, above_ref, bs); + memset(above_row + bs, above_row[bs - 1], bs); + } + } else if (x0 + bs <= frame_width) { + const int r = frame_width - x0; + if (right_available && bs == 4) { + memcpy(above_row, above_ref, r); + memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width); + } else { + memcpy(above_row, above_ref, bs); + memset(above_row + bs, above_row[bs - 1], bs); + } + } else if (x0 <= frame_width) { + const int r = frame_width - x0; + memcpy(above_row, above_ref, r); + memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width); + } + } else { + /* faster path if the block does not need extension */ + if (bs == 4 && right_available && left_available) { + const_above_row = above_ref; + } else { + memcpy(above_row, above_ref, bs); + if (bs == 4 && right_available) + memcpy(above_row + bs, above_ref + bs, bs); + else + memset(above_row + bs, above_row[bs - 1], bs); + } + } + above_row[-1] = left_available ? above_ref[-1] : 129; + } else { + memset(above_row, 127, bs * 2); + above_row[-1] = 127; + } + } + + // predict + if (mode == DC_PRED) { + dc_pred[left_available][up_available][tx_size](dst, dst_stride, + const_above_row, left_col); + } else { + pred[mode][tx_size](dst, dst_stride, const_above_row, left_col); + } +} + +void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, + TX_SIZE tx_size, PREDICTION_MODE mode, + const uint8_t *ref, int ref_stride, + uint8_t *dst, int dst_stride, + int aoff, int loff, int plane) { + const int bw = (1 << bwl_in); + const int txw = (1 << tx_size); + const int have_top = loff || (xd->above_mi != NULL); + const int have_left = aoff || (xd->left_mi != NULL); + const int have_right = (aoff + txw) < bw; + const int x = aoff * 4; + const int y = loff * 4; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode, + tx_size, have_top, have_left, have_right, + x, y, plane, xd->bd); + return; + } +#endif + build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size, + have_top, have_left, have_right, x, y, plane); +} + +void vp9_init_intra_predictors(void) { + once(vp9_init_intra_predictors_internal); +} diff --git a/thirdparty/libvpx/vp9/common/vp9_reconintra.h b/thirdparty/libvpx/vp9/common/vp9_reconintra.h new file mode 100644 index 000000000000..de453808b785 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_reconintra.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_RECONINTRA_H_ +#define VP9_COMMON_VP9_RECONINTRA_H_ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_init_intra_predictors(void); + +void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, + TX_SIZE tx_size, PREDICTION_MODE mode, + const uint8_t *ref, int ref_stride, + uint8_t *dst, int dst_stride, + int aoff, int loff, int plane); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_RECONINTRA_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_rtcd.c b/thirdparty/libvpx/vp9/common/vp9_rtcd.c new file mode 100644 index 000000000000..2dfa09f50e09 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_rtcd.c @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#define RTCD_C +#include "./vp9_rtcd.h" +#include "vpx_ports/vpx_once.h" + +void vp9_rtcd() { + // TODO(JBB): Remove this once, by insuring that both the encoder and + // decoder setup functions are protected by once(); + once(setup_rtcd_internal); +} diff --git a/thirdparty/libvpx/vp9/common/vp9_scale.c b/thirdparty/libvpx/vp9/common/vp9_scale.c new file mode 100644 index 000000000000..b763b925b3ea --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_scale.c @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_filter.h" +#include "vp9/common/vp9_scale.h" +#include "vpx_dsp/vpx_filter.h" + +static INLINE int scaled_x(int val, const struct scale_factors *sf) { + return (int)((int64_t)val * sf->x_scale_fp >> REF_SCALE_SHIFT); +} + +static INLINE int scaled_y(int val, const struct scale_factors *sf) { + return (int)((int64_t)val * sf->y_scale_fp >> REF_SCALE_SHIFT); +} + +static int unscaled_value(int val, const struct scale_factors *sf) { + (void) sf; + return val; +} + +static int get_fixed_point_scale_factor(int other_size, int this_size) { + // Calculate scaling factor once for each reference frame + // and use fixed point scaling factors in decoding and encoding routines. + // Hardware implementations can calculate scale factor in device driver + // and use multiplication and shifting on hardware instead of division. + return (other_size << REF_SCALE_SHIFT) / this_size; +} + +MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) { + const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf) & SUBPEL_MASK; + const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf) & SUBPEL_MASK; + const MV32 res = { + scaled_y(mv->row, sf) + y_off_q4, + scaled_x(mv->col, sf) + x_off_q4 + }; + return res; +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, + int other_w, int other_h, + int this_w, int this_h, + int use_highbd) { +#else +void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, + int other_w, int other_h, + int this_w, int this_h) { +#endif + if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) { + sf->x_scale_fp = REF_INVALID_SCALE; + sf->y_scale_fp = REF_INVALID_SCALE; + return; + } + + sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); + sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); + sf->x_step_q4 = scaled_x(16, sf); + sf->y_step_q4 = scaled_y(16, sf); + + if (vp9_is_scaled(sf)) { + sf->scale_value_x = scaled_x; + sf->scale_value_y = scaled_y; + } else { + sf->scale_value_x = unscaled_value; + sf->scale_value_y = unscaled_value; + } + + // TODO(agrange): Investigate the best choice of functions to use here + // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what + // to do at full-pel offsets. The current selection, where the filter is + // applied in one direction only, and not at all for 0,0, seems to give the + // best quality, but it may be worth trying an additional mode that does + // do the filtering on full-pel. + + if (sf->x_step_q4 == 16) { + if (sf->y_step_q4 == 16) { + // No scaling in either direction. + sf->predict[0][0][0] = vpx_convolve_copy; + sf->predict[0][0][1] = vpx_convolve_avg; + sf->predict[0][1][0] = vpx_convolve8_vert; + sf->predict[0][1][1] = vpx_convolve8_avg_vert; + sf->predict[1][0][0] = vpx_convolve8_horiz; + sf->predict[1][0][1] = vpx_convolve8_avg_horiz; + } else { + // No scaling in x direction. Must always scale in the y direction. + sf->predict[0][0][0] = vpx_scaled_vert; + sf->predict[0][0][1] = vpx_scaled_avg_vert; + sf->predict[0][1][0] = vpx_scaled_vert; + sf->predict[0][1][1] = vpx_scaled_avg_vert; + sf->predict[1][0][0] = vpx_scaled_2d; + sf->predict[1][0][1] = vpx_scaled_avg_2d; + } + } else { + if (sf->y_step_q4 == 16) { + // No scaling in the y direction. Must always scale in the x direction. + sf->predict[0][0][0] = vpx_scaled_horiz; + sf->predict[0][0][1] = vpx_scaled_avg_horiz; + sf->predict[0][1][0] = vpx_scaled_2d; + sf->predict[0][1][1] = vpx_scaled_avg_2d; + sf->predict[1][0][0] = vpx_scaled_horiz; + sf->predict[1][0][1] = vpx_scaled_avg_horiz; + } else { + // Must always scale in both directions. + sf->predict[0][0][0] = vpx_scaled_2d; + sf->predict[0][0][1] = vpx_scaled_avg_2d; + sf->predict[0][1][0] = vpx_scaled_2d; + sf->predict[0][1][1] = vpx_scaled_avg_2d; + sf->predict[1][0][0] = vpx_scaled_2d; + sf->predict[1][0][1] = vpx_scaled_avg_2d; + } + } + + // 2D subpel motion always gets filtered in both directions + + if ((sf->x_step_q4 != 16) || (sf->y_step_q4 != 16)) { + sf->predict[1][1][0] = vpx_scaled_2d; + sf->predict[1][1][1] = vpx_scaled_avg_2d; + } else { + sf->predict[1][1][0] = vpx_convolve8; + sf->predict[1][1][1] = vpx_convolve8_avg; + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (use_highbd) { + if (sf->x_step_q4 == 16) { + if (sf->y_step_q4 == 16) { + // No scaling in either direction. + sf->highbd_predict[0][0][0] = vpx_highbd_convolve_copy; + sf->highbd_predict[0][0][1] = vpx_highbd_convolve_avg; + sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert; + sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert; + sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz; + sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz; + } else { + // No scaling in x direction. Must always scale in the y direction. + sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_vert; + sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_vert; + sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert; + sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert; + sf->highbd_predict[1][0][0] = vpx_highbd_convolve8; + sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg; + } + } else { + if (sf->y_step_q4 == 16) { + // No scaling in the y direction. Must always scale in the x direction. + sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_horiz; + sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_horiz; + sf->highbd_predict[0][1][0] = vpx_highbd_convolve8; + sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg; + sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz; + sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz; + } else { + // Must always scale in both directions. + sf->highbd_predict[0][0][0] = vpx_highbd_convolve8; + sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg; + sf->highbd_predict[0][1][0] = vpx_highbd_convolve8; + sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg; + sf->highbd_predict[1][0][0] = vpx_highbd_convolve8; + sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg; + } + } + // 2D subpel motion always gets filtered in both directions. + sf->highbd_predict[1][1][0] = vpx_highbd_convolve8; + sf->highbd_predict[1][1][1] = vpx_highbd_convolve8_avg; + } +#endif +} diff --git a/thirdparty/libvpx/vp9/common/vp9_scale.h b/thirdparty/libvpx/vp9/common/vp9_scale.h new file mode 100644 index 000000000000..5e9104107979 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_scale.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_SCALE_H_ +#define VP9_COMMON_VP9_SCALE_H_ + +#include "vp9/common/vp9_mv.h" +#include "vpx_dsp/vpx_convolve.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define REF_SCALE_SHIFT 14 +#define REF_NO_SCALE (1 << REF_SCALE_SHIFT) +#define REF_INVALID_SCALE -1 + +struct scale_factors { + int x_scale_fp; // horizontal fixed point scale factor + int y_scale_fp; // vertical fixed point scale factor + int x_step_q4; + int y_step_q4; + + int (*scale_value_x)(int val, const struct scale_factors *sf); + int (*scale_value_y)(int val, const struct scale_factors *sf); + + convolve_fn_t predict[2][2][2]; // horiz, vert, avg +#if CONFIG_VP9_HIGHBITDEPTH + highbd_convolve_fn_t highbd_predict[2][2][2]; // horiz, vert, avg +#endif +}; + +MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf); + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, + int other_w, int other_h, + int this_w, int this_h, + int use_high); +#else +void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, + int other_w, int other_h, + int this_w, int this_h); +#endif + +static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) { + return sf->x_scale_fp != REF_INVALID_SCALE && + sf->y_scale_fp != REF_INVALID_SCALE; +} + +static INLINE int vp9_is_scaled(const struct scale_factors *sf) { + return vp9_is_valid_scale(sf) && + (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE); +} + +static INLINE int valid_ref_frame_size(int ref_width, int ref_height, + int this_width, int this_height) { + return 2 * this_width >= ref_width && + 2 * this_height >= ref_height && + this_width <= 16 * ref_width && + this_height <= 16 * ref_height; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_SCALE_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_scan.c b/thirdparty/libvpx/vp9/common/vp9_scan.c new file mode 100644 index 000000000000..8b8b09f4a33e --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_scan.c @@ -0,0 +1,725 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_scan.h" + +DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = { + 0, 4, 1, 5, + 8, 2, 12, 9, + 3, 6, 13, 10, + 7, 14, 11, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, col_scan_4x4[16]) = { + 0, 4, 8, 1, + 12, 5, 9, 2, + 13, 6, 10, 3, + 7, 14, 11, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, row_scan_4x4[16]) = { + 0, 1, 4, 2, + 5, 3, 6, 8, + 9, 7, 12, 10, + 13, 11, 14, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = { + 0, 8, 1, 16, 9, 2, 17, 24, + 10, 3, 18, 25, 32, 11, 4, 26, + 33, 19, 40, 12, 34, 27, 5, 41, + 20, 48, 13, 35, 42, 28, 21, 6, + 49, 56, 36, 43, 29, 7, 14, 50, + 57, 44, 22, 37, 15, 51, 58, 30, + 45, 23, 52, 59, 38, 31, 60, 53, + 46, 39, 61, 54, 47, 62, 55, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, col_scan_8x8[64]) = { + 0, 8, 16, 1, 24, 9, 32, 17, + 2, 40, 25, 10, 33, 18, 48, 3, + 26, 41, 11, 56, 19, 34, 4, 49, + 27, 42, 12, 35, 20, 57, 50, 28, + 5, 43, 13, 36, 58, 51, 21, 44, + 6, 29, 59, 37, 14, 52, 22, 7, + 45, 60, 30, 15, 38, 53, 23, 46, + 31, 61, 39, 54, 47, 62, 55, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, row_scan_8x8[64]) = { + 0, 1, 2, 8, 9, 3, 16, 10, + 4, 17, 11, 24, 5, 18, 25, 12, + 19, 26, 32, 6, 13, 20, 33, 27, + 7, 34, 40, 21, 28, 41, 14, 35, + 48, 42, 29, 36, 49, 22, 43, 15, + 56, 37, 50, 44, 30, 57, 23, 51, + 58, 45, 38, 52, 31, 59, 53, 46, + 60, 39, 61, 47, 54, 55, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = { + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80, + 50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52, + 98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69, + 100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146, + 55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25, + 133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119, + 26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194, + 180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59, + 12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13, + 226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169, + 242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108, + 77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140, + 230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141, + 63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142, + 219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, + 251, + 190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, col_scan_16x16[256]) = { + 0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81, + 34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4, + 67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21, + 146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85, + 22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179, + 225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24, + 87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227, + 88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167, + 213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229, + 74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59, + 200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170, + 60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202, + 233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125, + 62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79, + 126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205, + 236, + 159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, row_scan_16x16[256]) = { + 0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20, + 49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52, + 23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69, + 25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100, + 13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102, + 144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160, + 89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176, + 75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136, + 165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166, + 167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108, + 197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170, + 124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186, + 156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110, + 157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111, + 158, + 188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, + 175, + 190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160, + 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193, + 68, 131, 37, 100, + 225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38, + 258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321, + 102, 352, 8, 197, + 71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292, + 135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293, + 41, 417, 199, 136, + 262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105, + 419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169, + 295, 420, 106, 451, + 481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421, + 75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391, + 453, 139, 44, 234, + 484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108, + 546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577, + 486, 77, 204, 362, + 608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173, + 610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17, + 111, 238, 48, 143, + 80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51, + 83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424, + 393, 300, 269, 176, 145, + 52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301, + 270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581, + 550, 519, 488, 457, 426, 395, + 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737, + 706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241, + 210, 179, 117, 86, 55, 738, 707, + 614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491, + 367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676, + 645, 552, 521, 428, 397, 304, + 273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553, + 522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26, + 864, 833, 802, 771, 740, 709, + 678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306, + 275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741, + 710, 679, 617, 586, 555, 493, + 462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835, + 742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867, + 743, 619, 495, 371, 247, 123, + 896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680, + 649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929, + 898, 836, 805, 774, 712, 681, + 650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154, + 92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682, + 651, 620, 589, 558, 527, + 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, + 93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590, + 559, 497, 466, 435, 373, + 342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715, + 622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623, + 499, 375, 251, 127, + 900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560, + 529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716, + 685, 654, 592, 561, + 530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903, + 872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469, + 438, 407, 376, 345, + 314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718, + 687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998, + 967, 874, 843, 750, + 719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503, + 379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657, + 564, 533, 440, 409, + 316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534, + 472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783, + 752, 721, 690, 659, + 628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970, + 939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381, + 350, 319, 1002, 971, + 878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631, + 507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568, + 537, 444, 413, 972, + 941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414, + 1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601, + 570, 539, 508, 477, + 446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571, + 509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479, + 1007, 883, 759, 635, 511, + 912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945, + 914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915, + 884, 853, 822, 791, + 760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823, + 761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607, + 1011, 887, 763, 639, + 916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825, + 794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733, + 702, 671, 1013, 982, + 951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015, + 891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798, + 1016, 985, 954, 923, + 892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863, + 1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021, + 990, 959, 1022, 991, 1023, +}; + +// Neighborhood 2-tuples for various scans and blocksizes, +// in {top, left} order for each position in corresponding scan order. +DECLARE_ALIGNED(16, static const int16_t, + default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 0, 0, 1, 4, 4, 4, 1, 1, 8, 8, 5, 8, 2, 2, 2, 5, 9, 12, 6, 9, + 3, 6, 10, 13, 7, 10, 11, 14, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 4, 4, 0, 0, 8, 8, 1, 1, 5, 5, 1, 1, 9, 9, 2, 2, 6, 6, 2, 2, 3, + 3, 10, 10, 7, 7, 11, 11, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 0, 0, 1, 1, 4, 4, 2, 2, 5, 5, 4, 4, 8, 8, 6, 6, 8, 8, 9, 9, 12, + 12, 10, 10, 13, 13, 14, 14, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 8, 8, 0, 0, 16, 16, 1, 1, 24, 24, 9, 9, 1, 1, 32, 32, 17, 17, 2, + 2, 25, 25, 10, 10, 40, 40, 2, 2, 18, 18, 33, 33, 3, 3, 48, 48, 11, 11, 26, + 26, 3, 3, 41, 41, 19, 19, 34, 34, 4, 4, 27, 27, 12, 12, 49, 49, 42, 42, 20, + 20, 4, 4, 35, 35, 5, 5, 28, 28, 50, 50, 43, 43, 13, 13, 36, 36, 5, 5, 21, 21, + 51, 51, 29, 29, 6, 6, 44, 44, 14, 14, 6, 6, 37, 37, 52, 52, 22, 22, 7, 7, 30, + 30, 45, 45, 15, 15, 38, 38, 23, 23, 53, 53, 31, 31, 46, 46, 39, 39, 54, 54, + 47, 47, 55, 55, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 1, 1, 0, 0, 8, 8, 2, 2, 8, 8, 9, 9, 3, 3, 16, 16, 10, 10, 16, 16, + 4, 4, 17, 17, 24, 24, 11, 11, 18, 18, 25, 25, 24, 24, 5, 5, 12, 12, 19, 19, + 32, 32, 26, 26, 6, 6, 33, 33, 32, 32, 20, 20, 27, 27, 40, 40, 13, 13, 34, 34, + 40, 40, 41, 41, 28, 28, 35, 35, 48, 48, 21, 21, 42, 42, 14, 14, 48, 48, 36, + 36, 49, 49, 43, 43, 29, 29, 56, 56, 22, 22, 50, 50, 57, 57, 44, 44, 37, 37, + 51, 51, 30, 30, 58, 58, 52, 52, 45, 45, 59, 59, 38, 38, 60, 60, 46, 46, 53, + 53, 54, 54, 61, 61, 62, 62, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 0, 0, 8, 8, 1, 8, 1, 1, 9, 16, 16, 16, 2, 9, 2, 2, 10, 17, 17, + 24, 24, 24, 3, 10, 3, 3, 18, 25, 25, 32, 11, 18, 32, 32, 4, 11, 26, 33, 19, + 26, 4, 4, 33, 40, 12, 19, 40, 40, 5, 12, 27, 34, 34, 41, 20, 27, 13, 20, 5, + 5, 41, 48, 48, 48, 28, 35, 35, 42, 21, 28, 6, 6, 6, 13, 42, 49, 49, 56, 36, + 43, 14, 21, 29, 36, 7, 14, 43, 50, 50, 57, 22, 29, 37, 44, 15, 22, 44, 51, + 51, 58, 30, 37, 23, 30, 52, 59, 45, 52, 38, 45, 31, 38, 53, 60, 46, 53, 39, + 46, 54, 61, 47, 54, 55, 62, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 16, 16, 32, 32, 0, 0, 48, 48, 1, 1, 64, 64, + 17, 17, 80, 80, 33, 33, 1, 1, 49, 49, 96, 96, 2, 2, 65, 65, + 18, 18, 112, 112, 34, 34, 81, 81, 2, 2, 50, 50, 128, 128, 3, 3, + 97, 97, 19, 19, 66, 66, 144, 144, 82, 82, 35, 35, 113, 113, 3, 3, + 51, 51, 160, 160, 4, 4, 98, 98, 129, 129, 67, 67, 20, 20, 83, 83, + 114, 114, 36, 36, 176, 176, 4, 4, 145, 145, 52, 52, 99, 99, 5, 5, + 130, 130, 68, 68, 192, 192, 161, 161, 21, 21, 115, 115, 84, 84, 37, 37, + 146, 146, 208, 208, 53, 53, 5, 5, 100, 100, 177, 177, 131, 131, 69, 69, + 6, 6, 224, 224, 116, 116, 22, 22, 162, 162, 85, 85, 147, 147, 38, 38, + 193, 193, 101, 101, 54, 54, 6, 6, 132, 132, 178, 178, 70, 70, 163, 163, + 209, 209, 7, 7, 117, 117, 23, 23, 148, 148, 7, 7, 86, 86, 194, 194, + 225, 225, 39, 39, 179, 179, 102, 102, 133, 133, 55, 55, 164, 164, 8, 8, + 71, 71, 210, 210, 118, 118, 149, 149, 195, 195, 24, 24, 87, 87, 40, 40, + 56, 56, 134, 134, 180, 180, 226, 226, 103, 103, 8, 8, 165, 165, 211, 211, + 72, 72, 150, 150, 9, 9, 119, 119, 25, 25, 88, 88, 196, 196, 41, 41, + 135, 135, 181, 181, 104, 104, 57, 57, 227, 227, 166, 166, 120, 120, 151, 151, + 197, 197, 73, 73, 9, 9, 212, 212, 89, 89, 136, 136, 182, 182, 10, 10, + 26, 26, 105, 105, 167, 167, 228, 228, 152, 152, 42, 42, 121, 121, 213, 213, + 58, 58, 198, 198, 74, 74, 137, 137, 183, 183, 168, 168, 10, 10, 90, 90, + 229, 229, 11, 11, 106, 106, 214, 214, 153, 153, 27, 27, 199, 199, 43, 43, + 184, 184, 122, 122, 169, 169, 230, 230, 59, 59, 11, 11, 75, 75, 138, 138, + 200, 200, 215, 215, 91, 91, 12, 12, 28, 28, 185, 185, 107, 107, 154, 154, + 44, 44, 231, 231, 216, 216, 60, 60, 123, 123, 12, 12, 76, 76, 201, 201, + 170, 170, 232, 232, 139, 139, 92, 92, 13, 13, 108, 108, 29, 29, 186, 186, + 217, 217, 155, 155, 45, 45, 13, 13, 61, 61, 124, 124, 14, 14, 233, 233, + 77, 77, 14, 14, 171, 171, 140, 140, 202, 202, 30, 30, 93, 93, 109, 109, + 46, 46, 156, 156, 62, 62, 187, 187, 15, 15, 125, 125, 218, 218, 78, 78, + 31, 31, 172, 172, 47, 47, 141, 141, 94, 94, 234, 234, 203, 203, 63, 63, + 110, 110, 188, 188, 157, 157, 126, 126, 79, 79, 173, 173, 95, 95, 219, 219, + 142, 142, 204, 204, 235, 235, 111, 111, 158, 158, 127, 127, 189, 189, 220, + 220, 143, 143, 174, 174, 205, 205, 236, 236, 159, 159, 190, 190, 221, 221, + 175, 175, 237, 237, 206, 206, 222, 222, 191, 191, 238, 238, 207, 207, 223, + 223, 239, 239, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 16, 16, 3, 3, 17, 17, + 16, 16, 4, 4, 32, 32, 18, 18, 5, 5, 33, 33, 32, 32, 19, 19, + 48, 48, 6, 6, 34, 34, 20, 20, 49, 49, 48, 48, 7, 7, 35, 35, + 64, 64, 21, 21, 50, 50, 36, 36, 64, 64, 8, 8, 65, 65, 51, 51, + 22, 22, 37, 37, 80, 80, 66, 66, 9, 9, 52, 52, 23, 23, 81, 81, + 67, 67, 80, 80, 38, 38, 10, 10, 53, 53, 82, 82, 96, 96, 68, 68, + 24, 24, 97, 97, 83, 83, 39, 39, 96, 96, 54, 54, 11, 11, 69, 69, + 98, 98, 112, 112, 84, 84, 25, 25, 40, 40, 55, 55, 113, 113, 99, 99, + 12, 12, 70, 70, 112, 112, 85, 85, 26, 26, 114, 114, 100, 100, 128, 128, + 41, 41, 56, 56, 71, 71, 115, 115, 13, 13, 86, 86, 129, 129, 101, 101, + 128, 128, 72, 72, 130, 130, 116, 116, 27, 27, 57, 57, 14, 14, 87, 87, + 42, 42, 144, 144, 102, 102, 131, 131, 145, 145, 117, 117, 73, 73, 144, 144, + 88, 88, 132, 132, 103, 103, 28, 28, 58, 58, 146, 146, 118, 118, 43, 43, + 160, 160, 147, 147, 89, 89, 104, 104, 133, 133, 161, 161, 119, 119, 160, 160, + 74, 74, 134, 134, 148, 148, 29, 29, 59, 59, 162, 162, 176, 176, 44, 44, + 120, 120, 90, 90, 105, 105, 163, 163, 177, 177, 149, 149, 176, 176, 135, 135, + 164, 164, 178, 178, 30, 30, 150, 150, 192, 192, 75, 75, 121, 121, 60, 60, + 136, 136, 193, 193, 106, 106, 151, 151, 179, 179, 192, 192, 45, 45, 165, 165, + 166, 166, 194, 194, 91, 91, 180, 180, 137, 137, 208, 208, 122, 122, 152, 152, + 208, 208, 195, 195, 76, 76, 167, 167, 209, 209, 181, 181, 224, 224, 107, 107, + 196, 196, 61, 61, 153, 153, 224, 224, 182, 182, 168, 168, 210, 210, 46, 46, + 138, 138, 92, 92, 183, 183, 225, 225, 211, 211, 240, 240, 197, 197, 169, 169, + 123, 123, 154, 154, 198, 198, 77, 77, 212, 212, 184, 184, 108, 108, 226, 226, + 199, 199, 62, 62, 227, 227, 241, 241, 139, 139, 213, 213, 170, 170, 185, 185, + 155, 155, 228, 228, 242, 242, 124, 124, 93, 93, 200, 200, 243, 243, 214, 214, + 215, 215, 229, 229, 140, 140, 186, 186, 201, 201, 78, 78, 171, 171, 109, 109, + 156, 156, 244, 244, 216, 216, 230, 230, 94, 94, 245, 245, 231, 231, 125, 125, + 202, 202, 246, 246, 232, 232, 172, 172, 217, 217, 141, 141, 110, 110, 157, + 157, 187, 187, 247, 247, 126, 126, 233, 233, 218, 218, 248, 248, 188, 188, + 203, 203, 142, 142, 173, 173, 158, 158, 249, 249, 234, 234, 204, 204, 219, + 219, 174, 174, 189, 189, 250, 250, 220, 220, 190, 190, 205, 205, 235, 235, + 206, 206, 236, 236, 251, 251, 221, 221, 252, 252, 222, 222, 237, 237, 238, + 238, 253, 253, 254, 254, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 0, 0, 16, 16, 1, 16, 1, 1, 32, 32, 17, 32, + 2, 17, 2, 2, 48, 48, 18, 33, 33, 48, 3, 18, 49, 64, 64, 64, + 34, 49, 3, 3, 19, 34, 50, 65, 4, 19, 65, 80, 80, 80, 35, 50, + 4, 4, 20, 35, 66, 81, 81, 96, 51, 66, 96, 96, 5, 20, 36, 51, + 82, 97, 21, 36, 67, 82, 97, 112, 5, 5, 52, 67, 112, 112, 37, 52, + 6, 21, 83, 98, 98, 113, 68, 83, 6, 6, 113, 128, 22, 37, 53, 68, + 84, 99, 99, 114, 128, 128, 114, 129, 69, 84, 38, 53, 7, 22, 7, 7, + 129, 144, 23, 38, 54, 69, 100, 115, 85, 100, 115, 130, 144, 144, 130, 145, + 39, 54, 70, 85, 8, 23, 55, 70, 116, 131, 101, 116, 145, 160, 24, 39, + 8, 8, 86, 101, 131, 146, 160, 160, 146, 161, 71, 86, 40, 55, 9, 24, + 117, 132, 102, 117, 161, 176, 132, 147, 56, 71, 87, 102, 25, 40, 147, 162, + 9, 9, 176, 176, 162, 177, 72, 87, 41, 56, 118, 133, 133, 148, 103, 118, + 10, 25, 148, 163, 57, 72, 88, 103, 177, 192, 26, 41, 163, 178, 192, 192, + 10, 10, 119, 134, 73, 88, 149, 164, 104, 119, 134, 149, 42, 57, 178, 193, + 164, 179, 11, 26, 58, 73, 193, 208, 89, 104, 135, 150, 120, 135, 27, 42, + 74, 89, 208, 208, 150, 165, 179, 194, 165, 180, 105, 120, 194, 209, 43, 58, + 11, 11, 136, 151, 90, 105, 151, 166, 180, 195, 59, 74, 121, 136, 209, 224, + 195, 210, 224, 224, 166, 181, 106, 121, 75, 90, 12, 27, 181, 196, 12, 12, + 210, 225, 152, 167, 167, 182, 137, 152, 28, 43, 196, 211, 122, 137, 91, 106, + 225, 240, 44, 59, 13, 28, 107, 122, 182, 197, 168, 183, 211, 226, 153, 168, + 226, 241, 60, 75, 197, 212, 138, 153, 29, 44, 76, 91, 13, 13, 183, 198, + 123, 138, 45, 60, 212, 227, 198, 213, 154, 169, 169, 184, 227, 242, 92, 107, + 61, 76, 139, 154, 14, 29, 14, 14, 184, 199, 213, 228, 108, 123, 199, 214, + 228, 243, 77, 92, 30, 45, 170, 185, 155, 170, 185, 200, 93, 108, 124, 139, + 214, 229, 46, 61, 200, 215, 229, 244, 15, 30, 109, 124, 62, 77, 140, 155, + 215, 230, 31, 46, 171, 186, 186, 201, 201, 216, 78, 93, 230, 245, 125, 140, + 47, 62, 216, 231, 156, 171, 94, 109, 231, 246, 141, 156, 63, 78, 202, 217, + 187, 202, 110, 125, 217, 232, 172, 187, 232, 247, 79, 94, 157, 172, 126, 141, + 203, 218, 95, 110, 233, 248, 218, 233, 142, 157, 111, 126, 173, 188, 188, 203, + 234, 249, 219, 234, 127, 142, 158, 173, 204, 219, 189, 204, 143, 158, 235, + 250, 174, 189, 205, 220, 159, 174, 220, 235, 221, 236, 175, 190, 190, 205, + 236, 251, 206, 221, 237, 252, 191, 206, 222, 237, 207, 222, 238, 253, 223, + 238, 239, 254, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, + default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = { + 0, 0, 0, 0, 0, 0, 32, 32, 1, 32, 1, 1, 64, 64, 33, 64, + 2, 33, 96, 96, 2, 2, 65, 96, 34, 65, 128, 128, 97, 128, 3, 34, + 66, 97, 3, 3, 35, 66, 98, 129, 129, 160, 160, 160, 4, 35, 67, 98, + 192, 192, 4, 4, 130, 161, 161, 192, 36, 67, 99, 130, 5, 36, 68, 99, + 193, 224, 162, 193, 224, 224, 131, 162, 37, 68, 100, 131, 5, 5, 194, 225, + 225, 256, 256, 256, 163, 194, 69, 100, 132, 163, 6, 37, 226, 257, 6, 6, + 195, 226, 257, 288, 101, 132, 288, 288, 38, 69, 164, 195, 133, 164, 258, 289, + 227, 258, 196, 227, 7, 38, 289, 320, 70, 101, 320, 320, 7, 7, 165, 196, + 39, 70, 102, 133, 290, 321, 259, 290, 228, 259, 321, 352, 352, 352, 197, 228, + 134, 165, 71, 102, 8, 39, 322, 353, 291, 322, 260, 291, 103, 134, 353, 384, + 166, 197, 229, 260, 40, 71, 8, 8, 384, 384, 135, 166, 354, 385, 323, 354, + 198, 229, 292, 323, 72, 103, 261, 292, 9, 40, 385, 416, 167, 198, 104, 135, + 230, 261, 355, 386, 416, 416, 293, 324, 324, 355, 9, 9, 41, 72, 386, 417, + 199, 230, 136, 167, 417, 448, 262, 293, 356, 387, 73, 104, 387, 418, 231, 262, + 10, 41, 168, 199, 325, 356, 418, 449, 105, 136, 448, 448, 42, 73, 294, 325, + 200, 231, 10, 10, 357, 388, 137, 168, 263, 294, 388, 419, 74, 105, 419, 450, + 449, 480, 326, 357, 232, 263, 295, 326, 169, 200, 11, 42, 106, 137, 480, 480, + 450, 481, 358, 389, 264, 295, 201, 232, 138, 169, 389, 420, 43, 74, 420, 451, + 327, 358, 11, 11, 481, 512, 233, 264, 451, 482, 296, 327, 75, 106, 170, 201, + 482, 513, 512, 512, 390, 421, 359, 390, 421, 452, 107, 138, 12, 43, 202, 233, + 452, 483, 265, 296, 328, 359, 139, 170, 44, 75, 483, 514, 513, 544, 234, 265, + 297, 328, 422, 453, 12, 12, 391, 422, 171, 202, 76, 107, 514, 545, 453, 484, + 544, 544, 266, 297, 203, 234, 108, 139, 329, 360, 298, 329, 140, 171, 515, + 546, 13, 44, 423, 454, 235, 266, 545, 576, 454, 485, 45, 76, 172, 203, 330, + 361, 576, 576, 13, 13, 267, 298, 546, 577, 77, 108, 204, 235, 455, 486, 577, + 608, 299, 330, 109, 140, 547, 578, 14, 45, 14, 14, 141, 172, 578, 609, 331, + 362, 46, 77, 173, 204, 15, 15, 78, 109, 205, 236, 579, 610, 110, 141, 15, 46, + 142, 173, 47, 78, 174, 205, 16, 16, 79, 110, 206, 237, 16, 47, 111, 142, + 48, 79, 143, 174, 80, 111, 175, 206, 17, 48, 17, 17, 207, 238, 49, 80, + 81, 112, 18, 18, 18, 49, 50, 81, 82, 113, 19, 50, 51, 82, 83, 114, 608, 608, + 484, 515, 360, 391, 236, 267, 112, 143, 19, 19, 640, 640, 609, 640, 516, 547, + 485, 516, 392, 423, 361, 392, 268, 299, 237, 268, 144, 175, 113, 144, 20, 51, + 20, 20, 672, 672, 641, 672, 610, 641, 548, 579, 517, 548, 486, 517, 424, 455, + 393, 424, 362, 393, 300, 331, 269, 300, 238, 269, 176, 207, 145, 176, 114, + 145, 52, 83, 21, 52, 21, 21, 704, 704, 673, 704, 642, 673, 611, 642, 580, + 611, 549, 580, 518, 549, 487, 518, 456, 487, 425, 456, 394, 425, 363, 394, + 332, 363, 301, 332, 270, 301, 239, 270, 208, 239, 177, 208, 146, 177, 115, + 146, 84, 115, 53, 84, 22, 53, 22, 22, 705, 736, 674, 705, 643, 674, 581, 612, + 550, 581, 519, 550, 457, 488, 426, 457, 395, 426, 333, 364, 302, 333, 271, + 302, 209, 240, 178, 209, 147, 178, 85, 116, 54, 85, 23, 54, 706, 737, 675, + 706, 582, 613, 551, 582, 458, 489, 427, 458, 334, 365, 303, 334, 210, 241, + 179, 210, 86, 117, 55, 86, 707, 738, 583, 614, 459, 490, 335, 366, 211, 242, + 87, 118, 736, 736, 612, 643, 488, 519, 364, 395, 240, 271, 116, 147, 23, 23, + 768, 768, 737, 768, 644, 675, 613, 644, 520, 551, 489, 520, 396, 427, 365, + 396, 272, 303, 241, 272, 148, 179, 117, 148, 24, 55, 24, 24, 800, 800, 769, + 800, 738, 769, 676, 707, 645, 676, 614, 645, 552, 583, 521, 552, 490, 521, + 428, 459, 397, 428, 366, 397, 304, 335, 273, 304, 242, 273, 180, 211, 149, + 180, 118, 149, 56, 87, 25, 56, 25, 25, 832, 832, 801, 832, 770, 801, 739, + 770, 708, 739, 677, 708, 646, 677, 615, 646, 584, 615, 553, 584, 522, 553, + 491, 522, 460, 491, 429, 460, 398, 429, 367, 398, 336, 367, 305, 336, 274, + 305, 243, 274, 212, 243, 181, 212, 150, 181, 119, 150, 88, 119, 57, 88, 26, + 57, 26, 26, 833, 864, 802, 833, 771, 802, 709, 740, 678, 709, 647, 678, 585, + 616, 554, 585, 523, 554, 461, 492, 430, 461, 399, 430, 337, 368, 306, 337, + 275, 306, 213, 244, 182, 213, 151, 182, 89, 120, 58, 89, 27, 58, 834, 865, + 803, 834, 710, 741, 679, 710, 586, 617, 555, 586, 462, 493, 431, 462, 338, + 369, 307, 338, 214, 245, 183, 214, 90, 121, 59, 90, 835, 866, 711, 742, 587, + 618, 463, 494, 339, 370, 215, 246, 91, 122, 864, 864, 740, 771, 616, 647, + 492, 523, 368, 399, 244, 275, 120, 151, 27, 27, 896, 896, 865, 896, 772, 803, + 741, 772, 648, 679, 617, 648, 524, 555, 493, 524, 400, 431, 369, 400, 276, + 307, 245, 276, 152, 183, 121, 152, 28, 59, 28, 28, 928, 928, 897, 928, 866, + 897, 804, 835, 773, 804, 742, 773, 680, 711, 649, 680, 618, 649, 556, 587, + 525, 556, 494, 525, 432, 463, 401, 432, 370, 401, 308, 339, 277, 308, 246, + 277, 184, 215, 153, 184, 122, 153, 60, 91, 29, 60, 29, 29, 960, 960, 929, + 960, 898, 929, 867, 898, 836, 867, 805, 836, 774, 805, 743, 774, 712, 743, + 681, 712, 650, 681, 619, 650, 588, 619, 557, 588, 526, 557, 495, 526, 464, + 495, 433, 464, 402, 433, 371, 402, 340, 371, 309, 340, 278, 309, 247, 278, + 216, 247, 185, 216, 154, 185, 123, 154, 92, 123, 61, 92, 30, 61, 30, 30, + 961, 992, 930, 961, 899, 930, 837, 868, 806, 837, 775, 806, 713, 744, 682, + 713, 651, 682, 589, 620, 558, 589, 527, 558, 465, 496, 434, 465, 403, 434, + 341, 372, 310, 341, 279, 310, 217, 248, 186, 217, 155, 186, 93, 124, 62, 93, + 31, 62, 962, 993, 931, 962, 838, 869, 807, 838, 714, 745, 683, 714, 590, 621, + 559, 590, 466, 497, 435, 466, 342, 373, 311, 342, 218, 249, 187, 218, 94, + 125, 63, 94, 963, 994, 839, 870, 715, 746, 591, 622, 467, 498, 343, 374, 219, + 250, 95, 126, 868, 899, 744, 775, 620, 651, 496, 527, 372, 403, 248, 279, + 124, 155, 900, 931, 869, 900, 776, 807, 745, 776, 652, 683, 621, 652, 528, + 559, 497, 528, 404, 435, 373, 404, 280, 311, 249, 280, 156, 187, 125, 156, + 932, 963, 901, 932, 870, 901, 808, 839, 777, 808, 746, 777, 684, 715, 653, + 684, 622, 653, 560, 591, 529, 560, 498, 529, 436, 467, 405, 436, 374, 405, + 312, 343, 281, 312, 250, 281, 188, 219, 157, 188, 126, 157, 964, 995, 933, + 964, 902, 933, 871, 902, 840, 871, 809, 840, 778, 809, 747, 778, 716, 747, + 685, 716, 654, 685, 623, 654, 592, 623, 561, 592, 530, 561, 499, 530, 468, + 499, 437, 468, 406, 437, 375, 406, 344, 375, 313, 344, 282, 313, 251, 282, + 220, 251, 189, 220, 158, 189, 127, 158, 965, 996, 934, 965, 903, 934, 841, + 872, 810, 841, 779, 810, 717, 748, 686, 717, 655, 686, 593, 624, 562, 593, + 531, 562, 469, 500, 438, 469, 407, 438, 345, 376, 314, 345, 283, 314, 221, + 252, 190, 221, 159, 190, 966, 997, 935, 966, 842, 873, 811, 842, 718, 749, + 687, 718, 594, 625, 563, 594, 470, 501, 439, 470, 346, 377, 315, 346, 222, + 253, 191, 222, 967, 998, 843, 874, 719, 750, 595, 626, 471, 502, 347, 378, + 223, 254, 872, 903, 748, 779, 624, 655, 500, 531, 376, 407, 252, 283, 904, + 935, 873, 904, 780, 811, 749, 780, 656, 687, 625, 656, 532, 563, 501, 532, + 408, 439, 377, 408, 284, 315, 253, 284, 936, 967, 905, 936, 874, 905, 812, + 843, 781, 812, 750, 781, 688, 719, 657, 688, 626, 657, 564, 595, 533, 564, + 502, 533, 440, 471, 409, 440, 378, 409, 316, 347, 285, 316, 254, 285, 968, + 999, 937, 968, 906, 937, 875, 906, 844, 875, 813, 844, 782, 813, 751, 782, + 720, 751, 689, 720, 658, 689, 627, 658, 596, 627, 565, 596, 534, 565, 503, + 534, 472, 503, 441, 472, 410, 441, 379, 410, 348, 379, 317, 348, 286, 317, + 255, 286, 969, 1000, 938, 969, 907, 938, 845, 876, 814, 845, 783, 814, 721, + 752, 690, 721, 659, 690, 597, 628, 566, 597, 535, 566, 473, 504, 442, 473, + 411, 442, 349, 380, 318, 349, 287, 318, 970, 1001, 939, 970, 846, 877, 815, + 846, 722, 753, 691, 722, 598, 629, 567, 598, 474, 505, 443, 474, 350, 381, + 319, 350, 971, 1002, 847, 878, 723, 754, 599, 630, 475, 506, 351, 382, 876, + 907, 752, 783, 628, 659, 504, 535, 380, 411, 908, 939, 877, 908, 784, 815, + 753, 784, 660, 691, 629, 660, 536, 567, 505, 536, 412, 443, 381, 412, 940, + 971, 909, 940, 878, 909, 816, 847, 785, 816, 754, 785, 692, 723, 661, 692, + 630, 661, 568, 599, 537, 568, 506, 537, 444, 475, 413, 444, 382, 413, 972, + 1003, 941, 972, 910, 941, 879, 910, 848, 879, 817, 848, 786, 817, 755, 786, + 724, 755, 693, 724, 662, 693, 631, 662, 600, 631, 569, 600, 538, 569, 507, + 538, 476, 507, 445, 476, 414, 445, 383, 414, 973, 1004, 942, 973, 911, 942, + 849, 880, 818, 849, 787, 818, 725, 756, 694, 725, 663, 694, 601, 632, 570, + 601, 539, 570, 477, 508, 446, 477, 415, 446, 974, 1005, 943, 974, 850, 881, + 819, 850, 726, 757, 695, 726, 602, 633, 571, 602, 478, 509, 447, 478, 975, + 1006, 851, 882, 727, 758, 603, 634, 479, 510, 880, 911, 756, 787, 632, 663, + 508, 539, 912, 943, 881, 912, 788, 819, 757, 788, 664, 695, 633, 664, 540, + 571, 509, 540, 944, 975, 913, 944, 882, 913, 820, 851, 789, 820, 758, 789, + 696, 727, 665, 696, 634, 665, 572, 603, 541, 572, 510, 541, 976, 1007, 945, + 976, 914, 945, 883, 914, 852, 883, 821, 852, 790, 821, 759, 790, 728, 759, + 697, 728, 666, 697, 635, 666, 604, 635, 573, 604, 542, 573, 511, 542, 977, + 1008, 946, 977, 915, 946, 853, 884, 822, 853, 791, 822, 729, 760, 698, 729, + 667, 698, 605, 636, 574, 605, 543, 574, 978, 1009, 947, 978, 854, 885, 823, + 854, 730, 761, 699, 730, 606, 637, 575, 606, 979, 1010, 855, 886, 731, 762, + 607, 638, 884, 915, 760, 791, 636, 667, 916, 947, 885, 916, 792, 823, 761, + 792, 668, 699, 637, 668, 948, 979, 917, 948, 886, 917, 824, 855, 793, 824, + 762, 793, 700, 731, 669, 700, 638, 669, 980, 1011, 949, 980, 918, 949, 887, + 918, 856, 887, 825, 856, 794, 825, 763, 794, 732, 763, 701, 732, 670, 701, + 639, 670, 981, 1012, 950, 981, 919, 950, 857, 888, 826, 857, 795, 826, 733, + 764, 702, 733, 671, 702, 982, 1013, 951, 982, 858, 889, 827, 858, 734, 765, + 703, 734, 983, 1014, 859, 890, 735, 766, 888, 919, 764, 795, 920, 951, 889, + 920, 796, 827, 765, 796, 952, 983, 921, 952, 890, 921, 828, 859, 797, 828, + 766, 797, 984, 1015, 953, 984, 922, 953, 891, 922, 860, 891, 829, 860, 798, + 829, 767, 798, 985, 1016, 954, 985, 923, 954, 861, 892, 830, 861, 799, 830, + 986, 1017, 955, 986, 862, 893, 831, 862, 987, 1018, 863, 894, 892, 923, 924, + 955, 893, 924, 956, 987, 925, 956, 894, 925, 988, 1019, 957, 988, 926, 957, + 895, 926, 989, 1020, 958, 989, 927, 958, 990, 1021, 959, 990, 991, 1022, 0, 0, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_4x4[16]) = { + 0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_4x4[16]) = { + 0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_4x4[16]) = { + 0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_8x8[64]) = { + 0, 3, 8, 15, 22, 32, 40, 47, 1, 5, 11, 18, 26, 34, 44, 51, + 2, 7, 13, 20, 28, 38, 46, 54, 4, 10, 16, 24, 31, 41, 50, 56, + 6, 12, 21, 27, 35, 43, 52, 58, 9, 17, 25, 33, 39, 48, 55, 60, + 14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_8x8[64]) = { + 0, 1, 2, 5, 8, 12, 19, 24, 3, 4, 7, 10, 15, 20, 30, 39, + 6, 9, 13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52, + 18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59, + 32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_8x8[64]) = { + 0, 2, 5, 9, 14, 22, 31, 37, 1, 4, 8, 13, 19, 26, 38, 44, + 3, 6, 10, 17, 24, 30, 42, 49, 7, 11, 15, 21, 29, 36, 47, 53, + 12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60, + 25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_16x16[256]) = { + 0, 4, 11, 20, 31, 43, 59, 75, 85, 109, 130, 150, 165, 181, 195, 198, + 1, 6, 14, 23, 34, 47, 64, 81, 95, 114, 135, 153, 171, 188, 201, 212, + 2, 8, 16, 25, 38, 52, 67, 83, 101, 116, 136, 157, 172, 190, 205, 216, + 3, 10, 18, 29, 41, 55, 71, 89, 103, 119, 141, 159, 176, 194, 208, 218, + 5, 12, 21, 32, 45, 58, 74, 93, 104, 123, 144, 164, 179, 196, 210, 223, + 7, 15, 26, 37, 49, 63, 78, 96, 112, 129, 146, 166, 182, 200, 215, 228, + 9, 19, 28, 39, 54, 69, 86, 102, 117, 132, 151, 170, 187, 206, 220, 230, + 13, 24, 35, 46, 60, 73, 91, 108, 122, 137, 154, 174, 189, 207, 224, 235, + 17, 30, 40, 53, 66, 82, 98, 115, 126, 142, 161, 180, 197, 213, 227, 237, + 22, 36, 48, 62, 76, 92, 105, 120, 133, 147, 167, 186, 203, 219, 232, 240, + 27, 44, 56, 70, 84, 99, 113, 127, 140, 156, 175, 193, 209, 226, 236, 244, + 33, 51, 68, 79, 94, 110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247, + 42, 61, 77, 90, 106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251, + 50, 72, 87, 100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253, + 57, 80, 97, 111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254, + 65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_16x16[256]) = { + 0, 1, 2, 4, 6, 9, 12, 17, 22, 29, 36, 43, 54, 64, 76, 86, + 3, 5, 7, 11, 15, 19, 25, 32, 38, 48, 59, 68, 84, 99, 115, 130, + 8, 10, 13, 18, 23, 27, 33, 42, 51, 60, 72, 88, 103, 119, 142, 167, + 14, 16, 20, 26, 31, 37, 44, 53, 61, 73, 85, 100, 116, 135, 161, 185, + 21, 24, 30, 35, 40, 47, 55, 65, 74, 81, 94, 112, 133, 154, 179, 205, + 28, 34, 39, 45, 50, 58, 67, 77, 87, 96, 106, 121, 146, 169, 196, 212, + 41, 46, 49, 56, 63, 70, 79, 90, 98, 107, 122, 138, 159, 182, 207, 222, + 52, 57, 62, 69, 75, 83, 93, 102, 110, 120, 134, 150, 176, 195, 215, 226, + 66, 71, 78, 82, 91, 97, 108, 113, 127, 136, 148, 168, 188, 202, 221, 232, + 80, 89, 92, 101, 105, 114, 125, 131, 139, 151, 162, 177, 192, 208, 223, 234, + 95, 104, 109, 117, 123, 128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239, + 111, 118, 124, 129, 140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240, + 243, 126, 132, 137, 145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237, + 244, 246, 141, 149, 156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238, + 242, 249, 251, 152, 163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236, + 245, 247, 252, 253, 158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235, + 241, 248, 250, 254, 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_16x16[256]) = { + 0, 2, 5, 9, 17, 24, 36, 44, 55, 72, 88, 104, 128, 143, 166, 179, + 1, 4, 8, 13, 20, 30, 40, 54, 66, 79, 96, 113, 141, 154, 178, 196, + 3, 7, 11, 18, 25, 33, 46, 57, 71, 86, 101, 119, 148, 164, 186, 201, + 6, 12, 16, 23, 31, 39, 53, 64, 78, 92, 110, 127, 153, 169, 193, 208, + 10, 14, 19, 28, 37, 47, 58, 67, 84, 98, 114, 133, 161, 176, 198, 214, + 15, 21, 26, 34, 43, 52, 65, 77, 91, 106, 120, 140, 165, 185, 205, 221, + 22, 27, 32, 41, 48, 60, 73, 85, 99, 116, 130, 151, 175, 190, 211, 225, + 29, 35, 42, 49, 59, 69, 81, 95, 108, 125, 139, 155, 182, 197, 217, 229, + 38, 45, 51, 61, 68, 80, 93, 105, 118, 134, 150, 168, 191, 207, 223, 234, + 50, 56, 63, 74, 83, 94, 109, 117, 129, 147, 163, 177, 199, 213, 228, 238, + 62, 70, 76, 87, 97, 107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242, + 75, 82, 90, 102, 112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245, + 89, 100, 111, 123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250, + 103, 115, 126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248, + 252, 121, 135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244, + 251, 254, 137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247, + 249, 253, 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = { + 0, 2, 5, 10, 17, 25, 38, 47, 62, 83, 101, 121, 145, 170, 193, 204, + 210, 219, 229, 233, 245, 257, 275, 299, 342, 356, 377, 405, 455, 471, 495, + 527, 1, 4, 8, 15, 22, 30, 45, 58, 74, 92, 112, 133, 158, 184, 203, 215, 222, + 228, 234, 237, 256, 274, 298, 317, 355, 376, 404, 426, 470, 494, 526, 551, + 3, 7, 12, 18, 28, 36, 52, 64, 82, 102, 118, 142, 164, 189, 208, 217, 224, + 231, 235, 238, 273, 297, 316, 329, 375, 403, 425, 440, 493, 525, 550, 567, + 6, 11, 16, 23, 31, 43, 60, 73, 90, 109, 126, 150, 173, 196, 211, 220, 226, + 232, 236, 239, 296, 315, 328, 335, 402, 424, 439, 447, 524, 549, 566, 575, + 9, 14, 19, 29, 37, 50, 65, 78, 95, 116, 134, 157, 179, 201, 214, 223, 244, + 255, 272, 295, 341, 354, 374, 401, 454, 469, 492, 523, 582, 596, 617, 645, + 13, 20, 26, 35, 44, 54, 72, 85, 105, 123, 140, 163, 182, 205, 216, 225, + 254, 271, 294, 314, 353, 373, 400, 423, 468, 491, 522, 548, 595, 616, 644, + 666, 21, 27, 33, 42, 53, 63, 80, 94, 113, 132, 151, 172, 190, 209, 218, 227, + 270, 293, 313, 327, 372, 399, 422, 438, 490, 521, 547, 565, 615, 643, 665, + 680, 24, 32, 39, 48, 57, 71, 88, 104, 120, 139, 159, 178, 197, 212, 221, 230, + 292, 312, 326, 334, 398, 421, 437, 446, 520, 546, 564, 574, 642, 664, 679, + 687, 34, 40, 46, 56, 68, 81, 96, 111, 130, 147, 167, 186, 243, 253, 269, 291, + 340, 352, 371, 397, 453, 467, 489, 519, 581, 594, 614, 641, 693, 705, 723, + 747, 41, 49, 55, 67, 77, 91, 107, 124, 138, 161, 177, 194, 252, 268, 290, + 311, 351, 370, 396, 420, 466, 488, 518, 545, 593, 613, 640, 663, 704, 722, + 746, 765, 51, 59, 66, 76, 89, 99, 119, 131, 149, 168, 181, 200, 267, 289, + 310, 325, 369, 395, 419, 436, 487, 517, 544, 563, 612, 639, 662, 678, 721, + 745, 764, 777, 61, 69, 75, 87, 100, 114, 129, 144, 162, 180, 191, 207, 288, + 309, 324, 333, 394, 418, 435, 445, 516, 543, 562, 573, 638, 661, 677, 686, + 744, 763, 776, 783, 70, 79, 86, 97, 108, 122, 137, 155, 242, 251, 266, 287, + 339, 350, 368, 393, 452, 465, 486, 515, 580, 592, 611, 637, 692, 703, 720, + 743, 788, 798, 813, 833, 84, 93, 103, 110, 125, 141, 154, 171, 250, 265, 286, + 308, 349, 367, 392, 417, 464, 485, 514, 542, 591, 610, 636, 660, 702, 719, + 742, 762, 797, 812, 832, 848, 98, 106, 115, 127, 143, 156, 169, 185, 264, + 285, 307, 323, 366, 391, 416, 434, 484, 513, 541, 561, 609, 635, 659, 676, + 718, 741, 761, 775, 811, 831, 847, 858, 117, 128, 136, 148, 160, 175, 188, + 198, 284, 306, 322, 332, 390, 415, 433, 444, 512, 540, 560, 572, 634, 658, + 675, 685, 740, 760, 774, 782, 830, 846, 857, 863, 135, 146, 152, 165, 241, + 249, 263, 283, 338, 348, 365, 389, 451, 463, 483, 511, 579, 590, 608, 633, + 691, 701, 717, 739, 787, 796, 810, 829, 867, 875, 887, 903, 153, 166, 174, + 183, 248, 262, 282, 305, 347, 364, 388, 414, 462, 482, 510, 539, 589, 607, + 632, 657, 700, 716, 738, 759, 795, 809, 828, 845, 874, 886, 902, 915, 176, + 187, 195, 202, 261, 281, 304, 321, 363, 387, 413, 432, 481, 509, 538, 559, + 606, 631, 656, 674, 715, 737, 758, 773, 808, 827, 844, 856, 885, 901, 914, + 923, 192, 199, 206, 213, 280, 303, 320, 331, 386, 412, 431, 443, 508, 537, + 558, 571, 630, 655, 673, 684, 736, 757, 772, 781, 826, 843, 855, 862, 900, + 913, 922, 927, 240, 247, 260, 279, 337, 346, 362, 385, 450, 461, 480, 507, + 578, 588, 605, 629, 690, 699, 714, 735, 786, 794, 807, 825, 866, 873, 884, + 899, 930, 936, 945, 957, 246, 259, 278, 302, 345, 361, 384, 411, 460, 479, + 506, 536, 587, 604, 628, 654, 698, 713, 734, 756, 793, 806, 824, 842, 872, + 883, 898, 912, 935, 944, 956, 966, 258, 277, 301, 319, 360, 383, 410, 430, + 478, 505, 535, 557, 603, 627, 653, 672, 712, 733, 755, 771, 805, 823, 841, + 854, 882, 897, 911, 921, 943, 955, 965, 972, 276, 300, 318, 330, 382, 409, + 429, 442, 504, 534, 556, 570, 626, 652, 671, 683, 732, 754, 770, 780, 822, + 840, 853, 861, 896, 910, 920, 926, 954, 964, 971, 975, 336, 344, 359, 381, + 449, 459, 477, 503, 577, 586, 602, 625, 689, 697, 711, 731, 785, 792, 804, + 821, 865, 871, 881, 895, 929, 934, 942, 953, 977, 981, 987, 995, 343, 358, + 380, 408, 458, 476, 502, 533, 585, 601, 624, 651, 696, 710, 730, 753, 791, + 803, 820, 839, 870, 880, 894, 909, 933, 941, 952, 963, 980, 986, 994, 1001, + 357, 379, 407, 428, 475, 501, 532, 555, 600, 623, 650, 670, 709, 729, 752, + 769, 802, 819, 838, 852, 879, 893, 908, 919, 940, 951, 962, 970, 985, 993, + 1000, 1005, 378, 406, 427, 441, 500, 531, 554, 569, 622, 649, 669, 682, 728, + 751, 768, 779, 818, 837, 851, 860, 892, 907, 918, 925, 950, 961, 969, 974, + 992, 999, 1004, 1007, 448, 457, 474, 499, 576, 584, 599, 621, 688, 695, 708, + 727, 784, 790, 801, 817, 864, 869, 878, 891, 928, 932, 939, 949, 976, 979, + 984, 991, 1008, 1010, 1013, 1017, 456, 473, 498, 530, 583, 598, 620, 648, + 694, 707, 726, 750, 789, 800, 816, 836, 868, 877, 890, 906, 931, 938, 948, + 960, 978, 983, 990, 998, 1009, 1012, 1016, 1020, 472, 497, 529, 553, 597, + 619, 647, 668, 706, 725, 749, 767, 799, 815, 835, 850, 876, 889, 905, 917, + 937, 947, 959, 968, 982, 989, 997, 1003, 1011, 1015, 1019, 1022, 496, 528, + 552, 568, 618, 646, 667, 681, 724, 748, 766, 778, 814, 834, 849, 859, 888, + 904, 916, 924, 946, 958, 967, 973, 988, 996, 1002, 1006, 1014, 1018, 1021, + 1023, +}; + +const scan_order vp9_default_scan_orders[TX_SIZES] = { + {default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors}, + {default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors}, + {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors}, + {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors}, +}; + +const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES] = { + { // TX_4X4 + {default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors}, + {row_scan_4x4, vp9_row_iscan_4x4, row_scan_4x4_neighbors}, + {col_scan_4x4, vp9_col_iscan_4x4, col_scan_4x4_neighbors}, + {default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors} + }, { // TX_8X8 + {default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors}, + {row_scan_8x8, vp9_row_iscan_8x8, row_scan_8x8_neighbors}, + {col_scan_8x8, vp9_col_iscan_8x8, col_scan_8x8_neighbors}, + {default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors} + }, { // TX_16X16 + {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors}, + {row_scan_16x16, vp9_row_iscan_16x16, row_scan_16x16_neighbors}, + {col_scan_16x16, vp9_col_iscan_16x16, col_scan_16x16_neighbors}, + {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors} + }, { // TX_32X32 + {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors}, + {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors}, + {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors}, + {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors}, + } +}; diff --git a/thirdparty/libvpx/vp9/common/vp9_scan.h b/thirdparty/libvpx/vp9/common/vp9_scan.h new file mode 100644 index 000000000000..4c1ee8107c25 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_scan.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_SCAN_H_ +#define VP9_COMMON_VP9_SCAN_H_ + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_NEIGHBORS 2 + +typedef struct { + const int16_t *scan; + const int16_t *iscan; + const int16_t *neighbors; +} scan_order; + +extern const scan_order vp9_default_scan_orders[TX_SIZES]; +extern const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES]; + +static INLINE int get_coef_context(const int16_t *neighbors, + const uint8_t *token_cache, int c) { + return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] + + token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1; +} + +static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, + PLANE_TYPE type, int block_idx) { + const MODE_INFO *const mi = xd->mi[0]; + + if (is_inter_block(mi) || type != PLANE_TYPE_Y || xd->lossless) { + return &vp9_default_scan_orders[tx_size]; + } else { + const PREDICTION_MODE mode = get_y_mode(mi, block_idx); + return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]]; + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_SCAN_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_seg_common.c b/thirdparty/libvpx/vp9/common/vp9_seg_common.c new file mode 100644 index 000000000000..7af61629a098 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_seg_common.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_quant_common.h" + +static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 }; + +static const int seg_feature_data_max[SEG_LVL_MAX] = { + MAXQ, MAX_LOOP_FILTER, 3, 0 }; + +// These functions provide access to new segment level features. +// Eventually these function may be "optimized out" but for the moment, +// the coding mechanism is still subject to change so these provide a +// convenient single point of change. + +void vp9_clearall_segfeatures(struct segmentation *seg) { + vp9_zero(seg->feature_data); + vp9_zero(seg->feature_mask); + seg->aq_av_offset = 0; +} + +void vp9_enable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_mask[segment_id] |= 1 << feature_id; +} + +int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id) { + return seg_feature_data_max[feature_id]; +} + +int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) { + return seg_feature_data_signed[feature_id]; +} + +void vp9_set_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id, int seg_data) { + assert(seg_data <= seg_feature_data_max[feature_id]); + if (seg_data < 0) { + assert(seg_feature_data_signed[feature_id]); + assert(-seg_data <= seg_feature_data_max[feature_id]); + } + + seg->feature_data[segment_id][feature_id] = seg_data; +} + +const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = { + 2, 4, 6, 8, 10, 12, + 0, -1, -2, -3, -4, -5, -6, -7 +}; + + +// TBD? Functions to read and write segment data with range / validity checking diff --git a/thirdparty/libvpx/vp9/common/vp9_seg_common.h b/thirdparty/libvpx/vp9/common/vp9_seg_common.h new file mode 100644 index 000000000000..99a9440c17ee --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_seg_common.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_SEG_COMMON_H_ +#define VP9_COMMON_VP9_SEG_COMMON_H_ + +#include "vpx_dsp/prob.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define SEGMENT_DELTADATA 0 +#define SEGMENT_ABSDATA 1 + +#define MAX_SEGMENTS 8 +#define SEG_TREE_PROBS (MAX_SEGMENTS-1) + +#define PREDICTION_PROBS 3 + +// Segment level features. +typedef enum { + SEG_LVL_ALT_Q = 0, // Use alternate Quantizer .... + SEG_LVL_ALT_LF = 1, // Use alternate loop filter value... + SEG_LVL_REF_FRAME = 2, // Optional Segment reference frame + SEG_LVL_SKIP = 3, // Optional Segment (0,0) + skip mode + SEG_LVL_MAX = 4 // Number of features supported +} SEG_LVL_FEATURES; + + +struct segmentation { + uint8_t enabled; + uint8_t update_map; + uint8_t update_data; + uint8_t abs_delta; + uint8_t temporal_update; + + vpx_prob tree_probs[SEG_TREE_PROBS]; + vpx_prob pred_probs[PREDICTION_PROBS]; + + int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX]; + uint32_t feature_mask[MAX_SEGMENTS]; + int aq_av_offset; +}; + +static INLINE int segfeature_active(const struct segmentation *seg, + int segment_id, + SEG_LVL_FEATURES feature_id) { + return seg->enabled && + (seg->feature_mask[segment_id] & (1 << feature_id)); +} + +void vp9_clearall_segfeatures(struct segmentation *seg); + +void vp9_enable_segfeature(struct segmentation *seg, + int segment_id, + SEG_LVL_FEATURES feature_id); + +int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id); + +int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id); + +void vp9_set_segdata(struct segmentation *seg, + int segment_id, + SEG_LVL_FEATURES feature_id, + int seg_data); + +static INLINE int get_segdata(const struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + return seg->feature_data[segment_id][feature_id]; +} + +extern const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)]; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_SEG_COMMON_H_ + diff --git a/thirdparty/libvpx/vp9/common/vp9_thread_common.c b/thirdparty/libvpx/vp9/common/vp9_thread_common.c new file mode 100644 index 000000000000..db78d6be8946 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_thread_common.c @@ -0,0 +1,435 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_thread_common.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_loopfilter.h" + +#if CONFIG_MULTITHREAD +static INLINE void mutex_lock(pthread_mutex_t *const mutex) { + const int kMaxTryLocks = 4000; + int locked = 0; + int i; + + for (i = 0; i < kMaxTryLocks; ++i) { + if (!pthread_mutex_trylock(mutex)) { + locked = 1; + break; + } + } + + if (!locked) + pthread_mutex_lock(mutex); +} +#endif // CONFIG_MULTITHREAD + +static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) { +#if CONFIG_MULTITHREAD + const int nsync = lf_sync->sync_range; + + if (r && !(c & (nsync - 1))) { + pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1]; + mutex_lock(mutex); + + while (c > lf_sync->cur_sb_col[r - 1] - nsync) { + pthread_cond_wait(&lf_sync->cond_[r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)lf_sync; + (void)r; + (void)c; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c, + const int sb_cols) { +#if CONFIG_MULTITHREAD + const int nsync = lf_sync->sync_range; + int cur; + // Only signal when there are enough filtered SB for next row to run. + int sig = 1; + + if (c < sb_cols - 1) { + cur = c; + if (c % nsync) + sig = 0; + } else { + cur = sb_cols + nsync; + } + + if (sig) { + mutex_lock(&lf_sync->mutex_[r]); + + lf_sync->cur_sb_col[r] = cur; + + pthread_cond_signal(&lf_sync->cond_[r]); + pthread_mutex_unlock(&lf_sync->mutex_[r]); + } +#else + (void)lf_sync; + (void)r; + (void)c; + (void)sb_cols; +#endif // CONFIG_MULTITHREAD +} + +// Implement row loopfiltering for each thread. +static INLINE +void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer, + VP9_COMMON *const cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int start, int stop, int y_only, + VP9LfSync *const lf_sync) { + const int num_planes = y_only ? 1 : MAX_MB_PLANE; + const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; + int mi_row, mi_col; + enum lf_path path; + if (y_only) + path = LF_PATH_444; + else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1) + path = LF_PATH_420; + else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0) + path = LF_PATH_444; + else + path = LF_PATH_SLOW; + + for (mi_row = start; mi_row < stop; + mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) { + MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride; + LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0); + + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE, ++lfm) { + const int r = mi_row >> MI_BLOCK_SIZE_LOG2; + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; + int plane; + + sync_read(lf_sync, r, c); + + vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); + + vp9_adjust_mask(cm, mi_row, mi_col, lfm); + + vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm); + for (plane = 1; plane < num_planes; ++plane) { + switch (path) { + case LF_PATH_420: + vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, lfm); + break; + case LF_PATH_444: + vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, lfm); + break; + case LF_PATH_SLOW: + vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, + mi_row, mi_col); + break; + } + } + + sync_write(lf_sync, r, c, sb_cols); + } + } +} + +// Row-based multi-threaded loopfilter hook +static int loop_filter_row_worker(VP9LfSync *const lf_sync, + LFWorkerData *const lf_data) { + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_sync); + return 1; +} + +static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, + VP9_COMMON *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int start, int stop, int y_only, + VPxWorker *workers, int nworkers, + VP9LfSync *lf_sync) { + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + // Number of superblock rows and cols + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + // Decoder may allocate more threads than number of tiles based on user's + // input. + const int tile_cols = 1 << cm->log2_tile_cols; + const int num_workers = VPXMIN(nworkers, tile_cols); + int i; + + if (!lf_sync->sync_range || sb_rows != lf_sync->rows || + num_workers > lf_sync->num_workers) { + vp9_loop_filter_dealloc(lf_sync); + vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); + } + + // Initialize cur_sb_col to -1 for all SB rows. + memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); + + // Set up loopfilter thread data. + // The decoder is capping num_workers because it has been observed that using + // more threads on the loopfilter than there are cores will hurt performance + // on Android. This is because the system will only schedule the tile decode + // workers on cores equal to the number of tile columns. Then if the decoder + // tries to use more threads for the loopfilter, it will hurt performance + // because of contention. If the multithreading code changes in the future + // then the number of workers used by the loopfilter should be revisited. + for (i = 0; i < num_workers; ++i) { + VPxWorker *const worker = &workers[i]; + LFWorkerData *const lf_data = &lf_sync->lfdata[i]; + + worker->hook = (VPxWorkerHook)loop_filter_row_worker; + worker->data1 = lf_sync; + worker->data2 = lf_data; + + // Loopfilter data + vp9_loop_filter_data_reset(lf_data, frame, cm, planes); + lf_data->start = start + i * MI_BLOCK_SIZE; + lf_data->stop = stop; + lf_data->y_only = y_only; + + // Start loopfiltering + if (i == num_workers - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + // Wait till all rows are finished + for (i = 0; i < num_workers; ++i) { + winterface->sync(&workers[i]); + } +} + +void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, + VP9_COMMON *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int frame_filter_level, + int y_only, int partial_frame, + VPxWorker *workers, int num_workers, + VP9LfSync *lf_sync) { + int start_mi_row, end_mi_row, mi_rows_to_filter; + + if (!frame_filter_level) return; + + start_mi_row = 0; + mi_rows_to_filter = cm->mi_rows; + if (partial_frame && cm->mi_rows > 8) { + start_mi_row = cm->mi_rows >> 1; + start_mi_row &= 0xfffffff8; + mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8); + } + end_mi_row = start_mi_row + mi_rows_to_filter; + vp9_loop_filter_frame_init(cm, frame_filter_level); + + loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row, + y_only, workers, num_workers, lf_sync); +} + +// Set up nsync by width. +static INLINE int get_sync_range(int width) { + // nsync numbers are picked by testing. For example, for 4k + // video, using 4 gives best performance. + if (width < 640) + return 1; + else if (width <= 1280) + return 2; + else if (width <= 4096) + return 4; + else + return 8; +} + +// Allocate memory for lf row synchronization +void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, + int width, int num_workers) { + lf_sync->rows = rows; +#if CONFIG_MULTITHREAD + { + int i; + + CHECK_MEM_ERROR(cm, lf_sync->mutex_, + vpx_malloc(sizeof(*lf_sync->mutex_) * rows)); + if (lf_sync->mutex_) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&lf_sync->mutex_[i], NULL); + } + } + + CHECK_MEM_ERROR(cm, lf_sync->cond_, + vpx_malloc(sizeof(*lf_sync->cond_) * rows)); + if (lf_sync->cond_) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&lf_sync->cond_[i], NULL); + } + } + } +#endif // CONFIG_MULTITHREAD + + CHECK_MEM_ERROR(cm, lf_sync->lfdata, + vpx_malloc(num_workers * sizeof(*lf_sync->lfdata))); + lf_sync->num_workers = num_workers; + + CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col, + vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows)); + + // Set up nsync. + lf_sync->sync_range = get_sync_range(width); +} + +// Deallocate lf synchronization related mutex and data +void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) { + if (lf_sync != NULL) { +#if CONFIG_MULTITHREAD + int i; + + if (lf_sync->mutex_ != NULL) { + for (i = 0; i < lf_sync->rows; ++i) { + pthread_mutex_destroy(&lf_sync->mutex_[i]); + } + vpx_free(lf_sync->mutex_); + } + if (lf_sync->cond_ != NULL) { + for (i = 0; i < lf_sync->rows; ++i) { + pthread_cond_destroy(&lf_sync->cond_[i]); + } + vpx_free(lf_sync->cond_); + } +#endif // CONFIG_MULTITHREAD + vpx_free(lf_sync->lfdata); + vpx_free(lf_sync->cur_sb_col); + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + vp9_zero(*lf_sync); + } +} + +// Accumulate frame counts. +void vp9_accumulate_frame_counts(FRAME_COUNTS *accum, + const FRAME_COUNTS *counts, int is_dec) { + int i, j, k, l, m; + + for (i = 0; i < BLOCK_SIZE_GROUPS; i++) + for (j = 0; j < INTRA_MODES; j++) + accum->y_mode[i][j] += counts->y_mode[i][j]; + + for (i = 0; i < INTRA_MODES; i++) + for (j = 0; j < INTRA_MODES; j++) + accum->uv_mode[i][j] += counts->uv_mode[i][j]; + + for (i = 0; i < PARTITION_CONTEXTS; i++) + for (j = 0; j < PARTITION_TYPES; j++) + accum->partition[i][j] += counts->partition[i][j]; + + if (is_dec) { + int n; + for (i = 0; i < TX_SIZES; i++) + for (j = 0; j < PLANE_TYPES; j++) + for (k = 0; k < REF_TYPES; k++) + for (l = 0; l < COEF_BANDS; l++) + for (m = 0; m < COEFF_CONTEXTS; m++) { + accum->eob_branch[i][j][k][l][m] += + counts->eob_branch[i][j][k][l][m]; + for (n = 0; n < UNCONSTRAINED_NODES + 1; n++) + accum->coef[i][j][k][l][m][n] += + counts->coef[i][j][k][l][m][n]; + } + } else { + for (i = 0; i < TX_SIZES; i++) + for (j = 0; j < PLANE_TYPES; j++) + for (k = 0; k < REF_TYPES; k++) + for (l = 0; l < COEF_BANDS; l++) + for (m = 0; m < COEFF_CONTEXTS; m++) + accum->eob_branch[i][j][k][l][m] += + counts->eob_branch[i][j][k][l][m]; + // In the encoder, coef is only updated at frame + // level, so not need to accumulate it here. + // for (n = 0; n < UNCONSTRAINED_NODES + 1; n++) + // accum->coef[i][j][k][l][m][n] += + // counts->coef[i][j][k][l][m][n]; + } + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + for (j = 0; j < SWITCHABLE_FILTERS; j++) + accum->switchable_interp[i][j] += counts->switchable_interp[i][j]; + + for (i = 0; i < INTER_MODE_CONTEXTS; i++) + for (j = 0; j < INTER_MODES; j++) + accum->inter_mode[i][j] += counts->inter_mode[i][j]; + + for (i = 0; i < INTRA_INTER_CONTEXTS; i++) + for (j = 0; j < 2; j++) + accum->intra_inter[i][j] += counts->intra_inter[i][j]; + + for (i = 0; i < COMP_INTER_CONTEXTS; i++) + for (j = 0; j < 2; j++) + accum->comp_inter[i][j] += counts->comp_inter[i][j]; + + for (i = 0; i < REF_CONTEXTS; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) + accum->single_ref[i][j][k] += counts->single_ref[i][j][k]; + + for (i = 0; i < REF_CONTEXTS; i++) + for (j = 0; j < 2; j++) + accum->comp_ref[i][j] += counts->comp_ref[i][j]; + + for (i = 0; i < TX_SIZE_CONTEXTS; i++) { + for (j = 0; j < TX_SIZES; j++) + accum->tx.p32x32[i][j] += counts->tx.p32x32[i][j]; + + for (j = 0; j < TX_SIZES - 1; j++) + accum->tx.p16x16[i][j] += counts->tx.p16x16[i][j]; + + for (j = 0; j < TX_SIZES - 2; j++) + accum->tx.p8x8[i][j] += counts->tx.p8x8[i][j]; + } + + for (i = 0; i < TX_SIZES; i++) + accum->tx.tx_totals[i] += counts->tx.tx_totals[i]; + + for (i = 0; i < SKIP_CONTEXTS; i++) + for (j = 0; j < 2; j++) + accum->skip[i][j] += counts->skip[i][j]; + + for (i = 0; i < MV_JOINTS; i++) + accum->mv.joints[i] += counts->mv.joints[i]; + + for (k = 0; k < 2; k++) { + nmv_component_counts *const comps = &accum->mv.comps[k]; + const nmv_component_counts *const comps_t = &counts->mv.comps[k]; + + for (i = 0; i < 2; i++) { + comps->sign[i] += comps_t->sign[i]; + comps->class0_hp[i] += comps_t->class0_hp[i]; + comps->hp[i] += comps_t->hp[i]; + } + + for (i = 0; i < MV_CLASSES; i++) + comps->classes[i] += comps_t->classes[i]; + + for (i = 0; i < CLASS0_SIZE; i++) { + comps->class0[i] += comps_t->class0[i]; + for (j = 0; j < MV_FP_SIZE; j++) + comps->class0_fp[i][j] += comps_t->class0_fp[i][j]; + } + + for (i = 0; i < MV_OFFSET_BITS; i++) + for (j = 0; j < 2; j++) + comps->bits[i][j] += comps_t->bits[i][j]; + + for (i = 0; i < MV_FP_SIZE; i++) + comps->fp[i] += comps_t->fp[i]; + } +} diff --git a/thirdparty/libvpx/vp9/common/vp9_thread_common.h b/thirdparty/libvpx/vp9/common/vp9_thread_common.h new file mode 100644 index 000000000000..b3b60c253f29 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_thread_common.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_THREAD_COMMON_H_ +#define VP9_COMMON_VP9_THREAD_COMMON_H_ +#include "./vpx_config.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vpx_util/vpx_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP9Common; +struct FRAME_COUNTS; + +// Loopfilter row synchronization +typedef struct VP9LfSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_; + pthread_cond_t *cond_; +#endif + // Allocate memory to store the loop-filtered superblock index in each row. + int *cur_sb_col; + // The optimal sync_range for different resolution and platform should be + // determined by testing. Currently, it is chosen to be a power-of-2 number. + int sync_range; + int rows; + + // Row-based parallel loopfilter data + LFWorkerData *lfdata; + int num_workers; +} VP9LfSync; + +// Allocate memory for loopfilter row synchronization. +void vp9_loop_filter_alloc(VP9LfSync *lf_sync, struct VP9Common *cm, int rows, + int width, int num_workers); + +// Deallocate loopfilter synchronization related mutex and data. +void vp9_loop_filter_dealloc(VP9LfSync *lf_sync); + +// Multi-threaded loopfilter that uses the tile threads. +void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, + struct VP9Common *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int frame_filter_level, + int y_only, int partial_frame, + VPxWorker *workers, int num_workers, + VP9LfSync *lf_sync); + +void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum, + const struct FRAME_COUNTS *counts, int is_dec); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_THREAD_COMMON_H_ diff --git a/thirdparty/libvpx/vp9/common/vp9_tile_common.c b/thirdparty/libvpx/vp9/common/vp9_tile_common.c new file mode 100644 index 000000000000..9fcb97c85427 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_tile_common.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_tile_common.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vpx_dsp/vpx_dsp_common.h" + +#define MIN_TILE_WIDTH_B64 4 +#define MAX_TILE_WIDTH_B64 64 + +static int get_tile_offset(int idx, int mis, int log2) { + const int sb_cols = mi_cols_aligned_to_sb(mis) >> MI_BLOCK_SIZE_LOG2; + const int offset = ((idx * sb_cols) >> log2) << MI_BLOCK_SIZE_LOG2; + return VPXMIN(offset, mis); +} + +void vp9_tile_set_row(TileInfo *tile, const VP9_COMMON *cm, int row) { + tile->mi_row_start = get_tile_offset(row, cm->mi_rows, cm->log2_tile_rows); + tile->mi_row_end = get_tile_offset(row + 1, cm->mi_rows, cm->log2_tile_rows); +} + +void vp9_tile_set_col(TileInfo *tile, const VP9_COMMON *cm, int col) { + tile->mi_col_start = get_tile_offset(col, cm->mi_cols, cm->log2_tile_cols); + tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols); +} + +void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) { + vp9_tile_set_row(tile, cm, row); + vp9_tile_set_col(tile, cm, col); +} + +static int get_min_log2_tile_cols(const int sb64_cols) { + int min_log2 = 0; + while ((MAX_TILE_WIDTH_B64 << min_log2) < sb64_cols) + ++min_log2; + return min_log2; +} + +static int get_max_log2_tile_cols(const int sb64_cols) { + int max_log2 = 1; + while ((sb64_cols >> max_log2) >= MIN_TILE_WIDTH_B64) + ++max_log2; + return max_log2 - 1; +} + +void vp9_get_tile_n_bits(int mi_cols, + int *min_log2_tile_cols, int *max_log2_tile_cols) { + const int sb64_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2; + *min_log2_tile_cols = get_min_log2_tile_cols(sb64_cols); + *max_log2_tile_cols = get_max_log2_tile_cols(sb64_cols); + assert(*min_log2_tile_cols <= *max_log2_tile_cols); +} diff --git a/thirdparty/libvpx/vp9/common/vp9_tile_common.h b/thirdparty/libvpx/vp9/common/vp9_tile_common.h new file mode 100644 index 000000000000..ae58805de1c8 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/vp9_tile_common.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_TILE_COMMON_H_ +#define VP9_COMMON_VP9_TILE_COMMON_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP9Common; + +typedef struct TileInfo { + int mi_row_start, mi_row_end; + int mi_col_start, mi_col_end; +} TileInfo; + +// initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on +// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)' +void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm, + int row, int col); + +void vp9_tile_set_row(TileInfo *tile, const struct VP9Common *cm, int row); +void vp9_tile_set_col(TileInfo *tile, const struct VP9Common *cm, int col); + +void vp9_get_tile_n_bits(int mi_cols, + int *min_log2_tile_cols, int *max_log2_tile_cols); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_TILE_COMMON_H_ diff --git a/thirdparty/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/thirdparty/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c new file mode 100644 index 000000000000..1c77b57ff115 --- /dev/null +++ b/thirdparty/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" +#include "vpx_ports/mem.h" + +void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + __m128i in[2]; + const __m128i zero = _mm_setzero_si128(); + const __m128i eight = _mm_set1_epi16(8); + + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8); + + switch (tx_type) { + case 0: // DCT_DCT + idct4_sse2(in); + idct4_sse2(in); + break; + case 1: // ADST_DCT + idct4_sse2(in); + iadst4_sse2(in); + break; + case 2: // DCT_ADST + iadst4_sse2(in); + idct4_sse2(in); + break; + case 3: // ADST_ADST + iadst4_sse2(in); + iadst4_sse2(in); + break; + default: + assert(0); + break; + } + + // Final round and shift + in[0] = _mm_add_epi16(in[0], eight); + in[1] = _mm_add_epi16(in[1], eight); + + in[0] = _mm_srai_epi16(in[0], 4); + in[1] = _mm_srai_epi16(in[1], 4); + + // Reconstruction and Store + { + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); + __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi32(d0, + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d2 = _mm_unpacklo_epi32( + d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3))); + d0 = _mm_unpacklo_epi8(d0, zero); + d2 = _mm_unpacklo_epi8(d2, zero); + d0 = _mm_add_epi16(d0, in[0]); + d2 = _mm_add_epi16(d2, in[1]); + d0 = _mm_packus_epi16(d0, d2); + // store result[0] + *(int *)dest = _mm_cvtsi128_si32(d0); + // store result[1] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); + // store result[2] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); + // store result[3] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); + } +} + +void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + __m128i in[8]; + const __m128i zero = _mm_setzero_si128(); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + + // load input data + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8 * 1); + in[2] = load_input_data(input + 8 * 2); + in[3] = load_input_data(input + 8 * 3); + in[4] = load_input_data(input + 8 * 4); + in[5] = load_input_data(input + 8 * 5); + in[6] = load_input_data(input + 8 * 6); + in[7] = load_input_data(input + 8 * 7); + + switch (tx_type) { + case 0: // DCT_DCT + idct8_sse2(in); + idct8_sse2(in); + break; + case 1: // ADST_DCT + idct8_sse2(in); + iadst8_sse2(in); + break; + case 2: // DCT_ADST + iadst8_sse2(in); + idct8_sse2(in); + break; + case 3: // ADST_ADST + iadst8_sse2(in); + iadst8_sse2(in); + break; + default: + assert(0); + break; + } + + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 5); + in[1] = _mm_srai_epi16(in[1], 5); + in[2] = _mm_srai_epi16(in[2], 5); + in[3] = _mm_srai_epi16(in[3], 5); + in[4] = _mm_srai_epi16(in[4], 5); + in[5] = _mm_srai_epi16(in[5], 5); + in[6] = _mm_srai_epi16(in[6], 5); + in[7] = _mm_srai_epi16(in[7], 5); + + RECON_AND_STORE(dest + 0 * stride, in[0]); + RECON_AND_STORE(dest + 1 * stride, in[1]); + RECON_AND_STORE(dest + 2 * stride, in[2]); + RECON_AND_STORE(dest + 3 * stride, in[3]); + RECON_AND_STORE(dest + 4 * stride, in[4]); + RECON_AND_STORE(dest + 5 * stride, in[5]); + RECON_AND_STORE(dest + 6 * stride, in[6]); + RECON_AND_STORE(dest + 7 * stride, in[7]); +} + +void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + __m128i in0[16], in1[16]; + + load_buffer_8x16(input, in0); + input += 8; + load_buffer_8x16(input, in1); + + switch (tx_type) { + case 0: // DCT_DCT + idct16_sse2(in0, in1); + idct16_sse2(in0, in1); + break; + case 1: // ADST_DCT + idct16_sse2(in0, in1); + iadst16_sse2(in0, in1); + break; + case 2: // DCT_ADST + iadst16_sse2(in0, in1); + idct16_sse2(in0, in1); + break; + case 3: // ADST_ADST + iadst16_sse2(in0, in1); + iadst16_sse2(in0, in1); + break; + default: + assert(0); + break; + } + + write_buffer_8x16(dest, in0, stride); + dest += 8; + write_buffer_8x16(dest, in1, stride); +} diff --git a/thirdparty/libvpx/vp9/decoder/vp9_decodeframe.c b/thirdparty/libvpx/vp9/decoder/vp9_decodeframe.c new file mode 100644 index 000000000000..d63912932c19 --- /dev/null +++ b/thirdparty/libvpx/vp9/decoder/vp9_decodeframe.c @@ -0,0 +1,2271 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include // qsort() + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" + +#include "vpx_dsp/bitreader_buffer.h" +#include "vpx_dsp/bitreader.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/mem_ops.h" +#include "vpx_scale/vpx_scale.h" +#include "vpx_util/vpx_thread.h" + +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/vp9_thread_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_tile_common.h" + +#include "vp9/decoder/vp9_decodeframe.h" +#include "vp9/decoder/vp9_detokenize.h" +#include "vp9/decoder/vp9_decodemv.h" +#include "vp9/decoder/vp9_decoder.h" +#include "vp9/decoder/vp9_dsubexp.h" + +#define MAX_VP9_HEADER_SIZE 80 + +static int is_compound_reference_allowed(const VP9_COMMON *cm) { + int i; + for (i = 1; i < REFS_PER_FRAME; ++i) + if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) + return 1; + + return 0; +} + +static void setup_compound_reference_mode(VP9_COMMON *cm) { + if (cm->ref_frame_sign_bias[LAST_FRAME] == + cm->ref_frame_sign_bias[GOLDEN_FRAME]) { + cm->comp_fixed_ref = ALTREF_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = GOLDEN_FRAME; + } else if (cm->ref_frame_sign_bias[LAST_FRAME] == + cm->ref_frame_sign_bias[ALTREF_FRAME]) { + cm->comp_fixed_ref = GOLDEN_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = ALTREF_FRAME; + } else { + cm->comp_fixed_ref = LAST_FRAME; + cm->comp_var_ref[0] = GOLDEN_FRAME; + cm->comp_var_ref[1] = ALTREF_FRAME; + } +} + +static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) { + return len != 0 && len <= (size_t)(end - start); +} + +static int decode_unsigned_max(struct vpx_read_bit_buffer *rb, int max) { + const int data = vpx_rb_read_literal(rb, get_unsigned_bits(max)); + return data > max ? max : data; +} + +static TX_MODE read_tx_mode(vpx_reader *r) { + TX_MODE tx_mode = vpx_read_literal(r, 2); + if (tx_mode == ALLOW_32X32) + tx_mode += vpx_read_bit(r); + return tx_mode; +} + +static void read_tx_mode_probs(struct tx_probs *tx_probs, vpx_reader *r) { + int i, j; + + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) + for (j = 0; j < TX_SIZES - 3; ++j) + vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]); + + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) + for (j = 0; j < TX_SIZES - 2; ++j) + vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]); + + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) + for (j = 0; j < TX_SIZES - 1; ++j) + vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]); +} + +static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vpx_reader *r) { + int i, j; + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) + for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) + vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]); +} + +static void read_inter_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) { + int i, j; + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) + for (j = 0; j < INTER_MODES - 1; ++j) + vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]); +} + +static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm, + vpx_reader *r) { + if (is_compound_reference_allowed(cm)) { + return vpx_read_bit(r) ? (vpx_read_bit(r) ? REFERENCE_MODE_SELECT + : COMPOUND_REFERENCE) + : SINGLE_REFERENCE; + } else { + return SINGLE_REFERENCE; + } +} + +static void read_frame_reference_mode_probs(VP9_COMMON *cm, vpx_reader *r) { + FRAME_CONTEXT *const fc = cm->fc; + int i; + + if (cm->reference_mode == REFERENCE_MODE_SELECT) + for (i = 0; i < COMP_INTER_CONTEXTS; ++i) + vp9_diff_update_prob(r, &fc->comp_inter_prob[i]); + + if (cm->reference_mode != COMPOUND_REFERENCE) + for (i = 0; i < REF_CONTEXTS; ++i) { + vp9_diff_update_prob(r, &fc->single_ref_prob[i][0]); + vp9_diff_update_prob(r, &fc->single_ref_prob[i][1]); + } + + if (cm->reference_mode != SINGLE_REFERENCE) + for (i = 0; i < REF_CONTEXTS; ++i) + vp9_diff_update_prob(r, &fc->comp_ref_prob[i]); +} + +static void update_mv_probs(vpx_prob *p, int n, vpx_reader *r) { + int i; + for (i = 0; i < n; ++i) + if (vpx_read(r, MV_UPDATE_PROB)) + p[i] = (vpx_read_literal(r, 7) << 1) | 1; +} + +static void read_mv_probs(nmv_context *ctx, int allow_hp, vpx_reader *r) { + int i, j; + + update_mv_probs(ctx->joints, MV_JOINTS - 1, r); + + for (i = 0; i < 2; ++i) { + nmv_component *const comp_ctx = &ctx->comps[i]; + update_mv_probs(&comp_ctx->sign, 1, r); + update_mv_probs(comp_ctx->classes, MV_CLASSES - 1, r); + update_mv_probs(comp_ctx->class0, CLASS0_SIZE - 1, r); + update_mv_probs(comp_ctx->bits, MV_OFFSET_BITS, r); + } + + for (i = 0; i < 2; ++i) { + nmv_component *const comp_ctx = &ctx->comps[i]; + for (j = 0; j < CLASS0_SIZE; ++j) + update_mv_probs(comp_ctx->class0_fp[j], MV_FP_SIZE - 1, r); + update_mv_probs(comp_ctx->fp, 3, r); + } + + if (allow_hp) { + for (i = 0; i < 2; ++i) { + nmv_component *const comp_ctx = &ctx->comps[i]; + update_mv_probs(&comp_ctx->class0_hp, 1, r); + update_mv_probs(&comp_ctx->hp, 1, r); + } + } +} + +static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane, + const TX_SIZE tx_size, + uint8_t *dst, int stride, + int eob) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const dqcoeff = pd->dqcoeff; + assert(eob > 0); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (xd->lossless) { + vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd); + } else { + switch (tx_size) { + case TX_4X4: + vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd); + break; + case TX_8X8: + vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd); + break; + case TX_16X16: + vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd); + break; + case TX_32X32: + vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd); + break; + default: + assert(0 && "Invalid transform size"); + } + } + } else { + if (xd->lossless) { + vp9_iwht4x4_add(dqcoeff, dst, stride, eob); + } else { + switch (tx_size) { + case TX_4X4: + vp9_idct4x4_add(dqcoeff, dst, stride, eob); + break; + case TX_8X8: + vp9_idct8x8_add(dqcoeff, dst, stride, eob); + break; + case TX_16X16: + vp9_idct16x16_add(dqcoeff, dst, stride, eob); + break; + case TX_32X32: + vp9_idct32x32_add(dqcoeff, dst, stride, eob); + break; + default: + assert(0 && "Invalid transform size"); + return; + } + } + } +#else + if (xd->lossless) { + vp9_iwht4x4_add(dqcoeff, dst, stride, eob); + } else { + switch (tx_size) { + case TX_4X4: + vp9_idct4x4_add(dqcoeff, dst, stride, eob); + break; + case TX_8X8: + vp9_idct8x8_add(dqcoeff, dst, stride, eob); + break; + case TX_16X16: + vp9_idct16x16_add(dqcoeff, dst, stride, eob); + break; + case TX_32X32: + vp9_idct32x32_add(dqcoeff, dst, stride, eob); + break; + default: + assert(0 && "Invalid transform size"); + return; + } + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (eob == 1) { + dqcoeff[0] = 0; + } else { + if (tx_size <= TX_16X16 && eob <= 10) + memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0])); + else if (tx_size == TX_32X32 && eob <= 34) + memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0])); + else + memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0])); + } +} + +static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane, + const TX_TYPE tx_type, + const TX_SIZE tx_size, + uint8_t *dst, int stride, + int eob) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const dqcoeff = pd->dqcoeff; + assert(eob > 0); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (xd->lossless) { + vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd); + } else { + switch (tx_size) { + case TX_4X4: + vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + break; + case TX_8X8: + vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + break; + case TX_16X16: + vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + break; + case TX_32X32: + vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd); + break; + default: + assert(0 && "Invalid transform size"); + } + } + } else { + if (xd->lossless) { + vp9_iwht4x4_add(dqcoeff, dst, stride, eob); + } else { + switch (tx_size) { + case TX_4X4: + vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_8X8: + vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_16X16: + vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_32X32: + vp9_idct32x32_add(dqcoeff, dst, stride, eob); + break; + default: + assert(0 && "Invalid transform size"); + return; + } + } + } +#else + if (xd->lossless) { + vp9_iwht4x4_add(dqcoeff, dst, stride, eob); + } else { + switch (tx_size) { + case TX_4X4: + vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_8X8: + vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_16X16: + vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_32X32: + vp9_idct32x32_add(dqcoeff, dst, stride, eob); + break; + default: + assert(0 && "Invalid transform size"); + return; + } + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (eob == 1) { + dqcoeff[0] = 0; + } else { + if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10) + memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0])); + else if (tx_size == TX_32X32 && eob <= 34) + memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0])); + else + memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0])); + } +} + +static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd, + vpx_reader *r, + MODE_INFO *const mi, + int plane, + int row, int col, + TX_SIZE tx_size) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode; + uint8_t *dst; + dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col]; + + if (mi->sb_type < BLOCK_8X8) + if (plane == 0) + mode = xd->mi[0]->bmi[(row << 1) + col].as_mode; + + vp9_predict_intra_block(xd, pd->n4_wl, tx_size, mode, + dst, pd->dst.stride, dst, pd->dst.stride, + col, row, plane); + + if (!mi->skip) { + const TX_TYPE tx_type = (plane || xd->lossless) ? + DCT_DCT : intra_mode_to_tx_type_lookup[mode]; + const scan_order *sc = (plane || xd->lossless) ? + &vp9_default_scan_orders[tx_size] : &vp9_scan_orders[tx_size][tx_type]; + const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, + r, mi->segment_id); + if (eob > 0) { + inverse_transform_block_intra(xd, plane, tx_type, tx_size, + dst, pd->dst.stride, eob); + } + } +} + +static int reconstruct_inter_block(MACROBLOCKD *const xd, vpx_reader *r, + MODE_INFO *const mi, int plane, + int row, int col, TX_SIZE tx_size) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const scan_order *sc = &vp9_default_scan_orders[tx_size]; + const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, r, + mi->segment_id); + + if (eob > 0) { + inverse_transform_block_inter( + xd, plane, tx_size, &pd->dst.buf[4 * row * pd->dst.stride + 4 * col], + pd->dst.stride, eob); + } + return eob; +} + +static void build_mc_border(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + int x, int y, int b_w, int b_h, int w, int h) { + // Get a pointer to the start of the real data for this row. + const uint8_t *ref_row = src - x - y * src_stride; + + if (y >= h) + ref_row += (h - 1) * src_stride; + else if (y > 0) + ref_row += y * src_stride; + + do { + int right = 0, copy; + int left = x < 0 ? -x : 0; + + if (left > b_w) + left = b_w; + + if (x + b_w > w) + right = x + b_w - w; + + if (right > b_w) + right = b_w; + + copy = b_w - left - right; + + if (left) + memset(dst, ref_row[0], left); + + if (copy) + memcpy(dst + left, ref_row + x + left, copy); + + if (right) + memset(dst + left + copy, ref_row[w - 1], right); + + dst += dst_stride; + ++y; + + if (y > 0 && y < h) + ref_row += src_stride; + } while (--b_h); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void high_build_mc_border(const uint8_t *src8, int src_stride, + uint16_t *dst, int dst_stride, + int x, int y, int b_w, int b_h, + int w, int h) { + // Get a pointer to the start of the real data for this row. + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *ref_row = src - x - y * src_stride; + + if (y >= h) + ref_row += (h - 1) * src_stride; + else if (y > 0) + ref_row += y * src_stride; + + do { + int right = 0, copy; + int left = x < 0 ? -x : 0; + + if (left > b_w) + left = b_w; + + if (x + b_w > w) + right = x + b_w - w; + + if (right > b_w) + right = b_w; + + copy = b_w - left - right; + + if (left) + vpx_memset16(dst, ref_row[0], left); + + if (copy) + memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t)); + + if (right) + vpx_memset16(dst + left + copy, ref_row[w - 1], right); + + dst += dst_stride; + ++y; + + if (y > 0 && y < h) + ref_row += src_stride; + } while (--b_h); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_VP9_HIGHBITDEPTH +static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride, + int x0, int y0, int b_w, int b_h, + int frame_width, int frame_height, + int border_offset, + uint8_t *const dst, int dst_buf_stride, + int subpel_x, int subpel_y, + const InterpKernel *kernel, + const struct scale_factors *sf, + MACROBLOCKD *xd, + int w, int h, int ref, int xs, int ys) { + DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]); + const uint8_t *buf_ptr; + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w, + x0, y0, b_w, b_h, frame_width, frame_height); + buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset; + } else { + build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w, + x0, y0, b_w, b_h, frame_width, frame_height); + buf_ptr = ((uint8_t *)mc_buf_high) + border_offset; + } + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + highbd_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); + } else { + inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys); + } +} +#else +static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride, + int x0, int y0, int b_w, int b_h, + int frame_width, int frame_height, + int border_offset, + uint8_t *const dst, int dst_buf_stride, + int subpel_x, int subpel_y, + const InterpKernel *kernel, + const struct scale_factors *sf, + int w, int h, int ref, int xs, int ys) { + DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]); + const uint8_t *buf_ptr; + + build_mc_border(buf_ptr1, pre_buf_stride, mc_buf, b_w, + x0, y0, b_w, b_h, frame_width, frame_height); + buf_ptr = mc_buf + border_offset; + + inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void dec_build_inter_predictors(VPxWorker *const worker, MACROBLOCKD *xd, + int plane, int bw, int bh, int x, + int y, int w, int h, int mi_x, int mi_y, + const InterpKernel *kernel, + const struct scale_factors *sf, + struct buf_2d *pre_buf, + struct buf_2d *dst_buf, const MV* mv, + RefCntBuffer *ref_frame_buf, + int is_scaled, int ref) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; + MV32 scaled_mv; + int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, + buf_stride, subpel_x, subpel_y; + uint8_t *ref_frame, *buf_ptr; + + // Get reference frame pointer, width and height. + if (plane == 0) { + frame_width = ref_frame_buf->buf.y_crop_width; + frame_height = ref_frame_buf->buf.y_crop_height; + ref_frame = ref_frame_buf->buf.y_buffer; + } else { + frame_width = ref_frame_buf->buf.uv_crop_width; + frame_height = ref_frame_buf->buf.uv_crop_height; + ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer + : ref_frame_buf->buf.v_buffer; + } + + if (is_scaled) { + const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, mv, bw, bh, + pd->subsampling_x, + pd->subsampling_y); + // Co-ordinate of containing block to pixel precision. + int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)); + int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)); +#if CONFIG_BETTER_HW_COMPATIBILITY + assert(xd->mi[0]->sb_type != BLOCK_4X8 && + xd->mi[0]->sb_type != BLOCK_8X4); + assert(mv_q4.row == mv->row * (1 << (1 - pd->subsampling_y)) && + mv_q4.col == mv->col * (1 << (1 - pd->subsampling_x))); +#endif + // Co-ordinate of the block to 1/16th pixel precision. + x0_16 = (x_start + x) << SUBPEL_BITS; + y0_16 = (y_start + y) << SUBPEL_BITS; + + // Co-ordinate of current block in reference frame + // to 1/16th pixel precision. + x0_16 = sf->scale_value_x(x0_16, sf); + y0_16 = sf->scale_value_y(y0_16, sf); + + // Map the top left corner of the block into the reference frame. + x0 = sf->scale_value_x(x_start + x, sf); + y0 = sf->scale_value_y(y_start + y, sf); + + // Scale the MV and incorporate the sub-pixel offset of the block + // in the reference frame. + scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf); + xs = sf->x_step_q4; + ys = sf->y_step_q4; + } else { + // Co-ordinate of containing block to pixel precision. + x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x; + y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y; + + // Co-ordinate of the block to 1/16th pixel precision. + x0_16 = x0 << SUBPEL_BITS; + y0_16 = y0 << SUBPEL_BITS; + + scaled_mv.row = mv->row * (1 << (1 - pd->subsampling_y)); + scaled_mv.col = mv->col * (1 << (1 - pd->subsampling_x)); + xs = ys = 16; + } + subpel_x = scaled_mv.col & SUBPEL_MASK; + subpel_y = scaled_mv.row & SUBPEL_MASK; + + // Calculate the top left corner of the best matching block in the + // reference frame. + x0 += scaled_mv.col >> SUBPEL_BITS; + y0 += scaled_mv.row >> SUBPEL_BITS; + x0_16 += scaled_mv.col; + y0_16 += scaled_mv.row; + + // Get reference block pointer. + buf_ptr = ref_frame + y0 * pre_buf->stride + x0; + buf_stride = pre_buf->stride; + + // Do border extension if there is motion or the + // width/height is not a multiple of 8 pixels. + if (is_scaled || scaled_mv.col || scaled_mv.row || + (frame_width & 0x7) || (frame_height & 0x7)) { + int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1; + + // Get reference block bottom right horizontal coordinate. + int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1; + int x_pad = 0, y_pad = 0; + + if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) { + x0 -= VP9_INTERP_EXTEND - 1; + x1 += VP9_INTERP_EXTEND; + x_pad = 1; + } + + if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) { + y0 -= VP9_INTERP_EXTEND - 1; + y1 += VP9_INTERP_EXTEND; + y_pad = 1; + } + + // Wait until reference block is ready. Pad 7 more pixels as last 7 + // pixels of each superblock row can be changed by next superblock row. + if (worker != NULL) + vp9_frameworker_wait(worker, ref_frame_buf, + VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1)); + + // Skip border extension if block is inside the frame. + if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 || + y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) { + // Extend the border. + const uint8_t *const buf_ptr1 = ref_frame + y0 * buf_stride + x0; + const int b_w = x1 - x0 + 1; + const int b_h = y1 - y0 + 1; + const int border_offset = y_pad * 3 * b_w + x_pad * 3; + + extend_and_predict(buf_ptr1, buf_stride, x0, y0, b_w, b_h, + frame_width, frame_height, border_offset, + dst, dst_buf->stride, + subpel_x, subpel_y, + kernel, sf, +#if CONFIG_VP9_HIGHBITDEPTH + xd, +#endif + w, h, ref, xs, ys); + return; + } + } else { + // Wait until reference block is ready. Pad 7 more pixels as last 7 + // pixels of each superblock row can be changed by next superblock row. + if (worker != NULL) { + const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS; + vp9_frameworker_wait(worker, ref_frame_buf, + VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1)); + } + } +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + highbd_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); + } else { + inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys); + } +#else + inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +static void dec_build_inter_predictors_sb(VP9Decoder *const pbi, + MACROBLOCKD *xd, + int mi_row, int mi_col) { + int plane; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const MODE_INFO *mi = xd->mi[0]; + const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter]; + const BLOCK_SIZE sb_type = mi->sb_type; + const int is_compound = has_second_ref(mi); + int ref; + int is_scaled; + VPxWorker *const fwo = pbi->frame_parallel_decode ? + pbi->frame_worker_owner : NULL; + + for (ref = 0; ref < 1 + is_compound; ++ref) { + const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; + RefBuffer *ref_buf = &pbi->common.frame_refs[frame - LAST_FRAME]; + const struct scale_factors *const sf = &ref_buf->sf; + const int idx = ref_buf->idx; + BufferPool *const pool = pbi->common.buffer_pool; + RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx]; + + if (!vp9_is_valid_scale(sf)) + vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); + + is_scaled = vp9_is_scaled(sf); + vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col, + is_scaled ? sf : NULL); + xd->block_refs[ref] = ref_buf; + + if (sb_type < BLOCK_8X8) { + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + struct buf_2d *const dst_buf = &pd->dst; + const int num_4x4_w = pd->n4_w; + const int num_4x4_h = pd->n4_h; + const int n4w_x4 = 4 * num_4x4_w; + const int n4h_x4 = 4 * num_4x4_h; + struct buf_2d *const pre_buf = &pd->pre[ref]; + int i = 0, x, y; + for (y = 0; y < num_4x4_h; ++y) { + for (x = 0; x < num_4x4_w; ++x) { + const MV mv = average_split_mvs(pd, mi, ref, i++); + dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4, + 4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel, + sf, pre_buf, dst_buf, &mv, + ref_frame_buf, is_scaled, ref); + } + } + } + } else { + const MV mv = mi->mv[ref].as_mv; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + struct buf_2d *const dst_buf = &pd->dst; + const int num_4x4_w = pd->n4_w; + const int num_4x4_h = pd->n4_h; + const int n4w_x4 = 4 * num_4x4_w; + const int n4h_x4 = 4 * num_4x4_h; + struct buf_2d *const pre_buf = &pd->pre[ref]; + dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4, + 0, 0, n4w_x4, n4h_x4, mi_x, mi_y, kernel, + sf, pre_buf, dst_buf, &mv, + ref_frame_buf, is_scaled, ref); + } + } + } +} + +static INLINE TX_SIZE dec_get_uv_tx_size(const MODE_INFO *mi, + int n4_wl, int n4_hl) { + // get minimum log2 num4x4s dimension + const int x = VPXMIN(n4_wl, n4_hl); + return VPXMIN(mi->tx_size, x); +} + +static INLINE void dec_reset_skip_context(MACROBLOCKD *xd) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) { + struct macroblockd_plane *const pd = &xd->plane[i]; + memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * pd->n4_w); + memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) * pd->n4_h); + } +} + +static void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl, + int bhl) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].n4_w = (bw << 1) >> xd->plane[i].subsampling_x; + xd->plane[i].n4_h = (bh << 1) >> xd->plane[i].subsampling_y; + xd->plane[i].n4_wl = bwl - xd->plane[i].subsampling_x; + xd->plane[i].n4_hl = bhl - xd->plane[i].subsampling_y; + } +} + +static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int bw, int bh, int x_mis, int y_mis, + int bwl, int bhl) { + const int offset = mi_row * cm->mi_stride + mi_col; + int x, y; + const TileInfo *const tile = &xd->tile; + + xd->mi = cm->mi_grid_visible + offset; + xd->mi[0] = &cm->mi[offset]; + // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of + // passing bsize from decode_partition(). + xd->mi[0]->sb_type = bsize; + for (y = 0; y < y_mis; ++y) + for (x = !y; x < x_mis; ++x) { + xd->mi[y * cm->mi_stride + x] = xd->mi[0]; + } + + set_plane_n4(xd, bw, bh, bwl, bhl); + + set_skip_context(xd, mi_row, mi_col); + + // Distance of Mb to the various image edges. These are specified to 8th pel + // as they are always compared to values that are in 1/8th pel units + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); + + vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col); + return xd->mi[0]; +} + +static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd, + int mi_row, int mi_col, + vpx_reader *r, BLOCK_SIZE bsize, + int bwl, int bhl) { + VP9_COMMON *const cm = &pbi->common; + const int less8x8 = bsize < BLOCK_8X8; + const int bw = 1 << (bwl - 1); + const int bh = 1 << (bhl - 1); + const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col); + const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row); + + MODE_INFO *mi = set_offsets(cm, xd, bsize, mi_row, mi_col, + bw, bh, x_mis, y_mis, bwl, bhl); + + if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) { + const BLOCK_SIZE uv_subsize = + ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y]; + if (uv_subsize == BLOCK_INVALID) + vpx_internal_error(xd->error_info, + VPX_CODEC_CORRUPT_FRAME, "Invalid block size."); + } + + vp9_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis); + + if (mi->skip) { + dec_reset_skip_context(xd); + } + + if (!is_inter_block(mi)) { + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = + plane ? dec_get_uv_tx_size(mi, pd->n4_wl, pd->n4_hl) + : mi->tx_size; + const int num_4x4_w = pd->n4_w; + const int num_4x4_h = pd->n4_h; + const int step = (1 << tx_size); + int row, col; + const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? + 0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? + 0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide; + xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high; + + for (row = 0; row < max_blocks_high; row += step) + for (col = 0; col < max_blocks_wide; col += step) + predict_and_reconstruct_intra_block(xd, r, mi, plane, + row, col, tx_size); + } + } else { + // Prediction + dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col); + + // Reconstruction + if (!mi->skip) { + int eobtotal = 0; + int plane; + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = + plane ? dec_get_uv_tx_size(mi, pd->n4_wl, pd->n4_hl) + : mi->tx_size; + const int num_4x4_w = pd->n4_w; + const int num_4x4_h = pd->n4_h; + const int step = (1 << tx_size); + int row, col; + const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? + 0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? + 0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide; + xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high; + + for (row = 0; row < max_blocks_high; row += step) + for (col = 0; col < max_blocks_wide; col += step) + eobtotal += reconstruct_inter_block(xd, r, mi, plane, row, col, + tx_size); + } + + if (!less8x8 && eobtotal == 0) + mi->skip = 1; // skip loopfilter + } + } + + xd->corrupted |= vpx_reader_has_error(r); + + if (cm->lf.filter_level) { + vp9_build_mask(cm, mi, mi_row, mi_col, bw, bh); + } +} + +static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd, + int mi_row, int mi_col, + int bsl) { + const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col; + const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK); + int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1; + +// assert(bsl >= 0); + + return (left * 2 + above) + bsl * PARTITION_PLOFFSET; +} + +static INLINE void dec_update_partition_context(MACROBLOCKD *xd, + int mi_row, int mi_col, + BLOCK_SIZE subsize, + int bw) { + PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col; + PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK); + + // update the partition context at the end notes. set partition bits + // of block sizes larger than the current one to be one, and partition + // bits of smaller block sizes to be zero. + memset(above_ctx, partition_context_lookup[subsize].above, bw); + memset(left_ctx, partition_context_lookup[subsize].left, bw); +} + +static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col, + vpx_reader *r, + int has_rows, int has_cols, int bsl) { + const int ctx = dec_partition_plane_context(xd, mi_row, mi_col, bsl); + const vpx_prob *const probs = get_partition_probs(xd, ctx); + FRAME_COUNTS *counts = xd->counts; + PARTITION_TYPE p; + + if (has_rows && has_cols) + p = (PARTITION_TYPE)vpx_read_tree(r, vp9_partition_tree, probs); + else if (!has_rows && has_cols) + p = vpx_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ; + else if (has_rows && !has_cols) + p = vpx_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT; + else + p = PARTITION_SPLIT; + + if (counts) + ++counts->partition[ctx][p]; + + return p; +} + +// TODO(slavarnway): eliminate bsize and subsize in future commits +static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd, + int mi_row, int mi_col, + vpx_reader* r, BLOCK_SIZE bsize, int n4x4_l2) { + VP9_COMMON *const cm = &pbi->common; + const int n8x8_l2 = n4x4_l2 - 1; + const int num_8x8_wh = 1 << n8x8_l2; + const int hbs = num_8x8_wh >> 1; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; + const int has_rows = (mi_row + hbs) < cm->mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_cols; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) + return; + + partition = read_partition(xd, mi_row, mi_col, r, has_rows, has_cols, + n8x8_l2); + subsize = subsize_lookup[partition][bsize]; // get_subsize(bsize, partition); + if (!hbs) { + // calculate bmode block dimensions (log 2) + xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT); + xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ); + decode_block(pbi, xd, mi_row, mi_col, r, subsize, 1, 1); + } else { + switch (partition) { + case PARTITION_NONE: + decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n4x4_l2); + break; + case PARTITION_HORZ: + decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n8x8_l2); + if (has_rows) + decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize, n4x4_l2, + n8x8_l2); + break; + case PARTITION_VERT: + decode_block(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2, n4x4_l2); + if (has_cols) + decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2, + n4x4_l2); + break; + case PARTITION_SPLIT: + decode_partition(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2); + decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2); + decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize, n8x8_l2); + decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize, + n8x8_l2); + break; + default: + assert(0 && "Invalid partition type"); + } + } + + // update partition context + if (bsize >= BLOCK_8X8 && + (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) + dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh); +} + +static void setup_token_decoder(const uint8_t *data, + const uint8_t *data_end, + size_t read_size, + struct vpx_internal_error_info *error_info, + vpx_reader *r, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state) { + // Validate the calculated partition length. If the buffer + // described by the partition can't be fully read, then restrict + // it to the portion that can be (for EC mode) or throw an error. + if (!read_is_valid(data, read_size, data_end)) + vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile length"); + + if (vpx_reader_init(r, data, read_size, decrypt_cb, decrypt_state)) + vpx_internal_error(error_info, VPX_CODEC_MEM_ERROR, + "Failed to allocate bool decoder %d", 1); +} + +static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs, + vpx_reader *r) { + int i, j, k, l, m; + + if (vpx_read_bit(r)) + for (i = 0; i < PLANE_TYPES; ++i) + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) + for (m = 0; m < UNCONSTRAINED_NODES; ++m) + vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]); +} + +static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode, + vpx_reader *r) { + const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; + TX_SIZE tx_size; + for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) + read_coef_probs_common(fc->coef_probs[tx_size], r); +} + +static void setup_segmentation(struct segmentation *seg, + struct vpx_read_bit_buffer *rb) { + int i, j; + + seg->update_map = 0; + seg->update_data = 0; + + seg->enabled = vpx_rb_read_bit(rb); + if (!seg->enabled) + return; + + // Segmentation map update + seg->update_map = vpx_rb_read_bit(rb); + if (seg->update_map) { + for (i = 0; i < SEG_TREE_PROBS; i++) + seg->tree_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8) + : MAX_PROB; + + seg->temporal_update = vpx_rb_read_bit(rb); + if (seg->temporal_update) { + for (i = 0; i < PREDICTION_PROBS; i++) + seg->pred_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8) + : MAX_PROB; + } else { + for (i = 0; i < PREDICTION_PROBS; i++) + seg->pred_probs[i] = MAX_PROB; + } + } + + // Segmentation data update + seg->update_data = vpx_rb_read_bit(rb); + if (seg->update_data) { + seg->abs_delta = vpx_rb_read_bit(rb); + + vp9_clearall_segfeatures(seg); + + for (i = 0; i < MAX_SEGMENTS; i++) { + for (j = 0; j < SEG_LVL_MAX; j++) { + int data = 0; + const int feature_enabled = vpx_rb_read_bit(rb); + if (feature_enabled) { + vp9_enable_segfeature(seg, i, j); + data = decode_unsigned_max(rb, vp9_seg_feature_data_max(j)); + if (vp9_is_segfeature_signed(j)) + data = vpx_rb_read_bit(rb) ? -data : data; + } + vp9_set_segdata(seg, i, j, data); + } + } + } +} + +static void setup_loopfilter(struct loopfilter *lf, + struct vpx_read_bit_buffer *rb) { + lf->filter_level = vpx_rb_read_literal(rb, 6); + lf->sharpness_level = vpx_rb_read_literal(rb, 3); + + // Read in loop filter deltas applied at the MB level based on mode or ref + // frame. + lf->mode_ref_delta_update = 0; + + lf->mode_ref_delta_enabled = vpx_rb_read_bit(rb); + if (lf->mode_ref_delta_enabled) { + lf->mode_ref_delta_update = vpx_rb_read_bit(rb); + if (lf->mode_ref_delta_update) { + int i; + + for (i = 0; i < MAX_REF_LF_DELTAS; i++) + if (vpx_rb_read_bit(rb)) + lf->ref_deltas[i] = vpx_rb_read_signed_literal(rb, 6); + + for (i = 0; i < MAX_MODE_LF_DELTAS; i++) + if (vpx_rb_read_bit(rb)) + lf->mode_deltas[i] = vpx_rb_read_signed_literal(rb, 6); + } + } +} + +static INLINE int read_delta_q(struct vpx_read_bit_buffer *rb) { + return vpx_rb_read_bit(rb) ? vpx_rb_read_signed_literal(rb, 4) : 0; +} + +static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd, + struct vpx_read_bit_buffer *rb) { + cm->base_qindex = vpx_rb_read_literal(rb, QINDEX_BITS); + cm->y_dc_delta_q = read_delta_q(rb); + cm->uv_dc_delta_q = read_delta_q(rb); + cm->uv_ac_delta_q = read_delta_q(rb); + cm->dequant_bit_depth = cm->bit_depth; + xd->lossless = cm->base_qindex == 0 && + cm->y_dc_delta_q == 0 && + cm->uv_dc_delta_q == 0 && + cm->uv_ac_delta_q == 0; + +#if CONFIG_VP9_HIGHBITDEPTH + xd->bd = (int)cm->bit_depth; +#endif +} + +static void setup_segmentation_dequant(VP9_COMMON *const cm) { + // Build y/uv dequant values based on segmentation. + if (cm->seg.enabled) { + int i; + for (i = 0; i < MAX_SEGMENTS; ++i) { + const int qindex = vp9_get_qindex(&cm->seg, i, cm->base_qindex); + cm->y_dequant[i][0] = vp9_dc_quant(qindex, cm->y_dc_delta_q, + cm->bit_depth); + cm->y_dequant[i][1] = vp9_ac_quant(qindex, 0, cm->bit_depth); + cm->uv_dequant[i][0] = vp9_dc_quant(qindex, cm->uv_dc_delta_q, + cm->bit_depth); + cm->uv_dequant[i][1] = vp9_ac_quant(qindex, cm->uv_ac_delta_q, + cm->bit_depth); + } + } else { + const int qindex = cm->base_qindex; + // When segmentation is disabled, only the first value is used. The + // remaining are don't cares. + cm->y_dequant[0][0] = vp9_dc_quant(qindex, cm->y_dc_delta_q, cm->bit_depth); + cm->y_dequant[0][1] = vp9_ac_quant(qindex, 0, cm->bit_depth); + cm->uv_dequant[0][0] = vp9_dc_quant(qindex, cm->uv_dc_delta_q, + cm->bit_depth); + cm->uv_dequant[0][1] = vp9_ac_quant(qindex, cm->uv_ac_delta_q, + cm->bit_depth); + } +} + +static INTERP_FILTER read_interp_filter(struct vpx_read_bit_buffer *rb) { + const INTERP_FILTER literal_to_filter[] = { EIGHTTAP_SMOOTH, + EIGHTTAP, + EIGHTTAP_SHARP, + BILINEAR }; + return vpx_rb_read_bit(rb) ? SWITCHABLE + : literal_to_filter[vpx_rb_read_literal(rb, 2)]; +} + +static void setup_render_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) { + cm->render_width = cm->width; + cm->render_height = cm->height; + if (vpx_rb_read_bit(rb)) + vp9_read_frame_size(rb, &cm->render_width, &cm->render_height); +} + +static void resize_mv_buffer(VP9_COMMON *cm) { + vpx_free(cm->cur_frame->mvs); + cm->cur_frame->mi_rows = cm->mi_rows; + cm->cur_frame->mi_cols = cm->mi_cols; + CHECK_MEM_ERROR(cm, cm->cur_frame->mvs, + (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, + sizeof(*cm->cur_frame->mvs))); +} + +static void resize_context_buffers(VP9_COMMON *cm, int width, int height) { +#if CONFIG_SIZE_LIMIT + if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Dimensions of %dx%d beyond allowed size of %dx%d.", + width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT); +#endif + if (cm->width != width || cm->height != height) { + const int new_mi_rows = + ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2; + const int new_mi_cols = + ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2) >> MI_SIZE_LOG2; + + // Allocations in vp9_alloc_context_buffers() depend on individual + // dimensions as well as the overall size. + if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) { + if (vp9_alloc_context_buffers(cm, width, height)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } else { + vp9_set_mb_mi(cm, width, height); + } + vp9_init_context_buffers(cm); + cm->width = width; + cm->height = height; + } + if (cm->cur_frame->mvs == NULL || cm->mi_rows > cm->cur_frame->mi_rows || + cm->mi_cols > cm->cur_frame->mi_cols) { + resize_mv_buffer(cm); + } +} + +static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) { + int width, height; + BufferPool *const pool = cm->buffer_pool; + vp9_read_frame_size(rb, &width, &height); + resize_context_buffers(cm, width, height); + setup_render_size(cm, rb); + + lock_buffer_pool(pool); + if (vpx_realloc_frame_buffer( + get_frame_new_buffer(cm), cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_DEC_BORDER_IN_PIXELS, + cm->byte_alignment, + &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb, + pool->cb_priv)) { + unlock_buffer_pool(pool); + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } + unlock_buffer_pool(pool); + + pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; + pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; + pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; + pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space; + pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range; + pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width; + pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height; +} + +static INLINE int valid_ref_frame_img_fmt(vpx_bit_depth_t ref_bit_depth, + int ref_xss, int ref_yss, + vpx_bit_depth_t this_bit_depth, + int this_xss, int this_yss) { + return ref_bit_depth == this_bit_depth && ref_xss == this_xss && + ref_yss == this_yss; +} + +static void setup_frame_size_with_refs(VP9_COMMON *cm, + struct vpx_read_bit_buffer *rb) { + int width, height; + int found = 0, i; + int has_valid_ref_frame = 0; + BufferPool *const pool = cm->buffer_pool; + for (i = 0; i < REFS_PER_FRAME; ++i) { + if (vpx_rb_read_bit(rb)) { + if (cm->frame_refs[i].idx != INVALID_IDX) { + YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf; + width = buf->y_crop_width; + height = buf->y_crop_height; + found = 1; + break; + } else { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Failed to decode frame size"); + } + } + } + + if (!found) + vp9_read_frame_size(rb, &width, &height); + + if (width <= 0 || height <= 0) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid frame size"); + + // Check to make sure at least one of frames that this frame references + // has valid dimensions. + for (i = 0; i < REFS_PER_FRAME; ++i) { + RefBuffer *const ref_frame = &cm->frame_refs[i]; + has_valid_ref_frame |= (ref_frame->idx != INVALID_IDX && + valid_ref_frame_size(ref_frame->buf->y_crop_width, + ref_frame->buf->y_crop_height, + width, height)); + } + if (!has_valid_ref_frame) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Referenced frame has invalid size"); + for (i = 0; i < REFS_PER_FRAME; ++i) { + RefBuffer *const ref_frame = &cm->frame_refs[i]; + if (ref_frame->idx == INVALID_IDX || + !valid_ref_frame_img_fmt(ref_frame->buf->bit_depth, + ref_frame->buf->subsampling_x, + ref_frame->buf->subsampling_y, + cm->bit_depth, + cm->subsampling_x, + cm->subsampling_y)) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Referenced frame has incompatible color format"); + } + + resize_context_buffers(cm, width, height); + setup_render_size(cm, rb); + + lock_buffer_pool(pool); + if (vpx_realloc_frame_buffer( + get_frame_new_buffer(cm), cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_DEC_BORDER_IN_PIXELS, + cm->byte_alignment, + &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb, + pool->cb_priv)) { + unlock_buffer_pool(pool); + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } + unlock_buffer_pool(pool); + + pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; + pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; + pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; + pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space; + pool->frame_bufs[cm->new_fb_idx].buf.color_range = cm->color_range; + pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width; + pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height; +} + +static void setup_tile_info(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) { + int min_log2_tile_cols, max_log2_tile_cols, max_ones; + vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); + + // columns + max_ones = max_log2_tile_cols - min_log2_tile_cols; + cm->log2_tile_cols = min_log2_tile_cols; + while (max_ones-- && vpx_rb_read_bit(rb)) + cm->log2_tile_cols++; + + if (cm->log2_tile_cols > 6) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid number of tile columns"); + + // rows + cm->log2_tile_rows = vpx_rb_read_bit(rb); + if (cm->log2_tile_rows) + cm->log2_tile_rows += vpx_rb_read_bit(rb); +} + +// Reads the next tile returning its size and adjusting '*data' accordingly +// based on 'is_last'. +static void get_tile_buffer(const uint8_t *const data_end, + int is_last, + struct vpx_internal_error_info *error_info, + const uint8_t **data, + vpx_decrypt_cb decrypt_cb, void *decrypt_state, + TileBuffer *buf) { + size_t size; + + if (!is_last) { + if (!read_is_valid(*data, 4, data_end)) + vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile length"); + + if (decrypt_cb) { + uint8_t be_data[4]; + decrypt_cb(decrypt_state, *data, be_data, 4); + size = mem_get_be32(be_data); + } else { + size = mem_get_be32(*data); + } + *data += 4; + + if (size > (size_t)(data_end - *data)) + vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile size"); + } else { + size = data_end - *data; + } + + buf->data = *data; + buf->size = size; + + *data += size; +} + +static void get_tile_buffers(VP9Decoder *pbi, + const uint8_t *data, const uint8_t *data_end, + int tile_cols, int tile_rows, + TileBuffer (*tile_buffers)[1 << 6]) { + int r, c; + + for (r = 0; r < tile_rows; ++r) { + for (c = 0; c < tile_cols; ++c) { + const int is_last = (r == tile_rows - 1) && (c == tile_cols - 1); + TileBuffer *const buf = &tile_buffers[r][c]; + buf->col = c; + get_tile_buffer(data_end, is_last, &pbi->common.error, &data, + pbi->decrypt_cb, pbi->decrypt_state, buf); + } + } +} + +static const uint8_t *decode_tiles(VP9Decoder *pbi, + const uint8_t *data, + const uint8_t *data_end) { + VP9_COMMON *const cm = &pbi->common; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + TileBuffer tile_buffers[4][1 << 6]; + int tile_row, tile_col; + int mi_row, mi_col; + TileWorkerData *tile_data = NULL; + + if (cm->lf.filter_level && !cm->skip_loop_filter && + pbi->lf_worker.data1 == NULL) { + CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, + vpx_memalign(32, sizeof(LFWorkerData))); + pbi->lf_worker.hook = (VPxWorkerHook)vp9_loop_filter_worker; + if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Loop filter thread creation failed"); + } + } + + if (cm->lf.filter_level && !cm->skip_loop_filter) { + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + // Be sure to sync as we might be resuming after a failed frame decode. + winterface->sync(&pbi->lf_worker); + vp9_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm, + pbi->mb.plane); + } + + assert(tile_rows <= 4); + assert(tile_cols <= (1 << 6)); + + // Note: this memset assumes above_context[0], [1] and [2] + // are allocated as part of the same buffer. + memset(cm->above_context, 0, + sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols); + + memset(cm->above_seg_context, 0, + sizeof(*cm->above_seg_context) * aligned_cols); + + vp9_reset_lfm(cm); + + get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers); + + // Load all tile information into tile_data. + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + const TileBuffer *const buf = &tile_buffers[tile_row][tile_col]; + tile_data = pbi->tile_worker_data + tile_cols * tile_row + tile_col; + tile_data->xd = pbi->mb; + tile_data->xd.corrupted = 0; + tile_data->xd.counts = + cm->frame_parallel_decoding_mode ? NULL : &cm->counts; + vp9_zero(tile_data->dqcoeff); + vp9_tile_init(&tile_data->xd.tile, cm, tile_row, tile_col); + setup_token_decoder(buf->data, data_end, buf->size, &cm->error, + &tile_data->bit_reader, pbi->decrypt_cb, + pbi->decrypt_state); + vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff); + } + } + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + TileInfo tile; + vp9_tile_set_row(&tile, cm, tile_row); + for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end; + mi_row += MI_BLOCK_SIZE) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + const int col = pbi->inv_tile_order ? + tile_cols - tile_col - 1 : tile_col; + tile_data = pbi->tile_worker_data + tile_cols * tile_row + col; + vp9_tile_set_col(&tile, cm, col); + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; + mi_col += MI_BLOCK_SIZE) { + decode_partition(pbi, &tile_data->xd, mi_row, + mi_col, &tile_data->bit_reader, BLOCK_64X64, 4); + } + pbi->mb.corrupted |= tile_data->xd.corrupted; + if (pbi->mb.corrupted) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + } + // Loopfilter one row. + if (cm->lf.filter_level && !cm->skip_loop_filter) { + const int lf_start = mi_row - MI_BLOCK_SIZE; + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + + // delay the loopfilter by 1 macroblock row. + if (lf_start < 0) continue; + + // decoding has completed: finish up the loop filter in this thread. + if (mi_row + MI_BLOCK_SIZE >= cm->mi_rows) continue; + + winterface->sync(&pbi->lf_worker); + lf_data->start = lf_start; + lf_data->stop = mi_row; + if (pbi->max_threads > 1) { + winterface->launch(&pbi->lf_worker); + } else { + winterface->execute(&pbi->lf_worker); + } + } + // After loopfiltering, the last 7 row pixels in each superblock row may + // still be changed by the longest loopfilter of the next superblock + // row. + if (pbi->frame_parallel_decode) + vp9_frameworker_broadcast(pbi->cur_buf, + mi_row << MI_BLOCK_SIZE_LOG2); + } + } + + // Loopfilter remaining rows in the frame. + if (cm->lf.filter_level && !cm->skip_loop_filter) { + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + winterface->sync(&pbi->lf_worker); + lf_data->start = lf_data->stop; + lf_data->stop = cm->mi_rows; + winterface->execute(&pbi->lf_worker); + } + + // Get last tile data. + tile_data = pbi->tile_worker_data + tile_cols * tile_rows - 1; + + if (pbi->frame_parallel_decode) + vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX); + return vpx_reader_find_end(&tile_data->bit_reader); +} + +// On entry 'tile_data->data_end' points to the end of the input frame, on exit +// it is updated to reflect the bitreader position of the final tile column if +// present in the tile buffer group or NULL otherwise. +static int tile_worker_hook(TileWorkerData *const tile_data, + VP9Decoder *const pbi) { + TileInfo *volatile tile = &tile_data->xd.tile; + const int final_col = (1 << pbi->common.log2_tile_cols) - 1; + const uint8_t *volatile bit_reader_end = NULL; + volatile int n = tile_data->buf_start; + tile_data->error_info.setjmp = 1; + + if (setjmp(tile_data->error_info.jmp)) { + tile_data->error_info.setjmp = 0; + tile_data->xd.corrupted = 1; + tile_data->data_end = NULL; + return 0; + } + + tile_data->xd.error_info = &tile_data->error_info; + tile_data->xd.corrupted = 0; + + do { + int mi_row, mi_col; + const TileBuffer *const buf = pbi->tile_buffers + n; + vp9_zero(tile_data->dqcoeff); + vp9_tile_init(tile, &pbi->common, 0, buf->col); + setup_token_decoder(buf->data, tile_data->data_end, buf->size, + &tile_data->error_info, &tile_data->bit_reader, + pbi->decrypt_cb, pbi->decrypt_state); + vp9_init_macroblockd(&pbi->common, &tile_data->xd, tile_data->dqcoeff); + + for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; + mi_row += MI_BLOCK_SIZE) { + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; + mi_col += MI_BLOCK_SIZE) { + decode_partition(pbi, &tile_data->xd, mi_row, mi_col, + &tile_data->bit_reader, BLOCK_64X64, 4); + } + } + + if (buf->col == final_col) { + bit_reader_end = vpx_reader_find_end(&tile_data->bit_reader); + } + } while (!tile_data->xd.corrupted && ++n <= tile_data->buf_end); + + tile_data->data_end = bit_reader_end; + return !tile_data->xd.corrupted; +} + +// sorts in descending order +static int compare_tile_buffers(const void *a, const void *b) { + const TileBuffer *const buf1 = (const TileBuffer*)a; + const TileBuffer *const buf2 = (const TileBuffer*)b; + return (int)(buf2->size - buf1->size); +} + +static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, + const uint8_t *data, + const uint8_t *data_end) { + VP9_COMMON *const cm = &pbi->common; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + const uint8_t *bit_reader_end = NULL; + const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + const int num_workers = VPXMIN(pbi->max_threads, tile_cols); + int n; + + assert(tile_cols <= (1 << 6)); + assert(tile_rows == 1); + (void)tile_rows; + + if (pbi->num_tile_workers == 0) { + const int num_threads = pbi->max_threads; + CHECK_MEM_ERROR(cm, pbi->tile_workers, + vpx_malloc(num_threads * sizeof(*pbi->tile_workers))); + for (n = 0; n < num_threads; ++n) { + VPxWorker *const worker = &pbi->tile_workers[n]; + ++pbi->num_tile_workers; + + winterface->init(worker); + if (n < num_threads - 1 && !winterface->reset(worker)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Tile decoder thread creation failed"); + } + } + } + + // Reset tile decoding hook + for (n = 0; n < num_workers; ++n) { + VPxWorker *const worker = &pbi->tile_workers[n]; + TileWorkerData *const tile_data = + &pbi->tile_worker_data[n + pbi->total_tiles]; + winterface->sync(worker); + tile_data->xd = pbi->mb; + tile_data->xd.counts = + cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts; + worker->hook = (VPxWorkerHook)tile_worker_hook; + worker->data1 = tile_data; + worker->data2 = pbi; + } + + // Note: this memset assumes above_context[0], [1] and [2] + // are allocated as part of the same buffer. + memset(cm->above_context, 0, + sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols); + memset(cm->above_seg_context, 0, + sizeof(*cm->above_seg_context) * aligned_mi_cols); + + vp9_reset_lfm(cm); + + // Load tile data into tile_buffers + get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, + &pbi->tile_buffers); + + // Sort the buffers based on size in descending order. + qsort(pbi->tile_buffers, tile_cols, sizeof(pbi->tile_buffers[0]), + compare_tile_buffers); + + if (num_workers == tile_cols) { + // Rearrange the tile buffers such that the largest, and + // presumably the most difficult, tile will be decoded in the main thread. + // This should help minimize the number of instances where the main thread + // is waiting for a worker to complete. + const TileBuffer largest = pbi->tile_buffers[0]; + memmove(pbi->tile_buffers, pbi->tile_buffers + 1, + (tile_cols - 1) * sizeof(pbi->tile_buffers[0])); + pbi->tile_buffers[tile_cols - 1] = largest; + } else { + int start = 0, end = tile_cols - 2; + TileBuffer tmp; + + // Interleave the tiles to distribute the load between threads, assuming a + // larger tile implies it is more difficult to decode. + while (start < end) { + tmp = pbi->tile_buffers[start]; + pbi->tile_buffers[start] = pbi->tile_buffers[end]; + pbi->tile_buffers[end] = tmp; + start += 2; + end -= 2; + } + } + + // Initialize thread frame counts. + if (!cm->frame_parallel_decoding_mode) { + for (n = 0; n < num_workers; ++n) { + TileWorkerData *const tile_data = + (TileWorkerData*)pbi->tile_workers[n].data1; + vp9_zero(tile_data->counts); + } + } + + { + const int base = tile_cols / num_workers; + const int remain = tile_cols % num_workers; + int buf_start = 0; + + for (n = 0; n < num_workers; ++n) { + const int count = base + (remain + n) / num_workers; + VPxWorker *const worker = &pbi->tile_workers[n]; + TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; + + tile_data->buf_start = buf_start; + tile_data->buf_end = buf_start + count - 1; + tile_data->data_end = data_end; + buf_start += count; + + worker->had_error = 0; + if (n == num_workers - 1) { + assert(tile_data->buf_end == tile_cols - 1); + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + for (; n > 0; --n) { + VPxWorker *const worker = &pbi->tile_workers[n - 1]; + TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; + // TODO(jzern): The tile may have specific error data associated with + // its vpx_internal_error_info which could be propagated to the main info + // in cm. Additionally once the threads have been synced and an error is + // detected, there's no point in continuing to decode tiles. + pbi->mb.corrupted |= !winterface->sync(worker); + if (!bit_reader_end) bit_reader_end = tile_data->data_end; + } + } + + // Accumulate thread frame counts. + if (!cm->frame_parallel_decoding_mode) { + for (n = 0; n < num_workers; ++n) { + TileWorkerData *const tile_data = + (TileWorkerData*)pbi->tile_workers[n].data1; + vp9_accumulate_frame_counts(&cm->counts, &tile_data->counts, 1); + } + } + + assert(bit_reader_end || pbi->mb.corrupted); + return bit_reader_end; +} + +static void error_handler(void *data) { + VP9_COMMON *const cm = (VP9_COMMON *)data; + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet"); +} + +static void read_bitdepth_colorspace_sampling( + VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) { + if (cm->profile >= PROFILE_2) { + cm->bit_depth = vpx_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10; +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth = 1; +#endif + } else { + cm->bit_depth = VPX_BITS_8; +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth = 0; +#endif + } + cm->color_space = vpx_rb_read_literal(rb, 3); + if (cm->color_space != VPX_CS_SRGB) { + cm->color_range = (vpx_color_range_t)vpx_rb_read_bit(rb); + if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { + cm->subsampling_x = vpx_rb_read_bit(rb); + cm->subsampling_y = vpx_rb_read_bit(rb); + if (cm->subsampling_x == 1 && cm->subsampling_y == 1) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "4:2:0 color not supported in profile 1 or 3"); + if (vpx_rb_read_bit(rb)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Reserved bit set"); + } else { + cm->subsampling_y = cm->subsampling_x = 1; + } + } else { + cm->color_range = VPX_CR_FULL_RANGE; + if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { + // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed. + // 4:2:2 or 4:4:0 chroma sampling is not allowed. + cm->subsampling_y = cm->subsampling_x = 0; + if (vpx_rb_read_bit(rb)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Reserved bit set"); + } else { + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "4:4:4 color not supported in profile 0 or 2"); + } + } +} + +static size_t read_uncompressed_header(VP9Decoder *pbi, + struct vpx_read_bit_buffer *rb) { + VP9_COMMON *const cm = &pbi->common; + BufferPool *const pool = cm->buffer_pool; + RefCntBuffer *const frame_bufs = pool->frame_bufs; + int i, mask, ref_index = 0; + size_t sz; + + cm->last_frame_type = cm->frame_type; + cm->last_intra_only = cm->intra_only; + + if (vpx_rb_read_literal(rb, 2) != VP9_FRAME_MARKER) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid frame marker"); + + cm->profile = vp9_read_profile(rb); +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->profile >= MAX_PROFILES) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Unsupported bitstream profile"); +#else + if (cm->profile >= PROFILE_2) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Unsupported bitstream profile"); +#endif + + cm->show_existing_frame = vpx_rb_read_bit(rb); + if (cm->show_existing_frame) { + // Show an existing frame directly. + const int frame_to_show = cm->ref_frame_map[vpx_rb_read_literal(rb, 3)]; + lock_buffer_pool(pool); + if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) { + unlock_buffer_pool(pool); + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Buffer %d does not contain a decoded frame", + frame_to_show); + } + + ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show); + unlock_buffer_pool(pool); + pbi->refresh_frame_flags = 0; + cm->lf.filter_level = 0; + cm->show_frame = 1; + + if (pbi->frame_parallel_decode) { + for (i = 0; i < REF_FRAMES; ++i) + cm->next_ref_frame_map[i] = cm->ref_frame_map[i]; + } + return 0; + } + + cm->frame_type = (FRAME_TYPE) vpx_rb_read_bit(rb); + cm->show_frame = vpx_rb_read_bit(rb); + cm->error_resilient_mode = vpx_rb_read_bit(rb); + + if (cm->frame_type == KEY_FRAME) { + if (!vp9_read_sync_code(rb)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid frame sync code"); + + read_bitdepth_colorspace_sampling(cm, rb); + pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1; + + for (i = 0; i < REFS_PER_FRAME; ++i) { + cm->frame_refs[i].idx = INVALID_IDX; + cm->frame_refs[i].buf = NULL; + } + + setup_frame_size(cm, rb); + if (pbi->need_resync) { + memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); + pbi->need_resync = 0; + } + } else { + cm->intra_only = cm->show_frame ? 0 : vpx_rb_read_bit(rb); + + cm->reset_frame_context = cm->error_resilient_mode ? + 0 : vpx_rb_read_literal(rb, 2); + + if (cm->intra_only) { + if (!vp9_read_sync_code(rb)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid frame sync code"); + if (cm->profile > PROFILE_0) { + read_bitdepth_colorspace_sampling(cm, rb); + } else { + // NOTE: The intra-only frame header does not include the specification + // of either the color format or color sub-sampling in profile 0. VP9 + // specifies that the default color format should be YUV 4:2:0 in this + // case (normative). + cm->color_space = VPX_CS_BT_601; + cm->color_range = VPX_CR_STUDIO_RANGE; + cm->subsampling_y = cm->subsampling_x = 1; + cm->bit_depth = VPX_BITS_8; +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth = 0; +#endif + } + + pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES); + setup_frame_size(cm, rb); + if (pbi->need_resync) { + memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); + pbi->need_resync = 0; + } + } else if (pbi->need_resync != 1) { /* Skip if need resync */ + pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES); + for (i = 0; i < REFS_PER_FRAME; ++i) { + const int ref = vpx_rb_read_literal(rb, REF_FRAMES_LOG2); + const int idx = cm->ref_frame_map[ref]; + RefBuffer *const ref_frame = &cm->frame_refs[i]; + ref_frame->idx = idx; + ref_frame->buf = &frame_bufs[idx].buf; + cm->ref_frame_sign_bias[LAST_FRAME + i] = vpx_rb_read_bit(rb); + } + + setup_frame_size_with_refs(cm, rb); + + cm->allow_high_precision_mv = vpx_rb_read_bit(rb); + cm->interp_filter = read_interp_filter(rb); + + for (i = 0; i < REFS_PER_FRAME; ++i) { + RefBuffer *const ref_buf = &cm->frame_refs[i]; +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame(&ref_buf->sf, + ref_buf->buf->y_crop_width, + ref_buf->buf->y_crop_height, + cm->width, cm->height, + cm->use_highbitdepth); +#else + vp9_setup_scale_factors_for_frame(&ref_buf->sf, + ref_buf->buf->y_crop_width, + ref_buf->buf->y_crop_height, + cm->width, cm->height); +#endif + } + } + } +#if CONFIG_VP9_HIGHBITDEPTH + get_frame_new_buffer(cm)->bit_depth = cm->bit_depth; +#endif + get_frame_new_buffer(cm)->color_space = cm->color_space; + get_frame_new_buffer(cm)->color_range = cm->color_range; + get_frame_new_buffer(cm)->render_width = cm->render_width; + get_frame_new_buffer(cm)->render_height = cm->render_height; + + if (pbi->need_resync) { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Keyframe / intra-only frame required to reset decoder" + " state"); + } + + if (!cm->error_resilient_mode) { + cm->refresh_frame_context = vpx_rb_read_bit(rb); + cm->frame_parallel_decoding_mode = vpx_rb_read_bit(rb); + if (!cm->frame_parallel_decoding_mode) + vp9_zero(cm->counts); + } else { + cm->refresh_frame_context = 0; + cm->frame_parallel_decoding_mode = 1; + } + + // This flag will be overridden by the call to vp9_setup_past_independence + // below, forcing the use of context 0 for those frame types. + cm->frame_context_idx = vpx_rb_read_literal(rb, FRAME_CONTEXTS_LOG2); + + // Generate next_ref_frame_map. + lock_buffer_pool(pool); + for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { + if (mask & 1) { + cm->next_ref_frame_map[ref_index] = cm->new_fb_idx; + ++frame_bufs[cm->new_fb_idx].ref_count; + } else { + cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index]; + } + // Current thread holds the reference frame. + if (cm->ref_frame_map[ref_index] >= 0) + ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count; + ++ref_index; + } + + for (; ref_index < REF_FRAMES; ++ref_index) { + cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index]; + // Current thread holds the reference frame. + if (cm->ref_frame_map[ref_index] >= 0) + ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count; + } + unlock_buffer_pool(pool); + pbi->hold_ref_buf = 1; + + if (frame_is_intra_only(cm) || cm->error_resilient_mode) + vp9_setup_past_independence(cm); + + setup_loopfilter(&cm->lf, rb); + setup_quantization(cm, &pbi->mb, rb); + setup_segmentation(&cm->seg, rb); + setup_segmentation_dequant(cm); + + setup_tile_info(cm, rb); + sz = vpx_rb_read_literal(rb, 16); + + if (sz == 0) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid header size"); + + return sz; +} + +static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data, + size_t partition_size) { + VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + FRAME_CONTEXT *const fc = cm->fc; + vpx_reader r; + int k; + + if (vpx_reader_init(&r, data, partition_size, pbi->decrypt_cb, + pbi->decrypt_state)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate bool decoder 0"); + + cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r); + if (cm->tx_mode == TX_MODE_SELECT) + read_tx_mode_probs(&fc->tx_probs, &r); + read_coef_probs(fc, cm->tx_mode, &r); + + for (k = 0; k < SKIP_CONTEXTS; ++k) + vp9_diff_update_prob(&r, &fc->skip_probs[k]); + + if (!frame_is_intra_only(cm)) { + nmv_context *const nmvc = &fc->nmvc; + int i, j; + + read_inter_mode_probs(fc, &r); + + if (cm->interp_filter == SWITCHABLE) + read_switchable_interp_probs(fc, &r); + + for (i = 0; i < INTRA_INTER_CONTEXTS; i++) + vp9_diff_update_prob(&r, &fc->intra_inter_prob[i]); + + cm->reference_mode = read_frame_reference_mode(cm, &r); + if (cm->reference_mode != SINGLE_REFERENCE) + setup_compound_reference_mode(cm); + read_frame_reference_mode_probs(cm, &r); + + for (j = 0; j < BLOCK_SIZE_GROUPS; j++) + for (i = 0; i < INTRA_MODES - 1; ++i) + vp9_diff_update_prob(&r, &fc->y_mode_prob[j][i]); + + for (j = 0; j < PARTITION_CONTEXTS; ++j) + for (i = 0; i < PARTITION_TYPES - 1; ++i) + vp9_diff_update_prob(&r, &fc->partition_prob[j][i]); + + read_mv_probs(nmvc, cm->allow_high_precision_mv, &r); + } + + return vpx_reader_has_error(&r); +} + +static struct vpx_read_bit_buffer *init_read_bit_buffer( + VP9Decoder *pbi, + struct vpx_read_bit_buffer *rb, + const uint8_t *data, + const uint8_t *data_end, + uint8_t clear_data[MAX_VP9_HEADER_SIZE]) { + rb->bit_offset = 0; + rb->error_handler = error_handler; + rb->error_handler_data = &pbi->common; + if (pbi->decrypt_cb) { + const int n = (int)VPXMIN(MAX_VP9_HEADER_SIZE, data_end - data); + pbi->decrypt_cb(pbi->decrypt_state, data, clear_data, n); + rb->bit_buffer = clear_data; + rb->bit_buffer_end = clear_data + n; + } else { + rb->bit_buffer = data; + rb->bit_buffer_end = data_end; + } + return rb; +} + +//------------------------------------------------------------------------------ + +int vp9_read_sync_code(struct vpx_read_bit_buffer *const rb) { + return vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 && + vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 && + vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2; +} + +void vp9_read_frame_size(struct vpx_read_bit_buffer *rb, + int *width, int *height) { + *width = vpx_rb_read_literal(rb, 16) + 1; + *height = vpx_rb_read_literal(rb, 16) + 1; +} + +BITSTREAM_PROFILE vp9_read_profile(struct vpx_read_bit_buffer *rb) { + int profile = vpx_rb_read_bit(rb); + profile |= vpx_rb_read_bit(rb) << 1; + if (profile > 2) + profile += vpx_rb_read_bit(rb); + return (BITSTREAM_PROFILE) profile; +} + +void vp9_decode_frame(VP9Decoder *pbi, + const uint8_t *data, const uint8_t *data_end, + const uint8_t **p_data_end) { + VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + struct vpx_read_bit_buffer rb; + int context_updated = 0; + uint8_t clear_data[MAX_VP9_HEADER_SIZE]; + const size_t first_partition_size = read_uncompressed_header(pbi, + init_read_bit_buffer(pbi, &rb, data, data_end, clear_data)); + const int tile_rows = 1 << cm->log2_tile_rows; + const int tile_cols = 1 << cm->log2_tile_cols; + YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); + xd->cur_buf = new_fb; + + if (!first_partition_size) { + // showing a frame directly + *p_data_end = data + (cm->profile <= PROFILE_2 ? 1 : 2); + return; + } + + data += vpx_rb_bytes_read(&rb); + if (!read_is_valid(data, first_partition_size, data_end)) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt header length"); + + cm->use_prev_frame_mvs = !cm->error_resilient_mode && + cm->width == cm->last_width && + cm->height == cm->last_height && + !cm->last_intra_only && + cm->last_show_frame && + (cm->last_frame_type != KEY_FRAME); + + vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y); + + *cm->fc = cm->frame_contexts[cm->frame_context_idx]; + if (!cm->fc->initialized) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Uninitialized entropy context."); + + xd->corrupted = 0; + new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size); + if (new_fb->corrupted) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data header is corrupted."); + + if (cm->lf.filter_level && !cm->skip_loop_filter) { + vp9_loop_filter_frame_init(cm, cm->lf.filter_level); + } + + // If encoded in frame parallel mode, frame context is ready after decoding + // the frame header. + if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) { + VPxWorker *const worker = pbi->frame_worker_owner; + FrameWorkerData *const frame_worker_data = worker->data1; + if (cm->refresh_frame_context) { + context_updated = 1; + cm->frame_contexts[cm->frame_context_idx] = *cm->fc; + } + vp9_frameworker_lock_stats(worker); + pbi->cur_buf->row = -1; + pbi->cur_buf->col = -1; + frame_worker_data->frame_context_ready = 1; + // Signal the main thread that context is ready. + vp9_frameworker_signal_stats(worker); + vp9_frameworker_unlock_stats(worker); + } + + if (pbi->tile_worker_data == NULL || + (tile_cols * tile_rows) != pbi->total_tiles) { + const int num_tile_workers = tile_cols * tile_rows + + ((pbi->max_threads > 1) ? pbi->max_threads : 0); + const size_t twd_size = num_tile_workers * sizeof(*pbi->tile_worker_data); + // Ensure tile data offsets will be properly aligned. This may fail on + // platforms without DECLARE_ALIGNED(). + assert((sizeof(*pbi->tile_worker_data) % 16) == 0); + vpx_free(pbi->tile_worker_data); + CHECK_MEM_ERROR(cm, pbi->tile_worker_data, vpx_memalign(32, twd_size)); + pbi->total_tiles = tile_rows * tile_cols; + } + + if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) { + // Multi-threaded tile decoder + *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end); + if (!xd->corrupted) { + if (!cm->skip_loop_filter) { + // If multiple threads are used to decode tiles, then we use those + // threads to do parallel loopfiltering. + vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, + cm->lf.filter_level, 0, 0, pbi->tile_workers, + pbi->num_tile_workers, &pbi->lf_row_sync); + } + } else { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data is corrupted."); + } + } else { + *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end); + } + + if (!xd->corrupted) { + if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) { + vp9_adapt_coef_probs(cm); + + if (!frame_is_intra_only(cm)) { + vp9_adapt_mode_probs(cm); + vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv); + } + } + } else { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data is corrupted."); + } + + // Non frame parallel update frame context here. + if (cm->refresh_frame_context && !context_updated) + cm->frame_contexts[cm->frame_context_idx] = *cm->fc; +} diff --git a/thirdparty/libvpx/vp9/decoder/vp9_decodeframe.h b/thirdparty/libvpx/vp9/decoder/vp9_decodeframe.h new file mode 100644 index 000000000000..ce33cbdbd916 --- /dev/null +++ b/thirdparty/libvpx/vp9/decoder/vp9_decodeframe.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_DECODER_VP9_DECODEFRAME_H_ +#define VP9_DECODER_VP9_DECODEFRAME_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "vp9/common/vp9_enums.h" + +struct VP9Decoder; +struct vpx_read_bit_buffer; + +int vp9_read_sync_code(struct vpx_read_bit_buffer *const rb); +void vp9_read_frame_size(struct vpx_read_bit_buffer *rb, + int *width, int *height); +BITSTREAM_PROFILE vp9_read_profile(struct vpx_read_bit_buffer *rb); + +void vp9_decode_frame(struct VP9Decoder *pbi, + const uint8_t *data, const uint8_t *data_end, + const uint8_t **p_data_end); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_DECODER_VP9_DECODEFRAME_H_ diff --git a/thirdparty/libvpx/vp9/decoder/vp9_decodemv.c b/thirdparty/libvpx/vp9/decoder/vp9_decodemv.c new file mode 100644 index 000000000000..ffc6839ad1a6 --- /dev/null +++ b/thirdparty/libvpx/vp9/decoder/vp9_decodemv.c @@ -0,0 +1,911 @@ +/* + Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_mvref_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/decoder/vp9_decodemv.h" +#include "vp9/decoder/vp9_decodeframe.h" + +#include "vpx_dsp/vpx_dsp_common.h" + +static PREDICTION_MODE read_intra_mode(vpx_reader *r, const vpx_prob *p) { + return (PREDICTION_MODE)vpx_read_tree(r, vp9_intra_mode_tree, p); +} + +static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, MACROBLOCKD *xd, + vpx_reader *r, int size_group) { + const PREDICTION_MODE y_mode = + read_intra_mode(r, cm->fc->y_mode_prob[size_group]); + FRAME_COUNTS *counts = xd->counts; + if (counts) + ++counts->y_mode[size_group][y_mode]; + return y_mode; +} + +static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, MACROBLOCKD *xd, + vpx_reader *r, + PREDICTION_MODE y_mode) { + const PREDICTION_MODE uv_mode = read_intra_mode(r, + cm->fc->uv_mode_prob[y_mode]); + FRAME_COUNTS *counts = xd->counts; + if (counts) + ++counts->uv_mode[y_mode][uv_mode]; + return uv_mode; +} + +static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, MACROBLOCKD *xd, + vpx_reader *r, int ctx) { + const int mode = vpx_read_tree(r, vp9_inter_mode_tree, + cm->fc->inter_mode_probs[ctx]); + FRAME_COUNTS *counts = xd->counts; + if (counts) + ++counts->inter_mode[ctx][mode]; + + return NEARESTMV + mode; +} + +static int read_segment_id(vpx_reader *r, const struct segmentation *seg) { + return vpx_read_tree(r, vp9_segment_tree, seg->tree_probs); +} + +static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, + TX_SIZE max_tx_size, vpx_reader *r) { + FRAME_COUNTS *counts = xd->counts; + const int ctx = get_tx_size_context(xd); + const vpx_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs); + int tx_size = vpx_read(r, tx_probs[0]); + if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) { + tx_size += vpx_read(r, tx_probs[1]); + if (tx_size != TX_8X8 && max_tx_size >= TX_32X32) + tx_size += vpx_read(r, tx_probs[2]); + } + + if (counts) + ++get_tx_counts(max_tx_size, ctx, &counts->tx)[tx_size]; + return (TX_SIZE)tx_size; +} + +static INLINE TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, + int allow_select, vpx_reader *r) { + TX_MODE tx_mode = cm->tx_mode; + BLOCK_SIZE bsize = xd->mi[0]->sb_type; + const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; + if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8) + return read_selected_tx_size(cm, xd, max_tx_size, r); + else + return VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]); +} + +static int dec_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids, + int mi_offset, int x_mis, int y_mis) { + int x, y, segment_id = INT_MAX; + + for (y = 0; y < y_mis; y++) + for (x = 0; x < x_mis; x++) + segment_id = + VPXMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]); + + assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); + return segment_id; +} + +static void set_segment_id(VP9_COMMON *cm, int mi_offset, + int x_mis, int y_mis, int segment_id) { + int x, y; + + assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); + + for (y = 0; y < y_mis; y++) + for (x = 0; x < x_mis; x++) + cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id; +} + +static void copy_segment_id(const VP9_COMMON *cm, + const uint8_t *last_segment_ids, + uint8_t *current_segment_ids, + int mi_offset, int x_mis, int y_mis) { + int x, y; + + for (y = 0; y < y_mis; y++) + for (x = 0; x < x_mis; x++) + current_segment_ids[mi_offset + y * cm->mi_cols + x] = last_segment_ids ? + last_segment_ids[mi_offset + y * cm->mi_cols + x] : 0; +} + +static int read_intra_segment_id(VP9_COMMON *const cm, int mi_offset, + int x_mis, int y_mis, + vpx_reader *r) { + struct segmentation *const seg = &cm->seg; + int segment_id; + + if (!seg->enabled) + return 0; // Default for disabled segmentation + + if (!seg->update_map) { + copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map, + mi_offset, x_mis, y_mis); + return 0; + } + + segment_id = read_segment_id(r, seg); + set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id); + return segment_id; +} + +static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd, + int mi_row, int mi_col, vpx_reader *r, + int x_mis, int y_mis) { + struct segmentation *const seg = &cm->seg; + MODE_INFO *const mi = xd->mi[0]; + int predicted_segment_id, segment_id; + const int mi_offset = mi_row * cm->mi_cols + mi_col; + + if (!seg->enabled) + return 0; // Default for disabled segmentation + + predicted_segment_id = cm->last_frame_seg_map ? + dec_get_segment_id(cm, cm->last_frame_seg_map, mi_offset, x_mis, y_mis) : + 0; + + if (!seg->update_map) { + copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map, + mi_offset, x_mis, y_mis); + return predicted_segment_id; + } + + if (seg->temporal_update) { + const vpx_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd); + mi->seg_id_predicted = vpx_read(r, pred_prob); + segment_id = mi->seg_id_predicted ? predicted_segment_id + : read_segment_id(r, seg); + } else { + segment_id = read_segment_id(r, seg); + } + set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id); + return segment_id; +} + +static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd, + int segment_id, vpx_reader *r) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 1; + } else { + const int ctx = vp9_get_skip_context(xd); + const int skip = vpx_read(r, cm->fc->skip_probs[ctx]); + FRAME_COUNTS *counts = xd->counts; + if (counts) + ++counts->skip[ctx][skip]; + return skip; + } +} + +static void read_intra_frame_mode_info(VP9_COMMON *const cm, + MACROBLOCKD *const xd, + int mi_row, int mi_col, vpx_reader *r, + int x_mis, int y_mis) { + MODE_INFO *const mi = xd->mi[0]; + const MODE_INFO *above_mi = xd->above_mi; + const MODE_INFO *left_mi = xd->left_mi; + const BLOCK_SIZE bsize = mi->sb_type; + int i; + const int mi_offset = mi_row * cm->mi_cols + mi_col; + + mi->segment_id = read_intra_segment_id(cm, mi_offset, x_mis, y_mis, r); + mi->skip = read_skip(cm, xd, mi->segment_id, r); + mi->tx_size = read_tx_size(cm, xd, 1, r); + mi->ref_frame[0] = INTRA_FRAME; + mi->ref_frame[1] = NONE; + + switch (bsize) { + case BLOCK_4X4: + for (i = 0; i < 4; ++i) + mi->bmi[i].as_mode = + read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, i)); + mi->mode = mi->bmi[3].as_mode; + break; + case BLOCK_4X8: + mi->bmi[0].as_mode = mi->bmi[2].as_mode = + read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0)); + mi->bmi[1].as_mode = mi->bmi[3].as_mode = mi->mode = + read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 1)); + break; + case BLOCK_8X4: + mi->bmi[0].as_mode = mi->bmi[1].as_mode = + read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0)); + mi->bmi[2].as_mode = mi->bmi[3].as_mode = mi->mode = + read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 2)); + break; + default: + mi->mode = read_intra_mode(r, + get_y_mode_probs(mi, above_mi, left_mi, 0)); + } + + mi->uv_mode = read_intra_mode(r, vp9_kf_uv_mode_prob[mi->mode]); +} + +static int read_mv_component(vpx_reader *r, + const nmv_component *mvcomp, int usehp) { + int mag, d, fr, hp; + const int sign = vpx_read(r, mvcomp->sign); + const int mv_class = vpx_read_tree(r, vp9_mv_class_tree, mvcomp->classes); + const int class0 = mv_class == MV_CLASS_0; + + // Integer part + if (class0) { + d = vpx_read_tree(r, vp9_mv_class0_tree, mvcomp->class0); + mag = 0; + } else { + int i; + const int n = mv_class + CLASS0_BITS - 1; // number of bits + + d = 0; + for (i = 0; i < n; ++i) + d |= vpx_read(r, mvcomp->bits[i]) << i; + mag = CLASS0_SIZE << (mv_class + 2); + } + + // Fractional part + fr = vpx_read_tree(r, vp9_mv_fp_tree, class0 ? mvcomp->class0_fp[d] + : mvcomp->fp); + + // High precision part (if hp is not used, the default value of the hp is 1) + hp = usehp ? vpx_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp) + : 1; + + // Result + mag += ((d << 3) | (fr << 1) | hp) + 1; + return sign ? -mag : mag; +} + +static INLINE void read_mv(vpx_reader *r, MV *mv, const MV *ref, + const nmv_context *ctx, + nmv_context_counts *counts, int allow_hp) { + const MV_JOINT_TYPE joint_type = + (MV_JOINT_TYPE)vpx_read_tree(r, vp9_mv_joint_tree, ctx->joints); + const int use_hp = allow_hp && use_mv_hp(ref); + MV diff = {0, 0}; + + if (mv_joint_vertical(joint_type)) + diff.row = read_mv_component(r, &ctx->comps[0], use_hp); + + if (mv_joint_horizontal(joint_type)) + diff.col = read_mv_component(r, &ctx->comps[1], use_hp); + + vp9_inc_mv(&diff, counts); + + mv->row = ref->row + diff.row; + mv->col = ref->col + diff.col; +} + +static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm, + const MACROBLOCKD *xd, + vpx_reader *r) { + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + const int ctx = vp9_get_reference_mode_context(cm, xd); + const REFERENCE_MODE mode = + (REFERENCE_MODE)vpx_read(r, cm->fc->comp_inter_prob[ctx]); + FRAME_COUNTS *counts = xd->counts; + if (counts) + ++counts->comp_inter[ctx][mode]; + return mode; // SINGLE_REFERENCE or COMPOUND_REFERENCE + } else { + return cm->reference_mode; + } +} + +// Read the referncence frame +static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, + vpx_reader *r, + int segment_id, MV_REFERENCE_FRAME ref_frame[2]) { + FRAME_CONTEXT *const fc = cm->fc; + FRAME_COUNTS *counts = xd->counts; + + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id, + SEG_LVL_REF_FRAME); + ref_frame[1] = NONE; + } else { + const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r); + // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding + if (mode == COMPOUND_REFERENCE) { + const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; + const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd); + const int bit = vpx_read(r, fc->comp_ref_prob[ctx]); + if (counts) + ++counts->comp_ref[ctx][bit]; + ref_frame[idx] = cm->comp_fixed_ref; + ref_frame[!idx] = cm->comp_var_ref[bit]; + } else if (mode == SINGLE_REFERENCE) { + const int ctx0 = vp9_get_pred_context_single_ref_p1(xd); + const int bit0 = vpx_read(r, fc->single_ref_prob[ctx0][0]); + if (counts) + ++counts->single_ref[ctx0][0][bit0]; + if (bit0) { + const int ctx1 = vp9_get_pred_context_single_ref_p2(xd); + const int bit1 = vpx_read(r, fc->single_ref_prob[ctx1][1]); + if (counts) + ++counts->single_ref[ctx1][1][bit1]; + ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME; + } else { + ref_frame[0] = LAST_FRAME; + } + + ref_frame[1] = NONE; + } else { + assert(0 && "Invalid prediction mode."); + } + } +} + +// TODO(slavarnway): Move this decoder version of +// vp9_get_pred_context_switchable_interp() to vp9_pred_common.h and update the +// encoder. +// +// Returns a context number for the given MB prediction signal +static int dec_get_pred_context_switchable_interp(const MACROBLOCKD *xd) { + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + const MODE_INFO *const left_mi = xd->left_mi; + const int left_type = left_mi ? left_mi->interp_filter : SWITCHABLE_FILTERS; + const MODE_INFO *const above_mi = xd->above_mi; + const int above_type = above_mi ? above_mi->interp_filter + : SWITCHABLE_FILTERS; + + if (left_type == above_type) + return left_type; + else if (left_type == SWITCHABLE_FILTERS) + return above_type; + else if (above_type == SWITCHABLE_FILTERS) + return left_type; + else + return SWITCHABLE_FILTERS; +} + +static INLINE INTERP_FILTER read_switchable_interp_filter( + VP9_COMMON *const cm, MACROBLOCKD *const xd, + vpx_reader *r) { + const int ctx = dec_get_pred_context_switchable_interp(xd); + const INTERP_FILTER type = + (INTERP_FILTER)vpx_read_tree(r, vp9_switchable_interp_tree, + cm->fc->switchable_interp_prob[ctx]); + FRAME_COUNTS *counts = xd->counts; + if (counts) + ++counts->switchable_interp[ctx][type]; + return type; +} + +static void read_intra_block_mode_info(VP9_COMMON *const cm, + MACROBLOCKD *const xd, MODE_INFO *mi, + vpx_reader *r) { + const BLOCK_SIZE bsize = mi->sb_type; + int i; + + switch (bsize) { + case BLOCK_4X4: + for (i = 0; i < 4; ++i) + mi->bmi[i].as_mode = read_intra_mode_y(cm, xd, r, 0); + mi->mode = mi->bmi[3].as_mode; + break; + case BLOCK_4X8: + mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, xd, + r, 0); + mi->bmi[1].as_mode = mi->bmi[3].as_mode = mi->mode = + read_intra_mode_y(cm, xd, r, 0); + break; + case BLOCK_8X4: + mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, xd, + r, 0); + mi->bmi[2].as_mode = mi->bmi[3].as_mode = mi->mode = + read_intra_mode_y(cm, xd, r, 0); + break; + default: + mi->mode = read_intra_mode_y(cm, xd, r, size_group_lookup[bsize]); + } + + mi->uv_mode = read_intra_mode_uv(cm, xd, r, mi->mode); + + // Initialize interp_filter here so we do not have to check for inter block + // modes in dec_get_pred_context_switchable_interp() + mi->interp_filter = SWITCHABLE_FILTERS; + + mi->ref_frame[0] = INTRA_FRAME; + mi->ref_frame[1] = NONE; +} + +static INLINE int is_mv_valid(const MV *mv) { + return mv->row > MV_LOW && mv->row < MV_UPP && + mv->col > MV_LOW && mv->col < MV_UPP; +} + +static INLINE void copy_mv_pair(int_mv *dst, const int_mv *src) { + memcpy(dst, src, sizeof(*dst) * 2); +} + +static INLINE void zero_mv_pair(int_mv *dst) { + memset(dst, 0, sizeof(*dst) * 2); +} + +static INLINE int assign_mv(VP9_COMMON *cm, MACROBLOCKD *xd, + PREDICTION_MODE mode, + int_mv mv[2], int_mv ref_mv[2], + int_mv near_nearest_mv[2], + int is_compound, int allow_hp, vpx_reader *r) { + int i; + int ret = 1; + + switch (mode) { + case NEWMV: { + FRAME_COUNTS *counts = xd->counts; + nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL; + for (i = 0; i < 1 + is_compound; ++i) { + read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc->nmvc, mv_counts, + allow_hp); + ret = ret && is_mv_valid(&mv[i].as_mv); + } + break; + } + case NEARMV: + case NEARESTMV: { + copy_mv_pair(mv, near_nearest_mv); + break; + } + case ZEROMV: { + zero_mv_pair(mv); + break; + } + default: { + return 0; + } + } + return ret; +} + +static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd, + int segment_id, vpx_reader *r) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME; + } else { + const int ctx = get_intra_inter_context(xd); + const int is_inter = vpx_read(r, cm->fc->intra_inter_prob[ctx]); + FRAME_COUNTS *counts = xd->counts; + if (counts) + ++counts->intra_inter[ctx][is_inter]; + return is_inter; + } +} + +static void dec_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *best_mv, + int refmv_count) { + int i; + + // Make sure all the candidates are properly clamped etc + for (i = 0; i < refmv_count; ++i) { + lower_mv_precision(&mvlist[i].as_mv, allow_hp); + *best_mv = mvlist[i]; + } +} + +static void fpm_sync(void *const data, int mi_row) { + VP9Decoder *const pbi = (VP9Decoder *)data; + vp9_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame, + mi_row << MI_BLOCK_SIZE_LOG2); +} + +// This macro is used to add a motion vector mv_ref list if it isn't +// already in the list. If it's the second motion vector or early_break +// it will also skip all additional processing and jump to Done! +#define ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done) \ + do { \ + if (refmv_count) { \ + if ((mv).as_int != (mv_ref_list)[0].as_int) { \ + (mv_ref_list)[(refmv_count)] = (mv); \ + refmv_count++; \ + goto Done; \ + } \ + } else { \ + (mv_ref_list)[(refmv_count)++] = (mv); \ + if (early_break) \ + goto Done; \ + } \ + } while (0) + +// If either reference frame is different, not INTRA, and they +// are different from each other scale and add the mv to our list. +#define IF_DIFF_REF_FRAME_ADD_MV_EB(mbmi, ref_frame, ref_sign_bias, \ + refmv_count, mv_ref_list, Done) \ + do { \ + if (is_inter_block(mbmi)) { \ + if ((mbmi)->ref_frame[0] != ref_frame) \ + ADD_MV_REF_LIST_EB(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \ + refmv_count, mv_ref_list, Done); \ + if (has_second_ref(mbmi) && \ + (mbmi)->ref_frame[1] != ref_frame && \ + (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \ + ADD_MV_REF_LIST_EB(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \ + refmv_count, mv_ref_list, Done); \ + } \ + } while (0) + +// This function searches the neighborhood of a given MB/SB +// to try and find candidate reference vectors. +static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, + const POSITION *const mv_ref_search, + int_mv *mv_ref_list, + int mi_row, int mi_col, int block, int is_sub8x8, + find_mv_refs_sync sync, void *const data) { + const int *ref_sign_bias = cm->ref_frame_sign_bias; + int i, refmv_count = 0; + int different_ref_found = 0; + const MV_REF *const prev_frame_mvs = cm->use_prev_frame_mvs ? + cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL; + const TileInfo *const tile = &xd->tile; + // If mode is nearestmv or newmv (uses nearestmv as a reference) then stop + // searching after the first mv is found. + const int early_break = (mode != NEARMV); + + // Blank the reference vector list + memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES); + + i = 0; + if (is_sub8x8) { + // If the size < 8x8 we get the mv from the bmi substructure for the + // nearest two blocks. + for (i = 0; i < 2; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate_mi = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + different_ref_found = 1; + + if (candidate_mi->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST_EB( + get_sub_block_mv(candidate_mi, 0, mv_ref->col, block), + refmv_count, mv_ref_list, Done); + else if (candidate_mi->ref_frame[1] == ref_frame) + ADD_MV_REF_LIST_EB( + get_sub_block_mv(candidate_mi, 1, mv_ref->col, block), + refmv_count, mv_ref_list, Done); + } + } + } + + // Check the rest of the neighbors in much the same way + // as before except we don't need to keep track of sub blocks or + // mode counts. + for (; i < MVREF_NEIGHBOURS; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + different_ref_found = 1; + + if (candidate->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST_EB(candidate->mv[0], refmv_count, mv_ref_list, Done); + else if (candidate->ref_frame[1] == ref_frame) + ADD_MV_REF_LIST_EB(candidate->mv[1], refmv_count, mv_ref_list, Done); + } + } + + // TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast + // on windows platform. The sync here is unnecessary if use_prev_frame_mvs + // is 0. But after removing it, there will be hang in the unit test on windows + // due to several threads waiting for a thread's signal. +#if defined(_WIN32) && !HAVE_PTHREAD_H + if (cm->frame_parallel_decode && sync != NULL) { + sync(data, mi_row); + } +#endif + + // Check the last frame's mode and mv info. + if (prev_frame_mvs) { + // Synchronize here for frame parallel decode if sync function is provided. + if (cm->frame_parallel_decode && sync != NULL) { + sync(data, mi_row); + } + + if (prev_frame_mvs->ref_frame[0] == ref_frame) { + ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done); + } else if (prev_frame_mvs->ref_frame[1] == ref_frame) { + ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, Done); + } + } + + // Since we couldn't find 2 mvs from the same reference frame + // go back through the neighbors and find motion vectors from + // different reference frames. + if (different_ref_found) { + for (i = 0; i < MVREF_NEIGHBOURS; ++i) { + const POSITION *mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + + // If the candidate is INTRA we don't want to consider its mv. + IF_DIFF_REF_FRAME_ADD_MV_EB(candidate, ref_frame, ref_sign_bias, + refmv_count, mv_ref_list, Done); + } + } + } + + // Since we still don't have a candidate we'll try the last frame. + if (prev_frame_mvs) { + if (prev_frame_mvs->ref_frame[0] != ref_frame && + prev_frame_mvs->ref_frame[0] > INTRA_FRAME) { + int_mv mv = prev_frame_mvs->mv[0]; + if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] != + ref_sign_bias[ref_frame]) { + mv.as_mv.row *= -1; + mv.as_mv.col *= -1; + } + ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done); + } + + if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME && + prev_frame_mvs->ref_frame[1] != ref_frame && + prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int) { + int_mv mv = prev_frame_mvs->mv[1]; + if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] != + ref_sign_bias[ref_frame]) { + mv.as_mv.row *= -1; + mv.as_mv.col *= -1; + } + ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done); + } + } + + if (mode == NEARMV) + refmv_count = MAX_MV_REF_CANDIDATES; + else + // we only care about the nearestmv for the remaining modes + refmv_count = 1; + + Done: + // Clamp vectors + for (i = 0; i < refmv_count; ++i) + clamp_mv_ref(&mv_ref_list[i].as_mv, xd); + + return refmv_count; +} + +static void append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, + const POSITION *const mv_ref_search, + PREDICTION_MODE b_mode, int block, + int ref, int mi_row, int mi_col, + int_mv *best_sub8x8) { + int_mv mv_list[MAX_MV_REF_CANDIDATES]; + MODE_INFO *const mi = xd->mi[0]; + b_mode_info *bmi = mi->bmi; + int n; + int refmv_count; + + assert(MAX_MV_REF_CANDIDATES == 2); + + refmv_count = dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], + mv_ref_search, mv_list, mi_row, mi_col, block, + 1, NULL, NULL); + + switch (block) { + case 0: + best_sub8x8->as_int = mv_list[refmv_count - 1].as_int; + break; + case 1: + case 2: + if (b_mode == NEARESTMV) { + best_sub8x8->as_int = bmi[0].as_mv[ref].as_int; + } else { + best_sub8x8->as_int = 0; + for (n = 0; n < refmv_count; ++n) + if (bmi[0].as_mv[ref].as_int != mv_list[n].as_int) { + best_sub8x8->as_int = mv_list[n].as_int; + break; + } + } + break; + case 3: + if (b_mode == NEARESTMV) { + best_sub8x8->as_int = bmi[2].as_mv[ref].as_int; + } else { + int_mv candidates[2 + MAX_MV_REF_CANDIDATES]; + candidates[0] = bmi[1].as_mv[ref]; + candidates[1] = bmi[0].as_mv[ref]; + candidates[2] = mv_list[0]; + candidates[3] = mv_list[1]; + best_sub8x8->as_int = 0; + for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n) + if (bmi[2].as_mv[ref].as_int != candidates[n].as_int) { + best_sub8x8->as_int = candidates[n].as_int; + break; + } + } + break; + default: + assert(0 && "Invalid block index."); + } +} + +static uint8_t get_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const POSITION *const mv_ref_search, + int mi_row, int mi_col) { + int i; + int context_counter = 0; + const TileInfo *const tile = &xd->tile; + + // Get mode count from nearest 2 blocks + for (i = 0; i < 2; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + // Keep counts for entropy encoding. + context_counter += mode_2_counter[candidate->mode]; + } + } + + return counter_to_context[context_counter]; +} + +static void read_inter_block_mode_info(VP9Decoder *const pbi, + MACROBLOCKD *const xd, + MODE_INFO *const mi, + int mi_row, int mi_col, vpx_reader *r) { + VP9_COMMON *const cm = &pbi->common; + const BLOCK_SIZE bsize = mi->sb_type; + const int allow_hp = cm->allow_high_precision_mv; + int_mv best_ref_mvs[2]; + int ref, is_compound; + uint8_t inter_mode_ctx; + const POSITION *const mv_ref_search = mv_ref_blocks[bsize]; + + read_ref_frames(cm, xd, r, mi->segment_id, mi->ref_frame); + is_compound = has_second_ref(mi); + inter_mode_ctx = get_mode_context(cm, xd, mv_ref_search, mi_row, mi_col); + + if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) { + mi->mode = ZEROMV; + if (bsize < BLOCK_8X8) { + vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid usage of segement feature on small blocks"); + return; + } + } else { + if (bsize >= BLOCK_8X8) + mi->mode = read_inter_mode(cm, xd, r, inter_mode_ctx); + else + // Sub 8x8 blocks use the nearestmv as a ref_mv if the b_mode is NEWMV. + // Setting mode to NEARESTMV forces the search to stop after the nearestmv + // has been found. After b_modes have been read, mode will be overwritten + // by the last b_mode. + mi->mode = NEARESTMV; + + if (mi->mode != ZEROMV) { + for (ref = 0; ref < 1 + is_compound; ++ref) { + int_mv tmp_mvs[MAX_MV_REF_CANDIDATES]; + const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; + int refmv_count; + + refmv_count = dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search, + tmp_mvs, mi_row, mi_col, -1, 0, + fpm_sync, (void *)pbi); + + dec_find_best_ref_mvs(allow_hp, tmp_mvs, &best_ref_mvs[ref], + refmv_count); + } + } + } + + mi->interp_filter = (cm->interp_filter == SWITCHABLE) + ? read_switchable_interp_filter(cm, xd, r) + : cm->interp_filter; + + if (bsize < BLOCK_8X8) { + const int num_4x4_w = 1 << xd->bmode_blocks_wl; + const int num_4x4_h = 1 << xd->bmode_blocks_hl; + int idx, idy; + PREDICTION_MODE b_mode; + int_mv best_sub8x8[2]; + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const int j = idy * 2 + idx; + b_mode = read_inter_mode(cm, xd, r, inter_mode_ctx); + + if (b_mode == NEARESTMV || b_mode == NEARMV) { + for (ref = 0; ref < 1 + is_compound; ++ref) + append_sub8x8_mvs_for_idx(cm, xd, mv_ref_search, b_mode, j, ref, + mi_row, mi_col, &best_sub8x8[ref]); + } + + if (!assign_mv(cm, xd, b_mode, mi->bmi[j].as_mv, best_ref_mvs, + best_sub8x8, is_compound, allow_hp, r)) { + xd->corrupted |= 1; + break; + } + + if (num_4x4_h == 2) + mi->bmi[j + 2] = mi->bmi[j]; + if (num_4x4_w == 2) + mi->bmi[j + 1] = mi->bmi[j]; + } + } + + mi->mode = b_mode; + + copy_mv_pair(mi->mv, mi->bmi[3].as_mv); + } else { + xd->corrupted |= !assign_mv(cm, xd, mi->mode, mi->mv, best_ref_mvs, + best_ref_mvs, is_compound, allow_hp, r); + } +} + +static void read_inter_frame_mode_info(VP9Decoder *const pbi, + MACROBLOCKD *const xd, + int mi_row, int mi_col, vpx_reader *r, + int x_mis, int y_mis) { + VP9_COMMON *const cm = &pbi->common; + MODE_INFO *const mi = xd->mi[0]; + int inter_block; + + mi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r, x_mis, + y_mis); + mi->skip = read_skip(cm, xd, mi->segment_id, r); + inter_block = read_is_inter_block(cm, xd, mi->segment_id, r); + mi->tx_size = read_tx_size(cm, xd, !mi->skip || !inter_block, r); + + if (inter_block) + read_inter_block_mode_info(pbi, xd, mi, mi_row, mi_col, r); + else + read_intra_block_mode_info(cm, xd, mi, r); +} + +static INLINE void copy_ref_frame_pair(MV_REFERENCE_FRAME *dst, + const MV_REFERENCE_FRAME *src) { + memcpy(dst, src, sizeof(*dst) * 2); +} + +void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd, + int mi_row, int mi_col, vpx_reader *r, + int x_mis, int y_mis) { + VP9_COMMON *const cm = &pbi->common; + MODE_INFO *const mi = xd->mi[0]; + MV_REF* frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col; + int w, h; + + if (frame_is_intra_only(cm)) { + read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r, x_mis, y_mis); + } else { + read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis); + + for (h = 0; h < y_mis; ++h) { + for (w = 0; w < x_mis; ++w) { + MV_REF *const mv = frame_mvs + w; + copy_ref_frame_pair(mv->ref_frame, mi->ref_frame); + copy_mv_pair(mv->mv, mi->mv); + } + frame_mvs += cm->mi_cols; + } + } +#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH + if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && + (xd->above_mi == NULL || xd->left_mi == NULL) && + !is_inter_block(mi) && need_top_left[mi->uv_mode]) + assert(0); +#endif // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH +} diff --git a/thirdparty/libvpx/vp9/decoder/vp9_decodemv.h b/thirdparty/libvpx/vp9/decoder/vp9_decodemv.h new file mode 100644 index 000000000000..45569ec81f53 --- /dev/null +++ b/thirdparty/libvpx/vp9/decoder/vp9_decodemv.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_DECODER_VP9_DECODEMV_H_ +#define VP9_DECODER_VP9_DECODEMV_H_ + +#include "vpx_dsp/bitreader.h" + +#include "vp9/decoder/vp9_decoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd, + int mi_row, int mi_col, vpx_reader *r, + int x_mis, int y_mis); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_DECODER_VP9_DECODEMV_H_ diff --git a/thirdparty/libvpx/vp9/decoder/vp9_decoder.c b/thirdparty/libvpx/vp9/decoder/vp9_decoder.c new file mode 100644 index 000000000000..935c04f3aa53 --- /dev/null +++ b/thirdparty/libvpx/vp9/decoder/vp9_decoder.c @@ -0,0 +1,518 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" + +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/system_state.h" +#include "vpx_ports/vpx_once.h" +#include "vpx_ports/vpx_timer.h" +#include "vpx_scale/vpx_scale.h" +#include "vpx_util/vpx_thread.h" + +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_onyxc_int.h" +#if CONFIG_VP9_POSTPROC +#include "vp9/common/vp9_postproc.h" +#endif +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconintra.h" + +#include "vp9/decoder/vp9_decodeframe.h" +#include "vp9/decoder/vp9_decoder.h" +#include "vp9/decoder/vp9_detokenize.h" + +static void initialize_dec(void) { + static volatile int init_done = 0; + + if (!init_done) { + vp9_rtcd(); + vpx_dsp_rtcd(); + vpx_scale_rtcd(); + vp9_init_intra_predictors(); + init_done = 1; + } +} + +static void vp9_dec_setup_mi(VP9_COMMON *cm) { + cm->mi = cm->mip + cm->mi_stride + 1; + cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1; + memset(cm->mi_grid_base, 0, + cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base)); +} + +static int vp9_dec_alloc_mi(VP9_COMMON *cm, int mi_size) { + cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip)); + if (!cm->mip) + return 1; + cm->mi_alloc_size = mi_size; + cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*)); + if (!cm->mi_grid_base) + return 1; + return 0; +} + +static void vp9_dec_free_mi(VP9_COMMON *cm) { + vpx_free(cm->mip); + cm->mip = NULL; + vpx_free(cm->mi_grid_base); + cm->mi_grid_base = NULL; +} + +VP9Decoder *vp9_decoder_create(BufferPool *const pool) { + VP9Decoder *volatile const pbi = vpx_memalign(32, sizeof(*pbi)); + VP9_COMMON *volatile const cm = pbi ? &pbi->common : NULL; + + if (!cm) + return NULL; + + vp9_zero(*pbi); + + if (setjmp(cm->error.jmp)) { + cm->error.setjmp = 0; + vp9_decoder_remove(pbi); + return NULL; + } + + cm->error.setjmp = 1; + + CHECK_MEM_ERROR(cm, cm->fc, + (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); + CHECK_MEM_ERROR(cm, cm->frame_contexts, + (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, + sizeof(*cm->frame_contexts))); + + pbi->need_resync = 1; + once(initialize_dec); + + // Initialize the references to not point to any frame buffers. + memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); + memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map)); + + cm->current_video_frame = 0; + pbi->ready_for_new_data = 1; + pbi->common.buffer_pool = pool; + + cm->bit_depth = VPX_BITS_8; + cm->dequant_bit_depth = VPX_BITS_8; + + cm->alloc_mi = vp9_dec_alloc_mi; + cm->free_mi = vp9_dec_free_mi; + cm->setup_mi = vp9_dec_setup_mi; + + vp9_loop_filter_init(cm); + + cm->error.setjmp = 0; + + vpx_get_worker_interface()->init(&pbi->lf_worker); + + return pbi; +} + +void vp9_decoder_remove(VP9Decoder *pbi) { + int i; + + if (!pbi) + return; + + vpx_get_worker_interface()->end(&pbi->lf_worker); + vpx_free(pbi->lf_worker.data1); + + for (i = 0; i < pbi->num_tile_workers; ++i) { + VPxWorker *const worker = &pbi->tile_workers[i]; + vpx_get_worker_interface()->end(worker); + } + + vpx_free(pbi->tile_worker_data); + vpx_free(pbi->tile_workers); + + if (pbi->num_tile_workers > 0) { + vp9_loop_filter_dealloc(&pbi->lf_row_sync); + } + + vpx_free(pbi); +} + +static int equal_dimensions(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + return a->y_height == b->y_height && a->y_width == b->y_width && + a->uv_height == b->uv_height && a->uv_width == b->uv_width; +} + +vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi, + VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { + VP9_COMMON *cm = &pbi->common; + + /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the + * encoder is using the frame buffers for. This is just a stub to keep the + * vpxenc --test-decode functionality working, and will be replaced in a + * later commit that adds VP9-specific controls for this functionality. + */ + if (ref_frame_flag == VP9_LAST_FLAG) { + const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, 0); + if (cfg == NULL) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "No 'last' reference frame"); + return VPX_CODEC_ERROR; + } + if (!equal_dimensions(cfg, sd)) + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Incorrect buffer dimensions"); + else + vp8_yv12_copy_frame(cfg, sd); + } else { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Invalid reference frame"); + } + + return cm->error.error_code; +} + + +vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm, + VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { + RefBuffer *ref_buf = NULL; + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + + // TODO(jkoleszar): The decoder doesn't have any real knowledge of what the + // encoder is using the frame buffers for. This is just a stub to keep the + // vpxenc --test-decode functionality working, and will be replaced in a + // later commit that adds VP9-specific controls for this functionality. + if (ref_frame_flag == VP9_LAST_FLAG) { + ref_buf = &cm->frame_refs[0]; + } else if (ref_frame_flag == VP9_GOLD_FLAG) { + ref_buf = &cm->frame_refs[1]; + } else if (ref_frame_flag == VP9_ALT_FLAG) { + ref_buf = &cm->frame_refs[2]; + } else { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Invalid reference frame"); + return cm->error.error_code; + } + + if (!equal_dimensions(ref_buf->buf, sd)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Incorrect buffer dimensions"); + } else { + int *ref_fb_ptr = &ref_buf->idx; + + // Find an empty frame buffer. + const int free_fb = get_free_fb(cm); + if (cm->new_fb_idx == INVALID_IDX) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Unable to find free frame buffer"); + return cm->error.error_code; + } + + // Decrease ref_count since it will be increased again in + // ref_cnt_fb() below. + --frame_bufs[free_fb].ref_count; + + // Manage the reference counters and copy image. + ref_cnt_fb(frame_bufs, ref_fb_ptr, free_fb); + ref_buf->buf = &frame_bufs[*ref_fb_ptr].buf; + vp8_yv12_copy_frame(sd, ref_buf->buf); + } + + return cm->error.error_code; +} + +/* If any buffer updating is signaled it should be done here. */ +static void swap_frame_buffers(VP9Decoder *pbi) { + int ref_index = 0, mask; + VP9_COMMON *const cm = &pbi->common; + BufferPool *const pool = cm->buffer_pool; + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + + lock_buffer_pool(pool); + for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { + const int old_idx = cm->ref_frame_map[ref_index]; + // Current thread releases the holding of reference frame. + decrease_ref_count(old_idx, frame_bufs, pool); + + // Release the reference frame in reference map. + if (mask & 1) { + decrease_ref_count(old_idx, frame_bufs, pool); + } + cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index]; + ++ref_index; + } + + // Current thread releases the holding of reference frame. + for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) { + const int old_idx = cm->ref_frame_map[ref_index]; + decrease_ref_count(old_idx, frame_bufs, pool); + cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index]; + } + unlock_buffer_pool(pool); + pbi->hold_ref_buf = 0; + cm->frame_to_show = get_frame_new_buffer(cm); + + if (!pbi->frame_parallel_decode || !cm->show_frame) { + lock_buffer_pool(pool); + --frame_bufs[cm->new_fb_idx].ref_count; + unlock_buffer_pool(pool); + } + + // Invalidate these references until the next frame starts. + for (ref_index = 0; ref_index < 3; ref_index++) + cm->frame_refs[ref_index].idx = -1; +} + +int vp9_receive_compressed_data(VP9Decoder *pbi, + size_t size, const uint8_t **psource) { + VP9_COMMON *volatile const cm = &pbi->common; + BufferPool *volatile const pool = cm->buffer_pool; + RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs; + const uint8_t *source = *psource; + int retcode = 0; + cm->error.error_code = VPX_CODEC_OK; + + if (size == 0) { + // This is used to signal that we are missing frames. + // We do not know if the missing frame(s) was supposed to update + // any of the reference buffers, but we act conservative and + // mark only the last buffer as corrupted. + // + // TODO(jkoleszar): Error concealment is undefined and non-normative + // at this point, but if it becomes so, [0] may not always be the correct + // thing to do here. + if (cm->frame_refs[0].idx > 0) { + assert(cm->frame_refs[0].buf != NULL); + cm->frame_refs[0].buf->corrupted = 1; + } + } + + pbi->ready_for_new_data = 0; + + // Check if the previous frame was a frame without any references to it. + // Release frame buffer if not decoding in frame parallel mode. + if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0 + && frame_bufs[cm->new_fb_idx].ref_count == 0) + pool->release_fb_cb(pool->cb_priv, + &frame_bufs[cm->new_fb_idx].raw_frame_buffer); + // Find a free frame buffer. Return error if can not find any. + cm->new_fb_idx = get_free_fb(cm); + if (cm->new_fb_idx == INVALID_IDX) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Unable to find free frame buffer"); + return cm->error.error_code; + } + + // Assign a MV array to the frame buffer. + cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; + + pbi->hold_ref_buf = 0; + if (pbi->frame_parallel_decode) { + VPxWorker *const worker = pbi->frame_worker_owner; + vp9_frameworker_lock_stats(worker); + frame_bufs[cm->new_fb_idx].frame_worker_owner = worker; + // Reset decoding progress. + pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; + pbi->cur_buf->row = -1; + pbi->cur_buf->col = -1; + vp9_frameworker_unlock_stats(worker); + } else { + pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; + } + + + if (setjmp(cm->error.jmp)) { + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + int i; + + cm->error.setjmp = 0; + pbi->ready_for_new_data = 1; + + // Synchronize all threads immediately as a subsequent decode call may + // cause a resize invalidating some allocations. + winterface->sync(&pbi->lf_worker); + for (i = 0; i < pbi->num_tile_workers; ++i) { + winterface->sync(&pbi->tile_workers[i]); + } + + lock_buffer_pool(pool); + // Release all the reference buffers if worker thread is holding them. + if (pbi->hold_ref_buf == 1) { + int ref_index = 0, mask; + for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { + const int old_idx = cm->ref_frame_map[ref_index]; + // Current thread releases the holding of reference frame. + decrease_ref_count(old_idx, frame_bufs, pool); + + // Release the reference frame in reference map. + if (mask & 1) { + decrease_ref_count(old_idx, frame_bufs, pool); + } + ++ref_index; + } + + // Current thread releases the holding of reference frame. + for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) { + const int old_idx = cm->ref_frame_map[ref_index]; + decrease_ref_count(old_idx, frame_bufs, pool); + } + pbi->hold_ref_buf = 0; + } + // Release current frame. + decrease_ref_count(cm->new_fb_idx, frame_bufs, pool); + unlock_buffer_pool(pool); + + vpx_clear_system_state(); + return -1; + } + + cm->error.setjmp = 1; + vp9_decode_frame(pbi, source, source + size, psource); + + swap_frame_buffers(pbi); + + vpx_clear_system_state(); + + if (!cm->show_existing_frame) { + cm->last_show_frame = cm->show_frame; + cm->prev_frame = cm->cur_frame; + if (cm->seg.enabled && !pbi->frame_parallel_decode) + vp9_swap_current_and_last_seg_map(cm); + } + + // Update progress in frame parallel decode. + if (pbi->frame_parallel_decode) { + // Need to lock the mutex here as another thread may + // be accessing this buffer. + VPxWorker *const worker = pbi->frame_worker_owner; + FrameWorkerData *const frame_worker_data = worker->data1; + vp9_frameworker_lock_stats(worker); + + if (cm->show_frame) { + cm->current_video_frame++; + } + frame_worker_data->frame_decoded = 1; + frame_worker_data->frame_context_ready = 1; + vp9_frameworker_signal_stats(worker); + vp9_frameworker_unlock_stats(worker); + } else { + cm->last_width = cm->width; + cm->last_height = cm->height; + if (cm->show_frame) { + cm->current_video_frame++; + } + } + + cm->error.setjmp = 0; + return retcode; +} + +int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd, + vp9_ppflags_t *flags) { + VP9_COMMON *const cm = &pbi->common; + int ret = -1; +#if !CONFIG_VP9_POSTPROC + (void)*flags; +#endif + + if (pbi->ready_for_new_data == 1) + return ret; + + pbi->ready_for_new_data = 1; + + /* no raw frame to show!!! */ + if (!cm->show_frame) + return ret; + + pbi->ready_for_new_data = 1; + +#if CONFIG_VP9_POSTPROC + if (!cm->show_existing_frame) { + ret = vp9_post_proc_frame(cm, sd, flags); + } else { + *sd = *cm->frame_to_show; + ret = 0; + } +#else + *sd = *cm->frame_to_show; + ret = 0; +#endif /*!CONFIG_POSTPROC*/ + vpx_clear_system_state(); + return ret; +} + +vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data, + size_t data_sz, + uint32_t sizes[8], int *count, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state) { + // A chunk ending with a byte matching 0xc0 is an invalid chunk unless + // it is a super frame index. If the last byte of real video compression + // data is 0xc0 the encoder must add a 0 byte. If we have the marker but + // not the associated matching marker byte at the front of the index we have + // an invalid bitstream and need to return an error. + + uint8_t marker; + + assert(data_sz); + marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1); + *count = 0; + + if ((marker & 0xe0) == 0xc0) { + const uint32_t frames = (marker & 0x7) + 1; + const uint32_t mag = ((marker >> 3) & 0x3) + 1; + const size_t index_sz = 2 + mag * frames; + + // This chunk is marked as having a superframe index but doesn't have + // enough data for it, thus it's an invalid superframe index. + if (data_sz < index_sz) + return VPX_CODEC_CORRUPT_FRAME; + + { + const uint8_t marker2 = read_marker(decrypt_cb, decrypt_state, + data + data_sz - index_sz); + + // This chunk is marked as having a superframe index but doesn't have + // the matching marker byte at the front of the index therefore it's an + // invalid chunk. + if (marker != marker2) + return VPX_CODEC_CORRUPT_FRAME; + } + + { + // Found a valid superframe index. + uint32_t i, j; + const uint8_t *x = &data[data_sz - index_sz + 1]; + + // Frames has a maximum of 8 and mag has a maximum of 4. + uint8_t clear_buffer[32]; + assert(sizeof(clear_buffer) >= frames * mag); + if (decrypt_cb) { + decrypt_cb(decrypt_state, x, clear_buffer, frames * mag); + x = clear_buffer; + } + + for (i = 0; i < frames; ++i) { + uint32_t this_sz = 0; + + for (j = 0; j < mag; ++j) + this_sz |= ((uint32_t)(*x++)) << (j * 8); + sizes[i] = this_sz; + } + *count = frames; + } + } + return VPX_CODEC_OK; +} diff --git a/thirdparty/libvpx/vp9/decoder/vp9_decoder.h b/thirdparty/libvpx/vp9/decoder/vp9_decoder.h new file mode 100644 index 000000000000..7111a36d3702 --- /dev/null +++ b/thirdparty/libvpx/vp9/decoder/vp9_decoder.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_DECODER_VP9_DECODER_H_ +#define VP9_DECODER_VP9_DECODER_H_ + +#include "./vpx_config.h" + +#include "vpx/vpx_codec.h" +#include "vpx_dsp/bitreader.h" +#include "vpx_scale/yv12config.h" +#include "vpx_util/vpx_thread.h" + +#include "vp9/common/vp9_thread_common.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_ppflags.h" +#include "vp9/decoder/vp9_dthread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct TileBuffer { + const uint8_t *data; + size_t size; + int col; // only used with multi-threaded decoding +} TileBuffer; + +typedef struct TileWorkerData { + const uint8_t *data_end; + int buf_start, buf_end; // pbi->tile_buffers to decode, inclusive + vpx_reader bit_reader; + FRAME_COUNTS counts; + DECLARE_ALIGNED(16, MACROBLOCKD, xd); + /* dqcoeff are shared by all the planes. So planes must be decoded serially */ + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); + struct vpx_internal_error_info error_info; +} TileWorkerData; + +typedef struct VP9Decoder { + DECLARE_ALIGNED(16, MACROBLOCKD, mb); + + DECLARE_ALIGNED(16, VP9_COMMON, common); + + int ready_for_new_data; + + int refresh_frame_flags; + + int frame_parallel_decode; // frame-based threading. + + // TODO(hkuang): Combine this with cur_buf in macroblockd as they are + // the same. + RefCntBuffer *cur_buf; // Current decoding frame buffer. + + VPxWorker *frame_worker_owner; // frame_worker that owns this pbi. + VPxWorker lf_worker; + VPxWorker *tile_workers; + TileWorkerData *tile_worker_data; + TileBuffer tile_buffers[64]; + int num_tile_workers; + int total_tiles; + + VP9LfSync lf_row_sync; + + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; + + int max_threads; + int inv_tile_order; + int need_resync; // wait for key/intra-only frame. + int hold_ref_buf; // hold the reference buffer. +} VP9Decoder; + +int vp9_receive_compressed_data(struct VP9Decoder *pbi, + size_t size, const uint8_t **dest); + +int vp9_get_raw_frame(struct VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd, + vp9_ppflags_t *flags); + +vpx_codec_err_t vp9_copy_reference_dec(struct VP9Decoder *pbi, + VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + +vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm, + VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + +static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb, + void *decrypt_state, + const uint8_t *data) { + if (decrypt_cb) { + uint8_t marker; + decrypt_cb(decrypt_state, data, &marker, 1); + return marker; + } + return *data; +} + +// This function is exposed for use in tests, as well as the inlined function +// "read_marker". +vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data, + size_t data_sz, + uint32_t sizes[8], int *count, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state); + +struct VP9Decoder *vp9_decoder_create(BufferPool *const pool); + +void vp9_decoder_remove(struct VP9Decoder *pbi); + +static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, + BufferPool *const pool) { + if (idx >= 0 && frame_bufs[idx].ref_count > 0) { + --frame_bufs[idx].ref_count; + // A worker may only get a free framebuffer index when calling get_free_fb. + // But the private buffer is not set up until finish decoding header. + // So any error happens during decoding header, the frame_bufs will not + // have valid priv buffer. + if (frame_bufs[idx].ref_count == 0 && + frame_bufs[idx].raw_frame_buffer.priv) { + pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer); + } + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_DECODER_VP9_DECODER_H_ diff --git a/thirdparty/libvpx/vp9/decoder/vp9_detokenize.c b/thirdparty/libvpx/vp9/decoder/vp9_detokenize.c new file mode 100644 index 000000000000..47dc107fe217 --- /dev/null +++ b/thirdparty/libvpx/vp9/decoder/vp9_detokenize.c @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" +#if CONFIG_COEFFICIENT_RANGE_CHECKING +#include "vp9/common/vp9_idct.h" +#endif + +#include "vp9/decoder/vp9_detokenize.h" + +#define EOB_CONTEXT_NODE 0 +#define ZERO_CONTEXT_NODE 1 +#define ONE_CONTEXT_NODE 2 + +#define INCREMENT_COUNT(token) \ + do { \ + if (counts) \ + ++coef_counts[band][ctx][token]; \ + } while (0) + +static INLINE int read_coeff(const vpx_prob *probs, int n, vpx_reader *r) { + int i, val = 0; + for (i = 0; i < n; ++i) + val = (val << 1) | vpx_read(r, probs[i]); + return val; +} + +static int decode_coefs(const MACROBLOCKD *xd, + PLANE_TYPE type, + tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq, + int ctx, const int16_t *scan, const int16_t *nb, + vpx_reader *r) { + FRAME_COUNTS *counts = xd->counts; + const int max_eob = 16 << (tx_size << 1); + const FRAME_CONTEXT *const fc = xd->fc; + const int ref = is_inter_block(xd->mi[0]); + int band, c = 0; + const vpx_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] = + fc->coef_probs[tx_size][type][ref]; + const vpx_prob *prob; + unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1]; + unsigned int (*eob_branch_count)[COEFF_CONTEXTS]; + uint8_t token_cache[32 * 32]; + const uint8_t *band_translate = get_band_translate(tx_size); + const int dq_shift = (tx_size == TX_32X32); + int v, token; + int16_t dqv = dq[0]; + const uint8_t *const cat6_prob = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->bd == VPX_BITS_12) ? vp9_cat6_prob_high12 : + (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2 : +#endif // CONFIG_VP9_HIGHBITDEPTH + vp9_cat6_prob; + const int cat6_bits = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->bd == VPX_BITS_12) ? 18 : + (xd->bd == VPX_BITS_10) ? 16 : +#endif // CONFIG_VP9_HIGHBITDEPTH + 14; + + if (counts) { + coef_counts = counts->coef[tx_size][type][ref]; + eob_branch_count = counts->eob_branch[tx_size][type][ref]; + } + + while (c < max_eob) { + int val = -1; + band = *band_translate++; + prob = coef_probs[band][ctx]; + if (counts) + ++eob_branch_count[band][ctx]; + if (!vpx_read(r, prob[EOB_CONTEXT_NODE])) { + INCREMENT_COUNT(EOB_MODEL_TOKEN); + break; + } + + while (!vpx_read(r, prob[ZERO_CONTEXT_NODE])) { + INCREMENT_COUNT(ZERO_TOKEN); + dqv = dq[1]; + token_cache[scan[c]] = 0; + ++c; + if (c >= max_eob) + return c; // zero tokens at the end (no eob token) + ctx = get_coef_context(nb, token_cache, c); + band = *band_translate++; + prob = coef_probs[band][ctx]; + } + + if (!vpx_read(r, prob[ONE_CONTEXT_NODE])) { + INCREMENT_COUNT(ONE_TOKEN); + token = ONE_TOKEN; + val = 1; + } else { + INCREMENT_COUNT(TWO_TOKEN); + token = vpx_read_tree(r, vp9_coef_con_tree, + vp9_pareto8_full[prob[PIVOT_NODE] - 1]); + switch (token) { + case TWO_TOKEN: + case THREE_TOKEN: + case FOUR_TOKEN: + val = token; + break; + case CATEGORY1_TOKEN: + val = CAT1_MIN_VAL + read_coeff(vp9_cat1_prob, 1, r); + break; + case CATEGORY2_TOKEN: + val = CAT2_MIN_VAL + read_coeff(vp9_cat2_prob, 2, r); + break; + case CATEGORY3_TOKEN: + val = CAT3_MIN_VAL + read_coeff(vp9_cat3_prob, 3, r); + break; + case CATEGORY4_TOKEN: + val = CAT4_MIN_VAL + read_coeff(vp9_cat4_prob, 4, r); + break; + case CATEGORY5_TOKEN: + val = CAT5_MIN_VAL + read_coeff(vp9_cat5_prob, 5, r); + break; + case CATEGORY6_TOKEN: + val = CAT6_MIN_VAL + read_coeff(cat6_prob, cat6_bits, r); + break; + } + } + v = (val * dqv) >> dq_shift; +#if CONFIG_COEFFICIENT_RANGE_CHECKING +#if CONFIG_VP9_HIGHBITDEPTH + dqcoeff[scan[c]] = highbd_check_range((vpx_read_bit(r) ? -v : v), + xd->bd); +#else + dqcoeff[scan[c]] = check_range(vpx_read_bit(r) ? -v : v); +#endif // CONFIG_VP9_HIGHBITDEPTH +#else + dqcoeff[scan[c]] = vpx_read_bit(r) ? -v : v; +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + token_cache[scan[c]] = vp9_pt_energy_class[token]; + ++c; + ctx = get_coef_context(nb, token_cache, c); + dqv = dq[1]; + } + + return c; +} + +static void get_ctx_shift(MACROBLOCKD *xd, int *ctx_shift_a, int *ctx_shift_l, + int x, int y, unsigned int tx_size_in_blocks) { + if (xd->max_blocks_wide) { + if (tx_size_in_blocks + x > xd->max_blocks_wide) + *ctx_shift_a = (tx_size_in_blocks - (xd->max_blocks_wide - x)) * 8; + } + if (xd->max_blocks_high) { + if (tx_size_in_blocks + y > xd->max_blocks_high) + *ctx_shift_l = (tx_size_in_blocks - (xd->max_blocks_high - y)) * 8; + } +} + +int vp9_decode_block_tokens(MACROBLOCKD *xd, int plane, const scan_order *sc, + int x, int y, TX_SIZE tx_size, vpx_reader *r, + int seg_id) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int16_t *const dequant = pd->seg_dequant[seg_id]; + int eob; + ENTROPY_CONTEXT *a = pd->above_context + x; + ENTROPY_CONTEXT *l = pd->left_context + y; + int ctx; + int ctx_shift_a = 0; + int ctx_shift_l = 0; + + switch (tx_size) { + case TX_4X4: + ctx = a[0] != 0; + ctx += l[0] != 0; + eob = decode_coefs(xd, get_plane_type(plane), pd->dqcoeff, tx_size, + dequant, ctx, sc->scan, sc->neighbors, r); + a[0] = l[0] = (eob > 0); + break; + case TX_8X8: + get_ctx_shift(xd, &ctx_shift_a, &ctx_shift_l, x, y, 1 << TX_8X8); + ctx = !!*(const uint16_t *)a; + ctx += !!*(const uint16_t *)l; + eob = decode_coefs(xd, get_plane_type(plane), pd->dqcoeff, tx_size, + dequant, ctx, sc->scan, sc->neighbors, r); + *(uint16_t *)a = ((eob > 0) * 0x0101) >> ctx_shift_a; + *(uint16_t *)l = ((eob > 0) * 0x0101) >> ctx_shift_l; + break; + case TX_16X16: + get_ctx_shift(xd, &ctx_shift_a, &ctx_shift_l, x, y, 1 << TX_16X16); + ctx = !!*(const uint32_t *)a; + ctx += !!*(const uint32_t *)l; + eob = decode_coefs(xd, get_plane_type(plane), pd->dqcoeff, tx_size, + dequant, ctx, sc->scan, sc->neighbors, r); + *(uint32_t *)a = ((eob > 0) * 0x01010101) >> ctx_shift_a; + *(uint32_t *)l = ((eob > 0) * 0x01010101) >> ctx_shift_l; + break; + case TX_32X32: + get_ctx_shift(xd, &ctx_shift_a, &ctx_shift_l, x, y, 1 << TX_32X32); + // NOTE: casting to uint64_t here is safe because the default memory + // alignment is at least 8 bytes and the TX_32X32 is aligned on 8 byte + // boundaries. + ctx = !!*(const uint64_t *)a; + ctx += !!*(const uint64_t *)l; + eob = decode_coefs(xd, get_plane_type(plane), pd->dqcoeff, tx_size, + dequant, ctx, sc->scan, sc->neighbors, r); + *(uint64_t *)a = ((eob > 0) * 0x0101010101010101ULL) >> ctx_shift_a; + *(uint64_t *)l = ((eob > 0) * 0x0101010101010101ULL) >> ctx_shift_l; + break; + default: + assert(0 && "Invalid transform size."); + eob = 0; + break; + } + + return eob; +} diff --git a/thirdparty/libvpx/vp9/decoder/vp9_detokenize.h b/thirdparty/libvpx/vp9/decoder/vp9_detokenize.h new file mode 100644 index 000000000000..d242d4466e59 --- /dev/null +++ b/thirdparty/libvpx/vp9/decoder/vp9_detokenize.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_DECODER_VP9_DETOKENIZE_H_ +#define VP9_DECODER_VP9_DETOKENIZE_H_ + +#include "vpx_dsp/bitreader.h" +#include "vp9/decoder/vp9_decoder.h" +#include "vp9/common/vp9_scan.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int vp9_decode_block_tokens(MACROBLOCKD *xd, + int plane, const scan_order *sc, + int x, int y, + TX_SIZE tx_size, vpx_reader *r, + int seg_id); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_DECODER_VP9_DETOKENIZE_H_ diff --git a/thirdparty/libvpx/vp9/decoder/vp9_dsubexp.c b/thirdparty/libvpx/vp9/decoder/vp9_dsubexp.c new file mode 100644 index 000000000000..05b38538aef2 --- /dev/null +++ b/thirdparty/libvpx/vp9/decoder/vp9_dsubexp.c @@ -0,0 +1,76 @@ +/* + Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_entropy.h" + +#include "vp9/decoder/vp9_dsubexp.h" + +static int inv_recenter_nonneg(int v, int m) { + if (v > 2 * m) + return v; + + return (v & 1) ? m - ((v + 1) >> 1) : m + (v >> 1); +} + +static int decode_uniform(vpx_reader *r) { + const int l = 8; + const int m = (1 << l) - 191; + const int v = vpx_read_literal(r, l - 1); + return v < m ? v : (v << 1) - m + vpx_read_bit(r); +} + +static int inv_remap_prob(int v, int m) { + static uint8_t inv_map_table[MAX_PROB] = { + 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176, 189, + 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, + 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, + 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, + 77, 78, 79, 80, 81, 82, 83, 84, 86, 87, 88, 89, 90, 91, 92, + 93, 94, 95, 96, 97, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125, + 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, + 142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, + 158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, + 174, 175, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, + 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206, + 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 222, + 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, + 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 253 + }; + assert(v < (int)(sizeof(inv_map_table) / sizeof(inv_map_table[0]))); + v = inv_map_table[v]; + m--; + if ((m << 1) <= MAX_PROB) { + return 1 + inv_recenter_nonneg(v, m); + } else { + return MAX_PROB - inv_recenter_nonneg(v, MAX_PROB - 1 - m); + } +} + +static int decode_term_subexp(vpx_reader *r) { + if (!vpx_read_bit(r)) + return vpx_read_literal(r, 4); + if (!vpx_read_bit(r)) + return vpx_read_literal(r, 4) + 16; + if (!vpx_read_bit(r)) + return vpx_read_literal(r, 5) + 32; + return decode_uniform(r) + 64; +} + +void vp9_diff_update_prob(vpx_reader *r, vpx_prob* p) { + if (vpx_read(r, DIFF_UPDATE_PROB)) { + const int delp = decode_term_subexp(r); + *p = (vpx_prob)inv_remap_prob(delp, *p); + } +} diff --git a/thirdparty/libvpx/vp9/decoder/vp9_dsubexp.h b/thirdparty/libvpx/vp9/decoder/vp9_dsubexp.h new file mode 100644 index 000000000000..a8bcc70be914 --- /dev/null +++ b/thirdparty/libvpx/vp9/decoder/vp9_dsubexp.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_DECODER_VP9_DSUBEXP_H_ +#define VP9_DECODER_VP9_DSUBEXP_H_ + +#include "vpx_dsp/bitreader.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_diff_update_prob(vpx_reader *r, vpx_prob* p); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_DECODER_VP9_DSUBEXP_H_ diff --git a/thirdparty/libvpx/vp9/decoder/vp9_dthread.c b/thirdparty/libvpx/vp9/decoder/vp9_dthread.c new file mode 100644 index 000000000000..14a71448fe70 --- /dev/null +++ b/thirdparty/libvpx/vp9/decoder/vp9_dthread.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/decoder/vp9_dthread.h" +#include "vp9/decoder/vp9_decoder.h" + +// #define DEBUG_THREAD + +// TODO(hkuang): Clean up all the #ifdef in this file. +void vp9_frameworker_lock_stats(VPxWorker *const worker) { +#if CONFIG_MULTITHREAD + FrameWorkerData *const worker_data = worker->data1; + pthread_mutex_lock(&worker_data->stats_mutex); +#else + (void)worker; +#endif +} + +void vp9_frameworker_unlock_stats(VPxWorker *const worker) { +#if CONFIG_MULTITHREAD + FrameWorkerData *const worker_data = worker->data1; + pthread_mutex_unlock(&worker_data->stats_mutex); +#else + (void)worker; +#endif +} + +void vp9_frameworker_signal_stats(VPxWorker *const worker) { +#if CONFIG_MULTITHREAD + FrameWorkerData *const worker_data = worker->data1; + +// TODO(hkuang): Fix the pthread_cond_broadcast in windows wrapper. +#if defined(_WIN32) && !HAVE_PTHREAD_H + pthread_cond_signal(&worker_data->stats_cond); +#else + pthread_cond_broadcast(&worker_data->stats_cond); +#endif + +#else + (void)worker; +#endif +} + +// This macro prevents thread_sanitizer from reporting known concurrent writes. +#if defined(__has_feature) +#if __has_feature(thread_sanitizer) +#define BUILDING_WITH_TSAN +#endif +#endif + +// TODO(hkuang): Remove worker parameter as it is only used in debug code. +void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf, + int row) { +#if CONFIG_MULTITHREAD + if (!ref_buf) + return; + +#ifndef BUILDING_WITH_TSAN + // The following line of code will get harmless tsan error but it is the key + // to get best performance. + if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return; +#endif + + { + // Find the worker thread that owns the reference frame. If the reference + // frame has been fully decoded, it may not have owner. + VPxWorker *const ref_worker = ref_buf->frame_worker_owner; + FrameWorkerData *const ref_worker_data = + (FrameWorkerData *)ref_worker->data1; + const VP9Decoder *const pbi = ref_worker_data->pbi; + +#ifdef DEBUG_THREAD + { + FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; + printf("%d %p worker is waiting for %d %p worker (%d) ref %d \r\n", + worker_data->worker_id, worker, ref_worker_data->worker_id, + ref_buf->frame_worker_owner, row, ref_buf->row); + } +#endif + + vp9_frameworker_lock_stats(ref_worker); + while (ref_buf->row < row && pbi->cur_buf == ref_buf && + ref_buf->buf.corrupted != 1) { + pthread_cond_wait(&ref_worker_data->stats_cond, + &ref_worker_data->stats_mutex); + } + + if (ref_buf->buf.corrupted == 1) { + FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; + vp9_frameworker_unlock_stats(ref_worker); + vpx_internal_error(&worker_data->pbi->common.error, + VPX_CODEC_CORRUPT_FRAME, + "Worker %p failed to decode frame", worker); + } + vp9_frameworker_unlock_stats(ref_worker); + } +#else + (void)worker; + (void)ref_buf; + (void)row; + (void)ref_buf; +#endif // CONFIG_MULTITHREAD +} + +void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) { +#if CONFIG_MULTITHREAD + VPxWorker *worker = buf->frame_worker_owner; + +#ifdef DEBUG_THREAD + { + FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; + printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id, + buf->frame_worker_owner, row); + } +#endif + + vp9_frameworker_lock_stats(worker); + buf->row = row; + vp9_frameworker_signal_stats(worker); + vp9_frameworker_unlock_stats(worker); +#else + (void)buf; + (void)row; +#endif // CONFIG_MULTITHREAD +} + +void vp9_frameworker_copy_context(VPxWorker *const dst_worker, + VPxWorker *const src_worker) { +#if CONFIG_MULTITHREAD + FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1; + FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1; + VP9_COMMON *const src_cm = &src_worker_data->pbi->common; + VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common; + int i; + + // Wait until source frame's context is ready. + vp9_frameworker_lock_stats(src_worker); + while (!src_worker_data->frame_context_ready) { + pthread_cond_wait(&src_worker_data->stats_cond, + &src_worker_data->stats_mutex); + } + + dst_cm->last_frame_seg_map = src_cm->seg.enabled ? + src_cm->current_frame_seg_map : src_cm->last_frame_seg_map; + dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync; + vp9_frameworker_unlock_stats(src_worker); + + dst_cm->bit_depth = src_cm->bit_depth; +#if CONFIG_VP9_HIGHBITDEPTH + dst_cm->use_highbitdepth = src_cm->use_highbitdepth; +#endif + dst_cm->prev_frame = src_cm->show_existing_frame ? + src_cm->prev_frame : src_cm->cur_frame; + dst_cm->last_width = !src_cm->show_existing_frame ? + src_cm->width : src_cm->last_width; + dst_cm->last_height = !src_cm->show_existing_frame ? + src_cm->height : src_cm->last_height; + dst_cm->subsampling_x = src_cm->subsampling_x; + dst_cm->subsampling_y = src_cm->subsampling_y; + dst_cm->frame_type = src_cm->frame_type; + dst_cm->last_show_frame = !src_cm->show_existing_frame ? + src_cm->show_frame : src_cm->last_show_frame; + for (i = 0; i < REF_FRAMES; ++i) + dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i]; + + memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr, + (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh)); + dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level; + dst_cm->lf.filter_level = src_cm->lf.filter_level; + memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_LF_DELTAS); + memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS); + dst_cm->seg = src_cm->seg; + memcpy(dst_cm->frame_contexts, src_cm->frame_contexts, + FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0])); +#else + (void) dst_worker; + (void) src_worker; +#endif // CONFIG_MULTITHREAD +} diff --git a/thirdparty/libvpx/vp9/decoder/vp9_dthread.h b/thirdparty/libvpx/vp9/decoder/vp9_dthread.h new file mode 100644 index 000000000000..ba7c38a511fc --- /dev/null +++ b/thirdparty/libvpx/vp9/decoder/vp9_dthread.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_DECODER_VP9_DTHREAD_H_ +#define VP9_DECODER_VP9_DTHREAD_H_ + +#include "./vpx_config.h" +#include "vpx_util/vpx_thread.h" +#include "vpx/internal/vpx_codec_internal.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP9Common; +struct VP9Decoder; + +// WorkerData for the FrameWorker thread. It contains all the information of +// the worker and decode structures for decoding a frame. +typedef struct FrameWorkerData { + struct VP9Decoder *pbi; + const uint8_t *data; + const uint8_t *data_end; + size_t data_size; + void *user_priv; + int result; + int worker_id; + int received_frame; + + // scratch_buffer is used in frame parallel mode only. + // It is used to make a copy of the compressed data. + uint8_t *scratch_buffer; + size_t scratch_buffer_size; + +#if CONFIG_MULTITHREAD + pthread_mutex_t stats_mutex; + pthread_cond_t stats_cond; +#endif + + int frame_context_ready; // Current frame's context is ready to read. + int frame_decoded; // Finished decoding current frame. +} FrameWorkerData; + +void vp9_frameworker_lock_stats(VPxWorker *const worker); +void vp9_frameworker_unlock_stats(VPxWorker *const worker); +void vp9_frameworker_signal_stats(VPxWorker *const worker); + +// Wait until ref_buf has been decoded to row in real pixel unit. +// Note: worker may already finish decoding ref_buf and release it in order to +// start decoding next frame. So need to check whether worker is still decoding +// ref_buf. +void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf, + int row); + +// FrameWorker broadcasts its decoding progress so other workers that are +// waiting on it can resume decoding. +void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row); + +// Copy necessary decoding context from src worker to dst worker. +void vp9_frameworker_copy_context(VPxWorker *const dst_worker, + VPxWorker *const src_worker); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_DECODER_VP9_DTHREAD_H_ diff --git a/thirdparty/libvpx/vp9/vp9_dx_iface.c b/thirdparty/libvpx/vp9/vp9_dx_iface.c new file mode 100644 index 000000000000..6531e2c618f7 --- /dev/null +++ b/thirdparty/libvpx/vp9/vp9_dx_iface.c @@ -0,0 +1,1093 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_version.h" + +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx/vp8dx.h" +#include "vpx/vpx_decoder.h" +#include "vpx_dsp/bitreader_buffer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_util/vpx_thread.h" + +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_frame_buffers.h" + +#include "vp9/decoder/vp9_decodeframe.h" + +#include "vp9/vp9_dx_iface.h" +#include "vp9/vp9_iface_common.h" + +#define VP9_CAP_POSTPROC (CONFIG_VP9_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0) + +static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *data) { + // This function only allocates space for the vpx_codec_alg_priv_t + // structure. More memory may be required at the time the stream + // information becomes known. + (void)data; + + if (!ctx->priv) { + vpx_codec_alg_priv_t *const priv = + (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv)); + if (priv == NULL) + return VPX_CODEC_MEM_ERROR; + + ctx->priv = (vpx_codec_priv_t *)priv; + ctx->priv->init_flags = ctx->init_flags; + priv->si.sz = sizeof(priv->si); + priv->flushed = 0; + // Only do frame parallel decode when threads > 1. + priv->frame_parallel_decode = + (ctx->config.dec && (ctx->config.dec->threads > 1) && + (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING)) ? 1 : 0; + if (ctx->config.dec) { + priv->cfg = *ctx->config.dec; + ctx->config.dec = &priv->cfg; + } + } + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) { + if (ctx->frame_workers != NULL) { + int i; + for (i = 0; i < ctx->num_frame_workers; ++i) { + VPxWorker *const worker = &ctx->frame_workers[i]; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + vpx_get_worker_interface()->end(worker); + vp9_remove_common(&frame_worker_data->pbi->common); +#if CONFIG_VP9_POSTPROC + vp9_free_postproc_buffers(&frame_worker_data->pbi->common); +#endif + vp9_decoder_remove(frame_worker_data->pbi); + vpx_free(frame_worker_data->scratch_buffer); +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&frame_worker_data->stats_mutex); + pthread_cond_destroy(&frame_worker_data->stats_cond); +#endif + vpx_free(frame_worker_data); + } +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex); +#endif + } + + if (ctx->buffer_pool) { + vp9_free_ref_frame_buffers(ctx->buffer_pool); + vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers); + } + + vpx_free(ctx->frame_workers); + vpx_free(ctx->buffer_pool); + vpx_free(ctx); + return VPX_CODEC_OK; +} + +static int parse_bitdepth_colorspace_sampling( + BITSTREAM_PROFILE profile, struct vpx_read_bit_buffer *rb) { + vpx_color_space_t color_space; + if (profile >= PROFILE_2) + rb->bit_offset += 1; // Bit-depth 10 or 12. + color_space = (vpx_color_space_t)vpx_rb_read_literal(rb, 3); + if (color_space != VPX_CS_SRGB) { + rb->bit_offset += 1; // [16,235] (including xvycc) vs [0,255] range. + if (profile == PROFILE_1 || profile == PROFILE_3) { + rb->bit_offset += 2; // subsampling x/y. + rb->bit_offset += 1; // unused. + } + } else { + if (profile == PROFILE_1 || profile == PROFILE_3) { + rb->bit_offset += 1; // unused + } else { + // RGB is only available in version 1. + return 0; + } + } + return 1; +} + +static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si, + int *is_intra_only, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state) { + int intra_only_flag = 0; + uint8_t clear_buffer[10]; + + if (data + data_sz <= data) + return VPX_CODEC_INVALID_PARAM; + + si->is_kf = 0; + si->w = si->h = 0; + + if (decrypt_cb) { + data_sz = VPXMIN(sizeof(clear_buffer), data_sz); + decrypt_cb(decrypt_state, data, clear_buffer, data_sz); + data = clear_buffer; + } + + // A maximum of 6 bits are needed to read the frame marker, profile and + // show_existing_frame. + if (data_sz < 1) + return VPX_CODEC_UNSUP_BITSTREAM; + + { + int show_frame; + int error_resilient; + struct vpx_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL }; + const int frame_marker = vpx_rb_read_literal(&rb, 2); + const BITSTREAM_PROFILE profile = vp9_read_profile(&rb); + + if (frame_marker != VP9_FRAME_MARKER) + return VPX_CODEC_UNSUP_BITSTREAM; + + if (profile >= MAX_PROFILES) + return VPX_CODEC_UNSUP_BITSTREAM; + + if (vpx_rb_read_bit(&rb)) { // show an existing frame + // If profile is > 2 and show_existing_frame is true, then at least 1 more + // byte (6+3=9 bits) is needed. + if (profile > 2 && data_sz < 2) + return VPX_CODEC_UNSUP_BITSTREAM; + vpx_rb_read_literal(&rb, 3); // Frame buffer to show. + return VPX_CODEC_OK; + } + + // For the rest of the function, a maximum of 9 more bytes are needed + // (computed by taking the maximum possible bits needed in each case). Note + // that this has to be updated if we read any more bits in this function. + if (data_sz < 10) + return VPX_CODEC_UNSUP_BITSTREAM; + + si->is_kf = !vpx_rb_read_bit(&rb); + show_frame = vpx_rb_read_bit(&rb); + error_resilient = vpx_rb_read_bit(&rb); + + if (si->is_kf) { + if (!vp9_read_sync_code(&rb)) + return VPX_CODEC_UNSUP_BITSTREAM; + + if (!parse_bitdepth_colorspace_sampling(profile, &rb)) + return VPX_CODEC_UNSUP_BITSTREAM; + vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h); + } else { + intra_only_flag = show_frame ? 0 : vpx_rb_read_bit(&rb); + + rb.bit_offset += error_resilient ? 0 : 2; // reset_frame_context + + if (intra_only_flag) { + if (!vp9_read_sync_code(&rb)) + return VPX_CODEC_UNSUP_BITSTREAM; + if (profile > PROFILE_0) { + if (!parse_bitdepth_colorspace_sampling(profile, &rb)) + return VPX_CODEC_UNSUP_BITSTREAM; + } + rb.bit_offset += REF_FRAMES; // refresh_frame_flags + vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h); + } + } + } + if (is_intra_only != NULL) + *is_intra_only = intra_only_flag; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t decoder_peek_si(const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si) { + return decoder_peek_si_internal(data, data_sz, si, NULL, NULL, NULL); +} + +static vpx_codec_err_t decoder_get_si(vpx_codec_alg_priv_t *ctx, + vpx_codec_stream_info_t *si) { + const size_t sz = (si->sz >= sizeof(vp9_stream_info_t)) + ? sizeof(vp9_stream_info_t) + : sizeof(vpx_codec_stream_info_t); + memcpy(si, &ctx->si, sz); + si->sz = (unsigned int)sz; + + return VPX_CODEC_OK; +} + +static void set_error_detail(vpx_codec_alg_priv_t *ctx, + const char *const error) { + ctx->base.err_detail = error; +} + +static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx, + const struct vpx_internal_error_info *error) { + if (error->error_code) + set_error_detail(ctx, error->has_detail ? error->detail : NULL); + + return error->error_code; +} + +static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) { + int i; + + for (i = 0; i < ctx->num_frame_workers; ++i) { + VPxWorker *const worker = &ctx->frame_workers[i]; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + VP9_COMMON *const cm = &frame_worker_data->pbi->common; + BufferPool *const pool = cm->buffer_pool; + + cm->new_fb_idx = INVALID_IDX; + cm->byte_alignment = ctx->byte_alignment; + cm->skip_loop_filter = ctx->skip_loop_filter; + + if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { + pool->get_fb_cb = ctx->get_ext_fb_cb; + pool->release_fb_cb = ctx->release_ext_fb_cb; + pool->cb_priv = ctx->ext_priv; + } else { + pool->get_fb_cb = vp9_get_frame_buffer; + pool->release_fb_cb = vp9_release_frame_buffer; + + if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to initialize internal frame buffers"); + + pool->cb_priv = &pool->int_frame_buffers; + } + } +} + +static void set_default_ppflags(vp8_postproc_cfg_t *cfg) { + cfg->post_proc_flag = VP8_DEBLOCK | VP8_DEMACROBLOCK; + cfg->deblocking_level = 4; + cfg->noise_level = 0; +} + +static void set_ppflags(const vpx_codec_alg_priv_t *ctx, + vp9_ppflags_t *flags) { + flags->post_proc_flag = + ctx->postproc_cfg.post_proc_flag; + + flags->deblocking_level = ctx->postproc_cfg.deblocking_level; + flags->noise_level = ctx->postproc_cfg.noise_level; +} + +static int frame_worker_hook(void *arg1, void *arg2) { + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1; + const uint8_t *data = frame_worker_data->data; + (void)arg2; + + frame_worker_data->result = + vp9_receive_compressed_data(frame_worker_data->pbi, + frame_worker_data->data_size, + &data); + frame_worker_data->data_end = data; + + if (frame_worker_data->pbi->frame_parallel_decode) { + // In frame parallel decoding, a worker thread must successfully decode all + // the compressed data. + if (frame_worker_data->result != 0 || + frame_worker_data->data + frame_worker_data->data_size - 1 > data) { + VPxWorker *const worker = frame_worker_data->pbi->frame_worker_owner; + BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool; + // Signal all the other threads that are waiting for this frame. + vp9_frameworker_lock_stats(worker); + frame_worker_data->frame_context_ready = 1; + lock_buffer_pool(pool); + frame_worker_data->pbi->cur_buf->buf.corrupted = 1; + unlock_buffer_pool(pool); + frame_worker_data->pbi->need_resync = 1; + vp9_frameworker_signal_stats(worker); + vp9_frameworker_unlock_stats(worker); + return 0; + } + } else if (frame_worker_data->result != 0) { + // Check decode result in serial decode. + frame_worker_data->pbi->cur_buf->buf.corrupted = 1; + frame_worker_data->pbi->need_resync = 1; + } + return !frame_worker_data->result; +} + +static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { + int i; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + + ctx->last_show_frame = -1; + ctx->next_submit_worker_id = 0; + ctx->last_submit_worker_id = 0; + ctx->next_output_worker_id = 0; + ctx->frame_cache_read = 0; + ctx->frame_cache_write = 0; + ctx->num_cache_frames = 0; + ctx->need_resync = 1; + ctx->num_frame_workers = + (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads: 1; + if (ctx->num_frame_workers > MAX_DECODE_THREADS) + ctx->num_frame_workers = MAX_DECODE_THREADS; + ctx->available_threads = ctx->num_frame_workers; + ctx->flushed = 0; + + ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool)); + if (ctx->buffer_pool == NULL) + return VPX_CODEC_MEM_ERROR; + +#if CONFIG_MULTITHREAD + if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) { + set_error_detail(ctx, "Failed to allocate buffer pool mutex"); + return VPX_CODEC_MEM_ERROR; + } +#endif + + ctx->frame_workers = (VPxWorker *) + vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers)); + if (ctx->frame_workers == NULL) { + set_error_detail(ctx, "Failed to allocate frame_workers"); + return VPX_CODEC_MEM_ERROR; + } + + for (i = 0; i < ctx->num_frame_workers; ++i) { + VPxWorker *const worker = &ctx->frame_workers[i]; + FrameWorkerData *frame_worker_data = NULL; + winterface->init(worker); + worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData)); + if (worker->data1 == NULL) { + set_error_detail(ctx, "Failed to allocate frame_worker_data"); + return VPX_CODEC_MEM_ERROR; + } + frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool); + if (frame_worker_data->pbi == NULL) { + set_error_detail(ctx, "Failed to allocate frame_worker_data"); + return VPX_CODEC_MEM_ERROR; + } + frame_worker_data->pbi->frame_worker_owner = worker; + frame_worker_data->worker_id = i; + frame_worker_data->scratch_buffer = NULL; + frame_worker_data->scratch_buffer_size = 0; + frame_worker_data->frame_context_ready = 0; + frame_worker_data->received_frame = 0; +#if CONFIG_MULTITHREAD + if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) { + set_error_detail(ctx, "Failed to allocate frame_worker_data mutex"); + return VPX_CODEC_MEM_ERROR; + } + + if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) { + set_error_detail(ctx, "Failed to allocate frame_worker_data cond"); + return VPX_CODEC_MEM_ERROR; + } +#endif + // If decoding in serial mode, FrameWorker thread could create tile worker + // thread or loopfilter thread. + frame_worker_data->pbi->max_threads = + (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0; + + frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order; + frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode; + frame_worker_data->pbi->common.frame_parallel_decode = + ctx->frame_parallel_decode; + worker->hook = (VPxWorkerHook)frame_worker_hook; + if (!winterface->reset(worker)) { + set_error_detail(ctx, "Frame Worker thread creation failed"); + return VPX_CODEC_MEM_ERROR; + } + } + + // If postprocessing was enabled by the application and a + // configuration has not been provided, default it. + if (!ctx->postproc_cfg_set && + (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) + set_default_ppflags(&ctx->postproc_cfg); + + init_buffer_callbacks(ctx); + + return VPX_CODEC_OK; +} + +static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx, + const VP9Decoder *const pbi) { + // Clear resync flag if worker got a key frame or intra only frame. + if (ctx->need_resync == 1 && pbi->need_resync == 0 && + (pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME)) + ctx->need_resync = 0; +} + +static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, + const uint8_t **data, unsigned int data_sz, + void *user_priv, int64_t deadline) { + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + (void)deadline; + + // Determine the stream parameters. Note that we rely on peek_si to + // validate that we have a buffer that does not wrap around the top + // of the heap. + if (!ctx->si.h) { + int is_intra_only = 0; + const vpx_codec_err_t res = + decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only, + ctx->decrypt_cb, ctx->decrypt_state); + if (res != VPX_CODEC_OK) + return res; + + if (!ctx->si.is_kf && !is_intra_only) + return VPX_CODEC_ERROR; + } + + if (!ctx->frame_parallel_decode) { + VPxWorker *const worker = ctx->frame_workers; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->data = *data; + frame_worker_data->data_size = data_sz; + frame_worker_data->user_priv = user_priv; + frame_worker_data->received_frame = 1; + + // Set these even if already initialized. The caller may have changed the + // decrypt config between frames. + frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb; + frame_worker_data->pbi->decrypt_state = ctx->decrypt_state; + + worker->had_error = 0; + winterface->execute(worker); + + // Update data pointer after decode. + *data = frame_worker_data->data_end; + + if (worker->had_error) + return update_error_state(ctx, &frame_worker_data->pbi->common.error); + + check_resync(ctx, frame_worker_data->pbi); + } else { + VPxWorker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id]; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + // Copy context from last worker thread to next worker thread. + if (ctx->next_submit_worker_id != ctx->last_submit_worker_id) + vp9_frameworker_copy_context( + &ctx->frame_workers[ctx->next_submit_worker_id], + &ctx->frame_workers[ctx->last_submit_worker_id]); + + frame_worker_data->pbi->ready_for_new_data = 0; + // Copy the compressed data into worker's internal buffer. + // TODO(hkuang): Will all the workers allocate the same size + // as the size of the first intra frame be better? This will + // avoid too many deallocate and allocate. + if (frame_worker_data->scratch_buffer_size < data_sz) { + frame_worker_data->scratch_buffer = + (uint8_t *)vpx_realloc(frame_worker_data->scratch_buffer, data_sz); + if (frame_worker_data->scratch_buffer == NULL) { + set_error_detail(ctx, "Failed to reallocate scratch buffer"); + return VPX_CODEC_MEM_ERROR; + } + frame_worker_data->scratch_buffer_size = data_sz; + } + frame_worker_data->data_size = data_sz; + memcpy(frame_worker_data->scratch_buffer, *data, data_sz); + + frame_worker_data->frame_decoded = 0; + frame_worker_data->frame_context_ready = 0; + frame_worker_data->received_frame = 1; + frame_worker_data->data = frame_worker_data->scratch_buffer; + frame_worker_data->user_priv = user_priv; + + if (ctx->next_submit_worker_id != ctx->last_submit_worker_id) + ctx->last_submit_worker_id = + (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers; + + ctx->next_submit_worker_id = + (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers; + --ctx->available_threads; + worker->had_error = 0; + winterface->launch(worker); + } + + return VPX_CODEC_OK; +} + +static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) { + YV12_BUFFER_CONFIG sd; + vp9_ppflags_t flags = {0, 0, 0}; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + ctx->next_output_worker_id = + (ctx->next_output_worker_id + 1) % ctx->num_frame_workers; + // TODO(hkuang): Add worker error handling here. + winterface->sync(worker); + frame_worker_data->received_frame = 0; + ++ctx->available_threads; + + check_resync(ctx, frame_worker_data->pbi); + + if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) { + VP9_COMMON *const cm = &frame_worker_data->pbi->common; + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx; + yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd, + frame_worker_data->user_priv); + ctx->frame_cache[ctx->frame_cache_write].img.fb_priv = + frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; + ctx->frame_cache_write = + (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE; + ++ctx->num_cache_frames; + } +} + +static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, + const uint8_t *data, unsigned int data_sz, + void *user_priv, long deadline) { + const uint8_t *data_start = data; + const uint8_t * const data_end = data + data_sz; + vpx_codec_err_t res; + uint32_t frame_sizes[8]; + int frame_count; + + if (data == NULL && data_sz == 0) { + ctx->flushed = 1; + return VPX_CODEC_OK; + } + + // Reset flushed when receiving a valid frame. + ctx->flushed = 0; + + // Initialize the decoder workers on the first frame. + if (ctx->frame_workers == NULL) { + const vpx_codec_err_t res = init_decoder(ctx); + if (res != VPX_CODEC_OK) + return res; + } + + res = vp9_parse_superframe_index(data, data_sz, frame_sizes, &frame_count, + ctx->decrypt_cb, ctx->decrypt_state); + if (res != VPX_CODEC_OK) + return res; + + if (ctx->frame_parallel_decode) { + // Decode in frame parallel mode. When decoding in this mode, the frame + // passed to the decoder must be either a normal frame or a superframe with + // superframe index so the decoder could get each frame's start position + // in the superframe. + if (frame_count > 0) { + int i; + + for (i = 0; i < frame_count; ++i) { + const uint8_t *data_start_copy = data_start; + const uint32_t frame_size = frame_sizes[i]; + if (data_start < data + || frame_size > (uint32_t) (data_end - data_start)) { + set_error_detail(ctx, "Invalid frame size in index"); + return VPX_CODEC_CORRUPT_FRAME; + } + + if (ctx->available_threads == 0) { + // No more threads for decoding. Wait until the next output worker + // finishes decoding. Then copy the decoded frame into cache. + if (ctx->num_cache_frames < FRAME_CACHE_SIZE) { + wait_worker_and_cache_frame(ctx); + } else { + // TODO(hkuang): Add unit test to test this path. + set_error_detail(ctx, "Frame output cache is full."); + return VPX_CODEC_ERROR; + } + } + + res = decode_one(ctx, &data_start_copy, frame_size, user_priv, + deadline); + if (res != VPX_CODEC_OK) + return res; + data_start += frame_size; + } + } else { + if (ctx->available_threads == 0) { + // No more threads for decoding. Wait until the next output worker + // finishes decoding. Then copy the decoded frame into cache. + if (ctx->num_cache_frames < FRAME_CACHE_SIZE) { + wait_worker_and_cache_frame(ctx); + } else { + // TODO(hkuang): Add unit test to test this path. + set_error_detail(ctx, "Frame output cache is full."); + return VPX_CODEC_ERROR; + } + } + + res = decode_one(ctx, &data, data_sz, user_priv, deadline); + if (res != VPX_CODEC_OK) + return res; + } + } else { + // Decode in serial mode. + if (frame_count > 0) { + int i; + + for (i = 0; i < frame_count; ++i) { + const uint8_t *data_start_copy = data_start; + const uint32_t frame_size = frame_sizes[i]; + vpx_codec_err_t res; + if (data_start < data + || frame_size > (uint32_t) (data_end - data_start)) { + set_error_detail(ctx, "Invalid frame size in index"); + return VPX_CODEC_CORRUPT_FRAME; + } + + res = decode_one(ctx, &data_start_copy, frame_size, user_priv, + deadline); + if (res != VPX_CODEC_OK) + return res; + + data_start += frame_size; + } + } else { + while (data_start < data_end) { + const uint32_t frame_size = (uint32_t) (data_end - data_start); + const vpx_codec_err_t res = decode_one(ctx, &data_start, frame_size, + user_priv, deadline); + if (res != VPX_CODEC_OK) + return res; + + // Account for suboptimal termination by the encoder. + while (data_start < data_end) { + const uint8_t marker = read_marker(ctx->decrypt_cb, + ctx->decrypt_state, data_start); + if (marker) + break; + ++data_start; + } + } + } + } + + return res; +} + +static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) { + RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs; + // Decrease reference count of last output frame in frame parallel mode. + if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) { + BufferPool *const pool = ctx->buffer_pool; + lock_buffer_pool(pool); + decrease_ref_count(ctx->last_show_frame, frame_bufs, pool); + unlock_buffer_pool(pool); + } +} + +static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx, + vpx_codec_iter_t *iter) { + vpx_image_t *img = NULL; + + // Only return frame when all the cpu are busy or + // application fluhsed the decoder in frame parallel decode. + if (ctx->frame_parallel_decode && ctx->available_threads > 0 && + !ctx->flushed) { + return NULL; + } + + // Output the frames in the cache first. + if (ctx->num_cache_frames > 0) { + release_last_output_frame(ctx); + ctx->last_show_frame = ctx->frame_cache[ctx->frame_cache_read].fb_idx; + if (ctx->need_resync) + return NULL; + img = &ctx->frame_cache[ctx->frame_cache_read].img; + ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE; + --ctx->num_cache_frames; + return img; + } + + // iter acts as a flip flop, so an image is only returned on the first + // call to get_frame. + if (*iter == NULL && ctx->frame_workers != NULL) { + do { + YV12_BUFFER_CONFIG sd; + vp9_ppflags_t flags = {0, 0, 0}; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + VPxWorker *const worker = + &ctx->frame_workers[ctx->next_output_worker_id]; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + ctx->next_output_worker_id = + (ctx->next_output_worker_id + 1) % ctx->num_frame_workers; + if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) + set_ppflags(ctx, &flags); + // Wait for the frame from worker thread. + if (winterface->sync(worker)) { + // Check if worker has received any frames. + if (frame_worker_data->received_frame == 1) { + ++ctx->available_threads; + frame_worker_data->received_frame = 0; + check_resync(ctx, frame_worker_data->pbi); + } + if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) { + VP9_COMMON *const cm = &frame_worker_data->pbi->common; + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + release_last_output_frame(ctx); + ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx; + if (ctx->need_resync) + return NULL; + yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv); + ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; + img = &ctx->img; + return img; + } + } else { + // Decoding failed. Release the worker thread. + frame_worker_data->received_frame = 0; + ++ctx->available_threads; + ctx->need_resync = 1; + if (ctx->flushed != 1) + return NULL; + } + } while (ctx->next_output_worker_id != ctx->next_submit_worker_id); + } + return NULL; +} + +static vpx_codec_err_t decoder_set_fb_fn( + vpx_codec_alg_priv_t *ctx, + vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { + if (cb_get == NULL || cb_release == NULL) { + return VPX_CODEC_INVALID_PARAM; + } else if (ctx->frame_workers == NULL) { + // If the decoder has already been initialized, do not accept changes to + // the frame buffer functions. + ctx->get_ext_fb_cb = cb_get; + ctx->release_ext_fb_cb = cb_release; + ctx->ext_priv = cb_priv; + return VPX_CODEC_OK; + } + + return VPX_CODEC_ERROR; +} + +static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *); + + // Only support this function in serial decode. + if (ctx->frame_parallel_decode) { + set_error_detail(ctx, "Not supported in frame parallel decode"); + return VPX_CODEC_INCAPABLE; + } + + if (data) { + vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data; + YV12_BUFFER_CONFIG sd; + VPxWorker *const worker = ctx->frame_workers; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + image2yuvconfig(&frame->img, &sd); + return vp9_set_reference_dec(&frame_worker_data->pbi->common, + (VP9_REFFRAME)frame->frame_type, &sd); + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); + + // Only support this function in serial decode. + if (ctx->frame_parallel_decode) { + set_error_detail(ctx, "Not supported in frame parallel decode"); + return VPX_CODEC_INCAPABLE; + } + + if (data) { + vpx_ref_frame_t *frame = (vpx_ref_frame_t *) data; + YV12_BUFFER_CONFIG sd; + VPxWorker *const worker = ctx->frame_workers; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + image2yuvconfig(&frame->img, &sd); + return vp9_copy_reference_dec(frame_worker_data->pbi, + (VP9_REFFRAME)frame->frame_type, &sd); + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { + vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *); + + // Only support this function in serial decode. + if (ctx->frame_parallel_decode) { + set_error_detail(ctx, "Not supported in frame parallel decode"); + return VPX_CODEC_INCAPABLE; + } + + if (data) { + YV12_BUFFER_CONFIG* fb; + VPxWorker *const worker = ctx->frame_workers; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx); + if (fb == NULL) return VPX_CODEC_ERROR; + yuvconfig2image(&data->img, fb, NULL); + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx, + va_list args) { +#if CONFIG_VP9_POSTPROC + vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *); + + if (data) { + ctx->postproc_cfg_set = 1; + ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data); + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } +#else + (void)ctx; + (void)args; + return VPX_CODEC_INCAPABLE; +#endif +} + +static vpx_codec_err_t ctrl_set_dbg_options(vpx_codec_alg_priv_t *ctx, + va_list args) { + (void)ctx; + (void)args; + return VPX_CODEC_INCAPABLE; +} + +static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const update_info = va_arg(args, int *); + + // Only support this function in serial decode. + if (ctx->frame_parallel_decode) { + set_error_detail(ctx, "Not supported in frame parallel decode"); + return VPX_CODEC_INCAPABLE; + } + + if (update_info) { + if (ctx->frame_workers) { + VPxWorker *const worker = ctx->frame_workers; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + *update_info = frame_worker_data->pbi->refresh_frame_flags; + return VPX_CODEC_OK; + } else { + return VPX_CODEC_ERROR; + } + } + + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *corrupted = va_arg(args, int *); + + if (corrupted) { + if (ctx->frame_workers) { + VPxWorker *const worker = ctx->frame_workers; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + RefCntBuffer *const frame_bufs = + frame_worker_data->pbi->common.buffer_pool->frame_bufs; + if (frame_worker_data->pbi->common.frame_to_show == NULL) + return VPX_CODEC_ERROR; + if (ctx->last_show_frame >= 0) + *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted; + return VPX_CODEC_OK; + } else { + return VPX_CODEC_ERROR; + } + } + + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const frame_size = va_arg(args, int *); + + // Only support this function in serial decode. + if (ctx->frame_parallel_decode) { + set_error_detail(ctx, "Not supported in frame parallel decode"); + return VPX_CODEC_INCAPABLE; + } + + if (frame_size) { + if (ctx->frame_workers) { + VPxWorker *const worker = ctx->frame_workers; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const VP9_COMMON *const cm = &frame_worker_data->pbi->common; + frame_size[0] = cm->width; + frame_size[1] = cm->height; + return VPX_CODEC_OK; + } else { + return VPX_CODEC_ERROR; + } + } + + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const render_size = va_arg(args, int *); + + // Only support this function in serial decode. + if (ctx->frame_parallel_decode) { + set_error_detail(ctx, "Not supported in frame parallel decode"); + return VPX_CODEC_INCAPABLE; + } + + if (render_size) { + if (ctx->frame_workers) { + VPxWorker *const worker = ctx->frame_workers; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const VP9_COMMON *const cm = &frame_worker_data->pbi->common; + render_size[0] = cm->render_width; + render_size[1] = cm->render_height; + return VPX_CODEC_OK; + } else { + return VPX_CODEC_ERROR; + } + } + + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx, + va_list args) { + unsigned int *const bit_depth = va_arg(args, unsigned int *); + VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; + + if (bit_depth) { + if (worker) { + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const VP9_COMMON *const cm = &frame_worker_data->pbi->common; + *bit_depth = cm->bit_depth; + return VPX_CODEC_OK; + } else { + return VPX_CODEC_ERROR; + } + } + + return VPX_CODEC_INVALID_PARAM; +} + +static vpx_codec_err_t ctrl_set_invert_tile_order(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->invert_tile_order = va_arg(args, int); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_decryptor(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_decrypt_init *init = va_arg(args, vpx_decrypt_init *); + ctx->decrypt_cb = init ? init->decrypt_cb : NULL; + ctx->decrypt_state = init ? init->decrypt_state : NULL; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx, + va_list args) { + const int legacy_byte_alignment = 0; + const int min_byte_alignment = 32; + const int max_byte_alignment = 1024; + const int byte_alignment = va_arg(args, int); + + if (byte_alignment != legacy_byte_alignment && + (byte_alignment < min_byte_alignment || + byte_alignment > max_byte_alignment || + (byte_alignment & (byte_alignment - 1)) != 0)) + return VPX_CODEC_INVALID_PARAM; + + ctx->byte_alignment = byte_alignment; + if (ctx->frame_workers) { + VPxWorker *const worker = ctx->frame_workers; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + frame_worker_data->pbi->common.byte_alignment = byte_alignment; + } + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->skip_loop_filter = va_arg(args, int); + + if (ctx->frame_workers) { + VPxWorker *const worker = ctx->frame_workers; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter; + } + + return VPX_CODEC_OK; +} + +static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { + {VP8_COPY_REFERENCE, ctrl_copy_reference}, + + // Setters + {VP8_SET_REFERENCE, ctrl_set_reference}, + {VP8_SET_POSTPROC, ctrl_set_postproc}, + {VP8_SET_DBG_COLOR_REF_FRAME, ctrl_set_dbg_options}, + {VP8_SET_DBG_COLOR_MB_MODES, ctrl_set_dbg_options}, + {VP8_SET_DBG_COLOR_B_MODES, ctrl_set_dbg_options}, + {VP8_SET_DBG_DISPLAY_MV, ctrl_set_dbg_options}, + {VP9_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order}, + {VPXD_SET_DECRYPTOR, ctrl_set_decryptor}, + {VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment}, + {VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter}, + + // Getters + {VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates}, + {VP8D_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted}, + {VP9_GET_REFERENCE, ctrl_get_reference}, + {VP9D_GET_DISPLAY_SIZE, ctrl_get_render_size}, + {VP9D_GET_BIT_DEPTH, ctrl_get_bit_depth}, + {VP9D_GET_FRAME_SIZE, ctrl_get_frame_size}, + + { -1, NULL}, +}; + +#ifndef VERSION_STRING +#define VERSION_STRING +#endif +CODEC_INTERFACE(vpx_codec_vp9_dx) = { + "WebM Project VP9 Decoder" VERSION_STRING, + VPX_CODEC_INTERNAL_ABI_VERSION, + VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC | + VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER, // vpx_codec_caps_t + decoder_init, // vpx_codec_init_fn_t + decoder_destroy, // vpx_codec_destroy_fn_t + decoder_ctrl_maps, // vpx_codec_ctrl_fn_map_t + { // NOLINT + decoder_peek_si, // vpx_codec_peek_si_fn_t + decoder_get_si, // vpx_codec_get_si_fn_t + decoder_decode, // vpx_codec_decode_fn_t + decoder_get_frame, // vpx_codec_frame_get_fn_t + decoder_set_fb_fn, // vpx_codec_set_fb_fn_t + }, + { // NOLINT + 0, + NULL, // vpx_codec_enc_cfg_map_t + NULL, // vpx_codec_encode_fn_t + NULL, // vpx_codec_get_cx_data_fn_t + NULL, // vpx_codec_enc_config_set_fn_t + NULL, // vpx_codec_get_global_headers_fn_t + NULL, // vpx_codec_get_preview_frame_fn_t + NULL // vpx_codec_enc_mr_get_mem_loc_fn_t + } +}; diff --git a/thirdparty/libvpx/vp9/vp9_dx_iface.h b/thirdparty/libvpx/vp9/vp9_dx_iface.h new file mode 100644 index 000000000000..e0e948e16ca2 --- /dev/null +++ b/thirdparty/libvpx/vp9/vp9_dx_iface.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_VP9_DX_IFACE_H_ +#define VP9_VP9_DX_IFACE_H_ + +#include "vp9/decoder/vp9_decoder.h" + +typedef vpx_codec_stream_info_t vp9_stream_info_t; + +// This limit is due to framebuffer numbers. +// TODO(hkuang): Remove this limit after implementing ondemand framebuffers. +#define FRAME_CACHE_SIZE 6 // Cache maximum 6 decoded frames. + +typedef struct cache_frame { + int fb_idx; + vpx_image_t img; +} cache_frame; + +struct vpx_codec_alg_priv { + vpx_codec_priv_t base; + vpx_codec_dec_cfg_t cfg; + vp9_stream_info_t si; + int postproc_cfg_set; + vp8_postproc_cfg_t postproc_cfg; + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; + vpx_image_t img; + int img_avail; + int flushed; + int invert_tile_order; + int last_show_frame; // Index of last output frame. + int byte_alignment; + int skip_loop_filter; + + // Frame parallel related. + int frame_parallel_decode; // frame-based threading. + VPxWorker *frame_workers; + int num_frame_workers; + int next_submit_worker_id; + int last_submit_worker_id; + int next_output_worker_id; + int available_threads; + cache_frame frame_cache[FRAME_CACHE_SIZE]; + int frame_cache_write; + int frame_cache_read; + int num_cache_frames; + int need_resync; // wait for key/intra-only frame + // BufferPool that holds all reference frames. Shared by all the FrameWorkers. + BufferPool *buffer_pool; + + // External frame buffer info to save for VP9 common. + void *ext_priv; // Private data associated with the external frame buffers. + vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb; + vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb; +}; + +#endif // VP9_VP9_DX_IFACE_H_ diff --git a/thirdparty/libvpx/vp9/vp9_iface_common.h b/thirdparty/libvpx/vp9/vp9_iface_common.h new file mode 100644 index 000000000000..938d4224ba30 --- /dev/null +++ b/thirdparty/libvpx/vp9/vp9_iface_common.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VP9_VP9_IFACE_COMMON_H_ +#define VP9_VP9_IFACE_COMMON_H_ + +#include "vpx_ports/mem.h" + +static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, + void *user_priv) { + /** vpx_img_wrap() doesn't allow specifying independent strides for + * the Y, U, and V planes, nor other alignment adjustments that + * might be representable by a YV12_BUFFER_CONFIG, so we just + * initialize all the fields.*/ + int bps; + if (!yv12->subsampling_y) { + if (!yv12->subsampling_x) { + img->fmt = VPX_IMG_FMT_I444; + bps = 24; + } else { + img->fmt = VPX_IMG_FMT_I422; + bps = 16; + } + } else { + if (!yv12->subsampling_x) { + img->fmt = VPX_IMG_FMT_I440; + bps = 16; + } else { + img->fmt = VPX_IMG_FMT_I420; + bps = 12; + } + } + img->cs = yv12->color_space; + img->range = yv12->color_range; + img->bit_depth = 8; + img->w = yv12->y_stride; + img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3); + img->d_w = yv12->y_crop_width; + img->d_h = yv12->y_crop_height; + img->r_w = yv12->render_width; + img->r_h = yv12->render_height; + img->x_chroma_shift = yv12->subsampling_x; + img->y_chroma_shift = yv12->subsampling_y; + img->planes[VPX_PLANE_Y] = yv12->y_buffer; + img->planes[VPX_PLANE_U] = yv12->u_buffer; + img->planes[VPX_PLANE_V] = yv12->v_buffer; + img->planes[VPX_PLANE_ALPHA] = NULL; + img->stride[VPX_PLANE_Y] = yv12->y_stride; + img->stride[VPX_PLANE_U] = yv12->uv_stride; + img->stride[VPX_PLANE_V] = yv12->uv_stride; + img->stride[VPX_PLANE_ALPHA] = yv12->y_stride; +#if CONFIG_VP9_HIGHBITDEPTH + if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) { + // vpx_image_t uses byte strides and a pointer to the first byte + // of the image. + img->fmt = (vpx_img_fmt_t)(img->fmt | VPX_IMG_FMT_HIGHBITDEPTH); + img->bit_depth = yv12->bit_depth; + img->planes[VPX_PLANE_Y] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->y_buffer); + img->planes[VPX_PLANE_U] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->u_buffer); + img->planes[VPX_PLANE_V] = (uint8_t*)CONVERT_TO_SHORTPTR(yv12->v_buffer); + img->planes[VPX_PLANE_ALPHA] = NULL; + img->stride[VPX_PLANE_Y] = 2 * yv12->y_stride; + img->stride[VPX_PLANE_U] = 2 * yv12->uv_stride; + img->stride[VPX_PLANE_V] = 2 * yv12->uv_stride; + img->stride[VPX_PLANE_ALPHA] = 2 * yv12->y_stride; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + img->bps = bps; + img->user_priv = user_priv; + img->img_data = yv12->buffer_alloc; + img->img_data_owner = 0; + img->self_allocd = 0; +} + +static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, + YV12_BUFFER_CONFIG *yv12) { + yv12->y_buffer = img->planes[VPX_PLANE_Y]; + yv12->u_buffer = img->planes[VPX_PLANE_U]; + yv12->v_buffer = img->planes[VPX_PLANE_V]; + + yv12->y_crop_width = img->d_w; + yv12->y_crop_height = img->d_h; + yv12->render_width = img->r_w; + yv12->render_height = img->r_h; + yv12->y_width = img->d_w; + yv12->y_height = img->d_h; + + yv12->uv_width = img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2 + : yv12->y_width; + yv12->uv_height = img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2 + : yv12->y_height; + yv12->uv_crop_width = yv12->uv_width; + yv12->uv_crop_height = yv12->uv_height; + + yv12->y_stride = img->stride[VPX_PLANE_Y]; + yv12->uv_stride = img->stride[VPX_PLANE_U]; + yv12->color_space = img->cs; + yv12->color_range = img->range; + +#if CONFIG_VP9_HIGHBITDEPTH + if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + // In vpx_image_t + // planes point to uint8 address of start of data + // stride counts uint8s to reach next row + // In YV12_BUFFER_CONFIG + // y_buffer, u_buffer, v_buffer point to uint16 address of data + // stride and border counts in uint16s + // This means that all the address calculations in the main body of code + // should work correctly. + // However, before we do any pixel operations we need to cast the address + // to a uint16 ponter and double its value. + yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer); + yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer); + yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer); + yv12->y_stride >>= 1; + yv12->uv_stride >>= 1; + yv12->flags = YV12_FLAG_HIGHBITDEPTH; + } else { + yv12->flags = 0; + } + yv12->border = (yv12->y_stride - img->w) / 2; +#else + yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2; +#endif // CONFIG_VP9_HIGHBITDEPTH + yv12->subsampling_x = img->x_chroma_shift; + yv12->subsampling_y = img->y_chroma_shift; + return VPX_CODEC_OK; +} + +#endif // VP9_VP9_IFACE_COMMON_H_ diff --git a/thirdparty/libvpx/vp9_rtcd.h b/thirdparty/libvpx/vp9_rtcd.h new file mode 100644 index 000000000000..cf2b463d6393 --- /dev/null +++ b/thirdparty/libvpx/vp9_rtcd.h @@ -0,0 +1,9 @@ +#include "vpx_config.h" + +#if defined(WEBM_X86ASM) && (ARCH_X86 || ARCH_X86_64) + #include "rtcd/vp9_rtcd_x86.h" +#elif defined(WEBM_ARMASM) && ARCH_ARM + #include "rtcd/vp9_rtcd_arm.h" +#else + #include "rtcd/vp9_rtcd_c.h" +#endif diff --git a/thirdparty/libvpx/vpx/internal/vpx_codec_internal.h b/thirdparty/libvpx/vpx/internal/vpx_codec_internal.h new file mode 100644 index 000000000000..7380fcc7e24c --- /dev/null +++ b/thirdparty/libvpx/vpx/internal/vpx_codec_internal.h @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/*!\file + * \brief Describes the decoder algorithm interface for algorithm + * implementations. + * + * This file defines the private structures and data types that are only + * relevant to implementing an algorithm, as opposed to using it. + * + * To create a decoder algorithm class, an interface structure is put + * into the global namespace: + *
+ *     my_codec.c:
+ *       vpx_codec_iface_t my_codec = {
+ *           "My Codec v1.0",
+ *           VPX_CODEC_ALG_ABI_VERSION,
+ *           ...
+ *       };
+ *     
+ * + * An application instantiates a specific decoder instance by using + * vpx_codec_init() and a pointer to the algorithm's interface structure: + *
+ *     my_app.c:
+ *       extern vpx_codec_iface_t my_codec;
+ *       {
+ *           vpx_codec_ctx_t algo;
+ *           res = vpx_codec_init(&algo, &my_codec);
+ *       }
+ *     
+ * + * Once initialized, the instance is manged using other functions from + * the vpx_codec_* family. + */ +#ifndef VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ +#define VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ +#include "../vpx_decoder.h" +#include "../vpx_encoder.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_CODEC_INTERNAL_ABI_VERSION (5) /**<\hideinitializer*/ + +typedef struct vpx_codec_alg_priv vpx_codec_alg_priv_t; +typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t; + +/*!\brief init function pointer prototype + * + * Performs algorithm-specific initialization of the decoder context. This + * function is called by the generic vpx_codec_init() wrapper function, so + * plugins implementing this interface may trust the input parameters to be + * properly initialized. + * + * \param[in] ctx Pointer to this instance's context + * \retval #VPX_CODEC_OK + * The input stream was recognized and decoder initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory operation failed. + */ +typedef vpx_codec_err_t (*vpx_codec_init_fn_t)(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *data); + +/*!\brief destroy function pointer prototype + * + * Performs algorithm-specific destruction of the decoder context. This + * function is called by the generic vpx_codec_destroy() wrapper function, + * so plugins implementing this interface may trust the input parameters + * to be properly initialized. + * + * \param[in] ctx Pointer to this instance's context + * \retval #VPX_CODEC_OK + * The input stream was recognized and decoder initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory operation failed. + */ +typedef vpx_codec_err_t (*vpx_codec_destroy_fn_t)(vpx_codec_alg_priv_t *ctx); + +/*!\brief parse stream info function pointer prototype + * + * Performs high level parsing of the bitstream. This function is called by the + * generic vpx_codec_peek_stream_info() wrapper function, so plugins + * implementing this interface may trust the input parameters to be properly + * initialized. + * + * \param[in] data Pointer to a block of data to parse + * \param[in] data_sz Size of the data buffer + * \param[in,out] si Pointer to stream info to update. The size member + * \ref MUST be properly initialized, but \ref MAY be + * clobbered by the algorithm. This parameter \ref MAY + * be NULL. + * + * \retval #VPX_CODEC_OK + * Bitstream is parsable and stream information updated + */ +typedef vpx_codec_err_t (*vpx_codec_peek_si_fn_t)(const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si); + +/*!\brief Return information about the current stream. + * + * Returns information about the stream that has been parsed during decoding. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] si Pointer to stream info to update. The size member + * \ref MUST be properly initialized, but \ref MAY be + * clobbered by the algorithm. This parameter \ref MAY + * be NULL. + * + * \retval #VPX_CODEC_OK + * Bitstream is parsable and stream information updated + */ +typedef vpx_codec_err_t (*vpx_codec_get_si_fn_t)(vpx_codec_alg_priv_t *ctx, + vpx_codec_stream_info_t *si); + +/*!\brief control function pointer prototype + * + * This function is used to exchange algorithm specific data with the decoder + * instance. This can be used to implement features specific to a particular + * algorithm. + * + * This function is called by the generic vpx_codec_control() wrapper + * function, so plugins implementing this interface may trust the input + * parameters to be properly initialized. However, this interface does not + * provide type safety for the exchanged data or assign meanings to the + * control codes. Those details should be specified in the algorithm's + * header file. In particular, the ctrl_id parameter is guaranteed to exist + * in the algorithm's control mapping table, and the data parameter may be NULL. + * + * + * \param[in] ctx Pointer to this instance's context + * \param[in] ctrl_id Algorithm specific control identifier + * \param[in,out] data Data to exchange with algorithm instance. + * + * \retval #VPX_CODEC_OK + * The internal state data was deserialized. + */ +typedef vpx_codec_err_t (*vpx_codec_control_fn_t)(vpx_codec_alg_priv_t *ctx, + va_list ap); + +/*!\brief control function pointer mapping + * + * This structure stores the mapping between control identifiers and + * implementing functions. Each algorithm provides a list of these + * mappings. This list is searched by the vpx_codec_control() wrapper + * function to determine which function to invoke. The special + * value {0, NULL} is used to indicate end-of-list, and must be + * present. The special value {0, } can be used as a catch-all + * mapping. This implies that ctrl_id values chosen by the algorithm + * \ref MUST be non-zero. + */ +typedef const struct vpx_codec_ctrl_fn_map { + int ctrl_id; + vpx_codec_control_fn_t fn; +} vpx_codec_ctrl_fn_map_t; + +/*!\brief decode data function pointer prototype + * + * Processes a buffer of coded data. If the processing results in a new + * decoded frame becoming available, #VPX_CODEC_CB_PUT_SLICE and + * #VPX_CODEC_CB_PUT_FRAME events are generated as appropriate. This + * function is called by the generic vpx_codec_decode() wrapper function, + * so plugins implementing this interface may trust the input parameters + * to be properly initialized. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] data Pointer to this block of new coded data. If + * NULL, a #VPX_CODEC_CB_PUT_FRAME event is posted + * for the previously decoded frame. + * \param[in] data_sz Size of the coded data, in bytes. + * + * \return Returns #VPX_CODEC_OK if the coded data was processed completely + * and future pictures can be decoded without error. Otherwise, + * see the descriptions of the other error codes in ::vpx_codec_err_t + * for recoverability capabilities. + */ +typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t *ctx, + const uint8_t *data, + unsigned int data_sz, + void *user_priv, + long deadline); + +/*!\brief Decoded frames iterator + * + * Iterates over a list of the frames available for display. The iterator + * storage should be initialized to NULL to start the iteration. Iteration is + * complete when this function returns NULL. + * + * The list of available frames becomes valid upon completion of the + * vpx_codec_decode call, and remains valid until the next call to vpx_codec_decode. + * + * \param[in] ctx Pointer to this instance's context + * \param[in out] iter Iterator storage, initialized to NULL + * + * \return Returns a pointer to an image, if one is ready for display. Frames + * produced will always be in PTS (presentation time stamp) order. + */ +typedef vpx_image_t *(*vpx_codec_get_frame_fn_t)(vpx_codec_alg_priv_t *ctx, + vpx_codec_iter_t *iter); + +/*!\brief Pass in external frame buffers for the decoder to use. + * + * Registers functions to be called when libvpx needs a frame buffer + * to decode the current frame and a function to be called when libvpx does + * not internally reference the frame buffer. This set function must + * be called before the first call to decode or libvpx will assume the + * default behavior of allocating frame buffers internally. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cb_get Pointer to the get callback function + * \param[in] cb_release Pointer to the release callback function + * \param[in] cb_priv Callback's private data + * + * \retval #VPX_CODEC_OK + * External frame buffers will be used by libvpx. + * \retval #VPX_CODEC_INVALID_PARAM + * One or more of the callbacks were NULL. + * \retval #VPX_CODEC_ERROR + * Decoder context not initialized, or algorithm not capable of + * using external frame buffers. + * + * \note + * When decoding VP9, the application may be required to pass in at least + * #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS external frame + * buffers. + */ +typedef vpx_codec_err_t (*vpx_codec_set_fb_fn_t)( + vpx_codec_alg_priv_t *ctx, + vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv); + + +typedef vpx_codec_err_t (*vpx_codec_encode_fn_t)(vpx_codec_alg_priv_t *ctx, + const vpx_image_t *img, + vpx_codec_pts_t pts, + unsigned long duration, + vpx_enc_frame_flags_t flags, + unsigned long deadline); +typedef const vpx_codec_cx_pkt_t *(*vpx_codec_get_cx_data_fn_t)(vpx_codec_alg_priv_t *ctx, + vpx_codec_iter_t *iter); + +typedef vpx_codec_err_t +(*vpx_codec_enc_config_set_fn_t)(vpx_codec_alg_priv_t *ctx, + const vpx_codec_enc_cfg_t *cfg); +typedef vpx_fixed_buf_t * +(*vpx_codec_get_global_headers_fn_t)(vpx_codec_alg_priv_t *ctx); + +typedef vpx_image_t * +(*vpx_codec_get_preview_frame_fn_t)(vpx_codec_alg_priv_t *ctx); + +typedef vpx_codec_err_t +(*vpx_codec_enc_mr_get_mem_loc_fn_t)(const vpx_codec_enc_cfg_t *cfg, + void **mem_loc); + +/*!\brief usage configuration mapping + * + * This structure stores the mapping between usage identifiers and + * configuration structures. Each algorithm provides a list of these + * mappings. This list is searched by the vpx_codec_enc_config_default() + * wrapper function to determine which config to return. The special value + * {-1, {0}} is used to indicate end-of-list, and must be present. At least + * one mapping must be present, in addition to the end-of-list. + * + */ +typedef const struct vpx_codec_enc_cfg_map { + int usage; + vpx_codec_enc_cfg_t cfg; +} vpx_codec_enc_cfg_map_t; + +/*!\brief Decoder algorithm interface interface + * + * All decoders \ref MUST expose a variable of this type. + */ +struct vpx_codec_iface { + const char *name; /**< Identification String */ + int abi_version; /**< Implemented ABI version */ + vpx_codec_caps_t caps; /**< Decoder capabilities */ + vpx_codec_init_fn_t init; /**< \copydoc ::vpx_codec_init_fn_t */ + vpx_codec_destroy_fn_t destroy; /**< \copydoc ::vpx_codec_destroy_fn_t */ + vpx_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::vpx_codec_ctrl_fn_map_t */ + struct vpx_codec_dec_iface { + vpx_codec_peek_si_fn_t peek_si; /**< \copydoc ::vpx_codec_peek_si_fn_t */ + vpx_codec_get_si_fn_t get_si; /**< \copydoc ::vpx_codec_get_si_fn_t */ + vpx_codec_decode_fn_t decode; /**< \copydoc ::vpx_codec_decode_fn_t */ + vpx_codec_get_frame_fn_t get_frame; /**< \copydoc ::vpx_codec_get_frame_fn_t */ + vpx_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::vpx_codec_set_fb_fn_t */ + } dec; + struct vpx_codec_enc_iface { + int cfg_map_count; + vpx_codec_enc_cfg_map_t *cfg_maps; /**< \copydoc ::vpx_codec_enc_cfg_map_t */ + vpx_codec_encode_fn_t encode; /**< \copydoc ::vpx_codec_encode_fn_t */ + vpx_codec_get_cx_data_fn_t get_cx_data; /**< \copydoc ::vpx_codec_get_cx_data_fn_t */ + vpx_codec_enc_config_set_fn_t cfg_set; /**< \copydoc ::vpx_codec_enc_config_set_fn_t */ + vpx_codec_get_global_headers_fn_t get_glob_hdrs; /**< \copydoc ::vpx_codec_get_global_headers_fn_t */ + vpx_codec_get_preview_frame_fn_t get_preview; /**< \copydoc ::vpx_codec_get_preview_frame_fn_t */ + vpx_codec_enc_mr_get_mem_loc_fn_t mr_get_mem_loc; /**< \copydoc ::vpx_codec_enc_mr_get_mem_loc_fn_t */ + } enc; +}; + +/*!\brief Callback function pointer / user data pair storage */ +typedef struct vpx_codec_priv_cb_pair { + union { + vpx_codec_put_frame_cb_fn_t put_frame; + vpx_codec_put_slice_cb_fn_t put_slice; + } u; + void *user_priv; +} vpx_codec_priv_cb_pair_t; + + +/*!\brief Instance private storage + * + * This structure is allocated by the algorithm's init function. It can be + * extended in one of two ways. First, a second, algorithm specific structure + * can be allocated and the priv member pointed to it. Alternatively, this + * structure can be made the first member of the algorithm specific structure, + * and the pointer cast to the proper type. + */ +struct vpx_codec_priv { + const char *err_detail; + vpx_codec_flags_t init_flags; + struct { + vpx_codec_priv_cb_pair_t put_frame_cb; + vpx_codec_priv_cb_pair_t put_slice_cb; + } dec; + struct { + vpx_fixed_buf_t cx_data_dst_buf; + unsigned int cx_data_pad_before; + unsigned int cx_data_pad_after; + vpx_codec_cx_pkt_t cx_data_pkt; + unsigned int total_encoders; + } enc; +}; + +/* + * Multi-resolution encoding internal configuration + */ +struct vpx_codec_priv_enc_mr_cfg +{ + unsigned int mr_total_resolutions; + unsigned int mr_encoder_id; + struct vpx_rational mr_down_sampling_factor; + void* mr_low_res_mode_info; +}; + +#undef VPX_CTRL_USE_TYPE +#define VPX_CTRL_USE_TYPE(id, typ) \ + static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);} + +#undef VPX_CTRL_USE_TYPE_DEPRECATED +#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \ + static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);} + +#define CAST(id, arg) id##__value(arg) + +/* CODEC_INTERFACE convenience macro + * + * By convention, each codec interface is a struct with extern linkage, where + * the symbol is suffixed with _algo. A getter function is also defined to + * return a pointer to the struct, since in some cases it's easier to work + * with text symbols than data symbols (see issue #169). This function has + * the same name as the struct, less the _algo suffix. The CODEC_INTERFACE + * macro is provided to define this getter function automatically. + */ +#define CODEC_INTERFACE(id)\ + vpx_codec_iface_t* id(void) { return &id##_algo; }\ + vpx_codec_iface_t id##_algo + + +/* Internal Utility Functions + * + * The following functions are intended to be used inside algorithms as + * utilities for manipulating vpx_codec_* data structures. + */ +struct vpx_codec_pkt_list { + unsigned int cnt; + unsigned int max; + struct vpx_codec_cx_pkt pkts[1]; +}; + +#define vpx_codec_pkt_list_decl(n)\ + union {struct vpx_codec_pkt_list head;\ + struct {struct vpx_codec_pkt_list head;\ + struct vpx_codec_cx_pkt pkts[n];} alloc;} + +#define vpx_codec_pkt_list_init(m)\ + (m)->alloc.head.cnt = 0,\ + (m)->alloc.head.max = sizeof((m)->alloc.pkts) / sizeof((m)->alloc.pkts[0]) + +int +vpx_codec_pkt_list_add(struct vpx_codec_pkt_list *, + const struct vpx_codec_cx_pkt *); + +const vpx_codec_cx_pkt_t * +vpx_codec_pkt_list_get(struct vpx_codec_pkt_list *list, + vpx_codec_iter_t *iter); + + +#include +#include + +struct vpx_internal_error_info { + vpx_codec_err_t error_code; + int has_detail; + char detail[80]; + int setjmp; + jmp_buf jmp; +}; + +#define CLANG_ANALYZER_NORETURN +#if defined(__has_feature) +#if __has_feature(attribute_analyzer_noreturn) +#undef CLANG_ANALYZER_NORETURN +#define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn)) +#endif +#endif + +void vpx_internal_error(struct vpx_internal_error_info *info, + vpx_codec_err_t error, + const char *fmt, + ...) CLANG_ANALYZER_NORETURN; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ diff --git a/thirdparty/libvpx/vpx/internal/vpx_psnr.h b/thirdparty/libvpx/vpx/internal/vpx_psnr.h new file mode 100644 index 000000000000..07d81bb8d904 --- /dev/null +++ b/thirdparty/libvpx/vpx/internal/vpx_psnr.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_INTERNAL_VPX_PSNR_H_ +#define VPX_INTERNAL_VPX_PSNR_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +// TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t + +/*!\brief Converts SSE to PSNR + * + * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR). + * + * \param[in] samples Number of samples + * \param[in] peak Max sample value + * \param[in] sse Sum of squared errors + */ +double vpx_sse_to_psnr(double samples, double peak, double sse); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_INTERNAL_VPX_PSNR_H_ diff --git a/thirdparty/libvpx/vpx/src/vpx_codec.c b/thirdparty/libvpx/vpx/src/vpx_codec.c new file mode 100644 index 000000000000..5a495ce814b8 --- /dev/null +++ b/thirdparty/libvpx/vpx/src/vpx_codec.c @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/*!\file + * \brief Provides the high level interface to wrap decoder algorithms. + * + */ +#include +#include +#include "vpx/vpx_integer.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx_version.h" + +#define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var) + +int vpx_codec_version(void) { + return VERSION_PACKED; +} + + +const char *vpx_codec_version_str(void) { + return VERSION_STRING_NOSP; +} + + +const char *vpx_codec_version_extra_str(void) { + return VERSION_EXTRA; +} + + +const char *vpx_codec_iface_name(vpx_codec_iface_t *iface) { + return iface ? iface->name : ""; +} + +const char *vpx_codec_err_to_string(vpx_codec_err_t err) { + switch (err) { + case VPX_CODEC_OK: + return "Success"; + case VPX_CODEC_ERROR: + return "Unspecified internal error"; + case VPX_CODEC_MEM_ERROR: + return "Memory allocation error"; + case VPX_CODEC_ABI_MISMATCH: + return "ABI version mismatch"; + case VPX_CODEC_INCAPABLE: + return "Codec does not implement requested capability"; + case VPX_CODEC_UNSUP_BITSTREAM: + return "Bitstream not supported by this decoder"; + case VPX_CODEC_UNSUP_FEATURE: + return "Bitstream required feature not supported by this decoder"; + case VPX_CODEC_CORRUPT_FRAME: + return "Corrupt frame detected"; + case VPX_CODEC_INVALID_PARAM: + return "Invalid parameter"; + case VPX_CODEC_LIST_END: + return "End of iterated list"; + } + + return "Unrecognized error code"; +} + +const char *vpx_codec_error(vpx_codec_ctx_t *ctx) { + return (ctx) ? vpx_codec_err_to_string(ctx->err) + : vpx_codec_err_to_string(VPX_CODEC_INVALID_PARAM); +} + +const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx) { + if (ctx && ctx->err) + return ctx->priv ? ctx->priv->err_detail : ctx->err_detail; + + return NULL; +} + + +vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) { + vpx_codec_err_t res; + + if (!ctx) + res = VPX_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + res = VPX_CODEC_ERROR; + else { + ctx->iface->destroy((vpx_codec_alg_priv_t *)ctx->priv); + + ctx->iface = NULL; + ctx->name = NULL; + ctx->priv = NULL; + res = VPX_CODEC_OK; + } + + return SAVE_STATUS(ctx, res); +} + + +vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface) { + return (iface) ? iface->caps : 0; +} + + +vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, + int ctrl_id, + ...) { + vpx_codec_err_t res; + + if (!ctx || !ctrl_id) + res = VPX_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps) + res = VPX_CODEC_ERROR; + else { + vpx_codec_ctrl_fn_map_t *entry; + + res = VPX_CODEC_ERROR; + + for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) { + if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) { + va_list ap; + + va_start(ap, ctrl_id); + res = entry->fn((vpx_codec_alg_priv_t *)ctx->priv, ap); + va_end(ap); + break; + } + } + } + + return SAVE_STATUS(ctx, res); +} + +void vpx_internal_error(struct vpx_internal_error_info *info, + vpx_codec_err_t error, + const char *fmt, + ...) { + va_list ap; + + info->error_code = error; + info->has_detail = 0; + + if (fmt) { + size_t sz = sizeof(info->detail); + + info->has_detail = 1; + va_start(ap, fmt); + vsnprintf(info->detail, sz - 1, fmt, ap); + va_end(ap); + info->detail[sz - 1] = '\0'; + } + + if (info->setjmp) + longjmp(info->jmp, info->error_code); +} diff --git a/thirdparty/libvpx/vpx/src/vpx_decoder.c b/thirdparty/libvpx/vpx/src/vpx_decoder.c new file mode 100644 index 000000000000..802d8edd8a43 --- /dev/null +++ b/thirdparty/libvpx/vpx/src/vpx_decoder.c @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/*!\file + * \brief Provides the high level interface to wrap decoder algorithms. + * + */ +#include +#include "vpx/internal/vpx_codec_internal.h" + +#define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var) + +static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) { + return (vpx_codec_alg_priv_t *)ctx->priv; +} + +vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, + const vpx_codec_dec_cfg_t *cfg, + vpx_codec_flags_t flags, + int ver) { + vpx_codec_err_t res; + + if (ver != VPX_DECODER_ABI_VERSION) + res = VPX_CODEC_ABI_MISMATCH; + else if (!ctx || !iface) + res = VPX_CODEC_INVALID_PARAM; + else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION) + res = VPX_CODEC_ABI_MISMATCH; + else if ((flags & VPX_CODEC_USE_POSTPROC) && !(iface->caps & VPX_CODEC_CAP_POSTPROC)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_ERROR_CONCEALMENT) && + !(iface->caps & VPX_CODEC_CAP_ERROR_CONCEALMENT)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_INPUT_FRAGMENTS) && + !(iface->caps & VPX_CODEC_CAP_INPUT_FRAGMENTS)) + res = VPX_CODEC_INCAPABLE; + else if (!(iface->caps & VPX_CODEC_CAP_DECODER)) + res = VPX_CODEC_INCAPABLE; + else { + memset(ctx, 0, sizeof(*ctx)); + ctx->iface = iface; + ctx->name = iface->name; + ctx->priv = NULL; + ctx->init_flags = flags; + ctx->config.dec = cfg; + + res = ctx->iface->init(ctx, NULL); + if (res) { + ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; + vpx_codec_destroy(ctx); + } + } + + return SAVE_STATUS(ctx, res); +} + + +vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t *iface, + const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si) { + vpx_codec_err_t res; + + if (!iface || !data || !data_sz || !si + || si->sz < sizeof(vpx_codec_stream_info_t)) + res = VPX_CODEC_INVALID_PARAM; + else { + /* Set default/unknown values */ + si->w = 0; + si->h = 0; + + res = iface->dec.peek_si(data, data_sz, si); + } + + return res; +} + + +vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx, + vpx_codec_stream_info_t *si) { + vpx_codec_err_t res; + + if (!ctx || !si || si->sz < sizeof(vpx_codec_stream_info_t)) + res = VPX_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + res = VPX_CODEC_ERROR; + else { + /* Set default/unknown values */ + si->w = 0; + si->h = 0; + + res = ctx->iface->dec.get_si(get_alg_priv(ctx), si); + } + + return SAVE_STATUS(ctx, res); +} + + +vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, + const uint8_t *data, + unsigned int data_sz, + void *user_priv, + long deadline) { + vpx_codec_err_t res; + + /* Sanity checks */ + /* NULL data ptr allowed if data_sz is 0 too */ + if (!ctx || (!data && data_sz) || (data && !data_sz)) + res = VPX_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + res = VPX_CODEC_ERROR; + else { + res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv, + deadline); + } + + return SAVE_STATUS(ctx, res); +} + +vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, + vpx_codec_iter_t *iter) { + vpx_image_t *img; + + if (!ctx || !iter || !ctx->iface || !ctx->priv) + img = NULL; + else + img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter); + + return img; +} + + +vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx, + vpx_codec_put_frame_cb_fn_t cb, + void *user_priv) { + vpx_codec_err_t res; + + if (!ctx || !cb) + res = VPX_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv + || !(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME)) + res = VPX_CODEC_ERROR; + else { + ctx->priv->dec.put_frame_cb.u.put_frame = cb; + ctx->priv->dec.put_frame_cb.user_priv = user_priv; + res = VPX_CODEC_OK; + } + + return SAVE_STATUS(ctx, res); +} + + +vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx, + vpx_codec_put_slice_cb_fn_t cb, + void *user_priv) { + vpx_codec_err_t res; + + if (!ctx || !cb) + res = VPX_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv + || !(ctx->iface->caps & VPX_CODEC_CAP_PUT_SLICE)) + res = VPX_CODEC_ERROR; + else { + ctx->priv->dec.put_slice_cb.u.put_slice = cb; + ctx->priv->dec.put_slice_cb.user_priv = user_priv; + res = VPX_CODEC_OK; + } + + return SAVE_STATUS(ctx, res); +} + +vpx_codec_err_t vpx_codec_set_frame_buffer_functions( + vpx_codec_ctx_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { + vpx_codec_err_t res; + + if (!ctx || !cb_get || !cb_release) { + res = VPX_CODEC_INVALID_PARAM; + } else if (!ctx->iface || !ctx->priv || + !(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) { + res = VPX_CODEC_ERROR; + } else { + res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release, + cb_priv); + } + + return SAVE_STATUS(ctx, res); +} diff --git a/thirdparty/libvpx/vpx/src/vpx_image.c b/thirdparty/libvpx/vpx/src/vpx_image.c new file mode 100644 index 000000000000..9aae12c794ba --- /dev/null +++ b/thirdparty/libvpx/vpx/src/vpx_image.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx/vpx_image.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" + +static vpx_image_t *img_alloc_helper(vpx_image_t *img, + vpx_img_fmt_t fmt, + unsigned int d_w, + unsigned int d_h, + unsigned int buf_align, + unsigned int stride_align, + unsigned char *img_data) { + unsigned int h, w, s, xcs, ycs, bps; + unsigned int stride_in_bytes; + int align; + + /* Treat align==0 like align==1 */ + if (!buf_align) + buf_align = 1; + + /* Validate alignment (must be power of 2) */ + if (buf_align & (buf_align - 1)) + goto fail; + + /* Treat align==0 like align==1 */ + if (!stride_align) + stride_align = 1; + + /* Validate alignment (must be power of 2) */ + if (stride_align & (stride_align - 1)) + goto fail; + + /* Get sample size for this format */ + switch (fmt) { + case VPX_IMG_FMT_RGB32: + case VPX_IMG_FMT_RGB32_LE: + case VPX_IMG_FMT_ARGB: + case VPX_IMG_FMT_ARGB_LE: + bps = 32; + break; + case VPX_IMG_FMT_RGB24: + case VPX_IMG_FMT_BGR24: + bps = 24; + break; + case VPX_IMG_FMT_RGB565: + case VPX_IMG_FMT_RGB565_LE: + case VPX_IMG_FMT_RGB555: + case VPX_IMG_FMT_RGB555_LE: + case VPX_IMG_FMT_UYVY: + case VPX_IMG_FMT_YUY2: + case VPX_IMG_FMT_YVYU: + bps = 16; + break; + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_VPXI420: + case VPX_IMG_FMT_VPXYV12: + bps = 12; + break; + case VPX_IMG_FMT_I422: + case VPX_IMG_FMT_I440: + bps = 16; + break; + case VPX_IMG_FMT_I444: + bps = 24; + break; + case VPX_IMG_FMT_I42016: + bps = 24; + break; + case VPX_IMG_FMT_I42216: + case VPX_IMG_FMT_I44016: + bps = 32; + break; + case VPX_IMG_FMT_I44416: + bps = 48; + break; + default: + bps = 16; + break; + } + + /* Get chroma shift values for this format */ + switch (fmt) { + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_VPXI420: + case VPX_IMG_FMT_VPXYV12: + case VPX_IMG_FMT_I422: + case VPX_IMG_FMT_I42016: + case VPX_IMG_FMT_I42216: + xcs = 1; + break; + default: + xcs = 0; + break; + } + + switch (fmt) { + case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_I440: + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_VPXI420: + case VPX_IMG_FMT_VPXYV12: + case VPX_IMG_FMT_I42016: + case VPX_IMG_FMT_I44016: + ycs = 1; + break; + default: + ycs = 0; + break; + } + + /* Calculate storage sizes given the chroma subsampling */ + align = (1 << xcs) - 1; + w = (d_w + align) & ~align; + align = (1 << ycs) - 1; + h = (d_h + align) & ~align; + s = (fmt & VPX_IMG_FMT_PLANAR) ? w : bps * w / 8; + s = (s + stride_align - 1) & ~(stride_align - 1); + stride_in_bytes = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s; + + /* Allocate the new image */ + if (!img) { + img = (vpx_image_t *)calloc(1, sizeof(vpx_image_t)); + + if (!img) + goto fail; + + img->self_allocd = 1; + } else { + memset(img, 0, sizeof(vpx_image_t)); + } + + img->img_data = img_data; + + if (!img_data) { + const uint64_t alloc_size = (fmt & VPX_IMG_FMT_PLANAR) ? + (uint64_t)h * s * bps / 8 : (uint64_t)h * s; + + if (alloc_size != (size_t)alloc_size) + goto fail; + + img->img_data = (uint8_t *)vpx_memalign(buf_align, (size_t)alloc_size); + img->img_data_owner = 1; + } + + if (!img->img_data) + goto fail; + + img->fmt = fmt; + img->bit_depth = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 16 : 8; + img->w = w; + img->h = h; + img->x_chroma_shift = xcs; + img->y_chroma_shift = ycs; + img->bps = bps; + + /* Calculate strides */ + img->stride[VPX_PLANE_Y] = img->stride[VPX_PLANE_ALPHA] = stride_in_bytes; + img->stride[VPX_PLANE_U] = img->stride[VPX_PLANE_V] = stride_in_bytes >> xcs; + + /* Default viewport to entire image */ + if (!vpx_img_set_rect(img, 0, 0, d_w, d_h)) + return img; + +fail: + vpx_img_free(img); + return NULL; +} + +vpx_image_t *vpx_img_alloc(vpx_image_t *img, + vpx_img_fmt_t fmt, + unsigned int d_w, + unsigned int d_h, + unsigned int align) { + return img_alloc_helper(img, fmt, d_w, d_h, align, align, NULL); +} + +vpx_image_t *vpx_img_wrap(vpx_image_t *img, + vpx_img_fmt_t fmt, + unsigned int d_w, + unsigned int d_h, + unsigned int stride_align, + unsigned char *img_data) { + /* By setting buf_align = 1, we don't change buffer alignment in this + * function. */ + return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, img_data); +} + +int vpx_img_set_rect(vpx_image_t *img, + unsigned int x, + unsigned int y, + unsigned int w, + unsigned int h) { + unsigned char *data; + + if (x + w <= img->w && y + h <= img->h) { + img->d_w = w; + img->d_h = h; + + /* Calculate plane pointers */ + if (!(img->fmt & VPX_IMG_FMT_PLANAR)) { + img->planes[VPX_PLANE_PACKED] = + img->img_data + x * img->bps / 8 + y * img->stride[VPX_PLANE_PACKED]; + } else { + const int bytes_per_sample = + (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + data = img->img_data; + + if (img->fmt & VPX_IMG_FMT_HAS_ALPHA) { + img->planes[VPX_PLANE_ALPHA] = + data + x * bytes_per_sample + y * img->stride[VPX_PLANE_ALPHA]; + data += img->h * img->stride[VPX_PLANE_ALPHA]; + } + + img->planes[VPX_PLANE_Y] = data + x * bytes_per_sample + + y * img->stride[VPX_PLANE_Y]; + data += img->h * img->stride[VPX_PLANE_Y]; + + if (!(img->fmt & VPX_IMG_FMT_UV_FLIP)) { + img->planes[VPX_PLANE_U] = + data + (x >> img->x_chroma_shift) * bytes_per_sample + + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U]; + data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_U]; + img->planes[VPX_PLANE_V] = + data + (x >> img->x_chroma_shift) * bytes_per_sample + + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V]; + } else { + img->planes[VPX_PLANE_V] = + data + (x >> img->x_chroma_shift) * bytes_per_sample + + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V]; + data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_V]; + img->planes[VPX_PLANE_U] = + data + (x >> img->x_chroma_shift) * bytes_per_sample + + (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U]; + } + } + return 0; + } + return -1; +} + +void vpx_img_flip(vpx_image_t *img) { + /* Note: In the calculation pointer adjustment calculation, we want the + * rhs to be promoted to a signed type. Section 6.3.1.8 of the ISO C99 + * standard indicates that if the adjustment parameter is unsigned, the + * stride parameter will be promoted to unsigned, causing errors when + * the lhs is a larger type than the rhs. + */ + img->planes[VPX_PLANE_Y] += (signed)(img->d_h - 1) * img->stride[VPX_PLANE_Y]; + img->stride[VPX_PLANE_Y] = -img->stride[VPX_PLANE_Y]; + + img->planes[VPX_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1) + * img->stride[VPX_PLANE_U]; + img->stride[VPX_PLANE_U] = -img->stride[VPX_PLANE_U]; + + img->planes[VPX_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1) + * img->stride[VPX_PLANE_V]; + img->stride[VPX_PLANE_V] = -img->stride[VPX_PLANE_V]; + + img->planes[VPX_PLANE_ALPHA] += (signed)(img->d_h - 1) * img->stride[VPX_PLANE_ALPHA]; + img->stride[VPX_PLANE_ALPHA] = -img->stride[VPX_PLANE_ALPHA]; +} + +void vpx_img_free(vpx_image_t *img) { + if (img) { + if (img->img_data && img->img_data_owner) + vpx_free(img->img_data); + + if (img->self_allocd) + free(img); + } +} diff --git a/thirdparty/libvpx/vpx/src/vpx_psnr.c b/thirdparty/libvpx/vpx/src/vpx_psnr.c new file mode 100644 index 000000000000..05843acb61f0 --- /dev/null +++ b/thirdparty/libvpx/vpx/src/vpx_psnr.c @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx/internal/vpx_psnr.h" + +#define MAX_PSNR 100.0 + +double vpx_sse_to_psnr(double samples, double peak, double sse) { + if (sse > 0.0) { + const double psnr = 10.0 * log10(samples * peak * peak / sse); + return psnr > MAX_PSNR ? MAX_PSNR : psnr; + } else { + return MAX_PSNR; + } +} diff --git a/thirdparty/libvpx/vpx/vp8.h b/thirdparty/libvpx/vpx/vp8.h new file mode 100644 index 000000000000..8a035f977073 --- /dev/null +++ b/thirdparty/libvpx/vpx/vp8.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/*!\defgroup vp8 VP8 + * \ingroup codecs + * VP8 is vpx's newest video compression algorithm that uses motion + * compensated prediction, Discrete Cosine Transform (DCT) coding of the + * prediction error signal and context dependent entropy coding techniques + * based on arithmetic principles. It features: + * - YUV 4:2:0 image format + * - Macro-block based coding (16x16 luma plus two 8x8 chroma) + * - 1/4 (1/8) pixel accuracy motion compensated prediction + * - 4x4 DCT transform + * - 128 level linear quantizer + * - In loop deblocking filter + * - Context-based entropy coding + * + * @{ + */ +/*!\file + * \brief Provides controls common to both the VP8 encoder and decoder. + */ +#ifndef VPX_VP8_H_ +#define VPX_VP8_H_ + +#include "./vpx_codec.h" +#include "./vpx_image.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Control functions + * + * The set of macros define the control functions of VP8 interface + */ +enum vp8_com_control_id { + VP8_SET_REFERENCE = 1, /**< pass in an external frame into decoder to be used as reference frame */ + VP8_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */ + VP8_SET_POSTPROC = 3, /**< set the decoder's post processing settings */ + VP8_SET_DBG_COLOR_REF_FRAME = 4, /**< set the reference frames to color for each macroblock */ + VP8_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */ + VP8_SET_DBG_COLOR_B_MODES = 6, /**< set which blocks modes to color */ + VP8_SET_DBG_DISPLAY_MV = 7, /**< set which motion vector modes to draw */ + + /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+) + * for its control ids. These should be migrated to something like the + * VP8_DECODER_CTRL_ID_START range next time we're ready to break the ABI. + */ + VP9_GET_REFERENCE = 128, /**< get a pointer to a reference frame */ + VP8_COMMON_CTRL_ID_MAX, + VP8_DECODER_CTRL_ID_START = 256 +}; + +/*!\brief post process flags + * + * The set of macros define VP8 decoder post processing flags + */ +enum vp8_postproc_level { + VP8_NOFILTERING = 0, + VP8_DEBLOCK = 1 << 0, + VP8_DEMACROBLOCK = 1 << 1, + VP8_ADDNOISE = 1 << 2, + VP8_DEBUG_TXT_FRAME_INFO = 1 << 3, /**< print frame information */ + VP8_DEBUG_TXT_MBLK_MODES = 1 << 4, /**< print macro block modes over each macro block */ + VP8_DEBUG_TXT_DC_DIFF = 1 << 5, /**< print dc diff for each macro block */ + VP8_DEBUG_TXT_RATE_INFO = 1 << 6, /**< print video rate info (encoder only) */ + VP8_MFQE = 1 << 10 +}; + +/*!\brief post process flags + * + * This define a structure that describe the post processing settings. For + * the best objective measure (using the PSNR metric) set post_proc_flag + * to VP8_DEBLOCK and deblocking_level to 1. + */ + +typedef struct vp8_postproc_cfg { + int post_proc_flag; /**< the types of post processing to be done, should be combination of "vp8_postproc_level" */ + int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */ + int noise_level; /**< the strength of additive noise, valid range [0, 16] */ +} vp8_postproc_cfg_t; + +/*!\brief reference frame type + * + * The set of macros define the type of VP8 reference frames + */ +typedef enum vpx_ref_frame_type { + VP8_LAST_FRAME = 1, + VP8_GOLD_FRAME = 2, + VP8_ALTR_FRAME = 4 +} vpx_ref_frame_type_t; + +/*!\brief reference frame data struct + * + * Define the data struct to access vp8 reference frames. + */ +typedef struct vpx_ref_frame { + vpx_ref_frame_type_t frame_type; /**< which reference frame */ + vpx_image_t img; /**< reference frame data in image format */ +} vpx_ref_frame_t; + +/*!\brief VP9 specific reference frame data struct + * + * Define the data struct to access vp9 reference frames. + */ +typedef struct vp9_ref_frame { + int idx; /**< frame index to get (input) */ + vpx_image_t img; /**< img structure to populate (output) */ +} vp9_ref_frame_t; + +/*!\cond */ +/*!\brief vp8 decoder control function parameter type + * + * defines the data type for each of VP8 decoder control function requires + */ +VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE, vpx_ref_frame_t *) +#define VPX_CTRL_VP8_SET_REFERENCE +VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE, vpx_ref_frame_t *) +#define VPX_CTRL_VP8_COPY_REFERENCE +VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC, vp8_postproc_cfg_t *) +#define VPX_CTRL_VP8_SET_POSTPROC +VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_REF_FRAME, int) +#define VPX_CTRL_VP8_SET_DBG_COLOR_REF_FRAME +VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES, int) +#define VPX_CTRL_VP8_SET_DBG_COLOR_MB_MODES +VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES, int) +#define VPX_CTRL_VP8_SET_DBG_COLOR_B_MODES +VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV, int) +#define VPX_CTRL_VP8_SET_DBG_DISPLAY_MV +VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *) +#define VPX_CTRL_VP9_GET_REFERENCE + +/*!\endcond */ +/*! @} - end defgroup vp8 */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_H_ diff --git a/thirdparty/libvpx/vpx/vp8dx.h b/thirdparty/libvpx/vpx/vp8dx.h new file mode 100644 index 000000000000..67c97bb6c9aa --- /dev/null +++ b/thirdparty/libvpx/vpx/vp8dx.h @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/*!\defgroup vp8_decoder WebM VP8/VP9 Decoder + * \ingroup vp8 + * + * @{ + */ +/*!\file + * \brief Provides definitions for using VP8 or VP9 within the vpx Decoder + * interface. + */ +#ifndef VPX_VP8DX_H_ +#define VPX_VP8DX_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Include controls common to both the encoder and decoder */ +#include "./vp8.h" + +/*!\name Algorithm interface for VP8 + * + * This interface provides the capability to decode VP8 streams. + * @{ + */ +extern vpx_codec_iface_t vpx_codec_vp8_dx_algo; +extern vpx_codec_iface_t *vpx_codec_vp8_dx(void); +/*!@} - end algorithm interface member group*/ + +/*!\name Algorithm interface for VP9 + * + * This interface provides the capability to decode VP9 streams. + * @{ + */ +extern vpx_codec_iface_t vpx_codec_vp9_dx_algo; +extern vpx_codec_iface_t *vpx_codec_vp9_dx(void); +/*!@} - end algorithm interface member group*/ + +/*!\enum vp8_dec_control_id + * \brief VP8 decoder control functions + * + * This set of macros define the control functions available for the VP8 + * decoder interface. + * + * \sa #vpx_codec_control + */ +enum vp8_dec_control_id { + /** control function to get info on which reference frames were updated + * by the last decode + */ + VP8D_GET_LAST_REF_UPDATES = VP8_DECODER_CTRL_ID_START, + + /** check if the indicated frame is corrupted */ + VP8D_GET_FRAME_CORRUPTED, + + /** control function to get info on which reference frames were used + * by the last decode + */ + VP8D_GET_LAST_REF_USED, + + /** decryption function to decrypt encoded buffer data immediately + * before decoding. Takes a vpx_decrypt_init, which contains + * a callback function and opaque context pointer. + */ + VPXD_SET_DECRYPTOR, + VP8D_SET_DECRYPTOR = VPXD_SET_DECRYPTOR, + + /** control function to get the dimensions that the current frame is decoded + * at. This may be different to the intended display size for the frame as + * specified in the wrapper or frame header (see VP9D_GET_DISPLAY_SIZE). */ + VP9D_GET_FRAME_SIZE, + + /** control function to get the current frame's intended display dimensions + * (as specified in the wrapper or frame header). This may be different to + * the decoded dimensions of this frame (see VP9D_GET_FRAME_SIZE). */ + VP9D_GET_DISPLAY_SIZE, + + /** control function to get the bit depth of the stream. */ + VP9D_GET_BIT_DEPTH, + + /** control function to set the byte alignment of the planes in the reference + * buffers. Valid values are power of 2, from 32 to 1024. A value of 0 sets + * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly + * follows Y plane, and V plane directly follows U plane. Default value is 0. + */ + VP9_SET_BYTE_ALIGNMENT, + + /** control function to invert the decoding order to from right to left. The + * function is used in a test to confirm the decoding independence of tile + * columns. The function may be used in application where this order + * of decoding is desired. + * + * TODO(yaowu): Rework the unit test that uses this control, and in a future + * release, this test-only control shall be removed. + */ + VP9_INVERT_TILE_DECODE_ORDER, + + /** control function to set the skip loop filter flag. Valid values are + * integers. The decoder will skip the loop filter when its value is set to + * nonzero. If the loop filter is skipped the decoder may accumulate decode + * artifacts. The default value is 0. + */ + VP9_SET_SKIP_LOOP_FILTER, + + VP8_DECODER_CTRL_ID_MAX +}; + +/** Decrypt n bytes of data from input -> output, using the decrypt_state + * passed in VPXD_SET_DECRYPTOR. + */ +typedef void (*vpx_decrypt_cb)(void *decrypt_state, const unsigned char *input, + unsigned char *output, int count); + +/*!\brief Structure to hold decryption state + * + * Defines a structure to hold the decryption state and access function. + */ +typedef struct vpx_decrypt_init { + /*! Decrypt callback. */ + vpx_decrypt_cb decrypt_cb; + + /*! Decryption state. */ + void *decrypt_state; +} vpx_decrypt_init; + +/*!\brief A deprecated alias for vpx_decrypt_init. + */ +typedef vpx_decrypt_init vp8_decrypt_init; + + +/*!\cond */ +/*!\brief VP8 decoder control function parameter type + * + * Defines the data types that VP8D control functions take. Note that + * additional common controls are defined in vp8.h + * + */ + + +VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES, int *) +#define VPX_CTRL_VP8D_GET_LAST_REF_UPDATES +VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED, int *) +#define VPX_CTRL_VP8D_GET_FRAME_CORRUPTED +VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *) +#define VPX_CTRL_VP8D_GET_LAST_REF_USED +VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *) +#define VPX_CTRL_VPXD_SET_DECRYPTOR +VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *) +#define VPX_CTRL_VP8D_SET_DECRYPTOR +VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE, int *) +#define VPX_CTRL_VP9D_GET_DISPLAY_SIZE +VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *) +#define VPX_CTRL_VP9D_GET_BIT_DEPTH +VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *) +#define VPX_CTRL_VP9D_GET_FRAME_SIZE +VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) +#define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER + +/*!\endcond */ +/*! @} - end defgroup vp8_decoder */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8DX_H_ diff --git a/thirdparty/libvpx/vpx/vpx_codec.h b/thirdparty/libvpx/vpx/vpx_codec.h new file mode 100644 index 000000000000..b6037bb4d7a9 --- /dev/null +++ b/thirdparty/libvpx/vpx/vpx_codec.h @@ -0,0 +1,479 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/*!\defgroup codec Common Algorithm Interface + * This abstraction allows applications to easily support multiple video + * formats with minimal code duplication. This section describes the interface + * common to all codecs (both encoders and decoders). + * @{ + */ + +/*!\file + * \brief Describes the codec algorithm interface to applications. + * + * This file describes the interface between an application and a + * video codec algorithm. + * + * An application instantiates a specific codec instance by using + * vpx_codec_init() and a pointer to the algorithm's interface structure: + *
+ *     my_app.c:
+ *       extern vpx_codec_iface_t my_codec;
+ *       {
+ *           vpx_codec_ctx_t algo;
+ *           res = vpx_codec_init(&algo, &my_codec);
+ *       }
+ *     
+ * + * Once initialized, the instance is manged using other functions from + * the vpx_codec_* family. + */ +#ifndef VPX_VPX_CODEC_H_ +#define VPX_VPX_CODEC_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./vpx_integer.h" +#include "./vpx_image.h" + + /*!\brief Decorator indicating a function is deprecated */ +#ifndef DEPRECATED +#if defined(__GNUC__) && __GNUC__ +#define DEPRECATED __attribute__ ((deprecated)) +#elif defined(_MSC_VER) +#define DEPRECATED +#else +#define DEPRECATED +#endif +#endif /* DEPRECATED */ + +#ifndef DECLSPEC_DEPRECATED +#if defined(__GNUC__) && __GNUC__ +#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */ +#elif defined(_MSC_VER) +#define DECLSPEC_DEPRECATED __declspec(deprecated) /**< \copydoc #DEPRECATED */ +#else +#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */ +#endif +#endif /* DECLSPEC_DEPRECATED */ + + /*!\brief Decorator indicating a function is potentially unused */ +#ifdef UNUSED +#elif defined(__GNUC__) || defined(__clang__) +#define UNUSED __attribute__ ((unused)) +#else +#define UNUSED +#endif + + /*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_CODEC_ABI_VERSION (3 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/ + + /*!\brief Algorithm return codes */ + typedef enum { + /*!\brief Operation completed without error */ + VPX_CODEC_OK, + + /*!\brief Unspecified error */ + VPX_CODEC_ERROR, + + /*!\brief Memory operation failed */ + VPX_CODEC_MEM_ERROR, + + /*!\brief ABI version mismatch */ + VPX_CODEC_ABI_MISMATCH, + + /*!\brief Algorithm does not have required capability */ + VPX_CODEC_INCAPABLE, + + /*!\brief The given bitstream is not supported. + * + * The bitstream was unable to be parsed at the highest level. The decoder + * is unable to proceed. This error \ref SHOULD be treated as fatal to the + * stream. */ + VPX_CODEC_UNSUP_BITSTREAM, + + /*!\brief Encoded bitstream uses an unsupported feature + * + * The decoder does not implement a feature required by the encoder. This + * return code should only be used for features that prevent future + * pictures from being properly decoded. This error \ref MAY be treated as + * fatal to the stream or \ref MAY be treated as fatal to the current GOP. + */ + VPX_CODEC_UNSUP_FEATURE, + + /*!\brief The coded data for this stream is corrupt or incomplete + * + * There was a problem decoding the current frame. This return code + * should only be used for failures that prevent future pictures from + * being properly decoded. This error \ref MAY be treated as fatal to the + * stream or \ref MAY be treated as fatal to the current GOP. If decoding + * is continued for the current GOP, artifacts may be present. + */ + VPX_CODEC_CORRUPT_FRAME, + + /*!\brief An application-supplied parameter is not valid. + * + */ + VPX_CODEC_INVALID_PARAM, + + /*!\brief An iterator reached the end of list. + * + */ + VPX_CODEC_LIST_END + + } + vpx_codec_err_t; + + + /*! \brief Codec capabilities bitfield + * + * Each codec advertises the capabilities it supports as part of its + * ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces + * or functionality, and are not required to be supported. + * + * The available flags are specified by VPX_CODEC_CAP_* defines. + */ + typedef long vpx_codec_caps_t; +#define VPX_CODEC_CAP_DECODER 0x1 /**< Is a decoder */ +#define VPX_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */ + + + /*! \brief Initialization-time Feature Enabling + * + * Certain codec features must be known at initialization time, to allow for + * proper memory allocation. + * + * The available flags are specified by VPX_CODEC_USE_* defines. + */ + typedef long vpx_codec_flags_t; + + + /*!\brief Codec interface structure. + * + * Contains function pointers and other data private to the codec + * implementation. This structure is opaque to the application. + */ + typedef const struct vpx_codec_iface vpx_codec_iface_t; + + + /*!\brief Codec private data structure. + * + * Contains data private to the codec implementation. This structure is opaque + * to the application. + */ + typedef struct vpx_codec_priv vpx_codec_priv_t; + + + /*!\brief Iterator + * + * Opaque storage used for iterating over lists. + */ + typedef const void *vpx_codec_iter_t; + + + /*!\brief Codec context structure + * + * All codecs \ref MUST support this context structure fully. In general, + * this data should be considered private to the codec algorithm, and + * not be manipulated or examined by the calling application. Applications + * may reference the 'name' member to get a printable description of the + * algorithm. + */ + typedef struct vpx_codec_ctx { + const char *name; /**< Printable interface name */ + vpx_codec_iface_t *iface; /**< Interface pointers */ + vpx_codec_err_t err; /**< Last returned error */ + const char *err_detail; /**< Detailed info, if available */ + vpx_codec_flags_t init_flags; /**< Flags passed at init time */ + union { + /**< Decoder Configuration Pointer */ + const struct vpx_codec_dec_cfg *dec; + /**< Encoder Configuration Pointer */ + const struct vpx_codec_enc_cfg *enc; + const void *raw; + } config; /**< Configuration pointer aliasing union */ + vpx_codec_priv_t *priv; /**< Algorithm private storage */ + } vpx_codec_ctx_t; + + /*!\brief Bit depth for codec + * * + * This enumeration determines the bit depth of the codec. + */ + typedef enum vpx_bit_depth { + VPX_BITS_8 = 8, /**< 8 bits */ + VPX_BITS_10 = 10, /**< 10 bits */ + VPX_BITS_12 = 12, /**< 12 bits */ + } vpx_bit_depth_t; + + /* + * Library Version Number Interface + * + * For example, see the following sample return values: + * vpx_codec_version() (1<<16 | 2<<8 | 3) + * vpx_codec_version_str() "v1.2.3-rc1-16-gec6a1ba" + * vpx_codec_version_extra_str() "rc1-16-gec6a1ba" + */ + + /*!\brief Return the version information (as an integer) + * + * Returns a packed encoding of the library version number. This will only include + * the major.minor.patch component of the version number. Note that this encoded + * value should be accessed through the macros provided, as the encoding may change + * in the future. + * + */ + int vpx_codec_version(void); +#define VPX_VERSION_MAJOR(v) ((v>>16)&0xff) /**< extract major from packed version */ +#define VPX_VERSION_MINOR(v) ((v>>8)&0xff) /**< extract minor from packed version */ +#define VPX_VERSION_PATCH(v) ((v>>0)&0xff) /**< extract patch from packed version */ + + /*!\brief Return the version major number */ +#define vpx_codec_version_major() ((vpx_codec_version()>>16)&0xff) + + /*!\brief Return the version minor number */ +#define vpx_codec_version_minor() ((vpx_codec_version()>>8)&0xff) + + /*!\brief Return the version patch number */ +#define vpx_codec_version_patch() ((vpx_codec_version()>>0)&0xff) + + + /*!\brief Return the version information (as a string) + * + * Returns a printable string containing the full library version number. This may + * contain additional text following the three digit version number, as to indicate + * release candidates, prerelease versions, etc. + * + */ + const char *vpx_codec_version_str(void); + + + /*!\brief Return the version information (as a string) + * + * Returns a printable "extra string". This is the component of the string returned + * by vpx_codec_version_str() following the three digit version number. + * + */ + const char *vpx_codec_version_extra_str(void); + + + /*!\brief Return the build configuration + * + * Returns a printable string containing an encoded version of the build + * configuration. This may be useful to vpx support. + * + */ + const char *vpx_codec_build_config(void); + + + /*!\brief Return the name for a given interface + * + * Returns a human readable string for name of the given codec interface. + * + * \param[in] iface Interface pointer + * + */ + const char *vpx_codec_iface_name(vpx_codec_iface_t *iface); + + + /*!\brief Convert error number to printable string + * + * Returns a human readable string for the last error returned by the + * algorithm. The returned error will be one line and will not contain + * any newline characters. + * + * + * \param[in] err Error number. + * + */ + const char *vpx_codec_err_to_string(vpx_codec_err_t err); + + + /*!\brief Retrieve error synopsis for codec context + * + * Returns a human readable string for the last error returned by the + * algorithm. The returned error will be one line and will not contain + * any newline characters. + * + * + * \param[in] ctx Pointer to this instance's context. + * + */ + const char *vpx_codec_error(vpx_codec_ctx_t *ctx); + + + /*!\brief Retrieve detailed error information for codec context + * + * Returns a human readable string providing detailed information about + * the last error. + * + * \param[in] ctx Pointer to this instance's context. + * + * \retval NULL + * No detailed information is available. + */ + const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx); + + + /* REQUIRED FUNCTIONS + * + * The following functions are required to be implemented for all codecs. + * They represent the base case functionality expected of all codecs. + */ + + /*!\brief Destroy a codec instance + * + * Destroys a codec context, freeing any associated memory buffers. + * + * \param[in] ctx Pointer to this instance's context + * + * \retval #VPX_CODEC_OK + * The codec algorithm initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory allocation failed. + */ + vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx); + + + /*!\brief Get the capabilities of an algorithm. + * + * Retrieves the capabilities bitfield from the algorithm's interface. + * + * \param[in] iface Pointer to the algorithm interface + * + */ + vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface); + + + /*!\brief Control algorithm + * + * This function is used to exchange algorithm specific data with the codec + * instance. This can be used to implement features specific to a particular + * algorithm. + * + * This wrapper function dispatches the request to the helper function + * associated with the given ctrl_id. It tries to call this function + * transparently, but will return #VPX_CODEC_ERROR if the request could not + * be dispatched. + * + * Note that this function should not be used directly. Call the + * #vpx_codec_control wrapper macro instead. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] ctrl_id Algorithm specific control identifier + * + * \retval #VPX_CODEC_OK + * The control request was processed. + * \retval #VPX_CODEC_ERROR + * The control request was not processed. + * \retval #VPX_CODEC_INVALID_PARAM + * The data was not valid. + */ + vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, + int ctrl_id, + ...); +#if defined(VPX_DISABLE_CTRL_TYPECHECKS) && VPX_DISABLE_CTRL_TYPECHECKS +# define vpx_codec_control(ctx,id,data) vpx_codec_control_(ctx,id,data) +# define VPX_CTRL_USE_TYPE(id, typ) +# define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) +# define VPX_CTRL_VOID(id, typ) + +#else + /*!\brief vpx_codec_control wrapper macro + * + * This macro allows for type safe conversions across the variadic parameter + * to vpx_codec_control_(). + * + * \internal + * It works by dispatching the call to the control function through a wrapper + * function named with the id parameter. + */ +# define vpx_codec_control(ctx,id,data) vpx_codec_control_##id(ctx,id,data)\ + /**<\hideinitializer*/ + + + /*!\brief vpx_codec_control type definition macro + * + * This macro allows for type safe conversions across the variadic parameter + * to vpx_codec_control_(). It defines the type of the argument for a given + * control identifier. + * + * \internal + * It defines a static function with + * the correctly typed arguments as a wrapper to the type-unsafe internal + * function. + */ +# define VPX_CTRL_USE_TYPE(id, typ) \ + static vpx_codec_err_t \ + vpx_codec_control_##id(vpx_codec_ctx_t*, int, typ) UNUSED;\ + \ + static vpx_codec_err_t \ + vpx_codec_control_##id(vpx_codec_ctx_t *ctx, int ctrl_id, typ data) {\ + return vpx_codec_control_(ctx, ctrl_id, data);\ + } /**<\hideinitializer*/ + + + /*!\brief vpx_codec_control deprecated type definition macro + * + * Like #VPX_CTRL_USE_TYPE, but indicates that the specified control is + * deprecated and should not be used. Consult the documentation for your + * codec for more information. + * + * \internal + * It defines a static function with the correctly typed arguments as a + * wrapper to the type-unsafe internal function. + */ +# define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \ + DECLSPEC_DEPRECATED static vpx_codec_err_t \ + vpx_codec_control_##id(vpx_codec_ctx_t*, int, typ) DEPRECATED UNUSED;\ + \ + DECLSPEC_DEPRECATED static vpx_codec_err_t \ + vpx_codec_control_##id(vpx_codec_ctx_t *ctx, int ctrl_id, typ data) {\ + return vpx_codec_control_(ctx, ctrl_id, data);\ + } /**<\hideinitializer*/ + + + /*!\brief vpx_codec_control void type definition macro + * + * This macro allows for type safe conversions across the variadic parameter + * to vpx_codec_control_(). It indicates that a given control identifier takes + * no argument. + * + * \internal + * It defines a static function without a data argument as a wrapper to the + * type-unsafe internal function. + */ +# define VPX_CTRL_VOID(id) \ + static vpx_codec_err_t \ + vpx_codec_control_##id(vpx_codec_ctx_t*, int) UNUSED;\ + \ + static vpx_codec_err_t \ + vpx_codec_control_##id(vpx_codec_ctx_t *ctx, int ctrl_id) {\ + return vpx_codec_control_(ctx, ctrl_id);\ + } /**<\hideinitializer*/ + + +#endif + + /*!@} - end defgroup codec*/ +#ifdef __cplusplus +} +#endif +#endif // VPX_VPX_CODEC_H_ + diff --git a/thirdparty/libvpx/vpx/vpx_decoder.h b/thirdparty/libvpx/vpx/vpx_decoder.h new file mode 100644 index 000000000000..62fd91975642 --- /dev/null +++ b/thirdparty/libvpx/vpx/vpx_decoder.h @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_VPX_DECODER_H_ +#define VPX_VPX_DECODER_H_ + +/*!\defgroup decoder Decoder Algorithm Interface + * \ingroup codec + * This abstraction allows applications using this decoder to easily support + * multiple video formats with minimal code duplication. This section describes + * the interface common to all decoders. + * @{ + */ + +/*!\file + * \brief Describes the decoder algorithm interface to applications. + * + * This file describes the interface between an application and a + * video decoder algorithm. + * + */ +#ifdef __cplusplus +extern "C" { +#endif + +#include "./vpx_codec.h" +#include "./vpx_frame_buffer.h" + + /*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_DECODER_ABI_VERSION (3 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ + + /*! \brief Decoder capabilities bitfield + * + * Each decoder advertises the capabilities it supports as part of its + * ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces + * or functionality, and are not required to be supported by a decoder. + * + * The available flags are specified by VPX_CODEC_CAP_* defines. + */ +#define VPX_CODEC_CAP_PUT_SLICE 0x10000 /**< Will issue put_slice callbacks */ +#define VPX_CODEC_CAP_PUT_FRAME 0x20000 /**< Will issue put_frame callbacks */ +#define VPX_CODEC_CAP_POSTPROC 0x40000 /**< Can postprocess decoded frame */ +#define VPX_CODEC_CAP_ERROR_CONCEALMENT 0x80000 /**< Can conceal errors due to + packet loss */ +#define VPX_CODEC_CAP_INPUT_FRAGMENTS 0x100000 /**< Can receive encoded frames + one fragment at a time */ + + /*! \brief Initialization-time Feature Enabling + * + * Certain codec features must be known at initialization time, to allow for + * proper memory allocation. + * + * The available flags are specified by VPX_CODEC_USE_* defines. + */ +#define VPX_CODEC_CAP_FRAME_THREADING 0x200000 /**< Can support frame-based + multi-threading */ +#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000 /**< Can support external + frame buffers */ + +#define VPX_CODEC_USE_POSTPROC 0x10000 /**< Postprocess decoded frame */ +#define VPX_CODEC_USE_ERROR_CONCEALMENT 0x20000 /**< Conceal errors in decoded + frames */ +#define VPX_CODEC_USE_INPUT_FRAGMENTS 0x40000 /**< The input frame should be + passed to the decoder one + fragment at a time */ +#define VPX_CODEC_USE_FRAME_THREADING 0x80000 /**< Enable frame-based + multi-threading */ + + /*!\brief Stream properties + * + * This structure is used to query or set properties of the decoded + * stream. Algorithms may extend this structure with data specific + * to their bitstream by setting the sz member appropriately. + */ + typedef struct vpx_codec_stream_info { + unsigned int sz; /**< Size of this structure */ + unsigned int w; /**< Width (or 0 for unknown/default) */ + unsigned int h; /**< Height (or 0 for unknown/default) */ + unsigned int is_kf; /**< Current frame is a keyframe */ + } vpx_codec_stream_info_t; + + /* REQUIRED FUNCTIONS + * + * The following functions are required to be implemented for all decoders. + * They represent the base case functionality expected of all decoders. + */ + + + /*!\brief Initialization Configurations + * + * This structure is used to pass init time configuration options to the + * decoder. + */ + typedef struct vpx_codec_dec_cfg { + unsigned int threads; /**< Maximum number of threads to use, default 1 */ + unsigned int w; /**< Width */ + unsigned int h; /**< Height */ + } vpx_codec_dec_cfg_t; /**< alias for struct vpx_codec_dec_cfg */ + + + /*!\brief Initialize a decoder instance + * + * Initializes a decoder context using the given interface. Applications + * should call the vpx_codec_dec_init convenience macro instead of this + * function directly, to ensure that the ABI version number parameter + * is properly initialized. + * + * If the library was configured with --disable-multithread, this call + * is not thread safe and should be guarded with a lock if being used + * in a multithreaded context. + * + * \param[in] ctx Pointer to this instance's context. + * \param[in] iface Pointer to the algorithm interface to use. + * \param[in] cfg Configuration to use, if known. May be NULL. + * \param[in] flags Bitfield of VPX_CODEC_USE_* flags + * \param[in] ver ABI version number. Must be set to + * VPX_DECODER_ABI_VERSION + * \retval #VPX_CODEC_OK + * The decoder algorithm initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory allocation failed. + */ + vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, + const vpx_codec_dec_cfg_t *cfg, + vpx_codec_flags_t flags, + int ver); + + /*!\brief Convenience macro for vpx_codec_dec_init_ver() + * + * Ensures the ABI version parameter is properly set. + */ +#define vpx_codec_dec_init(ctx, iface, cfg, flags) \ + vpx_codec_dec_init_ver(ctx, iface, cfg, flags, VPX_DECODER_ABI_VERSION) + + + /*!\brief Parse stream info from a buffer + * + * Performs high level parsing of the bitstream. Construction of a decoder + * context is not necessary. Can be used to determine if the bitstream is + * of the proper format, and to extract information from the stream. + * + * \param[in] iface Pointer to the algorithm interface + * \param[in] data Pointer to a block of data to parse + * \param[in] data_sz Size of the data buffer + * \param[in,out] si Pointer to stream info to update. The size member + * \ref MUST be properly initialized, but \ref MAY be + * clobbered by the algorithm. This parameter \ref MAY + * be NULL. + * + * \retval #VPX_CODEC_OK + * Bitstream is parsable and stream information updated + */ + vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t *iface, + const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si); + + + /*!\brief Return information about the current stream. + * + * Returns information about the stream that has been parsed during decoding. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] si Pointer to stream info to update. The size member + * \ref MUST be properly initialized, but \ref MAY be + * clobbered by the algorithm. This parameter \ref MAY + * be NULL. + * + * \retval #VPX_CODEC_OK + * Bitstream is parsable and stream information updated + */ + vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx, + vpx_codec_stream_info_t *si); + + + /*!\brief Decode data + * + * Processes a buffer of coded data. If the processing results in a new + * decoded frame becoming available, PUT_SLICE and PUT_FRAME events may be + * generated, as appropriate. Encoded data \ref MUST be passed in DTS (decode + * time stamp) order. Frames produced will always be in PTS (presentation + * time stamp) order. + * If the decoder is configured with VPX_CODEC_USE_INPUT_FRAGMENTS enabled, + * data and data_sz can contain a fragment of the encoded frame. Fragment + * \#n must contain at least partition \#n, but can also contain subsequent + * partitions (\#n+1 - \#n+i), and if so, fragments \#n+1, .., \#n+i must + * be empty. When no more data is available, this function should be called + * with NULL as data and 0 as data_sz. The memory passed to this function + * must be available until the frame has been decoded. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] data Pointer to this block of new coded data. If + * NULL, a VPX_CODEC_CB_PUT_FRAME event is posted + * for the previously decoded frame. + * \param[in] data_sz Size of the coded data, in bytes. + * \param[in] user_priv Application specific data to associate with + * this frame. + * \param[in] deadline Soft deadline the decoder should attempt to meet, + * in us. Set to zero for unlimited. + * + * \return Returns #VPX_CODEC_OK if the coded data was processed completely + * and future pictures can be decoded without error. Otherwise, + * see the descriptions of the other error codes in ::vpx_codec_err_t + * for recoverability capabilities. + */ + vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, + const uint8_t *data, + unsigned int data_sz, + void *user_priv, + long deadline); + + + /*!\brief Decoded frames iterator + * + * Iterates over a list of the frames available for display. The iterator + * storage should be initialized to NULL to start the iteration. Iteration is + * complete when this function returns NULL. + * + * The list of available frames becomes valid upon completion of the + * vpx_codec_decode call, and remains valid until the next call to vpx_codec_decode. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] iter Iterator storage, initialized to NULL + * + * \return Returns a pointer to an image, if one is ready for display. Frames + * produced will always be in PTS (presentation time stamp) order. + */ + vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, + vpx_codec_iter_t *iter); + + + /*!\defgroup cap_put_frame Frame-Based Decoding Functions + * + * The following functions are required to be implemented for all decoders + * that advertise the VPX_CODEC_CAP_PUT_FRAME capability. Calling these functions + * for codecs that don't advertise this capability will result in an error + * code being returned, usually VPX_CODEC_ERROR + * @{ + */ + + /*!\brief put frame callback prototype + * + * This callback is invoked by the decoder to notify the application of + * the availability of decoded image data. + */ + typedef void (*vpx_codec_put_frame_cb_fn_t)(void *user_priv, + const vpx_image_t *img); + + + /*!\brief Register for notification of frame completion. + * + * Registers a given function to be called when a decoded frame is + * available. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cb Pointer to the callback function + * \param[in] user_priv User's private data + * + * \retval #VPX_CODEC_OK + * Callback successfully registered. + * \retval #VPX_CODEC_ERROR + * Decoder context not initialized, or algorithm not capable of + * posting slice completion. + */ + vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx, + vpx_codec_put_frame_cb_fn_t cb, + void *user_priv); + + + /*!@} - end defgroup cap_put_frame */ + + /*!\defgroup cap_put_slice Slice-Based Decoding Functions + * + * The following functions are required to be implemented for all decoders + * that advertise the VPX_CODEC_CAP_PUT_SLICE capability. Calling these functions + * for codecs that don't advertise this capability will result in an error + * code being returned, usually VPX_CODEC_ERROR + * @{ + */ + + /*!\brief put slice callback prototype + * + * This callback is invoked by the decoder to notify the application of + * the availability of partially decoded image data. The + */ + typedef void (*vpx_codec_put_slice_cb_fn_t)(void *user_priv, + const vpx_image_t *img, + const vpx_image_rect_t *valid, + const vpx_image_rect_t *update); + + + /*!\brief Register for notification of slice completion. + * + * Registers a given function to be called when a decoded slice is + * available. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cb Pointer to the callback function + * \param[in] user_priv User's private data + * + * \retval #VPX_CODEC_OK + * Callback successfully registered. + * \retval #VPX_CODEC_ERROR + * Decoder context not initialized, or algorithm not capable of + * posting slice completion. + */ + vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx, + vpx_codec_put_slice_cb_fn_t cb, + void *user_priv); + + + /*!@} - end defgroup cap_put_slice*/ + + /*!\defgroup cap_external_frame_buffer External Frame Buffer Functions + * + * The following section is required to be implemented for all decoders + * that advertise the VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability. + * Calling this function for codecs that don't advertise this capability + * will result in an error code being returned, usually VPX_CODEC_ERROR. + * + * \note + * Currently this only works with VP9. + * @{ + */ + + /*!\brief Pass in external frame buffers for the decoder to use. + * + * Registers functions to be called when libvpx needs a frame buffer + * to decode the current frame and a function to be called when libvpx does + * not internally reference the frame buffer. This set function must + * be called before the first call to decode or libvpx will assume the + * default behavior of allocating frame buffers internally. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cb_get Pointer to the get callback function + * \param[in] cb_release Pointer to the release callback function + * \param[in] cb_priv Callback's private data + * + * \retval #VPX_CODEC_OK + * External frame buffers will be used by libvpx. + * \retval #VPX_CODEC_INVALID_PARAM + * One or more of the callbacks were NULL. + * \retval #VPX_CODEC_ERROR + * Decoder context not initialized, or algorithm not capable of + * using external frame buffers. + * + * \note + * When decoding VP9, the application may be required to pass in at least + * #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS external frame + * buffers. + */ + vpx_codec_err_t vpx_codec_set_frame_buffer_functions( + vpx_codec_ctx_t *ctx, + vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv); + + /*!@} - end defgroup cap_external_frame_buffer */ + + /*!@} - end defgroup decoder*/ +#ifdef __cplusplus +} +#endif +#endif // VPX_VPX_DECODER_H_ + diff --git a/thirdparty/libvpx/vpx/vpx_encoder.h b/thirdparty/libvpx/vpx/vpx_encoder.h new file mode 100644 index 000000000000..955e8735193b --- /dev/null +++ b/thirdparty/libvpx/vpx/vpx_encoder.h @@ -0,0 +1,1043 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_VPX_ENCODER_H_ +#define VPX_VPX_ENCODER_H_ + +/*!\defgroup encoder Encoder Algorithm Interface + * \ingroup codec + * This abstraction allows applications using this encoder to easily support + * multiple video formats with minimal code duplication. This section describes + * the interface common to all encoders. + * @{ + */ + +/*!\file + * \brief Describes the encoder algorithm interface to applications. + * + * This file describes the interface between an application and a + * video encoder algorithm. + * + */ +#ifdef __cplusplus +extern "C" { +#endif + +#include "./vpx_codec.h" + + /*! Temporal Scalability: Maximum length of the sequence defining frame + * layer membership + */ +#define VPX_TS_MAX_PERIODICITY 16 + + /*! Temporal Scalability: Maximum number of coding layers */ +#define VPX_TS_MAX_LAYERS 5 + + /*!\deprecated Use #VPX_TS_MAX_PERIODICITY instead. */ +#define MAX_PERIODICITY VPX_TS_MAX_PERIODICITY + +/*! Temporal+Spatial Scalability: Maximum number of coding layers */ +#define VPX_MAX_LAYERS 12 // 3 temporal + 4 spatial layers are allowed. + +/*!\deprecated Use #VPX_MAX_LAYERS instead. */ +#define MAX_LAYERS VPX_MAX_LAYERS // 3 temporal + 4 spatial layers allowed. + +/*! Spatial Scalability: Maximum number of coding layers */ +#define VPX_SS_MAX_LAYERS 5 + +/*! Spatial Scalability: Default number of coding layers */ +#define VPX_SS_DEFAULT_LAYERS 1 + + /*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_ENCODER_ABI_VERSION (5 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ + + + /*! \brief Encoder capabilities bitfield + * + * Each encoder advertises the capabilities it supports as part of its + * ::vpx_codec_iface_t interface structure. Capabilities are extra + * interfaces or functionality, and are not required to be supported + * by an encoder. + * + * The available flags are specified by VPX_CODEC_CAP_* defines. + */ +#define VPX_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */ + + /*! Can output one partition at a time. Each partition is returned in its + * own VPX_CODEC_CX_FRAME_PKT, with the FRAME_IS_FRAGMENT flag set for + * every partition but the last. In this mode all frames are always + * returned partition by partition. + */ +#define VPX_CODEC_CAP_OUTPUT_PARTITION 0x20000 + +/*! Can support input images at greater than 8 bitdepth. + */ +#define VPX_CODEC_CAP_HIGHBITDEPTH 0x40000 + + /*! \brief Initialization-time Feature Enabling + * + * Certain codec features must be known at initialization time, to allow + * for proper memory allocation. + * + * The available flags are specified by VPX_CODEC_USE_* defines. + */ +#define VPX_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */ +#define VPX_CODEC_USE_OUTPUT_PARTITION 0x20000 /**< Make the encoder output one + partition at a time. */ +#define VPX_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */ + + + /*!\brief Generic fixed size buffer structure + * + * This structure is able to hold a reference to any fixed size buffer. + */ + typedef struct vpx_fixed_buf { + void *buf; /**< Pointer to the data */ + size_t sz; /**< Length of the buffer, in chars */ + } vpx_fixed_buf_t; /**< alias for struct vpx_fixed_buf */ + + + /*!\brief Time Stamp Type + * + * An integer, which when multiplied by the stream's time base, provides + * the absolute time of a sample. + */ + typedef int64_t vpx_codec_pts_t; + + + /*!\brief Compressed Frame Flags + * + * This type represents a bitfield containing information about a compressed + * frame that may be useful to an application. The most significant 16 bits + * can be used by an algorithm to provide additional detail, for example to + * support frame types that are codec specific (MPEG-1 D-frames for example) + */ + typedef uint32_t vpx_codec_frame_flags_t; +#define VPX_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */ +#define VPX_FRAME_IS_DROPPABLE 0x2 /**< frame can be dropped without affecting + the stream (no future frame depends on + this one) */ +#define VPX_FRAME_IS_INVISIBLE 0x4 /**< frame should be decoded but will not + be shown */ +#define VPX_FRAME_IS_FRAGMENT 0x8 /**< this is a fragment of the encoded + frame */ + + /*!\brief Error Resilient flags + * + * These flags define which error resilient features to enable in the + * encoder. The flags are specified through the + * vpx_codec_enc_cfg::g_error_resilient variable. + */ + typedef uint32_t vpx_codec_er_flags_t; +#define VPX_ERROR_RESILIENT_DEFAULT 0x1 /**< Improve resiliency against + losses of whole frames */ +#define VPX_ERROR_RESILIENT_PARTITIONS 0x2 /**< The frame partitions are + independently decodable by the + bool decoder, meaning that + partitions can be decoded even + though earlier partitions have + been lost. Note that intra + prediction is still done over + the partition boundary. */ + + /*!\brief Encoder output packet variants + * + * This enumeration lists the different kinds of data packets that can be + * returned by calls to vpx_codec_get_cx_data(). Algorithms \ref MAY + * extend this list to provide additional functionality. + */ + enum vpx_codec_cx_pkt_kind { + VPX_CODEC_CX_FRAME_PKT, /**< Compressed video frame */ + VPX_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */ + VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */ + VPX_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */ + // Spatial SVC is still experimental and may be removed before the next ABI + // bump. +#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) + VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/ + VPX_CODEC_SPATIAL_SVC_LAYER_PSNR, /**< PSNR for each layer in this frame*/ +#endif + VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions */ + }; + + + /*!\brief Encoder output packet + * + * This structure contains the different kinds of output data the encoder + * may produce while compressing a frame. + */ + typedef struct vpx_codec_cx_pkt { + enum vpx_codec_cx_pkt_kind kind; /**< packet variant */ + union { + struct { + void *buf; /**< compressed data buffer */ + size_t sz; /**< length of compressed data */ + vpx_codec_pts_t pts; /**< time stamp to show frame + (in timebase units) */ + unsigned long duration; /**< duration to show frame + (in timebase units) */ + vpx_codec_frame_flags_t flags; /**< flags for this frame */ + int partition_id; /**< the partition id + defines the decoding order + of the partitions. Only + applicable when "output partition" + mode is enabled. First partition + has id 0.*/ + + } frame; /**< data for compressed frame packet */ + vpx_fixed_buf_t twopass_stats; /**< data for two-pass packet */ + vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */ + struct vpx_psnr_pkt { + unsigned int samples[4]; /**< Number of samples, total/y/u/v */ + uint64_t sse[4]; /**< sum squared error, total/y/u/v */ + double psnr[4]; /**< PSNR, total/y/u/v */ + } psnr; /**< data for PSNR packet */ + vpx_fixed_buf_t raw; /**< data for arbitrary packets */ + // Spatial SVC is still experimental and may be removed before the next + // ABI bump. +#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) + size_t layer_sizes[VPX_SS_MAX_LAYERS]; + struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS]; +#endif + + /* This packet size is fixed to allow codecs to extend this + * interface without having to manage storage for raw packets, + * i.e., if it's smaller than 128 bytes, you can store in the + * packet list directly. + */ + char pad[128 - sizeof(enum vpx_codec_cx_pkt_kind)]; /**< fixed sz */ + } data; /**< packet data */ + } vpx_codec_cx_pkt_t; /**< alias for struct vpx_codec_cx_pkt */ + + + /*!\brief Encoder return output buffer callback + * + * This callback function, when registered, returns with packets when each + * spatial layer is encoded. + */ + // putting the definitions here for now. (agrange: find if there + // is a better place for this) + typedef void (* vpx_codec_enc_output_cx_pkt_cb_fn_t)(vpx_codec_cx_pkt_t *pkt, + void *user_data); + + /*!\brief Callback function pointer / user data pair storage */ + typedef struct vpx_codec_enc_output_cx_cb_pair { + vpx_codec_enc_output_cx_pkt_cb_fn_t output_cx_pkt; /**< Callback function */ + void *user_priv; /**< Pointer to private data */ + } vpx_codec_priv_output_cx_pkt_cb_pair_t; + + /*!\brief Rational Number + * + * This structure holds a fractional value. + */ + typedef struct vpx_rational { + int num; /**< fraction numerator */ + int den; /**< fraction denominator */ + } vpx_rational_t; /**< alias for struct vpx_rational */ + + + /*!\brief Multi-pass Encoding Pass */ + enum vpx_enc_pass { + VPX_RC_ONE_PASS, /**< Single pass mode */ + VPX_RC_FIRST_PASS, /**< First pass of multi-pass mode */ + VPX_RC_LAST_PASS /**< Final pass of multi-pass mode */ + }; + + + /*!\brief Rate control mode */ + enum vpx_rc_mode { + VPX_VBR, /**< Variable Bit Rate (VBR) mode */ + VPX_CBR, /**< Constant Bit Rate (CBR) mode */ + VPX_CQ, /**< Constrained Quality (CQ) mode */ + VPX_Q, /**< Constant Quality (Q) mode */ + }; + + + /*!\brief Keyframe placement mode. + * + * This enumeration determines whether keyframes are placed automatically by + * the encoder or whether this behavior is disabled. Older releases of this + * SDK were implemented such that VPX_KF_FIXED meant keyframes were disabled. + * This name is confusing for this behavior, so the new symbols to be used + * are VPX_KF_AUTO and VPX_KF_DISABLED. + */ + enum vpx_kf_mode { + VPX_KF_FIXED, /**< deprecated, implies VPX_KF_DISABLED */ + VPX_KF_AUTO, /**< Encoder determines optimal placement automatically */ + VPX_KF_DISABLED = 0 /**< Encoder does not place keyframes. */ + }; + + + /*!\brief Encoded Frame Flags + * + * This type indicates a bitfield to be passed to vpx_codec_encode(), defining + * per-frame boolean values. By convention, bits common to all codecs will be + * named VPX_EFLAG_*, and bits specific to an algorithm will be named + * /algo/_eflag_*. The lower order 16 bits are reserved for common use. + */ + typedef long vpx_enc_frame_flags_t; +#define VPX_EFLAG_FORCE_KF (1<<0) /**< Force this frame to be a keyframe */ + + + /*!\brief Encoder configuration structure + * + * This structure contains the encoder settings that have common representations + * across all codecs. This doesn't imply that all codecs support all features, + * however. + */ + typedef struct vpx_codec_enc_cfg { + /* + * generic settings (g) + */ + + /*!\brief Algorithm specific "usage" value + * + * Algorithms may define multiple values for usage, which may convey the + * intent of how the application intends to use the stream. If this value + * is non-zero, consult the documentation for the codec to determine its + * meaning. + */ + unsigned int g_usage; + + + /*!\brief Maximum number of threads to use + * + * For multi-threaded implementations, use no more than this number of + * threads. The codec may use fewer threads than allowed. The value + * 0 is equivalent to the value 1. + */ + unsigned int g_threads; + + + /*!\brief Bitstream profile to use + * + * Some codecs support a notion of multiple bitstream profiles. Typically + * this maps to a set of features that are turned on or off. Often the + * profile to use is determined by the features of the intended decoder. + * Consult the documentation for the codec to determine the valid values + * for this parameter, or set to zero for a sane default. + */ + unsigned int g_profile; /**< profile of bitstream to use */ + + + + /*!\brief Width of the frame + * + * This value identifies the presentation resolution of the frame, + * in pixels. Note that the frames passed as input to the encoder must + * have this resolution. Frames will be presented by the decoder in this + * resolution, independent of any spatial resampling the encoder may do. + */ + unsigned int g_w; + + + /*!\brief Height of the frame + * + * This value identifies the presentation resolution of the frame, + * in pixels. Note that the frames passed as input to the encoder must + * have this resolution. Frames will be presented by the decoder in this + * resolution, independent of any spatial resampling the encoder may do. + */ + unsigned int g_h; + + /*!\brief Bit-depth of the codec + * + * This value identifies the bit_depth of the codec, + * Only certain bit-depths are supported as identified in the + * vpx_bit_depth_t enum. + */ + vpx_bit_depth_t g_bit_depth; + + /*!\brief Bit-depth of the input frames + * + * This value identifies the bit_depth of the input frames in bits. + * Note that the frames passed as input to the encoder must have + * this bit-depth. + */ + unsigned int g_input_bit_depth; + + /*!\brief Stream timebase units + * + * Indicates the smallest interval of time, in seconds, used by the stream. + * For fixed frame rate material, or variable frame rate material where + * frames are timed at a multiple of a given clock (ex: video capture), + * the \ref RECOMMENDED method is to set the timebase to the reciprocal + * of the frame rate (ex: 1001/30000 for 29.970 Hz NTSC). This allows the + * pts to correspond to the frame number, which can be handy. For + * re-encoding video from containers with absolute time timestamps, the + * \ref RECOMMENDED method is to set the timebase to that of the parent + * container or multimedia framework (ex: 1/1000 for ms, as in FLV). + */ + struct vpx_rational g_timebase; + + + /*!\brief Enable error resilient modes. + * + * The error resilient bitfield indicates to the encoder which features + * it should enable to take measures for streaming over lossy or noisy + * links. + */ + vpx_codec_er_flags_t g_error_resilient; + + + /*!\brief Multi-pass Encoding Mode + * + * This value should be set to the current phase for multi-pass encoding. + * For single pass, set to #VPX_RC_ONE_PASS. + */ + enum vpx_enc_pass g_pass; + + + /*!\brief Allow lagged encoding + * + * If set, this value allows the encoder to consume a number of input + * frames before producing output frames. This allows the encoder to + * base decisions for the current frame on future frames. This does + * increase the latency of the encoding pipeline, so it is not appropriate + * in all situations (ex: realtime encoding). + * + * Note that this is a maximum value -- the encoder may produce frames + * sooner than the given limit. Set this value to 0 to disable this + * feature. + */ + unsigned int g_lag_in_frames; + + + /* + * rate control settings (rc) + */ + + /*!\brief Temporal resampling configuration, if supported by the codec. + * + * Temporal resampling allows the codec to "drop" frames as a strategy to + * meet its target data rate. This can cause temporal discontinuities in + * the encoded video, which may appear as stuttering during playback. This + * trade-off is often acceptable, but for many applications is not. It can + * be disabled in these cases. + * + * Note that not all codecs support this feature. All vpx VPx codecs do. + * For other codecs, consult the documentation for that algorithm. + * + * This threshold is described as a percentage of the target data buffer. + * When the data buffer falls below this percentage of fullness, a + * dropped frame is indicated. Set the threshold to zero (0) to disable + * this feature. + */ + unsigned int rc_dropframe_thresh; + + + /*!\brief Enable/disable spatial resampling, if supported by the codec. + * + * Spatial resampling allows the codec to compress a lower resolution + * version of the frame, which is then upscaled by the encoder to the + * correct presentation resolution. This increases visual quality at + * low data rates, at the expense of CPU time on the encoder/decoder. + */ + unsigned int rc_resize_allowed; + + /*!\brief Internal coded frame width. + * + * If spatial resampling is enabled this specifies the width of the + * encoded frame. + */ + unsigned int rc_scaled_width; + + /*!\brief Internal coded frame height. + * + * If spatial resampling is enabled this specifies the height of the + * encoded frame. + */ + unsigned int rc_scaled_height; + + /*!\brief Spatial resampling up watermark. + * + * This threshold is described as a percentage of the target data buffer. + * When the data buffer rises above this percentage of fullness, the + * encoder will step up to a higher resolution version of the frame. + */ + unsigned int rc_resize_up_thresh; + + + /*!\brief Spatial resampling down watermark. + * + * This threshold is described as a percentage of the target data buffer. + * When the data buffer falls below this percentage of fullness, the + * encoder will step down to a lower resolution version of the frame. + */ + unsigned int rc_resize_down_thresh; + + + /*!\brief Rate control algorithm to use. + * + * Indicates whether the end usage of this stream is to be streamed over + * a bandwidth constrained link, indicating that Constant Bit Rate (CBR) + * mode should be used, or whether it will be played back on a high + * bandwidth link, as from a local disk, where higher variations in + * bitrate are acceptable. + */ + enum vpx_rc_mode rc_end_usage; + + + /*!\brief Two-pass stats buffer. + * + * A buffer containing all of the stats packets produced in the first + * pass, concatenated. + */ + vpx_fixed_buf_t rc_twopass_stats_in; + + /*!\brief first pass mb stats buffer. + * + * A buffer containing all of the first pass mb stats packets produced + * in the first pass, concatenated. + */ + vpx_fixed_buf_t rc_firstpass_mb_stats_in; + + /*!\brief Target data rate + * + * Target bandwidth to use for this stream, in kilobits per second. + */ + unsigned int rc_target_bitrate; + + + /* + * quantizer settings + */ + + + /*!\brief Minimum (Best Quality) Quantizer + * + * The quantizer is the most direct control over the quality of the + * encoded image. The range of valid values for the quantizer is codec + * specific. Consult the documentation for the codec to determine the + * values to use. To determine the range programmatically, call + * vpx_codec_enc_config_default() with a usage value of 0. + */ + unsigned int rc_min_quantizer; + + + /*!\brief Maximum (Worst Quality) Quantizer + * + * The quantizer is the most direct control over the quality of the + * encoded image. The range of valid values for the quantizer is codec + * specific. Consult the documentation for the codec to determine the + * values to use. To determine the range programmatically, call + * vpx_codec_enc_config_default() with a usage value of 0. + */ + unsigned int rc_max_quantizer; + + + /* + * bitrate tolerance + */ + + + /*!\brief Rate control adaptation undershoot control + * + * This value, expressed as a percentage of the target bitrate, + * controls the maximum allowed adaptation speed of the codec. + * This factor controls the maximum amount of bits that can + * be subtracted from the target bitrate in order to compensate + * for prior overshoot. + * + * Valid values in the range 0-1000. + */ + unsigned int rc_undershoot_pct; + + + /*!\brief Rate control adaptation overshoot control + * + * This value, expressed as a percentage of the target bitrate, + * controls the maximum allowed adaptation speed of the codec. + * This factor controls the maximum amount of bits that can + * be added to the target bitrate in order to compensate for + * prior undershoot. + * + * Valid values in the range 0-1000. + */ + unsigned int rc_overshoot_pct; + + + /* + * decoder buffer model parameters + */ + + + /*!\brief Decoder Buffer Size + * + * This value indicates the amount of data that may be buffered by the + * decoding application. Note that this value is expressed in units of + * time (milliseconds). For example, a value of 5000 indicates that the + * client will buffer (at least) 5000ms worth of encoded data. Use the + * target bitrate (#rc_target_bitrate) to convert to bits/bytes, if + * necessary. + */ + unsigned int rc_buf_sz; + + + /*!\brief Decoder Buffer Initial Size + * + * This value indicates the amount of data that will be buffered by the + * decoding application prior to beginning playback. This value is + * expressed in units of time (milliseconds). Use the target bitrate + * (#rc_target_bitrate) to convert to bits/bytes, if necessary. + */ + unsigned int rc_buf_initial_sz; + + + /*!\brief Decoder Buffer Optimal Size + * + * This value indicates the amount of data that the encoder should try + * to maintain in the decoder's buffer. This value is expressed in units + * of time (milliseconds). Use the target bitrate (#rc_target_bitrate) + * to convert to bits/bytes, if necessary. + */ + unsigned int rc_buf_optimal_sz; + + + /* + * 2 pass rate control parameters + */ + + + /*!\brief Two-pass mode CBR/VBR bias + * + * Bias, expressed on a scale of 0 to 100, for determining target size + * for the current frame. The value 0 indicates the optimal CBR mode + * value should be used. The value 100 indicates the optimal VBR mode + * value should be used. Values in between indicate which way the + * encoder should "lean." + */ + unsigned int rc_2pass_vbr_bias_pct; /**< RC mode bias between CBR and VBR(0-100: 0->CBR, 100->VBR) */ + + + /*!\brief Two-pass mode per-GOP minimum bitrate + * + * This value, expressed as a percentage of the target bitrate, indicates + * the minimum bitrate to be used for a single GOP (aka "section") + */ + unsigned int rc_2pass_vbr_minsection_pct; + + + /*!\brief Two-pass mode per-GOP maximum bitrate + * + * This value, expressed as a percentage of the target bitrate, indicates + * the maximum bitrate to be used for a single GOP (aka "section") + */ + unsigned int rc_2pass_vbr_maxsection_pct; + + + /* + * keyframing settings (kf) + */ + + /*!\brief Keyframe placement mode + * + * This value indicates whether the encoder should place keyframes at a + * fixed interval, or determine the optimal placement automatically + * (as governed by the #kf_min_dist and #kf_max_dist parameters) + */ + enum vpx_kf_mode kf_mode; + + + /*!\brief Keyframe minimum interval + * + * This value, expressed as a number of frames, prevents the encoder from + * placing a keyframe nearer than kf_min_dist to the previous keyframe. At + * least kf_min_dist frames non-keyframes will be coded before the next + * keyframe. Set kf_min_dist equal to kf_max_dist for a fixed interval. + */ + unsigned int kf_min_dist; + + + /*!\brief Keyframe maximum interval + * + * This value, expressed as a number of frames, forces the encoder to code + * a keyframe if one has not been coded in the last kf_max_dist frames. + * A value of 0 implies all frames will be keyframes. Set kf_min_dist + * equal to kf_max_dist for a fixed interval. + */ + unsigned int kf_max_dist; + + /* + * Spatial scalability settings (ss) + */ + + /*!\brief Number of spatial coding layers. + * + * This value specifies the number of spatial coding layers to be used. + */ + unsigned int ss_number_layers; + + /*!\brief Enable auto alt reference flags for each spatial layer. + * + * These values specify if auto alt reference frame is enabled for each + * spatial layer. + */ + int ss_enable_auto_alt_ref[VPX_SS_MAX_LAYERS]; + + /*!\brief Target bitrate for each spatial layer. + * + * These values specify the target coding bitrate to be used for each + * spatial layer. + */ + unsigned int ss_target_bitrate[VPX_SS_MAX_LAYERS]; + + /*!\brief Number of temporal coding layers. + * + * This value specifies the number of temporal layers to be used. + */ + unsigned int ts_number_layers; + + /*!\brief Target bitrate for each temporal layer. + * + * These values specify the target coding bitrate to be used for each + * temporal layer. + */ + unsigned int ts_target_bitrate[VPX_TS_MAX_LAYERS]; + + /*!\brief Frame rate decimation factor for each temporal layer. + * + * These values specify the frame rate decimation factors to apply + * to each temporal layer. + */ + unsigned int ts_rate_decimator[VPX_TS_MAX_LAYERS]; + + /*!\brief Length of the sequence defining frame temporal layer membership. + * + * This value specifies the length of the sequence that defines the + * membership of frames to temporal layers. For example, if the + * ts_periodicity = 8, then the frames are assigned to coding layers with a + * repeated sequence of length 8. + */ + unsigned int ts_periodicity; + + /*!\brief Template defining the membership of frames to temporal layers. + * + * This array defines the membership of frames to temporal coding layers. + * For a 2-layer encoding that assigns even numbered frames to one temporal + * layer (0) and odd numbered frames to a second temporal layer (1) with + * ts_periodicity=8, then ts_layer_id = (0,1,0,1,0,1,0,1). + */ + unsigned int ts_layer_id[VPX_TS_MAX_PERIODICITY]; + + /*!\brief Target bitrate for each spatial/temporal layer. + * + * These values specify the target coding bitrate to be used for each + * spatial/temporal layer. + * + */ + unsigned int layer_target_bitrate[VPX_MAX_LAYERS]; + + /*!\brief Temporal layering mode indicating which temporal layering scheme to use. + * + * The value (refer to VP9E_TEMPORAL_LAYERING_MODE) specifies the + * temporal layering mode to use. + * + */ + int temporal_layering_mode; + } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */ + + /*!\brief vp9 svc extra configure parameters + * + * This defines max/min quantizers and scale factors for each layer + * + */ + typedef struct vpx_svc_parameters { + int max_quantizers[VPX_MAX_LAYERS]; /**< Max Q for each layer */ + int min_quantizers[VPX_MAX_LAYERS]; /**< Min Q for each layer */ + int scaling_factor_num[VPX_MAX_LAYERS]; /**< Scaling factor-numerator */ + int scaling_factor_den[VPX_MAX_LAYERS]; /**< Scaling factor-denominator */ + int temporal_layering_mode; /**< Temporal layering mode */ + } vpx_svc_extra_cfg_t; + + + /*!\brief Initialize an encoder instance + * + * Initializes a encoder context using the given interface. Applications + * should call the vpx_codec_enc_init convenience macro instead of this + * function directly, to ensure that the ABI version number parameter + * is properly initialized. + * + * If the library was configured with --disable-multithread, this call + * is not thread safe and should be guarded with a lock if being used + * in a multithreaded context. + * + * \param[in] ctx Pointer to this instance's context. + * \param[in] iface Pointer to the algorithm interface to use. + * \param[in] cfg Configuration to use, if known. May be NULL. + * \param[in] flags Bitfield of VPX_CODEC_USE_* flags + * \param[in] ver ABI version number. Must be set to + * VPX_ENCODER_ABI_VERSION + * \retval #VPX_CODEC_OK + * The decoder algorithm initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory allocation failed. + */ + vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, + const vpx_codec_enc_cfg_t *cfg, + vpx_codec_flags_t flags, + int ver); + + + /*!\brief Convenience macro for vpx_codec_enc_init_ver() + * + * Ensures the ABI version parameter is properly set. + */ +#define vpx_codec_enc_init(ctx, iface, cfg, flags) \ + vpx_codec_enc_init_ver(ctx, iface, cfg, flags, VPX_ENCODER_ABI_VERSION) + + + /*!\brief Initialize multi-encoder instance + * + * Initializes multi-encoder context using the given interface. + * Applications should call the vpx_codec_enc_init_multi convenience macro + * instead of this function directly, to ensure that the ABI version number + * parameter is properly initialized. + * + * \param[in] ctx Pointer to this instance's context. + * \param[in] iface Pointer to the algorithm interface to use. + * \param[in] cfg Configuration to use, if known. May be NULL. + * \param[in] num_enc Total number of encoders. + * \param[in] flags Bitfield of VPX_CODEC_USE_* flags + * \param[in] dsf Pointer to down-sampling factors. + * \param[in] ver ABI version number. Must be set to + * VPX_ENCODER_ABI_VERSION + * \retval #VPX_CODEC_OK + * The decoder algorithm initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory allocation failed. + */ + vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, + vpx_codec_enc_cfg_t *cfg, + int num_enc, + vpx_codec_flags_t flags, + vpx_rational_t *dsf, + int ver); + + + /*!\brief Convenience macro for vpx_codec_enc_init_multi_ver() + * + * Ensures the ABI version parameter is properly set. + */ +#define vpx_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \ + vpx_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf, \ + VPX_ENCODER_ABI_VERSION) + + + /*!\brief Get a default configuration + * + * Initializes a encoder configuration structure with default values. Supports + * the notion of "usages" so that an algorithm may offer different default + * settings depending on the user's intended goal. This function \ref SHOULD + * be called by all applications to initialize the configuration structure + * before specializing the configuration with application specific values. + * + * \param[in] iface Pointer to the algorithm interface to use. + * \param[out] cfg Configuration buffer to populate. + * \param[in] reserved Must set to 0 for VP8 and VP9. + * + * \retval #VPX_CODEC_OK + * The configuration was populated. + * \retval #VPX_CODEC_INCAPABLE + * Interface is not an encoder interface. + * \retval #VPX_CODEC_INVALID_PARAM + * A parameter was NULL, or the usage value was not recognized. + */ + vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, + vpx_codec_enc_cfg_t *cfg, + unsigned int reserved); + + + /*!\brief Set or change configuration + * + * Reconfigures an encoder instance according to the given configuration. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cfg Configuration buffer to use + * + * \retval #VPX_CODEC_OK + * The configuration was populated. + * \retval #VPX_CODEC_INCAPABLE + * Interface is not an encoder interface. + * \retval #VPX_CODEC_INVALID_PARAM + * A parameter was NULL, or the usage value was not recognized. + */ + vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx, + const vpx_codec_enc_cfg_t *cfg); + + + /*!\brief Get global stream headers + * + * Retrieves a stream level global header packet, if supported by the codec. + * + * \param[in] ctx Pointer to this instance's context + * + * \retval NULL + * Encoder does not support global header + * \retval Non-NULL + * Pointer to buffer containing global header packet + */ + vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx); + + +#define VPX_DL_REALTIME (1) /**< deadline parameter analogous to + * VPx REALTIME mode. */ +#define VPX_DL_GOOD_QUALITY (1000000) /**< deadline parameter analogous to + * VPx GOOD QUALITY mode. */ +#define VPX_DL_BEST_QUALITY (0) /**< deadline parameter analogous to + * VPx BEST QUALITY mode. */ + /*!\brief Encode a frame + * + * Encodes a video frame at the given "presentation time." The presentation + * time stamp (PTS) \ref MUST be strictly increasing. + * + * The encoder supports the notion of a soft real-time deadline. Given a + * non-zero value to the deadline parameter, the encoder will make a "best + * effort" guarantee to return before the given time slice expires. It is + * implicit that limiting the available time to encode will degrade the + * output quality. The encoder can be given an unlimited time to produce the + * best possible frame by specifying a deadline of '0'. This deadline + * supercedes the VPx notion of "best quality, good quality, realtime". + * Applications that wish to map these former settings to the new deadline + * based system can use the symbols #VPX_DL_REALTIME, #VPX_DL_GOOD_QUALITY, + * and #VPX_DL_BEST_QUALITY. + * + * When the last frame has been passed to the encoder, this function should + * continue to be called, with the img parameter set to NULL. This will + * signal the end-of-stream condition to the encoder and allow it to encode + * any held buffers. Encoding is complete when vpx_codec_encode() is called + * and vpx_codec_get_cx_data() returns no data. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] img Image data to encode, NULL to flush. + * \param[in] pts Presentation time stamp, in timebase units. + * \param[in] duration Duration to show frame, in timebase units. + * \param[in] flags Flags to use for encoding this frame. + * \param[in] deadline Time to spend encoding, in microseconds. (0=infinite) + * + * \retval #VPX_CODEC_OK + * The configuration was populated. + * \retval #VPX_CODEC_INCAPABLE + * Interface is not an encoder interface. + * \retval #VPX_CODEC_INVALID_PARAM + * A parameter was NULL, the image format is unsupported, etc. + */ + vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, + const vpx_image_t *img, + vpx_codec_pts_t pts, + unsigned long duration, + vpx_enc_frame_flags_t flags, + unsigned long deadline); + + /*!\brief Set compressed data output buffer + * + * Sets the buffer that the codec should output the compressed data + * into. This call effectively sets the buffer pointer returned in the + * next VPX_CODEC_CX_FRAME_PKT packet. Subsequent packets will be + * appended into this buffer. The buffer is preserved across frames, + * so applications must periodically call this function after flushing + * the accumulated compressed data to disk or to the network to reset + * the pointer to the buffer's head. + * + * `pad_before` bytes will be skipped before writing the compressed + * data, and `pad_after` bytes will be appended to the packet. The size + * of the packet will be the sum of the size of the actual compressed + * data, pad_before, and pad_after. The padding bytes will be preserved + * (not overwritten). + * + * Note that calling this function does not guarantee that the returned + * compressed data will be placed into the specified buffer. In the + * event that the encoded data will not fit into the buffer provided, + * the returned packet \ref MAY point to an internal buffer, as it would + * if this call were never used. In this event, the output packet will + * NOT have any padding, and the application must free space and copy it + * to the proper place. This is of particular note in configurations + * that may output multiple packets for a single encoded frame (e.g., lagged + * encoding) or if the application does not reset the buffer periodically. + * + * Applications may restore the default behavior of the codec providing + * the compressed data buffer by calling this function with a NULL + * buffer. + * + * Applications \ref MUSTNOT call this function during iteration of + * vpx_codec_get_cx_data(). + * + * \param[in] ctx Pointer to this instance's context + * \param[in] buf Buffer to store compressed data into + * \param[in] pad_before Bytes to skip before writing compressed data + * \param[in] pad_after Bytes to skip after writing compressed data + * + * \retval #VPX_CODEC_OK + * The buffer was set successfully. + * \retval #VPX_CODEC_INVALID_PARAM + * A parameter was NULL, the image format is unsupported, etc. + */ + vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx, + const vpx_fixed_buf_t *buf, + unsigned int pad_before, + unsigned int pad_after); + + + /*!\brief Encoded data iterator + * + * Iterates over a list of data packets to be passed from the encoder to the + * application. The different kinds of packets available are enumerated in + * #vpx_codec_cx_pkt_kind. + * + * #VPX_CODEC_CX_FRAME_PKT packets should be passed to the application's + * muxer. Multiple compressed frames may be in the list. + * #VPX_CODEC_STATS_PKT packets should be appended to a global buffer. + * + * The application \ref MUST silently ignore any packet kinds that it does + * not recognize or support. + * + * The data buffers returned from this function are only guaranteed to be + * valid until the application makes another call to any vpx_codec_* function. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] iter Iterator storage, initialized to NULL + * + * \return Returns a pointer to an output data packet (compressed frame data, + * two-pass statistics, etc.) or NULL to signal end-of-list. + * + */ + const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx, + vpx_codec_iter_t *iter); + + + /*!\brief Get Preview Frame + * + * Returns an image that can be used as a preview. Shows the image as it would + * exist at the decompressor. The application \ref MUST NOT write into this + * image buffer. + * + * \param[in] ctx Pointer to this instance's context + * + * \return Returns a pointer to a preview image, or NULL if no image is + * available. + * + */ + const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx); + + + /*!@} - end defgroup encoder*/ +#ifdef __cplusplus +} +#endif +#endif // VPX_VPX_ENCODER_H_ + diff --git a/thirdparty/libvpx/vpx/vpx_frame_buffer.h b/thirdparty/libvpx/vpx/vpx_frame_buffer.h new file mode 100644 index 000000000000..9036459af0a3 --- /dev/null +++ b/thirdparty/libvpx/vpx/vpx_frame_buffer.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_FRAME_BUFFER_H_ +#define VPX_VPX_FRAME_BUFFER_H_ + +/*!\file + * \brief Describes the decoder external frame buffer interface. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./vpx_integer.h" + +/*!\brief The maximum number of work buffers used by libvpx. + * Support maximum 4 threads to decode video in parallel. + * Each thread will use one work buffer. + * TODO(hkuang): Add support to set number of worker threads dynamically. + */ +#define VPX_MAXIMUM_WORK_BUFFERS 8 + +/*!\brief The maximum number of reference buffers that a VP9 encoder may use. + */ +#define VP9_MAXIMUM_REF_BUFFERS 8 + +/*!\brief External frame buffer + * + * This structure holds allocated frame buffers used by the decoder. + */ +typedef struct vpx_codec_frame_buffer { + uint8_t *data; /**< Pointer to the data buffer */ + size_t size; /**< Size of data in bytes */ + void *priv; /**< Frame's private data */ +} vpx_codec_frame_buffer_t; + +/*!\brief get frame buffer callback prototype + * + * This callback is invoked by the decoder to retrieve data for the frame + * buffer in order for the decode call to complete. The callback must + * allocate at least min_size in bytes and assign it to fb->data. The callback + * must zero out all the data allocated. Then the callback must set fb->size + * to the allocated size. The application does not need to align the allocated + * data. The callback is triggered when the decoder needs a frame buffer to + * decode a compressed image into. This function may be called more than once + * for every call to vpx_codec_decode. The application may set fb->priv to + * some data which will be passed back in the ximage and the release function + * call. |fb| is guaranteed to not be NULL. On success the callback must + * return 0. Any failure the callback must return a value less than 0. + * + * \param[in] priv Callback's private data + * \param[in] new_size Size in bytes needed by the buffer + * \param[in,out] fb Pointer to vpx_codec_frame_buffer_t + */ +typedef int (*vpx_get_frame_buffer_cb_fn_t)( + void *priv, size_t min_size, vpx_codec_frame_buffer_t *fb); + +/*!\brief release frame buffer callback prototype + * + * This callback is invoked by the decoder when the frame buffer is not + * referenced by any other buffers. |fb| is guaranteed to not be NULL. On + * success the callback must return 0. Any failure the callback must return + * a value less than 0. + * + * \param[in] priv Callback's private data + * \param[in] fb Pointer to vpx_codec_frame_buffer_t + */ +typedef int (*vpx_release_frame_buffer_cb_fn_t)( + void *priv, vpx_codec_frame_buffer_t *fb); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_FRAME_BUFFER_H_ diff --git a/thirdparty/libvpx/vpx/vpx_image.h b/thirdparty/libvpx/vpx/vpx_image.h new file mode 100644 index 000000000000..7958c69806ed --- /dev/null +++ b/thirdparty/libvpx/vpx/vpx_image.h @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/*!\file + * \brief Describes the vpx image descriptor and associated operations + * + */ +#ifndef VPX_VPX_IMAGE_H_ +#define VPX_VPX_IMAGE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + + /*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_IMAGE_ABI_VERSION (4) /**<\hideinitializer*/ + + +#define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */ +#define VPX_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */ +#define VPX_IMG_FMT_HAS_ALPHA 0x400 /**< Image has an alpha channel. */ +#define VPX_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */ + + /*!\brief List of supported image formats */ + typedef enum vpx_img_fmt { + VPX_IMG_FMT_NONE, + VPX_IMG_FMT_RGB24, /**< 24 bit per pixel packed RGB */ + VPX_IMG_FMT_RGB32, /**< 32 bit per pixel packed 0RGB */ + VPX_IMG_FMT_RGB565, /**< 16 bit per pixel, 565 */ + VPX_IMG_FMT_RGB555, /**< 16 bit per pixel, 555 */ + VPX_IMG_FMT_UYVY, /**< UYVY packed YUV */ + VPX_IMG_FMT_YUY2, /**< YUYV packed YUV */ + VPX_IMG_FMT_YVYU, /**< YVYU packed YUV */ + VPX_IMG_FMT_BGR24, /**< 24 bit per pixel packed BGR */ + VPX_IMG_FMT_RGB32_LE, /**< 32 bit packed BGR0 */ + VPX_IMG_FMT_ARGB, /**< 32 bit packed ARGB, alpha=255 */ + VPX_IMG_FMT_ARGB_LE, /**< 32 bit packed BGRA, alpha=255 */ + VPX_IMG_FMT_RGB565_LE, /**< 16 bit per pixel, gggbbbbb rrrrrggg */ + VPX_IMG_FMT_RGB555_LE, /**< 16 bit per pixel, gggbbbbb 0rrrrrgg */ + VPX_IMG_FMT_YV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */ + VPX_IMG_FMT_I420 = VPX_IMG_FMT_PLANAR | 2, + VPX_IMG_FMT_VPXYV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 3, /** < planar 4:2:0 format with vpx color space */ + VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4, + VPX_IMG_FMT_I422 = VPX_IMG_FMT_PLANAR | 5, + VPX_IMG_FMT_I444 = VPX_IMG_FMT_PLANAR | 6, + VPX_IMG_FMT_I440 = VPX_IMG_FMT_PLANAR | 7, + VPX_IMG_FMT_444A = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 6, + VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH, + VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH, + VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH, + VPX_IMG_FMT_I44016 = VPX_IMG_FMT_I440 | VPX_IMG_FMT_HIGHBITDEPTH + } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */ + + /*!\brief List of supported color spaces */ + typedef enum vpx_color_space { + VPX_CS_UNKNOWN = 0, /**< Unknown */ + VPX_CS_BT_601 = 1, /**< BT.601 */ + VPX_CS_BT_709 = 2, /**< BT.709 */ + VPX_CS_SMPTE_170 = 3, /**< SMPTE.170 */ + VPX_CS_SMPTE_240 = 4, /**< SMPTE.240 */ + VPX_CS_BT_2020 = 5, /**< BT.2020 */ + VPX_CS_RESERVED = 6, /**< Reserved */ + VPX_CS_SRGB = 7 /**< sRGB */ + } vpx_color_space_t; /**< alias for enum vpx_color_space */ + + /*!\brief List of supported color range */ + typedef enum vpx_color_range { + VPX_CR_STUDIO_RANGE = 0, /**< Y [16..235], UV [16..240] */ + VPX_CR_FULL_RANGE = 1 /**< YUV/RGB [0..255] */ + } vpx_color_range_t; /**< alias for enum vpx_color_range */ + + /**\brief Image Descriptor */ + typedef struct vpx_image { + vpx_img_fmt_t fmt; /**< Image Format */ + vpx_color_space_t cs; /**< Color Space */ + vpx_color_range_t range; /**< Color Range */ + + /* Image storage dimensions */ + unsigned int w; /**< Stored image width */ + unsigned int h; /**< Stored image height */ + unsigned int bit_depth; /**< Stored image bit-depth */ + + /* Image display dimensions */ + unsigned int d_w; /**< Displayed image width */ + unsigned int d_h; /**< Displayed image height */ + + /* Image intended rendering dimensions */ + unsigned int r_w; /**< Intended rendering image width */ + unsigned int r_h; /**< Intended rendering image height */ + + /* Chroma subsampling info */ + unsigned int x_chroma_shift; /**< subsampling order, X */ + unsigned int y_chroma_shift; /**< subsampling order, Y */ + + /* Image data pointers. */ +#define VPX_PLANE_PACKED 0 /**< To be used for all packed formats */ +#define VPX_PLANE_Y 0 /**< Y (Luminance) plane */ +#define VPX_PLANE_U 1 /**< U (Chroma) plane */ +#define VPX_PLANE_V 2 /**< V (Chroma) plane */ +#define VPX_PLANE_ALPHA 3 /**< A (Transparency) plane */ + unsigned char *planes[4]; /**< pointer to the top left pixel for each plane */ + int stride[4]; /**< stride between rows for each plane */ + + int bps; /**< bits per sample (for packed formats) */ + + /* The following member may be set by the application to associate data + * with this image. + */ + void *user_priv; /**< may be set by the application to associate data + * with this image. */ + + /* The following members should be treated as private. */ + unsigned char *img_data; /**< private */ + int img_data_owner; /**< private */ + int self_allocd; /**< private */ + + void *fb_priv; /**< Frame buffer data associated with the image. */ + } vpx_image_t; /**< alias for struct vpx_image */ + + /**\brief Representation of a rectangle on a surface */ + typedef struct vpx_image_rect { + unsigned int x; /**< leftmost column */ + unsigned int y; /**< topmost row */ + unsigned int w; /**< width */ + unsigned int h; /**< height */ + } vpx_image_rect_t; /**< alias for struct vpx_image_rect */ + + /*!\brief Open a descriptor, allocating storage for the underlying image + * + * Returns a descriptor for storing an image of the given format. The + * storage for the descriptor is allocated on the heap. + * + * \param[in] img Pointer to storage for descriptor. If this parameter + * is NULL, the storage for the descriptor will be + * allocated on the heap. + * \param[in] fmt Format for the image + * \param[in] d_w Width of the image + * \param[in] d_h Height of the image + * \param[in] align Alignment, in bytes, of the image buffer and + * each row in the image(stride). + * + * \return Returns a pointer to the initialized image descriptor. If the img + * parameter is non-null, the value of the img parameter will be + * returned. + */ + vpx_image_t *vpx_img_alloc(vpx_image_t *img, + vpx_img_fmt_t fmt, + unsigned int d_w, + unsigned int d_h, + unsigned int align); + + /*!\brief Open a descriptor, using existing storage for the underlying image + * + * Returns a descriptor for storing an image of the given format. The + * storage for descriptor has been allocated elsewhere, and a descriptor is + * desired to "wrap" that storage. + * + * \param[in] img Pointer to storage for descriptor. If this parameter + * is NULL, the storage for the descriptor will be + * allocated on the heap. + * \param[in] fmt Format for the image + * \param[in] d_w Width of the image + * \param[in] d_h Height of the image + * \param[in] align Alignment, in bytes, of each row in the image. + * \param[in] img_data Storage to use for the image + * + * \return Returns a pointer to the initialized image descriptor. If the img + * parameter is non-null, the value of the img parameter will be + * returned. + */ + vpx_image_t *vpx_img_wrap(vpx_image_t *img, + vpx_img_fmt_t fmt, + unsigned int d_w, + unsigned int d_h, + unsigned int align, + unsigned char *img_data); + + + /*!\brief Set the rectangle identifying the displayed portion of the image + * + * Updates the displayed rectangle (aka viewport) on the image surface to + * match the specified coordinates and size. + * + * \param[in] img Image descriptor + * \param[in] x leftmost column + * \param[in] y topmost row + * \param[in] w width + * \param[in] h height + * + * \return 0 if the requested rectangle is valid, nonzero otherwise. + */ + int vpx_img_set_rect(vpx_image_t *img, + unsigned int x, + unsigned int y, + unsigned int w, + unsigned int h); + + + /*!\brief Flip the image vertically (top for bottom) + * + * Adjusts the image descriptor's pointers and strides to make the image + * be referenced upside-down. + * + * \param[in] img Image descriptor + */ + void vpx_img_flip(vpx_image_t *img); + + /*!\brief Close an image descriptor + * + * Frees all allocated storage associated with an image descriptor. + * + * \param[in] img Image descriptor + */ + void vpx_img_free(vpx_image_t *img); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_IMAGE_H_ diff --git a/thirdparty/libvpx/vpx/vpx_integer.h b/thirdparty/libvpx/vpx/vpx_integer.h new file mode 100644 index 000000000000..829c9d132c8c --- /dev/null +++ b/thirdparty/libvpx/vpx/vpx_integer.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VPX_VPX_INTEGER_H_ +#define VPX_VPX_INTEGER_H_ + +/* get ptrdiff_t, size_t, wchar_t, NULL */ +#include + +#if defined(_MSC_VER) +#define VPX_FORCE_INLINE __forceinline +#define VPX_INLINE __inline +#else +#define VPX_FORCE_INLINE __inline__ __attribute__(always_inline) +// TODO(jbb): Allow a way to force inline off for older compilers. +#define VPX_INLINE inline +#endif + +#if (defined(_MSC_VER) && (_MSC_VER < 1600)) || defined(VPX_EMULATE_INTTYPES) +typedef signed char int8_t; +typedef signed short int16_t; +typedef signed int int32_t; + +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; + +#if (defined(_MSC_VER) && (_MSC_VER < 1600)) +typedef signed __int64 int64_t; +typedef unsigned __int64 uint64_t; +#define INT64_MAX _I64_MAX +#define INT32_MAX _I32_MAX +#define INT32_MIN _I32_MIN +#define INT16_MAX _I16_MAX +#define INT16_MIN _I16_MIN +#endif + +#ifndef _UINTPTR_T_DEFINED +typedef size_t uintptr_t; +#endif + +#else + +/* Most platforms have the C99 standard integer types. */ + +#if defined(__cplusplus) +# if !defined(__STDC_FORMAT_MACROS) +# define __STDC_FORMAT_MACROS +# endif +# if !defined(__STDC_LIMIT_MACROS) +# define __STDC_LIMIT_MACROS +# endif +#endif // __cplusplus + +#include + +#endif + +/* VS2010 defines stdint.h, but not inttypes.h */ +#if defined(_MSC_VER) && _MSC_VER < 1800 +#define PRId64 "I64d" +#else +#include +#endif + +#endif // VPX_VPX_INTEGER_H_ diff --git a/thirdparty/libvpx/vpx_config.asm b/thirdparty/libvpx/vpx_config.asm new file mode 100644 index 000000000000..8f2119e7bc2e --- /dev/null +++ b/thirdparty/libvpx/vpx_config.asm @@ -0,0 +1,65 @@ +%ifdef X86_32 + ARCH_X86 equ 1 + ARCH_X86_64 equ 0 +%elifdef X86_64 + ARCH_X86 equ 0 + ARCH_X86_64 equ 1 +%endif + +HAVE_VPX_PORTS equ 1 +CONFIG_DEPENDENCY_TRACKING equ 0 +CONFIG_EXTERNAL_BUILD equ 0 +CONFIG_INSTALL_DOCS equ 0 +CONFIG_INSTALL_BINS equ 0 +CONFIG_INSTALL_LIBS equ 0 +CONFIG_INSTALL_SRCS equ 0 +CONFIG_USE_X86INC equ 1 +CONFIG_DEBUG equ 0 +CONFIG_GPROF equ 0 +CONFIG_GCOV equ 0 +CONFIG_RVCT equ 0 +CONFIG_PIC equ 1 ;TODO: autodetect +CONFIG_CODEC_SRCS equ 0 +CONFIG_DEBUG_LIBS equ 0 +CONFIG_DEQUANT_TOKENS equ 0 +CONFIG_DC_RECON equ 0 +CONFIG_RUNTIME_CPU_DETECT equ 1 +CONFIG_POSTPROC equ 0 +CONFIG_VP9_POSTPROC equ 0 +CONFIG_MULTITHREAD equ 1 +CONFIG_INTERNAL_STATS equ 0 +CONFIG_VP8_ENCODER equ 0 +CONFIG_VP8_DECODER equ 1 +CONFIG_VP9_ENCODER equ 0 +CONFIG_VP9_DECODER equ 1 +CONFIG_VP8 equ 1 +CONFIG_VP9 equ 1 +CONFIG_ENCODERS equ 0 +CONFIG_DECODERS equ 1 +CONFIG_STATIC_MSVCRT equ 0 +CONFIG_SPATIAL_RESAMPLING equ 0 +CONFIG_REALTIME_ONLY equ 0 +CONFIG_ONTHEFLY_BITPACKING equ 0 +CONFIG_ERROR_CONCEALMENT equ 0 +CONFIG_SHARED equ 0 +CONFIG_STATIC equ 0 +CONFIG_SMALL equ 0 +CONFIG_POSTPROC_VISUALIZER equ 0 +CONFIG_OS_SUPPORT equ 1 +CONFIG_UNIT_TESTS equ 0 +CONFIG_WEBM_IO equ 0 +CONFIG_LIBYUV equ 0 +CONFIG_DECODE_PERF_TESTS equ 0 +CONFIG_ENCODE_PERF_TESTS equ 0 +CONFIG_MULTI_RES_ENCODING equ 0 +CONFIG_TEMPORAL_DENOISING equ 1 +CONFIG_VP9_TEMPORAL_DENOISING equ 0 +CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 +CONFIG_VP9_HIGHBITDEPTH equ 0 +CONFIG_BETTER_HW_COMPATIBILITY equ 0 +CONFIG_EXPERIMENTAL equ 0 +CONFIG_SIZE_LIMIT equ 0 +CONFIG_SPATIAL_SVC equ 0 +CONFIG_FP_MB_STATS equ 0 +CONFIG_EMULATE_HARDWARE equ 0 +CONFIG_MISC_FIXES equ 0 diff --git a/thirdparty/libvpx/vpx_config.h b/thirdparty/libvpx/vpx_config.h new file mode 100644 index 000000000000..e8e91fa6effc --- /dev/null +++ b/thirdparty/libvpx/vpx_config.h @@ -0,0 +1,140 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#if defined(_MSC_VER) && (_MSC_VER < 1900) + #define INLINE __inline +#else + #define INLINE inline +#endif + +#define HAVE_MIPS32 0 +#define HAVE_MEDIA 0 + +#if defined(__i386) || defined(__i386__) || defined(_M_IX86) + #define ARCH_X86 1 + #define ARCH_X86_64 0 + + #define ARCH_ARM 0 + #define HAVE_NEON 0 + #define HAVE_NEON_ASM 0 + + #define HAVE_MMX 1 + #define HAVE_SSE2 1 + #define HAVE_SSSE3 1 + #define HAVE_AVX2 0 +#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64) + #define ARCH_X86 0 + #define ARCH_X86_64 1 + + #define ARCH_ARM 0 + #define HAVE_NEON 0 + #define HAVE_NEON_ASM 0 + + #define HAVE_MMX 1 + #define HAVE_SSE2 1 + #define HAVE_SSSE3 1 + #define HAVE_AVX2 0 +#elif defined(__arm__) || defined(__TARGET_ARCH_ARM) || defined(_M_ARM) + #define ARCH_X86 0 + #define ARCH_X86_64 0 + + #define ARCH_ARM 1 + #define HAVE_NEON 1 + #define HAVE_NEON_ASM 1 +#elif defined(__aarch64__) + #define ARCH_X86 0 + #define ARCH_X86_64 0 + + #define ARCH_ARM 1 + #define HAVE_NEON 0 + #define HAVE_NEON_ASM 0 +#else + #define ARCH_X86 0 + #define ARCH_X86_64 0 + + #define ARCH_ARM 0 + #define HAVE_NEON 0 + #define HAVE_NEON_ASM 0 +#endif + +#define CONFIG_BIG_ENDIAN 0 //TODO: Autodetect + +#ifdef __EMSCRIPTEN__ +#define CONFIG_MULTITHREAD 0 +#else +#define CONFIG_MULTITHREAD 1 +#endif + +#ifdef _WIN32 + #define HAVE_PTHREAD_H 0 + #define HAVE_UNISTD_H 0 +#else + #define HAVE_PTHREAD_H 1 + #define HAVE_UNISTD_H 1 +#endif + +/**/ + +#define HAVE_VPX_PORTS 1 +#define CONFIG_DEPENDENCY_TRACKING 0 +#define CONFIG_EXTERNAL_BUILD 0 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 0 +#define CONFIG_INSTALL_LIBS 0 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 1 +#define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 0 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 0 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 0 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 0 +#define CONFIG_REALTIME_ONLY 0 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 0 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 0 +#define CONFIG_LIBYUV 0 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 0 +#define CONFIG_TEMPORAL_DENOISING 0 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 0 +#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_MISC_FIXES 0 +#endif /* VPX_CONFIG_H */ diff --git a/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/intrapred_neon_asm.asm b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/intrapred_neon_asm.asm new file mode 100644 index 000000000000..b2846c410bb0 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/intrapred_neon_asm.asm @@ -0,0 +1,643 @@ +; This file was created from a .asm file +; using the ads2armasm_ms.pl script. +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vpx_v_predictor_4x4_neon| + EXPORT |vpx_v_predictor_8x8_neon| + EXPORT |vpx_v_predictor_16x16_neon| + EXPORT |vpx_v_predictor_32x32_neon| + EXPORT |vpx_h_predictor_4x4_neon| + EXPORT |vpx_h_predictor_8x8_neon| + EXPORT |vpx_h_predictor_16x16_neon| + EXPORT |vpx_h_predictor_32x32_neon| + EXPORT |vpx_tm_predictor_4x4_neon| + EXPORT |vpx_tm_predictor_8x8_neon| + EXPORT |vpx_tm_predictor_16x16_neon| + EXPORT |vpx_tm_predictor_32x32_neon| + + + + AREA |.text|, CODE, READONLY, ALIGN=2 + +;void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_v_predictor_4x4_neon| PROC + vld1.32 {d0[0]}, [r2] + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + bx lr + ENDP ; |vpx_v_predictor_4x4_neon| + ALIGN 4 + +;void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_v_predictor_8x8_neon| PROC + vld1.8 {d0}, [r2] + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + bx lr + ENDP ; |vpx_v_predictor_8x8_neon| + ALIGN 4 + +;void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_v_predictor_16x16_neon| PROC + vld1.8 {q0}, [r2] + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + bx lr + ENDP ; |vpx_v_predictor_16x16_neon| + ALIGN 4 + +;void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_v_predictor_32x32_neon| PROC + vld1.8 {q0, q1}, [r2] + mov r2, #2 +loop_v + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + subs r2, r2, #1 + bgt loop_v + bx lr + ENDP ; |vpx_v_predictor_32x32_neon| + ALIGN 4 + +;void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_h_predictor_4x4_neon| PROC + vld1.32 {d1[0]}, [r3] + vdup.8 d0, d1[0] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[1] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[2] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[3] + vst1.32 {d0[0]}, [r0], r1 + bx lr + ENDP ; |vpx_h_predictor_4x4_neon| + ALIGN 4 + +;void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_h_predictor_8x8_neon| PROC + vld1.64 {d1}, [r3] + vdup.8 d0, d1[0] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[1] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[2] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[3] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[4] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[5] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[6] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[7] + vst1.64 {d0}, [r0], r1 + bx lr + ENDP ; |vpx_h_predictor_8x8_neon| + ALIGN 4 + +;void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_h_predictor_16x16_neon| PROC + vld1.8 {q1}, [r3] + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0], r1 + bx lr + ENDP ; |vpx_h_predictor_16x16_neon| + ALIGN 4 + +;void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_h_predictor_32x32_neon| PROC + sub r1, r1, #16 + mov r2, #2 +loop_h + vld1.8 {q1}, [r3]! + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + subs r2, r2, #1 + bgt loop_h + bx lr + ENDP ; |vpx_h_predictor_32x32_neon| + ALIGN 4 + +;void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_tm_predictor_4x4_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.u8 {d0[]}, [r12] + + ; Load above 4 pixels + vld1.32 {d2[0]}, [r2] + + ; Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + ; Load left row by row and compute left + (above - ytop_left) + ; 1st row and 2nd row + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3]! + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + + ; 3rd row and 4th row + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3] + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + bx lr + ENDP ; |vpx_tm_predictor_4x4_neon| + ALIGN 4 + +;void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_tm_predictor_8x8_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + ; preload 8 left + vld1.8 {d30}, [r3] + + ; Load above 8 pixels + vld1.64 {d2}, [r2] + + vmovl.u8 q10, d30 + + ; Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + ; Load left row by row and compute left + (above - ytop_left) + ; 1st row and 2nd row + vdup.16 q0, d20[0] + vdup.16 q1, d20[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + ; 3rd row and 4th row + vdup.16 q8, d20[2] + vdup.16 q9, d20[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + ; 5th row and 6th row + vdup.16 q0, d21[0] + vdup.16 q1, d21[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + ; 7th row and 8th row + vdup.16 q8, d21[2] + vdup.16 q9, d21[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + bx lr + ENDP ; |vpx_tm_predictor_8x8_neon| + ALIGN 4 + +;void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_tm_predictor_16x16_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + ; Load above 8 pixels + vld1.8 {q1}, [r2] + + ; preload 8 left into r12 + vld1.8 {d18}, [r3]! + + ; Compute above - ytop_left + vsubl.u8 q2, d2, d0 + vsubl.u8 q3, d3, d0 + + vmovl.u8 q10, d18 + + ; Load left row by row and compute left + (above - ytop_left) + ; Process 8 rows in each single loop and loop 2 times to process 16 rows. + mov r2, #2 + +loop_16x16_neon + ; Process two rows. + vdup.16 q0, d20[0] + vdup.16 q8, d20[1] + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d20[2] ; proload next 2 rows data + vdup.16 q8, d20[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + ; Process two rows. + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[0] ; proload next 2 rows data + vdup.16 q8, d21[1] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[2] ; proload next 2 rows data + vdup.16 q8, d21[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vld1.8 {d18}, [r3]! ; preload 8 left into r12 + vmovl.u8 q10, d18 + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + subs r2, r2, #1 + bgt loop_16x16_neon + + bx lr + ENDP ; |vpx_tm_predictor_16x16_neon| + ALIGN 4 + +;void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vpx_tm_predictor_32x32_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + ; Load above 32 pixels + vld1.8 {q1}, [r2]! + vld1.8 {q2}, [r2] + + ; preload 8 left pixels + vld1.8 {d26}, [r3]! + + ; Compute above - ytop_left + vsubl.u8 q8, d2, d0 + vsubl.u8 q9, d3, d0 + vsubl.u8 q10, d4, d0 + vsubl.u8 q11, d5, d0 + + vmovl.u8 q3, d26 + + ; Load left row by row and compute left + (above - ytop_left) + ; Process 8 rows in each single loop and loop 4 times to process 32 rows. + mov r2, #4 + +loop_32x32_neon + ; Process two rows. + vdup.16 q0, d6[0] + vdup.16 q2, d6[1] + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q1, d6[2] + vdup.16 q2, d6[3] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q1, q8 + vadd.s16 q13, q1, q9 + vadd.s16 q14, q1, q10 + vadd.s16 q15, q1, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[0] + vdup.16 q2, d7[1] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[2] + vdup.16 q2, d7[3] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vld1.8 {d0}, [r3]! ; preload 8 left pixels + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vmovl.u8 q3, d0 + vst1.64 {d24-d27}, [r0], r1 + + subs r2, r2, #1 + bgt loop_32x32_neon + + bx lr + ENDP ; |vpx_tm_predictor_32x32_neon| + ALIGN 4 + + END diff --git a/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/loopfilter_mb_neon.asm b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/loopfilter_mb_neon.asm new file mode 100644 index 000000000000..9c3736faf84e --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/loopfilter_mb_neon.asm @@ -0,0 +1,641 @@ +; This file was created from a .asm file +; using the ads2armasm_ms.pl script. +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vpx_lpf_horizontal_edge_8_neon| + EXPORT |vpx_lpf_horizontal_edge_16_neon| + EXPORT |vpx_lpf_vertical_16_neon| + + AREA |.text|, CODE, READONLY, ALIGN=2 + +; void mb_lpf_horizontal_edge(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; r12 int count +|mb_lpf_horizontal_edge| PROC + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] ; load thresh + +h_count + vld1.8 {d16[]}, [r2] ; load *blimit + vld1.8 {d17[]}, [r3] ; load *limit + vld1.8 {d18[]}, [r4] ; load *thresh + + sub r8, r0, r1, lsl #3 ; move src pointer down by 8 lines + + vld1.u8 {d0}, [r8@64], r1 ; p7 + vld1.u8 {d1}, [r8@64], r1 ; p6 + vld1.u8 {d2}, [r8@64], r1 ; p5 + vld1.u8 {d3}, [r8@64], r1 ; p4 + vld1.u8 {d4}, [r8@64], r1 ; p3 + vld1.u8 {d5}, [r8@64], r1 ; p2 + vld1.u8 {d6}, [r8@64], r1 ; p1 + vld1.u8 {d7}, [r8@64], r1 ; p0 + vld1.u8 {d8}, [r8@64], r1 ; q0 + vld1.u8 {d9}, [r8@64], r1 ; q1 + vld1.u8 {d10}, [r8@64], r1 ; q2 + vld1.u8 {d11}, [r8@64], r1 ; q3 + vld1.u8 {d12}, [r8@64], r1 ; q4 + vld1.u8 {d13}, [r8@64], r1 ; q5 + vld1.u8 {d14}, [r8@64], r1 ; q6 + vld1.u8 {d15}, [r8@64], r1 ; q7 + + bl vpx_wide_mbfilter_neon + + tst r7, #1 + beq h_mbfilter + + ; flat && mask were not set for any of the channels. Just store the values + ; from filter. + sub r8, r0, r1, lsl #1 + + vst1.u8 {d25}, [r8@64], r1 ; store op1 + vst1.u8 {d24}, [r8@64], r1 ; store op0 + vst1.u8 {d23}, [r8@64], r1 ; store oq0 + vst1.u8 {d26}, [r8@64], r1 ; store oq1 + + b h_next + +h_mbfilter + tst r7, #2 + beq h_wide_mbfilter + + ; flat2 was not set for any of the channels. Just store the values from + ; mbfilter. + sub r8, r0, r1, lsl #1 + sub r8, r8, r1 + + vst1.u8 {d18}, [r8@64], r1 ; store op2 + vst1.u8 {d19}, [r8@64], r1 ; store op1 + vst1.u8 {d20}, [r8@64], r1 ; store op0 + vst1.u8 {d21}, [r8@64], r1 ; store oq0 + vst1.u8 {d22}, [r8@64], r1 ; store oq1 + vst1.u8 {d23}, [r8@64], r1 ; store oq2 + + b h_next + +h_wide_mbfilter + sub r8, r0, r1, lsl #3 + add r8, r8, r1 + + vst1.u8 {d16}, [r8@64], r1 ; store op6 + vst1.u8 {d24}, [r8@64], r1 ; store op5 + vst1.u8 {d25}, [r8@64], r1 ; store op4 + vst1.u8 {d26}, [r8@64], r1 ; store op3 + vst1.u8 {d27}, [r8@64], r1 ; store op2 + vst1.u8 {d18}, [r8@64], r1 ; store op1 + vst1.u8 {d19}, [r8@64], r1 ; store op0 + vst1.u8 {d20}, [r8@64], r1 ; store oq0 + vst1.u8 {d21}, [r8@64], r1 ; store oq1 + vst1.u8 {d22}, [r8@64], r1 ; store oq2 + vst1.u8 {d23}, [r8@64], r1 ; store oq3 + vst1.u8 {d1}, [r8@64], r1 ; store oq4 + vst1.u8 {d2}, [r8@64], r1 ; store oq5 + vst1.u8 {d3}, [r8@64], r1 ; store oq6 + +h_next + add r0, r0, #8 + subs r12, r12, #1 + bne h_count + + vpop {d8-d15} + pop {r4-r8, pc} + + ENDP ; |mb_lpf_horizontal_edge| + ALIGN 4 + +; void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|vpx_lpf_horizontal_edge_8_neon| PROC + mov r12, #1 + b mb_lpf_horizontal_edge + ENDP ; |vpx_lpf_horizontal_edge_8_neon| + ALIGN 4 + +; void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|vpx_lpf_horizontal_edge_16_neon| PROC + mov r12, #2 + b mb_lpf_horizontal_edge + ENDP ; |vpx_lpf_horizontal_edge_16_neon| + ALIGN 4 + +; void vpx_lpf_vertical_16_neon(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|vpx_lpf_vertical_16_neon| PROC + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] ; load thresh + + vld1.8 {d16[]}, [r2] ; load *blimit + vld1.8 {d17[]}, [r3] ; load *limit + vld1.8 {d18[]}, [r4] ; load *thresh + + sub r8, r0, #8 + + vld1.8 {d0}, [r8@64], r1 + vld1.8 {d8}, [r0@64], r1 + vld1.8 {d1}, [r8@64], r1 + vld1.8 {d9}, [r0@64], r1 + vld1.8 {d2}, [r8@64], r1 + vld1.8 {d10}, [r0@64], r1 + vld1.8 {d3}, [r8@64], r1 + vld1.8 {d11}, [r0@64], r1 + vld1.8 {d4}, [r8@64], r1 + vld1.8 {d12}, [r0@64], r1 + vld1.8 {d5}, [r8@64], r1 + vld1.8 {d13}, [r0@64], r1 + vld1.8 {d6}, [r8@64], r1 + vld1.8 {d14}, [r0@64], r1 + vld1.8 {d7}, [r8@64], r1 + vld1.8 {d15}, [r0@64], r1 + + sub r0, r0, r1, lsl #3 + + vtrn.32 q0, q2 + vtrn.32 q1, q3 + vtrn.32 q4, q6 + vtrn.32 q5, q7 + + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + vtrn.8 d0, d1 + vtrn.8 d2, d3 + vtrn.8 d4, d5 + vtrn.8 d6, d7 + + vtrn.8 d8, d9 + vtrn.8 d10, d11 + vtrn.8 d12, d13 + vtrn.8 d14, d15 + + bl vpx_wide_mbfilter_neon + + tst r7, #1 + beq v_mbfilter + + ; flat && mask were not set for any of the channels. Just store the values + ; from filter. + sub r8, r0, #2 + + vswp d23, d25 + + vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1 + vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1 + vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1 + vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1 + vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1 + vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1 + vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1 + vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1 + + b v_end + +v_mbfilter + tst r7, #2 + beq v_wide_mbfilter + + ; flat2 was not set for any of the channels. Just store the values from + ; mbfilter. + sub r8, r0, #3 + + vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1 + vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1 + vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1 + vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1 + vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1 + vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1 + vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1 + vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1 + vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1 + vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1 + vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1 + vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1 + vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1 + vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1 + vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1 + vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1 + + b v_end + +v_wide_mbfilter + sub r8, r0, #8 + + vtrn.32 d0, d26 + vtrn.32 d16, d27 + vtrn.32 d24, d18 + vtrn.32 d25, d19 + + vtrn.16 d0, d24 + vtrn.16 d16, d25 + vtrn.16 d26, d18 + vtrn.16 d27, d19 + + vtrn.8 d0, d16 + vtrn.8 d24, d25 + vtrn.8 d26, d27 + vtrn.8 d18, d19 + + vtrn.32 d20, d1 + vtrn.32 d21, d2 + vtrn.32 d22, d3 + vtrn.32 d23, d15 + + vtrn.16 d20, d22 + vtrn.16 d21, d23 + vtrn.16 d1, d3 + vtrn.16 d2, d15 + + vtrn.8 d20, d21 + vtrn.8 d22, d23 + vtrn.8 d1, d2 + vtrn.8 d3, d15 + + vst1.8 {d0}, [r8@64], r1 + vst1.8 {d20}, [r0@64], r1 + vst1.8 {d16}, [r8@64], r1 + vst1.8 {d21}, [r0@64], r1 + vst1.8 {d24}, [r8@64], r1 + vst1.8 {d22}, [r0@64], r1 + vst1.8 {d25}, [r8@64], r1 + vst1.8 {d23}, [r0@64], r1 + vst1.8 {d26}, [r8@64], r1 + vst1.8 {d1}, [r0@64], r1 + vst1.8 {d27}, [r8@64], r1 + vst1.8 {d2}, [r0@64], r1 + vst1.8 {d18}, [r8@64], r1 + vst1.8 {d3}, [r0@64], r1 + vst1.8 {d19}, [r8@64], r1 + vst1.8 {d15}, [r0@64], r1 + +v_end + vpop {d8-d15} + pop {r4-r8, pc} + + ENDP ; |vpx_lpf_vertical_16_neon| + ALIGN 4 + +; void vpx_wide_mbfilter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. +; +; r0-r3 PRESERVE +; d16 blimit +; d17 limit +; d18 thresh +; d0 p7 +; d1 p6 +; d2 p5 +; d3 p4 +; d4 p3 +; d5 p2 +; d6 p1 +; d7 p0 +; d8 q0 +; d9 q1 +; d10 q2 +; d11 q3 +; d12 q4 +; d13 q5 +; d14 q6 +; d15 q7 +|vpx_wide_mbfilter_neon| PROC + mov r7, #0 + + ; filter_mask + vabd.u8 d19, d4, d5 ; abs(p3 - p2) + vabd.u8 d20, d5, d6 ; abs(p2 - p1) + vabd.u8 d21, d6, d7 ; abs(p1 - p0) + vabd.u8 d22, d9, d8 ; abs(q1 - q0) + vabd.u8 d23, d10, d9 ; abs(q2 - q1) + vabd.u8 d24, d11, d10 ; abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1)) + vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0)) + vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2)) + vmax.u8 d19, d19, d20 + + vabd.u8 d24, d7, d8 ; abs(p0 - q0) + + vmax.u8 d19, d19, d23 + + vabd.u8 d23, d6, d9 ; a = abs(p1 - q1) + vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 + + ; abs () > limit + vcge.u8 d19, d17, d19 + + ; flatmask4 + vabd.u8 d25, d7, d5 ; abs(p0 - p2) + vabd.u8 d26, d8, d10 ; abs(q0 - q2) + vabd.u8 d27, d4, d7 ; abs(p3 - p0) + vabd.u8 d28, d11, d8 ; abs(q3 - q0) + + ; only compare the largest value to thresh + vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2)) + vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0)) + vmax.u8 d25, d25, d26 + vmax.u8 d20, d20, d25 + + vshr.u8 d23, d23, #1 ; a = a / 2 + vqadd.u8 d24, d24, d23 ; a = b + a + + vmov.u8 d30, #1 + vcge.u8 d24, d16, d24 ; (a > blimit * 2 + limit) * -1 + + vcge.u8 d20, d30, d20 ; flat + + vand d19, d19, d24 ; mask + + ; hevmask + vcgt.u8 d21, d21, d18 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d18 ; (abs(q1 - q0) > thresh)*-1 + vorr d21, d21, d22 ; hev + + vand d16, d20, d19 ; flat && mask + vmov r5, r6, d16 + + ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7) + vabd.u8 d22, d3, d7 ; abs(p4 - p0) + vabd.u8 d23, d12, d8 ; abs(q4 - q0) + vabd.u8 d24, d7, d2 ; abs(p0 - p5) + vabd.u8 d25, d8, d13 ; abs(q0 - q5) + vabd.u8 d26, d1, d7 ; abs(p6 - p0) + vabd.u8 d27, d14, d8 ; abs(q6 - q0) + vabd.u8 d28, d0, d7 ; abs(p7 - p0) + vabd.u8 d29, d15, d8 ; abs(q7 - q0) + + ; only compare the largest value to thresh + vmax.u8 d22, d22, d23 ; max(abs(p4 - p0), abs(q4 - q0)) + vmax.u8 d23, d24, d25 ; max(abs(p0 - p5), abs(q0 - q5)) + vmax.u8 d24, d26, d27 ; max(abs(p6 - p0), abs(q6 - q0)) + vmax.u8 d25, d28, d29 ; max(abs(p7 - p0), abs(q7 - q0)) + + vmax.u8 d26, d22, d23 + vmax.u8 d27, d24, d25 + vmax.u8 d23, d26, d27 + + vcge.u8 d18, d30, d23 ; flat2 + + vmov.u8 d22, #0x80 + + orrs r5, r5, r6 ; Check for 0 + orreq r7, r7, #1 ; Only do filter branch + + vand d17, d18, d16 ; flat2 && flat && mask + vmov r5, r6, d17 + + ; mbfilter() function + + ; filter() function + ; convert to signed + veor d23, d8, d22 ; qs0 + veor d24, d7, d22 ; ps0 + veor d25, d6, d22 ; ps1 + veor d26, d9, d22 ; qs1 + + vmov.u8 d27, #3 + + vsub.s8 d28, d23, d24 ; ( qs0 - ps0) + vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) + vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) + vand d29, d29, d21 ; filter &= hev + vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) + vmov.u8 d29, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d28, q15 + + vand d28, d28, d19 ; filter &= mask + + vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) + vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) + vshr.s8 d30, d30, #3 ; filter2 >>= 3 + vshr.s8 d29, d29, #3 ; filter1 >>= 3 + + + vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) + vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1) + + ; outer tap adjustments: ++filter1 >> 1 + vrshr.s8 d29, d29, #1 + vbic d29, d29, d21 ; filter &= ~hev + + vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) + vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) + + veor d24, d24, d22 ; *f_op0 = u^0x80 + veor d23, d23, d22 ; *f_oq0 = u^0x80 + veor d25, d25, d22 ; *f_op1 = u^0x80 + veor d26, d26, d22 ; *f_oq1 = u^0x80 + + tst r7, #1 + bxne lr + + orrs r5, r5, r6 ; Check for 0 + orreq r7, r7, #2 ; Only do mbfilter branch + + ; mbfilter flat && mask branch + ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's + ; and using vibt on the q's? + vmov.u8 d29, #2 + vaddl.u8 q15, d7, d8 ; op2 = p0 + q0 + vmlal.u8 q15, d4, d27 ; op2 = p0 + q0 + p3 * 3 + vmlal.u8 q15, d5, d29 ; op2 = p0 + q0 + p3 * 3 + p2 * 2 + vaddl.u8 q10, d4, d5 + vaddw.u8 q15, d6 ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2 + vaddl.u8 q14, d6, d9 + vqrshrn.u16 d18, q15, #3 ; r_op2 + + vsub.i16 q15, q10 + vaddl.u8 q10, d4, d6 + vadd.i16 q15, q14 + vaddl.u8 q14, d7, d10 + vqrshrn.u16 d19, q15, #3 ; r_op1 + + vsub.i16 q15, q10 + vadd.i16 q15, q14 + vaddl.u8 q14, d8, d11 + vqrshrn.u16 d20, q15, #3 ; r_op0 + + vsubw.u8 q15, d4 ; oq0 = op0 - p3 + vsubw.u8 q15, d7 ; oq0 -= p0 + vadd.i16 q15, q14 + vaddl.u8 q14, d9, d11 + vqrshrn.u16 d21, q15, #3 ; r_oq0 + + vsubw.u8 q15, d5 ; oq1 = oq0 - p2 + vsubw.u8 q15, d8 ; oq1 -= q0 + vadd.i16 q15, q14 + vaddl.u8 q14, d10, d11 + vqrshrn.u16 d22, q15, #3 ; r_oq1 + + vsubw.u8 q15, d6 ; oq2 = oq0 - p1 + vsubw.u8 q15, d9 ; oq2 -= q1 + vadd.i16 q15, q14 + vqrshrn.u16 d27, q15, #3 ; r_oq2 + + ; Filter does not set op2 or oq2, so use p2 and q2. + vbif d18, d5, d16 ; t_op2 |= p2 & ~(flat & mask) + vbif d19, d25, d16 ; t_op1 |= f_op1 & ~(flat & mask) + vbif d20, d24, d16 ; t_op0 |= f_op0 & ~(flat & mask) + vbif d21, d23, d16 ; t_oq0 |= f_oq0 & ~(flat & mask) + vbif d22, d26, d16 ; t_oq1 |= f_oq1 & ~(flat & mask) + + vbit d23, d27, d16 ; t_oq2 |= r_oq2 & (flat & mask) + vbif d23, d10, d16 ; t_oq2 |= q2 & ~(flat & mask) + + tst r7, #2 + bxne lr + + ; wide_mbfilter flat2 && flat && mask branch + vmov.u8 d16, #7 + vaddl.u8 q15, d7, d8 ; op6 = p0 + q0 + vaddl.u8 q12, d2, d3 + vaddl.u8 q13, d4, d5 + vaddl.u8 q14, d1, d6 + vmlal.u8 q15, d0, d16 ; op6 += p7 * 3 + vadd.i16 q12, q13 + vadd.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vadd.i16 q15, q12 + vaddl.u8 q12, d0, d1 + vaddw.u8 q15, d1 + vaddl.u8 q13, d0, d2 + vadd.i16 q14, q15, q14 + vqrshrn.u16 d16, q15, #4 ; w_op6 + + vsub.i16 q15, q14, q12 + vaddl.u8 q14, d3, d10 + vqrshrn.u16 d24, q15, #4 ; w_op5 + + vsub.i16 q15, q13 + vaddl.u8 q13, d0, d3 + vadd.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vqrshrn.u16 d25, q15, #4 ; w_op4 + + vadd.i16 q15, q14 + vaddl.u8 q14, d0, d4 + vsub.i16 q15, q13 + vsub.i16 q14, q15, q14 + vqrshrn.u16 d26, q15, #4 ; w_op3 + + vaddw.u8 q15, q14, d5 ; op2 += p2 + vaddl.u8 q14, d0, d5 + vaddw.u8 q15, d12 ; op2 += q4 + vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m) + vqrshrn.u16 d27, q15, #4 ; w_op2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d6 + vaddw.u8 q15, d6 ; op1 += p1 + vaddw.u8 q15, d13 ; op1 += q5 + vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m) + vqrshrn.u16 d18, q15, #4 ; w_op1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d7 + vaddw.u8 q15, d7 ; op0 += p0 + vaddw.u8 q15, d14 ; op0 += q6 + vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m) + vqrshrn.u16 d19, q15, #4 ; w_op0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d1, d8 + vaddw.u8 q15, d8 ; oq0 += q0 + vaddw.u8 q15, d15 ; oq0 += q7 + vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m) + vqrshrn.u16 d20, q15, #4 ; w_oq0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vaddw.u8 q15, d9 ; oq1 += q1 + vaddl.u8 q4, d10, d15 + vaddw.u8 q15, d15 ; oq1 += q7 + vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m) + vqrshrn.u16 d21, q15, #4 ; w_oq1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d3, d10 + vadd.i16 q15, q4 + vaddl.u8 q4, d11, d15 + vbif d21, d22, d17 ; oq1 |= t_oq1 & ~(f2 & f & m) + vqrshrn.u16 d22, q15, #4 ; w_oq2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vadd.i16 q15, q4 + vaddl.u8 q4, d12, d15 + vbif d22, d23, d17 ; oq2 |= t_oq2 & ~(f2 & f & m) + vqrshrn.u16 d23, q15, #4 ; w_oq3 + + vsub.i16 q15, q14 + vaddl.u8 q14, d5, d12 + vadd.i16 q15, q4 + vaddl.u8 q4, d13, d15 + vbif d16, d1, d17 ; op6 |= p6 & ~(f2 & f & m) + vqrshrn.u16 d1, q15, #4 ; w_oq4 + + vsub.i16 q15, q14 + vaddl.u8 q14, d6, d13 + vadd.i16 q15, q4 + vaddl.u8 q4, d14, d15 + vbif d24, d2, d17 ; op5 |= p5 & ~(f2 & f & m) + vqrshrn.u16 d2, q15, #4 ; w_oq5 + + vsub.i16 q15, q14 + vbif d25, d3, d17 ; op4 |= p4 & ~(f2 & f & m) + vadd.i16 q15, q4 + vbif d23, d11, d17 ; oq3 |= q3 & ~(f2 & f & m) + vqrshrn.u16 d3, q15, #4 ; w_oq6 + vbif d1, d12, d17 ; oq4 |= q4 & ~(f2 & f & m) + vbif d2, d13, d17 ; oq5 |= q5 & ~(f2 & f & m) + vbif d3, d14, d17 ; oq6 |= q6 & ~(f2 & f & m) + + bx lr + ENDP ; |vpx_wide_mbfilter_neon| + ALIGN 4 + + END diff --git a/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/save_reg_neon.asm b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/save_reg_neon.asm new file mode 100644 index 000000000000..4cf9988e659d --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/armasm_ms/save_reg_neon.asm @@ -0,0 +1,39 @@ +; This file was created from a .asm file +; using the ads2armasm_ms.pl script. +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vpx_push_neon| + EXPORT |vpx_pop_neon| + + + + + AREA |.text|, CODE, READONLY, ALIGN=2 + +|vpx_push_neon| PROC + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + ALIGN 4 + +|vpx_pop_neon| PROC + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + ALIGN 4 + + END + diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas/intrapred_neon_asm.s b/thirdparty/libvpx/vpx_dsp/arm/gas/intrapred_neon_asm.s new file mode 100644 index 000000000000..3932227fc5b1 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/gas/intrapred_neon_asm.s @@ -0,0 +1,658 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. + .equ DO1STROUNDING, 0 +@ +@ Copyright (c) 2014 The WebM project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ + + .global vpx_v_predictor_4x4_neon + .type vpx_v_predictor_4x4_neon, function + .global vpx_v_predictor_8x8_neon + .type vpx_v_predictor_8x8_neon, function + .global vpx_v_predictor_16x16_neon + .type vpx_v_predictor_16x16_neon, function + .global vpx_v_predictor_32x32_neon + .type vpx_v_predictor_32x32_neon, function + .global vpx_h_predictor_4x4_neon + .type vpx_h_predictor_4x4_neon, function + .global vpx_h_predictor_8x8_neon + .type vpx_h_predictor_8x8_neon, function + .global vpx_h_predictor_16x16_neon + .type vpx_h_predictor_16x16_neon, function + .global vpx_h_predictor_32x32_neon + .type vpx_h_predictor_32x32_neon, function + .global vpx_tm_predictor_4x4_neon + .type vpx_tm_predictor_4x4_neon, function + .global vpx_tm_predictor_8x8_neon + .type vpx_tm_predictor_8x8_neon, function + .global vpx_tm_predictor_16x16_neon + .type vpx_tm_predictor_16x16_neon, function + .global vpx_tm_predictor_32x32_neon + .type vpx_tm_predictor_32x32_neon, function + .arm + .eabi_attribute 24, 1 @Tag_ABI_align_needed + .eabi_attribute 25, 1 @Tag_ABI_align_preserved + +.text +.p2align 2 + +@void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_v_predictor_4x4_neon: + vpx_v_predictor_4x4_neon: @ PROC + vld1.32 {d0[0]}, [r2] + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + bx lr + .size vpx_v_predictor_4x4_neon, .-vpx_v_predictor_4x4_neon @ ENDP @ |vpx_v_predictor_4x4_neon| + +@void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_v_predictor_8x8_neon: + vpx_v_predictor_8x8_neon: @ PROC + vld1.8 {d0}, [r2] + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + bx lr + .size vpx_v_predictor_8x8_neon, .-vpx_v_predictor_8x8_neon @ ENDP @ |vpx_v_predictor_8x8_neon| + +@void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_v_predictor_16x16_neon: + vpx_v_predictor_16x16_neon: @ PROC + vld1.8 {q0}, [r2] + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + bx lr + .size vpx_v_predictor_16x16_neon, .-vpx_v_predictor_16x16_neon @ ENDP @ |vpx_v_predictor_16x16_neon| + +@void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_v_predictor_32x32_neon: + vpx_v_predictor_32x32_neon: @ PROC + vld1.8 {q0, q1}, [r2] + mov r2, #2 +loop_v: + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + subs r2, r2, #1 + bgt loop_v + bx lr + .size vpx_v_predictor_32x32_neon, .-vpx_v_predictor_32x32_neon @ ENDP @ |vpx_v_predictor_32x32_neon| + +@void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_h_predictor_4x4_neon: + vpx_h_predictor_4x4_neon: @ PROC + vld1.32 {d1[0]}, [r3] + vdup.8 d0, d1[0] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[1] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[2] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[3] + vst1.32 {d0[0]}, [r0], r1 + bx lr + .size vpx_h_predictor_4x4_neon, .-vpx_h_predictor_4x4_neon @ ENDP @ |vpx_h_predictor_4x4_neon| + +@void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_h_predictor_8x8_neon: + vpx_h_predictor_8x8_neon: @ PROC + vld1.64 {d1}, [r3] + vdup.8 d0, d1[0] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[1] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[2] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[3] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[4] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[5] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[6] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[7] + vst1.64 {d0}, [r0], r1 + bx lr + .size vpx_h_predictor_8x8_neon, .-vpx_h_predictor_8x8_neon @ ENDP @ |vpx_h_predictor_8x8_neon| + +@void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_h_predictor_16x16_neon: + vpx_h_predictor_16x16_neon: @ PROC + vld1.8 {q1}, [r3] + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0], r1 + bx lr + .size vpx_h_predictor_16x16_neon, .-vpx_h_predictor_16x16_neon @ ENDP @ |vpx_h_predictor_16x16_neon| + +@void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_h_predictor_32x32_neon: + vpx_h_predictor_32x32_neon: @ PROC + sub r1, r1, #16 + mov r2, #2 +loop_h: + vld1.8 {q1}, [r3]! + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + subs r2, r2, #1 + bgt loop_h + bx lr + .size vpx_h_predictor_32x32_neon, .-vpx_h_predictor_32x32_neon @ ENDP @ |vpx_h_predictor_32x32_neon| + +@void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_tm_predictor_4x4_neon: + vpx_tm_predictor_4x4_neon: @ PROC + @ Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.u8 {d0[]}, [r12] + + @ Load above 4 pixels + vld1.32 {d2[0]}, [r2] + + @ Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + @ Load left row by row and compute left + (above - ytop_left) + @ 1st row and 2nd row + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3]! + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + + @ 3rd row and 4th row + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3] + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + bx lr + .size vpx_tm_predictor_4x4_neon, .-vpx_tm_predictor_4x4_neon @ ENDP @ |vpx_tm_predictor_4x4_neon| + +@void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_tm_predictor_8x8_neon: + vpx_tm_predictor_8x8_neon: @ PROC + @ Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + @ preload 8 left + vld1.8 {d30}, [r3] + + @ Load above 8 pixels + vld1.64 {d2}, [r2] + + vmovl.u8 q10, d30 + + @ Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + @ Load left row by row and compute left + (above - ytop_left) + @ 1st row and 2nd row + vdup.16 q0, d20[0] + vdup.16 q1, d20[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + @ 3rd row and 4th row + vdup.16 q8, d20[2] + vdup.16 q9, d20[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + @ 5th row and 6th row + vdup.16 q0, d21[0] + vdup.16 q1, d21[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + @ 7th row and 8th row + vdup.16 q8, d21[2] + vdup.16 q9, d21[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + bx lr + .size vpx_tm_predictor_8x8_neon, .-vpx_tm_predictor_8x8_neon @ ENDP @ |vpx_tm_predictor_8x8_neon| + +@void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_tm_predictor_16x16_neon: + vpx_tm_predictor_16x16_neon: @ PROC + @ Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + @ Load above 8 pixels + vld1.8 {q1}, [r2] + + @ preload 8 left into r12 + vld1.8 {d18}, [r3]! + + @ Compute above - ytop_left + vsubl.u8 q2, d2, d0 + vsubl.u8 q3, d3, d0 + + vmovl.u8 q10, d18 + + @ Load left row by row and compute left + (above - ytop_left) + @ Process 8 rows in each single loop and loop 2 times to process 16 rows. + mov r2, #2 + +loop_16x16_neon: + @ Process two rows. + vdup.16 q0, d20[0] + vdup.16 q8, d20[1] + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d20[2] @ proload next 2 rows data + vdup.16 q8, d20[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + @ Process two rows. + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[0] @ proload next 2 rows data + vdup.16 q8, d21[1] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[2] @ proload next 2 rows data + vdup.16 q8, d21[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vld1.8 {d18}, [r3]! @ preload 8 left into r12 + vmovl.u8 q10, d18 + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + subs r2, r2, #1 + bgt loop_16x16_neon + + bx lr + .size vpx_tm_predictor_16x16_neon, .-vpx_tm_predictor_16x16_neon @ ENDP @ |vpx_tm_predictor_16x16_neon| + +@void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, +@ const uint8_t *above, +@ const uint8_t *left) +@ r0 uint8_t *dst +@ r1 ptrdiff_t y_stride +@ r2 const uint8_t *above +@ r3 const uint8_t *left + +_vpx_tm_predictor_32x32_neon: + vpx_tm_predictor_32x32_neon: @ PROC + @ Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + @ Load above 32 pixels + vld1.8 {q1}, [r2]! + vld1.8 {q2}, [r2] + + @ preload 8 left pixels + vld1.8 {d26}, [r3]! + + @ Compute above - ytop_left + vsubl.u8 q8, d2, d0 + vsubl.u8 q9, d3, d0 + vsubl.u8 q10, d4, d0 + vsubl.u8 q11, d5, d0 + + vmovl.u8 q3, d26 + + @ Load left row by row and compute left + (above - ytop_left) + @ Process 8 rows in each single loop and loop 4 times to process 32 rows. + mov r2, #4 + +loop_32x32_neon: + @ Process two rows. + vdup.16 q0, d6[0] + vdup.16 q2, d6[1] + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q1, d6[2] + vdup.16 q2, d6[3] + vst1.64 {d24-d27}, [r0], r1 + + @ Process two rows. + vadd.s16 q12, q1, q8 + vadd.s16 q13, q1, q9 + vadd.s16 q14, q1, q10 + vadd.s16 q15, q1, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[0] + vdup.16 q2, d7[1] + vst1.64 {d24-d27}, [r0], r1 + + @ Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[2] + vdup.16 q2, d7[3] + vst1.64 {d24-d27}, [r0], r1 + + @ Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vld1.8 {d0}, [r3]! @ preload 8 left pixels + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vmovl.u8 q3, d0 + vst1.64 {d24-d27}, [r0], r1 + + subs r2, r2, #1 + bgt loop_32x32_neon + + bx lr + .size vpx_tm_predictor_32x32_neon, .-vpx_tm_predictor_32x32_neon @ ENDP @ |vpx_tm_predictor_32x32_neon| + + .section .note.GNU-stack,"",%progbits diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas/loopfilter_mb_neon.s b/thirdparty/libvpx/vpx_dsp/arm/gas/loopfilter_mb_neon.s new file mode 100644 index 000000000000..f6b05406fb20 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/gas/loopfilter_mb_neon.s @@ -0,0 +1,647 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. + .equ DO1STROUNDING, 0 +@ +@ Copyright (c) 2013 The WebM project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ + + .global vpx_lpf_horizontal_edge_8_neon + .type vpx_lpf_horizontal_edge_8_neon, function + .global vpx_lpf_horizontal_edge_16_neon + .type vpx_lpf_horizontal_edge_16_neon, function + .global vpx_lpf_vertical_16_neon + .type vpx_lpf_vertical_16_neon, function + .arm + +.text +.p2align 2 + +@ void mb_lpf_horizontal_edge(uint8_t *s, int p, +@ const uint8_t *blimit, +@ const uint8_t *limit, +@ const uint8_t *thresh, +@ int count) +@ r0 uint8_t *s, +@ r1 int p, /* pitch */ +@ r2 const uint8_t *blimit, +@ r3 const uint8_t *limit, +@ sp const uint8_t *thresh, +@ r12 int count +_mb_lpf_horizontal_edge: + mb_lpf_horizontal_edge: @ PROC + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] @ load thresh + +h_count: + vld1.8 {d16[]}, [r2] @ load *blimit + vld1.8 {d17[]}, [r3] @ load *limit + vld1.8 {d18[]}, [r4] @ load *thresh + + sub r8, r0, r1, lsl #3 @ move src pointer down by 8 lines + + vld1.u8 {d0}, [r8,:64], r1 @ p7 + vld1.u8 {d1}, [r8,:64], r1 @ p6 + vld1.u8 {d2}, [r8,:64], r1 @ p5 + vld1.u8 {d3}, [r8,:64], r1 @ p4 + vld1.u8 {d4}, [r8,:64], r1 @ p3 + vld1.u8 {d5}, [r8,:64], r1 @ p2 + vld1.u8 {d6}, [r8,:64], r1 @ p1 + vld1.u8 {d7}, [r8,:64], r1 @ p0 + vld1.u8 {d8}, [r8,:64], r1 @ q0 + vld1.u8 {d9}, [r8,:64], r1 @ q1 + vld1.u8 {d10}, [r8,:64], r1 @ q2 + vld1.u8 {d11}, [r8,:64], r1 @ q3 + vld1.u8 {d12}, [r8,:64], r1 @ q4 + vld1.u8 {d13}, [r8,:64], r1 @ q5 + vld1.u8 {d14}, [r8,:64], r1 @ q6 + vld1.u8 {d15}, [r8,:64], r1 @ q7 + + bl vpx_wide_mbfilter_neon + + tst r7, #1 + beq h_mbfilter + + @ flat && mask were not set for any of the channels. Just store the values + @ from filter. + sub r8, r0, r1, lsl #1 + + vst1.u8 {d25}, [r8,:64], r1 @ store op1 + vst1.u8 {d24}, [r8,:64], r1 @ store op0 + vst1.u8 {d23}, [r8,:64], r1 @ store oq0 + vst1.u8 {d26}, [r8,:64], r1 @ store oq1 + + b h_next + +h_mbfilter: + tst r7, #2 + beq h_wide_mbfilter + + @ flat2 was not set for any of the channels. Just store the values from + @ mbfilter. + sub r8, r0, r1, lsl #1 + sub r8, r8, r1 + + vst1.u8 {d18}, [r8,:64], r1 @ store op2 + vst1.u8 {d19}, [r8,:64], r1 @ store op1 + vst1.u8 {d20}, [r8,:64], r1 @ store op0 + vst1.u8 {d21}, [r8,:64], r1 @ store oq0 + vst1.u8 {d22}, [r8,:64], r1 @ store oq1 + vst1.u8 {d23}, [r8,:64], r1 @ store oq2 + + b h_next + +h_wide_mbfilter: + sub r8, r0, r1, lsl #3 + add r8, r8, r1 + + vst1.u8 {d16}, [r8,:64], r1 @ store op6 + vst1.u8 {d24}, [r8,:64], r1 @ store op5 + vst1.u8 {d25}, [r8,:64], r1 @ store op4 + vst1.u8 {d26}, [r8,:64], r1 @ store op3 + vst1.u8 {d27}, [r8,:64], r1 @ store op2 + vst1.u8 {d18}, [r8,:64], r1 @ store op1 + vst1.u8 {d19}, [r8,:64], r1 @ store op0 + vst1.u8 {d20}, [r8,:64], r1 @ store oq0 + vst1.u8 {d21}, [r8,:64], r1 @ store oq1 + vst1.u8 {d22}, [r8,:64], r1 @ store oq2 + vst1.u8 {d23}, [r8,:64], r1 @ store oq3 + vst1.u8 {d1}, [r8,:64], r1 @ store oq4 + vst1.u8 {d2}, [r8,:64], r1 @ store oq5 + vst1.u8 {d3}, [r8,:64], r1 @ store oq6 + +h_next: + add r0, r0, #8 + subs r12, r12, #1 + bne h_count + + vpop {d8-d15} + pop {r4-r8, pc} + + .size mb_lpf_horizontal_edge, .-mb_lpf_horizontal_edge @ ENDP @ |mb_lpf_horizontal_edge| + +@ void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch, +@ const uint8_t *blimit, +@ const uint8_t *limit, +@ const uint8_t *thresh) +@ r0 uint8_t *s, +@ r1 int pitch, +@ r2 const uint8_t *blimit, +@ r3 const uint8_t *limit, +@ sp const uint8_t *thresh +_vpx_lpf_horizontal_edge_8_neon: + vpx_lpf_horizontal_edge_8_neon: @ PROC + mov r12, #1 + b mb_lpf_horizontal_edge + .size vpx_lpf_horizontal_edge_8_neon, .-vpx_lpf_horizontal_edge_8_neon @ ENDP @ |vpx_lpf_horizontal_edge_8_neon| + +@ void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch, +@ const uint8_t *blimit, +@ const uint8_t *limit, +@ const uint8_t *thresh) +@ r0 uint8_t *s, +@ r1 int pitch, +@ r2 const uint8_t *blimit, +@ r3 const uint8_t *limit, +@ sp const uint8_t *thresh +_vpx_lpf_horizontal_edge_16_neon: + vpx_lpf_horizontal_edge_16_neon: @ PROC + mov r12, #2 + b mb_lpf_horizontal_edge + .size vpx_lpf_horizontal_edge_16_neon, .-vpx_lpf_horizontal_edge_16_neon @ ENDP @ |vpx_lpf_horizontal_edge_16_neon| + +@ void vpx_lpf_vertical_16_neon(uint8_t *s, int p, +@ const uint8_t *blimit, +@ const uint8_t *limit, +@ const uint8_t *thresh) +@ r0 uint8_t *s, +@ r1 int p, /* pitch */ +@ r2 const uint8_t *blimit, +@ r3 const uint8_t *limit, +@ sp const uint8_t *thresh, +_vpx_lpf_vertical_16_neon: + vpx_lpf_vertical_16_neon: @ PROC + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] @ load thresh + + vld1.8 {d16[]}, [r2] @ load *blimit + vld1.8 {d17[]}, [r3] @ load *limit + vld1.8 {d18[]}, [r4] @ load *thresh + + sub r8, r0, #8 + + vld1.8 {d0}, [r8,:64], r1 + vld1.8 {d8}, [r0,:64], r1 + vld1.8 {d1}, [r8,:64], r1 + vld1.8 {d9}, [r0,:64], r1 + vld1.8 {d2}, [r8,:64], r1 + vld1.8 {d10}, [r0,:64], r1 + vld1.8 {d3}, [r8,:64], r1 + vld1.8 {d11}, [r0,:64], r1 + vld1.8 {d4}, [r8,:64], r1 + vld1.8 {d12}, [r0,:64], r1 + vld1.8 {d5}, [r8,:64], r1 + vld1.8 {d13}, [r0,:64], r1 + vld1.8 {d6}, [r8,:64], r1 + vld1.8 {d14}, [r0,:64], r1 + vld1.8 {d7}, [r8,:64], r1 + vld1.8 {d15}, [r0,:64], r1 + + sub r0, r0, r1, lsl #3 + + vtrn.32 q0, q2 + vtrn.32 q1, q3 + vtrn.32 q4, q6 + vtrn.32 q5, q7 + + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + vtrn.8 d0, d1 + vtrn.8 d2, d3 + vtrn.8 d4, d5 + vtrn.8 d6, d7 + + vtrn.8 d8, d9 + vtrn.8 d10, d11 + vtrn.8 d12, d13 + vtrn.8 d14, d15 + + bl vpx_wide_mbfilter_neon + + tst r7, #1 + beq v_mbfilter + + @ flat && mask were not set for any of the channels. Just store the values + @ from filter. + sub r8, r0, #2 + + vswp d23, d25 + + vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1 + vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1 + vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1 + vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1 + vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1 + vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1 + vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1 + vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1 + + b v_end + +v_mbfilter: + tst r7, #2 + beq v_wide_mbfilter + + @ flat2 was not set for any of the channels. Just store the values from + @ mbfilter. + sub r8, r0, #3 + + vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1 + vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1 + vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1 + vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1 + vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1 + vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1 + vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1 + vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1 + vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1 + vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1 + vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1 + vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1 + vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1 + vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1 + vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1 + vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1 + + b v_end + +v_wide_mbfilter: + sub r8, r0, #8 + + vtrn.32 d0, d26 + vtrn.32 d16, d27 + vtrn.32 d24, d18 + vtrn.32 d25, d19 + + vtrn.16 d0, d24 + vtrn.16 d16, d25 + vtrn.16 d26, d18 + vtrn.16 d27, d19 + + vtrn.8 d0, d16 + vtrn.8 d24, d25 + vtrn.8 d26, d27 + vtrn.8 d18, d19 + + vtrn.32 d20, d1 + vtrn.32 d21, d2 + vtrn.32 d22, d3 + vtrn.32 d23, d15 + + vtrn.16 d20, d22 + vtrn.16 d21, d23 + vtrn.16 d1, d3 + vtrn.16 d2, d15 + + vtrn.8 d20, d21 + vtrn.8 d22, d23 + vtrn.8 d1, d2 + vtrn.8 d3, d15 + + vst1.8 {d0}, [r8,:64], r1 + vst1.8 {d20}, [r0,:64], r1 + vst1.8 {d16}, [r8,:64], r1 + vst1.8 {d21}, [r0,:64], r1 + vst1.8 {d24}, [r8,:64], r1 + vst1.8 {d22}, [r0,:64], r1 + vst1.8 {d25}, [r8,:64], r1 + vst1.8 {d23}, [r0,:64], r1 + vst1.8 {d26}, [r8,:64], r1 + vst1.8 {d1}, [r0,:64], r1 + vst1.8 {d27}, [r8,:64], r1 + vst1.8 {d2}, [r0,:64], r1 + vst1.8 {d18}, [r8,:64], r1 + vst1.8 {d3}, [r0,:64], r1 + vst1.8 {d19}, [r8,:64], r1 + vst1.8 {d15}, [r0,:64], r1 + +v_end: + vpop {d8-d15} + pop {r4-r8, pc} + + .size vpx_lpf_vertical_16_neon, .-vpx_lpf_vertical_16_neon @ ENDP @ |vpx_lpf_vertical_16_neon| + +@ void vpx_wide_mbfilter_neon(); +@ This is a helper function for the loopfilters. The invidual functions do the +@ necessary load, transpose (if necessary) and store. +@ +@ r0-r3 PRESERVE +@ d16 blimit +@ d17 limit +@ d18 thresh +@ d0 p7 +@ d1 p6 +@ d2 p5 +@ d3 p4 +@ d4 p3 +@ d5 p2 +@ d6 p1 +@ d7 p0 +@ d8 q0 +@ d9 q1 +@ d10 q2 +@ d11 q3 +@ d12 q4 +@ d13 q5 +@ d14 q6 +@ d15 q7 +_vpx_wide_mbfilter_neon: + vpx_wide_mbfilter_neon: @ PROC + mov r7, #0 + + @ filter_mask + vabd.u8 d19, d4, d5 @ abs(p3 - p2) + vabd.u8 d20, d5, d6 @ abs(p2 - p1) + vabd.u8 d21, d6, d7 @ abs(p1 - p0) + vabd.u8 d22, d9, d8 @ abs(q1 - q0) + vabd.u8 d23, d10, d9 @ abs(q2 - q1) + vabd.u8 d24, d11, d10 @ abs(q3 - q2) + + @ only compare the largest value to limit + vmax.u8 d19, d19, d20 @ max(abs(p3 - p2), abs(p2 - p1)) + vmax.u8 d20, d21, d22 @ max(abs(p1 - p0), abs(q1 - q0)) + vmax.u8 d23, d23, d24 @ max(abs(q2 - q1), abs(q3 - q2)) + vmax.u8 d19, d19, d20 + + vabd.u8 d24, d7, d8 @ abs(p0 - q0) + + vmax.u8 d19, d19, d23 + + vabd.u8 d23, d6, d9 @ a = abs(p1 - q1) + vqadd.u8 d24, d24, d24 @ b = abs(p0 - q0) * 2 + + @ abs () > limit + vcge.u8 d19, d17, d19 + + @ flatmask4 + vabd.u8 d25, d7, d5 @ abs(p0 - p2) + vabd.u8 d26, d8, d10 @ abs(q0 - q2) + vabd.u8 d27, d4, d7 @ abs(p3 - p0) + vabd.u8 d28, d11, d8 @ abs(q3 - q0) + + @ only compare the largest value to thresh + vmax.u8 d25, d25, d26 @ max(abs(p0 - p2), abs(q0 - q2)) + vmax.u8 d26, d27, d28 @ max(abs(p3 - p0), abs(q3 - q0)) + vmax.u8 d25, d25, d26 + vmax.u8 d20, d20, d25 + + vshr.u8 d23, d23, #1 @ a = a / 2 + vqadd.u8 d24, d24, d23 @ a = b + a + + vmov.u8 d30, #1 + vcge.u8 d24, d16, d24 @ (a > blimit * 2 + limit) * -1 + + vcge.u8 d20, d30, d20 @ flat + + vand d19, d19, d24 @ mask + + @ hevmask + vcgt.u8 d21, d21, d18 @ (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d18 @ (abs(q1 - q0) > thresh)*-1 + vorr d21, d21, d22 @ hev + + vand d16, d20, d19 @ flat && mask + vmov r5, r6, d16 + + @ flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7) + vabd.u8 d22, d3, d7 @ abs(p4 - p0) + vabd.u8 d23, d12, d8 @ abs(q4 - q0) + vabd.u8 d24, d7, d2 @ abs(p0 - p5) + vabd.u8 d25, d8, d13 @ abs(q0 - q5) + vabd.u8 d26, d1, d7 @ abs(p6 - p0) + vabd.u8 d27, d14, d8 @ abs(q6 - q0) + vabd.u8 d28, d0, d7 @ abs(p7 - p0) + vabd.u8 d29, d15, d8 @ abs(q7 - q0) + + @ only compare the largest value to thresh + vmax.u8 d22, d22, d23 @ max(abs(p4 - p0), abs(q4 - q0)) + vmax.u8 d23, d24, d25 @ max(abs(p0 - p5), abs(q0 - q5)) + vmax.u8 d24, d26, d27 @ max(abs(p6 - p0), abs(q6 - q0)) + vmax.u8 d25, d28, d29 @ max(abs(p7 - p0), abs(q7 - q0)) + + vmax.u8 d26, d22, d23 + vmax.u8 d27, d24, d25 + vmax.u8 d23, d26, d27 + + vcge.u8 d18, d30, d23 @ flat2 + + vmov.u8 d22, #0x80 + + orrs r5, r5, r6 @ Check for 0 + orreq r7, r7, #1 @ Only do filter branch + + vand d17, d18, d16 @ flat2 && flat && mask + vmov r5, r6, d17 + + @ mbfilter() function + + @ filter() function + @ convert to signed + veor d23, d8, d22 @ qs0 + veor d24, d7, d22 @ ps0 + veor d25, d6, d22 @ ps1 + veor d26, d9, d22 @ qs1 + + vmov.u8 d27, #3 + + vsub.s8 d28, d23, d24 @ ( qs0 - ps0) + vqsub.s8 d29, d25, d26 @ filter = clamp(ps1-qs1) + vmull.s8 q15, d28, d27 @ 3 * ( qs0 - ps0) + vand d29, d29, d21 @ filter &= hev + vaddw.s8 q15, q15, d29 @ filter + 3 * (qs0 - ps0) + vmov.u8 d29, #4 + + @ filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d28, q15 + + vand d28, d28, d19 @ filter &= mask + + vqadd.s8 d30, d28, d27 @ filter2 = clamp(filter+3) + vqadd.s8 d29, d28, d29 @ filter1 = clamp(filter+4) + vshr.s8 d30, d30, #3 @ filter2 >>= 3 + vshr.s8 d29, d29, #3 @ filter1 >>= 3 + + + vqadd.s8 d24, d24, d30 @ op0 = clamp(ps0 + filter2) + vqsub.s8 d23, d23, d29 @ oq0 = clamp(qs0 - filter1) + + @ outer tap adjustments: ++filter1 >> 1 + vrshr.s8 d29, d29, #1 + vbic d29, d29, d21 @ filter &= ~hev + + vqadd.s8 d25, d25, d29 @ op1 = clamp(ps1 + filter) + vqsub.s8 d26, d26, d29 @ oq1 = clamp(qs1 - filter) + + veor d24, d24, d22 @ *f_op0 = u^0x80 + veor d23, d23, d22 @ *f_oq0 = u^0x80 + veor d25, d25, d22 @ *f_op1 = u^0x80 + veor d26, d26, d22 @ *f_oq1 = u^0x80 + + tst r7, #1 + bxne lr + + orrs r5, r5, r6 @ Check for 0 + orreq r7, r7, #2 @ Only do mbfilter branch + + @ mbfilter flat && mask branch + @ TODO(fgalligan): Can I decrease the cycles shifting to consective d's + @ and using vibt on the q's? + vmov.u8 d29, #2 + vaddl.u8 q15, d7, d8 @ op2 = p0 + q0 + vmlal.u8 q15, d4, d27 @ op2 = p0 + q0 + p3 * 3 + vmlal.u8 q15, d5, d29 @ op2 = p0 + q0 + p3 * 3 + p2 * 2 + vaddl.u8 q10, d4, d5 + vaddw.u8 q15, d6 @ op2=p1 + p0 + q0 + p3 * 3 + p2 *2 + vaddl.u8 q14, d6, d9 + vqrshrn.u16 d18, q15, #3 @ r_op2 + + vsub.i16 q15, q10 + vaddl.u8 q10, d4, d6 + vadd.i16 q15, q14 + vaddl.u8 q14, d7, d10 + vqrshrn.u16 d19, q15, #3 @ r_op1 + + vsub.i16 q15, q10 + vadd.i16 q15, q14 + vaddl.u8 q14, d8, d11 + vqrshrn.u16 d20, q15, #3 @ r_op0 + + vsubw.u8 q15, d4 @ oq0 = op0 - p3 + vsubw.u8 q15, d7 @ oq0 -= p0 + vadd.i16 q15, q14 + vaddl.u8 q14, d9, d11 + vqrshrn.u16 d21, q15, #3 @ r_oq0 + + vsubw.u8 q15, d5 @ oq1 = oq0 - p2 + vsubw.u8 q15, d8 @ oq1 -= q0 + vadd.i16 q15, q14 + vaddl.u8 q14, d10, d11 + vqrshrn.u16 d22, q15, #3 @ r_oq1 + + vsubw.u8 q15, d6 @ oq2 = oq0 - p1 + vsubw.u8 q15, d9 @ oq2 -= q1 + vadd.i16 q15, q14 + vqrshrn.u16 d27, q15, #3 @ r_oq2 + + @ Filter does not set op2 or oq2, so use p2 and q2. + vbif d18, d5, d16 @ t_op2 |= p2 & ~(flat & mask) + vbif d19, d25, d16 @ t_op1 |= f_op1 & ~(flat & mask) + vbif d20, d24, d16 @ t_op0 |= f_op0 & ~(flat & mask) + vbif d21, d23, d16 @ t_oq0 |= f_oq0 & ~(flat & mask) + vbif d22, d26, d16 @ t_oq1 |= f_oq1 & ~(flat & mask) + + vbit d23, d27, d16 @ t_oq2 |= r_oq2 & (flat & mask) + vbif d23, d10, d16 @ t_oq2 |= q2 & ~(flat & mask) + + tst r7, #2 + bxne lr + + @ wide_mbfilter flat2 && flat && mask branch + vmov.u8 d16, #7 + vaddl.u8 q15, d7, d8 @ op6 = p0 + q0 + vaddl.u8 q12, d2, d3 + vaddl.u8 q13, d4, d5 + vaddl.u8 q14, d1, d6 + vmlal.u8 q15, d0, d16 @ op6 += p7 * 3 + vadd.i16 q12, q13 + vadd.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vadd.i16 q15, q12 + vaddl.u8 q12, d0, d1 + vaddw.u8 q15, d1 + vaddl.u8 q13, d0, d2 + vadd.i16 q14, q15, q14 + vqrshrn.u16 d16, q15, #4 @ w_op6 + + vsub.i16 q15, q14, q12 + vaddl.u8 q14, d3, d10 + vqrshrn.u16 d24, q15, #4 @ w_op5 + + vsub.i16 q15, q13 + vaddl.u8 q13, d0, d3 + vadd.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vqrshrn.u16 d25, q15, #4 @ w_op4 + + vadd.i16 q15, q14 + vaddl.u8 q14, d0, d4 + vsub.i16 q15, q13 + vsub.i16 q14, q15, q14 + vqrshrn.u16 d26, q15, #4 @ w_op3 + + vaddw.u8 q15, q14, d5 @ op2 += p2 + vaddl.u8 q14, d0, d5 + vaddw.u8 q15, d12 @ op2 += q4 + vbif d26, d4, d17 @ op3 |= p3 & ~(f2 & f & m) + vqrshrn.u16 d27, q15, #4 @ w_op2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d6 + vaddw.u8 q15, d6 @ op1 += p1 + vaddw.u8 q15, d13 @ op1 += q5 + vbif d27, d18, d17 @ op2 |= t_op2 & ~(f2 & f & m) + vqrshrn.u16 d18, q15, #4 @ w_op1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d7 + vaddw.u8 q15, d7 @ op0 += p0 + vaddw.u8 q15, d14 @ op0 += q6 + vbif d18, d19, d17 @ op1 |= t_op1 & ~(f2 & f & m) + vqrshrn.u16 d19, q15, #4 @ w_op0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d1, d8 + vaddw.u8 q15, d8 @ oq0 += q0 + vaddw.u8 q15, d15 @ oq0 += q7 + vbif d19, d20, d17 @ op0 |= t_op0 & ~(f2 & f & m) + vqrshrn.u16 d20, q15, #4 @ w_oq0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vaddw.u8 q15, d9 @ oq1 += q1 + vaddl.u8 q4, d10, d15 + vaddw.u8 q15, d15 @ oq1 += q7 + vbif d20, d21, d17 @ oq0 |= t_oq0 & ~(f2 & f & m) + vqrshrn.u16 d21, q15, #4 @ w_oq1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d3, d10 + vadd.i16 q15, q4 + vaddl.u8 q4, d11, d15 + vbif d21, d22, d17 @ oq1 |= t_oq1 & ~(f2 & f & m) + vqrshrn.u16 d22, q15, #4 @ w_oq2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vadd.i16 q15, q4 + vaddl.u8 q4, d12, d15 + vbif d22, d23, d17 @ oq2 |= t_oq2 & ~(f2 & f & m) + vqrshrn.u16 d23, q15, #4 @ w_oq3 + + vsub.i16 q15, q14 + vaddl.u8 q14, d5, d12 + vadd.i16 q15, q4 + vaddl.u8 q4, d13, d15 + vbif d16, d1, d17 @ op6 |= p6 & ~(f2 & f & m) + vqrshrn.u16 d1, q15, #4 @ w_oq4 + + vsub.i16 q15, q14 + vaddl.u8 q14, d6, d13 + vadd.i16 q15, q4 + vaddl.u8 q4, d14, d15 + vbif d24, d2, d17 @ op5 |= p5 & ~(f2 & f & m) + vqrshrn.u16 d2, q15, #4 @ w_oq5 + + vsub.i16 q15, q14 + vbif d25, d3, d17 @ op4 |= p4 & ~(f2 & f & m) + vadd.i16 q15, q4 + vbif d23, d11, d17 @ oq3 |= q3 & ~(f2 & f & m) + vqrshrn.u16 d3, q15, #4 @ w_oq6 + vbif d1, d12, d17 @ oq4 |= q4 & ~(f2 & f & m) + vbif d2, d13, d17 @ oq5 |= q5 & ~(f2 & f & m) + vbif d3, d14, d17 @ oq6 |= q6 & ~(f2 & f & m) + + bx lr + .size vpx_wide_mbfilter_neon, .-vpx_wide_mbfilter_neon @ ENDP @ |vpx_wide_mbfilter_neon| + + .section .note.GNU-stack,"",%progbits diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas/save_reg_neon.s b/thirdparty/libvpx/vpx_dsp/arm/gas/save_reg_neon.s new file mode 100644 index 000000000000..e8852fa0d0d1 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/gas/save_reg_neon.s @@ -0,0 +1,44 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. + .equ DO1STROUNDING, 0 +@ +@ Copyright (c) 2010 The WebM project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ + + + .global vpx_push_neon + .type vpx_push_neon, function + .global vpx_pop_neon + .type vpx_pop_neon, function + + .arm + .eabi_attribute 24, 1 @Tag_ABI_align_needed + .eabi_attribute 25, 1 @Tag_ABI_align_preserved + +.text +.p2align 2 + +_vpx_push_neon: + vpx_push_neon: @ PROC + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + .size vpx_push_neon, .-vpx_push_neon @ ENDP + +_vpx_pop_neon: + vpx_pop_neon: @ PROC + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + .size vpx_pop_neon, .-vpx_pop_neon @ ENDP + + + .section .note.GNU-stack,"",%progbits diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/intrapred_neon_asm.s b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/intrapred_neon_asm.s new file mode 100644 index 000000000000..1c527afcff6d --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/intrapred_neon_asm.s @@ -0,0 +1,660 @@ +@ This file was created from a .asm file +@ using the ads2gas_apple.pl script. + + .set WIDE_REFERENCE, 0 + .set ARCHITECTURE, 5 + .set DO1STROUNDING, 0 + @ + @ Copyright (c) 2014 The WebM project authors. All Rights Reserved. + @ + @ Use of this source code is governed by a BSD-style license + @ that can be found in the LICENSE file in the root of the source + @ tree. An additional intellectual property rights grant can be found + @ in the file PATENTS. All contributing project authors may + @ be found in the AUTHORS file in the root of the source tree. + @ + + .globl _vpx_v_predictor_4x4_neon + .globl vpx_v_predictor_4x4_neon + .globl _vpx_v_predictor_8x8_neon + .globl vpx_v_predictor_8x8_neon + .globl _vpx_v_predictor_16x16_neon + .globl vpx_v_predictor_16x16_neon + .globl _vpx_v_predictor_32x32_neon + .globl vpx_v_predictor_32x32_neon + .globl _vpx_h_predictor_4x4_neon + .globl vpx_h_predictor_4x4_neon + .globl _vpx_h_predictor_8x8_neon + .globl vpx_h_predictor_8x8_neon + .globl _vpx_h_predictor_16x16_neon + .globl vpx_h_predictor_16x16_neon + .globl _vpx_h_predictor_32x32_neon + .globl vpx_h_predictor_32x32_neon + .globl _vpx_tm_predictor_4x4_neon + .globl vpx_tm_predictor_4x4_neon + .globl _vpx_tm_predictor_8x8_neon + .globl vpx_tm_predictor_8x8_neon + .globl _vpx_tm_predictor_16x16_neon + .globl vpx_tm_predictor_16x16_neon + .globl _vpx_tm_predictor_32x32_neon + .globl vpx_tm_predictor_32x32_neon + @ ARM + @ + @ PRESERVE8 + +.text +.p2align 2 + + @void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_v_predictor_4x4_neon: + vpx_v_predictor_4x4_neon: @ + vld1.32 {d0[0]}, [r2] + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + bx lr + @ @ |vpx_v_predictor_4x4_neon| + + @void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_v_predictor_8x8_neon: + vpx_v_predictor_8x8_neon: @ + vld1.8 {d0}, [r2] + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + bx lr + @ @ |vpx_v_predictor_8x8_neon| + + @void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_v_predictor_16x16_neon: + vpx_v_predictor_16x16_neon: @ + vld1.8 {q0}, [r2] + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + bx lr + @ @ |vpx_v_predictor_16x16_neon| + + @void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_v_predictor_32x32_neon: + vpx_v_predictor_32x32_neon: @ + vld1.8 {q0, q1}, [r2] + mov r2, #2 +loop_v: + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + subs r2, r2, #1 + bgt loop_v + bx lr + @ @ |vpx_v_predictor_32x32_neon| + + @void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_h_predictor_4x4_neon: + vpx_h_predictor_4x4_neon: @ + vld1.32 {d1[0]}, [r3] + vdup.8 d0, d1[0] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[1] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[2] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[3] + vst1.32 {d0[0]}, [r0], r1 + bx lr + @ @ |vpx_h_predictor_4x4_neon| + + @void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_h_predictor_8x8_neon: + vpx_h_predictor_8x8_neon: @ + vld1.64 {d1}, [r3] + vdup.8 d0, d1[0] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[1] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[2] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[3] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[4] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[5] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[6] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[7] + vst1.64 {d0}, [r0], r1 + bx lr + @ @ |vpx_h_predictor_8x8_neon| + + @void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_h_predictor_16x16_neon: + vpx_h_predictor_16x16_neon: @ + vld1.8 {q1}, [r3] + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0], r1 + bx lr + @ @ |vpx_h_predictor_16x16_neon| + + @void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_h_predictor_32x32_neon: + vpx_h_predictor_32x32_neon: @ + sub r1, r1, #16 + mov r2, #2 +loop_h: + vld1.8 {q1}, [r3]! + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + subs r2, r2, #1 + bgt loop_h + bx lr + @ @ |vpx_h_predictor_32x32_neon| + + @void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_tm_predictor_4x4_neon: + vpx_tm_predictor_4x4_neon: @ + @ Load ytop_left = above[-1] @ + sub r12, r2, #1 + vld1.u8 {d0[]}, [r12] + + @ Load above 4 pixels + vld1.32 {d2[0]}, [r2] + + @ Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + @ Load left row by row and compute left + (above - ytop_left) + @ 1st row and 2nd row + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3]! + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + + @ 3rd row and 4th row + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3] + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + bx lr + @ @ |vpx_tm_predictor_4x4_neon| + + @void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_tm_predictor_8x8_neon: + vpx_tm_predictor_8x8_neon: @ + @ Load ytop_left = above[-1] @ + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + @ preload 8 left + vld1.8 {d30}, [r3] + + @ Load above 8 pixels + vld1.64 {d2}, [r2] + + vmovl.u8 q10, d30 + + @ Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + @ Load left row by row and compute left + (above - ytop_left) + @ 1st row and 2nd row + vdup.16 q0, d20[0] + vdup.16 q1, d20[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + @ 3rd row and 4th row + vdup.16 q8, d20[2] + vdup.16 q9, d20[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + @ 5th row and 6th row + vdup.16 q0, d21[0] + vdup.16 q1, d21[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + @ 7th row and 8th row + vdup.16 q8, d21[2] + vdup.16 q9, d21[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + bx lr + @ @ |vpx_tm_predictor_8x8_neon| + + @void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_tm_predictor_16x16_neon: + vpx_tm_predictor_16x16_neon: @ + @ Load ytop_left = above[-1] @ + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + @ Load above 8 pixels + vld1.8 {q1}, [r2] + + @ preload 8 left into r12 + vld1.8 {d18}, [r3]! + + @ Compute above - ytop_left + vsubl.u8 q2, d2, d0 + vsubl.u8 q3, d3, d0 + + vmovl.u8 q10, d18 + + @ Load left row by row and compute left + (above - ytop_left) + @ Process 8 rows in each single loop and loop 2 times to process 16 rows. + mov r2, #2 + +loop_16x16_neon: + @ Process two rows. + vdup.16 q0, d20[0] + vdup.16 q8, d20[1] + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d20[2] @ proload next 2 rows data + vdup.16 q8, d20[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + @ Process two rows. + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[0] @ proload next 2 rows data + vdup.16 q8, d21[1] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[2] @ proload next 2 rows data + vdup.16 q8, d21[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vld1.8 {d18}, [r3]! @ preload 8 left into r12 + vmovl.u8 q10, d18 + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + subs r2, r2, #1 + bgt loop_16x16_neon + + bx lr + @ @ |vpx_tm_predictor_16x16_neon| + + @void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, + @ const uint8_t *above, + @ const uint8_t *left) + @ r0 uint8_t *dst + @ r1 ptrdiff_t y_stride + @ r2 const uint8_t *above + @ r3 const uint8_t *left + +_vpx_tm_predictor_32x32_neon: + vpx_tm_predictor_32x32_neon: @ + @ Load ytop_left = above[-1] @ + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + @ Load above 32 pixels + vld1.8 {q1}, [r2]! + vld1.8 {q2}, [r2] + + @ preload 8 left pixels + vld1.8 {d26}, [r3]! + + @ Compute above - ytop_left + vsubl.u8 q8, d2, d0 + vsubl.u8 q9, d3, d0 + vsubl.u8 q10, d4, d0 + vsubl.u8 q11, d5, d0 + + vmovl.u8 q3, d26 + + @ Load left row by row and compute left + (above - ytop_left) + @ Process 8 rows in each single loop and loop 4 times to process 32 rows. + mov r2, #4 + +loop_32x32_neon: + @ Process two rows. + vdup.16 q0, d6[0] + vdup.16 q2, d6[1] + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q1, d6[2] + vdup.16 q2, d6[3] + vst1.64 {d24-d27}, [r0], r1 + + @ Process two rows. + vadd.s16 q12, q1, q8 + vadd.s16 q13, q1, q9 + vadd.s16 q14, q1, q10 + vadd.s16 q15, q1, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[0] + vdup.16 q2, d7[1] + vst1.64 {d24-d27}, [r0], r1 + + @ Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[2] + vdup.16 q2, d7[3] + vst1.64 {d24-d27}, [r0], r1 + + @ Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vld1.8 {d0}, [r3]! @ preload 8 left pixels + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vmovl.u8 q3, d0 + vst1.64 {d24-d27}, [r0], r1 + + subs r2, r2, #1 + bgt loop_32x32_neon + + bx lr + @ @ |vpx_tm_predictor_32x32_neon| + diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/loopfilter_mb_neon.s b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/loopfilter_mb_neon.s new file mode 100644 index 000000000000..69f7e5207eda --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/loopfilter_mb_neon.s @@ -0,0 +1,649 @@ +@ This file was created from a .asm file +@ using the ads2gas_apple.pl script. + + .set WIDE_REFERENCE, 0 + .set ARCHITECTURE, 5 + .set DO1STROUNDING, 0 + @ + @ Copyright (c) 2013 The WebM project authors. All Rights Reserved. + @ + @ Use of this source code is governed by a BSD-style license + @ that can be found in the LICENSE file in the root of the source + @ tree. An additional intellectual property rights grant can be found + @ in the file PATENTS. All contributing project authors may + @ be found in the AUTHORS file in the root of the source tree. + @ + + .globl _vpx_lpf_horizontal_edge_8_neon + .globl vpx_lpf_horizontal_edge_8_neon + .globl _vpx_lpf_horizontal_edge_16_neon + .globl vpx_lpf_horizontal_edge_16_neon + .globl _vpx_lpf_vertical_16_neon + .globl vpx_lpf_vertical_16_neon + @ ARM + +.text +.p2align 2 + + @ void mb_lpf_horizontal_edge(uint8_t *s, int p, + @ const uint8_t *blimit, + @ const uint8_t *limit, + @ const uint8_t *thresh, + @ int count) + @ r0 uint8_t *s, + @ r1 int p, /* pitch */ + @ r2 const uint8_t *blimit, + @ r3 const uint8_t *limit, + @ sp const uint8_t *thresh, + @ r12 int count +_mb_lpf_horizontal_edge: + mb_lpf_horizontal_edge: @ + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] @ load thresh + +h_count: + vld1.8 {d16[]}, [r2] @ load *blimit + vld1.8 {d17[]}, [r3] @ load *limit + vld1.8 {d18[]}, [r4] @ load *thresh + + sub r8, r0, r1, lsl #3 @ move src pointer down by 8 lines + + vld1.u8 {d0}, [r8,:64], r1 @ p7 + vld1.u8 {d1}, [r8,:64], r1 @ p6 + vld1.u8 {d2}, [r8,:64], r1 @ p5 + vld1.u8 {d3}, [r8,:64], r1 @ p4 + vld1.u8 {d4}, [r8,:64], r1 @ p3 + vld1.u8 {d5}, [r8,:64], r1 @ p2 + vld1.u8 {d6}, [r8,:64], r1 @ p1 + vld1.u8 {d7}, [r8,:64], r1 @ p0 + vld1.u8 {d8}, [r8,:64], r1 @ q0 + vld1.u8 {d9}, [r8,:64], r1 @ q1 + vld1.u8 {d10}, [r8,:64], r1 @ q2 + vld1.u8 {d11}, [r8,:64], r1 @ q3 + vld1.u8 {d12}, [r8,:64], r1 @ q4 + vld1.u8 {d13}, [r8,:64], r1 @ q5 + vld1.u8 {d14}, [r8,:64], r1 @ q6 + vld1.u8 {d15}, [r8,:64], r1 @ q7 + + bl vpx_wide_mbfilter_neon + + tst r7, #1 + beq h_mbfilter + + @ flat && mask were not set for any of the channels. Just store the values + @ from filter. + sub r8, r0, r1, lsl #1 + + vst1.u8 {d25}, [r8,:64], r1 @ store op1 + vst1.u8 {d24}, [r8,:64], r1 @ store op0 + vst1.u8 {d23}, [r8,:64], r1 @ store oq0 + vst1.u8 {d26}, [r8,:64], r1 @ store oq1 + + b h_next + +h_mbfilter: + tst r7, #2 + beq h_wide_mbfilter + + @ flat2 was not set for any of the channels. Just store the values from + @ mbfilter. + sub r8, r0, r1, lsl #1 + sub r8, r8, r1 + + vst1.u8 {d18}, [r8,:64], r1 @ store op2 + vst1.u8 {d19}, [r8,:64], r1 @ store op1 + vst1.u8 {d20}, [r8,:64], r1 @ store op0 + vst1.u8 {d21}, [r8,:64], r1 @ store oq0 + vst1.u8 {d22}, [r8,:64], r1 @ store oq1 + vst1.u8 {d23}, [r8,:64], r1 @ store oq2 + + b h_next + +h_wide_mbfilter: + sub r8, r0, r1, lsl #3 + add r8, r8, r1 + + vst1.u8 {d16}, [r8,:64], r1 @ store op6 + vst1.u8 {d24}, [r8,:64], r1 @ store op5 + vst1.u8 {d25}, [r8,:64], r1 @ store op4 + vst1.u8 {d26}, [r8,:64], r1 @ store op3 + vst1.u8 {d27}, [r8,:64], r1 @ store op2 + vst1.u8 {d18}, [r8,:64], r1 @ store op1 + vst1.u8 {d19}, [r8,:64], r1 @ store op0 + vst1.u8 {d20}, [r8,:64], r1 @ store oq0 + vst1.u8 {d21}, [r8,:64], r1 @ store oq1 + vst1.u8 {d22}, [r8,:64], r1 @ store oq2 + vst1.u8 {d23}, [r8,:64], r1 @ store oq3 + vst1.u8 {d1}, [r8,:64], r1 @ store oq4 + vst1.u8 {d2}, [r8,:64], r1 @ store oq5 + vst1.u8 {d3}, [r8,:64], r1 @ store oq6 + +h_next: + add r0, r0, #8 + subs r12, r12, #1 + bne h_count + + vpop {d8-d15} + pop {r4-r8, pc} + + @ @ |mb_lpf_horizontal_edge| + + @ void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch, + @ const uint8_t *blimit, + @ const uint8_t *limit, + @ const uint8_t *thresh) + @ r0 uint8_t *s, + @ r1 int pitch, + @ r2 const uint8_t *blimit, + @ r3 const uint8_t *limit, + @ sp const uint8_t *thresh +_vpx_lpf_horizontal_edge_8_neon: + vpx_lpf_horizontal_edge_8_neon: @ + mov r12, #1 + b mb_lpf_horizontal_edge + @ @ |vpx_lpf_horizontal_edge_8_neon| + + @ void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch, + @ const uint8_t *blimit, + @ const uint8_t *limit, + @ const uint8_t *thresh) + @ r0 uint8_t *s, + @ r1 int pitch, + @ r2 const uint8_t *blimit, + @ r3 const uint8_t *limit, + @ sp const uint8_t *thresh +_vpx_lpf_horizontal_edge_16_neon: + vpx_lpf_horizontal_edge_16_neon: @ + mov r12, #2 + b mb_lpf_horizontal_edge + @ @ |vpx_lpf_horizontal_edge_16_neon| + + @ void vpx_lpf_vertical_16_neon(uint8_t *s, int p, + @ const uint8_t *blimit, + @ const uint8_t *limit, + @ const uint8_t *thresh) + @ r0 uint8_t *s, + @ r1 int p, /* pitch */ + @ r2 const uint8_t *blimit, + @ r3 const uint8_t *limit, + @ sp const uint8_t *thresh, +_vpx_lpf_vertical_16_neon: + vpx_lpf_vertical_16_neon: @ + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] @ load thresh + + vld1.8 {d16[]}, [r2] @ load *blimit + vld1.8 {d17[]}, [r3] @ load *limit + vld1.8 {d18[]}, [r4] @ load *thresh + + sub r8, r0, #8 + + vld1.8 {d0}, [r8,:64], r1 + vld1.8 {d8}, [r0,:64], r1 + vld1.8 {d1}, [r8,:64], r1 + vld1.8 {d9}, [r0,:64], r1 + vld1.8 {d2}, [r8,:64], r1 + vld1.8 {d10}, [r0,:64], r1 + vld1.8 {d3}, [r8,:64], r1 + vld1.8 {d11}, [r0,:64], r1 + vld1.8 {d4}, [r8,:64], r1 + vld1.8 {d12}, [r0,:64], r1 + vld1.8 {d5}, [r8,:64], r1 + vld1.8 {d13}, [r0,:64], r1 + vld1.8 {d6}, [r8,:64], r1 + vld1.8 {d14}, [r0,:64], r1 + vld1.8 {d7}, [r8,:64], r1 + vld1.8 {d15}, [r0,:64], r1 + + sub r0, r0, r1, lsl #3 + + vtrn.32 q0, q2 + vtrn.32 q1, q3 + vtrn.32 q4, q6 + vtrn.32 q5, q7 + + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + vtrn.8 d0, d1 + vtrn.8 d2, d3 + vtrn.8 d4, d5 + vtrn.8 d6, d7 + + vtrn.8 d8, d9 + vtrn.8 d10, d11 + vtrn.8 d12, d13 + vtrn.8 d14, d15 + + bl vpx_wide_mbfilter_neon + + tst r7, #1 + beq v_mbfilter + + @ flat && mask were not set for any of the channels. Just store the values + @ from filter. + sub r8, r0, #2 + + vswp d23, d25 + + vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1 + vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1 + vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1 + vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1 + vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1 + vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1 + vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1 + vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1 + + b v_end + +v_mbfilter: + tst r7, #2 + beq v_wide_mbfilter + + @ flat2 was not set for any of the channels. Just store the values from + @ mbfilter. + sub r8, r0, #3 + + vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1 + vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1 + vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1 + vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1 + vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1 + vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1 + vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1 + vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1 + vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1 + vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1 + vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1 + vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1 + vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1 + vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1 + vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1 + vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1 + + b v_end + +v_wide_mbfilter: + sub r8, r0, #8 + + vtrn.32 d0, d26 + vtrn.32 d16, d27 + vtrn.32 d24, d18 + vtrn.32 d25, d19 + + vtrn.16 d0, d24 + vtrn.16 d16, d25 + vtrn.16 d26, d18 + vtrn.16 d27, d19 + + vtrn.8 d0, d16 + vtrn.8 d24, d25 + vtrn.8 d26, d27 + vtrn.8 d18, d19 + + vtrn.32 d20, d1 + vtrn.32 d21, d2 + vtrn.32 d22, d3 + vtrn.32 d23, d15 + + vtrn.16 d20, d22 + vtrn.16 d21, d23 + vtrn.16 d1, d3 + vtrn.16 d2, d15 + + vtrn.8 d20, d21 + vtrn.8 d22, d23 + vtrn.8 d1, d2 + vtrn.8 d3, d15 + + vst1.8 {d0}, [r8,:64], r1 + vst1.8 {d20}, [r0,:64], r1 + vst1.8 {d16}, [r8,:64], r1 + vst1.8 {d21}, [r0,:64], r1 + vst1.8 {d24}, [r8,:64], r1 + vst1.8 {d22}, [r0,:64], r1 + vst1.8 {d25}, [r8,:64], r1 + vst1.8 {d23}, [r0,:64], r1 + vst1.8 {d26}, [r8,:64], r1 + vst1.8 {d1}, [r0,:64], r1 + vst1.8 {d27}, [r8,:64], r1 + vst1.8 {d2}, [r0,:64], r1 + vst1.8 {d18}, [r8,:64], r1 + vst1.8 {d3}, [r0,:64], r1 + vst1.8 {d19}, [r8,:64], r1 + vst1.8 {d15}, [r0,:64], r1 + +v_end: + vpop {d8-d15} + pop {r4-r8, pc} + + @ @ |vpx_lpf_vertical_16_neon| + + @ void vpx_wide_mbfilter_neon() @ + @ This is a helper function for the loopfilters. The invidual functions do the + @ necessary load, transpose (if necessary) and store. + @ + @ r0-r3 PRESERVE + @ d16 blimit + @ d17 limit + @ d18 thresh + @ d0 p7 + @ d1 p6 + @ d2 p5 + @ d3 p4 + @ d4 p3 + @ d5 p2 + @ d6 p1 + @ d7 p0 + @ d8 q0 + @ d9 q1 + @ d10 q2 + @ d11 q3 + @ d12 q4 + @ d13 q5 + @ d14 q6 + @ d15 q7 +_vpx_wide_mbfilter_neon: + vpx_wide_mbfilter_neon: @ + mov r7, #0 + + @ filter_mask + vabd.u8 d19, d4, d5 @ abs(p3 - p2) + vabd.u8 d20, d5, d6 @ abs(p2 - p1) + vabd.u8 d21, d6, d7 @ abs(p1 - p0) + vabd.u8 d22, d9, d8 @ abs(q1 - q0) + vabd.u8 d23, d10, d9 @ abs(q2 - q1) + vabd.u8 d24, d11, d10 @ abs(q3 - q2) + + @ only compare the largest value to limit + vmax.u8 d19, d19, d20 @ max(abs(p3 - p2), abs(p2 - p1)) + vmax.u8 d20, d21, d22 @ max(abs(p1 - p0), abs(q1 - q0)) + vmax.u8 d23, d23, d24 @ max(abs(q2 - q1), abs(q3 - q2)) + vmax.u8 d19, d19, d20 + + vabd.u8 d24, d7, d8 @ abs(p0 - q0) + + vmax.u8 d19, d19, d23 + + vabd.u8 d23, d6, d9 @ a = abs(p1 - q1) + vqadd.u8 d24, d24, d24 @ b = abs(p0 - q0) * 2 + + @ abs () > limit + vcge.u8 d19, d17, d19 + + @ flatmask4 + vabd.u8 d25, d7, d5 @ abs(p0 - p2) + vabd.u8 d26, d8, d10 @ abs(q0 - q2) + vabd.u8 d27, d4, d7 @ abs(p3 - p0) + vabd.u8 d28, d11, d8 @ abs(q3 - q0) + + @ only compare the largest value to thresh + vmax.u8 d25, d25, d26 @ max(abs(p0 - p2), abs(q0 - q2)) + vmax.u8 d26, d27, d28 @ max(abs(p3 - p0), abs(q3 - q0)) + vmax.u8 d25, d25, d26 + vmax.u8 d20, d20, d25 + + vshr.u8 d23, d23, #1 @ a = a / 2 + vqadd.u8 d24, d24, d23 @ a = b + a + + vmov.u8 d30, #1 + vcge.u8 d24, d16, d24 @ (a > blimit * 2 + limit) * -1 + + vcge.u8 d20, d30, d20 @ flat + + vand d19, d19, d24 @ mask + + @ hevmask + vcgt.u8 d21, d21, d18 @ (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d18 @ (abs(q1 - q0) > thresh)*-1 + vorr d21, d21, d22 @ hev + + vand d16, d20, d19 @ flat && mask + vmov r5, r6, d16 + + @ flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7) + vabd.u8 d22, d3, d7 @ abs(p4 - p0) + vabd.u8 d23, d12, d8 @ abs(q4 - q0) + vabd.u8 d24, d7, d2 @ abs(p0 - p5) + vabd.u8 d25, d8, d13 @ abs(q0 - q5) + vabd.u8 d26, d1, d7 @ abs(p6 - p0) + vabd.u8 d27, d14, d8 @ abs(q6 - q0) + vabd.u8 d28, d0, d7 @ abs(p7 - p0) + vabd.u8 d29, d15, d8 @ abs(q7 - q0) + + @ only compare the largest value to thresh + vmax.u8 d22, d22, d23 @ max(abs(p4 - p0), abs(q4 - q0)) + vmax.u8 d23, d24, d25 @ max(abs(p0 - p5), abs(q0 - q5)) + vmax.u8 d24, d26, d27 @ max(abs(p6 - p0), abs(q6 - q0)) + vmax.u8 d25, d28, d29 @ max(abs(p7 - p0), abs(q7 - q0)) + + vmax.u8 d26, d22, d23 + vmax.u8 d27, d24, d25 + vmax.u8 d23, d26, d27 + + vcge.u8 d18, d30, d23 @ flat2 + + vmov.u8 d22, #0x80 + + orrs r5, r5, r6 @ Check for 0 + orreq r7, r7, #1 @ Only do filter branch + + vand d17, d18, d16 @ flat2 && flat && mask + vmov r5, r6, d17 + + @ mbfilter() function + + @ filter() function + @ convert to signed + veor d23, d8, d22 @ qs0 + veor d24, d7, d22 @ ps0 + veor d25, d6, d22 @ ps1 + veor d26, d9, d22 @ qs1 + + vmov.u8 d27, #3 + + vsub.s8 d28, d23, d24 @ ( qs0 - ps0) + vqsub.s8 d29, d25, d26 @ filter = clamp(ps1-qs1) + vmull.s8 q15, d28, d27 @ 3 * ( qs0 - ps0) + vand d29, d29, d21 @ filter &= hev + vaddw.s8 q15, q15, d29 @ filter + 3 * (qs0 - ps0) + vmov.u8 d29, #4 + + @ filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d28, q15 + + vand d28, d28, d19 @ filter &= mask + + vqadd.s8 d30, d28, d27 @ filter2 = clamp(filter+3) + vqadd.s8 d29, d28, d29 @ filter1 = clamp(filter+4) + vshr.s8 d30, d30, #3 @ filter2 >>= 3 + vshr.s8 d29, d29, #3 @ filter1 >>= 3 + + + vqadd.s8 d24, d24, d30 @ op0 = clamp(ps0 + filter2) + vqsub.s8 d23, d23, d29 @ oq0 = clamp(qs0 - filter1) + + @ outer tap adjustments: ++filter1 >> 1 + vrshr.s8 d29, d29, #1 + vbic d29, d29, d21 @ filter &= ~hev + + vqadd.s8 d25, d25, d29 @ op1 = clamp(ps1 + filter) + vqsub.s8 d26, d26, d29 @ oq1 = clamp(qs1 - filter) + + veor d24, d24, d22 @ *f_op0 = u^0x80 + veor d23, d23, d22 @ *f_oq0 = u^0x80 + veor d25, d25, d22 @ *f_op1 = u^0x80 + veor d26, d26, d22 @ *f_oq1 = u^0x80 + + tst r7, #1 + bxne lr + + orrs r5, r5, r6 @ Check for 0 + orreq r7, r7, #2 @ Only do mbfilter branch + + @ mbfilter flat && mask branch + @ TODO(fgalligan): Can I decrease the cycles shifting to consective d's + @ and using vibt on the q's? + vmov.u8 d29, #2 + vaddl.u8 q15, d7, d8 @ op2 = p0 + q0 + vmlal.u8 q15, d4, d27 @ op2 = p0 + q0 + p3 * 3 + vmlal.u8 q15, d5, d29 @ op2 = p0 + q0 + p3 * 3 + p2 * 2 + vaddl.u8 q10, d4, d5 + vaddw.u8 q15, d6 @ op2=p1 + p0 + q0 + p3 * 3 + p2 *2 + vaddl.u8 q14, d6, d9 + vqrshrn.u16 d18, q15, #3 @ r_op2 + + vsub.i16 q15, q10 + vaddl.u8 q10, d4, d6 + vadd.i16 q15, q14 + vaddl.u8 q14, d7, d10 + vqrshrn.u16 d19, q15, #3 @ r_op1 + + vsub.i16 q15, q10 + vadd.i16 q15, q14 + vaddl.u8 q14, d8, d11 + vqrshrn.u16 d20, q15, #3 @ r_op0 + + vsubw.u8 q15, d4 @ oq0 = op0 - p3 + vsubw.u8 q15, d7 @ oq0 -= p0 + vadd.i16 q15, q14 + vaddl.u8 q14, d9, d11 + vqrshrn.u16 d21, q15, #3 @ r_oq0 + + vsubw.u8 q15, d5 @ oq1 = oq0 - p2 + vsubw.u8 q15, d8 @ oq1 -= q0 + vadd.i16 q15, q14 + vaddl.u8 q14, d10, d11 + vqrshrn.u16 d22, q15, #3 @ r_oq1 + + vsubw.u8 q15, d6 @ oq2 = oq0 - p1 + vsubw.u8 q15, d9 @ oq2 -= q1 + vadd.i16 q15, q14 + vqrshrn.u16 d27, q15, #3 @ r_oq2 + + @ Filter does not set op2 or oq2, so use p2 and q2. + vbif d18, d5, d16 @ t_op2 |= p2 & ~(flat & mask) + vbif d19, d25, d16 @ t_op1 |= f_op1 & ~(flat & mask) + vbif d20, d24, d16 @ t_op0 |= f_op0 & ~(flat & mask) + vbif d21, d23, d16 @ t_oq0 |= f_oq0 & ~(flat & mask) + vbif d22, d26, d16 @ t_oq1 |= f_oq1 & ~(flat & mask) + + vbit d23, d27, d16 @ t_oq2 |= r_oq2 & (flat & mask) + vbif d23, d10, d16 @ t_oq2 |= q2 & ~(flat & mask) + + tst r7, #2 + bxne lr + + @ wide_mbfilter flat2 && flat && mask branch + vmov.u8 d16, #7 + vaddl.u8 q15, d7, d8 @ op6 = p0 + q0 + vaddl.u8 q12, d2, d3 + vaddl.u8 q13, d4, d5 + vaddl.u8 q14, d1, d6 + vmlal.u8 q15, d0, d16 @ op6 += p7 * 3 + vadd.i16 q12, q13 + vadd.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vadd.i16 q15, q12 + vaddl.u8 q12, d0, d1 + vaddw.u8 q15, d1 + vaddl.u8 q13, d0, d2 + vadd.i16 q14, q15, q14 + vqrshrn.u16 d16, q15, #4 @ w_op6 + + vsub.i16 q15, q14, q12 + vaddl.u8 q14, d3, d10 + vqrshrn.u16 d24, q15, #4 @ w_op5 + + vsub.i16 q15, q13 + vaddl.u8 q13, d0, d3 + vadd.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vqrshrn.u16 d25, q15, #4 @ w_op4 + + vadd.i16 q15, q14 + vaddl.u8 q14, d0, d4 + vsub.i16 q15, q13 + vsub.i16 q14, q15, q14 + vqrshrn.u16 d26, q15, #4 @ w_op3 + + vaddw.u8 q15, q14, d5 @ op2 += p2 + vaddl.u8 q14, d0, d5 + vaddw.u8 q15, d12 @ op2 += q4 + vbif d26, d4, d17 @ op3 |= p3 & ~(f2 & f & m) + vqrshrn.u16 d27, q15, #4 @ w_op2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d6 + vaddw.u8 q15, d6 @ op1 += p1 + vaddw.u8 q15, d13 @ op1 += q5 + vbif d27, d18, d17 @ op2 |= t_op2 & ~(f2 & f & m) + vqrshrn.u16 d18, q15, #4 @ w_op1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d7 + vaddw.u8 q15, d7 @ op0 += p0 + vaddw.u8 q15, d14 @ op0 += q6 + vbif d18, d19, d17 @ op1 |= t_op1 & ~(f2 & f & m) + vqrshrn.u16 d19, q15, #4 @ w_op0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d1, d8 + vaddw.u8 q15, d8 @ oq0 += q0 + vaddw.u8 q15, d15 @ oq0 += q7 + vbif d19, d20, d17 @ op0 |= t_op0 & ~(f2 & f & m) + vqrshrn.u16 d20, q15, #4 @ w_oq0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vaddw.u8 q15, d9 @ oq1 += q1 + vaddl.u8 q4, d10, d15 + vaddw.u8 q15, d15 @ oq1 += q7 + vbif d20, d21, d17 @ oq0 |= t_oq0 & ~(f2 & f & m) + vqrshrn.u16 d21, q15, #4 @ w_oq1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d3, d10 + vadd.i16 q15, q4 + vaddl.u8 q4, d11, d15 + vbif d21, d22, d17 @ oq1 |= t_oq1 & ~(f2 & f & m) + vqrshrn.u16 d22, q15, #4 @ w_oq2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vadd.i16 q15, q4 + vaddl.u8 q4, d12, d15 + vbif d22, d23, d17 @ oq2 |= t_oq2 & ~(f2 & f & m) + vqrshrn.u16 d23, q15, #4 @ w_oq3 + + vsub.i16 q15, q14 + vaddl.u8 q14, d5, d12 + vadd.i16 q15, q4 + vaddl.u8 q4, d13, d15 + vbif d16, d1, d17 @ op6 |= p6 & ~(f2 & f & m) + vqrshrn.u16 d1, q15, #4 @ w_oq4 + + vsub.i16 q15, q14 + vaddl.u8 q14, d6, d13 + vadd.i16 q15, q4 + vaddl.u8 q4, d14, d15 + vbif d24, d2, d17 @ op5 |= p5 & ~(f2 & f & m) + vqrshrn.u16 d2, q15, #4 @ w_oq5 + + vsub.i16 q15, q14 + vbif d25, d3, d17 @ op4 |= p4 & ~(f2 & f & m) + vadd.i16 q15, q4 + vbif d23, d11, d17 @ oq3 |= q3 & ~(f2 & f & m) + vqrshrn.u16 d3, q15, #4 @ w_oq6 + vbif d1, d12, d17 @ oq4 |= q4 & ~(f2 & f & m) + vbif d2, d13, d17 @ oq5 |= q5 & ~(f2 & f & m) + vbif d3, d14, d17 @ oq6 |= q6 & ~(f2 & f & m) + + bx lr + @ @ |vpx_wide_mbfilter_neon| + diff --git a/thirdparty/libvpx/vpx_dsp/arm/gas_apple/save_reg_neon.s b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/save_reg_neon.s new file mode 100644 index 000000000000..f322b698b443 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/gas_apple/save_reg_neon.s @@ -0,0 +1,46 @@ +@ This file was created from a .asm file +@ using the ads2gas_apple.pl script. + + .set WIDE_REFERENCE, 0 + .set ARCHITECTURE, 5 + .set DO1STROUNDING, 0 + @ + @ Copyright (c) 2010 The WebM project authors. All Rights Reserved. + @ + @ Use of this source code is governed by a BSD-style license + @ that can be found in the LICENSE file in the root of the source + @ tree. An additional intellectual property rights grant can be found + @ in the file PATENTS. All contributing project authors may + @ be found in the AUTHORS file in the root of the source tree. + @ + + + .globl _vpx_push_neon + .globl vpx_push_neon + .globl _vpx_pop_neon + .globl vpx_pop_neon + + @ ARM + @ + @ PRESERVE8 + +.text +.p2align 2 + +_vpx_push_neon: + vpx_push_neon: @ + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + @ + +_vpx_pop_neon: + vpx_pop_neon: @ + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + @ + + diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c new file mode 100644 index 000000000000..f734e4802794 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx_dsp/inv_txfm.h" +#include "vpx_ports/mem.h" + +void vpx_idct16x16_1_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8x8_t d2u8, d3u8, d30u8, d31u8; + uint64x1_t d2u64, d3u64, d4u64, d5u64; + uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, j, a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 6); + + q0s16 = vdupq_n_s16(a1); + q0u16 = vreinterpretq_u16_s16(q0s16); + + for (d1 = d2 = dest, i = 0; i < 4; i++) { + for (j = 0; j < 2; j++) { + d2u64 = vld1_u64((const uint64_t *)d1); + d3u64 = vld1_u64((const uint64_t *)(d1 + 8)); + d1 += dest_stride; + d4u64 = vld1_u64((const uint64_t *)d1); + d5u64 = vld1_u64((const uint64_t *)(d1 + 8)); + d1 += dest_stride; + + q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); + q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); + q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); + q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); + vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8)); + d2 += dest_stride; + } + } + return; +} diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_add_neon.c new file mode 100644 index 000000000000..651ebb21f996 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_add_neon.c @@ -0,0 +1,1317 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void TRANSPOSE8X8( + int16x8_t *q8s16, + int16x8_t *q9s16, + int16x8_t *q10s16, + int16x8_t *q11s16, + int16x8_t *q12s16, + int16x8_t *q13s16, + int16x8_t *q14s16, + int16x8_t *q15s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + *q12s16 = vcombine_s16(d17s16, d25s16); + *q13s16 = vcombine_s16(d19s16, d27s16); + *q14s16 = vcombine_s16(d21s16, d29s16); + *q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16), + vreinterpretq_s32_s16(*q10s16)); + q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16), + vreinterpretq_s32_s16(*q11s16)); + q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16), + vreinterpretq_s32_s16(*q14s16)); + q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16), + vreinterpretq_s32_s16(*q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + *q8s16 = q0x2s16.val[0]; + *q9s16 = q0x2s16.val[1]; + *q10s16 = q1x2s16.val[0]; + *q11s16 = q1x2s16.val[1]; + *q12s16 = q2x2s16.val[0]; + *q13s16 = q2x2s16.val[1]; + *q14s16 = q3x2s16.val[0]; + *q15s16 = q3x2s16.val[1]; + return; +} + +void vpx_idct16x16_256_add_neon_pass1( + int16_t *in, + int16_t *out, + int output_stride) { + int16x4_t d0s16, d1s16, d2s16, d3s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(in); + q8s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q9s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q10s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q11s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q12s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q13s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q14s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + // stage 3 + d0s16 = vdup_n_s16(cospi_28_64); + d1s16 = vdup_n_s16(cospi_4_64); + + q2s32 = vmull_s16(d18s16, d0s16); + q3s32 = vmull_s16(d19s16, d0s16); + q5s32 = vmull_s16(d18s16, d1s16); + q6s32 = vmull_s16(d19s16, d1s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); + q5s32 = vmlal_s16(q5s32, d30s16, d0s16); + q6s32 = vmlal_s16(q6s32, d31s16, d0s16); + + d2s16 = vdup_n_s16(cospi_12_64); + d3s16 = vdup_n_s16(cospi_20_64); + + d8s16 = vqrshrn_n_s32(q2s32, 14); + d9s16 = vqrshrn_n_s32(q3s32, 14); + d14s16 = vqrshrn_n_s32(q5s32, 14); + d15s16 = vqrshrn_n_s32(q6s32, 14); + q4s16 = vcombine_s16(d8s16, d9s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + q2s32 = vmull_s16(d26s16, d2s16); + q3s32 = vmull_s16(d27s16, d2s16); + q9s32 = vmull_s16(d26s16, d3s16); + q15s32 = vmull_s16(d27s16, d3s16); + + q2s32 = vmlsl_s16(q2s32, d22s16, d3s16); + q3s32 = vmlsl_s16(q3s32, d23s16, d3s16); + q9s32 = vmlal_s16(q9s32, d22s16, d2s16); + q15s32 = vmlal_s16(q15s32, d23s16, d2s16); + + d10s16 = vqrshrn_n_s32(q2s32, 14); + d11s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q15s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 4 + d30s16 = vdup_n_s16(cospi_16_64); + + q2s32 = vmull_s16(d16s16, d30s16); + q11s32 = vmull_s16(d17s16, d30s16); + q0s32 = vmull_s16(d24s16, d30s16); + q1s32 = vmull_s16(d25s16, d30s16); + + d30s16 = vdup_n_s16(cospi_24_64); + d31s16 = vdup_n_s16(cospi_8_64); + + q3s32 = vaddq_s32(q2s32, q0s32); + q12s32 = vaddq_s32(q11s32, q1s32); + q13s32 = vsubq_s32(q2s32, q0s32); + q1s32 = vsubq_s32(q11s32, q1s32); + + d16s16 = vqrshrn_n_s32(q3s32, 14); + d17s16 = vqrshrn_n_s32(q12s32, 14); + d18s16 = vqrshrn_n_s32(q13s32, 14); + d19s16 = vqrshrn_n_s32(q1s32, 14); + q8s16 = vcombine_s16(d16s16, d17s16); + q9s16 = vcombine_s16(d18s16, d19s16); + + q0s32 = vmull_s16(d20s16, d31s16); + q1s32 = vmull_s16(d21s16, d31s16); + q12s32 = vmull_s16(d20s16, d30s16); + q13s32 = vmull_s16(d21s16, d30s16); + + q0s32 = vmlal_s16(q0s32, d28s16, d30s16); + q1s32 = vmlal_s16(q1s32, d29s16, d30s16); + q12s32 = vmlsl_s16(q12s32, d28s16, d31s16); + q13s32 = vmlsl_s16(q13s32, d29s16, d31s16); + + d22s16 = vqrshrn_n_s32(q0s32, 14); + d23s16 = vqrshrn_n_s32(q1s32, 14); + d20s16 = vqrshrn_n_s32(q12s32, 14); + d21s16 = vqrshrn_n_s32(q13s32, 14); + q10s16 = vcombine_s16(d20s16, d21s16); + q11s16 = vcombine_s16(d22s16, d23s16); + + q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q15s16 = vaddq_s16(q6s16, q7s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + // stage 5 + q0s16 = vaddq_s16(q8s16, q11s16); + q1s16 = vaddq_s16(q9s16, q10s16); + q2s16 = vsubq_s16(q9s16, q10s16); + q3s16 = vsubq_s16(q8s16, q11s16); + + d16s16 = vdup_n_s16(cospi_16_64); + + q11s32 = vmull_s16(d26s16, d16s16); + q12s32 = vmull_s16(d27s16, d16s16); + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + + q6s32 = vsubq_s32(q9s32, q11s32); + q13s32 = vsubq_s32(q10s32, q12s32); + q9s32 = vaddq_s32(q9s32, q11s32); + q10s32 = vaddq_s32(q10s32, q12s32); + + d10s16 = vqrshrn_n_s32(q6s32, 14); + d11s16 = vqrshrn_n_s32(q13s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q10s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 6 + q8s16 = vaddq_s16(q0s16, q15s16); + q9s16 = vaddq_s16(q1s16, q6s16); + q10s16 = vaddq_s16(q2s16, q5s16); + q11s16 = vaddq_s16(q3s16, q4s16); + q12s16 = vsubq_s16(q3s16, q4s16); + q13s16 = vsubq_s16(q2s16, q5s16); + q14s16 = vsubq_s16(q1s16, q6s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); + d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); + d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); + d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); + d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + // store the data + output_stride >>= 1; // output_stride / 2, out is int16_t + vst1_u64((uint64_t *)out, d16u64); + out += output_stride; + vst1_u64((uint64_t *)out, d17u64); + out += output_stride; + vst1_u64((uint64_t *)out, d18u64); + out += output_stride; + vst1_u64((uint64_t *)out, d19u64); + out += output_stride; + vst1_u64((uint64_t *)out, d20u64); + out += output_stride; + vst1_u64((uint64_t *)out, d21u64); + out += output_stride; + vst1_u64((uint64_t *)out, d22u64); + out += output_stride; + vst1_u64((uint64_t *)out, d23u64); + out += output_stride; + vst1_u64((uint64_t *)out, d24u64); + out += output_stride; + vst1_u64((uint64_t *)out, d25u64); + out += output_stride; + vst1_u64((uint64_t *)out, d26u64); + out += output_stride; + vst1_u64((uint64_t *)out, d27u64); + out += output_stride; + vst1_u64((uint64_t *)out, d28u64); + out += output_stride; + vst1_u64((uint64_t *)out, d29u64); + out += output_stride; + vst1_u64((uint64_t *)out, d30u64); + out += output_stride; + vst1_u64((uint64_t *)out, d31u64); + return; +} + +void vpx_idct16x16_256_add_neon_pass2( + int16_t *src, + int16_t *out, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride) { + uint8_t *d; + uint8x8_t d12u8, d13u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + uint64x1_t d24u64, d25u64, d26u64, d27u64; + int64x1_t d12s64, d13s64; + uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16; + uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(src); + q8s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q9s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q10s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q11s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q12s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q13s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q14s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + // stage 3 + d12s16 = vdup_n_s16(cospi_30_64); + d13s16 = vdup_n_s16(cospi_2_64); + + q2s32 = vmull_s16(d16s16, d12s16); + q3s32 = vmull_s16(d17s16, d12s16); + q1s32 = vmull_s16(d16s16, d13s16); + q4s32 = vmull_s16(d17s16, d13s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d13s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d13s16); + q1s32 = vmlal_s16(q1s32, d30s16, d12s16); + q4s32 = vmlal_s16(q4s32, d31s16, d12s16); + + d0s16 = vqrshrn_n_s32(q2s32, 14); + d1s16 = vqrshrn_n_s32(q3s32, 14); + d14s16 = vqrshrn_n_s32(q1s32, 14); + d15s16 = vqrshrn_n_s32(q4s32, 14); + q0s16 = vcombine_s16(d0s16, d1s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + d30s16 = vdup_n_s16(cospi_14_64); + d31s16 = vdup_n_s16(cospi_18_64); + + q2s32 = vmull_s16(d24s16, d30s16); + q3s32 = vmull_s16(d25s16, d30s16); + q4s32 = vmull_s16(d24s16, d31s16); + q5s32 = vmull_s16(d25s16, d31s16); + + q2s32 = vmlsl_s16(q2s32, d22s16, d31s16); + q3s32 = vmlsl_s16(q3s32, d23s16, d31s16); + q4s32 = vmlal_s16(q4s32, d22s16, d30s16); + q5s32 = vmlal_s16(q5s32, d23s16, d30s16); + + d2s16 = vqrshrn_n_s32(q2s32, 14); + d3s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q4s32, 14); + d13s16 = vqrshrn_n_s32(q5s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + d30s16 = vdup_n_s16(cospi_22_64); + d31s16 = vdup_n_s16(cospi_10_64); + + q11s32 = vmull_s16(d20s16, d30s16); + q12s32 = vmull_s16(d21s16, d30s16); + q4s32 = vmull_s16(d20s16, d31s16); + q5s32 = vmull_s16(d21s16, d31s16); + + q11s32 = vmlsl_s16(q11s32, d26s16, d31s16); + q12s32 = vmlsl_s16(q12s32, d27s16, d31s16); + q4s32 = vmlal_s16(q4s32, d26s16, d30s16); + q5s32 = vmlal_s16(q5s32, d27s16, d30s16); + + d4s16 = vqrshrn_n_s32(q11s32, 14); + d5s16 = vqrshrn_n_s32(q12s32, 14); + d11s16 = vqrshrn_n_s32(q5s32, 14); + d10s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + d30s16 = vdup_n_s16(cospi_6_64); + d31s16 = vdup_n_s16(cospi_26_64); + + q10s32 = vmull_s16(d28s16, d30s16); + q11s32 = vmull_s16(d29s16, d30s16); + q12s32 = vmull_s16(d28s16, d31s16); + q13s32 = vmull_s16(d29s16, d31s16); + + q10s32 = vmlsl_s16(q10s32, d18s16, d31s16); + q11s32 = vmlsl_s16(q11s32, d19s16, d31s16); + q12s32 = vmlal_s16(q12s32, d18s16, d30s16); + q13s32 = vmlal_s16(q13s32, d19s16, d30s16); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q11s32, 14); + d8s16 = vqrshrn_n_s32(q12s32, 14); + d9s16 = vqrshrn_n_s32(q13s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 3 + q9s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q10s16 = vsubq_s16(q3s16, q2s16); + q11s16 = vaddq_s16(q2s16, q3s16); + q12s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q6s16, q7s16); + + // stage 4 + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + d30s16 = vdup_n_s16(cospi_8_64); + d31s16 = vdup_n_s16(cospi_24_64); + + q2s32 = vmull_s16(d18s16, d31s16); + q3s32 = vmull_s16(d19s16, d31s16); + q4s32 = vmull_s16(d28s16, d31s16); + q5s32 = vmull_s16(d29s16, d31s16); + + q2s32 = vmlal_s16(q2s32, d28s16, d30s16); + q3s32 = vmlal_s16(q3s32, d29s16, d30s16); + q4s32 = vmlsl_s16(q4s32, d18s16, d30s16); + q5s32 = vmlsl_s16(q5s32, d19s16, d30s16); + + d12s16 = vqrshrn_n_s32(q2s32, 14); + d13s16 = vqrshrn_n_s32(q3s32, 14); + d2s16 = vqrshrn_n_s32(q4s32, 14); + d3s16 = vqrshrn_n_s32(q5s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + q3s16 = q11s16; + q4s16 = q12s16; + + d30s16 = vdup_n_s16(-cospi_8_64); + q11s32 = vmull_s16(d26s16, d30s16); + q12s32 = vmull_s16(d27s16, d30s16); + q8s32 = vmull_s16(d20s16, d30s16); + q9s32 = vmull_s16(d21s16, d30s16); + + q11s32 = vmlsl_s16(q11s32, d20s16, d31s16); + q12s32 = vmlsl_s16(q12s32, d21s16, d31s16); + q8s32 = vmlal_s16(q8s32, d26s16, d31s16); + q9s32 = vmlal_s16(q9s32, d27s16, d31s16); + + d4s16 = vqrshrn_n_s32(q11s32, 14); + d5s16 = vqrshrn_n_s32(q12s32, 14); + d10s16 = vqrshrn_n_s32(q8s32, 14); + d11s16 = vqrshrn_n_s32(q9s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + // stage 5 + q8s16 = vaddq_s16(q0s16, q3s16); + q9s16 = vaddq_s16(q1s16, q2s16); + q10s16 = vsubq_s16(q1s16, q2s16); + q11s16 = vsubq_s16(q0s16, q3s16); + q12s16 = vsubq_s16(q7s16, q4s16); + q13s16 = vsubq_s16(q6s16, q5s16); + q14s16 = vaddq_s16(q6s16, q5s16); + q15s16 = vaddq_s16(q7s16, q4s16); + + // stage 6 + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + + d14s16 = vdup_n_s16(cospi_16_64); + + q3s32 = vmull_s16(d26s16, d14s16); + q4s32 = vmull_s16(d27s16, d14s16); + q0s32 = vmull_s16(d20s16, d14s16); + q1s32 = vmull_s16(d21s16, d14s16); + + q5s32 = vsubq_s32(q3s32, q0s32); + q6s32 = vsubq_s32(q4s32, q1s32); + q10s32 = vaddq_s32(q3s32, q0s32); + q4s32 = vaddq_s32(q4s32, q1s32); + + d4s16 = vqrshrn_n_s32(q5s32, 14); + d5s16 = vqrshrn_n_s32(q6s32, 14); + d10s16 = vqrshrn_n_s32(q10s32, 14); + d11s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q0s32 = vmull_s16(d22s16, d14s16); + q1s32 = vmull_s16(d23s16, d14s16); + q13s32 = vmull_s16(d24s16, d14s16); + q6s32 = vmull_s16(d25s16, d14s16); + + q10s32 = vsubq_s32(q13s32, q0s32); + q4s32 = vsubq_s32(q6s32, q1s32); + q13s32 = vaddq_s32(q13s32, q0s32); + q6s32 = vaddq_s32(q6s32, q1s32); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q4s32, 14); + d8s16 = vqrshrn_n_s32(q13s32, 14); + d9s16 = vqrshrn_n_s32(q6s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 7 + if (skip_adding != 0) { + d = dest; + // load the data in pass1 + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), + vreinterpret_u8_s64(d12s64)); + q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), + vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q14s16 = vsubq_s16(q1s16, q14s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), + vreinterpret_u8_s64(d12s64)); + q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), + vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); + + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), + vreinterpret_u8_s64(d12s64)); + q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), + vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), + vreinterpret_u8_s64(d12s64)); + q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), + vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); + + // store the data out 8,9,10,11,12,13,14,15 + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q8s16 = vrshrq_n_s16(q8s16, 6); + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q9s16 = vrshrq_n_s16(q9s16, 6); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q2s16 = vrshrq_n_s16(q2s16, 6); + q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q3s16 = vrshrq_n_s16(q3s16, 6); + q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q4s16 = vrshrq_n_s16(q4s16, 6); + q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q5s16 = vrshrq_n_s16(q5s16, 6); + q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q14s16 = vrshrq_n_s16(q14s16, 6); + q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + q15s16 = vrshrq_n_s16(q15s16, 6); + q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16), + vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + } else { // skip_adding_dest + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q14s16 = vsubq_s16(q1s16, q14s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); + + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); + + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16))); + } + return; +} + +void vpx_idct16x16_10_add_neon_pass1( + int16_t *in, + int16_t *out, + int output_stride) { + int16x4_t d4s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q6s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q15s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(in); + q8s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q9s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q10s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q11s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q12s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q13s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q14s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // stage 3 + q0s16 = vdupq_n_s16(cospi_28_64 * 2); + q1s16 = vdupq_n_s16(cospi_4_64 * 2); + + q4s16 = vqrdmulhq_s16(q9s16, q0s16); + q7s16 = vqrdmulhq_s16(q9s16, q1s16); + + // stage 4 + q1s16 = vdupq_n_s16(cospi_16_64 * 2); + d4s16 = vdup_n_s16(cospi_16_64); + + q8s16 = vqrdmulhq_s16(q8s16, q1s16); + + d8s16 = vget_low_s16(q4s16); + d9s16 = vget_high_s16(q4s16); + d14s16 = vget_low_s16(q7s16); + d15s16 = vget_high_s16(q7s16); + q9s32 = vmull_s16(d14s16, d4s16); + q10s32 = vmull_s16(d15s16, d4s16); + q12s32 = vmull_s16(d9s16, d4s16); + q11s32 = vmull_s16(d8s16, d4s16); + + q15s32 = vsubq_s32(q10s32, q12s32); + q6s32 = vsubq_s32(q9s32, q11s32); + q9s32 = vaddq_s32(q9s32, q11s32); + q10s32 = vaddq_s32(q10s32, q12s32); + + d11s16 = vqrshrn_n_s32(q15s32, 14); + d10s16 = vqrshrn_n_s32(q6s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q10s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 6 + q2s16 = vaddq_s16(q8s16, q7s16); + q9s16 = vaddq_s16(q8s16, q6s16); + q10s16 = vaddq_s16(q8s16, q5s16); + q11s16 = vaddq_s16(q8s16, q4s16); + q12s16 = vsubq_s16(q8s16, q4s16); + q13s16 = vsubq_s16(q8s16, q5s16); + q14s16 = vsubq_s16(q8s16, q6s16); + q15s16 = vsubq_s16(q8s16, q7s16); + + d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); + d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); + d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); + d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); + d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + // store the data + output_stride >>= 1; // output_stride / 2, out is int16_t + vst1_u64((uint64_t *)out, d4u64); + out += output_stride; + vst1_u64((uint64_t *)out, d5u64); + out += output_stride; + vst1_u64((uint64_t *)out, d18u64); + out += output_stride; + vst1_u64((uint64_t *)out, d19u64); + out += output_stride; + vst1_u64((uint64_t *)out, d20u64); + out += output_stride; + vst1_u64((uint64_t *)out, d21u64); + out += output_stride; + vst1_u64((uint64_t *)out, d22u64); + out += output_stride; + vst1_u64((uint64_t *)out, d23u64); + out += output_stride; + vst1_u64((uint64_t *)out, d24u64); + out += output_stride; + vst1_u64((uint64_t *)out, d25u64); + out += output_stride; + vst1_u64((uint64_t *)out, d26u64); + out += output_stride; + vst1_u64((uint64_t *)out, d27u64); + out += output_stride; + vst1_u64((uint64_t *)out, d28u64); + out += output_stride; + vst1_u64((uint64_t *)out, d29u64); + out += output_stride; + vst1_u64((uint64_t *)out, d30u64); + out += output_stride; + vst1_u64((uint64_t *)out, d31u64); + return; +} + +void vpx_idct16x16_10_add_neon_pass2( + int16_t *src, + int16_t *out, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16; + uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64; + uint64x1_t d16u64, d17u64, d18u64, d19u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32; + int16x8x2_t q0x2s16; + (void)skip_adding; + (void)dest; + (void)dest_stride; + + q0x2s16 = vld2q_s16(src); + q8s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q9s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q10s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q11s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q12s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q13s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q14s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // stage 3 + q6s16 = vdupq_n_s16(cospi_30_64 * 2); + q0s16 = vqrdmulhq_s16(q8s16, q6s16); + q6s16 = vdupq_n_s16(cospi_2_64 * 2); + q7s16 = vqrdmulhq_s16(q8s16, q6s16); + + q15s16 = vdupq_n_s16(-cospi_26_64 * 2); + q14s16 = vdupq_n_s16(cospi_6_64 * 2); + q3s16 = vqrdmulhq_s16(q9s16, q15s16); + q4s16 = vqrdmulhq_s16(q9s16, q14s16); + + // stage 4 + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + d6s16 = vget_low_s16(q3s16); + d7s16 = vget_high_s16(q3s16); + d8s16 = vget_low_s16(q4s16); + d9s16 = vget_high_s16(q4s16); + d14s16 = vget_low_s16(q7s16); + d15s16 = vget_high_s16(q7s16); + + d30s16 = vdup_n_s16(cospi_8_64); + d31s16 = vdup_n_s16(cospi_24_64); + + q12s32 = vmull_s16(d14s16, d31s16); + q5s32 = vmull_s16(d15s16, d31s16); + q2s32 = vmull_s16(d0s16, d31s16); + q11s32 = vmull_s16(d1s16, d31s16); + + q12s32 = vmlsl_s16(q12s32, d0s16, d30s16); + q5s32 = vmlsl_s16(q5s32, d1s16, d30s16); + q2s32 = vmlal_s16(q2s32, d14s16, d30s16); + q11s32 = vmlal_s16(q11s32, d15s16, d30s16); + + d2s16 = vqrshrn_n_s32(q12s32, 14); + d3s16 = vqrshrn_n_s32(q5s32, 14); + d12s16 = vqrshrn_n_s32(q2s32, 14); + d13s16 = vqrshrn_n_s32(q11s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + d30s16 = vdup_n_s16(-cospi_8_64); + q10s32 = vmull_s16(d8s16, d30s16); + q13s32 = vmull_s16(d9s16, d30s16); + q8s32 = vmull_s16(d6s16, d30s16); + q9s32 = vmull_s16(d7s16, d30s16); + + q10s32 = vmlsl_s16(q10s32, d6s16, d31s16); + q13s32 = vmlsl_s16(q13s32, d7s16, d31s16); + q8s32 = vmlal_s16(q8s32, d8s16, d31s16); + q9s32 = vmlal_s16(q9s32, d9s16, d31s16); + + d4s16 = vqrshrn_n_s32(q10s32, 14); + d5s16 = vqrshrn_n_s32(q13s32, 14); + d10s16 = vqrshrn_n_s32(q8s32, 14); + d11s16 = vqrshrn_n_s32(q9s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + // stage 5 + q8s16 = vaddq_s16(q0s16, q3s16); + q9s16 = vaddq_s16(q1s16, q2s16); + q10s16 = vsubq_s16(q1s16, q2s16); + q11s16 = vsubq_s16(q0s16, q3s16); + q12s16 = vsubq_s16(q7s16, q4s16); + q13s16 = vsubq_s16(q6s16, q5s16); + q14s16 = vaddq_s16(q6s16, q5s16); + q15s16 = vaddq_s16(q7s16, q4s16); + + // stage 6 + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + + d14s16 = vdup_n_s16(cospi_16_64); + q3s32 = vmull_s16(d26s16, d14s16); + q4s32 = vmull_s16(d27s16, d14s16); + q0s32 = vmull_s16(d20s16, d14s16); + q1s32 = vmull_s16(d21s16, d14s16); + + q5s32 = vsubq_s32(q3s32, q0s32); + q6s32 = vsubq_s32(q4s32, q1s32); + q0s32 = vaddq_s32(q3s32, q0s32); + q4s32 = vaddq_s32(q4s32, q1s32); + + d4s16 = vqrshrn_n_s32(q5s32, 14); + d5s16 = vqrshrn_n_s32(q6s32, 14); + d10s16 = vqrshrn_n_s32(q0s32, 14); + d11s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q0s32 = vmull_s16(d22s16, d14s16); + q1s32 = vmull_s16(d23s16, d14s16); + q13s32 = vmull_s16(d24s16, d14s16); + q6s32 = vmull_s16(d25s16, d14s16); + + q10s32 = vsubq_s32(q13s32, q0s32); + q4s32 = vsubq_s32(q6s32, q1s32); + q13s32 = vaddq_s32(q13s32, q0s32); + q6s32 = vaddq_s32(q6s32, q1s32); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q4s32, 14); + d8s16 = vqrshrn_n_s32(q13s32, 14); + d9s16 = vqrshrn_n_s32(q6s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 7 + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q14s16 = vsubq_s16(q1s16, q14s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); + + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); + + d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); + d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); + d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16)); + d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16)); + d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16)); + d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16)); + d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16)); + d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16)); + d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); + d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + vst1_u64((uint64_t *)out, d16u64); + out += 4; + vst1_u64((uint64_t *)out, d17u64); + out += 12; + vst1_u64((uint64_t *)out, d18u64); + out += 4; + vst1_u64((uint64_t *)out, d19u64); + out += 12; + vst1_u64((uint64_t *)out, d4u64); + out += 4; + vst1_u64((uint64_t *)out, d5u64); + out += 12; + vst1_u64((uint64_t *)out, d6u64); + out += 4; + vst1_u64((uint64_t *)out, d7u64); + out += 12; + vst1_u64((uint64_t *)out, d8u64); + out += 4; + vst1_u64((uint64_t *)out, d9u64); + out += 12; + vst1_u64((uint64_t *)out, d10u64); + out += 4; + vst1_u64((uint64_t *)out, d11u64); + out += 12; + vst1_u64((uint64_t *)out, d28u64); + out += 4; + vst1_u64((uint64_t *)out, d29u64); + out += 12; + vst1_u64((uint64_t *)out, d30u64); + out += 4; + vst1_u64((uint64_t *)out, d31u64); + return; +} diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct16x16_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_neon.c new file mode 100644 index 000000000000..352979aa16f7 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/idct16x16_neon.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_dsp/vpx_dsp_common.h" + +void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, + int16_t *output, + int output_stride); +void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, + int16_t *output, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride); +void vpx_idct16x16_10_add_neon_pass1(const int16_t *input, + int16_t *output, + int output_stride); +void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, + int16_t *output, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride); + +#if HAVE_NEON_ASM +/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ +extern void vpx_push_neon(int64_t *store); +extern void vpx_pop_neon(int64_t *store); +#endif // HAVE_NEON_ASM + +void vpx_idct16x16_256_add_neon(const int16_t *input, + uint8_t *dest, int dest_stride) { +#if HAVE_NEON_ASM + int64_t store_reg[8]; +#endif + int16_t pass1_output[16*16] = {0}; + int16_t row_idct_output[16*16] = {0}; + +#if HAVE_NEON_ASM + // save d8-d15 register values. + vpx_push_neon(store_reg); +#endif + + /* Parallel idct on the upper 8 rows */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vpx_idct16x16_256_add_neon_pass1(input, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7 + // which will be saved into row_idct_output. + vpx_idct16x16_256_add_neon_pass2(input+1, + row_idct_output, + pass1_output, + 0, + dest, + dest_stride); + + /* Parallel idct on the lower 8 rows */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vpx_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7 + // which will be saved into row_idct_output. + vpx_idct16x16_256_add_neon_pass2(input+8*16+1, + row_idct_output+8, + pass1_output, + 0, + dest, + dest_stride); + + /* Parallel idct on the left 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + vpx_idct16x16_256_add_neon_pass2(row_idct_output+1, + row_idct_output, + pass1_output, + 1, + dest, + dest_stride); + + /* Parallel idct on the right 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, + row_idct_output+8, + pass1_output, + 1, + dest+8, + dest_stride); + +#if HAVE_NEON_ASM + // restore d8-d15 register values. + vpx_pop_neon(store_reg); +#endif + + return; +} + +void vpx_idct16x16_10_add_neon(const int16_t *input, + uint8_t *dest, int dest_stride) { +#if HAVE_NEON_ASM + int64_t store_reg[8]; +#endif + int16_t pass1_output[16*16] = {0}; + int16_t row_idct_output[16*16] = {0}; + +#if HAVE_NEON_ASM + // save d8-d15 register values. + vpx_push_neon(store_reg); +#endif + + /* Parallel idct on the upper 8 rows */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vpx_idct16x16_10_add_neon_pass1(input, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7 + // which will be saved into row_idct_output. + vpx_idct16x16_10_add_neon_pass2(input+1, + row_idct_output, + pass1_output, + 0, + dest, + dest_stride); + + /* Skip Parallel idct on the lower 8 rows as they are all 0s */ + + /* Parallel idct on the left 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + vpx_idct16x16_256_add_neon_pass2(row_idct_output+1, + row_idct_output, + pass1_output, + 1, + dest, + dest_stride); + + /* Parallel idct on the right 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, + row_idct_output+8, + pass1_output, + 1, + dest+8, + dest_stride); + +#if HAVE_NEON_ASM + // restore d8-d15 register values. + vpx_pop_neon(store_reg); +#endif + + return; +} diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c new file mode 100644 index 000000000000..c25c0c4a5c20 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" + +#include "vpx_dsp/inv_txfm.h" +#include "vpx_ports/mem.h" + +static INLINE void LD_16x8( + uint8_t *d, + int d_stride, + uint8x16_t *q8u8, + uint8x16_t *q9u8, + uint8x16_t *q10u8, + uint8x16_t *q11u8, + uint8x16_t *q12u8, + uint8x16_t *q13u8, + uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vld1q_u8(d); + d += d_stride; + *q9u8 = vld1q_u8(d); + d += d_stride; + *q10u8 = vld1q_u8(d); + d += d_stride; + *q11u8 = vld1q_u8(d); + d += d_stride; + *q12u8 = vld1q_u8(d); + d += d_stride; + *q13u8 = vld1q_u8(d); + d += d_stride; + *q14u8 = vld1q_u8(d); + d += d_stride; + *q15u8 = vld1q_u8(d); + return; +} + +static INLINE void ADD_DIFF_16x8( + uint8x16_t qdiffu8, + uint8x16_t *q8u8, + uint8x16_t *q9u8, + uint8x16_t *q10u8, + uint8x16_t *q11u8, + uint8x16_t *q12u8, + uint8x16_t *q13u8, + uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vqaddq_u8(*q8u8, qdiffu8); + *q9u8 = vqaddq_u8(*q9u8, qdiffu8); + *q10u8 = vqaddq_u8(*q10u8, qdiffu8); + *q11u8 = vqaddq_u8(*q11u8, qdiffu8); + *q12u8 = vqaddq_u8(*q12u8, qdiffu8); + *q13u8 = vqaddq_u8(*q13u8, qdiffu8); + *q14u8 = vqaddq_u8(*q14u8, qdiffu8); + *q15u8 = vqaddq_u8(*q15u8, qdiffu8); + return; +} + +static INLINE void SUB_DIFF_16x8( + uint8x16_t qdiffu8, + uint8x16_t *q8u8, + uint8x16_t *q9u8, + uint8x16_t *q10u8, + uint8x16_t *q11u8, + uint8x16_t *q12u8, + uint8x16_t *q13u8, + uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vqsubq_u8(*q8u8, qdiffu8); + *q9u8 = vqsubq_u8(*q9u8, qdiffu8); + *q10u8 = vqsubq_u8(*q10u8, qdiffu8); + *q11u8 = vqsubq_u8(*q11u8, qdiffu8); + *q12u8 = vqsubq_u8(*q12u8, qdiffu8); + *q13u8 = vqsubq_u8(*q13u8, qdiffu8); + *q14u8 = vqsubq_u8(*q14u8, qdiffu8); + *q15u8 = vqsubq_u8(*q15u8, qdiffu8); + return; +} + +static INLINE void ST_16x8( + uint8_t *d, + int d_stride, + uint8x16_t *q8u8, + uint8x16_t *q9u8, + uint8x16_t *q10u8, + uint8x16_t *q11u8, + uint8x16_t *q12u8, + uint8x16_t *q13u8, + uint8x16_t *q14u8, + uint8x16_t *q15u8) { + vst1q_u8(d, *q8u8); + d += d_stride; + vst1q_u8(d, *q9u8); + d += d_stride; + vst1q_u8(d, *q10u8); + d += d_stride; + vst1q_u8(d, *q11u8); + d += d_stride; + vst1q_u8(d, *q12u8); + d += d_stride; + vst1q_u8(d, *q13u8); + d += d_stride; + vst1q_u8(d, *q14u8); + d += d_stride; + vst1q_u8(d, *q15u8); + return; +} + +void vpx_idct32x32_1_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int i, j, dest_stride8; + uint8_t *d; + int16_t a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 6); + + dest_stride8 = dest_stride * 8; + if (a1 >= 0) { // diff_positive_32_32 + a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; + q0u8 = vdupq_n_u8(a1); + for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop + d = dest; + for (j = 0; j < 4; j++) { + LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + d += dest_stride8; + } + } + } else { // diff_negative_32_32 + a1 = -a1; + a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; + q0u8 = vdupq_n_u8(a1); + for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop + d = dest; + for (j = 0; j < 4; j++) { + LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, + &q12u8, &q13u8, &q14u8, &q15u8); + d += dest_stride8; + } + } + } + return; +} diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct32x32_add_neon.c new file mode 100644 index 000000000000..025437eb963b --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/idct32x32_add_neon.c @@ -0,0 +1,719 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "vpx_dsp/txfm_common.h" + +#define LOAD_FROM_TRANSPOSED(prev, first, second) \ + q14s16 = vld1q_s16(trans_buf + first * 8); \ + q13s16 = vld1q_s16(trans_buf + second * 8); + +#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \ + qA = vld1q_s16(out + first * 32); \ + qB = vld1q_s16(out + second * 32); + +#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \ + vst1q_s16(out + first * 32, qA); \ + vst1q_s16(out + second * 32, qB); + +#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ + __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \ + q6s16, q7s16, q8s16, q9s16); +static INLINE void __STORE_COMBINE_CENTER_RESULTS( + uint8_t *p1, + uint8_t *p2, + int stride, + int16x8_t q6s16, + int16x8_t q7s16, + int16x8_t q8s16, + int16x8_t q9s16) { + int16x4_t d8s16, d9s16, d10s16, d11s16; + + d8s16 = vld1_s16((int16_t *)p1); + p1 += stride; + d11s16 = vld1_s16((int16_t *)p2); + p2 -= stride; + d9s16 = vld1_s16((int16_t *)p1); + d10s16 = vld1_s16((int16_t *)p2); + + q7s16 = vrshrq_n_s16(q7s16, 6); + q8s16 = vrshrq_n_s16(q8s16, 6); + q9s16 = vrshrq_n_s16(q9s16, 6); + q6s16 = vrshrq_n_s16(q6s16, 6); + + q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16), + vreinterpret_u8_s16(d9s16))); + q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_s16(d10s16))); + q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_s16(d11s16))); + q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16), + vreinterpret_u8_s16(d8s16))); + + d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); + d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16)); + d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16)); + d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); + + vst1_s16((int16_t *)p1, d9s16); + p1 -= stride; + vst1_s16((int16_t *)p2, d10s16); + p2 += stride; + vst1_s16((int16_t *)p1, d8s16); + vst1_s16((int16_t *)p2, d11s16); + return; +} + +#define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \ + __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \ + q4s16, q5s16, q6s16, q7s16); +static INLINE void __STORE_COMBINE_EXTREME_RESULTS( + uint8_t *p1, + uint8_t *p2, + int stride, + int16x8_t q4s16, + int16x8_t q5s16, + int16x8_t q6s16, + int16x8_t q7s16) { + int16x4_t d4s16, d5s16, d6s16, d7s16; + + d4s16 = vld1_s16((int16_t *)p1); + p1 += stride; + d7s16 = vld1_s16((int16_t *)p2); + p2 -= stride; + d5s16 = vld1_s16((int16_t *)p1); + d6s16 = vld1_s16((int16_t *)p2); + + q5s16 = vrshrq_n_s16(q5s16, 6); + q6s16 = vrshrq_n_s16(q6s16, 6); + q7s16 = vrshrq_n_s16(q7s16, 6); + q4s16 = vrshrq_n_s16(q4s16, 6); + + q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16), + vreinterpret_u8_s16(d5s16))); + q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16), + vreinterpret_u8_s16(d6s16))); + q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16), + vreinterpret_u8_s16(d7s16))); + q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16), + vreinterpret_u8_s16(d4s16))); + + d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16)); + d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); + d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); + d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16)); + + vst1_s16((int16_t *)p1, d5s16); + p1 -= stride; + vst1_s16((int16_t *)p2, d6s16); + p2 += stride; + vst1_s16((int16_t *)p2, d7s16); + vst1_s16((int16_t *)p1, d4s16); + return; +} + +#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \ + DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); +static INLINE void DO_BUTTERFLY( + int16x8_t q14s16, + int16x8_t q13s16, + int16_t first_const, + int16_t second_const, + int16x8_t *qAs16, + int16x8_t *qBs16) { + int16x4_t d30s16, d31s16; + int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32; + int16x4_t dCs16, dDs16, dAs16, dBs16; + + dCs16 = vget_low_s16(q14s16); + dDs16 = vget_high_s16(q14s16); + dAs16 = vget_low_s16(q13s16); + dBs16 = vget_high_s16(q13s16); + + d30s16 = vdup_n_s16(first_const); + d31s16 = vdup_n_s16(second_const); + + q8s32 = vmull_s16(dCs16, d30s16); + q10s32 = vmull_s16(dAs16, d31s16); + q9s32 = vmull_s16(dDs16, d30s16); + q11s32 = vmull_s16(dBs16, d31s16); + q12s32 = vmull_s16(dCs16, d31s16); + + q8s32 = vsubq_s32(q8s32, q10s32); + q9s32 = vsubq_s32(q9s32, q11s32); + + q10s32 = vmull_s16(dDs16, d31s16); + q11s32 = vmull_s16(dAs16, d30s16); + q15s32 = vmull_s16(dBs16, d30s16); + + q11s32 = vaddq_s32(q12s32, q11s32); + q10s32 = vaddq_s32(q10s32, q15s32); + + *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), + vqrshrn_n_s32(q9s32, 14)); + *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), + vqrshrn_n_s32(q10s32, 14)); + return; +} + +static INLINE void idct32_transpose_pair( + int16_t *input, + int16_t *t_buf) { + int16_t *in; + int i; + const int stride = 32; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + for (i = 0; i < 4; i++, input += 8) { + in = input; + q8s16 = vld1q_s16(in); + in += stride; + q9s16 = vld1q_s16(in); + in += stride; + q10s16 = vld1q_s16(in); + in += stride; + q11s16 = vld1q_s16(in); + in += stride; + q12s16 = vld1q_s16(in); + in += stride; + q13s16 = vld1q_s16(in); + in += stride; + q14s16 = vld1q_s16(in); + in += stride; + q15s16 = vld1q_s16(in); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + q12s16 = vcombine_s16(d17s16, d25s16); + q13s16 = vcombine_s16(d19s16, d27s16); + q14s16 = vcombine_s16(d21s16, d29s16); + q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), + vreinterpretq_s32_s16(q10s16)); + q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16), + vreinterpretq_s32_s16(q11s16)); + q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16), + vreinterpretq_s32_s16(q14s16)); + q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16), + vreinterpretq_s32_s16(q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + vst1q_s16(t_buf, q0x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q0x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q1x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q1x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q2x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q2x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q3x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q3x2s16.val[1]); + t_buf += 8; + } + return; +} + +static INLINE void idct32_bands_end_1st_pass( + int16_t *out, + int16x8_t q2s16, + int16x8_t q3s16, + int16x8_t q6s16, + int16x8_t q7s16, + int16x8_t q8s16, + int16x8_t q9s16, + int16x8_t q10s16, + int16x8_t q11s16, + int16x8_t q12s16, + int16x8_t q13s16, + int16x8_t q14s16, + int16x8_t q15s16) { + int16x8_t q0s16, q1s16, q4s16, q5s16; + + STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16); + STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16); + + LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16); + STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16); + + LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16); + q2s16 = vaddq_s16(q10s16, q1s16); + q3s16 = vaddq_s16(q11s16, q0s16); + q4s16 = vsubq_s16(q11s16, q0s16); + q5s16 = vsubq_s16(q10s16, q1s16); + + LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16); + STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16); + + LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16); + STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16); + + LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16); + q2s16 = vaddq_s16(q12s16, q1s16); + q3s16 = vaddq_s16(q13s16, q0s16); + q4s16 = vsubq_s16(q13s16, q0s16); + q5s16 = vsubq_s16(q12s16, q1s16); + + LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16); + STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16); + + LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16); + STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16); + + LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16); + q2s16 = vaddq_s16(q14s16, q1s16); + q3s16 = vaddq_s16(q15s16, q0s16); + q4s16 = vsubq_s16(q15s16, q0s16); + q5s16 = vsubq_s16(q14s16, q1s16); + + LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16); + STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16); + + LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16); + STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16); + return; +} + +static INLINE void idct32_bands_end_2nd_pass( + int16_t *out, + uint8_t *dest, + int stride, + int16x8_t q2s16, + int16x8_t q3s16, + int16x8_t q6s16, + int16x8_t q7s16, + int16x8_t q8s16, + int16x8_t q9s16, + int16x8_t q10s16, + int16x8_t q11s16, + int16x8_t q12s16, + int16x8_t q13s16, + int16x8_t q14s16, + int16x8_t q15s16) { + uint8_t *r6 = dest + 31 * stride; + uint8_t *r7 = dest/* + 0 * stride*/; + uint8_t *r9 = dest + 15 * stride; + uint8_t *r10 = dest + 16 * stride; + int str2 = stride << 1; + int16x8_t q0s16, q1s16, q4s16, q5s16; + + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; r9 -= str2; + + LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; r6 -= str2; + + LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16) + q2s16 = vaddq_s16(q10s16, q1s16); + q3s16 = vaddq_s16(q11s16, q0s16); + q4s16 = vsubq_s16(q11s16, q0s16); + q5s16 = vsubq_s16(q10s16, q1s16); + + LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; r9 -= str2; + + LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; r6 -= str2; + + LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16) + q2s16 = vaddq_s16(q12s16, q1s16); + q3s16 = vaddq_s16(q13s16, q0s16); + q4s16 = vsubq_s16(q13s16, q0s16); + q5s16 = vsubq_s16(q12s16, q1s16); + + LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; r9 -= str2; + + LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; r6 -= str2; + + LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16) + q2s16 = vaddq_s16(q14s16, q1s16); + q3s16 = vaddq_s16(q15s16, q0s16); + q4s16 = vsubq_s16(q15s16, q0s16); + q5s16 = vsubq_s16(q14s16, q1s16); + + LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + + LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + return; +} + +void vpx_idct32x32_1024_add_neon( + int16_t *input, + uint8_t *dest, + int stride) { + int i, idct32_pass_loop; + int16_t trans_buf[32 * 8]; + int16_t pass1[32 * 32]; + int16_t pass2[32 * 32]; + int16_t *out; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + + for (idct32_pass_loop = 0, out = pass1; + idct32_pass_loop < 2; + idct32_pass_loop++, + input = pass1, // the input of pass2 is the result of pass1 + out = pass2) { + for (i = 0; + i < 4; i++, + input += 32 * 8, out += 8) { // idct32_bands_loop + idct32_transpose_pair(input, trans_buf); + + // ----------------------------------------- + // BLOCK A: 16-19,28-31 + // ----------------------------------------- + // generate 16,17,30,31 + // part of stage 1 + LOAD_FROM_TRANSPOSED(0, 1, 31) + DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(31, 17, 15) + DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16) + // part of stage 2 + q4s16 = vaddq_s16(q0s16, q1s16); + q13s16 = vsubq_s16(q0s16, q1s16); + q6s16 = vaddq_s16(q2s16, q3s16); + q14s16 = vsubq_s16(q2s16, q3s16); + // part of stage 3 + DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16) + + // generate 18,19,28,29 + // part of stage 1 + LOAD_FROM_TRANSPOSED(15, 9, 23) + DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(23, 25, 7) + DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16) + // part of stage 2 + q13s16 = vsubq_s16(q3s16, q2s16); + q3s16 = vaddq_s16(q3s16, q2s16); + q14s16 = vsubq_s16(q1s16, q0s16); + q2s16 = vaddq_s16(q1s16, q0s16); + // part of stage 3 + DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16) + // part of stage 4 + q8s16 = vaddq_s16(q4s16, q2s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q10s16 = vaddq_s16(q7s16, q1s16); + q15s16 = vaddq_s16(q6s16, q3s16); + q13s16 = vsubq_s16(q5s16, q0s16); + q14s16 = vsubq_s16(q7s16, q1s16); + STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16) + STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16) + // part of stage 5 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16) + STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16) + // part of stage 4 + q13s16 = vsubq_s16(q4s16, q2s16); + q14s16 = vsubq_s16(q6s16, q3s16); + // part of stage 5 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16) + STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16) + + // ----------------------------------------- + // BLOCK B: 20-23,24-27 + // ----------------------------------------- + // generate 20,21,26,27 + // part of stage 1 + LOAD_FROM_TRANSPOSED(7, 5, 27) + DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(27, 21, 11) + DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16) + // part of stage 2 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 3 + DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) + + // generate 22,23,24,25 + // part of stage 1 + LOAD_FROM_TRANSPOSED(11, 13, 19) + DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(19, 29, 3) + DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16) + // part of stage 2 + q14s16 = vsubq_s16(q4s16, q5s16); + q5s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q6s16, q7s16); + q6s16 = vaddq_s16(q6s16, q7s16); + // part of stage 3 + DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16) + // part of stage 4 + q10s16 = vaddq_s16(q7s16, q1s16); + q11s16 = vaddq_s16(q5s16, q0s16); + q12s16 = vaddq_s16(q6s16, q2s16); + q15s16 = vaddq_s16(q4s16, q3s16); + // part of stage 6 + LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16) + q8s16 = vaddq_s16(q14s16, q11s16); + q9s16 = vaddq_s16(q13s16, q10s16); + q13s16 = vsubq_s16(q13s16, q10s16); + q11s16 = vsubq_s16(q14s16, q11s16); + STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16) + LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16) + q8s16 = vsubq_s16(q9s16, q12s16); + q10s16 = vaddq_s16(q14s16, q15s16); + q14s16 = vsubq_s16(q14s16, q15s16); + q12s16 = vaddq_s16(q9s16, q12s16); + STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16) + // part of stage 7 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16) + q13s16 = q11s16; + q14s16 = q8s16; + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16) + // part of stage 4 + q14s16 = vsubq_s16(q5s16, q0s16); + q13s16 = vsubq_s16(q6s16, q2s16); + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16); + q14s16 = vsubq_s16(q7s16, q1s16); + q13s16 = vsubq_s16(q4s16, q3s16); + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16); + // part of stage 6 + LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16) + q8s16 = vaddq_s16(q14s16, q1s16); + q9s16 = vaddq_s16(q13s16, q6s16); + q13s16 = vsubq_s16(q13s16, q6s16); + q1s16 = vsubq_s16(q14s16, q1s16); + STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16) + LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16) + q14s16 = vsubq_s16(q8s16, q5s16); + q10s16 = vaddq_s16(q8s16, q5s16); + q11s16 = vaddq_s16(q9s16, q0s16); + q0s16 = vsubq_s16(q9s16, q0s16); + STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16) + // part of stage 7 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16) + DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, + &q1s16, &q0s16); + STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16) + + // ----------------------------------------- + // BLOCK C: 8-10,11-15 + // ----------------------------------------- + // generate 8,9,14,15 + // part of stage 2 + LOAD_FROM_TRANSPOSED(3, 2, 30) + DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(30, 18, 14) + DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16) + // part of stage 3 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 4 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16) + + // generate 10,11,12,13 + // part of stage 2 + LOAD_FROM_TRANSPOSED(14, 10, 22) + DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(22, 26, 6) + DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16) + // part of stage 3 + q14s16 = vsubq_s16(q4s16, q5s16); + q5s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q6s16, q7s16); + q6s16 = vaddq_s16(q6s16, q7s16); + // part of stage 4 + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16) + // part of stage 5 + q8s16 = vaddq_s16(q0s16, q5s16); + q9s16 = vaddq_s16(q1s16, q7s16); + q13s16 = vsubq_s16(q1s16, q7s16); + q14s16 = vsubq_s16(q3s16, q4s16); + q10s16 = vaddq_s16(q3s16, q4s16); + q15s16 = vaddq_s16(q2s16, q6s16); + STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16) + STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16) + // part of stage 6 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16) + q13s16 = vsubq_s16(q0s16, q5s16); + q14s16 = vsubq_s16(q2s16, q6s16); + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16) + + // ----------------------------------------- + // BLOCK D: 0-3,4-7 + // ----------------------------------------- + // generate 4,5,6,7 + // part of stage 3 + LOAD_FROM_TRANSPOSED(6, 4, 28) + DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(28, 20, 12) + DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) + // part of stage 4 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 5 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + + // generate 0,1,2,3 + // part of stage 4 + LOAD_FROM_TRANSPOSED(12, 0, 16) + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(16, 8, 24) + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16) + // part of stage 5 + q4s16 = vaddq_s16(q7s16, q6s16); + q7s16 = vsubq_s16(q7s16, q6s16); + q6s16 = vsubq_s16(q5s16, q14s16); + q5s16 = vaddq_s16(q5s16, q14s16); + // part of stage 6 + q8s16 = vaddq_s16(q4s16, q2s16); + q9s16 = vaddq_s16(q5s16, q3s16); + q10s16 = vaddq_s16(q6s16, q1s16); + q11s16 = vaddq_s16(q7s16, q0s16); + q12s16 = vsubq_s16(q7s16, q0s16); + q13s16 = vsubq_s16(q6s16, q1s16); + q14s16 = vsubq_s16(q5s16, q3s16); + q15s16 = vsubq_s16(q4s16, q2s16); + // part of stage 7 + LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16) + q2s16 = vaddq_s16(q8s16, q1s16); + q3s16 = vaddq_s16(q9s16, q0s16); + q4s16 = vsubq_s16(q9s16, q0s16); + q5s16 = vsubq_s16(q8s16, q1s16); + LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + + if (idct32_pass_loop == 0) { + idct32_bands_end_1st_pass(out, + q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, + q10s16, q11s16, q12s16, q13s16, q14s16, q15s16); + } else { + idct32_bands_end_2nd_pass(out, dest, stride, + q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, + q10s16, q11s16, q12s16, q13s16, q14s16, q15s16); + dest += 8; + } + } + } + return; +} diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c new file mode 100644 index 000000000000..ea618700c954 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx_dsp/inv_txfm.h" +#include "vpx_ports/mem.h" + +void vpx_idct4x4_1_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8x8_t d6u8; + uint32x2_t d2u32 = vdup_n_u32(0); + uint16x8_t q8u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 4); + + q0s16 = vdupq_n_s16(a1); + + // dc_only_idct_add + d1 = d2 = dest; + for (i = 0; i < 2; i++) { + d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0); + d1 += dest_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), + vreinterpret_u8_u32(d2u32)); + d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + + vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); + d2 += dest_stride; + vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1); + d2 += dest_stride; + } + return; +} diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct4x4_add_neon.c new file mode 100644 index 000000000000..3c975c99b771 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/idct4x4_add_neon.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void vpx_idct4x4_16_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8x8_t d26u8, d27u8; + uint32x2_t d26u32, d27u32; + uint16x8_t q8u16, q9u16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; + int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; + int16x8_t q8s16, q9s16, q13s16, q14s16; + int32x4_t q1s32, q13s32, q14s32, q15s32; + int16x4x2_t d0x2s16, d1x2s16; + int32x4x2_t q0x2s32; + uint8_t *d; + int16_t cospi_8_64 = 15137; + int16_t cospi_16_64 = 11585; + int16_t cospi_24_64 = 6270; + + d26u32 = d27u32 = vdup_n_u32(0); + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + + d0x2s16 = vtrn_s16(d16s16, d17s16); + d1x2s16 = vtrn_s16(d18s16, d19s16); + q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); + q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); + + d20s16 = vdup_n_s16(cospi_8_64); + d21s16 = vdup_n_s16(cospi_16_64); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), + vreinterpretq_s32_s16(q9s16)); + d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + + d22s16 = vdup_n_s16(cospi_24_64); + + // stage 1 + d23s16 = vadd_s16(d16s16, d18s16); + d24s16 = vsub_s16(d16s16, d18s16); + + q15s32 = vmull_s16(d17s16, d22s16); + q1s32 = vmull_s16(d17s16, d20s16); + q13s32 = vmull_s16(d23s16, d21s16); + q14s32 = vmull_s16(d24s16, d21s16); + + q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); + q1s32 = vmlal_s16(q1s32, d19s16, d22s16); + + d26s16 = vqrshrn_n_s32(q13s32, 14); + d27s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d28s16 = vqrshrn_n_s32(q1s32, 14); + q13s16 = vcombine_s16(d26s16, d27s16); + q14s16 = vcombine_s16(d28s16, d29s16); + + // stage 2 + q8s16 = vaddq_s16(q13s16, q14s16); + q9s16 = vsubq_s16(q13s16, q14s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_high_s16(q9s16); // vswp d18 d19 + d19s16 = vget_low_s16(q9s16); + + d0x2s16 = vtrn_s16(d16s16, d17s16); + d1x2s16 = vtrn_s16(d18s16, d19s16); + q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); + q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), + vreinterpretq_s32_s16(q9s16)); + d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + + // do the transform on columns + // stage 1 + d23s16 = vadd_s16(d16s16, d18s16); + d24s16 = vsub_s16(d16s16, d18s16); + + q15s32 = vmull_s16(d17s16, d22s16); + q1s32 = vmull_s16(d17s16, d20s16); + q13s32 = vmull_s16(d23s16, d21s16); + q14s32 = vmull_s16(d24s16, d21s16); + + q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); + q1s32 = vmlal_s16(q1s32, d19s16, d22s16); + + d26s16 = vqrshrn_n_s32(q13s32, 14); + d27s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d28s16 = vqrshrn_n_s32(q1s32, 14); + q13s16 = vcombine_s16(d26s16, d27s16); + q14s16 = vcombine_s16(d28s16, d29s16); + + // stage 2 + q8s16 = vaddq_s16(q13s16, q14s16); + q9s16 = vsubq_s16(q13s16, q14s16); + + q8s16 = vrshrq_n_s16(q8s16, 4); + q9s16 = vrshrq_n_s16(q9s16, 4); + + d = dest; + d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); + d += dest_stride; + d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); + d += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); + d += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u32(d26u32)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u32(d27u32)); + + d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + + d = dest; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); + return; +} diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c new file mode 100644 index 000000000000..c1b801fad543 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx_dsp/inv_txfm.h" +#include "vpx_ports/mem.h" + +void vpx_idct8x8_1_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8x8_t d2u8, d3u8, d30u8, d31u8; + uint64x1_t d2u64, d3u64, d4u64, d5u64; + uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 5); + + q0s16 = vdupq_n_s16(a1); + q0u16 = vreinterpretq_u16_s16(q0s16); + + d1 = d2 = dest; + for (i = 0; i < 2; i++) { + d2u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d4u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d5u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + + q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); + q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); + q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); + q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8)); + d2 += dest_stride; + } + return; +} diff --git a/thirdparty/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/thirdparty/libvpx/vpx_dsp/arm/idct8x8_add_neon.c new file mode 100644 index 000000000000..4b2c2a6f83c0 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/idct8x8_add_neon.c @@ -0,0 +1,540 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void TRANSPOSE8X8( + int16x8_t *q8s16, + int16x8_t *q9s16, + int16x8_t *q10s16, + int16x8_t *q11s16, + int16x8_t *q12s16, + int16x8_t *q13s16, + int16x8_t *q14s16, + int16x8_t *q15s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + *q12s16 = vcombine_s16(d17s16, d25s16); + *q13s16 = vcombine_s16(d19s16, d27s16); + *q14s16 = vcombine_s16(d21s16, d29s16); + *q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16), + vreinterpretq_s32_s16(*q10s16)); + q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16), + vreinterpretq_s32_s16(*q11s16)); + q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16), + vreinterpretq_s32_s16(*q14s16)); + q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16), + vreinterpretq_s32_s16(*q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + *q8s16 = q0x2s16.val[0]; + *q9s16 = q0x2s16.val[1]; + *q10s16 = q1x2s16.val[0]; + *q11s16 = q1x2s16.val[1]; + *q12s16 = q2x2s16.val[0]; + *q13s16 = q2x2s16.val[1]; + *q14s16 = q3x2s16.val[0]; + *q15s16 = q3x2s16.val[1]; + return; +} + +static INLINE void IDCT8x8_1D( + int16x8_t *q8s16, + int16x8_t *q9s16, + int16x8_t *q10s16, + int16x8_t *q11s16, + int16x8_t *q12s16, + int16x8_t *q13s16, + int16x8_t *q14s16, + int16x8_t *q15s16) { + int16x4_t d0s16, d1s16, d2s16, d3s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; + + d0s16 = vdup_n_s16(cospi_28_64); + d1s16 = vdup_n_s16(cospi_4_64); + d2s16 = vdup_n_s16(cospi_12_64); + d3s16 = vdup_n_s16(cospi_20_64); + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + q2s32 = vmull_s16(d18s16, d0s16); + q3s32 = vmull_s16(d19s16, d0s16); + q5s32 = vmull_s16(d26s16, d2s16); + q6s32 = vmull_s16(d27s16, d2s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); + q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); + q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); + + d8s16 = vqrshrn_n_s32(q2s32, 14); + d9s16 = vqrshrn_n_s32(q3s32, 14); + d10s16 = vqrshrn_n_s32(q5s32, 14); + d11s16 = vqrshrn_n_s32(q6s32, 14); + q4s16 = vcombine_s16(d8s16, d9s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q2s32 = vmull_s16(d18s16, d1s16); + q3s32 = vmull_s16(d19s16, d1s16); + q9s32 = vmull_s16(d26s16, d3s16); + q13s32 = vmull_s16(d27s16, d3s16); + + q2s32 = vmlal_s16(q2s32, d30s16, d0s16); + q3s32 = vmlal_s16(q3s32, d31s16, d0s16); + q9s32 = vmlal_s16(q9s32, d22s16, d2s16); + q13s32 = vmlal_s16(q13s32, d23s16, d2s16); + + d14s16 = vqrshrn_n_s32(q2s32, 14); + d15s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q13s32, 14); + q6s16 = vcombine_s16(d12s16, d13s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + d0s16 = vdup_n_s16(cospi_16_64); + + q2s32 = vmull_s16(d16s16, d0s16); + q3s32 = vmull_s16(d17s16, d0s16); + q13s32 = vmull_s16(d16s16, d0s16); + q15s32 = vmull_s16(d17s16, d0s16); + + q2s32 = vmlal_s16(q2s32, d24s16, d0s16); + q3s32 = vmlal_s16(q3s32, d25s16, d0s16); + q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); + q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); + + d0s16 = vdup_n_s16(cospi_24_64); + d1s16 = vdup_n_s16(cospi_8_64); + + d18s16 = vqrshrn_n_s32(q2s32, 14); + d19s16 = vqrshrn_n_s32(q3s32, 14); + d22s16 = vqrshrn_n_s32(q13s32, 14); + d23s16 = vqrshrn_n_s32(q15s32, 14); + *q9s16 = vcombine_s16(d18s16, d19s16); + *q11s16 = vcombine_s16(d22s16, d23s16); + + q2s32 = vmull_s16(d20s16, d0s16); + q3s32 = vmull_s16(d21s16, d0s16); + q8s32 = vmull_s16(d20s16, d1s16); + q12s32 = vmull_s16(d21s16, d1s16); + + q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); + q8s32 = vmlal_s16(q8s32, d28s16, d0s16); + q12s32 = vmlal_s16(q12s32, d29s16, d0s16); + + d26s16 = vqrshrn_n_s32(q2s32, 14); + d27s16 = vqrshrn_n_s32(q3s32, 14); + d30s16 = vqrshrn_n_s32(q8s32, 14); + d31s16 = vqrshrn_n_s32(q12s32, 14); + *q13s16 = vcombine_s16(d26s16, d27s16); + *q15s16 = vcombine_s16(d30s16, d31s16); + + q0s16 = vaddq_s16(*q9s16, *q15s16); + q1s16 = vaddq_s16(*q11s16, *q13s16); + q2s16 = vsubq_s16(*q11s16, *q13s16); + q3s16 = vsubq_s16(*q9s16, *q15s16); + + *q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + *q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q7s16, q6s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + + d16s16 = vdup_n_s16(cospi_16_64); + + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + q11s32 = vmull_s16(d28s16, d16s16); + q12s32 = vmull_s16(d29s16, d16s16); + + q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); + q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); + q11s32 = vmlal_s16(q11s32, d26s16, d16s16); + q12s32 = vmlal_s16(q12s32, d27s16, d16s16); + + d10s16 = vqrshrn_n_s32(q9s32, 14); + d11s16 = vqrshrn_n_s32(q10s32, 14); + d12s16 = vqrshrn_n_s32(q11s32, 14); + d13s16 = vqrshrn_n_s32(q12s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + *q8s16 = vaddq_s16(q0s16, q7s16); + *q9s16 = vaddq_s16(q1s16, q6s16); + *q10s16 = vaddq_s16(q2s16, q5s16); + *q11s16 = vaddq_s16(q3s16, q4s16); + *q12s16 = vsubq_s16(q3s16, q4s16); + *q13s16 = vsubq_s16(q2s16, q5s16); + *q14s16 = vsubq_s16(q1s16, q6s16); + *q15s16 = vsubq_s16(q0s16, q7s16); + return; +} + +void vpx_idct8x8_64_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8_t *d1, *d2; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + uint64x1_t d0u64, d1u64, d2u64, d3u64; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + q10s16 = vld1q_s16(input + 16); + q11s16 = vld1q_s16(input + 24); + q12s16 = vld1q_s16(input + 32); + q13s16 = vld1q_s16(input + 40); + q14s16 = vld1q_s16(input + 48); + q15s16 = vld1q_s16(input + 56); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + q8s16 = vrshrq_n_s16(q8s16, 5); + q9s16 = vrshrq_n_s16(q9s16, 5); + q10s16 = vrshrq_n_s16(q10s16, 5); + q11s16 = vrshrq_n_s16(q11s16, 5); + q12s16 = vrshrq_n_s16(q12s16, 5); + q13s16 = vrshrq_n_s16(q13s16, 5); + q14s16 = vrshrq_n_s16(q14s16, 5); + q15s16 = vrshrq_n_s16(q15s16, 5); + + d1 = d2 = dest; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), + vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), + vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + + q8s16 = q12s16; + q9s16 = q13s16; + q10s16 = q14s16; + q11s16 = q15s16; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), + vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), + vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + return; +} + +void vpx_idct8x8_12_add_neon( + int16_t *input, + uint8_t *dest, + int dest_stride) { + uint8_t *d1, *d2; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; + int16x4_t d26s16, d27s16, d28s16, d29s16; + uint64x1_t d0u64, d1u64, d2u64, d3u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + int32x4_t q9s32, q10s32, q11s32, q12s32; + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + q10s16 = vld1q_s16(input + 16); + q11s16 = vld1q_s16(input + 24); + q12s16 = vld1q_s16(input + 32); + q13s16 = vld1q_s16(input + 40); + q14s16 = vld1q_s16(input + 48); + q15s16 = vld1q_s16(input + 56); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + // First transform rows + // stage 1 + q0s16 = vdupq_n_s16(cospi_28_64 * 2); + q1s16 = vdupq_n_s16(cospi_4_64 * 2); + + q4s16 = vqrdmulhq_s16(q9s16, q0s16); + + q0s16 = vdupq_n_s16(-cospi_20_64 * 2); + + q7s16 = vqrdmulhq_s16(q9s16, q1s16); + + q1s16 = vdupq_n_s16(cospi_12_64 * 2); + + q5s16 = vqrdmulhq_s16(q11s16, q0s16); + + q0s16 = vdupq_n_s16(cospi_16_64 * 2); + + q6s16 = vqrdmulhq_s16(q11s16, q1s16); + + // stage 2 & stage 3 - even half + q1s16 = vdupq_n_s16(cospi_24_64 * 2); + + q9s16 = vqrdmulhq_s16(q8s16, q0s16); + + q0s16 = vdupq_n_s16(cospi_8_64 * 2); + + q13s16 = vqrdmulhq_s16(q10s16, q1s16); + + q15s16 = vqrdmulhq_s16(q10s16, q0s16); + + // stage 3 -odd half + q0s16 = vaddq_s16(q9s16, q15s16); + q1s16 = vaddq_s16(q9s16, q13s16); + q2s16 = vsubq_s16(q9s16, q13s16); + q3s16 = vsubq_s16(q9s16, q15s16); + + // stage 2 - odd half + q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q7s16, q6s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + d16s16 = vdup_n_s16(cospi_16_64); + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + q11s32 = vmull_s16(d28s16, d16s16); + q12s32 = vmull_s16(d29s16, d16s16); + + q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); + q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); + q11s32 = vmlal_s16(q11s32, d26s16, d16s16); + q12s32 = vmlal_s16(q12s32, d27s16, d16s16); + + d10s16 = vqrshrn_n_s32(q9s32, 14); + d11s16 = vqrshrn_n_s32(q10s32, 14); + d12s16 = vqrshrn_n_s32(q11s32, 14); + d13s16 = vqrshrn_n_s32(q12s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 4 + q8s16 = vaddq_s16(q0s16, q7s16); + q9s16 = vaddq_s16(q1s16, q6s16); + q10s16 = vaddq_s16(q2s16, q5s16); + q11s16 = vaddq_s16(q3s16, q4s16); + q12s16 = vsubq_s16(q3s16, q4s16); + q13s16 = vsubq_s16(q2s16, q5s16); + q14s16 = vsubq_s16(q1s16, q6s16); + q15s16 = vsubq_s16(q0s16, q7s16); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, + &q12s16, &q13s16, &q14s16, &q15s16); + + q8s16 = vrshrq_n_s16(q8s16, 5); + q9s16 = vrshrq_n_s16(q9s16, 5); + q10s16 = vrshrq_n_s16(q10s16, 5); + q11s16 = vrshrq_n_s16(q11s16, 5); + q12s16 = vrshrq_n_s16(q12s16, 5); + q13s16 = vrshrq_n_s16(q13s16, 5); + q14s16 = vrshrq_n_s16(q14s16, 5); + q15s16 = vrshrq_n_s16(q15s16, 5); + + d1 = d2 = dest; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), + vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), + vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + + q8s16 = q12s16; + q9s16 = q13s16; + q10s16 = q14s16; + q11s16 = q15s16; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), + vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), + vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), + vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), + vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + return; +} diff --git a/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon.c b/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon.c new file mode 100644 index 000000000000..0a376104d2bb --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/intrapred_neon.c @@ -0,0 +1,822 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +//------------------------------------------------------------------------------ +// DC 4x4 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x8_t A = vld1_u8(above); // top row + const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top + const uint16x4_t p1 = vpadd_u16(p0, p0); + sum_top = vcombine_u16(p1, p1); + } + + if (do_left) { + const uint8x8_t L = vld1_u8(left); // left border + const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left + const uint16x4_t p1 = vpadd_u16(p0, p0); + sum_left = vcombine_u16(p1, p1); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 3); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 2); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 2); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x8_t dc = vdup_lane_u8(dc0, 0); + int i; + for (i = 0; i < 4; ++i) { + vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0); + } + } +} + +void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_4x4(dst, stride, above, left, 1, 1); +} + +void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + dc_4x4(dst, stride, NULL, left, 0, 1); +} + +void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + dc_4x4(dst, stride, above, NULL, 1, 0); +} + +void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + dc_4x4(dst, stride, NULL, NULL, 0, 0); +} + +//------------------------------------------------------------------------------ +// DC 8x8 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x8_t A = vld1_u8(above); // top row + const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top + const uint16x4_t p1 = vpadd_u16(p0, p0); + const uint16x4_t p2 = vpadd_u16(p1, p1); + sum_top = vcombine_u16(p2, p2); + } + + if (do_left) { + const uint8x8_t L = vld1_u8(left); // left border + const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left + const uint16x4_t p1 = vpadd_u16(p0, p0); + const uint16x4_t p2 = vpadd_u16(p1, p1); + sum_left = vcombine_u16(p2, p2); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 4); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 3); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 3); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x8_t dc = vdup_lane_u8(dc0, 0); + int i; + for (i = 0; i < 8; ++i) { + vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc)); + } + } +} + +void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_8x8(dst, stride, above, left, 1, 1); +} + +void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + dc_8x8(dst, stride, NULL, left, 0, 1); +} + +void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + dc_8x8(dst, stride, above, NULL, 1, 0); +} + +void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + dc_8x8(dst, stride, NULL, NULL, 0, 0); +} + +//------------------------------------------------------------------------------ +// DC 16x16 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x16_t A = vld1q_u8(above); // top row + const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top + const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + const uint16x4_t p2 = vpadd_u16(p1, p1); + const uint16x4_t p3 = vpadd_u16(p2, p2); + sum_top = vcombine_u16(p3, p3); + } + + if (do_left) { + const uint8x16_t L = vld1q_u8(left); // left row + const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left + const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + const uint16x4_t p2 = vpadd_u16(p1, p1); + const uint16x4_t p3 = vpadd_u16(p2, p2); + sum_left = vcombine_u16(p3, p3); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 5); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 4); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 4); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x16_t dc = vdupq_lane_u8(dc0, 0); + int i; + for (i = 0; i < 16; ++i) { + vst1q_u8(dst + i * stride, dc); + } + } +} + +void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_16x16(dst, stride, above, left, 1, 1); +} + +void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + dc_16x16(dst, stride, NULL, left, 0, 1); +} + +void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + dc_16x16(dst, stride, above, NULL, 1, 0); +} + +void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + dc_16x16(dst, stride, NULL, NULL, 0, 0); +} + +//------------------------------------------------------------------------------ +// DC 32x32 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x16_t A0 = vld1q_u8(above); // top row + const uint8x16_t A1 = vld1q_u8(above + 16); + const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top + const uint16x8_t p1 = vpaddlq_u8(A1); + const uint16x8_t p2 = vaddq_u16(p0, p1); + const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + const uint16x4_t p4 = vpadd_u16(p3, p3); + const uint16x4_t p5 = vpadd_u16(p4, p4); + sum_top = vcombine_u16(p5, p5); + } + + if (do_left) { + const uint8x16_t L0 = vld1q_u8(left); // left row + const uint8x16_t L1 = vld1q_u8(left + 16); + const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left + const uint16x8_t p1 = vpaddlq_u8(L1); + const uint16x8_t p2 = vaddq_u16(p0, p1); + const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + const uint16x4_t p4 = vpadd_u16(p3, p3); + const uint16x4_t p5 = vpadd_u16(p4, p4); + sum_left = vcombine_u16(p5, p5); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 6); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 5); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 5); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x16_t dc = vdupq_lane_u8(dc0, 0); + int i; + for (i = 0; i < 32; ++i) { + vst1q_u8(dst + i * stride, dc); + vst1q_u8(dst + i * stride + 16, dc); + } + } +} + +void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_32x32(dst, stride, above, left, 1, 1); +} + +void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + dc_32x32(dst, stride, NULL, left, 0, 1); +} + +void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + dc_32x32(dst, stride, above, NULL, 1, 0); +} + +void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + dc_32x32(dst, stride, NULL, NULL, 0, 0); +} + +// ----------------------------------------------------------------------------- + +void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row + const uint64x1_t A1 = vshr_n_u64(A0, 8); + const uint64x1_t A2 = vshr_n_u64(A0, 16); + const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0); + const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); + const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); + const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00); + const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); + const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); + const uint32x2_t r0 = vreinterpret_u32_u8(avg2); + const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); + const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); + const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + (void)left; + vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); + vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); + vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); + vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); + dst[3 * stride + 3] = above[7]; +} + +void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 }; + static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 }; + const uint8x8_t sh_12345677 = vld1_u8(shuffle1); + const uint8x8_t sh_23456777 = vld1_u8(shuffle2); + const uint8x8_t A0 = vld1_u8(above); // top row + const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677); + const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777); + const uint8x8_t avg1 = vhadd_u8(A0, A2); + uint8x8_t row = vrhadd_u8(avg1, A1); + int i; + (void)left; + for (i = 0; i < 7; ++i) { + vst1_u8(dst + i * stride, row); + row = vtbl1_u8(row, sh_12345677); + } + vst1_u8(dst + i * stride, row); +} + +void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t A0 = vld1q_u8(above); // top row + const uint8x16_t above_right = vld1q_dup_u8(above + 15); + const uint8x16_t A1 = vextq_u8(A0, above_right, 1); + const uint8x16_t A2 = vextq_u8(A0, above_right, 2); + const uint8x16_t avg1 = vhaddq_u8(A0, A2); + uint8x16_t row = vrhaddq_u8(avg1, A1); + int i; + (void)left; + for (i = 0; i < 15; ++i) { + vst1q_u8(dst + i * stride, row); + row = vextq_u8(row, above_right, 1); + } + vst1q_u8(dst + i * stride, row); +} + +// ----------------------------------------------------------------------------- + +void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t XABCD_u8 = vld1_u8(above - 1); + const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); + const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); + const uint32x2_t zero = vdup_n_u32(0); + const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); + const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); + const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); + const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); + const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); + const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); + const uint8_t D = vget_lane_u8(XABCD_u8, 4); + const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); + const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); + const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); + const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); + const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); + const uint32x2_t r3 = vreinterpret_u32_u8(avg2); + const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); + const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); + const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); + vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); + vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); + vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); +} + +#if !HAVE_NEON_ASM + +void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint32x2_t d0u32 = vdup_n_u32(0); + (void)left; + + d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); + for (i = 0; i < 4; i++, dst += stride) + vst1_lane_u32((uint32_t *)dst, d0u32, 0); +} + +void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x8_t d0u8 = vdup_n_u8(0); + (void)left; + + d0u8 = vld1_u8(above); + for (i = 0; i < 8; i++, dst += stride) + vst1_u8(dst, d0u8); +} + +void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x16_t q0u8 = vdupq_n_u8(0); + (void)left; + + q0u8 = vld1q_u8(above); + for (i = 0; i < 16; i++, dst += stride) + vst1q_u8(dst, q0u8); +} + +void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)left; + + q0u8 = vld1q_u8(above); + q1u8 = vld1q_u8(above + 16); + for (i = 0; i < 32; i++, dst += stride) { + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + } +} + +void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t d0u8 = vdup_n_u8(0); + uint32x2_t d1u32 = vdup_n_u32(0); + (void)above; + + d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); + + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); +} + +void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t d0u8 = vdup_n_u8(0); + uint64x1_t d1u64 = vdup_n_u64(0); + (void)above; + + d1u64 = vld1_u64((const uint64_t *)left); + + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); + vst1_u8(dst, d0u8); +} + +void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j; + uint8x8_t d2u8 = vdup_n_u8(0); + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)above; + + q1u8 = vld1q_u8(left); + d2u8 = vget_low_u8(q1u8); + for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { + q0u8 = vdupq_lane_u8(d2u8, 0); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 1); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 2); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 3); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 4); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 5); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 6); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 7); + vst1q_u8(dst, q0u8); + dst += stride; + } +} + +void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j, k; + uint8x8_t d2u8 = vdup_n_u8(0); + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)above; + + for (k = 0; k < 2; k++, left += 16) { + q1u8 = vld1q_u8(left); + d2u8 = vget_low_u8(q1u8); + for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { + q0u8 = vdupq_lane_u8(d2u8, 0); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 1); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 2); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 3); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 4); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 5); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 6); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 7); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + } + } +} + +void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint16x8_t q1u16, q3u16; + int16x8_t q1s16; + uint8x8_t d0u8 = vdup_n_u8(0); + uint32x2_t d2u32 = vdup_n_u32(0); + + d0u8 = vld1_dup_u8(above - 1); + d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); + q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); + for (i = 0; i < 4; i++, dst += stride) { + q1u16 = vdupq_n_u16((uint16_t)left[i]); + q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16), + vreinterpretq_s16_u16(q3u16)); + d0u8 = vqmovun_s16(q1s16); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + } +} + +void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j; + uint16x8_t q0u16, q3u16, q10u16; + int16x8_t q0s16; + uint16x4_t d20u16; + uint8x8_t d0u8, d2u8, d30u8; + + d0u8 = vld1_dup_u8(above - 1); + d30u8 = vld1_u8(left); + d2u8 = vld1_u8(above); + q10u16 = vmovl_u8(d30u8); + q3u16 = vsubl_u8(d2u8, d0u8); + d20u16 = vget_low_u16(q10u16); + for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { + q0u16 = vdupq_lane_u16(d20u16, 0); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += stride; + q0u16 = vdupq_lane_u16(d20u16, 1); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += stride; + q0u16 = vdupq_lane_u16(d20u16, 2); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += stride; + q0u16 = vdupq_lane_u16(d20u16, 3); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), + vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += stride; + } +} + +void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j, k; + uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; + uint8x16_t q0u8, q1u8; + int16x8_t q0s16, q1s16, q8s16, q11s16; + uint16x4_t d20u16; + uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; + + q0u8 = vld1q_dup_u8(above - 1); + q1u8 = vld1q_u8(above); + q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); + q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); + for (k = 0; k < 2; k++, left += 8) { + d18u8 = vld1_u8(left); + q10u16 = vmovl_u8(d18u8); + d20u16 = vget_low_u16(q10u16); + for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { + q0u16 = vdupq_lane_u16(d20u16, 0); + q8u16 = vdupq_lane_u16(d20u16, 1); + q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q2u16)); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q3u16)); + q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q2u16)); + q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q3u16)); + d2u8 = vqmovun_s16(q1s16); + d3u8 = vqmovun_s16(q0s16); + d22u8 = vqmovun_s16(q11s16); + d23u8 = vqmovun_s16(q8s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); + dst += stride; + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); + dst += stride; + + q0u16 = vdupq_lane_u16(d20u16, 2); + q8u16 = vdupq_lane_u16(d20u16, 3); + q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q2u16)); + q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q3u16)); + q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q2u16)); + q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), + vreinterpretq_s16_u16(q3u16)); + d2u8 = vqmovun_s16(q1s16); + d3u8 = vqmovun_s16(q0s16); + d22u8 = vqmovun_s16(q11s16); + d23u8 = vqmovun_s16(q8s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); + dst += stride; + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); + dst += stride; + } + } +} + +void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j, k; + uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; + uint8x16_t q0u8, q1u8, q2u8; + int16x8_t q12s16, q13s16, q14s16, q15s16; + uint16x4_t d6u16; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; + + q0u8 = vld1q_dup_u8(above - 1); + q1u8 = vld1q_u8(above); + q2u8 = vld1q_u8(above + 16); + q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); + q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); + q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); + q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); + for (k = 0; k < 4; k++, left += 8) { + d26u8 = vld1_u8(left); + q3u16 = vmovl_u8(d26u8); + d6u16 = vget_low_u16(q3u16); + for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { + q0u16 = vdupq_lane_u16(d6u16, 0); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += stride; + + q0u16 = vdupq_lane_u16(d6u16, 1); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += stride; + + q0u16 = vdupq_lane_u16(d6u16, 2); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += stride; + + q0u16 = vdupq_lane_u16(d6u16, 3); + q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q8u16)); + q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += stride; + } + } +} +#endif // !HAVE_NEON_ASM diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_16_neon.c b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_16_neon.c new file mode 100644 index 000000000000..d24e6adc8a64 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_16_neon.c @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +static INLINE void loop_filter_neon_16( + uint8x16_t qblimit, // blimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p3 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r) { // q1 + uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q2s16, q11s16; + uint16x8_t q4u16; + int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8; + int8x8_t d2s8, d3s8; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q3 = vabdq_u8(q9, q8); + q4 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q3 = vmaxq_u8(q3, q4); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q9 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q3); + + q2u8 = vabdq_u8(q5, q8); + q9 = vqaddq_u8(q9, q9); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + // vp8_filter() function + // convert to signed + q10 = vdupq_n_u8(0x80); + q8 = veorq_u8(q8, q10); + q7 = veorq_u8(q7, q10); + q6 = veorq_u8(q6, q10); + q5 = veorq_u8(q5, q10); + + q2u8 = vshrq_n_u8(q2u8, 1); + q9 = vqaddq_u8(q9, q2u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q9 = vcgeq_u8(qblimit, q9); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), + vreinterpretq_s8_u8(q8)); + + q14u8 = vorrq_u8(q13u8, q14u8); + + q4u16 = vdupq_n_u16(3); + q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); + q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); + + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); + q15u8 = vandq_u8(q15u8, q9); + + q1s8 = vreinterpretq_s8_u8(q1u8); + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); + + q4 = vdupq_n_u8(3); + q9 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2s8 = vqmovn_s16(q2s16); + d3s8 = vqmovn_s16(q11s16); + q1s8 = vcombine_s8(d2s8, d3s8); + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); + q1s8 = vreinterpretq_s8_u8(q1u8); + + q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4)); + q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); + q2s8 = vshrq_n_s8(q2s8, 3); + q1s8 = vshrq_n_s8(q1s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); + q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); + + q1s8 = vrshrq_n_s8(q1s8, 1); + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); + q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); + + *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10); + *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10); + *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10); + *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10); + return; +} + +void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1; + uint8x16_t qblimit, qlimit, qthresh; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; + + dblimit0 = vld1_u8(blimit0); + dlimit0 = vld1_u8(limit0); + dthresh0 = vld1_u8(thresh0); + dblimit1 = vld1_u8(blimit1); + dlimit1 = vld1_u8(limit1); + dthresh1 = vld1_u8(thresh1); + qblimit = vcombine_u8(dblimit0, dblimit1); + qlimit = vcombine_u8(dlimit0, dlimit1); + qthresh = vcombine_u8(dthresh0, dthresh1); + + s -= (p << 2); + + q3u8 = vld1q_u8(s); + s += p; + q4u8 = vld1q_u8(s); + s += p; + q5u8 = vld1q_u8(s); + s += p; + q6u8 = vld1q_u8(s); + s += p; + q7u8 = vld1q_u8(s); + s += p; + q8u8 = vld1q_u8(s); + s += p; + q9u8 = vld1q_u8(s); + s += p; + q10u8 = vld1q_u8(s); + + loop_filter_neon_16(qblimit, qlimit, qthresh, + q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, + &q5u8, &q6u8, &q7u8, &q8u8); + + s -= (p * 5); + vst1q_u8(s, q5u8); + s += p; + vst1q_u8(s, q6u8); + s += p; + vst1q_u8(s, q7u8); + s += p; + vst1q_u8(s, q8u8); + return; +} diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_4_neon.c b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_4_neon.c new file mode 100644 index 000000000000..7f3ee70b9487 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_4_neon.c @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" + +static INLINE void loop_filter_neon( + uint8x8_t dblimit, // flimit + uint8x8_t dlimit, // limit + uint8x8_t dthresh, // thresh + uint8x8_t d3u8, // p3 + uint8x8_t d4u8, // p2 + uint8x8_t d5u8, // p1 + uint8x8_t d6u8, // p0 + uint8x8_t d7u8, // q0 + uint8x8_t d16u8, // q1 + uint8x8_t d17u8, // q2 + uint8x8_t d18u8, // q3 + uint8x8_t *d4ru8, // p1 + uint8x8_t *d5ru8, // p0 + uint8x8_t *d6ru8, // q0 + uint8x8_t *d7ru8) { // q1 + uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8; + int16x8_t q12s16; + int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8; + + d19u8 = vabd_u8(d3u8, d4u8); + d20u8 = vabd_u8(d4u8, d5u8); + d21u8 = vabd_u8(d5u8, d6u8); + d22u8 = vabd_u8(d16u8, d7u8); + d3u8 = vabd_u8(d17u8, d16u8); + d4u8 = vabd_u8(d18u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + d20u8 = vmax_u8(d21u8, d22u8); + d3u8 = vmax_u8(d3u8, d4u8); + d23u8 = vmax_u8(d19u8, d20u8); + + d17u8 = vabd_u8(d6u8, d7u8); + + d21u8 = vcgt_u8(d21u8, dthresh); + d22u8 = vcgt_u8(d22u8, dthresh); + d23u8 = vmax_u8(d23u8, d3u8); + + d28u8 = vabd_u8(d5u8, d16u8); + d17u8 = vqadd_u8(d17u8, d17u8); + + d23u8 = vcge_u8(dlimit, d23u8); + + d18u8 = vdup_n_u8(0x80); + d5u8 = veor_u8(d5u8, d18u8); + d6u8 = veor_u8(d6u8, d18u8); + d7u8 = veor_u8(d7u8, d18u8); + d16u8 = veor_u8(d16u8, d18u8); + + d28u8 = vshr_n_u8(d28u8, 1); + d17u8 = vqadd_u8(d17u8, d28u8); + + d19u8 = vdup_n_u8(3); + + d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), + vreinterpret_s8_u8(d6u8)); + + d17u8 = vcge_u8(dblimit, d17u8); + + d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), + vreinterpret_s8_u8(d16u8)); + + d22u8 = vorr_u8(d21u8, d22u8); + + q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8)); + + d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8); + d23u8 = vand_u8(d23u8, d17u8); + + q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8)); + + d17u8 = vdup_n_u8(4); + + d27s8 = vqmovn_s16(q12s16); + d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8); + d27s8 = vreinterpret_s8_u8(d27u8); + + d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8)); + d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8)); + d28s8 = vshr_n_s8(d28s8, 3); + d27s8 = vshr_n_s8(d27s8, 3); + + d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8); + d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8); + + d27s8 = vrshr_n_s8(d27s8, 1); + d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8)); + + d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8); + d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8); + + *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8); + *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8); + *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8); + *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8); + return; +} + +void vpx_lpf_horizontal_4_neon( + uint8_t *src, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + int i; + uint8_t *s, *psrc; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + psrc = src - (pitch << 2); + for (i = 0; i < 1; i++) { + s = psrc + i * 8; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + loop_filter_neon(dblimit, dlimit, dthresh, + d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, + &d4u8, &d5u8, &d6u8, &d7u8); + + s -= (pitch * 5); + vst1_u8(s, d4u8); + s += pitch; + vst1_u8(s, d5u8); + s += pitch; + vst1_u8(s, d6u8); + s += pitch; + vst1_u8(s, d7u8); + } + return; +} + +void vpx_lpf_vertical_4_neon( + uint8_t *src, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + int i, pitch8; + uint8_t *s; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; + uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; + uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; + uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; + uint8x8x4_t d4Result; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + pitch8 = pitch * 8; + for (i = 0; i < 1; i++, src += pitch8) { + s = src - (i + 1) * 4; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), + vreinterpret_u32_u8(d7u8)); + d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), + vreinterpret_u32_u8(d16u8)); + d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), + vreinterpret_u32_u8(d17u8)); + d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), + vreinterpret_u32_u8(d18u8)); + + d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), + vreinterpret_u16_u32(d2tmp2.val[0])); + d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), + vreinterpret_u16_u32(d2tmp3.val[0])); + d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), + vreinterpret_u16_u32(d2tmp2.val[1])); + d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), + vreinterpret_u16_u32(d2tmp3.val[1])); + + d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), + vreinterpret_u8_u16(d2tmp5.val[0])); + d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), + vreinterpret_u8_u16(d2tmp5.val[1])); + d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), + vreinterpret_u8_u16(d2tmp7.val[0])); + d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), + vreinterpret_u8_u16(d2tmp7.val[1])); + + d3u8 = d2tmp8.val[0]; + d4u8 = d2tmp8.val[1]; + d5u8 = d2tmp9.val[0]; + d6u8 = d2tmp9.val[1]; + d7u8 = d2tmp10.val[0]; + d16u8 = d2tmp10.val[1]; + d17u8 = d2tmp11.val[0]; + d18u8 = d2tmp11.val[1]; + + loop_filter_neon(dblimit, dlimit, dthresh, + d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, + &d4u8, &d5u8, &d6u8, &d7u8); + + d4Result.val[0] = d4u8; + d4Result.val[1] = d5u8; + d4Result.val[2] = d6u8; + d4Result.val[3] = d7u8; + + src -= 2; + vst4_lane_u8(src, d4Result, 0); + src += pitch; + vst4_lane_u8(src, d4Result, 1); + src += pitch; + vst4_lane_u8(src, d4Result, 2); + src += pitch; + vst4_lane_u8(src, d4Result, 3); + src += pitch; + vst4_lane_u8(src, d4Result, 4); + src += pitch; + vst4_lane_u8(src, d4Result, 5); + src += pitch; + vst4_lane_u8(src, d4Result, 6); + src += pitch; + vst4_lane_u8(src, d4Result, 7); + } + return; +} diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_8_neon.c b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_8_neon.c new file mode 100644 index 000000000000..ec3757380d57 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_8_neon.c @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" + +static INLINE void mbloop_filter_neon( + uint8x8_t dblimit, // mblimit + uint8x8_t dlimit, // limit + uint8x8_t dthresh, // thresh + uint8x8_t d3u8, // p2 + uint8x8_t d4u8, // p2 + uint8x8_t d5u8, // p1 + uint8x8_t d6u8, // p0 + uint8x8_t d7u8, // q0 + uint8x8_t d16u8, // q1 + uint8x8_t d17u8, // q2 + uint8x8_t d18u8, // q3 + uint8x8_t *d0ru8, // p1 + uint8x8_t *d1ru8, // p1 + uint8x8_t *d2ru8, // p0 + uint8x8_t *d3ru8, // q0 + uint8x8_t *d4ru8, // q1 + uint8x8_t *d5ru8) { // q1 + uint32_t flat; + uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; + uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int16x8_t q15s16; + uint16x8_t q10u16, q14u16; + int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; + + d19u8 = vabd_u8(d3u8, d4u8); + d20u8 = vabd_u8(d4u8, d5u8); + d21u8 = vabd_u8(d5u8, d6u8); + d22u8 = vabd_u8(d16u8, d7u8); + d23u8 = vabd_u8(d17u8, d16u8); + d24u8 = vabd_u8(d18u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + d20u8 = vmax_u8(d21u8, d22u8); + + d25u8 = vabd_u8(d6u8, d4u8); + + d23u8 = vmax_u8(d23u8, d24u8); + + d26u8 = vabd_u8(d7u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + + d24u8 = vabd_u8(d6u8, d7u8); + d27u8 = vabd_u8(d3u8, d6u8); + d28u8 = vabd_u8(d18u8, d7u8); + + d19u8 = vmax_u8(d19u8, d23u8); + + d23u8 = vabd_u8(d5u8, d16u8); + d24u8 = vqadd_u8(d24u8, d24u8); + + + d19u8 = vcge_u8(dlimit, d19u8); + + + d25u8 = vmax_u8(d25u8, d26u8); + d26u8 = vmax_u8(d27u8, d28u8); + + d23u8 = vshr_n_u8(d23u8, 1); + + d25u8 = vmax_u8(d25u8, d26u8); + + d24u8 = vqadd_u8(d24u8, d23u8); + + d20u8 = vmax_u8(d20u8, d25u8); + + d23u8 = vdup_n_u8(1); + d24u8 = vcge_u8(dblimit, d24u8); + + d21u8 = vcgt_u8(d21u8, dthresh); + + d20u8 = vcge_u8(d23u8, d20u8); + + d19u8 = vand_u8(d19u8, d24u8); + + d23u8 = vcgt_u8(d22u8, dthresh); + + d20u8 = vand_u8(d20u8, d19u8); + + d22u8 = vdup_n_u8(0x80); + + d23u8 = vorr_u8(d21u8, d23u8); + + q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), + vreinterpret_u16_u8(d21u8)); + + d30u8 = vshrn_n_u16(q10u16, 4); + flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); + + if (flat == 0xffffffff) { // Check for all 1's, power_branch_only + d27u8 = vdup_n_u8(3); + d21u8 = vdup_n_u8(2); + q14u16 = vaddl_u8(d6u8, d7u8); + q14u16 = vmlal_u8(q14u16, d3u8, d27u8); + q14u16 = vmlal_u8(q14u16, d4u8, d21u8); + q14u16 = vaddw_u8(q14u16, d5u8); + *d0ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vaddw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d16u8); + *d1ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d17u8); + *d2ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d3ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vsubw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d4ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vsubw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d17u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d5ru8 = vqrshrn_n_u16(q14u16, 3); + } else { + d21u8 = veor_u8(d7u8, d22u8); + d24u8 = veor_u8(d6u8, d22u8); + d25u8 = veor_u8(d5u8, d22u8); + d26u8 = veor_u8(d16u8, d22u8); + + d27u8 = vdup_n_u8(3); + + d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); + d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); + + q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); + + d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); + + q15s16 = vaddw_s8(q15s16, d29s8); + + d29u8 = vdup_n_u8(4); + + d28s8 = vqmovn_s16(q15s16); + + d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); + + d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); + d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); + d30s8 = vshr_n_s8(d30s8, 3); + d29s8 = vshr_n_s8(d29s8, 3); + + d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); + d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); + + d29s8 = vrshr_n_s8(d29s8, 1); + d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); + + d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); + d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); + + if (flat == 0) { // filter_branch_only + *d0ru8 = d4u8; + *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); + *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); + *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); + *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); + *d5ru8 = d17u8; + return; + } + + d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); + d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); + d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); + d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); + + d23u8 = vdup_n_u8(2); + q14u16 = vaddl_u8(d6u8, d7u8); + q14u16 = vmlal_u8(q14u16, d3u8, d27u8); + q14u16 = vmlal_u8(q14u16, d4u8, d23u8); + + d0u8 = vbsl_u8(d20u8, dblimit, d4u8); + + q14u16 = vaddw_u8(q14u16, d5u8); + + d1u8 = vbsl_u8(d20u8, dlimit, d25u8); + + d30u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vaddw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d16u8); + + d2u8 = vbsl_u8(d20u8, dthresh, d24u8); + + d31u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d17u8); + + *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); + + d23u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d7u8); + + *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); + + q14u16 = vaddw_u8(q14u16, d18u8); + + *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); + + d22u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vsubw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d16u8); + + d3u8 = vbsl_u8(d20u8, d3u8, d21u8); + + q14u16 = vaddw_u8(q14u16, d18u8); + + d4u8 = vbsl_u8(d20u8, d4u8, d26u8); + + d6u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vsubw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d17u8); + q14u16 = vaddw_u8(q14u16, d18u8); + + d5u8 = vbsl_u8(d20u8, d5u8, d17u8); + + d7u8 = vqrshrn_n_u16(q14u16, 3); + + *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); + *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); + *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); + } + return; +} + +void vpx_lpf_horizontal_8_neon( + uint8_t *src, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + int i; + uint8_t *s, *psrc; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint8x8_t d16u8, d17u8, d18u8; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + psrc = src - (pitch << 2); + for (i = 0; i < 1; i++) { + s = psrc + i * 8; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + mbloop_filter_neon(dblimit, dlimit, dthresh, + d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, + &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); + + s -= (pitch * 6); + vst1_u8(s, d0u8); + s += pitch; + vst1_u8(s, d1u8); + s += pitch; + vst1_u8(s, d2u8); + s += pitch; + vst1_u8(s, d3u8); + s += pitch; + vst1_u8(s, d4u8); + s += pitch; + vst1_u8(s, d5u8); + } + return; +} + +void vpx_lpf_vertical_8_neon( + uint8_t *src, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + int i; + uint8_t *s; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint8x8_t d16u8, d17u8, d18u8; + uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; + uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; + uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; + uint8x8x4_t d4Result; + uint8x8x2_t d2Result; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + for (i = 0; i < 1; i++) { + s = src + (i * (pitch << 3)) - 4; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), + vreinterpret_u32_u8(d7u8)); + d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), + vreinterpret_u32_u8(d16u8)); + d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), + vreinterpret_u32_u8(d17u8)); + d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), + vreinterpret_u32_u8(d18u8)); + + d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), + vreinterpret_u16_u32(d2tmp2.val[0])); + d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), + vreinterpret_u16_u32(d2tmp3.val[0])); + d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), + vreinterpret_u16_u32(d2tmp2.val[1])); + d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), + vreinterpret_u16_u32(d2tmp3.val[1])); + + d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), + vreinterpret_u8_u16(d2tmp5.val[0])); + d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), + vreinterpret_u8_u16(d2tmp5.val[1])); + d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), + vreinterpret_u8_u16(d2tmp7.val[0])); + d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), + vreinterpret_u8_u16(d2tmp7.val[1])); + + d3u8 = d2tmp8.val[0]; + d4u8 = d2tmp8.val[1]; + d5u8 = d2tmp9.val[0]; + d6u8 = d2tmp9.val[1]; + d7u8 = d2tmp10.val[0]; + d16u8 = d2tmp10.val[1]; + d17u8 = d2tmp11.val[0]; + d18u8 = d2tmp11.val[1]; + + mbloop_filter_neon(dblimit, dlimit, dthresh, + d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, + &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); + + d4Result.val[0] = d0u8; + d4Result.val[1] = d1u8; + d4Result.val[2] = d2u8; + d4Result.val[3] = d3u8; + + d2Result.val[0] = d4u8; + d2Result.val[1] = d5u8; + + s = src - 3; + vst4_lane_u8(s, d4Result, 0); + s += pitch; + vst4_lane_u8(s, d4Result, 1); + s += pitch; + vst4_lane_u8(s, d4Result, 2); + s += pitch; + vst4_lane_u8(s, d4Result, 3); + s += pitch; + vst4_lane_u8(s, d4Result, 4); + s += pitch; + vst4_lane_u8(s, d4Result, 5); + s += pitch; + vst4_lane_u8(s, d4Result, 6); + s += pitch; + vst4_lane_u8(s, d4Result, 7); + + s = src + 1; + vst2_lane_u8(s, d2Result, 0); + s += pitch; + vst2_lane_u8(s, d2Result, 1); + s += pitch; + vst2_lane_u8(s, d2Result, 2); + s += pitch; + vst2_lane_u8(s, d2Result, 3); + s += pitch; + vst2_lane_u8(s, d2Result, 4); + s += pitch; + vst2_lane_u8(s, d2Result, 5); + s += pitch; + vst2_lane_u8(s, d2Result, 6); + s += pitch; + vst2_lane_u8(s, d2Result, 7); + } + return; +} diff --git a/thirdparty/libvpx/vpx_dsp/arm/loopfilter_neon.c b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_neon.c new file mode 100644 index 000000000000..aa31f293588c --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/loopfilter_neon.c @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0); + vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1); +} + +#if HAVE_NEON_ASM +void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1); +} + +void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0); + vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1); +} + +void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh); + vpx_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh); +} +#endif // HAVE_NEON_ASM diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c new file mode 100644 index 000000000000..8632250138c1 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon.c @@ -0,0 +1,373 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +static INLINE int32x4_t MULTIPLY_BY_Q0( + int16x4_t dsrc0, + int16x4_t dsrc1, + int16x4_t dsrc2, + int16x4_t dsrc3, + int16x4_t dsrc4, + int16x4_t dsrc5, + int16x4_t dsrc6, + int16x4_t dsrc7, + int16x8_t q0s16) { + int32x4_t qdst; + int16x4_t d0s16, d1s16; + + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + + qdst = vmull_lane_s16(dsrc0, d0s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); + qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); + return qdst; +} + +void vpx_convolve8_avg_horiz_neon( + const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, + int x_step_q4, + const int16_t *filter_y, // unused + int y_step_q4, // unused + int w, + int h) { + int width; + const uint8_t *s; + uint8_t *d; + uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; + uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32; + uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + uint16x8x2_t q0x2u16; + uint8x8x2_t d0x2u8, d1x2u8; + uint32x2x2_t d0x2u32; + uint16x4x2_t d0x2u16, d1x2u16; + uint32x4x2_t q0x2u32; + + assert(x_step_q4 == 16); + + q0s16 = vld1q_s16(filter_x); + + src -= 3; // adjust for taps + for (; h > 0; h -= 4) { // loop_horiz_v + s = src; + d24u8 = vld1_u8(s); + s += src_stride; + d25u8 = vld1_u8(s); + s += src_stride; + d26u8 = vld1_u8(s); + s += src_stride; + d27u8 = vld1_u8(s); + + q12u8 = vcombine_u8(d24u8, d25u8); + q13u8 = vcombine_u8(d26u8, d27u8); + + q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), + vreinterpretq_u16_u8(q13u8)); + d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); + d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); + d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); + d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); + d0x2u8 = vtrn_u8(d24u8, d25u8); + d1x2u8 = vtrn_u8(d26u8, d27u8); + + __builtin_prefetch(src + src_stride * 4); + __builtin_prefetch(src + src_stride * 5); + + q8u16 = vmovl_u8(d0x2u8.val[0]); + q9u16 = vmovl_u8(d0x2u8.val[1]); + q10u16 = vmovl_u8(d1x2u8.val[0]); + q11u16 = vmovl_u8(d1x2u8.val[1]); + + src += 7; + d16u16 = vget_low_u16(q8u16); + d17u16 = vget_high_u16(q8u16); + d18u16 = vget_low_u16(q9u16); + d19u16 = vget_high_u16(q9u16); + q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 + q9u16 = vcombine_u16(d17u16, d19u16); + + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 + for (width = w; + width > 0; + width -= 4, src += 4, dst += 4) { // loop_horiz + s = src; + d28u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d29u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d31u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d30u32 = vld1_dup_u32((const uint32_t *)s); + + __builtin_prefetch(src + 64); + + d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), + vreinterpret_u16_u32(d31u32)); + d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), + vreinterpret_u16_u32(d30u32)); + d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 + vreinterpret_u8_u16(d1x2u16.val[0])); // d29 + d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 + vreinterpret_u8_u16(d1x2u16.val[1])); // d30 + + __builtin_prefetch(src + 64 + src_stride); + + q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); + q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); + q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), + vreinterpretq_u32_u8(q15u8)); + + d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); + d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); + q12u16 = vmovl_u8(d28u8); + q13u16 = vmovl_u8(d29u8); + + __builtin_prefetch(src + 64 + src_stride * 2); + + d = dst; + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); + d += dst_stride; + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, + d18s16, d19s16, d23s16, d24s16, q0s16); + q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, + d19s16, d23s16, d24s16, d26s16, q0s16); + q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, + d23s16, d24s16, d26s16, d27s16, q0s16); + q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, + d24s16, d26s16, d27s16, d25s16, q0s16); + + __builtin_prefetch(src + 64 + src_stride * 3); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u8 = vqmovn_u16(q1u16); + d3u8 = vqmovn_u16(q2u16); + + d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), + vreinterpret_u16_u8(d3u8)); + d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), + vreinterpret_u32_u16(d0x2u16.val[1])); + d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), + vreinterpret_u8_u32(d0x2u32.val[1])); + + q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); + q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); + + q1u8 = vrhaddq_u8(q1u8, q3u8); + + d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); + d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); + + d = dst; + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + + q8u16 = q9u16; + d20s16 = d23s16; + q11u16 = q12u16; + q9u16 = q13u16; + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + } + src += src_stride * 4 - w - 7; + dst += dst_stride * 4 - w; + } + return; +} + +void vpx_convolve8_avg_vert_neon( + const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, // unused + int x_step_q4, // unused + const int16_t *filter_y, + int y_step_q4, + int w, + int h) { + int height; + const uint8_t *s; + uint8_t *d; + uint8x8_t d2u8, d3u8; + uint32x2_t d2u32, d3u32, d6u32, d7u32; + uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; + uint8x16_t q1u8, q3u8; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + + assert(y_step_q4 == 16); + + src -= src_stride * 3; + q0s16 = vld1q_s16(filter_y); + for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h + s = src; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); + s += src_stride; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); + s += src_stride; + d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); + s += src_stride; + + q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); + q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); + q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); + q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); + + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d = dst; + for (height = h; height > 0; height -= 4) { // loop_vert + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); + s += src_stride; + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); + s += src_stride; + + q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); + q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); + + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); + d += dst_stride; + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); + d -= dst_stride * 3; + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + __builtin_prefetch(s); + __builtin_prefetch(s + src_stride); + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, + d20s16, d21s16, d22s16, d24s16, q0s16); + __builtin_prefetch(s + src_stride * 2); + __builtin_prefetch(s + src_stride * 3); + q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, + d21s16, d22s16, d24s16, d26s16, q0s16); + __builtin_prefetch(d); + __builtin_prefetch(d + dst_stride); + q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, + d22s16, d24s16, d26s16, d27s16, q0s16); + __builtin_prefetch(d + dst_stride * 2); + __builtin_prefetch(d + dst_stride * 3); + q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, + d24s16, d26s16, d27s16, d25s16, q0s16); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u8 = vqmovn_u16(q1u16); + d3u8 = vqmovn_u16(q2u16); + + q1u8 = vcombine_u8(d2u8, d3u8); + q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); + + q1u8 = vrhaddq_u8(q1u8, q3u8); + + d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); + d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); + + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + d += dst_stride; + + q8u16 = q10u16; + d18s16 = d22s16; + d19s16 = d24s16; + q10u16 = q13u16; + d22s16 = d25s16; + } + } + return; +} diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c new file mode 100644 index 000000000000..9bd715e2c630 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +static INLINE int32x4_t MULTIPLY_BY_Q0( + int16x4_t dsrc0, + int16x4_t dsrc1, + int16x4_t dsrc2, + int16x4_t dsrc3, + int16x4_t dsrc4, + int16x4_t dsrc5, + int16x4_t dsrc6, + int16x4_t dsrc7, + int16x8_t q0s16) { + int32x4_t qdst; + int16x4_t d0s16, d1s16; + + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + + qdst = vmull_lane_s16(dsrc0, d0s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); + qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); + return qdst; +} + +void vpx_convolve8_horiz_neon( + const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, + int x_step_q4, + const int16_t *filter_y, // unused + int y_step_q4, // unused + int w, + int h) { + int width; + const uint8_t *s, *psrc; + uint8_t *d, *pdst; + uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; + uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; + uint8x16_t q12u8, q13u8, q14u8, q15u8; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + uint16x8x2_t q0x2u16; + uint8x8x2_t d0x2u8, d1x2u8; + uint32x2x2_t d0x2u32; + uint16x4x2_t d0x2u16, d1x2u16; + uint32x4x2_t q0x2u32; + + assert(x_step_q4 == 16); + + q0s16 = vld1q_s16(filter_x); + + src -= 3; // adjust for taps + for (; h > 0; h -= 4, + src += src_stride * 4, + dst += dst_stride * 4) { // loop_horiz_v + s = src; + d24u8 = vld1_u8(s); + s += src_stride; + d25u8 = vld1_u8(s); + s += src_stride; + d26u8 = vld1_u8(s); + s += src_stride; + d27u8 = vld1_u8(s); + + q12u8 = vcombine_u8(d24u8, d25u8); + q13u8 = vcombine_u8(d26u8, d27u8); + + q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), + vreinterpretq_u16_u8(q13u8)); + d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); + d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); + d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); + d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); + d0x2u8 = vtrn_u8(d24u8, d25u8); + d1x2u8 = vtrn_u8(d26u8, d27u8); + + __builtin_prefetch(src + src_stride * 4); + __builtin_prefetch(src + src_stride * 5); + __builtin_prefetch(src + src_stride * 6); + + q8u16 = vmovl_u8(d0x2u8.val[0]); + q9u16 = vmovl_u8(d0x2u8.val[1]); + q10u16 = vmovl_u8(d1x2u8.val[0]); + q11u16 = vmovl_u8(d1x2u8.val[1]); + + d16u16 = vget_low_u16(q8u16); + d17u16 = vget_high_u16(q8u16); + d18u16 = vget_low_u16(q9u16); + d19u16 = vget_high_u16(q9u16); + q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 + q9u16 = vcombine_u16(d17u16, d19u16); + + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 + for (width = w, psrc = src + 7, pdst = dst; + width > 0; + width -= 4, psrc += 4, pdst += 4) { // loop_horiz + s = psrc; + d28u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d29u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d31u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d30u32 = vld1_dup_u32((const uint32_t *)s); + + __builtin_prefetch(psrc + 64); + + d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), + vreinterpret_u16_u32(d31u32)); + d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), + vreinterpret_u16_u32(d30u32)); + d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 + vreinterpret_u8_u16(d1x2u16.val[0])); // d29 + d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 + vreinterpret_u8_u16(d1x2u16.val[1])); // d30 + + __builtin_prefetch(psrc + 64 + src_stride); + + q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); + q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); + q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), + vreinterpretq_u32_u8(q15u8)); + + d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); + d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); + q12u16 = vmovl_u8(d28u8); + q13u16 = vmovl_u8(d29u8); + + __builtin_prefetch(psrc + 64 + src_stride * 2); + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, + d18s16, d19s16, d23s16, d24s16, q0s16); + q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, + d19s16, d23s16, d24s16, d26s16, q0s16); + q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, + d23s16, d24s16, d26s16, d27s16, q0s16); + q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, + d24s16, d26s16, d27s16, d25s16, q0s16); + + __builtin_prefetch(psrc + 60 + src_stride * 3); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u8 = vqmovn_u16(q1u16); + d3u8 = vqmovn_u16(q2u16); + + d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), + vreinterpret_u16_u8(d3u8)); + d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), + vreinterpret_u32_u16(d0x2u16.val[1])); + d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), + vreinterpret_u8_u32(d0x2u32.val[1])); + + d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); + d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); + + d = pdst; + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + + q8u16 = q9u16; + d20s16 = d23s16; + q11u16 = q12u16; + q9u16 = q13u16; + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + } + } + return; +} + +void vpx_convolve8_vert_neon( + const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, // unused + int x_step_q4, // unused + const int16_t *filter_y, + int y_step_q4, + int w, + int h) { + int height; + const uint8_t *s; + uint8_t *d; + uint32x2_t d2u32, d3u32; + uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + + assert(y_step_q4 == 16); + + src -= src_stride * 3; + q0s16 = vld1q_s16(filter_y); + for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h + s = src; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); + s += src_stride; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); + s += src_stride; + d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); + s += src_stride; + + q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); + q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); + q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); + q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); + + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d = dst; + for (height = h; height > 0; height -= 4) { // loop_vert + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); + s += src_stride; + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); + s += src_stride; + + q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); + q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + __builtin_prefetch(d); + __builtin_prefetch(d + dst_stride); + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, + d20s16, d21s16, d22s16, d24s16, q0s16); + __builtin_prefetch(d + dst_stride * 2); + __builtin_prefetch(d + dst_stride * 3); + q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, + d21s16, d22s16, d24s16, d26s16, q0s16); + __builtin_prefetch(s); + __builtin_prefetch(s + src_stride); + q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, + d22s16, d24s16, d26s16, d27s16, q0s16); + __builtin_prefetch(s + src_stride * 2); + __builtin_prefetch(s + src_stride * 3); + q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, + d24s16, d26s16, d27s16, d25s16, q0s16); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16)); + d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16)); + + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + d += dst_stride; + + q8u16 = q10u16; + d18s16 = d22s16; + d19s16 = d24s16; + q10u16 = q13u16; + d22s16 = d25s16; + } + } + return; +} diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c new file mode 100644 index 000000000000..dc58a332f81d --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_convolve_avg_neon( + const uint8_t *src, // r0 + ptrdiff_t src_stride, // r1 + uint8_t *dst, // r2 + ptrdiff_t dst_stride, // r3 + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, + int h) { + uint8_t *d; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + uint32x2_t d0u32, d2u32; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8; + (void)filter_x; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; + + d = dst; + if (w > 32) { // avg64 + for (; h > 0; h -= 1) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + q2u8 = vld1q_u8(src + 32); + q3u8 = vld1q_u8(src + 48); + src += src_stride; + q8u8 = vld1q_u8(d); + q9u8 = vld1q_u8(d + 16); + q10u8 = vld1q_u8(d + 32); + q11u8 = vld1q_u8(d + 48); + d += dst_stride; + + q0u8 = vrhaddq_u8(q0u8, q8u8); + q1u8 = vrhaddq_u8(q1u8, q9u8); + q2u8 = vrhaddq_u8(q2u8, q10u8); + q3u8 = vrhaddq_u8(q3u8, q11u8); + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + vst1q_u8(dst + 32, q2u8); + vst1q_u8(dst + 48, q3u8); + dst += dst_stride; + } + } else if (w == 32) { // avg32 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + src += src_stride; + q2u8 = vld1q_u8(src); + q3u8 = vld1q_u8(src + 16); + src += src_stride; + q8u8 = vld1q_u8(d); + q9u8 = vld1q_u8(d + 16); + d += dst_stride; + q10u8 = vld1q_u8(d); + q11u8 = vld1q_u8(d + 16); + d += dst_stride; + + q0u8 = vrhaddq_u8(q0u8, q8u8); + q1u8 = vrhaddq_u8(q1u8, q9u8); + q2u8 = vrhaddq_u8(q2u8, q10u8); + q3u8 = vrhaddq_u8(q3u8, q11u8); + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + dst += dst_stride; + vst1q_u8(dst, q2u8); + vst1q_u8(dst + 16, q3u8); + dst += dst_stride; + } + } else if (w > 8) { // avg16 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + src += src_stride; + q1u8 = vld1q_u8(src); + src += src_stride; + q2u8 = vld1q_u8(d); + d += dst_stride; + q3u8 = vld1q_u8(d); + d += dst_stride; + + q0u8 = vrhaddq_u8(q0u8, q2u8); + q1u8 = vrhaddq_u8(q1u8, q3u8); + + vst1q_u8(dst, q0u8); + dst += dst_stride; + vst1q_u8(dst, q1u8); + dst += dst_stride; + } + } else if (w == 8) { // avg8 + for (; h > 0; h -= 2) { + d0u8 = vld1_u8(src); + src += src_stride; + d1u8 = vld1_u8(src); + src += src_stride; + d2u8 = vld1_u8(d); + d += dst_stride; + d3u8 = vld1_u8(d); + d += dst_stride; + + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + q0u8 = vrhaddq_u8(q0u8, q1u8); + + vst1_u8(dst, vget_low_u8(q0u8)); + dst += dst_stride; + vst1_u8(dst, vget_high_u8(q0u8)); + dst += dst_stride; + } + } else { // avg4 + for (; h > 0; h -= 2) { + d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0); + src += src_stride; + d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1); + src += src_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0); + d += dst_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1); + d += dst_stride; + + d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), + vreinterpret_u8_u32(d2u32)); + + d0u32 = vreinterpret_u32_u8(d0u8); + vst1_lane_u32((uint32_t *)dst, d0u32, 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, d0u32, 1); + dst += dst_stride; + } + } + return; +} diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c new file mode 100644 index 000000000000..d8fb97a86190 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_convolve_copy_neon( + const uint8_t *src, // r0 + ptrdiff_t src_stride, // r1 + uint8_t *dst, // r2 + ptrdiff_t dst_stride, // r3 + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, + int h) { + uint8x8_t d0u8, d2u8; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + (void)filter_x; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; + + if (w > 32) { // copy64 + for (; h > 0; h--) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + q2u8 = vld1q_u8(src + 32); + q3u8 = vld1q_u8(src + 48); + src += src_stride; + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + vst1q_u8(dst + 32, q2u8); + vst1q_u8(dst + 48, q3u8); + dst += dst_stride; + } + } else if (w == 32) { // copy32 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + src += src_stride; + q2u8 = vld1q_u8(src); + q3u8 = vld1q_u8(src + 16); + src += src_stride; + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + dst += dst_stride; + vst1q_u8(dst, q2u8); + vst1q_u8(dst + 16, q3u8); + dst += dst_stride; + } + } else if (w > 8) { // copy16 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + src += src_stride; + q1u8 = vld1q_u8(src); + src += src_stride; + + vst1q_u8(dst, q0u8); + dst += dst_stride; + vst1q_u8(dst, q1u8); + dst += dst_stride; + } + } else if (w == 8) { // copy8 + for (; h > 0; h -= 2) { + d0u8 = vld1_u8(src); + src += src_stride; + d2u8 = vld1_u8(src); + src += src_stride; + + vst1_u8(dst, d0u8); + dst += dst_stride; + vst1_u8(dst, d2u8); + dst += dst_stride; + } + } else { // copy4 + for (; h > 0; h--) { + *(uint32_t *)dst = *(const uint32_t *)src; + src += src_stride; + dst += dst_stride; + } + } + return; +} diff --git a/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_neon.c new file mode 100644 index 000000000000..1506ce6203de --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/arm/vpx_convolve_neon.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/mem.h" + +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the + * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). + */ + DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); + + // Account for the vertical phase needing 3 lines prior and 4 lines post + int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + /* Filter starting 3 lines back. The neon implementation will ignore the + * given height and filter a multiple of 4 lines. Since this goes in to + * the temp buffer which has lots of extra room and is subsequently discarded + * this is safe if somewhat less than ideal. + */ + vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, + temp, 64, + filter_x, x_step_q4, filter_y, y_step_q4, + w, intermediate_height); + + /* Step into the temp buffer 3 lines to get the actual frame data */ + vpx_convolve8_vert_neon(temp + 64 * 3, 64, + dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); +} + +void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); + int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + /* This implementation has the same issues as above. In addition, we only want + * to average the values after both passes. + */ + vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, + temp, 64, + filter_x, x_step_q4, filter_y, y_step_q4, + w, intermediate_height); + vpx_convolve8_avg_vert_neon(temp + 64 * 3, + 64, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); +} diff --git a/thirdparty/libvpx/vpx_dsp/bitreader.c b/thirdparty/libvpx/vpx_dsp/bitreader.c new file mode 100644 index 000000000000..8140e78e70e8 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/bitreader.c @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include + +#include "./vpx_config.h" + +#include "vpx_dsp/bitreader.h" +#include "vpx_dsp/prob.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/mem.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_util/endian_inl.h" + +int vpx_reader_init(vpx_reader *r, + const uint8_t *buffer, + size_t size, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state) { + if (size && !buffer) { + return 1; + } else { + r->buffer_end = buffer + size; + r->buffer = buffer; + r->value = 0; + r->count = -8; + r->range = 255; + r->decrypt_cb = decrypt_cb; + r->decrypt_state = decrypt_state; + vpx_reader_fill(r); + return vpx_read_bit(r) != 0; // marker bit + } +} + +void vpx_reader_fill(vpx_reader *r) { + const uint8_t *const buffer_end = r->buffer_end; + const uint8_t *buffer = r->buffer; + const uint8_t *buffer_start = buffer; + BD_VALUE value = r->value; + int count = r->count; + const size_t bytes_left = buffer_end - buffer; + const size_t bits_left = bytes_left * CHAR_BIT; + int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT); + + if (r->decrypt_cb) { + size_t n = VPXMIN(sizeof(r->clear_buffer), bytes_left); + r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n); + buffer = r->clear_buffer; + buffer_start = r->clear_buffer; + } + if (bits_left > BD_VALUE_SIZE) { + const int bits = (shift & 0xfffffff8) + CHAR_BIT; + BD_VALUE nv; + BD_VALUE big_endian_values; + memcpy(&big_endian_values, buffer, sizeof(BD_VALUE)); +#if SIZE_MAX == 0xffffffffffffffffULL + big_endian_values = HToBE64(big_endian_values); +#else + big_endian_values = HToBE32(big_endian_values); +#endif + nv = big_endian_values >> (BD_VALUE_SIZE - bits); + count += bits; + buffer += (bits >> 3); + value = r->value | (nv << (shift & 0x7)); + } else { + const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left); + int loop_end = 0; + if (bits_over >= 0) { + count += LOTS_OF_BITS; + loop_end = bits_over; + } + + if (bits_over < 0 || bits_left) { + while (shift >= loop_end) { + count += CHAR_BIT; + value |= (BD_VALUE)*buffer++ << shift; + shift -= CHAR_BIT; + } + } + } + + // NOTE: Variable 'buffer' may not relate to 'r->buffer' after decryption, + // so we increase 'r->buffer' by the amount that 'buffer' moved, rather than + // assign 'buffer' to 'r->buffer'. + r->buffer += buffer - buffer_start; + r->value = value; + r->count = count; +} + +const uint8_t *vpx_reader_find_end(vpx_reader *r) { + // Find the end of the coded buffer + while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) { + r->count -= CHAR_BIT; + r->buffer--; + } + return r->buffer; +} diff --git a/thirdparty/libvpx/vpx_dsp/bitreader.h b/thirdparty/libvpx/vpx_dsp/bitreader.h new file mode 100644 index 000000000000..9a441b41077e --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/bitreader.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_BITREADER_H_ +#define VPX_DSP_BITREADER_H_ + +#include +#include + +#include "./vpx_config.h" +#include "vpx_ports/mem.h" +#include "vpx/vp8dx.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/prob.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef size_t BD_VALUE; + +#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT) + +// This is meant to be a large, positive constant that can still be efficiently +// loaded as an immediate (on platforms like ARM, for example). +// Even relatively modest values like 100 would work fine. +#define LOTS_OF_BITS 0x40000000 + +typedef struct { + // Be careful when reordering this struct, it may impact the cache negatively. + BD_VALUE value; + unsigned int range; + int count; + const uint8_t *buffer_end; + const uint8_t *buffer; + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; + uint8_t clear_buffer[sizeof(BD_VALUE) + 1]; +} vpx_reader; + +int vpx_reader_init(vpx_reader *r, + const uint8_t *buffer, + size_t size, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state); + +void vpx_reader_fill(vpx_reader *r); + +const uint8_t *vpx_reader_find_end(vpx_reader *r); + +static INLINE int vpx_reader_has_error(vpx_reader *r) { + // Check if we have reached the end of the buffer. + // + // Variable 'count' stores the number of bits in the 'value' buffer, minus + // 8. The top byte is part of the algorithm, and the remainder is buffered + // to be shifted into it. So if count == 8, the top 16 bits of 'value' are + // occupied, 8 for the algorithm and 8 in the buffer. + // + // When reading a byte from the user's buffer, count is filled with 8 and + // one byte is filled into the value buffer. When we reach the end of the + // data, count is additionally filled with LOTS_OF_BITS. So when + // count == LOTS_OF_BITS - 1, the user's data has been exhausted. + // + // 1 if we have tried to decode bits after the end of stream was encountered. + // 0 No error. + return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS; +} + +static INLINE int vpx_read(vpx_reader *r, int prob) { + unsigned int bit = 0; + BD_VALUE value; + BD_VALUE bigsplit; + int count; + unsigned int range; + unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT; + + if (r->count < 0) + vpx_reader_fill(r); + + value = r->value; + count = r->count; + + bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT); + + range = split; + + if (value >= bigsplit) { + range = r->range - split; + value = value - bigsplit; + bit = 1; + } + + { + register int shift = vpx_norm[range]; + range <<= shift; + value <<= shift; + count -= shift; + } + r->value = value; + r->count = count; + r->range = range; + + return bit; +} + +static INLINE int vpx_read_bit(vpx_reader *r) { + return vpx_read(r, 128); // vpx_prob_half +} + +static INLINE int vpx_read_literal(vpx_reader *r, int bits) { + int literal = 0, bit; + + for (bit = bits - 1; bit >= 0; bit--) + literal |= vpx_read_bit(r) << bit; + + return literal; +} + +static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree, + const vpx_prob *probs) { + vpx_tree_index i = 0; + + while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0) + continue; + + return -i; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_BITREADER_H_ diff --git a/thirdparty/libvpx/vpx_dsp/bitreader_buffer.c b/thirdparty/libvpx/vpx_dsp/bitreader_buffer.c new file mode 100644 index 000000000000..d7b55cf9f410 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/bitreader_buffer.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "./bitreader_buffer.h" + +size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb) { + return (rb->bit_offset + 7) >> 3; +} + +int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) { + const size_t off = rb->bit_offset; + const size_t p = off >> 3; + const int q = 7 - (int)(off & 0x7); + if (rb->bit_buffer + p < rb->bit_buffer_end) { + const int bit = (rb->bit_buffer[p] >> q) & 1; + rb->bit_offset = off + 1; + return bit; + } else { + rb->error_handler(rb->error_handler_data); + return 0; + } +} + +int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits) { + int value = 0, bit; + for (bit = bits - 1; bit >= 0; bit--) + value |= vpx_rb_read_bit(rb) << bit; + return value; +} + +int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, + int bits) { + const int value = vpx_rb_read_literal(rb, bits); + return vpx_rb_read_bit(rb) ? -value : value; +} + +int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, + int bits) { +#if CONFIG_MISC_FIXES + const int nbits = sizeof(unsigned) * 8 - bits - 1; + const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits; + return ((int) value) >> nbits; +#else + return vpx_rb_read_signed_literal(rb, bits); +#endif +} diff --git a/thirdparty/libvpx/vpx_dsp/bitreader_buffer.h b/thirdparty/libvpx/vpx_dsp/bitreader_buffer.h new file mode 100644 index 000000000000..8a48a95ed192 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/bitreader_buffer.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_BITREADER_BUFFER_H_ +#define VPX_DSP_BITREADER_BUFFER_H_ + +#include + +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*vpx_rb_error_handler)(void *data); + +struct vpx_read_bit_buffer { + const uint8_t *bit_buffer; + const uint8_t *bit_buffer_end; + size_t bit_offset; + + void *error_handler_data; + vpx_rb_error_handler error_handler; +}; + +size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb); + +int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb); + +int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits); + +int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits); + +int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_BITREADER_BUFFER_H_ diff --git a/thirdparty/libvpx/vpx_dsp/intrapred.c b/thirdparty/libvpx/vpx_dsp/intrapred.c new file mode 100644 index 000000000000..cc4a74bd260b --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/intrapred.c @@ -0,0 +1,870 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" + +#define DST(x, y) dst[(x) + (y) * stride] +#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) +#define AVG2(a, b) (((a) + (b) + 1) >> 1) + +static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + (void) above; + // first column + for (r = 0; r < bs - 1; ++r) + dst[r * stride] = AVG2(left[r], left[r + 1]); + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + + // second column + for (r = 0; r < bs - 2; ++r) + dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]); + dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]); + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + + // rest of last row + for (c = 0; c < bs - 2; ++c) + dst[(bs - 1) * stride + c] = left[bs - 1]; + + for (r = bs - 2; r >= 0; --r) + for (c = 0; c < bs - 2; ++c) + dst[r * stride + c] = dst[(r + 1) * stride + c - 2]; +} + +#if CONFIG_MISC_FIXES +static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + (void) above; + + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1], + left[(c >> 1) + r + 2]) + : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); + } + dst += stride; + } +} +#endif // CONFIG_MISC_FIXES + +static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + int size; + (void)left; + for (c = 0; c < bs; ++c) { + dst[c] = AVG2(above[c], above[c + 1]); + dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]); + } + for (r = 2, size = bs - 2; r < bs; r += 2, --size) { + memcpy(dst + (r + 0) * stride, dst + (r >> 1), size); + memset(dst + (r + 0) * stride + size, above[bs - 1], bs - size); + memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1), size); + memset(dst + (r + 1) * stride + size, above[bs - 1], bs - size); + } +} + +#if CONFIG_MISC_FIXES +static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + (void) left; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], + above[(r >> 1) + c + 2]) + : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); + } + dst += stride; + } +} +#endif // CONFIG_MISC_FIXES + +static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + const uint8_t above_right = above[bs - 1]; + const uint8_t *const dst_row0 = dst; + int x, size; + (void)left; + + for (x = 0; x < bs - 1; ++x) { + dst[x] = AVG3(above[x], above[x + 1], above[x + 2]); + } + dst[bs - 1] = above_right; + dst += stride; + for (x = 1, size = bs - 2; x < bs; ++x, --size) { + memcpy(dst, dst_row0 + x, size); + memset(dst + size, above_right, x + 1); + dst += stride; + } +} + +#if CONFIG_MISC_FIXES +static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + (void) left; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = AVG3(above[r + c], above[r + c + 1], + above[r + c + 1 + (r + c + 2 < bs * 2)]); + } + dst += stride; + } +} +#endif // CONFIG_MISC_FIXES + +static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + + // first row + for (c = 0; c < bs; c++) + dst[c] = AVG2(above[c - 1], above[c]); + dst += stride; + + // second row + dst[0] = AVG3(left[0], above[-1], above[0]); + for (c = 1; c < bs; c++) + dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + dst += stride; + + // the rest of first col + dst[0] = AVG3(above[-1], left[0], left[1]); + for (r = 3; r < bs; ++r) + dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]); + + // the rest of the block + for (r = 2; r < bs; ++r) { + for (c = 1; c < bs; c++) + dst[c] = dst[-2 * stride + c - 1]; + dst += stride; + } +} + +static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int i; +#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7 + // silence a spurious -Warray-bounds warning, possibly related to: + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273 + uint8_t border[69]; +#else + uint8_t border[32 + 32 - 1]; // outer border from bottom-left to top-right +#endif + + // dst(bs, bs - 2)[0], i.e., border starting at bottom-left + for (i = 0; i < bs - 2; ++i) { + border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]); + } + border[bs - 2] = AVG3(above[-1], left[0], left[1]); + border[bs - 1] = AVG3(left[0], above[-1], above[0]); + border[bs - 0] = AVG3(above[-1], above[0], above[1]); + // dst[0][2, size), i.e., remaining top border ascending + for (i = 0; i < bs - 2; ++i) { + border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]); + } + + for (i = 0; i < bs; ++i) { + memcpy(dst + i * stride, border + bs - 1 - i, bs); + } +} + +static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + dst[0] = AVG2(above[-1], left[0]); + for (r = 1; r < bs; r++) + dst[r * stride] = AVG2(left[r - 1], left[r]); + dst++; + + dst[0] = AVG3(left[0], above[-1], above[0]); + dst[stride] = AVG3(above[-1], left[0], left[1]); + for (r = 2; r < bs; r++) + dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); + dst++; + + for (c = 0; c < bs - 2; c++) + dst[c] = AVG3(above[c - 1], above[c], above[c + 1]); + dst += stride; + + for (r = 1; r < bs; ++r) { + for (c = 0; c < bs - 2; c++) + dst[c] = dst[-stride + c - 2]; + dst += stride; + } +} + +static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r; + (void) left; + + for (r = 0; r < bs; r++) { + memcpy(dst, above, bs); + dst += stride; + } +} + +static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r; + (void) above; + + for (r = 0; r < bs; r++) { + memset(dst, left[r], bs); + dst += stride; + } +} + +static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + int ytop_left = above[-1]; + + for (r = 0; r < bs; r++) { + for (c = 0; c < bs; c++) + dst[c] = clip_pixel(left[r] + above[c] - ytop_left); + dst += stride; + } +} + +static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r; + (void) above; + (void) left; + + for (r = 0; r < bs; r++) { + memset(dst, 128, bs); + dst += stride; + } +} + +static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, + const uint8_t *left) { + int i, r, expected_dc, sum = 0; + (void) above; + + for (i = 0; i < bs; i++) + sum += left[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + memset(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int i, r, expected_dc, sum = 0; + (void) left; + + for (i = 0; i < bs; i++) + sum += above[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + memset(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int i, r, expected_dc, sum = 0; + const int count = 2 * bs; + + for (i = 0; i < bs; i++) { + sum += above[i]; + sum += left[i]; + } + + expected_dc = (sum + (count >> 1)) / count; + + for (r = 0; r < bs; r++) { + memset(dst, expected_dc, bs); + dst += stride; + } +} + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int H = above[-1]; + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + + memset(dst + stride * 0, AVG3(H, I, J), 4); + memset(dst + stride * 1, AVG3(I, J, K), 4); + memset(dst + stride * 2, AVG3(J, K, L), 4); + memset(dst + stride * 3, AVG3(K, L, L), 4); +} + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int H = above[-1]; + const int I = above[0]; + const int J = above[1]; + const int K = above[2]; + const int L = above[3]; + const int M = above[4]; + (void)left; + + dst[0] = AVG3(H, I, J); + dst[1] = AVG3(I, J, K); + dst[2] = AVG3(J, K, L); + dst[3] = AVG3(K, L, M); + memcpy(dst + stride * 1, dst, 4); + memcpy(dst + stride * 2, dst, 4); + memcpy(dst + stride * 3, dst, 4); +} + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + (void)above; + DST(0, 0) = AVG2(I, J); + DST(2, 0) = DST(0, 1) = AVG2(J, K); + DST(2, 1) = DST(0, 2) = AVG2(K, L); + DST(1, 0) = AVG3(I, J, K); + DST(3, 0) = DST(1, 1) = AVG3(J, K, L); + DST(3, 1) = DST(1, 2) = AVG3(K, L, L); + DST(3, 2) = DST(2, 2) = + DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; +} + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + (void)left; + DST(0, 0) = AVG2(A, B); + DST(1, 0) = DST(0, 2) = AVG2(B, C); + DST(2, 0) = DST(1, 2) = AVG2(C, D); + DST(3, 0) = DST(2, 2) = AVG2(D, E); + DST(3, 2) = AVG2(E, F); // differs from vp8 + + DST(0, 1) = AVG3(A, B, C); + DST(1, 1) = DST(0, 3) = AVG3(B, C, D); + DST(2, 1) = DST(1, 3) = AVG3(C, D, E); + DST(3, 1) = DST(2, 3) = AVG3(D, E, F); + DST(3, 3) = AVG3(E, F, G); // differs from vp8 +} + +void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + const int H = above[7]; + (void)left; + DST(0, 0) = AVG2(A, B); + DST(1, 0) = DST(0, 2) = AVG2(B, C); + DST(2, 0) = DST(1, 2) = AVG2(C, D); + DST(3, 0) = DST(2, 2) = AVG2(D, E); + DST(3, 2) = AVG3(E, F, G); + + DST(0, 1) = AVG3(A, B, C); + DST(1, 1) = DST(0, 3) = AVG3(B, C, D); + DST(2, 1) = DST(1, 3) = AVG3(C, D, E); + DST(3, 1) = DST(2, 3) = AVG3(D, E, F); + DST(3, 3) = AVG3(F, G, H); +} + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + const int H = above[7]; + (void)stride; + (void)left; + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); + DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); + DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); + DST(3, 2) = DST(2, 3) = AVG3(F, G, H); + DST(3, 3) = H; // differs from vp8 +} + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + const int H = above[7]; + (void)stride; + (void)left; + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); + DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); + DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); + DST(3, 2) = DST(2, 3) = AVG3(F, G, H); + DST(3, 3) = AVG3(G, H, H); +} + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + DST(0, 0) = DST(1, 2) = AVG2(X, A); + DST(1, 0) = DST(2, 2) = AVG2(A, B); + DST(2, 0) = DST(3, 2) = AVG2(B, C); + DST(3, 0) = AVG2(C, D); + + DST(0, 3) = AVG3(K, J, I); + DST(0, 2) = AVG3(J, I, X); + DST(0, 1) = DST(1, 3) = AVG3(I, X, A); + DST(1, 1) = DST(2, 3) = AVG3(X, A, B); + DST(2, 1) = DST(3, 3) = AVG3(A, B, C); + DST(3, 1) = AVG3(B, C, D); +} + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)stride; + DST(0, 3) = AVG3(J, K, L); + DST(1, 3) = DST(0, 2) = AVG3(I, J, K); + DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J); + DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I); + DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X); + DST(3, 1) = DST(2, 0) = AVG3(C, B, A); + DST(3, 0) = AVG3(D, C, B); +} + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + + DST(0, 0) = DST(2, 1) = AVG2(I, X); + DST(0, 1) = DST(2, 2) = AVG2(J, I); + DST(0, 2) = DST(2, 3) = AVG2(K, J); + DST(0, 3) = AVG2(L, K); + + DST(3, 0) = AVG3(A, B, C); + DST(2, 0) = AVG3(X, A, B); + DST(1, 0) = DST(3, 1) = AVG3(I, X, A); + DST(1, 1) = DST(3, 2) = AVG3(J, I, X); + DST(1, 2) = DST(3, 3) = AVG3(K, J, I); + DST(1, 3) = AVG3(L, K, J); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void) above; + (void) bd; + + // First column. + for (r = 0; r < bs - 1; ++r) { + dst[r * stride] = AVG2(left[r], left[r + 1]); + } + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + + // Second column. + for (r = 0; r < bs - 2; ++r) { + dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]); + } + dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]); + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + + // Rest of last row. + for (c = 0; c < bs - 2; ++c) + dst[(bs - 1) * stride + c] = left[bs - 1]; + + for (r = bs - 2; r >= 0; --r) { + for (c = 0; c < bs - 2; ++c) + dst[r * stride + c] = dst[(r + 1) * stride + c - 2]; + } +} + +#if CONFIG_MISC_FIXES +static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void) above; + (void) bd; + + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1], + left[(c >> 1) + r + 2]) + : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); + } + dst += stride; + } +} +#endif // CONFIG_MISC_FIXES + +static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void) left; + (void) bd; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], + above[(r >> 1) + c + 2]) + : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); + } + dst += stride; + } +} + +#define highbd_d63e_predictor highbd_d63_predictor + +static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void) left; + (void) bd; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = r + c + 2 < bs * 2 ? AVG3(above[r + c], above[r + c + 1], + above[r + c + 2]) + : above[bs * 2 - 1]; + } + dst += stride; + } +} + +#if CONFIG_MISC_FIXES +static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void) left; + (void) bd; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = AVG3(above[r + c], above[r + c + 1], + above[r + c + 1 + (r + c + 2 < bs * 2)]); + } + dst += stride; + } +} +#endif // CONFIG_MISC_FIXES + +static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void) bd; + + // first row + for (c = 0; c < bs; c++) + dst[c] = AVG2(above[c - 1], above[c]); + dst += stride; + + // second row + dst[0] = AVG3(left[0], above[-1], above[0]); + for (c = 1; c < bs; c++) + dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + dst += stride; + + // the rest of first col + dst[0] = AVG3(above[-1], left[0], left[1]); + for (r = 3; r < bs; ++r) + dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]); + + // the rest of the block + for (r = 2; r < bs; ++r) { + for (c = 1; c < bs; c++) + dst[c] = dst[-2 * stride + c - 1]; + dst += stride; + } +} + +static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void) bd; + dst[0] = AVG3(left[0], above[-1], above[0]); + for (c = 1; c < bs; c++) + dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + + dst[stride] = AVG3(above[-1], left[0], left[1]); + for (r = 2; r < bs; ++r) + dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); + + dst += stride; + for (r = 1; r < bs; ++r) { + for (c = 1; c < bs; c++) + dst[c] = dst[-stride + c - 1]; + dst += stride; + } +} + +static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void) bd; + dst[0] = AVG2(above[-1], left[0]); + for (r = 1; r < bs; r++) + dst[r * stride] = AVG2(left[r - 1], left[r]); + dst++; + + dst[0] = AVG3(left[0], above[-1], above[0]); + dst[stride] = AVG3(above[-1], left[0], left[1]); + for (r = 2; r < bs; r++) + dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); + dst++; + + for (c = 0; c < bs - 2; c++) + dst[c] = AVG3(above[c - 1], above[c], above[c + 1]); + dst += stride; + + for (r = 1; r < bs; ++r) { + for (c = 0; c < bs - 2; c++) + dst[c] = dst[-stride + c - 2]; + dst += stride; + } +} + +static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void) left; + (void) bd; + for (r = 0; r < bs; r++) { + memcpy(dst, above, bs * sizeof(uint16_t)); + dst += stride; + } +} + +static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void) above; + (void) bd; + for (r = 0; r < bs; r++) { + vpx_memset16(dst, left[r], bs); + dst += stride; + } +} + +static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + int ytop_left = above[-1]; + (void) bd; + + for (r = 0; r < bs; r++) { + for (c = 0; c < bs; c++) + dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd); + dst += stride; + } +} + +static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void) above; + (void) left; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, 128 << (bd - 8), bs); + dst += stride; + } +} + +static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + (void) above; + (void) bd; + + for (i = 0; i < bs; i++) + sum += left[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + (void) left; + (void) bd; + + for (i = 0; i < bs; i++) + sum += above[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + const int count = 2 * bs; + (void) bd; + + for (i = 0; i < bs; i++) { + sum += above[i]; + sum += left[i]; + } + + expected_dc = (sum + (count >> 1)) / count; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, expected_dc, bs); + dst += stride; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +// This serves as a wrapper function, so that all the prediction functions +// can be unified and accessed as a pointer array. Note that the boundary +// above and left are not necessarily used all the time. +#define intra_pred_sized(type, size) \ + void vpx_##type##_predictor_##size##x##size##_c(uint8_t *dst, \ + ptrdiff_t stride, \ + const uint8_t *above, \ + const uint8_t *left) { \ + type##_predictor(dst, stride, size, above, left); \ + } + +#if CONFIG_VP9_HIGHBITDEPTH +#define intra_pred_highbd_sized(type, size) \ + void vpx_highbd_##type##_predictor_##size##x##size##_c( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + highbd_##type##_predictor(dst, stride, size, above, left, bd); \ + } + +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 4) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_highbd_sized(type, 4) \ + intra_pred_highbd_sized(type, 8) \ + intra_pred_highbd_sized(type, 16) \ + intra_pred_highbd_sized(type, 32) + +#define intra_pred_no_4x4(type) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_highbd_sized(type, 4) \ + intra_pred_highbd_sized(type, 8) \ + intra_pred_highbd_sized(type, 16) \ + intra_pred_highbd_sized(type, 32) + +#else +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 4) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) + +#define intra_pred_no_4x4(type) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) +#endif // CONFIG_VP9_HIGHBITDEPTH + +intra_pred_no_4x4(d207) +intra_pred_no_4x4(d63) +intra_pred_no_4x4(d45) +#if CONFIG_MISC_FIXES +intra_pred_allsizes(d207e) +intra_pred_allsizes(d63e) +intra_pred_no_4x4(d45e) +#endif +intra_pred_no_4x4(d117) +intra_pred_no_4x4(d135) +intra_pred_no_4x4(d153) +intra_pred_allsizes(v) +intra_pred_allsizes(h) +intra_pred_allsizes(tm) +intra_pred_allsizes(dc_128) +intra_pred_allsizes(dc_left) +intra_pred_allsizes(dc_top) +intra_pred_allsizes(dc) +#undef intra_pred_allsizes diff --git a/thirdparty/libvpx/vpx_dsp/inv_txfm.c b/thirdparty/libvpx/vpx_dsp/inv_txfm.c new file mode 100644 index 000000000000..e18d31d7aa2e --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/inv_txfm.c @@ -0,0 +1,2518 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/inv_txfm.h" + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { +/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + tran_low_t output[16]; + tran_high_t a1, b1, c1, d1, e1; + const tran_low_t *ip = input; + tran_low_t *op = output; + + for (i = 0; i < 4; i++) { + a1 = ip[0] >> UNIT_QUANT_SHIFT; + c1 = ip[1] >> UNIT_QUANT_SHIFT; + d1 = ip[2] >> UNIT_QUANT_SHIFT; + b1 = ip[3] >> UNIT_QUANT_SHIFT; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + op[0] = WRAPLOW(a1); + op[1] = WRAPLOW(b1); + op[2] = WRAPLOW(c1); + op[3] = WRAPLOW(d1); + ip += 4; + op += 4; + } + + ip = output; + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0]; + c1 = ip[4 * 1]; + d1 = ip[4 * 2]; + b1 = ip[4 * 3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1)); + dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1)); + dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1)); + dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1)); + + ip++; + dest++; + } +} + +void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { + int i; + tran_high_t a1, e1; + tran_low_t tmp[4]; + const tran_low_t *ip = in; + tran_low_t *op = tmp; + + a1 = ip[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + op[0] = WRAPLOW(a1); + op[1] = op[2] = op[3] = WRAPLOW(e1); + + ip = tmp; + for (i = 0; i < 4; i++) { + e1 = ip[0] >> 1; + a1 = ip[0] - e1; + dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1); + dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1); + dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1); + dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1); + ip++; + dest++; + } +} + +void idct4_c(const tran_low_t *input, tran_low_t *output) { + tran_low_t step[4]; + tran_high_t temp1, temp2; + // stage 1 + temp1 = (input[0] + input[2]) * cospi_16_64; + temp2 = (input[0] - input[2]) * cospi_16_64; + step[0] = WRAPLOW(dct_const_round_shift(temp1)); + step[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step[2] = WRAPLOW(dct_const_round_shift(temp1)); + step[3] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + output[0] = WRAPLOW(step[0] + step[3]); + output[1] = WRAPLOW(step[1] + step[2]); + output[2] = WRAPLOW(step[1] - step[2]); + output[3] = WRAPLOW(step[0] - step[3]); +} + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[4], temp_out[4]; + + // Rows + for (i = 0; i < 4; ++i) { + idct4_c(input, outptr); + input += 4; + outptr += 4; + } + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + idct4_c(temp_in, temp_out); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 4)); + } + } +} + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, + int dest_stride) { + int i; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 4); + + for (i = 0; i < 4; i++) { + dest[0] = clip_pixel_add(dest[0], a1); + dest[1] = clip_pixel_add(dest[1], a1); + dest[2] = clip_pixel_add(dest[2], a1); + dest[3] = clip_pixel_add(dest[3], a1); + dest += dest_stride; + } +} + +void idct8_c(const tran_low_t *input, tran_low_t *output) { + tran_low_t step1[8], step2[8]; + tran_high_t temp1, temp2; + // stage 1 + step1[0] = input[0]; + step1[2] = input[4]; + step1[1] = input[2]; + step1[3] = input[6]; + temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + temp1 = (step1[0] + step1[2]) * cospi_16_64; + temp2 = (step1[0] - step1[2]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + // stage 3 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + // stage 4 + output[0] = WRAPLOW(step1[0] + step1[7]); + output[1] = WRAPLOW(step1[1] + step1[6]); + output[2] = WRAPLOW(step1[2] + step1[5]); + output[3] = WRAPLOW(step1[3] + step1[4]); + output[4] = WRAPLOW(step1[3] - step1[4]); + output[5] = WRAPLOW(step1[2] - step1[5]); + output[6] = WRAPLOW(step1[1] - step1[6]); + output[7] = WRAPLOW(step1[0] - step1[7]); +} + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + for (i = 0; i < 8; ++i) { + idct8_c(input, outptr); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + idct8_c(temp_in, temp_out); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 5)); + } + } +} + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 5); + for (j = 0; j < 8; ++j) { + for (i = 0; i < 8; ++i) + dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; + } +} + +void iadst4_c(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + tran_low_t x0 = input[0]; + tran_low_t x1 = input[1]; + tran_low_t x2 = input[2]; + tran_low_t x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = WRAPLOW(x0 - x2 + x3); + + s0 = s0 + s3 + s5; + s1 = s1 - s4 - s6; + s3 = s2; + s2 = sinpi_3_9 * s7; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = WRAPLOW(dct_const_round_shift(s0 + s3)); + output[1] = WRAPLOW(dct_const_round_shift(s1 + s3)); + output[2] = WRAPLOW(dct_const_round_shift(s2)); + output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); +} + +void iadst8_c(const tran_low_t *input, tran_low_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + + tran_high_t x0 = input[7]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[5]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[3]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[1]; + tran_high_t x7 = input[6]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + output[0] = output[1] = output[2] = output[3] = output[4] + = output[5] = output[6] = output[7] = 0; + return; + } + + // stage 1 + s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1); + s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1); + s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3); + s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3); + s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5); + s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5); + s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); + s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); + + x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); + x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); + x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); + x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); + + // stage 2 + s0 = (int)x0; + s1 = (int)x1; + s2 = (int)x2; + s3 = (int)x3; + s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5); + s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5); + s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7); + s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7); + + x0 = WRAPLOW(s0 + s2); + x1 = WRAPLOW(s1 + s3); + x2 = WRAPLOW(s0 - s2); + x3 = WRAPLOW(s1 - s3); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); + + // stage 3 + s2 = (int)(cospi_16_64 * (x2 + x3)); + s3 = (int)(cospi_16_64 * (x2 - x3)); + s6 = (int)(cospi_16_64 * (x6 + x7)); + s7 = (int)(cospi_16_64 * (x6 - x7)); + + x2 = WRAPLOW(dct_const_round_shift(s2)); + x3 = WRAPLOW(dct_const_round_shift(s3)); + x6 = WRAPLOW(dct_const_round_shift(s6)); + x7 = WRAPLOW(dct_const_round_shift(s7)); + + output[0] = WRAPLOW(x0); + output[1] = WRAPLOW(-x4); + output[2] = WRAPLOW(x6); + output[3] = WRAPLOW(-x2); + output[4] = WRAPLOW(x3); + output[5] = WRAPLOW(-x7); + output[6] = WRAPLOW(x5); + output[7] = WRAPLOW(-x1); +} + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + // only first 4 row has non-zero coefs + for (i = 0; i < 4; ++i) { + idct8_c(input, outptr); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + idct8_c(temp_in, temp_out); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 5)); + } + } +} + +void idct16_c(const tran_low_t *input, tran_low_t *output) { + tran_low_t step1[16], step2[16]; + tran_high_t temp1, temp2; + + // stage 1 + step1[0] = input[0/2]; + step1[1] = input[16/2]; + step1[2] = input[8/2]; + step1[3] = input[24/2]; + step1[4] = input[4/2]; + step1[5] = input[20/2]; + step1[6] = input[12/2]; + step1[7] = input[28/2]; + step1[8] = input[2/2]; + step1[9] = input[18/2]; + step1[10] = input[10/2]; + step1[11] = input[26/2]; + step1[12] = input[6/2]; + step1[13] = input[22/2]; + step1[14] = input[14/2]; + step1[15] = input[30/2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = WRAPLOW(dct_const_round_shift(temp1)); + step2[15] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + step1[8] = WRAPLOW(step2[8] + step2[9]); + step1[9] = WRAPLOW(step2[8] - step2[9]); + step1[10] = WRAPLOW(-step2[10] + step2[11]); + step1[11] = WRAPLOW(step2[10] + step2[11]); + step1[12] = WRAPLOW(step2[12] + step2[13]); + step1[13] = WRAPLOW(step2[12] - step2[13]); + step1[14] = WRAPLOW(-step2[14] + step2[15]); + step1[15] = WRAPLOW(step2[14] + step2[15]); + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + step1[8] = WRAPLOW(step2[8] + step2[11]); + step1[9] = WRAPLOW(step2[9] + step2[10]); + step1[10] = WRAPLOW(step2[9] - step2[10]); + step1[11] = WRAPLOW(step2[8] - step2[11]); + step1[12] = WRAPLOW(-step2[12] + step2[15]); + step1[13] = WRAPLOW(-step2[13] + step2[14]); + step1[14] = WRAPLOW(step2[13] + step2[14]); + step1[15] = WRAPLOW(step2[12] + step2[15]); + + // stage 6 + step2[0] = WRAPLOW(step1[0] + step1[7]); + step2[1] = WRAPLOW(step1[1] + step1[6]); + step2[2] = WRAPLOW(step1[2] + step1[5]); + step2[3] = WRAPLOW(step1[3] + step1[4]); + step2[4] = WRAPLOW(step1[3] - step1[4]); + step2[5] = WRAPLOW(step1[2] - step1[5]); + step2[6] = WRAPLOW(step1[1] - step1[6]); + step2[7] = WRAPLOW(step1[0] - step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + output[0] = WRAPLOW(step2[0] + step2[15]); + output[1] = WRAPLOW(step2[1] + step2[14]); + output[2] = WRAPLOW(step2[2] + step2[13]); + output[3] = WRAPLOW(step2[3] + step2[12]); + output[4] = WRAPLOW(step2[4] + step2[11]); + output[5] = WRAPLOW(step2[5] + step2[10]); + output[6] = WRAPLOW(step2[6] + step2[9]); + output[7] = WRAPLOW(step2[7] + step2[8]); + output[8] = WRAPLOW(step2[7] - step2[8]); + output[9] = WRAPLOW(step2[6] - step2[9]); + output[10] = WRAPLOW(step2[5] - step2[10]); + output[11] = WRAPLOW(step2[4] - step2[11]); + output[12] = WRAPLOW(step2[3] - step2[12]); + output[13] = WRAPLOW(step2[2] - step2[13]); + output[14] = WRAPLOW(step2[1] - step2[14]); + output[15] = WRAPLOW(step2[0] - step2[15]); +} + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows + for (i = 0; i < 16; ++i) { + idct16_c(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void iadst16_c(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + + tran_high_t x0 = input[15]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[13]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[11]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[9]; + tran_high_t x7 = input[6]; + tran_high_t x8 = input[7]; + tran_high_t x9 = input[8]; + tran_high_t x10 = input[5]; + tran_high_t x11 = input[10]; + tran_high_t x12 = input[3]; + tran_high_t x13 = input[12]; + tran_high_t x14 = input[1]; + tran_high_t x15 = input[14]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 + | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { + output[0] = output[1] = output[2] = output[3] = output[4] + = output[5] = output[6] = output[7] = output[8] + = output[9] = output[10] = output[11] = output[12] + = output[13] = output[14] = output[15] = 0; + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); + x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); + x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); + x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); + x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); + x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); + x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); + x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); + x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = WRAPLOW(s0 + s4); + x1 = WRAPLOW(s1 + s5); + x2 = WRAPLOW(s2 + s6); + x3 = WRAPLOW(s3 + s7); + x4 = WRAPLOW(s0 - s4); + x5 = WRAPLOW(s1 - s5); + x6 = WRAPLOW(s2 - s6); + x7 = WRAPLOW(s3 - s7); + x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); + x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); + x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); + x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); + x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = WRAPLOW(s0 + s2); + x1 = WRAPLOW(s1 + s3); + x2 = WRAPLOW(s0 - s2); + x3 = WRAPLOW(s1 - s3); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); + x8 = WRAPLOW(s8 + s10); + x9 = WRAPLOW(s9 + s11); + x10 = WRAPLOW(s8 - s10); + x11 = WRAPLOW(s9 - s11); + x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); + x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); + x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); + + // stage 4 + s2 = (- cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (- x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (- x10 + x11); + s14 = (- cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = WRAPLOW(dct_const_round_shift(s2)); + x3 = WRAPLOW(dct_const_round_shift(s3)); + x6 = WRAPLOW(dct_const_round_shift(s6)); + x7 = WRAPLOW(dct_const_round_shift(s7)); + x10 = WRAPLOW(dct_const_round_shift(s10)); + x11 = WRAPLOW(dct_const_round_shift(s11)); + x14 = WRAPLOW(dct_const_round_shift(s14)); + x15 = WRAPLOW(dct_const_round_shift(s15)); + + output[0] = WRAPLOW(x0); + output[1] = WRAPLOW(-x8); + output[2] = WRAPLOW(x12); + output[3] = WRAPLOW(-x4); + output[4] = WRAPLOW(x6); + output[5] = WRAPLOW(x14); + output[6] = WRAPLOW(x10); + output[7] = WRAPLOW(x2); + output[8] = WRAPLOW(x3); + output[9] = WRAPLOW(x11); + output[10] = WRAPLOW(x15); + output[11] = WRAPLOW(x7); + output[12] = WRAPLOW(x5); + output[13] = WRAPLOW(-x13); + output[14] = WRAPLOW(x9); + output[15] = WRAPLOW(-x1); +} + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + for (i = 0; i < 4; ++i) { + idct16_c(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j*16 + i]; + idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + for (j = 0; j < 16; ++j) { + for (i = 0; i < 16; ++i) + dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; + } +} + +void idct32_c(const tran_low_t *input, tran_low_t *output) { + tran_low_t step1[32], step2[32]; + tran_high_t temp1, temp2; + + // stage 1 + step1[0] = input[0]; + step1[1] = input[16]; + step1[2] = input[8]; + step1[3] = input[24]; + step1[4] = input[4]; + step1[5] = input[20]; + step1[6] = input[12]; + step1[7] = input[28]; + step1[8] = input[2]; + step1[9] = input[18]; + step1[10] = input[10]; + step1[11] = input[26]; + step1[12] = input[6]; + step1[13] = input[22]; + step1[14] = input[14]; + step1[15] = input[30]; + + temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; + temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; + step1[16] = WRAPLOW(dct_const_round_shift(temp1)); + step1[31] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; + temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; + step1[17] = WRAPLOW(dct_const_round_shift(temp1)); + step1[30] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; + temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; + temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; + step1[19] = WRAPLOW(dct_const_round_shift(temp1)); + step1[28] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; + temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; + temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; + temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; + temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; + step1[23] = WRAPLOW(dct_const_round_shift(temp1)); + step1[24] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = WRAPLOW(dct_const_round_shift(temp1)); + step2[15] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + + step2[16] = WRAPLOW(step1[16] + step1[17]); + step2[17] = WRAPLOW(step1[16] - step1[17]); + step2[18] = WRAPLOW(-step1[18] + step1[19]); + step2[19] = WRAPLOW(step1[18] + step1[19]); + step2[20] = WRAPLOW(step1[20] + step1[21]); + step2[21] = WRAPLOW(step1[20] - step1[21]); + step2[22] = WRAPLOW(-step1[22] + step1[23]); + step2[23] = WRAPLOW(step1[22] + step1[23]); + step2[24] = WRAPLOW(step1[24] + step1[25]); + step2[25] = WRAPLOW(step1[24] - step1[25]); + step2[26] = WRAPLOW(-step1[26] + step1[27]); + step2[27] = WRAPLOW(step1[26] + step1[27]); + step2[28] = WRAPLOW(step1[28] + step1[29]); + step2[29] = WRAPLOW(step1[28] - step1[29]); + step2[30] = WRAPLOW(-step1[30] + step1[31]); + step2[31] = WRAPLOW(step1[30] + step1[31]); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + step1[8] = WRAPLOW(step2[8] + step2[9]); + step1[9] = WRAPLOW(step2[8] - step2[9]); + step1[10] = WRAPLOW(-step2[10] + step2[11]); + step1[11] = WRAPLOW(step2[10] + step2[11]); + step1[12] = WRAPLOW(step2[12] + step2[13]); + step1[13] = WRAPLOW(step2[12] - step2[13]); + step1[14] = WRAPLOW(-step2[14] + step2[15]); + step1[15] = WRAPLOW(step2[14] + step2[15]); + + step1[16] = step2[16]; + step1[31] = step2[31]; + temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; + temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; + step1[17] = WRAPLOW(dct_const_round_shift(temp1)); + step1[30] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; + temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + step1[19] = step2[19]; + step1[20] = step2[20]; + temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; + temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; + temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[11] = step1[11]; + step2[12] = step1[12]; + + step2[16] = WRAPLOW(step1[16] + step1[19]); + step2[17] = WRAPLOW(step1[17] + step1[18]); + step2[18] = WRAPLOW(step1[17] - step1[18]); + step2[19] = WRAPLOW(step1[16] - step1[19]); + step2[20] = WRAPLOW(-step1[20] + step1[23]); + step2[21] = WRAPLOW(-step1[21] + step1[22]); + step2[22] = WRAPLOW(step1[21] + step1[22]); + step2[23] = WRAPLOW(step1[20] + step1[23]); + + step2[24] = WRAPLOW(step1[24] + step1[27]); + step2[25] = WRAPLOW(step1[25] + step1[26]); + step2[26] = WRAPLOW(step1[25] - step1[26]); + step2[27] = WRAPLOW(step1[24] - step1[27]); + step2[28] = WRAPLOW(-step1[28] + step1[31]); + step2[29] = WRAPLOW(-step1[29] + step1[30]); + step2[30] = WRAPLOW(step1[29] + step1[30]); + step2[31] = WRAPLOW(step1[28] + step1[31]); + + // stage 5 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + step1[8] = WRAPLOW(step2[8] + step2[11]); + step1[9] = WRAPLOW(step2[9] + step2[10]); + step1[10] = WRAPLOW(step2[9] - step2[10]); + step1[11] = WRAPLOW(step2[8] - step2[11]); + step1[12] = WRAPLOW(-step2[12] + step2[15]); + step1[13] = WRAPLOW(-step2[13] + step2[14]); + step1[14] = WRAPLOW(step2[13] + step2[14]); + step1[15] = WRAPLOW(step2[12] + step2[15]); + + step1[16] = step2[16]; + step1[17] = step2[17]; + temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; + temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; + temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; + step1[19] = WRAPLOW(dct_const_round_shift(temp1)); + step1[28] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; + temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; + temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[0] = WRAPLOW(step1[0] + step1[7]); + step2[1] = WRAPLOW(step1[1] + step1[6]); + step2[2] = WRAPLOW(step1[2] + step1[5]); + step2[3] = WRAPLOW(step1[3] + step1[4]); + step2[4] = WRAPLOW(step1[3] - step1[4]); + step2[5] = WRAPLOW(step1[2] - step1[5]); + step2[6] = WRAPLOW(step1[1] - step1[6]); + step2[7] = WRAPLOW(step1[0] - step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + step2[14] = step1[14]; + step2[15] = step1[15]; + + step2[16] = WRAPLOW(step1[16] + step1[23]); + step2[17] = WRAPLOW(step1[17] + step1[22]); + step2[18] = WRAPLOW(step1[18] + step1[21]); + step2[19] = WRAPLOW(step1[19] + step1[20]); + step2[20] = WRAPLOW(step1[19] - step1[20]); + step2[21] = WRAPLOW(step1[18] - step1[21]); + step2[22] = WRAPLOW(step1[17] - step1[22]); + step2[23] = WRAPLOW(step1[16] - step1[23]); + + step2[24] = WRAPLOW(-step1[24] + step1[31]); + step2[25] = WRAPLOW(-step1[25] + step1[30]); + step2[26] = WRAPLOW(-step1[26] + step1[29]); + step2[27] = WRAPLOW(-step1[27] + step1[28]); + step2[28] = WRAPLOW(step1[27] + step1[28]); + step2[29] = WRAPLOW(step1[26] + step1[29]); + step2[30] = WRAPLOW(step1[25] + step1[30]); + step2[31] = WRAPLOW(step1[24] + step1[31]); + + // stage 7 + step1[0] = WRAPLOW(step2[0] + step2[15]); + step1[1] = WRAPLOW(step2[1] + step2[14]); + step1[2] = WRAPLOW(step2[2] + step2[13]); + step1[3] = WRAPLOW(step2[3] + step2[12]); + step1[4] = WRAPLOW(step2[4] + step2[11]); + step1[5] = WRAPLOW(step2[5] + step2[10]); + step1[6] = WRAPLOW(step2[6] + step2[9]); + step1[7] = WRAPLOW(step2[7] + step2[8]); + step1[8] = WRAPLOW(step2[7] - step2[8]); + step1[9] = WRAPLOW(step2[6] - step2[9]); + step1[10] = WRAPLOW(step2[5] - step2[10]); + step1[11] = WRAPLOW(step2[4] - step2[11]); + step1[12] = WRAPLOW(step2[3] - step2[12]); + step1[13] = WRAPLOW(step2[2] - step2[13]); + step1[14] = WRAPLOW(step2[1] - step2[14]); + step1[15] = WRAPLOW(step2[0] - step2[15]); + + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + temp1 = (-step2[20] + step2[27]) * cospi_16_64; + temp2 = (step2[20] + step2[27]) * cospi_16_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step2[21] + step2[26]) * cospi_16_64; + temp2 = (step2[21] + step2[26]) * cospi_16_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step2[22] + step2[25]) * cospi_16_64; + temp2 = (step2[22] + step2[25]) * cospi_16_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step2[23] + step2[24]) * cospi_16_64; + temp2 = (step2[23] + step2[24]) * cospi_16_64; + step1[23] = WRAPLOW(dct_const_round_shift(temp1)); + step1[24] = WRAPLOW(dct_const_round_shift(temp2)); + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // final stage + output[0] = WRAPLOW(step1[0] + step1[31]); + output[1] = WRAPLOW(step1[1] + step1[30]); + output[2] = WRAPLOW(step1[2] + step1[29]); + output[3] = WRAPLOW(step1[3] + step1[28]); + output[4] = WRAPLOW(step1[4] + step1[27]); + output[5] = WRAPLOW(step1[5] + step1[26]); + output[6] = WRAPLOW(step1[6] + step1[25]); + output[7] = WRAPLOW(step1[7] + step1[24]); + output[8] = WRAPLOW(step1[8] + step1[23]); + output[9] = WRAPLOW(step1[9] + step1[22]); + output[10] = WRAPLOW(step1[10] + step1[21]); + output[11] = WRAPLOW(step1[11] + step1[20]); + output[12] = WRAPLOW(step1[12] + step1[19]); + output[13] = WRAPLOW(step1[13] + step1[18]); + output[14] = WRAPLOW(step1[14] + step1[17]); + output[15] = WRAPLOW(step1[15] + step1[16]); + output[16] = WRAPLOW(step1[15] - step1[16]); + output[17] = WRAPLOW(step1[14] - step1[17]); + output[18] = WRAPLOW(step1[13] - step1[18]); + output[19] = WRAPLOW(step1[12] - step1[19]); + output[20] = WRAPLOW(step1[11] - step1[20]); + output[21] = WRAPLOW(step1[10] - step1[21]); + output[22] = WRAPLOW(step1[9] - step1[22]); + output[23] = WRAPLOW(step1[8] - step1[23]); + output[24] = WRAPLOW(step1[7] - step1[24]); + output[25] = WRAPLOW(step1[6] - step1[25]); + output[26] = WRAPLOW(step1[5] - step1[26]); + output[27] = WRAPLOW(step1[4] - step1[27]); + output[28] = WRAPLOW(step1[3] - step1[28]); + output[29] = WRAPLOW(step1[2] - step1[29]); + output[30] = WRAPLOW(step1[1] - step1[30]); + output[31] = WRAPLOW(step1[0] - step1[31]); +} + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[32 * 32]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + for (i = 0; i < 32; ++i) { + int16_t zero_coeff[16]; + for (j = 0; j < 16; ++j) + zero_coeff[j] = input[2 * j] | input[2 * j + 1]; + for (j = 0; j < 8; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 4; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 2; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + + if (zero_coeff[0] | zero_coeff[1]) + idct32_c(input, outptr); + else + memset(outptr, 0, sizeof(tran_low_t) * 32); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[32 * 32] = {0}; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + // only upper-left 16x16 has non-zero coeff + for (i = 0; i < 16; ++i) { + idct32_c(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[32 * 32] = {0}; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + // only upper-left 8x8 has non-zero coeff + for (i = 0; i < 8; ++i) { + idct32_c(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_high_t a1; + + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + + for (j = 0; j < 32; ++j) { + for (i = 0; i < 32; ++i) + dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + tran_low_t output[16]; + tran_high_t a1, b1, c1, d1, e1; + const tran_low_t *ip = input; + tran_low_t *op = output; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + for (i = 0; i < 4; i++) { + a1 = ip[0] >> UNIT_QUANT_SHIFT; + c1 = ip[1] >> UNIT_QUANT_SHIFT; + d1 = ip[2] >> UNIT_QUANT_SHIFT; + b1 = ip[3] >> UNIT_QUANT_SHIFT; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + op[0] = HIGHBD_WRAPLOW(a1, bd); + op[1] = HIGHBD_WRAPLOW(b1, bd); + op[2] = HIGHBD_WRAPLOW(c1, bd); + op[3] = HIGHBD_WRAPLOW(d1, bd); + ip += 4; + op += 4; + } + + ip = output; + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0]; + c1 = ip[4 * 1]; + d1 = ip[4 * 2]; + b1 = ip[4 * 3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], + HIGHBD_WRAPLOW(a1, bd), bd); + dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], + HIGHBD_WRAPLOW(b1, bd), bd); + dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], + HIGHBD_WRAPLOW(c1, bd), bd); + dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], + HIGHBD_WRAPLOW(d1, bd), bd); + + ip++; + dest++; + } +} + +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, + int dest_stride, int bd) { + int i; + tran_high_t a1, e1; + tran_low_t tmp[4]; + const tran_low_t *ip = in; + tran_low_t *op = tmp; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + (void) bd; + + a1 = ip[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + op[0] = HIGHBD_WRAPLOW(a1, bd); + op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd); + + ip = tmp; + for (i = 0; i < 4; i++) { + e1 = ip[0] >> 1; + a1 = ip[0] - e1; + dest[dest_stride * 0] = highbd_clip_pixel_add( + dest[dest_stride * 0], a1, bd); + dest[dest_stride * 1] = highbd_clip_pixel_add( + dest[dest_stride * 1], e1, bd); + dest[dest_stride * 2] = highbd_clip_pixel_add( + dest[dest_stride * 2], e1, bd); + dest[dest_stride * 3] = highbd_clip_pixel_add( + dest[dest_stride * 3], e1, bd); + ip++; + dest++; + } +} + +void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step[4]; + tran_high_t temp1, temp2; + (void) bd; + // stage 1 + temp1 = (input[0] + input[2]) * cospi_16_64; + temp2 = (input[0] - input[2]) * cospi_16_64; + step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + // stage 2 + output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd); + output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd); + output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd); + output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); +} + +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[4], temp_out[4]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // Rows + for (i = 0; i < 4; ++i) { + vpx_highbd_idct4_c(input, outptr, bd); + input += 4; + outptr += 4; + } + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + vpx_highbd_idct4_c(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } + } +} + +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, + int dest_stride, int bd) { + int i; + tran_high_t a1; + tran_low_t out = HIGHBD_WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, 4); + + for (i = 0; i < 4; i++) { + dest[0] = highbd_clip_pixel_add(dest[0], a1, bd); + dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); + dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); + dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); + dest += dest_stride; + } +} + +void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step1[8], step2[8]; + tran_high_t temp1, temp2; + // stage 1 + step1[0] = input[0]; + step1[2] = input[4]; + step1[1] = input[2]; + step1[3] = input[6]; + temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + // stage 2 & stage 3 - even half + vpx_highbd_idct4_c(step1, step1, bd); + + // stage 2 - odd half + step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); + step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); + + // stage 3 - odd half + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[7] = step2[7]; + + // stage 4 + output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); + output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); + output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); + output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); + output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); + output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); + output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); + output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); +} + +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[8], temp_out[8]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // First transform rows. + for (i = 0; i < 8; ++i) { + vpx_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + + // Then transform columns. + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + vpx_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } +} + +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + int i, j; + tran_high_t a1; + tran_low_t out = HIGHBD_WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, 5); + for (j = 0; j < 8; ++j) { + for (i = 0; i < 8; ++i) + dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); + dest += stride; + } +} + +void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + tran_low_t x0 = input[0]; + tran_low_t x1 = input[1]; + tran_low_t x2 = input[2]; + tran_low_t x3 = input[3]; + (void) bd; + + if (!(x0 | x1 | x2 | x3)) { + memset(output, 0, 4 * sizeof(*output)); + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd); + + s0 = s0 + s3 + s5; + s1 = s1 - s4 - s6; + s3 = s2; + s2 = sinpi_3_9 * s7; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd); + output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd); + output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); + output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd); +} + +void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + tran_low_t x0 = input[7]; + tran_low_t x1 = input[0]; + tran_low_t x2 = input[5]; + tran_low_t x3 = input[2]; + tran_low_t x4 = input[3]; + tran_low_t x5 = input[4]; + tran_low_t x6 = input[1]; + tran_low_t x7 = input[6]; + (void) bd; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + memset(output, 0, 8 * sizeof(*output)); + return; + } + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd); + x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd); + x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd); + x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd); + x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd); + x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = HIGHBD_WRAPLOW(s0 + s2, bd); + x1 = HIGHBD_WRAPLOW(s1 + s3, bd); + x2 = HIGHBD_WRAPLOW(s0 - s2, bd); + x3 = HIGHBD_WRAPLOW(s1 - s3, bd); + x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd); + x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); + x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd); + + output[0] = HIGHBD_WRAPLOW(x0, bd); + output[1] = HIGHBD_WRAPLOW(-x4, bd); + output[2] = HIGHBD_WRAPLOW(x6, bd); + output[3] = HIGHBD_WRAPLOW(-x2, bd); + output[4] = HIGHBD_WRAPLOW(x3, bd); + output[5] = HIGHBD_WRAPLOW(-x7, bd); + output[6] = HIGHBD_WRAPLOW(x5, bd); + output[7] = HIGHBD_WRAPLOW(-x1, bd); +} + +void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[8], temp_out[8]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // First transform rows. + // Only first 4 row has non-zero coefs. + for (i = 0; i < 4; ++i) { + vpx_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + // Then transform columns. + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + vpx_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } +} + +void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step1[16], step2[16]; + tran_high_t temp1, temp2; + (void) bd; + + // stage 1 + step1[0] = input[0/2]; + step1[1] = input[16/2]; + step1[2] = input[8/2]; + step1[3] = input[24/2]; + step1[4] = input[4/2]; + step1[5] = input[20/2]; + step1[6] = input[12/2]; + step1[7] = input[28/2]; + step1[8] = input[2/2]; + step1[9] = input[18/2]; + step1[10] = input[10/2]; + step1[11] = input[26/2]; + step1[12] = input[6/2]; + step1[13] = input[22/2]; + step1[14] = input[14/2]; + step1[15] = input[30/2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); + step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); + step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); + step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); + step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); + step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); + step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); + step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); + step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); + step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); + step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[7] = step2[7]; + + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); + step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); + step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); + step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); + step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); + step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); + step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); + + // stage 6 + step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); + step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); + step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); + step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); + step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); + step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); + step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); + output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); + output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); + output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); + output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); + output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); + output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); + output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); + output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); + output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); + output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); + output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); + output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); + output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); + output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); + output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); +} + +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[16], temp_out[16]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // First transform rows. + for (i = 0; i < 16; ++i) { + vpx_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + + // Then transform columns. + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + vpx_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } +} + +void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + + tran_low_t x0 = input[15]; + tran_low_t x1 = input[0]; + tran_low_t x2 = input[13]; + tran_low_t x3 = input[2]; + tran_low_t x4 = input[11]; + tran_low_t x5 = input[4]; + tran_low_t x6 = input[9]; + tran_low_t x7 = input[6]; + tran_low_t x8 = input[7]; + tran_low_t x9 = input[8]; + tran_low_t x10 = input[5]; + tran_low_t x11 = input[10]; + tran_low_t x12 = input[3]; + tran_low_t x13 = input[12]; + tran_low_t x14 = input[1]; + tran_low_t x15 = input[14]; + (void) bd; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 + | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { + memset(output, 0, 16 * sizeof(*output)); + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd); + x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd); + x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd); + x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd); + x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd); + x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd); + x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd); + x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd); + x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd); + x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd); + x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd); + x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd); + x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd); + x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = HIGHBD_WRAPLOW(s0 + s4, bd); + x1 = HIGHBD_WRAPLOW(s1 + s5, bd); + x2 = HIGHBD_WRAPLOW(s2 + s6, bd); + x3 = HIGHBD_WRAPLOW(s3 + s7, bd); + x4 = HIGHBD_WRAPLOW(s0 - s4, bd); + x5 = HIGHBD_WRAPLOW(s1 - s5, bd); + x6 = HIGHBD_WRAPLOW(s2 - s6, bd); + x7 = HIGHBD_WRAPLOW(s3 - s7, bd); + x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd); + x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd); + x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd); + x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd); + x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd); + x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd); + x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd); + x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = HIGHBD_WRAPLOW(s0 + s2, bd); + x1 = HIGHBD_WRAPLOW(s1 + s3, bd); + x2 = HIGHBD_WRAPLOW(s0 - s2, bd); + x3 = HIGHBD_WRAPLOW(s1 - s3, bd); + x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd); + x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd); + x8 = HIGHBD_WRAPLOW(s8 + s10, bd); + x9 = HIGHBD_WRAPLOW(s9 + s11, bd); + x10 = HIGHBD_WRAPLOW(s8 - s10, bd); + x11 = HIGHBD_WRAPLOW(s9 - s11, bd); + x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd); + x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd); + x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd); + x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd); + + // stage 4 + s2 = (- cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (- cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); + x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd); + x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd); + x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd); + x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd); + x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd); + + output[0] = HIGHBD_WRAPLOW(x0, bd); + output[1] = HIGHBD_WRAPLOW(-x8, bd); + output[2] = HIGHBD_WRAPLOW(x12, bd); + output[3] = HIGHBD_WRAPLOW(-x4, bd); + output[4] = HIGHBD_WRAPLOW(x6, bd); + output[5] = HIGHBD_WRAPLOW(x14, bd); + output[6] = HIGHBD_WRAPLOW(x10, bd); + output[7] = HIGHBD_WRAPLOW(x2, bd); + output[8] = HIGHBD_WRAPLOW(x3, bd); + output[9] = HIGHBD_WRAPLOW(x11, bd); + output[10] = HIGHBD_WRAPLOW(x15, bd); + output[11] = HIGHBD_WRAPLOW(x7, bd); + output[12] = HIGHBD_WRAPLOW(x5, bd); + output[13] = HIGHBD_WRAPLOW(-x13, bd); + output[14] = HIGHBD_WRAPLOW(x9, bd); + output[15] = HIGHBD_WRAPLOW(-x1, bd); +} + +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[16], temp_out[16]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + for (i = 0; i < 4; ++i) { + vpx_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + + // Then transform columns. + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j*16 + i]; + vpx_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } +} + +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + int i, j; + tran_high_t a1; + tran_low_t out = HIGHBD_WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, 6); + for (j = 0; j < 16; ++j) { + for (i = 0; i < 16; ++i) + dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); + dest += stride; + } +} + +static void highbd_idct32_c(const tran_low_t *input, + tran_low_t *output, int bd) { + tran_low_t step1[32], step2[32]; + tran_high_t temp1, temp2; + (void) bd; + + // stage 1 + step1[0] = input[0]; + step1[1] = input[16]; + step1[2] = input[8]; + step1[3] = input[24]; + step1[4] = input[4]; + step1[5] = input[20]; + step1[6] = input[12]; + step1[7] = input[28]; + step1[8] = input[2]; + step1[9] = input[18]; + step1[10] = input[10]; + step1[11] = input[26]; + step1[12] = input[6]; + step1[13] = input[22]; + step1[14] = input[14]; + step1[15] = input[30]; + + temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; + temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; + step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; + temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; + step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; + temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; + step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; + temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; + step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; + temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; + step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; + temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; + step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; + temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; + step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; + temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; + step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd); + step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd); + step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd); + step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd); + step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd); + step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd); + step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd); + step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd); + step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd); + step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd); + step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd); + step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd); + step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd); + step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd); + step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd); + step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); + step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); + step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); + step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); + step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); + step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); + step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); + + step1[16] = step2[16]; + step1[31] = step2[31]; + temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; + temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; + step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; + temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; + step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[19] = step2[19]; + step1[20] = step2[20]; + temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; + temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; + step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; + temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; + step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); + step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[11] = step1[11]; + step2[12] = step1[12]; + + step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd); + step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd); + step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd); + step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd); + step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd); + step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd); + step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd); + step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd); + + step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd); + step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd); + step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd); + step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd); + step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd); + step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd); + step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd); + step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd); + + // stage 5 + step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); + step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); + step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); + step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[7] = step2[7]; + + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); + step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); + step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); + step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); + step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); + step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); + step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); + + step1[16] = step2[16]; + step1[17] = step2[17]; + temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; + temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; + step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; + temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; + step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; + temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; + step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; + temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; + step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); + step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); + step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); + step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); + step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); + step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); + step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[14] = step1[14]; + step2[15] = step1[15]; + + step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd); + step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd); + step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd); + step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd); + step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd); + step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd); + step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd); + step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd); + + step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd); + step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd); + step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd); + step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd); + step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd); + step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd); + step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd); + step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd); + + // stage 7 + step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); + step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); + step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); + step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); + step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); + step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); + step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); + step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); + step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); + step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); + step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); + step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); + step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); + step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); + step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); + + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + temp1 = (-step2[20] + step2[27]) * cospi_16_64; + temp2 = (step2[20] + step2[27]) * cospi_16_64; + step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = (-step2[21] + step2[26]) * cospi_16_64; + temp2 = (step2[21] + step2[26]) * cospi_16_64; + step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = (-step2[22] + step2[25]) * cospi_16_64; + temp2 = (step2[22] + step2[25]) * cospi_16_64; + step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + temp1 = (-step2[23] + step2[24]) * cospi_16_64; + temp2 = (step2[23] + step2[24]) * cospi_16_64; + step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // final stage + output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd); + output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd); + output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd); + output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd); + output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd); + output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd); + output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd); + output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd); + output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd); + output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd); + output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd); + output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd); + output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd); + output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd); + output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd); + output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd); + output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd); + output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd); + output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd); + output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd); + output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd); + output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd); + output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd); + output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd); + output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd); + output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd); + output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd); + output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd); + output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd); + output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd); + output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd); + output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd); +} + +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[32 * 32]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // Rows + for (i = 0; i < 32; ++i) { + tran_low_t zero_coeff[16]; + for (j = 0; j < 16; ++j) + zero_coeff[j] = input[2 * j] | input[2 * j + 1]; + for (j = 0; j < 8; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 4; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 2; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + + if (zero_coeff[0] | zero_coeff[1]) + highbd_idct32_c(input, outptr, bd); + else + memset(outptr, 0, sizeof(tran_low_t) * 32); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + highbd_idct32_c(temp_in, temp_out, bd); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } +} + +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[32 * 32] = {0}; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // Rows + // Only upper-left 8x8 has non-zero coeff. + for (i = 0; i < 8; ++i) { + highbd_idct32_c(input, outptr, bd); + input += 32; + outptr += 32; + } + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + highbd_idct32_c(temp_in, temp_out, bd); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } +} + +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + int i, j; + int a1; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + tran_low_t out = HIGHBD_WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); + out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, 6); + + for (j = 0; j < 32; ++j) { + for (i = 0; i < 32; ++i) + dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); + dest += stride; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/thirdparty/libvpx/vpx_dsp/inv_txfm.h b/thirdparty/libvpx/vpx_dsp/inv_txfm.h new file mode 100644 index 000000000000..9cfe1be3a7dd --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/inv_txfm.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_INV_TXFM_H_ +#define VPX_DSP_INV_TXFM_H_ + +#include + +#include "./vpx_config.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE tran_high_t check_range(tran_high_t input) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + // For valid VP9 input streams, intermediate stage coefficients should always + // stay within the range of a signed 16 bit integer. Coefficients can go out + // of this range for invalid/corrupt VP9 streams. However, strictly checking + // this range for every intermediate coefficient can burdensome for a decoder, + // therefore the following assertion is only enabled when configured with + // --enable-coefficient-range-checking. + assert(INT16_MIN <= input); + assert(input <= INT16_MAX); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + return input; +} + +static INLINE tran_high_t dct_const_round_shift(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + return (tran_high_t)rv; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE tran_high_t highbd_check_range(tran_high_t input, + int bd) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + // For valid highbitdepth VP9 streams, intermediate stage coefficients will + // stay within the ranges: + // - 8 bit: signed 16 bit integer + // - 10 bit: signed 18 bit integer + // - 12 bit: signed 20 bit integer + const int32_t int_max = (1 << (7 + bd)) - 1; + const int32_t int_min = -int_max - 1; + assert(int_min <= input); + assert(input <= int_max); + (void) int_min; +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + (void) bd; + return input; +} + +static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + return (tran_high_t)rv; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_EMULATE_HARDWARE +// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a +// non-normative method to handle overflows. A stream that causes +// overflows in the inverse transform is considered invalid in VP9, +// and a hardware implementer is free to choose any reasonable +// method to handle overflows. However to aid in hardware +// verification they can use a specific implementation of the +// WRAPLOW() macro below that is identical to their intended +// hardware implementation (and also use configure options to trigger +// the C-implementation of the transform). +// +// The particular WRAPLOW implementation below performs strict +// overflow wrapping to match common hardware implementations. +// bd of 8 uses trans_low with 16bits, need to remove 16bits +// bd of 10 uses trans_low with 18bits, need to remove 14bits +// bd of 12 uses trans_low with 20bits, need to remove 12bits +// bd of x uses trans_low with 8+x bits, need to remove 24-x bits + +#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16) +#if CONFIG_VP9_HIGHBITDEPTH +#define HIGHBD_WRAPLOW(x, bd) \ + ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd)) +#endif // CONFIG_VP9_HIGHBITDEPTH + +#else // CONFIG_EMULATE_HARDWARE + +#define WRAPLOW(x) ((int32_t)check_range(x)) +#if CONFIG_VP9_HIGHBITDEPTH +#define HIGHBD_WRAPLOW(x, bd) \ + ((int32_t)highbd_check_range((x), bd)) +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // CONFIG_EMULATE_HARDWARE + +void idct4_c(const tran_low_t *input, tran_low_t *output); +void idct8_c(const tran_low_t *input, tran_low_t *output); +void idct16_c(const tran_low_t *input, tran_low_t *output); +void idct32_c(const tran_low_t *input, tran_low_t *output); +void iadst4_c(const tran_low_t *input, tran_low_t *output); +void iadst8_c(const tran_low_t *input, tran_low_t *output); +void iadst16_c(const tran_low_t *input, tran_low_t *output); + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd); +void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd); +void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd); + +void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd); +void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd); +void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd); + +static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, + int bd) { + trans = HIGHBD_WRAPLOW(trans, bd); + return clip_pixel_highbd(dest + (int)trans, bd); +} +#endif + +static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) { + trans = WRAPLOW(trans); + return clip_pixel(dest + (int)trans); +} +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_INV_TXFM_H_ diff --git a/thirdparty/libvpx/vpx_dsp/loopfilter.c b/thirdparty/libvpx/vpx_dsp/loopfilter.c new file mode 100644 index 000000000000..645a1ab95ee9 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/loopfilter.c @@ -0,0 +1,767 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/mem.h" + +static INLINE int8_t signed_char_clamp(int t) { + return (int8_t)clamp(t, -128, 127); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE int16_t signed_char_clamp_high(int t, int bd) { + switch (bd) { + case 10: + return (int16_t)clamp(t, -128*4, 128*4-1); + case 12: + return (int16_t)clamp(t, -128*16, 128*16-1); + case 8: + default: + return (int16_t)clamp(t, -128, 128-1); + } +} +#endif + +// should we apply any filter at all: 11111111 yes, 00000000 no +static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, + uint8_t p3, uint8_t p2, + uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1, + uint8_t q2, uint8_t q3) { + int8_t mask = 0; + mask |= (abs(p3 - p2) > limit) * -1; + mask |= (abs(p2 - p1) > limit) * -1; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(q2 - q1) > limit) * -1; + mask |= (abs(q3 - q2) > limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + return ~mask; +} + +static INLINE int8_t flat_mask4(uint8_t thresh, + uint8_t p3, uint8_t p2, + uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1, + uint8_t q2, uint8_t q3) { + int8_t mask = 0; + mask |= (abs(p1 - p0) > thresh) * -1; + mask |= (abs(q1 - q0) > thresh) * -1; + mask |= (abs(p2 - p0) > thresh) * -1; + mask |= (abs(q2 - q0) > thresh) * -1; + mask |= (abs(p3 - p0) > thresh) * -1; + mask |= (abs(q3 - q0) > thresh) * -1; + return ~mask; +} + +static INLINE int8_t flat_mask5(uint8_t thresh, + uint8_t p4, uint8_t p3, + uint8_t p2, uint8_t p1, + uint8_t p0, uint8_t q0, + uint8_t q1, uint8_t q2, + uint8_t q3, uint8_t q4) { + int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3); + mask |= (abs(p4 - p0) > thresh) * -1; + mask |= (abs(q4 - q0) > thresh) * -1; + return ~mask; +} + +// is there high edge variance internal edge: 11111111 yes, 00000000 no +static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1) { + int8_t hev = 0; + hev |= (abs(p1 - p0) > thresh) * -1; + hev |= (abs(q1 - q0) > thresh) * -1; + return hev; +} + +static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { + int8_t filter1, filter2; + + const int8_t ps1 = (int8_t) *op1 ^ 0x80; + const int8_t ps0 = (int8_t) *op0 ^ 0x80; + const int8_t qs0 = (int8_t) *oq0 ^ 0x80; + const int8_t qs1 = (int8_t) *oq1 ^ 0x80; + const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); + + // add outer taps if we have high edge variance + int8_t filter = signed_char_clamp(ps1 - qs1) & hev; + + // inner taps + filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; + + // save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set to adjust by -1 to account for the fact + // we'd round 3 the other way + filter1 = signed_char_clamp(filter + 4) >> 3; + filter2 = signed_char_clamp(filter + 3) >> 3; + + *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80; + *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80; + + // outer tap adjustments + filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + + *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80; + *op1 = signed_char_clamp(ps1 + filter) ^ 0x80; +} + +void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); + ++s; + } +} + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1); +} + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + filter4(mask, *thresh, s - 2, s - 1, s, s + 1); + s += pitch; + } +} + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0); + vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); +} + +static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, + uint8_t *op3, uint8_t *op2, + uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1, + uint8_t *oq2, uint8_t *oq3) { + if (flat && mask) { + const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + + // 7-tap filter [1, 1, 1, 2, 1, 1, 1] + *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); + *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); + *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); + *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); + *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); + *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); + } else { + filter4(mask, thresh, op1, op0, oq0, oq1); + } +} + +void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, + s, s + 1 * p, s + 2 * p, s + 3 * p); + ++s; + } +} + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1); +} + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + + for (i = 0; i < 8; ++i) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, + s, s + 1, s + 2, s + 3); + s += pitch; + } +} + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0); + vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); +} + +static INLINE void filter16(int8_t mask, uint8_t thresh, + uint8_t flat, uint8_t flat2, + uint8_t *op7, uint8_t *op6, + uint8_t *op5, uint8_t *op4, + uint8_t *op3, uint8_t *op2, + uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1, + uint8_t *oq2, uint8_t *oq3, + uint8_t *oq4, uint8_t *oq5, + uint8_t *oq6, uint8_t *oq7) { + if (flat2 && flat && mask) { + const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, + p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, + q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7; + + // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] + *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + + q0, 4); + *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + + q0 + q1, 4); + *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + + q0 + q1 + q2, 4); + *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + + q0 + q1 + q2 + q3, 4); + *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + + q0 + q1 + q2 + q3 + q4, 4); + *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 + + q0 + q1 + q2 + q3 + q4 + q5, 4); + *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + + q0 + q1 + q2 + q3 + q4 + q5 + q6, 4); + *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + + q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); + *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + + q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4); + *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4); + *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); + *oq6 = ROUND_POWER_OF_TWO(p0 + + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); + } else { + filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); + } +} + +static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8 * count; ++i) { + const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat2 = flat_mask5(1, + s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, + q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]); + + filter16(mask, *thresh, flat, flat2, + s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p, + s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, + s, s + 1 * p, s + 2 * p, s + 3 * p, + s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p); + ++s; + } +} + +void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); +} + +void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2); +} + +static void mb_lpf_vertical_edge_w(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { + int i; + + for (i = 0; i < count; ++i) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, + q0, s[4], s[5], s[6], s[7]); + + filter16(mask, *thresh, flat, flat2, + s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, + s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7); + s += p; + } +} + +void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8); +} + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16); +} + +#if CONFIG_VP9_HIGHBITDEPTH +// Should we apply any filter at all: 11111111 yes, 00000000 no ? +static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, + uint16_t p3, uint16_t p2, + uint16_t p1, uint16_t p0, + uint16_t q0, uint16_t q1, + uint16_t q2, uint16_t q3, int bd) { + int8_t mask = 0; + int16_t limit16 = (uint16_t)limit << (bd - 8); + int16_t blimit16 = (uint16_t)blimit << (bd - 8); + mask |= (abs(p3 - p2) > limit16) * -1; + mask |= (abs(p2 - p1) > limit16) * -1; + mask |= (abs(p1 - p0) > limit16) * -1; + mask |= (abs(q1 - q0) > limit16) * -1; + mask |= (abs(q2 - q1) > limit16) * -1; + mask |= (abs(q3 - q2) > limit16) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; + return ~mask; +} + +static INLINE int8_t highbd_flat_mask4(uint8_t thresh, + uint16_t p3, uint16_t p2, + uint16_t p1, uint16_t p0, + uint16_t q0, uint16_t q1, + uint16_t q2, uint16_t q3, int bd) { + int8_t mask = 0; + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + mask |= (abs(p1 - p0) > thresh16) * -1; + mask |= (abs(q1 - q0) > thresh16) * -1; + mask |= (abs(p2 - p0) > thresh16) * -1; + mask |= (abs(q2 - q0) > thresh16) * -1; + mask |= (abs(p3 - p0) > thresh16) * -1; + mask |= (abs(q3 - q0) > thresh16) * -1; + return ~mask; +} + +static INLINE int8_t highbd_flat_mask5(uint8_t thresh, + uint16_t p4, uint16_t p3, + uint16_t p2, uint16_t p1, + uint16_t p0, uint16_t q0, + uint16_t q1, uint16_t q2, + uint16_t q3, uint16_t q4, int bd) { + int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd); + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + mask |= (abs(p4 - p0) > thresh16) * -1; + mask |= (abs(q4 - q0) > thresh16) * -1; + return ~mask; +} + +// Is there high edge variance internal edge: +// 11111111_11111111 yes, 00000000_00000000 no ? +static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0, + uint16_t q0, uint16_t q1, int bd) { + int16_t hev = 0; + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + hev |= (abs(p1 - p0) > thresh16) * -1; + hev |= (abs(q1 - q0) > thresh16) * -1; + return hev; +} + +static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, + uint16_t *op0, uint16_t *oq0, uint16_t *oq1, + int bd) { + int16_t filter1, filter2; + // ^0x80 equivalent to subtracting 0x80 from the values to turn them + // into -128 to +127 instead of 0 to 255. + int shift = bd - 8; + const int16_t ps1 = (int16_t)*op1 - (0x80 << shift); + const int16_t ps0 = (int16_t)*op0 - (0x80 << shift); + const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift); + const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift); + const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd); + + // Add outer taps if we have high edge variance. + int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev; + + // Inner taps. + filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask; + + // Save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set to adjust by -1 to account for the fact + // we'd round 3 the other way. + filter1 = signed_char_clamp_high(filter + 4, bd) >> 3; + filter2 = signed_char_clamp_high(filter + 3, bd) >> 3; + + *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift); + *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift); + + // Outer tap adjustments. + filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + + *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift); + *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift); +} + +void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint16_t p3 = s[-4 * p]; + const uint16_t p2 = s[-3 * p]; + const uint16_t p1 = s[-2 * p]; + const uint16_t p0 = s[-p]; + const uint16_t q0 = s[0 * p]; + const uint16_t q1 = s[1 * p]; + const uint16_t q2 = s[2 * p]; + const uint16_t q3 = s[3 * p]; + const int8_t mask = highbd_filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd); + ++s; + } +} + +void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1, + int bd) { + vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd); +} + +void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = highbd_filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd); + s += pitch; + } +} + +void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1, + int bd) { + vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, + thresh1, bd); +} + +static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, + uint16_t *op3, uint16_t *op2, + uint16_t *op1, uint16_t *op0, + uint16_t *oq0, uint16_t *oq1, + uint16_t *oq2, uint16_t *oq3, int bd) { + if (flat && mask) { + const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + + // 7-tap filter [1, 1, 1, 2, 1, 1, 1] + *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); + *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); + *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); + *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); + *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); + *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); + } else { + highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); + } +} + +void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + + const int8_t mask = highbd_filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, + bd); + highbd_filter8(mask, *thresh, flat, + s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, + s, s + 1 * p, s + 2 * p, s + 3 * p, bd); + ++s; + } +} + +void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1, + int bd) { + vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd); +} + +void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + + for (i = 0; i < 8; ++i) { + const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = highbd_filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, + bd); + highbd_filter8(mask, *thresh, flat, + s - 4, s - 3, s - 2, s - 1, + s, s + 1, s + 2, s + 3, + bd); + s += pitch; + } +} + +void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1, + int bd) { + vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, + thresh1, bd); +} + +static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, + uint8_t flat, uint8_t flat2, + uint16_t *op7, uint16_t *op6, + uint16_t *op5, uint16_t *op4, + uint16_t *op3, uint16_t *op2, + uint16_t *op1, uint16_t *op0, + uint16_t *oq0, uint16_t *oq1, + uint16_t *oq2, uint16_t *oq3, + uint16_t *oq4, uint16_t *oq5, + uint16_t *oq6, uint16_t *oq7, int bd) { + if (flat2 && flat && mask) { + const uint16_t p7 = *op7; + const uint16_t p6 = *op6; + const uint16_t p5 = *op5; + const uint16_t p4 = *op4; + const uint16_t p3 = *op3; + const uint16_t p2 = *op2; + const uint16_t p1 = *op1; + const uint16_t p0 = *op0; + const uint16_t q0 = *oq0; + const uint16_t q1 = *oq1; + const uint16_t q2 = *oq2; + const uint16_t q3 = *oq3; + const uint16_t q4 = *oq4; + const uint16_t q5 = *oq5; + const uint16_t q6 = *oq6; + const uint16_t q7 = *oq7; + + // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] + *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + + q0, 4); + *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + + q0 + q1, 4); + *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + + q0 + q1 + q2, 4); + *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + + q0 + q1 + q2 + q3, 4); + *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + + q0 + q1 + q2 + q3 + q4, 4); + *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 + + q0 + q1 + q2 + q3 + q4 + q5, 4); + *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + + q0 + q1 + q2 + q3 + q4 + q5 + q6, 4); + *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + + q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); + *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + + q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4); + *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4); + *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); + *oq6 = ROUND_POWER_OF_TWO(p0 + + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); + } else { + highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3, + bd); + } +} + +static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count, int bd) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8 * count; ++i) { + const uint16_t p3 = s[-4 * p]; + const uint16_t p2 = s[-3 * p]; + const uint16_t p1 = s[-2 * p]; + const uint16_t p0 = s[-p]; + const uint16_t q0 = s[0 * p]; + const uint16_t q1 = s[1 * p]; + const uint16_t q2 = s[2 * p]; + const uint16_t q3 = s[3 * p]; + const int8_t mask = highbd_filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, + bd); + const int8_t flat2 = highbd_flat_mask5( + 1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, + q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd); + + highbd_filter16(mask, *thresh, flat, flat2, + s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p, + s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, + s, s + 1 * p, s + 2 * p, s + 3 * p, + s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p, + bd); + ++s; + } +} + +void vpx_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd); +} + +void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd); +} + +static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count, int bd) { + int i; + + for (i = 0; i < count; ++i) { + const uint16_t p3 = s[-4]; + const uint16_t p2 = s[-3]; + const uint16_t p1 = s[-2]; + const uint16_t p0 = s[-1]; + const uint16_t q0 = s[0]; + const uint16_t q1 = s[1]; + const uint16_t q2 = s[2]; + const uint16_t q3 = s[3]; + const int8_t mask = highbd_filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, + bd); + const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, + q0, s[4], s[5], s[6], s[7], bd); + + highbd_filter16(mask, *thresh, flat, flat2, + s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, + s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7, + bd); + s += p; + } +} + +void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd); +} + +void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int bd) { + highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/thirdparty/libvpx/vpx_dsp/prob.c b/thirdparty/libvpx/vpx_dsp/prob.c new file mode 100644 index 000000000000..639d24dd2f01 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/prob.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./prob.h" + +const uint8_t vpx_norm[256] = { + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static unsigned int tree_merge_probs_impl(unsigned int i, + const vpx_tree_index *tree, + const vpx_prob *pre_probs, + const unsigned int *counts, + vpx_prob *probs) { + const int l = tree[i]; + const unsigned int left_count = (l <= 0) + ? counts[-l] + : tree_merge_probs_impl(l, tree, pre_probs, counts, probs); + const int r = tree[i + 1]; + const unsigned int right_count = (r <= 0) + ? counts[-r] + : tree_merge_probs_impl(r, tree, pre_probs, counts, probs); + const unsigned int ct[2] = { left_count, right_count }; + probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct); + return left_count + right_count; +} + +void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs, + const unsigned int *counts, vpx_prob *probs) { + tree_merge_probs_impl(0, tree, pre_probs, counts, probs); +} diff --git a/thirdparty/libvpx/vpx_dsp/prob.h b/thirdparty/libvpx/vpx_dsp/prob.h new file mode 100644 index 000000000000..c3cb103ffb5e --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/prob.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_PROB_H_ +#define VPX_DSP_PROB_H_ + +#include "./vpx_config.h" +#include "./vpx_dsp_common.h" + +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint8_t vpx_prob; + +#define MAX_PROB 255 + +#define vpx_prob_half ((vpx_prob) 128) + +typedef int8_t vpx_tree_index; + +#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2) + +#define vpx_complement(x) (255 - x) + +#define MODE_MV_COUNT_SAT 20 + +/* We build coding trees compactly in arrays. + Each node of the tree is a pair of vpx_tree_indices. + Array index often references a corresponding probability table. + Index <= 0 means done encoding/decoding and value = -Index, + Index > 0 means need another bit, specification at index. + Nonnegative indices are always even; processing begins at node 0. */ + +typedef const vpx_tree_index vpx_tree[]; + +static INLINE vpx_prob clip_prob(int p) { + return (p > 255) ? 255 : (p < 1) ? 1 : p; +} + +static INLINE vpx_prob get_prob(int num, int den) { + return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den); +} + +static INLINE vpx_prob get_binary_prob(int n0, int n1) { + return get_prob(n0, n0 + n1); +} + +/* This function assumes prob1 and prob2 are already within [1,255] range. */ +static INLINE vpx_prob weighted_prob(int prob1, int prob2, int factor) { + return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8); +} + +static INLINE vpx_prob merge_probs(vpx_prob pre_prob, + const unsigned int ct[2], + unsigned int count_sat, + unsigned int max_update_factor) { + const vpx_prob prob = get_binary_prob(ct[0], ct[1]); + const unsigned int count = VPXMIN(ct[0] + ct[1], count_sat); + const unsigned int factor = max_update_factor * count / count_sat; + return weighted_prob(pre_prob, prob, factor); +} + +// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT; +static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = { + 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64, + 70, 76, 83, 89, 96, 102, 108, 115, 121, 128 +}; + +static INLINE vpx_prob mode_mv_merge_probs(vpx_prob pre_prob, + const unsigned int ct[2]) { + const unsigned int den = ct[0] + ct[1]; + if (den == 0) { + return pre_prob; + } else { + const unsigned int count = VPXMIN(den, MODE_MV_COUNT_SAT); + const unsigned int factor = count_to_update_factor[count]; + const vpx_prob prob = + clip_prob(((int64_t)(ct[0]) * 256 + (den >> 1)) / den); + return weighted_prob(pre_prob, prob, factor); + } +} + +void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs, + const unsigned int *counts, vpx_prob *probs); + + +DECLARE_ALIGNED(16, extern const uint8_t, vpx_norm[256]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_PROB_H_ diff --git a/thirdparty/libvpx/vpx_dsp/txfm_common.h b/thirdparty/libvpx/vpx_dsp/txfm_common.h new file mode 100644 index 000000000000..442e6a57b5ba --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/txfm_common.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_TXFM_COMMON_H_ +#define VPX_DSP_TXFM_COMMON_H_ + +#include "vpx_dsp/vpx_dsp_common.h" + +// Constants and Macros used by all idct/dct functions +#define DCT_CONST_BITS 14 +#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1)) + +#define UNIT_QUANT_SHIFT 2 +#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT) + +// Constants: +// for (int i = 1; i< 32; ++i) +// printf("static const int cospi_%d_64 = %.0f;\n", i, +// round(16384 * cos(i*M_PI/64))); +// Note: sin(k*Pi/64) = cos((32-k)*Pi/64) +static const tran_high_t cospi_1_64 = 16364; +static const tran_high_t cospi_2_64 = 16305; +static const tran_high_t cospi_3_64 = 16207; +static const tran_high_t cospi_4_64 = 16069; +static const tran_high_t cospi_5_64 = 15893; +static const tran_high_t cospi_6_64 = 15679; +static const tran_high_t cospi_7_64 = 15426; +static const tran_high_t cospi_8_64 = 15137; +static const tran_high_t cospi_9_64 = 14811; +static const tran_high_t cospi_10_64 = 14449; +static const tran_high_t cospi_11_64 = 14053; +static const tran_high_t cospi_12_64 = 13623; +static const tran_high_t cospi_13_64 = 13160; +static const tran_high_t cospi_14_64 = 12665; +static const tran_high_t cospi_15_64 = 12140; +static const tran_high_t cospi_16_64 = 11585; +static const tran_high_t cospi_17_64 = 11003; +static const tran_high_t cospi_18_64 = 10394; +static const tran_high_t cospi_19_64 = 9760; +static const tran_high_t cospi_20_64 = 9102; +static const tran_high_t cospi_21_64 = 8423; +static const tran_high_t cospi_22_64 = 7723; +static const tran_high_t cospi_23_64 = 7005; +static const tran_high_t cospi_24_64 = 6270; +static const tran_high_t cospi_25_64 = 5520; +static const tran_high_t cospi_26_64 = 4756; +static const tran_high_t cospi_27_64 = 3981; +static const tran_high_t cospi_28_64 = 3196; +static const tran_high_t cospi_29_64 = 2404; +static const tran_high_t cospi_30_64 = 1606; +static const tran_high_t cospi_31_64 = 804; + +// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 +static const tran_high_t sinpi_1_9 = 5283; +static const tran_high_t sinpi_2_9 = 9929; +static const tran_high_t sinpi_3_9 = 13377; +static const tran_high_t sinpi_4_9 = 15212; + +#endif // VPX_DSP_TXFM_COMMON_H_ diff --git a/thirdparty/libvpx/vpx_dsp/vpx_convolve.c b/thirdparty/libvpx/vpx_dsp/vpx_convolve.c new file mode 100644 index 000000000000..2d1c927cbea7 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/vpx_convolve.c @@ -0,0 +1,612 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_convolve.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO(dst[x] + + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + int x0_q4, int x_step_q4, + const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, + int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + uint8_t temp[135 * 64]; + int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, + x_filters, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + y_filters, y0_q4, y_step_q4, w, h); +} + +static const InterpKernel *get_filter_base(const int16_t *filter) { + // NOTE: This assumes that the filter table is 256-byte aligned. + // TODO(agrange) Modify to make independent of table alignment. + return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); +} + +static int get_filter_offset(const int16_t *f, const InterpKernel *base) { + return (int)((const InterpKernel *)(intptr_t)f - base); +} + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + (void)filter_y; + (void)y_step_q4; + + convolve_horiz(src, src_stride, dst, dst_stride, filters_x, + x0_q4, x_step_q4, w, h); +} + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + (void)filter_y; + (void)y_step_q4; + + convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, + x0_q4, x_step_q4, w, h); +} + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + + convolve_vert(src, src_stride, dst, dst_stride, filters_y, + y0_q4, y_step_q4, w, h); +} + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + + convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, + y0_q4, y_step_q4, w, h); +} + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + convolve(src, src_stride, dst, dst_stride, + filters_x, x0_q4, x_step_q4, + filters_y, y0_q4, y_step_q4, w, h); +} + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); + assert(w <= 64); + assert(h <= 64); + + vpx_convolve8_c(src, src_stride, temp, 64, + filter_x, x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); +} + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int r; + + (void)filter_x; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; + + for (r = h; r > 0; --r) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + (void)filter_x; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) + dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); + + src += src_stride; + dst += dst_stride; + } +} + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); +} + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); +} + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); +} + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); +} + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); +} + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, + int w, int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, + int w, int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO(dst[x] + + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, int h, + int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = clip_pixel_highbd( + ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, int h, + int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + int x0_q4, int x_step_q4, + const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, + int w, int h, int bd) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + uint16_t temp[64 * 135]; + int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, CONVERT_TO_BYTEPTR(temp), 64, + x_filters, x0_q4, x_step_q4, w, + intermediate_height, bd); + highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1), + 64, dst, dst_stride, y_filters, y0_q4, y_step_q4, + w, h, bd); +} + + +void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; + (void)y_step_q4; + + highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, + x0_q4, x_step_q4, w, h, bd); +} + +void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; + (void)y_step_q4; + + highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, + x0_q4, x_step_q4, w, h, bd); +} + +void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; + (void)x_step_q4; + + highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, + y0_q4, y_step_q4, w, h, bd); +} + +void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; + (void)x_step_q4; + + highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, + y0_q4, y_step_q4, w, h, bd); +} + +void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + highbd_convolve(src, src_stride, dst, dst_stride, + filters_x, x0_q4, x_step_q4, + filters_y, y0_q4, y_step_q4, w, h, bd); +} + +void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + // Fixed size intermediate buffer places limits on parameters. + DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]); + assert(w <= 64); + assert(h <= 64); + + vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64, + filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, + NULL, 0, NULL, 0, w, h, bd); +} + +void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h, int bd) { + int r; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + for (r = h; r > 0; --r) { + memcpy(dst, src, w * sizeof(uint16_t)); + src += src_stride; + dst += dst_stride; + } +} + +void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); + } + src += src_stride; + dst += dst_stride; + } +} +#endif diff --git a/thirdparty/libvpx/vpx_dsp/vpx_convolve.h b/thirdparty/libvpx/vpx_dsp/vpx_convolve.h new file mode 100644 index 000000000000..9ed3f1750f2d --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/vpx_convolve.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_DSP_VPX_CONVOLVE_H_ +#define VPX_DSP_VPX_CONVOLVE_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_VPX_CONVOLVE_H_ diff --git a/thirdparty/libvpx/vpx_dsp/vpx_dsp_common.h b/thirdparty/libvpx/vpx_dsp/vpx_dsp_common.h new file mode 100644 index 000000000000..a1d0a51ef565 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/vpx_dsp_common.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_VPX_DSP_COMMON_H_ +#define VPX_DSP_VPX_DSP_COMMON_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define VPXMIN(x, y) (((x) < (y)) ? (x) : (y)) +#define VPXMAX(x, y) (((x) > (y)) ? (x) : (y)) + +#if CONFIG_VP9_HIGHBITDEPTH +// Note: +// tran_low_t is the datatype used for final transform coefficients. +// tran_high_t is the datatype used for intermediate transform stages. +typedef int64_t tran_high_t; +typedef int32_t tran_low_t; +#else +// Note: +// tran_low_t is the datatype used for final transform coefficients. +// tran_high_t is the datatype used for intermediate transform stages. +typedef int32_t tran_high_t; +typedef int16_t tran_low_t; +#endif // CONFIG_VP9_HIGHBITDEPTH + +static INLINE uint8_t clip_pixel(int val) { + return (val > 255) ? 255 : (val < 0) ? 0 : val; +} + +static INLINE int clamp(int value, int low, int high) { + return value < low ? low : (value > high ? high : value); +} + +static INLINE double fclamp(double value, double low, double high) { + return value < low ? low : (value > high ? high : value); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE uint16_t clip_pixel_highbd(int val, int bd) { + switch (bd) { + case 8: + default: + return (uint16_t)clamp(val, 0, 255); + case 10: + return (uint16_t)clamp(val, 0, 1023); + case 12: + return (uint16_t)clamp(val, 0, 4095); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_VPX_DSP_COMMON_H_ diff --git a/thirdparty/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/thirdparty/libvpx/vpx_dsp/vpx_dsp_rtcd.c new file mode 100644 index 000000000000..5fe27b614bdc --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/vpx_dsp_rtcd.c @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#define RTCD_C +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/vpx_once.h" + +void vpx_dsp_rtcd() { + once(setup_rtcd_internal); +} diff --git a/thirdparty/libvpx/vpx_dsp/vpx_filter.h b/thirdparty/libvpx/vpx_dsp/vpx_filter.h new file mode 100644 index 000000000000..2617febf3b3d --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/vpx_filter.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_VPX_FILTER_H_ +#define VPX_DSP_VPX_FILTER_H_ + +#include "vpx/vpx_integer.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +#define FILTER_BITS 7 + +#define SUBPEL_BITS 4 +#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) +#define SUBPEL_SHIFTS (1 << SUBPEL_BITS) +#define SUBPEL_TAPS 8 + +typedef int16_t InterpKernel[SUBPEL_TAPS]; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_VPX_FILTER_H_ diff --git a/thirdparty/libvpx/vpx_dsp/x86/convolve.h b/thirdparty/libvpx/vpx_dsp/x86/convolve.h new file mode 100644 index 000000000000..7e43eb7c7240 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/convolve.h @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_DSP_X86_CONVOLVE_H_ +#define VPX_DSP_X86_CONVOLVE_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +typedef void filter8_1dfunction ( + const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter +); + +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void vpx_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h) { \ + assert(filter[3] != 128); \ + assert(step_q4 == 16); \ + if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + if (w == 8) { \ + vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + } else if (w == 4) { \ + vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + } \ + } else { \ + while (w >= 16) { \ + vpx_filter_block1d16_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + if (w == 8) { \ + vpx_filter_block1d8_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + } else if (w == 4) { \ + vpx_filter_block1d4_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + } \ + } \ +} + +#define FUN_CONV_2D(avg, opt) \ +void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h) { \ + assert(filter_x[3] != 128); \ + assert(filter_y[3] != 128); \ + assert(w <= 64); \ + assert(h <= 64); \ + assert(x_step_q4 == 16); \ + assert(y_step_q4 == 16); \ + if (filter_x[0] | filter_x[1] | filter_x[2]) { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ + vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 7); \ + vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } else { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ + vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 1); \ + vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } \ +} + +#if CONFIG_VP9_HIGHBITDEPTH + +typedef void highbd_filter8_1dfunction ( + const uint16_t *src_ptr, + const ptrdiff_t src_pitch, + uint16_t *output_ptr, + ptrdiff_t out_pitch, + unsigned int output_height, + const int16_t *filter, + int bd +); + +#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void vpx_highbd_convolve8_##name##_##opt(const uint8_t *src8, \ + ptrdiff_t src_stride, \ + uint8_t *dst8, \ + ptrdiff_t dst_stride, \ + const int16_t *filter_x, \ + int x_step_q4, \ + const int16_t *filter_y, \ + int y_step_q4, \ + int w, int h, int bd) { \ + if (step_q4 == 16 && filter[3] != 128) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter, \ + bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h, bd); \ + } \ +} + +#define HIGH_FUN_CONV_2D(avg, opt) \ +void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h, int bd) { \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ + vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), 64, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h + 7, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \ + 64, dst, dst_stride, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h, bd); \ + } else { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ + vpx_highbd_convolve8_horiz_##opt(src, src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), 64, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h + 1, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \ + dst, dst_stride, \ + filter_x, x_step_q4, \ + filter_y, y_step_q4, \ + w, h, bd); \ + } \ + } else { \ + vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, \ + h, bd); \ + } \ +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // VPX_DSP_X86_CONVOLVE_H_ diff --git a/thirdparty/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/intrapred_sse2.asm new file mode 100644 index 000000000000..cd6a6ae982c7 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/intrapred_sse2.asm @@ -0,0 +1,860 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pb_1: times 16 db 1 +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 8 dw 16 +pw_32: times 8 dw 32 +dc_128: times 16 db 128 +pw2_4: times 8 dw 2 +pw2_8: times 8 dw 4 +pw2_16: times 8 dw 8 +pw2_32: times 8 dw 16 + +SECTION .text + +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 + pavgb %4, %1, %3 + pxor %3, %1 + pand %3, [GLOBAL(pb_1)] + psubb %4, %3 + pavgb %4, %2 +%endmacro + +INIT_XMM sse2 +cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset + GET_GOT goffsetq + + movq m0, [aboveq] + DEFINE_ARGS dst, stride, temp + psrldq m1, m0, 1 + psrldq m2, m0, 2 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 + + ; store 4 lines + movd [dstq ], m3 + psrlq m3, 8 + movd [dstq+strideq ], m3 + lea dstq, [dstq+strideq*2] + psrlq m3, 8 + movd [dstq ], m3 + psrlq m3, 8 + movd [dstq+strideq ], m3 + psrlq m0, 56 + movd tempq, m0 + mov [dstq+strideq+3], tempb + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset + GET_GOT goffsetq + + movu m1, [aboveq] + pslldq m0, m1, 1 + psrldq m2, m1, 1 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 + punpckhbw m0, m0 ; 7 7 + punpcklwd m0, m0 ; 7 7 7 7 + punpckldq m0, m0 ; 7 7 7 7 7 7 7 7 + punpcklqdq m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7 + + ; store 4 lines + psrldq m3, 1 + movq [dstq ], m3 + psrldq m3, 1 + movq [dstq+strideq ], m3 + psrldq m3, 1 + movq [dstq+strideq*2], m3 + psrldq m3, 1 + movq [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + + ; store next 4 lines + psrldq m3, 1 + movq [dstq ], m3 + psrldq m3, 1 + movq [dstq+strideq ], m3 + psrldq m3, 1 + movq [dstq+strideq*2], m3 + psrldq m3, 1 + movq [dstq+stride3q ], m3 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset + GET_GOT goffsetq + + movd m0, [leftq] ; abcd [byte] + punpcklbw m4, m0, m0 ; aabb ccdd + punpcklwd m4, m4 ; aaaa bbbb cccc dddd + psrldq m4, 12 ; dddd + punpckldq m0, m4 ; abcd dddd + psrldq m1, m0, 1 ; bcdd + psrldq m2, m0, 2 ; cddd + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; a2bc b2cd c3d d + pavgb m1, m0 ; ab, bc, cd, d [byte] + + punpcklbw m1, m3 ; ab, a2bc, bc, b2cd, cd, c3d, d, d + movd [dstq ], m1 + psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d + movd [dstq+strideq], m1 + + lea dstq, [dstq+strideq*2] + psrlq m1, 16 ; cd, c3d, d, d + movd [dstq ], m1 + movd [dstq+strideq], m4 ; d, d, d, d + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + movd m2, [leftq] + movd m0, [aboveq] + pxor m1, m1 + punpckldq m0, m2 + psadbw m0, m1 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [leftq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [aboveq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + movq m2, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + paddw m0, [GLOBAL(pw_8)] + psraw m0, 4 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movd m0, [GLOBAL(dc_128)] + movd [dstq ], m0 + movd [dstq+strideq ], m0 + movd [dstq+strideq*2], m0 + movd [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq m0, [GLOBAL(dc_128)] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_16)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + + +INIT_XMM sse2 +cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + + +INIT_XMM sse2 +cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + psadbw m3, m1 + psadbw m4, m1 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_32)] + psraw m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + mova m2, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above + movd m0, [aboveq] + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left + movifnidn leftq, leftmp + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 + pshufd m1, m0, 0x1 + movd [dstq ], m0 + movd [dstq+strideq], m1 + pshufd m2, m0, 0x2 + lea dstq, [dstq+strideq*2] + pshufd m3, m0, 0x3 + movd [dstq ], m2 + movd [dstq+strideq], m3 + RET + +INIT_XMM sse2 +cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -2 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] + movq m0, [leftq ] + punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8 +.loop: + pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1 + pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2 + movq [dstq ], m1 + movq [dstq+strideq], m2 + pshuflw m1, m0, 0xaa + pshuflw m2, m0, 0xff + movq [dstq+strideq*2], m1 + movq [dstq+stride3q ], m2 + pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 + inc lineq + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -4 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] +.loop: + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 ; l1 to l4 each repeated 4 times + pshufd m1, m0, 0x0 ; l1 repeated 16 times + pshufd m2, m0, 0x55 ; l2 repeated 16 times + mova [dstq ], m1 + mova [dstq+strideq ], m2 + pshufd m1, m0, 0xaa + pshufd m2, m0, 0xff + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 + inc lineq + lea leftq, [leftq+4 ] + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -8 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] +.loop: + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 ; l1 to l4 each repeated 4 times + pshufd m1, m0, 0x0 ; l1 repeated 16 times + pshufd m2, m0, 0x55 ; l2 repeated 16 times + mova [dstq ], m1 + mova [dstq+16 ], m1 + mova [dstq+strideq ], m2 + mova [dstq+strideq+16 ], m2 + pshufd m1, m0, 0xaa + pshufd m2, m0, 0xff + mova [dstq+strideq*2 ], m1 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m2 + mova [dstq+stride3q+16 ], m2 + inc lineq + lea leftq, [leftq+4 ] + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left + pxor m1, m1 + movq m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x + punpcklbw m0, m1 + pshuflw m2, m0, 0x0 ; [63:0] tl tl tl tl [word] + psrldq m0, 2 + psubw m0, m2 ; [63:0] t1-tl t2-tl t3-tl t4-tl [word] + movd m2, [leftq] + punpcklbw m2, m1 + pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] + pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] + paddw m4, m0 + paddw m3, m0 + packuswb m4, m4 + packuswb m3, m3 + movd [dstq ], m4 + movd [dstq+strideq], m3 + lea dstq, [dstq+strideq*2] + pshuflw m4, m2, 0xaa + pshuflw m3, m2, 0xff + paddw m4, m0 + paddw m3, m0 + packuswb m4, m4 + packuswb m3, m3 + movd [dstq ], m4 + movd [dstq+strideq], m3 + RET + +INIT_XMM sse2 +cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left + pxor m1, m1 + movd m2, [aboveq-1] + movq m0, [aboveq] + punpcklbw m2, m1 + punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word] + pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word] + DEFINE_ARGS dst, stride, line, left + mov lineq, -4 + punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word] + psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word] + movq m2, [leftq] + punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word] +.loop + pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] + pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] + punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word] + punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word] + paddw m4, m0 + paddw m3, m0 + packuswb m4, m3 + movq [dstq ], m4 + movhps [dstq+strideq], m4 + lea dstq, [dstq+strideq*2] + psrldq m2, 4 + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left + pxor m1, m1 + mova m2, [aboveq-16]; + mova m0, [aboveq] ; t1 t2 ... t16 [byte] + punpckhbw m2, m1 ; [127:112] tl [word] + punpckhbw m4, m0, m1 + punpcklbw m0, m1 ; m0:m4 t1 t2 ... t16 [word] + DEFINE_ARGS dst, stride, line, left, stride8 + mov lineq, -8 + pshufhw m2, m2, 0xff + mova m3, [leftq] ; l1 l2 ... l16 [byte] + punpckhqdq m2, m2 ; tl repeated 8 times [word] + psubw m0, m2 + psubw m4, m2 ; m0:m4 t1-tl t2-tl ... t16-tl [word] + punpckhbw m5, m3, m1 + punpcklbw m3, m1 ; m3:m5 l1 l2 ... l16 [word] + lea stride8q, [strideq*8] +.loop: + pshuflw m6, m3, 0x0 + pshuflw m7, m5, 0x0 + punpcklqdq m6, m6 ; l1 repeated 8 times [word] + punpcklqdq m7, m7 ; l8 repeated 8 times [word] + paddw m1, m6, m0 + paddw m6, m4 ; m1:m6 ti-tl+l1 [i=1,15] [word] + psrldq m5, 2 + packuswb m1, m6 + mova [dstq ], m1 + paddw m1, m7, m0 + paddw m7, m4 ; m1:m7 ti-tl+l8 [i=1,15] [word] + psrldq m3, 2 + packuswb m1, m7 + mova [dstq+stride8q], m1 + inc lineq + lea dstq, [dstq+strideq] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left + pxor m1, m1 + movd m2, [aboveq-1] + mova m0, [aboveq] + mova m4, [aboveq+16] + punpcklbw m2, m1 + punpckhbw m3, m0, m1 + punpckhbw m5, m4, m1 + punpcklbw m0, m1 + punpcklbw m4, m1 + pshuflw m2, m2, 0x0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -16 + punpcklqdq m2, m2 + add leftq, 32 + psubw m0, m2 + psubw m3, m2 + psubw m4, m2 + psubw m5, m2 +.loop: + movd m2, [leftq+lineq*2] + pxor m1, m1 + punpcklbw m2, m1 + pshuflw m7, m2, 0x55 + pshuflw m2, m2, 0x0 + punpcklqdq m2, m2 + punpcklqdq m7, m7 + paddw m6, m2, m3 + paddw m1, m2, m0 + packuswb m1, m6 + mova [dstq ], m1 + paddw m6, m2, m5 + paddw m1, m2, m4 + packuswb m1, m6 + mova [dstq+16 ], m1 + paddw m6, m7, m3 + paddw m1, m7, m0 + packuswb m1, m6 + mova [dstq+strideq ], m1 + paddw m6, m7, m5 + paddw m1, m7, m4 + packuswb m1, m6 + mova [dstq+strideq+16], m1 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET diff --git a/thirdparty/libvpx/vpx_dsp/x86/intrapred_ssse3.asm b/thirdparty/libvpx/vpx_dsp/x86/intrapred_ssse3.asm new file mode 100644 index 000000000000..5e0139fa8d3f --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/intrapred_ssse3.asm @@ -0,0 +1,871 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA + +pb_1: times 16 db 1 +sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 +sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 +sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 +sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +SECTION .text + +INIT_XMM ssse3 +cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, dst8, line + lea stride3q, [strideq*3] + lea dst8q, [dstq+strideq*8] + mova m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] + pavgb m3, m2, m0 + pxor m2, m0 + pshufb m0, m1 + pand m2, [GLOBAL(pb_1)] + psubb m3, m2 + pavgb m0, m3 + + ; first 4 lines and first half of 3rd 4 lines + mov lined, 2 +.loop: + mova [dstq ], m0 + movhps [dst8q ], m0 + pshufb m0, m1 + mova [dstq +strideq ], m0 + movhps [dst8q+strideq ], m0 + pshufb m0, m1 + mova [dstq +strideq*2 ], m0 + movhps [dst8q+strideq*2 ], m0 + pshufb m0, m1 + mova [dstq +stride3q ], m0 + movhps [dst8q+stride3q ], m0 + pshufb m0, m1 + lea dstq, [dstq +strideq*4] + lea dst8q, [dst8q+strideq*4] + dec lined + jnz .loop + + ; bottom-right 8x8 block + movhps [dstq +8], m0 + movhps [dstq+strideq +8], m0 + movhps [dstq+strideq*2+8], m0 + movhps [dstq+stride3q +8], m0 + lea dstq, [dstq+strideq*4] + movhps [dstq +8], m0 + movhps [dstq+strideq +8], m0 + movhps [dstq+strideq*2+8], m0 + movhps [dstq+stride3q +8], m0 + + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + mova m4, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, dst16, line + lea stride3q, [strideq*3] + lea dst16q, [dstq +strideq*8] + lea dst16q, [dst16q+strideq*8] + mova m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)] + pavgb m3, m2, m4 + pxor m2, m4 + palignr m5, m4, m0, 1 + palignr m6, m4, m0, 2 + pshufb m4, m1 + pand m2, [GLOBAL(pb_1)] + psubb m3, m2 + pavgb m4, m3 + pavgb m3, m0, m6 + pxor m0, m6 + pand m0, [GLOBAL(pb_1)] + psubb m3, m0 + pavgb m5, m3 + + ; write 4x4 lines (and the first half of the second 4x4 lines) + mov lined, 4 +.loop: + mova [dstq ], m5 + mova [dstq +16], m4 + mova [dst16q ], m4 + palignr m3, m4, m5, 1 + pshufb m4, m1 + mova [dstq +strideq ], m3 + mova [dstq +strideq +16], m4 + mova [dst16q+strideq ], m4 + palignr m5, m4, m3, 1 + pshufb m4, m1 + mova [dstq +strideq*2 ], m5 + mova [dstq +strideq*2+16], m4 + mova [dst16q+strideq*2 ], m4 + palignr m3, m4, m5, 1 + pshufb m4, m1 + mova [dstq +stride3q ], m3 + mova [dstq +stride3q +16], m4 + mova [dst16q+stride3q ], m4 + palignr m5, m4, m3, 1 + pshufb m4, m1 + lea dstq, [dstq +strideq*4] + lea dst16q, [dst16q+strideq*4] + dec lined + jnz .loop + + ; write second half of second 4x4 lines + mova [dstq +16], m4 + mova [dstq +strideq +16], m4 + mova [dstq +strideq*2+16], m4 + mova [dstq +stride3q +16], m4 + lea dstq, [dstq +strideq*4] + mova [dstq +16], m4 + mova [dstq +strideq +16], m4 + mova [dstq +strideq*2+16], m4 + mova [dstq +stride3q +16], m4 + lea dstq, [dstq +strideq*4] + mova [dstq +16], m4 + mova [dstq +strideq +16], m4 + mova [dstq +strideq*2+16], m4 + mova [dstq +stride3q +16], m4 + lea dstq, [dstq +strideq*4] + mova [dstq +16], m4 + mova [dstq +strideq +16], m4 + mova [dstq +strideq*2+16], m4 + mova [dstq +stride3q +16], m4 + + RESTORE_GOT + RET + +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 + pavgb %4, %1, %3 + pxor %3, %1 + pand %3, [GLOBAL(pb_1)] + psubb %4, %3 + pavgb %4, %2 +%endmacro + +INIT_XMM ssse3 +cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset + GET_GOT goffsetq + + movq m3, [aboveq] + pshufb m1, m3, [GLOBAL(sh_b23456777)] + pshufb m2, m3, [GLOBAL(sh_b12345677)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 + pavgb m3, m2 + + ; store 4 lines + movd [dstq ], m3 + movd [dstq+strideq], m4 + lea dstq, [dstq+strideq*2] + psrldq m3, 1 + psrldq m4, 1 + movd [dstq ], m3 + movd [dstq+strideq], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset + GET_GOT goffsetq + + movq m3, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] + pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] + pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] + pshufb m3, [GLOBAL(sh_b0123456777777777)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4 + pavgb m3, m2 + + ; store 4 lines + movq [dstq ], m3 + movq [dstq+strideq], m4 + psrldq m3, 1 + psrldq m4, 1 + movq [dstq+strideq*2], m3 + movq [dstq+stride3q ], m4 + lea dstq, [dstq+strideq*4] + psrldq m3, 1 + psrldq m4, 1 + + ; store 4 lines + movq [dstq ], m3 + movq [dstq+strideq], m4 + psrldq m3, 1 + psrldq m4, 1 + movq [dstq+strideq*2], m3 + movq [dstq+stride3q ], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, line + lea stride3q, [strideq*3] + mova m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] + pshufb m3, m0, m1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4 + pavgb m0, m3 + + mov lined, 4 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m4 + pshufb m0, m1 + pshufb m4, m1 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m4 + pshufb m0, m1 + pshufb m4, m1 + lea dstq, [dstq+strideq*4] + dec lined + jnz .loop + RESTORE_GOT + REP_RET + +INIT_XMM ssse3 +cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + mova m7, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, line + mova m1, [GLOBAL(sh_b123456789abcdeff)] + lea stride3q, [strideq*3] + pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)] + pshufb m3, m7, m1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4 + palignr m6, m7, m0, 1 + palignr m5, m7, m0, 2 + pavgb m7, m3 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2 + pavgb m0, m6 + + mov lined, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m7 + mova [dstq+strideq ], m2 + mova [dstq+strideq +16], m4 + palignr m3, m7, m0, 1 + palignr m5, m4, m2, 1 + pshufb m7, m1 + pshufb m4, m1 + + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m5 + mova [dstq+stride3q +16], m4 + palignr m0, m7, m3, 1 + palignr m2, m4, m5, 1 + pshufb m7, m1 + pshufb m4, m1 + lea dstq, [dstq+strideq*4] + dec lined + jnz .loop + RESTORE_GOT + REP_RET + +INIT_XMM ssse3 +cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + movd m0, [leftq] ; l1, l2, l3, l4 + movd m1, [aboveq-1] ; tl, t1, t2, t3 + punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 + pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 + psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 + psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 + ; comments below are for a predictor like this + ; A1 B1 C1 D1 + ; A2 B2 A1 B1 + ; A3 B3 A2 B2 + ; A4 B4 A3 B3 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 + pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 + + punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 .. + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 .. + movd [dstq+stride3q ], m3 + psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. + movd [dstq+strideq*2], m3 + psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. + movd [dstq+strideq ], m3 + psrldq m3, 2 ; A1 B1 C1 D1 .. + movd [dstq ], m3 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + movq m0, [leftq] ; [0- 7] l1-8 [byte] + movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] + pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] + pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] + pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] + pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] + psrldq m4, m0, 1 ; t1-7 [word] + psrldq m5, m0, 2 ; t2-7 [word] + ; comments below are for a predictor like this + ; A1 B1 C1 D1 E1 F1 G1 H1 + ; A2 B2 A1 B1 C1 D1 E1 F1 + ; A3 B3 A2 B2 A1 B1 C1 D1 + ; A4 B4 A3 B3 A2 B2 A1 B1 + ; A5 B5 A4 B4 A3 B3 A2 B2 + ; A6 B6 A5 B5 A4 B4 A3 B3 + ; A7 B7 A6 B6 A5 B5 A4 B4 + ; A8 B8 A7 B7 A6 B6 A5 B5 + pavgb m6, m1, m2 ; 2-tap avg A8-A1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 + + punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 + palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 + movq [dstq+strideq*2], m0 + psrldq m0, 2 ; A-B2, A-B1, C-H1 + movq [dstq+strideq ], m0 + psrldq m0, 2 ; A-H1 + movq [dstq ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 + psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 + movq [dstq+strideq*2], m6 + psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 + movq [dstq+strideq ], m6 + psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 + movq [dstq ], m6 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + mova m0, [leftq] + movu m7, [aboveq-1] + ; comments below are for a predictor like this + ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 + ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 + ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 + ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 + ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 + ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 + ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 + ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 + ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 + ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 + ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 + ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 + ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 + ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 + ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 + ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 + pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] + palignr m5, m0, m6, 15 + palignr m3, m0, m6, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg + pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] + pavgb m5, m0 ; A1 - Ag + + punpcklbw m0, m4, m5 ; A-B8 ... A-B1 + punpckhbw m4, m5 ; A-B9 ... A-Bg + + pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] + pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 + + pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + palignr m2, m1, m6, 14 + mova [dstq ], m2 + palignr m2, m1, m6, 12 + mova [dstq+strideq ], m2 + palignr m2, m1, m6, 10 + mova [dstq+strideq*2], m2 + palignr m2, m1, m6, 8 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + palignr m2, m1, m6, 6 + mova [dstq ], m2 + palignr m2, m1, m6, 4 + mova [dstq+strideq ], m2 + palignr m2, m1, m6, 2 + mova [dstq+strideq*2], m2 + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] + mova [dstq+stride3q ], m6 + lea dstq, [dstq+strideq*4] + + palignr m2, m6, m4, 14 + mova [dstq ], m2 + palignr m2, m6, m4, 12 + mova [dstq+strideq ], m2 + palignr m2, m6, m4, 10 + mova [dstq+strideq*2], m2 + palignr m2, m6, m4, 8 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + palignr m2, m6, m4, 6 + mova [dstq ], m2 + palignr m2, m6, m4, 4 + mova [dstq+strideq ], m2 + palignr m2, m6, m4, 2 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + mova m0, [leftq] + movu m7, [aboveq-1] + movu m1, [aboveq+15] + + pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] + + palignr m3, m1, m7, 1 + palignr m5, m1, m7, 2 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] + + pshufb m7, [GLOBAL(sh_bfedcba9876543210)] + palignr m5, m0, m7, 15 + palignr m3, m0, m7, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg + pavgb m5, m0 ; A1 - Ag + punpcklbw m6, m4, m5 ; A-B8 ... A-B1 + punpckhbw m4, m5 ; A-B9 ... A-Bg + pshufb m6, [GLOBAL(sh_bfedcba9876543210)] + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] + + DEFINE_ARGS dst, stride, stride3, left, line + lea stride3q, [strideq*3] + + palignr m5, m2, m1, 14 + palignr m7, m1, m6, 14 + mova [dstq ], m7 + mova [dstq+16 ], m5 + palignr m5, m2, m1, 12 + palignr m7, m1, m6, 12 + mova [dstq+strideq ], m7 + mova [dstq+strideq+16 ], m5 + palignr m5, m2, m1, 10 + palignr m7, m1, m6, 10 + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m5 + palignr m5, m2, m1, 8 + palignr m7, m1, m6, 8 + mova [dstq+stride3q ], m7 + mova [dstq+stride3q+16 ], m5 + lea dstq, [dstq+strideq*4] + palignr m5, m2, m1, 6 + palignr m7, m1, m6, 6 + mova [dstq ], m7 + mova [dstq+16 ], m5 + palignr m5, m2, m1, 4 + palignr m7, m1, m6, 4 + mova [dstq+strideq ], m7 + mova [dstq+strideq+16 ], m5 + palignr m5, m2, m1, 2 + palignr m7, m1, m6, 2 + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m5 + mova [dstq+stride3q ], m6 + mova [dstq+stride3q+16 ], m1 + lea dstq, [dstq+strideq*4] + + palignr m5, m1, m6, 14 + palignr m3, m6, m4, 14 + mova [dstq ], m3 + mova [dstq+16 ], m5 + palignr m5, m1, m6, 12 + palignr m3, m6, m4, 12 + mova [dstq+strideq ], m3 + mova [dstq+strideq+16 ], m5 + palignr m5, m1, m6, 10 + palignr m3, m6, m4, 10 + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m5 + palignr m5, m1, m6, 8 + palignr m3, m6, m4, 8 + mova [dstq+stride3q ], m3 + mova [dstq+stride3q+16 ], m5 + lea dstq, [dstq+strideq*4] + palignr m5, m1, m6, 6 + palignr m3, m6, m4, 6 + mova [dstq ], m3 + mova [dstq+16 ], m5 + palignr m5, m1, m6, 4 + palignr m3, m6, m4, 4 + mova [dstq+strideq ], m3 + mova [dstq+strideq+16 ], m5 + palignr m5, m1, m6, 2 + palignr m3, m6, m4, 2 + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m5 + mova [dstq+stride3q ], m4 + mova [dstq+stride3q+16 ], m6 + lea dstq, [dstq+strideq*4] + + mova m7, [leftq] + mova m3, [leftq+16] + palignr m5, m3, m7, 15 + palignr m0, m3, m7, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - + pavgb m5, m3 ; Ah - + punpcklbw m3, m2, m5 ; A-B8 ... A-B1 + punpckhbw m2, m5 ; A-B9 ... A-Bg + pshufb m3, [GLOBAL(sh_bfedcba9876543210)] + pshufb m2, [GLOBAL(sh_bfedcba9876543210)] + + palignr m7, m6, m4, 14 + palignr m0, m4, m3, 14 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m6, m4, 12 + palignr m0, m4, m3, 12 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m6, m4, 10 + palignr m0, m4, m3, 10 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + palignr m7, m6, m4, 8 + palignr m0, m4, m3, 8 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q+16 ], m7 + lea dstq, [dstq+strideq*4] + palignr m7, m6, m4, 6 + palignr m0, m4, m3, 6 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m6, m4, 4 + palignr m0, m4, m3, 4 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m6, m4, 2 + palignr m0, m4, m3, 2 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m3 + mova [dstq+stride3q+16 ], m4 + lea dstq, [dstq+strideq*4] + + palignr m7, m4, m3, 14 + palignr m0, m3, m2, 14 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m4, m3, 12 + palignr m0, m3, m2, 12 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m4, m3, 10 + palignr m0, m3, m2, 10 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + palignr m7, m4, m3, 8 + palignr m0, m3, m2, 8 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q+16 ], m7 + lea dstq, [dstq+strideq*4] + palignr m7, m4, m3, 6 + palignr m0, m3, m2, 6 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m4, m3, 4 + palignr m0, m3, m2, 4 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m4, m3, 2 + palignr m0, m3, m2, 2 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m2 + mova [dstq+stride3q+16 ], m3 + + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset + GET_GOT goffsetq + movq m3, [leftq] ; abcdefgh [byte] + lea stride3q, [strideq*3] + + pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] + pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] + pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3 + pavgb m0, m2 + punpcklbw m0, m3 ; interleaved output + + movq [dstq ], m0 + psrldq m0, 2 + movq [dstq+strideq ], m0 + psrldq m0, 2 + movq [dstq+strideq*2], m0 + psrldq m0, 2 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh + psrldq m0, 2 + movq [dstq ], m0 + psrldq m0, 2 + movq [dstq+strideq ], m0 + psrldq m0, 2 + movq [dstq+strideq*2], m0 + psrldq m0, 2 + movq [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset + GET_GOT goffsetq + lea stride3q, [strideq*3] + mova m0, [leftq] ; abcdefghijklmnop [byte] + pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp + pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 + pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte] + + punpckhbw m4, m1, m3 ; interleaved input + punpcklbw m1, m3 ; interleaved output + mova [dstq ], m1 + palignr m3, m4, m1, 2 + mova [dstq+strideq ], m3 + palignr m3, m4, m1, 4 + mova [dstq+strideq*2], m3 + palignr m3, m4, m1, 6 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + palignr m3, m4, m1, 8 + mova [dstq ], m3 + palignr m3, m4, m1, 10 + mova [dstq+strideq ], m3 + palignr m3, m4, m1, 12 + mova [dstq+strideq*2], m3 + palignr m3, m4, m1, 14 + mova [dstq+stride3q ], m3 + DEFINE_ARGS dst, stride, stride3, line + mov lined, 2 + mova m0, [GLOBAL(sh_b23456789abcdefff)] +.loop: + lea dstq, [dstq+strideq*4] + mova [dstq ], m4 + pshufb m4, m0 + mova [dstq+strideq ], m4 + pshufb m4, m0 + mova [dstq+strideq*2], m4 + pshufb m4, m0 + mova [dstq+stride3q ], m4 + pshufb m4, m0 + dec lined + jnz .loop + RESTORE_GOT + REP_RET + +INIT_XMM ssse3 +cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset + GET_GOT goffsetq + lea stride3q, [strideq*3] + mova m1, [leftq] ; 0-15 [byte] + mova m2, [leftq+16] ; 16-31 [byte] + pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)] + pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3 + palignr m6, m2, m1, 1 + palignr m5, m2, m1, 2 + pavgb m2, m4 ; high 16px even lines + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0 + pavgb m1, m6 ; low 16px even lines + + punpckhbw m6, m1, m0 ; interleaved output 2 + punpcklbw m1, m0 ; interleaved output 1 + + punpckhbw m7, m2, m3 ; interleaved output 4 + punpcklbw m2, m3 ; interleaved output 3 + + ; output 1st 8 lines (and half of 2nd 8 lines) + DEFINE_ARGS dst, stride, stride3, dst8 + lea dst8q, [dstq+strideq*8] + mova [dstq ], m1 + mova [dstq +16], m6 + mova [dst8q ], m6 + palignr m0, m6, m1, 2 + palignr m4, m2, m6, 2 + mova [dstq +strideq ], m0 + mova [dstq +strideq +16], m4 + mova [dst8q+strideq ], m4 + palignr m0, m6, m1, 4 + palignr m4, m2, m6, 4 + mova [dstq +strideq*2 ], m0 + mova [dstq +strideq*2+16], m4 + mova [dst8q+strideq*2 ], m4 + palignr m0, m6, m1, 6 + palignr m4, m2, m6, 6 + mova [dstq +stride3q ], m0 + mova [dstq +stride3q +16], m4 + mova [dst8q+stride3q ], m4 + lea dstq, [dstq +strideq*4] + lea dst8q, [dst8q+strideq*4] + palignr m0, m6, m1, 8 + palignr m4, m2, m6, 8 + mova [dstq ], m0 + mova [dstq +16], m4 + mova [dst8q ], m4 + palignr m0, m6, m1, 10 + palignr m4, m2, m6, 10 + mova [dstq +strideq ], m0 + mova [dstq +strideq +16], m4 + mova [dst8q+strideq ], m4 + palignr m0, m6, m1, 12 + palignr m4, m2, m6, 12 + mova [dstq +strideq*2 ], m0 + mova [dstq +strideq*2+16], m4 + mova [dst8q+strideq*2 ], m4 + palignr m0, m6, m1, 14 + palignr m4, m2, m6, 14 + mova [dstq +stride3q ], m0 + mova [dstq +stride3q +16], m4 + mova [dst8q+stride3q ], m4 + lea dstq, [dstq+strideq*4] + lea dst8q, [dst8q+strideq*4] + + ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines + mova [dstq +16], m2 + mova [dst8q ], m2 + palignr m4, m7, m2, 2 + mova [dstq +strideq +16], m4 + mova [dst8q+strideq ], m4 + palignr m4, m7, m2, 4 + mova [dstq +strideq*2+16], m4 + mova [dst8q+strideq*2 ], m4 + palignr m4, m7, m2, 6 + mova [dstq +stride3q +16], m4 + mova [dst8q+stride3q ], m4 + lea dstq, [dstq+strideq*4] + lea dst8q, [dst8q+strideq*4] + palignr m4, m7, m2, 8 + mova [dstq +16], m4 + mova [dst8q ], m4 + palignr m4, m7, m2, 10 + mova [dstq +strideq +16], m4 + mova [dst8q+strideq ], m4 + palignr m4, m7, m2, 12 + mova [dstq +strideq*2+16], m4 + mova [dst8q+strideq*2 ], m4 + palignr m4, m7, m2, 14 + mova [dstq +stride3q +16], m4 + mova [dst8q+stride3q ], m4 + lea dstq, [dstq+strideq*4] + lea dst8q, [dst8q+strideq*4] + + ; output 2nd half of 3rd 8 lines and half of 4th 8 lines + mova m0, [GLOBAL(sh_b23456789abcdefff)] + mova [dstq +16], m7 + mova [dst8q ], m7 + pshufb m7, m0 + mova [dstq +strideq +16], m7 + mova [dst8q+strideq ], m7 + pshufb m7, m0 + mova [dstq +strideq*2+16], m7 + mova [dst8q+strideq*2 ], m7 + pshufb m7, m0 + mova [dstq +stride3q +16], m7 + mova [dst8q+stride3q ], m7 + pshufb m7, m0 + lea dstq, [dstq+strideq*4] + lea dst8q, [dst8q+strideq*4] + mova [dstq +16], m7 + mova [dst8q ], m7 + pshufb m7, m0 + mova [dstq +strideq +16], m7 + mova [dst8q+strideq ], m7 + pshufb m7, m0 + mova [dstq +strideq*2+16], m7 + mova [dst8q+strideq*2 ], m7 + pshufb m7, m0 + mova [dstq +stride3q +16], m7 + mova [dst8q+stride3q ], m7 + pshufb m7, m0 + lea dstq, [dstq+strideq*4] + + ; output last half of 4th 8 lines + mova [dstq +16], m7 + mova [dstq +strideq +16], m7 + mova [dstq +strideq*2+16], m7 + mova [dstq +stride3q +16], m7 + lea dstq, [dstq+strideq*4] + mova [dstq +16], m7 + mova [dstq +strideq +16], m7 + mova [dstq +strideq*2+16], m7 + mova [dstq +stride3q +16], m7 + + ; done! + RESTORE_GOT + RET diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.c new file mode 100644 index 000000000000..df5068c624b4 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.c @@ -0,0 +1,4046 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +#define RECON_AND_STORE4X4(dest, in_x) \ +{ \ + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + *(int *)(dest) = _mm_cvtsi128_si32(d0); \ +} + +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i eight = _mm_set1_epi16(8); + const __m128i cst = _mm_setr_epi16( + (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, + (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, + (int16_t)cospi_8_64, (int16_t)cospi_24_64); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i input0, input1, input2, input3; + + // Rows + input0 = load_input_data(input); + input2 = load_input_data(input + 8); + + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + input0 = _mm_shufflelo_epi16(input0, 0xd8); + input0 = _mm_shufflehi_epi16(input0, 0xd8); + input2 = _mm_shufflelo_epi16(input2, 0xd8); + input2 = _mm_shufflehi_epi16(input2, 0xd8); + + input1 = _mm_unpackhi_epi32(input0, input0); + input0 = _mm_unpacklo_epi32(input0, input0); + input3 = _mm_unpackhi_epi32(input2, input2); + input2 = _mm_unpacklo_epi32(input2, input2); + + // Stage 1 + input0 = _mm_madd_epi16(input0, cst); + input1 = _mm_madd_epi16(input1, cst); + input2 = _mm_madd_epi16(input2, cst); + input3 = _mm_madd_epi16(input3, cst); + + input0 = _mm_add_epi32(input0, rounding); + input1 = _mm_add_epi32(input1, rounding); + input2 = _mm_add_epi32(input2, rounding); + input3 = _mm_add_epi32(input3, rounding); + + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); + + // Stage 2 + input0 = _mm_packs_epi32(input0, input1); + input1 = _mm_packs_epi32(input2, input3); + + // Transpose + input2 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpackhi_epi16(input0, input1); + input0 = _mm_unpacklo_epi32(input2, input3); + input1 = _mm_unpackhi_epi32(input2, input3); + + // Switch column2, column 3, and then, we got: + // input2: column1, column 0; input3: column2, column 3. + input1 = _mm_shuffle_epi32(input1, 0x4e); + input2 = _mm_add_epi16(input0, input1); + input3 = _mm_sub_epi16(input0, input1); + + // Columns + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + input0 = _mm_unpacklo_epi32(input2, input2); + input1 = _mm_unpackhi_epi32(input2, input2); + input2 = _mm_unpackhi_epi32(input3, input3); + input3 = _mm_unpacklo_epi32(input3, input3); + + // Stage 1 + input0 = _mm_madd_epi16(input0, cst); + input1 = _mm_madd_epi16(input1, cst); + input2 = _mm_madd_epi16(input2, cst); + input3 = _mm_madd_epi16(input3, cst); + + input0 = _mm_add_epi32(input0, rounding); + input1 = _mm_add_epi32(input1, rounding); + input2 = _mm_add_epi32(input2, rounding); + input3 = _mm_add_epi32(input3, rounding); + + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); + + // Stage 2 + input0 = _mm_packs_epi32(input0, input2); + input1 = _mm_packs_epi32(input1, input3); + + // Transpose + input2 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpackhi_epi16(input0, input1); + input0 = _mm_unpacklo_epi32(input2, input3); + input1 = _mm_unpackhi_epi32(input2, input3); + + // Switch column2, column 3, and then, we got: + // input2: column1, column 0; input3: column2, column 3. + input1 = _mm_shuffle_epi32(input1, 0x4e); + input2 = _mm_add_epi16(input0, input1); + input3 = _mm_sub_epi16(input0, input1); + + // Final round and shift + input2 = _mm_add_epi16(input2, eight); + input3 = _mm_add_epi16(input3, eight); + + input2 = _mm_srai_epi16(input2, 4); + input3 = _mm_srai_epi16(input3, 4); + + // Reconstruction and Store + { + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); + __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi32(d0, + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d2 = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); + d0 = _mm_unpacklo_epi8(d0, zero); + d2 = _mm_unpacklo_epi8(d2, zero); + d0 = _mm_add_epi16(d0, input2); + d2 = _mm_add_epi16(d2, input3); + d0 = _mm_packus_epi16(d0, d2); + // store input0 + *(int *)dest = _mm_cvtsi128_si32(d0); + // store input1 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); + // store input2 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); + // store input3 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); + } +} + +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a; + + a = (int)dct_const_round_shift(input[0] * cospi_16_64); + a = (int)dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 4); + + dc_value = _mm_set1_epi16(a); + + RECON_AND_STORE4X4(dest + 0 * stride, dc_value); + RECON_AND_STORE4X4(dest + 1 * stride, dc_value); + RECON_AND_STORE4X4(dest + 2 * stride, dc_value); + RECON_AND_STORE4X4(dest + 3 * stride, dc_value); +} + +static INLINE void transpose_4x4(__m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); + + res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); + res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); +} + +void idct4_sse2(__m128i *in) { + const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8]; + + transpose_4x4(in); + // stage 1 + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); + v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + + u[0] = _mm_packs_epi32(v[0], v[1]); + u[1] = _mm_packs_epi32(v[3], v[2]); + + // stage 2 + in[0] = _mm_add_epi16(u[0], u[1]); + in[1] = _mm_sub_epi16(u[0], u[1]); + in[1] = _mm_shuffle_epi32(in[1], 0x4E); +} + +void iadst4_sse2(__m128i *in) { + const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); + const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); + const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); + const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); + const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); + const __m128i kZero = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8], in7; + + transpose_4x4(in); + in7 = _mm_srli_si128(in[1], 8); + in7 = _mm_add_epi16(in7, in[0]); + in7 = _mm_sub_epi16(in7, in[1]); + + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); + u[2] = _mm_unpacklo_epi16(in7, kZero); + u[3] = _mm_unpackhi_epi16(in[0], kZero); + + v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 + v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 + v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 + v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 + v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 + v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = _mm_add_epi32(v[3], v[4]); + u[2] = v[2]; + u[3] = _mm_add_epi32(u[0], u[1]); + u[4] = _mm_slli_epi32(v[5], 2); + u[5] = _mm_add_epi32(u[3], v[5]); + u[6] = _mm_sub_epi32(u[5], u[4]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[1]); + in[1] = _mm_packs_epi32(u[2], u[3]); +} + +#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ + const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ + const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ + const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ + const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ + out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ + out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ + out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ + } + +#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \ + out0, out1, out2, out3) \ + { \ + const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ + const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + } + +#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + } + +// Define Macro for multiplying elements by constants and adding them together. +#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ + cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + tmp4 = _mm_madd_epi16(lo_1, cst2); \ + tmp5 = _mm_madd_epi16(hi_1, cst2); \ + tmp6 = _mm_madd_epi16(lo_1, cst3); \ + tmp7 = _mm_madd_epi16(hi_1, cst3); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + tmp4 = _mm_add_epi32(tmp4, rounding); \ + tmp5 = _mm_add_epi32(tmp5, rounding); \ + tmp6 = _mm_add_epi32(tmp6, rounding); \ + tmp7 = _mm_add_epi32(tmp7, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + res2 = _mm_packs_epi32(tmp4, tmp5); \ + res3 = _mm_packs_epi32(tmp6, tmp7); \ + } + +#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + } + +#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ + { \ + /* Stage1 */ \ + { \ + const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ + const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ + const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ + const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ + \ + MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ + stg1_1, stg1_2, stg1_3, stp1_4, \ + stp1_7, stp1_5, stp1_6) \ + } \ + \ + /* Stage2 */ \ + { \ + const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ + const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ + const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ + const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ + \ + MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ + stg2_1, stg2_2, stg2_3, stp2_0, \ + stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ + tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ + tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ + tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + } \ + \ + /* Stage4 */ \ + out0 = _mm_adds_epi16(stp1_0, stp2_7); \ + out1 = _mm_adds_epi16(stp1_1, stp1_6); \ + out2 = _mm_adds_epi16(stp1_2, stp1_5); \ + out3 = _mm_adds_epi16(stp1_3, stp2_4); \ + out4 = _mm_subs_epi16(stp1_3, stp2_4); \ + out5 = _mm_subs_epi16(stp1_2, stp1_5); \ + out6 = _mm_subs_epi16(stp1_1, stp1_6); \ + out7 = _mm_subs_epi16(stp1_0, stp2_7); \ + } + +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. + in0 = load_input_data(input); + in1 = load_input_data(input + 8 * 1); + in2 = load_input_data(input + 8 * 2); + in3 = load_input_data(input + 8 * 3); + in4 = load_input_data(input + 8 * 4); + in5 = load_input_data(input + 8 * 5); + in6 = load_input_data(input + 8 * 6); + in7 = load_input_data(input + 8 * 7); + + // 2-D + for (i = 0; i < 2; i++) { + // 8x8 Transpose is copied from vpx_fdct8x8_sse2() + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); + } + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} + +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a; + + a = (int)dct_const_round_shift(input[0] * cospi_16_64); + a = (int)dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 5); + + dc_value = _mm_set1_epi16(a); + + RECON_AND_STORE(dest + 0 * stride, dc_value); + RECON_AND_STORE(dest + 1 * stride, dc_value); + RECON_AND_STORE(dest + 2 * stride, dc_value); + RECON_AND_STORE(dest + 3 * stride, dc_value); + RECON_AND_STORE(dest + 4 * stride, dc_value); + RECON_AND_STORE(dest + 5 * stride, dc_value); + RECON_AND_STORE(dest + 6 * stride, dc_value); + RECON_AND_STORE(dest + 7 * stride, dc_value); +} + +void idct8_sse2(__m128i *in) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + // 8x8 Transpose is copied from vpx_fdct8x8_sse2() + TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], + in0, in1, in2, in3, in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); +} + +void iadst8_sse2(__m128i *in) { + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__const_0 = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; + __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + + // transpose + array_transpose_8x8(in, in); + + // properly aligned for butterfly input + in0 = in[7]; + in1 = in[0]; + in2 = in[5]; + in3 = in[2]; + in4 = in[3]; + in5 = in[4]; + in6 = in[1]; + in7 = in[6]; + + // column transformation + // stage 1 + // interleave and multiply/add into 32-bit integer + s0 = _mm_unpacklo_epi16(in0, in1); + s1 = _mm_unpackhi_epi16(in0, in1); + s2 = _mm_unpacklo_epi16(in2, in3); + s3 = _mm_unpackhi_epi16(in2, in3); + s4 = _mm_unpacklo_epi16(in4, in5); + s5 = _mm_unpackhi_epi16(in4, in5); + s6 = _mm_unpacklo_epi16(in6, in7); + s7 = _mm_unpackhi_epi16(in6, in7); + + u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); + u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); + u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); + u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); + u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); + u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); + u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); + u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); + u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); + u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); + u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); + u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); + u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); + u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); + u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); + u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); + + // addition + w0 = _mm_add_epi32(u0, u8); + w1 = _mm_add_epi32(u1, u9); + w2 = _mm_add_epi32(u2, u10); + w3 = _mm_add_epi32(u3, u11); + w4 = _mm_add_epi32(u4, u12); + w5 = _mm_add_epi32(u5, u13); + w6 = _mm_add_epi32(u6, u14); + w7 = _mm_add_epi32(u7, u15); + w8 = _mm_sub_epi32(u0, u8); + w9 = _mm_sub_epi32(u1, u9); + w10 = _mm_sub_epi32(u2, u10); + w11 = _mm_sub_epi32(u3, u11); + w12 = _mm_sub_epi32(u4, u12); + w13 = _mm_sub_epi32(u5, u13); + w14 = _mm_sub_epi32(u6, u14); + w15 = _mm_sub_epi32(u7, u15); + + // shift and rounding + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); + v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); + v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); + v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); + v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); + v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); + v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); + v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); + u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); + u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); + u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); + u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); + u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); + u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); + u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); + + // back to 16-bit and pack 8 integers into __m128i + in[0] = _mm_packs_epi32(u0, u1); + in[1] = _mm_packs_epi32(u2, u3); + in[2] = _mm_packs_epi32(u4, u5); + in[3] = _mm_packs_epi32(u6, u7); + in[4] = _mm_packs_epi32(u8, u9); + in[5] = _mm_packs_epi32(u10, u11); + in[6] = _mm_packs_epi32(u12, u13); + in[7] = _mm_packs_epi32(u14, u15); + + // stage 2 + s0 = _mm_add_epi16(in[0], in[2]); + s1 = _mm_add_epi16(in[1], in[3]); + s2 = _mm_sub_epi16(in[0], in[2]); + s3 = _mm_sub_epi16(in[1], in[3]); + u0 = _mm_unpacklo_epi16(in[4], in[5]); + u1 = _mm_unpackhi_epi16(in[4], in[5]); + u2 = _mm_unpacklo_epi16(in[6], in[7]); + u3 = _mm_unpackhi_epi16(in[6], in[7]); + + v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); + v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); + v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); + v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); + v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); + v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); + v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); + v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); + + w0 = _mm_add_epi32(v0, v4); + w1 = _mm_add_epi32(v1, v5); + w2 = _mm_add_epi32(v2, v6); + w3 = _mm_add_epi32(v3, v7); + w4 = _mm_sub_epi32(v0, v4); + w5 = _mm_sub_epi32(v1, v5); + w6 = _mm_sub_epi32(v2, v6); + w7 = _mm_sub_epi32(v3, v7); + + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + // back to 16-bit intergers + s4 = _mm_packs_epi32(u0, u1); + s5 = _mm_packs_epi32(u2, u3); + s6 = _mm_packs_epi32(u4, u5); + s7 = _mm_packs_epi32(u6, u7); + + // stage 3 + u0 = _mm_unpacklo_epi16(s2, s3); + u1 = _mm_unpackhi_epi16(s2, s3); + u2 = _mm_unpacklo_epi16(s6, s7); + u3 = _mm_unpackhi_epi16(s6, s7); + + v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); + v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); + v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); + v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); + v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); + v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); + v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); + v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); + + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + s2 = _mm_packs_epi32(v0, v1); + s3 = _mm_packs_epi32(v2, v3); + s6 = _mm_packs_epi32(v4, v5); + s7 = _mm_packs_epi32(v6, v7); + + in[0] = s0; + in[1] = _mm_sub_epi16(k__const_0, s4); + in[2] = s6; + in[3] = _mm_sub_epi16(k__const_0, s2); + in[4] = s3; + in[5] = _mm_sub_epi16(k__const_0, s7); + in[6] = s5; + in[7] = _mm_sub_epi16(k__const_0, s1); +} + +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + // Rows. Load 4-row input data. + in0 = load_input_data(input); + in1 = load_input_data(input + 8 * 1); + in2 = load_input_data(input + 8 * 2); + in3 = load_input_data(input + 8 * 3); + + // 8x4 Transpose + TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); + // Stage1 + { + const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); + const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); + + tmp0 = _mm_madd_epi16(lo_17, stg1_0); + tmp2 = _mm_madd_epi16(lo_17, stg1_1); + tmp4 = _mm_madd_epi16(lo_35, stg1_2); + tmp6 = _mm_madd_epi16(lo_35, stg1_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_4 = _mm_packs_epi32(tmp0, tmp2); + stp1_5 = _mm_packs_epi32(tmp4, tmp6); + } + + // Stage2 + { + const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); + const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); + + tmp0 = _mm_madd_epi16(lo_04, stg2_0); + tmp2 = _mm_madd_epi16(lo_04, stg2_1); + tmp4 = _mm_madd_epi16(lo_26, stg2_2); + tmp6 = _mm_madd_epi16(lo_26, stg2_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp2_0 = _mm_packs_epi32(tmp0, tmp2); + stp2_2 = _mm_packs_epi32(tmp6, tmp4); + + tmp0 = _mm_adds_epi16(stp1_4, stp1_5); + tmp1 = _mm_subs_epi16(stp1_4, stp1_5); + + stp2_4 = tmp0; + stp2_5 = _mm_unpacklo_epi64(tmp1, zero); + stp2_6 = _mm_unpackhi_epi64(tmp1, zero); + } + + // Stage3 + { + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); + + tmp4 = _mm_adds_epi16(stp2_0, stp2_2); + tmp6 = _mm_subs_epi16(stp2_0, stp2_2); + + stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); + stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); + + tmp0 = _mm_madd_epi16(lo_56, stg3_0); + tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp0, tmp2); + } + + // Stage4 + tmp0 = _mm_adds_epi16(stp1_3, stp2_4); + tmp1 = _mm_adds_epi16(stp1_2, stp1_5); + tmp2 = _mm_subs_epi16(stp1_3, stp2_4); + tmp3 = _mm_subs_epi16(stp1_2, stp1_5); + + TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) + + IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, + in0, in1, in2, in3, in4, in5, in6, in7); + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} + +#define IDCT16 \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ + const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ + const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ + const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ + const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ + stg2_0, stg2_1, stg2_2, stg2_3, \ + stp2_8, stp2_15, stp2_9, stp2_14) \ + \ + MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ + stg2_4, stg2_5, stg2_6, stg2_7, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ + const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ + const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ + \ + MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ + stg3_0, stg3_1, stg3_2, stg3_3, \ + stp1_4, stp1_7, stp1_5, stp1_6) \ + \ + stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + \ + stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ + const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ + const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ + stg4_0, stg4_1, stg4_2, stg4_3, \ + stp2_0, stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ + stg4_4, stg4_5, stg4_6, stg4_7, \ + stp2_9, stp2_14, stp2_10, stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } + +#define IDCT16_10 \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \ + stg2_0, stg2_1, stg2_6, stg2_7, \ + stp1_8_0, stp1_15, stp1_11, stp1_12_0) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ + \ + MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \ + stg3_0, stg3_1, \ + stp2_4, stp2_7) \ + \ + stp1_9 = stp1_8_0; \ + stp1_10 = stp1_11; \ + \ + stp1_13 = stp1_12_0; \ + stp1_14 = stp1_15; \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \ + stg4_0, stg4_1, \ + stp1_0, stp1_1) \ + stp2_5 = stp2_4; \ + stp2_6 = stp2_7; \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ + stg4_4, stg4_5, stg4_6, stg4_7, \ + stp2_9, stp2_14, stp2_10, stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_2 = stp1_1; \ + stp1_3 = stp1_0; \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } + +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in[16], l[16], r[16], *curr1; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + curr1 = l; + for (i = 0; i < 2; i++) { + // 1-D idct + + // Load input data. + in[0] = load_input_data(input); + in[8] = load_input_data(input + 8 * 1); + in[1] = load_input_data(input + 8 * 2); + in[9] = load_input_data(input + 8 * 3); + in[2] = load_input_data(input + 8 * 4); + in[10] = load_input_data(input + 8 * 5); + in[3] = load_input_data(input + 8 * 6); + in[11] = load_input_data(input + 8 * 7); + in[4] = load_input_data(input + 8 * 8); + in[12] = load_input_data(input + 8 * 9); + in[5] = load_input_data(input + 8 * 10); + in[13] = load_input_data(input + 8 * 11); + in[6] = load_input_data(input + 8 * 12); + in[14] = load_input_data(input + 8 * 13); + in[7] = load_input_data(input + 8 * 14); + in[15] = load_input_data(input + 8 * 15); + + array_transpose_8x8(in, in); + array_transpose_8x8(in + 8, in + 8); + + IDCT16 + + // Stage7 + curr1[0] = _mm_add_epi16(stp2_0, stp1_15); + curr1[1] = _mm_add_epi16(stp2_1, stp1_14); + curr1[2] = _mm_add_epi16(stp2_2, stp2_13); + curr1[3] = _mm_add_epi16(stp2_3, stp2_12); + curr1[4] = _mm_add_epi16(stp2_4, stp2_11); + curr1[5] = _mm_add_epi16(stp2_5, stp2_10); + curr1[6] = _mm_add_epi16(stp2_6, stp1_9); + curr1[7] = _mm_add_epi16(stp2_7, stp1_8); + curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); + curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); + curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); + curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); + curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); + curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); + curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); + curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); + + curr1 = r; + input += 128; + } + for (i = 0; i < 2; i++) { + int j; + // 1-D idct + array_transpose_8x8(l + i * 8, in); + array_transpose_8x8(r + i * 8, in + 8); + + IDCT16 + + // 2-D + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); + + for (j = 0; j < 16; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a, i; + + a = (int)dct_const_round_shift(input[0] * cospi_16_64); + a = (int)dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 6); + + dc_value = _mm_set1_epi16(a); + + for (i = 0; i < 16; ++i) { + RECON_AND_STORE(dest + 0, dc_value); + RECON_AND_STORE(dest + 8, dc_value); + dest += stride; + } +} + +static void iadst16_8col(__m128i *in) { + // perform 16x16 1-D ADST for 8 columns + __m128i s[16], x[16], u[32], v[32]; + const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); + const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); + const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kZero = _mm_set1_epi16(0); + + u[0] = _mm_unpacklo_epi16(in[15], in[0]); + u[1] = _mm_unpackhi_epi16(in[15], in[0]); + u[2] = _mm_unpacklo_epi16(in[13], in[2]); + u[3] = _mm_unpackhi_epi16(in[13], in[2]); + u[4] = _mm_unpacklo_epi16(in[11], in[4]); + u[5] = _mm_unpackhi_epi16(in[11], in[4]); + u[6] = _mm_unpacklo_epi16(in[9], in[6]); + u[7] = _mm_unpackhi_epi16(in[9], in[6]); + u[8] = _mm_unpacklo_epi16(in[7], in[8]); + u[9] = _mm_unpackhi_epi16(in[7], in[8]); + u[10] = _mm_unpacklo_epi16(in[5], in[10]); + u[11] = _mm_unpackhi_epi16(in[5], in[10]); + u[12] = _mm_unpacklo_epi16(in[3], in[12]); + u[13] = _mm_unpackhi_epi16(in[3], in[12]); + u[14] = _mm_unpacklo_epi16(in[1], in[14]); + u[15] = _mm_unpackhi_epi16(in[1], in[14]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); + v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); + v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); + v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); + v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); + v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); + v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); + v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); + v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); + v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); + v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); + v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); + v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); + v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); + v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); + v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); + v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); + v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); + v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); + v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); + v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); + v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); + v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); + v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); + v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); + v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); + v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); + v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); + v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); + v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); + v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); + v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); + + u[0] = _mm_add_epi32(v[0], v[16]); + u[1] = _mm_add_epi32(v[1], v[17]); + u[2] = _mm_add_epi32(v[2], v[18]); + u[3] = _mm_add_epi32(v[3], v[19]); + u[4] = _mm_add_epi32(v[4], v[20]); + u[5] = _mm_add_epi32(v[5], v[21]); + u[6] = _mm_add_epi32(v[6], v[22]); + u[7] = _mm_add_epi32(v[7], v[23]); + u[8] = _mm_add_epi32(v[8], v[24]); + u[9] = _mm_add_epi32(v[9], v[25]); + u[10] = _mm_add_epi32(v[10], v[26]); + u[11] = _mm_add_epi32(v[11], v[27]); + u[12] = _mm_add_epi32(v[12], v[28]); + u[13] = _mm_add_epi32(v[13], v[29]); + u[14] = _mm_add_epi32(v[14], v[30]); + u[15] = _mm_add_epi32(v[15], v[31]); + u[16] = _mm_sub_epi32(v[0], v[16]); + u[17] = _mm_sub_epi32(v[1], v[17]); + u[18] = _mm_sub_epi32(v[2], v[18]); + u[19] = _mm_sub_epi32(v[3], v[19]); + u[20] = _mm_sub_epi32(v[4], v[20]); + u[21] = _mm_sub_epi32(v[5], v[21]); + u[22] = _mm_sub_epi32(v[6], v[22]); + u[23] = _mm_sub_epi32(v[7], v[23]); + u[24] = _mm_sub_epi32(v[8], v[24]); + u[25] = _mm_sub_epi32(v[9], v[25]); + u[26] = _mm_sub_epi32(v[10], v[26]); + u[27] = _mm_sub_epi32(v[11], v[27]); + u[28] = _mm_sub_epi32(v[12], v[28]); + u[29] = _mm_sub_epi32(v[13], v[29]); + u[30] = _mm_sub_epi32(v[14], v[30]); + u[31] = _mm_sub_epi32(v[15], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); + v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); + v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); + v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); + v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); + v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); + v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); + v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); + v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); + v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); + v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); + v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); + v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); + v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); + v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); + v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); + u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); + u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); + u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); + u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); + u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); + u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); + u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); + u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); + u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); + u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); + u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); + u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); + u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); + u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); + u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); + + s[0] = _mm_packs_epi32(u[0], u[1]); + s[1] = _mm_packs_epi32(u[2], u[3]); + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_packs_epi32(u[8], u[9]); + s[5] = _mm_packs_epi32(u[10], u[11]); + s[6] = _mm_packs_epi32(u[12], u[13]); + s[7] = _mm_packs_epi32(u[14], u[15]); + s[8] = _mm_packs_epi32(u[16], u[17]); + s[9] = _mm_packs_epi32(u[18], u[19]); + s[10] = _mm_packs_epi32(u[20], u[21]); + s[11] = _mm_packs_epi32(u[22], u[23]); + s[12] = _mm_packs_epi32(u[24], u[25]); + s[13] = _mm_packs_epi32(u[26], u[27]); + s[14] = _mm_packs_epi32(u[28], u[29]); + s[15] = _mm_packs_epi32(u[30], u[31]); + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[9]); + u[1] = _mm_unpackhi_epi16(s[8], s[9]); + u[2] = _mm_unpacklo_epi16(s[10], s[11]); + u[3] = _mm_unpackhi_epi16(s[10], s[11]); + u[4] = _mm_unpacklo_epi16(s[12], s[13]); + u[5] = _mm_unpackhi_epi16(s[12], s[13]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); + v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); + v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); + v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); + v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); + v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); + v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); + v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[15] = _mm_sub_epi32(v[7], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + x[0] = _mm_add_epi16(s[0], s[4]); + x[1] = _mm_add_epi16(s[1], s[5]); + x[2] = _mm_add_epi16(s[2], s[6]); + x[3] = _mm_add_epi16(s[3], s[7]); + x[4] = _mm_sub_epi16(s[0], s[4]); + x[5] = _mm_sub_epi16(s[1], s[5]); + x[6] = _mm_sub_epi16(s[2], s[6]); + x[7] = _mm_sub_epi16(s[3], s[7]); + x[8] = _mm_packs_epi32(u[0], u[1]); + x[9] = _mm_packs_epi32(u[2], u[3]); + x[10] = _mm_packs_epi32(u[4], u[5]); + x[11] = _mm_packs_epi32(u[6], u[7]); + x[12] = _mm_packs_epi32(u[8], u[9]); + x[13] = _mm_packs_epi32(u[10], u[11]); + x[14] = _mm_packs_epi32(u[12], u[13]); + x[15] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + u[0] = _mm_unpacklo_epi16(x[4], x[5]); + u[1] = _mm_unpackhi_epi16(x[4], x[5]); + u[2] = _mm_unpacklo_epi16(x[6], x[7]); + u[3] = _mm_unpackhi_epi16(x[6], x[7]); + u[4] = _mm_unpacklo_epi16(x[12], x[13]); + u[5] = _mm_unpackhi_epi16(x[12], x[13]); + u[6] = _mm_unpacklo_epi16(x[14], x[15]); + u[7] = _mm_unpackhi_epi16(x[14], x[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); + v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); + v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], v[4]); + u[1] = _mm_add_epi32(v[1], v[5]); + u[2] = _mm_add_epi32(v[2], v[6]); + u[3] = _mm_add_epi32(v[3], v[7]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[6] = _mm_sub_epi32(v[2], v[6]); + u[7] = _mm_sub_epi32(v[3], v[7]); + u[8] = _mm_add_epi32(v[8], v[12]); + u[9] = _mm_add_epi32(v[9], v[13]); + u[10] = _mm_add_epi32(v[10], v[14]); + u[11] = _mm_add_epi32(v[11], v[15]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[14] = _mm_sub_epi32(v[10], v[14]); + u[15] = _mm_sub_epi32(v[11], v[15]); + + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[0] = _mm_add_epi16(x[0], x[2]); + s[1] = _mm_add_epi16(x[1], x[3]); + s[2] = _mm_sub_epi16(x[0], x[2]); + s[3] = _mm_sub_epi16(x[1], x[3]); + s[4] = _mm_packs_epi32(v[0], v[1]); + s[5] = _mm_packs_epi32(v[2], v[3]); + s[6] = _mm_packs_epi32(v[4], v[5]); + s[7] = _mm_packs_epi32(v[6], v[7]); + s[8] = _mm_add_epi16(x[8], x[10]); + s[9] = _mm_add_epi16(x[9], x[11]); + s[10] = _mm_sub_epi16(x[8], x[10]); + s[11] = _mm_sub_epi16(x[9], x[11]); + s[12] = _mm_packs_epi32(v[8], v[9]); + s[13] = _mm_packs_epi32(v[10], v[11]); + s[14] = _mm_packs_epi32(v[12], v[13]); + s[15] = _mm_packs_epi32(v[14], v[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(s[2], s[3]); + u[1] = _mm_unpackhi_epi16(s[2], s[3]); + u[2] = _mm_unpacklo_epi16(s[6], s[7]); + u[3] = _mm_unpackhi_epi16(s[6], s[7]); + u[4] = _mm_unpacklo_epi16(s[10], s[11]); + u[5] = _mm_unpackhi_epi16(s[10], s[11]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); + v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); + v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); + v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); + v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); + v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); + v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); + v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + in[0] = s[0]; + in[1] = _mm_sub_epi16(kZero, s[8]); + in[2] = s[12]; + in[3] = _mm_sub_epi16(kZero, s[4]); + in[4] = _mm_packs_epi32(v[4], v[5]); + in[5] = _mm_packs_epi32(v[12], v[13]); + in[6] = _mm_packs_epi32(v[8], v[9]); + in[7] = _mm_packs_epi32(v[0], v[1]); + in[8] = _mm_packs_epi32(v[2], v[3]); + in[9] = _mm_packs_epi32(v[10], v[11]); + in[10] = _mm_packs_epi32(v[14], v[15]); + in[11] = _mm_packs_epi32(v[6], v[7]); + in[12] = s[5]; + in[13] = _mm_sub_epi16(kZero, s[13]); + in[14] = s[9]; + in[15] = _mm_sub_epi16(kZero, s[1]); +} + +static void idct16_8col(__m128i *in) { + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i v[16], u[16], s[16], t[16]; + + // stage 1 + s[0] = in[0]; + s[1] = in[8]; + s[2] = in[4]; + s[3] = in[12]; + s[4] = in[2]; + s[5] = in[10]; + s[6] = in[6]; + s[7] = in[14]; + s[8] = in[1]; + s[9] = in[9]; + s[10] = in[5]; + s[11] = in[13]; + s[12] = in[3]; + s[13] = in[11]; + s[14] = in[7]; + s[15] = in[15]; + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[15]); + u[1] = _mm_unpackhi_epi16(s[8], s[15]); + u[2] = _mm_unpacklo_epi16(s[9], s[14]); + u[3] = _mm_unpackhi_epi16(s[9], s[14]); + u[4] = _mm_unpacklo_epi16(s[10], s[13]); + u[5] = _mm_unpackhi_epi16(s[10], s[13]); + u[6] = _mm_unpacklo_epi16(s[11], s[12]); + u[7] = _mm_unpackhi_epi16(s[11], s[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); + v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); + v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); + v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); + v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); + v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); + v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); + v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); + v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); + v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); + v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); + v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); + v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); + v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); + v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); + v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[8] = _mm_packs_epi32(u[0], u[1]); + s[15] = _mm_packs_epi32(u[2], u[3]); + s[9] = _mm_packs_epi32(u[4], u[5]); + s[14] = _mm_packs_epi32(u[6], u[7]); + s[10] = _mm_packs_epi32(u[8], u[9]); + s[13] = _mm_packs_epi32(u[10], u[11]); + s[11] = _mm_packs_epi32(u[12], u[13]); + s[12] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + t[0] = s[0]; + t[1] = s[1]; + t[2] = s[2]; + t[3] = s[3]; + u[0] = _mm_unpacklo_epi16(s[4], s[7]); + u[1] = _mm_unpackhi_epi16(s[4], s[7]); + u[2] = _mm_unpacklo_epi16(s[5], s[6]); + u[3] = _mm_unpackhi_epi16(s[5], s[6]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[4] = _mm_packs_epi32(u[0], u[1]); + t[7] = _mm_packs_epi32(u[2], u[3]); + t[5] = _mm_packs_epi32(u[4], u[5]); + t[6] = _mm_packs_epi32(u[6], u[7]); + t[8] = _mm_add_epi16(s[8], s[9]); + t[9] = _mm_sub_epi16(s[8], s[9]); + t[10] = _mm_sub_epi16(s[11], s[10]); + t[11] = _mm_add_epi16(s[10], s[11]); + t[12] = _mm_add_epi16(s[12], s[13]); + t[13] = _mm_sub_epi16(s[12], s[13]); + t[14] = _mm_sub_epi16(s[15], s[14]); + t[15] = _mm_add_epi16(s[14], s[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(t[0], t[1]); + u[1] = _mm_unpackhi_epi16(t[0], t[1]); + u[2] = _mm_unpacklo_epi16(t[2], t[3]); + u[3] = _mm_unpackhi_epi16(t[2], t[3]); + u[4] = _mm_unpacklo_epi16(t[9], t[14]); + u[5] = _mm_unpackhi_epi16(t[9], t[14]); + u[6] = _mm_unpacklo_epi16(t[10], t[13]); + u[7] = _mm_unpackhi_epi16(t[10], t[13]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); + v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); + v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[0] = _mm_packs_epi32(u[0], u[1]); + s[1] = _mm_packs_epi32(u[2], u[3]); + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_add_epi16(t[4], t[5]); + s[5] = _mm_sub_epi16(t[4], t[5]); + s[6] = _mm_sub_epi16(t[7], t[6]); + s[7] = _mm_add_epi16(t[6], t[7]); + s[8] = t[8]; + s[15] = t[15]; + s[9] = _mm_packs_epi32(u[8], u[9]); + s[14] = _mm_packs_epi32(u[10], u[11]); + s[10] = _mm_packs_epi32(u[12], u[13]); + s[13] = _mm_packs_epi32(u[14], u[15]); + s[11] = t[11]; + s[12] = t[12]; + + // stage 5 + t[0] = _mm_add_epi16(s[0], s[3]); + t[1] = _mm_add_epi16(s[1], s[2]); + t[2] = _mm_sub_epi16(s[1], s[2]); + t[3] = _mm_sub_epi16(s[0], s[3]); + t[4] = s[4]; + t[7] = s[7]; + + u[0] = _mm_unpacklo_epi16(s[5], s[6]); + u[1] = _mm_unpackhi_epi16(s[5], s[6]); + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + t[5] = _mm_packs_epi32(u[0], u[1]); + t[6] = _mm_packs_epi32(u[2], u[3]); + + t[8] = _mm_add_epi16(s[8], s[11]); + t[9] = _mm_add_epi16(s[9], s[10]); + t[10] = _mm_sub_epi16(s[9], s[10]); + t[11] = _mm_sub_epi16(s[8], s[11]); + t[12] = _mm_sub_epi16(s[15], s[12]); + t[13] = _mm_sub_epi16(s[14], s[13]); + t[14] = _mm_add_epi16(s[13], s[14]); + t[15] = _mm_add_epi16(s[12], s[15]); + + // stage 6 + s[0] = _mm_add_epi16(t[0], t[7]); + s[1] = _mm_add_epi16(t[1], t[6]); + s[2] = _mm_add_epi16(t[2], t[5]); + s[3] = _mm_add_epi16(t[3], t[4]); + s[4] = _mm_sub_epi16(t[3], t[4]); + s[5] = _mm_sub_epi16(t[2], t[5]); + s[6] = _mm_sub_epi16(t[1], t[6]); + s[7] = _mm_sub_epi16(t[0], t[7]); + s[8] = t[8]; + s[9] = t[9]; + + u[0] = _mm_unpacklo_epi16(t[10], t[13]); + u[1] = _mm_unpackhi_epi16(t[10], t[13]); + u[2] = _mm_unpacklo_epi16(t[11], t[12]); + u[3] = _mm_unpackhi_epi16(t[11], t[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + s[10] = _mm_packs_epi32(u[0], u[1]); + s[13] = _mm_packs_epi32(u[2], u[3]); + s[11] = _mm_packs_epi32(u[4], u[5]); + s[12] = _mm_packs_epi32(u[6], u[7]); + s[14] = t[14]; + s[15] = t[15]; + + // stage 7 + in[0] = _mm_add_epi16(s[0], s[15]); + in[1] = _mm_add_epi16(s[1], s[14]); + in[2] = _mm_add_epi16(s[2], s[13]); + in[3] = _mm_add_epi16(s[3], s[12]); + in[4] = _mm_add_epi16(s[4], s[11]); + in[5] = _mm_add_epi16(s[5], s[10]); + in[6] = _mm_add_epi16(s[6], s[9]); + in[7] = _mm_add_epi16(s[7], s[8]); + in[8] = _mm_sub_epi16(s[7], s[8]); + in[9] = _mm_sub_epi16(s[6], s[9]); + in[10] = _mm_sub_epi16(s[5], s[10]); + in[11] = _mm_sub_epi16(s[4], s[11]); + in[12] = _mm_sub_epi16(s[3], s[12]); + in[13] = _mm_sub_epi16(s[2], s[13]); + in[14] = _mm_sub_epi16(s[1], s[14]); + in[15] = _mm_sub_epi16(s[0], s[15]); +} + +void idct16_sse2(__m128i *in0, __m128i *in1) { + array_transpose_16x16(in0, in1); + idct16_8col(in0); + idct16_8col(in1); +} + +void iadst16_sse2(__m128i *in0, __m128i *in1) { + array_transpose_16x16(in0, in1); + iadst16_8col(in0); + iadst16_8col(in1); +} + +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i in[16], l[16]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + // First 1-D inverse DCT + // Load input data. + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8 * 2); + in[2] = load_input_data(input + 8 * 4); + in[3] = load_input_data(input + 8 * 6); + + TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); + + // Stage2 + { + const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); + const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); + + tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); + tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); + tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); + tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); + + stp2_8 = _mm_packs_epi32(tmp0, tmp2); + stp2_11 = _mm_packs_epi32(tmp5, tmp7); + } + + // Stage3 + { + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); + + tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); + tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + + stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); + stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); + + stp1_4 = _mm_packs_epi32(tmp0, tmp2); + } + + // Stage4 + { + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); + + tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); + tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); + tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); + tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); + tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); + tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); + + stp1_0 = _mm_packs_epi32(tmp0, tmp0); + stp1_1 = _mm_packs_epi32(tmp2, tmp2); + stp2_9 = _mm_packs_epi32(tmp1, tmp3); + stp2_10 = _mm_packs_epi32(tmp5, tmp7); + + stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); + } + + // Stage5 and Stage6 + { + tmp0 = _mm_add_epi16(stp2_8, stp2_11); + tmp1 = _mm_sub_epi16(stp2_8, stp2_11); + tmp2 = _mm_add_epi16(stp2_9, stp2_10); + tmp3 = _mm_sub_epi16(stp2_9, stp2_10); + + stp1_9 = _mm_unpacklo_epi64(tmp2, zero); + stp1_10 = _mm_unpacklo_epi64(tmp3, zero); + stp1_8 = _mm_unpacklo_epi64(tmp0, zero); + stp1_11 = _mm_unpacklo_epi64(tmp1, zero); + + stp1_13 = _mm_unpackhi_epi64(tmp3, zero); + stp1_14 = _mm_unpackhi_epi64(tmp2, zero); + stp1_12 = _mm_unpackhi_epi64(tmp1, zero); + stp1_15 = _mm_unpackhi_epi64(tmp0, zero); + } + + // Stage6 + { + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); + + tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); + tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); + tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); + tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); + tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); + tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); + + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_6 = _mm_packs_epi32(tmp3, tmp1); + + stp2_10 = _mm_packs_epi32(tmp0, zero); + stp2_13 = _mm_packs_epi32(tmp2, zero); + stp2_11 = _mm_packs_epi32(tmp4, zero); + stp2_12 = _mm_packs_epi32(tmp6, zero); + + tmp0 = _mm_add_epi16(stp1_0, stp1_4); + tmp1 = _mm_sub_epi16(stp1_0, stp1_4); + tmp2 = _mm_add_epi16(stp1_1, stp1_6); + tmp3 = _mm_sub_epi16(stp1_1, stp1_6); + + stp2_0 = _mm_unpackhi_epi64(tmp0, zero); + stp2_1 = _mm_unpacklo_epi64(tmp2, zero); + stp2_2 = _mm_unpackhi_epi64(tmp2, zero); + stp2_3 = _mm_unpacklo_epi64(tmp0, zero); + stp2_4 = _mm_unpacklo_epi64(tmp1, zero); + stp2_5 = _mm_unpackhi_epi64(tmp3, zero); + stp2_6 = _mm_unpacklo_epi64(tmp3, zero); + stp2_7 = _mm_unpackhi_epi64(tmp1, zero); + } + + // Stage7. Left 8x16 only. + l[0] = _mm_add_epi16(stp2_0, stp1_15); + l[1] = _mm_add_epi16(stp2_1, stp1_14); + l[2] = _mm_add_epi16(stp2_2, stp2_13); + l[3] = _mm_add_epi16(stp2_3, stp2_12); + l[4] = _mm_add_epi16(stp2_4, stp2_11); + l[5] = _mm_add_epi16(stp2_5, stp2_10); + l[6] = _mm_add_epi16(stp2_6, stp1_9); + l[7] = _mm_add_epi16(stp2_7, stp1_8); + l[8] = _mm_sub_epi16(stp2_7, stp1_8); + l[9] = _mm_sub_epi16(stp2_6, stp1_9); + l[10] = _mm_sub_epi16(stp2_5, stp2_10); + l[11] = _mm_sub_epi16(stp2_4, stp2_11); + l[12] = _mm_sub_epi16(stp2_3, stp2_12); + l[13] = _mm_sub_epi16(stp2_2, stp2_13); + l[14] = _mm_sub_epi16(stp2_1, stp1_14); + l[15] = _mm_sub_epi16(stp2_0, stp1_15); + + // Second 1-D inverse transform, performed per 8x16 block + for (i = 0; i < 2; i++) { + int j; + array_transpose_4X8(l + 8 * i, in); + + IDCT16_10 + + // Stage7 + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); + + for (j = 0; j < 16; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +#define LOAD_DQCOEFF(reg, input) \ + { \ + reg = load_input_data(input); \ + input += 8; \ + } \ + +#define IDCT32_34 \ +/* Stage1 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ + \ + const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ + \ + const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ + \ + MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \ + stg1_1, stp1_16, stp1_31); \ + MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \ + stg1_7, stp1_19, stp1_28); \ + MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \ + stg1_9, stp1_20, stp1_27); \ + MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \ + stg1_15, stp1_23, stp1_24); \ +} \ +\ +/* Stage2 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ + \ + const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ + \ + MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \ + stg2_1, stp2_8, stp2_15); \ + MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \ + stg2_7, stp2_11, stp2_12); \ + \ + stp2_16 = stp1_16; \ + stp2_19 = stp1_19; \ + \ + stp2_20 = stp1_20; \ + stp2_23 = stp1_23; \ + \ + stp2_24 = stp1_24; \ + stp2_27 = stp1_27; \ + \ + stp2_28 = stp1_28; \ + stp2_31 = stp1_31; \ +} \ +\ +/* Stage3 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ + \ + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ + \ + MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \ + stg3_1, stp1_4, stp1_7); \ + \ + stp1_8 = stp2_8; \ + stp1_11 = stp2_11; \ + stp1_12 = stp2_12; \ + stp1_15 = stp2_15; \ + \ + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ + stp1_18, stp1_29) \ + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ + stp1_22, stp1_25) \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ +} \ +\ +/* Stage4 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ + \ + MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \ + stg4_1, stp2_0, stp2_1); \ + \ + stp2_4 = stp1_4; \ + stp2_5 = stp1_4; \ + stp2_6 = stp1_7; \ + stp2_7 = stp1_7; \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ + stp2_10, stp2_13) \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ +} \ +\ +/* Stage5 */ \ +{ \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + stp1_0 = stp2_0; \ + stp1_1 = stp2_1; \ + stp1_2 = stp2_1; \ + stp1_3 = stp2_0; \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ + stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} \ +\ +/* Stage6 */ \ +{ \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ + stp2_13, stp2_11, stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ +} \ +\ +/* Stage7 */ \ +{ \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} + + +#define IDCT32 \ +/* Stage1 */ \ +{ \ + const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ + const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ + const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ + \ + const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ + const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ + const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ + const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ + const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ + \ + const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ + const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ + const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ + stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ + stp1_17, stp1_30) \ + MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ + stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ + stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ + stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ + stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ +} \ +\ +/* Stage2 */ \ +{ \ + const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ + const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ + const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ + \ + const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ + const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ + const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ + \ + MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ + stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ + stp2_14) \ + MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ + stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ + stp2_11, stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ + stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ + stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ + \ + stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ + stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ + stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ + stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ + \ + stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ + stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ + stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ +} \ +\ +/* Stage3 */ \ +{ \ + const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ + const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ + const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ + \ + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + \ + MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ + stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ + stp1_6) \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + \ + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ + stp1_18, stp1_29) \ + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ + stp1_22, stp1_25) \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ +} \ +\ +/* Stage4 */ \ +{ \ + const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ + const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ + const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ + stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ + stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ + stp2_10, stp2_13) \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ +} \ +\ +/* Stage5 */ \ +{ \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ + stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} \ +\ +/* Stage6 */ \ +{ \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ + stp2_13, stp2_11, stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ +} \ +\ +/* Stage7 */ \ +{ \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} + +// Only upper-left 8x8 has non-zero coeff +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<5); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in[32], col[32]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, + stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, + stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, + stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, + stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. Only need to load the top left 8x8 block. + in[0] = load_input_data(input); + in[1] = load_input_data(input + 32); + in[2] = load_input_data(input + 64); + in[3] = load_input_data(input + 96); + in[4] = load_input_data(input + 128); + in[5] = load_input_data(input + 160); + in[6] = load_input_data(input + 192); + in[7] = load_input_data(input + 224); + + for (i = 8; i < 32; ++i) { + in[i] = _mm_setzero_si128(); + } + + array_transpose_8x8(in, in); + // TODO(hkuang): Following transposes are unnecessary. But remove them will + // lead to performance drop on some devices. + array_transpose_8x8(in + 8, in + 8); + array_transpose_8x8(in + 16, in + 16); + array_transpose_8x8(in + 24, in + 24); + + IDCT32_34 + + // 1_D: Store 32 intermediate results for each 8x32 block. + col[0] = _mm_add_epi16(stp1_0, stp1_31); + col[1] = _mm_add_epi16(stp1_1, stp1_30); + col[2] = _mm_add_epi16(stp1_2, stp1_29); + col[3] = _mm_add_epi16(stp1_3, stp1_28); + col[4] = _mm_add_epi16(stp1_4, stp1_27); + col[5] = _mm_add_epi16(stp1_5, stp1_26); + col[6] = _mm_add_epi16(stp1_6, stp1_25); + col[7] = _mm_add_epi16(stp1_7, stp1_24); + col[8] = _mm_add_epi16(stp1_8, stp1_23); + col[9] = _mm_add_epi16(stp1_9, stp1_22); + col[10] = _mm_add_epi16(stp1_10, stp1_21); + col[11] = _mm_add_epi16(stp1_11, stp1_20); + col[12] = _mm_add_epi16(stp1_12, stp1_19); + col[13] = _mm_add_epi16(stp1_13, stp1_18); + col[14] = _mm_add_epi16(stp1_14, stp1_17); + col[15] = _mm_add_epi16(stp1_15, stp1_16); + col[16] = _mm_sub_epi16(stp1_15, stp1_16); + col[17] = _mm_sub_epi16(stp1_14, stp1_17); + col[18] = _mm_sub_epi16(stp1_13, stp1_18); + col[19] = _mm_sub_epi16(stp1_12, stp1_19); + col[20] = _mm_sub_epi16(stp1_11, stp1_20); + col[21] = _mm_sub_epi16(stp1_10, stp1_21); + col[22] = _mm_sub_epi16(stp1_9, stp1_22); + col[23] = _mm_sub_epi16(stp1_8, stp1_23); + col[24] = _mm_sub_epi16(stp1_7, stp1_24); + col[25] = _mm_sub_epi16(stp1_6, stp1_25); + col[26] = _mm_sub_epi16(stp1_5, stp1_26); + col[27] = _mm_sub_epi16(stp1_4, stp1_27); + col[28] = _mm_sub_epi16(stp1_3, stp1_28); + col[29] = _mm_sub_epi16(stp1_2, stp1_29); + col[30] = _mm_sub_epi16(stp1_1, stp1_30); + col[31] = _mm_sub_epi16(stp1_0, stp1_31); + for (i = 0; i < 4; i++) { + int j; + const __m128i zero = _mm_setzero_si128(); + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col + i * 8, in); + IDCT32_34 + + // 2_D: Calculate the results and store them to destination. + in[0] = _mm_add_epi16(stp1_0, stp1_31); + in[1] = _mm_add_epi16(stp1_1, stp1_30); + in[2] = _mm_add_epi16(stp1_2, stp1_29); + in[3] = _mm_add_epi16(stp1_3, stp1_28); + in[4] = _mm_add_epi16(stp1_4, stp1_27); + in[5] = _mm_add_epi16(stp1_5, stp1_26); + in[6] = _mm_add_epi16(stp1_6, stp1_25); + in[7] = _mm_add_epi16(stp1_7, stp1_24); + in[8] = _mm_add_epi16(stp1_8, stp1_23); + in[9] = _mm_add_epi16(stp1_9, stp1_22); + in[10] = _mm_add_epi16(stp1_10, stp1_21); + in[11] = _mm_add_epi16(stp1_11, stp1_20); + in[12] = _mm_add_epi16(stp1_12, stp1_19); + in[13] = _mm_add_epi16(stp1_13, stp1_18); + in[14] = _mm_add_epi16(stp1_14, stp1_17); + in[15] = _mm_add_epi16(stp1_15, stp1_16); + in[16] = _mm_sub_epi16(stp1_15, stp1_16); + in[17] = _mm_sub_epi16(stp1_14, stp1_17); + in[18] = _mm_sub_epi16(stp1_13, stp1_18); + in[19] = _mm_sub_epi16(stp1_12, stp1_19); + in[20] = _mm_sub_epi16(stp1_11, stp1_20); + in[21] = _mm_sub_epi16(stp1_10, stp1_21); + in[22] = _mm_sub_epi16(stp1_9, stp1_22); + in[23] = _mm_sub_epi16(stp1_8, stp1_23); + in[24] = _mm_sub_epi16(stp1_7, stp1_24); + in[25] = _mm_sub_epi16(stp1_6, stp1_25); + in[26] = _mm_sub_epi16(stp1_5, stp1_26); + in[27] = _mm_sub_epi16(stp1_4, stp1_27); + in[28] = _mm_sub_epi16(stp1_3, stp1_28); + in[29] = _mm_sub_epi16(stp1_2, stp1_29); + in[30] = _mm_sub_epi16(stp1_1, stp1_30); + in[31] = _mm_sub_epi16(stp1_0, stp1_31); + + for (j = 0; j < 32; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in[32], col[128], zero_idx[16]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, + stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, + stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, + stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, + stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i, j, i32; + + for (i = 0; i < 4; i++) { + i32 = (i << 5); + // First 1-D idct + // Load input data. + LOAD_DQCOEFF(in[0], input); + LOAD_DQCOEFF(in[8], input); + LOAD_DQCOEFF(in[16], input); + LOAD_DQCOEFF(in[24], input); + LOAD_DQCOEFF(in[1], input); + LOAD_DQCOEFF(in[9], input); + LOAD_DQCOEFF(in[17], input); + LOAD_DQCOEFF(in[25], input); + LOAD_DQCOEFF(in[2], input); + LOAD_DQCOEFF(in[10], input); + LOAD_DQCOEFF(in[18], input); + LOAD_DQCOEFF(in[26], input); + LOAD_DQCOEFF(in[3], input); + LOAD_DQCOEFF(in[11], input); + LOAD_DQCOEFF(in[19], input); + LOAD_DQCOEFF(in[27], input); + + LOAD_DQCOEFF(in[4], input); + LOAD_DQCOEFF(in[12], input); + LOAD_DQCOEFF(in[20], input); + LOAD_DQCOEFF(in[28], input); + LOAD_DQCOEFF(in[5], input); + LOAD_DQCOEFF(in[13], input); + LOAD_DQCOEFF(in[21], input); + LOAD_DQCOEFF(in[29], input); + LOAD_DQCOEFF(in[6], input); + LOAD_DQCOEFF(in[14], input); + LOAD_DQCOEFF(in[22], input); + LOAD_DQCOEFF(in[30], input); + LOAD_DQCOEFF(in[7], input); + LOAD_DQCOEFF(in[15], input); + LOAD_DQCOEFF(in[23], input); + LOAD_DQCOEFF(in[31], input); + + // checking if all entries are zero + zero_idx[0] = _mm_or_si128(in[0], in[1]); + zero_idx[1] = _mm_or_si128(in[2], in[3]); + zero_idx[2] = _mm_or_si128(in[4], in[5]); + zero_idx[3] = _mm_or_si128(in[6], in[7]); + zero_idx[4] = _mm_or_si128(in[8], in[9]); + zero_idx[5] = _mm_or_si128(in[10], in[11]); + zero_idx[6] = _mm_or_si128(in[12], in[13]); + zero_idx[7] = _mm_or_si128(in[14], in[15]); + zero_idx[8] = _mm_or_si128(in[16], in[17]); + zero_idx[9] = _mm_or_si128(in[18], in[19]); + zero_idx[10] = _mm_or_si128(in[20], in[21]); + zero_idx[11] = _mm_or_si128(in[22], in[23]); + zero_idx[12] = _mm_or_si128(in[24], in[25]); + zero_idx[13] = _mm_or_si128(in[26], in[27]); + zero_idx[14] = _mm_or_si128(in[28], in[29]); + zero_idx[15] = _mm_or_si128(in[30], in[31]); + + zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); + zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); + zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); + zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); + zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); + zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); + zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); + zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); + + zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); + zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); + zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); + zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); + zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); + zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); + zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); + + if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { + col[i32 + 0] = _mm_setzero_si128(); + col[i32 + 1] = _mm_setzero_si128(); + col[i32 + 2] = _mm_setzero_si128(); + col[i32 + 3] = _mm_setzero_si128(); + col[i32 + 4] = _mm_setzero_si128(); + col[i32 + 5] = _mm_setzero_si128(); + col[i32 + 6] = _mm_setzero_si128(); + col[i32 + 7] = _mm_setzero_si128(); + col[i32 + 8] = _mm_setzero_si128(); + col[i32 + 9] = _mm_setzero_si128(); + col[i32 + 10] = _mm_setzero_si128(); + col[i32 + 11] = _mm_setzero_si128(); + col[i32 + 12] = _mm_setzero_si128(); + col[i32 + 13] = _mm_setzero_si128(); + col[i32 + 14] = _mm_setzero_si128(); + col[i32 + 15] = _mm_setzero_si128(); + col[i32 + 16] = _mm_setzero_si128(); + col[i32 + 17] = _mm_setzero_si128(); + col[i32 + 18] = _mm_setzero_si128(); + col[i32 + 19] = _mm_setzero_si128(); + col[i32 + 20] = _mm_setzero_si128(); + col[i32 + 21] = _mm_setzero_si128(); + col[i32 + 22] = _mm_setzero_si128(); + col[i32 + 23] = _mm_setzero_si128(); + col[i32 + 24] = _mm_setzero_si128(); + col[i32 + 25] = _mm_setzero_si128(); + col[i32 + 26] = _mm_setzero_si128(); + col[i32 + 27] = _mm_setzero_si128(); + col[i32 + 28] = _mm_setzero_si128(); + col[i32 + 29] = _mm_setzero_si128(); + col[i32 + 30] = _mm_setzero_si128(); + col[i32 + 31] = _mm_setzero_si128(); + continue; + } + + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(in, in); + array_transpose_8x8(in + 8, in + 8); + array_transpose_8x8(in + 16, in + 16); + array_transpose_8x8(in + 24, in + 24); + + IDCT32 + + // 1_D: Store 32 intermediate results for each 8x32 block. + col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); + col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); + col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); + col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); + col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); + col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); + col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); + col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); + col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); + col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); + col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); + col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); + col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); + col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); + col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); + col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); + col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); + col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); + col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); + col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); + col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); + col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); + col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); + col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); + col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); + col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); + col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); + col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); + col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); + col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); + col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); + col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); + } + for (i = 0; i < 4; i++) { + // Second 1-D idct + j = i << 3; + + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col + j, in); + array_transpose_8x8(col + j + 32, in + 8); + array_transpose_8x8(col + j + 64, in + 16); + array_transpose_8x8(col + j + 96, in + 24); + + IDCT32 + + // 2_D: Calculate the results and store them to destination. + in[0] = _mm_add_epi16(stp1_0, stp1_31); + in[1] = _mm_add_epi16(stp1_1, stp1_30); + in[2] = _mm_add_epi16(stp1_2, stp1_29); + in[3] = _mm_add_epi16(stp1_3, stp1_28); + in[4] = _mm_add_epi16(stp1_4, stp1_27); + in[5] = _mm_add_epi16(stp1_5, stp1_26); + in[6] = _mm_add_epi16(stp1_6, stp1_25); + in[7] = _mm_add_epi16(stp1_7, stp1_24); + in[8] = _mm_add_epi16(stp1_8, stp1_23); + in[9] = _mm_add_epi16(stp1_9, stp1_22); + in[10] = _mm_add_epi16(stp1_10, stp1_21); + in[11] = _mm_add_epi16(stp1_11, stp1_20); + in[12] = _mm_add_epi16(stp1_12, stp1_19); + in[13] = _mm_add_epi16(stp1_13, stp1_18); + in[14] = _mm_add_epi16(stp1_14, stp1_17); + in[15] = _mm_add_epi16(stp1_15, stp1_16); + in[16] = _mm_sub_epi16(stp1_15, stp1_16); + in[17] = _mm_sub_epi16(stp1_14, stp1_17); + in[18] = _mm_sub_epi16(stp1_13, stp1_18); + in[19] = _mm_sub_epi16(stp1_12, stp1_19); + in[20] = _mm_sub_epi16(stp1_11, stp1_20); + in[21] = _mm_sub_epi16(stp1_10, stp1_21); + in[22] = _mm_sub_epi16(stp1_9, stp1_22); + in[23] = _mm_sub_epi16(stp1_8, stp1_23); + in[24] = _mm_sub_epi16(stp1_7, stp1_24); + in[25] = _mm_sub_epi16(stp1_6, stp1_25); + in[26] = _mm_sub_epi16(stp1_5, stp1_26); + in[27] = _mm_sub_epi16(stp1_4, stp1_27); + in[28] = _mm_sub_epi16(stp1_3, stp1_28); + in[29] = _mm_sub_epi16(stp1_2, stp1_29); + in[30] = _mm_sub_epi16(stp1_1, stp1_30); + in[31] = _mm_sub_epi16(stp1_0, stp1_31); + + for (j = 0; j < 32; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a, j; + + a = (int)dct_const_round_shift(input[0] * cospi_16_64); + a = (int)dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 6); + + dc_value = _mm_set1_epi16(a); + + for (j = 0; j < 32; ++j) { + RECON_AND_STORE(dest + 0 + j * stride, dc_value); + RECON_AND_STORE(dest + 8 + j * stride, dc_value); + RECON_AND_STORE(dest + 16 + j * stride, dc_value); + RECON_AND_STORE(dest + 24 + j * stride, dc_value); + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { + __m128i ubounded, retval; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); + ubounded = _mm_cmpgt_epi16(value, max); + retval = _mm_andnot_si128(ubounded, value); + ubounded = _mm_and_si128(ubounded, max); + retval = _mm_or_si128(retval, ubounded); + retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); + return retval; +} + +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + int i, j; + __m128i inptr[4]; + __m128i sign_bits[2]; + __m128i temp_mm, min_input, max_input; + int test; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + int optimised_cols = 0; + const __m128i zero = _mm_set1_epi16(0); + const __m128i eight = _mm_set1_epi16(8); + const __m128i max = _mm_set1_epi16(12043); + const __m128i min = _mm_set1_epi16(-12043); + // Load input into __m128i + inptr[0] = _mm_loadu_si128((const __m128i *)input); + inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); + inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); + inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); + + // Pack to 16 bits + inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); + inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); + + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp_mm = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp_mm); + + if (!test) { + // Do the row transform + idct4_sse2(inptr); + + // Check the min & max values + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp_mm = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp_mm); + + if (test) { + transpose_4x4(inptr); + sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); + sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); + inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); + inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); + inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); + inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); + _mm_storeu_si128((__m128i *)outptr, inptr[0]); + _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); + _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); + _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + vpx_highbd_idct4_c(input, outptr, bd); + input += 4; + outptr += 4; + } + } + + if (optimised_cols) { + idct4_sse2(inptr); + + // Final round and shift + inptr[0] = _mm_add_epi16(inptr[0], eight); + inptr[1] = _mm_add_epi16(inptr[1], eight); + + inptr[0] = _mm_srai_epi16(inptr[0], 4); + inptr[1] = _mm_srai_epi16(inptr[1], 4); + + // Reconstruction and Store + { + __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); + __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi64( + d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); + d2 = _mm_unpacklo_epi64( + d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); + d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); + d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); + // store input0 + _mm_storel_epi64((__m128i *)dest, d0); + // store input1 + d0 = _mm_srli_si128(d0, 8); + _mm_storel_epi64((__m128i *)(dest + stride), d0); + // store input2 + _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); + // store input3 + d2 = _mm_srli_si128(d2, 8); + _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[4], temp_out[4]; + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + vpx_highbd_idct4_c(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } + } + } +} + +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[8]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + const __m128i zero = _mm_set1_epi16(0); + const __m128i sixteen = _mm_set1_epi16(16); + const __m128i max = _mm_set1_epi16(6201); + const __m128i min = _mm_set1_epi16(-6201); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 8; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 8; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform + idct8_sse2(inptr); + + // Find the min & max for the column transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 8; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + array_transpose_8x8(inptr, inptr); + for (i = 0; i < 8; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); + temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 8; ++i) { + vpx_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + } + + if (optimised_cols) { + idct8_sse2(inptr); + + // Final round & shift and Reconstruction and Store + { + __m128i d[8]; + for (i = 0; i < 8; i++) { + inptr[i] = _mm_add_epi16(inptr[i], sixteen); + d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); + inptr[i] = _mm_srai_epi16(inptr[i], 5); + d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[8], temp_out[8]; + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + vpx_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } + } +} + +void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[8]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + const __m128i zero = _mm_set1_epi16(0); + const __m128i sixteen = _mm_set1_epi16(16); + const __m128i max = _mm_set1_epi16(6201); + const __m128i min = _mm_set1_epi16(-6201); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 8; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + // only first 4 row has non-zero coefs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 4; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform + idct8_sse2(inptr); + + // Find the min & max for the column transform + // N.B. Only first 4 cols contain non-zero coeffs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 8; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + // Use fact only first 4 rows contain non-zero coeffs + array_transpose_4X8(inptr, inptr); + for (i = 0; i < 4; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); + temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + vpx_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + } + + if (optimised_cols) { + idct8_sse2(inptr); + + // Final round & shift and Reconstruction and Store + { + __m128i d[8]; + for (i = 0; i < 8; i++) { + inptr[i] = _mm_add_epi16(inptr[i], sixteen); + d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); + inptr[i] = _mm_srai_epi16(inptr[i], 5); + d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[8], temp_out[8]; + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + vpx_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } + } +} + +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[32]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + const __m128i zero = _mm_set1_epi16(0); + const __m128i rounding = _mm_set1_epi16(32); + const __m128i max = _mm_set1_epi16(3155); + const __m128i min = _mm_set1_epi16(-3155); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 16; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); + inptr[i + 16] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 32; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform + idct16_sse2(inptr, inptr + 16); + + // Find the min & max for the column transform + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 32; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + array_transpose_16x16(inptr, inptr + 16); + for (i = 0; i < 16; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); + sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); + temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 16; ++i) { + vpx_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + } + + if (optimised_cols) { + idct16_sse2(inptr, inptr + 16); + + // Final round & shift and Reconstruction and Store + { + __m128i d[2]; + for (i = 0; i < 16; i++) { + inptr[i ] = _mm_add_epi16(inptr[i ], rounding); + inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding); + d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); + d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8)); + inptr[i ] = _mm_srai_epi16(inptr[i ], 6); + inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6); + d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd); + d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); + _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[16], temp_out[16]; + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + vpx_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } + } +} + +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[32]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + const __m128i zero = _mm_set1_epi16(0); + const __m128i rounding = _mm_set1_epi16(32); + const __m128i max = _mm_set1_epi16(3155); + const __m128i min = _mm_set1_epi16(-3155); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 16; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); + inptr[i + 16] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + // Since all non-zero dct coefficients are in upper-left 4x4 area, + // we only need to consider first 4 rows here. + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 4; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform (N.B. This transposes inptr) + idct16_sse2(inptr, inptr + 16); + + // Find the min & max for the column transform + // N.B. Only first 4 cols contain non-zero coeffs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 16; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + // Use fact only first 4 rows contain non-zero coeffs + array_transpose_8x8(inptr, inptr); + array_transpose_8x8(inptr + 8, inptr + 16); + for (i = 0; i < 4; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); + sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); + temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); + temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + vpx_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + } + + if (optimised_cols) { + idct16_sse2(inptr, inptr + 16); + + // Final round & shift and Reconstruction and Store + { + __m128i d[2]; + for (i = 0; i < 16; i++) { + inptr[i ] = _mm_add_epi16(inptr[i ], rounding); + inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding); + d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); + d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8)); + inptr[i ] = _mm_srai_epi16(inptr[i ], 6); + inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6); + d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd); + d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); + _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[16], temp_out[16]; + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + vpx_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + } + } + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.h new file mode 100644 index 000000000000..bd520c18e56f --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.h @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_ +#define VPX_DSP_X86_INV_TXFM_SSE2_H_ + +#include // SSE2 +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +// perform 8x8 transpose +static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + + res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); + res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); + res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); + res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); + res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); +} + +#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + \ + in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ + in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ + } + +static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); + out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); + out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); +} + +static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { + __m128i tbuf[8]; + array_transpose_8x8(res0, res0); + array_transpose_8x8(res1, tbuf); + array_transpose_8x8(res0 + 8, res1); + array_transpose_8x8(res1 + 8, res1 + 8); + + res0[8] = tbuf[0]; + res0[9] = tbuf[1]; + res0[10] = tbuf[2]; + res0[11] = tbuf[3]; + res0[12] = tbuf[4]; + res0[13] = tbuf[5]; + res0[14] = tbuf[6]; + res0[15] = tbuf[7]; +} + +// Function to allow 8 bit optimisations to be used when profile 0 is used with +// highbitdepth enabled +static INLINE __m128i load_input_data(const tran_low_t *data) { +#if CONFIG_VP9_HIGHBITDEPTH + return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], + data[6], data[7]); +#else + return _mm_load_si128((const __m128i *)data); +#endif +} + +static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) { + in[0] = load_input_data(input + 0 * 16); + in[1] = load_input_data(input + 1 * 16); + in[2] = load_input_data(input + 2 * 16); + in[3] = load_input_data(input + 3 * 16); + in[4] = load_input_data(input + 4 * 16); + in[5] = load_input_data(input + 5 * 16); + in[6] = load_input_data(input + 6 * 16); + in[7] = load_input_data(input + 7 * 16); + + in[8] = load_input_data(input + 8 * 16); + in[9] = load_input_data(input + 9 * 16); + in[10] = load_input_data(input + 10 * 16); + in[11] = load_input_data(input + 11 * 16); + in[12] = load_input_data(input + 12 * 16); + in[13] = load_input_data(input + 13 * 16); + in[14] = load_input_data(input + 14 * 16); + in[15] = load_input_data(input + 15 * 16); +} + +#define RECON_AND_STORE(dest, in_x) \ + { \ + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + _mm_storel_epi64((__m128i *)(dest), d0); \ + } + +static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + RECON_AND_STORE(dest + 0 * stride, in[0]); + RECON_AND_STORE(dest + 1 * stride, in[1]); + RECON_AND_STORE(dest + 2 * stride, in[2]); + RECON_AND_STORE(dest + 3 * stride, in[3]); + RECON_AND_STORE(dest + 4 * stride, in[4]); + RECON_AND_STORE(dest + 5 * stride, in[5]); + RECON_AND_STORE(dest + 6 * stride, in[6]); + RECON_AND_STORE(dest + 7 * stride, in[7]); + RECON_AND_STORE(dest + 8 * stride, in[8]); + RECON_AND_STORE(dest + 9 * stride, in[9]); + RECON_AND_STORE(dest + 10 * stride, in[10]); + RECON_AND_STORE(dest + 11 * stride, in[11]); + RECON_AND_STORE(dest + 12 * stride, in[12]); + RECON_AND_STORE(dest + 13 * stride, in[13]); + RECON_AND_STORE(dest + 14 * stride, in[14]); + RECON_AND_STORE(dest + 15 * stride, in[15]); +} + +void idct4_sse2(__m128i *in); +void idct8_sse2(__m128i *in); +void idct16_sse2(__m128i *in0, __m128i *in1); +void iadst4_sse2(__m128i *in); +void iadst8_sse2(__m128i *in); +void iadst16_sse2(__m128i *in0, __m128i *in1); + +#endif // VPX_DSP_X86_INV_TXFM_SSE2_H_ diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm new file mode 100644 index 000000000000..20baf820f6b9 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm @@ -0,0 +1,1793 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +; This file provides SSSE3 version of the inverse transformation. Part +; of the functions are originally derived from the ffmpeg project. +; Note that the current version applies to x86 64-bit only. + +SECTION_RODATA + +pw_11585x2: times 8 dw 23170 + +pw_m2404x2: times 8 dw -2404*2 +pw_m4756x2: times 8 dw -4756*2 +pw_m5520x2: times 8 dw -5520*2 +pw_m8423x2: times 8 dw -8423*2 +pw_m9102x2: times 8 dw -9102*2 +pw_m10394x2: times 8 dw -10394*2 +pw_m11003x2: times 8 dw -11003*2 + +pw_16364x2: times 8 dw 16364*2 +pw_16305x2: times 8 dw 16305*2 +pw_16207x2: times 8 dw 16207*2 +pw_16069x2: times 8 dw 16069*2 +pw_15893x2: times 8 dw 15893*2 +pw_15679x2: times 8 dw 15679*2 +pw_15426x2: times 8 dw 15426*2 +pw_15137x2: times 8 dw 15137*2 +pw_14811x2: times 8 dw 14811*2 +pw_14449x2: times 8 dw 14449*2 +pw_14053x2: times 8 dw 14053*2 +pw_13623x2: times 8 dw 13623*2 +pw_13160x2: times 8 dw 13160*2 +pw_12665x2: times 8 dw 12665*2 +pw_12140x2: times 8 dw 12140*2 +pw__9760x2: times 8 dw 9760*2 +pw__7723x2: times 8 dw 7723*2 +pw__7005x2: times 8 dw 7005*2 +pw__6270x2: times 8 dw 6270*2 +pw__3981x2: times 8 dw 3981*2 +pw__3196x2: times 8 dw 3196*2 +pw__1606x2: times 8 dw 1606*2 +pw___804x2: times 8 dw 804*2 + +pd_8192: times 4 dd 8192 +pw_32: times 8 dw 32 +pw_16: times 8 dw 16 + +%macro TRANSFORM_COEFFS 2 +pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 +pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1 +pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2 +%endmacro + +TRANSFORM_COEFFS 6270, 15137 +TRANSFORM_COEFFS 3196, 16069 +TRANSFORM_COEFFS 13623, 9102 + +; constants for 32x32_34 +TRANSFORM_COEFFS 804, 16364 +TRANSFORM_COEFFS 15426, 5520 +TRANSFORM_COEFFS 3981, 15893 +TRANSFORM_COEFFS 16207, 2404 +TRANSFORM_COEFFS 1606, 16305 +TRANSFORM_COEFFS 15679, 4756 +TRANSFORM_COEFFS 11585, 11585 + +; constants for 32x32_1024 +TRANSFORM_COEFFS 12140, 11003 +TRANSFORM_COEFFS 7005, 14811 +TRANSFORM_COEFFS 14053, 8423 +TRANSFORM_COEFFS 9760, 13160 +TRANSFORM_COEFFS 12665, 10394 +TRANSFORM_COEFFS 7723, 14449 + +%macro PAIR_PP_COEFFS 2 +dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2 +%endmacro + +%macro PAIR_MP_COEFFS 2 +dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2 +%endmacro + +%macro PAIR_MM_COEFFS 2 +dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2 +%endmacro + +PAIR_PP_COEFFS 30274, 12540 +PAIR_PP_COEFFS 6392, 32138 +PAIR_MP_COEFFS 18204, 27246 + +PAIR_PP_COEFFS 12540, 12540 +PAIR_PP_COEFFS 30274, 30274 +PAIR_PP_COEFFS 6392, 6392 +PAIR_PP_COEFFS 32138, 32138 +PAIR_MM_COEFFS 18204, 18204 +PAIR_PP_COEFFS 27246, 27246 + +SECTION .text + +%if ARCH_X86_64 +%macro SUM_SUB 3 + psubw m%3, m%1, m%2 + paddw m%1, m%2 + SWAP %2, %3 +%endmacro + +; butterfly operation +%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 + pmaddwd m%1, m%3, %5 + pmaddwd m%2, m%3, %6 + paddd m%1, %4 + paddd m%2, %4 + psrad m%1, 14 + psrad m%2, 14 +%endmacro + +%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 + punpckhwd m%6, m%2, m%1 + MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4] + punpcklwd m%2, m%1 + MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4] + packssdw m%1, m%7 + packssdw m%2, m%6 +%endmacro + +%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 + punpckhwd m%6, m%2, m%1 + MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_m%3_m%4] + punpcklwd m%2, m%1 + MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_m%3_m%4] + packssdw m%1, m%7 + packssdw m%2, m%6 +%endmacro + +; matrix transpose +%macro INTERLEAVE_2X 4 + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 + SWAP %3, %4 +%endmacro + +%macro TRANSPOSE8X8 9 + INTERLEAVE_2X wd, %1, %2, %9 + INTERLEAVE_2X wd, %3, %4, %9 + INTERLEAVE_2X wd, %5, %6, %9 + INTERLEAVE_2X wd, %7, %8, %9 + + INTERLEAVE_2X dq, %1, %3, %9 + INTERLEAVE_2X dq, %2, %4, %9 + INTERLEAVE_2X dq, %5, %7, %9 + INTERLEAVE_2X dq, %6, %8, %9 + + INTERLEAVE_2X qdq, %1, %5, %9 + INTERLEAVE_2X qdq, %3, %7, %9 + INTERLEAVE_2X qdq, %2, %6, %9 + INTERLEAVE_2X qdq, %4, %8, %9 + + SWAP %2, %5 + SWAP %4, %7 +%endmacro + +%macro IDCT8_1D 0 + SUM_SUB 0, 4, 9 + BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10 + pmulhrsw m0, m12 + pmulhrsw m4, m12 + BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10 + BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10 + + SUM_SUB 1, 5, 9 + SUM_SUB 7, 3, 9 + SUM_SUB 0, 6, 9 + SUM_SUB 4, 2, 9 + SUM_SUB 3, 5, 9 + pmulhrsw m3, m12 + pmulhrsw m5, m12 + + SUM_SUB 0, 7, 9 + SUM_SUB 4, 3, 9 + SUM_SUB 2, 5, 9 + SUM_SUB 6, 1, 9 + + SWAP 3, 6 + SWAP 1, 4 +%endmacro + +; This macro handles 8 pixels per line +%macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero + paddw m%1, m11 + paddw m%2, m11 + psraw m%1, 5 + psraw m%2, 5 + + movh m%3, [outputq] + movh m%4, [outputq + strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + paddw m%3, m%1 + paddw m%4, m%2 + packuswb m%3, m%5 + packuswb m%4, m%5 + movh [outputq], m%3 + movh [outputq + strideq], m%4 +%endmacro + +INIT_XMM ssse3 +; full inverse 8x8 2D-DCT transform +cglobal idct8x8_64_add, 3, 5, 13, input, output, stride + mova m8, [pd_8192] + mova m11, [pw_16] + mova m12, [pw_11585x2] + + lea r3, [2 * strideq] +%if CONFIG_VP9_HIGHBITDEPTH + mova m0, [inputq + 0] + packssdw m0, [inputq + 16] + mova m1, [inputq + 32] + packssdw m1, [inputq + 48] + mova m2, [inputq + 64] + packssdw m2, [inputq + 80] + mova m3, [inputq + 96] + packssdw m3, [inputq + 112] + mova m4, [inputq + 128] + packssdw m4, [inputq + 144] + mova m5, [inputq + 160] + packssdw m5, [inputq + 176] + mova m6, [inputq + 192] + packssdw m6, [inputq + 208] + mova m7, [inputq + 224] + packssdw m7, [inputq + 240] +%else + mova m0, [inputq + 0] + mova m1, [inputq + 16] + mova m2, [inputq + 32] + mova m3, [inputq + 48] + mova m4, [inputq + 64] + mova m5, [inputq + 80] + mova m6, [inputq + 96] + mova m7, [inputq + 112] +%endif + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + IDCT8_1D + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + IDCT8_1D + + pxor m12, m12 + ADD_STORE_8P_2X 0, 1, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 2, 3, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 4, 5, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 6, 7, 9, 10, 12 + + RET + +; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero +cglobal idct8x8_12_add, 3, 5, 13, input, output, stride + mova m8, [pd_8192] + mova m11, [pw_16] + mova m12, [pw_11585x2] + + lea r3, [2 * strideq] + +%if CONFIG_VP9_HIGHBITDEPTH + mova m0, [inputq + 0] + packssdw m0, [inputq + 16] + mova m1, [inputq + 32] + packssdw m1, [inputq + 48] + mova m2, [inputq + 64] + packssdw m2, [inputq + 80] + mova m3, [inputq + 96] + packssdw m3, [inputq + 112] +%else + mova m0, [inputq + 0] + mova m1, [inputq + 16] + mova m2, [inputq + 32] + mova m3, [inputq + 48] +%endif + + punpcklwd m0, m1 + punpcklwd m2, m3 + punpckhdq m9, m0, m2 + punpckldq m0, m2 + SWAP 2, 9 + + ; m0 -> [0], [0] + ; m1 -> [1], [1] + ; m2 -> [2], [2] + ; m3 -> [3], [3] + punpckhqdq m10, m0, m0 + punpcklqdq m0, m0 + punpckhqdq m9, m2, m2 + punpcklqdq m2, m2 + SWAP 1, 10 + SWAP 3, 9 + + pmulhrsw m0, m12 + pmulhrsw m2, [dpw_30274_12540] + pmulhrsw m1, [dpw_6392_32138] + pmulhrsw m3, [dpw_m18204_27246] + + SUM_SUB 0, 2, 9 + SUM_SUB 1, 3, 9 + + punpcklqdq m9, m3, m3 + punpckhqdq m5, m3, m9 + + SUM_SUB 3, 5, 9 + punpckhqdq m5, m3 + pmulhrsw m5, m12 + + punpckhqdq m9, m1, m5 + punpcklqdq m1, m5 + SWAP 5, 9 + + SUM_SUB 0, 5, 9 + SUM_SUB 2, 1, 9 + + punpckhqdq m3, m0, m0 + punpckhqdq m4, m1, m1 + punpckhqdq m6, m5, m5 + punpckhqdq m7, m2, m2 + + punpcklwd m0, m3 + punpcklwd m7, m2 + punpcklwd m1, m4 + punpcklwd m6, m5 + + punpckhdq m4, m0, m7 + punpckldq m0, m7 + punpckhdq m10, m1, m6 + punpckldq m5, m1, m6 + + punpckhqdq m1, m0, m5 + punpcklqdq m0, m5 + punpckhqdq m3, m4, m10 + punpcklqdq m2, m4, m10 + + + pmulhrsw m0, m12 + pmulhrsw m6, m2, [dpw_30274_30274] + pmulhrsw m4, m2, [dpw_12540_12540] + + pmulhrsw m7, m1, [dpw_32138_32138] + pmulhrsw m1, [dpw_6392_6392] + pmulhrsw m5, m3, [dpw_m18204_m18204] + pmulhrsw m3, [dpw_27246_27246] + + mova m2, m0 + SUM_SUB 0, 6, 9 + SUM_SUB 2, 4, 9 + SUM_SUB 1, 5, 9 + SUM_SUB 7, 3, 9 + + SUM_SUB 3, 5, 9 + pmulhrsw m3, m12 + pmulhrsw m5, m12 + + SUM_SUB 0, 7, 9 + SUM_SUB 2, 3, 9 + SUM_SUB 4, 5, 9 + SUM_SUB 6, 1, 9 + + SWAP 3, 6 + SWAP 1, 2 + SWAP 2, 4 + + + pxor m12, m12 + ADD_STORE_8P_2X 0, 1, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 2, 3, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 4, 5, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 6, 7, 9, 10, 12 + + RET + +%define idx0 16 * 0 +%define idx1 16 * 1 +%define idx2 16 * 2 +%define idx3 16 * 3 +%define idx4 16 * 4 +%define idx5 16 * 5 +%define idx6 16 * 6 +%define idx7 16 * 7 +%define idx8 16 * 0 +%define idx9 16 * 1 +%define idx10 16 * 2 +%define idx11 16 * 3 +%define idx12 16 * 4 +%define idx13 16 * 5 +%define idx14 16 * 6 +%define idx15 16 * 7 +%define idx16 16 * 0 +%define idx17 16 * 1 +%define idx18 16 * 2 +%define idx19 16 * 3 +%define idx20 16 * 4 +%define idx21 16 * 5 +%define idx22 16 * 6 +%define idx23 16 * 7 +%define idx24 16 * 0 +%define idx25 16 * 1 +%define idx26 16 * 2 +%define idx27 16 * 3 +%define idx28 16 * 4 +%define idx29 16 * 5 +%define idx30 16 * 6 +%define idx31 16 * 7 + +; FROM idct32x32_add_neon.asm +; +; Instead of doing the transforms stage by stage, it is done by loading +; some input values and doing as many stages as possible to minimize the +; storing/loading of intermediate results. To fit within registers, the +; final coefficients are cut into four blocks: +; BLOCK A: 16-19,28-31 +; BLOCK B: 20-23,24-27 +; BLOCK C: 8-11,12-15 +; BLOCK D: 0-3,4-7 +; Blocks A and C are straight calculation through the various stages. In +; block B, further calculations are performed using the results from +; block A. In block D, further calculations are performed using the results +; from block C and then the final calculations are done using results from +; block A and B which have been combined at the end of block B. +; + +%macro IDCT32X32_34 4 + ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m11, m1 + pmulhrsw m1, [pw___804x2] ; stp1_16 + mova [r4 + 0], m0 + pmulhrsw m11, [pw_16364x2] ; stp2_31 + mova [r4 + 16 * 2], m2 + mova m12, m7 + pmulhrsw m7, [pw_15426x2] ; stp1_28 + mova [r4 + 16 * 4], m4 + pmulhrsw m12, [pw_m5520x2] ; stp2_19 + mova [r4 + 16 * 6], m6 + + ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m2, m1 ; stp1_16 + mova m0, m11 ; stp1_31 + mova m4, m7 ; stp1_28 + mova m15, m12 ; stp1_19 + + ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 + BUTTERFLY_4Xmm 4, 15, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 + + ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 + SUM_SUB 0, 15, 9 ; stp2_17, stp2_18 + SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 + SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 + + ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 4, 15, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 + BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 + + ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m6, m5 + pmulhrsw m5, [pw__3981x2] ; stp1_20 + mova [stp + %4 + idx28], m12 + mova [stp + %4 + idx29], m15 + pmulhrsw m6, [pw_15893x2] ; stp2_27 + mova [stp + %4 + idx30], m2 + mova m2, m3 + pmulhrsw m3, [pw_m2404x2] ; stp1_23 + mova [stp + %4 + idx31], m11 + pmulhrsw m2, [pw_16207x2] ; stp2_24 + + ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m13, m5 ; stp1_20 + mova m14, m6 ; stp1_27 + mova m15, m3 ; stp1_23 + mova m11, m2 ; stp1_24 + + ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 + BUTTERFLY_4Xmm 11, 15, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 + + ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 + SUM_SUB 15, 14, 9 ; stp2_22, stp2_21 + SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 + SUM_SUB 11, 13, 9 ; stp2_25, stp2_26 + + ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 + BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 + + ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 1, 3, 9 ; stp2_16, stp2_23 + SUM_SUB 0, 15, 9 ; stp2_17, stp2_22 + SUM_SUB 4, 14, 9 ; stp2_18, stp2_21 + SUM_SUB 7, 5, 9 ; stp2_19, stp2_20 + mova [stp + %3 + idx16], m1 + mova [stp + %3 + idx17], m0 + mova [stp + %3 + idx18], m4 + mova [stp + %3 + idx19], m7 + + mova m4, [stp + %4 + idx28] + mova m7, [stp + %4 + idx29] + mova m10, [stp + %4 + idx30] + mova m12, [stp + %4 + idx31] + SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 + SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 + SUM_SUB 10, 11, 9 ; stp2_30, stp2_25 + SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 + mova [stp + %4 + idx28], m4 + mova [stp + %4 + idx29], m7 + mova [stp + %4 + idx30], m10 + mova [stp + %4 + idx31], m12 + + ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 6, 5, 9 + pmulhrsw m6, m10 ; stp1_27 + pmulhrsw m5, m10 ; stp1_20 + SUM_SUB 13, 14, 9 + pmulhrsw m13, m10 ; stp1_26 + pmulhrsw m14, m10 ; stp1_21 + SUM_SUB 11, 15, 9 + pmulhrsw m11, m10 ; stp1_25 + pmulhrsw m15, m10 ; stp1_22 + SUM_SUB 2, 3, 9 + pmulhrsw m2, m10 ; stp1_24 + pmulhrsw m3, m10 ; stp1_23 +%else + BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 + SWAP 6, 5 + BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 + SWAP 13, 14 + BUTTERFLY_4X 11, 15, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 + SWAP 11, 15 + BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 + SWAP 2, 3 +%endif + + mova [stp + %4 + idx24], m2 + mova [stp + %4 + idx25], m11 + mova [stp + %4 + idx26], m13 + mova [stp + %4 + idx27], m6 + + ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m0, [rsp + transposed_in + 16 * 2] + mova m6, [rsp + transposed_in + 16 * 6] + + mova m1, m0 + pmulhrsw m0, [pw__1606x2] ; stp1_8 + mova [stp + %3 + idx20], m5 + mova [stp + %3 + idx21], m14 + pmulhrsw m1, [pw_16305x2] ; stp2_15 + mova [stp + %3 + idx22], m15 + mova m7, m6 + pmulhrsw m7, [pw_m4756x2] ; stp2_11 + mova [stp + %3 + idx23], m3 + pmulhrsw m6, [pw_15679x2] ; stp1_12 + + ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m3, m0 ; stp1_8 + mova m2, m1 ; stp1_15 + + ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 + mova m4, m7 ; stp1_11 + mova m5, m6 ; stp1_12 + BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 + + ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 + SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 + SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 + SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 + + ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 5, 4, 9 + pmulhrsw m5, m10 ; stp1_13 + pmulhrsw m4, m10 ; stp1_10 + SUM_SUB 6, 7, 9 + pmulhrsw m6, m10 ; stp1_12 + pmulhrsw m7, m10 ; stp1_11 +%else + BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 + SWAP 5, 4 + BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 + SWAP 6, 7 +%endif + + ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova [stp + %2 + idx8], m0 + mova [stp + %2 + idx9], m2 + mova [stp + %2 + idx10], m4 + mova [stp + %2 + idx11], m7 + + ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m11, [rsp + transposed_in + 16 * 4] + mova m12, m11 + pmulhrsw m11, [pw__3196x2] ; stp1_4 + pmulhrsw m12, [pw_16069x2] ; stp1_7 + + ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m0, [rsp + transposed_in + 16 * 0] + mova m10, [pw_11585x2] + pmulhrsw m0, m10 ; stp1_1 + + mova m14, m11 ; stp1_4 + mova m13, m12 ; stp1_7 + + ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + SUM_SUB 13, 14, 9 + pmulhrsw m13, m10 ; stp1_6 + pmulhrsw m14, m10 ; stp1_5 +%else + BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 + SWAP 13, 14 +%endif + mova m7, m0 ; stp1_0 = stp1_1 + mova m4, m0 ; stp1_1 + mova m2, m7 ; stp1_0 + + ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 + SUM_SUB 7, 13, 9 ; stp1_1, stp1_6 + SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 + SUM_SUB 4, 11, 9 ; stp1_3, stp1_4 + + ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 1, 9 ; stp1_0, stp1_15 + SUM_SUB 7, 3, 9 ; stp1_1, stp1_14 + SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 + SUM_SUB 4, 6, 9 ; stp1_3, stp1_12 + + ; 0-3, 28-31 final stage + mova m15, [stp + %4 + idx30] + mova m10, [stp + %4 + idx31] + SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 + SUM_SUB 7, 15, 9 ; stp1_1, stp1_30 + mova [stp + %1 + idx0], m0 + mova [stp + %1 + idx1], m7 + mova [stp + %4 + idx30], m15 + mova [stp + %4 + idx31], m10 + mova m7, [stp + %4 + idx28] + mova m0, [stp + %4 + idx29] + SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 + SUM_SUB 4, 7, 9 ; stp1_3, stp1_28 + mova [stp + %1 + idx2], m2 + mova [stp + %1 + idx3], m4 + mova [stp + %4 + idx28], m7 + mova [stp + %4 + idx29], m0 + + ; 12-15, 16-19 final stage + mova m0, [stp + %3 + idx16] + mova m7, [stp + %3 + idx17] + mova m2, [stp + %3 + idx18] + mova m4, [stp + %3 + idx19] + SUM_SUB 1, 0, 9 ; stp1_15, stp1_16 + SUM_SUB 3, 7, 9 ; stp1_14, stp1_17 + SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 + SUM_SUB 6, 4, 9 ; stp1_12, stp1_19 + mova [stp + %2 + idx12], m6 + mova [stp + %2 + idx13], m5 + mova [stp + %2 + idx14], m3 + mova [stp + %2 + idx15], m1 + mova [stp + %3 + idx16], m0 + mova [stp + %3 + idx17], m7 + mova [stp + %3 + idx18], m2 + mova [stp + %3 + idx19], m4 + + mova m4, [stp + %2 + idx8] + mova m5, [stp + %2 + idx9] + mova m6, [stp + %2 + idx10] + mova m7, [stp + %2 + idx11] + SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 + SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 + SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 + SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 + + ; 4-7, 24-27 final stage + mova m0, [stp + %4 + idx27] + mova m1, [stp + %4 + idx26] + mova m2, [stp + %4 + idx25] + mova m3, [stp + %4 + idx24] + SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 + SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 + SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 + SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 + mova [stp + %4 + idx27], m0 + mova [stp + %4 + idx26], m1 + mova [stp + %4 + idx25], m2 + mova [stp + %4 + idx24], m3 + mova [stp + %1 + idx4], m11 + mova [stp + %1 + idx5], m14 + mova [stp + %1 + idx6], m13 + mova [stp + %1 + idx7], m12 + + ; 8-11, 20-23 final stage + mova m0, [stp + %3 + idx20] + mova m1, [stp + %3 + idx21] + mova m2, [stp + %3 + idx22] + mova m3, [stp + %3 + idx23] + SUM_SUB 7, 0, 9 ; stp1_11, stp_20 + SUM_SUB 6, 1, 9 ; stp1_10, stp_21 + SUM_SUB 5, 2, 9 ; stp1_9, stp_22 + SUM_SUB 4, 3, 9 ; stp1_8, stp_23 + mova [stp + %2 + idx8], m4 + mova [stp + %2 + idx9], m5 + mova [stp + %2 + idx10], m6 + mova [stp + %2 + idx11], m7 + mova [stp + %3 + idx20], m0 + mova [stp + %3 + idx21], m1 + mova [stp + %3 + idx22], m2 + mova [stp + %3 + idx23], m3 +%endmacro + +%macro RECON_AND_STORE 1 + mova m11, [pw_32] + lea stp, [rsp + %1] + mov r6, 32 + pxor m8, m8 +%%recon_and_store: + mova m0, [stp + 16 * 32 * 0] + mova m1, [stp + 16 * 32 * 1] + mova m2, [stp + 16 * 32 * 2] + mova m3, [stp + 16 * 32 * 3] + add stp, 16 + + paddw m0, m11 + paddw m1, m11 + paddw m2, m11 + paddw m3, m11 + psraw m0, 6 + psraw m1, 6 + psraw m2, 6 + psraw m3, 6 + movh m4, [outputq + 0] + movh m5, [outputq + 8] + movh m6, [outputq + 16] + movh m7, [outputq + 24] + punpcklbw m4, m8 + punpcklbw m5, m8 + punpcklbw m6, m8 + punpcklbw m7, m8 + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + mova [outputq + 0], m0 + mova [outputq + 16], m2 + lea outputq, [outputq + strideq] + dec r6 + jnz %%recon_and_store +%endmacro + +%define i32x32_size 16*32*5 +%define pass_two_start 16*32*0 +%define transposed_in 16*32*4 +%define pass_one_start 16*32*0 +%define stp r8 + +INIT_XMM ssse3 +cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride + mova m8, [pd_8192] + lea stp, [rsp + pass_one_start] + +idct32x32_34: + mov r3, inputq + lea r4, [rsp + transposed_in] + +idct32x32_34_transpose: +%if CONFIG_VP9_HIGHBITDEPTH + mova m0, [r3 + 0] + packssdw m0, [r3 + 16] + mova m1, [r3 + 32 * 4] + packssdw m1, [r3 + 32 * 4 + 16] + mova m2, [r3 + 32 * 8] + packssdw m2, [r3 + 32 * 8 + 16] + mova m3, [r3 + 32 * 12] + packssdw m3, [r3 + 32 * 12 + 16] + mova m4, [r3 + 32 * 16] + packssdw m4, [r3 + 32 * 16 + 16] + mova m5, [r3 + 32 * 20] + packssdw m5, [r3 + 32 * 20 + 16] + mova m6, [r3 + 32 * 24] + packssdw m6, [r3 + 32 * 24 + 16] + mova m7, [r3 + 32 * 28] + packssdw m7, [r3 + 32 * 28 + 16] +%else + mova m0, [r3 + 0] + mova m1, [r3 + 16 * 4] + mova m2, [r3 + 16 * 8] + mova m3, [r3 + 16 * 12] + mova m4, [r3 + 16 * 16] + mova m5, [r3 + 16 * 20] + mova m6, [r3 + 16 * 24] + mova m7, [r3 + 16 * 28] +%endif + + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + IDCT32X32_34 16*0, 16*32, 16*64, 16*96 + lea stp, [stp + 16 * 8] + mov r6, 4 + lea stp, [rsp + pass_one_start] + lea r9, [rsp + pass_one_start] + +idct32x32_34_2: + lea r4, [rsp + transposed_in] + mov r3, r9 + +idct32x32_34_transpose_2: + mova m0, [r3 + 0] + mova m1, [r3 + 16 * 1] + mova m2, [r3 + 16 * 2] + mova m3, [r3 + 16 * 3] + mova m4, [r3 + 16 * 4] + mova m5, [r3 + 16 * 5] + mova m6, [r3 + 16 * 6] + mova m7, [r3 + 16 * 7] + + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + IDCT32X32_34 16*0, 16*8, 16*16, 16*24 + + lea stp, [stp + 16 * 32] + add r9, 16 * 32 + dec r6 + jnz idct32x32_34_2 + + RECON_AND_STORE pass_two_start + + RET + +%macro IDCT32X32_135 4 + ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m1, [rsp + transposed_in + 16 * 1] + mova m11, m1 + pmulhrsw m1, [pw___804x2] ; stp1_16 + pmulhrsw m11, [pw_16364x2] ; stp2_31 + + mova m7, [rsp + transposed_in + 16 * 7] + mova m12, m7 + pmulhrsw m7, [pw_15426x2] ; stp1_28 + pmulhrsw m12, [pw_m5520x2] ; stp2_19 + + mova m3, [rsp + transposed_in + 16 * 9] + mova m4, m3 + pmulhrsw m3, [pw__7005x2] ; stp1_18 + pmulhrsw m4, [pw_14811x2] ; stp2_29 + + mova m0, [rsp + transposed_in + 16 * 15] + mova m2, m0 + pmulhrsw m0, [pw_12140x2] ; stp1_30 + pmulhrsw m2, [pw_m11003x2] ; stp2_17 + + ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 1, 2, 9 ; stp2_16, stp2_17 + SUM_SUB 12, 3, 9 ; stp2_19, stp2_18 + SUM_SUB 7, 4, 9 ; stp2_28, stp2_29 + SUM_SUB 11, 0, 9 ; stp2_31, stp2_30 + + ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 + BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 + + ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 + SUM_SUB 0, 3, 9 ; stp2_17, stp2_18 + SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 + SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 + + ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 + BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 + + mova [stp + %3 + idx16], m1 + mova [stp + %3 + idx17], m0 + mova [stp + %3 + idx18], m4 + mova [stp + %3 + idx19], m7 + mova [stp + %4 + idx28], m12 + mova [stp + %4 + idx29], m3 + mova [stp + %4 + idx30], m2 + mova [stp + %4 + idx31], m11 + + ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m2, [rsp + transposed_in + 16 * 3] + mova m3, m2 + pmulhrsw m3, [pw_m2404x2] ; stp1_23 + pmulhrsw m2, [pw_16207x2] ; stp2_24 + + mova m5, [rsp + transposed_in + 16 * 5] + mova m6, m5 + pmulhrsw m5, [pw__3981x2] ; stp1_20 + pmulhrsw m6, [pw_15893x2] ; stp2_27 + + mova m14, [rsp + transposed_in + 16 * 11] + mova m13, m14 + pmulhrsw m13, [pw_m8423x2] ; stp1_21 + pmulhrsw m14, [pw_14053x2] ; stp2_26 + + mova m0, [rsp + transposed_in + 16 * 13] + mova m1, m0 + pmulhrsw m0, [pw__9760x2] ; stp1_22 + pmulhrsw m1, [pw_13160x2] ; stp2_25 + + ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 5, 13, 9 ; stp2_20, stp2_21 + SUM_SUB 3, 0, 9 ; stp2_23, stp2_22 + SUM_SUB 2, 1, 9 ; stp2_24, stp2_25 + SUM_SUB 6, 14, 9 ; stp2_27, stp2_26 + + ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 + BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 + + ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 + SUM_SUB 0, 14, 9 ; stp2_22, stp2_21 + SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 + SUM_SUB 1, 13, 9 ; stp2_25, stp2_26 + + ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 + BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 + + ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m4, [stp + %3 + idx16] + mova m7, [stp + %3 + idx17] + mova m11, [stp + %3 + idx18] + mova m12, [stp + %3 + idx19] + SUM_SUB 4, 3, 9 ; stp2_16, stp2_23 + SUM_SUB 7, 0, 9 ; stp2_17, stp2_22 + SUM_SUB 11, 14, 9 ; stp2_18, stp2_21 + SUM_SUB 12, 5, 9 ; stp2_19, stp2_20 + mova [stp + %3 + idx16], m4 + mova [stp + %3 + idx17], m7 + mova [stp + %3 + idx18], m11 + mova [stp + %3 + idx19], m12 + + mova m4, [stp + %4 + idx28] + mova m7, [stp + %4 + idx29] + mova m11, [stp + %4 + idx30] + mova m12, [stp + %4 + idx31] + SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 + SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 + SUM_SUB 11, 1, 9 ; stp2_30, stp2_25 + SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 + mova [stp + %4 + idx28], m4 + mova [stp + %4 + idx29], m7 + mova [stp + %4 + idx30], m11 + mova [stp + %4 + idx31], m12 + + ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 6, 5, 9 + pmulhrsw m6, m10 ; stp1_27 + pmulhrsw m5, m10 ; stp1_20 + SUM_SUB 13, 14, 9 + pmulhrsw m13, m10 ; stp1_26 + pmulhrsw m14, m10 ; stp1_21 + SUM_SUB 1, 0, 9 + pmulhrsw m1, m10 ; stp1_25 + pmulhrsw m0, m10 ; stp1_22 + SUM_SUB 2, 3, 9 + pmulhrsw m2, m10 ; stp1_25 + pmulhrsw m3, m10 ; stp1_22 +%else + BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 + SWAP 6, 5 + BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 + SWAP 13, 14 + BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 + SWAP 1, 0 + BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 + SWAP 2, 3 +%endif + mova [stp + %3 + idx20], m5 + mova [stp + %3 + idx21], m14 + mova [stp + %3 + idx22], m0 + mova [stp + %3 + idx23], m3 + mova [stp + %4 + idx24], m2 + mova [stp + %4 + idx25], m1 + mova [stp + %4 + idx26], m13 + mova [stp + %4 + idx27], m6 + + ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m0, [rsp + transposed_in + 16 * 2] + mova m1, m0 + pmulhrsw m0, [pw__1606x2] ; stp1_8 + pmulhrsw m1, [pw_16305x2] ; stp2_15 + + mova m6, [rsp + transposed_in + 16 * 6] + mova m7, m6 + pmulhrsw m7, [pw_m4756x2] ; stp2_11 + pmulhrsw m6, [pw_15679x2] ; stp1_12 + + mova m4, [rsp + transposed_in + 16 * 10] + mova m5, m4 + pmulhrsw m4, [pw__7723x2] ; stp1_10 + pmulhrsw m5, [pw_14449x2] ; stp2_13 + + mova m2, [rsp + transposed_in + 16 * 14] + mova m3, m2 + pmulhrsw m3, [pw_m10394x2] ; stp1_9 + pmulhrsw m2, [pw_12665x2] ; stp2_14 + + ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 3, 9 ; stp1_8, stp1_9 + SUM_SUB 7, 4, 9 ; stp1_11, stp1_10 + SUM_SUB 6, 5, 9 ; stp1_12, stp1_13 + SUM_SUB 1, 2, 9 ; stp1_15, stp1_14 + + ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 + BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 + + ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 + SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 + SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 + SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 + + ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 5, 4, 9 + pmulhrsw m5, m10 ; stp1_13 + pmulhrsw m4, m10 ; stp1_10 + SUM_SUB 6, 7, 9 + pmulhrsw m6, m10 ; stp1_12 + pmulhrsw m7, m10 ; stp1_11 +%else + BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 + SWAP 5, 4 + BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 + SWAP 6, 7 +%endif + ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova [stp + %2 + idx8], m0 + mova [stp + %2 + idx9], m2 + mova [stp + %2 + idx10], m4 + mova [stp + %2 + idx11], m7 + mova [stp + %2 + idx12], m6 + mova [stp + %2 + idx13], m5 + mova [stp + %2 + idx14], m3 + mova [stp + %2 + idx15], m1 + + ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m11, [rsp + transposed_in + 16 * 4] + mova m12, m11 + pmulhrsw m11, [pw__3196x2] ; stp1_4 + pmulhrsw m12, [pw_16069x2] ; stp1_7 + + mova m13, [rsp + transposed_in + 16 * 12] + mova m14, m13 + pmulhrsw m13, [pw_13623x2] ; stp1_6 + pmulhrsw m14, [pw_m9102x2] ; stp1_5 + + ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m0, [rsp + transposed_in + 16 * 0] + mova m2, [rsp + transposed_in + 16 * 8] + pmulhrsw m0, [pw_11585x2] ; stp1_1 + mova m3, m2 + pmulhrsw m2, [pw__6270x2] ; stp1_2 + pmulhrsw m3, [pw_15137x2] ; stp1_3 + + SUM_SUB 11, 14, 9 ; stp1_4, stp1_5 + SUM_SUB 12, 13, 9 ; stp1_7, stp1_6 + + ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 13, 14, 9 + pmulhrsw m13, m10 ; stp1_6 + pmulhrsw m14, m10 ; stp1_5 +%else + BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 + SWAP 13, 14 +%endif + mova m1, m0 ; stp1_0 = stp1_1 + SUM_SUB 0, 3, 9 ; stp1_0, stp1_3 + SUM_SUB 1, 2, 9 ; stp1_1, stp1_2 + + ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 + SUM_SUB 1, 13, 9 ; stp1_1, stp1_6 + SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 + SUM_SUB 3, 11, 9 ; stp1_3, stp1_4 + + ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m4, [stp + %2 + idx12] + mova m5, [stp + %2 + idx13] + mova m6, [stp + %2 + idx14] + mova m7, [stp + %2 + idx15] + SUM_SUB 0, 7, 9 ; stp1_0, stp1_15 + SUM_SUB 1, 6, 9 ; stp1_1, stp1_14 + SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 + SUM_SUB 3, 4, 9 ; stp1_3, stp1_12 + + ; 0-3, 28-31 final stage + mova m10, [stp + %4 + idx31] + mova m15, [stp + %4 + idx30] + SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 + SUM_SUB 1, 15, 9 ; stp1_1, stp1_30 + mova [stp + %1 + idx0], m0 + mova [stp + %1 + idx1], m1 + mova [stp + %4 + idx31], m10 + mova [stp + %4 + idx30], m15 + mova m0, [stp + %4 + idx29] + mova m1, [stp + %4 + idx28] + SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 + SUM_SUB 3, 1, 9 ; stp1_3, stp1_28 + mova [stp + %1 + idx2], m2 + mova [stp + %1 + idx3], m3 + mova [stp + %4 + idx29], m0 + mova [stp + %4 + idx28], m1 + + ; 12-15, 16-19 final stage + mova m0, [stp + %3 + idx16] + mova m1, [stp + %3 + idx17] + mova m2, [stp + %3 + idx18] + mova m3, [stp + %3 + idx19] + SUM_SUB 7, 0, 9 ; stp1_15, stp1_16 + SUM_SUB 6, 1, 9 ; stp1_14, stp1_17 + SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 + SUM_SUB 4, 3, 9 ; stp1_12, stp1_19 + mova [stp + %2 + idx12], m4 + mova [stp + %2 + idx13], m5 + mova [stp + %2 + idx14], m6 + mova [stp + %2 + idx15], m7 + mova [stp + %3 + idx16], m0 + mova [stp + %3 + idx17], m1 + mova [stp + %3 + idx18], m2 + mova [stp + %3 + idx19], m3 + + mova m4, [stp + %2 + idx8] + mova m5, [stp + %2 + idx9] + mova m6, [stp + %2 + idx10] + mova m7, [stp + %2 + idx11] + SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 + SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 + SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 + SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 + + ; 4-7, 24-27 final stage + mova m3, [stp + %4 + idx24] + mova m2, [stp + %4 + idx25] + mova m1, [stp + %4 + idx26] + mova m0, [stp + %4 + idx27] + SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 + SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 + SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 + SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 + mova [stp + %4 + idx24], m3 + mova [stp + %4 + idx25], m2 + mova [stp + %4 + idx26], m1 + mova [stp + %4 + idx27], m0 + mova [stp + %1 + idx4], m11 + mova [stp + %1 + idx5], m14 + mova [stp + %1 + idx6], m13 + mova [stp + %1 + idx7], m12 + + ; 8-11, 20-23 final stage + mova m0, [stp + %3 + idx20] + mova m1, [stp + %3 + idx21] + mova m2, [stp + %3 + idx22] + mova m3, [stp + %3 + idx23] + SUM_SUB 7, 0, 9 ; stp1_11, stp_20 + SUM_SUB 6, 1, 9 ; stp1_10, stp_21 + SUM_SUB 5, 2, 9 ; stp1_9, stp_22 + SUM_SUB 4, 3, 9 ; stp1_8, stp_23 + mova [stp + %2 + idx8], m4 + mova [stp + %2 + idx9], m5 + mova [stp + %2 + idx10], m6 + mova [stp + %2 + idx11], m7 + mova [stp + %3 + idx20], m0 + mova [stp + %3 + idx21], m1 + mova [stp + %3 + idx22], m2 + mova [stp + %3 + idx23], m3 +%endmacro + +INIT_XMM ssse3 +cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride + mova m8, [pd_8192] + mov r6, 2 + lea stp, [rsp + pass_one_start] + +idct32x32_135: + mov r3, inputq + lea r4, [rsp + transposed_in] + mov r7, 2 + +idct32x32_135_transpose: +%if CONFIG_VP9_HIGHBITDEPTH + mova m0, [r3 + 0] + packssdw m0, [r3 + 16] + mova m1, [r3 + 32 * 4] + packssdw m1, [r3 + 32 * 4 + 16] + mova m2, [r3 + 32 * 8] + packssdw m2, [r3 + 32 * 8 + 16] + mova m3, [r3 + 32 * 12] + packssdw m3, [r3 + 32 * 12 + 16] + mova m4, [r3 + 32 * 16] + packssdw m4, [r3 + 32 * 16 + 16] + mova m5, [r3 + 32 * 20] + packssdw m5, [r3 + 32 * 20 + 16] + mova m6, [r3 + 32 * 24] + packssdw m6, [r3 + 32 * 24 + 16] + mova m7, [r3 + 32 * 28] + packssdw m7, [r3 + 32 * 28 + 16] +%else + mova m0, [r3 + 0] + mova m1, [r3 + 16 * 4] + mova m2, [r3 + 16 * 8] + mova m3, [r3 + 16 * 12] + mova m4, [r3 + 16 * 16] + mova m5, [r3 + 16 * 20] + mova m6, [r3 + 16 * 24] + mova m7, [r3 + 16 * 28] +%endif + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + mova [r4 + 0], m0 + mova [r4 + 16 * 1], m1 + mova [r4 + 16 * 2], m2 + mova [r4 + 16 * 3], m3 + mova [r4 + 16 * 4], m4 + mova [r4 + 16 * 5], m5 + mova [r4 + 16 * 6], m6 + mova [r4 + 16 * 7], m7 + +%if CONFIG_VP9_HIGHBITDEPTH + add r3, 32 +%else + add r3, 16 +%endif + add r4, 16 * 8 + dec r7 + jne idct32x32_135_transpose + + IDCT32X32_135 16*0, 16*32, 16*64, 16*96 + lea stp, [stp + 16 * 8] +%if CONFIG_VP9_HIGHBITDEPTH + lea inputq, [inputq + 32 * 32] +%else + lea inputq, [inputq + 16 * 32] +%endif + dec r6 + jnz idct32x32_135 + + mov r6, 4 + lea stp, [rsp + pass_one_start] + lea r9, [rsp + pass_one_start] + +idct32x32_135_2: + lea r4, [rsp + transposed_in] + mov r3, r9 + mov r7, 2 + +idct32x32_135_transpose_2: + mova m0, [r3 + 0] + mova m1, [r3 + 16 * 1] + mova m2, [r3 + 16 * 2] + mova m3, [r3 + 16 * 3] + mova m4, [r3 + 16 * 4] + mova m5, [r3 + 16 * 5] + mova m6, [r3 + 16 * 6] + mova m7, [r3 + 16 * 7] + + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + mova [r4 + 0], m0 + mova [r4 + 16 * 1], m1 + mova [r4 + 16 * 2], m2 + mova [r4 + 16 * 3], m3 + mova [r4 + 16 * 4], m4 + mova [r4 + 16 * 5], m5 + mova [r4 + 16 * 6], m6 + mova [r4 + 16 * 7], m7 + + add r3, 16 * 8 + add r4, 16 * 8 + dec r7 + jne idct32x32_135_transpose_2 + + IDCT32X32_135 16*0, 16*8, 16*16, 16*24 + + lea stp, [stp + 16 * 32] + add r9, 16 * 32 + dec r6 + jnz idct32x32_135_2 + + RECON_AND_STORE pass_two_start + + RET + +%macro IDCT32X32_1024 4 + ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m1, [rsp + transposed_in + 16 * 1] + mova m11, [rsp + transposed_in + 16 * 31] + BUTTERFLY_4X 1, 11, 804, 16364, m8, 9, 10 ; stp1_16, stp1_31 + + mova m0, [rsp + transposed_in + 16 * 15] + mova m2, [rsp + transposed_in + 16 * 17] + BUTTERFLY_4X 2, 0, 12140, 11003, m8, 9, 10 ; stp1_17, stp1_30 + + mova m7, [rsp + transposed_in + 16 * 7] + mova m12, [rsp + transposed_in + 16 * 25] + BUTTERFLY_4X 12, 7, 15426, 5520, m8, 9, 10 ; stp1_19, stp1_28 + + mova m3, [rsp + transposed_in + 16 * 9] + mova m4, [rsp + transposed_in + 16 * 23] + BUTTERFLY_4X 3, 4, 7005, 14811, m8, 9, 10 ; stp1_18, stp1_29 + + ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 1, 2, 9 ; stp2_16, stp2_17 + SUM_SUB 12, 3, 9 ; stp2_19, stp2_18 + SUM_SUB 7, 4, 9 ; stp2_28, stp2_29 + SUM_SUB 11, 0, 9 ; stp2_31, stp2_30 + + ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 + BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 + + ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 + SUM_SUB 0, 3, 9 ; stp2_17, stp2_18 + SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 + SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 + + ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 + BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 + + mova [stp + %3 + idx16], m1 + mova [stp + %3 + idx17], m0 + mova [stp + %3 + idx18], m4 + mova [stp + %3 + idx19], m7 + mova [stp + %4 + idx28], m12 + mova [stp + %4 + idx29], m3 + mova [stp + %4 + idx30], m2 + mova [stp + %4 + idx31], m11 + + ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m5, [rsp + transposed_in + 16 * 5] + mova m6, [rsp + transposed_in + 16 * 27] + BUTTERFLY_4X 5, 6, 3981, 15893, m8, 9, 10 ; stp1_20, stp1_27 + + mova m13, [rsp + transposed_in + 16 * 21] + mova m14, [rsp + transposed_in + 16 * 11] + BUTTERFLY_4X 13, 14, 14053, 8423, m8, 9, 10 ; stp1_21, stp1_26 + + mova m0, [rsp + transposed_in + 16 * 13] + mova m1, [rsp + transposed_in + 16 * 19] + BUTTERFLY_4X 0, 1, 9760, 13160, m8, 9, 10 ; stp1_22, stp1_25 + + mova m2, [rsp + transposed_in + 16 * 3] + mova m3, [rsp + transposed_in + 16 * 29] + BUTTERFLY_4X 3, 2, 16207, 2404, m8, 9, 10 ; stp1_23, stp1_24 + + ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 5, 13, 9 ; stp2_20, stp2_21 + SUM_SUB 3, 0, 9 ; stp2_23, stp2_22 + SUM_SUB 2, 1, 9 ; stp2_24, stp2_25 + SUM_SUB 6, 14, 9 ; stp2_27, stp2_26 + + ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 + BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 + + ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 + SUM_SUB 0, 14, 9 ; stp2_22, stp2_21 + SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 + SUM_SUB 1, 13, 9 ; stp2_25, stp2_26 + + ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 + BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 + + ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m4, [stp + %3 + idx16] + mova m7, [stp + %3 + idx17] + mova m11, [stp + %3 + idx18] + mova m12, [stp + %3 + idx19] + SUM_SUB 4, 3, 9 ; stp2_16, stp2_23 + SUM_SUB 7, 0, 9 ; stp2_17, stp2_22 + SUM_SUB 11, 14, 9 ; stp2_18, stp2_21 + SUM_SUB 12, 5, 9 ; stp2_19, stp2_20 + mova [stp + %3 + idx16], m4 + mova [stp + %3 + idx17], m7 + mova [stp + %3 + idx18], m11 + mova [stp + %3 + idx19], m12 + + mova m4, [stp + %4 + idx28] + mova m7, [stp + %4 + idx29] + mova m11, [stp + %4 + idx30] + mova m12, [stp + %4 + idx31] + SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 + SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 + SUM_SUB 11, 1, 9 ; stp2_30, stp2_25 + SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 + mova [stp + %4 + idx28], m4 + mova [stp + %4 + idx29], m7 + mova [stp + %4 + idx30], m11 + mova [stp + %4 + idx31], m12 + + ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 6, 5, 9 + pmulhrsw m6, m10 ; stp1_27 + pmulhrsw m5, m10 ; stp1_20 + SUM_SUB 13, 14, 9 + pmulhrsw m13, m10 ; stp1_26 + pmulhrsw m14, m10 ; stp1_21 + SUM_SUB 1, 0, 9 + pmulhrsw m1, m10 ; stp1_25 + pmulhrsw m0, m10 ; stp1_22 + SUM_SUB 2, 3, 9 + pmulhrsw m2, m10 ; stp1_25 + pmulhrsw m3, m10 ; stp1_22 +%else + BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 + SWAP 6, 5 + BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 + SWAP 13, 14 + BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 + SWAP 1, 0 + BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 + SWAP 2, 3 +%endif + mova [stp + %3 + idx20], m5 + mova [stp + %3 + idx21], m14 + mova [stp + %3 + idx22], m0 + mova [stp + %3 + idx23], m3 + mova [stp + %4 + idx24], m2 + mova [stp + %4 + idx25], m1 + mova [stp + %4 + idx26], m13 + mova [stp + %4 + idx27], m6 + + ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m0, [rsp + transposed_in + 16 * 2] + mova m1, [rsp + transposed_in + 16 * 30] + BUTTERFLY_4X 0, 1, 1606, 16305, m8, 9, 10 ; stp1_8, stp1_15 + + mova m2, [rsp + transposed_in + 16 * 14] + mova m3, [rsp + transposed_in + 16 * 18] + BUTTERFLY_4X 3, 2, 12665, 10394, m8, 9, 10 ; stp1_9, stp1_14 + + mova m4, [rsp + transposed_in + 16 * 10] + mova m5, [rsp + transposed_in + 16 * 22] + BUTTERFLY_4X 4, 5, 7723, 14449, m8, 9, 10 ; stp1_10, stp1_13 + + mova m6, [rsp + transposed_in + 16 * 6] + mova m7, [rsp + transposed_in + 16 * 26] + BUTTERFLY_4X 7, 6, 15679, 4756, m8, 9, 10 ; stp1_11, stp1_12 + + ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 3, 9 ; stp1_8, stp1_9 + SUM_SUB 7, 4, 9 ; stp1_11, stp1_10 + SUM_SUB 6, 5, 9 ; stp1_12, stp1_13 + SUM_SUB 1, 2, 9 ; stp1_15, stp1_14 + + ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 + BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 + + ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 + SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 + SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 + SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 + + ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 5, 4, 9 + pmulhrsw m5, m10 ; stp1_13 + pmulhrsw m4, m10 ; stp1_10 + SUM_SUB 6, 7, 9 + pmulhrsw m6, m10 ; stp1_12 + pmulhrsw m7, m10 ; stp1_11 +%else + BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 + SWAP 5, 4 + BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 + SWAP 6, 7 +%endif + ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova [stp + %2 + idx8], m0 + mova [stp + %2 + idx9], m2 + mova [stp + %2 + idx10], m4 + mova [stp + %2 + idx11], m7 + mova [stp + %2 + idx12], m6 + mova [stp + %2 + idx13], m5 + mova [stp + %2 + idx14], m3 + mova [stp + %2 + idx15], m1 + + ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m11, [rsp + transposed_in + 16 * 4] + mova m12, [rsp + transposed_in + 16 * 28] + BUTTERFLY_4X 11, 12, 3196, 16069, m8, 9, 10 ; stp1_4, stp1_7 + + mova m13, [rsp + transposed_in + 16 * 12] + mova m14, [rsp + transposed_in + 16 * 20] + BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_5, stp1_6 + + ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m0, [rsp + transposed_in + 16 * 0] + mova m1, [rsp + transposed_in + 16 * 16] + +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 0, 1, 9 + pmulhrsw m0, m10 ; stp1_1 + pmulhrsw m1, m10 ; stp1_0 +%else + BUTTERFLY_4X 0, 1, 11585, 11585, m8, 9, 10 ; stp1_1, stp1_0 + SWAP 0, 1 +%endif + mova m2, [rsp + transposed_in + 16 * 8] + mova m3, [rsp + transposed_in + 16 * 24] + BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_2, stp1_3 + + mova m10, [pw_11585x2] + SUM_SUB 11, 14, 9 ; stp1_4, stp1_5 + SUM_SUB 12, 13, 9 ; stp1_7, stp1_6 + + ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + SUM_SUB 13, 14, 9 + pmulhrsw m13, m10 ; stp1_6 + pmulhrsw m14, m10 ; stp1_5 +%else + BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 + SWAP 13, 14 +%endif + SUM_SUB 0, 3, 9 ; stp1_0, stp1_3 + SUM_SUB 1, 2, 9 ; stp1_1, stp1_2 + + ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 + SUM_SUB 1, 13, 9 ; stp1_1, stp1_6 + SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 + SUM_SUB 3, 11, 9 ; stp1_3, stp1_4 + + ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m4, [stp + %2 + idx12] + mova m5, [stp + %2 + idx13] + mova m6, [stp + %2 + idx14] + mova m7, [stp + %2 + idx15] + SUM_SUB 0, 7, 9 ; stp1_0, stp1_15 + SUM_SUB 1, 6, 9 ; stp1_1, stp1_14 + SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 + SUM_SUB 3, 4, 9 ; stp1_3, stp1_12 + + ; 0-3, 28-31 final stage + mova m10, [stp + %4 + idx31] + mova m15, [stp + %4 + idx30] + SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 + SUM_SUB 1, 15, 9 ; stp1_1, stp1_30 + mova [stp + %1 + idx0], m0 + mova [stp + %1 + idx1], m1 + mova [stp + %4 + idx31], m10 + mova [stp + %4 + idx30], m15 + mova m0, [stp + %4 + idx29] + mova m1, [stp + %4 + idx28] + SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 + SUM_SUB 3, 1, 9 ; stp1_3, stp1_28 + mova [stp + %1 + idx2], m2 + mova [stp + %1 + idx3], m3 + mova [stp + %4 + idx29], m0 + mova [stp + %4 + idx28], m1 + + ; 12-15, 16-19 final stage + mova m0, [stp + %3 + idx16] + mova m1, [stp + %3 + idx17] + mova m2, [stp + %3 + idx18] + mova m3, [stp + %3 + idx19] + SUM_SUB 7, 0, 9 ; stp1_15, stp1_16 + SUM_SUB 6, 1, 9 ; stp1_14, stp1_17 + SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 + SUM_SUB 4, 3, 9 ; stp1_12, stp1_19 + mova [stp + %2 + idx12], m4 + mova [stp + %2 + idx13], m5 + mova [stp + %2 + idx14], m6 + mova [stp + %2 + idx15], m7 + mova [stp + %3 + idx16], m0 + mova [stp + %3 + idx17], m1 + mova [stp + %3 + idx18], m2 + mova [stp + %3 + idx19], m3 + + mova m4, [stp + %2 + idx8] + mova m5, [stp + %2 + idx9] + mova m6, [stp + %2 + idx10] + mova m7, [stp + %2 + idx11] + SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 + SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 + SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 + SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 + + ; 4-7, 24-27 final stage + mova m3, [stp + %4 + idx24] + mova m2, [stp + %4 + idx25] + mova m1, [stp + %4 + idx26] + mova m0, [stp + %4 + idx27] + SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 + SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 + SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 + SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 + mova [stp + %4 + idx24], m3 + mova [stp + %4 + idx25], m2 + mova [stp + %4 + idx26], m1 + mova [stp + %4 + idx27], m0 + mova [stp + %1 + idx4], m11 + mova [stp + %1 + idx5], m14 + mova [stp + %1 + idx6], m13 + mova [stp + %1 + idx7], m12 + + ; 8-11, 20-23 final stage + mova m0, [stp + %3 + idx20] + mova m1, [stp + %3 + idx21] + mova m2, [stp + %3 + idx22] + mova m3, [stp + %3 + idx23] + SUM_SUB 7, 0, 9 ; stp1_11, stp_20 + SUM_SUB 6, 1, 9 ; stp1_10, stp_21 + SUM_SUB 5, 2, 9 ; stp1_9, stp_22 + SUM_SUB 4, 3, 9 ; stp1_8, stp_23 + mova [stp + %2 + idx8], m4 + mova [stp + %2 + idx9], m5 + mova [stp + %2 + idx10], m6 + mova [stp + %2 + idx11], m7 + mova [stp + %3 + idx20], m0 + mova [stp + %3 + idx21], m1 + mova [stp + %3 + idx22], m2 + mova [stp + %3 + idx23], m3 +%endmacro + +INIT_XMM ssse3 +cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride + mova m8, [pd_8192] + mov r6, 4 + lea stp, [rsp + pass_one_start] + +idct32x32_1024: + mov r3, inputq + lea r4, [rsp + transposed_in] + mov r7, 4 + +idct32x32_1024_transpose: +%if CONFIG_VP9_HIGHBITDEPTH + mova m0, [r3 + 0] + packssdw m0, [r3 + 16] + mova m1, [r3 + 32 * 4] + packssdw m1, [r3 + 32 * 4 + 16] + mova m2, [r3 + 32 * 8] + packssdw m2, [r3 + 32 * 8 + 16] + mova m3, [r3 + 32 * 12] + packssdw m3, [r3 + 32 * 12 + 16] + mova m4, [r3 + 32 * 16] + packssdw m4, [r3 + 32 * 16 + 16] + mova m5, [r3 + 32 * 20] + packssdw m5, [r3 + 32 * 20 + 16] + mova m6, [r3 + 32 * 24] + packssdw m6, [r3 + 32 * 24 + 16] + mova m7, [r3 + 32 * 28] + packssdw m7, [r3 + 32 * 28 + 16] +%else + mova m0, [r3 + 0] + mova m1, [r3 + 16 * 4] + mova m2, [r3 + 16 * 8] + mova m3, [r3 + 16 * 12] + mova m4, [r3 + 16 * 16] + mova m5, [r3 + 16 * 20] + mova m6, [r3 + 16 * 24] + mova m7, [r3 + 16 * 28] +%endif + + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + mova [r4 + 0], m0 + mova [r4 + 16 * 1], m1 + mova [r4 + 16 * 2], m2 + mova [r4 + 16 * 3], m3 + mova [r4 + 16 * 4], m4 + mova [r4 + 16 * 5], m5 + mova [r4 + 16 * 6], m6 + mova [r4 + 16 * 7], m7 +%if CONFIG_VP9_HIGHBITDEPTH + add r3, 32 +%else + add r3, 16 +%endif + add r4, 16 * 8 + dec r7 + jne idct32x32_1024_transpose + + IDCT32X32_1024 16*0, 16*32, 16*64, 16*96 + + lea stp, [stp + 16 * 8] +%if CONFIG_VP9_HIGHBITDEPTH + lea inputq, [inputq + 32 * 32] +%else + lea inputq, [inputq + 16 * 32] +%endif + dec r6 + jnz idct32x32_1024 + + mov r6, 4 + lea stp, [rsp + pass_one_start] + lea r9, [rsp + pass_one_start] + +idct32x32_1024_2: + lea r4, [rsp + transposed_in] + mov r3, r9 + mov r7, 4 + +idct32x32_1024_transpose_2: + mova m0, [r3 + 0] + mova m1, [r3 + 16 * 1] + mova m2, [r3 + 16 * 2] + mova m3, [r3 + 16 * 3] + mova m4, [r3 + 16 * 4] + mova m5, [r3 + 16 * 5] + mova m6, [r3 + 16 * 6] + mova m7, [r3 + 16 * 7] + + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + mova [r4 + 0], m0 + mova [r4 + 16 * 1], m1 + mova [r4 + 16 * 2], m2 + mova [r4 + 16 * 3], m3 + mova [r4 + 16 * 4], m4 + mova [r4 + 16 * 5], m5 + mova [r4 + 16 * 6], m6 + mova [r4 + 16 * 7], m7 + + add r3, 16 * 8 + add r4, 16 * 8 + dec r7 + jne idct32x32_1024_transpose_2 + + IDCT32X32_1024 16*0, 16*8, 16*16, 16*24 + + lea stp, [stp + 16 * 32] + add r9, 16 * 32 + dec r6 + jnz idct32x32_1024_2 + + RECON_AND_STORE pass_two_start + + RET +%endif diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_wht_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/inv_wht_sse2.asm new file mode 100644 index 000000000000..fbbcd76bd7b4 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/inv_wht_sse2.asm @@ -0,0 +1,109 @@ +; +; Copyright (c) 2015 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro REORDER_INPUTS 0 + ; a c d b to a b c d + SWAP 1, 3, 2 +%endmacro + +%macro TRANSFORM_COLS 0 + ; input: + ; m0 a + ; m1 b + ; m2 c + ; m3 d + paddw m0, m2 + psubw m3, m1 + + ; wide subtract + punpcklwd m4, m0 + punpcklwd m5, m3 + psrad m4, 16 + psrad m5, 16 + psubd m4, m5 + psrad m4, 1 + packssdw m4, m4 ; e + + psubw m5, m4, m1 ; b + psubw m4, m2 ; c + psubw m0, m5 + paddw m3, m4 + ; m0 a + SWAP 1, 5 ; m1 b + SWAP 2, 4 ; m2 c + ; m3 d +%endmacro + +%macro TRANSPOSE_4X4 0 + punpcklwd m0, m2 + punpcklwd m1, m3 + mova m2, m0 + punpcklwd m0, m1 + punpckhwd m2, m1 + pshufd m1, m0, 0x0e + pshufd m3, m2, 0x0e +%endmacro + +; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3 +%macro TRANSPOSE_4X4_WIDE 0 + mova m3, m0 + punpcklwd m0, m1 + punpckhwd m3, m1 + mova m2, m0 + punpcklwd m0, m3 + punpckhwd m2, m3 + pshufd m1, m0, 0x0e + pshufd m3, m2, 0x0e +%endmacro + +%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero + movd m%3, [outputq] + movd m%4, [outputq + strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + paddw m%1, m%3 + paddw m%2, m%4 + packuswb m%1, m%5 + packuswb m%2, m%5 + movd [outputq], m%1 + movd [outputq + strideq], m%2 +%endmacro + +INIT_XMM sse2 +cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride +%if CONFIG_VP9_HIGHBITDEPTH + mova m0, [inputq + 0] + packssdw m0, [inputq + 16] + mova m1, [inputq + 32] + packssdw m1, [inputq + 48] +%else + mova m0, [inputq + 0] + mova m1, [inputq + 16] +%endif + psraw m0, 2 + psraw m1, 2 + + TRANSPOSE_4X4_WIDE + REORDER_INPUTS + TRANSFORM_COLS + TRANSPOSE_4X4 + REORDER_INPUTS + TRANSFORM_COLS + + pxor m4, m4 + ADD_STORE_4P_2X 0, 1, 5, 6, 4 + lea outputq, [outputq + 2 * strideq] + ADD_STORE_4P_2X 2, 3, 5, 6, 4 + + RET diff --git a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/thirdparty/libvpx/vpx_dsp/x86/loopfilter_avx2.c new file mode 100644 index 000000000000..be1087c1e951 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/loopfilter_avx2.c @@ -0,0 +1,979 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include /* AVX2 */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" + +void vpx_lpf_horizontal_edge_8_avx2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; + __m128i abs_p1p0; + + const __m128i thresh = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _thresh[0])); + const __m128i limit = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _limit[0])); + const __m128i blimit = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _blimit[0])); + + q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p)); + q4p4 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p))); + q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p)); + q3p3 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p))); + q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p)); + q2p2 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p))); + q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p)); + q1p1 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p)); + q0p0 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p))); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; + abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), + _mm_subs_epu8(q0p0, q1p1)); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), + _mm_subs_epu8(p0q0, q0p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), + _mm_subs_epu8(p1q1, q1p1)); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), + _mm_subs_epu8(q1p1, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), + _mm_subs_epu8(q2p2, q3p3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); + __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); + __m128i qs0 = _mm_xor_si128(p0q0, t80); + __m128i qs1 = _mm_xor_si128(p1q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + + filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, qs0ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 0xB); + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 0xB); + + /* Filter1 >> 3 */ + filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); + qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128( + _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt); + filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); + qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); + // loopfilter done + + { + __m128i work; + flat = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), + _mm_subs_epu8(q0p0, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), + _mm_subs_epu8(q0p0, q3p3))); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p)); + q5p5 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q5p5), + (__m64 *) (s + 5 * p))); + + q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p)); + q6p6 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q6p6), + (__m64 *) (s + 6 * p))); + + flat2 = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), + _mm_subs_epu8(q0p0, q4p4)), + _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), + _mm_subs_epu8(q0p0, q5p5))); + + q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p)); + q7p7 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q7p7), + (__m64 *) (s + 7 * p))); + + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), + _mm_subs_epu8(q0p0, q6p6)), + _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), + _mm_subs_epu8(q0p0, q7p7))); + + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + p7_16 = _mm_unpacklo_epi8(q7p7, zero); + p6_16 = _mm_unpacklo_epi8(q6p6, zero); + p5_16 = _mm_unpacklo_epi8(q5p5, zero); + p4_16 = _mm_unpacklo_epi8(q4p4, zero); + p3_16 = _mm_unpacklo_epi8(q3p3, zero); + p2_16 = _mm_unpacklo_epi8(q2p2, zero); + p1_16 = _mm_unpacklo_epi8(q1p1, zero); + p0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_unpackhi_epi8(q0p0, zero); + q1_16 = _mm_unpackhi_epi8(q1p1, zero); + q2_16 = _mm_unpackhi_epi8(q2p2, zero); + q3_16 = _mm_unpackhi_epi8(q3p3, zero); + q4_16 = _mm_unpackhi_epi8(q4p4, zero); + q5_16 = _mm_unpackhi_epi8(q5p5, zero); + q6_16 = _mm_unpackhi_epi8(q6p6, zero); + q7_16 = _mm_unpackhi_epi8(q7p7, zero); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), + _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), + _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, + _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, + _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = _mm_add_epi16(eight, + _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16(four, + _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), + 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(p7_16, p7_16); + sum_q7 = _mm_add_epi16(q7_16, q7_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), + 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, + _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), + 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, + _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), + 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), + 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), + 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), + 4); + flat2_q6p6 = _mm_packus_epi16(res_p, res_q); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + flat = _mm_shuffle_epi32(flat, 68); + flat2 = _mm_shuffle_epi32(flat2, 68); + + q2p2 = _mm_andnot_si128(flat, q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + q2p2 = _mm_or_si128(q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + q6p6 = _mm_andnot_si128(flat2, q6p6); + flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); + q6p6 = _mm_or_si128(q6p6, flat2_q6p6); + _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6); + _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6)); + + q5p5 = _mm_andnot_si128(flat2, q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + q5p5 = _mm_or_si128(q5p5, flat2_q5p5); + _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5); + _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5)); + + q4p4 = _mm_andnot_si128(flat2, q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + q4p4 = _mm_or_si128(q4p4, flat2_q4p4); + _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4); + _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4)); + + q3p3 = _mm_andnot_si128(flat2, q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + q3p3 = _mm_or_si128(q3p3, flat2_q3p3); + _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3); + _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3)); + + q2p2 = _mm_andnot_si128(flat2, q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + q2p2 = _mm_or_si128(q2p2, flat2_q2p2); + _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2); + _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2)); + + q1p1 = _mm_andnot_si128(flat2, q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + q1p1 = _mm_or_si128(q1p1, flat2_q1p1); + _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1); + _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1)); + + q0p0 = _mm_andnot_si128(flat2, q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + q0p0 = _mm_or_si128(q0p0, flat2_q0p0); + _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0); + _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0)); + } +} + +DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { + 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, + 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 +}; + +void vpx_lpf_horizontal_edge_16_avx2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + __m128i p7, p6, p5; + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i q5, q6, q7; + __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, + q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, + p256_0, q256_0; + + const __m128i thresh = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _thresh[0])); + const __m128i limit = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _limit[0])); + const __m128i blimit = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _blimit[0])); + + p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 5 * p))); + p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 4 * p))); + p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 3 * p))); + p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 2 * p))); + p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 1 * p))); + q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 0 * p))); + q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 1 * p))); + q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 2 * p))); + q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 3 * p))); + q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 4 * p))); + + p4 = _mm256_castsi256_si128(p256_4); + p3 = _mm256_castsi256_si128(p256_3); + p2 = _mm256_castsi256_si128(p256_2); + p1 = _mm256_castsi256_si128(p256_1); + p0 = _mm256_castsi256_si128(p256_0); + q0 = _mm256_castsi256_si128(q256_0); + q1 = _mm256_castsi256_si128(q256_1); + q2 = _mm256_castsi256_si128(q256_2); + q3 = _mm256_castsi256_si128(q256_3); + q4 = _mm256_castsi256_si128(q256_4); + + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + __m128i ps1 = _mm_xor_si128(p1, t80); + __m128i ps0 = _mm_xor_si128(p0, t80); + __m128i qs0 = _mm_xor_si128(q0, t80); + __m128i qs1 = _mm_xor_si128(q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, + flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, + flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, + flat_q2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + /* Filter1 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + + /* Filter2 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + // loopfilter done + + { + __m128i work; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), + _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 6 * p))); + q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 5 * p))); + p5 = _mm256_castsi256_si128(p256_5); + q5 = _mm256_castsi256_si128(q256_5); + flat2 = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), + _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); + + flat2 = _mm_max_epu8(work, flat2); + p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 7 * p))); + q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 6 * p))); + p6 = _mm256_castsi256_si128(p256_6); + q6 = _mm256_castsi256_si128(q256_6); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), + _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); + + flat2 = _mm_max_epu8(work, flat2); + + p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s - 8 * p))); + q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd( + (__m128d const *)(s + 7 * p))); + p7 = _mm256_castsi256_si128(p256_7); + q7 = _mm256_castsi256_si128(q256_7); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), + _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); + + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m256i eight = _mm256_set1_epi16(8); + const __m256i four = _mm256_set1_epi16(4); + __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, + pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, + res_q; + + const __m256i filter = _mm256_load_si256( + (__m256i const *)filt_loopfilter_avx2); + p256_7 = _mm256_shuffle_epi8(p256_7, filter); + p256_6 = _mm256_shuffle_epi8(p256_6, filter); + p256_5 = _mm256_shuffle_epi8(p256_5, filter); + p256_4 = _mm256_shuffle_epi8(p256_4, filter); + p256_3 = _mm256_shuffle_epi8(p256_3, filter); + p256_2 = _mm256_shuffle_epi8(p256_2, filter); + p256_1 = _mm256_shuffle_epi8(p256_1, filter); + p256_0 = _mm256_shuffle_epi8(p256_0, filter); + q256_0 = _mm256_shuffle_epi8(q256_0, filter); + q256_1 = _mm256_shuffle_epi8(q256_1, filter); + q256_2 = _mm256_shuffle_epi8(q256_2, filter); + q256_3 = _mm256_shuffle_epi8(q256_3, filter); + q256_4 = _mm256_shuffle_epi8(q256_4, filter); + q256_5 = _mm256_shuffle_epi8(q256_5, filter); + q256_6 = _mm256_shuffle_epi8(q256_6, filter); + q256_7 = _mm256_shuffle_epi8(q256_7, filter); + + pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), + _mm256_add_epi16(p256_4, p256_3)); + pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), + _mm256_add_epi16(q256_4, q256_3)); + + pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0, + _mm256_add_epi16(p256_2, p256_1)); + pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0, + _mm256_add_epi16(q256_2, q256_1)); + pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + + pixelFilter_p = _mm256_add_epi16(eight, + _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); + + pixetFilter_p2p1p0 = _mm256_add_epi16(four, + _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(p256_7, p256_0)), 4); + + flat2_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(q256_7, q256_0)), 4); + + flat2_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(p256_3, p256_0)), 3); + + flat_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(q256_3, q256_0)), 3); + + flat_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(p256_7, p256_7); + + sum_q7 = _mm256_add_epi16(q256_7, q256_7); + + sum_p3 = _mm256_add_epi16(p256_3, p256_3); + + sum_q3 = _mm256_add_epi16(q256_3, q256_3); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_1)), 4); + + flat2_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_1)), 4); + + flat2_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); + + pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(sum_p3, p256_1)), 3); + + flat_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_q2q1q0, + _mm256_add_epi16(sum_q3, q256_1)), 3); + + flat_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + sum_p3 = _mm256_add_epi16(sum_p3, p256_3); + + sum_q3 = _mm256_add_epi16(sum_q3, q256_3); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_2)), 4); + + flat2_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_2)), 4); + + flat2_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); + + pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(sum_p3, p256_2)), 3); + + flat_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_q2q1q0, + _mm256_add_epi16(sum_q3, q256_2)), 3); + + flat_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_3)), 4); + + flat2_p3 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_3)), 4); + + flat2_q3 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_4)), 4); + + flat2_p4 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_4)), 4); + + flat2_q4 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_5)), 4); + + flat2_p5 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_5)), 4); + + flat2_q5 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_6)), 4); + + flat2_p6 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_6)), 4); + + flat2_q6 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + } + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + p2 = _mm_andnot_si128(flat, p2); + flat_p2 = _mm_and_si128(flat, flat_p2); + p2 = _mm_or_si128(flat_p2, p2); + + p1 = _mm_andnot_si128(flat, ps1); + flat_p1 = _mm_and_si128(flat, flat_p1); + p1 = _mm_or_si128(flat_p1, p1); + + p0 = _mm_andnot_si128(flat, ps0); + flat_p0 = _mm_and_si128(flat, flat_p0); + p0 = _mm_or_si128(flat_p0, p0); + + q0 = _mm_andnot_si128(flat, qs0); + flat_q0 = _mm_and_si128(flat, flat_q0); + q0 = _mm_or_si128(flat_q0, q0); + + q1 = _mm_andnot_si128(flat, qs1); + flat_q1 = _mm_and_si128(flat, flat_q1); + q1 = _mm_or_si128(flat_q1, q1); + + q2 = _mm_andnot_si128(flat, q2); + flat_q2 = _mm_and_si128(flat, flat_q2); + q2 = _mm_or_si128(flat_q2, q2); + + p6 = _mm_andnot_si128(flat2, p6); + flat2_p6 = _mm_and_si128(flat2, flat2_p6); + p6 = _mm_or_si128(flat2_p6, p6); + _mm_storeu_si128((__m128i *) (s - 7 * p), p6); + + p5 = _mm_andnot_si128(flat2, p5); + flat2_p5 = _mm_and_si128(flat2, flat2_p5); + p5 = _mm_or_si128(flat2_p5, p5); + _mm_storeu_si128((__m128i *) (s - 6 * p), p5); + + p4 = _mm_andnot_si128(flat2, p4); + flat2_p4 = _mm_and_si128(flat2, flat2_p4); + p4 = _mm_or_si128(flat2_p4, p4); + _mm_storeu_si128((__m128i *) (s - 5 * p), p4); + + p3 = _mm_andnot_si128(flat2, p3); + flat2_p3 = _mm_and_si128(flat2, flat2_p3); + p3 = _mm_or_si128(flat2_p3, p3); + _mm_storeu_si128((__m128i *) (s - 4 * p), p3); + + p2 = _mm_andnot_si128(flat2, p2); + flat2_p2 = _mm_and_si128(flat2, flat2_p2); + p2 = _mm_or_si128(flat2_p2, p2); + _mm_storeu_si128((__m128i *) (s - 3 * p), p2); + + p1 = _mm_andnot_si128(flat2, p1); + flat2_p1 = _mm_and_si128(flat2, flat2_p1); + p1 = _mm_or_si128(flat2_p1, p1); + _mm_storeu_si128((__m128i *) (s - 2 * p), p1); + + p0 = _mm_andnot_si128(flat2, p0); + flat2_p0 = _mm_and_si128(flat2, flat2_p0); + p0 = _mm_or_si128(flat2_p0, p0); + _mm_storeu_si128((__m128i *) (s - 1 * p), p0); + + q0 = _mm_andnot_si128(flat2, q0); + flat2_q0 = _mm_and_si128(flat2, flat2_q0); + q0 = _mm_or_si128(flat2_q0, q0); + _mm_storeu_si128((__m128i *) (s - 0 * p), q0); + + q1 = _mm_andnot_si128(flat2, q1); + flat2_q1 = _mm_and_si128(flat2, flat2_q1); + q1 = _mm_or_si128(flat2_q1, q1); + _mm_storeu_si128((__m128i *) (s + 1 * p), q1); + + q2 = _mm_andnot_si128(flat2, q2); + flat2_q2 = _mm_and_si128(flat2, flat2_q2); + q2 = _mm_or_si128(flat2_q2, q2); + _mm_storeu_si128((__m128i *) (s + 2 * p), q2); + + q3 = _mm_andnot_si128(flat2, q3); + flat2_q3 = _mm_and_si128(flat2, flat2_q3); + q3 = _mm_or_si128(flat2_q3, q3); + _mm_storeu_si128((__m128i *) (s + 3 * p), q3); + + q4 = _mm_andnot_si128(flat2, q4); + flat2_q4 = _mm_and_si128(flat2, flat2_q4); + q4 = _mm_or_si128(flat2_q4, q4); + _mm_storeu_si128((__m128i *) (s + 4 * p), q4); + + q5 = _mm_andnot_si128(flat2, q5); + flat2_q5 = _mm_and_si128(flat2, flat2_q5); + q5 = _mm_or_si128(flat2_q5, q5); + _mm_storeu_si128((__m128i *) (s + 5 * p), q5); + + q6 = _mm_andnot_si128(flat2, q6); + flat2_q6 = _mm_and_si128(flat2, flat2_q6); + q6 = _mm_or_si128(flat2_q6, q6); + _mm_storeu_si128((__m128i *) (s + 6 * p), q6); + } +} diff --git a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/thirdparty/libvpx/vpx_dsp/x86/loopfilter_sse2.c new file mode 100644 index 000000000000..739adf31d067 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/loopfilter_sse2.c @@ -0,0 +1,1776 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/emmintrin_compat.h" + +static INLINE __m128i abs_diff(__m128i a, __m128i b) { + return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); +} + +// filter_mask and hev_mask +#define FILTER_HEV_MASK do { \ + /* (abs(q1 - q0), abs(p1 - p0) */ \ + __m128i flat = abs_diff(q1p1, q0p0); \ + /* abs(p1 - q1), abs(p0 - q0) */ \ + const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \ + __m128i abs_p0q0, abs_p1q1, work; \ + \ + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ + hev = _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ + hev = _mm_cmpgt_epi16(hev, thresh); \ + hev = _mm_packs_epi16(hev, hev); \ + \ + /* const int8_t mask = filter_mask(*limit, *blimit, */ \ + /* p3, p2, p1, p0, q0, q1, q2, q3); */ \ + abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */\ + abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */\ + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \ + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \ + /* abs(p3 - p2), abs(p2 - p1) */ \ + work = abs_diff(p3p2, p2p1); \ + flat = _mm_max_epu8(work, flat); \ + /* abs(q3 - q2), abs(q2 - q1) */ \ + work = abs_diff(q3q2, q2q1); \ + flat = _mm_max_epu8(work, flat); \ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ + mask = _mm_unpacklo_epi64(mask, flat); \ + mask = _mm_subs_epu8(mask, limit); \ + mask = _mm_cmpeq_epi8(mask, zero); \ + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ +} while (0) + +#define FILTER4 do { \ + const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, \ + 4, 4, 4, 4, 4, 4, 4, 4); \ + const __m128i t80 = _mm_set1_epi8(0x80); \ + __m128i filter, filter2filter1, work; \ + \ + ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \ + qs1qs0 = _mm_xor_si128(q1q0, t80); \ + \ + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \ + work = _mm_subs_epi8(ps1ps0, qs1qs0); \ + filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \ + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \ + filter = _mm_subs_epi8(filter, work); \ + filter = _mm_subs_epi8(filter, work); \ + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \ + filter = _mm_and_si128(filter, mask); /* & mask */ \ + filter = _mm_unpacklo_epi64(filter, filter); \ + \ + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \ + filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \ + filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \ + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \ + filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \ + \ + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \ + filter = _mm_unpacklo_epi8(filter, filter); \ + filter = _mm_srai_epi16(filter, 9); /* round */ \ + filter = _mm_packs_epi16(filter, filter); \ + filter = _mm_andnot_si128(hev, filter); \ + \ + hev = _mm_unpackhi_epi64(filter2filter1, filter); \ + filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \ + \ + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \ + qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \ + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \ + ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \ + qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \ + ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \ +} while (0) + +void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i limit = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + const __m128i thresh = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; + __m128i mask, hev; + + p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s - 4 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s + 0 * p))); + q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); + q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); + + FILTER_HEV_MASK; + FILTER4; + + _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 + _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 + _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 + _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 +} + +void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i limit = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + const __m128i thresh = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i x0, x1, x2, x3; + __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; + __m128i mask, hev; + + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 1 * p - 4))); + + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 3 * p - 4))); + + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 5 * p - 4))); + + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 7 * p - 4))); + + // Transpose 8x8 + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + p1p0 = _mm_unpacklo_epi16(q1q0, x1); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x0 = _mm_unpacklo_epi16(x2, x3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + p3p2 = _mm_unpacklo_epi32(p1p0, x0); + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + p1p0 = _mm_unpackhi_epi32(p1p0, x0); + p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high + p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + q1q0 = _mm_unpackhi_epi16(q1q0, x1); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x2 = _mm_unpackhi_epi16(x2, x3); + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + q3q2 = _mm_unpackhi_epi32(q1q0, x2); + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + q1q0 = _mm_unpacklo_epi32(q1q0, x2); + + q0p0 = _mm_unpacklo_epi64(p1p0, q1q0); + q1p1 = _mm_unpackhi_epi64(p1p0, q1q0); + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); + q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); + + FILTER_HEV_MASK; + FILTER4; + + // Transpose 8x4 to 4x8 + // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37 + // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07 + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8)); + // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37 + x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0); + // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27 + ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0); + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); + + *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + + *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0); +} + +void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat, flat2; + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; + __m128i abs_p1p0; + + q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); + q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4), + (__m64 *)(s + 4 * p))); + q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3), + (__m64 *)(s + 3 * p))); + q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2), + (__m64 *)(s + 2 * p))); + q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1), + (__m64 *)(s + 1 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0), + (__m64 *)(s - 0 * p))); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = abs_diff(q0p0, p0q0); + abs_p1q1 = abs_diff(q1p1, p1q1); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), + abs_diff(q3p3, q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); + __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); + __m128i qs0 = _mm_xor_si128(p0q0, t80); + __m128i qs1 = _mm_xor_si128(p1q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + + filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, qs0ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 0xB); + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 0xB); + + // Filter1 >> 3 + filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); + qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); + + // filt >> 1 + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), + filt); + filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); + qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); + // loopfilter done + + { + __m128i work; + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); + q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5), + (__m64 *)(s + 5 * p))); + + q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); + q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6), + (__m64 *)(s + 6 * p))); + flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0)); + + q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); + q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7), + (__m64 *)(s + 7 * p))); + work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0)); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + p7_16 = _mm_unpacklo_epi8(q7p7, zero);; + p6_16 = _mm_unpacklo_epi8(q6p6, zero); + p5_16 = _mm_unpacklo_epi8(q5p5, zero); + p4_16 = _mm_unpacklo_epi8(q4p4, zero); + p3_16 = _mm_unpacklo_epi8(q3p3, zero); + p2_16 = _mm_unpacklo_epi8(q2p2, zero); + p1_16 = _mm_unpacklo_epi8(q1p1, zero); + p0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_unpackhi_epi8(q0p0, zero); + q1_16 = _mm_unpackhi_epi8(q1p1, zero); + q2_16 = _mm_unpackhi_epi8(q2p2, zero); + q3_16 = _mm_unpackhi_epi8(q3p3, zero); + q4_16 = _mm_unpackhi_epi8(q4p4, zero); + q5_16 = _mm_unpackhi_epi8(q5p5, zero); + q6_16 = _mm_unpackhi_epi8(q6p6, zero); + q7_16 = _mm_unpackhi_epi8(q7p7, zero); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), + _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), + _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, + pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16(four, + _mm_add_epi16(pixetFilter_p2p1p0, + pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(p7_16, p0_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(q7_16, q0_16)), 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(p7_16, p7_16); + sum_q7 = _mm_add_epi16(q7_16, q7_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p1_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q1_16)), 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, + _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p2_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q2_16)), 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, + _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p3_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q3_16)), 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p4_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q4_16)), 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p5_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q5_16)), 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p6_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q6_16)), 4); + flat2_q6p6 = _mm_packus_epi16(res_p, res_q); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + flat = _mm_shuffle_epi32(flat, 68); + flat2 = _mm_shuffle_epi32(flat2, 68); + + q2p2 = _mm_andnot_si128(flat, q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + q2p2 = _mm_or_si128(q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + q6p6 = _mm_andnot_si128(flat2, q6p6); + flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); + q6p6 = _mm_or_si128(q6p6, flat2_q6p6); + _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); + _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + + q5p5 = _mm_andnot_si128(flat2, q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + q5p5 = _mm_or_si128(q5p5, flat2_q5p5); + _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); + _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + + q4p4 = _mm_andnot_si128(flat2, q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + q4p4 = _mm_or_si128(q4p4, flat2_q4p4); + _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); + _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + + q3p3 = _mm_andnot_si128(flat2, q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + q3p3 = _mm_or_si128(q3p3, flat2_q3p3); + _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); + _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + + q2p2 = _mm_andnot_si128(flat2, q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + q2p2 = _mm_or_si128(q2p2, flat2_q2p2); + _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); + _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + + q1p1 = _mm_andnot_si128(flat2, q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + q1p1 = _mm_or_si128(q1p1, flat2_q1p1); + _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); + _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + + q0p0 = _mm_andnot_si128(flat2, q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + q0p0 = _mm_or_si128(q0p0, flat2_q0p0); + _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); + _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + } +} + +static INLINE __m128i filter_add2_sub2(const __m128i *const total, + const __m128i *const a1, + const __m128i *const a2, + const __m128i *const s1, + const __m128i *const s2) { + __m128i x = _mm_add_epi16(*a1, *total); + x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2); + return x; +} + +static INLINE __m128i filter8_mask(const __m128i *const flat, + const __m128i *const other_filt, + const __m128i *const f8_lo, + const __m128i *const f8_hi) { + const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), + _mm_srli_epi16(*f8_hi, 3)); + const __m128i result = _mm_and_si128(*flat, f8); + return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); +} + +static INLINE __m128i filter16_mask(const __m128i *const flat, + const __m128i *const other_filt, + const __m128i *const f_lo, + const __m128i *const f_hi) { + const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), + _mm_srli_epi16(*f_hi, 4)); + const __m128i result = _mm_and_si128(*flat, f); + return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); +} + +void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat, flat2; + __m128i p7, p6, p5; + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i q5, q6, q7; + + __m128i op2, op1, op0, oq0, oq1, oq2; + + __m128i max_abs_p1p0q1q0; + + p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); + p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); + p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); + p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); + q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); + q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); + q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); + + { + const __m128i abs_p1p0 = abs_diff(p1, p0); + const __m128i abs_q1q0 = abs_diff(q1, q0); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i abs_p0q0 = abs_diff(p0, q0); + __m128i abs_p1q1 = abs_diff(p1, q1); + __m128i work; + max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2)); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + { + __m128i work; + work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); + flat = _mm_max_epu8(work, max_abs_p1p0q1q0); + work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0)); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0)); + flat2 = _mm_max_epu8(work, flat2); + work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0)); + flat2 = _mm_max_epu8(work, flat2); + work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0)); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + const __m128i ff = _mm_cmpeq_epi8(t4, t4); + + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + op1 = _mm_xor_si128(p1, t80); + op0 = _mm_xor_si128(p0, t80); + oq0 = _mm_xor_si128(q0, t80); + oq1 = _mm_xor_si128(q1, t80); + + hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); + + work_a = _mm_subs_epi8(oq0, op0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); + oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); + // loopfilter done + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter8 + { + const __m128i four = _mm_set1_epi16(4); + const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); + const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); + const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); + const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); + const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); + const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); + const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); + const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); + + const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); + const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); + const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); + const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); + const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); + const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); + const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); + const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); + __m128i f8_lo, f8_hi; + + f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four), + _mm_add_epi16(p3_lo, p2_lo)); + f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo), + _mm_add_epi16(p2_lo, p1_lo)); + f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); + + f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four), + _mm_add_epi16(p3_hi, p2_hi)); + f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi), + _mm_add_epi16(p2_hi, p1_hi)); + f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); + + op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi); + op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi); + op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi); + oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi); + oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi); + oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi); + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero); + const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero); + const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero); + const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero); + const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); + const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); + const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); + const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); + const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); + const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); + const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); + const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); + const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero); + const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero); + const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero); + const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero); + + const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero); + const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero); + const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero); + const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero); + const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); + const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); + const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); + const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); + const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); + const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); + const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); + const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); + const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero); + const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero); + const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero); + const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero); + + __m128i f_lo; + __m128i f_hi; + + f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7 + f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), + _mm_add_epi16(p4_lo, f_lo)); + f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo), + _mm_add_epi16(p2_lo, p1_lo)); + f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo); + f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo); + + f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7 + f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), + _mm_add_epi16(p4_hi, f_hi)); + f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi), + _mm_add_epi16(p2_hi, p1_hi)); + f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); + f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); + + p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 7 * p), p6); + + f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); + p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + + f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); + p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + + f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); + p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + + f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); + op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 3 * p), op2); + + f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); + op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + + f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); + op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); + oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); + oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); + oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); + q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); + q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); + q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); + q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 6 * p), q6); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + } +} + +void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); + const __m128i zero = _mm_set1_epi16(0); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; + + q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); + q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s + 2 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s - 0 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + // filter_mask and hev_mask + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + + abs_p0q0 = abs_diff(q0p0, p0q0); + abs_p1q1 = abs_diff(q1p1, p1q1); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), + abs_diff(q3p3, q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // flat_mask4 + + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), + abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + + { + const __m128i four = _mm_set1_epi16(4); + unsigned char *src = s; + { + __m128i workp_a, workp_b, workp_shft; + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op2[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op1[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op0[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq0[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq1[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq2[0], + _mm_packus_epi16(workp_shft, workp_shft)); + } + } + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + t80); + const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + t80); + const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), + t80); + const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), + t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 11); + filter1 = _mm_packs_epi16(filter1, filter1); + + // Filter2 >> 3 + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 11); + filter2 = _mm_packs_epi16(filter2, zero); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + filt = _mm_unpacklo_epi8(zero, filt); + filt = _mm_srai_epi16(filt, 9); + filt = _mm_packs_epi16(filt, zero); + + filt = _mm_andnot_si128(hev, filt); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q0 = _mm_loadl_epi64((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + q1 = _mm_loadl_epi64((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_loadl_epi64((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p0 = _mm_loadl_epi64((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + p1 = _mm_loadl_epi64((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_loadl_epi64((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + } +} + +void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, + const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); + const __m128i zero = _mm_set1_epi16(0); + const __m128i blimit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + const __m128i thresh = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + + __m128i mask, hev, flat; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + + // filter_mask and hev_mask + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), + _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), + _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), + _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), + _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // flat_mask4 + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), + _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), + _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), + _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), + _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + { + const __m128i four = _mm_set1_epi16(4); + unsigned char *src = s; + int i = 0; + + do { + __m128i workp_a, workp_b, workp_shft; + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op2[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op1[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op0[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq0[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq1[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq2[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + src += 8; + } while (++i < 2); + } + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), + t80); + const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), + t80); + const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), + t80); + const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), + t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q0 = _mm_load_si128((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + q1 = _mm_load_si128((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_load_si128((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p0 = _mm_load_si128((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + p1 = _mm_load_si128((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_load_si128((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + } +} + +void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + const __m128i blimit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + const __m128i thresh = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + const __m128i zero = _mm_set1_epi16(0); + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + __m128i mask, hev, flat; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + + // filter_mask and hev_mask + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), + _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), + _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), + _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), + _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), + t80); + const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), + t80); + const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), + t80); + const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), + t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + } +} + +static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, + int in_p, unsigned char *out, int out_p) { + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i x8, x9, x10, x11, x12, x13, x14, x15; + + // 2-way interleave w/hoisting of unpacks + x0 = _mm_loadl_epi64((__m128i *)in0); // 1 + x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3 + x0 = _mm_unpacklo_epi8(x0, x1); // 1 + + x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5 + x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p)); // 7 + x1 = _mm_unpacklo_epi8(x2, x3); // 2 + + x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p)); // 9 + x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p)); // 11 + x2 = _mm_unpacklo_epi8(x4, x5); // 3 + + x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p)); // 13 + x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p)); // 15 + x3 = _mm_unpacklo_epi8(x6, x7); // 4 + x4 = _mm_unpacklo_epi16(x0, x1); // 9 + + x8 = _mm_loadl_epi64((__m128i *)in1); // 2 + x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4 + x8 = _mm_unpacklo_epi8(x8, x9); // 5 + x5 = _mm_unpacklo_epi16(x2, x3); // 10 + + x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6 + x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p)); // 8 + x9 = _mm_unpacklo_epi8(x10, x11); // 6 + + x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p)); // 10 + x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p)); // 12 + x10 = _mm_unpacklo_epi8(x12, x13); // 7 + x12 = _mm_unpacklo_epi16(x8, x9); // 11 + + x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p)); // 14 + x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p)); // 16 + x11 = _mm_unpacklo_epi8(x14, x15); // 8 + x13 = _mm_unpacklo_epi16(x10, x11); // 12 + + x6 = _mm_unpacklo_epi32(x4, x5); // 13 + x7 = _mm_unpackhi_epi32(x4, x5); // 14 + x14 = _mm_unpacklo_epi32(x12, x13); // 15 + x15 = _mm_unpackhi_epi32(x12, x13); // 16 + + // Store first 4-line result + _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); + _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); + + x4 = _mm_unpackhi_epi16(x0, x1); + x5 = _mm_unpackhi_epi16(x2, x3); + x12 = _mm_unpackhi_epi16(x8, x9); + x13 = _mm_unpackhi_epi16(x10, x11); + + x6 = _mm_unpacklo_epi32(x4, x5); + x7 = _mm_unpackhi_epi32(x4, x5); + x14 = _mm_unpacklo_epi32(x12, x13); + x15 = _mm_unpackhi_epi32(x12, x13); + + // Store second 4-line result + _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); + _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); +} + +static INLINE void transpose(unsigned char *src[], int in_p, + unsigned char *dst[], int out_p, + int num_8x8_to_transpose) { + int idx8x8 = 0; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + do { + unsigned char *in = src[idx8x8]; + unsigned char *out = dst[idx8x8]; + + x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07 + x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17 + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + x0 = _mm_unpacklo_epi8(x0, x1); + + x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27 + x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = _mm_unpacklo_epi8(x2, x3); + + x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47 + x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = _mm_unpacklo_epi8(x4, x5); + + x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67 + x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = _mm_unpacklo_epi8(x6, x7); + + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + x4 = _mm_unpacklo_epi16(x0, x1); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x5 = _mm_unpacklo_epi16(x2, x3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + x6 = _mm_unpacklo_epi32(x4, x5); + _mm_storel_pd((double *)(out + 0*out_p), + _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 + _mm_storeh_pd((double *)(out + 1*out_p), + _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + x7 = _mm_unpackhi_epi32(x4, x5); + _mm_storel_pd((double *)(out + 2*out_p), + _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 + _mm_storeh_pd((double *)(out + 3*out_p), + _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + x4 = _mm_unpackhi_epi16(x0, x1); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x5 = _mm_unpackhi_epi16(x2, x3); + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + x6 = _mm_unpacklo_epi32(x4, x5); + _mm_storel_pd((double *)(out + 4*out_p), + _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 + _mm_storeh_pd((double *)(out + 5*out_p), + _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + x7 = _mm_unpackhi_epi32(x4, x5); + + _mm_storel_pd((double *)(out + 6*out_p), + _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 + _mm_storeh_pd((double *)(out + 7*out_p), + _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num_8x8_to_transpose); +} + +void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); + unsigned char *src[2]; + unsigned char *dst[2]; + + // Transpose 8x16 + transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, + blimit1, limit1, thresh1); + src[0] = t_dst; + src[1] = t_dst + 8; + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + transpose(src, 16, dst, p, 2); +} + +void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]); + unsigned char *src[1]; + unsigned char *dst[1]; + + // Transpose 8x8 + src[0] = s - 4; + dst[0] = t_dst; + + transpose(src, p, dst, 8, 1); + + // Loop filtering + vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh); + + src[0] = t_dst; + dst[0] = s - 4; + + // Transpose back + transpose(src, 8, dst, p, 1); +} + +void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); + unsigned char *src[2]; + unsigned char *dst[2]; + + // Transpose 8x16 + transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, + blimit1, limit1, thresh1); + src[0] = t_dst; + src[1] = t_dst + 8; + + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + transpose(src, 16, dst, p, 2); +} + +void vpx_lpf_vertical_16_sse2(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]); + unsigned char *src[2]; + unsigned char *dst[2]; + + src[0] = s - 8; + src[1] = s; + dst[0] = t_dst; + dst[1] = t_dst + 8 * 8; + + // Transpose 16x8 + transpose(src, p, dst, 8, 2); + + // Loop filtering + vpx_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh); + + src[0] = t_dst; + src[1] = t_dst + 8 * 8; + dst[0] = s - 8; + dst[1] = s; + + // Transpose back + transpose(src, 8, dst, p, 2); +} + +void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + DECLARE_ALIGNED(16, unsigned char, t_dst[256]); + + // Transpose 16x16 + transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); + transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); + + // Loop filtering + vpx_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh); + + // Transpose back + transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); + transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); +} diff --git a/thirdparty/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/thirdparty/libvpx/vpx_dsp/x86/txfm_common_sse2.h new file mode 100644 index 000000000000..536b206876c2 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/txfm_common_sse2.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_TXFM_COMMON_SSE2_H_ +#define VPX_DSP_X86_TXFM_COMMON_SSE2_H_ + +#include +#include "vpx/vpx_integer.h" + +#define pair_set_epi16(a, b) \ + _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +#define dual_set_epi16(a, b) \ + _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \ + (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a)) + +#define octa_set_epi16(a, b, c, d, e, f, g, h) \ + _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \ + (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h)) + +#endif // VPX_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_asm_stubs.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_asm_stubs.c new file mode 100644 index 000000000000..422b0fc422d6 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/vpx_asm_stubs.c @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/convolve.h" + +#if HAVE_SSE2 +filter8_1dfunction vpx_filter_block1d16_v8_sse2; +filter8_1dfunction vpx_filter_block1d16_h8_sse2; +filter8_1dfunction vpx_filter_block1d8_v8_sse2; +filter8_1dfunction vpx_filter_block1d8_h8_sse2; +filter8_1dfunction vpx_filter_block1d4_v8_sse2; +filter8_1dfunction vpx_filter_block1d4_h8_sse2; +filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2; + +filter8_1dfunction vpx_filter_block1d16_v2_sse2; +filter8_1dfunction vpx_filter_block1d16_h2_sse2; +filter8_1dfunction vpx_filter_block1d8_v2_sse2; +filter8_1dfunction vpx_filter_block1d8_h2_sse2; +filter8_1dfunction vpx_filter_block1d4_v2_sse2; +filter8_1dfunction vpx_filter_block1d4_h2_sse2; +filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; + +// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); + +// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, sse2); +FUN_CONV_2D(avg_ , sse2); + +#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; + +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; + +// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); +HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); +HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + sse2); + +// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_2D(, sse2); +HIGH_FUN_CONV_2D(avg_ , sse2); +#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 +#endif // HAVE_SSE2 diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm new file mode 100644 index 000000000000..abc027065559 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm @@ -0,0 +1,228 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro convolve_fn 1-2 +%ifidn %1, avg +%define AUX_XMM_REGS 4 +%else +%define AUX_XMM_REGS 0 +%endif +%ifidn %2, highbd +%define pavg pavgw +cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + fx, fxs, fy, fys, w, h, bd +%else +%define pavg pavgb +cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + fx, fxs, fy, fys, w, h +%endif + mov r4d, dword wm +%ifidn %2, highbd + shl r4d, 1 + shl srcq, 1 + shl src_strideq, 1 + shl dstq, 1 + shl dst_strideq, 1 +%else + cmp r4d, 4 + je .w4 +%endif + cmp r4d, 8 + je .w8 + cmp r4d, 16 + je .w16 + cmp r4d, 32 + je .w32 +%ifidn %2, highbd + cmp r4d, 64 + je .w64 + + mov r4d, dword hm +.loop128: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + movu m0, [srcq+64] + movu m1, [srcq+80] + movu m2, [srcq+96] + movu m3, [srcq+112] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq+64] + pavg m1, [dstq+80] + pavg m2, [dstq+96] + pavg m3, [dstq+112] +%endif + mova [dstq+64], m0 + mova [dstq+80], m1 + mova [dstq+96], m2 + mova [dstq+112], m3 + add dstq, dst_strideq + dec r4d + jnz .loop128 + RET +%endif + +.w64 + mov r4d, dword hm +.loop64: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + add dstq, dst_strideq + dec r4d + jnz .loop64 + RET + +.w32: + mov r4d, dword hm +.loop32: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+src_strideq] + movu m3, [srcq+src_strideq+16] + lea srcq, [srcq+src_strideq*2] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq +16] + pavg m2, [dstq+dst_strideq] + pavg m3, [dstq+dst_strideq+16] +%endif + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+dst_strideq ], m2 + mova [dstq+dst_strideq+16], m3 + lea dstq, [dstq+dst_strideq*2] + sub r4d, 2 + jnz .loop32 + RET + +.w16: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop16: + movu m0, [srcq] + movu m1, [srcq+src_strideq] + movu m2, [srcq+src_strideq*2] + movu m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+dst_strideq] + pavg m2, [dstq+dst_strideq*2] + pavg m3, [dstq+r6q] +%endif + mova [dstq ], m0 + mova [dstq+dst_strideq ], m1 + mova [dstq+dst_strideq*2], m2 + mova [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop16 + RET + +.w8: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop8: + movh m0, [srcq] + movh m1, [srcq+src_strideq] + movh m2, [srcq+src_strideq*2] + movh m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + movh m4, [dstq] + movh m5, [dstq+dst_strideq] + movh m6, [dstq+dst_strideq*2] + movh m7, [dstq+r6q] + pavg m0, m4 + pavg m1, m5 + pavg m2, m6 + pavg m3, m7 +%endif + movh [dstq ], m0 + movh [dstq+dst_strideq ], m1 + movh [dstq+dst_strideq*2], m2 + movh [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop8 + RET + +%ifnidn %2, highbd +.w4: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop4: + movd m0, [srcq] + movd m1, [srcq+src_strideq] + movd m2, [srcq+src_strideq*2] + movd m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + movd m4, [dstq] + movd m5, [dstq+dst_strideq] + movd m6, [dstq+dst_strideq*2] + movd m7, [dstq+r6q] + pavg m0, m4 + pavg m1, m5 + pavg m2, m6 + pavg m3, m7 +%endif + movd [dstq ], m0 + movd [dstq+dst_strideq ], m1 + movd [dstq+dst_strideq*2], m2 + movd [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop4 + RET +%endif +%endmacro + +INIT_XMM sse2 +convolve_fn copy +convolve_fn avg +%if CONFIG_VP9_HIGHBITDEPTH +convolve_fn copy, highbd +convolve_fn avg, highbd +%endif diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c new file mode 100644 index 000000000000..d8a92354c98c --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -0,0 +1,606 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Due to a header conflict between math.h and intrinsics includes with ceil() +// in certain configurations under vs9 this include needs to precede +// immintrin.h. + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/convolve.h" +#include "vpx_ports/mem.h" + +// filters for 16_h8 and 16_v8 +DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +#if defined(__clang__) +// -- GODOT start - +# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \ + (!defined(__MACPORTS__) && defined(__APPLE__) && \ + ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ + (__clang_major__ == 5 && __clang_minor__ == 0))) +// -- GODOT end -- +# define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +# else // clang > 3.3, and not 5.0 on macosx. +# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +# endif // clang <= 3.3 +#elif defined(__GNUC__) +# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) +# define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 +# define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) +# else // gcc > 4.7 +# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +# endif // gcc <= 4.6 +#else // !(gcc || clang) +# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // __clang__ + +static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i-=2) { + // load the 2 strides of source + srcReg32b1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr - 3))); + srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, + _mm_loadu_si128((const __m128i *) + (src_ptr+src_pixels_per_line-3)), 1); + + // filter the source buffer + srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, + _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 5))); + srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, + _mm_loadu_si128((const __m128i *) + (src_ptr+src_pixels_per_line+5)), 1); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, + _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // filter the source buffer + srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, + _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, + _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); + + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, + srcRegFilt32b2_1); + + src_ptr+=src_stride; + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, + _mm256_castsi256_si128(srcRegFilt32b1_1)); + + // save the next 16 bits + _mm_store_si128((__m128i*)(output_ptr+output_pitch), + _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + output_ptr+=dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2= _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, + _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + + // reading the next 16 bytes + // (part of it was being read by earlier read) + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + // filter the source buffer + srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, + _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, + _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(addFilterReg64)); + + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm256_castsi256_si128(addFilterReg64)); + + // shift by 7 bit each 16 bit + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); + } +} + +static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr))); + srcReg32b2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); + srcReg32b3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); + srcReg32b4 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); + srcReg32b5 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); + srcReg32b6 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); + + // have each consecutive loads on the same 256 register + srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, + _mm256_castsi256_si128(srcReg32b2), 1); + srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, + _mm256_castsi256_si128(srcReg32b3), 1); + srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, + _mm256_castsi256_si128(srcReg32b4), 1); + srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, + _mm256_castsi256_si128(srcReg32b5), 1); + srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, + _mm256_castsi256_si128(srcReg32b6), 1); + srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, + _mm256_castsi256_si128(srcReg32b7), 1); + + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); + + // save + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + + // save + srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); + + // save + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + + // save + srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); + + + for (i = output_height; i > 1; i-=2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_min_epi16(srcReg32b8, srcReg32b12)); + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_max_epi16(srcReg32b8, srcReg32b12)); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); + + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); + + // add and saturate the results together + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_min_epi16(srcReg32b8, srcReg32b12)); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_max_epi16(srcReg32b8, srcReg32b12)); + + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); + srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); + + src_ptr+=src_stride; + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, + _mm256_castsi256_si128(srcReg32b1)); + + // save the next 16 bits + _mm_store_si128((__m128i*)(output_ptr+out_pitch), + _mm256_extractf128_si256(srcReg32b1, 1)); + + output_ptr+=dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b1 = srcReg32b3; + srcReg32b11 = srcReg32b2; + srcReg32b3 = srcReg32b5; + srcReg32b2 = srcReg32b4; + srcReg32b5 = srcReg32b7; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; + __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the last 2 results together + srcRegFilt4 = _mm_unpacklo_epi8( + _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + srcRegFilt7 = _mm_unpackhi_epi8( + _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, + _mm256_castsi256_si128(forthFilters)); + srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, + _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); + + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_min_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm_min_epi16(srcRegFilt5, srcRegFilt7)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_max_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm_max_epi16(srcRegFilt5, srcRegFilt7)); + + + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm256_castsi256_si128(addFilterReg64)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm256_castsi256_si128(addFilterReg64)); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); + } +} + +#if HAVE_AVX2 && HAVE_SSSE3 +filter8_1dfunction vpx_filter_block1d4_v8_ssse3; +#if ARCH_X86_64 +filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; +#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3 +#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3 +#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3 +#else // ARCH_X86 +filter8_1dfunction vpx_filter_block1d8_v8_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_ssse3; +#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3 +#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3 +#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3 +#endif // ARCH_X86_64 +filter8_1dfunction vpx_filter_block1d16_v2_ssse3; +filter8_1dfunction vpx_filter_block1d16_h2_ssse3; +filter8_1dfunction vpx_filter_block1d8_v2_ssse3; +filter8_1dfunction vpx_filter_block1d8_h2_ssse3; +filter8_1dfunction vpx_filter_block1d4_v2_ssse3; +filter8_1dfunction vpx_filter_block1d4_h2_ssse3; +#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3 +#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3 +#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3 +#define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3 +#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3 +#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3 +#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3 +// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); + +// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, avx2); +#endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c new file mode 100644 index 000000000000..6fd52087c7b9 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -0,0 +1,915 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Due to a header conflict between math.h and intrinsics includes with ceil() +// in certain configurations under vs9 this include needs to precede +// tmmintrin.h. + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_dsp/x86/convolve.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/emmintrin_compat.h" + +// filters only for the 4_h8 convolution +DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +// filters for 8_h8 and 16_h8 +DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +// These are reused by the avx2 intrinsics. +filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; + +void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i firstFilters, secondFilters, shuffle1, shuffle2; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, srcReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 =_mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter into the first lane + firstFilters = _mm_shufflelo_epi16(filtersReg, 0); + // duplicate only the third 16 bit in the filter into the first lane + secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); + // duplicate only the seconds 16 bits in the filter into the second lane + // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 + firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); + // duplicate only the forth 16 bits in the filter into the second lane + // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 + secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); + + // loading the local filters + shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8); + shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); + srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // extract the higher half of the lane + srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); + srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); + + minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); + + // add and saturate all the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + src_ptr+=src_pixels_per_line; + + // save only 4 bytes + *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); + + output_ptr+=output_pitch; + } +} + +void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, + ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; + __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 128 bit register + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 128 bit register + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + filt1Reg = _mm_load_si128((__m128i const *)filt1_global); + filt2Reg = _mm_load_si128((__m128i const *)filt2_global); + filt3Reg = _mm_load_si128((__m128i const *)filt3_global); + filt4Reg = _mm_load_si128((__m128i const *)filt4_global); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg); + srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); + + // add and saturate all the results together + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + + srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr+=src_pixels_per_line; + + // save only 8 bytes + _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); + + output_ptr+=output_pitch; + } +} + +void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_pitch, + uint8_t *output_ptr, + ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i addFilterReg64, filtersReg, minReg; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; + __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; + __m128i srcReg8; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits in the filter + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits in the filter + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits in the filter + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + // load the first 7 rows of 8 bytes + srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + + for (i = 0; i < output_height; i++) { + // load the last 8 bytes + srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the result together + srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); + srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); + + // merge the result together + srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); + srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); + + // add and saturate the results together + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); + srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr+=src_pitch; + + // shift down a row + srcReg1 = srcReg2; + srcReg2 = srcReg3; + srcReg3 = srcReg4; + srcReg4 = srcReg5; + srcReg5 = srcReg6; + srcReg6 = srcReg7; + srcReg7 = srcReg8; + + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); + + output_ptr+=out_pitch; + } +} + +filter8_1dfunction vpx_filter_block1d16_v8_ssse3; +filter8_1dfunction vpx_filter_block1d16_h8_ssse3; +filter8_1dfunction vpx_filter_block1d8_v8_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_ssse3; +filter8_1dfunction vpx_filter_block1d4_v8_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_ssse3; +filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; + +filter8_1dfunction vpx_filter_block1d16_v2_ssse3; +filter8_1dfunction vpx_filter_block1d16_h2_ssse3; +filter8_1dfunction vpx_filter_block1d8_v2_ssse3; +filter8_1dfunction vpx_filter_block1d8_h2_ssse3; +filter8_1dfunction vpx_filter_block1d4_v2_ssse3; +filter8_1dfunction vpx_filter_block1d4_h2_ssse3; +filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; + +// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + ssse3); + +#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \ + const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \ + const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \ + const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \ + const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \ + const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \ + \ + const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \ + const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \ + const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \ + const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \ + \ + out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \ + out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \ + out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \ + out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \ + out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \ + out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \ + out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \ + out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \ +} + +static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *x_filter) { + const __m128i k_256 = _mm_set1_epi16(1 << 8); + const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); + // pack and duplicate the filter values + const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); + const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); + const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); + const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); + const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); + const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); + const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); + const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); + const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); + // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 + const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); + // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 + const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); + // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 + const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); + // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 + const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); + // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 + const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); + // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 + const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); + // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); + const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); + const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); + const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); + const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); + const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); + const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); + // add and saturate the results together + const __m128i min_x2x1 = _mm_min_epi16(x2, x1); + const __m128i max_x2x1 = _mm_max_epi16(x2, x1); + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, min_x2x1); + temp = _mm_adds_epi16(temp, max_x2x1); + // round and shift by 7 bit each 16 bit + temp = _mm_mulhrs_epi16(temp, k_256); + // shrink to 8 bit each 16 bits + temp = _mm_packus_epi16(temp, temp); + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i*)dst, temp); +} + +static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride) { + __m128i A, B, C, D, E, F, G, H; + + A = _mm_loadl_epi64((const __m128i *)src); + B = _mm_loadl_epi64((const __m128i *)(src + src_stride)); + C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); + D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); + E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4)); + F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5)); + G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6)); + H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7)); + + TRANSPOSE_8X8(A, B, C, D, E, F, G, H, + A, B, C, D, E, F, G, H); + + _mm_storel_epi64((__m128i*)dst, A); + _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), B); + _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), C); + _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), D); + _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), E); + _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), F); + _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), G); + _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), H); +} + +static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, int w, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); + int x, y, z; + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 8x8 areas. The intermediate height is not always + // a multiple of 8, so force it to be a multiple of 8 here. + y = h + (8 - (h & 0x7)); + + do { + int x_q4 = x0_q4; + for (x = 0; x < w; x += 8) { + // process 8 src_x steps + for (z = 0; z < 8; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter); + } else { + int i; + for (i = 0; i < 8; ++i) { + temp[z * 8 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 8x8 filters values back to dst + transpose8x8_to_dst(temp, 8, dst + x, dst_stride); + } + + src += src_stride * 8; + dst += dst_stride * 8; + } while (y -= 8); +} + +static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *filter) { + const __m128i k_256 = _mm_set1_epi16(1 << 8); + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); + const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); + const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); + const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + // TRANSPOSE... + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // + // TO + // + // 00 10 20 30 + // 01 11 21 31 + // 02 12 22 32 + // 03 13 23 33 + // 04 14 24 34 + // 05 15 25 35 + // 06 16 26 36 + // 07 17 27 37 + // + // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 + const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); + // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 + const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); + // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 + const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 + const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1); + // 02 03 12 13 22 23 32 33 + const __m128i s3s2 = _mm_srli_si128(s1s0, 8); + // 06 07 16 17 26 27 36 37 + const __m128i s7s6 = _mm_srli_si128(s5s4, 8); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); + const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); + const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); + const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); + // add and saturate the results together + const __m128i min_x2x1 = _mm_min_epi16(x2, x1); + const __m128i max_x2x1 = _mm_max_epi16(x2, x1); + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, min_x2x1); + temp = _mm_adds_epi16(temp, max_x2x1); + // round and shift by 7 bit each 16 bit + temp = _mm_mulhrs_epi16(temp, k_256); + // shrink to 8 bit each 16 bits + temp = _mm_packus_epi16(temp, temp); + // save only 4 bytes + *(int *)dst = _mm_cvtsi128_si32(temp); +} + +static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride) { + __m128i A = _mm_cvtsi32_si128(*(const int *)src); + __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride)); + __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2)); + __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3)); + // 00 10 01 11 02 12 03 13 + const __m128i tr0_0 = _mm_unpacklo_epi8(A, B); + // 20 30 21 31 22 32 23 33 + const __m128i tr0_1 = _mm_unpacklo_epi8(C, D); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + A = _mm_unpacklo_epi16(tr0_0, tr0_1); + B = _mm_srli_si128(A, 4); + C = _mm_srli_si128(A, 8); + D = _mm_srli_si128(A, 12); + + *(int *)(dst) = _mm_cvtsi128_si32(A); + *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B); + *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C); + *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D); +} + +static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, int w, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); + int x, y, z; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; y += 4) { + int x_q4 = x0_q4; + for (x = 0; x < w; x += 4) { + // process 4 src_x steps + for (z = 0; z < 4; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter); + } else { + int i; + for (i = 0; i < 4; ++i) { + temp[z * 4 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 4x4 filters values back to dst + transpose4x4_to_dst(temp, 4, dst + x, dst_stride); + } + + src += src_stride * 4; + dst += dst_stride * 4; + } +} + +static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *filter) { + const __m128i k_256 = _mm_set1_epi16(1 << 8); + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); + const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr); + const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch)); + const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2)); + const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3)); + const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4)); + const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5)); + const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6)); + const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7)); + const __m128i s1s0 = _mm_unpacklo_epi8(A, B); + const __m128i s3s2 = _mm_unpacklo_epi8(C, D); + const __m128i s5s4 = _mm_unpacklo_epi8(E, F); + const __m128i s7s6 = _mm_unpacklo_epi8(G, H); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); + const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); + const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); + const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); + // add and saturate the results together + const __m128i min_x2x1 = _mm_min_epi16(x2, x1); + const __m128i max_x2x1 = _mm_max_epi16(x2, x1); + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, min_x2x1); + temp = _mm_adds_epi16(temp, max_x2x1); + // round and shift by 7 bit each 16 bit + temp = _mm_mulhrs_epi16(temp, k_256); + // shrink to 8 bit each 16 bits + temp = _mm_packus_epi16(temp, temp); + // save only 4 bytes + *(int *)dst = _mm_cvtsi128_si32(temp); +} + +static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); + } + + y_q4 += y_step_q4; + } +} + +static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *filter) { + const __m128i k_256 = _mm_set1_epi16(1 << 8); + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); + const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); + const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); + const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + const __m128i s1s0 = _mm_unpacklo_epi8(A, B); + const __m128i s3s2 = _mm_unpacklo_epi8(C, D); + const __m128i s5s4 = _mm_unpacklo_epi8(E, F); + const __m128i s7s6 = _mm_unpacklo_epi8(G, H); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); + const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); + const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); + const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); + // add and saturate the results together + const __m128i min_x2x1 = _mm_min_epi16(x2, x1); + const __m128i max_x2x1 = _mm_max_epi16(x2, x1); + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, min_x2x1); + temp = _mm_adds_epi16(temp, max_x2x1); + // round and shift by 7 bit each 16 bit + temp = _mm_mulhrs_epi16(temp, k_256); + // shrink to 8 bit each 16 bits + temp = _mm_packus_epi16(temp, temp); + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i*)dst, temp); +} + +static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + if (y_q4 & SUBPEL_MASK) { + filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); + } + y_q4 += y_step_q4; + } +} + +static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *filter, int w) { + const __m128i k_256 = _mm_set1_epi16(1 << 8); + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); + int i; + + for (i = 0; i < w; i += 16) { + const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr); + const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); + const __m128i C = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + const __m128i D = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + const __m128i E = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + const __m128i F = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + const __m128i G = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); + const __m128i H = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); + // merge the result together + const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B); + const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H); + const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B); + const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0); + const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6); + const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0); + const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6); + // add and saturate the results together + const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo); + const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi); + // merge the result together + const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D); + const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2); + const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2); + // merge the result together + const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F); + const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4); + const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4); + // add and saturate the results together + __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo)); + __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi)); + + // add and saturate the results together + temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo)); + temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi)); + // round and shift by 7 bit each 16 bit + temp_lo = _mm_mulhrs_epi16(temp_lo, k_256); + temp_hi = _mm_mulhrs_epi16(temp_hi, k_256); + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + temp_hi = _mm_packus_epi16(temp_lo, temp_hi); + src_ptr += 16; + // save 16 bytes convolve result + _mm_store_si128((__m128i*)&dst[i], temp_hi); + } +} + +static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + if (y_q4 & SUBPEL_MASK) { + filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter, + w); + } else { + memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); + } + y_q4 += y_step_q4; + } +} + +static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + int x0_q4, int x_step_q4, + const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, + int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // --Require an additional 8 rows for the horiz_w8 transpose tail. + DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + if (w >= 8) { + scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, x_filters, x0_q4, x_step_q4, + w, intermediate_height); + } else { + scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, x_filters, x0_q4, x_step_q4, + w, intermediate_height); + } + + if (w >= 16) { + scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, y_filters, y0_q4, y_step_q4, w, h); + } else if (w == 8) { + scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, y_filters, y0_q4, y_step_q4, w, h); + } else { + scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, y_filters, y0_q4, y_step_q4, w, h); + } +} + +static const InterpKernel *get_filter_base(const int16_t *filter) { + // NOTE: This assumes that the filter table is 256-byte aligned. + // TODO(agrange) Modify to make independent of table alignment. + return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); +} + +static int get_filter_offset(const int16_t *f, const InterpKernel *base) { + return (int)((const InterpKernel *)(intptr_t)f - base); +} + +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + scaledconvolve2d(src, src_stride, dst, dst_stride, + filters_x, x0_q4, x_step_q4, + filters_y, y0_q4, y_step_q4, w, h); +} + +// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, ssse3); +FUN_CONV_2D(avg_ , ssse3); diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm new file mode 100644 index 000000000000..08f3d6a6cf35 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm @@ -0,0 +1,987 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklqdq xmm0, xmm1 + punpcklqdq xmm2, xmm3 + punpcklqdq xmm5, xmm4 + punpcklqdq xmm6, xmm7 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm2 + movdqa k5k4, xmm5 + movdqa k6k7, xmm6 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro APPLY_FILTER_4 1 + punpckldq xmm0, xmm1 ;two row in one register + punpckldq xmm6, xmm7 + punpckldq xmm2, xmm3 + punpckldq xmm5, xmm4 + + punpcklbw xmm0, zero ;unpack to word + punpcklbw xmm6, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + + pmullw xmm0, k0k1 ;multiply the filter factors + pmullw xmm6, k6k7 + pmullw xmm2, k2k3 + pmullw xmm5, k5k4 + + paddsw xmm0, xmm6 ;sum + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + paddsw xmm0, xmm2 + psrldq xmm2, 8 + paddsw xmm0, xmm5 + psrldq xmm5, 8 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 +%endm + +%macro GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm0 + punpcklwd xmm1, xmm1 + punpcklwd xmm2, xmm2 + punpcklwd xmm3, xmm3 + punpckhwd xmm4, xmm4 + punpckhwd xmm5, xmm5 + punpckhwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movdqa k0, xmm0 ;store filter factors on stack + movdqa k1, xmm1 + movdqa k2, xmm2 + movdqa k3, xmm3 + movdqa k4, xmm4 + movdqa k5, xmm5 + movdqa k6, xmm6 + movdqa k7, xmm7 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro LOAD_VERT_8 1 + movq xmm0, [rsi + %1] ;0 + movq xmm1, [rsi + rax + %1] ;1 + movq xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2 + %1] ;7 + movq xmm2, [rsi + rax + %1] ;2 + movq xmm3, [rsi + rax * 2 + %1] ;3 + movq xmm4, [rsi + rdx + %1] ;4 + movq xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro APPLY_FILTER_8 2 + punpcklbw xmm0, zero + punpcklbw xmm1, zero + punpcklbw xmm6, zero + punpcklbw xmm7, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + punpcklbw xmm3, zero + punpcklbw xmm4, zero + + pmullw xmm0, k0 + pmullw xmm1, k1 + pmullw xmm6, k6 + pmullw xmm7, k7 + pmullw xmm2, k2 + pmullw xmm5, k5 + pmullw xmm3, k3 + pmullw xmm4, k4 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm6 + paddsw xmm0, xmm7 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + paddsw xmm0, xmm3 + paddsw xmm0, xmm4 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi + %2] + pavgb xmm0, xmm1 +%endif + movq [rdi + %2], xmm0 +%endm + +;void vpx_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vpx_filter_block1d4_v8_sse2) PRIVATE +sym(vpx_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movd xmm0, [rsi] ;load src: row 0 + movd xmm1, [rsi + rax] ;1 + movd xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movd xmm7, [rsi + rdx * 2] ;7 + movd xmm2, [rsi + rax] ;2 + movd xmm3, [rsi + rax * 2] ;3 + movd xmm4, [rsi + rdx] ;4 + movd xmm5, [rsi + rax * 4] ;5 + + APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vpx_filter_block1d8_v8_sse2) PRIVATE +sym(vpx_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vpx_filter_block1d16_v8_sse2) PRIVATE +sym(vpx_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 8 + APPLY_FILTER_8 0, 8 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE +sym(vpx_filter_block1d4_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movd xmm0, [rsi] ;load src: row 0 + movd xmm1, [rsi + rax] ;1 + movd xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movd xmm7, [rsi + rdx * 2] ;7 + movd xmm2, [rsi + rax] ;2 + movd xmm3, [rsi + rax * 2] ;3 + movd xmm4, [rsi + rdx] ;4 + movd xmm5, [rsi + rax * 4] ;5 + + APPLY_FILTER_4 1 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE +sym(vpx_filter_block1d8_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 1, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE +sym(vpx_filter_block1d16_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 1, 0 + sub rsi, rax + + LOAD_VERT_8 8 + APPLY_FILTER_8 1, 8 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vpx_filter_block1d4_h8_sse2) PRIVATE +sym(vpx_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm3, 3 + psrldq xmm5, 5 + psrldq xmm4, 4 + + APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vpx_filter_block1d8_h8_sse2) PRIVATE +sym(vpx_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vpx_filter_block1d16_h8_sse2) PRIVATE +sym(vpx_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 5] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 8 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE +sym(vpx_filter_block1d4_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm3, 3 + psrldq xmm5, 5 + psrldq xmm4, 4 + + APPLY_FILTER_4 1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE +sym(vpx_filter_block1d8_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE +sym(vpx_filter_block1d16_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 0 + + movdqu xmm0, [rsi + 5] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 8 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm new file mode 100644 index 000000000000..d2cb8ea29274 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm @@ -0,0 +1,629 @@ +; +; Copyright (c) 2015 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_64: times 8 dw 64 + +; %define USE_PMULHRSW +; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss +; when using this instruction. +; +; The add order below (based on ffvp9) must be followed to prevent outranges. +; x = k0k1 + k4k5 +; y = k2k3 + k6k7 +; z = signed SAT(x + y) + +SECTION .text +%if ARCH_X86_64 + %define LOCAL_VARS_SIZE 16*4 +%else + %define LOCAL_VARS_SIZE 16*6 +%endif + +%macro SETUP_LOCAL_VARS 0 + ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + + ; pmaddubsw has a higher latency on some platforms, this might be eased by + ; interleaving the instructions. + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + packsswb m4, m4 + ; TODO(slavarnway): multiple pshufb instructions had a higher latency on + ; some platforms. + pshuflw m0, m4, 0b ;k0_k1 + pshuflw m1, m4, 01010101b ;k2_k3 + pshuflw m2, m4, 10101010b ;k4_k5 + pshuflw m3, m4, 11111111b ;k6_k7 + punpcklqdq m0, m0 + punpcklqdq m1, m1 + punpcklqdq m2, m2 + punpcklqdq m3, m3 + mova k0k1, m0 + mova k2k3, m1 + mova k4k5, m2 + mova k6k7, m3 +%if ARCH_X86_64 + %define krd m12 + %define tmp m13 + mova krd, [GLOBAL(pw_64)] +%else + %define tmp [rsp + 16*4] + %define krd [rsp + 16*5] +%if CONFIG_PIC=0 + mova m6, [GLOBAL(pw_64)] +%else + ; build constants without accessing global memory + pcmpeqb m6, m6 ;all ones + psrlw m6, 15 + psllw m6, 6 ;aka pw_64 +%endif + mova krd, m6 +%endif +%endm + +%macro HORIZx4_ROW 2 + mova %2, %1 + punpcklbw %1, %1 + punpckhbw %2, %2 + + mova m3, %2 + palignr %2, %1, 1 + palignr m3, %1, 5 + + pmaddubsw %2, k0k1k4k5 + pmaddubsw m3, k2k3k6k7 + mova m4, %2 ;k0k1 + mova m5, m3 ;k2k3 + psrldq %2, 8 ;k4k5 + psrldq m3, 8 ;k6k7 + paddsw %2, m4 + paddsw m5, m3 + paddsw %2, m5 + paddsw %2, krd + psraw %2, 7 + packuswb %2, %2 +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER4 1 +cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + packsswb m4, m4 +%if ARCH_X86_64 + %define k0k1k4k5 m8 + %define k2k3k6k7 m9 + %define krd m10 + %define orig_height r7d + mova krd, [GLOBAL(pw_64)] + pshuflw k0k1k4k5, m4, 0b ;k0_k1 + pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 + pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 + pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 +%else + %define k0k1k4k5 [rsp + 16*0] + %define k2k3k6k7 [rsp + 16*1] + %define krd [rsp + 16*2] + %define orig_height [rsp + 16*3] + pshuflw m6, m4, 0b ;k0_k1 + pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 + pshuflw m7, m4, 01010101b ;k2_k3 + pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 +%if CONFIG_PIC=0 + mova m1, [GLOBAL(pw_64)] +%else + ; build constants without accessing global memory + pcmpeqb m1, m1 ;all ones + psrlw m1, 15 + psllw m1, 6 ;aka pw_64 +%endif + mova k0k1k4k5, m6 + mova k2k3k6k7, m7 + mova krd, m1 +%endif + mov orig_height, heightd + shr heightd, 1 +.loop: + ;Do two rows at once + movh m0, [srcq - 3] + movh m1, [srcq + 5] + punpcklqdq m0, m1 + mova m1, m0 + movh m2, [srcq + sstrideq - 3] + movh m3, [srcq + sstrideq + 5] + punpcklqdq m2, m3 + mova m3, m2 + punpcklbw m0, m0 + punpckhbw m1, m1 + punpcklbw m2, m2 + punpckhbw m3, m3 + mova m4, m1 + palignr m4, m0, 1 + pmaddubsw m4, k0k1k4k5 + palignr m1, m0, 5 + pmaddubsw m1, k2k3k6k7 + mova m7, m3 + palignr m7, m2, 1 + pmaddubsw m7, k0k1k4k5 + palignr m3, m2, 5 + pmaddubsw m3, k2k3k6k7 + mova m0, m4 ;k0k1 + mova m5, m1 ;k2k3 + mova m2, m7 ;k0k1 upper + psrldq m4, 8 ;k4k5 + psrldq m1, 8 ;k6k7 + paddsw m4, m0 + paddsw m5, m1 + mova m1, m3 ;k2k3 upper + psrldq m7, 8 ;k4k5 upper + psrldq m3, 8 ;k6k7 upper + paddsw m7, m2 + paddsw m4, m5 + paddsw m1, m3 + paddsw m7, m1 + paddsw m4, krd + psraw m4, 7 + packuswb m4, m4 + paddsw m7, krd + psraw m7, 7 + packuswb m7, m7 + +%ifidn %1, h8_avg + movd m0, [dstq] + pavgb m4, m0 + movd m2, [dstq + dstrideq] + pavgb m7, m2 +%endif + movd [dstq], m4 + movd [dstq + dstrideq], m7 + + lea srcq, [srcq + sstrideq ] + prefetcht0 [srcq + 4 * sstrideq - 3] + lea srcq, [srcq + sstrideq ] + lea dstq, [dstq + 2 * dstrideq ] + prefetcht0 [srcq + 2 * sstrideq - 3] + + dec heightd + jnz .loop + + ; Do last row if output_height is odd + mov heightd, orig_height + and heightd, 1 + je .done + + movh m0, [srcq - 3] ; load src + movh m1, [srcq + 5] + punpcklqdq m0, m1 + + HORIZx4_ROW m0, m1 +%ifidn %1, h8_avg + movd m0, [dstq] + pavgb m1, m0 +%endif + movd [dstq], m1 +.done + RET +%endm + +%macro HORIZx8_ROW 5 + mova %2, %1 + punpcklbw %1, %1 + punpckhbw %2, %2 + + mova %3, %2 + mova %4, %2 + mova %5, %2 + + palignr %2, %1, 1 + palignr %3, %1, 5 + palignr %4, %1, 9 + palignr %5, %1, 13 + + pmaddubsw %2, k0k1 + pmaddubsw %3, k2k3 + pmaddubsw %4, k4k5 + pmaddubsw %5, k6k7 + paddsw %2, %4 + paddsw %5, %3 + paddsw %2, %5 + paddsw %2, krd + psraw %2, 7 + packuswb %2, %2 + SWAP %1, %2 +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER8 1 +cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS +%if ARCH_X86_64 + %define orig_height r7d +%else + %define orig_height heightmp +%endif + mov orig_height, heightd + shr heightd, 1 + +.loop: + movh m0, [srcq - 3] + movh m3, [srcq + 5] + movh m4, [srcq + sstrideq - 3] + movh m7, [srcq + sstrideq + 5] + punpcklqdq m0, m3 + mova m1, m0 + punpcklbw m0, m0 + punpckhbw m1, m1 + mova m5, m1 + palignr m5, m0, 13 + pmaddubsw m5, k6k7 + mova m2, m1 + mova m3, m1 + palignr m1, m0, 1 + pmaddubsw m1, k0k1 + punpcklqdq m4, m7 + mova m6, m4 + punpcklbw m4, m4 + palignr m2, m0, 5 + punpckhbw m6, m6 + palignr m3, m0, 9 + mova m7, m6 + pmaddubsw m2, k2k3 + pmaddubsw m3, k4k5 + + palignr m7, m4, 13 + mova m0, m6 + palignr m0, m4, 5 + pmaddubsw m7, k6k7 + paddsw m1, m3 + paddsw m2, m5 + paddsw m1, m2 + mova m5, m6 + palignr m6, m4, 1 + pmaddubsw m0, k2k3 + pmaddubsw m6, k0k1 + palignr m5, m4, 9 + paddsw m1, krd + pmaddubsw m5, k4k5 + psraw m1, 7 + paddsw m0, m7 +%ifidn %1, h8_avg + movh m7, [dstq] + movh m2, [dstq + dstrideq] +%endif + packuswb m1, m1 + paddsw m6, m5 + paddsw m6, m0 + paddsw m6, krd + psraw m6, 7 + packuswb m6, m6 +%ifidn %1, h8_avg + pavgb m1, m7 + pavgb m6, m2 +%endif + movh [dstq], m1 + movh [dstq + dstrideq], m6 + + lea srcq, [srcq + sstrideq ] + prefetcht0 [srcq + 4 * sstrideq - 3] + lea srcq, [srcq + sstrideq ] + lea dstq, [dstq + 2 * dstrideq ] + prefetcht0 [srcq + 2 * sstrideq - 3] + dec heightd + jnz .loop + + ;Do last row if output_height is odd + mov heightd, orig_height + and heightd, 1 + je .done + + movh m0, [srcq - 3] + movh m3, [srcq + 5] + punpcklqdq m0, m3 + + HORIZx8_ROW m0, m1, m2, m3, m4 + +%ifidn %1, h8_avg + movh m1, [dstq] + pavgb m0, m1 +%endif + movh [dstq], m0 +.done: + RET +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER16 1 +cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS +.loop: + prefetcht0 [srcq + 2 * sstrideq -3] + + movh m0, [srcq - 3] + movh m4, [srcq + 5] + movh m6, [srcq + 13] + punpcklqdq m0, m4 + mova m7, m0 + punpckhbw m0, m0 + mova m1, m0 + punpcklqdq m4, m6 + mova m3, m0 + punpcklbw m7, m7 + + palignr m3, m7, 13 + mova m2, m0 + pmaddubsw m3, k6k7 + palignr m0, m7, 1 + pmaddubsw m0, k0k1 + palignr m1, m7, 5 + pmaddubsw m1, k2k3 + palignr m2, m7, 9 + pmaddubsw m2, k4k5 + paddsw m1, m3 + mova m3, m4 + punpckhbw m4, m4 + mova m5, m4 + punpcklbw m3, m3 + mova m7, m4 + palignr m5, m3, 5 + mova m6, m4 + palignr m4, m3, 1 + pmaddubsw m4, k0k1 + pmaddubsw m5, k2k3 + palignr m6, m3, 9 + pmaddubsw m6, k4k5 + palignr m7, m3, 13 + pmaddubsw m7, k6k7 + paddsw m0, m2 + paddsw m0, m1 +%ifidn %1, h8_avg + mova m1, [dstq] +%endif + paddsw m4, m6 + paddsw m5, m7 + paddsw m4, m5 + paddsw m0, krd + paddsw m4, krd + psraw m0, 7 + psraw m4, 7 + packuswb m0, m4 +%ifidn %1, h8_avg + pavgb m0, m1 +%endif + lea srcq, [srcq + sstrideq] + mova [dstq], m0 + lea dstq, [dstq + dstrideq] + dec heightd + jnz .loop + RET +%endm + +INIT_XMM ssse3 +SUBPIX_HFILTER16 h8 +SUBPIX_HFILTER16 h8_avg +SUBPIX_HFILTER8 h8 +SUBPIX_HFILTER8 h8_avg +SUBPIX_HFILTER4 h8 +SUBPIX_HFILTER4 h8_avg + +;------------------------------------------------------------------------------- +%macro SUBPIX_VFILTER 2 +cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS +%if ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq +%else + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp +%endif + mov src1q, srcq + add src1q, sstrideq + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 + +%ifidn %2, 8 + %define movx movh +%else + %define movx movd +%endif +.loop: + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + punpcklbw m0, m1 ;A B + movx m2, [srcq + sstrideq * 2 ] ;C + pmaddubsw m0, k0k1 + mova m6, m2 + movx m3, [src1q + sstrideq * 2] ;D + punpcklbw m2, m3 ;C D + pmaddubsw m2, k2k3 + movx m4, [srcq + sstrideq * 4 ] ;E + mova m7, m4 + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m4, k4k5 + punpcklbw m1, m6 ;A B next iter + movx m6, [srcq + sstride6q ] ;G + punpcklbw m5, m6 ;E F next iter + punpcklbw m3, m7 ;C D next iter + pmaddubsw m5, k4k5 + movx m7, [src1q + sstride6q ] ;H + punpcklbw m6, m7 ;G H + pmaddubsw m6, k6k7 + pmaddubsw m3, k2k3 + pmaddubsw m1, k0k1 + paddsw m0, m4 + paddsw m2, m6 + movx m6, [srcq + sstrideq * 8 ] ;H next iter + punpcklbw m7, m6 + pmaddubsw m7, k6k7 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + paddsw m1, m5 + packuswb m0, m0 + + paddsw m3, m7 + paddsw m1, m3 + paddsw m1, krd + psraw m1, 7 + lea srcq, [srcq + sstrideq * 2 ] + lea src1q, [src1q + sstrideq * 2] + packuswb m1, m1 + +%ifidn %1, v8_avg + movx m2, [dstq] + pavgb m0, m2 +%endif + movx [dstq], m0 + add dstq, dst_stride +%ifidn %1, v8_avg + movx m3, [dstq] + pavgb m1, m3 +%endif + movx [dstq], m1 + add dstq, dst_stride + sub heightd, 2 + cmp heightd, 1 + jg .loop + + cmp heightd, 0 + je .done + + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + movx m6, [srcq + sstride6q ] ;G + punpcklbw m0, m1 ;A B + movx m7, [src1q + sstride6q ] ;H + pmaddubsw m0, k0k1 + movx m2, [srcq + sstrideq * 2 ] ;C + punpcklbw m6, m7 ;G H + movx m3, [src1q + sstrideq * 2] ;D + pmaddubsw m6, k6k7 + movx m4, [srcq + sstrideq * 4 ] ;E + punpcklbw m2, m3 ;C D + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m2, k2k3 + pmaddubsw m4, k4k5 + paddsw m2, m6 + paddsw m0, m4 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + packuswb m0, m0 +%ifidn %1, v8_avg + movx m1, [dstq] + pavgb m0, m1 +%endif + movx [dstq], m0 +.done: + RET +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_VFILTER16 1 +cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS +%if ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq +%else + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp +%endif + mov src1q, srcq + add src1q, sstrideq + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 + +.loop: + movh m0, [srcq ] ;A + movh m1, [srcq + sstrideq ] ;B + movh m2, [srcq + sstrideq * 2 ] ;C + movh m3, [src1q + sstrideq * 2] ;D + movh m4, [srcq + sstrideq * 4 ] ;E + movh m5, [src1q + sstrideq * 4] ;F + + punpcklbw m0, m1 ;A B + movh m6, [srcq + sstride6q] ;G + punpcklbw m2, m3 ;C D + movh m7, [src1q + sstride6q] ;H + punpcklbw m4, m5 ;E F + pmaddubsw m0, k0k1 + movh m3, [srcq + 8] ;A + pmaddubsw m2, k2k3 + punpcklbw m6, m7 ;G H + movh m5, [srcq + sstrideq + 8] ;B + pmaddubsw m4, k4k5 + punpcklbw m3, m5 ;A B + movh m7, [srcq + sstrideq * 2 + 8] ;C + pmaddubsw m6, k6k7 + movh m5, [src1q + sstrideq * 2 + 8] ;D + punpcklbw m7, m5 ;C D + paddsw m2, m6 + pmaddubsw m3, k0k1 + movh m1, [srcq + sstrideq * 4 + 8] ;E + paddsw m0, m4 + pmaddubsw m7, k2k3 + movh m6, [src1q + sstrideq * 4 + 8] ;F + punpcklbw m1, m6 ;E F + paddsw m0, m2 + paddsw m0, krd + movh m2, [srcq + sstride6q + 8] ;G + pmaddubsw m1, k4k5 + movh m5, [src1q + sstride6q + 8] ;H + psraw m0, 7 + punpcklbw m2, m5 ;G H + pmaddubsw m2, k6k7 +%ifidn %1, v8_avg + mova m4, [dstq] +%endif + movh [dstq], m0 + paddsw m7, m2 + paddsw m3, m1 + paddsw m3, m7 + paddsw m3, krd + psraw m3, 7 + packuswb m0, m3 + + add srcq, sstrideq + add src1q, sstrideq +%ifidn %1, v8_avg + pavgb m0, m4 +%endif + mova [dstq], m0 + add dstq, dst_stride + dec heightd + jnz .loop + RET +%endm + +INIT_XMM ssse3 +SUBPIX_VFILTER16 v8 +SUBPIX_VFILTER16 v8_avg +SUBPIX_VFILTER v8, 8 +SUBPIX_VFILTER v8_avg, 8 +SUBPIX_VFILTER v8, 4 +SUBPIX_VFILTER v8_avg, 4 diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm new file mode 100644 index 000000000000..a378dd040282 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm @@ -0,0 +1,448 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklqdq xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + pxor xmm2, xmm2 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + + punpckldq xmm0, xmm1 ;two row in one register + punpcklbw xmm0, xmm2 ;unpack to word + pmullw xmm0, xmm4 ;multiply the filter factors + + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + + paddsw xmm0, xmm3 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + + pshuflw xmm6, xmm7, 11111111b ;k3 + pshufhw xmm7, xmm7, 0b ;k4 + punpcklwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + pxor xmm5, xmm5 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + punpckhbw xmm3, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + pmullw xmm2, xmm6 + pmullw xmm3, xmm7 + + paddsw xmm0, xmm1 + paddsw xmm2, xmm3 + + paddsw xmm0, xmm4 ;rounding + paddsw xmm2, xmm4 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +global sym(vpx_filter_block1d4_v2_sse2) PRIVATE +sym(vpx_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d8_v2_sse2) PRIVATE +sym(vpx_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d16_v2_sse2) PRIVATE +sym(vpx_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE +sym(vpx_filter_block1d4_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE +sym(vpx_filter_block1d8_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE +sym(vpx_filter_block1d16_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d4_h2_sse2) PRIVATE +sym(vpx_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d8_h2_sse2) PRIVATE +sym(vpx_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d16_h2_sse2) PRIVATE +sym(vpx_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE +sym(vpx_filter_block1d4_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE +sym(vpx_filter_block1d8_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE +sym(vpx_filter_block1d16_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm new file mode 100644 index 000000000000..3c8cfd225378 --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm @@ -0,0 +1,422 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + psrldq xmm3, 6 + packsswb xmm3, xmm3 + pshuflw xmm3, xmm3, 0b ;k3_k4 + + movq xmm2, rcx ;rounding + pshufd xmm2, xmm2, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm3 + + paddsw xmm0, xmm2 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + psrldq xmm7, 6 + packsswb xmm7, xmm7 + pshuflw xmm7, xmm7, 0b ;k3_k4 + punpcklwd xmm7, xmm7 + + movq xmm6, rcx ;rounding + pshufd xmm6, xmm6, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm7 + + paddsw xmm0, xmm6 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte + +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, xmm7 + pmaddubsw xmm2, xmm7 + + paddsw xmm0, xmm6 ;rounding + paddsw xmm2, xmm6 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte + +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE +sym(vpx_filter_block1d4_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE +sym(vpx_filter_block1d8_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE +sym(vpx_filter_block1d16_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE +sym(vpx_filter_block1d4_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE +sym(vpx_filter_block1d8_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE +sym(vpx_filter_block1d16_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE +sym(vpx_filter_block1d4_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE +sym(vpx_filter_block1d8_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE +sym(vpx_filter_block1d16_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE +sym(vpx_filter_block1d4_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE +sym(vpx_filter_block1d8_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE +sym(vpx_filter_block1d16_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/thirdparty/libvpx/vpx_dsp_rtcd.h b/thirdparty/libvpx/vpx_dsp_rtcd.h new file mode 100644 index 000000000000..4d5ad895337c --- /dev/null +++ b/thirdparty/libvpx/vpx_dsp_rtcd.h @@ -0,0 +1,9 @@ +#include "vpx_config.h" + +#if defined(WEBM_X86ASM) && (ARCH_X86 || ARCH_X86_64) + #include "rtcd/vpx_dsp_rtcd_x86.h" +#elif defined(WEBM_ARMASM) && ARCH_ARM + #include "rtcd/vpx_dsp_rtcd_arm.h" +#else + #include "rtcd/vpx_dsp_rtcd_c.h" +#endif diff --git a/thirdparty/libvpx/vpx_mem/include/vpx_mem_intrnl.h b/thirdparty/libvpx/vpx_mem/include/vpx_mem_intrnl.h new file mode 100644 index 000000000000..c4dd78550f35 --- /dev/null +++ b/thirdparty/libvpx/vpx_mem/include/vpx_mem_intrnl.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ +#define VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ +#include "./vpx_config.h" + +#define ADDRESS_STORAGE_SIZE sizeof(size_t) + +#ifndef DEFAULT_ALIGNMENT +# if defined(VXWORKS) +# define DEFAULT_ALIGNMENT 32 /*default addr alignment to use in +calls to vpx_* functions other +than vpx_memalign*/ +# else +# define DEFAULT_ALIGNMENT (2 * sizeof(void*)) /* NOLINT */ +# endif +#endif + +/*returns an addr aligned to the byte boundary specified by align*/ +#define align_addr(addr,align) (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align)) + +#endif // VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ diff --git a/thirdparty/libvpx/vpx_mem/vpx_mem.c b/thirdparty/libvpx/vpx_mem/vpx_mem.c new file mode 100644 index 000000000000..b261fc0da117 --- /dev/null +++ b/thirdparty/libvpx/vpx_mem/vpx_mem.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_mem.h" +#include +#include +#include +#include "include/vpx_mem_intrnl.h" +#include "vpx/vpx_integer.h" + +void *vpx_memalign(size_t align, size_t size) { + void *addr, + * x = NULL; + + addr = malloc(size + align - 1 + ADDRESS_STORAGE_SIZE); + + if (addr) { + x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, (int)align); + /* save the actual malloc address */ + ((size_t *)x)[-1] = (size_t)addr; + } + + return x; +} + +void *vpx_malloc(size_t size) { + return vpx_memalign(DEFAULT_ALIGNMENT, size); +} + +void *vpx_calloc(size_t num, size_t size) { + void *x; + + x = vpx_memalign(DEFAULT_ALIGNMENT, num * size); + + if (x) + memset(x, 0, num * size); + + return x; +} + +void *vpx_realloc(void *memblk, size_t size) { + void *addr, + * new_addr = NULL; + int align = DEFAULT_ALIGNMENT; + + /* + The realloc() function changes the size of the object pointed to by + ptr to the size specified by size, and returns a pointer to the + possibly moved block. The contents are unchanged up to the lesser + of the new and old sizes. If ptr is null, realloc() behaves like + malloc() for the specified size. If size is zero (0) and ptr is + not a null pointer, the object pointed to is freed. + */ + if (!memblk) + new_addr = vpx_malloc(size); + else if (!size) + vpx_free(memblk); + else { + addr = (void *)(((size_t *)memblk)[-1]); + memblk = NULL; + + new_addr = realloc(addr, size + align + ADDRESS_STORAGE_SIZE); + + if (new_addr) { + addr = new_addr; + new_addr = (void *)(((size_t) + ((unsigned char *)new_addr + ADDRESS_STORAGE_SIZE) + (align - 1)) & + (size_t) - align); + /* save the actual malloc address */ + ((size_t *)new_addr)[-1] = (size_t)addr; + } + } + + return new_addr; +} + +void vpx_free(void *memblk) { + if (memblk) { + void *addr = (void *)(((size_t *)memblk)[-1]); + free(addr); + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void *vpx_memset16(void *dest, int val, size_t length) { + size_t i; + uint16_t *dest16 = (uint16_t *)dest; + for (i = 0; i < length; i++) + *dest16++ = val; + return dest; +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/thirdparty/libvpx/vpx_mem/vpx_mem.h b/thirdparty/libvpx/vpx_mem/vpx_mem.h new file mode 100644 index 000000000000..a006e0f00b3b --- /dev/null +++ b/thirdparty/libvpx/vpx_mem/vpx_mem.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VPX_MEM_VPX_MEM_H_ +#define VPX_MEM_VPX_MEM_H_ + +#include "vpx_config.h" +#if defined(__uClinux__) +# include +#endif + +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + + void *vpx_memalign(size_t align, size_t size); + void *vpx_malloc(size_t size); + void *vpx_calloc(size_t num, size_t size); + void *vpx_realloc(void *memblk, size_t size); + void vpx_free(void *memblk); + +#if CONFIG_VP9_HIGHBITDEPTH + void *vpx_memset16(void *dest, int val, size_t length); +#endif + +#include + +#ifdef VPX_MEM_PLTFRM +# include VPX_MEM_PLTFRM +#endif + +#if defined(__cplusplus) +} +#endif + +#endif // VPX_MEM_VPX_MEM_H_ diff --git a/thirdparty/libvpx/vpx_ports/arm.h b/thirdparty/libvpx/vpx_ports/arm.h new file mode 100644 index 000000000000..42c98f5a8382 --- /dev/null +++ b/thirdparty/libvpx/vpx_ports/arm.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VPX_PORTS_ARM_H_ +#define VPX_PORTS_ARM_H_ +#include +#include "vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*ARMv5TE "Enhanced DSP" instructions.*/ +#define HAS_EDSP 0x01 +/*ARMv6 "Parallel" or "Media" instructions.*/ +#define HAS_MEDIA 0x02 +/*ARMv7 optional NEON instructions.*/ +#define HAS_NEON 0x04 + +int arm_cpu_caps(void); + +// Earlier gcc compilers have issues with some neon intrinsics +#if !defined(__clang__) && defined(__GNUC__) && \ + __GNUC__ == 4 && __GNUC_MINOR__ <= 6 +#define VPX_INCOMPATIBLE_GCC +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_PORTS_ARM_H_ + diff --git a/thirdparty/libvpx/vpx_ports/arm_cpudetect.c b/thirdparty/libvpx/vpx_ports/arm_cpudetect.c new file mode 100644 index 000000000000..7eb74a7dc9f5 --- /dev/null +++ b/thirdparty/libvpx/vpx_ports/arm_cpudetect.c @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include "vpx_ports/arm.h" +#include "./vpx_config.h" + +#ifdef WINAPI_FAMILY +#include +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define getenv(x) NULL +#endif +#endif + +static int arm_cpu_env_flags(int *flags) { + char *env; + env = getenv("VPX_SIMD_CAPS"); + if (env && *env) { + *flags = (int)strtol(env, NULL, 0); + return 0; + } + *flags = 0; + return -1; +} + +static int arm_cpu_env_mask(void) { + char *env; + env = getenv("VPX_SIMD_CAPS_MASK"); + return env && *env ? (int)strtol(env, NULL, 0) : ~0; +} + +#if !CONFIG_RUNTIME_CPU_DETECT + #error "CONFIG_RUNTIME_CPU_DETECT should be enabled!" +#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */ +/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/ +#define WIN32_LEAN_AND_MEAN +#define WIN32_EXTRA_LEAN +#include + +int arm_cpu_caps(void) { + int flags; + int mask; + if (!arm_cpu_env_flags(&flags)) { + return flags; + } + mask = arm_cpu_env_mask(); + /* MSVC has no inline __asm support for ARM, but it does let you __emit + * instructions via their assembled hex code. + * All of these instructions should be essentially nops. + */ +#if HAVE_MEDIA + if (mask & HAS_MEDIA) { + __try { + /*SHADD8 r3,r3,r3*/ + __emit(0xE6333F93); + flags |= HAS_MEDIA; + } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { + /*Ignore exception.*/ + } + } +#endif /* HAVE_MEDIA */ +#if HAVE_NEON || HAVE_NEON_ASM + if (mask &HAS_NEON) { + __try { + /*VORR q0,q0,q0*/ + __emit(0xF2200150); + flags |= HAS_NEON; + } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { + /*Ignore exception.*/ + } + } +#endif /* HAVE_NEON || HAVE_NEON_ASM */ + return flags & mask; +} + +#elif defined(__ANDROID__) /* end _MSC_VER */ +#include + +int arm_cpu_caps(void) { + int flags; + int mask; + uint64_t features; + if (!arm_cpu_env_flags(&flags)) { + return flags; + } + mask = arm_cpu_env_mask(); + features = android_getCpuFeatures(); + +#if HAVE_MEDIA + flags |= HAS_MEDIA; +#endif /* HAVE_MEDIA */ +#if HAVE_NEON || HAVE_NEON_ASM + if (features & ANDROID_CPU_ARM_FEATURE_NEON) + flags |= HAS_NEON; +#endif /* HAVE_NEON || HAVE_NEON_ASM */ + return flags & mask; +} + +#elif defined(__linux__) /* end __ANDROID__ */ + +#include + +int arm_cpu_caps(void) { + FILE *fin; + int flags; + int mask; + if (!arm_cpu_env_flags(&flags)) { + return flags; + } + mask = arm_cpu_env_mask(); + /* Reading /proc/self/auxv would be easier, but that doesn't work reliably + * on Android. + * This also means that detection will fail in Scratchbox. + */ + fin = fopen("/proc/cpuinfo", "r"); + if (fin != NULL) { + /* 512 should be enough for anybody (it's even enough for all the flags + * that x86 has accumulated... so far). + */ + char buf[512]; + while (fgets(buf, 511, fin) != NULL) { +#if HAVE_NEON || HAVE_NEON_ASM + if (memcmp(buf, "Features", 8) == 0) { + char *p; + p = strstr(buf, " neon"); + if (p != NULL && (p[5] == ' ' || p[5] == '\n')) { + flags |= HAS_NEON; + } + } +#endif /* HAVE_NEON || HAVE_NEON_ASM */ +#if HAVE_MEDIA + if (memcmp(buf, "CPU architecture:", 17) == 0) { + int version; + version = atoi(buf + 17); + if (version >= 6) { + flags |= HAS_MEDIA; + } + } +#endif /* HAVE_MEDIA */ + } + fclose(fin); + } + return flags & mask; +} +#else /* end __linux__ */ +int arm_cpu_caps(void) { + int flags; + int mask; + if (!arm_cpu_env_flags(&flags)) { + return flags; + } + mask = arm_cpu_env_mask(); +#if HAVE_MEDIA + flags |= HAS_MEDIA; +#endif /* HAVE_MEDIA */ +#if HAVE_NEON || HAVE_NEON_ASM + flags |= HAS_NEON; +#endif /* HAVE_NEON || HAVE_NEON_ASM */ + return flags & mask; +} +#warning "ARM run-time CPU detection is disabled for this platform..." +#endif diff --git a/thirdparty/libvpx/vpx_ports/bitops.h b/thirdparty/libvpx/vpx_ports/bitops.h new file mode 100644 index 000000000000..84ff3659fee8 --- /dev/null +++ b/thirdparty/libvpx/vpx_ports/bitops.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_PORTS_BITOPS_H_ +#define VPX_PORTS_BITOPS_H_ + +#include + +#include "vpx_ports/msvc.h" + +#ifdef _MSC_VER +# include // the ceil() definition must precede intrin.h +# if _MSC_VER > 1310 && (defined(_M_X64) || defined(_M_IX86)) +# include +# define USE_MSC_INTRINSICS +# endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// These versions of get_msb() are only valid when n != 0 because all +// of the optimized versions are undefined when n == 0: +// https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html + +// use GNU builtins where available. +#if defined(__GNUC__) && \ + ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) +static INLINE int get_msb(unsigned int n) { + assert(n != 0); + return 31 ^ __builtin_clz(n); +} +#elif defined(USE_MSC_INTRINSICS) +#pragma intrinsic(_BitScanReverse) + +static INLINE int get_msb(unsigned int n) { + unsigned long first_set_bit; + assert(n != 0); + _BitScanReverse(&first_set_bit, n); + return first_set_bit; +} +#undef USE_MSC_INTRINSICS +#else +// Returns (int)floor(log2(n)). n must be > 0. +static INLINE int get_msb(unsigned int n) { + int log = 0; + unsigned int value = n; + int i; + + assert(n != 0); + + for (i = 4; i >= 0; --i) { + const int shift = (1 << i); + const unsigned int x = value >> shift; + if (x != 0) { + value = x; + log += shift; + } + } + return log; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_PORTS_BITOPS_H_ diff --git a/thirdparty/libvpx/vpx_ports/config.h b/thirdparty/libvpx/vpx_ports/config.h new file mode 100644 index 000000000000..3c1ab99f4aa3 --- /dev/null +++ b/thirdparty/libvpx/vpx_ports/config.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_PORTS_CONFIG_H_ +#define VPX_PORTS_CONFIG_H_ + +#include "vpx_config.h" + +#endif // VPX_PORTS_CONFIG_H_ diff --git a/thirdparty/libvpx/vpx_ports/emmintrin_compat.h b/thirdparty/libvpx/vpx_ports/emmintrin_compat.h new file mode 100644 index 000000000000..16176383d290 --- /dev/null +++ b/thirdparty/libvpx/vpx_ports/emmintrin_compat.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_PORTS_EMMINTRIN_COMPAT_H_ +#define VPX_PORTS_EMMINTRIN_COMPAT_H_ + +#if defined(__GNUC__) && __GNUC__ < 4 +/* From emmintrin.h (gcc 4.5.3) */ +/* Casts between various SP, DP, INT vector types. Note that these do no + conversion of values, they just change the type. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castpd_ps(__m128d __A) +{ + return (__m128) __A; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castpd_si128(__m128d __A) +{ + return (__m128i) __A; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castps_pd(__m128 __A) +{ + return (__m128d) __A; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castps_si128(__m128 __A) +{ + return (__m128i) __A; +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castsi128_ps(__m128i __A) +{ + return (__m128) __A; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castsi128_pd(__m128i __A) +{ + return (__m128d) __A; +} +#endif + +#endif // VPX_PORTS_EMMINTRIN_COMPAT_H_ diff --git a/thirdparty/libvpx/vpx_ports/emms.asm b/thirdparty/libvpx/vpx_ports/emms.asm new file mode 100644 index 000000000000..db8da2873752 --- /dev/null +++ b/thirdparty/libvpx/vpx_ports/emms.asm @@ -0,0 +1,38 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +section .text +global sym(vpx_reset_mmx_state) PRIVATE +sym(vpx_reset_mmx_state): + emms + ret + + +%if LIBVPX_YASM_WIN64 +global sym(vpx_winx64_fldcw) PRIVATE +sym(vpx_winx64_fldcw): + sub rsp, 8 + mov [rsp], rcx ; win x64 specific + fldcw [rsp] + add rsp, 8 + ret + + +global sym(vpx_winx64_fstcw) PRIVATE +sym(vpx_winx64_fstcw): + sub rsp, 8 + fstcw [rsp] + mov rax, [rsp] + add rsp, 8 + ret +%endif diff --git a/thirdparty/libvpx/vpx_ports/mem.h b/thirdparty/libvpx/vpx_ports/mem.h new file mode 100644 index 000000000000..7502f9063259 --- /dev/null +++ b/thirdparty/libvpx/vpx_ports/mem.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VPX_PORTS_MEM_H_ +#define VPX_PORTS_MEM_H_ + +#include "vpx_config.h" +#include "vpx/vpx_integer.h" + +#if (defined(__GNUC__) && __GNUC__) || defined(__SUNPRO_C) +#define DECLARE_ALIGNED(n,typ,val) typ val __attribute__ ((aligned (n))) +#elif defined(_MSC_VER) +#define DECLARE_ALIGNED(n,typ,val) __declspec(align(n)) typ val +#else +#warning No alignment directives known for this compiler. +#define DECLARE_ALIGNED(n,typ,val) typ val +#endif + +/* Indicates that the usage of the specified variable has been audited to assure + * that it's safe to use uninitialized. Silences 'may be used uninitialized' + * warnings on gcc. + */ +#if defined(__GNUC__) && __GNUC__ +#define UNINITIALIZED_IS_SAFE(x) x=x +#else +#define UNINITIALIZED_IS_SAFE(x) x +#endif + +#if HAVE_NEON && defined(_MSC_VER) +#define __builtin_prefetch(x) +#endif + +/* Shift down with rounding */ +#define ROUND_POWER_OF_TWO(value, n) \ + (((value) + (1 << ((n) - 1))) >> (n)) + +#define ALIGN_POWER_OF_TWO(value, n) \ + (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1)) + +#if CONFIG_VP9_HIGHBITDEPTH +#define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1)) +#define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1)) +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // VPX_PORTS_MEM_H_ diff --git a/thirdparty/libvpx/vpx_ports/mem_ops.h b/thirdparty/libvpx/vpx_ports/mem_ops.h new file mode 100644 index 000000000000..620df31b223e --- /dev/null +++ b/thirdparty/libvpx/vpx_ports/mem_ops.h @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_PORTS_MEM_OPS_H_ +#define VPX_PORTS_MEM_OPS_H_ + +/* \file + * \brief Provides portable memory access primitives + * + * This function provides portable primitives for getting and setting of + * signed and unsigned integers in 16, 24, and 32 bit sizes. The operations + * can be performed on unaligned data regardless of hardware support for + * unaligned accesses. + * + * The type used to pass the integral values may be changed by defining + * MEM_VALUE_T with the appropriate type. The type given must be an integral + * numeric type. + * + * The actual functions instantiated have the MEM_VALUE_T type name pasted + * on to the symbol name. This allows the developer to instantiate these + * operations for multiple types within the same translation unit. This is + * of somewhat questionable utility, but the capability exists nonetheless. + * Users not making use of this functionality should call the functions + * without the type name appended, and the preprocessor will take care of + * it. + * + * NOTE: This code is not supported on platforms where char > 1 octet ATM. + */ + +#ifndef MAU_T +/* Minimum Access Unit for this target */ +#define MAU_T unsigned char +#endif + +#ifndef MEM_VALUE_T +#define MEM_VALUE_T int +#endif + +#undef MEM_VALUE_T_SZ_BITS +#define MEM_VALUE_T_SZ_BITS (sizeof(MEM_VALUE_T) << 3) + +#undef mem_ops_wrap_symbol +#define mem_ops_wrap_symbol(fn) mem_ops_wrap_symbol2(fn, MEM_VALUE_T) +#undef mem_ops_wrap_symbol2 +#define mem_ops_wrap_symbol2(fn,typ) mem_ops_wrap_symbol3(fn,typ) +#undef mem_ops_wrap_symbol3 +#define mem_ops_wrap_symbol3(fn,typ) fn##_as_##typ + +/* + * Include aligned access routines + */ +#define INCLUDED_BY_MEM_OPS_H +#include "mem_ops_aligned.h" +#undef INCLUDED_BY_MEM_OPS_H + +#undef mem_get_be16 +#define mem_get_be16 mem_ops_wrap_symbol(mem_get_be16) +static unsigned MEM_VALUE_T mem_get_be16(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = mem[0] << 8; + val |= mem[1]; + return val; +} + +#undef mem_get_be24 +#define mem_get_be24 mem_ops_wrap_symbol(mem_get_be24) +static unsigned MEM_VALUE_T mem_get_be24(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = mem[0] << 16; + val |= mem[1] << 8; + val |= mem[2]; + return val; +} + +#undef mem_get_be32 +#define mem_get_be32 mem_ops_wrap_symbol(mem_get_be32) +static unsigned MEM_VALUE_T mem_get_be32(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = ((unsigned MEM_VALUE_T)mem[0]) << 24; + val |= mem[1] << 16; + val |= mem[2] << 8; + val |= mem[3]; + return val; +} + +#undef mem_get_le16 +#define mem_get_le16 mem_ops_wrap_symbol(mem_get_le16) +static unsigned MEM_VALUE_T mem_get_le16(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = mem[1] << 8; + val |= mem[0]; + return val; +} + +#undef mem_get_le24 +#define mem_get_le24 mem_ops_wrap_symbol(mem_get_le24) +static unsigned MEM_VALUE_T mem_get_le24(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = mem[2] << 16; + val |= mem[1] << 8; + val |= mem[0]; + return val; +} + +#undef mem_get_le32 +#define mem_get_le32 mem_ops_wrap_symbol(mem_get_le32) +static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = ((unsigned MEM_VALUE_T)mem[3]) << 24; + val |= mem[2] << 16; + val |= mem[1] << 8; + val |= mem[0]; + return val; +} + +#define mem_get_s_generic(end,sz) \ + static VPX_INLINE signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) {\ + const MAU_T *mem = (const MAU_T*)vmem;\ + signed MEM_VALUE_T val = mem_get_##end##sz(mem);\ + return (val << (MEM_VALUE_T_SZ_BITS - sz)) >> (MEM_VALUE_T_SZ_BITS - sz);\ + } + +#undef mem_get_sbe16 +#define mem_get_sbe16 mem_ops_wrap_symbol(mem_get_sbe16) +mem_get_s_generic(be, 16) + +#undef mem_get_sbe24 +#define mem_get_sbe24 mem_ops_wrap_symbol(mem_get_sbe24) +mem_get_s_generic(be, 24) + +#undef mem_get_sbe32 +#define mem_get_sbe32 mem_ops_wrap_symbol(mem_get_sbe32) +mem_get_s_generic(be, 32) + +#undef mem_get_sle16 +#define mem_get_sle16 mem_ops_wrap_symbol(mem_get_sle16) +mem_get_s_generic(le, 16) + +#undef mem_get_sle24 +#define mem_get_sle24 mem_ops_wrap_symbol(mem_get_sle24) +mem_get_s_generic(le, 24) + +#undef mem_get_sle32 +#define mem_get_sle32 mem_ops_wrap_symbol(mem_get_sle32) +mem_get_s_generic(le, 32) + +#undef mem_put_be16 +#define mem_put_be16 mem_ops_wrap_symbol(mem_put_be16) +static VPX_INLINE void mem_put_be16(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 8) & 0xff); + mem[1] = (MAU_T)((val >> 0) & 0xff); +} + +#undef mem_put_be24 +#define mem_put_be24 mem_ops_wrap_symbol(mem_put_be24) +static VPX_INLINE void mem_put_be24(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 16) & 0xff); + mem[1] = (MAU_T)((val >> 8) & 0xff); + mem[2] = (MAU_T)((val >> 0) & 0xff); +} + +#undef mem_put_be32 +#define mem_put_be32 mem_ops_wrap_symbol(mem_put_be32) +static VPX_INLINE void mem_put_be32(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 24) & 0xff); + mem[1] = (MAU_T)((val >> 16) & 0xff); + mem[2] = (MAU_T)((val >> 8) & 0xff); + mem[3] = (MAU_T)((val >> 0) & 0xff); +} + +#undef mem_put_le16 +#define mem_put_le16 mem_ops_wrap_symbol(mem_put_le16) +static VPX_INLINE void mem_put_le16(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 0) & 0xff); + mem[1] = (MAU_T)((val >> 8) & 0xff); +} + +#undef mem_put_le24 +#define mem_put_le24 mem_ops_wrap_symbol(mem_put_le24) +static VPX_INLINE void mem_put_le24(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 0) & 0xff); + mem[1] = (MAU_T)((val >> 8) & 0xff); + mem[2] = (MAU_T)((val >> 16) & 0xff); +} + +#undef mem_put_le32 +#define mem_put_le32 mem_ops_wrap_symbol(mem_put_le32) +static VPX_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 0) & 0xff); + mem[1] = (MAU_T)((val >> 8) & 0xff); + mem[2] = (MAU_T)((val >> 16) & 0xff); + mem[3] = (MAU_T)((val >> 24) & 0xff); +} + +#endif // VPX_PORTS_MEM_OPS_H_ diff --git a/thirdparty/libvpx/vpx_ports/mem_ops_aligned.h b/thirdparty/libvpx/vpx_ports/mem_ops_aligned.h new file mode 100644 index 000000000000..46f61738ba42 --- /dev/null +++ b/thirdparty/libvpx/vpx_ports/mem_ops_aligned.h @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_PORTS_MEM_OPS_ALIGNED_H_ +#define VPX_PORTS_MEM_OPS_ALIGNED_H_ + +#include "vpx/vpx_integer.h" + +/* \file + * \brief Provides portable memory access primitives for operating on aligned + * data + * + * This file is split from mem_ops.h for easier maintenance. See mem_ops.h + * for a more detailed description of these primitives. + */ +#ifndef INCLUDED_BY_MEM_OPS_H +#error Include mem_ops.h, not mem_ops_aligned.h directly. +#endif + +/* Architectures that provide instructions for doing this byte swapping + * could redefine these macros. + */ +#define swap_endian_16(val,raw) do {\ + val = (uint16_t)(((raw>>8) & 0x00ff) \ + | ((raw<<8) & 0xff00));\ + } while(0) +#define swap_endian_32(val,raw) do {\ + val = ((raw>>24) & 0x000000ff) \ + | ((raw>>8) & 0x0000ff00) \ + | ((raw<<8) & 0x00ff0000) \ + | ((raw<<24) & 0xff000000); \ + } while(0) +#define swap_endian_16_se(val,raw) do {\ + swap_endian_16(val,raw);\ + val = ((val << 16) >> 16);\ + } while(0) +#define swap_endian_32_se(val,raw) swap_endian_32(val,raw) + +#define mem_get_ne_aligned_generic(end,sz) \ + static VPX_INLINE unsigned MEM_VALUE_T \ + mem_get_##end##sz##_aligned(const void *vmem) {\ + const uint##sz##_t *mem = (const uint##sz##_t *)vmem;\ + return *mem;\ + } + +#define mem_get_sne_aligned_generic(end,sz) \ + static VPX_INLINE signed MEM_VALUE_T \ + mem_get_s##end##sz##_aligned(const void *vmem) {\ + const int##sz##_t *mem = (const int##sz##_t *)vmem;\ + return *mem;\ + } + +#define mem_get_se_aligned_generic(end,sz) \ + static VPX_INLINE unsigned MEM_VALUE_T \ + mem_get_##end##sz##_aligned(const void *vmem) {\ + const uint##sz##_t *mem = (const uint##sz##_t *)vmem;\ + unsigned MEM_VALUE_T val, raw = *mem;\ + swap_endian_##sz(val,raw);\ + return val;\ + } + +#define mem_get_sse_aligned_generic(end,sz) \ + static VPX_INLINE signed MEM_VALUE_T \ + mem_get_s##end##sz##_aligned(const void *vmem) {\ + const int##sz##_t *mem = (const int##sz##_t *)vmem;\ + unsigned MEM_VALUE_T val, raw = *mem;\ + swap_endian_##sz##_se(val,raw);\ + return val;\ + } + +#define mem_put_ne_aligned_generic(end,sz) \ + static VPX_INLINE void \ + mem_put_##end##sz##_aligned(void *vmem, MEM_VALUE_T val) {\ + uint##sz##_t *mem = (uint##sz##_t *)vmem;\ + *mem = (uint##sz##_t)val;\ + } + +#define mem_put_se_aligned_generic(end,sz) \ + static VPX_INLINE void \ + mem_put_##end##sz##_aligned(void *vmem, MEM_VALUE_T val) {\ + uint##sz##_t *mem = (uint##sz##_t *)vmem, raw;\ + swap_endian_##sz(raw,val);\ + *mem = (uint##sz##_t)raw;\ + } + +#include "vpx_config.h" +#if CONFIG_BIG_ENDIAN +#define mem_get_be_aligned_generic(sz) mem_get_ne_aligned_generic(be,sz) +#define mem_get_sbe_aligned_generic(sz) mem_get_sne_aligned_generic(be,sz) +#define mem_get_le_aligned_generic(sz) mem_get_se_aligned_generic(le,sz) +#define mem_get_sle_aligned_generic(sz) mem_get_sse_aligned_generic(le,sz) +#define mem_put_be_aligned_generic(sz) mem_put_ne_aligned_generic(be,sz) +#define mem_put_le_aligned_generic(sz) mem_put_se_aligned_generic(le,sz) +#else +#define mem_get_be_aligned_generic(sz) mem_get_se_aligned_generic(be,sz) +#define mem_get_sbe_aligned_generic(sz) mem_get_sse_aligned_generic(be,sz) +#define mem_get_le_aligned_generic(sz) mem_get_ne_aligned_generic(le,sz) +#define mem_get_sle_aligned_generic(sz) mem_get_sne_aligned_generic(le,sz) +#define mem_put_be_aligned_generic(sz) mem_put_se_aligned_generic(be,sz) +#define mem_put_le_aligned_generic(sz) mem_put_ne_aligned_generic(le,sz) +#endif + +#undef mem_get_be16_aligned +#define mem_get_be16_aligned mem_ops_wrap_symbol(mem_get_be16_aligned) +mem_get_be_aligned_generic(16) + +#undef mem_get_be32_aligned +#define mem_get_be32_aligned mem_ops_wrap_symbol(mem_get_be32_aligned) +mem_get_be_aligned_generic(32) + +#undef mem_get_le16_aligned +#define mem_get_le16_aligned mem_ops_wrap_symbol(mem_get_le16_aligned) +mem_get_le_aligned_generic(16) + +#undef mem_get_le32_aligned +#define mem_get_le32_aligned mem_ops_wrap_symbol(mem_get_le32_aligned) +mem_get_le_aligned_generic(32) + +#undef mem_get_sbe16_aligned +#define mem_get_sbe16_aligned mem_ops_wrap_symbol(mem_get_sbe16_aligned) +mem_get_sbe_aligned_generic(16) + +#undef mem_get_sbe32_aligned +#define mem_get_sbe32_aligned mem_ops_wrap_symbol(mem_get_sbe32_aligned) +mem_get_sbe_aligned_generic(32) + +#undef mem_get_sle16_aligned +#define mem_get_sle16_aligned mem_ops_wrap_symbol(mem_get_sle16_aligned) +mem_get_sle_aligned_generic(16) + +#undef mem_get_sle32_aligned +#define mem_get_sle32_aligned mem_ops_wrap_symbol(mem_get_sle32_aligned) +mem_get_sle_aligned_generic(32) + +#undef mem_put_be16_aligned +#define mem_put_be16_aligned mem_ops_wrap_symbol(mem_put_be16_aligned) +mem_put_be_aligned_generic(16) + +#undef mem_put_be32_aligned +#define mem_put_be32_aligned mem_ops_wrap_symbol(mem_put_be32_aligned) +mem_put_be_aligned_generic(32) + +#undef mem_put_le16_aligned +#define mem_put_le16_aligned mem_ops_wrap_symbol(mem_put_le16_aligned) +mem_put_le_aligned_generic(16) + +#undef mem_put_le32_aligned +#define mem_put_le32_aligned mem_ops_wrap_symbol(mem_put_le32_aligned) +mem_put_le_aligned_generic(32) + +#undef mem_get_ne_aligned_generic +#undef mem_get_se_aligned_generic +#undef mem_get_sne_aligned_generic +#undef mem_get_sse_aligned_generic +#undef mem_put_ne_aligned_generic +#undef mem_put_se_aligned_generic +#undef swap_endian_16 +#undef swap_endian_32 +#undef swap_endian_16_se +#undef swap_endian_32_se + +#endif // VPX_PORTS_MEM_OPS_ALIGNED_H_ diff --git a/thirdparty/libvpx/vpx_ports/msvc.h b/thirdparty/libvpx/vpx_ports/msvc.h new file mode 100644 index 000000000000..cab77405f47e --- /dev/null +++ b/thirdparty/libvpx/vpx_ports/msvc.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_PORTS_MSVC_H_ +#define VPX_PORTS_MSVC_H_ +#ifdef _MSC_VER + +#include "./vpx_config.h" + +# if _MSC_VER < 1900 // VS2015 provides snprintf +# define snprintf _snprintf +# endif // _MSC_VER < 1900 + +#if _MSC_VER < 1800 // VS2013 provides round +#include +static INLINE double round(double x) { + if (x < 0) + return ceil(x - 0.5); + else + return floor(x + 0.5); +} +#endif // _MSC_VER < 1800 + +#endif // _MSC_VER +#endif // VPX_PORTS_MSVC_H_ diff --git a/thirdparty/libvpx/vpx_ports/system_state.h b/thirdparty/libvpx/vpx_ports/system_state.h new file mode 100644 index 000000000000..01989dcafc17 --- /dev/null +++ b/thirdparty/libvpx/vpx_ports/system_state.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_PORTS_SYSTEM_STATE_H_ +#define VPX_PORTS_SYSTEM_STATE_H_ + +#include "./vpx_config.h" + +#if defined(WEBM_X86ASM) && (ARCH_X86 || ARCH_X86_64) + void vpx_reset_mmx_state(void); + #define vpx_clear_system_state() vpx_reset_mmx_state() +#else + #define vpx_clear_system_state() +#endif // ARCH_X86 || ARCH_X86_64 +#endif // VPX_PORTS_SYSTEM_STATE_H_ diff --git a/thirdparty/libvpx/vpx_ports/vpx_once.h b/thirdparty/libvpx/vpx_ports/vpx_once.h new file mode 100644 index 000000000000..da04db459075 --- /dev/null +++ b/thirdparty/libvpx/vpx_ports/vpx_once.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_PORTS_VPX_ONCE_H_ +#define VPX_PORTS_VPX_ONCE_H_ + +#include "vpx_config.h" + +/* Implement a function wrapper to guarantee initialization + * thread-safety for library singletons. + * + * NOTE: These functions use static locks, and can only be + * used with one common argument per compilation unit. So + * + * file1.c: + * vpx_once(foo); + * ... + * vpx_once(foo); + * + * file2.c: + * vpx_once(bar); + * + * will ensure foo() and bar() are each called only once, but in + * + * file1.c: + * vpx_once(foo); + * vpx_once(bar): + * + * bar() will never be called because the lock is used up + * by the call to foo(). + */ + +#if CONFIG_MULTITHREAD && defined(_WIN32) +#include +#include +/* Declare a per-compilation-unit state variable to track the progress + * of calling func() only once. This must be at global scope because + * local initializers are not thread-safe in MSVC prior to Visual + * Studio 2015. + * + * As a static, once_state will be zero-initialized as program start. + */ +static LONG once_state; +static void once(void (*func)(void)) +{ + /* Try to advance once_state from its initial value of 0 to 1. + * Only one thread can succeed in doing so. + */ + if (InterlockedCompareExchange(&once_state, 1, 0) == 0) { + /* We're the winning thread, having set once_state to 1. + * Call our function. */ + func(); + /* Now advance once_state to 2, unblocking any other threads. */ + InterlockedIncrement(&once_state); + return; + } + + /* We weren't the winning thread, but we want to block on + * the state variable so we don't return before func() + * has finished executing elsewhere. + * + * Try to advance once_state from 2 to 2, which is only possible + * after the winning thead advances it from 1 to 2. + */ + while (InterlockedCompareExchange(&once_state, 2, 2) != 2) { + /* State isn't yet 2. Try again. + * + * We are used for singleton initialization functions, + * which should complete quickly. Contention will likewise + * be rare, so it's worthwhile to use a simple but cpu- + * intensive busy-wait instead of successive backoff, + * waiting on a kernel object, or another heavier-weight scheme. + * + * We can at least yield our timeslice. + */ + Sleep(0); + } + + /* We've seen once_state advance to 2, so we know func() + * has been called. And we've left once_state as we found it, + * so other threads will have the same experience. + * + * It's safe to return now. + */ + return; +} + + +#elif CONFIG_MULTITHREAD && defined(__OS2__) +#define INCL_DOS +#include +static void once(void (*func)(void)) +{ + static int done; + + /* If the initialization is complete, return early. */ + if(done) + return; + + /* Causes all other threads in the process to block themselves + * and give up their time slice. + */ + DosEnterCritSec(); + + if (!done) + { + func(); + done = 1; + } + + /* Restores normal thread dispatching for the current process. */ + DosExitCritSec(); +} + + +#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H +#include +static void once(void (*func)(void)) +{ + static pthread_once_t lock = PTHREAD_ONCE_INIT; + pthread_once(&lock, func); +} + + +#else +/* No-op version that performs no synchronization. *_rtcd() is idempotent, + * so as long as your platform provides atomic loads/stores of pointers + * no synchronization is strictly necessary. + */ + +static void once(void (*func)(void)) +{ + static int done; + + if(!done) + { + func(); + done = 1; + } +} +#endif + +#endif // VPX_PORTS_VPX_ONCE_H_ diff --git a/thirdparty/libvpx/vpx_ports/vpx_timer.h b/thirdparty/libvpx/vpx_ports/vpx_timer.h new file mode 100644 index 000000000000..dd98e291c2f7 --- /dev/null +++ b/thirdparty/libvpx/vpx_ports/vpx_timer.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VPX_PORTS_VPX_TIMER_H_ +#define VPX_PORTS_VPX_TIMER_H_ + +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +#if CONFIG_OS_SUPPORT + +#if defined(_WIN32) +/* + * Win32 specific includes + */ +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include +#else +/* + * POSIX specific includes + */ +#include + +/* timersub is not provided by msys at this time. */ +#ifndef timersub +#define timersub(a, b, result) \ + do { \ + (result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \ + (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \ + if ((result)->tv_usec < 0) { \ + --(result)->tv_sec; \ + (result)->tv_usec += 1000000; \ + } \ + } while (0) +#endif +#endif + + +struct vpx_usec_timer { +#if defined(_WIN32) + LARGE_INTEGER begin, end; +#else + struct timeval begin, end; +#endif +}; + + +static INLINE void +vpx_usec_timer_start(struct vpx_usec_timer *t) { +#if defined(_WIN32) + QueryPerformanceCounter(&t->begin); +#else + gettimeofday(&t->begin, NULL); +#endif +} + + +static INLINE void +vpx_usec_timer_mark(struct vpx_usec_timer *t) { +#if defined(_WIN32) + QueryPerformanceCounter(&t->end); +#else + gettimeofday(&t->end, NULL); +#endif +} + + +static INLINE int64_t +vpx_usec_timer_elapsed(struct vpx_usec_timer *t) { +#if defined(_WIN32) + LARGE_INTEGER freq, diff; + + diff.QuadPart = t->end.QuadPart - t->begin.QuadPart; + + QueryPerformanceFrequency(&freq); + return diff.QuadPart * 1000000 / freq.QuadPart; +#else + struct timeval diff; + + timersub(&t->end, &t->begin, &diff); + return diff.tv_sec * 1000000 + diff.tv_usec; +#endif +} + +#else /* CONFIG_OS_SUPPORT = 0*/ + +/* Empty timer functions if CONFIG_OS_SUPPORT = 0 */ +#ifndef timersub +#define timersub(a, b, result) +#endif + +struct vpx_usec_timer { + void *dummy; +}; + +static INLINE void +vpx_usec_timer_start(struct vpx_usec_timer *t) { } + +static INLINE void +vpx_usec_timer_mark(struct vpx_usec_timer *t) { } + +static INLINE int +vpx_usec_timer_elapsed(struct vpx_usec_timer *t) { + return 0; +} + +#endif /* CONFIG_OS_SUPPORT */ + +#endif // VPX_PORTS_VPX_TIMER_H_ diff --git a/thirdparty/libvpx/vpx_ports/x86.h b/thirdparty/libvpx/vpx_ports/x86.h new file mode 100644 index 000000000000..bae25ac34546 --- /dev/null +++ b/thirdparty/libvpx/vpx_ports/x86.h @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VPX_PORTS_X86_H_ +#define VPX_PORTS_X86_H_ +#include + +#if defined(_MSC_VER) +#include /* For __cpuidex, __rdtsc */ +#endif + +#include "vpx_config.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + VPX_CPU_UNKNOWN = -1, + VPX_CPU_AMD, + VPX_CPU_AMD_OLD, + VPX_CPU_CENTAUR, + VPX_CPU_CYRIX, + VPX_CPU_INTEL, + VPX_CPU_NEXGEN, + VPX_CPU_NSC, + VPX_CPU_RISE, + VPX_CPU_SIS, + VPX_CPU_TRANSMETA, + VPX_CPU_TRANSMETA_OLD, + VPX_CPU_UMC, + VPX_CPU_VIA, + + VPX_CPU_LAST +} vpx_cpu_t; + +#if defined(__GNUC__) && __GNUC__ || defined(__ANDROID__) +#if ARCH_X86_64 +#define cpuid(func, func2, ax, bx, cx, dx)\ + __asm__ __volatile__ (\ + "cpuid \n\t" \ + : "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) \ + : "a" (func), "c" (func2)); +#else +#define cpuid(func, func2, ax, bx, cx, dx)\ + __asm__ __volatile__ (\ + "mov %%ebx, %%edi \n\t" \ + "cpuid \n\t" \ + "xchg %%edi, %%ebx \n\t" \ + : "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \ + : "a" (func), "c" (func2)); +#endif +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/ +#if ARCH_X86_64 +#define cpuid(func, func2, ax, bx, cx, dx)\ + asm volatile (\ + "xchg %rsi, %rbx \n\t" \ + "cpuid \n\t" \ + "movl %ebx, %edi \n\t" \ + "xchg %rsi, %rbx \n\t" \ + : "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \ + : "a" (func), "c" (func2)); +#else +#define cpuid(func, func2, ax, bx, cx, dx)\ + asm volatile (\ + "pushl %ebx \n\t" \ + "cpuid \n\t" \ + "movl %ebx, %edi \n\t" \ + "popl %ebx \n\t" \ + : "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \ + : "a" (func), "c" (func2)); +#endif +#else /* end __SUNPRO__ */ +#if ARCH_X86_64 +#if defined(_MSC_VER) && _MSC_VER > 1500 +#define cpuid(func, func2, a, b, c, d) do {\ + int regs[4];\ + __cpuidex(regs, func, func2); \ + a = regs[0]; b = regs[1]; c = regs[2]; d = regs[3];\ + } while(0) +#else +#define cpuid(func, func2, a, b, c, d) do {\ + int regs[4];\ + __cpuid(regs, func); \ + a = regs[0]; b = regs[1]; c = regs[2]; d = regs[3];\ + } while (0) +#endif +#else +#define cpuid(func, func2, a, b, c, d)\ + __asm mov eax, func\ + __asm mov ecx, func2\ + __asm cpuid\ + __asm mov a, eax\ + __asm mov b, ebx\ + __asm mov c, ecx\ + __asm mov d, edx +#endif +#endif /* end others */ + +// NaCl has no support for xgetbv or the raw opcode. +#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__)) +static INLINE uint64_t xgetbv(void) { + const uint32_t ecx = 0; + uint32_t eax, edx; + // Use the raw opcode for xgetbv for compatibility with older toolchains. + __asm__ volatile ( + ".byte 0x0f, 0x01, 0xd0\n" + : "=a"(eax), "=d"(edx) : "c" (ecx)); + return ((uint64_t)edx << 32) | eax; +} +#elif (defined(_M_X64) || defined(_M_IX86)) && \ + defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219 // >= VS2010 SP1 +#include +#define xgetbv() _xgetbv(0) +#elif defined(_MSC_VER) && defined(_M_IX86) +static INLINE uint64_t xgetbv(void) { + uint32_t eax_, edx_; + __asm { + xor ecx, ecx // ecx = 0 + // Use the raw opcode for xgetbv for compatibility with older toolchains. + __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0 + mov eax_, eax + mov edx_, edx + } + return ((uint64_t)edx_ << 32) | eax_; +} +#else +#define xgetbv() 0U // no AVX for older x64 or unrecognized toolchains. +#endif + +#if defined(_MSC_VER) && _MSC_VER >= 1700 +#include +#if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP) +#define getenv(x) NULL +#endif +#endif + +#define HAS_MMX 0x01 +#define HAS_SSE 0x02 +#define HAS_SSE2 0x04 +#define HAS_SSE3 0x08 +#define HAS_SSSE3 0x10 +#define HAS_SSE4_1 0x20 +#define HAS_AVX 0x40 +#define HAS_AVX2 0x80 +#ifndef BIT +#define BIT(n) (1<= 7) { + /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ + cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + + if (reg_ebx & BIT(5)) flags |= HAS_AVX2; + } + } + } + + return flags & mask; +} + +// Note: +// 32-bit CPU cycle counter is light-weighted for most function performance +// measurement. For large function (CPU time > a couple of seconds), 64-bit +// counter should be used. +// 32-bit CPU cycle counter +static INLINE unsigned int +x86_readtsc(void) { +#if defined(__GNUC__) && __GNUC__ + unsigned int tsc; + __asm__ __volatile__("rdtsc\n\t":"=a"(tsc):); + return tsc; +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + unsigned int tsc; + asm volatile("rdtsc\n\t":"=a"(tsc):); + return tsc; +#else +#if ARCH_X86_64 + return (unsigned int)__rdtsc(); +#else + __asm rdtsc; +#endif +#endif +} +// 64-bit CPU cycle counter +static INLINE uint64_t +x86_readtsc64(void) { +#if defined(__GNUC__) && __GNUC__ + uint32_t hi, lo; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return ((uint64_t)hi << 32) | lo; +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + uint_t hi, lo; + asm volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi)); + return ((uint64_t)hi << 32) | lo; +#else +#if ARCH_X86_64 + return (uint64_t)__rdtsc(); +#else + __asm rdtsc; +#endif +#endif +} + +#if defined(__GNUC__) && __GNUC__ +#define x86_pause_hint()\ + __asm__ __volatile__ ("pause \n\t") +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) +#define x86_pause_hint()\ + asm volatile ("pause \n\t") +#else +#if ARCH_X86_64 +#define x86_pause_hint()\ + _mm_pause(); +#else +#define x86_pause_hint()\ + __asm pause +#endif +#endif + +#if defined(__GNUC__) && __GNUC__ +static void +x87_set_control_word(unsigned short mode) { + __asm__ __volatile__("fldcw %0" : : "m"(*&mode)); +} +static unsigned short +x87_get_control_word(void) { + unsigned short mode; + __asm__ __volatile__("fstcw %0\n\t":"=m"(*&mode):); + return mode; +} +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) +static void +x87_set_control_word(unsigned short mode) { + asm volatile("fldcw %0" : : "m"(*&mode)); +} +static unsigned short +x87_get_control_word(void) { + unsigned short mode; + asm volatile("fstcw %0\n\t":"=m"(*&mode):); + return mode; +} +#elif ARCH_X86_64 +/* No fldcw intrinsics on Windows x64, punt to external asm */ +extern void vpx_winx64_fldcw(unsigned short mode); +extern unsigned short vpx_winx64_fstcw(void); +#define x87_set_control_word vpx_winx64_fldcw +#define x87_get_control_word vpx_winx64_fstcw +#else +static void +x87_set_control_word(unsigned short mode) { + __asm { fldcw mode } +} +static unsigned short +x87_get_control_word(void) { + unsigned short mode; + __asm { fstcw mode } + return mode; +} +#endif + +static INLINE unsigned int +x87_set_double_precision(void) { + unsigned int mode = x87_get_control_word(); + x87_set_control_word((mode&~0x300) | 0x200); + return mode; +} + + +extern void vpx_reset_mmx_state(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_PORTS_X86_H_ diff --git a/thirdparty/libvpx/vpx_ports/x86_abi_support.asm b/thirdparty/libvpx/vpx_ports/x86_abi_support.asm new file mode 100644 index 000000000000..708fa101c5dd --- /dev/null +++ b/thirdparty/libvpx/vpx_ports/x86_abi_support.asm @@ -0,0 +1,404 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_config.asm" + +; 32/64 bit compatibility macros +; +; In general, we make the source use 64 bit syntax, then twiddle with it using +; the preprocessor to get the 32 bit syntax on 32 bit platforms. +; +%ifidn __OUTPUT_FORMAT__,elf32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,macho32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,win32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,aout +%define ABI_IS_32BIT 1 +%else +%define ABI_IS_32BIT 0 +%endif + +%if ABI_IS_32BIT +%define rax eax +%define rbx ebx +%define rcx ecx +%define rdx edx +%define rsi esi +%define rdi edi +%define rsp esp +%define rbp ebp +%define movsxd mov +%macro movq 2 + %ifidn %1,eax + movd %1,%2 + %elifidn %2,eax + movd %1,%2 + %elifidn %1,ebx + movd %1,%2 + %elifidn %2,ebx + movd %1,%2 + %elifidn %1,ecx + movd %1,%2 + %elifidn %2,ecx + movd %1,%2 + %elifidn %1,edx + movd %1,%2 + %elifidn %2,edx + movd %1,%2 + %elifidn %1,esi + movd %1,%2 + %elifidn %2,esi + movd %1,%2 + %elifidn %1,edi + movd %1,%2 + %elifidn %2,edi + movd %1,%2 + %elifidn %1,esp + movd %1,%2 + %elifidn %2,esp + movd %1,%2 + %elifidn %1,ebp + movd %1,%2 + %elifidn %2,ebp + movd %1,%2 + %else + movq %1,%2 + %endif +%endmacro +%endif + + +; LIBVPX_YASM_WIN64 +; Set LIBVPX_YASM_WIN64 if output is Windows 64bit so the code will work if x64 +; or win64 is defined on the Yasm command line. +%ifidn __OUTPUT_FORMAT__,win64 +%define LIBVPX_YASM_WIN64 1 +%elifidn __OUTPUT_FORMAT__,x64 +%define LIBVPX_YASM_WIN64 1 +%else +%define LIBVPX_YASM_WIN64 0 +%endif + +; sym() +; Return the proper symbol name for the target ABI. +; +; Certain ABIs, notably MS COFF and Darwin MACH-O, require that symbols +; with C linkage be prefixed with an underscore. +; +%ifidn __OUTPUT_FORMAT__,elf32 +%define sym(x) x +%elifidn __OUTPUT_FORMAT__,elf64 +%define sym(x) x +%elifidn __OUTPUT_FORMAT__,elfx32 +%define sym(x) x +%elif LIBVPX_YASM_WIN64 +%define sym(x) x +%else +%define sym(x) _ %+ x +%endif + +; PRIVATE +; Macro for the attribute to hide a global symbol for the target ABI. +; This is only active if CHROMIUM is defined. +; +; Chromium doesn't like exported global symbols due to symbol clashing with +; plugins among other things. +; +; Requires Chromium's patched copy of yasm: +; http://src.chromium.org/viewvc/chrome?view=rev&revision=73761 +; http://www.tortall.net/projects/yasm/ticket/236 +; +%ifdef CHROMIUM + %ifidn __OUTPUT_FORMAT__,elf32 + %define PRIVATE :hidden + %elifidn __OUTPUT_FORMAT__,elf64 + %define PRIVATE :hidden + %elifidn __OUTPUT_FORMAT__,elfx32 + %define PRIVATE :hidden + %elif LIBVPX_YASM_WIN64 + %define PRIVATE + %else + %define PRIVATE :private_extern + %endif +%else + %define PRIVATE +%endif + +; arg() +; Return the address specification of the given argument +; +%if ABI_IS_32BIT + %define arg(x) [ebp+8+4*x] +%else + ; 64 bit ABI passes arguments in registers. This is a workaround to get up + ; and running quickly. Relies on SHADOW_ARGS_TO_STACK + %if LIBVPX_YASM_WIN64 + %define arg(x) [rbp+16+8*x] + %else + %define arg(x) [rbp-8-8*x] + %endif +%endif + +; REG_SZ_BYTES, REG_SZ_BITS +; Size of a register +%if ABI_IS_32BIT +%define REG_SZ_BYTES 4 +%define REG_SZ_BITS 32 +%else +%define REG_SZ_BYTES 8 +%define REG_SZ_BITS 64 +%endif + + +; ALIGN_STACK +; This macro aligns the stack to the given alignment (in bytes). The stack +; is left such that the previous value of the stack pointer is the first +; argument on the stack (ie, the inverse of this macro is 'pop rsp.') +; This macro uses one temporary register, which is not preserved, and thus +; must be specified as an argument. +%macro ALIGN_STACK 2 + mov %2, rsp + and rsp, -%1 + lea rsp, [rsp - (%1 - REG_SZ_BYTES)] + push %2 +%endmacro + + +; +; The Microsoft assembler tries to impose a certain amount of type safety in +; its register usage. YASM doesn't recognize these directives, so we just +; %define them away to maintain as much compatibility as possible with the +; original inline assembler we're porting from. +; +%idefine PTR +%idefine XMMWORD +%idefine MMWORD + +; PIC macros +; +%if ABI_IS_32BIT + %if CONFIG_PIC=1 + %ifidn __OUTPUT_FORMAT__,elf32 + %define WRT_PLT wrt ..plt + %macro GET_GOT 1 + extern _GLOBAL_OFFSET_TABLE_ + push %1 + call %%get_got + %%sub_offset: + jmp %%exitGG + %%get_got: + mov %1, [esp] + add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc + ret + %%exitGG: + %undef GLOBAL + %define GLOBAL(x) x + %1 wrt ..gotoff + %undef RESTORE_GOT + %define RESTORE_GOT pop %1 + %endmacro + %elifidn __OUTPUT_FORMAT__,macho32 + %macro GET_GOT 1 + push %1 + call %%get_got + %%get_got: + pop %1 + %undef GLOBAL + %define GLOBAL(x) x + %1 - %%get_got + %undef RESTORE_GOT + %define RESTORE_GOT pop %1 + %endmacro + %endif + %endif + + %ifdef CHROMIUM + %ifidn __OUTPUT_FORMAT__,macho32 + %define HIDDEN_DATA(x) x:private_extern + %else + %define HIDDEN_DATA(x) x + %endif + %else + %define HIDDEN_DATA(x) x + %endif +%else + %macro GET_GOT 1 + %endmacro + %define GLOBAL(x) rel x + %ifidn __OUTPUT_FORMAT__,elf64 + %define WRT_PLT wrt ..plt + %define HIDDEN_DATA(x) x:data hidden + %elifidn __OUTPUT_FORMAT__,elfx32 + %define WRT_PLT wrt ..plt + %define HIDDEN_DATA(x) x:data hidden + %elifidn __OUTPUT_FORMAT__,macho64 + %ifdef CHROMIUM + %define HIDDEN_DATA(x) x:private_extern + %else + %define HIDDEN_DATA(x) x + %endif + %else + %define HIDDEN_DATA(x) x + %endif +%endif +%ifnmacro GET_GOT + %macro GET_GOT 1 + %endmacro + %define GLOBAL(x) x +%endif +%ifndef RESTORE_GOT +%define RESTORE_GOT +%endif +%ifndef WRT_PLT +%define WRT_PLT +%endif + +%if ABI_IS_32BIT + %macro SHADOW_ARGS_TO_STACK 1 + %endm + %define UNSHADOW_ARGS +%else +%if LIBVPX_YASM_WIN64 + %macro SHADOW_ARGS_TO_STACK 1 ; argc + %if %1 > 0 + mov arg(0),rcx + %endif + %if %1 > 1 + mov arg(1),rdx + %endif + %if %1 > 2 + mov arg(2),r8 + %endif + %if %1 > 3 + mov arg(3),r9 + %endif + %endm +%else + %macro SHADOW_ARGS_TO_STACK 1 ; argc + %if %1 > 0 + push rdi + %endif + %if %1 > 1 + push rsi + %endif + %if %1 > 2 + push rdx + %endif + %if %1 > 3 + push rcx + %endif + %if %1 > 4 + push r8 + %endif + %if %1 > 5 + push r9 + %endif + %if %1 > 6 + %assign i %1-6 + %assign off 16 + %rep i + mov rax,[rbp+off] + push rax + %assign off off+8 + %endrep + %endif + %endm +%endif + %define UNSHADOW_ARGS mov rsp, rbp +%endif + +; Win64 ABI requires that XMM6:XMM15 are callee saved +; SAVE_XMM n, [u] +; store registers 6-n on the stack +; if u is specified, use unaligned movs. +; Win64 ABI requires 16 byte stack alignment, but then pushes an 8 byte return +; value. Typically we follow this up with 'push rbp' - re-aligning the stack - +; but in some cases this is not done and unaligned movs must be used. +%if LIBVPX_YASM_WIN64 +%macro SAVE_XMM 1-2 a + %if %1 < 6 + %error Only xmm registers 6-15 must be preserved + %else + %assign last_xmm %1 + %define movxmm movdq %+ %2 + %assign xmm_stack_space ((last_xmm - 5) * 16) + sub rsp, xmm_stack_space + %assign i 6 + %rep (last_xmm - 5) + movxmm [rsp + ((i - 6) * 16)], xmm %+ i + %assign i i+1 + %endrep + %endif +%endmacro +%macro RESTORE_XMM 0 + %ifndef last_xmm + %error RESTORE_XMM must be paired with SAVE_XMM n + %else + %assign i last_xmm + %rep (last_xmm - 5) + movxmm xmm %+ i, [rsp +((i - 6) * 16)] + %assign i i-1 + %endrep + add rsp, xmm_stack_space + ; there are a couple functions which return from multiple places. + ; otherwise, we could uncomment these: + ; %undef last_xmm + ; %undef xmm_stack_space + ; %undef movxmm + %endif +%endmacro +%else +%macro SAVE_XMM 1-2 +%endmacro +%macro RESTORE_XMM 0 +%endmacro +%endif + +; Name of the rodata section +; +; .rodata seems to be an elf-ism, as it doesn't work on OSX. +; +%ifidn __OUTPUT_FORMAT__,macho64 +%define SECTION_RODATA section .text +%elifidn __OUTPUT_FORMAT__,macho32 +%macro SECTION_RODATA 0 +section .text +%endmacro +%elifidn __OUTPUT_FORMAT__,aout +%define SECTION_RODATA section .data +%else +%define SECTION_RODATA section .rodata +%endif + + +; Tell GNU ld that we don't require an executable stack. +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%elifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%elifidn __OUTPUT_FORMAT__,elfx32 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%endif + +; On Android platforms use lrand48 when building postproc routines. Prior to L +; rand() was not available. +%if CONFIG_POSTPROC=1 || CONFIG_VP9_POSTPROC=1 +%ifdef __ANDROID__ +extern sym(lrand48) +%define LIBVPX_RAND lrand48 +%else +extern sym(rand) +%define LIBVPX_RAND rand +%endif +%endif ; CONFIG_POSTPROC || CONFIG_VP9_POSTPROC diff --git a/thirdparty/libvpx/vpx_scale/generic/yv12config.c b/thirdparty/libvpx/vpx_scale/generic/yv12config.c new file mode 100644 index 000000000000..6bbb6d8d4823 --- /dev/null +++ b/thirdparty/libvpx/vpx_scale/generic/yv12config.c @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx_scale/yv12config.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" + +/**************************************************************************** +* Exports +****************************************************************************/ + +/**************************************************************************** + * + ****************************************************************************/ +#define yv12_align_addr(addr, align) \ + (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align)) + +int +vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) { + if (ybf) { + // If libvpx is using frame buffer callbacks then buffer_alloc_sz must + // not be set. + if (ybf->buffer_alloc_sz > 0) { + vpx_free(ybf->buffer_alloc); + } + + /* buffer_alloc isn't accessed by most functions. Rather y_buffer, + u_buffer and v_buffer point to buffer_alloc and are used. Clear out + all of this so that a freed pointer isn't inadvertently used */ + memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG)); + } else { + return -1; + } + + return 0; +} + +int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, + int width, int height, int border) { + if (ybf) { + int aligned_width = (width + 15) & ~15; + int aligned_height = (height + 15) & ~15; + int y_stride = ((aligned_width + 2 * border) + 31) & ~31; + int yplane_size = (aligned_height + 2 * border) * y_stride; + int uv_width = aligned_width >> 1; + int uv_height = aligned_height >> 1; + /** There is currently a bunch of code which assumes + * uv_stride == y_stride/2, so enforce this here. */ + int uv_stride = y_stride >> 1; + int uvplane_size = (uv_height + border) * uv_stride; + const int frame_size = yplane_size + 2 * uvplane_size; + + if (!ybf->buffer_alloc) { + ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size); + ybf->buffer_alloc_sz = frame_size; + } + + if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size) + return -1; + + /* Only support allocating buffers that have a border that's a multiple + * of 32. The border restriction is required to get 16-byte alignment of + * the start of the chroma rows without introducing an arbitrary gap + * between planes, which would break the semantics of things like + * vpx_img_set_rect(). */ + if (border & 0x1f) + return -3; + + ybf->y_crop_width = width; + ybf->y_crop_height = height; + ybf->y_width = aligned_width; + ybf->y_height = aligned_height; + ybf->y_stride = y_stride; + + ybf->uv_crop_width = (width + 1) / 2; + ybf->uv_crop_height = (height + 1) / 2; + ybf->uv_width = uv_width; + ybf->uv_height = uv_height; + ybf->uv_stride = uv_stride; + + ybf->alpha_width = 0; + ybf->alpha_height = 0; + ybf->alpha_stride = 0; + + ybf->border = border; + ybf->frame_size = frame_size; + + ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border; + ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2 * uv_stride) + border / 2; + ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2 * uv_stride) + border / 2; + ybf->alpha_buffer = NULL; + + ybf->corrupted = 0; /* assume not currupted by errors */ + return 0; + } + return -2; +} + +int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, + int width, int height, int border) { + if (ybf) { + vp8_yv12_de_alloc_frame_buffer(ybf); + return vp8_yv12_realloc_frame_buffer(ybf, width, height, border); + } + return -2; +} + +#if CONFIG_VP9 +// TODO(jkoleszar): Maybe replace this with struct vpx_image + +int vpx_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) { + if (ybf) { + if (ybf->buffer_alloc_sz > 0) { + vpx_free(ybf->buffer_alloc); + } + + /* buffer_alloc isn't accessed by most functions. Rather y_buffer, + u_buffer and v_buffer point to buffer_alloc and are used. Clear out + all of this so that a freed pointer isn't inadvertently used */ + memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG)); + } else { + return -1; + } + + return 0; +} + +int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, + int width, int height, + int ss_x, int ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border, + int byte_alignment, + vpx_codec_frame_buffer_t *fb, + vpx_get_frame_buffer_cb_fn_t cb, + void *cb_priv) { + if (ybf) { + const int vp9_byte_align = (byte_alignment == 0) ? 1 : byte_alignment; + const int aligned_width = (width + 7) & ~7; + const int aligned_height = (height + 7) & ~7; + const int y_stride = ((aligned_width + 2 * border) + 31) & ~31; + const uint64_t yplane_size = (aligned_height + 2 * border) * + (uint64_t)y_stride + byte_alignment; + const int uv_width = aligned_width >> ss_x; + const int uv_height = aligned_height >> ss_y; + const int uv_stride = y_stride >> ss_x; + const int uv_border_w = border >> ss_x; + const int uv_border_h = border >> ss_y; + const uint64_t uvplane_size = (uv_height + 2 * uv_border_h) * + (uint64_t)uv_stride + byte_alignment; + +#if CONFIG_VP9_HIGHBITDEPTH + const uint64_t frame_size = + (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size); +#else + const uint64_t frame_size = yplane_size + 2 * uvplane_size; +#endif // CONFIG_VP9_HIGHBITDEPTH + + uint8_t *buf = NULL; + + if (cb != NULL) { + const int align_addr_extra_size = 31; + const uint64_t external_frame_size = frame_size + align_addr_extra_size; + + assert(fb != NULL); + + if (external_frame_size != (size_t)external_frame_size) + return -1; + + // Allocation to hold larger frame, or first allocation. + if (cb(cb_priv, (size_t)external_frame_size, fb) < 0) + return -1; + + if (fb->data == NULL || fb->size < external_frame_size) + return -1; + + ybf->buffer_alloc = (uint8_t *)yv12_align_addr(fb->data, 32); + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) + // This memset is needed for fixing the issue of using uninitialized + // value in msan test. It will cause a perf loss, so only do this for + // msan test. + memset(ybf->buffer_alloc, 0, (int)frame_size); +#endif +#endif + } else if (frame_size > (size_t)ybf->buffer_alloc_sz) { + // Allocation to hold larger frame, or first allocation. + vpx_free(ybf->buffer_alloc); + ybf->buffer_alloc = NULL; + + if (frame_size != (size_t)frame_size) + return -1; + + ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, (size_t)frame_size); + if (!ybf->buffer_alloc) + return -1; + + ybf->buffer_alloc_sz = (int)frame_size; + + // This memset is needed for fixing valgrind error from C loop filter + // due to access uninitialized memory in frame border. It could be + // removed if border is totally removed. + memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz); + } + + /* Only support allocating buffers that have a border that's a multiple + * of 32. The border restriction is required to get 16-byte alignment of + * the start of the chroma rows without introducing an arbitrary gap + * between planes, which would break the semantics of things like + * vpx_img_set_rect(). */ + if (border & 0x1f) + return -3; + + ybf->y_crop_width = width; + ybf->y_crop_height = height; + ybf->y_width = aligned_width; + ybf->y_height = aligned_height; + ybf->y_stride = y_stride; + + ybf->uv_crop_width = (width + ss_x) >> ss_x; + ybf->uv_crop_height = (height + ss_y) >> ss_y; + ybf->uv_width = uv_width; + ybf->uv_height = uv_height; + ybf->uv_stride = uv_stride; + + ybf->border = border; + ybf->frame_size = (int)frame_size; + ybf->subsampling_x = ss_x; + ybf->subsampling_y = ss_y; + + buf = ybf->buffer_alloc; +#if CONFIG_VP9_HIGHBITDEPTH + if (use_highbitdepth) { + // Store uint16 addresses when using 16bit framebuffers + buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc); + ybf->flags = YV12_FLAG_HIGHBITDEPTH; + } else { + ybf->flags = 0; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + ybf->y_buffer = (uint8_t *)yv12_align_addr( + buf + (border * y_stride) + border, vp9_byte_align); + ybf->u_buffer = (uint8_t *)yv12_align_addr( + buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w, + vp9_byte_align); + ybf->v_buffer = (uint8_t *)yv12_align_addr( + buf + yplane_size + uvplane_size + (uv_border_h * uv_stride) + + uv_border_w, vp9_byte_align); + + ybf->corrupted = 0; /* assume not corrupted by errors */ + return 0; + } + return -2; +} + +int vpx_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, + int width, int height, + int ss_x, int ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border, + int byte_alignment) { + if (ybf) { + vpx_free_frame_buffer(ybf); + return vpx_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + border, byte_alignment, NULL, NULL, NULL); + } + return -2; +} +#endif diff --git a/thirdparty/libvpx/vpx_scale/generic/yv12extend.c b/thirdparty/libvpx/vpx_scale/generic/yv12extend.c new file mode 100644 index 000000000000..52f0aff1f25e --- /dev/null +++ b/thirdparty/libvpx/vpx_scale/generic/yv12extend.c @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" +#include "vpx_scale/yv12config.h" +#if CONFIG_VP9_HIGHBITDEPTH +#include "vp9/common/vp9_common.h" +#endif + +static void extend_plane(uint8_t *const src, int src_stride, + int width, int height, + int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i; + const int linesize = extend_left + extend_right + width; + + /* copy the left and right most columns out */ + uint8_t *src_ptr1 = src; + uint8_t *src_ptr2 = src + width - 1; + uint8_t *dst_ptr1 = src - extend_left; + uint8_t *dst_ptr2 = src + width; + + for (i = 0; i < height; ++i) { + memset(dst_ptr1, src_ptr1[0], extend_left); + memset(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_stride; + src_ptr2 += src_stride; + dst_ptr1 += src_stride; + dst_ptr2 += src_stride; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = src - extend_left; + src_ptr2 = src + src_stride * (height - 1) - extend_left; + dst_ptr1 = src + src_stride * -extend_top - extend_left; + dst_ptr2 = src + src_stride * height - extend_left; + + for (i = 0; i < extend_top; ++i) { + memcpy(dst_ptr1, src_ptr1, linesize); + dst_ptr1 += src_stride; + } + + for (i = 0; i < extend_bottom; ++i) { + memcpy(dst_ptr2, src_ptr2, linesize); + dst_ptr2 += src_stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void extend_plane_high(uint8_t *const src8, int src_stride, + int width, int height, + int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i; + const int linesize = extend_left + extend_right + width; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + + /* copy the left and right most columns out */ + uint16_t *src_ptr1 = src; + uint16_t *src_ptr2 = src + width - 1; + uint16_t *dst_ptr1 = src - extend_left; + uint16_t *dst_ptr2 = src + width; + + for (i = 0; i < height; ++i) { + vpx_memset16(dst_ptr1, src_ptr1[0], extend_left); + vpx_memset16(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_stride; + src_ptr2 += src_stride; + dst_ptr1 += src_stride; + dst_ptr2 += src_stride; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = src - extend_left; + src_ptr2 = src + src_stride * (height - 1) - extend_left; + dst_ptr1 = src + src_stride * -extend_top - extend_left; + dst_ptr2 = src + src_stride * height - extend_left; + + for (i = 0; i < extend_top; ++i) { + memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t)); + dst_ptr1 += src_stride; + } + + for (i = 0; i < extend_bottom; ++i) { + memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t)); + dst_ptr2 += src_stride; + } +} +#endif + +void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { + const int uv_border = ybf->border / 2; + + assert(ybf->border % 2 == 0); + assert(ybf->y_height - ybf->y_crop_height < 16); + assert(ybf->y_width - ybf->y_crop_width < 16); + assert(ybf->y_height - ybf->y_crop_height >= 0); + assert(ybf->y_width - ybf->y_crop_width >= 0); + +#if CONFIG_VP9_HIGHBITDEPTH + if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { + extend_plane_high( + ybf->y_buffer, ybf->y_stride, + ybf->y_crop_width, ybf->y_crop_height, + ybf->border, ybf->border, + ybf->border + ybf->y_height - ybf->y_crop_height, + ybf->border + ybf->y_width - ybf->y_crop_width); + + extend_plane_high( + ybf->u_buffer, ybf->uv_stride, + ybf->uv_crop_width, ybf->uv_crop_height, + uv_border, uv_border, + uv_border + ybf->uv_height - ybf->uv_crop_height, + uv_border + ybf->uv_width - ybf->uv_crop_width); + + extend_plane_high( + ybf->v_buffer, ybf->uv_stride, + ybf->uv_crop_width, ybf->uv_crop_height, + uv_border, uv_border, + uv_border + ybf->uv_height - ybf->uv_crop_height, + uv_border + ybf->uv_width - ybf->uv_crop_width); + return; + } +#endif + extend_plane(ybf->y_buffer, ybf->y_stride, + ybf->y_crop_width, ybf->y_crop_height, + ybf->border, ybf->border, + ybf->border + ybf->y_height - ybf->y_crop_height, + ybf->border + ybf->y_width - ybf->y_crop_width); + + extend_plane(ybf->u_buffer, ybf->uv_stride, + ybf->uv_crop_width, ybf->uv_crop_height, + uv_border, uv_border, + uv_border + ybf->uv_height - ybf->uv_crop_height, + uv_border + ybf->uv_width - ybf->uv_crop_width); + + extend_plane(ybf->v_buffer, ybf->uv_stride, + ybf->uv_crop_width, ybf->uv_crop_height, + uv_border, uv_border, + uv_border + ybf->uv_height - ybf->uv_crop_height, + uv_border + ybf->uv_width - ybf->uv_crop_width); +} + +#if CONFIG_VP9 +static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) { + const int c_w = ybf->uv_crop_width; + const int c_h = ybf->uv_crop_height; + const int ss_x = ybf->uv_width < ybf->y_width; + const int ss_y = ybf->uv_height < ybf->y_height; + const int c_et = ext_size >> ss_y; + const int c_el = ext_size >> ss_x; + const int c_eb = c_et + ybf->uv_height - ybf->uv_crop_height; + const int c_er = c_el + ybf->uv_width - ybf->uv_crop_width; + + assert(ybf->y_height - ybf->y_crop_height < 16); + assert(ybf->y_width - ybf->y_crop_width < 16); + assert(ybf->y_height - ybf->y_crop_height >= 0); + assert(ybf->y_width - ybf->y_crop_width >= 0); + +#if CONFIG_VP9_HIGHBITDEPTH + if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { + extend_plane_high(ybf->y_buffer, ybf->y_stride, + ybf->y_crop_width, ybf->y_crop_height, + ext_size, ext_size, + ext_size + ybf->y_height - ybf->y_crop_height, + ext_size + ybf->y_width - ybf->y_crop_width); + extend_plane_high(ybf->u_buffer, ybf->uv_stride, + c_w, c_h, c_et, c_el, c_eb, c_er); + extend_plane_high(ybf->v_buffer, ybf->uv_stride, + c_w, c_h, c_et, c_el, c_eb, c_er); + return; + } +#endif + extend_plane(ybf->y_buffer, ybf->y_stride, + ybf->y_crop_width, ybf->y_crop_height, + ext_size, ext_size, + ext_size + ybf->y_height - ybf->y_crop_height, + ext_size + ybf->y_width - ybf->y_crop_width); + + extend_plane(ybf->u_buffer, ybf->uv_stride, + c_w, c_h, c_et, c_el, c_eb, c_er); + + extend_plane(ybf->v_buffer, ybf->uv_stride, + c_w, c_h, c_et, c_el, c_eb, c_er); +} + +void vpx_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { + extend_frame(ybf, ybf->border); +} + +void vpx_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) { + const int inner_bw = (ybf->border > VP9INNERBORDERINPIXELS) ? + VP9INNERBORDERINPIXELS : ybf->border; + extend_frame(ybf, inner_bw); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + memcpy(dst, src, num * sizeof(uint16_t)); +} +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // CONFIG_VP9 + +// Copies the source image into the destination image and updates the +// destination's UMV borders. +// Note: The frames are assumed to be identical in size. +void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { + int row; + const uint8_t *src = src_ybc->y_buffer; + uint8_t *dst = dst_ybc->y_buffer; + +#if 0 + /* These assertions are valid in the codec, but the libvpx-tester uses + * this code slightly differently. + */ + assert(src_ybc->y_width == dst_ybc->y_width); + assert(src_ybc->y_height == dst_ybc->y_height); +#endif + +#if CONFIG_VP9_HIGHBITDEPTH + if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { + assert(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH); + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy_short_addr(dst, src, src_ybc->y_width); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; + } + + src = src_ybc->u_buffer; + dst = dst_ybc->u_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy_short_addr(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + src = src_ybc->v_buffer; + dst = dst_ybc->v_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy_short_addr(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + vp8_yv12_extend_frame_borders_c(dst_ybc); + return; + } else { + assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH)); + } +#endif + + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy(dst, src, src_ybc->y_width); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; + } + + src = src_ybc->u_buffer; + dst = dst_ybc->u_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + src = src_ybc->v_buffer; + dst = dst_ybc->v_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + vp8_yv12_extend_frame_borders_c(dst_ybc); +} + +void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { + int row; + const uint8_t *src = src_ybc->y_buffer; + uint8_t *dst = dst_ybc->y_buffer; + +#if CONFIG_VP9_HIGHBITDEPTH + if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t)); + src16 += src_ybc->y_stride; + dst16 += dst_ybc->y_stride; + } + return; + } +#endif + + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy(dst, src, src_ybc->y_width); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; + } +} diff --git a/thirdparty/libvpx/vpx_scale/vpx_scale.h b/thirdparty/libvpx/vpx_scale/vpx_scale.h new file mode 100644 index 000000000000..43fcf9d66e18 --- /dev/null +++ b/thirdparty/libvpx/vpx_scale/vpx_scale.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VPX_SCALE_VPX_SCALE_H_ +#define VPX_SCALE_VPX_SCALE_H_ + +#include "vpx_scale/yv12config.h" + +extern void vpx_scale_frame(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + unsigned char *temp_area, + unsigned char temp_height, + unsigned int hscale, + unsigned int hratio, + unsigned int vscale, + unsigned int vratio, + unsigned int interlaced); + +#endif // VPX_SCALE_VPX_SCALE_H_ diff --git a/thirdparty/libvpx/vpx_scale/vpx_scale_rtcd.c b/thirdparty/libvpx/vpx_scale/vpx_scale_rtcd.c new file mode 100644 index 000000000000..bea603fd1046 --- /dev/null +++ b/thirdparty/libvpx/vpx_scale/vpx_scale_rtcd.c @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#define RTCD_C +#include "./vpx_scale_rtcd.h" +#include "vpx_ports/vpx_once.h" + +void vpx_scale_rtcd() +{ + once(setup_rtcd_internal); +} diff --git a/thirdparty/libvpx/vpx_scale/yv12config.h b/thirdparty/libvpx/vpx_scale/yv12config.h new file mode 100644 index 000000000000..37b255d4d392 --- /dev/null +++ b/thirdparty/libvpx/vpx_scale/yv12config.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_SCALE_YV12CONFIG_H_ +#define VPX_SCALE_YV12CONFIG_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./vpx_config.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_frame_buffer.h" +#include "vpx/vpx_integer.h" + +#define VP8BORDERINPIXELS 32 +#define VP9INNERBORDERINPIXELS 96 +#define VP9_INTERP_EXTEND 4 +#define VP9_ENC_BORDER_IN_PIXELS 160 +#define VP9_DEC_BORDER_IN_PIXELS 32 + +typedef struct yv12_buffer_config { + int y_width; + int y_height; + int y_crop_width; + int y_crop_height; + int y_stride; + + int uv_width; + int uv_height; + int uv_crop_width; + int uv_crop_height; + int uv_stride; + + int alpha_width; + int alpha_height; + int alpha_stride; + + uint8_t *y_buffer; + uint8_t *u_buffer; + uint8_t *v_buffer; + uint8_t *alpha_buffer; + + uint8_t *buffer_alloc; + int buffer_alloc_sz; + int border; + int frame_size; + int subsampling_x; + int subsampling_y; + unsigned int bit_depth; + vpx_color_space_t color_space; + vpx_color_range_t color_range; + int render_width; + int render_height; + + int corrupted; + int flags; +} YV12_BUFFER_CONFIG; + +#define YV12_FLAG_HIGHBITDEPTH 8 + +int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, + int width, int height, int border); +int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, + int width, int height, int border); +int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf); + +int vpx_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, + int width, int height, int ss_x, int ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border, int byte_alignment); + +// Updates the yv12 buffer config with the frame buffer. |byte_alignment| must +// be a power of 2, from 32 to 1024. 0 sets legacy alignment. If cb is not +// NULL, then libvpx is using the frame buffer callbacks to handle memory. +// If cb is not NULL, libvpx will call cb with minimum size in bytes needed +// to decode the current frame. If cb is NULL, libvpx will allocate memory +// internally to decode the current frame. Returns 0 on success. Returns < 0 +// on failure. +int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, + int width, int height, int ss_x, int ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border, + int byte_alignment, + vpx_codec_frame_buffer_t *fb, + vpx_get_frame_buffer_cb_fn_t cb, + void *cb_priv); +int vpx_free_frame_buffer(YV12_BUFFER_CONFIG *ybf); + +#ifdef __cplusplus +} +#endif + +#endif // VPX_SCALE_YV12CONFIG_H_ diff --git a/thirdparty/libvpx/vpx_scale_rtcd.h b/thirdparty/libvpx/vpx_scale_rtcd.h new file mode 100644 index 000000000000..4d59467fe932 --- /dev/null +++ b/thirdparty/libvpx/vpx_scale_rtcd.h @@ -0,0 +1,44 @@ +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#ifdef RTCD_C +static void setup_rtcd_internal(void) +{ + //Only MIPS has something here, but it is not supported +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/thirdparty/libvpx/vpx_util/endian_inl.h b/thirdparty/libvpx/vpx_util/endian_inl.h new file mode 100644 index 000000000000..37bdce1ccd96 --- /dev/null +++ b/thirdparty/libvpx/vpx_util/endian_inl.h @@ -0,0 +1,120 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Endian related functions. + +#ifndef VPX_UTIL_ENDIAN_INL_H_ +#define VPX_UTIL_ENDIAN_INL_H_ + +#include +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +#if defined(__GNUC__) +# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__) +# define LOCAL_GCC_PREREQ(maj, min) \ + (LOCAL_GCC_VERSION >= (((maj) << 8) | (min))) +#else +# define LOCAL_GCC_VERSION 0 +# define LOCAL_GCC_PREREQ(maj, min) 0 +#endif + +// handle clang compatibility +#ifndef __has_builtin +# define __has_builtin(x) 0 +#endif + +// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__) +#if !defined(WORDS_BIGENDIAN) && \ + (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \ + (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))) +#define WORDS_BIGENDIAN +#endif + +#if defined(WORDS_BIGENDIAN) +#define HToLE32 BSwap32 +#define HToLE16 BSwap16 +#define HToBE64(x) (x) +#define HToBE32(x) (x) +#else +#define HToLE32(x) (x) +#define HToLE16(x) (x) +#define HToBE64(X) BSwap64(X) +#define HToBE32(X) BSwap32(X) +#endif + +#if LOCAL_GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) +#define HAVE_BUILTIN_BSWAP16 +#endif + +#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32) +#define HAVE_BUILTIN_BSWAP32 +#endif + +#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64) +#define HAVE_BUILTIN_BSWAP64 +#endif + +#if HAVE_MIPS32 && defined(__mips__) && !defined(__mips64) && \ + defined(__mips_isa_rev) && (__mips_isa_rev >= 2) && (__mips_isa_rev < 6) +#define VPX_USE_MIPS32_R2 +#endif + +static INLINE uint16_t BSwap16(uint16_t x) { +#if defined(HAVE_BUILTIN_BSWAP16) + return __builtin_bswap16(x); +#elif defined(_MSC_VER) + return _byteswap_ushort(x); +#else + // gcc will recognize a 'rorw $8, ...' here: + return (x >> 8) | ((x & 0xff) << 8); +#endif // HAVE_BUILTIN_BSWAP16 +} + +static INLINE uint32_t BSwap32(uint32_t x) { +#if defined(VPX_USE_MIPS32_R2) + uint32_t ret; + __asm__ volatile ( + "wsbh %[ret], %[x] \n\t" + "rotr %[ret], %[ret], 16 \n\t" + : [ret]"=r"(ret) + : [x]"r"(x) + ); + return ret; +#elif defined(HAVE_BUILTIN_BSWAP32) + return __builtin_bswap32(x); +#elif defined(__i386__) || defined(__x86_64__) + uint32_t swapped_bytes; + __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x)); + return swapped_bytes; +#elif defined(_MSC_VER) + return (uint32_t)_byteswap_ulong(x); +#else + return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24); +#endif // HAVE_BUILTIN_BSWAP32 +} + +static INLINE uint64_t BSwap64(uint64_t x) { +#if defined(HAVE_BUILTIN_BSWAP64) + return __builtin_bswap64(x); +#elif defined(__x86_64__) + uint64_t swapped_bytes; + __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x)); + return swapped_bytes; +#elif defined(_MSC_VER) + return (uint64_t)_byteswap_uint64(x); +#else // generic code for swapping 64-bit values (suggested by bdb@) + x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32); + x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16); + x = ((x & 0xff00ff00ff00ff00ull) >> 8) | ((x & 0x00ff00ff00ff00ffull) << 8); + return x; +#endif // HAVE_BUILTIN_BSWAP64 +} + +#endif // VPX_UTIL_ENDIAN_INL_H_ diff --git a/thirdparty/libvpx/vpx_util/vpx_thread.c b/thirdparty/libvpx/vpx_util/vpx_thread.c new file mode 100644 index 000000000000..0bb0125bd49e --- /dev/null +++ b/thirdparty/libvpx/vpx_util/vpx_thread.c @@ -0,0 +1,184 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Multi-threaded worker +// +// Original source: +// http://git.chromium.org/webm/libwebp.git +// 100644 blob 264210ba2807e4da47eb5d18c04cf869d89b9784 src/utils/thread.c + +#include +#include // for memset() +#include "./vpx_thread.h" +#include "vpx_mem/vpx_mem.h" + +#if CONFIG_MULTITHREAD + +struct VPxWorkerImpl { + pthread_mutex_t mutex_; + pthread_cond_t condition_; + pthread_t thread_; +}; + +//------------------------------------------------------------------------------ + +static void execute(VPxWorker *const worker); // Forward declaration. + +static THREADFN thread_loop(void *ptr) { + VPxWorker *const worker = (VPxWorker*)ptr; + int done = 0; + while (!done) { + pthread_mutex_lock(&worker->impl_->mutex_); + while (worker->status_ == OK) { // wait in idling mode + pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); + } + if (worker->status_ == WORK) { + execute(worker); + worker->status_ = OK; + } else if (worker->status_ == NOT_OK) { // finish the worker + done = 1; + } + // signal to the main thread that we're done (for sync()) + pthread_cond_signal(&worker->impl_->condition_); + pthread_mutex_unlock(&worker->impl_->mutex_); + } + return THREAD_RETURN(NULL); // Thread is finished +} + +// main thread state control +static void change_state(VPxWorker *const worker, + VPxWorkerStatus new_status) { + // No-op when attempting to change state on a thread that didn't come up. + // Checking status_ without acquiring the lock first would result in a data + // race. + if (worker->impl_ == NULL) return; + + pthread_mutex_lock(&worker->impl_->mutex_); + if (worker->status_ >= OK) { + // wait for the worker to finish + while (worker->status_ != OK) { + pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); + } + // assign new status and release the working thread if needed + if (new_status != OK) { + worker->status_ = new_status; + pthread_cond_signal(&worker->impl_->condition_); + } + } + pthread_mutex_unlock(&worker->impl_->mutex_); +} + +#endif // CONFIG_MULTITHREAD + +//------------------------------------------------------------------------------ + +static void init(VPxWorker *const worker) { + memset(worker, 0, sizeof(*worker)); + worker->status_ = NOT_OK; +} + +static int sync(VPxWorker *const worker) { +#if CONFIG_MULTITHREAD + change_state(worker, OK); +#endif + assert(worker->status_ <= OK); + return !worker->had_error; +} + +static int reset(VPxWorker *const worker) { + int ok = 1; + worker->had_error = 0; + if (worker->status_ < OK) { +#if CONFIG_MULTITHREAD + worker->impl_ = (VPxWorkerImpl*)vpx_calloc(1, sizeof(*worker->impl_)); + if (worker->impl_ == NULL) { + return 0; + } + if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) { + goto Error; + } + if (pthread_cond_init(&worker->impl_->condition_, NULL)) { + pthread_mutex_destroy(&worker->impl_->mutex_); + goto Error; + } + pthread_mutex_lock(&worker->impl_->mutex_); + ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker); + if (ok) worker->status_ = OK; + pthread_mutex_unlock(&worker->impl_->mutex_); + if (!ok) { + pthread_mutex_destroy(&worker->impl_->mutex_); + pthread_cond_destroy(&worker->impl_->condition_); + Error: + vpx_free(worker->impl_); + worker->impl_ = NULL; + return 0; + } +#else + worker->status_ = OK; +#endif + } else if (worker->status_ > OK) { + ok = sync(worker); + } + assert(!ok || (worker->status_ == OK)); + return ok; +} + +static void execute(VPxWorker *const worker) { + if (worker->hook != NULL) { + worker->had_error |= !worker->hook(worker->data1, worker->data2); + } +} + +static void launch(VPxWorker *const worker) { +#if CONFIG_MULTITHREAD + change_state(worker, WORK); +#else + execute(worker); +#endif +} + +static void end(VPxWorker *const worker) { +#if CONFIG_MULTITHREAD + if (worker->impl_ != NULL) { + change_state(worker, NOT_OK); + pthread_join(worker->impl_->thread_, NULL); + pthread_mutex_destroy(&worker->impl_->mutex_); + pthread_cond_destroy(&worker->impl_->condition_); + vpx_free(worker->impl_); + worker->impl_ = NULL; + } +#else + worker->status_ = NOT_OK; + assert(worker->impl_ == NULL); +#endif + assert(worker->status_ == NOT_OK); +} + +//------------------------------------------------------------------------------ + +static VPxWorkerInterface g_worker_interface = { + init, reset, sync, launch, execute, end +}; + +int vpx_set_worker_interface(const VPxWorkerInterface* const winterface) { + if (winterface == NULL || + winterface->init == NULL || winterface->reset == NULL || + winterface->sync == NULL || winterface->launch == NULL || + winterface->execute == NULL || winterface->end == NULL) { + return 0; + } + g_worker_interface = *winterface; + return 1; +} + +const VPxWorkerInterface *vpx_get_worker_interface(void) { + return &g_worker_interface; +} + +//------------------------------------------------------------------------------ diff --git a/thirdparty/libvpx/vpx_util/vpx_thread.h b/thirdparty/libvpx/vpx_util/vpx_thread.h new file mode 100644 index 000000000000..2062abd75f9f --- /dev/null +++ b/thirdparty/libvpx/vpx_util/vpx_thread.h @@ -0,0 +1,369 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Multi-threaded worker +// +// Original source: +// http://git.chromium.org/webm/libwebp.git +// 100644 blob 7bd451b124ae3b81596abfbcc823e3cb129d3a38 src/utils/thread.h + +#ifndef VPX_THREAD_H_ +#define VPX_THREAD_H_ + +#include "./vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Set maximum decode threads to be 8 due to the limit of frame buffers +// and not enough semaphores in the emulation layer on windows. +#define MAX_DECODE_THREADS 8 + +#if CONFIG_MULTITHREAD + +#if defined(_WIN32) && !HAVE_PTHREAD_H +#include // NOLINT +#include // NOLINT +#include // NOLINT +typedef HANDLE pthread_t; +typedef CRITICAL_SECTION pthread_mutex_t; +typedef struct { + HANDLE waiting_sem_; + HANDLE received_sem_; + HANDLE signal_event_; +} pthread_cond_t; + +//------------------------------------------------------------------------------ +// simplistic pthread emulation layer + +// _beginthreadex requires __stdcall +#define THREADFN unsigned int __stdcall +#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val) + +static INLINE int pthread_create(pthread_t* const thread, const void* attr, + unsigned int (__stdcall *start)(void*), + void* arg) { + (void)attr; + *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ + 0, /* unsigned stack_size */ + start, + arg, + 0, /* unsigned initflag */ + NULL); /* unsigned *thrdaddr */ + if (*thread == NULL) return 1; + SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); + return 0; +} + +static INLINE int pthread_join(pthread_t thread, void** value_ptr) { + (void)value_ptr; + return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 || + CloseHandle(thread) == 0); +} + +// Mutex +static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, + void* mutexattr) { + (void)mutexattr; + InitializeCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { + return TryEnterCriticalSection(mutex) ? 0 : EBUSY; +} + +static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { + EnterCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { + LeaveCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { + DeleteCriticalSection(mutex); + return 0; +} + +// Condition +static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { + int ok = 1; + ok &= (CloseHandle(condition->waiting_sem_) != 0); + ok &= (CloseHandle(condition->received_sem_) != 0); + ok &= (CloseHandle(condition->signal_event_) != 0); + return !ok; +} + +static INLINE int pthread_cond_init(pthread_cond_t *const condition, + void* cond_attr) { + (void)cond_attr; + condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL); + condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL); + condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL); + if (condition->waiting_sem_ == NULL || + condition->received_sem_ == NULL || + condition->signal_event_ == NULL) { + pthread_cond_destroy(condition); + return 1; + } + return 0; +} + +static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { + int ok = 1; + if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) { + // a thread is waiting in pthread_cond_wait: allow it to be notified + ok = SetEvent(condition->signal_event_); + // wait until the event is consumed so the signaler cannot consume + // the event via its own pthread_cond_wait. + ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) != + WAIT_OBJECT_0); + } + return !ok; +} + +static INLINE int pthread_cond_wait(pthread_cond_t *const condition, + pthread_mutex_t *const mutex) { + int ok; + // note that there is a consumer available so the signal isn't dropped in + // pthread_cond_signal + if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) + return 1; + // now unlock the mutex so pthread_cond_signal may be issued + pthread_mutex_unlock(mutex); + ok = (WaitForSingleObject(condition->signal_event_, INFINITE) == + WAIT_OBJECT_0); + ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL); + pthread_mutex_lock(mutex); + return !ok; +} +#elif defined(__OS2__) +#define INCL_DOS +#include // NOLINT + +#include // NOLINT +#include // NOLINT +#include // NOLINT + +#define pthread_t TID +#define pthread_mutex_t HMTX + +typedef struct { + HEV event_sem_; + HEV ack_sem_; + volatile unsigned wait_count_; +} pthread_cond_t; + +//------------------------------------------------------------------------------ +// simplistic pthread emulation layer + +#define THREADFN void * +#define THREAD_RETURN(val) (val) + +typedef struct { + void* (*start_)(void*); + void* arg_; +} thread_arg; + +static void thread_start(void* arg) { + thread_arg targ = *(thread_arg *)arg; + free(arg); + + targ.start_(targ.arg_); +} + +static INLINE int pthread_create(pthread_t* const thread, const void* attr, + void* (*start)(void*), + void* arg) { + int tid; + thread_arg *targ = (thread_arg *)malloc(sizeof(*targ)); + if (targ == NULL) return 1; + + (void)attr; + + targ->start_ = start; + targ->arg_ = arg; + tid = (pthread_t)_beginthread(thread_start, NULL, 1024 * 1024, targ); + if (tid == -1) { + free(targ); + return 1; + } + + *thread = tid; + return 0; +} + +static INLINE int pthread_join(pthread_t thread, void** value_ptr) { + (void)value_ptr; + return DosWaitThread(&thread, DCWW_WAIT) != 0; +} + +// Mutex +static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, + void* mutexattr) { + (void)mutexattr; + return DosCreateMutexSem(NULL, mutex, 0, FALSE) != 0; +} + +static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { + return DosRequestMutexSem(*mutex, SEM_IMMEDIATE_RETURN) == 0 ? 0 : EBUSY; +} + +static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { + return DosRequestMutexSem(*mutex, SEM_INDEFINITE_WAIT) != 0; +} + +static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { + return DosReleaseMutexSem(*mutex) != 0; +} + +static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { + return DosCloseMutexSem(*mutex) != 0; +} + +// Condition +static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { + int ok = 1; + ok &= DosCloseEventSem(condition->event_sem_) == 0; + ok &= DosCloseEventSem(condition->ack_sem_) == 0; + return !ok; +} + +static INLINE int pthread_cond_init(pthread_cond_t *const condition, + void* cond_attr) { + int ok = 1; + (void)cond_attr; + + ok &= DosCreateEventSem(NULL, &condition->event_sem_, DCE_POSTONE, FALSE) + == 0; + ok &= DosCreateEventSem(NULL, &condition->ack_sem_, DCE_POSTONE, FALSE) == 0; + if (!ok) { + pthread_cond_destroy(condition); + return 1; + } + condition->wait_count_ = 0; + return 0; +} + +static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { + int ok = 1; + + if (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) { + ok &= DosPostEventSem(condition->event_sem_) == 0; + ok &= DosWaitEventSem(condition->ack_sem_, SEM_INDEFINITE_WAIT) == 0; + } + + return !ok; +} + +static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { + int ok = 1; + + while (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) + ok &= pthread_cond_signal(condition) == 0; + + return !ok; +} + +static INLINE int pthread_cond_wait(pthread_cond_t *const condition, + pthread_mutex_t *const mutex) { + int ok = 1; + + __atomic_increment(&condition->wait_count_); + + ok &= pthread_mutex_unlock(mutex) == 0; + + ok &= DosWaitEventSem(condition->event_sem_, SEM_INDEFINITE_WAIT) == 0; + + __atomic_decrement(&condition->wait_count_); + + ok &= DosPostEventSem(condition->ack_sem_) == 0; + + pthread_mutex_lock(mutex); + + return !ok; +} +#else // _WIN32 +#include // NOLINT +# define THREADFN void* +# define THREAD_RETURN(val) val +#endif + +#endif // CONFIG_MULTITHREAD + +// State of the worker thread object +typedef enum { + NOT_OK = 0, // object is unusable + OK, // ready to work + WORK // busy finishing the current task +} VPxWorkerStatus; + +// Function to be called by the worker thread. Takes two opaque pointers as +// arguments (data1 and data2), and should return false in case of error. +typedef int (*VPxWorkerHook)(void*, void*); + +// Platform-dependent implementation details for the worker. +typedef struct VPxWorkerImpl VPxWorkerImpl; + +// Synchronization object used to launch job in the worker thread +typedef struct { + VPxWorkerImpl *impl_; + VPxWorkerStatus status_; + VPxWorkerHook hook; // hook to call + void *data1; // first argument passed to 'hook' + void *data2; // second argument passed to 'hook' + int had_error; // return value of the last call to 'hook' +} VPxWorker; + +// The interface for all thread-worker related functions. All these functions +// must be implemented. +typedef struct { + // Must be called first, before any other method. + void (*init)(VPxWorker *const worker); + // Must be called to initialize the object and spawn the thread. Re-entrant. + // Will potentially launch the thread. Returns false in case of error. + int (*reset)(VPxWorker *const worker); + // Makes sure the previous work is finished. Returns true if worker->had_error + // was not set and no error condition was triggered by the working thread. + int (*sync)(VPxWorker *const worker); + // Triggers the thread to call hook() with data1 and data2 arguments. These + // hook/data1/data2 values can be changed at any time before calling this + // function, but not be changed afterward until the next call to Sync(). + void (*launch)(VPxWorker *const worker); + // This function is similar to launch() except that it calls the + // hook directly instead of using a thread. Convenient to bypass the thread + // mechanism while still using the VPxWorker structs. sync() must + // still be called afterward (for error reporting). + void (*execute)(VPxWorker *const worker); + // Kill the thread and terminate the object. To use the object again, one + // must call reset() again. + void (*end)(VPxWorker *const worker); +} VPxWorkerInterface; + +// Install a new set of threading functions, overriding the defaults. This +// should be done before any workers are started, i.e., before any encoding or +// decoding takes place. The contents of the interface struct are copied, it +// is safe to free the corresponding memory after this call. This function is +// not thread-safe. Return false in case of invalid pointer or methods. +int vpx_set_worker_interface(const VPxWorkerInterface *const winterface); + +// Retrieve the currently set thread worker interface. +const VPxWorkerInterface *vpx_get_worker_interface(void); + +//------------------------------------------------------------------------------ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_THREAD_H_ diff --git a/thirdparty/libvpx/vpx_version.h b/thirdparty/libvpx/vpx_version.h new file mode 100644 index 000000000000..5cff3b429f43 --- /dev/null +++ b/thirdparty/libvpx/vpx_version.h @@ -0,0 +1,7 @@ +#define VERSION_MAJOR 1 +#define VERSION_MINOR 6 +#define VERSION_PATCH 0 +#define VERSION_EXTRA "" +#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) +#define VERSION_STRING_NOSP "v1.6.0" +#define VERSION_STRING " v1.6.0" diff --git a/thirdparty/opus/COPYING b/thirdparty/opus/COPYING new file mode 100644 index 000000000000..9c739c34a3a9 --- /dev/null +++ b/thirdparty/opus/COPYING @@ -0,0 +1,44 @@ +Copyright 2001-2011 Xiph.Org, Skype Limited, Octasic, + Jean-Marc Valin, Timothy B. Terriberry, + CSIRO, Gregory Maxwell, Mark Borgerding, + Erik de Castro Lopo + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +- Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Opus is subject to the royalty-free patent licenses which are +specified at: + +Xiph.Org Foundation: +https://datatracker.ietf.org/ipr/1524/ + +Microsoft Corporation: +https://datatracker.ietf.org/ipr/1914/ + +Broadcom Corporation: +https://datatracker.ietf.org/ipr/1526/ diff --git a/thirdparty/opus/analysis.c b/thirdparty/opus/analysis.c new file mode 100644 index 000000000000..663431a436a1 --- /dev/null +++ b/thirdparty/opus/analysis.c @@ -0,0 +1,672 @@ +/* Copyright (c) 2011 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "kiss_fft.h" +#include "celt.h" +#include "modes.h" +#include "arch.h" +#include "quant_bands.h" +#include +#include "analysis.h" +#include "mlp.h" +#include "stack_alloc.h" + +#ifndef M_PI +#define M_PI 3.141592653 +#endif + +static const float dct_table[128] = { + 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, + 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, + 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f, + -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f, + 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f, + -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f, + 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f, + 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f, + 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f, + 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f, + 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f, + -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f, + 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f, + -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f, + 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f, + 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f, +}; + +static const float analysis_window[240] = { + 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f, + 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f, + 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f, + 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f, + 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f, + 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f, + 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f, + 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f, + 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f, + 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f, + 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f, + 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f, + 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f, + 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f, + 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f, + 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f, + 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f, + 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f, + 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f, + 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f, + 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f, + 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f, + 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f, + 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f, + 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f, + 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f, + 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f, + 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f, + 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f, + 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f, +}; + +static const int tbands[NB_TBANDS+1] = { + 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120 +}; + +static const int extra_bands[NB_TOT_BANDS+1] = { + 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200 +}; + +/*static const float tweight[NB_TBANDS+1] = { + .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5 +};*/ + +#define NB_TONAL_SKIP_BANDS 9 + +#define cA 0.43157974f +#define cB 0.67848403f +#define cC 0.08595542f +#define cE ((float)M_PI/2) +static OPUS_INLINE float fast_atan2f(float y, float x) { + float x2, y2; + /* Should avoid underflow on the values we'll get */ + if (ABS16(x)+ABS16(y)<1e-9f) + { + x*=1e12f; + y*=1e12f; + } + x2 = x*x; + y2 = y*y; + if(x2arch = opus_select_arch(); + /* Clear remaining fields. */ + tonality_analysis_reset(tonal); +} + +void tonality_analysis_reset(TonalityAnalysisState *tonal) +{ + /* Clear non-reusable fields. */ + char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START; + OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal)); +} + +void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len) +{ + int pos; + int curr_lookahead; + float psum; + int i; + + pos = tonal->read_pos; + curr_lookahead = tonal->write_pos-tonal->read_pos; + if (curr_lookahead<0) + curr_lookahead += DETECT_SIZE; + + if (len > 480 && pos != tonal->write_pos) + { + pos++; + if (pos==DETECT_SIZE) + pos=0; + } + if (pos == tonal->write_pos) + pos--; + if (pos<0) + pos = DETECT_SIZE-1; + OPUS_COPY(info_out, &tonal->info[pos], 1); + tonal->read_subframe += len/120; + while (tonal->read_subframe>=4) + { + tonal->read_subframe -= 4; + tonal->read_pos++; + } + if (tonal->read_pos>=DETECT_SIZE) + tonal->read_pos-=DETECT_SIZE; + + /* Compensate for the delay in the features themselves. + FIXME: Need a better estimate the 10 I just made up */ + curr_lookahead = IMAX(curr_lookahead-10, 0); + + psum=0; + /* Summing the probability of transition patterns that involve music at + time (DETECT_SIZE-curr_lookahead-1) */ + for (i=0;ipmusic[i]; + for (;ipspeech[i]; + psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence; + /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/ + + info_out->music_prob = psum; +} + +static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix) +{ + int i, b; + const kiss_fft_state *kfft; + VARDECL(kiss_fft_cpx, in); + VARDECL(kiss_fft_cpx, out); + int N = 480, N2=240; + float * OPUS_RESTRICT A = tonal->angle; + float * OPUS_RESTRICT dA = tonal->d_angle; + float * OPUS_RESTRICT d2A = tonal->d2_angle; + VARDECL(float, tonality); + VARDECL(float, noisiness); + float band_tonality[NB_TBANDS]; + float logE[NB_TBANDS]; + float BFCC[8]; + float features[25]; + float frame_tonality; + float max_frame_tonality; + /*float tw_sum=0;*/ + float frame_noisiness; + const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI); + float slope=0; + float frame_stationarity; + float relativeE; + float frame_probs[2]; + float alpha, alphaE, alphaE2; + float frame_loudness; + float bandwidth_mask; + int bandwidth=0; + float maxE = 0; + float noise_floor; + int remaining; + AnalysisInfo *info; + SAVE_STACK; + + tonal->last_transition++; + alpha = 1.f/IMIN(20, 1+tonal->count); + alphaE = 1.f/IMIN(50, 1+tonal->count); + alphaE2 = 1.f/IMIN(1000, 1+tonal->count); + + if (tonal->count<4) + tonal->music_prob = .5; + kfft = celt_mode->mdct.kfft[0]; + if (tonal->count==0) + tonal->mem_fill = 240; + downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C); + if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) + { + tonal->mem_fill += len; + /* Don't have enough to update the analysis */ + RESTORE_STACK; + return; + } + info = &tonal->info[tonal->write_pos++]; + if (tonal->write_pos>=DETECT_SIZE) + tonal->write_pos-=DETECT_SIZE; + + ALLOC(in, 480, kiss_fft_cpx); + ALLOC(out, 480, kiss_fft_cpx); + ALLOC(tonality, 240, float); + ALLOC(noisiness, 240, float); + for (i=0;iinmem[i]); + in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]); + in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]); + in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]); + } + OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); + remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); + downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C); + tonal->mem_fill = 240 + remaining; + opus_fft(kfft, in, out, tonal->arch); +#ifndef FIXED_POINT + /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */ + if (celt_isnan(out[0].r)) + { + info->valid = 0; + RESTORE_STACK; + return; + } +#endif + + for (i=1;iactivity = 0; + frame_noisiness = 0; + frame_stationarity = 0; + if (!tonal->count) + { + for (b=0;blowE[b] = 1e10; + tonal->highE[b] = -1e10; + } + } + relativeE = 0; + frame_loudness = 0; + for (b=0;bvalid = 0; + RESTORE_STACK; + return; + } +#endif + + tonal->E[tonal->E_count][b] = E; + frame_noisiness += nE/(1e-15f+E); + + frame_loudness += (float)sqrt(E+1e-10f); + logE[b] = (float)log(E+1e-10f); + tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f); + tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f); + if (tonal->highE[b] < tonal->lowE[b]+1.f) + { + tonal->highE[b]+=.5f; + tonal->lowE[b]-=.5f; + } + relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]); + + L1=L2=0; + for (i=0;iE[i][b]); + L2 += tonal->E[i][b]; + } + + stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2)); + stationarity *= stationarity; + stationarity *= stationarity; + frame_stationarity += stationarity; + /*band_tonality[b] = tE/(1e-15+E)*/; + band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]); +#if 0 + if (b>=NB_TONAL_SKIP_BANDS) + { + frame_tonality += tweight[b]*band_tonality[b]; + tw_sum += tweight[b]; + } +#else + frame_tonality += band_tonality[b]; + if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS) + frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS]; +#endif + max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality); + slope += band_tonality[b]*(b-8); + /*printf("%f %f ", band_tonality[b], stationarity);*/ + tonal->prev_band_tonality[b] = band_tonality[b]; + } + + bandwidth_mask = 0; + bandwidth = 0; + maxE = 0; + noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8))); +#ifdef FIXED_POINT + noise_floor *= 1<<(15+SIG_SHIFT); +#endif + noise_floor *= noise_floor; + for (b=0;bmeanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E); + E = MAX32(E, tonal->meanE[b]); + /* Use a simple follower with 13 dB/Bark slope for spreading function */ + bandwidth_mask = MAX32(.05f*bandwidth_mask, E); + /* Consider the band "active" only if all these conditions are met: + 1) less than 10 dB below the simple follower + 2) less than 90 dB below the peak band (maximal masking possible considering + both the ATH and the loudness-dependent slope of the spreading function) + 3) above the PCM quantization noise floor + */ + if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start)) + bandwidth = b; + } + if (tonal->count<=2) + bandwidth = 20; + frame_loudness = 20*(float)log10(frame_loudness); + tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness); + tonal->lowECount *= (1-alphaE); + if (frame_loudness < tonal->Etracker-30) + tonal->lowECount += alphaE; + + for (i=0;i<8;i++) + { + float sum=0; + for (b=0;b<16;b++) + sum += dct_table[i*16+b]*logE[b]; + BFCC[i] = sum; + } + + frame_stationarity /= NB_TBANDS; + relativeE /= NB_TBANDS; + if (tonal->count<10) + relativeE = .5; + frame_noisiness /= NB_TBANDS; +#if 1 + info->activity = frame_noisiness + (1-frame_noisiness)*relativeE; +#else + info->activity = .5*(1+frame_noisiness-frame_stationarity); +#endif + frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS)); + frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f); + tonal->prev_tonality = frame_tonality; + + slope /= 8*8; + info->tonality_slope = slope; + + tonal->E_count = (tonal->E_count+1)%NB_FRAMES; + tonal->count++; + info->tonality = frame_tonality; + + for (i=0;i<4;i++) + features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i]; + + for (i=0;i<4;i++) + tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i]; + + for (i=0;i<4;i++) + features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]); + for (i=0;i<3;i++) + features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8]; + + if (tonal->count > 5) + { + for (i=0;i<9;i++) + tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i]; + } + + for (i=0;i<8;i++) + { + tonal->mem[i+24] = tonal->mem[i+16]; + tonal->mem[i+16] = tonal->mem[i+8]; + tonal->mem[i+8] = tonal->mem[i]; + tonal->mem[i] = BFCC[i]; + } + for (i=0;i<9;i++) + features[11+i] = (float)sqrt(tonal->std[i]); + features[20] = info->tonality; + features[21] = info->activity; + features[22] = frame_stationarity; + features[23] = info->tonality_slope; + features[24] = tonal->lowECount; + +#ifndef DISABLE_FLOAT_API + mlp_process(&net, features, frame_probs); + frame_probs[0] = .5f*(frame_probs[0]+1); + /* Curve fitting between the MLP probability and the actual probability */ + frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10); + /* Probability of active audio (as opposed to silence) */ + frame_probs[1] = .5f*frame_probs[1]+.5f; + /* Consider that silence has a 50-50 probability. */ + frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f; + + /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/ + { + /* Probability of state transition */ + float tau; + /* Represents independence of the MLP probabilities, where + beta=1 means fully independent. */ + float beta; + /* Denormalized probability of speech (p0) and music (p1) after update */ + float p0, p1; + /* Probabilities for "all speech" and "all music" */ + float s0, m0; + /* Probability sum for renormalisation */ + float psum; + /* Instantaneous probability of speech and music, with beta pre-applied. */ + float speech0; + float music0; + float p, q; + + /* One transition every 3 minutes of active audio */ + tau = .00005f*frame_probs[1]; + /* Adapt beta based on how "unexpected" the new prob is */ + p = MAX16(.05f,MIN16(.95f,frame_probs[0])); + q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); + beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); + /* p0 and p1 are the probabilities of speech and music at this frame + using only information from previous frame and applying the + state transition model */ + p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau; + p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau; + /* We apply the current probability with exponent beta to work around + the fact that the probability estimates aren't independent. */ + p0 *= (float)pow(1-frame_probs[0], beta); + p1 *= (float)pow(frame_probs[0], beta); + /* Normalise the probabilities to get the Marokv probability of music. */ + tonal->music_prob = p1/(p0+p1); + info->music_prob = tonal->music_prob; + + /* This chunk of code deals with delayed decision. */ + psum=1e-20f; + /* Instantaneous probability of speech and music, with beta pre-applied. */ + speech0 = (float)pow(1-frame_probs[0], beta); + music0 = (float)pow(frame_probs[0], beta); + if (tonal->count==1) + { + tonal->pspeech[0]=.5; + tonal->pmusic [0]=.5; + } + /* Updated probability of having only speech (s0) or only music (m0), + before considering the new observation. */ + s0 = tonal->pspeech[0] + tonal->pspeech[1]; + m0 = tonal->pmusic [0] + tonal->pmusic [1]; + /* Updates s0 and m0 with instantaneous probability. */ + tonal->pspeech[0] = s0*(1-tau)*speech0; + tonal->pmusic [0] = m0*(1-tau)*music0; + /* Propagate the transition probabilities */ + for (i=1;ipspeech[i] = tonal->pspeech[i+1]*speech0; + tonal->pmusic [i] = tonal->pmusic [i+1]*music0; + } + /* Probability that the latest frame is speech, when all the previous ones were music. */ + tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0; + /* Probability that the latest frame is music, when all the previous ones were speech. */ + tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0; + + /* Renormalise probabilities to 1 */ + for (i=0;ipspeech[i] + tonal->pmusic[i]; + psum = 1.f/psum; + for (i=0;ipspeech[i] *= psum; + tonal->pmusic [i] *= psum; + } + psum = tonal->pmusic[0]; + for (i=1;ipspeech[i]; + + /* Estimate our confidence in the speech/music decisions */ + if (frame_probs[1]>.75) + { + if (tonal->music_prob>.9) + { + float adapt; + adapt = 1.f/(++tonal->music_confidence_count); + tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500); + tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence); + } + if (tonal->music_prob<.1) + { + float adapt; + adapt = 1.f/(++tonal->speech_confidence_count); + tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500); + tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence); + } + } else { + if (tonal->music_confidence_count==0) + tonal->music_confidence = .9f; + if (tonal->speech_confidence_count==0) + tonal->speech_confidence = .1f; + } + } + if (tonal->last_music != (tonal->music_prob>.5f)) + tonal->last_transition=0; + tonal->last_music = tonal->music_prob>.5f; +#else + info->music_prob = 0; +#endif + /*for (i=0;i<25;i++) + printf("%f ", features[i]); + printf("\n");*/ + + info->bandwidth = bandwidth; + /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/ + info->noisiness = frame_noisiness; + info->valid = 1; + RESTORE_STACK; +} + +void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm, + int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs, + int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info) +{ + int offset; + int pcm_len; + + if (analysis_pcm != NULL) + { + /* Avoid overflow/wrap-around of the analysis buffer */ + analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size); + + pcm_len = analysis_frame_size - analysis->analysis_offset; + offset = analysis->analysis_offset; + do { + tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix); + offset += 480; + pcm_len -= 480; + } while (pcm_len>0); + analysis->analysis_offset = analysis_frame_size; + + analysis->analysis_offset -= frame_size; + } + + analysis_info->valid = 0; + tonality_get_info(analysis, analysis_info, frame_size); +} diff --git a/thirdparty/opus/analysis.h b/thirdparty/opus/analysis.h new file mode 100644 index 000000000000..9eae56a52537 --- /dev/null +++ b/thirdparty/opus/analysis.h @@ -0,0 +1,103 @@ +/* Copyright (c) 2011 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef ANALYSIS_H +#define ANALYSIS_H + +#include "celt.h" +#include "opus_private.h" + +#define NB_FRAMES 8 +#define NB_TBANDS 18 +#define NB_TOT_BANDS 21 +#define ANALYSIS_BUF_SIZE 720 /* 15 ms at 48 kHz */ + +#define DETECT_SIZE 200 + +typedef struct { + int arch; +#define TONALITY_ANALYSIS_RESET_START angle + float angle[240]; + float d_angle[240]; + float d2_angle[240]; + opus_val32 inmem[ANALYSIS_BUF_SIZE]; + int mem_fill; /* number of usable samples in the buffer */ + float prev_band_tonality[NB_TBANDS]; + float prev_tonality; + float E[NB_FRAMES][NB_TBANDS]; + float lowE[NB_TBANDS]; + float highE[NB_TBANDS]; + float meanE[NB_TOT_BANDS]; + float mem[32]; + float cmean[8]; + float std[9]; + float music_prob; + float Etracker; + float lowECount; + int E_count; + int last_music; + int last_transition; + int count; + float subframe_mem[3]; + int analysis_offset; + /** Probability of having speech for time i to DETECT_SIZE-1 (and music before). + pspeech[0] is the probability that all frames in the window are speech. */ + float pspeech[DETECT_SIZE]; + /** Probability of having music for time i to DETECT_SIZE-1 (and speech before). + pmusic[0] is the probability that all frames in the window are music. */ + float pmusic[DETECT_SIZE]; + float speech_confidence; + float music_confidence; + int speech_confidence_count; + int music_confidence_count; + int write_pos; + int read_pos; + int read_subframe; + AnalysisInfo info[DETECT_SIZE]; +} TonalityAnalysisState; + +/** Initialize a TonalityAnalysisState struct. + * + * This performs some possibly slow initialization steps which should + * not be repeated every analysis step. No allocated memory is retained + * by the state struct, so no cleanup call is required. + */ +void tonality_analysis_init(TonalityAnalysisState *analysis); + +/** Reset a TonalityAnalysisState stuct. + * + * Call this when there's a discontinuity in the data. + */ +void tonality_analysis_reset(TonalityAnalysisState *analysis); + +void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len); + +void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm, + int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs, + int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info); + +#endif diff --git a/thirdparty/opus/celt/_kiss_fft_guts.h b/thirdparty/opus/celt/_kiss_fft_guts.h new file mode 100644 index 000000000000..5e3d58fd6641 --- /dev/null +++ b/thirdparty/opus/celt/_kiss_fft_guts.h @@ -0,0 +1,182 @@ +/*Copyright (c) 2003-2004, Mark Borgerding + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE.*/ + +#ifndef KISS_FFT_GUTS_H +#define KISS_FFT_GUTS_H + +#define MIN(a,b) ((a)<(b) ? (a):(b)) +#define MAX(a,b) ((a)>(b) ? (a):(b)) + +/* kiss_fft.h + defines kiss_fft_scalar as either short or a float type + and defines + typedef struct { kiss_fft_scalar r; kiss_fft_scalar i; }kiss_fft_cpx; */ +#include "kiss_fft.h" + +/* + Explanation of macros dealing with complex math: + + C_MUL(m,a,b) : m = a*b + C_FIXDIV( c , div ) : if a fixed point impl., c /= div. noop otherwise + C_SUB( res, a,b) : res = a - b + C_SUBFROM( res , a) : res -= a + C_ADDTO( res , a) : res += a + * */ +#ifdef FIXED_POINT +#include "arch.h" + + +#define SAMP_MAX 2147483647 +#define TWID_MAX 32767 +#define TRIG_UPSCALE 1 + +#define SAMP_MIN -SAMP_MAX + + +# define S_MUL(a,b) MULT16_32_Q15(b, a) + +# define C_MUL(m,a,b) \ + do{ (m).r = SUB32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \ + (m).i = ADD32(S_MUL((a).r,(b).i) , S_MUL((a).i,(b).r)); }while(0) + +# define C_MULC(m,a,b) \ + do{ (m).r = ADD32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \ + (m).i = SUB32(S_MUL((a).i,(b).r) , S_MUL((a).r,(b).i)); }while(0) + +# define C_MULBYSCALAR( c, s ) \ + do{ (c).r = S_MUL( (c).r , s ) ;\ + (c).i = S_MUL( (c).i , s ) ; }while(0) + +# define DIVSCALAR(x,k) \ + (x) = S_MUL( x, (TWID_MAX-((k)>>1))/(k)+1 ) + +# define C_FIXDIV(c,div) \ + do { DIVSCALAR( (c).r , div); \ + DIVSCALAR( (c).i , div); }while (0) + +#define C_ADD( res, a,b)\ + do {(res).r=ADD32((a).r,(b).r); (res).i=ADD32((a).i,(b).i); \ + }while(0) +#define C_SUB( res, a,b)\ + do {(res).r=SUB32((a).r,(b).r); (res).i=SUB32((a).i,(b).i); \ + }while(0) +#define C_ADDTO( res , a)\ + do {(res).r = ADD32((res).r, (a).r); (res).i = ADD32((res).i,(a).i);\ + }while(0) + +#define C_SUBFROM( res , a)\ + do {(res).r = ADD32((res).r,(a).r); (res).i = SUB32((res).i,(a).i); \ + }while(0) + +#if defined(OPUS_ARM_INLINE_ASM) +#include "arm/kiss_fft_armv4.h" +#endif + +#if defined(OPUS_ARM_INLINE_EDSP) +#include "arm/kiss_fft_armv5e.h" +#endif +#if defined(MIPSr1_ASM) +#include "mips/kiss_fft_mipsr1.h" +#endif + +#else /* not FIXED_POINT*/ + +# define S_MUL(a,b) ( (a)*(b) ) +#define C_MUL(m,a,b) \ + do{ (m).r = (a).r*(b).r - (a).i*(b).i;\ + (m).i = (a).r*(b).i + (a).i*(b).r; }while(0) +#define C_MULC(m,a,b) \ + do{ (m).r = (a).r*(b).r + (a).i*(b).i;\ + (m).i = (a).i*(b).r - (a).r*(b).i; }while(0) + +#define C_MUL4(m,a,b) C_MUL(m,a,b) + +# define C_FIXDIV(c,div) /* NOOP */ +# define C_MULBYSCALAR( c, s ) \ + do{ (c).r *= (s);\ + (c).i *= (s); }while(0) +#endif + +#ifndef CHECK_OVERFLOW_OP +# define CHECK_OVERFLOW_OP(a,op,b) /* noop */ +#endif + +#ifndef C_ADD +#define C_ADD( res, a,b)\ + do { \ + CHECK_OVERFLOW_OP((a).r,+,(b).r)\ + CHECK_OVERFLOW_OP((a).i,+,(b).i)\ + (res).r=(a).r+(b).r; (res).i=(a).i+(b).i; \ + }while(0) +#define C_SUB( res, a,b)\ + do { \ + CHECK_OVERFLOW_OP((a).r,-,(b).r)\ + CHECK_OVERFLOW_OP((a).i,-,(b).i)\ + (res).r=(a).r-(b).r; (res).i=(a).i-(b).i; \ + }while(0) +#define C_ADDTO( res , a)\ + do { \ + CHECK_OVERFLOW_OP((res).r,+,(a).r)\ + CHECK_OVERFLOW_OP((res).i,+,(a).i)\ + (res).r += (a).r; (res).i += (a).i;\ + }while(0) + +#define C_SUBFROM( res , a)\ + do {\ + CHECK_OVERFLOW_OP((res).r,-,(a).r)\ + CHECK_OVERFLOW_OP((res).i,-,(a).i)\ + (res).r -= (a).r; (res).i -= (a).i; \ + }while(0) +#endif /* C_ADD defined */ + +#ifdef FIXED_POINT +/*# define KISS_FFT_COS(phase) TRIG_UPSCALE*floor(MIN(32767,MAX(-32767,.5+32768 * cos (phase)))) +# define KISS_FFT_SIN(phase) TRIG_UPSCALE*floor(MIN(32767,MAX(-32767,.5+32768 * sin (phase))))*/ +# define KISS_FFT_COS(phase) floor(.5+TWID_MAX*cos (phase)) +# define KISS_FFT_SIN(phase) floor(.5+TWID_MAX*sin (phase)) +# define HALF_OF(x) ((x)>>1) +#elif defined(USE_SIMD) +# define KISS_FFT_COS(phase) _mm_set1_ps( cos(phase) ) +# define KISS_FFT_SIN(phase) _mm_set1_ps( sin(phase) ) +# define HALF_OF(x) ((x)*_mm_set1_ps(.5f)) +#else +# define KISS_FFT_COS(phase) (kiss_fft_scalar) cos(phase) +# define KISS_FFT_SIN(phase) (kiss_fft_scalar) sin(phase) +# define HALF_OF(x) ((x)*.5f) +#endif + +#define kf_cexp(x,phase) \ + do{ \ + (x)->r = KISS_FFT_COS(phase);\ + (x)->i = KISS_FFT_SIN(phase);\ + }while(0) + +#define kf_cexp2(x,phase) \ + do{ \ + (x)->r = TRIG_UPSCALE*celt_cos_norm((phase));\ + (x)->i = TRIG_UPSCALE*celt_cos_norm((phase)-32768);\ +}while(0) + +#endif /* KISS_FFT_GUTS_H */ diff --git a/thirdparty/opus/celt/arch.h b/thirdparty/opus/celt/arch.h new file mode 100644 index 000000000000..8ceab5fe10e9 --- /dev/null +++ b/thirdparty/opus/celt/arch.h @@ -0,0 +1,252 @@ +/* Copyright (c) 2003-2008 Jean-Marc Valin + Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/** + @file arch.h + @brief Various architecture definitions for CELT +*/ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef ARCH_H +#define ARCH_H + +#include "opus_types.h" +#include "opus_defines.h" + +# if !defined(__GNUC_PREREQ) +# if defined(__GNUC__)&&defined(__GNUC_MINOR__) +# define __GNUC_PREREQ(_maj,_min) \ + ((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min)) +# else +# define __GNUC_PREREQ(_maj,_min) 0 +# endif +# endif + +#define CELT_SIG_SCALE 32768.f + +#define celt_fatal(str) _celt_fatal(str, __FILE__, __LINE__); +#ifdef ENABLE_ASSERTIONS +#include +#include +#ifdef __GNUC__ +__attribute__((noreturn)) +#endif +static OPUS_INLINE void _celt_fatal(const char *str, const char *file, int line) +{ + fprintf (stderr, "Fatal (internal) error in %s, line %d: %s\n", file, line, str); + abort(); +} +#define celt_assert(cond) {if (!(cond)) {celt_fatal("assertion failed: " #cond);}} +#define celt_assert2(cond, message) {if (!(cond)) {celt_fatal("assertion failed: " #cond "\n" message);}} +#else +#define celt_assert(cond) +#define celt_assert2(cond, message) +#endif + +#define IMUL32(a,b) ((a)*(b)) + +#define MIN16(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum 16-bit value. */ +#define MAX16(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum 16-bit value. */ +#define MIN32(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum 32-bit value. */ +#define MAX32(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum 32-bit value. */ +#define IMIN(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum int value. */ +#define IMAX(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum int value. */ +#define UADD32(a,b) ((a)+(b)) +#define USUB32(a,b) ((a)-(b)) + +/* Set this if opus_int64 is a native type of the CPU. */ +/* Assume that all LP64 architectures have fast 64-bit types; also x86_64 + (which can be ILP32 for x32) and Win64 (which is LLP64). */ +#if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64) +#define OPUS_FAST_INT64 1 +#else +#define OPUS_FAST_INT64 0 +#endif + +#define PRINT_MIPS(file) + +#ifdef FIXED_POINT + +typedef opus_int16 opus_val16; +typedef opus_int32 opus_val32; + +typedef opus_val32 celt_sig; +typedef opus_val16 celt_norm; +typedef opus_val32 celt_ener; + +#define Q15ONE 32767 + +#define SIG_SHIFT 12 + +#define NORM_SCALING 16384 + +#define DB_SHIFT 10 + +#define EPSILON 1 +#define VERY_SMALL 0 +#define VERY_LARGE16 ((opus_val16)32767) +#define Q15_ONE ((opus_val16)32767) + +#define SCALEIN(a) (a) +#define SCALEOUT(a) (a) + +#define ABS16(x) ((x) < 0 ? (-(x)) : (x)) +#define ABS32(x) ((x) < 0 ? (-(x)) : (x)) + +static OPUS_INLINE opus_int16 SAT16(opus_int32 x) { + return x > 32767 ? 32767 : x < -32768 ? -32768 : (opus_int16)x; +} + +#ifdef FIXED_DEBUG +#include "fixed_debug.h" +#else + +#include "fixed_generic.h" + +#ifdef OPUS_ARM_PRESUME_AARCH64_NEON_INTR +#include "arm/fixed_arm64.h" +#elif OPUS_ARM_INLINE_EDSP +#include "arm/fixed_armv5e.h" +#elif defined (OPUS_ARM_INLINE_ASM) +#include "arm/fixed_armv4.h" +#elif defined (BFIN_ASM) +#include "fixed_bfin.h" +#elif defined (TI_C5X_ASM) +#include "fixed_c5x.h" +#elif defined (TI_C6X_ASM) +#include "fixed_c6x.h" +#endif + +#endif + +#else /* FIXED_POINT */ + +typedef float opus_val16; +typedef float opus_val32; + +typedef float celt_sig; +typedef float celt_norm; +typedef float celt_ener; + +#ifdef FLOAT_APPROX +/* This code should reliably detect NaN/inf even when -ffast-math is used. + Assumes IEEE 754 format. */ +static OPUS_INLINE int celt_isnan(float x) +{ + union {float f; opus_uint32 i;} in; + in.f = x; + return ((in.i>>23)&0xFF)==0xFF && (in.i&0x007FFFFF)!=0; +} +#else +#ifdef __FAST_MATH__ +#error Cannot build libopus with -ffast-math unless FLOAT_APPROX is defined. This could result in crashes on extreme (e.g. NaN) input +#endif +#define celt_isnan(x) ((x)!=(x)) +#endif + +#define Q15ONE 1.0f + +#define NORM_SCALING 1.f + +#define EPSILON 1e-15f +#define VERY_SMALL 1e-30f +#define VERY_LARGE16 1e15f +#define Q15_ONE ((opus_val16)1.f) + +/* This appears to be the same speed as C99's fabsf() but it's more portable. */ +#define ABS16(x) ((float)fabs(x)) +#define ABS32(x) ((float)fabs(x)) + +#define QCONST16(x,bits) (x) +#define QCONST32(x,bits) (x) + +#define NEG16(x) (-(x)) +#define NEG32(x) (-(x)) +#define EXTRACT16(x) (x) +#define EXTEND32(x) (x) +#define SHR16(a,shift) (a) +#define SHL16(a,shift) (a) +#define SHR32(a,shift) (a) +#define SHL32(a,shift) (a) +#define PSHR32(a,shift) (a) +#define VSHR32(a,shift) (a) + +#define PSHR(a,shift) (a) +#define SHR(a,shift) (a) +#define SHL(a,shift) (a) +#define SATURATE(x,a) (x) +#define SATURATE16(x) (x) + +#define ROUND16(a,shift) (a) +#define HALF16(x) (.5f*(x)) +#define HALF32(x) (.5f*(x)) + +#define ADD16(a,b) ((a)+(b)) +#define SUB16(a,b) ((a)-(b)) +#define ADD32(a,b) ((a)+(b)) +#define SUB32(a,b) ((a)-(b)) +#define MULT16_16_16(a,b) ((a)*(b)) +#define MULT16_16(a,b) ((opus_val32)(a)*(opus_val32)(b)) +#define MAC16_16(c,a,b) ((c)+(opus_val32)(a)*(opus_val32)(b)) + +#define MULT16_32_Q15(a,b) ((a)*(b)) +#define MULT16_32_Q16(a,b) ((a)*(b)) + +#define MULT32_32_Q31(a,b) ((a)*(b)) + +#define MAC16_32_Q15(c,a,b) ((c)+(a)*(b)) +#define MAC16_32_Q16(c,a,b) ((c)+(a)*(b)) + +#define MULT16_16_Q11_32(a,b) ((a)*(b)) +#define MULT16_16_Q11(a,b) ((a)*(b)) +#define MULT16_16_Q13(a,b) ((a)*(b)) +#define MULT16_16_Q14(a,b) ((a)*(b)) +#define MULT16_16_Q15(a,b) ((a)*(b)) +#define MULT16_16_P15(a,b) ((a)*(b)) +#define MULT16_16_P13(a,b) ((a)*(b)) +#define MULT16_16_P14(a,b) ((a)*(b)) +#define MULT16_32_P16(a,b) ((a)*(b)) + +#define DIV32_16(a,b) (((opus_val32)(a))/(opus_val16)(b)) +#define DIV32(a,b) (((opus_val32)(a))/(opus_val32)(b)) + +#define SCALEIN(a) ((a)*CELT_SIG_SCALE) +#define SCALEOUT(a) ((a)*(1/CELT_SIG_SCALE)) + +#define SIG2WORD16(x) (x) + +#endif /* !FIXED_POINT */ + +#ifndef GLOBAL_STACK_SIZE +#ifdef FIXED_POINT +#define GLOBAL_STACK_SIZE 100000 +#else +#define GLOBAL_STACK_SIZE 100000 +#endif +#endif + +#endif /* ARCH_H */ diff --git a/thirdparty/opus/celt/bands.c b/thirdparty/opus/celt/bands.c new file mode 100644 index 000000000000..87eaa6c0311d --- /dev/null +++ b/thirdparty/opus/celt/bands.c @@ -0,0 +1,1529 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Copyright (c) 2008-2009 Gregory Maxwell + Written by Jean-Marc Valin and Gregory Maxwell */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include "bands.h" +#include "modes.h" +#include "vq.h" +#include "cwrs.h" +#include "stack_alloc.h" +#include "os_support.h" +#include "mathops.h" +#include "rate.h" +#include "quant_bands.h" +#include "pitch.h" + +int hysteresis_decision(opus_val16 val, const opus_val16 *thresholds, const opus_val16 *hysteresis, int N, int prev) +{ + int i; + for (i=0;iprev && val < thresholds[prev]+hysteresis[prev]) + i=prev; + if (i thresholds[prev-1]-hysteresis[prev-1]) + i=prev; + return i; +} + +opus_uint32 celt_lcg_rand(opus_uint32 seed) +{ + return 1664525 * seed + 1013904223; +} + +/* This is a cos() approximation designed to be bit-exact on any platform. Bit exactness + with this approximation is important because it has an impact on the bit allocation */ +static opus_int16 bitexact_cos(opus_int16 x) +{ + opus_int32 tmp; + opus_int16 x2; + tmp = (4096+((opus_int32)(x)*(x)))>>13; + celt_assert(tmp<=32767); + x2 = tmp; + x2 = (32767-x2) + FRAC_MUL16(x2, (-7651 + FRAC_MUL16(x2, (8277 + FRAC_MUL16(-626, x2))))); + celt_assert(x2<=32766); + return 1+x2; +} + +static int bitexact_log2tan(int isin,int icos) +{ + int lc; + int ls; + lc=EC_ILOG(icos); + ls=EC_ILOG(isin); + icos<<=15-lc; + isin<<=15-ls; + return (ls-lc)*(1<<11) + +FRAC_MUL16(isin, FRAC_MUL16(isin, -2597) + 7932) + -FRAC_MUL16(icos, FRAC_MUL16(icos, -2597) + 7932); +} + +#ifdef FIXED_POINT +/* Compute the amplitude (sqrt energy) in each of the bands */ +void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM) +{ + int i, c, N; + const opus_int16 *eBands = m->eBands; + N = m->shortMdctSize< 0) + { + int shift = celt_ilog2(maxval) - 14 + (((m->logN[i]>>BITRES)+LM+1)>>1); + j=eBands[i]<0) + { + do { + sum = MAC16_16(sum, EXTRACT16(SHR32(X[j+c*N],shift)), + EXTRACT16(SHR32(X[j+c*N],shift))); + } while (++jnbEBands] = EPSILON+VSHR32(EXTEND32(celt_sqrt(sum)),-shift); + } else { + bandE[i+c*m->nbEBands] = EPSILON; + } + /*printf ("%f ", bandE[i+c*m->nbEBands]);*/ + } + } while (++ceBands; + N = M*m->shortMdctSize; + c=0; do { + i=0; do { + opus_val16 g; + int j,shift; + opus_val16 E; + shift = celt_zlog2(bandE[i+c*m->nbEBands])-13; + E = VSHR32(bandE[i+c*m->nbEBands], shift); + g = EXTRACT16(celt_rcp(SHL32(E,3))); + j=M*eBands[i]; do { + X[j+c*N] = MULT16_16_Q15(VSHR32(freq[j+c*N],shift-1),g); + } while (++jeBands; + N = m->shortMdctSize<nbEBands] = celt_sqrt(sum); + /*printf ("%f ", bandE[i+c*m->nbEBands]);*/ + } + } while (++ceBands; + N = M*m->shortMdctSize; + c=0; do { + for (i=0;inbEBands]); + for (j=M*eBands[i];jeBands; + N = M*m->shortMdctSize; + bound = M*eBands[end]; + if (downsample!=1) + bound = IMIN(bound, N/downsample); + if (silence) + { + bound = 0; + start = end = 0; + } + f = freq; + x = X+M*eBands[start]; + for (i=0;i>DB_SHIFT); + if (shift>31) + { + shift=0; + g=0; + } else { + /* Handle the fractional part. */ + g = celt_exp2_frac(lg&((1<eBands[i+1]-m->eBands[i]; + /* depth in 1/8 bits */ + celt_assert(pulses[i]>=0); + depth = celt_udiv(1+pulses[i], (m->eBands[i+1]-m->eBands[i]))>>LM; + +#ifdef FIXED_POINT + thresh32 = SHR32(celt_exp2(-SHL16(depth, 10-BITRES)),1); + thresh = MULT16_32_Q15(QCONST16(0.5f, 15), MIN32(32767,thresh32)); + { + opus_val32 t; + t = N0<>1; + t = SHL32(t, (7-shift)<<1); + sqrt_1 = celt_rsqrt_norm(t); + } +#else + thresh = .5f*celt_exp2(-.125f*depth); + sqrt_1 = celt_rsqrt(N0<nbEBands+i]; + prev2 = prev2logE[c*m->nbEBands+i]; + if (C==1) + { + prev1 = MAX16(prev1,prev1logE[m->nbEBands+i]); + prev2 = MAX16(prev2,prev2logE[m->nbEBands+i]); + } + Ediff = EXTEND32(logE[c*m->nbEBands+i])-EXTEND32(MIN16(prev1,prev2)); + Ediff = MAX32(0, Ediff); + +#ifdef FIXED_POINT + if (Ediff < 16384) + { + opus_val32 r32 = SHR32(celt_exp2(-EXTRACT16(Ediff)),1); + r = 2*MIN16(16383,r32); + } else { + r = 0; + } + if (LM==3) + r = MULT16_16_Q14(23170, MIN32(23169, r)); + r = SHR16(MIN16(thresh, r),1); + r = SHR32(MULT16_16_Q15(sqrt_1, r),shift); +#else + /* r needs to be multiplied by 2 or 2*sqrt(2) depending on LM because + short blocks don't have the same energy as long */ + r = 2.f*celt_exp2(-Ediff); + if (LM==3) + r *= 1.41421356f; + r = MIN16(thresh, r); + r = r*sqrt_1; +#endif + X = X_+c*size+(m->eBands[i]<nbEBands]))-13; +#endif + left = VSHR32(bandE[i],shift); + right = VSHR32(bandE[i+m->nbEBands],shift); + norm = EPSILON + celt_sqrt(EPSILON+MULT16_16(left,left)+MULT16_16(right,right)); + a1 = DIV32_16(SHL32(EXTEND32(left),14),norm); + a2 = DIV32_16(SHL32(EXTEND32(right),14),norm); + for (j=0;j>1; + kr = celt_ilog2(Er)>>1; +#endif + t = VSHR32(El, (kl-7)<<1); + lgain = celt_rsqrt_norm(t); + t = VSHR32(Er, (kr-7)<<1); + rgain = celt_rsqrt_norm(t); + +#ifdef FIXED_POINT + if (kl < 7) + kl = 7; + if (kr < 7) + kr = 7; +#endif + + for (j=0;jeBands; + int decision; + int hf_sum=0; + + celt_assert(end>0); + + N0 = M*m->shortMdctSize; + + if (M*(eBands[end]-eBands[end-1]) <= 8) + return SPREAD_NONE; + c=0; do { + for (i=0;im->nbEBands-4) + hf_sum += celt_udiv(32*(tcount[1]+tcount[0]), N); + tmp = (2*tcount[2] >= N) + (2*tcount[1] >= N) + (2*tcount[0] >= N); + sum += tmp*256; + nbBands++; + } + } while (++cnbEBands+end)); + *hf_average = (*hf_average+hf_sum)>>1; + hf_sum = *hf_average; + if (*tapset_decision==2) + hf_sum += 4; + else if (*tapset_decision==0) + hf_sum -= 4; + if (hf_sum > 22) + *tapset_decision=2; + else if (hf_sum > 18) + *tapset_decision=1; + else + *tapset_decision=0; + } + /*printf("%d %d %d\n", hf_sum, *hf_average, *tapset_decision);*/ + celt_assert(nbBands>0); /* end has to be non-zero */ + celt_assert(sum>=0); + sum = celt_udiv(sum, nbBands); + /* Recursive averaging */ + sum = (sum+*average)>>1; + *average = sum; + /* Hysteresis */ + sum = (3*sum + (((3-last_decision)<<7) + 64) + 2)>>2; + if (sum < 80) + { + decision = SPREAD_AGGRESSIVE; + } else if (sum < 256) + { + decision = SPREAD_NORMAL; + } else if (sum < 384) + { + decision = SPREAD_LIGHT; + } else { + decision = SPREAD_NONE; + } +#ifdef FUZZING + decision = rand()&0x3; + *tapset_decision=rand()%3; +#endif + return decision; +} + +/* Indexing table for converting from natural Hadamard to ordery Hadamard + This is essentially a bit-reversed Gray, on top of which we've added + an inversion of the order because we want the DC at the end rather than + the beginning. The lines are for N=2, 4, 8, 16 */ +static const int ordery_table[] = { + 1, 0, + 3, 0, 2, 1, + 7, 0, 4, 3, 6, 1, 5, 2, + 15, 0, 8, 7, 12, 3, 11, 4, 14, 1, 9, 6, 13, 2, 10, 5, +}; + +static void deinterleave_hadamard(celt_norm *X, int N0, int stride, int hadamard) +{ + int i,j; + VARDECL(celt_norm, tmp); + int N; + SAVE_STACK; + N = N0*stride; + ALLOC(tmp, N, celt_norm); + celt_assert(stride>0); + if (hadamard) + { + const int *ordery = ordery_table+stride-2; + for (i=0;i>= 1; + for (i=0;i>1)) { + qn = 1; + } else { + qn = exp2_table8[qb&0x7]>>(14-(qb>>BITRES)); + qn = (qn+1)>>1<<1; + } + celt_assert(qn <= 256); + return qn; +} + +struct band_ctx { + int encode; + const CELTMode *m; + int i; + int intensity; + int spread; + int tf_change; + ec_ctx *ec; + opus_int32 remaining_bits; + const celt_ener *bandE; + opus_uint32 seed; + int arch; +}; + +struct split_ctx { + int inv; + int imid; + int iside; + int delta; + int itheta; + int qalloc; +}; + +static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx, + celt_norm *X, celt_norm *Y, int N, int *b, int B, int B0, + int LM, + int stereo, int *fill) +{ + int qn; + int itheta=0; + int delta; + int imid, iside; + int qalloc; + int pulse_cap; + int offset; + opus_int32 tell; + int inv=0; + int encode; + const CELTMode *m; + int i; + int intensity; + ec_ctx *ec; + const celt_ener *bandE; + + encode = ctx->encode; + m = ctx->m; + i = ctx->i; + intensity = ctx->intensity; + ec = ctx->ec; + bandE = ctx->bandE; + + /* Decide on the resolution to give to the split parameter theta */ + pulse_cap = m->logN[i]+LM*(1<>1) - (stereo&&N==2 ? QTHETA_OFFSET_TWOPHASE : QTHETA_OFFSET); + qn = compute_qn(N, *b, offset, pulse_cap, stereo); + if (stereo && i>=intensity) + qn = 1; + if (encode) + { + /* theta is the atan() of the ratio between the (normalized) + side and mid. With just that parameter, we can re-scale both + mid and side because we know that 1) they have unit norm and + 2) they are orthogonal. */ + itheta = stereo_itheta(X, Y, stereo, N, ctx->arch); + } + tell = ec_tell_frac(ec); + if (qn!=1) + { + if (encode) + itheta = (itheta*(opus_int32)qn+8192)>>14; + + /* Entropy coding of the angle. We use a uniform pdf for the + time split, a step for stereo, and a triangular one for the rest. */ + if (stereo && N>2) + { + int p0 = 3; + int x = itheta; + int x0 = qn/2; + int ft = p0*(x0+1) + x0; + /* Use a probability of p0 up to itheta=8192 and then use 1 after */ + if (encode) + { + ec_encode(ec,x<=x0?p0*x:(x-1-x0)+(x0+1)*p0,x<=x0?p0*(x+1):(x-x0)+(x0+1)*p0,ft); + } else { + int fs; + fs=ec_decode(ec,ft); + if (fs<(x0+1)*p0) + x=fs/p0; + else + x=x0+1+(fs-(x0+1)*p0); + ec_dec_update(ec,x<=x0?p0*x:(x-1-x0)+(x0+1)*p0,x<=x0?p0*(x+1):(x-x0)+(x0+1)*p0,ft); + itheta = x; + } + } else if (B0>1 || stereo) { + /* Uniform pdf */ + if (encode) + ec_enc_uint(ec, itheta, qn+1); + else + itheta = ec_dec_uint(ec, qn+1); + } else { + int fs=1, ft; + ft = ((qn>>1)+1)*((qn>>1)+1); + if (encode) + { + int fl; + + fs = itheta <= (qn>>1) ? itheta + 1 : qn + 1 - itheta; + fl = itheta <= (qn>>1) ? itheta*(itheta + 1)>>1 : + ft - ((qn + 1 - itheta)*(qn + 2 - itheta)>>1); + + ec_encode(ec, fl, fl+fs, ft); + } else { + /* Triangular pdf */ + int fl=0; + int fm; + fm = ec_decode(ec, ft); + + if (fm < ((qn>>1)*((qn>>1) + 1)>>1)) + { + itheta = (isqrt32(8*(opus_uint32)fm + 1) - 1)>>1; + fs = itheta + 1; + fl = itheta*(itheta + 1)>>1; + } + else + { + itheta = (2*(qn + 1) + - isqrt32(8*(opus_uint32)(ft - fm - 1) + 1))>>1; + fs = qn + 1 - itheta; + fl = ft - ((qn + 1 - itheta)*(qn + 2 - itheta)>>1); + } + + ec_dec_update(ec, fl, fl+fs, ft); + } + } + celt_assert(itheta>=0); + itheta = celt_udiv((opus_int32)itheta*16384, qn); + if (encode && stereo) + { + if (itheta==0) + intensity_stereo(m, X, Y, bandE, i, N); + else + stereo_split(X, Y, N); + } + /* NOTE: Renormalising X and Y *may* help fixed-point a bit at very high rate. + Let's do that at higher complexity */ + } else if (stereo) { + if (encode) + { + inv = itheta > 8192; + if (inv) + { + int j; + for (j=0;j2<remaining_bits > 2<inv = inv; + sctx->imid = imid; + sctx->iside = iside; + sctx->delta = delta; + sctx->itheta = itheta; + sctx->qalloc = qalloc; +} +static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, int b, + celt_norm *lowband_out) +{ +#ifdef RESYNTH + int resynth = 1; +#else + int resynth = !ctx->encode; +#endif + int c; + int stereo; + celt_norm *x = X; + int encode; + ec_ctx *ec; + + encode = ctx->encode; + ec = ctx->ec; + + stereo = Y != NULL; + c=0; do { + int sign=0; + if (ctx->remaining_bits>=1<remaining_bits -= 1<encode; +#endif + celt_norm *Y=NULL; + int encode; + const CELTMode *m; + int i; + int spread; + ec_ctx *ec; + + encode = ctx->encode; + m = ctx->m; + i = ctx->i; + spread = ctx->spread; + ec = ctx->ec; + + /* If we need 1.5 more bit than we can produce, split the band in two. */ + cache = m->cache.bits + m->cache.index[(LM+1)*m->nbEBands+i]; + if (LM != -1 && b > cache[cache[0]]+12 && N>2) + { + int mbits, sbits, delta; + int itheta; + int qalloc; + struct split_ctx sctx; + celt_norm *next_lowband2=NULL; + opus_int32 rebalance; + + N >>= 1; + Y = X+N; + LM -= 1; + if (B==1) + fill = (fill&1)|(fill<<1); + B = (B+1)>>1; + + compute_theta(ctx, &sctx, X, Y, N, &b, B, B0, + LM, 0, &fill); + imid = sctx.imid; + iside = sctx.iside; + delta = sctx.delta; + itheta = sctx.itheta; + qalloc = sctx.qalloc; +#ifdef FIXED_POINT + mid = imid; + side = iside; +#else + mid = (1.f/32768)*imid; + side = (1.f/32768)*iside; +#endif + + /* Give more bits to low-energy MDCTs than they would otherwise deserve */ + if (B0>1 && (itheta&0x3fff)) + { + if (itheta > 8192) + /* Rough approximation for pre-echo masking */ + delta -= delta>>(4-LM); + else + /* Corresponds to a forward-masking slope of 1.5 dB per 10 ms */ + delta = IMIN(0, delta + (N<>(5-LM))); + } + mbits = IMAX(0, IMIN(b, (b-delta)/2)); + sbits = b-mbits; + ctx->remaining_bits -= qalloc; + + if (lowband) + next_lowband2 = lowband+N; /* >32-bit split case */ + + rebalance = ctx->remaining_bits; + if (mbits >= sbits) + { + cm = quant_partition(ctx, X, N, mbits, B, + lowband, LM, + MULT16_16_P15(gain,mid), fill); + rebalance = mbits - (rebalance-ctx->remaining_bits); + if (rebalance > 3<>B)<<(B0>>1); + } else { + cm = quant_partition(ctx, Y, N, sbits, B, + next_lowband2, LM, + MULT16_16_P15(gain,side), fill>>B)<<(B0>>1); + rebalance = sbits - (rebalance-ctx->remaining_bits); + if (rebalance > 3<remaining_bits -= curr_bits; + + /* Ensures we can never bust the budget */ + while (ctx->remaining_bits < 0 && q > 0) + { + ctx->remaining_bits += curr_bits; + q--; + curr_bits = pulses2bits(m, i, LM, q); + ctx->remaining_bits -= curr_bits; + } + + if (q!=0) + { + int K = get_pulses(q); + + /* Finally do the actual quantization */ + if (encode) + { + cm = alg_quant(X, N, K, spread, B, ec +#ifdef RESYNTH + , gain +#endif + ); + } else { + cm = alg_unquant(X, N, K, spread, B, ec, gain); + } + } else { + /* If there's no pulse, fill the band anyway */ + int j; + if (resynth) + { + unsigned cm_mask; + /* B can be as large as 16, so this shift might overflow an int on a + 16-bit platform; use a long to get defined behavior.*/ + cm_mask = (unsigned)(1UL<seed = celt_lcg_rand(ctx->seed); + X[j] = (celt_norm)((opus_int32)ctx->seed>>20); + } + cm = cm_mask; + } else { + /* Folded spectrum */ + for (j=0;jseed = celt_lcg_rand(ctx->seed); + /* About 48 dB below the "normal" folding level */ + tmp = QCONST16(1.0f/256, 10); + tmp = (ctx->seed)&0x8000 ? tmp : -tmp; + X[j] = lowband[j]+tmp; + } + cm = fill; + } + renormalise_vector(X, N, gain, ctx->arch); + } + } + } + } + + return cm; +} + + +/* This function is responsible for encoding and decoding a band for the mono case. */ +static unsigned quant_band(struct band_ctx *ctx, celt_norm *X, + int N, int b, int B, celt_norm *lowband, + int LM, celt_norm *lowband_out, + opus_val16 gain, celt_norm *lowband_scratch, int fill) +{ + int N0=N; + int N_B=N; + int N_B0; + int B0=B; + int time_divide=0; + int recombine=0; + int longBlocks; + unsigned cm=0; +#ifdef RESYNTH + int resynth = 1; +#else + int resynth = !ctx->encode; +#endif + int k; + int encode; + int tf_change; + + encode = ctx->encode; + tf_change = ctx->tf_change; + + longBlocks = B0==1; + + N_B = celt_udiv(N_B, B); + + /* Special case for one sample */ + if (N==1) + { + return quant_band_n1(ctx, X, NULL, b, lowband_out); + } + + if (tf_change>0) + recombine = tf_change; + /* Band recombining to increase frequency resolution */ + + if (lowband_scratch && lowband && (recombine || ((N_B&1) == 0 && tf_change<0) || B0>1)) + { + OPUS_COPY(lowband_scratch, lowband, N); + lowband = lowband_scratch; + } + + for (k=0;k>k, 1<>k, 1<>4]<<2; + } + B>>=recombine; + N_B<<=recombine; + + /* Increasing the time resolution */ + while ((N_B&1) == 0 && tf_change<0) + { + if (encode) + haar1(X, N_B, B); + if (lowband) + haar1(lowband, N_B, B); + fill |= fill<>= 1; + time_divide++; + tf_change++; + } + B0=B; + N_B0 = N_B; + + /* Reorganize the samples in time order instead of frequency order */ + if (B0>1) + { + if (encode) + deinterleave_hadamard(X, N_B>>recombine, B0<>recombine, B0<1) + interleave_hadamard(X, N_B>>recombine, B0<>= 1; + N_B <<= 1; + cm |= cm>>B; + haar1(X, N_B, B); + } + + for (k=0;k>k, 1<encode; +#endif + int mbits, sbits, delta; + int itheta; + int qalloc; + struct split_ctx sctx; + int orig_fill; + int encode; + ec_ctx *ec; + + encode = ctx->encode; + ec = ctx->ec; + + /* Special case for one sample */ + if (N==1) + { + return quant_band_n1(ctx, X, Y, b, lowband_out); + } + + orig_fill = fill; + + compute_theta(ctx, &sctx, X, Y, N, &b, B, B, + LM, 1, &fill); + inv = sctx.inv; + imid = sctx.imid; + iside = sctx.iside; + delta = sctx.delta; + itheta = sctx.itheta; + qalloc = sctx.qalloc; +#ifdef FIXED_POINT + mid = imid; + side = iside; +#else + mid = (1.f/32768)*imid; + side = (1.f/32768)*iside; +#endif + + /* This is a special case for N=2 that only works for stereo and takes + advantage of the fact that mid and side are orthogonal to encode + the side with just one bit. */ + if (N==2) + { + int c; + int sign=0; + celt_norm *x2, *y2; + mbits = b; + sbits = 0; + /* Only need one bit for the side. */ + if (itheta != 0 && itheta != 16384) + sbits = 1< 8192; + ctx->remaining_bits -= qalloc+sbits; + + x2 = c ? Y : X; + y2 = c ? X : Y; + if (sbits) + { + if (encode) + { + /* Here we only need to encode a sign for the side. */ + sign = x2[0]*y2[1] - x2[1]*y2[0] < 0; + ec_enc_bits(ec, sign, 1); + } else { + sign = ec_dec_bits(ec, 1); + } + } + sign = 1-2*sign; + /* We use orig_fill here because we want to fold the side, but if + itheta==16384, we'll have cleared the low bits of fill. */ + cm = quant_band(ctx, x2, N, mbits, B, lowband, + LM, lowband_out, Q15ONE, lowband_scratch, orig_fill); + /* We don't split N=2 bands, so cm is either 1 or 0 (for a fold-collapse), + and there's no need to worry about mixing with the other channel. */ + y2[0] = -sign*x2[1]; + y2[1] = sign*x2[0]; + if (resynth) + { + celt_norm tmp; + X[0] = MULT16_16_Q15(mid, X[0]); + X[1] = MULT16_16_Q15(mid, X[1]); + Y[0] = MULT16_16_Q15(side, Y[0]); + Y[1] = MULT16_16_Q15(side, Y[1]); + tmp = X[0]; + X[0] = SUB16(tmp,Y[0]); + Y[0] = ADD16(tmp,Y[0]); + tmp = X[1]; + X[1] = SUB16(tmp,Y[1]); + Y[1] = ADD16(tmp,Y[1]); + } + } else { + /* "Normal" split code */ + opus_int32 rebalance; + + mbits = IMAX(0, IMIN(b, (b-delta)/2)); + sbits = b-mbits; + ctx->remaining_bits -= qalloc; + + rebalance = ctx->remaining_bits; + if (mbits >= sbits) + { + /* In stereo mode, we do not apply a scaling to the mid because we need the normalized + mid for folding later. */ + cm = quant_band(ctx, X, N, mbits, B, + lowband, LM, lowband_out, + Q15ONE, lowband_scratch, fill); + rebalance = mbits - (rebalance-ctx->remaining_bits); + if (rebalance > 3<>B); + } else { + /* For a stereo split, the high bits of fill are always zero, so no + folding will be done to the side. */ + cm = quant_band(ctx, Y, N, sbits, B, + NULL, LM, NULL, + side, NULL, fill>>B); + rebalance = sbits - (rebalance-ctx->remaining_bits); + if (rebalance > 3<arch); + if (inv) + { + int j; + for (j=0;jeBands; + celt_norm * OPUS_RESTRICT norm, * OPUS_RESTRICT norm2; + VARDECL(celt_norm, _norm); + celt_norm *lowband_scratch; + int B; + int M; + int lowband_offset; + int update_lowband = 1; + int C = Y_ != NULL ? 2 : 1; + int norm_offset; +#ifdef RESYNTH + int resynth = 1; +#else + int resynth = !encode; +#endif + struct band_ctx ctx; + SAVE_STACK; + + M = 1<nbEBands-1]-norm_offset), celt_norm); + norm = _norm; + norm2 = norm + M*eBands[m->nbEBands-1]-norm_offset; + /* We can use the last band as scratch space because we don't need that + scratch space for the last band. */ + lowband_scratch = X_+M*eBands[m->nbEBands-1]; + + lowband_offset = 0; + ctx.bandE = bandE; + ctx.ec = ec; + ctx.encode = encode; + ctx.intensity = intensity; + ctx.m = m; + ctx.seed = *seed; + ctx.spread = spread; + ctx.arch = arch; + for (i=start;i= M*eBands[start] && (update_lowband || lowband_offset==0)) + lowband_offset = i; + + tf_change = tf_res[i]; + ctx.tf_change = tf_change; + if (i>=m->effEBands) + { + X=norm; + if (Y_!=NULL) + Y = norm; + lowband_scratch = NULL; + } + if (i==end-1) + lowband_scratch = NULL; + + /* Get a conservative estimate of the collapse_mask's for the bands we're + going to be folding from. */ + if (lowband_offset != 0 && (spread!=SPREAD_AGGRESSIVE || B>1 || tf_change<0)) + { + int fold_start; + int fold_end; + int fold_i; + /* This ensures we never repeat spectral content within one band */ + effective_lowband = IMAX(0, M*eBands[lowband_offset]-norm_offset-N); + fold_start = lowband_offset; + while(M*eBands[--fold_start] > effective_lowband+norm_offset); + fold_end = lowband_offset-1; + while(M*eBands[++fold_end] < effective_lowband+norm_offset+N); + x_cm = y_cm = 0; + fold_i = fold_start; do { + x_cm |= collapse_masks[fold_i*C+0]; + y_cm |= collapse_masks[fold_i*C+C-1]; + } while (++fold_i(N< +#include "celt.h" +#include "pitch.h" +#include "bands.h" +#include "modes.h" +#include "entcode.h" +#include "quant_bands.h" +#include "rate.h" +#include "stack_alloc.h" +#include "mathops.h" +#include "float_cast.h" +#include +#include "celt_lpc.h" +#include "vq.h" + +#ifndef PACKAGE_VERSION +#define PACKAGE_VERSION "unknown" +#endif + +#if defined(MIPSr1_ASM) +#include "mips/celt_mipsr1.h" +#endif + + +int resampling_factor(opus_int32 rate) +{ + int ret; + switch (rate) + { + case 48000: + ret = 1; + break; + case 24000: + ret = 2; + break; + case 16000: + ret = 3; + break; + case 12000: + ret = 4; + break; + case 8000: + ret = 6; + break; + default: +#ifndef CUSTOM_MODES + celt_assert(0); +#endif + ret = 0; + break; + } + return ret; +} + +#if !defined(OVERRIDE_COMB_FILTER_CONST) || defined(NON_STATIC_COMB_FILTER_CONST_C) +/* This version should be faster on ARM */ +#ifdef OPUS_ARM_ASM +#ifndef NON_STATIC_COMB_FILTER_CONST_C +static +#endif +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, + opus_val16 g10, opus_val16 g11, opus_val16 g12) +{ + opus_val32 x0, x1, x2, x3, x4; + int i; + x4 = SHL32(x[-T-2], 1); + x3 = SHL32(x[-T-1], 1); + x2 = SHL32(x[-T], 1); + x1 = SHL32(x[-T+1], 1); + for (i=0;inbEBands;i++) + { + int N; + N=(m->eBands[i+1]-m->eBands[i])<cache.caps[m->nbEBands*(2*LM+C-1)+i]+64)*C*N>>2; + } +} + + + +const char *opus_strerror(int error) +{ + static const char * const error_strings[8] = { + "success", + "invalid argument", + "buffer too small", + "internal error", + "corrupted stream", + "request not implemented", + "invalid state", + "memory allocation failed" + }; + if (error > 0 || error < -7) + return "unknown error"; + else + return error_strings[-error]; +} + +const char *opus_get_version_string(void) +{ + return "libopus " PACKAGE_VERSION + /* Applications may rely on the presence of this substring in the version + string to determine if they have a fixed-point or floating-point build + at runtime. */ +#ifdef FIXED_POINT + "-fixed" +#endif +#ifdef FUZZING + "-fuzzing" +#endif + ; +} diff --git a/thirdparty/opus/celt/celt.h b/thirdparty/opus/celt/celt.h new file mode 100644 index 000000000000..d1f7eb690db3 --- /dev/null +++ b/thirdparty/opus/celt/celt.h @@ -0,0 +1,229 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Copyright (c) 2008 Gregory Maxwell + Written by Jean-Marc Valin and Gregory Maxwell */ +/** + @file celt.h + @brief Contains all the functions for encoding and decoding audio + */ + +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef CELT_H +#define CELT_H + +#include "opus_types.h" +#include "opus_defines.h" +#include "opus_custom.h" +#include "entenc.h" +#include "entdec.h" +#include "arch.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define CELTEncoder OpusCustomEncoder +#define CELTDecoder OpusCustomDecoder +#define CELTMode OpusCustomMode + +typedef struct { + int valid; + float tonality; + float tonality_slope; + float noisiness; + float activity; + float music_prob; + int bandwidth; +}AnalysisInfo; + +#define __celt_check_mode_ptr_ptr(ptr) ((ptr) + ((ptr) - (const CELTMode**)(ptr))) + +#define __celt_check_analysis_ptr(ptr) ((ptr) + ((ptr) - (const AnalysisInfo*)(ptr))) + +/* Encoder/decoder Requests */ + +/* Expose this option again when variable framesize actually works */ +#define OPUS_FRAMESIZE_VARIABLE 5010 /**< Optimize the frame size dynamically */ + + +#define CELT_SET_PREDICTION_REQUEST 10002 +/** Controls the use of interframe prediction. + 0=Independent frames + 1=Short term interframe prediction allowed + 2=Long term prediction allowed + */ +#define CELT_SET_PREDICTION(x) CELT_SET_PREDICTION_REQUEST, __opus_check_int(x) + +#define CELT_SET_INPUT_CLIPPING_REQUEST 10004 +#define CELT_SET_INPUT_CLIPPING(x) CELT_SET_INPUT_CLIPPING_REQUEST, __opus_check_int(x) + +#define CELT_GET_AND_CLEAR_ERROR_REQUEST 10007 +#define CELT_GET_AND_CLEAR_ERROR(x) CELT_GET_AND_CLEAR_ERROR_REQUEST, __opus_check_int_ptr(x) + +#define CELT_SET_CHANNELS_REQUEST 10008 +#define CELT_SET_CHANNELS(x) CELT_SET_CHANNELS_REQUEST, __opus_check_int(x) + + +/* Internal */ +#define CELT_SET_START_BAND_REQUEST 10010 +#define CELT_SET_START_BAND(x) CELT_SET_START_BAND_REQUEST, __opus_check_int(x) + +#define CELT_SET_END_BAND_REQUEST 10012 +#define CELT_SET_END_BAND(x) CELT_SET_END_BAND_REQUEST, __opus_check_int(x) + +#define CELT_GET_MODE_REQUEST 10015 +/** Get the CELTMode used by an encoder or decoder */ +#define CELT_GET_MODE(x) CELT_GET_MODE_REQUEST, __celt_check_mode_ptr_ptr(x) + +#define CELT_SET_SIGNALLING_REQUEST 10016 +#define CELT_SET_SIGNALLING(x) CELT_SET_SIGNALLING_REQUEST, __opus_check_int(x) + +#define CELT_SET_TONALITY_REQUEST 10018 +#define CELT_SET_TONALITY(x) CELT_SET_TONALITY_REQUEST, __opus_check_int(x) +#define CELT_SET_TONALITY_SLOPE_REQUEST 10020 +#define CELT_SET_TONALITY_SLOPE(x) CELT_SET_TONALITY_SLOPE_REQUEST, __opus_check_int(x) + +#define CELT_SET_ANALYSIS_REQUEST 10022 +#define CELT_SET_ANALYSIS(x) CELT_SET_ANALYSIS_REQUEST, __celt_check_analysis_ptr(x) + +#define OPUS_SET_LFE_REQUEST 10024 +#define OPUS_SET_LFE(x) OPUS_SET_LFE_REQUEST, __opus_check_int(x) + +#define OPUS_SET_ENERGY_MASK_REQUEST 10026 +#define OPUS_SET_ENERGY_MASK(x) OPUS_SET_ENERGY_MASK_REQUEST, __opus_check_val16_ptr(x) + +/* Encoder stuff */ + +int celt_encoder_get_size(int channels); + +int celt_encode_with_ec(OpusCustomEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes, ec_enc *enc); + +int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels, + int arch); + + + +/* Decoder stuff */ + +int celt_decoder_get_size(int channels); + + +int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels); + +int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned char *data, + int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum); + +#define celt_encoder_ctl opus_custom_encoder_ctl +#define celt_decoder_ctl opus_custom_decoder_ctl + + +#ifdef CUSTOM_MODES +#define OPUS_CUSTOM_NOSTATIC +#else +#define OPUS_CUSTOM_NOSTATIC static OPUS_INLINE +#endif + +static const unsigned char trim_icdf[11] = {126, 124, 119, 109, 87, 41, 19, 9, 4, 2, 0}; +/* Probs: NONE: 21.875%, LIGHT: 6.25%, NORMAL: 65.625%, AGGRESSIVE: 6.25% */ +static const unsigned char spread_icdf[4] = {25, 23, 2, 0}; + +static const unsigned char tapset_icdf[3]={2,1,0}; + +#ifdef CUSTOM_MODES +static const unsigned char toOpusTable[20] = { + 0xE0, 0xE8, 0xF0, 0xF8, + 0xC0, 0xC8, 0xD0, 0xD8, + 0xA0, 0xA8, 0xB0, 0xB8, + 0x00, 0x00, 0x00, 0x00, + 0x80, 0x88, 0x90, 0x98, +}; + +static const unsigned char fromOpusTable[16] = { + 0x80, 0x88, 0x90, 0x98, + 0x40, 0x48, 0x50, 0x58, + 0x20, 0x28, 0x30, 0x38, + 0x00, 0x08, 0x10, 0x18 +}; + +static OPUS_INLINE int toOpus(unsigned char c) +{ + int ret=0; + if (c<0xA0) + ret = toOpusTable[c>>3]; + if (ret == 0) + return -1; + else + return ret|(c&0x7); +} + +static OPUS_INLINE int fromOpus(unsigned char c) +{ + if (c<0x80) + return -1; + else + return fromOpusTable[(c>>3)-16] | (c&0x7); +} +#endif /* CUSTOM_MODES */ + +#define COMBFILTER_MAXPERIOD 1024 +#define COMBFILTER_MINPERIOD 15 + +extern const signed char tf_select_table[4][8]; + +int resampling_factor(opus_int32 rate); + +void celt_preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTRICT inp, + int N, int CC, int upsample, const opus_val16 *coef, celt_sig *mem, int clip); + +void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, + opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, + const opus_val16 *window, int overlap, int arch); + +#ifdef NON_STATIC_COMB_FILTER_CONST_C +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, + opus_val16 g10, opus_val16 g11, opus_val16 g12); +#endif + +#ifndef OVERRIDE_COMB_FILTER_CONST +# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ + ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12)) +#endif + +void init_caps(const CELTMode *m,int *cap,int LM,int C); + +#ifdef RESYNTH +void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem); +void celt_synthesis(const CELTMode *mode, celt_norm *X, celt_sig * out_syn[], + opus_val16 *oldBandE, int start, int effEnd, int C, int CC, int isTransient, + int LM, int downsample, int silence); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* CELT_H */ diff --git a/thirdparty/opus/celt/celt_decoder.c b/thirdparty/opus/celt/celt_decoder.c new file mode 100644 index 000000000000..b978bb34d1b7 --- /dev/null +++ b/thirdparty/opus/celt/celt_decoder.c @@ -0,0 +1,1248 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2010 Xiph.Org Foundation + Copyright (c) 2008 Gregory Maxwell + Written by Jean-Marc Valin and Gregory Maxwell */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#define CELT_DECODER_C + +#include "cpu_support.h" +#include "os_support.h" +#include "mdct.h" +#include +#include "celt.h" +#include "pitch.h" +#include "bands.h" +#include "modes.h" +#include "entcode.h" +#include "quant_bands.h" +#include "rate.h" +#include "stack_alloc.h" +#include "mathops.h" +#include "float_cast.h" +#include +#include "celt_lpc.h" +#include "vq.h" + +#if defined(SMALL_FOOTPRINT) && defined(FIXED_POINT) +#define NORM_ALIASING_HACK +#endif +/**********************************************************************/ +/* */ +/* DECODER */ +/* */ +/**********************************************************************/ +#define DECODE_BUFFER_SIZE 2048 + +/** Decoder state + @brief Decoder state + */ +struct OpusCustomDecoder { + const OpusCustomMode *mode; + int overlap; + int channels; + int stream_channels; + + int downsample; + int start, end; + int signalling; + int arch; + + /* Everything beyond this point gets cleared on a reset */ +#define DECODER_RESET_START rng + + opus_uint32 rng; + int error; + int last_pitch_index; + int loss_count; + int skip_plc; + int postfilter_period; + int postfilter_period_old; + opus_val16 postfilter_gain; + opus_val16 postfilter_gain_old; + int postfilter_tapset; + int postfilter_tapset_old; + + celt_sig preemph_memD[2]; + + celt_sig _decode_mem[1]; /* Size = channels*(DECODE_BUFFER_SIZE+mode->overlap) */ + /* opus_val16 lpc[], Size = channels*LPC_ORDER */ + /* opus_val16 oldEBands[], Size = 2*mode->nbEBands */ + /* opus_val16 oldLogE[], Size = 2*mode->nbEBands */ + /* opus_val16 oldLogE2[], Size = 2*mode->nbEBands */ + /* opus_val16 backgroundLogE[], Size = 2*mode->nbEBands */ +}; + +int celt_decoder_get_size(int channels) +{ + const CELTMode *mode = opus_custom_mode_create(48000, 960, NULL); + return opus_custom_decoder_get_size(mode, channels); +} + +OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_get_size(const CELTMode *mode, int channels) +{ + int size = sizeof(struct CELTDecoder) + + (channels*(DECODE_BUFFER_SIZE+mode->overlap)-1)*sizeof(celt_sig) + + channels*LPC_ORDER*sizeof(opus_val16) + + 4*2*mode->nbEBands*sizeof(opus_val16); + return size; +} + +#ifdef CUSTOM_MODES +CELTDecoder *opus_custom_decoder_create(const CELTMode *mode, int channels, int *error) +{ + int ret; + CELTDecoder *st = (CELTDecoder *)opus_alloc(opus_custom_decoder_get_size(mode, channels)); + ret = opus_custom_decoder_init(st, mode, channels); + if (ret != OPUS_OK) + { + opus_custom_decoder_destroy(st); + st = NULL; + } + if (error) + *error = ret; + return st; +} +#endif /* CUSTOM_MODES */ + +int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels) +{ + int ret; + ret = opus_custom_decoder_init(st, opus_custom_mode_create(48000, 960, NULL), channels); + if (ret != OPUS_OK) + return ret; + st->downsample = resampling_factor(sampling_rate); + if (st->downsample==0) + return OPUS_BAD_ARG; + else + return OPUS_OK; +} + +OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_init(CELTDecoder *st, const CELTMode *mode, int channels) +{ + if (channels < 0 || channels > 2) + return OPUS_BAD_ARG; + + if (st==NULL) + return OPUS_ALLOC_FAIL; + + OPUS_CLEAR((char*)st, opus_custom_decoder_get_size(mode, channels)); + + st->mode = mode; + st->overlap = mode->overlap; + st->stream_channels = st->channels = channels; + + st->downsample = 1; + st->start = 0; + st->end = st->mode->effEBands; + st->signalling = 1; + st->arch = opus_select_arch(); + + opus_custom_decoder_ctl(st, OPUS_RESET_STATE); + + return OPUS_OK; +} + +#ifdef CUSTOM_MODES +void opus_custom_decoder_destroy(CELTDecoder *st) +{ + opus_free(st); +} +#endif /* CUSTOM_MODES */ + + +#ifndef RESYNTH +static +#endif +void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, + celt_sig *mem, int accum) +{ + int c; + int Nd; + int apply_downsampling=0; + opus_val16 coef0; + VARDECL(celt_sig, scratch); + SAVE_STACK; +#ifndef FIXED_POINT + (void)accum; + celt_assert(accum==0); +#endif + ALLOC(scratch, N, celt_sig); + coef0 = coef[0]; + Nd = N/downsample; + c=0; do { + int j; + celt_sig * OPUS_RESTRICT x; + opus_val16 * OPUS_RESTRICT y; + celt_sig m = mem[c]; + x =in[c]; + y = pcm+c; +#ifdef CUSTOM_MODES + if (coef[1] != 0) + { + opus_val16 coef1 = coef[1]; + opus_val16 coef3 = coef[3]; + for (j=0;j1) + { + /* Shortcut for the standard (non-custom modes) case */ + for (j=0;joverlap; + nbEBands = mode->nbEBands; + N = mode->shortMdctSize<shortMdctSize; + shift = mode->maxLM; + } else { + B = 1; + NB = mode->shortMdctSize<maxLM-LM; + } + + if (CC==2&&C==1) + { + /* Copying a mono streams to two channels */ + celt_sig *freq2; + denormalise_bands(mode, X, freq, oldBandE, start, effEnd, M, + downsample, silence); + /* Store a temporary copy in the output buffer because the IMDCT destroys its input. */ + freq2 = out_syn[1]+overlap/2; + OPUS_COPY(freq2, freq, N); + for (b=0;bmdct, &freq2[b], out_syn[0]+NB*b, mode->window, overlap, shift, B, arch); + for (b=0;bmdct, &freq[b], out_syn[1]+NB*b, mode->window, overlap, shift, B, arch); + } else if (CC==1&&C==2) + { + /* Downmixing a stereo stream to mono */ + celt_sig *freq2; + freq2 = out_syn[0]+overlap/2; + denormalise_bands(mode, X, freq, oldBandE, start, effEnd, M, + downsample, silence); + /* Use the output buffer as temp array before downmixing. */ + denormalise_bands(mode, X+N, freq2, oldBandE+nbEBands, start, effEnd, M, + downsample, silence); + for (i=0;imdct, &freq[b], out_syn[0]+NB*b, mode->window, overlap, shift, B, arch); + } else { + /* Normal case (mono or stereo) */ + c=0; do { + denormalise_bands(mode, X+c*N, freq, oldBandE+c*nbEBands, start, effEnd, M, + downsample, silence); + for (b=0;bmdct, &freq[b], out_syn[c]+NB*b, mode->window, overlap, shift, B, arch); + } while (++cstorage*8; + tell = ec_tell(dec); + logp = isTransient ? 2 : 4; + tf_select_rsv = LM>0 && tell+logp+1<=budget; + budget -= tf_select_rsv; + tf_changed = curr = 0; + for (i=start;i>1, opus_val16 ); + pitch_downsample(decode_mem, lp_pitch_buf, + DECODE_BUFFER_SIZE, C, arch); + pitch_search(lp_pitch_buf+(PLC_PITCH_LAG_MAX>>1), lp_pitch_buf, + DECODE_BUFFER_SIZE-PLC_PITCH_LAG_MAX, + PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index, arch); + pitch_index = PLC_PITCH_LAG_MAX-pitch_index; + RESTORE_STACK; + return pitch_index; +} + +static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) +{ + int c; + int i; + const int C = st->channels; + celt_sig *decode_mem[2]; + celt_sig *out_syn[2]; + opus_val16 *lpc; + opus_val16 *oldBandE, *oldLogE, *oldLogE2, *backgroundLogE; + const OpusCustomMode *mode; + int nbEBands; + int overlap; + int start; + int loss_count; + int noise_based; + const opus_int16 *eBands; + SAVE_STACK; + + mode = st->mode; + nbEBands = mode->nbEBands; + overlap = mode->overlap; + eBands = mode->eBands; + + c=0; do { + decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap); + out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N; + } while (++c_decode_mem+(DECODE_BUFFER_SIZE+overlap)*C); + oldBandE = lpc+C*LPC_ORDER; + oldLogE = oldBandE + 2*nbEBands; + oldLogE2 = oldLogE + 2*nbEBands; + backgroundLogE = oldLogE2 + 2*nbEBands; + + loss_count = st->loss_count; + start = st->start; + noise_based = loss_count >= 5 || start != 0 || st->skip_plc; + if (noise_based) + { + /* Noise-based PLC/CNG */ +#ifdef NORM_ALIASING_HACK + celt_norm *X; +#else + VARDECL(celt_norm, X); +#endif + opus_uint32 seed; + int end; + int effEnd; + opus_val16 decay; + end = st->end; + effEnd = IMAX(start, IMIN(end, mode->effEBands)); + +#ifdef NORM_ALIASING_HACK + /* This is an ugly hack that breaks aliasing rules and would be easily broken, + but it saves almost 4kB of stack. */ + X = (celt_norm*)(out_syn[C-1]+overlap/2); +#else + ALLOC(X, C*N, celt_norm); /**< Interleaved normalised MDCTs */ +#endif + + /* Energy decay */ + decay = loss_count==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT); + c=0; do + { + for (i=start;irng; + for (c=0;c>20); + } + renormalise_vector(X+boffs, blen, Q15ONE, st->arch); + } + } + st->rng = seed; + + c=0; do { + OPUS_MOVE(decode_mem[c], decode_mem[c]+N, + DECODE_BUFFER_SIZE-N+(overlap>>1)); + } while (++cdownsample, 0, st->arch); + } else { + /* Pitch-based PLC */ + const opus_val16 *window; + opus_val16 fade = Q15ONE; + int pitch_index; + VARDECL(opus_val32, etmp); + VARDECL(opus_val16, exc); + + if (loss_count == 0) + { + st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch); + } else { + pitch_index = st->last_pitch_index; + fade = QCONST16(.8f,15); + } + + ALLOC(etmp, overlap, opus_val32); + ALLOC(exc, MAX_PERIOD, opus_val16); + window = mode->window; + c=0; do { + opus_val16 decay; + opus_val16 attenuation; + opus_val32 S1=0; + celt_sig *buf; + int extrapolation_offset; + int extrapolation_len; + int exc_length; + int j; + + buf = decode_mem[c]; + for (i=0;iarch); + /* Add a noise floor of -40 dB. */ +#ifdef FIXED_POINT + ac[0] += SHR32(ac[0],13); +#else + ac[0] *= 1.0001f; +#endif + /* Use lag windowing to stabilize the Levinson-Durbin recursion. */ + for (i=1;i<=LPC_ORDER;i++) + { + /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/ +#ifdef FIXED_POINT + ac[i] -= MULT16_32_Q15(2*i*i, ac[i]); +#else + ac[i] -= ac[i]*(0.008f*0.008f)*i*i; +#endif + } + _celt_lpc(lpc+c*LPC_ORDER, ac, LPC_ORDER); + } + /* We want the excitation for 2 pitch periods in order to look for a + decaying signal, but we can't get more than MAX_PERIOD. */ + exc_length = IMIN(2*pitch_index, MAX_PERIOD); + /* Initialize the LPC history with the samples just before the start + of the region for which we're computing the excitation. */ + { + opus_val16 lpc_mem[LPC_ORDER]; + for (i=0;iarch); + } + + /* Check if the waveform is decaying, and if so how fast. + We do this to avoid adding energy when concealing in a segment + with decaying energy. */ + { + opus_val32 E1=1, E2=1; + int decay_length; +#ifdef FIXED_POINT + int shift = IMAX(0,2*celt_zlog2(celt_maxabs16(&exc[MAX_PERIOD-exc_length], exc_length))-20); +#endif + decay_length = exc_length>>1; + for (i=0;i= pitch_index) { + j -= pitch_index; + attenuation = MULT16_16_Q15(attenuation, decay); + } + buf[DECODE_BUFFER_SIZE-N+i] = + SHL32(EXTEND32(MULT16_16_Q15(attenuation, + exc[extrapolation_offset+j])), SIG_SHIFT); + /* Compute the energy of the previously decoded signal whose + excitation we're copying. */ + tmp = ROUND16( + buf[DECODE_BUFFER_SIZE-MAX_PERIOD-N+extrapolation_offset+j], + SIG_SHIFT); + S1 += SHR32(MULT16_16(tmp, tmp), 8); + } + + { + opus_val16 lpc_mem[LPC_ORDER]; + /* Copy the last decoded samples (prior to the overlap region) to + synthesis filter memory so we can have a continuous signal. */ + for (i=0;iarch); + } + + /* Check if the synthesis energy is higher than expected, which can + happen with the signal changes during our window. If so, + attenuate. */ + { + opus_val32 S2=0; + for (i=0;i SHR32(S2,2))) +#else + /* The float test is written this way to catch NaNs in the output + of the IIR filter at the same time. */ + if (!(S1 > 0.2f*S2)) +#endif + { + for (i=0;ipostfilter_period, st->postfilter_period, overlap, + -st->postfilter_gain, -st->postfilter_gain, + st->postfilter_tapset, st->postfilter_tapset, NULL, 0, st->arch); + + /* Simulate TDAC on the concealed audio so that it blends with the + MDCT of the next frame. */ + for (i=0;iloss_count = loss_count+1; + + RESTORE_STACK; +} + +int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, + int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum) +{ + int c, i, N; + int spread_decision; + opus_int32 bits; + ec_dec _dec; +#ifdef NORM_ALIASING_HACK + celt_norm *X; +#else + VARDECL(celt_norm, X); +#endif + VARDECL(int, fine_quant); + VARDECL(int, pulses); + VARDECL(int, cap); + VARDECL(int, offsets); + VARDECL(int, fine_priority); + VARDECL(int, tf_res); + VARDECL(unsigned char, collapse_masks); + celt_sig *decode_mem[2]; + celt_sig *out_syn[2]; + opus_val16 *lpc; + opus_val16 *oldBandE, *oldLogE, *oldLogE2, *backgroundLogE; + + int shortBlocks; + int isTransient; + int intra_ener; + const int CC = st->channels; + int LM, M; + int start; + int end; + int effEnd; + int codedBands; + int alloc_trim; + int postfilter_pitch; + opus_val16 postfilter_gain; + int intensity=0; + int dual_stereo=0; + opus_int32 total_bits; + opus_int32 balance; + opus_int32 tell; + int dynalloc_logp; + int postfilter_tapset; + int anti_collapse_rsv; + int anti_collapse_on=0; + int silence; + int C = st->stream_channels; + const OpusCustomMode *mode; + int nbEBands; + int overlap; + const opus_int16 *eBands; + ALLOC_STACK; + + mode = st->mode; + nbEBands = mode->nbEBands; + overlap = mode->overlap; + eBands = mode->eBands; + start = st->start; + end = st->end; + frame_size *= st->downsample; + + lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*CC); + oldBandE = lpc+CC*LPC_ORDER; + oldLogE = oldBandE + 2*nbEBands; + oldLogE2 = oldLogE + 2*nbEBands; + backgroundLogE = oldLogE2 + 2*nbEBands; + +#ifdef CUSTOM_MODES + if (st->signalling && data!=NULL) + { + int data0=data[0]; + /* Convert "standard mode" to Opus header */ + if (mode->Fs==48000 && mode->shortMdctSize==120) + { + data0 = fromOpus(data0); + if (data0<0) + return OPUS_INVALID_PACKET; + } + st->end = end = IMAX(1, mode->effEBands-2*(data0>>5)); + LM = (data0>>3)&0x3; + C = 1 + ((data0>>2)&0x1); + data++; + len--; + if (LM>mode->maxLM) + return OPUS_INVALID_PACKET; + if (frame_size < mode->shortMdctSize<shortMdctSize<maxLM;LM++) + if (mode->shortMdctSize<mode->maxLM) + return OPUS_BAD_ARG; + } + M=1<1275 || pcm==NULL) + return OPUS_BAD_ARG; + + N = M*mode->shortMdctSize; + c=0; do { + decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap); + out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N; + } while (++c mode->effEBands) + effEnd = mode->effEBands; + + if (data == NULL || len<=1) + { + celt_decode_lost(st, N, LM); + deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum); + RESTORE_STACK; + return frame_size/st->downsample; + } + + /* Check if there are at least two packets received consecutively before + * turning on the pitch-based PLC */ + st->skip_plc = st->loss_count != 0; + + if (dec == NULL) + { + ec_dec_init(&_dec,(unsigned char*)data,len); + dec = &_dec; + } + + if (C==1) + { + for (i=0;i= total_bits) + silence = 1; + else if (tell==1) + silence = ec_dec_bit_logp(dec, 15); + else + silence = 0; + if (silence) + { + /* Pretend we've read all the remaining bits */ + tell = len*8; + dec->nbits_total+=tell-ec_tell(dec); + } + + postfilter_gain = 0; + postfilter_pitch = 0; + postfilter_tapset = 0; + if (start==0 && tell+16 <= total_bits) + { + if(ec_dec_bit_logp(dec, 1)) + { + int qg, octave; + octave = ec_dec_uint(dec, 6); + postfilter_pitch = (16< 0 && tell+3 <= total_bits) + { + isTransient = ec_dec_bit_logp(dec, 3); + tell = ec_tell(dec); + } + else + isTransient = 0; + + if (isTransient) + shortBlocks = M; + else + shortBlocks = 0; + + /* Decode the global flags (first symbols in the stream) */ + intra_ener = tell+3<=total_bits ? ec_dec_bit_logp(dec, 3) : 0; + /* Get band energies */ + unquant_coarse_energy(mode, start, end, oldBandE, + intra_ener, dec, C, LM); + + ALLOC(tf_res, nbEBands, int); + tf_decode(start, end, isTransient, tf_res, LM, dec); + + tell = ec_tell(dec); + spread_decision = SPREAD_NORMAL; + if (tell+4 <= total_bits) + spread_decision = ec_dec_icdf(dec, spread_icdf, 5); + + ALLOC(cap, nbEBands, int); + + init_caps(mode,cap,LM,C); + + ALLOC(offsets, nbEBands, int); + + dynalloc_logp = 6; + total_bits<<=BITRES; + tell = ec_tell_frac(dec); + for (i=start;i0) + dynalloc_logp = IMAX(2, dynalloc_logp-1); + } + + ALLOC(fine_quant, nbEBands, int); + alloc_trim = tell+(6<=2&&bits>=((LM+2)<rng, st->arch); + + if (anti_collapse_rsv > 0) + { + anti_collapse_on = ec_dec_bits(dec, 1); + } + + unquant_energy_finalise(mode, start, end, oldBandE, + fine_quant, fine_priority, len*8-ec_tell(dec), dec, C); + + if (anti_collapse_on) + anti_collapse(mode, X, collapse_masks, LM, C, N, + start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng, st->arch); + + if (silence) + { + for (i=0;idownsample, silence, st->arch); + + c=0; do { + st->postfilter_period=IMAX(st->postfilter_period, COMBFILTER_MINPERIOD); + st->postfilter_period_old=IMAX(st->postfilter_period_old, COMBFILTER_MINPERIOD); + comb_filter(out_syn[c], out_syn[c], st->postfilter_period_old, st->postfilter_period, mode->shortMdctSize, + st->postfilter_gain_old, st->postfilter_gain, st->postfilter_tapset_old, st->postfilter_tapset, + mode->window, overlap, st->arch); + if (LM!=0) + comb_filter(out_syn[c]+mode->shortMdctSize, out_syn[c]+mode->shortMdctSize, st->postfilter_period, postfilter_pitch, N-mode->shortMdctSize, + st->postfilter_gain, postfilter_gain, st->postfilter_tapset, postfilter_tapset, + mode->window, overlap, st->arch); + + } while (++cpostfilter_period_old = st->postfilter_period; + st->postfilter_gain_old = st->postfilter_gain; + st->postfilter_tapset_old = st->postfilter_tapset; + st->postfilter_period = postfilter_pitch; + st->postfilter_gain = postfilter_gain; + st->postfilter_tapset = postfilter_tapset; + if (LM!=0) + { + st->postfilter_period_old = st->postfilter_period; + st->postfilter_gain_old = st->postfilter_gain; + st->postfilter_tapset_old = st->postfilter_tapset; + } + + if (C==1) + OPUS_COPY(&oldBandE[nbEBands], oldBandE, nbEBands); + + /* In case start or end were to change */ + if (!isTransient) + { + opus_val16 max_background_increase; + OPUS_COPY(oldLogE2, oldLogE, 2*nbEBands); + OPUS_COPY(oldLogE, oldBandE, 2*nbEBands); + /* In normal circumstances, we only allow the noise floor to increase by + up to 2.4 dB/second, but when we're in DTX, we allow up to 6 dB + increase for each update.*/ + if (st->loss_count < 10) + max_background_increase = M*QCONST16(0.001f,DB_SHIFT); + else + max_background_increase = QCONST16(1.f,DB_SHIFT); + for (i=0;i<2*nbEBands;i++) + backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]); + } else { + for (i=0;i<2*nbEBands;i++) + oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]); + } + c=0; do + { + for (i=0;irng = dec->rng; + + deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum); + st->loss_count = 0; + RESTORE_STACK; + if (ec_tell(dec) > 8*len) + return OPUS_INTERNAL_ERROR; + if(ec_get_error(dec)) + st->error = 1; + return frame_size/st->downsample; +} + + +#ifdef CUSTOM_MODES + +#ifdef FIXED_POINT +int opus_custom_decode(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_int16 * OPUS_RESTRICT pcm, int frame_size) +{ + return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL, 0); +} + +#ifndef DISABLE_FLOAT_API +int opus_custom_decode_float(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, float * OPUS_RESTRICT pcm, int frame_size) +{ + int j, ret, C, N; + VARDECL(opus_int16, out); + ALLOC_STACK; + + if (pcm==NULL) + return OPUS_BAD_ARG; + + C = st->channels; + N = frame_size; + + ALLOC(out, C*N, opus_int16); + ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL, 0); + if (ret>0) + for (j=0;jchannels; + N = frame_size; + ALLOC(out, C*N, celt_sig); + + ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL, 0); + + if (ret>0) + for (j=0;j=st->mode->nbEBands) + goto bad_arg; + st->start = value; + } + break; + case CELT_SET_END_BAND_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value<1 || value>st->mode->nbEBands) + goto bad_arg; + st->end = value; + } + break; + case CELT_SET_CHANNELS_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value<1 || value>2) + goto bad_arg; + st->stream_channels = value; + } + break; + case CELT_GET_AND_CLEAR_ERROR_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (value==NULL) + goto bad_arg; + *value=st->error; + st->error = 0; + } + break; + case OPUS_GET_LOOKAHEAD_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (value==NULL) + goto bad_arg; + *value = st->overlap/st->downsample; + } + break; + case OPUS_RESET_STATE: + { + int i; + opus_val16 *lpc, *oldBandE, *oldLogE, *oldLogE2; + lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+st->overlap)*st->channels); + oldBandE = lpc+st->channels*LPC_ORDER; + oldLogE = oldBandE + 2*st->mode->nbEBands; + oldLogE2 = oldLogE + 2*st->mode->nbEBands; + OPUS_CLEAR((char*)&st->DECODER_RESET_START, + opus_custom_decoder_get_size(st->mode, st->channels)- + ((char*)&st->DECODER_RESET_START - (char*)st)); + for (i=0;i<2*st->mode->nbEBands;i++) + oldLogE[i]=oldLogE2[i]=-QCONST16(28.f,DB_SHIFT); + st->skip_plc = 1; + } + break; + case OPUS_GET_PITCH_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (value==NULL) + goto bad_arg; + *value = st->postfilter_period; + } + break; + case CELT_GET_MODE_REQUEST: + { + const CELTMode ** value = va_arg(ap, const CELTMode**); + if (value==0) + goto bad_arg; + *value=st->mode; + } + break; + case CELT_SET_SIGNALLING_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + st->signalling = value; + } + break; + case OPUS_GET_FINAL_RANGE_REQUEST: + { + opus_uint32 * value = va_arg(ap, opus_uint32 *); + if (value==0) + goto bad_arg; + *value=st->rng; + } + break; + default: + goto bad_request; + } + va_end(ap); + return OPUS_OK; +bad_arg: + va_end(ap); + return OPUS_BAD_ARG; +bad_request: + va_end(ap); + return OPUS_UNIMPLEMENTED; +} diff --git a/thirdparty/opus/celt/celt_encoder.c b/thirdparty/opus/celt/celt_encoder.c new file mode 100644 index 000000000000..3ee7a4d3f7a1 --- /dev/null +++ b/thirdparty/opus/celt/celt_encoder.c @@ -0,0 +1,2410 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2010 Xiph.Org Foundation + Copyright (c) 2008 Gregory Maxwell + Written by Jean-Marc Valin and Gregory Maxwell */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#define CELT_ENCODER_C + +#include "cpu_support.h" +#include "os_support.h" +#include "mdct.h" +#include +#include "celt.h" +#include "pitch.h" +#include "bands.h" +#include "modes.h" +#include "entcode.h" +#include "quant_bands.h" +#include "rate.h" +#include "stack_alloc.h" +#include "mathops.h" +#include "float_cast.h" +#include +#include "celt_lpc.h" +#include "vq.h" + + +/** Encoder state + @brief Encoder state + */ +struct OpusCustomEncoder { + const OpusCustomMode *mode; /**< Mode used by the encoder */ + int channels; + int stream_channels; + + int force_intra; + int clip; + int disable_pf; + int complexity; + int upsample; + int start, end; + + opus_int32 bitrate; + int vbr; + int signalling; + int constrained_vbr; /* If zero, VBR can do whatever it likes with the rate */ + int loss_rate; + int lsb_depth; + int variable_duration; + int lfe; + int arch; + + /* Everything beyond this point gets cleared on a reset */ +#define ENCODER_RESET_START rng + + opus_uint32 rng; + int spread_decision; + opus_val32 delayedIntra; + int tonal_average; + int lastCodedBands; + int hf_average; + int tapset_decision; + + int prefilter_period; + opus_val16 prefilter_gain; + int prefilter_tapset; +#ifdef RESYNTH + int prefilter_period_old; + opus_val16 prefilter_gain_old; + int prefilter_tapset_old; +#endif + int consec_transient; + AnalysisInfo analysis; + + opus_val32 preemph_memE[2]; + opus_val32 preemph_memD[2]; + + /* VBR-related parameters */ + opus_int32 vbr_reservoir; + opus_int32 vbr_drift; + opus_int32 vbr_offset; + opus_int32 vbr_count; + opus_val32 overlap_max; + opus_val16 stereo_saving; + int intensity; + opus_val16 *energy_mask; + opus_val16 spec_avg; + +#ifdef RESYNTH + /* +MAX_PERIOD/2 to make space for overlap */ + celt_sig syn_mem[2][2*MAX_PERIOD+MAX_PERIOD/2]; +#endif + + celt_sig in_mem[1]; /* Size = channels*mode->overlap */ + /* celt_sig prefilter_mem[], Size = channels*COMBFILTER_MAXPERIOD */ + /* opus_val16 oldBandE[], Size = channels*mode->nbEBands */ + /* opus_val16 oldLogE[], Size = channels*mode->nbEBands */ + /* opus_val16 oldLogE2[], Size = channels*mode->nbEBands */ +}; + +int celt_encoder_get_size(int channels) +{ + CELTMode *mode = opus_custom_mode_create(48000, 960, NULL); + return opus_custom_encoder_get_size(mode, channels); +} + +OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_get_size(const CELTMode *mode, int channels) +{ + int size = sizeof(struct CELTEncoder) + + (channels*mode->overlap-1)*sizeof(celt_sig) /* celt_sig in_mem[channels*mode->overlap]; */ + + channels*COMBFILTER_MAXPERIOD*sizeof(celt_sig) /* celt_sig prefilter_mem[channels*COMBFILTER_MAXPERIOD]; */ + + 3*channels*mode->nbEBands*sizeof(opus_val16); /* opus_val16 oldBandE[channels*mode->nbEBands]; */ + /* opus_val16 oldLogE[channels*mode->nbEBands]; */ + /* opus_val16 oldLogE2[channels*mode->nbEBands]; */ + return size; +} + +#ifdef CUSTOM_MODES +CELTEncoder *opus_custom_encoder_create(const CELTMode *mode, int channels, int *error) +{ + int ret; + CELTEncoder *st = (CELTEncoder *)opus_alloc(opus_custom_encoder_get_size(mode, channels)); + /* init will handle the NULL case */ + ret = opus_custom_encoder_init(st, mode, channels); + if (ret != OPUS_OK) + { + opus_custom_encoder_destroy(st); + st = NULL; + } + if (error) + *error = ret; + return st; +} +#endif /* CUSTOM_MODES */ + +static int opus_custom_encoder_init_arch(CELTEncoder *st, const CELTMode *mode, + int channels, int arch) +{ + if (channels < 0 || channels > 2) + return OPUS_BAD_ARG; + + if (st==NULL || mode==NULL) + return OPUS_ALLOC_FAIL; + + OPUS_CLEAR((char*)st, opus_custom_encoder_get_size(mode, channels)); + + st->mode = mode; + st->stream_channels = st->channels = channels; + + st->upsample = 1; + st->start = 0; + st->end = st->mode->effEBands; + st->signalling = 1; + + st->arch = arch; + + st->constrained_vbr = 1; + st->clip = 1; + + st->bitrate = OPUS_BITRATE_MAX; + st->vbr = 0; + st->force_intra = 0; + st->complexity = 5; + st->lsb_depth=24; + + opus_custom_encoder_ctl(st, OPUS_RESET_STATE); + + return OPUS_OK; +} + +#ifdef CUSTOM_MODES +int opus_custom_encoder_init(CELTEncoder *st, const CELTMode *mode, int channels) +{ + return opus_custom_encoder_init_arch(st, mode, channels, opus_select_arch()); +} +#endif + +int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels, + int arch) +{ + int ret; + ret = opus_custom_encoder_init_arch(st, + opus_custom_mode_create(48000, 960, NULL), channels, arch); + if (ret != OPUS_OK) + return ret; + st->upsample = resampling_factor(sampling_rate); + return OPUS_OK; +} + +#ifdef CUSTOM_MODES +void opus_custom_encoder_destroy(CELTEncoder *st) +{ + opus_free(st); +} +#endif /* CUSTOM_MODES */ + + +static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int C, + opus_val16 *tf_estimate, int *tf_chan) +{ + int i; + VARDECL(opus_val16, tmp); + opus_val32 mem0,mem1; + int is_transient = 0; + opus_int32 mask_metric = 0; + int c; + opus_val16 tf_max; + int len2; + /* Table of 6*64/x, trained on real data to minimize the average error */ + static const unsigned char inv_table[128] = { + 255,255,156,110, 86, 70, 59, 51, 45, 40, 37, 33, 31, 28, 26, 25, + 23, 22, 21, 20, 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 12, 12, + 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 8, 8, + 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, + }; + SAVE_STACK; + ALLOC(tmp, len, opus_val16); + + len2=len/2; + for (c=0;c=0;i--) + { +#ifdef FIXED_POINT + /* FIXME: Use PSHR16() instead */ + tmp[i] = mem0 + PSHR32(tmp[i]-mem0,3); +#else + tmp[i] = mem0 + MULT16_16_P15(QCONST16(0.125f,15),tmp[i]-mem0); +#endif + mem0 = tmp[i]; + maxE = MAX16(maxE, mem0); + } + /*for (i=0;i>1))); +#else + mean = celt_sqrt(mean * maxE*.5*len2); +#endif + /* Inverse of the mean energy in Q15+6 */ + norm = SHL32(EXTEND32(len2),6+14)/ADD32(EPSILON,SHR32(mean,1)); + /* Compute harmonic mean discarding the unreliable boundaries + The data is smooth, so we only take 1/4th of the samples */ + unmask=0; + for (i=12;imask_metric) + { + *tf_chan = c; + mask_metric = unmask; + } + } + is_transient = mask_metric>200; + + /* Arbitrary metric for VBR boost */ + tf_max = MAX16(0,celt_sqrt(27*mask_metric)-42); + /* *tf_estimate = 1 + MIN16(1, sqrt(MAX16(0, tf_max-30))/20); */ + *tf_estimate = celt_sqrt(MAX32(0, SHL32(MULT16_16(QCONST16(0.0069,14),MIN16(163,tf_max)),14)-QCONST32(0.139,28))); + /*printf("%d %f\n", tf_max, mask_metric);*/ + RESTORE_STACK; +#ifdef FUZZING + is_transient = rand()&0x1; +#endif + /*printf("%d %f %d\n", is_transient, (float)*tf_estimate, tf_max);*/ + return is_transient; +} + +/* Looks for sudden increases of energy to decide whether we need to patch + the transient decision */ +static int patch_transient_decision(opus_val16 *newE, opus_val16 *oldE, int nbEBands, + int start, int end, int C) +{ + int i, c; + opus_val32 mean_diff=0; + opus_val16 spread_old[26]; + /* Apply an aggressive (-6 dB/Bark) spreading function to the old frame to + avoid false detection caused by irrelevant bands */ + if (C==1) + { + spread_old[start] = oldE[start]; + for (i=start+1;i=start;i--) + spread_old[i] = MAX16(spread_old[i], spread_old[i+1]-QCONST16(1.0f, DB_SHIFT)); + /* Compute mean increase */ + c=0; do { + for (i=IMAX(2,start);i QCONST16(1.f, DB_SHIFT); +} + +/** Apply window and compute the MDCT for all sub-frames and + all channels in a frame */ +static void compute_mdcts(const CELTMode *mode, int shortBlocks, celt_sig * OPUS_RESTRICT in, + celt_sig * OPUS_RESTRICT out, int C, int CC, int LM, int upsample, + int arch) +{ + const int overlap = mode->overlap; + int N; + int B; + int shift; + int i, b, c; + if (shortBlocks) + { + B = shortBlocks; + N = mode->shortMdctSize; + shift = mode->maxLM; + } else { + B = 1; + N = mode->shortMdctSize<maxLM-LM; + } + c=0; do { + for (b=0;bmdct, in+c*(B*N+overlap)+b*N, + &out[b+c*N*B], mode->window, overlap, shift, B, + arch); + } + } while (++ceBands[len]-m->eBands[len-1])<eBands[len]-m->eBands[len-1])<eBands[i+1]-m->eBands[i])<eBands[i+1]-m->eBands[i])==1; + OPUS_COPY(tmp, &X[tf_chan*N0 + (m->eBands[i]<eBands[i]<>LM, 1<>k, 1<=0;i--) + { + if (tf_res[i+1] == 1) + tf_res[i] = path1[i+1]; + else + tf_res[i] = path0[i+1]; + } + /*printf("%d %f\n", *tf_sum, tf_estimate);*/ + RESTORE_STACK; +#ifdef FUZZING + tf_select = rand()&0x1; + tf_res[0] = rand()&0x1; + for (i=1;istorage*8; + tell = ec_tell(enc); + logp = isTransient ? 2 : 4; + /* Reserve space to code the tf_select decision. */ + tf_select_rsv = LM>0 && tell+logp+1 <= budget; + budget -= tf_select_rsv; + curr = tf_changed = 0; + for (i=start;ieBands[i]<eBands[i]<eBands[i+1]-m->eBands[i])<eBands[i]<eBands[i]<eBands[i+1]-m->eBands[i])<nbEBands]*(opus_int32)(2+2*i-end); + } + } while (++cvalid) + { + trim -= MAX16(-QCONST16(2.f, 8), MIN16(QCONST16(2.f, 8), + (opus_val16)(QCONST16(2.f, 8)*(analysis->tonality_slope+.05f)))); + } +#else + (void)analysis; +#endif + +#ifdef FIXED_POINT + trim_index = PSHR32(trim, 8); +#else + trim_index = (int)floor(.5f+trim); +#endif + trim_index = IMAX(0, IMIN(10, trim_index)); + /*printf("%d\n", trim_index);*/ +#ifdef FUZZING + trim_index = rand()%11; +#endif + return trim_index; +} + +static int stereo_analysis(const CELTMode *m, const celt_norm *X, + int LM, int N0) +{ + int i; + int thetas; + opus_val32 sumLR = EPSILON, sumMS = EPSILON; + + /* Use the L1 norm to model the entropy of the L/R signal vs the M/S signal */ + for (i=0;i<13;i++) + { + int j; + for (j=m->eBands[i]<eBands[i+1]<eBands[13]<<(LM+1))+thetas, sumMS) + > MULT16_32_Q15(m->eBands[13]<<(LM+1), sumLR); +} + +#define MSWAP(a,b) do {opus_val16 tmp = a;a=b;b=tmp;} while(0) +static opus_val16 median_of_5(const opus_val16 *x) +{ + opus_val16 t0, t1, t2, t3, t4; + t2 = x[2]; + if (x[0] > x[1]) + { + t0 = x[1]; + t1 = x[0]; + } else { + t0 = x[0]; + t1 = x[1]; + } + if (x[3] > x[4]) + { + t3 = x[4]; + t4 = x[3]; + } else { + t3 = x[3]; + t4 = x[4]; + } + if (t0 > t3) + { + MSWAP(t0, t3); + MSWAP(t1, t4); + } + if (t2 > t1) + { + if (t1 < t3) + return MIN16(t2, t3); + else + return MIN16(t4, t1); + } else { + if (t2 < t3) + return MIN16(t1, t3); + else + return MIN16(t2, t4); + } +} + +static opus_val16 median_of_3(const opus_val16 *x) +{ + opus_val16 t0, t1, t2; + if (x[0] > x[1]) + { + t0 = x[1]; + t1 = x[0]; + } else { + t0 = x[0]; + t1 = x[1]; + } + t2 = x[2]; + if (t1 < t2) + return t1; + else if (t0 < t2) + return t2; + else + return t0; +} + +static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 *bandLogE2, + int nbEBands, int start, int end, int C, int *offsets, int lsb_depth, const opus_int16 *logN, + int isTransient, int vbr, int constrained_vbr, const opus_int16 *eBands, int LM, + int effectiveBytes, opus_int32 *tot_boost_, int lfe, opus_val16 *surround_dynalloc) +{ + int i, c; + opus_int32 tot_boost=0; + opus_val16 maxDepth; + VARDECL(opus_val16, follower); + VARDECL(opus_val16, noise_floor); + SAVE_STACK; + ALLOC(follower, C*nbEBands, opus_val16); + ALLOC(noise_floor, C*nbEBands, opus_val16); + OPUS_CLEAR(offsets, nbEBands); + /* Dynamic allocation code */ + maxDepth=-QCONST16(31.9f, DB_SHIFT); + for (i=0;i 50 && LM>=1 && !lfe) + { + int last=0; + c=0;do + { + opus_val16 offset; + opus_val16 tmp; + opus_val16 *f; + f = &follower[c*nbEBands]; + f[0] = bandLogE2[c*nbEBands]; + for (i=1;i bandLogE2[c*nbEBands+i-1]+QCONST16(.5f,DB_SHIFT)) + last=i; + f[i] = MIN16(f[i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE2[c*nbEBands+i]); + } + for (i=last-1;i>=0;i--) + f[i] = MIN16(f[i], MIN16(f[i+1]+QCONST16(2.f,DB_SHIFT), bandLogE2[c*nbEBands+i])); + + /* Combine with a median filter to avoid dynalloc triggering unnecessarily. + The "offset" value controls how conservative we are -- a higher offset + reduces the impact of the median filter and makes dynalloc use more bits. */ + offset = QCONST16(1.f, DB_SHIFT); + for (i=2;i=12) + follower[i] = HALF16(follower[i]); + follower[i] = MIN16(follower[i], QCONST16(4, DB_SHIFT)); + + width = C*(eBands[i+1]-eBands[i])< 48) { + boost = (int)SHR32(EXTEND32(follower[i])*8,DB_SHIFT); + boost_bits = (boost*width<>BITRES>>3 > effectiveBytes/4) + { + opus_int32 cap = ((effectiveBytes/4)<mode; + overlap = mode->overlap; + ALLOC(_pre, CC*(N+COMBFILTER_MAXPERIOD), celt_sig); + + pre[0] = _pre; + pre[1] = _pre + (N+COMBFILTER_MAXPERIOD); + + + c=0; do { + OPUS_COPY(pre[c], prefilter_mem+c*COMBFILTER_MAXPERIOD, COMBFILTER_MAXPERIOD); + OPUS_COPY(pre[c]+COMBFILTER_MAXPERIOD, in+c*(N+overlap)+overlap, N); + } while (++c>1, opus_val16); + + pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC, st->arch); + /* Don't search for the fir last 1.5 octave of the range because + there's too many false-positives due to short-term correlation */ + pitch_search(pitch_buf+(COMBFILTER_MAXPERIOD>>1), pitch_buf, N, + COMBFILTER_MAXPERIOD-3*COMBFILTER_MINPERIOD, &pitch_index, + st->arch); + pitch_index = COMBFILTER_MAXPERIOD-pitch_index; + + gain1 = remove_doubling(pitch_buf, COMBFILTER_MAXPERIOD, COMBFILTER_MINPERIOD, + N, &pitch_index, st->prefilter_period, st->prefilter_gain, st->arch); + if (pitch_index > COMBFILTER_MAXPERIOD-2) + pitch_index = COMBFILTER_MAXPERIOD-2; + gain1 = MULT16_16_Q15(QCONST16(.7f,15),gain1); + /*printf("%d %d %f %f\n", pitch_change, pitch_index, gain1, st->analysis.tonality);*/ + if (st->loss_rate>2) + gain1 = HALF32(gain1); + if (st->loss_rate>4) + gain1 = HALF32(gain1); + if (st->loss_rate>8) + gain1 = 0; + } else { + gain1 = 0; + pitch_index = COMBFILTER_MINPERIOD; + } + + /* Gain threshold for enabling the prefilter/postfilter */ + pf_threshold = QCONST16(.2f,15); + + /* Adjusting the threshold based on rate and continuity */ + if (abs(pitch_index-st->prefilter_period)*10>pitch_index) + pf_threshold += QCONST16(.2f,15); + if (nbAvailableBytes<25) + pf_threshold += QCONST16(.1f,15); + if (nbAvailableBytes<35) + pf_threshold += QCONST16(.1f,15); + if (st->prefilter_gain > QCONST16(.4f,15)) + pf_threshold -= QCONST16(.1f,15); + if (st->prefilter_gain > QCONST16(.55f,15)) + pf_threshold -= QCONST16(.1f,15); + + /* Hard threshold at 0.2 */ + pf_threshold = MAX16(pf_threshold, QCONST16(.2f,15)); + if (gain1prefilter_gain)prefilter_gain; + +#ifdef FIXED_POINT + qg = ((gain1+1536)>>10)/3-1; +#else + qg = (int)floor(.5f+gain1*32/3)-1; +#endif + qg = IMAX(0, IMIN(7, qg)); + gain1 = QCONST16(0.09375f,15)*(qg+1); + pf_on = 1; + } + /*printf("%d %f\n", pitch_index, gain1);*/ + + c=0; do { + int offset = mode->shortMdctSize-overlap; + st->prefilter_period=IMAX(st->prefilter_period, COMBFILTER_MINPERIOD); + OPUS_COPY(in+c*(N+overlap), st->in_mem+c*(overlap), overlap); + if (offset) + comb_filter(in+c*(N+overlap)+overlap, pre[c]+COMBFILTER_MAXPERIOD, + st->prefilter_period, st->prefilter_period, offset, -st->prefilter_gain, -st->prefilter_gain, + st->prefilter_tapset, st->prefilter_tapset, NULL, 0, st->arch); + + comb_filter(in+c*(N+overlap)+overlap+offset, pre[c]+COMBFILTER_MAXPERIOD+offset, + st->prefilter_period, pitch_index, N-offset, -st->prefilter_gain, -gain1, + st->prefilter_tapset, prefilter_tapset, mode->window, overlap, st->arch); + OPUS_COPY(st->in_mem+c*(overlap), in+c*(N+overlap)+N, overlap); + + if (N>COMBFILTER_MAXPERIOD) + { + OPUS_COPY(prefilter_mem+c*COMBFILTER_MAXPERIOD, pre[c]+N, COMBFILTER_MAXPERIOD); + } else { + OPUS_MOVE(prefilter_mem+c*COMBFILTER_MAXPERIOD, prefilter_mem+c*COMBFILTER_MAXPERIOD+N, COMBFILTER_MAXPERIOD-N); + OPUS_COPY(prefilter_mem+c*COMBFILTER_MAXPERIOD+COMBFILTER_MAXPERIOD-N, pre[c]+COMBFILTER_MAXPERIOD, N); + } + } while (++cnbEBands; + eBands = mode->eBands; + + coded_bands = lastCodedBands ? lastCodedBands : nbEBands; + coded_bins = eBands[coded_bands]<analysis.activity, st->analysis.tonality, tf_estimate, st->stereo_saving, tot_boost, coded_bands);*/ +#ifndef DISABLE_FLOAT_API + if (analysis->valid && analysis->activity<.4) + target -= (opus_int32)((coded_bins<activity)); +#endif + /* Stereo savings */ + if (C==2) + { + int coded_stereo_bands; + int coded_stereo_dof; + opus_val16 max_frac; + coded_stereo_bands = IMIN(intensity, coded_bands); + coded_stereo_dof = (eBands[coded_stereo_bands]<valid && !lfe) + { + opus_int32 tonal_target; + float tonal; + + /* Tonality boost (compensating for the average). */ + tonal = MAX16(0.f,analysis->tonality-.15f)-0.09f; + tonal_target = target + (opus_int32)((coded_bins<tonality, tonal);*/ + target = tonal_target; + } +#else + (void)analysis; + (void)pitch_change; +#endif + + if (has_surround_mask&&!lfe) + { + opus_int32 surround_target = target + (opus_int32)SHR32(MULT16_16(surround_masking,coded_bins<end, st->intensity, surround_target, target, st->bitrate);*/ + target = IMAX(target/4, surround_target); + } + + { + opus_int32 floor_depth; + int bins; + bins = eBands[nbEBands-2]<>2); + target = IMIN(target, floor_depth); + /*printf("%f %d\n", maxDepth, floor_depth);*/ + } + + if ((!has_surround_mask||lfe) && (constrained_vbr || bitrate<64000)) + { + opus_val16 rate_factor = Q15ONE; + if (bitrate < 64000) + { +#ifdef FIXED_POINT + rate_factor = MAX16(0,(bitrate-32000)); +#else + rate_factor = MAX16(0,(1.f/32768)*(bitrate-32000)); +#endif + } + if (constrained_vbr) + rate_factor = MIN16(rate_factor, QCONST16(0.67f, 15)); + target = base_target + (opus_int32)MULT16_32_Q15(rate_factor, target-base_target); + + } + + if (!has_surround_mask && tf_estimate < QCONST16(.2f, 14)) + { + opus_val16 amount; + opus_val16 tvbr_factor; + amount = MULT16_16_Q15(QCONST16(.0000031f, 30), IMAX(0, IMIN(32000, 96000-bitrate))); + tvbr_factor = SHR32(MULT16_16(temporal_vbr, amount), DB_SHIFT); + target += (opus_int32)MULT16_32_Q15(tvbr_factor, target); + } + + /* Don't allow more than doubling the rate */ + target = IMIN(2*base_target, target); + + return target; +} + +int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes, ec_enc *enc) +{ + int i, c, N; + opus_int32 bits; + ec_enc _enc; + VARDECL(celt_sig, in); + VARDECL(celt_sig, freq); + VARDECL(celt_norm, X); + VARDECL(celt_ener, bandE); + VARDECL(opus_val16, bandLogE); + VARDECL(opus_val16, bandLogE2); + VARDECL(int, fine_quant); + VARDECL(opus_val16, error); + VARDECL(int, pulses); + VARDECL(int, cap); + VARDECL(int, offsets); + VARDECL(int, fine_priority); + VARDECL(int, tf_res); + VARDECL(unsigned char, collapse_masks); + celt_sig *prefilter_mem; + opus_val16 *oldBandE, *oldLogE, *oldLogE2; + int shortBlocks=0; + int isTransient=0; + const int CC = st->channels; + const int C = st->stream_channels; + int LM, M; + int tf_select; + int nbFilledBytes, nbAvailableBytes; + int start; + int end; + int effEnd; + int codedBands; + int tf_sum; + int alloc_trim; + int pitch_index=COMBFILTER_MINPERIOD; + opus_val16 gain1 = 0; + int dual_stereo=0; + int effectiveBytes; + int dynalloc_logp; + opus_int32 vbr_rate; + opus_int32 total_bits; + opus_int32 total_boost; + opus_int32 balance; + opus_int32 tell; + int prefilter_tapset=0; + int pf_on; + int anti_collapse_rsv; + int anti_collapse_on=0; + int silence=0; + int tf_chan = 0; + opus_val16 tf_estimate; + int pitch_change=0; + opus_int32 tot_boost; + opus_val32 sample_max; + opus_val16 maxDepth; + const OpusCustomMode *mode; + int nbEBands; + int overlap; + const opus_int16 *eBands; + int secondMdct; + int signalBandwidth; + int transient_got_disabled=0; + opus_val16 surround_masking=0; + opus_val16 temporal_vbr=0; + opus_val16 surround_trim = 0; + opus_int32 equiv_rate = 510000; + VARDECL(opus_val16, surround_dynalloc); + ALLOC_STACK; + + mode = st->mode; + nbEBands = mode->nbEBands; + overlap = mode->overlap; + eBands = mode->eBands; + start = st->start; + end = st->end; + tf_estimate = 0; + if (nbCompressedBytes<2 || pcm==NULL) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + + frame_size *= st->upsample; + for (LM=0;LM<=mode->maxLM;LM++) + if (mode->shortMdctSize<mode->maxLM) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + M=1<shortMdctSize; + + prefilter_mem = st->in_mem+CC*(overlap); + oldBandE = (opus_val16*)(st->in_mem+CC*(overlap+COMBFILTER_MAXPERIOD)); + oldLogE = oldBandE + CC*nbEBands; + oldLogE2 = oldLogE + CC*nbEBands; + + if (enc==NULL) + { + tell=1; + nbFilledBytes=0; + } else { + tell=ec_tell(enc); + nbFilledBytes=(tell+4)>>3; + } + +#ifdef CUSTOM_MODES + if (st->signalling && enc==NULL) + { + int tmp = (mode->effEBands-end)>>1; + end = st->end = IMAX(1, mode->effEBands-tmp); + compressed[0] = tmp<<5; + compressed[0] |= LM<<3; + compressed[0] |= (C==2)<<2; + /* Convert "standard mode" to Opus header */ + if (mode->Fs==48000 && mode->shortMdctSize==120) + { + int c0 = toOpus(compressed[0]); + if (c0<0) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + compressed[0] = c0; + } + compressed++; + nbCompressedBytes--; + } +#else + celt_assert(st->signalling==0); +#endif + + /* Can't produce more than 1275 output bytes */ + nbCompressedBytes = IMIN(nbCompressedBytes,1275); + nbAvailableBytes = nbCompressedBytes - nbFilledBytes; + + if (st->vbr && st->bitrate!=OPUS_BITRATE_MAX) + { + opus_int32 den=mode->Fs>>BITRES; + vbr_rate=(st->bitrate*frame_size+(den>>1))/den; +#ifdef CUSTOM_MODES + if (st->signalling) + vbr_rate -= 8<>(3+BITRES); + } else { + opus_int32 tmp; + vbr_rate = 0; + tmp = st->bitrate*frame_size; + if (tell>1) + tmp += tell; + if (st->bitrate!=OPUS_BITRATE_MAX) + nbCompressedBytes = IMAX(2, IMIN(nbCompressedBytes, + (tmp+4*mode->Fs)/(8*mode->Fs)-!!st->signalling)); + effectiveBytes = nbCompressedBytes; + } + if (st->bitrate != OPUS_BITRATE_MAX) + equiv_rate = st->bitrate - (40*C+20)*((400>>LM) - 50); + + if (enc==NULL) + { + ec_enc_init(&_enc, compressed, nbCompressedBytes); + enc = &_enc; + } + + if (vbr_rate>0) + { + /* Computes the max bit-rate allowed in VBR mode to avoid violating the + target rate and buffering. + We must do this up front so that bust-prevention logic triggers + correctly if we don't have enough bits. */ + if (st->constrained_vbr) + { + opus_int32 vbr_bound; + opus_int32 max_allowed; + /* We could use any multiple of vbr_rate as bound (depending on the + delay). + This is clamped to ensure we use at least two bytes if the encoder + was entirely empty, but to allow 0 in hybrid mode. */ + vbr_bound = vbr_rate; + max_allowed = IMIN(IMAX(tell==1?2:0, + (vbr_rate+vbr_bound-st->vbr_reservoir)>>(BITRES+3)), + nbAvailableBytes); + if(max_allowed < nbAvailableBytes) + { + nbCompressedBytes = nbFilledBytes+max_allowed; + nbAvailableBytes = max_allowed; + ec_enc_shrink(enc, nbCompressedBytes); + } + } + } + total_bits = nbCompressedBytes*8; + + effEnd = end; + if (effEnd > mode->effEBands) + effEnd = mode->effEBands; + + ALLOC(in, CC*(N+overlap), celt_sig); + + sample_max=MAX32(st->overlap_max, celt_maxabs16(pcm, C*(N-overlap)/st->upsample)); + st->overlap_max=celt_maxabs16(pcm+C*(N-overlap)/st->upsample, C*overlap/st->upsample); + sample_max=MAX32(sample_max, st->overlap_max); +#ifdef FIXED_POINT + silence = (sample_max==0); +#else + silence = (sample_max <= (opus_val16)1/(1<lsb_depth)); +#endif +#ifdef FUZZING + if ((rand()&0x3F)==0) + silence = 1; +#endif + if (tell==1) + ec_enc_bit_logp(enc, silence, 15); + else + silence=0; + if (silence) + { + /*In VBR mode there is no need to send more than the minimum. */ + if (vbr_rate>0) + { + effectiveBytes=nbCompressedBytes=IMIN(nbCompressedBytes, nbFilledBytes+2); + total_bits=nbCompressedBytes*8; + nbAvailableBytes=2; + ec_enc_shrink(enc, nbCompressedBytes); + } + /* Pretend we've filled all the remaining bits with zeros + (that's what the initialiser did anyway) */ + tell = nbCompressedBytes*8; + enc->nbits_total+=tell-ec_tell(enc); + } + c=0; do { + int need_clip=0; +#ifndef FIXED_POINT + need_clip = st->clip && sample_max>65536.f; +#endif + celt_preemphasis(pcm+c, in+c*(N+overlap)+overlap, N, CC, st->upsample, + mode->preemph, st->preemph_memE+c, need_clip); + } while (++clfe&&nbAvailableBytes>3) || nbAvailableBytes>12*C) && start==0 && !silence && !st->disable_pf + && st->complexity >= 5 && !(st->consec_transient && LM!=3 && st->variable_duration==OPUS_FRAMESIZE_VARIABLE); + + prefilter_tapset = st->tapset_decision; + pf_on = run_prefilter(st, in, prefilter_mem, CC, N, prefilter_tapset, &pitch_index, &gain1, &qg, enabled, nbAvailableBytes); + if ((gain1 > QCONST16(.4f,15) || st->prefilter_gain > QCONST16(.4f,15)) && (!st->analysis.valid || st->analysis.tonality > .3) + && (pitch_index > 1.26*st->prefilter_period || pitch_index < .79*st->prefilter_period)) + pitch_change = 1; + if (pf_on==0) + { + if(start==0 && tell+16<=total_bits) + ec_enc_bit_logp(enc, 0, 1); + } else { + /*This block is not gated by a total bits check only because + of the nbAvailableBytes check above.*/ + int octave; + ec_enc_bit_logp(enc, 1, 1); + pitch_index += 1; + octave = EC_ILOG(pitch_index)-5; + ec_enc_uint(enc, octave, 6); + ec_enc_bits(enc, pitch_index-(16<complexity >= 1 && !st->lfe) + { + isTransient = transient_analysis(in, N+overlap, CC, + &tf_estimate, &tf_chan); + } + if (LM>0 && ec_tell(enc)+3<=total_bits) + { + if (isTransient) + shortBlocks = M; + } else { + isTransient = 0; + transient_got_disabled=1; + } + + ALLOC(freq, CC*N, celt_sig); /**< Interleaved signal MDCTs */ + ALLOC(bandE,nbEBands*CC, celt_ener); + ALLOC(bandLogE,nbEBands*CC, opus_val16); + + secondMdct = shortBlocks && st->complexity>=8; + ALLOC(bandLogE2, C*nbEBands, opus_val16); + if (secondMdct) + { + compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample, st->arch); + compute_band_energies(mode, freq, bandE, effEnd, C, LM); + amp2Log2(mode, effEnd, end, bandE, bandLogE2, C); + for (i=0;iupsample, st->arch); + if (CC==2&&C==1) + tf_chan = 0; + compute_band_energies(mode, freq, bandE, effEnd, C, LM); + + if (st->lfe) + { + for (i=2;ienergy_mask&&!st->lfe) + { + int mask_end; + int midband; + int count_dynalloc; + opus_val32 mask_avg=0; + opus_val32 diff=0; + int count=0; + mask_end = IMAX(2,st->lastCodedBands); + for (c=0;cenergy_mask[nbEBands*c+i], + QCONST16(.25f, DB_SHIFT)), -QCONST16(2.0f, DB_SHIFT)); + if (mask > 0) + mask = HALF16(mask); + mask_avg += MULT16_16(mask, eBands[i+1]-eBands[i]); + count += eBands[i+1]-eBands[i]; + diff += MULT16_16(mask, 1+2*i-mask_end); + } + } + celt_assert(count>0); + mask_avg = DIV32_16(mask_avg,count); + mask_avg += QCONST16(.2f, DB_SHIFT); + diff = diff*6/(C*(mask_end-1)*(mask_end+1)*mask_end); + /* Again, being conservative */ + diff = HALF32(diff); + diff = MAX32(MIN32(diff, QCONST32(.031f, DB_SHIFT)), -QCONST32(.031f, DB_SHIFT)); + /* Find the band that's in the middle of the coded spectrum */ + for (midband=0;eBands[midband+1] < eBands[mask_end]/2;midband++); + count_dynalloc=0; + for(i=0;ienergy_mask[i], st->energy_mask[nbEBands+i]); + else + unmask = st->energy_mask[i]; + unmask = MIN16(unmask, QCONST16(.0f, DB_SHIFT)); + unmask -= lin; + if (unmask > QCONST16(.25f, DB_SHIFT)) + { + surround_dynalloc[i] = unmask - QCONST16(.25f, DB_SHIFT); + count_dynalloc++; + } + } + if (count_dynalloc>=3) + { + /* If we need dynalloc in many bands, it's probably because our + initial masking rate was too low. */ + mask_avg += QCONST16(.25f, DB_SHIFT); + if (mask_avg>0) + { + /* Something went really wrong in the original calculations, + disabling masking. */ + mask_avg = 0; + diff = 0; + OPUS_CLEAR(surround_dynalloc, mask_end); + } else { + for(i=0;ilfe) + { + opus_val16 follow=-QCONST16(10.0f,DB_SHIFT); + opus_val32 frame_avg=0; + opus_val16 offset = shortBlocks?HALF16(SHL16(LM, DB_SHIFT)):0; + for(i=start;ispec_avg); + temporal_vbr = MIN16(QCONST16(3.f, DB_SHIFT), MAX16(-QCONST16(1.5f, DB_SHIFT), temporal_vbr)); + st->spec_avg += MULT16_16_Q15(QCONST16(.02f, 15), temporal_vbr); + } + /*for (i=0;i<21;i++) + printf("%f ", bandLogE[i]); + printf("\n");*/ + + if (!secondMdct) + { + OPUS_COPY(bandLogE2, bandLogE, C*nbEBands); + } + + /* Last chance to catch any transient we might have missed in the + time-domain analysis */ + if (LM>0 && ec_tell(enc)+3<=total_bits && !isTransient && st->complexity>=5 && !st->lfe) + { + if (patch_transient_decision(bandLogE, oldBandE, nbEBands, start, end, C)) + { + isTransient = 1; + shortBlocks = M; + compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, st->arch); + compute_band_energies(mode, freq, bandE, effEnd, C, LM); + amp2Log2(mode, effEnd, end, bandE, bandLogE, C); + /* Compensate for the scaling of short vs long mdcts */ + for (i=0;i0 && ec_tell(enc)+3<=total_bits) + ec_enc_bit_logp(enc, isTransient, 3); + + ALLOC(X, C*N, celt_norm); /**< Interleaved normalised MDCTs */ + + /* Band normalisation */ + normalise_bands(mode, freq, X, bandE, effEnd, C, M); + + ALLOC(tf_res, nbEBands, int); + /* Disable variable tf resolution for hybrid and at very low bitrate */ + if (effectiveBytes>=15*C && start==0 && st->complexity>=2 && !st->lfe) + { + int lambda; + if (effectiveBytes<40) + lambda = 12; + else if (effectiveBytes<60) + lambda = 6; + else if (effectiveBytes<100) + lambda = 4; + else + lambda = 3; + lambda*=2; + tf_select = tf_analysis(mode, effEnd, isTransient, tf_res, lambda, X, N, LM, &tf_sum, tf_estimate, tf_chan); + for (i=effEnd;iforce_intra, + &st->delayedIntra, st->complexity >= 4, st->loss_rate, st->lfe); + + tf_encode(start, end, isTransient, tf_res, LM, tf_select, enc); + + if (ec_tell(enc)+4<=total_bits) + { + if (st->lfe) + { + st->tapset_decision = 0; + st->spread_decision = SPREAD_NORMAL; + } else if (shortBlocks || st->complexity < 3 || nbAvailableBytes < 10*C || start != 0) + { + if (st->complexity == 0) + st->spread_decision = SPREAD_NONE; + else + st->spread_decision = SPREAD_NORMAL; + } else { + /* Disable new spreading+tapset estimator until we can show it works + better than the old one. So far it seems like spreading_decision() + works best. */ +#if 0 + if (st->analysis.valid) + { + static const opus_val16 spread_thresholds[3] = {-QCONST16(.6f, 15), -QCONST16(.2f, 15), -QCONST16(.07f, 15)}; + static const opus_val16 spread_histeresis[3] = {QCONST16(.15f, 15), QCONST16(.07f, 15), QCONST16(.02f, 15)}; + static const opus_val16 tapset_thresholds[2] = {QCONST16(.0f, 15), QCONST16(.15f, 15)}; + static const opus_val16 tapset_histeresis[2] = {QCONST16(.1f, 15), QCONST16(.05f, 15)}; + st->spread_decision = hysteresis_decision(-st->analysis.tonality, spread_thresholds, spread_histeresis, 3, st->spread_decision); + st->tapset_decision = hysteresis_decision(st->analysis.tonality_slope, tapset_thresholds, tapset_histeresis, 2, st->tapset_decision); + } else +#endif + { + st->spread_decision = spreading_decision(mode, X, + &st->tonal_average, st->spread_decision, &st->hf_average, + &st->tapset_decision, pf_on&&!shortBlocks, effEnd, C, M); + } + /*printf("%d %d\n", st->tapset_decision, st->spread_decision);*/ + /*printf("%f %d %f %d\n\n", st->analysis.tonality, st->spread_decision, st->analysis.tonality_slope, st->tapset_decision);*/ + } + ec_enc_icdf(enc, st->spread_decision, spread_icdf, 5); + } + + ALLOC(offsets, nbEBands, int); + + maxDepth = dynalloc_analysis(bandLogE, bandLogE2, nbEBands, start, end, C, offsets, + st->lsb_depth, mode->logN, isTransient, st->vbr, st->constrained_vbr, + eBands, LM, effectiveBytes, &tot_boost, st->lfe, surround_dynalloc); + /* For LFE, everything interesting is in the first band */ + if (st->lfe) + offsets[0] = IMIN(8, effectiveBytes/3); + ALLOC(cap, nbEBands, int); + init_caps(mode,cap,LM,C); + + dynalloc_logp = 6; + total_bits<<=BITRES; + total_boost = 0; + tell = ec_tell_frac(enc); + for (i=start;iintensity = hysteresis_decision((opus_val16)(equiv_rate/1000), + intensity_thresholds, intensity_histeresis, 21, st->intensity); + st->intensity = IMIN(end,IMAX(start, st->intensity)); + } + + alloc_trim = 5; + if (tell+(6<lfe) + alloc_trim = 5; + else + alloc_trim = alloc_trim_analysis(mode, X, bandLogE, + end, LM, C, N, &st->analysis, &st->stereo_saving, tf_estimate, + st->intensity, surround_trim, st->arch); + ec_enc_icdf(enc, alloc_trim, trim_icdf, 7); + tell = ec_tell_frac(enc); + } + + /* Variable bitrate */ + if (vbr_rate>0) + { + opus_val16 alpha; + opus_int32 delta; + /* The target rate in 8th bits per frame */ + opus_int32 target, base_target; + opus_int32 min_allowed; + int lm_diff = mode->maxLM - LM; + + /* Don't attempt to use more than 510 kb/s, even for frames smaller than 20 ms. + The CELT allocator will just not be able to use more than that anyway. */ + nbCompressedBytes = IMIN(nbCompressedBytes,1275>>(3-LM)); + base_target = vbr_rate - ((40*C+20)<constrained_vbr) + base_target += (st->vbr_offset>>lm_diff); + + target = compute_vbr(mode, &st->analysis, base_target, LM, equiv_rate, + st->lastCodedBands, C, st->intensity, st->constrained_vbr, + st->stereo_saving, tot_boost, tf_estimate, pitch_change, maxDepth, + st->variable_duration, st->lfe, st->energy_mask!=NULL, surround_masking, + temporal_vbr); + + /* The current offset is removed from the target and the space used + so far is added*/ + target=target+tell; + /* In VBR mode the frame size must not be reduced so much that it would + result in the encoder running out of bits. + The margin of 2 bytes ensures that none of the bust-prevention logic + in the decoder will have triggered so far. */ + min_allowed = ((tell+total_boost+(1<<(BITRES+3))-1)>>(BITRES+3)) + 2 - nbFilledBytes; + + nbAvailableBytes = (target+(1<<(BITRES+2)))>>(BITRES+3); + nbAvailableBytes = IMAX(min_allowed,nbAvailableBytes); + nbAvailableBytes = IMIN(nbCompressedBytes,nbAvailableBytes+nbFilledBytes) - nbFilledBytes; + + /* By how much did we "miss" the target on that frame */ + delta = target - vbr_rate; + + target=nbAvailableBytes<<(BITRES+3); + + /*If the frame is silent we don't adjust our drift, otherwise + the encoder will shoot to very high rates after hitting a + span of silence, but we do allow the bitres to refill. + This means that we'll undershoot our target in CVBR/VBR modes + on files with lots of silence. */ + if(silence) + { + nbAvailableBytes = 2; + target = 2*8<vbr_count < 970) + { + st->vbr_count++; + alpha = celt_rcp(SHL32(EXTEND32(st->vbr_count+20),16)); + } else + alpha = QCONST16(.001f,15); + /* How many bits have we used in excess of what we're allowed */ + if (st->constrained_vbr) + st->vbr_reservoir += target - vbr_rate; + /*printf ("%d\n", st->vbr_reservoir);*/ + + /* Compute the offset we need to apply in order to reach the target */ + if (st->constrained_vbr) + { + st->vbr_drift += (opus_int32)MULT16_32_Q15(alpha,(delta*(1<vbr_offset-st->vbr_drift); + st->vbr_offset = -st->vbr_drift; + } + /*printf ("%d\n", st->vbr_drift);*/ + + if (st->constrained_vbr && st->vbr_reservoir < 0) + { + /* We're under the min value -- increase rate */ + int adjust = (-st->vbr_reservoir)/(8<vbr_reservoir = 0; + /*printf ("+%d\n", adjust);*/ + } + nbCompressedBytes = IMIN(nbCompressedBytes,nbAvailableBytes+nbFilledBytes); + /*printf("%d\n", nbCompressedBytes*50*8);*/ + /* This moves the raw bits to take into account the new compressed size */ + ec_enc_shrink(enc, nbCompressedBytes); + } + + /* Bit allocation */ + ALLOC(fine_quant, nbEBands, int); + ALLOC(pulses, nbEBands, int); + ALLOC(fine_priority, nbEBands, int); + + /* bits = packet size - where we are - safety*/ + bits = (((opus_int32)nbCompressedBytes*8)<=2&&bits>=((LM+2)<analysis.valid) + { + int min_bandwidth; + if (equiv_rate < (opus_int32)32000*C) + min_bandwidth = 13; + else if (equiv_rate < (opus_int32)48000*C) + min_bandwidth = 16; + else if (equiv_rate < (opus_int32)60000*C) + min_bandwidth = 18; + else if (equiv_rate < (opus_int32)80000*C) + min_bandwidth = 19; + else + min_bandwidth = 20; + signalBandwidth = IMAX(st->analysis.bandwidth, min_bandwidth); + } +#endif + if (st->lfe) + signalBandwidth = 1; + codedBands = compute_allocation(mode, start, end, offsets, cap, + alloc_trim, &st->intensity, &dual_stereo, bits, &balance, pulses, + fine_quant, fine_priority, C, LM, enc, 1, st->lastCodedBands, signalBandwidth); + if (st->lastCodedBands) + st->lastCodedBands = IMIN(st->lastCodedBands+1,IMAX(st->lastCodedBands-1,codedBands)); + else + st->lastCodedBands = codedBands; + + quant_fine_energy(mode, start, end, oldBandE, error, fine_quant, enc, C); + + /* Residual quantisation */ + ALLOC(collapse_masks, C*nbEBands, unsigned char); + quant_all_bands(1, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks, + bandE, pulses, shortBlocks, st->spread_decision, + dual_stereo, st->intensity, tf_res, nbCompressedBytes*(8<rng, st->arch); + + if (anti_collapse_rsv > 0) + { + anti_collapse_on = st->consec_transient<2; +#ifdef FUZZING + anti_collapse_on = rand()&0x1; +#endif + ec_enc_bits(enc, anti_collapse_on, 1); + } + quant_energy_finalise(mode, start, end, oldBandE, error, fine_quant, fine_priority, nbCompressedBytes*8-ec_tell(enc), enc, C); + + if (silence) + { + for (i=0;irng); + } + + c=0; do { + OPUS_MOVE(st->syn_mem[c], st->syn_mem[c]+N, 2*MAX_PERIOD-N+overlap/2); + } while (++csyn_mem[c]+2*MAX_PERIOD-N; + } while (++cupsample, silence, st->arch); + + c=0; do { + st->prefilter_period=IMAX(st->prefilter_period, COMBFILTER_MINPERIOD); + st->prefilter_period_old=IMAX(st->prefilter_period_old, COMBFILTER_MINPERIOD); + comb_filter(out_mem[c], out_mem[c], st->prefilter_period_old, st->prefilter_period, mode->shortMdctSize, + st->prefilter_gain_old, st->prefilter_gain, st->prefilter_tapset_old, st->prefilter_tapset, + mode->window, overlap); + if (LM!=0) + comb_filter(out_mem[c]+mode->shortMdctSize, out_mem[c]+mode->shortMdctSize, st->prefilter_period, pitch_index, N-mode->shortMdctSize, + st->prefilter_gain, gain1, st->prefilter_tapset, prefilter_tapset, + mode->window, overlap); + } while (++cupsample, mode->preemph, st->preemph_memD); + st->prefilter_period_old = st->prefilter_period; + st->prefilter_gain_old = st->prefilter_gain; + st->prefilter_tapset_old = st->prefilter_tapset; + } +#endif + + st->prefilter_period = pitch_index; + st->prefilter_gain = gain1; + st->prefilter_tapset = prefilter_tapset; +#ifdef RESYNTH + if (LM!=0) + { + st->prefilter_period_old = st->prefilter_period; + st->prefilter_gain_old = st->prefilter_gain; + st->prefilter_tapset_old = st->prefilter_tapset; + } +#endif + + if (CC==2&&C==1) { + OPUS_COPY(&oldBandE[nbEBands], oldBandE, nbEBands); + } + + if (!isTransient) + { + OPUS_COPY(oldLogE2, oldLogE, CC*nbEBands); + OPUS_COPY(oldLogE, oldBandE, CC*nbEBands); + } else { + for (i=0;iconsec_transient++; + else + st->consec_transient=0; + st->rng = enc->rng; + + /* If there's any room left (can only happen for very high rates), + it's already filled with zeros */ + ec_enc_done(enc); + +#ifdef CUSTOM_MODES + if (st->signalling) + nbCompressedBytes++; +#endif + + RESTORE_STACK; + if (ec_get_error(enc)) + return OPUS_INTERNAL_ERROR; + else + return nbCompressedBytes; +} + + +#ifdef CUSTOM_MODES + +#ifdef FIXED_POINT +int opus_custom_encode(CELTEncoder * OPUS_RESTRICT st, const opus_int16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes) +{ + return celt_encode_with_ec(st, pcm, frame_size, compressed, nbCompressedBytes, NULL); +} + +#ifndef DISABLE_FLOAT_API +int opus_custom_encode_float(CELTEncoder * OPUS_RESTRICT st, const float * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes) +{ + int j, ret, C, N; + VARDECL(opus_int16, in); + ALLOC_STACK; + + if (pcm==NULL) + return OPUS_BAD_ARG; + + C = st->channels; + N = frame_size; + ALLOC(in, C*N, opus_int16); + + for (j=0;jchannels; + N=frame_size; + ALLOC(in, C*N, celt_sig); + for (j=0;j10) + goto bad_arg; + st->complexity = value; + } + break; + case CELT_SET_START_BAND_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value<0 || value>=st->mode->nbEBands) + goto bad_arg; + st->start = value; + } + break; + case CELT_SET_END_BAND_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value<1 || value>st->mode->nbEBands) + goto bad_arg; + st->end = value; + } + break; + case CELT_SET_PREDICTION_REQUEST: + { + int value = va_arg(ap, opus_int32); + if (value<0 || value>2) + goto bad_arg; + st->disable_pf = value<=1; + st->force_intra = value==0; + } + break; + case OPUS_SET_PACKET_LOSS_PERC_REQUEST: + { + int value = va_arg(ap, opus_int32); + if (value<0 || value>100) + goto bad_arg; + st->loss_rate = value; + } + break; + case OPUS_SET_VBR_CONSTRAINT_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + st->constrained_vbr = value; + } + break; + case OPUS_SET_VBR_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + st->vbr = value; + } + break; + case OPUS_SET_BITRATE_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value<=500 && value!=OPUS_BITRATE_MAX) + goto bad_arg; + value = IMIN(value, 260000*st->channels); + st->bitrate = value; + } + break; + case CELT_SET_CHANNELS_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value<1 || value>2) + goto bad_arg; + st->stream_channels = value; + } + break; + case OPUS_SET_LSB_DEPTH_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value<8 || value>24) + goto bad_arg; + st->lsb_depth=value; + } + break; + case OPUS_GET_LSB_DEPTH_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + *value=st->lsb_depth; + } + break; + case OPUS_SET_EXPERT_FRAME_DURATION_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + st->variable_duration = value; + } + break; + case OPUS_RESET_STATE: + { + int i; + opus_val16 *oldBandE, *oldLogE, *oldLogE2; + oldBandE = (opus_val16*)(st->in_mem+st->channels*(st->mode->overlap+COMBFILTER_MAXPERIOD)); + oldLogE = oldBandE + st->channels*st->mode->nbEBands; + oldLogE2 = oldLogE + st->channels*st->mode->nbEBands; + OPUS_CLEAR((char*)&st->ENCODER_RESET_START, + opus_custom_encoder_get_size(st->mode, st->channels)- + ((char*)&st->ENCODER_RESET_START - (char*)st)); + for (i=0;ichannels*st->mode->nbEBands;i++) + oldLogE[i]=oldLogE2[i]=-QCONST16(28.f,DB_SHIFT); + st->vbr_offset = 0; + st->delayedIntra = 1; + st->spread_decision = SPREAD_NORMAL; + st->tonal_average = 256; + st->hf_average = 0; + st->tapset_decision = 0; + } + break; +#ifdef CUSTOM_MODES + case CELT_SET_INPUT_CLIPPING_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + st->clip = value; + } + break; +#endif + case CELT_SET_SIGNALLING_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + st->signalling = value; + } + break; + case CELT_SET_ANALYSIS_REQUEST: + { + AnalysisInfo *info = va_arg(ap, AnalysisInfo *); + if (info) + OPUS_COPY(&st->analysis, info, 1); + } + break; + case CELT_GET_MODE_REQUEST: + { + const CELTMode ** value = va_arg(ap, const CELTMode**); + if (value==0) + goto bad_arg; + *value=st->mode; + } + break; + case OPUS_GET_FINAL_RANGE_REQUEST: + { + opus_uint32 * value = va_arg(ap, opus_uint32 *); + if (value==0) + goto bad_arg; + *value=st->rng; + } + break; + case OPUS_SET_LFE_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + st->lfe = value; + } + break; + case OPUS_SET_ENERGY_MASK_REQUEST: + { + opus_val16 *value = va_arg(ap, opus_val16*); + st->energy_mask = value; + } + break; + default: + goto bad_request; + } + va_end(ap); + return OPUS_OK; +bad_arg: + va_end(ap); + return OPUS_BAD_ARG; +bad_request: + va_end(ap); + return OPUS_UNIMPLEMENTED; +} diff --git a/thirdparty/opus/celt/celt_lpc.c b/thirdparty/opus/celt/celt_lpc.c new file mode 100644 index 000000000000..b410a21c5f6b --- /dev/null +++ b/thirdparty/opus/celt/celt_lpc.c @@ -0,0 +1,314 @@ +/* Copyright (c) 2009-2010 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "celt_lpc.h" +#include "stack_alloc.h" +#include "mathops.h" +#include "pitch.h" + +void _celt_lpc( + opus_val16 *_lpc, /* out: [0...p-1] LPC coefficients */ +const opus_val32 *ac, /* in: [0...p] autocorrelation values */ +int p +) +{ + int i, j; + opus_val32 r; + opus_val32 error = ac[0]; +#ifdef FIXED_POINT + opus_val32 lpc[LPC_ORDER]; +#else + float *lpc = _lpc; +#endif + + OPUS_CLEAR(lpc, p); + if (ac[0] != 0) + { + for (i = 0; i < p; i++) { + /* Sum up this iteration's reflection coefficient */ + opus_val32 rr = 0; + for (j = 0; j < i; j++) + rr += MULT32_32_Q31(lpc[j],ac[i - j]); + rr += SHR32(ac[i + 1],3); + r = -frac_div32(SHL32(rr,3), error); + /* Update LPC coefficients and total error */ + lpc[i] = SHR32(r,3); + for (j = 0; j < (i+1)>>1; j++) + { + opus_val32 tmp1, tmp2; + tmp1 = lpc[j]; + tmp2 = lpc[i-1-j]; + lpc[j] = tmp1 + MULT32_32_Q31(r,tmp2); + lpc[i-1-j] = tmp2 + MULT32_32_Q31(r,tmp1); + } + + error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error); + /* Bail out once we get 30 dB gain */ +#ifdef FIXED_POINT + if (error=1;j--) + { + mem[j]=mem[j-1]; + } + mem[0] = ROUND16(sum,SIG_SHIFT); + _y[i] = sum; + } +#else + int i,j; + VARDECL(opus_val16, rden); + VARDECL(opus_val16, y); + SAVE_STACK; + + celt_assert((ord&3)==0); + ALLOC(rden, ord, opus_val16); + ALLOC(y, N+ord, opus_val16); + for(i=0;i0); + celt_assert(overlap>=0); + if (overlap == 0) + { + xptr = x; + } else { + for (i=0;i0) + { + for(i=0;i= 536870912) + { + int shift2=1; + if (ac[0] >= 1073741824) + shift2++; + for (i=0;i<=lag;i++) + ac[i] = SHR32(ac[i], shift2); + shift += shift2; + } +#endif + + RESTORE_STACK; + return shift; +} diff --git a/thirdparty/opus/celt/celt_lpc.h b/thirdparty/opus/celt/celt_lpc.h new file mode 100644 index 000000000000..323459eb1a83 --- /dev/null +++ b/thirdparty/opus/celt/celt_lpc.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2009-2010 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef PLC_H +#define PLC_H + +#include "arch.h" +#include "cpu_support.h" + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) +#include "x86/celt_lpc_sse.h" +#endif + +#define LPC_ORDER 24 + +void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p); + +void celt_fir_c( + const opus_val16 *x, + const opus_val16 *num, + opus_val16 *y, + int N, + int ord, + opus_val16 *mem, + int arch); + +#if !defined(OVERRIDE_CELT_FIR) +#define celt_fir(x, num, y, N, ord, mem, arch) \ + (celt_fir_c(x, num, y, N, ord, mem, arch)) +#endif + +void celt_iir(const opus_val32 *x, + const opus_val16 *den, + opus_val32 *y, + int N, + int ord, + opus_val16 *mem, + int arch); + +int _celt_autocorr(const opus_val16 *x, opus_val32 *ac, + const opus_val16 *window, int overlap, int lag, int n, int arch); + +#endif /* PLC_H */ diff --git a/thirdparty/opus/celt/cpu_support.h b/thirdparty/opus/celt/cpu_support.h new file mode 100644 index 000000000000..68fc60678f6a --- /dev/null +++ b/thirdparty/opus/celt/cpu_support.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2010 Xiph.Org Foundation + * Copyright (c) 2013 Parrot */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef CPU_SUPPORT_H +#define CPU_SUPPORT_H + +#include "opus_types.h" +#include "opus_defines.h" + +#if defined(OPUS_HAVE_RTCD) && \ + (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) +#include "arm/armcpu.h" + +/* We currently support 4 ARM variants: + * arch[0] -> ARMv4 + * arch[1] -> ARMv5E + * arch[2] -> ARMv6 + * arch[3] -> NEON + */ +#define OPUS_ARCHMASK 3 + +#elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ + (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)) + +#include "x86/x86cpu.h" +/* We currently support 5 x86 variants: + * arch[0] -> non-sse + * arch[1] -> sse + * arch[2] -> sse2 + * arch[3] -> sse4.1 + * arch[4] -> avx + */ +#define OPUS_ARCHMASK 7 +int opus_select_arch(void); + +#else +#define OPUS_ARCHMASK 0 + +static OPUS_INLINE int opus_select_arch(void) +{ + return 0; +} +#endif +#endif diff --git a/thirdparty/opus/celt/cwrs.c b/thirdparty/opus/celt/cwrs.c new file mode 100644 index 000000000000..9722f0ac86c9 --- /dev/null +++ b/thirdparty/opus/celt/cwrs.c @@ -0,0 +1,715 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Copyright (c) 2007-2009 Timothy B. Terriberry + Written by Timothy B. Terriberry and Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "os_support.h" +#include "cwrs.h" +#include "mathops.h" +#include "arch.h" + +#ifdef CUSTOM_MODES + +/*Guaranteed to return a conservatively large estimate of the binary logarithm + with frac bits of fractional precision. + Tested for all possible 32-bit inputs with frac=4, where the maximum + overestimation is 0.06254243 bits.*/ +int log2_frac(opus_uint32 val, int frac) +{ + int l; + l=EC_ILOG(val); + if(val&(val-1)){ + /*This is (val>>l-16), but guaranteed to round up, even if adding a bias + before the shift would cause overflow (e.g., for 0xFFFFxxxx). + Doesn't work for val=0, but that case fails the test above.*/ + if(l>16)val=((val-1)>>(l-16))+1; + else val<<=16-l; + l=(l-1)<>16); + l+=b<>b; + val=(val*val+0x7FFF)>>15; + } + while(frac-->0); + /*If val is not exactly 0x8000, then we have to round up the remainder.*/ + return l+(val>0x8000); + } + /*Exact powers of two require no rounding.*/ + else return (l-1)<0 ? sum(k=1...K,2**k*choose(N,k)*choose(K-1,k-1)) : 1, + where choose() is the binomial function. + A table of values for N<10 and K<10 looks like: + V[10][10] = { + {1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {1, 2, 2, 2, 2, 2, 2, 2, 2, 2}, + {1, 4, 8, 12, 16, 20, 24, 28, 32, 36}, + {1, 6, 18, 38, 66, 102, 146, 198, 258, 326}, + {1, 8, 32, 88, 192, 360, 608, 952, 1408, 1992}, + {1, 10, 50, 170, 450, 1002, 1970, 3530, 5890, 9290}, + {1, 12, 72, 292, 912, 2364, 5336, 10836, 20256, 35436}, + {1, 14, 98, 462, 1666, 4942, 12642, 28814, 59906, 115598}, + {1, 16, 128, 688, 2816, 9424, 27008, 68464, 157184, 332688}, + {1, 18, 162, 978, 4482, 16722, 53154, 148626, 374274, 864146} + }; + + U(N,K) = the number of such combinations wherein N-1 objects are taken at + most K-1 at a time. + This is given by + U(N,K) = sum(k=0...K-1,V(N-1,k)) + = K>0 ? (V(N-1,K-1) + V(N,K-1))/2 : 0. + The latter expression also makes clear that U(N,K) is half the number of such + combinations wherein the first object is taken at least once. + Although it may not be clear from either of these definitions, U(N,K) is the + natural function to work with when enumerating the pulse vector codebooks, + not V(N,K). + U(N,K) is not well-defined for N=0, but with the extension + U(0,K) = K>0 ? 0 : 1, + the function becomes symmetric: U(N,K) = U(K,N), with a similar table: + U[10][10] = { + {1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 1, 3, 5, 7, 9, 11, 13, 15, 17}, + {0, 1, 5, 13, 25, 41, 61, 85, 113, 145}, + {0, 1, 7, 25, 63, 129, 231, 377, 575, 833}, + {0, 1, 9, 41, 129, 321, 681, 1289, 2241, 3649}, + {0, 1, 11, 61, 231, 681, 1683, 3653, 7183, 13073}, + {0, 1, 13, 85, 377, 1289, 3653, 8989, 19825, 40081}, + {0, 1, 15, 113, 575, 2241, 7183, 19825, 48639, 108545}, + {0, 1, 17, 145, 833, 3649, 13073, 40081, 108545, 265729} + }; + + With this extension, V(N,K) may be written in terms of U(N,K): + V(N,K) = U(N,K) + U(N,K+1) + for all N>=0, K>=0. + Thus U(N,K+1) represents the number of combinations where the first element + is positive or zero, and U(N,K) represents the number of combinations where + it is negative. + With a large enough table of U(N,K) values, we could write O(N) encoding + and O(min(N*log(K),N+K)) decoding routines, but such a table would be + prohibitively large for small embedded devices (K may be as large as 32767 + for small N, and N may be as large as 200). + + Both functions obey the same recurrence relation: + V(N,K) = V(N-1,K) + V(N,K-1) + V(N-1,K-1), + U(N,K) = U(N-1,K) + U(N,K-1) + U(N-1,K-1), + for all N>0, K>0, with different initial conditions at N=0 or K=0. + This allows us to construct a row of one of the tables above given the + previous row or the next row. + Thus we can derive O(NK) encoding and decoding routines with O(K) memory + using only addition and subtraction. + + When encoding, we build up from the U(2,K) row and work our way forwards. + When decoding, we need to start at the U(N,K) row and work our way backwards, + which requires a means of computing U(N,K). + U(N,K) may be computed from two previous values with the same N: + U(N,K) = ((2*N-1)*U(N,K-1) - U(N,K-2))/(K-1) + U(N,K-2) + for all N>1, and since U(N,K) is symmetric, a similar relation holds for two + previous values with the same K: + U(N,K>1) = ((2*K-1)*U(N-1,K) - U(N-2,K))/(N-1) + U(N-2,K) + for all K>1. + This allows us to construct an arbitrary row of the U(N,K) table by starting + with the first two values, which are constants. + This saves roughly 2/3 the work in our O(NK) decoding routine, but costs O(K) + multiplications. + Similar relations can be derived for V(N,K), but are not used here. + + For N>0 and K>0, U(N,K) and V(N,K) take on the form of an (N-1)-degree + polynomial for fixed N. + The first few are + U(1,K) = 1, + U(2,K) = 2*K-1, + U(3,K) = (2*K-2)*K+1, + U(4,K) = (((4*K-6)*K+8)*K-3)/3, + U(5,K) = ((((2*K-4)*K+10)*K-8)*K+3)/3, + and + V(1,K) = 2, + V(2,K) = 4*K, + V(3,K) = 4*K*K+2, + V(4,K) = 8*(K*K+2)*K/3, + V(5,K) = ((4*K*K+20)*K*K+6)/3, + for all K>0. + This allows us to derive O(N) encoding and O(N*log(K)) decoding routines for + small N (and indeed decoding is also O(N) for N<3). + + @ARTICLE{Fis86, + author="Thomas R. Fischer", + title="A Pyramid Vector Quantizer", + journal="IEEE Transactions on Information Theory", + volume="IT-32", + number=4, + pages="568--583", + month=Jul, + year=1986 + }*/ + +#if !defined(SMALL_FOOTPRINT) + +/*U(N,K) = U(K,N) := N>0?K>0?U(N-1,K)+U(N,K-1)+U(N-1,K-1):0:K>0?1:0*/ +# define CELT_PVQ_U(_n,_k) (CELT_PVQ_U_ROW[IMIN(_n,_k)][IMAX(_n,_k)]) +/*V(N,K) := U(N,K)+U(N,K+1) = the number of PVQ codewords for a band of size N + with K pulses allocated to it.*/ +# define CELT_PVQ_V(_n,_k) (CELT_PVQ_U(_n,_k)+CELT_PVQ_U(_n,(_k)+1)) + +/*For each V(N,K) supported, we will access element U(min(N,K+1),max(N,K+1)). + Thus, the number of entries in row I is the larger of the maximum number of + pulses we will ever allocate for a given N=I (K=128, or however many fit in + 32 bits, whichever is smaller), plus one, and the maximum N for which + K=I-1 pulses fit in 32 bits. + The largest band size in an Opus Custom mode is 208. + Otherwise, we can limit things to the set of N which can be achieved by + splitting a band from a standard Opus mode: 176, 144, 96, 88, 72, 64, 48, + 44, 36, 32, 24, 22, 18, 16, 8, 4, 2).*/ +#if defined(CUSTOM_MODES) +static const opus_uint32 CELT_PVQ_U_DATA[1488]={ +#else +static const opus_uint32 CELT_PVQ_U_DATA[1272]={ +#endif + /*N=0, K=0...176:*/ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +#if defined(CUSTOM_MODES) + /*...208:*/ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, +#endif + /*N=1, K=1...176:*/ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +#if defined(CUSTOM_MODES) + /*...208:*/ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, +#endif + /*N=2, K=2...176:*/ + 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, + 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, + 81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, + 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139, 141, 143, + 145, 147, 149, 151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173, + 175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203, + 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, + 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, 257, 259, 261, 263, + 265, 267, 269, 271, 273, 275, 277, 279, 281, 283, 285, 287, 289, 291, 293, + 295, 297, 299, 301, 303, 305, 307, 309, 311, 313, 315, 317, 319, 321, 323, + 325, 327, 329, 331, 333, 335, 337, 339, 341, 343, 345, 347, 349, 351, +#if defined(CUSTOM_MODES) + /*...208:*/ + 353, 355, 357, 359, 361, 363, 365, 367, 369, 371, 373, 375, 377, 379, 381, + 383, 385, 387, 389, 391, 393, 395, 397, 399, 401, 403, 405, 407, 409, 411, + 413, 415, +#endif + /*N=3, K=3...176:*/ + 13, 25, 41, 61, 85, 113, 145, 181, 221, 265, 313, 365, 421, 481, 545, 613, + 685, 761, 841, 925, 1013, 1105, 1201, 1301, 1405, 1513, 1625, 1741, 1861, + 1985, 2113, 2245, 2381, 2521, 2665, 2813, 2965, 3121, 3281, 3445, 3613, 3785, + 3961, 4141, 4325, 4513, 4705, 4901, 5101, 5305, 5513, 5725, 5941, 6161, 6385, + 6613, 6845, 7081, 7321, 7565, 7813, 8065, 8321, 8581, 8845, 9113, 9385, 9661, + 9941, 10225, 10513, 10805, 11101, 11401, 11705, 12013, 12325, 12641, 12961, + 13285, 13613, 13945, 14281, 14621, 14965, 15313, 15665, 16021, 16381, 16745, + 17113, 17485, 17861, 18241, 18625, 19013, 19405, 19801, 20201, 20605, 21013, + 21425, 21841, 22261, 22685, 23113, 23545, 23981, 24421, 24865, 25313, 25765, + 26221, 26681, 27145, 27613, 28085, 28561, 29041, 29525, 30013, 30505, 31001, + 31501, 32005, 32513, 33025, 33541, 34061, 34585, 35113, 35645, 36181, 36721, + 37265, 37813, 38365, 38921, 39481, 40045, 40613, 41185, 41761, 42341, 42925, + 43513, 44105, 44701, 45301, 45905, 46513, 47125, 47741, 48361, 48985, 49613, + 50245, 50881, 51521, 52165, 52813, 53465, 54121, 54781, 55445, 56113, 56785, + 57461, 58141, 58825, 59513, 60205, 60901, 61601, +#if defined(CUSTOM_MODES) + /*...208:*/ + 62305, 63013, 63725, 64441, 65161, 65885, 66613, 67345, 68081, 68821, 69565, + 70313, 71065, 71821, 72581, 73345, 74113, 74885, 75661, 76441, 77225, 78013, + 78805, 79601, 80401, 81205, 82013, 82825, 83641, 84461, 85285, 86113, +#endif + /*N=4, K=4...176:*/ + 63, 129, 231, 377, 575, 833, 1159, 1561, 2047, 2625, 3303, 4089, 4991, 6017, + 7175, 8473, 9919, 11521, 13287, 15225, 17343, 19649, 22151, 24857, 27775, + 30913, 34279, 37881, 41727, 45825, 50183, 54809, 59711, 64897, 70375, 76153, + 82239, 88641, 95367, 102425, 109823, 117569, 125671, 134137, 142975, 152193, + 161799, 171801, 182207, 193025, 204263, 215929, 228031, 240577, 253575, + 267033, 280959, 295361, 310247, 325625, 341503, 357889, 374791, 392217, + 410175, 428673, 447719, 467321, 487487, 508225, 529543, 551449, 573951, + 597057, 620775, 645113, 670079, 695681, 721927, 748825, 776383, 804609, + 833511, 863097, 893375, 924353, 956039, 988441, 1021567, 1055425, 1090023, + 1125369, 1161471, 1198337, 1235975, 1274393, 1313599, 1353601, 1394407, + 1436025, 1478463, 1521729, 1565831, 1610777, 1656575, 1703233, 1750759, + 1799161, 1848447, 1898625, 1949703, 2001689, 2054591, 2108417, 2163175, + 2218873, 2275519, 2333121, 2391687, 2451225, 2511743, 2573249, 2635751, + 2699257, 2763775, 2829313, 2895879, 2963481, 3032127, 3101825, 3172583, + 3244409, 3317311, 3391297, 3466375, 3542553, 3619839, 3698241, 3777767, + 3858425, 3940223, 4023169, 4107271, 4192537, 4278975, 4366593, 4455399, + 4545401, 4636607, 4729025, 4822663, 4917529, 5013631, 5110977, 5209575, + 5309433, 5410559, 5512961, 5616647, 5721625, 5827903, 5935489, 6044391, + 6154617, 6266175, 6379073, 6493319, 6608921, 6725887, 6844225, 6963943, + 7085049, 7207551, +#if defined(CUSTOM_MODES) + /*...208:*/ + 7331457, 7456775, 7583513, 7711679, 7841281, 7972327, 8104825, 8238783, + 8374209, 8511111, 8649497, 8789375, 8930753, 9073639, 9218041, 9363967, + 9511425, 9660423, 9810969, 9963071, 10116737, 10271975, 10428793, 10587199, + 10747201, 10908807, 11072025, 11236863, 11403329, 11571431, 11741177, + 11912575, +#endif + /*N=5, K=5...176:*/ + 321, 681, 1289, 2241, 3649, 5641, 8361, 11969, 16641, 22569, 29961, 39041, + 50049, 63241, 78889, 97281, 118721, 143529, 172041, 204609, 241601, 283401, + 330409, 383041, 441729, 506921, 579081, 658689, 746241, 842249, 947241, + 1061761, 1186369, 1321641, 1468169, 1626561, 1797441, 1981449, 2179241, + 2391489, 2618881, 2862121, 3121929, 3399041, 3694209, 4008201, 4341801, + 4695809, 5071041, 5468329, 5888521, 6332481, 6801089, 7295241, 7815849, + 8363841, 8940161, 9545769, 10181641, 10848769, 11548161, 12280841, 13047849, + 13850241, 14689089, 15565481, 16480521, 17435329, 18431041, 19468809, + 20549801, 21675201, 22846209, 24064041, 25329929, 26645121, 28010881, + 29428489, 30899241, 32424449, 34005441, 35643561, 37340169, 39096641, + 40914369, 42794761, 44739241, 46749249, 48826241, 50971689, 53187081, + 55473921, 57833729, 60268041, 62778409, 65366401, 68033601, 70781609, + 73612041, 76526529, 79526721, 82614281, 85790889, 89058241, 92418049, + 95872041, 99421961, 103069569, 106816641, 110664969, 114616361, 118672641, + 122835649, 127107241, 131489289, 135983681, 140592321, 145317129, 150160041, + 155123009, 160208001, 165417001, 170752009, 176215041, 181808129, 187533321, + 193392681, 199388289, 205522241, 211796649, 218213641, 224775361, 231483969, + 238341641, 245350569, 252512961, 259831041, 267307049, 274943241, 282741889, + 290705281, 298835721, 307135529, 315607041, 324252609, 333074601, 342075401, + 351257409, 360623041, 370174729, 379914921, 389846081, 399970689, 410291241, + 420810249, 431530241, 442453761, 453583369, 464921641, 476471169, 488234561, + 500214441, 512413449, 524834241, 537479489, 550351881, 563454121, 576788929, + 590359041, 604167209, 618216201, 632508801, +#if defined(CUSTOM_MODES) + /*...208:*/ + 647047809, 661836041, 676876329, 692171521, 707724481, 723538089, 739615241, + 755958849, 772571841, 789457161, 806617769, 824056641, 841776769, 859781161, + 878072841, 896654849, 915530241, 934702089, 954173481, 973947521, 994027329, + 1014416041, 1035116809, 1056132801, 1077467201, 1099123209, 1121104041, + 1143412929, 1166053121, 1189027881, 1212340489, 1235994241, +#endif + /*N=6, K=6...96:*/ + 1683, 3653, 7183, 13073, 22363, 36365, 56695, 85305, 124515, 177045, 246047, + 335137, 448427, 590557, 766727, 982729, 1244979, 1560549, 1937199, 2383409, + 2908411, 3522221, 4235671, 5060441, 6009091, 7095093, 8332863, 9737793, + 11326283, 13115773, 15124775, 17372905, 19880915, 22670725, 25765455, + 29189457, 32968347, 37129037, 41699767, 46710137, 52191139, 58175189, + 64696159, 71789409, 79491819, 87841821, 96879431, 106646281, 117185651, + 128542501, 140763503, 153897073, 167993403, 183104493, 199284183, 216588185, + 235074115, 254801525, 275831935, 298228865, 322057867, 347386557, 374284647, + 402823977, 433078547, 465124549, 499040399, 534906769, 572806619, 612825229, + 655050231, 699571641, 746481891, 795875861, 847850911, 902506913, 959946283, + 1020274013, 1083597703, 1150027593, 1219676595, 1292660325, 1369097135, + 1449108145, 1532817275, 1620351277, 1711839767, 1807415257, 1907213187, + 2011371957, 2120032959, +#if defined(CUSTOM_MODES) + /*...109:*/ + 2233340609U, 2351442379U, 2474488829U, 2602633639U, 2736033641U, 2874848851U, + 3019242501U, 3169381071U, 3325434321U, 3487575323U, 3655980493U, 3830829623U, + 4012305913U, +#endif + /*N=7, K=7...54*/ + 8989, 19825, 40081, 75517, 134245, 227305, 369305, 579125, 880685, 1303777, + 1884961, 2668525, 3707509, 5064793, 6814249, 9041957, 11847485, 15345233, + 19665841, 24957661, 31388293, 39146185, 48442297, 59511829, 72616013, + 88043969, 106114625, 127178701, 151620757, 179861305, 212358985, 249612805, + 292164445, 340600625, 395555537, 457713341, 527810725, 606639529, 695049433, + 793950709, 904317037, 1027188385, 1163673953, 1314955181, 1482288821, + 1667010073, 1870535785, 2094367717, +#if defined(CUSTOM_MODES) + /*...60:*/ + 2340095869U, 2609401873U, 2904062449U, 3225952925U, 3577050821U, 3959439497U, +#endif + /*N=8, K=8...37*/ + 48639, 108545, 224143, 433905, 795455, 1392065, 2340495, 3800305, 5984767, + 9173505, 13726991, 20103025, 28875327, 40754369, 56610575, 77500017, + 104692735, 139703809, 184327311, 240673265, 311207743, 398796225, 506750351, + 638878193, 799538175, 993696769, 1226990095, 1505789553, 1837271615, + 2229491905U, +#if defined(CUSTOM_MODES) + /*...40:*/ + 2691463695U, 3233240945U, 3866006015U, +#endif + /*N=9, K=9...28:*/ + 265729, 598417, 1256465, 2485825, 4673345, 8405905, 14546705, 24331777, + 39490049, 62390545, 96220561, 145198913, 214828609, 312193553, 446304145, + 628496897, 872893441, 1196924561, 1621925137, 2173806145U, +#if defined(CUSTOM_MODES) + /*...29:*/ + 2883810113U, +#endif + /*N=10, K=10...24:*/ + 1462563, 3317445, 7059735, 14218905, 27298155, 50250765, 89129247, 152951073, + 254831667, 413442773, 654862247, 1014889769, 1541911931, 2300409629U, + 3375210671U, + /*N=11, K=11...19:*/ + 8097453, 18474633, 39753273, 81270333, 158819253, 298199265, 540279585, + 948062325, 1616336765, +#if defined(CUSTOM_MODES) + /*...20:*/ + 2684641785U, +#endif + /*N=12, K=12...18:*/ + 45046719, 103274625, 224298231, 464387817, 921406335, 1759885185, + 3248227095U, + /*N=13, K=13...16:*/ + 251595969, 579168825, 1267854873, 2653649025U, + /*N=14, K=14:*/ + 1409933619 +}; + +#if defined(CUSTOM_MODES) +static const opus_uint32 *const CELT_PVQ_U_ROW[15]={ + CELT_PVQ_U_DATA+ 0,CELT_PVQ_U_DATA+ 208,CELT_PVQ_U_DATA+ 415, + CELT_PVQ_U_DATA+ 621,CELT_PVQ_U_DATA+ 826,CELT_PVQ_U_DATA+1030, + CELT_PVQ_U_DATA+1233,CELT_PVQ_U_DATA+1336,CELT_PVQ_U_DATA+1389, + CELT_PVQ_U_DATA+1421,CELT_PVQ_U_DATA+1441,CELT_PVQ_U_DATA+1455, + CELT_PVQ_U_DATA+1464,CELT_PVQ_U_DATA+1470,CELT_PVQ_U_DATA+1473 +}; +#else +static const opus_uint32 *const CELT_PVQ_U_ROW[15]={ + CELT_PVQ_U_DATA+ 0,CELT_PVQ_U_DATA+ 176,CELT_PVQ_U_DATA+ 351, + CELT_PVQ_U_DATA+ 525,CELT_PVQ_U_DATA+ 698,CELT_PVQ_U_DATA+ 870, + CELT_PVQ_U_DATA+1041,CELT_PVQ_U_DATA+1131,CELT_PVQ_U_DATA+1178, + CELT_PVQ_U_DATA+1207,CELT_PVQ_U_DATA+1226,CELT_PVQ_U_DATA+1240, + CELT_PVQ_U_DATA+1248,CELT_PVQ_U_DATA+1254,CELT_PVQ_U_DATA+1257 +}; +#endif + +#if defined(CUSTOM_MODES) +void get_required_bits(opus_int16 *_bits,int _n,int _maxk,int _frac){ + int k; + /*_maxk==0 => there's nothing to do.*/ + celt_assert(_maxk>0); + _bits[0]=0; + for(k=1;k<=_maxk;k++)_bits[k]=log2_frac(CELT_PVQ_V(_n,k),_frac); +} +#endif + +static opus_uint32 icwrs(int _n,const int *_y){ + opus_uint32 i; + int j; + int k; + celt_assert(_n>=2); + j=_n-1; + i=_y[j]<0; + k=abs(_y[j]); + do{ + j--; + i+=CELT_PVQ_U(_n-j,k); + k+=abs(_y[j]); + if(_y[j]<0)i+=CELT_PVQ_U(_n-j,k+1); + } + while(j>0); + return i; +} + +void encode_pulses(const int *_y,int _n,int _k,ec_enc *_enc){ + celt_assert(_k>0); + ec_enc_uint(_enc,icwrs(_n,_y),CELT_PVQ_V(_n,_k)); +} + +static opus_val32 cwrsi(int _n,int _k,opus_uint32 _i,int *_y){ + opus_uint32 p; + int s; + int k0; + opus_int16 val; + opus_val32 yy=0; + celt_assert(_k>0); + celt_assert(_n>1); + while(_n>2){ + opus_uint32 q; + /*Lots of pulses case:*/ + if(_k>=_n){ + const opus_uint32 *row; + row=CELT_PVQ_U_ROW[_n]; + /*Are the pulses in this dimension negative?*/ + p=row[_k+1]; + s=-(_i>=p); + _i-=p&s; + /*Count how many pulses were placed in this dimension.*/ + k0=_k; + q=row[_n]; + if(q>_i){ + celt_assert(p>q); + _k=_n; + do p=CELT_PVQ_U_ROW[--_k][_n]; + while(p>_i); + } + else for(p=row[_k];p>_i;p=row[_k])_k--; + _i-=p; + val=(k0-_k+s)^s; + *_y++=val; + yy=MAC16_16(yy,val,val); + } + /*Lots of dimensions case:*/ + else{ + /*Are there any pulses in this dimension at all?*/ + p=CELT_PVQ_U_ROW[_k][_n]; + q=CELT_PVQ_U_ROW[_k+1][_n]; + if(p<=_i&&_i=q); + _i-=q&s; + /*Count how many pulses were placed in this dimension.*/ + k0=_k; + do p=CELT_PVQ_U_ROW[--_k][_n]; + while(p>_i); + _i-=p; + val=(k0-_k+s)^s; + *_y++=val; + yy=MAC16_16(yy,val,val); + } + } + _n--; + } + /*_n==2*/ + p=2*_k+1; + s=-(_i>=p); + _i-=p&s; + k0=_k; + _k=(_i+1)>>1; + if(_k)_i-=2*_k-1; + val=(k0-_k+s)^s; + *_y++=val; + yy=MAC16_16(yy,val,val); + /*_n==1*/ + s=-(int)_i; + val=(_k+s)^s; + *_y=val; + yy=MAC16_16(yy,val,val); + return yy; +} + +opus_val32 decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){ + return cwrsi(_n,_k,ec_dec_uint(_dec,CELT_PVQ_V(_n,_k)),_y); +} + +#else /* SMALL_FOOTPRINT */ + +/*Computes the next row/column of any recurrence that obeys the relation + u[i][j]=u[i-1][j]+u[i][j-1]+u[i-1][j-1]. + _ui0 is the base case for the new row/column.*/ +static OPUS_INLINE void unext(opus_uint32 *_ui,unsigned _len,opus_uint32 _ui0){ + opus_uint32 ui1; + unsigned j; + /*This do-while will overrun the array if we don't have storage for at least + 2 values.*/ + j=1; do { + ui1=UADD32(UADD32(_ui[j],_ui[j-1]),_ui0); + _ui[j-1]=_ui0; + _ui0=ui1; + } while (++j<_len); + _ui[j-1]=_ui0; +} + +/*Computes the previous row/column of any recurrence that obeys the relation + u[i-1][j]=u[i][j]-u[i][j-1]-u[i-1][j-1]. + _ui0 is the base case for the new row/column.*/ +static OPUS_INLINE void uprev(opus_uint32 *_ui,unsigned _n,opus_uint32 _ui0){ + opus_uint32 ui1; + unsigned j; + /*This do-while will overrun the array if we don't have storage for at least + 2 values.*/ + j=1; do { + ui1=USUB32(USUB32(_ui[j],_ui[j-1]),_ui0); + _ui[j-1]=_ui0; + _ui0=ui1; + } while (++j<_n); + _ui[j-1]=_ui0; +} + +/*Compute V(_n,_k), as well as U(_n,0..._k+1). + _u: On exit, _u[i] contains U(_n,i) for i in [0..._k+1].*/ +static opus_uint32 ncwrs_urow(unsigned _n,unsigned _k,opus_uint32 *_u){ + opus_uint32 um2; + unsigned len; + unsigned k; + len=_k+2; + /*We require storage at least 3 values (e.g., _k>0).*/ + celt_assert(len>=3); + _u[0]=0; + _u[1]=um2=1; + /*If _n==0, _u[0] should be 1 and the rest should be 0.*/ + /*If _n==1, _u[i] should be 1 for i>1.*/ + celt_assert(_n>=2); + /*If _k==0, the following do-while loop will overflow the buffer.*/ + celt_assert(_k>0); + k=2; + do _u[k]=(k<<1)-1; + while(++k0); + j=0; + do{ + opus_uint32 p; + int s; + int yj; + p=_u[_k+1]; + s=-(_i>=p); + _i-=p&s; + yj=_k; + p=_u[_k]; + while(p>_i)p=_u[--_k]; + _i-=p; + yj-=_k; + val=(yj+s)^s; + _y[j]=val; + yy=MAC16_16(yy,val,val); + uprev(_u,_k+2,0); + } + while(++j<_n); + return yy; +} + +/*Returns the index of the given combination of K elements chosen from a set + of size 1 with associated sign bits. + _y: The vector of pulses, whose sum of absolute values is K. + _k: Returns K.*/ +static OPUS_INLINE opus_uint32 icwrs1(const int *_y,int *_k){ + *_k=abs(_y[0]); + return _y[0]<0; +} + +/*Returns the index of the given combination of K elements chosen from a set + of size _n with associated sign bits. + _y: The vector of pulses, whose sum of absolute values must be _k. + _nc: Returns V(_n,_k).*/ +static OPUS_INLINE opus_uint32 icwrs(int _n,int _k,opus_uint32 *_nc,const int *_y, + opus_uint32 *_u){ + opus_uint32 i; + int j; + int k; + /*We can't unroll the first two iterations of the loop unless _n>=2.*/ + celt_assert(_n>=2); + _u[0]=0; + for(k=1;k<=_k+1;k++)_u[k]=(k<<1)-1; + i=icwrs1(_y+_n-1,&k); + j=_n-2; + i+=_u[k]; + k+=abs(_y[j]); + if(_y[j]<0)i+=_u[k+1]; + while(j-->0){ + unext(_u,_k+2,0); + i+=_u[k]; + k+=abs(_y[j]); + if(_y[j]<0)i+=_u[k+1]; + } + *_nc=_u[k]+_u[k+1]; + return i; +} + +#ifdef CUSTOM_MODES +void get_required_bits(opus_int16 *_bits,int _n,int _maxk,int _frac){ + int k; + /*_maxk==0 => there's nothing to do.*/ + celt_assert(_maxk>0); + _bits[0]=0; + if (_n==1) + { + for (k=1;k<=_maxk;k++) + _bits[k] = 1<<_frac; + } + else { + VARDECL(opus_uint32,u); + SAVE_STACK; + ALLOC(u,_maxk+2U,opus_uint32); + ncwrs_urow(_n,_maxk,u); + for(k=1;k<=_maxk;k++) + _bits[k]=log2_frac(u[k]+u[k+1],_frac); + RESTORE_STACK; + } +} +#endif /* CUSTOM_MODES */ + +void encode_pulses(const int *_y,int _n,int _k,ec_enc *_enc){ + opus_uint32 i; + VARDECL(opus_uint32,u); + opus_uint32 nc; + SAVE_STACK; + celt_assert(_k>0); + ALLOC(u,_k+2U,opus_uint32); + i=icwrs(_n,_k,&nc,_y,u); + ec_enc_uint(_enc,i,nc); + RESTORE_STACK; +} + +opus_val32 decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){ + VARDECL(opus_uint32,u); + int ret; + SAVE_STACK; + celt_assert(_k>0); + ALLOC(u,_k+2U,opus_uint32); + ret = cwrsi(_n,_k,ec_dec_uint(_dec,ncwrs_urow(_n,_k,u)),_y,u); + RESTORE_STACK; + return ret; +} + +#endif /* SMALL_FOOTPRINT */ diff --git a/thirdparty/opus/celt/cwrs.h b/thirdparty/opus/celt/cwrs.h new file mode 100644 index 000000000000..7cd47174593c --- /dev/null +++ b/thirdparty/opus/celt/cwrs.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Copyright (c) 2007-2009 Timothy B. Terriberry + Written by Timothy B. Terriberry and Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef CWRS_H +#define CWRS_H + +#include "arch.h" +#include "stack_alloc.h" +#include "entenc.h" +#include "entdec.h" + +#ifdef CUSTOM_MODES +int log2_frac(opus_uint32 val, int frac); +#endif + +void get_required_bits(opus_int16 *bits, int N, int K, int frac); + +void encode_pulses(const int *_y, int N, int K, ec_enc *enc); + +opus_val32 decode_pulses(int *_y, int N, int K, ec_dec *dec); + +#endif /* CWRS_H */ diff --git a/thirdparty/opus/celt/ecintrin.h b/thirdparty/opus/celt/ecintrin.h new file mode 100644 index 000000000000..2263cff6bdf3 --- /dev/null +++ b/thirdparty/opus/celt/ecintrin.h @@ -0,0 +1,87 @@ +/* Copyright (c) 2003-2008 Timothy B. Terriberry + Copyright (c) 2008 Xiph.Org Foundation */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/*Some common macros for potential platform-specific optimization.*/ +#include "opus_types.h" +#include +#include +#include "arch.h" +#if !defined(_ecintrin_H) +# define _ecintrin_H (1) + +/*Some specific platforms may have optimized intrinsic or OPUS_INLINE assembly + versions of these functions which can substantially improve performance. + We define macros for them to allow easy incorporation of these non-ANSI + features.*/ + +/*Modern gcc (4.x) can compile the naive versions of min and max with cmov if + given an appropriate architecture, but the branchless bit-twiddling versions + are just as fast, and do not require any special target architecture. + Earlier gcc versions (3.x) compiled both code to the same assembly + instructions, because of the way they represented ((_b)>(_a)) internally.*/ +# define EC_MINI(_a,_b) ((_a)+(((_b)-(_a))&-((_b)<(_a)))) + +/*Count leading zeros. + This macro should only be used for implementing ec_ilog(), if it is defined. + All other code should use EC_ILOG() instead.*/ +#if defined(_MSC_VER) && (_MSC_VER >= 1400) +# include +/*In _DEBUG mode this is not an intrinsic by default.*/ +# pragma intrinsic(_BitScanReverse) + +static __inline int ec_bsr(unsigned long _x){ + unsigned long ret; + _BitScanReverse(&ret,_x); + return (int)ret; +} +# define EC_CLZ0 (1) +# define EC_CLZ(_x) (-ec_bsr(_x)) +#elif defined(ENABLE_TI_DSPLIB) +# include "dsplib.h" +# define EC_CLZ0 (31) +# define EC_CLZ(_x) (_lnorm(_x)) +#elif __GNUC_PREREQ(3,4) +# if INT_MAX>=2147483647 +# define EC_CLZ0 ((int)sizeof(unsigned)*CHAR_BIT) +# define EC_CLZ(_x) (__builtin_clz(_x)) +# elif LONG_MAX>=2147483647L +# define EC_CLZ0 ((int)sizeof(unsigned long)*CHAR_BIT) +# define EC_CLZ(_x) (__builtin_clzl(_x)) +# endif +#endif + +#if defined(EC_CLZ) +/*Note that __builtin_clz is not defined when _x==0, according to the gcc + documentation (and that of the BSR instruction that implements it on x86). + The majority of the time we can never pass it zero. + When we need to, it can be special cased.*/ +# define EC_ILOG(_x) (EC_CLZ0-EC_CLZ(_x)) +#else +int ec_ilog(opus_uint32 _v); +# define EC_ILOG(_x) (ec_ilog(_x)) +#endif +#endif diff --git a/thirdparty/opus/celt/entcode.c b/thirdparty/opus/celt/entcode.c new file mode 100644 index 000000000000..70f32016ecee --- /dev/null +++ b/thirdparty/opus/celt/entcode.c @@ -0,0 +1,153 @@ +/* Copyright (c) 2001-2011 Timothy B. Terriberry +*/ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "entcode.h" +#include "arch.h" + +#if !defined(EC_CLZ) +/*This is a fallback for systems where we don't know how to access + a BSR or CLZ instruction (see ecintrin.h). + If you are optimizing Opus on a new platform and it has a native CLZ or + BZR (e.g. cell, MIPS, x86, etc) then making it available to Opus will be + an easy performance win.*/ +int ec_ilog(opus_uint32 _v){ + /*On a Pentium M, this branchless version tested as the fastest on + 1,000,000,000 random 32-bit integers, edging out a similar version with + branches, and a 256-entry LUT version.*/ + int ret; + int m; + ret=!!_v; + m=!!(_v&0xFFFF0000)<<4; + _v>>=m; + ret|=m; + m=!!(_v&0xFF00)<<3; + _v>>=m; + ret|=m; + m=!!(_v&0xF0)<<2; + _v>>=m; + ret|=m; + m=!!(_v&0xC)<<1; + _v>>=m; + ret|=m; + ret+=!!(_v&0x2); + return ret; +} +#endif + +#if 1 +/* This is a faster version of ec_tell_frac() that takes advantage + of the low (1/8 bit) resolution to use just a linear function + followed by a lookup to determine the exact transition thresholds. */ +opus_uint32 ec_tell_frac(ec_ctx *_this){ + static const unsigned correction[8] = + {35733, 38967, 42495, 46340, + 50535, 55109, 60097, 65535}; + opus_uint32 nbits; + opus_uint32 r; + int l; + unsigned b; + nbits=_this->nbits_total<rng); + r=_this->rng>>(l-16); + b = (r>>12)-8; + b += r>correction[b]; + l = (l<<3)+b; + return nbits-l; +} +#else +opus_uint32 ec_tell_frac(ec_ctx *_this){ + opus_uint32 nbits; + opus_uint32 r; + int l; + int i; + /*To handle the non-integral number of bits still left in the encoder/decoder + state, we compute the worst-case number of bits of val that must be + encoded to ensure that the value is inside the range for any possible + subsequent bits. + The computation here is independent of val itself (the decoder does not + even track that value), even though the real number of bits used after + ec_enc_done() may be 1 smaller if rng is a power of two and the + corresponding trailing bits of val are all zeros. + If we did try to track that special case, then coding a value with a + probability of 1/(1<nbits_total<rng); + r=_this->rng>>(l-16); + for(i=BITRES;i-->0;){ + int b; + r=r*r>>15; + b=(int)(r>>16); + l=l<<1|b; + r>>=b; + } + return nbits-l; +} +#endif + +#ifdef USE_SMALL_DIV_TABLE +/* Result of 2^32/(2*i+1), except for i=0. */ +const opus_uint32 SMALL_DIV_TABLE[129] = { + 0xFFFFFFFF, 0x55555555, 0x33333333, 0x24924924, + 0x1C71C71C, 0x1745D174, 0x13B13B13, 0x11111111, + 0x0F0F0F0F, 0x0D79435E, 0x0C30C30C, 0x0B21642C, + 0x0A3D70A3, 0x097B425E, 0x08D3DCB0, 0x08421084, + 0x07C1F07C, 0x07507507, 0x06EB3E45, 0x06906906, + 0x063E7063, 0x05F417D0, 0x05B05B05, 0x0572620A, + 0x05397829, 0x05050505, 0x04D4873E, 0x04A7904A, + 0x047DC11F, 0x0456C797, 0x04325C53, 0x04104104, + 0x03F03F03, 0x03D22635, 0x03B5CC0E, 0x039B0AD1, + 0x0381C0E0, 0x0369D036, 0x03531DEC, 0x033D91D2, + 0x0329161F, 0x03159721, 0x03030303, 0x02F14990, + 0x02E05C0B, 0x02D02D02, 0x02C0B02C, 0x02B1DA46, + 0x02A3A0FD, 0x0295FAD4, 0x0288DF0C, 0x027C4597, + 0x02702702, 0x02647C69, 0x02593F69, 0x024E6A17, + 0x0243F6F0, 0x0239E0D5, 0x02302302, 0x0226B902, + 0x021D9EAD, 0x0214D021, 0x020C49BA, 0x02040810, + 0x01FC07F0, 0x01F44659, 0x01ECC07B, 0x01E573AC, + 0x01DE5D6E, 0x01D77B65, 0x01D0CB58, 0x01CA4B30, + 0x01C3F8F0, 0x01BDD2B8, 0x01B7D6C3, 0x01B20364, + 0x01AC5701, 0x01A6D01A, 0x01A16D3F, 0x019C2D14, + 0x01970E4F, 0x01920FB4, 0x018D3018, 0x01886E5F, + 0x0183C977, 0x017F405F, 0x017AD220, 0x01767DCE, + 0x01724287, 0x016E1F76, 0x016A13CD, 0x01661EC6, + 0x01623FA7, 0x015E75BB, 0x015AC056, 0x01571ED3, + 0x01539094, 0x01501501, 0x014CAB88, 0x0149539E, + 0x01460CBC, 0x0142D662, 0x013FB013, 0x013C995A, + 0x013991C2, 0x013698DF, 0x0133AE45, 0x0130D190, + 0x012E025C, 0x012B404A, 0x01288B01, 0x0125E227, + 0x01234567, 0x0120B470, 0x011E2EF3, 0x011BB4A4, + 0x01194538, 0x0116E068, 0x011485F0, 0x0112358E, + 0x010FEF01, 0x010DB20A, 0x010B7E6E, 0x010953F3, + 0x01073260, 0x0105197F, 0x0103091B, 0x01010101 +}; +#endif diff --git a/thirdparty/opus/celt/entcode.h b/thirdparty/opus/celt/entcode.h new file mode 100644 index 000000000000..13d6c84ef0fa --- /dev/null +++ b/thirdparty/opus/celt/entcode.h @@ -0,0 +1,152 @@ +/* Copyright (c) 2001-2011 Timothy B. Terriberry + Copyright (c) 2008-2009 Xiph.Org Foundation */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "opus_types.h" +#include "opus_defines.h" + +#if !defined(_entcode_H) +# define _entcode_H (1) +# include +# include +# include "ecintrin.h" + +extern const opus_uint32 SMALL_DIV_TABLE[129]; + +#ifdef OPUS_ARM_ASM +#define USE_SMALL_DIV_TABLE +#endif + +/*OPT: ec_window must be at least 32 bits, but if you have fast arithmetic on a + larger type, you can speed up the decoder by using it here.*/ +typedef opus_uint32 ec_window; +typedef struct ec_ctx ec_ctx; +typedef struct ec_ctx ec_enc; +typedef struct ec_ctx ec_dec; + +# define EC_WINDOW_SIZE ((int)sizeof(ec_window)*CHAR_BIT) + +/*The number of bits to use for the range-coded part of unsigned integers.*/ +# define EC_UINT_BITS (8) + +/*The resolution of fractional-precision bit usage measurements, i.e., + 3 => 1/8th bits.*/ +# define BITRES 3 + +/*The entropy encoder/decoder context. + We use the same structure for both, so that common functions like ec_tell() + can be used on either one.*/ +struct ec_ctx{ + /*Buffered input/output.*/ + unsigned char *buf; + /*The size of the buffer.*/ + opus_uint32 storage; + /*The offset at which the last byte containing raw bits was read/written.*/ + opus_uint32 end_offs; + /*Bits that will be read from/written at the end.*/ + ec_window end_window; + /*Number of valid bits in end_window.*/ + int nend_bits; + /*The total number of whole bits read/written. + This does not include partial bits currently in the range coder.*/ + int nbits_total; + /*The offset at which the next range coder byte will be read/written.*/ + opus_uint32 offs; + /*The number of values in the current range.*/ + opus_uint32 rng; + /*In the decoder: the difference between the top of the current range and + the input value, minus one. + In the encoder: the low end of the current range.*/ + opus_uint32 val; + /*In the decoder: the saved normalization factor from ec_decode(). + In the encoder: the number of oustanding carry propagating symbols.*/ + opus_uint32 ext; + /*A buffered input/output symbol, awaiting carry propagation.*/ + int rem; + /*Nonzero if an error occurred.*/ + int error; +}; + +static OPUS_INLINE opus_uint32 ec_range_bytes(ec_ctx *_this){ + return _this->offs; +} + +static OPUS_INLINE unsigned char *ec_get_buffer(ec_ctx *_this){ + return _this->buf; +} + +static OPUS_INLINE int ec_get_error(ec_ctx *_this){ + return _this->error; +} + +/*Returns the number of bits "used" by the encoded or decoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Return: The number of bits. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +static OPUS_INLINE int ec_tell(ec_ctx *_this){ + return _this->nbits_total-EC_ILOG(_this->rng); +} + +/*Returns the number of bits "used" by the encoded or decoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Return: The number of bits scaled by 2**BITRES. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +opus_uint32 ec_tell_frac(ec_ctx *_this); + +/* Tested exhaustively for all n and for 1<=d<=256 */ +static OPUS_INLINE opus_uint32 celt_udiv(opus_uint32 n, opus_uint32 d) { + celt_assert(d>0); +#ifdef USE_SMALL_DIV_TABLE + if (d>256) + return n/d; + else { + opus_uint32 t, q; + t = EC_ILOG(d&-d); + q = (opus_uint64)SMALL_DIV_TABLE[d>>t]*(n>>(t-1))>>32; + return q+(n-q*d >= d); + } +#else + return n/d; +#endif +} + +static OPUS_INLINE opus_int32 celt_sudiv(opus_int32 n, opus_int32 d) { + celt_assert(d>0); +#ifdef USE_SMALL_DIV_TABLE + if (n<0) + return -(opus_int32)celt_udiv(-n, d); + else + return celt_udiv(n, d); +#else + return n/d; +#endif +} + +#endif diff --git a/thirdparty/opus/celt/entdec.c b/thirdparty/opus/celt/entdec.c new file mode 100644 index 000000000000..0b3433ed8b91 --- /dev/null +++ b/thirdparty/opus/celt/entdec.c @@ -0,0 +1,245 @@ +/* Copyright (c) 2001-2011 Timothy B. Terriberry + Copyright (c) 2008-2009 Xiph.Org Foundation */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include "os_support.h" +#include "arch.h" +#include "entdec.h" +#include "mfrngcod.h" + +/*A range decoder. + This is an entropy decoder based upon \cite{Mar79}, which is itself a + rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}. + It is very similar to arithmetic encoding, except that encoding is done with + digits in any base, instead of with bits, and so it is faster when using + larger bases (i.e.: a byte). + The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$ + is the base, longer than the theoretical optimum, but to my knowledge there + is no published justification for this claim. + This only seems true when using near-infinite precision arithmetic so that + the process is carried out with no rounding errors. + + An excellent description of implementation details is available at + http://www.arturocampos.com/ac_range.html + A recent work \cite{MNW98} which proposes several changes to arithmetic + encoding for efficiency actually re-discovers many of the principles + behind range encoding, and presents a good theoretical analysis of them. + + End of stream is handled by writing out the smallest number of bits that + ensures that the stream will be correctly decoded regardless of the value of + any subsequent bits. + ec_tell() can be used to determine how many bits were needed to decode + all the symbols thus far; other data can be packed in the remaining bits of + the input buffer. + @PHDTHESIS{Pas76, + author="Richard Clark Pasco", + title="Source coding algorithms for fast data compression", + school="Dept. of Electrical Engineering, Stanford University", + address="Stanford, CA", + month=May, + year=1976 + } + @INPROCEEDINGS{Mar79, + author="Martin, G.N.N.", + title="Range encoding: an algorithm for removing redundancy from a digitised + message", + booktitle="Video & Data Recording Conference", + year=1979, + address="Southampton", + month=Jul + } + @ARTICLE{MNW98, + author="Alistair Moffat and Radford Neal and Ian H. Witten", + title="Arithmetic Coding Revisited", + journal="{ACM} Transactions on Information Systems", + year=1998, + volume=16, + number=3, + pages="256--294", + month=Jul, + URL="http://www.stanford.edu/class/ee398a/handouts/papers/Moffat98ArithmCoding.pdf" + }*/ + +static int ec_read_byte(ec_dec *_this){ + return _this->offs<_this->storage?_this->buf[_this->offs++]:0; +} + +static int ec_read_byte_from_end(ec_dec *_this){ + return _this->end_offs<_this->storage? + _this->buf[_this->storage-++(_this->end_offs)]:0; +} + +/*Normalizes the contents of val and rng so that rng lies entirely in the + high-order symbol.*/ +static void ec_dec_normalize(ec_dec *_this){ + /*If the range is too small, rescale it and input some bits.*/ + while(_this->rng<=EC_CODE_BOT){ + int sym; + _this->nbits_total+=EC_SYM_BITS; + _this->rng<<=EC_SYM_BITS; + /*Use up the remaining bits from our last symbol.*/ + sym=_this->rem; + /*Read the next value from the input.*/ + _this->rem=ec_read_byte(_this); + /*Take the rest of the bits we need from this new symbol.*/ + sym=(sym<rem)>>(EC_SYM_BITS-EC_CODE_EXTRA); + /*And subtract them from val, capped to be less than EC_CODE_TOP.*/ + _this->val=((_this->val<buf=_buf; + _this->storage=_storage; + _this->end_offs=0; + _this->end_window=0; + _this->nend_bits=0; + /*This is the offset from which ec_tell() will subtract partial bits. + The final value after the ec_dec_normalize() call will be the same as in + the encoder, but we have to compensate for the bits that are added there.*/ + _this->nbits_total=EC_CODE_BITS+1 + -((EC_CODE_BITS-EC_CODE_EXTRA)/EC_SYM_BITS)*EC_SYM_BITS; + _this->offs=0; + _this->rng=1U<rem=ec_read_byte(_this); + _this->val=_this->rng-1-(_this->rem>>(EC_SYM_BITS-EC_CODE_EXTRA)); + _this->error=0; + /*Normalize the interval.*/ + ec_dec_normalize(_this); +} + +unsigned ec_decode(ec_dec *_this,unsigned _ft){ + unsigned s; + _this->ext=celt_udiv(_this->rng,_ft); + s=(unsigned)(_this->val/_this->ext); + return _ft-EC_MINI(s+1,_ft); +} + +unsigned ec_decode_bin(ec_dec *_this,unsigned _bits){ + unsigned s; + _this->ext=_this->rng>>_bits; + s=(unsigned)(_this->val/_this->ext); + return (1U<<_bits)-EC_MINI(s+1U,1U<<_bits); +} + +void ec_dec_update(ec_dec *_this,unsigned _fl,unsigned _fh,unsigned _ft){ + opus_uint32 s; + s=IMUL32(_this->ext,_ft-_fh); + _this->val-=s; + _this->rng=_fl>0?IMUL32(_this->ext,_fh-_fl):_this->rng-s; + ec_dec_normalize(_this); +} + +/*The probability of having a "one" is 1/(1<<_logp).*/ +int ec_dec_bit_logp(ec_dec *_this,unsigned _logp){ + opus_uint32 r; + opus_uint32 d; + opus_uint32 s; + int ret; + r=_this->rng; + d=_this->val; + s=r>>_logp; + ret=dval=d-s; + _this->rng=ret?s:r-s; + ec_dec_normalize(_this); + return ret; +} + +int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb){ + opus_uint32 r; + opus_uint32 d; + opus_uint32 s; + opus_uint32 t; + int ret; + s=_this->rng; + d=_this->val; + r=s>>_ftb; + ret=-1; + do{ + t=s; + s=IMUL32(r,_icdf[++ret]); + } + while(dval=d-s; + _this->rng=t-s; + ec_dec_normalize(_this); + return ret; +} + +opus_uint32 ec_dec_uint(ec_dec *_this,opus_uint32 _ft){ + unsigned ft; + unsigned s; + int ftb; + /*In order to optimize EC_ILOG(), it is undefined for the value 0.*/ + celt_assert(_ft>1); + _ft--; + ftb=EC_ILOG(_ft); + if(ftb>EC_UINT_BITS){ + opus_uint32 t; + ftb-=EC_UINT_BITS; + ft=(unsigned)(_ft>>ftb)+1; + s=ec_decode(_this,ft); + ec_dec_update(_this,s,s+1,ft); + t=(opus_uint32)s<error=1; + return _ft; + } + else{ + _ft++; + s=ec_decode(_this,(unsigned)_ft); + ec_dec_update(_this,s,s+1,(unsigned)_ft); + return s; + } +} + +opus_uint32 ec_dec_bits(ec_dec *_this,unsigned _bits){ + ec_window window; + int available; + opus_uint32 ret; + window=_this->end_window; + available=_this->nend_bits; + if((unsigned)available<_bits){ + do{ + window|=(ec_window)ec_read_byte_from_end(_this)<>=_bits; + available-=_bits; + _this->end_window=window; + _this->nend_bits=available; + _this->nbits_total+=_bits; + return ret; +} diff --git a/thirdparty/opus/celt/entdec.h b/thirdparty/opus/celt/entdec.h new file mode 100644 index 000000000000..d8ab31873088 --- /dev/null +++ b/thirdparty/opus/celt/entdec.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2001-2011 Timothy B. Terriberry + Copyright (c) 2008-2009 Xiph.Org Foundation */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if !defined(_entdec_H) +# define _entdec_H (1) +# include +# include "entcode.h" + +/*Initializes the decoder. + _buf: The input buffer to use. + Return: 0 on success, or a negative value on error.*/ +void ec_dec_init(ec_dec *_this,unsigned char *_buf,opus_uint32 _storage); + +/*Calculates the cumulative frequency for the next symbol. + This can then be fed into the probability model to determine what that + symbol is, and the additional frequency information required to advance to + the next symbol. + This function cannot be called more than once without a corresponding call to + ec_dec_update(), or decoding will not proceed correctly. + _ft: The total frequency of the symbols in the alphabet the next symbol was + encoded with. + Return: A cumulative frequency representing the encoded symbol. + If the cumulative frequency of all the symbols before the one that + was encoded was fl, and the cumulative frequency of all the symbols + up to and including the one encoded is fh, then the returned value + will fall in the range [fl,fh).*/ +unsigned ec_decode(ec_dec *_this,unsigned _ft); + +/*Equivalent to ec_decode() with _ft==1<<_bits.*/ +unsigned ec_decode_bin(ec_dec *_this,unsigned _bits); + +/*Advance the decoder past the next symbol using the frequency information the + symbol was encoded with. + Exactly one call to ec_decode() must have been made so that all necessary + intermediate calculations are performed. + _fl: The cumulative frequency of all symbols that come before the symbol + decoded. + _fh: The cumulative frequency of all symbols up to and including the symbol + decoded. + Together with _fl, this defines the range [_fl,_fh) in which the value + returned above must fall. + _ft: The total frequency of the symbols in the alphabet the symbol decoded + was encoded in. + This must be the same as passed to the preceding call to ec_decode().*/ +void ec_dec_update(ec_dec *_this,unsigned _fl,unsigned _fh,unsigned _ft); + +/* Decode a bit that has a 1/(1<<_logp) probability of being a one */ +int ec_dec_bit_logp(ec_dec *_this,unsigned _logp); + +/*Decodes a symbol given an "inverse" CDF table. + No call to ec_dec_update() is necessary after this call. + _icdf: The "inverse" CDF, such that symbol s falls in the range + [s>0?ft-_icdf[s-1]:0,ft-_icdf[s]), where ft=1<<_ftb. + The values must be monotonically non-increasing, and the last value + must be 0. + _ftb: The number of bits of precision in the cumulative distribution. + Return: The decoded symbol s.*/ +int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb); + +/*Extracts a raw unsigned integer with a non-power-of-2 range from the stream. + The bits must have been encoded with ec_enc_uint(). + No call to ec_dec_update() is necessary after this call. + _ft: The number of integers that can be decoded (one more than the max). + This must be at least one, and no more than 2**32-1. + Return: The decoded bits.*/ +opus_uint32 ec_dec_uint(ec_dec *_this,opus_uint32 _ft); + +/*Extracts a sequence of raw bits from the stream. + The bits must have been encoded with ec_enc_bits(). + No call to ec_dec_update() is necessary after this call. + _ftb: The number of bits to extract. + This must be between 0 and 25, inclusive. + Return: The decoded bits.*/ +opus_uint32 ec_dec_bits(ec_dec *_this,unsigned _ftb); + +#endif diff --git a/thirdparty/opus/celt/entenc.c b/thirdparty/opus/celt/entenc.c new file mode 100644 index 000000000000..f1750d25b842 --- /dev/null +++ b/thirdparty/opus/celt/entenc.c @@ -0,0 +1,294 @@ +/* Copyright (c) 2001-2011 Timothy B. Terriberry + Copyright (c) 2008-2009 Xiph.Org Foundation */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if defined(HAVE_CONFIG_H) +# include "config.h" +#endif +#include "os_support.h" +#include "arch.h" +#include "entenc.h" +#include "mfrngcod.h" + +/*A range encoder. + See entdec.c and the references for implementation details \cite{Mar79,MNW98}. + + @INPROCEEDINGS{Mar79, + author="Martin, G.N.N.", + title="Range encoding: an algorithm for removing redundancy from a digitised + message", + booktitle="Video \& Data Recording Conference", + year=1979, + address="Southampton", + month=Jul + } + @ARTICLE{MNW98, + author="Alistair Moffat and Radford Neal and Ian H. Witten", + title="Arithmetic Coding Revisited", + journal="{ACM} Transactions on Information Systems", + year=1998, + volume=16, + number=3, + pages="256--294", + month=Jul, + URL="http://www.stanford.edu/class/ee398/handouts/papers/Moffat98ArithmCoding.pdf" + }*/ + +static int ec_write_byte(ec_enc *_this,unsigned _value){ + if(_this->offs+_this->end_offs>=_this->storage)return -1; + _this->buf[_this->offs++]=(unsigned char)_value; + return 0; +} + +static int ec_write_byte_at_end(ec_enc *_this,unsigned _value){ + if(_this->offs+_this->end_offs>=_this->storage)return -1; + _this->buf[_this->storage-++(_this->end_offs)]=(unsigned char)_value; + return 0; +} + +/*Outputs a symbol, with a carry bit. + If there is a potential to propagate a carry over several symbols, they are + buffered until it can be determined whether or not an actual carry will + occur. + If the counter for the buffered symbols overflows, then the stream becomes + undecodable. + This gives a theoretical limit of a few billion symbols in a single packet on + 32-bit systems. + The alternative is to truncate the range in order to force a carry, but + requires similar carry tracking in the decoder, needlessly slowing it down.*/ +static void ec_enc_carry_out(ec_enc *_this,int _c){ + if(_c!=EC_SYM_MAX){ + /*No further carry propagation possible, flush buffer.*/ + int carry; + carry=_c>>EC_SYM_BITS; + /*Don't output a byte on the first write. + This compare should be taken care of by branch-prediction thereafter.*/ + if(_this->rem>=0)_this->error|=ec_write_byte(_this,_this->rem+carry); + if(_this->ext>0){ + unsigned sym; + sym=(EC_SYM_MAX+carry)&EC_SYM_MAX; + do _this->error|=ec_write_byte(_this,sym); + while(--(_this->ext)>0); + } + _this->rem=_c&EC_SYM_MAX; + } + else _this->ext++; +} + +static OPUS_INLINE void ec_enc_normalize(ec_enc *_this){ + /*If the range is too small, output some bits and rescale it.*/ + while(_this->rng<=EC_CODE_BOT){ + ec_enc_carry_out(_this,(int)(_this->val>>EC_CODE_SHIFT)); + /*Move the next-to-high-order symbol into the high-order position.*/ + _this->val=(_this->val<rng<<=EC_SYM_BITS; + _this->nbits_total+=EC_SYM_BITS; + } +} + +void ec_enc_init(ec_enc *_this,unsigned char *_buf,opus_uint32 _size){ + _this->buf=_buf; + _this->end_offs=0; + _this->end_window=0; + _this->nend_bits=0; + /*This is the offset from which ec_tell() will subtract partial bits.*/ + _this->nbits_total=EC_CODE_BITS+1; + _this->offs=0; + _this->rng=EC_CODE_TOP; + _this->rem=-1; + _this->val=0; + _this->ext=0; + _this->storage=_size; + _this->error=0; +} + +void ec_encode(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _ft){ + opus_uint32 r; + r=celt_udiv(_this->rng,_ft); + if(_fl>0){ + _this->val+=_this->rng-IMUL32(r,(_ft-_fl)); + _this->rng=IMUL32(r,(_fh-_fl)); + } + else _this->rng-=IMUL32(r,(_ft-_fh)); + ec_enc_normalize(_this); +} + +void ec_encode_bin(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _bits){ + opus_uint32 r; + r=_this->rng>>_bits; + if(_fl>0){ + _this->val+=_this->rng-IMUL32(r,((1U<<_bits)-_fl)); + _this->rng=IMUL32(r,(_fh-_fl)); + } + else _this->rng-=IMUL32(r,((1U<<_bits)-_fh)); + ec_enc_normalize(_this); +} + +/*The probability of having a "one" is 1/(1<<_logp).*/ +void ec_enc_bit_logp(ec_enc *_this,int _val,unsigned _logp){ + opus_uint32 r; + opus_uint32 s; + opus_uint32 l; + r=_this->rng; + l=_this->val; + s=r>>_logp; + r-=s; + if(_val)_this->val=l+r; + _this->rng=_val?s:r; + ec_enc_normalize(_this); +} + +void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb){ + opus_uint32 r; + r=_this->rng>>_ftb; + if(_s>0){ + _this->val+=_this->rng-IMUL32(r,_icdf[_s-1]); + _this->rng=IMUL32(r,_icdf[_s-1]-_icdf[_s]); + } + else _this->rng-=IMUL32(r,_icdf[_s]); + ec_enc_normalize(_this); +} + +void ec_enc_uint(ec_enc *_this,opus_uint32 _fl,opus_uint32 _ft){ + unsigned ft; + unsigned fl; + int ftb; + /*In order to optimize EC_ILOG(), it is undefined for the value 0.*/ + celt_assert(_ft>1); + _ft--; + ftb=EC_ILOG(_ft); + if(ftb>EC_UINT_BITS){ + ftb-=EC_UINT_BITS; + ft=(_ft>>ftb)+1; + fl=(unsigned)(_fl>>ftb); + ec_encode(_this,fl,fl+1,ft); + ec_enc_bits(_this,_fl&(((opus_uint32)1<end_window; + used=_this->nend_bits; + celt_assert(_bits>0); + if(used+_bits>EC_WINDOW_SIZE){ + do{ + _this->error|=ec_write_byte_at_end(_this,(unsigned)window&EC_SYM_MAX); + window>>=EC_SYM_BITS; + used-=EC_SYM_BITS; + } + while(used>=EC_SYM_BITS); + } + window|=(ec_window)_fl<end_window=window; + _this->nend_bits=used; + _this->nbits_total+=_bits; +} + +void ec_enc_patch_initial_bits(ec_enc *_this,unsigned _val,unsigned _nbits){ + int shift; + unsigned mask; + celt_assert(_nbits<=EC_SYM_BITS); + shift=EC_SYM_BITS-_nbits; + mask=((1<<_nbits)-1)<offs>0){ + /*The first byte has been finalized.*/ + _this->buf[0]=(unsigned char)((_this->buf[0]&~mask)|_val<rem>=0){ + /*The first byte is still awaiting carry propagation.*/ + _this->rem=(_this->rem&~mask)|_val<rng<=(EC_CODE_TOP>>_nbits)){ + /*The renormalization loop has never been run.*/ + _this->val=(_this->val&~((opus_uint32)mask<error=-1; +} + +void ec_enc_shrink(ec_enc *_this,opus_uint32 _size){ + celt_assert(_this->offs+_this->end_offs<=_size); + OPUS_MOVE(_this->buf+_size-_this->end_offs, + _this->buf+_this->storage-_this->end_offs,_this->end_offs); + _this->storage=_size; +} + +void ec_enc_done(ec_enc *_this){ + ec_window window; + int used; + opus_uint32 msk; + opus_uint32 end; + int l; + /*We output the minimum number of bits that ensures that the symbols encoded + thus far will be decoded correctly regardless of the bits that follow.*/ + l=EC_CODE_BITS-EC_ILOG(_this->rng); + msk=(EC_CODE_TOP-1)>>l; + end=(_this->val+msk)&~msk; + if((end|msk)>=_this->val+_this->rng){ + l++; + msk>>=1; + end=(_this->val+msk)&~msk; + } + while(l>0){ + ec_enc_carry_out(_this,(int)(end>>EC_CODE_SHIFT)); + end=(end<rem>=0||_this->ext>0)ec_enc_carry_out(_this,0); + /*If we have buffered extra bits, flush them as well.*/ + window=_this->end_window; + used=_this->nend_bits; + while(used>=EC_SYM_BITS){ + _this->error|=ec_write_byte_at_end(_this,(unsigned)window&EC_SYM_MAX); + window>>=EC_SYM_BITS; + used-=EC_SYM_BITS; + } + /*Clear any excess space and add any remaining extra bits to the last byte.*/ + if(!_this->error){ + OPUS_CLEAR(_this->buf+_this->offs, + _this->storage-_this->offs-_this->end_offs); + if(used>0){ + /*If there's no range coder data at all, give up.*/ + if(_this->end_offs>=_this->storage)_this->error=-1; + else{ + l=-l; + /*If we've busted, don't add too many extra bits to the last byte; it + would corrupt the range coder data, and that's more important.*/ + if(_this->offs+_this->end_offs>=_this->storage&&lerror=-1; + } + _this->buf[_this->storage-_this->end_offs-1]|=(unsigned char)window; + } + } + } +} diff --git a/thirdparty/opus/celt/entenc.h b/thirdparty/opus/celt/entenc.h new file mode 100644 index 000000000000..796bc4d5727f --- /dev/null +++ b/thirdparty/opus/celt/entenc.h @@ -0,0 +1,110 @@ +/* Copyright (c) 2001-2011 Timothy B. Terriberry + Copyright (c) 2008-2009 Xiph.Org Foundation */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if !defined(_entenc_H) +# define _entenc_H (1) +# include +# include "entcode.h" + +/*Initializes the encoder. + _buf: The buffer to store output bytes in. + _size: The size of the buffer, in chars.*/ +void ec_enc_init(ec_enc *_this,unsigned char *_buf,opus_uint32 _size); +/*Encodes a symbol given its frequency information. + The frequency information must be discernable by the decoder, assuming it + has read only the previous symbols from the stream. + It is allowable to change the frequency information, or even the entire + source alphabet, so long as the decoder can tell from the context of the + previously encoded information that it is supposed to do so as well. + _fl: The cumulative frequency of all symbols that come before the one to be + encoded. + _fh: The cumulative frequency of all symbols up to and including the one to + be encoded. + Together with _fl, this defines the range [_fl,_fh) in which the + decoded value will fall. + _ft: The sum of the frequencies of all the symbols*/ +void ec_encode(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _ft); + +/*Equivalent to ec_encode() with _ft==1<<_bits.*/ +void ec_encode_bin(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _bits); + +/* Encode a bit that has a 1/(1<<_logp) probability of being a one */ +void ec_enc_bit_logp(ec_enc *_this,int _val,unsigned _logp); + +/*Encodes a symbol given an "inverse" CDF table. + _s: The index of the symbol to encode. + _icdf: The "inverse" CDF, such that symbol _s falls in the range + [_s>0?ft-_icdf[_s-1]:0,ft-_icdf[_s]), where ft=1<<_ftb. + The values must be monotonically non-increasing, and the last value + must be 0. + _ftb: The number of bits of precision in the cumulative distribution.*/ +void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb); + +/*Encodes a raw unsigned integer in the stream. + _fl: The integer to encode. + _ft: The number of integers that can be encoded (one more than the max). + This must be at least one, and no more than 2**32-1.*/ +void ec_enc_uint(ec_enc *_this,opus_uint32 _fl,opus_uint32 _ft); + +/*Encodes a sequence of raw bits in the stream. + _fl: The bits to encode. + _ftb: The number of bits to encode. + This must be between 1 and 25, inclusive.*/ +void ec_enc_bits(ec_enc *_this,opus_uint32 _fl,unsigned _ftb); + +/*Overwrites a few bits at the very start of an existing stream, after they + have already been encoded. + This makes it possible to have a few flags up front, where it is easy for + decoders to access them without parsing the whole stream, even if their + values are not determined until late in the encoding process, without having + to buffer all the intermediate symbols in the encoder. + In order for this to work, at least _nbits bits must have already been + encoded using probabilities that are an exact power of two. + The encoder can verify the number of encoded bits is sufficient, but cannot + check this latter condition. + _val: The bits to encode (in the least _nbits significant bits). + They will be decoded in order from most-significant to least. + _nbits: The number of bits to overwrite. + This must be no more than 8.*/ +void ec_enc_patch_initial_bits(ec_enc *_this,unsigned _val,unsigned _nbits); + +/*Compacts the data to fit in the target size. + This moves up the raw bits at the end of the current buffer so they are at + the end of the new buffer size. + The caller must ensure that the amount of data that's already been written + will fit in the new size. + _size: The number of bytes in the new buffer. + This must be large enough to contain the bits already written, and + must be no larger than the existing size.*/ +void ec_enc_shrink(ec_enc *_this,opus_uint32 _size); + +/*Indicates that there are no more symbols to encode. + All reamining output bytes are flushed to the output buffer. + ec_enc_init() must be called before the encoder can be used again.*/ +void ec_enc_done(ec_enc *_this); + +#endif diff --git a/thirdparty/opus/celt/fixed_debug.h b/thirdparty/opus/celt/fixed_debug.h new file mode 100644 index 000000000000..d28227f5dc72 --- /dev/null +++ b/thirdparty/opus/celt/fixed_debug.h @@ -0,0 +1,784 @@ +/* Copyright (C) 2003-2008 Jean-Marc Valin + Copyright (C) 2007-2012 Xiph.Org Foundation */ +/** + @file fixed_debug.h + @brief Fixed-point operations with debugging +*/ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef FIXED_DEBUG_H +#define FIXED_DEBUG_H + +#include +#include "opus_defines.h" + +#ifdef CELT_C +OPUS_EXPORT opus_int64 celt_mips=0; +#else +extern opus_int64 celt_mips; +#endif + +#define MULT16_16SU(a,b) ((opus_val32)(opus_val16)(a)*(opus_val32)(opus_uint16)(b)) +#define MULT32_32_Q31(a,b) ADD32(ADD32(SHL32(MULT16_16(SHR32((a),16),SHR((b),16)),1), SHR32(MULT16_16SU(SHR32((a),16),((b)&0x0000ffff)),15)), SHR32(MULT16_16SU(SHR32((b),16),((a)&0x0000ffff)),15)) + +/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */ +#define MULT16_32_Q16(a,b) ADD32(MULT16_16((a),SHR32((b),16)), SHR32(MULT16_16SU((a),((b)&0x0000ffff)),16)) + +#define MULT16_32_P16(a,b) MULT16_32_PX(a,b,16) + +#define QCONST16(x,bits) ((opus_val16)(.5+(x)*(((opus_val32)1)<<(bits)))) +#define QCONST32(x,bits) ((opus_val32)(.5+(x)*(((opus_val32)1)<<(bits)))) + +#define VERIFY_SHORT(x) ((x)<=32767&&(x)>=-32768) +#define VERIFY_INT(x) ((x)<=2147483647LL&&(x)>=-2147483648LL) +#define VERIFY_UINT(x) ((x)<=(2147483647LLU<<1)) + +#define SHR(a,b) SHR32(a,b) +#define PSHR(a,b) PSHR32(a,b) + +static OPUS_INLINE short NEG16(int x) +{ + int res; + if (!VERIFY_SHORT(x)) + { + fprintf (stderr, "NEG16: input is not short: %d\n", (int)x); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = -x; + if (!VERIFY_SHORT(res)) + { + fprintf (stderr, "NEG16: output is not short: %d\n", (int)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips++; + return res; +} +static OPUS_INLINE int NEG32(opus_int64 x) +{ + opus_int64 res; + if (!VERIFY_INT(x)) + { + fprintf (stderr, "NEG16: input is not int: %d\n", (int)x); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = -x; + if (!VERIFY_INT(res)) + { + fprintf (stderr, "NEG16: output is not int: %d\n", (int)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=2; + return res; +} + +#define EXTRACT16(x) EXTRACT16_(x, __FILE__, __LINE__) +static OPUS_INLINE short EXTRACT16_(int x, char *file, int line) +{ + int res; + if (!VERIFY_SHORT(x)) + { + fprintf (stderr, "EXTRACT16: input is not short: %d in %s: line %d\n", x, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = x; + celt_mips++; + return res; +} + +#define EXTEND32(x) EXTEND32_(x, __FILE__, __LINE__) +static OPUS_INLINE int EXTEND32_(int x, char *file, int line) +{ + int res; + if (!VERIFY_SHORT(x)) + { + fprintf (stderr, "EXTEND32: input is not short: %d in %s: line %d\n", x, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = x; + celt_mips++; + return res; +} + +#define SHR16(a, shift) SHR16_(a, shift, __FILE__, __LINE__) +static OPUS_INLINE short SHR16_(int a, int shift, char *file, int line) +{ + int res; + if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift)) + { + fprintf (stderr, "SHR16: inputs are not short: %d >> %d in %s: line %d\n", a, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = a>>shift; + if (!VERIFY_SHORT(res)) + { + fprintf (stderr, "SHR16: output is not short: %d in %s: line %d\n", res, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips++; + return res; +} +#define SHL16(a, shift) SHL16_(a, shift, __FILE__, __LINE__) +static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line) +{ + int res; + if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift)) + { + fprintf (stderr, "SHL16: inputs are not short: %d %d in %s: line %d\n", a, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = a<>shift; + if (!VERIFY_INT(res)) + { + fprintf (stderr, "SHR32: output is not int: %d\n", (int)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=2; + return res; +} +#define SHL32(a, shift) SHL32_(a, shift, __FILE__, __LINE__) +static OPUS_INLINE int SHL32_(opus_int64 a, int shift, char *file, int line) +{ + opus_int64 res; + if (!VERIFY_INT(a) || !VERIFY_SHORT(shift)) + { + fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", a, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = a<>1))),shift)) +#define VSHR32(a, shift) (((shift)>0) ? SHR32(a, shift) : SHL32(a, -(shift))) + +#define ROUND16(x,a) (celt_mips--,EXTRACT16(PSHR32((x),(a)))) +#define HALF16(x) (SHR16(x,1)) +#define HALF32(x) (SHR32(x,1)) + +//#define SHR(a,shift) ((a) >> (shift)) +//#define SHL(a,shift) ((a) << (shift)) + +#define ADD16(a, b) ADD16_(a, b, __FILE__, __LINE__) +static OPUS_INLINE short ADD16_(int a, int b, char *file, int line) +{ + int res; + if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) + { + fprintf (stderr, "ADD16: inputs are not short: %d %d in %s: line %d\n", a, b, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = a+b; + if (!VERIFY_SHORT(res)) + { + fprintf (stderr, "ADD16: output is not short: %d+%d=%d in %s: line %d\n", a,b,res, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips++; + return res; +} + +#define SUB16(a, b) SUB16_(a, b, __FILE__, __LINE__) +static OPUS_INLINE short SUB16_(int a, int b, char *file, int line) +{ + int res; + if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) + { + fprintf (stderr, "SUB16: inputs are not short: %d %d in %s: line %d\n", a, b, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = a-b; + if (!VERIFY_SHORT(res)) + { + fprintf (stderr, "SUB16: output is not short: %d in %s: line %d\n", res, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips++; + return res; +} + +#define ADD32(a, b) ADD32_(a, b, __FILE__, __LINE__) +static OPUS_INLINE int ADD32_(opus_int64 a, opus_int64 b, char *file, int line) +{ + opus_int64 res; + if (!VERIFY_INT(a) || !VERIFY_INT(b)) + { + fprintf (stderr, "ADD32: inputs are not int: %d %d in %s: line %d\n", (int)a, (int)b, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = a+b; + if (!VERIFY_INT(res)) + { + fprintf (stderr, "ADD32: output is not int: %d in %s: line %d\n", (int)res, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=2; + return res; +} + +#define SUB32(a, b) SUB32_(a, b, __FILE__, __LINE__) +static OPUS_INLINE int SUB32_(opus_int64 a, opus_int64 b, char *file, int line) +{ + opus_int64 res; + if (!VERIFY_INT(a) || !VERIFY_INT(b)) + { + fprintf (stderr, "SUB32: inputs are not int: %d %d in %s: line %d\n", (int)a, (int)b, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = a-b; + if (!VERIFY_INT(res)) + { + fprintf (stderr, "SUB32: output is not int: %d in %s: line %d\n", (int)res, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=2; + return res; +} + +#undef UADD32 +#define UADD32(a, b) UADD32_(a, b, __FILE__, __LINE__) +static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file, int line) +{ + opus_uint64 res; + if (!VERIFY_UINT(a) || !VERIFY_UINT(b)) + { + fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = a+b; + if (!VERIFY_UINT(res)) + { + fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", res, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=2; + return res; +} + +#undef USUB32 +#define USUB32(a, b) USUB32_(a, b, __FILE__, __LINE__) +static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file, int line) +{ + opus_uint64 res; + if (!VERIFY_UINT(a) || !VERIFY_UINT(b)) + { + fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + if (a=((opus_val32)(1)<<(15+Q))) + { + fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n", Q, (int)a, (int)b, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = (((opus_int64)a)*(opus_int64)b) >> Q; + if (!VERIFY_INT(res)) + { + fprintf (stderr, "MULT16_32_Q%d: output is not int: %d*%d=%d in %s: line %d\n", Q, (int)a, (int)b,(int)res, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + if (Q==15) + celt_mips+=3; + else + celt_mips+=4; + return res; +} + +#define MULT16_32_PX(a, b, Q) MULT16_32_PX_(a, b, Q, __FILE__, __LINE__) +static OPUS_INLINE int MULT16_32_PX_(int a, opus_int64 b, int Q, char *file, int line) +{ + opus_int64 res; + if (!VERIFY_SHORT(a) || !VERIFY_INT(b)) + { + fprintf (stderr, "MULT16_32_P%d: inputs are not short+int: %d %d in %s: line %d\n\n", Q, (int)a, (int)b, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + if (ABS32(b)>=((opus_int64)(1)<<(15+Q))) + { + fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n\n", Q, (int)a, (int)b,file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = ((((opus_int64)a)*(opus_int64)b) + (((opus_val32)(1)<>1))>> Q; + if (!VERIFY_INT(res)) + { + fprintf (stderr, "MULT16_32_P%d: output is not int: %d*%d=%d in %s: line %d\n\n", Q, (int)a, (int)b,(int)res, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + if (Q==15) + celt_mips+=4; + else + celt_mips+=5; + return res; +} + +#define MULT16_32_Q15(a,b) MULT16_32_QX(a,b,15) +#define MAC16_32_Q15(c,a,b) (celt_mips-=2,ADD32((c),MULT16_32_Q15((a),(b)))) +#define MAC16_32_Q16(c,a,b) (celt_mips-=2,ADD32((c),MULT16_32_Q16((a),(b)))) + +static OPUS_INLINE int SATURATE(int a, int b) +{ + if (a>b) + a=b; + if (a<-b) + a = -b; + celt_mips+=3; + return a; +} + +static OPUS_INLINE opus_int16 SATURATE16(opus_int32 a) +{ + celt_mips+=3; + if (a>32767) + return 32767; + else if (a<-32768) + return -32768; + else return a; +} + +static OPUS_INLINE int MULT16_16_Q11_32(int a, int b) +{ + opus_int64 res; + if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) + { + fprintf (stderr, "MULT16_16_Q11: inputs are not short: %d %d\n", a, b); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = ((opus_int64)a)*b; + res >>= 11; + if (!VERIFY_INT(res)) + { + fprintf (stderr, "MULT16_16_Q11: output is not short: %d*%d=%d\n", (int)a, (int)b, (int)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=3; + return res; +} +static OPUS_INLINE short MULT16_16_Q13(int a, int b) +{ + opus_int64 res; + if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) + { + fprintf (stderr, "MULT16_16_Q13: inputs are not short: %d %d\n", a, b); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = ((opus_int64)a)*b; + res >>= 13; + if (!VERIFY_SHORT(res)) + { + fprintf (stderr, "MULT16_16_Q13: output is not short: %d*%d=%d\n", a, b, (int)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=3; + return res; +} +static OPUS_INLINE short MULT16_16_Q14(int a, int b) +{ + opus_int64 res; + if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) + { + fprintf (stderr, "MULT16_16_Q14: inputs are not short: %d %d\n", a, b); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = ((opus_int64)a)*b; + res >>= 14; + if (!VERIFY_SHORT(res)) + { + fprintf (stderr, "MULT16_16_Q14: output is not short: %d\n", (int)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=3; + return res; +} + +#define MULT16_16_Q15(a, b) MULT16_16_Q15_(a, b, __FILE__, __LINE__) +static OPUS_INLINE short MULT16_16_Q15_(int a, int b, char *file, int line) +{ + opus_int64 res; + if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) + { + fprintf (stderr, "MULT16_16_Q15: inputs are not short: %d %d in %s: line %d\n", a, b, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = ((opus_int64)a)*b; + res >>= 15; + if (!VERIFY_SHORT(res)) + { + fprintf (stderr, "MULT16_16_Q15: output is not short: %d in %s: line %d\n", (int)res, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=1; + return res; +} + +static OPUS_INLINE short MULT16_16_P13(int a, int b) +{ + opus_int64 res; + if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) + { + fprintf (stderr, "MULT16_16_P13: inputs are not short: %d %d\n", a, b); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = ((opus_int64)a)*b; + res += 4096; + if (!VERIFY_INT(res)) + { + fprintf (stderr, "MULT16_16_P13: overflow: %d*%d=%d\n", a, b, (int)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res >>= 13; + if (!VERIFY_SHORT(res)) + { + fprintf (stderr, "MULT16_16_P13: output is not short: %d*%d=%d\n", a, b, (int)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=4; + return res; +} +static OPUS_INLINE short MULT16_16_P14(int a, int b) +{ + opus_int64 res; + if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) + { + fprintf (stderr, "MULT16_16_P14: inputs are not short: %d %d\n", a, b); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = ((opus_int64)a)*b; + res += 8192; + if (!VERIFY_INT(res)) + { + fprintf (stderr, "MULT16_16_P14: overflow: %d*%d=%d\n", a, b, (int)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res >>= 14; + if (!VERIFY_SHORT(res)) + { + fprintf (stderr, "MULT16_16_P14: output is not short: %d*%d=%d\n", a, b, (int)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=4; + return res; +} +static OPUS_INLINE short MULT16_16_P15(int a, int b) +{ + opus_int64 res; + if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) + { + fprintf (stderr, "MULT16_16_P15: inputs are not short: %d %d\n", a, b); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = ((opus_int64)a)*b; + res += 16384; + if (!VERIFY_INT(res)) + { + fprintf (stderr, "MULT16_16_P15: overflow: %d*%d=%d\n", a, b, (int)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res >>= 15; + if (!VERIFY_SHORT(res)) + { + fprintf (stderr, "MULT16_16_P15: output is not short: %d*%d=%d\n", a, b, (int)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=2; + return res; +} + +#define DIV32_16(a, b) DIV32_16_(a, b, __FILE__, __LINE__) + +static OPUS_INLINE int DIV32_16_(opus_int64 a, opus_int64 b, char *file, int line) +{ + opus_int64 res; + if (b==0) + { + fprintf(stderr, "DIV32_16: divide by zero: %d/%d in %s: line %d\n", (int)a, (int)b, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + return 0; + } + if (!VERIFY_INT(a) || !VERIFY_SHORT(b)) + { + fprintf (stderr, "DIV32_16: inputs are not int/short: %d %d in %s: line %d\n", (int)a, (int)b, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = a/b; + if (!VERIFY_SHORT(res)) + { + fprintf (stderr, "DIV32_16: output is not short: %d / %d = %d in %s: line %d\n", (int)a,(int)b,(int)res, file, line); + if (res>32767) + res = 32767; + if (res<-32768) + res = -32768; +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=35; + return res; +} + +#define DIV32(a, b) DIV32_(a, b, __FILE__, __LINE__) +static OPUS_INLINE int DIV32_(opus_int64 a, opus_int64 b, char *file, int line) +{ + opus_int64 res; + if (b==0) + { + fprintf(stderr, "DIV32: divide by zero: %d/%d in %s: line %d\n", (int)a, (int)b, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + return 0; + } + + if (!VERIFY_INT(a) || !VERIFY_INT(b)) + { + fprintf (stderr, "DIV32: inputs are not int/short: %d %d in %s: line %d\n", (int)a, (int)b, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = a/b; + if (!VERIFY_INT(res)) + { + fprintf (stderr, "DIV32: output is not int: %d in %s: line %d\n", (int)res, file, line); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=70; + return res; +} + +static OPUS_INLINE opus_val16 SIG2WORD16_generic(celt_sig x) +{ + x = PSHR32(x, SIG_SHIFT); + x = MAX32(x, -32768); + x = MIN32(x, 32767); + return EXTRACT16(x); +} +#define SIG2WORD16(x) (SIG2WORD16_generic(x)) + + +#undef PRINT_MIPS +#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", celt_mips);} while (0); + +#endif diff --git a/thirdparty/opus/celt/fixed_generic.h b/thirdparty/opus/celt/fixed_generic.h new file mode 100644 index 000000000000..1cfd6d698906 --- /dev/null +++ b/thirdparty/opus/celt/fixed_generic.h @@ -0,0 +1,167 @@ +/* Copyright (C) 2007-2009 Xiph.Org Foundation + Copyright (C) 2003-2008 Jean-Marc Valin + Copyright (C) 2007-2008 CSIRO */ +/** + @file fixed_generic.h + @brief Generic fixed-point operations +*/ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef FIXED_GENERIC_H +#define FIXED_GENERIC_H + +/** Multiply a 16-bit signed value by a 16-bit unsigned value. The result is a 32-bit signed value */ +#define MULT16_16SU(a,b) ((opus_val32)(opus_val16)(a)*(opus_val32)(opus_uint16)(b)) + +/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */ +#if OPUS_FAST_INT64 +#define MULT16_32_Q16(a,b) ((opus_val32)SHR((opus_int64)((opus_val16)(a))*(b),16)) +#else +#define MULT16_32_Q16(a,b) ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16)) +#endif + +/** 16x32 multiplication, followed by a 16-bit shift right (round-to-nearest). Results fits in 32 bits */ +#if OPUS_FAST_INT64 +#define MULT16_32_P16(a,b) ((opus_val32)PSHR((opus_int64)((opus_val16)(a))*(b),16)) +#else +#define MULT16_32_P16(a,b) ADD32(MULT16_16((a),SHR((b),16)), PSHR(MULT16_16SU((a),((b)&0x0000ffff)),16)) +#endif + +/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */ +#if OPUS_FAST_INT64 +#define MULT16_32_Q15(a,b) ((opus_val32)SHR((opus_int64)((opus_val16)(a))*(b),15)) +#else +#define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15)) +#endif + +/** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */ +#if OPUS_FAST_INT64 +#define MULT32_32_Q31(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),31)) +#else +#define MULT32_32_Q31(a,b) ADD32(ADD32(SHL(MULT16_16(SHR((a),16),SHR((b),16)),1), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),15)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),15)) +#endif + +/** Compile-time conversion of float constant to 16-bit value */ +#define QCONST16(x,bits) ((opus_val16)(.5+(x)*(((opus_val32)1)<<(bits)))) + +/** Compile-time conversion of float constant to 32-bit value */ +#define QCONST32(x,bits) ((opus_val32)(.5+(x)*(((opus_val32)1)<<(bits)))) + +/** Negate a 16-bit value */ +#define NEG16(x) (-(x)) +/** Negate a 32-bit value */ +#define NEG32(x) (-(x)) + +/** Change a 32-bit value into a 16-bit value. The value is assumed to fit in 16-bit, otherwise the result is undefined */ +#define EXTRACT16(x) ((opus_val16)(x)) +/** Change a 16-bit value into a 32-bit value */ +#define EXTEND32(x) ((opus_val32)(x)) + +/** Arithmetic shift-right of a 16-bit value */ +#define SHR16(a,shift) ((a) >> (shift)) +/** Arithmetic shift-left of a 16-bit value */ +#define SHL16(a,shift) ((opus_int16)((opus_uint16)(a)<<(shift))) +/** Arithmetic shift-right of a 32-bit value */ +#define SHR32(a,shift) ((a) >> (shift)) +/** Arithmetic shift-left of a 32-bit value */ +#define SHL32(a,shift) ((opus_int32)((opus_uint32)(a)<<(shift))) + +/** 32-bit arithmetic shift right with rounding-to-nearest instead of rounding down */ +#define PSHR32(a,shift) (SHR32((a)+((EXTEND32(1)<<((shift))>>1)),shift)) +/** 32-bit arithmetic shift right where the argument can be negative */ +#define VSHR32(a, shift) (((shift)>0) ? SHR32(a, shift) : SHL32(a, -(shift))) + +/** "RAW" macros, should not be used outside of this header file */ +#define SHR(a,shift) ((a) >> (shift)) +#define SHL(a,shift) SHL32(a,shift) +#define PSHR(a,shift) (SHR((a)+((EXTEND32(1)<<((shift))>>1)),shift)) +#define SATURATE(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x))) + +#define SATURATE16(x) (EXTRACT16((x)>32767 ? 32767 : (x)<-32768 ? -32768 : (x))) + +/** Shift by a and round-to-neareast 32-bit value. Result is a 16-bit value */ +#define ROUND16(x,a) (EXTRACT16(PSHR32((x),(a)))) +/** Divide by two */ +#define HALF16(x) (SHR16(x,1)) +#define HALF32(x) (SHR32(x,1)) + +/** Add two 16-bit values */ +#define ADD16(a,b) ((opus_val16)((opus_val16)(a)+(opus_val16)(b))) +/** Subtract two 16-bit values */ +#define SUB16(a,b) ((opus_val16)(a)-(opus_val16)(b)) +/** Add two 32-bit values */ +#define ADD32(a,b) ((opus_val32)(a)+(opus_val32)(b)) +/** Subtract two 32-bit values */ +#define SUB32(a,b) ((opus_val32)(a)-(opus_val32)(b)) + +/** 16x16 multiplication where the result fits in 16 bits */ +#define MULT16_16_16(a,b) ((((opus_val16)(a))*((opus_val16)(b)))) + +/* (opus_val32)(opus_val16) gives TI compiler a hint that it's 16x16->32 multiply */ +/** 16x16 multiplication where the result fits in 32 bits */ +#define MULT16_16(a,b) (((opus_val32)(opus_val16)(a))*((opus_val32)(opus_val16)(b))) + +/** 16x16 multiply-add where the result fits in 32 bits */ +#define MAC16_16(c,a,b) (ADD32((c),MULT16_16((a),(b)))) +/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add. + b must fit in 31 bits. + Result fits in 32 bits. */ +#define MAC16_32_Q15(c,a,b) ADD32((c),ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15))) + +/** 16x32 multiplication, followed by a 16-bit shift right and 32-bit add. + Results fits in 32 bits */ +#define MAC16_32_Q16(c,a,b) ADD32((c),ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16))) + +#define MULT16_16_Q11_32(a,b) (SHR(MULT16_16((a),(b)),11)) +#define MULT16_16_Q11(a,b) (SHR(MULT16_16((a),(b)),11)) +#define MULT16_16_Q13(a,b) (SHR(MULT16_16((a),(b)),13)) +#define MULT16_16_Q14(a,b) (SHR(MULT16_16((a),(b)),14)) +#define MULT16_16_Q15(a,b) (SHR(MULT16_16((a),(b)),15)) + +#define MULT16_16_P13(a,b) (SHR(ADD32(4096,MULT16_16((a),(b))),13)) +#define MULT16_16_P14(a,b) (SHR(ADD32(8192,MULT16_16((a),(b))),14)) +#define MULT16_16_P15(a,b) (SHR(ADD32(16384,MULT16_16((a),(b))),15)) + +/** Divide a 32-bit value by a 16-bit value. Result fits in 16 bits */ +#define DIV32_16(a,b) ((opus_val16)(((opus_val32)(a))/((opus_val16)(b)))) + +/** Divide a 32-bit value by a 32-bit value. Result fits in 32 bits */ +#define DIV32(a,b) (((opus_val32)(a))/((opus_val32)(b))) + +#if defined(MIPSr1_ASM) +#include "mips/fixed_generic_mipsr1.h" +#endif + +static OPUS_INLINE opus_val16 SIG2WORD16_generic(celt_sig x) +{ + x = PSHR32(x, SIG_SHIFT); + x = MAX32(x, -32768); + x = MIN32(x, 32767); + return EXTRACT16(x); +} +#define SIG2WORD16(x) (SIG2WORD16_generic(x)) + +#endif diff --git a/thirdparty/opus/celt/float_cast.h b/thirdparty/opus/celt/float_cast.h new file mode 100644 index 000000000000..ed5a39b5433b --- /dev/null +++ b/thirdparty/opus/celt/float_cast.h @@ -0,0 +1,140 @@ +/* Copyright (C) 2001 Erik de Castro Lopo */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Version 1.1 */ + +#ifndef FLOAT_CAST_H +#define FLOAT_CAST_H + + +#include "arch.h" + +/*============================================================================ +** On Intel Pentium processors (especially PIII and probably P4), converting +** from float to int is very slow. To meet the C specs, the code produced by +** most C compilers targeting Pentium needs to change the FPU rounding mode +** before the float to int conversion is performed. +** +** Changing the FPU rounding mode causes the FPU pipeline to be flushed. It +** is this flushing of the pipeline which is so slow. +** +** Fortunately the ISO C99 specifications define the functions lrint, lrintf, +** llrint and llrintf which fix this problem as a side effect. +** +** On Unix-like systems, the configure process should have detected the +** presence of these functions. If they weren't found we have to replace them +** here with a standard C cast. +*/ + +/* +** The C99 prototypes for lrint and lrintf are as follows: +** +** long int lrintf (float x) ; +** long int lrint (double x) ; +*/ + +/* The presence of the required functions are detected during the configure +** process and the values HAVE_LRINT and HAVE_LRINTF are set accordingly in +** the config.h file. +*/ + +#if (HAVE_LRINTF) + +/* These defines enable functionality introduced with the 1999 ISO C +** standard. They must be defined before the inclusion of math.h to +** engage them. If optimisation is enabled, these functions will be +** inlined. With optimisation switched off, you have to link in the +** maths library using -lm. +*/ + +#define _ISOC9X_SOURCE 1 +#define _ISOC99_SOURCE 1 + +#define __USE_ISOC9X 1 +#define __USE_ISOC99 1 + +#include +#define float2int(x) lrintf(x) + +#elif (defined(HAVE_LRINT)) + +#define _ISOC9X_SOURCE 1 +#define _ISOC99_SOURCE 1 + +#define __USE_ISOC9X 1 +#define __USE_ISOC99 1 + +#include +#define float2int(x) lrint(x) + +#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && defined (_M_X64) + #include + + __inline long int float2int(float value) + { + return _mm_cvtss_si32(_mm_load_ss(&value)); + } +#elif (defined(_MSC_VER) && _MSC_VER >= 1400) && defined (_M_IX86) + #include + + /* Win32 doesn't seem to have these functions. + ** Therefore implement OPUS_INLINE versions of these functions here. + */ + + __inline long int + float2int (float flt) + { int intgr; + + _asm + { fld flt + fistp intgr + } ; + + return intgr ; + } + +#else + +#if (defined(__GNUC__) && defined(__STDC__) && __STDC__ && __STDC_VERSION__ >= 199901L) + /* supported by gcc in C99 mode, but not by all other compilers */ + #warning "Don't have the functions lrint() and lrintf ()." + #warning "Replacing these functions with a standard C cast." +#endif /* __STDC_VERSION__ >= 199901L */ + #include + #define float2int(flt) ((int)(floor(.5+flt))) +#endif + +#ifndef DISABLE_FLOAT_API +static OPUS_INLINE opus_int16 FLOAT2INT16(float x) +{ + x = x*CELT_SIG_SCALE; + x = MAX32(x, -32768); + x = MIN32(x, 32767); + return (opus_int16)float2int(x); +} +#endif /* DISABLE_FLOAT_API */ + +#endif /* FLOAT_CAST_H */ diff --git a/thirdparty/opus/celt/kiss_fft.c b/thirdparty/opus/celt/kiss_fft.c new file mode 100644 index 000000000000..1f8fd05321a4 --- /dev/null +++ b/thirdparty/opus/celt/kiss_fft.c @@ -0,0 +1,604 @@ +/*Copyright (c) 2003-2004, Mark Borgerding + Lots of modifications by Jean-Marc Valin + Copyright (c) 2005-2007, Xiph.Org Foundation + Copyright (c) 2008, Xiph.Org Foundation, CSIRO + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE.*/ + +/* This code is originally from Mark Borgerding's KISS-FFT but has been + heavily modified to better suit Opus */ + +#ifndef SKIP_CONFIG_H +# ifdef HAVE_CONFIG_H +# include "config.h" +# endif +#endif + +#include "_kiss_fft_guts.h" +#include "arch.h" +#include "os_support.h" +#include "mathops.h" +#include "stack_alloc.h" + +/* The guts header contains all the multiplication and addition macros that are defined for + complex numbers. It also delares the kf_ internal functions. +*/ + +static void kf_bfly2( + kiss_fft_cpx * Fout, + int m, + int N + ) +{ + kiss_fft_cpx * Fout2; + int i; + (void)m; +#ifdef CUSTOM_MODES + if (m==1) + { + celt_assert(m==1); + for (i=0;itwiddles; + /* m is guaranteed to be a multiple of 4. */ + for (j=0;jtwiddles[fstride*m]; +#endif + for (i=0;itwiddles; + /* For non-custom modes, m is guaranteed to be a multiple of 4. */ + k=m; + do { + + C_MUL(scratch[1],Fout[m] , *tw1); + C_MUL(scratch[2],Fout[m2] , *tw2); + + C_ADD(scratch[3],scratch[1],scratch[2]); + C_SUB(scratch[0],scratch[1],scratch[2]); + tw1 += fstride; + tw2 += fstride*2; + + Fout[m].r = Fout->r - HALF_OF(scratch[3].r); + Fout[m].i = Fout->i - HALF_OF(scratch[3].i); + + C_MULBYSCALAR( scratch[0] , epi3.i ); + + C_ADDTO(*Fout,scratch[3]); + + Fout[m2].r = Fout[m].r + scratch[0].i; + Fout[m2].i = Fout[m].i - scratch[0].r; + + Fout[m].r -= scratch[0].i; + Fout[m].i += scratch[0].r; + + ++Fout; + } while(--k); + } +} + + +#ifndef OVERRIDE_kf_bfly5 +static void kf_bfly5( + kiss_fft_cpx * Fout, + const size_t fstride, + const kiss_fft_state *st, + int m, + int N, + int mm + ) +{ + kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4; + int i, u; + kiss_fft_cpx scratch[13]; + const kiss_twiddle_cpx *tw; + kiss_twiddle_cpx ya,yb; + kiss_fft_cpx * Fout_beg = Fout; + +#ifdef FIXED_POINT + ya.r = 10126; + ya.i = -31164; + yb.r = -26510; + yb.i = -19261; +#else + ya = st->twiddles[fstride*m]; + yb = st->twiddles[fstride*2*m]; +#endif + tw=st->twiddles; + + for (i=0;ir += scratch[7].r + scratch[8].r; + Fout0->i += scratch[7].i + scratch[8].i; + + scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r); + scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r); + + scratch[6].r = S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i); + scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i); + + C_SUB(*Fout1,scratch[5],scratch[6]); + C_ADD(*Fout4,scratch[5],scratch[6]); + + scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r); + scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r); + scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i); + scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i); + + C_ADD(*Fout2,scratch[11],scratch[12]); + C_SUB(*Fout3,scratch[11],scratch[12]); + + ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4; + } + } +} +#endif /* OVERRIDE_kf_bfly5 */ + + +#endif + + +#ifdef CUSTOM_MODES + +static +void compute_bitrev_table( + int Fout, + opus_int16 *f, + const size_t fstride, + int in_stride, + opus_int16 * factors, + const kiss_fft_state *st + ) +{ + const int p=*factors++; /* the radix */ + const int m=*factors++; /* stage's fft length/p */ + + /*printf ("fft %d %d %d %d %d %d\n", p*m, m, p, s2, fstride*in_stride, N);*/ + if (m==1) + { + int j; + for (j=0;j32000 || (opus_int32)p*(opus_int32)p > n) + p = n; /* no more factors, skip to end */ + } + n /= p; +#ifdef RADIX_TWO_ONLY + if (p!=2 && p != 4) +#else + if (p>5) +#endif + { + return 0; + } + facbuf[2*stages] = p; + if (p==2 && stages > 1) + { + facbuf[2*stages] = 4; + facbuf[2] = 2; + } + stages++; + } while (n > 1); + n = nbak; + /* Reverse the order to get the radix 4 at the end, so we can use the + fast degenerate case. It turns out that reversing the order also + improves the noise behaviour. */ + for (i=0;i= memneeded) + st = (kiss_fft_state*)mem; + *lenmem = memneeded; + } + if (st) { + opus_int16 *bitrev; + kiss_twiddle_cpx *twiddles; + + st->nfft=nfft; +#ifdef FIXED_POINT + st->scale_shift = celt_ilog2(st->nfft); + if (st->nfft == 1<scale_shift) + st->scale = Q15ONE; + else + st->scale = (1073741824+st->nfft/2)/st->nfft>>(15-st->scale_shift); +#else + st->scale = 1.f/nfft; +#endif + if (base != NULL) + { + st->twiddles = base->twiddles; + st->shift = 0; + while (st->shift < 32 && nfft<shift != base->nfft) + st->shift++; + if (st->shift>=32) + goto fail; + } else { + st->twiddles = twiddles = (kiss_twiddle_cpx*)KISS_FFT_MALLOC(sizeof(kiss_twiddle_cpx)*nfft); + compute_twiddles(twiddles, nfft); + st->shift = -1; + } + if (!kf_factor(nfft,st->factors)) + { + goto fail; + } + + /* bitrev */ + st->bitrev = bitrev = (opus_int16*)KISS_FFT_MALLOC(sizeof(opus_int16)*nfft); + if (st->bitrev==NULL) + goto fail; + compute_bitrev_table(0, bitrev, 1,1, st->factors,st); + + /* Initialize architecture specific fft parameters */ + if (opus_fft_alloc_arch(st, arch)) + goto fail; + } + return st; +fail: + opus_fft_free(st, arch); + return NULL; +} + +kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem, int arch) +{ + return opus_fft_alloc_twiddles(nfft, mem, lenmem, NULL, arch); +} + +void opus_fft_free_arch_c(kiss_fft_state *st) { + (void)st; +} + +void opus_fft_free(const kiss_fft_state *cfg, int arch) +{ + if (cfg) + { + opus_fft_free_arch((kiss_fft_state *)cfg, arch); + opus_free((opus_int16*)cfg->bitrev); + if (cfg->shift < 0) + opus_free((kiss_twiddle_cpx*)cfg->twiddles); + opus_free((kiss_fft_state*)cfg); + } +} + +#endif /* CUSTOM_MODES */ + +void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout) +{ + int m2, m; + int p; + int L; + int fstride[MAXFACTORS]; + int i; + int shift; + + /* st->shift can be -1 */ + shift = st->shift>0 ? st->shift : 0; + + fstride[0] = 1; + L=0; + do { + p = st->factors[2*L]; + m = st->factors[2*L+1]; + fstride[L+1] = fstride[L]*p; + L++; + } while(m!=1); + m = st->factors[2*L-1]; + for (i=L-1;i>=0;i--) + { + if (i!=0) + m2 = st->factors[2*i-1]; + else + m2 = 1; + switch (st->factors[2*i]) + { + case 2: + kf_bfly2(fout, m, fstride[i]); + break; + case 4: + kf_bfly4(fout,fstride[i]<scale_shift-1; +#endif + scale = st->scale; + + celt_assert2 (fin != fout, "In-place FFT not supported"); + /* Bit-reverse the input */ + for (i=0;infft;i++) + { + kiss_fft_cpx x = fin[i]; + fout[st->bitrev[i]].r = SHR32(MULT16_32_Q16(scale, x.r), scale_shift); + fout[st->bitrev[i]].i = SHR32(MULT16_32_Q16(scale, x.i), scale_shift); + } + opus_fft_impl(st, fout); +} + + +void opus_ifft_c(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout) +{ + int i; + celt_assert2 (fin != fout, "In-place FFT not supported"); + /* Bit-reverse the input */ + for (i=0;infft;i++) + fout[st->bitrev[i]] = fin[i]; + for (i=0;infft;i++) + fout[i].i = -fout[i].i; + opus_fft_impl(st, fout); + for (i=0;infft;i++) + fout[i].i = -fout[i].i; +} diff --git a/thirdparty/opus/celt/kiss_fft.h b/thirdparty/opus/celt/kiss_fft.h new file mode 100644 index 000000000000..bffa2bfad639 --- /dev/null +++ b/thirdparty/opus/celt/kiss_fft.h @@ -0,0 +1,200 @@ +/*Copyright (c) 2003-2004, Mark Borgerding + Lots of modifications by Jean-Marc Valin + Copyright (c) 2005-2007, Xiph.Org Foundation + Copyright (c) 2008, Xiph.Org Foundation, CSIRO + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE.*/ + +#ifndef KISS_FFT_H +#define KISS_FFT_H + +#include +#include +#include "arch.h" +#include "cpu_support.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef USE_SIMD +# include +# define kiss_fft_scalar __m128 +#define KISS_FFT_MALLOC(nbytes) memalign(16,nbytes) +#else +#define KISS_FFT_MALLOC opus_alloc +#endif + +#ifdef FIXED_POINT +#include "arch.h" + +# define kiss_fft_scalar opus_int32 +# define kiss_twiddle_scalar opus_int16 + + +#else +# ifndef kiss_fft_scalar +/* default is float */ +# define kiss_fft_scalar float +# define kiss_twiddle_scalar float +# define KF_SUFFIX _celt_single +# endif +#endif + +typedef struct { + kiss_fft_scalar r; + kiss_fft_scalar i; +}kiss_fft_cpx; + +typedef struct { + kiss_twiddle_scalar r; + kiss_twiddle_scalar i; +}kiss_twiddle_cpx; + +#define MAXFACTORS 8 +/* e.g. an fft of length 128 has 4 factors + as far as kissfft is concerned + 4*4*4*2 + */ + +typedef struct arch_fft_state{ + int is_supported; + void *priv; +} arch_fft_state; + +typedef struct kiss_fft_state{ + int nfft; + opus_val16 scale; +#ifdef FIXED_POINT + int scale_shift; +#endif + int shift; + opus_int16 factors[2*MAXFACTORS]; + const opus_int16 *bitrev; + const kiss_twiddle_cpx *twiddles; + arch_fft_state *arch_fft; +} kiss_fft_state; + +#if defined(HAVE_ARM_NE10) +#include "arm/fft_arm.h" +#endif + +/*typedef struct kiss_fft_state* kiss_fft_cfg;*/ + +/** + * opus_fft_alloc + * + * Initialize a FFT (or IFFT) algorithm's cfg/state buffer. + * + * typical usage: kiss_fft_cfg mycfg=opus_fft_alloc(1024,0,NULL,NULL); + * + * The return value from fft_alloc is a cfg buffer used internally + * by the fft routine or NULL. + * + * If lenmem is NULL, then opus_fft_alloc will allocate a cfg buffer using malloc. + * The returned value should be free()d when done to avoid memory leaks. + * + * The state can be placed in a user supplied buffer 'mem': + * If lenmem is not NULL and mem is not NULL and *lenmem is large enough, + * then the function places the cfg in mem and the size used in *lenmem + * and returns mem. + * + * If lenmem is not NULL and ( mem is NULL or *lenmem is not large enough), + * then the function returns NULL and places the minimum cfg + * buffer size in *lenmem. + * */ + +kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem, const kiss_fft_state *base, int arch); + +kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem, int arch); + +/** + * opus_fft(cfg,in_out_buf) + * + * Perform an FFT on a complex input buffer. + * for a forward FFT, + * fin should be f[0] , f[1] , ... ,f[nfft-1] + * fout will be F[0] , F[1] , ... ,F[nfft-1] + * Note that each element is complex and can be accessed like + f[k].r and f[k].i + * */ +void opus_fft_c(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout); +void opus_ifft_c(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout); + +void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout); +void opus_ifft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout); + +void opus_fft_free(const kiss_fft_state *cfg, int arch); + + +void opus_fft_free_arch_c(kiss_fft_state *st); +int opus_fft_alloc_arch_c(kiss_fft_state *st); + +#if !defined(OVERRIDE_OPUS_FFT) +/* Is run-time CPU detection enabled on this platform? */ +#if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) + +extern int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])( + kiss_fft_state *st); + +#define opus_fft_alloc_arch(_st, arch) \ + ((*OPUS_FFT_ALLOC_ARCH_IMPL[(arch)&OPUS_ARCHMASK])(_st)) + +extern void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])( + kiss_fft_state *st); +#define opus_fft_free_arch(_st, arch) \ + ((*OPUS_FFT_FREE_ARCH_IMPL[(arch)&OPUS_ARCHMASK])(_st)) + +extern void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg, + const kiss_fft_cpx *fin, kiss_fft_cpx *fout); +#define opus_fft(_cfg, _fin, _fout, arch) \ + ((*OPUS_FFT[(arch)&OPUS_ARCHMASK])(_cfg, _fin, _fout)) + +extern void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg, + const kiss_fft_cpx *fin, kiss_fft_cpx *fout); +#define opus_ifft(_cfg, _fin, _fout, arch) \ + ((*OPUS_IFFT[(arch)&OPUS_ARCHMASK])(_cfg, _fin, _fout)) + +#else /* else for if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) */ + +#define opus_fft_alloc_arch(_st, arch) \ + ((void)(arch), opus_fft_alloc_arch_c(_st)) + +#define opus_fft_free_arch(_st, arch) \ + ((void)(arch), opus_fft_free_arch_c(_st)) + +#define opus_fft(_cfg, _fin, _fout, arch) \ + ((void)(arch), opus_fft_c(_cfg, _fin, _fout)) + +#define opus_ifft(_cfg, _fin, _fout, arch) \ + ((void)(arch), opus_ifft_c(_cfg, _fin, _fout)) + +#endif /* end if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) */ +#endif /* end if !defined(OVERRIDE_OPUS_FFT) */ + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/thirdparty/opus/celt/laplace.c b/thirdparty/opus/celt/laplace.c new file mode 100644 index 000000000000..a7bca874b6e6 --- /dev/null +++ b/thirdparty/opus/celt/laplace.c @@ -0,0 +1,134 @@ +/* Copyright (c) 2007 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "laplace.h" +#include "mathops.h" + +/* The minimum probability of an energy delta (out of 32768). */ +#define LAPLACE_LOG_MINP (0) +#define LAPLACE_MINP (1<>15; +} + +void ec_laplace_encode(ec_enc *enc, int *value, unsigned fs, int decay) +{ + unsigned fl; + int val = *value; + fl = 0; + if (val) + { + int s; + int i; + s = -(val<0); + val = (val+s)^s; + fl = fs; + fs = ec_laplace_get_freq1(fs, decay); + /* Search the decaying part of the PDF.*/ + for (i=1; fs > 0 && i < val; i++) + { + fs *= 2; + fl += fs+2*LAPLACE_MINP; + fs = (fs*(opus_int32)decay)>>15; + } + /* Everything beyond that has probability LAPLACE_MINP. */ + if (!fs) + { + int di; + int ndi_max; + ndi_max = (32768-fl+LAPLACE_MINP-1)>>LAPLACE_LOG_MINP; + ndi_max = (ndi_max-s)>>1; + di = IMIN(val - i, ndi_max - 1); + fl += (2*di+1+s)*LAPLACE_MINP; + fs = IMIN(LAPLACE_MINP, 32768-fl); + *value = (i+di+s)^s; + } + else + { + fs += LAPLACE_MINP; + fl += fs&~s; + } + celt_assert(fl+fs<=32768); + celt_assert(fs>0); + } + ec_encode_bin(enc, fl, fl+fs, 15); +} + +int ec_laplace_decode(ec_dec *dec, unsigned fs, int decay) +{ + int val=0; + unsigned fl; + unsigned fm; + fm = ec_decode_bin(dec, 15); + fl = 0; + if (fm >= fs) + { + val++; + fl = fs; + fs = ec_laplace_get_freq1(fs, decay)+LAPLACE_MINP; + /* Search the decaying part of the PDF.*/ + while(fs > LAPLACE_MINP && fm >= fl+2*fs) + { + fs *= 2; + fl += fs; + fs = ((fs-2*LAPLACE_MINP)*(opus_int32)decay)>>15; + fs += LAPLACE_MINP; + val++; + } + /* Everything beyond that has probability LAPLACE_MINP. */ + if (fs <= LAPLACE_MINP) + { + int di; + di = (fm-fl)>>(LAPLACE_LOG_MINP+1); + val += di; + fl += 2*di*LAPLACE_MINP; + } + if (fm < fl+fs) + val = -val; + else + fl += fs; + } + celt_assert(fl<32768); + celt_assert(fs>0); + celt_assert(fl<=fm); + celt_assert(fm>1; + b=1U<>=1; + bshift--; + } + while(bshift>=0); + return g; +} + +#ifdef FIXED_POINT + +opus_val32 frac_div32(opus_val32 a, opus_val32 b) +{ + opus_val16 rcp; + opus_val32 result, rem; + int shift = celt_ilog2(b)-29; + a = VSHR32(a,shift); + b = VSHR32(b,shift); + /* 16-bit reciprocal */ + rcp = ROUND16(celt_rcp(ROUND16(b,16)),3); + result = MULT16_32_Q15(rcp, a); + rem = PSHR32(a,2)-MULT32_32_Q31(result, b); + result = ADD32(result, SHL32(MULT16_32_Q15(rcp, rem),2)); + if (result >= 536870912) /* 2^29 */ + return 2147483647; /* 2^31 - 1 */ + else if (result <= -536870912) /* -2^29 */ + return -2147483647; /* -2^31 */ + else + return SHL32(result, 2); +} + +/** Reciprocal sqrt approximation in the range [0.25,1) (Q16 in, Q14 out) */ +opus_val16 celt_rsqrt_norm(opus_val32 x) +{ + opus_val16 n; + opus_val16 r; + opus_val16 r2; + opus_val16 y; + /* Range of n is [-16384,32767] ([-0.5,1) in Q15). */ + n = x-32768; + /* Get a rough initial guess for the root. + The optimal minimax quadratic approximation (using relative error) is + r = 1.437799046117536+n*(-0.823394375837328+n*0.4096419668459485). + Coefficients here, and the final result r, are Q14.*/ + r = ADD16(23557, MULT16_16_Q15(n, ADD16(-13490, MULT16_16_Q15(n, 6713)))); + /* We want y = x*r*r-1 in Q15, but x is 32-bit Q16 and r is Q14. + We can compute the result from n and r using Q15 multiplies with some + adjustment, carefully done to avoid overflow. + Range of y is [-1564,1594]. */ + r2 = MULT16_16_Q15(r, r); + y = SHL16(SUB16(ADD16(MULT16_16_Q15(r2, n), r2), 16384), 1); + /* Apply a 2nd-order Householder iteration: r += r*y*(y*0.375-0.5). + This yields the Q14 reciprocal square root of the Q16 x, with a maximum + relative error of 1.04956E-4, a (relative) RMSE of 2.80979E-5, and a + peak absolute error of 2.26591/16384. */ + return ADD16(r, MULT16_16_Q15(r, MULT16_16_Q15(y, + SUB16(MULT16_16_Q15(y, 12288), 16384)))); +} + +/** Sqrt approximation (QX input, QX/2 output) */ +opus_val32 celt_sqrt(opus_val32 x) +{ + int k; + opus_val16 n; + opus_val32 rt; + static const opus_val16 C[5] = {23175, 11561, -3011, 1699, -664}; + if (x==0) + return 0; + else if (x>=1073741824) + return 32767; + k = (celt_ilog2(x)>>1)-7; + x = VSHR32(x, 2*k); + n = x-32768; + rt = ADD16(C[0], MULT16_16_Q15(n, ADD16(C[1], MULT16_16_Q15(n, ADD16(C[2], + MULT16_16_Q15(n, ADD16(C[3], MULT16_16_Q15(n, (C[4]))))))))); + rt = VSHR32(rt,7-k); + return rt; +} + +#define L1 32767 +#define L2 -7651 +#define L3 8277 +#define L4 -626 + +static OPUS_INLINE opus_val16 _celt_cos_pi_2(opus_val16 x) +{ + opus_val16 x2; + + x2 = MULT16_16_P15(x,x); + return ADD16(1,MIN16(32766,ADD32(SUB16(L1,x2), MULT16_16_P15(x2, ADD32(L2, MULT16_16_P15(x2, ADD32(L3, MULT16_16_P15(L4, x2 + )))))))); +} + +#undef L1 +#undef L2 +#undef L3 +#undef L4 + +opus_val16 celt_cos_norm(opus_val32 x) +{ + x = x&0x0001ffff; + if (x>SHL32(EXTEND32(1), 16)) + x = SUB32(SHL32(EXTEND32(1), 17),x); + if (x&0x00007fff) + { + if (x0, "celt_rcp() only defined for positive values"); + i = celt_ilog2(x); + /* n is Q15 with range [0,1). */ + n = VSHR32(x,i-15)-32768; + /* Start with a linear approximation: + r = 1.8823529411764706-0.9411764705882353*n. + The coefficients and the result are Q14 in the range [15420,30840].*/ + r = ADD16(30840, MULT16_16_Q15(-15420, n)); + /* Perform two Newton iterations: + r -= r*((r*n)-1.Q15) + = r*((r*n)+(r-1.Q15)). */ + r = SUB16(r, MULT16_16_Q15(r, + ADD16(MULT16_16_Q15(r, n), ADD16(r, -32768)))); + /* We subtract an extra 1 in the second iteration to avoid overflow; it also + neatly compensates for truncation error in the rest of the process. */ + r = SUB16(r, ADD16(1, MULT16_16_Q15(r, + ADD16(MULT16_16_Q15(r, n), ADD16(r, -32768))))); + /* r is now the Q15 solution to 2/(n+1), with a maximum relative error + of 7.05346E-5, a (relative) RMSE of 2.14418E-5, and a peak absolute + error of 1.24665/32768. */ + return VSHR32(EXTEND32(r),i-16); +} + +#endif diff --git a/thirdparty/opus/celt/mathops.h b/thirdparty/opus/celt/mathops.h new file mode 100644 index 000000000000..a0525a961030 --- /dev/null +++ b/thirdparty/opus/celt/mathops.h @@ -0,0 +1,258 @@ +/* Copyright (c) 2002-2008 Jean-Marc Valin + Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/** + @file mathops.h + @brief Various math functions +*/ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef MATHOPS_H +#define MATHOPS_H + +#include "arch.h" +#include "entcode.h" +#include "os_support.h" + +/* Multiplies two 16-bit fractional values. Bit-exactness of this macro is important */ +#define FRAC_MUL16(a,b) ((16384+((opus_int32)(opus_int16)(a)*(opus_int16)(b)))>>15) + +unsigned isqrt32(opus_uint32 _val); + +#ifndef OVERRIDE_CELT_MAXABS16 +static OPUS_INLINE opus_val32 celt_maxabs16(const opus_val16 *x, int len) +{ + int i; + opus_val16 maxval = 0; + opus_val16 minval = 0; + for (i=0;i>23)-127; + in.i -= integer<<23; + frac = in.f - 1.5f; + frac = -0.41445418f + frac*(0.95909232f + + frac*(-0.33951290f + frac*0.16541097f)); + return 1+integer+frac; +} + +/** Base-2 exponential approximation (2^x). */ +static OPUS_INLINE float celt_exp2(float x) +{ + int integer; + float frac; + union { + float f; + opus_uint32 i; + } res; + integer = floor(x); + if (integer < -50) + return 0; + frac = x-integer; + /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */ + res.f = 0.99992522f + frac * (0.69583354f + + frac * (0.22606716f + 0.078024523f*frac)); + res.i = (res.i + (integer<<23)) & 0x7fffffff; + return res.f; +} + +#else +#define celt_log2(x) ((float)(1.442695040888963387*log(x))) +#define celt_exp2(x) ((float)exp(0.6931471805599453094*(x))) +#endif + +#endif + +#ifdef FIXED_POINT + +#include "os_support.h" + +#ifndef OVERRIDE_CELT_ILOG2 +/** Integer log in base2. Undefined for zero and negative numbers */ +static OPUS_INLINE opus_int16 celt_ilog2(opus_int32 x) +{ + celt_assert2(x>0, "celt_ilog2() only defined for strictly positive numbers"); + return EC_ILOG(x)-1; +} +#endif + + +/** Integer log in base2. Defined for zero, but not for negative numbers */ +static OPUS_INLINE opus_int16 celt_zlog2(opus_val32 x) +{ + return x <= 0 ? 0 : celt_ilog2(x); +} + +opus_val16 celt_rsqrt_norm(opus_val32 x); + +opus_val32 celt_sqrt(opus_val32 x); + +opus_val16 celt_cos_norm(opus_val32 x); + +/** Base-2 logarithm approximation (log2(x)). (Q14 input, Q10 output) */ +static OPUS_INLINE opus_val16 celt_log2(opus_val32 x) +{ + int i; + opus_val16 n, frac; + /* -0.41509302963303146, 0.9609890551383969, -0.31836011537636605, + 0.15530808010959576, -0.08556153059057618 */ + static const opus_val16 C[5] = {-6801+(1<<(13-DB_SHIFT)), 15746, -5217, 2545, -1401}; + if (x==0) + return -32767; + i = celt_ilog2(x); + n = VSHR32(x,i-15)-32768-16384; + frac = ADD16(C[0], MULT16_16_Q15(n, ADD16(C[1], MULT16_16_Q15(n, ADD16(C[2], MULT16_16_Q15(n, ADD16(C[3], MULT16_16_Q15(n, C[4])))))))); + return SHL16(i-13,DB_SHIFT)+SHR16(frac,14-DB_SHIFT); +} + +/* + K0 = 1 + K1 = log(2) + K2 = 3-4*log(2) + K3 = 3*log(2) - 2 +*/ +#define D0 16383 +#define D1 22804 +#define D2 14819 +#define D3 10204 + +static OPUS_INLINE opus_val32 celt_exp2_frac(opus_val16 x) +{ + opus_val16 frac; + frac = SHL16(x, 4); + return ADD16(D0, MULT16_16_Q15(frac, ADD16(D1, MULT16_16_Q15(frac, ADD16(D2 , MULT16_16_Q15(D3,frac)))))); +} +/** Base-2 exponential approximation (2^x). (Q10 input, Q16 output) */ +static OPUS_INLINE opus_val32 celt_exp2(opus_val16 x) +{ + int integer; + opus_val16 frac; + integer = SHR16(x,10); + if (integer>14) + return 0x7f000000; + else if (integer < -15) + return 0; + frac = celt_exp2_frac(x-SHL16(integer,10)); + return VSHR32(EXTEND32(frac), -integer-2); +} + +opus_val32 celt_rcp(opus_val32 x); + +#define celt_div(a,b) MULT32_32_Q31((opus_val32)(a),celt_rcp(b)) + +opus_val32 frac_div32(opus_val32 a, opus_val32 b); + +#define M1 32767 +#define M2 -21 +#define M3 -11943 +#define M4 4936 + +/* Atan approximation using a 4th order polynomial. Input is in Q15 format + and normalized by pi/4. Output is in Q15 format */ +static OPUS_INLINE opus_val16 celt_atan01(opus_val16 x) +{ + return MULT16_16_P15(x, ADD32(M1, MULT16_16_P15(x, ADD32(M2, MULT16_16_P15(x, ADD32(M3, MULT16_16_P15(M4, x))))))); +} + +#undef M1 +#undef M2 +#undef M3 +#undef M4 + +/* atan2() approximation valid for positive input values */ +static OPUS_INLINE opus_val16 celt_atan2p(opus_val16 y, opus_val16 x) +{ + if (y < x) + { + opus_val32 arg; + arg = celt_div(SHL32(EXTEND32(y),15),x); + if (arg >= 32767) + arg = 32767; + return SHR16(celt_atan01(EXTRACT16(arg)),1); + } else { + opus_val32 arg; + arg = celt_div(SHL32(EXTEND32(x),15),y); + if (arg >= 32767) + arg = 32767; + return 25736-SHR16(celt_atan01(EXTRACT16(arg)),1); + } +} + +#endif /* FIXED_POINT */ +#endif /* MATHOPS_H */ diff --git a/thirdparty/opus/celt/mdct.c b/thirdparty/opus/celt/mdct.c new file mode 100644 index 000000000000..5315ad11a37b --- /dev/null +++ b/thirdparty/opus/celt/mdct.c @@ -0,0 +1,343 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2008 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* This is a simple MDCT implementation that uses a N/4 complex FFT + to do most of the work. It should be relatively straightforward to + plug in pretty much and FFT here. + + This replaces the Vorbis FFT (and uses the exact same API), which + was a bit too messy and that was ending up duplicating code + (might as well use the same FFT everywhere). + + The algorithm is similar to (and inspired from) Fabrice Bellard's + MDCT implementation in FFMPEG, but has differences in signs, ordering + and scaling in many places. +*/ + +#ifndef SKIP_CONFIG_H +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#endif + +#include "mdct.h" +#include "kiss_fft.h" +#include "_kiss_fft_guts.h" +#include +#include "os_support.h" +#include "mathops.h" +#include "stack_alloc.h" + +#if defined(MIPSr1_ASM) +#include "mips/mdct_mipsr1.h" +#endif + + +#ifdef CUSTOM_MODES + +int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch) +{ + int i; + kiss_twiddle_scalar *trig; + int shift; + int N2=N>>1; + l->n = N; + l->maxshift = maxshift; + for (i=0;i<=maxshift;i++) + { + if (i==0) + l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0, arch); + else + l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0], arch); +#ifndef ENABLE_TI_DSPLIB55 + if (l->kfft[i]==NULL) + return 0; +#endif + } + l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N-(N2>>maxshift))*sizeof(kiss_twiddle_scalar)); + if (l->trig==NULL) + return 0; + for (shift=0;shift<=maxshift;shift++) + { + /* We have enough points that sine isn't necessary */ +#if defined(FIXED_POINT) +#if 1 + for (i=0;i>= 1; + N >>= 1; + } + return 1; +} + +void clt_mdct_clear(mdct_lookup *l, int arch) +{ + int i; + for (i=0;i<=l->maxshift;i++) + opus_fft_free(l->kfft[i], arch); + opus_free((kiss_twiddle_scalar*)l->trig); +} + +#endif /* CUSTOM_MODES */ + +/* Forward MDCT trashes the input array */ +#ifndef OVERRIDE_clt_mdct_forward +void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, + const opus_val16 *window, int overlap, int shift, int stride, int arch) +{ + int i; + int N, N2, N4; + VARDECL(kiss_fft_scalar, f); + VARDECL(kiss_fft_cpx, f2); + const kiss_fft_state *st = l->kfft[shift]; + const kiss_twiddle_scalar *trig; + opus_val16 scale; +#ifdef FIXED_POINT + /* Allows us to scale with MULT16_32_Q16(), which is faster than + MULT16_32_Q15() on ARM. */ + int scale_shift = st->scale_shift-1; +#endif + SAVE_STACK; + (void)arch; + scale = st->scale; + + N = l->n; + trig = l->trig; + for (i=0;i>= 1; + trig += N; + } + N2 = N>>1; + N4 = N>>2; + + ALLOC(f, N2, kiss_fft_scalar); + ALLOC(f2, N4, kiss_fft_cpx); + + /* Consider the input to be composed of four blocks: [a, b, c, d] */ + /* Window, shuffle, fold */ + { + /* Temp pointers to make it really clear to the compiler what we're doing */ + const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); + const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); + kiss_fft_scalar * OPUS_RESTRICT yp = f; + const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1); + const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; + for(i=0;i<((overlap+3)>>2);i++) + { + /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ + *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2); + *yp++ = MULT16_32_Q15(*wp1, *xp1) - MULT16_32_Q15(*wp2, xp2[-N2]); + xp1+=2; + xp2-=2; + wp1+=2; + wp2-=2; + } + wp1 = window; + wp2 = window+overlap-1; + for(;i>2);i++) + { + /* Real part arranged as a-bR, Imag part arranged as -c-dR */ + *yp++ = *xp2; + *yp++ = *xp1; + xp1+=2; + xp2-=2; + } + for(;ibitrev[i]] = yc; + } + } + + /* N/4 complex FFT, does not downscale anymore */ + opus_fft_impl(st, f2); + + /* Post-rotate */ + { + /* Temp pointers to make it really clear to the compiler what we're doing */ + const kiss_fft_cpx * OPUS_RESTRICT fp = f2; + kiss_fft_scalar * OPUS_RESTRICT yp1 = out; + kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); + const kiss_twiddle_scalar *t = &trig[0]; + /* Temp pointers to make it really clear to the compiler what we're doing */ + for(i=0;ii,t[N4+i]) - S_MUL(fp->r,t[i]); + yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]); + *yp1 = yr; + *yp2 = yi; + fp++; + yp1 += 2*stride; + yp2 -= 2*stride; + } + } + RESTORE_STACK; +} +#endif /* OVERRIDE_clt_mdct_forward */ + +#ifndef OVERRIDE_clt_mdct_backward +void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, + const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch) +{ + int i; + int N, N2, N4; + const kiss_twiddle_scalar *trig; + (void) arch; + + N = l->n; + trig = l->trig; + for (i=0;i>= 1; + trig += N; + } + N2 = N>>1; + N4 = N>>2; + + /* Pre-rotate */ + { + /* Temp pointers to make it really clear to the compiler what we're doing */ + const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; + const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); + kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1); + const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0]; + const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev; + for(i=0;ikfft[shift], (kiss_fft_cpx*)(out+(overlap>>1))); + + /* Post-rotate and de-shuffle from both ends of the buffer at once to make + it in-place. */ + { + kiss_fft_scalar * yp0 = out+(overlap>>1); + kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2; + const kiss_twiddle_scalar *t = &trig[0]; + /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the + middle pair will be computed twice. */ + for(i=0;i<(N4+1)>>1;i++) + { + kiss_fft_scalar re, im, yr, yi; + kiss_twiddle_scalar t0, t1; + /* We swap real and imag because we're using an FFT instead of an IFFT. */ + re = yp0[1]; + im = yp0[0]; + t0 = t[i]; + t1 = t[N4+i]; + /* We'd scale up by 2 here, but instead it's done when mixing the windows */ + yr = S_MUL(re,t0) + S_MUL(im,t1); + yi = S_MUL(re,t1) - S_MUL(im,t0); + /* We swap real and imag because we're using an FFT instead of an IFFT. */ + re = yp1[1]; + im = yp1[0]; + yp0[0] = yr; + yp1[1] = yi; + + t0 = t[(N4-i-1)]; + t1 = t[(N2-i-1)]; + /* We'd scale up by 2 here, but instead it's done when mixing the windows */ + yr = S_MUL(re,t0) + S_MUL(im,t1); + yi = S_MUL(re,t1) - S_MUL(im,t0); + yp1[0] = yr; + yp0[1] = yi; + yp0 += 2; + yp1 -= 2; + } + } + + /* Mirror on both sides for TDAC */ + { + kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; + kiss_fft_scalar * OPUS_RESTRICT yp1 = out; + const opus_val16 * OPUS_RESTRICT wp1 = window; + const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1; + + for(i = 0; i < overlap/2; i++) + { + kiss_fft_scalar x1, x2; + x1 = *xp1; + x2 = *yp1; + *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1); + *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1); + wp1++; + wp2--; + } + } +} +#endif /* OVERRIDE_clt_mdct_backward */ diff --git a/thirdparty/opus/celt/mdct.h b/thirdparty/opus/celt/mdct.h new file mode 100644 index 000000000000..160ae4e0f3ba --- /dev/null +++ b/thirdparty/opus/celt/mdct.h @@ -0,0 +1,112 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2008 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* This is a simple MDCT implementation that uses a N/4 complex FFT + to do most of the work. It should be relatively straightforward to + plug in pretty much and FFT here. + + This replaces the Vorbis FFT (and uses the exact same API), which + was a bit too messy and that was ending up duplicating code + (might as well use the same FFT everywhere). + + The algorithm is similar to (and inspired from) Fabrice Bellard's + MDCT implementation in FFMPEG, but has differences in signs, ordering + and scaling in many places. +*/ + +#ifndef MDCT_H +#define MDCT_H + +#include "opus_defines.h" +#include "kiss_fft.h" +#include "arch.h" + +typedef struct { + int n; + int maxshift; + const kiss_fft_state *kfft[4]; + const kiss_twiddle_scalar * OPUS_RESTRICT trig; +} mdct_lookup; + +#if defined(HAVE_ARM_NE10) +#include "arm/mdct_arm.h" +#endif + + +int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch); +void clt_mdct_clear(mdct_lookup *l, int arch); + +/** Compute a forward MDCT and scale by 4/N, trashes the input array */ +void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, + kiss_fft_scalar * OPUS_RESTRICT out, + const opus_val16 *window, int overlap, + int shift, int stride, int arch); + +/** Compute a backward MDCT (no scaling) and performs weighted overlap-add + (scales implicitly by 1/2) */ +void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in, + kiss_fft_scalar * OPUS_RESTRICT out, + const opus_val16 * OPUS_RESTRICT window, + int overlap, int shift, int stride, int arch); + +#if !defined(OVERRIDE_OPUS_MDCT) +/* Is run-time CPU detection enabled on this platform? */ +#if defined(OPUS_HAVE_RTCD) && defined(HAVE_ARM_NE10) + +extern void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])( + const mdct_lookup *l, kiss_fft_scalar *in, + kiss_fft_scalar * OPUS_RESTRICT out, const opus_val16 *window, + int overlap, int shift, int stride, int arch); + +#define clt_mdct_forward(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) \ + ((*CLT_MDCT_FORWARD_IMPL[(arch)&OPUS_ARCHMASK])(_l, _in, _out, \ + _window, _overlap, _shift, \ + _stride, _arch)) + +extern void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])( + const mdct_lookup *l, kiss_fft_scalar *in, + kiss_fft_scalar * OPUS_RESTRICT out, const opus_val16 *window, + int overlap, int shift, int stride, int arch); + +#define clt_mdct_backward(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) \ + (*CLT_MDCT_BACKWARD_IMPL[(arch)&OPUS_ARCHMASK])(_l, _in, _out, \ + _window, _overlap, _shift, \ + _stride, _arch) + +#else /* if defined(OPUS_HAVE_RTCD) && defined(HAVE_ARM_NE10) */ + +#define clt_mdct_forward(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) \ + clt_mdct_forward_c(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) + +#define clt_mdct_backward(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) \ + clt_mdct_backward_c(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) + +#endif /* end if defined(OPUS_HAVE_RTCD) && defined(HAVE_ARM_NE10) && !defined(FIXED_POINT) */ +#endif /* end if !defined(OVERRIDE_OPUS_MDCT) */ + +#endif diff --git a/thirdparty/opus/celt/mfrngcod.h b/thirdparty/opus/celt/mfrngcod.h new file mode 100644 index 000000000000..809152a59a96 --- /dev/null +++ b/thirdparty/opus/celt/mfrngcod.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2001-2008 Timothy B. Terriberry + Copyright (c) 2008-2009 Xiph.Org Foundation */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if !defined(_mfrngcode_H) +# define _mfrngcode_H (1) +# include "entcode.h" + +/*Constants used by the entropy encoder/decoder.*/ + +/*The number of bits to output at a time.*/ +# define EC_SYM_BITS (8) +/*The total number of bits in each of the state registers.*/ +# define EC_CODE_BITS (32) +/*The maximum symbol value.*/ +# define EC_SYM_MAX ((1U<>EC_SYM_BITS) +/*The number of bits available for the last, partial symbol in the code field.*/ +# define EC_CODE_EXTRA ((EC_CODE_BITS-2)%EC_SYM_BITS+1) +#endif diff --git a/thirdparty/opus/celt/mips/celt_mipsr1.h b/thirdparty/opus/celt/mips/celt_mipsr1.h new file mode 100644 index 000000000000..e85661a6618c --- /dev/null +++ b/thirdparty/opus/celt/mips/celt_mipsr1.h @@ -0,0 +1,151 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2010 Xiph.Org Foundation + Copyright (c) 2008 Gregory Maxwell + Written by Jean-Marc Valin and Gregory Maxwell */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef __CELT_MIPSR1_H__ +#define __CELT_MIPSR1_H__ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#define CELT_C + +#include "os_support.h" +#include "mdct.h" +#include +#include "celt.h" +#include "pitch.h" +#include "bands.h" +#include "modes.h" +#include "entcode.h" +#include "quant_bands.h" +#include "rate.h" +#include "stack_alloc.h" +#include "mathops.h" +#include "float_cast.h" +#include +#include "celt_lpc.h" +#include "vq.h" + +#define OVERRIDE_comb_filter +void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, + opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, + const opus_val16 *window, int overlap, int arch) +{ + int i; + opus_val32 x0, x1, x2, x3, x4; + + (void)arch; + + /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */ + opus_val16 g00, g01, g02, g10, g11, g12; + static const opus_val16 gains[3][3] = { + {QCONST16(0.3066406250f, 15), QCONST16(0.2170410156f, 15), QCONST16(0.1296386719f, 15)}, + {QCONST16(0.4638671875f, 15), QCONST16(0.2680664062f, 15), QCONST16(0.f, 15)}, + {QCONST16(0.7998046875f, 15), QCONST16(0.1000976562f, 15), QCONST16(0.f, 15)}}; + + if (g0==0 && g1==0) + { + /* OPT: Happens to work without the OPUS_MOVE(), but only because the current encoder already copies x to y */ + if (x!=y) + OPUS_MOVE(y, x, N); + return; + } + + g00 = MULT16_16_P15(g0, gains[tapset0][0]); + g01 = MULT16_16_P15(g0, gains[tapset0][1]); + g02 = MULT16_16_P15(g0, gains[tapset0][2]); + g10 = MULT16_16_P15(g1, gains[tapset1][0]); + g11 = MULT16_16_P15(g1, gains[tapset1][1]); + g12 = MULT16_16_P15(g1, gains[tapset1][2]); + x1 = x[-T1+1]; + x2 = x[-T1 ]; + x3 = x[-T1-1]; + x4 = x[-T1-2]; + /* If the filter didn't change, we don't need the overlap */ + if (g0==g1 && T0==T1 && tapset0==tapset1) + overlap=0; + + for (i=0;itwiddles[fstride*m]; + yb = st->twiddles[fstride*2*m]; +#endif + + tw=st->twiddles; + + for (i=0;ir += scratch[7].r + scratch[8].r; + Fout0->i += scratch[7].i + scratch[8].i; + scratch[5].r = scratch[0].r + S_MUL_ADD(scratch[7].r,ya.r,scratch[8].r,yb.r); + scratch[5].i = scratch[0].i + S_MUL_ADD(scratch[7].i,ya.r,scratch[8].i,yb.r); + + scratch[6].r = S_MUL_ADD(scratch[10].i,ya.i,scratch[9].i,yb.i); + scratch[6].i = -S_MUL_ADD(scratch[10].r,ya.i,scratch[9].r,yb.i); + + C_SUB(*Fout1,scratch[5],scratch[6]); + C_ADD(*Fout4,scratch[5],scratch[6]); + + scratch[11].r = scratch[0].r + S_MUL_ADD(scratch[7].r,yb.r,scratch[8].r,ya.r); + scratch[11].i = scratch[0].i + S_MUL_ADD(scratch[7].i,yb.r,scratch[8].i,ya.r); + + scratch[12].r = S_MUL_SUB(scratch[9].i,ya.i,scratch[10].i,yb.i); + scratch[12].i = S_MUL_SUB(scratch[10].r,yb.i,scratch[9].r,ya.i); + + C_ADD(*Fout2,scratch[11],scratch[12]); + C_SUB(*Fout3,scratch[11],scratch[12]); + + ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4; + } + } +} + + +#endif /* KISS_FFT_MIPSR1_H */ diff --git a/thirdparty/opus/celt/mips/mdct_mipsr1.h b/thirdparty/opus/celt/mips/mdct_mipsr1.h new file mode 100644 index 000000000000..2934dab7768a --- /dev/null +++ b/thirdparty/opus/celt/mips/mdct_mipsr1.h @@ -0,0 +1,288 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2008 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* This is a simple MDCT implementation that uses a N/4 complex FFT + to do most of the work. It should be relatively straightforward to + plug in pretty much and FFT here. + + This replaces the Vorbis FFT (and uses the exact same API), which + was a bit too messy and that was ending up duplicating code + (might as well use the same FFT everywhere). + + The algorithm is similar to (and inspired from) Fabrice Bellard's + MDCT implementation in FFMPEG, but has differences in signs, ordering + and scaling in many places. +*/ +#ifndef __MDCT_MIPSR1_H__ +#define __MDCT_MIPSR1_H__ + +#ifndef SKIP_CONFIG_H +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#endif + +#include "mdct.h" +#include "kiss_fft.h" +#include "_kiss_fft_guts.h" +#include +#include "os_support.h" +#include "mathops.h" +#include "stack_alloc.h" + +/* Forward MDCT trashes the input array */ +#define OVERRIDE_clt_mdct_forward +void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, + const opus_val16 *window, int overlap, int shift, int stride, int arch) +{ + int i; + int N, N2, N4; + VARDECL(kiss_fft_scalar, f); + VARDECL(kiss_fft_cpx, f2); + const kiss_fft_state *st = l->kfft[shift]; + const kiss_twiddle_scalar *trig; + opus_val16 scale; +#ifdef FIXED_POINT + /* Allows us to scale with MULT16_32_Q16(), which is faster than + MULT16_32_Q15() on ARM. */ + int scale_shift = st->scale_shift-1; +#endif + + (void)arch; + + SAVE_STACK; + scale = st->scale; + + N = l->n; + trig = l->trig; + for (i=0;i>= 1; + trig += N; + } + N2 = N>>1; + N4 = N>>2; + + ALLOC(f, N2, kiss_fft_scalar); + ALLOC(f2, N4, kiss_fft_cpx); + + /* Consider the input to be composed of four blocks: [a, b, c, d] */ + /* Window, shuffle, fold */ + { + /* Temp pointers to make it really clear to the compiler what we're doing */ + const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); + const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); + kiss_fft_scalar * OPUS_RESTRICT yp = f; + const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1); + const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; + for(i=0;i<((overlap+3)>>2);i++) + { + /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ + *yp++ = S_MUL_ADD(*wp2, xp1[N2],*wp1,*xp2); + *yp++ = S_MUL_SUB(*wp1, *xp1,*wp2, xp2[-N2]); + xp1+=2; + xp2-=2; + wp1+=2; + wp2-=2; + } + wp1 = window; + wp2 = window+overlap-1; + for(;i>2);i++) + { + /* Real part arranged as a-bR, Imag part arranged as -c-dR */ + *yp++ = *xp2; + *yp++ = *xp1; + xp1+=2; + xp2-=2; + } + for(;ibitrev[i]] = yc; + } + } + + /* N/4 complex FFT, does not downscale anymore */ + opus_fft_impl(st, f2); + + /* Post-rotate */ + { + /* Temp pointers to make it really clear to the compiler what we're doing */ + const kiss_fft_cpx * OPUS_RESTRICT fp = f2; + kiss_fft_scalar * OPUS_RESTRICT yp1 = out; + kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); + const kiss_twiddle_scalar *t = &trig[0]; + /* Temp pointers to make it really clear to the compiler what we're doing */ + for(i=0;ii,t[N4+i] , fp->r,t[i]); + yi = S_MUL_ADD(fp->r,t[N4+i] ,fp->i,t[i]); + *yp1 = yr; + *yp2 = yi; + fp++; + yp1 += 2*stride; + yp2 -= 2*stride; + } + } + RESTORE_STACK; +} + +#define OVERRIDE_clt_mdct_backward +void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, + const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch) +{ + int i; + int N, N2, N4; + const kiss_twiddle_scalar *trig; + + (void)arch; + + N = l->n; + trig = l->trig; + for (i=0;i>= 1; + trig += N; + } + N2 = N>>1; + N4 = N>>2; + + /* Pre-rotate */ + { + /* Temp pointers to make it really clear to the compiler what we're doing */ + const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; + const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); + kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1); + const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0]; + const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev; + for(i=0;ikfft[shift], (kiss_fft_cpx*)(out+(overlap>>1))); + + /* Post-rotate and de-shuffle from both ends of the buffer at once to make + it in-place. */ + { + kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1); + kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2; + const kiss_twiddle_scalar *t = &trig[0]; + /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the + middle pair will be computed twice. */ + for(i=0;i<(N4+1)>>1;i++) + { + kiss_fft_scalar re, im, yr, yi; + kiss_twiddle_scalar t0, t1; + /* We swap real and imag because we're using an FFT instead of an IFFT. */ + re = yp0[1]; + im = yp0[0]; + t0 = t[i]; + t1 = t[N4+i]; + /* We'd scale up by 2 here, but instead it's done when mixing the windows */ + yr = S_MUL_ADD(re,t0 , im,t1); + yi = S_MUL_SUB(re,t1 , im,t0); + /* We swap real and imag because we're using an FFT instead of an IFFT. */ + re = yp1[1]; + im = yp1[0]; + yp0[0] = yr; + yp1[1] = yi; + + t0 = t[(N4-i-1)]; + t1 = t[(N2-i-1)]; + /* We'd scale up by 2 here, but instead it's done when mixing the windows */ + yr = S_MUL_ADD(re,t0,im,t1); + yi = S_MUL_SUB(re,t1,im,t0); + yp1[0] = yr; + yp0[1] = yi; + yp0 += 2; + yp1 -= 2; + } + } + + /* Mirror on both sides for TDAC */ + { + kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; + kiss_fft_scalar * OPUS_RESTRICT yp1 = out; + const opus_val16 * OPUS_RESTRICT wp1 = window; + const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1; + + for(i = 0; i < overlap/2; i++) + { + kiss_fft_scalar x1, x2; + x1 = *xp1; + x2 = *yp1; + *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1); + *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1); + wp1++; + wp2--; + } + } +} +#endif /* __MDCT_MIPSR1_H__ */ diff --git a/thirdparty/opus/celt/mips/pitch_mipsr1.h b/thirdparty/opus/celt/mips/pitch_mipsr1.h new file mode 100644 index 000000000000..a9500aff58e9 --- /dev/null +++ b/thirdparty/opus/celt/mips/pitch_mipsr1.h @@ -0,0 +1,161 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/** + @file pitch.h + @brief Pitch analysis + */ + +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef PITCH_MIPSR1_H +#define PITCH_MIPSR1_H + +#define OVERRIDE_DUAL_INNER_PROD +static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, + int N, opus_val32 *xy1, opus_val32 *xy2, int arch) +{ + int j; + opus_val32 xy01=0; + opus_val32 xy02=0; + + (void)arch; + + asm volatile("MULT $ac1, $0, $0"); + asm volatile("MULT $ac2, $0, $0"); + /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */ + for (j=0;j=0;i--) + { + celt_norm x1, x2; + x1 = Xptr[0]; + x2 = Xptr[stride]; + Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2), s, x1), 15)); + *Xptr-- = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15)); + } +} + +#define OVERRIDE_renormalise_vector + +#define renormalise_vector(X, N, gain, arch) \ + (renormalise_vector_mips(X, N, gain, arch)) + +void renormalise_vector_mips(celt_norm *X, int N, opus_val16 gain, int arch) +{ + int i; +#ifdef FIXED_POINT + int k; +#endif + opus_val32 E = EPSILON; + opus_val16 g; + opus_val32 t; + celt_norm *xptr = X; + int X0, X1; + + (void)arch; + + asm volatile("mult $ac1, $0, $0"); + asm volatile("MTLO %0, $ac1" : :"r" (E)); + /*if(N %4) + printf("error");*/ + for (i=0;i>1; +#endif + t = VSHR32(E, 2*(k-7)); + g = MULT16_16_P15(celt_rsqrt_norm(t),gain); + + xptr = X; + for (i=0;i= Fs) + break; + + /* Find where the linear part ends (i.e. where the spacing is more than min_width */ + for (lin=0;lin= res) + break; + + low = (bark_freq[lin]+res/2)/res; + high = nBark-lin; + *nbEBands = low+high; + eBands = opus_alloc(sizeof(opus_int16)*(*nbEBands+2)); + + if (eBands==NULL) + return NULL; + + /* Linear spacing (min_width) */ + for (i=0;i0) + offset = eBands[low-1]*res - bark_freq[lin-1]; + /* Spacing follows critical bands */ + for (i=0;i frame_size) + eBands[*nbEBands] = frame_size; + for (i=1;i<*nbEBands-1;i++) + { + if (eBands[i+1]-eBands[i] < eBands[i]-eBands[i-1]) + { + eBands[i] -= (2*eBands[i]-eBands[i-1]-eBands[i+1])/2; + } + } + /* Remove any empty bands. */ + for (i=j=0;i<*nbEBands;i++) + if(eBands[i+1]>eBands[j]) + eBands[++j]=eBands[i+1]; + *nbEBands=j; + + for (i=1;i<*nbEBands;i++) + { + /* Every band must be smaller than the last band. */ + celt_assert(eBands[i]-eBands[i-1]<=eBands[*nbEBands]-eBands[*nbEBands-1]); + /* Each band must be no larger than twice the size of the previous one. */ + celt_assert(eBands[i+1]-eBands[i]<=2*(eBands[i]-eBands[i-1])); + } + + return eBands; +} + +static void compute_allocation_table(CELTMode *mode) +{ + int i, j; + unsigned char *allocVectors; + int maxBands = sizeof(eband5ms)/sizeof(eband5ms[0])-1; + + mode->nbAllocVectors = BITALLOC_SIZE; + allocVectors = opus_alloc(sizeof(unsigned char)*(BITALLOC_SIZE*mode->nbEBands)); + if (allocVectors==NULL) + return; + + /* Check for standard mode */ + if (mode->Fs == 400*(opus_int32)mode->shortMdctSize) + { + for (i=0;inbEBands;i++) + allocVectors[i] = band_allocation[i]; + mode->allocVectors = allocVectors; + return; + } + /* If not the standard mode, interpolate */ + /* Compute per-codec-band allocation from per-critical-band matrix */ + for (i=0;inbEBands;j++) + { + int k; + for (k=0;k mode->eBands[j]*(opus_int32)mode->Fs/mode->shortMdctSize) + break; + } + if (k>maxBands-1) + allocVectors[i*mode->nbEBands+j] = band_allocation[i*maxBands + maxBands-1]; + else { + opus_int32 a0, a1; + a1 = mode->eBands[j]*(opus_int32)mode->Fs/mode->shortMdctSize - 400*(opus_int32)eband5ms[k-1]; + a0 = 400*(opus_int32)eband5ms[k] - mode->eBands[j]*(opus_int32)mode->Fs/mode->shortMdctSize; + allocVectors[i*mode->nbEBands+j] = (a0*band_allocation[i*maxBands+k-1] + + a1*band_allocation[i*maxBands+k])/(a0+a1); + } + } + } + + /*printf ("\n"); + for (i=0;inbEBands;j++) + printf ("%d ", allocVectors[i*mode->nbEBands+j]); + printf ("\n"); + } + exit(0);*/ + + mode->allocVectors = allocVectors; +} + +#endif /* CUSTOM_MODES */ + +CELTMode *opus_custom_mode_create(opus_int32 Fs, int frame_size, int *error) +{ + int i; +#ifdef CUSTOM_MODES + CELTMode *mode=NULL; + int res; + opus_val16 *window; + opus_int16 *logN; + int LM; + int arch = opus_select_arch(); + ALLOC_STACK; +#if !defined(VAR_ARRAYS) && !defined(USE_ALLOCA) + if (global_stack==NULL) + goto failure; +#endif +#endif + +#ifndef CUSTOM_MODES_ONLY + for (i=0;iFs && + (frame_size<shortMdctSize*static_mode_list[i]->nbShortMdcts) + { + if (error) + *error = OPUS_OK; + return (CELTMode*)static_mode_list[i]; + } + } + } +#endif /* CUSTOM_MODES_ONLY */ + +#ifndef CUSTOM_MODES + if (error) + *error = OPUS_BAD_ARG; + return NULL; +#else + + /* The good thing here is that permutation of the arguments will automatically be invalid */ + + if (Fs < 8000 || Fs > 96000) + { + if (error) + *error = OPUS_BAD_ARG; + return NULL; + } + if (frame_size < 40 || frame_size > 1024 || frame_size%2!=0) + { + if (error) + *error = OPUS_BAD_ARG; + return NULL; + } + /* Frames of less than 1ms are not supported. */ + if ((opus_int32)frame_size*1000 < Fs) + { + if (error) + *error = OPUS_BAD_ARG; + return NULL; + } + + if ((opus_int32)frame_size*75 >= Fs && (frame_size%16)==0) + { + LM = 3; + } else if ((opus_int32)frame_size*150 >= Fs && (frame_size%8)==0) + { + LM = 2; + } else if ((opus_int32)frame_size*300 >= Fs && (frame_size%4)==0) + { + LM = 1; + } else + { + LM = 0; + } + + /* Shorts longer than 3.3ms are not supported. */ + if ((opus_int32)(frame_size>>LM)*300 > Fs) + { + if (error) + *error = OPUS_BAD_ARG; + return NULL; + } + + mode = opus_alloc(sizeof(CELTMode)); + if (mode==NULL) + goto failure; + mode->Fs = Fs; + + /* Pre/de-emphasis depends on sampling rate. The "standard" pre-emphasis + is defined as A(z) = 1 - 0.85*z^-1 at 48 kHz. Other rates should + approximate that. */ + if(Fs < 12000) /* 8 kHz */ + { + mode->preemph[0] = QCONST16(0.3500061035f, 15); + mode->preemph[1] = -QCONST16(0.1799926758f, 15); + mode->preemph[2] = QCONST16(0.2719968125f, SIG_SHIFT); /* exact 1/preemph[3] */ + mode->preemph[3] = QCONST16(3.6765136719f, 13); + } else if(Fs < 24000) /* 16 kHz */ + { + mode->preemph[0] = QCONST16(0.6000061035f, 15); + mode->preemph[1] = -QCONST16(0.1799926758f, 15); + mode->preemph[2] = QCONST16(0.4424998650f, SIG_SHIFT); /* exact 1/preemph[3] */ + mode->preemph[3] = QCONST16(2.2598876953f, 13); + } else if(Fs < 40000) /* 32 kHz */ + { + mode->preemph[0] = QCONST16(0.7799987793f, 15); + mode->preemph[1] = -QCONST16(0.1000061035f, 15); + mode->preemph[2] = QCONST16(0.7499771125f, SIG_SHIFT); /* exact 1/preemph[3] */ + mode->preemph[3] = QCONST16(1.3333740234f, 13); + } else /* 48 kHz */ + { + mode->preemph[0] = QCONST16(0.8500061035f, 15); + mode->preemph[1] = QCONST16(0.0f, 15); + mode->preemph[2] = QCONST16(1.f, SIG_SHIFT); + mode->preemph[3] = QCONST16(1.f, 13); + } + + mode->maxLM = LM; + mode->nbShortMdcts = 1<shortMdctSize = frame_size/mode->nbShortMdcts; + res = (mode->Fs+mode->shortMdctSize)/(2*mode->shortMdctSize); + + mode->eBands = compute_ebands(Fs, mode->shortMdctSize, res, &mode->nbEBands); + if (mode->eBands==NULL) + goto failure; +#if !defined(SMALL_FOOTPRINT) + /* Make sure we don't allocate a band larger than our PVQ table. + 208 should be enough, but let's be paranoid. */ + if ((mode->eBands[mode->nbEBands] - mode->eBands[mode->nbEBands-1])< + 208) { + goto failure; + } +#endif + + mode->effEBands = mode->nbEBands; + while (mode->eBands[mode->effEBands] > mode->shortMdctSize) + mode->effEBands--; + + /* Overlap must be divisible by 4 */ + mode->overlap = ((mode->shortMdctSize>>2)<<2); + + compute_allocation_table(mode); + if (mode->allocVectors==NULL) + goto failure; + + window = (opus_val16*)opus_alloc(mode->overlap*sizeof(opus_val16)); + if (window==NULL) + goto failure; + +#ifndef FIXED_POINT + for (i=0;ioverlap;i++) + window[i] = Q15ONE*sin(.5*M_PI* sin(.5*M_PI*(i+.5)/mode->overlap) * sin(.5*M_PI*(i+.5)/mode->overlap)); +#else + for (i=0;ioverlap;i++) + window[i] = MIN32(32767,floor(.5+32768.*sin(.5*M_PI* sin(.5*M_PI*(i+.5)/mode->overlap) * sin(.5*M_PI*(i+.5)/mode->overlap)))); +#endif + mode->window = window; + + logN = (opus_int16*)opus_alloc(mode->nbEBands*sizeof(opus_int16)); + if (logN==NULL) + goto failure; + + for (i=0;inbEBands;i++) + logN[i] = log2_frac(mode->eBands[i+1]-mode->eBands[i], BITRES); + mode->logN = logN; + + compute_pulse_cache(mode, mode->maxLM); + + if (clt_mdct_init(&mode->mdct, 2*mode->shortMdctSize*mode->nbShortMdcts, + mode->maxLM, arch) == 0) + goto failure; + + if (error) + *error = OPUS_OK; + + return mode; +failure: + if (error) + *error = OPUS_ALLOC_FAIL; + if (mode!=NULL) + opus_custom_mode_destroy(mode); + return NULL; +#endif /* !CUSTOM_MODES */ +} + +#ifdef CUSTOM_MODES +void opus_custom_mode_destroy(CELTMode *mode) +{ + int arch = opus_select_arch(); + + if (mode == NULL) + return; +#ifndef CUSTOM_MODES_ONLY + { + int i; + for (i=0;ieBands); + opus_free((opus_int16*)mode->allocVectors); + + opus_free((opus_val16*)mode->window); + opus_free((opus_int16*)mode->logN); + + opus_free((opus_int16*)mode->cache.index); + opus_free((unsigned char*)mode->cache.bits); + opus_free((unsigned char*)mode->cache.caps); + clt_mdct_clear(&mode->mdct, arch); + + opus_free((CELTMode *)mode); +} +#endif diff --git a/thirdparty/opus/celt/modes.h b/thirdparty/opus/celt/modes.h new file mode 100644 index 000000000000..be813ccc8b4d --- /dev/null +++ b/thirdparty/opus/celt/modes.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Copyright (c) 2008 Gregory Maxwell + Written by Jean-Marc Valin and Gregory Maxwell */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef MODES_H +#define MODES_H + +#include "opus_types.h" +#include "celt.h" +#include "arch.h" +#include "mdct.h" +#include "entenc.h" +#include "entdec.h" + +#define MAX_PERIOD 1024 + +typedef struct { + int size; + const opus_int16 *index; + const unsigned char *bits; + const unsigned char *caps; +} PulseCache; + +/** Mode definition (opaque) + @brief Mode definition + */ +struct OpusCustomMode { + opus_int32 Fs; + int overlap; + + int nbEBands; + int effEBands; + opus_val16 preemph[4]; + const opus_int16 *eBands; /**< Definition for each "pseudo-critical band" */ + + int maxLM; + int nbShortMdcts; + int shortMdctSize; + + int nbAllocVectors; /**< Number of lines in the matrix below */ + const unsigned char *allocVectors; /**< Number of bits in each band for several rates */ + const opus_int16 *logN; + + const opus_val16 *window; + mdct_lookup mdct; + PulseCache cache; +}; + + +#endif diff --git a/thirdparty/opus/celt/opus_custom_demo.c b/thirdparty/opus/celt/opus_custom_demo.c new file mode 100644 index 000000000000..ae41c0de5a28 --- /dev/null +++ b/thirdparty/opus/celt/opus_custom_demo.c @@ -0,0 +1,210 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opus_custom.h" +#include "arch.h" +#include +#include +#include +#include + +#define MAX_PACKET 1275 + +int main(int argc, char *argv[]) +{ + int err; + char *inFile, *outFile; + FILE *fin, *fout; + OpusCustomMode *mode=NULL; + OpusCustomEncoder *enc; + OpusCustomDecoder *dec; + int len; + opus_int32 frame_size, channels, rate; + int bytes_per_packet; + unsigned char data[MAX_PACKET]; + int complexity; +#if !(defined (FIXED_POINT) && !defined(CUSTOM_MODES)) && defined(RESYNTH) + int i; + double rmsd = 0; +#endif + int count = 0; + opus_int32 skip; + opus_int16 *in, *out; + if (argc != 9 && argc != 8 && argc != 7) + { + fprintf (stderr, "Usage: test_opus_custom " + " [ [packet loss rate]] " + " \n"); + return 1; + } + + rate = (opus_int32)atol(argv[1]); + channels = atoi(argv[2]); + frame_size = atoi(argv[3]); + mode = opus_custom_mode_create(rate, frame_size, NULL); + if (mode == NULL) + { + fprintf(stderr, "failed to create a mode\n"); + return 1; + } + + bytes_per_packet = atoi(argv[4]); + if (bytes_per_packet < 0 || bytes_per_packet > MAX_PACKET) + { + fprintf (stderr, "bytes per packet must be between 0 and %d\n", + MAX_PACKET); + return 1; + } + + inFile = argv[argc-2]; + fin = fopen(inFile, "rb"); + if (!fin) + { + fprintf (stderr, "Could not open input file %s\n", argv[argc-2]); + return 1; + } + outFile = argv[argc-1]; + fout = fopen(outFile, "wb+"); + if (!fout) + { + fprintf (stderr, "Could not open output file %s\n", argv[argc-1]); + fclose(fin); + return 1; + } + + enc = opus_custom_encoder_create(mode, channels, &err); + if (err != 0) + { + fprintf(stderr, "Failed to create the encoder: %s\n", opus_strerror(err)); + fclose(fin); + fclose(fout); + return 1; + } + dec = opus_custom_decoder_create(mode, channels, &err); + if (err != 0) + { + fprintf(stderr, "Failed to create the decoder: %s\n", opus_strerror(err)); + fclose(fin); + fclose(fout); + return 1; + } + opus_custom_decoder_ctl(dec, OPUS_GET_LOOKAHEAD(&skip)); + + if (argc>7) + { + complexity=atoi(argv[5]); + opus_custom_encoder_ctl(enc,OPUS_SET_COMPLEXITY(complexity)); + } + + in = (opus_int16*)malloc(frame_size*channels*sizeof(opus_int16)); + out = (opus_int16*)malloc(frame_size*channels*sizeof(opus_int16)); + + while (!feof(fin)) + { + int ret; + err = fread(in, sizeof(short), frame_size*channels, fin); + if (feof(fin)) + break; + len = opus_custom_encode(enc, in, frame_size, data, bytes_per_packet); + if (len <= 0) + fprintf (stderr, "opus_custom_encode() failed: %s\n", opus_strerror(len)); + + /* This is for simulating bit errors */ +#if 0 + int errors = 0; + int eid = 0; + /* This simulates random bit error */ + for (i=0;i 0) + { + rmsd = sqrt(rmsd/(1.0*frame_size*channels*count)); + fprintf (stderr, "Error: encoder doesn't match decoder\n"); + fprintf (stderr, "RMS mismatch is %f\n", rmsd); + return 1; + } else { + fprintf (stderr, "Encoder matches decoder!!\n"); + } +#endif + return 0; +} + diff --git a/thirdparty/opus/celt/os_support.h b/thirdparty/opus/celt/os_support.h new file mode 100644 index 000000000000..a2171971e9d6 --- /dev/null +++ b/thirdparty/opus/celt/os_support.h @@ -0,0 +1,92 @@ +/* Copyright (C) 2007 Jean-Marc Valin + + File: os_support.h + This is the (tiny) OS abstraction layer. Aside from math.h, this is the + only place where system headers are allowed. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef OS_SUPPORT_H +#define OS_SUPPORT_H + +#ifdef CUSTOM_SUPPORT +# include "custom_support.h" +#endif + +#include "opus_types.h" +#include "opus_defines.h" + +#include +#include +#include + +/** Opus wrapper for malloc(). To do your own dynamic allocation, all you need to do is replace this function and opus_free */ +#ifndef OVERRIDE_OPUS_ALLOC +static OPUS_INLINE void *opus_alloc (size_t size) +{ + return malloc(size); +} +#endif + +/** Same as celt_alloc(), except that the area is only needed inside a CELT call (might cause problem with wideband though) */ +#ifndef OVERRIDE_OPUS_ALLOC_SCRATCH +static OPUS_INLINE void *opus_alloc_scratch (size_t size) +{ + /* Scratch space doesn't need to be cleared */ + return opus_alloc(size); +} +#endif + +/** Opus wrapper for free(). To do your own dynamic allocation, all you need to do is replace this function and opus_alloc */ +#ifndef OVERRIDE_OPUS_FREE +static OPUS_INLINE void opus_free (void *ptr) +{ + free(ptr); +} +#endif + +/** Copy n elements from src to dst. The 0* term provides compile-time type checking */ +#ifndef OVERRIDE_OPUS_COPY +#define OPUS_COPY(dst, src, n) (memcpy((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) )) +#endif + +/** Copy n elements from src to dst, allowing overlapping regions. The 0* term + provides compile-time type checking */ +#ifndef OVERRIDE_OPUS_MOVE +#define OPUS_MOVE(dst, src, n) (memmove((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) )) +#endif + +/** Set n elements of dst to zero */ +#ifndef OVERRIDE_OPUS_CLEAR +#define OPUS_CLEAR(dst, n) (memset((dst), 0, (n)*sizeof(*(dst)))) +#endif + +/*#ifdef __GNUC__ +#pragma GCC poison printf sprintf +#pragma GCC poison malloc free realloc calloc +#endif*/ + +#endif /* OS_SUPPORT_H */ + diff --git a/thirdparty/opus/celt/pitch.c b/thirdparty/opus/celt/pitch.c new file mode 100644 index 000000000000..bf46e7d562b3 --- /dev/null +++ b/thirdparty/opus/celt/pitch.c @@ -0,0 +1,557 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/** + @file pitch.c + @brief Pitch analysis + */ + +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "pitch.h" +#include "os_support.h" +#include "modes.h" +#include "stack_alloc.h" +#include "mathops.h" +#include "celt_lpc.h" + +static void find_best_pitch(opus_val32 *xcorr, opus_val16 *y, int len, + int max_pitch, int *best_pitch +#ifdef FIXED_POINT + , int yshift, opus_val32 maxcorr +#endif + ) +{ + int i, j; + opus_val32 Syy=1; + opus_val16 best_num[2]; + opus_val32 best_den[2]; +#ifdef FIXED_POINT + int xshift; + + xshift = celt_ilog2(maxcorr)-14; +#endif + + best_num[0] = -1; + best_num[1] = -1; + best_den[0] = 0; + best_den[1] = 0; + best_pitch[0] = 0; + best_pitch[1] = 1; + for (j=0;j0) + { + opus_val16 num; + opus_val32 xcorr16; + xcorr16 = EXTRACT16(VSHR32(xcorr[i], xshift)); +#ifndef FIXED_POINT + /* Considering the range of xcorr16, this should avoid both underflows + and overflows (inf) when squaring xcorr16 */ + xcorr16 *= 1e-12f; +#endif + num = MULT16_16_Q15(xcorr16,xcorr16); + if (MULT16_32_Q15(num,best_den[1]) > MULT16_32_Q15(best_num[1],Syy)) + { + if (MULT16_32_Q15(num,best_den[0]) > MULT16_32_Q15(best_num[0],Syy)) + { + best_num[1] = best_num[0]; + best_den[1] = best_den[0]; + best_pitch[1] = best_pitch[0]; + best_num[0] = num; + best_den[0] = Syy; + best_pitch[0] = i; + } else { + best_num[1] = num; + best_den[1] = Syy; + best_pitch[1] = i; + } + } + } + Syy += SHR32(MULT16_16(y[i+len],y[i+len]),yshift) - SHR32(MULT16_16(y[i],y[i]),yshift); + Syy = MAX32(1, Syy); + } +} + +static void celt_fir5(const opus_val16 *x, + const opus_val16 *num, + opus_val16 *y, + int N, + opus_val16 *mem) +{ + int i; + opus_val16 num0, num1, num2, num3, num4; + opus_val32 mem0, mem1, mem2, mem3, mem4; + num0=num[0]; + num1=num[1]; + num2=num[2]; + num3=num[3]; + num4=num[4]; + mem0=mem[0]; + mem1=mem[1]; + mem2=mem[2]; + mem3=mem[3]; + mem4=mem[4]; + for (i=0;i>1;i++) + x_lp[i] = SHR32(HALF32(HALF32(x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]), shift); + x_lp[0] = SHR32(HALF32(HALF32(x[0][1])+x[0][0]), shift); + if (C==2) + { + for (i=1;i>1;i++) + x_lp[i] += SHR32(HALF32(HALF32(x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]), shift); + x_lp[0] += SHR32(HALF32(HALF32(x[1][1])+x[1][0]), shift); + } + + _celt_autocorr(x_lp, ac, NULL, 0, + 4, len>>1, arch); + + /* Noise floor -40 dB */ +#ifdef FIXED_POINT + ac[0] += SHR32(ac[0],13); +#else + ac[0] *= 1.0001f; +#endif + /* Lag windowing */ + for (i=1;i<=4;i++) + { + /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/ +#ifdef FIXED_POINT + ac[i] -= MULT16_32_Q15(2*i*i, ac[i]); +#else + ac[i] -= ac[i]*(.008f*i)*(.008f*i); +#endif + } + + _celt_lpc(lpc, ac, 4); + for (i=0;i<4;i++) + { + tmp = MULT16_16_Q15(QCONST16(.9f,15), tmp); + lpc[i] = MULT16_16_Q15(lpc[i], tmp); + } + /* Add a zero */ + lpc2[0] = lpc[0] + QCONST16(.8f,SIG_SHIFT); + lpc2[1] = lpc[1] + MULT16_16_Q15(c1,lpc[0]); + lpc2[2] = lpc[2] + MULT16_16_Q15(c1,lpc[1]); + lpc2[3] = lpc[3] + MULT16_16_Q15(c1,lpc[2]); + lpc2[4] = MULT16_16_Q15(c1,lpc[3]); + celt_fir5(x_lp, lpc2, x_lp, len>>1, mem); +} + +/* Pure C implementation. */ +#ifdef FIXED_POINT +opus_val32 +#else +void +#endif +#if defined(OVERRIDE_PITCH_XCORR) +celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, + opus_val32 *xcorr, int len, int max_pitch) +#else +celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, + opus_val32 *xcorr, int len, int max_pitch, int arch) +#endif +{ + +#if 0 /* This is a simple version of the pitch correlation that should work + well on DSPs like Blackfin and TI C5x/C6x */ + int i, j; +#ifdef FIXED_POINT + opus_val32 maxcorr=1; +#endif +#if !defined(OVERRIDE_PITCH_XCORR) + (void)arch; +#endif + for (i=0;i0); + celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0); + for (i=0;i0); + celt_assert(max_pitch>0); + lag = len+max_pitch; + + ALLOC(x_lp4, len>>2, opus_val16); + ALLOC(y_lp4, lag>>2, opus_val16); + ALLOC(xcorr, max_pitch>>1, opus_val32); + + /* Downsample by 2 again */ + for (j=0;j>2;j++) + x_lp4[j] = x_lp[2*j]; + for (j=0;j>2;j++) + y_lp4[j] = y[2*j]; + +#ifdef FIXED_POINT + xmax = celt_maxabs16(x_lp4, len>>2); + ymax = celt_maxabs16(y_lp4, lag>>2); + shift = celt_ilog2(MAX32(1, MAX32(xmax, ymax)))-11; + if (shift>0) + { + for (j=0;j>2;j++) + x_lp4[j] = SHR16(x_lp4[j], shift); + for (j=0;j>2;j++) + y_lp4[j] = SHR16(y_lp4[j], shift); + /* Use double the shift for a MAC */ + shift *= 2; + } else { + shift = 0; + } +#endif + + /* Coarse search with 4x decimation */ + +#ifdef FIXED_POINT + maxcorr = +#endif + celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2, arch); + + find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch +#ifdef FIXED_POINT + , 0, maxcorr +#endif + ); + + /* Finer search with 2x decimation */ +#ifdef FIXED_POINT + maxcorr=1; +#endif + for (i=0;i>1;i++) + { + opus_val32 sum; + xcorr[i] = 0; + if (abs(i-2*best_pitch[0])>2 && abs(i-2*best_pitch[1])>2) + continue; +#ifdef FIXED_POINT + sum = 0; + for (j=0;j>1;j++) + sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift); +#else + sum = celt_inner_prod_c(x_lp, y+i, len>>1); +#endif + xcorr[i] = MAX32(-1, sum); +#ifdef FIXED_POINT + maxcorr = MAX32(maxcorr, sum); +#endif + } + find_best_pitch(xcorr, y, len>>1, max_pitch>>1, best_pitch +#ifdef FIXED_POINT + , shift+1, maxcorr +#endif + ); + + /* Refine by pseudo-interpolation */ + if (best_pitch[0]>0 && best_pitch[0]<(max_pitch>>1)-1) + { + opus_val32 a, b, c; + a = xcorr[best_pitch[0]-1]; + b = xcorr[best_pitch[0]]; + c = xcorr[best_pitch[0]+1]; + if ((c-a) > MULT16_32_Q15(QCONST16(.7f,15),b-a)) + offset = 1; + else if ((a-c) > MULT16_32_Q15(QCONST16(.7f,15),b-c)) + offset = -1; + else + offset = 0; + } else { + offset = 0; + } + *pitch = 2*best_pitch[0]-offset; + + RESTORE_STACK; +} + +#ifdef FIXED_POINT +static opus_val16 compute_pitch_gain(opus_val32 xy, opus_val32 xx, opus_val32 yy) +{ + opus_val32 x2y2; + int sx, sy, shift; + opus_val32 g; + opus_val16 den; + if (xy == 0 || xx == 0 || yy == 0) + return 0; + sx = celt_ilog2(xx)-14; + sy = celt_ilog2(yy)-14; + shift = sx + sy; + x2y2 = MULT16_16_Q14(VSHR32(xx, sx), VSHR32(yy, sy)); + if (shift & 1) { + if (x2y2 < 32768) + { + x2y2 <<= 1; + shift--; + } else { + x2y2 >>= 1; + shift++; + } + } + den = celt_rsqrt_norm(x2y2); + g = MULT16_32_Q15(den, xy); + g = VSHR32(g, (shift>>1)-1); + return EXTRACT16(MIN32(g, Q15ONE)); +} +#else +static opus_val16 compute_pitch_gain(opus_val32 xy, opus_val32 xx, opus_val32 yy) +{ + return xy/celt_sqrt(1+xx*yy); +} +#endif + +static const int second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2}; +opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, + int N, int *T0_, int prev_period, opus_val16 prev_gain, int arch) +{ + int k, i, T, T0; + opus_val16 g, g0; + opus_val16 pg; + opus_val32 xy,xx,yy,xy2; + opus_val32 xcorr[3]; + opus_val32 best_xy, best_yy; + int offset; + int minperiod0; + VARDECL(opus_val32, yy_lookup); + SAVE_STACK; + + minperiod0 = minperiod; + maxperiod /= 2; + minperiod /= 2; + *T0_ /= 2; + prev_period /= 2; + N /= 2; + x += maxperiod; + if (*T0_>=maxperiod) + *T0_=maxperiod-1; + + T = T0 = *T0_; + ALLOC(yy_lookup, maxperiod+1, opus_val32); + dual_inner_prod(x, x, x-T0, N, &xx, &xy, arch); + yy_lookup[0] = xx; + yy=xx; + for (i=1;i<=maxperiod;i++) + { + yy = yy+MULT16_16(x[-i],x[-i])-MULT16_16(x[N-i],x[N-i]); + yy_lookup[i] = MAX32(0, yy); + } + yy = yy_lookup[T0]; + best_xy = xy; + best_yy = yy; + g = g0 = compute_pitch_gain(xy, xx, yy); + /* Look for any pitch at T/k */ + for (k=2;k<=15;k++) + { + int T1, T1b; + opus_val16 g1; + opus_val16 cont=0; + opus_val16 thresh; + T1 = celt_udiv(2*T0+k, 2*k); + if (T1 < minperiod) + break; + /* Look for another strong correlation at T1b */ + if (k==2) + { + if (T1+T0>maxperiod) + T1b = T0; + else + T1b = T0+T1; + } else + { + T1b = celt_udiv(2*second_check[k]*T0+k, 2*k); + } + dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2, arch); + xy = HALF32(xy + xy2); + yy = HALF32(yy_lookup[T1] + yy_lookup[T1b]); + g1 = compute_pitch_gain(xy, xx, yy); + if (abs(T1-prev_period)<=1) + cont = prev_gain; + else if (abs(T1-prev_period)<=2 && 5*k*k < T0) + cont = HALF16(prev_gain); + else + cont = 0; + thresh = MAX16(QCONST16(.3f,15), MULT16_16_Q15(QCONST16(.7f,15),g0)-cont); + /* Bias against very high pitch (very short period) to avoid false-positives + due to short-term correlation */ + if (T1<3*minperiod) + thresh = MAX16(QCONST16(.4f,15), MULT16_16_Q15(QCONST16(.85f,15),g0)-cont); + else if (T1<2*minperiod) + thresh = MAX16(QCONST16(.5f,15), MULT16_16_Q15(QCONST16(.9f,15),g0)-cont); + if (g1 > thresh) + { + best_xy = xy; + best_yy = yy; + T = T1; + g = g1; + } + } + best_xy = MAX32(0, best_xy); + if (best_yy <= best_xy) + pg = Q15ONE; + else + pg = SHR32(frac_div32(best_xy,best_yy+1),16); + + for (k=0;k<3;k++) + xcorr[k] = celt_inner_prod(x, x-(T+k-1), N, arch); + if ((xcorr[2]-xcorr[0]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[0])) + offset = 1; + else if ((xcorr[0]-xcorr[2]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[2])) + offset = -1; + else + offset = 0; + if (pg > g) + pg = g; + *T0_ = 2*T+offset; + + if (*T0_=3); + y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */ + y_0=*y++; + y_1=*y++; + y_2=*y++; + for (j=0;j +#include "os_support.h" +#include "arch.h" +#include "mathops.h" +#include "stack_alloc.h" +#include "rate.h" + +#ifdef FIXED_POINT +/* Mean energy in each band quantized in Q4 */ +const signed char eMeans[25] = { + 103,100, 92, 85, 81, + 77, 72, 70, 78, 75, + 73, 71, 78, 74, 69, + 72, 70, 74, 76, 71, + 60, 60, 60, 60, 60 +}; +#else +/* Mean energy in each band quantized in Q4 and converted back to float */ +const opus_val16 eMeans[25] = { + 6.437500f, 6.250000f, 5.750000f, 5.312500f, 5.062500f, + 4.812500f, 4.500000f, 4.375000f, 4.875000f, 4.687500f, + 4.562500f, 4.437500f, 4.875000f, 4.625000f, 4.312500f, + 4.500000f, 4.375000f, 4.625000f, 4.750000f, 4.437500f, + 3.750000f, 3.750000f, 3.750000f, 3.750000f, 3.750000f +}; +#endif +/* prediction coefficients: 0.9, 0.8, 0.65, 0.5 */ +#ifdef FIXED_POINT +static const opus_val16 pred_coef[4] = {29440, 26112, 21248, 16384}; +static const opus_val16 beta_coef[4] = {30147, 22282, 12124, 6554}; +static const opus_val16 beta_intra = 4915; +#else +// static const opus_val16 pred_coef[4] = {29440/32768., 26112/32768., 21248/32768., 16384/32768.}; +// static const opus_val16 beta_coef[4] = {30147/32768., 22282/32768., 12124/32768., 6554/32768.}; +// static const opus_val16 beta_intra = 4915/32768.; +static const opus_val16 pred_coef[4] = {0.8984375f, 0.796875f, 0.6484375f, 0.5f}; +static const opus_val16 beta_coef[4] = {0.920013427734375f, 0.67999267578125f, 0.3699951171875f, 0.20001220703125f}; +static const opus_val16 beta_intra = 0.149993896484375f; +#endif + +/*Parameters of the Laplace-like probability models used for the coarse energy. + There is one pair of parameters for each frame size, prediction type + (inter/intra), and band number. + The first number of each pair is the probability of 0, and the second is the + decay rate, both in Q8 precision.*/ +static const unsigned char e_prob_model[4][2][42] = { + /*120 sample frames.*/ + { + /*Inter*/ + { + 72, 127, 65, 129, 66, 128, 65, 128, 64, 128, 62, 128, 64, 128, + 64, 128, 92, 78, 92, 79, 92, 78, 90, 79, 116, 41, 115, 40, + 114, 40, 132, 26, 132, 26, 145, 17, 161, 12, 176, 10, 177, 11 + }, + /*Intra*/ + { + 24, 179, 48, 138, 54, 135, 54, 132, 53, 134, 56, 133, 55, 132, + 55, 132, 61, 114, 70, 96, 74, 88, 75, 88, 87, 74, 89, 66, + 91, 67, 100, 59, 108, 50, 120, 40, 122, 37, 97, 43, 78, 50 + } + }, + /*240 sample frames.*/ + { + /*Inter*/ + { + 83, 78, 84, 81, 88, 75, 86, 74, 87, 71, 90, 73, 93, 74, + 93, 74, 109, 40, 114, 36, 117, 34, 117, 34, 143, 17, 145, 18, + 146, 19, 162, 12, 165, 10, 178, 7, 189, 6, 190, 8, 177, 9 + }, + /*Intra*/ + { + 23, 178, 54, 115, 63, 102, 66, 98, 69, 99, 74, 89, 71, 91, + 73, 91, 78, 89, 86, 80, 92, 66, 93, 64, 102, 59, 103, 60, + 104, 60, 117, 52, 123, 44, 138, 35, 133, 31, 97, 38, 77, 45 + } + }, + /*480 sample frames.*/ + { + /*Inter*/ + { + 61, 90, 93, 60, 105, 42, 107, 41, 110, 45, 116, 38, 113, 38, + 112, 38, 124, 26, 132, 27, 136, 19, 140, 20, 155, 14, 159, 16, + 158, 18, 170, 13, 177, 10, 187, 8, 192, 6, 175, 9, 159, 10 + }, + /*Intra*/ + { + 21, 178, 59, 110, 71, 86, 75, 85, 84, 83, 91, 66, 88, 73, + 87, 72, 92, 75, 98, 72, 105, 58, 107, 54, 115, 52, 114, 55, + 112, 56, 129, 51, 132, 40, 150, 33, 140, 29, 98, 35, 77, 42 + } + }, + /*960 sample frames.*/ + { + /*Inter*/ + { + 42, 121, 96, 66, 108, 43, 111, 40, 117, 44, 123, 32, 120, 36, + 119, 33, 127, 33, 134, 34, 139, 21, 147, 23, 152, 20, 158, 25, + 154, 26, 166, 21, 173, 16, 184, 13, 184, 10, 150, 13, 139, 15 + }, + /*Intra*/ + { + 22, 178, 63, 114, 74, 82, 84, 83, 92, 82, 103, 62, 96, 72, + 96, 67, 101, 73, 107, 72, 113, 55, 118, 52, 125, 52, 118, 52, + 117, 55, 135, 49, 137, 39, 157, 32, 145, 29, 97, 33, 77, 40 + } + } +}; + +static const unsigned char small_energy_icdf[3]={2,1,0}; + +static opus_val32 loss_distortion(const opus_val16 *eBands, opus_val16 *oldEBands, int start, int end, int len, int C) +{ + int c, i; + opus_val32 dist = 0; + c=0; do { + for (i=start;inbEBands]; + oldE = MAX16(-QCONST16(9.f,DB_SHIFT), oldEBands[i+c*m->nbEBands]); +#ifdef FIXED_POINT + f = SHL32(EXTEND32(x),7) - PSHR32(MULT16_16(coef,oldE), 8) - prev[c]; + /* Rounding to nearest integer here is really important! */ + qi = (f+QCONST32(.5f,DB_SHIFT+7))>>(DB_SHIFT+7); + decay_bound = EXTRACT16(MAX32(-QCONST16(28.f,DB_SHIFT), + SUB32((opus_val32)oldEBands[i+c*m->nbEBands],max_decay))); +#else + f = x-coef*oldE-prev[c]; + /* Rounding to nearest integer here is really important! */ + qi = (int)floor(.5f+f); + decay_bound = MAX16(-QCONST16(28.f,DB_SHIFT), oldEBands[i+c*m->nbEBands]) - max_decay; +#endif + /* Prevent the energy from going down too quickly (e.g. for bands + that have just one bin) */ + if (qi < 0 && x < decay_bound) + { + qi += (int)SHR16(SUB16(decay_bound,x), DB_SHIFT); + if (qi > 0) + qi = 0; + } + qi0 = qi; + /* If we don't have enough bits to encode all the energy, just assume + something safe. */ + tell = ec_tell(enc); + bits_left = budget-tell-3*C*(end-i); + if (i!=start && bits_left < 30) + { + if (bits_left < 24) + qi = IMIN(1, qi); + if (bits_left < 16) + qi = IMAX(-1, qi); + } + if (lfe && i>=2) + qi = IMIN(qi, 0); + if (budget-tell >= 15) + { + int pi; + pi = 2*IMIN(i,20); + ec_laplace_encode(enc, &qi, + prob_model[pi]<<7, prob_model[pi+1]<<6); + } + else if(budget-tell >= 2) + { + qi = IMAX(-1, IMIN(qi, 1)); + ec_enc_icdf(enc, 2*qi^-(qi<0), small_energy_icdf, 2); + } + else if(budget-tell >= 1) + { + qi = IMIN(0, qi); + ec_enc_bit_logp(enc, -qi, 1); + } + else + qi = -1; + error[i+c*m->nbEBands] = PSHR32(f,7) - SHL16(qi,DB_SHIFT); + badness += abs(qi0-qi); + q = (opus_val32)SHL32(EXTEND32(qi),DB_SHIFT); + + tmp = PSHR32(MULT16_16(coef,oldE),8) + prev[c] + SHL32(q,7); +#ifdef FIXED_POINT + tmp = MAX32(-QCONST32(28.f, DB_SHIFT+7), tmp); +#endif + oldEBands[i+c*m->nbEBands] = PSHR32(tmp, 7); + prev[c] = prev[c] + SHL32(q,7) - MULT16_16(beta,PSHR32(q,8)); + } while (++c < C); + } + return lfe ? 0 : badness; +} + +void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd, + const opus_val16 *eBands, opus_val16 *oldEBands, opus_uint32 budget, + opus_val16 *error, ec_enc *enc, int C, int LM, int nbAvailableBytes, + int force_intra, opus_val32 *delayedIntra, int two_pass, int loss_rate, int lfe) +{ + int intra; + opus_val16 max_decay; + VARDECL(opus_val16, oldEBands_intra); + VARDECL(opus_val16, error_intra); + ec_enc enc_start_state; + opus_uint32 tell; + int badness1=0; + opus_int32 intra_bias; + opus_val32 new_distortion; + SAVE_STACK; + + intra = force_intra || (!two_pass && *delayedIntra>2*C*(end-start) && nbAvailableBytes > (end-start)*C); + intra_bias = (opus_int32)((budget**delayedIntra*loss_rate)/(C*512)); + new_distortion = loss_distortion(eBands, oldEBands, start, effEnd, m->nbEBands, C); + + tell = ec_tell(enc); + if (tell+3 > budget) + two_pass = intra = 0; + + max_decay = QCONST16(16.f,DB_SHIFT); + if (end-start>10) + { +#ifdef FIXED_POINT + max_decay = MIN32(max_decay, SHL32(EXTEND32(nbAvailableBytes),DB_SHIFT-3)); +#else + max_decay = MIN32(max_decay, .125f*nbAvailableBytes); +#endif + } + if (lfe) + max_decay = QCONST16(3.f,DB_SHIFT); + enc_start_state = *enc; + + ALLOC(oldEBands_intra, C*m->nbEBands, opus_val16); + ALLOC(error_intra, C*m->nbEBands, opus_val16); + OPUS_COPY(oldEBands_intra, oldEBands, C*m->nbEBands); + + if (two_pass || intra) + { + badness1 = quant_coarse_energy_impl(m, start, end, eBands, oldEBands_intra, budget, + tell, e_prob_model[LM][1], error_intra, enc, C, LM, 1, max_decay, lfe); + } + + if (!intra) + { + unsigned char *intra_buf; + ec_enc enc_intra_state; + opus_int32 tell_intra; + opus_uint32 nstart_bytes; + opus_uint32 nintra_bytes; + opus_uint32 save_bytes; + int badness2; + VARDECL(unsigned char, intra_bits); + + tell_intra = ec_tell_frac(enc); + + enc_intra_state = *enc; + + nstart_bytes = ec_range_bytes(&enc_start_state); + nintra_bytes = ec_range_bytes(&enc_intra_state); + intra_buf = ec_get_buffer(&enc_intra_state) + nstart_bytes; + save_bytes = nintra_bytes-nstart_bytes; + if (save_bytes == 0) + save_bytes = ALLOC_NONE; + ALLOC(intra_bits, save_bytes, unsigned char); + /* Copy bits from intra bit-stream */ + OPUS_COPY(intra_bits, intra_buf, nintra_bytes - nstart_bytes); + + *enc = enc_start_state; + + badness2 = quant_coarse_energy_impl(m, start, end, eBands, oldEBands, budget, + tell, e_prob_model[LM][intra], error, enc, C, LM, 0, max_decay, lfe); + + if (two_pass && (badness1 < badness2 || (badness1 == badness2 && ((opus_int32)ec_tell_frac(enc))+intra_bias > tell_intra))) + { + *enc = enc_intra_state; + /* Copy intra bits to bit-stream */ + OPUS_COPY(intra_buf, intra_bits, nintra_bytes - nstart_bytes); + OPUS_COPY(oldEBands, oldEBands_intra, C*m->nbEBands); + OPUS_COPY(error, error_intra, C*m->nbEBands); + intra = 1; + } + } else { + OPUS_COPY(oldEBands, oldEBands_intra, C*m->nbEBands); + OPUS_COPY(error, error_intra, C*m->nbEBands); + } + + if (intra) + *delayedIntra = new_distortion; + else + *delayedIntra = ADD32(MULT16_32_Q15(MULT16_16_Q15(pred_coef[LM], pred_coef[LM]),*delayedIntra), + new_distortion); + + RESTORE_STACK; +} + +void quant_fine_energy(const CELTMode *m, int start, int end, opus_val16 *oldEBands, opus_val16 *error, int *fine_quant, ec_enc *enc, int C) +{ + int i, c; + + /* Encode finer resolution */ + for (i=start;inbEBands]+QCONST16(.5f,DB_SHIFT))>>(DB_SHIFT-fine_quant[i]); +#else + q2 = (int)floor((error[i+c*m->nbEBands]+.5f)*frac); +#endif + if (q2 > frac-1) + q2 = frac-1; + if (q2<0) + q2 = 0; + ec_enc_bits(enc, q2, fine_quant[i]); +#ifdef FIXED_POINT + offset = SUB16(SHR32(SHL32(EXTEND32(q2),DB_SHIFT)+QCONST16(.5f,DB_SHIFT),fine_quant[i]),QCONST16(.5f,DB_SHIFT)); +#else + offset = (q2+.5f)*(1<<(14-fine_quant[i]))*(1.f/16384) - .5f; +#endif + oldEBands[i+c*m->nbEBands] += offset; + error[i+c*m->nbEBands] -= offset; + /*printf ("%f ", error[i] - offset);*/ + } while (++c < C); + } +} + +void quant_energy_finalise(const CELTMode *m, int start, int end, opus_val16 *oldEBands, opus_val16 *error, int *fine_quant, int *fine_priority, int bits_left, ec_enc *enc, int C) +{ + int i, prio, c; + + /* Use up the remaining bits */ + for (prio=0;prio<2;prio++) + { + for (i=start;i=C ;i++) + { + if (fine_quant[i] >= MAX_FINE_BITS || fine_priority[i]!=prio) + continue; + c=0; + do { + int q2; + opus_val16 offset; + q2 = error[i+c*m->nbEBands]<0 ? 0 : 1; + ec_enc_bits(enc, q2, 1); +#ifdef FIXED_POINT + offset = SHR16(SHL16(q2,DB_SHIFT)-QCONST16(.5f,DB_SHIFT),fine_quant[i]+1); +#else + offset = (q2-.5f)*(1<<(14-fine_quant[i]-1))*(1.f/16384); +#endif + oldEBands[i+c*m->nbEBands] += offset; + bits_left--; + } while (++c < C); + } + } +} + +void unquant_coarse_energy(const CELTMode *m, int start, int end, opus_val16 *oldEBands, int intra, ec_dec *dec, int C, int LM) +{ + const unsigned char *prob_model = e_prob_model[LM][intra]; + int i, c; + opus_val32 prev[2] = {0, 0}; + opus_val16 coef; + opus_val16 beta; + opus_int32 budget; + opus_int32 tell; + + if (intra) + { + coef = 0; + beta = beta_intra; + } else { + beta = beta_coef[LM]; + coef = pred_coef[LM]; + } + + budget = dec->storage*8; + + /* Decode at a fixed coarse resolution */ + for (i=start;i=15) + { + int pi; + pi = 2*IMIN(i,20); + qi = ec_laplace_decode(dec, + prob_model[pi]<<7, prob_model[pi+1]<<6); + } + else if(budget-tell>=2) + { + qi = ec_dec_icdf(dec, small_energy_icdf, 2); + qi = (qi>>1)^-(qi&1); + } + else if(budget-tell>=1) + { + qi = -ec_dec_bit_logp(dec, 1); + } + else + qi = -1; + q = (opus_val32)SHL32(EXTEND32(qi),DB_SHIFT); + + oldEBands[i+c*m->nbEBands] = MAX16(-QCONST16(9.f,DB_SHIFT), oldEBands[i+c*m->nbEBands]); + tmp = PSHR32(MULT16_16(coef,oldEBands[i+c*m->nbEBands]),8) + prev[c] + SHL32(q,7); +#ifdef FIXED_POINT + tmp = MAX32(-QCONST32(28.f, DB_SHIFT+7), tmp); +#endif + oldEBands[i+c*m->nbEBands] = PSHR32(tmp, 7); + prev[c] = prev[c] + SHL32(q,7) - MULT16_16(beta,PSHR32(q,8)); + } while (++c < C); + } +} + +void unquant_fine_energy(const CELTMode *m, int start, int end, opus_val16 *oldEBands, int *fine_quant, ec_dec *dec, int C) +{ + int i, c; + /* Decode finer resolution */ + for (i=start;inbEBands] += offset; + } while (++c < C); + } +} + +void unquant_energy_finalise(const CELTMode *m, int start, int end, opus_val16 *oldEBands, int *fine_quant, int *fine_priority, int bits_left, ec_dec *dec, int C) +{ + int i, prio, c; + + /* Use up the remaining bits */ + for (prio=0;prio<2;prio++) + { + for (i=start;i=C ;i++) + { + if (fine_quant[i] >= MAX_FINE_BITS || fine_priority[i]!=prio) + continue; + c=0; + do { + int q2; + opus_val16 offset; + q2 = ec_dec_bits(dec, 1); +#ifdef FIXED_POINT + offset = SHR16(SHL16(q2,DB_SHIFT)-QCONST16(.5f,DB_SHIFT),fine_quant[i]+1); +#else + offset = (q2-.5f)*(1<<(14-fine_quant[i]-1))*(1.f/16384); +#endif + oldEBands[i+c*m->nbEBands] += offset; + bits_left--; + } while (++c < C); + } + } +} + +void amp2Log2(const CELTMode *m, int effEnd, int end, + celt_ener *bandE, opus_val16 *bandLogE, int C) +{ + int c, i; + c=0; + do { + for (i=0;inbEBands] = + celt_log2(SHL32(bandE[i+c*m->nbEBands],2)) + - SHL16((opus_val16)eMeans[i],6); + for (i=effEnd;inbEBands+i] = -QCONST16(14.f,DB_SHIFT); + } while (++c < C); +} diff --git a/thirdparty/opus/celt/quant_bands.h b/thirdparty/opus/celt/quant_bands.h new file mode 100644 index 000000000000..0490bca4b404 --- /dev/null +++ b/thirdparty/opus/celt/quant_bands.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef QUANT_BANDS +#define QUANT_BANDS + +#include "arch.h" +#include "modes.h" +#include "entenc.h" +#include "entdec.h" +#include "mathops.h" + +#ifdef FIXED_POINT +extern const signed char eMeans[25]; +#else +extern const opus_val16 eMeans[25]; +#endif + +void amp2Log2(const CELTMode *m, int effEnd, int end, + celt_ener *bandE, opus_val16 *bandLogE, int C); + +void log2Amp(const CELTMode *m, int start, int end, + celt_ener *eBands, const opus_val16 *oldEBands, int C); + +void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd, + const opus_val16 *eBands, opus_val16 *oldEBands, opus_uint32 budget, + opus_val16 *error, ec_enc *enc, int C, int LM, + int nbAvailableBytes, int force_intra, opus_val32 *delayedIntra, + int two_pass, int loss_rate, int lfe); + +void quant_fine_energy(const CELTMode *m, int start, int end, opus_val16 *oldEBands, opus_val16 *error, int *fine_quant, ec_enc *enc, int C); + +void quant_energy_finalise(const CELTMode *m, int start, int end, opus_val16 *oldEBands, opus_val16 *error, int *fine_quant, int *fine_priority, int bits_left, ec_enc *enc, int C); + +void unquant_coarse_energy(const CELTMode *m, int start, int end, opus_val16 *oldEBands, int intra, ec_dec *dec, int C, int LM); + +void unquant_fine_energy(const CELTMode *m, int start, int end, opus_val16 *oldEBands, int *fine_quant, ec_dec *dec, int C); + +void unquant_energy_finalise(const CELTMode *m, int start, int end, opus_val16 *oldEBands, int *fine_quant, int *fine_priority, int bits_left, ec_dec *dec, int C); + +#endif /* QUANT_BANDS */ diff --git a/thirdparty/opus/celt/rate.c b/thirdparty/opus/celt/rate.c new file mode 100644 index 000000000000..7dfa5be8a639 --- /dev/null +++ b/thirdparty/opus/celt/rate.c @@ -0,0 +1,639 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include "modes.h" +#include "cwrs.h" +#include "arch.h" +#include "os_support.h" + +#include "entcode.h" +#include "rate.h" + +static const unsigned char LOG2_FRAC_TABLE[24]={ + 0, + 8,13, + 16,19,21,23, + 24,26,27,28,29,30,31,32, + 32,33,34,34,35,36,36,37,37 +}; + +#ifdef CUSTOM_MODES + +/*Determines if V(N,K) fits in a 32-bit unsigned integer. + N and K are themselves limited to 15 bits.*/ +static int fits_in32(int _n, int _k) +{ + static const opus_int16 maxN[15] = { + 32767, 32767, 32767, 1476, 283, 109, 60, 40, + 29, 24, 20, 18, 16, 14, 13}; + static const opus_int16 maxK[15] = { + 32767, 32767, 32767, 32767, 1172, 238, 95, 53, + 36, 27, 22, 18, 16, 15, 13}; + if (_n>=14) + { + if (_k>=14) + return 0; + else + return _n <= maxN[_k]; + } else { + return _k <= maxK[_n]; + } +} + +void compute_pulse_cache(CELTMode *m, int LM) +{ + int C; + int i; + int j; + int curr=0; + int nbEntries=0; + int entryN[100], entryK[100], entryI[100]; + const opus_int16 *eBands = m->eBands; + PulseCache *cache = &m->cache; + opus_int16 *cindex; + unsigned char *bits; + unsigned char *cap; + + cindex = (opus_int16 *)opus_alloc(sizeof(cache->index[0])*m->nbEBands*(LM+2)); + cache->index = cindex; + + /* Scan for all unique band sizes */ + for (i=0;i<=LM+1;i++) + { + for (j=0;jnbEBands;j++) + { + int k; + int N = (eBands[j+1]-eBands[j])<>1; + cindex[i*m->nbEBands+j] = -1; + /* Find other bands that have the same size */ + for (k=0;k<=i;k++) + { + int n; + for (n=0;nnbEBands && (k!=i || n>1) + { + cindex[i*m->nbEBands+j] = cindex[k*m->nbEBands+n]; + break; + } + } + } + if (cache->index[i*m->nbEBands+j] == -1 && N!=0) + { + int K; + entryN[nbEntries] = N; + K = 0; + while (fits_in32(N,get_pulses(K+1)) && KnbEBands+j] = curr; + entryI[nbEntries] = curr; + + curr += K+1; + nbEntries++; + } + } + } + bits = (unsigned char *)opus_alloc(sizeof(unsigned char)*curr); + cache->bits = bits; + cache->size = curr; + /* Compute the cache for all unique sizes */ + for (i=0;icaps = cap = (unsigned char *)opus_alloc(sizeof(cache->caps[0])*(LM+1)*2*m->nbEBands); + for (i=0;i<=LM;i++) + { + for (C=1;C<=2;C++) + { + for (j=0;jnbEBands;j++) + { + int N0; + int max_bits; + N0 = m->eBands[j+1]-m->eBands[j]; + /* N=1 bands only have a sign bit and fine bits. */ + if (N0<1 are even, including custom modes.*/ + if (N0 > 2) + { + N0>>=1; + LM0--; + } + /* N0=1 bands can't be split down to N<2. */ + else if (N0 <= 1) + { + LM0=IMIN(i,1); + N0<<=LM0; + } + /* Compute the cost for the lowest-level PVQ of a fully split + band. */ + pcache = bits + cindex[(LM0+1)*m->nbEBands+j]; + max_bits = pcache[pcache[0]]+1; + /* Add in the cost of coding regular splits. */ + N = N0; + for(k=0;klogN[j]+((LM0+k)<>1)-QTHETA_OFFSET; + /* The number of qtheta bits we'll allocate if the remainder + is to be max_bits. + The average measured cost for theta is 0.89701 times qb, + approximated here as 459/512. */ + num=459*(opus_int32)((2*N-1)*offset+max_bits); + den=((opus_int32)(2*N-1)<<9)-459; + qb = IMIN((num+(den>>1))/den, 57); + celt_assert(qb >= 0); + max_bits += qb; + N <<= 1; + } + /* Add in the cost of a stereo split, if necessary. */ + if (C==2) + { + max_bits <<= 1; + offset = ((m->logN[j]+(i<>1)-(N==2?QTHETA_OFFSET_TWOPHASE:QTHETA_OFFSET); + ndof = 2*N-1-(N==2); + /* The average measured cost for theta with the step PDF is + 0.95164 times qb, approximated here as 487/512. */ + num = (N==2?512:487)*(opus_int32)(max_bits+ndof*offset); + den = ((opus_int32)ndof<<9)-(N==2?512:487); + qb = IMIN((num+(den>>1))/den, (N==2?64:61)); + celt_assert(qb >= 0); + max_bits += qb; + } + /* Add the fine bits we'll use. */ + /* Compensate for the extra DoF in stereo */ + ndof = C*N + ((C==2 && N>2) ? 1 : 0); + /* Offset the number of fine bits by log2(N)/2 + FINE_OFFSET + compared to their "fair share" of total/N */ + offset = ((m->logN[j] + (i<>1)-FINE_OFFSET; + /* N=2 is the only point that doesn't match the curve */ + if (N==2) + offset += 1<>2; + /* The number of fine bits we'll allocate if the remainder is + to be max_bits. */ + num = max_bits+ndof*offset; + den = (ndof-1)<>1))/den, MAX_FINE_BITS); + celt_assert(qb >= 0); + max_bits += C*qb<eBands[j+1]-m->eBands[j])<= 0); + celt_assert(max_bits < 256); + *cap++ = (unsigned char)max_bits; + } + } + } +} + +#endif /* CUSTOM_MODES */ + +#define ALLOC_STEPS 6 + +static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end, int skip_start, + const int *bits1, const int *bits2, const int *thresh, const int *cap, opus_int32 total, opus_int32 *_balance, + int skip_rsv, int *intensity, int intensity_rsv, int *dual_stereo, int dual_stereo_rsv, int *bits, + int *ebits, int *fine_priority, int C, int LM, ec_ctx *ec, int encode, int prev, int signalBandwidth) +{ + opus_int32 psum; + int lo, hi; + int i, j; + int logM; + int stereo; + int codedBands=-1; + int alloc_floor; + opus_int32 left, percoeff; + int done; + opus_int32 balance; + SAVE_STACK; + + alloc_floor = C<1; + + logM = LM<>1; + psum = 0; + done = 0; + for (j=end;j-->start;) + { + int tmp = bits1[j] + (mid*(opus_int32)bits2[j]>>ALLOC_STEPS); + if (tmp >= thresh[j] || done) + { + done = 1; + /* Don't allocate more than we can actually use */ + psum += IMIN(tmp, cap[j]); + } else { + if (tmp >= alloc_floor) + psum += alloc_floor; + } + } + if (psum > total) + hi = mid; + else + lo = mid; + } + psum = 0; + /*printf ("interp bisection gave %d\n", lo);*/ + done = 0; + for (j=end;j-->start;) + { + int tmp = bits1[j] + ((opus_int32)lo*bits2[j]>>ALLOC_STEPS); + if (tmp < thresh[j] && !done) + { + if (tmp >= alloc_floor) + tmp = alloc_floor; + else + tmp = 0; + } else + done = 1; + /* Don't allocate more than we can actually use */ + tmp = IMIN(tmp, cap[j]); + bits[j] = tmp; + psum += tmp; + } + + /* Decide which bands to skip, working backwards from the end. */ + for (codedBands=end;;codedBands--) + { + int band_width; + int band_bits; + int rem; + j = codedBands-1; + /* Never skip the first band, nor a band that has been boosted by + dynalloc. + In the first case, we'd be coding a bit to signal we're going to waste + all the other bits. + In the second case, we'd be coding a bit to redistribute all the bits + we just signaled should be cocentrated in this band. */ + if (j<=skip_start) + { + /* Give the bit we reserved to end skipping back. */ + total += skip_rsv; + break; + } + /*Figure out how many left-over bits we would be adding to this band. + This can include bits we've stolen back from higher, skipped bands.*/ + left = total-psum; + percoeff = celt_udiv(left, m->eBands[codedBands]-m->eBands[start]); + left -= (m->eBands[codedBands]-m->eBands[start])*percoeff; + rem = IMAX(left-(m->eBands[j]-m->eBands[start]),0); + band_width = m->eBands[codedBands]-m->eBands[j]; + band_bits = (int)(bits[j] + percoeff*band_width + rem); + /*Only code a skip decision if we're above the threshold for this band. + Otherwise it is force-skipped. + This ensures that we have enough bits to code the skip flag.*/ + if (band_bits >= IMAX(thresh[j], alloc_floor+(1< ((j>4 && j<=signalBandwidth)) +#endif + { + ec_enc_bit_logp(ec, 1, 1); + break; + } + ec_enc_bit_logp(ec, 0, 1); + } else if (ec_dec_bit_logp(ec, 1)) { + break; + } + /*We used a bit to skip this band.*/ + psum += 1< 0) + intensity_rsv = LOG2_FRAC_TABLE[j-start]; + psum += intensity_rsv; + if (band_bits >= alloc_floor) + { + /*If we have enough for a fine energy bit per channel, use it.*/ + psum += alloc_floor; + bits[j] = alloc_floor; + } else { + /*Otherwise this band gets nothing at all.*/ + bits[j] = 0; + } + } + + celt_assert(codedBands > start); + /* Code the intensity and dual stereo parameters. */ + if (intensity_rsv > 0) + { + if (encode) + { + *intensity = IMIN(*intensity, codedBands); + ec_enc_uint(ec, *intensity-start, codedBands+1-start); + } + else + *intensity = start+ec_dec_uint(ec, codedBands+1-start); + } + else + *intensity = 0; + if (*intensity <= start) + { + total += dual_stereo_rsv; + dual_stereo_rsv = 0; + } + if (dual_stereo_rsv > 0) + { + if (encode) + ec_enc_bit_logp(ec, *dual_stereo, 1); + else + *dual_stereo = ec_dec_bit_logp(ec, 1); + } + else + *dual_stereo = 0; + + /* Allocate the remaining bits */ + left = total-psum; + percoeff = celt_udiv(left, m->eBands[codedBands]-m->eBands[start]); + left -= (m->eBands[codedBands]-m->eBands[start])*percoeff; + for (j=start;jeBands[j+1]-m->eBands[j])); + for (j=start;jeBands[j+1]-m->eBands[j]); + bits[j] += tmp; + left -= tmp; + } + /*for (j=0;j= 0); + N0 = m->eBands[j+1]-m->eBands[j]; + N=N0<1) + { + excess = MAX32(bit-cap[j],0); + bits[j] = bit-excess; + + /* Compensate for the extra DoF in stereo */ + den=(C*N+ ((C==2 && N>2 && !*dual_stereo && j<*intensity) ? 1 : 0)); + + NClogN = den*(m->logN[j] + logM); + + /* Offset for the number of fine bits by log2(N)/2 + FINE_OFFSET + compared to their "fair share" of total/N */ + offset = (NClogN>>1)-den*FINE_OFFSET; + + /* N=2 is the only point that doesn't match the curve */ + if (N==2) + offset += den<>2; + + /* Changing the offset for allocating the second and third + fine energy bit */ + if (bits[j] + offset < den*2<>2; + else if (bits[j] + offset < den*3<>3; + + /* Divide with rounding */ + ebits[j] = IMAX(0, (bits[j] + offset + (den<<(BITRES-1)))); + ebits[j] = celt_udiv(ebits[j], den)>>BITRES; + + /* Make sure not to bust */ + if (C*ebits[j] > (bits[j]>>BITRES)) + ebits[j] = bits[j] >> stereo >> BITRES; + + /* More than that is useless because that's about as far as PVQ can go */ + ebits[j] = IMIN(ebits[j], MAX_FINE_BITS); + + /* If we rounded down or capped this band, make it a candidate for the + final fine energy pass */ + fine_priority[j] = ebits[j]*(den<= bits[j]+offset; + + /* Remove the allocated fine bits; the rest are assigned to PVQ */ + bits[j] -= C*ebits[j]< 0) + { + int extra_fine; + int extra_bits; + extra_fine = IMIN(excess>>(stereo+BITRES),MAX_FINE_BITS-ebits[j]); + ebits[j] += extra_fine; + extra_bits = extra_fine*C<= excess-balance; + excess -= extra_bits; + } + balance = excess; + + celt_assert(bits[j] >= 0); + celt_assert(ebits[j] >= 0); + } + /* Save any remaining bits over the cap for the rebalancing in + quant_all_bands(). */ + *_balance = balance; + + /* The skipped bands use all their bits for fine energy. */ + for (;j> stereo >> BITRES; + celt_assert(C*ebits[j]<nbEBands; + skip_start = start; + /* Reserve a bit to signal the end of manually skipped bands. */ + skip_rsv = total >= 1<total) + intensity_rsv = 0; + else + { + total -= intensity_rsv; + dual_stereo_rsv = total>=1<eBands[j+1]-m->eBands[j])<>4); + /* Tilt of the allocation curve */ + trim_offset[j] = C*(m->eBands[j+1]-m->eBands[j])*(alloc_trim-5-LM)*(end-j-1) + *(1<<(LM+BITRES))>>6; + /* Giving less resolution to single-coefficient bands because they get + more benefit from having one coarse value per coefficient*/ + if ((m->eBands[j+1]-m->eBands[j])<nbAllocVectors - 1; + do + { + int done = 0; + int psum = 0; + int mid = (lo+hi) >> 1; + for (j=end;j-->start;) + { + int bitsj; + int N = m->eBands[j+1]-m->eBands[j]; + bitsj = C*N*m->allocVectors[mid*len+j]<>2; + if (bitsj > 0) + bitsj = IMAX(0, bitsj + trim_offset[j]); + bitsj += offsets[j]; + if (bitsj >= thresh[j] || done) + { + done = 1; + /* Don't allocate more than we can actually use */ + psum += IMIN(bitsj, cap[j]); + } else { + if (bitsj >= C< total) + hi = mid - 1; + else + lo = mid + 1; + /*printf ("lo = %d, hi = %d\n", lo, hi);*/ + } + while (lo <= hi); + hi = lo--; + /*printf ("interp between %d and %d\n", lo, hi);*/ + for (j=start;jeBands[j+1]-m->eBands[j]; + bits1j = C*N*m->allocVectors[lo*len+j]<>2; + bits2j = hi>=m->nbAllocVectors ? + cap[j] : C*N*m->allocVectors[hi*len+j]<>2; + if (bits1j > 0) + bits1j = IMAX(0, bits1j + trim_offset[j]); + if (bits2j > 0) + bits2j = IMAX(0, bits2j + trim_offset[j]); + if (lo > 0) + bits1j += offsets[j]; + bits2j += offsets[j]; + if (offsets[j]>0) + skip_start = j; + bits2j = IMAX(0,bits2j-bits1j); + bits1[j] = bits1j; + bits2[j] = bits2j; + } + codedBands = interp_bits2pulses(m, start, end, skip_start, bits1, bits2, thresh, cap, + total, balance, skip_rsv, intensity, intensity_rsv, dual_stereo, dual_stereo_rsv, + pulses, ebits, fine_priority, C, LM, ec, encode, prev, signalBandwidth); + RESTORE_STACK; + return codedBands; +} + diff --git a/thirdparty/opus/celt/rate.h b/thirdparty/opus/celt/rate.h new file mode 100644 index 000000000000..515f7687cec6 --- /dev/null +++ b/thirdparty/opus/celt/rate.h @@ -0,0 +1,101 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef RATE_H +#define RATE_H + +#define MAX_PSEUDO 40 +#define LOG_MAX_PSEUDO 6 + +#define CELT_MAX_PULSES 128 + +#define MAX_FINE_BITS 8 + +#define FINE_OFFSET 21 +#define QTHETA_OFFSET 4 +#define QTHETA_OFFSET_TWOPHASE 16 + +#include "cwrs.h" +#include "modes.h" + +void compute_pulse_cache(CELTMode *m, int LM); + +static OPUS_INLINE int get_pulses(int i) +{ + return i<8 ? i : (8 + (i&7)) << ((i>>3)-1); +} + +static OPUS_INLINE int bits2pulses(const CELTMode *m, int band, int LM, int bits) +{ + int i; + int lo, hi; + const unsigned char *cache; + + LM++; + cache = m->cache.bits + m->cache.index[LM*m->nbEBands+band]; + + lo = 0; + hi = cache[0]; + bits--; + for (i=0;i>1; + /* OPT: Make sure this is implemented with a conditional move */ + if ((int)cache[mid] >= bits) + hi = mid; + else + lo = mid; + } + if (bits- (lo == 0 ? -1 : (int)cache[lo]) <= (int)cache[hi]-bits) + return lo; + else + return hi; +} + +static OPUS_INLINE int pulses2bits(const CELTMode *m, int band, int LM, int pulses) +{ + const unsigned char *cache; + + LM++; + cache = m->cache.bits + m->cache.index[LM*m->nbEBands+band]; + return pulses == 0 ? 0 : cache[pulses]+1; +} + +/** Compute the pulse allocation, i.e. how many pulses will go in each + * band. + @param m mode + @param offsets Requested increase or decrease in the number of bits for + each band + @param total Number of bands + @param pulses Number of pulses per band (returned) + @return Total number of bits allocated +*/ +int compute_allocation(const CELTMode *m, int start, int end, const int *offsets, const int *cap, int alloc_trim, int *intensity, int *dual_stero, + opus_int32 total, opus_int32 *balance, int *pulses, int *ebits, int *fine_priority, int C, int LM, ec_ctx *ec, int encode, int prev, int signalBandwidth); + +#endif diff --git a/thirdparty/opus/celt/stack_alloc.h b/thirdparty/opus/celt/stack_alloc.h new file mode 100644 index 000000000000..2b51c8d80cc5 --- /dev/null +++ b/thirdparty/opus/celt/stack_alloc.h @@ -0,0 +1,184 @@ +/* Copyright (C) 2002-2003 Jean-Marc Valin + Copyright (C) 2007-2009 Xiph.Org Foundation */ +/** + @file stack_alloc.h + @brief Temporary memory allocation on stack +*/ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef STACK_ALLOC_H +#define STACK_ALLOC_H + +#include "opus_types.h" +#include "opus_defines.h" + +#if (!defined (VAR_ARRAYS) && !defined (USE_ALLOCA) && !defined (NONTHREADSAFE_PSEUDOSTACK)) +#error "Opus requires one of VAR_ARRAYS, USE_ALLOCA, or NONTHREADSAFE_PSEUDOSTACK be defined to select the temporary allocation mode." +#endif + +#ifdef USE_ALLOCA +# ifdef WIN32 +# include +# else +# ifdef HAVE_ALLOCA_H +# include +# else +# include +# endif +# endif +#endif + +/** + * @def ALIGN(stack, size) + * + * Aligns the stack to a 'size' boundary + * + * @param stack Stack + * @param size New size boundary + */ + +/** + * @def PUSH(stack, size, type) + * + * Allocates 'size' elements of type 'type' on the stack + * + * @param stack Stack + * @param size Number of elements + * @param type Type of element + */ + +/** + * @def VARDECL(var) + * + * Declare variable on stack + * + * @param var Variable to declare + */ + +/** + * @def ALLOC(var, size, type) + * + * Allocate 'size' elements of 'type' on stack + * + * @param var Name of variable to allocate + * @param size Number of elements + * @param type Type of element + */ + +#if defined(VAR_ARRAYS) + +#define VARDECL(type, var) +#define ALLOC(var, size, type) type var[size] +#define SAVE_STACK +#define RESTORE_STACK +#define ALLOC_STACK +/* C99 does not allow VLAs of size zero */ +#define ALLOC_NONE 1 + +#elif defined(USE_ALLOCA) + +#define VARDECL(type, var) type *var + +# ifdef WIN32 +# define ALLOC(var, size, type) var = ((type*)_alloca(sizeof(type)*(size))) +# else +# define ALLOC(var, size, type) var = ((type*)alloca(sizeof(type)*(size))) +# endif + +#define SAVE_STACK +#define RESTORE_STACK +#define ALLOC_STACK +#define ALLOC_NONE 0 + +#else + +#ifdef CELT_C +char *scratch_ptr=0; +char *global_stack=0; +#else +extern char *global_stack; +extern char *scratch_ptr; +#endif /* CELT_C */ + +#ifdef ENABLE_VALGRIND + +#include + +#ifdef CELT_C +char *global_stack_top=0; +#else +extern char *global_stack_top; +#endif /* CELT_C */ + +#define ALIGN(stack, size) ((stack) += ((size) - (long)(stack)) & ((size) - 1)) +#define PUSH(stack, size, type) (VALGRIND_MAKE_MEM_NOACCESS(stack, global_stack_top-stack),ALIGN((stack),sizeof(type)/sizeof(char)),VALGRIND_MAKE_MEM_UNDEFINED(stack, ((size)*sizeof(type)/sizeof(char))),(stack)+=(2*(size)*sizeof(type)/sizeof(char)),(type*)((stack)-(2*(size)*sizeof(type)/sizeof(char)))) +#define RESTORE_STACK ((global_stack = _saved_stack),VALGRIND_MAKE_MEM_NOACCESS(global_stack, global_stack_top-global_stack)) +#define ALLOC_STACK char *_saved_stack; ((global_stack = (global_stack==0) ? ((global_stack_top=opus_alloc_scratch(GLOBAL_STACK_SIZE*2)+(GLOBAL_STACK_SIZE*2))-(GLOBAL_STACK_SIZE*2)) : global_stack),VALGRIND_MAKE_MEM_NOACCESS(global_stack, global_stack_top-global_stack)); _saved_stack = global_stack; + +#else + +#define ALIGN(stack, size) ((stack) += ((size) - (long)(stack)) & ((size) - 1)) +#define PUSH(stack, size, type) (ALIGN((stack),sizeof(type)/sizeof(char)),(stack)+=(size)*(sizeof(type)/sizeof(char)),(type*)((stack)-(size)*(sizeof(type)/sizeof(char)))) +#if 0 /* Set this to 1 to instrument pseudostack usage */ +#define RESTORE_STACK (printf("%ld %s:%d\n", global_stack-scratch_ptr, __FILE__, __LINE__),global_stack = _saved_stack) +#else +#define RESTORE_STACK (global_stack = _saved_stack) +#endif +#define ALLOC_STACK char *_saved_stack; (global_stack = (global_stack==0) ? (scratch_ptr=opus_alloc_scratch(GLOBAL_STACK_SIZE)) : global_stack); _saved_stack = global_stack; + +#endif /* ENABLE_VALGRIND */ + +#include "os_support.h" +#define VARDECL(type, var) type *var +#define ALLOC(var, size, type) var = PUSH(global_stack, size, type) +#define SAVE_STACK char *_saved_stack = global_stack; +#define ALLOC_NONE 0 + +#endif /* VAR_ARRAYS */ + + +#ifdef ENABLE_VALGRIND + +#include +#define OPUS_CHECK_ARRAY(ptr, len) VALGRIND_CHECK_MEM_IS_DEFINED(ptr, len*sizeof(*ptr)) +#define OPUS_CHECK_VALUE(value) VALGRIND_CHECK_VALUE_IS_DEFINED(value) +#define OPUS_CHECK_ARRAY_COND(ptr, len) VALGRIND_CHECK_MEM_IS_DEFINED(ptr, len*sizeof(*ptr)) +#define OPUS_CHECK_VALUE_COND(value) VALGRIND_CHECK_VALUE_IS_DEFINED(value) +#define OPUS_PRINT_INT(value) do {fprintf(stderr, #value " = %d at %s:%d\n", value, __FILE__, __LINE__);}while(0) +#define OPUS_FPRINTF fprintf + +#else + +static OPUS_INLINE int _opus_false(void) {return 0;} +#define OPUS_CHECK_ARRAY(ptr, len) _opus_false() +#define OPUS_CHECK_VALUE(value) _opus_false() +#define OPUS_PRINT_INT(value) do{}while(0) +#define OPUS_FPRINTF (void) + +#endif + + +#endif /* STACK_ALLOC_H */ diff --git a/thirdparty/opus/celt/static_modes_fixed.h b/thirdparty/opus/celt/static_modes_fixed.h new file mode 100644 index 000000000000..8717d626cbee --- /dev/null +++ b/thirdparty/opus/celt/static_modes_fixed.h @@ -0,0 +1,892 @@ +/* The contents of this file was automatically generated by dump_modes.c + with arguments: 48000 960 + It contains static definitions for some pre-defined modes. */ +#include "modes.h" +#include "rate.h" + +#ifdef HAVE_ARM_NE10 +#define OVERRIDE_FFT 1 +#include "static_modes_fixed_arm_ne10.h" +#endif + +#ifndef DEF_WINDOW120 +#define DEF_WINDOW120 +static const opus_val16 window120[120] = { +2, 20, 55, 108, 178, +266, 372, 494, 635, 792, +966, 1157, 1365, 1590, 1831, +2089, 2362, 2651, 2956, 3276, +3611, 3961, 4325, 4703, 5094, +5499, 5916, 6346, 6788, 7241, +7705, 8179, 8663, 9156, 9657, +10167, 10684, 11207, 11736, 12271, +12810, 13353, 13899, 14447, 14997, +15547, 16098, 16648, 17197, 17744, +18287, 18827, 19363, 19893, 20418, +20936, 21447, 21950, 22445, 22931, +23407, 23874, 24330, 24774, 25208, +25629, 26039, 26435, 26819, 27190, +27548, 27893, 28224, 28541, 28845, +29135, 29411, 29674, 29924, 30160, +30384, 30594, 30792, 30977, 31151, +31313, 31463, 31602, 31731, 31849, +31958, 32057, 32148, 32229, 32303, +32370, 32429, 32481, 32528, 32568, +32604, 32634, 32661, 32683, 32701, +32717, 32729, 32740, 32748, 32754, +32758, 32762, 32764, 32766, 32767, +32767, 32767, 32767, 32767, 32767, +}; +#endif + +#ifndef DEF_LOGN400 +#define DEF_LOGN400 +static const opus_int16 logN400[21] = { +0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 16, 16, 16, 21, 21, 24, 29, 34, 36, }; +#endif + +#ifndef DEF_PULSE_CACHE50 +#define DEF_PULSE_CACHE50 +static const opus_int16 cache_index50[105] = { +-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 41, 41, 41, +82, 82, 123, 164, 200, 222, 0, 0, 0, 0, 0, 0, 0, 0, 41, +41, 41, 41, 123, 123, 123, 164, 164, 240, 266, 283, 295, 41, 41, 41, +41, 41, 41, 41, 41, 123, 123, 123, 123, 240, 240, 240, 266, 266, 305, +318, 328, 336, 123, 123, 123, 123, 123, 123, 123, 123, 240, 240, 240, 240, +305, 305, 305, 318, 318, 343, 351, 358, 364, 240, 240, 240, 240, 240, 240, +240, 240, 305, 305, 305, 305, 343, 343, 343, 351, 351, 370, 376, 382, 387, +}; +static const unsigned char cache_bits50[392] = { +40, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, +7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, +7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 40, 15, 23, 28, +31, 34, 36, 38, 39, 41, 42, 43, 44, 45, 46, 47, 47, 49, 50, +51, 52, 53, 54, 55, 55, 57, 58, 59, 60, 61, 62, 63, 63, 65, +66, 67, 68, 69, 70, 71, 71, 40, 20, 33, 41, 48, 53, 57, 61, +64, 66, 69, 71, 73, 75, 76, 78, 80, 82, 85, 87, 89, 91, 92, +94, 96, 98, 101, 103, 105, 107, 108, 110, 112, 114, 117, 119, 121, 123, +124, 126, 128, 40, 23, 39, 51, 60, 67, 73, 79, 83, 87, 91, 94, +97, 100, 102, 105, 107, 111, 115, 118, 121, 124, 126, 129, 131, 135, 139, +142, 145, 148, 150, 153, 155, 159, 163, 166, 169, 172, 174, 177, 179, 35, +28, 49, 65, 78, 89, 99, 107, 114, 120, 126, 132, 136, 141, 145, 149, +153, 159, 165, 171, 176, 180, 185, 189, 192, 199, 205, 211, 216, 220, 225, +229, 232, 239, 245, 251, 21, 33, 58, 79, 97, 112, 125, 137, 148, 157, +166, 174, 182, 189, 195, 201, 207, 217, 227, 235, 243, 251, 17, 35, 63, +86, 106, 123, 139, 152, 165, 177, 187, 197, 206, 214, 222, 230, 237, 250, +25, 31, 55, 75, 91, 105, 117, 128, 138, 146, 154, 161, 168, 174, 180, +185, 190, 200, 208, 215, 222, 229, 235, 240, 245, 255, 16, 36, 65, 89, +110, 128, 144, 159, 173, 185, 196, 207, 217, 226, 234, 242, 250, 11, 41, +74, 103, 128, 151, 172, 191, 209, 225, 241, 255, 9, 43, 79, 110, 138, +163, 186, 207, 227, 246, 12, 39, 71, 99, 123, 144, 164, 182, 198, 214, +228, 241, 253, 9, 44, 81, 113, 142, 168, 192, 214, 235, 255, 7, 49, +90, 127, 160, 191, 220, 247, 6, 51, 95, 134, 170, 203, 234, 7, 47, +87, 123, 155, 184, 212, 237, 6, 52, 97, 137, 174, 208, 240, 5, 57, +106, 151, 192, 231, 5, 59, 111, 158, 202, 243, 5, 55, 103, 147, 187, +224, 5, 60, 113, 161, 206, 248, 4, 65, 122, 175, 224, 4, 67, 127, +182, 234, }; +static const unsigned char cache_caps50[168] = { +224, 224, 224, 224, 224, 224, 224, 224, 160, 160, 160, 160, 185, 185, 185, +178, 178, 168, 134, 61, 37, 224, 224, 224, 224, 224, 224, 224, 224, 240, +240, 240, 240, 207, 207, 207, 198, 198, 183, 144, 66, 40, 160, 160, 160, +160, 160, 160, 160, 160, 185, 185, 185, 185, 193, 193, 193, 183, 183, 172, +138, 64, 38, 240, 240, 240, 240, 240, 240, 240, 240, 207, 207, 207, 207, +204, 204, 204, 193, 193, 180, 143, 66, 40, 185, 185, 185, 185, 185, 185, +185, 185, 193, 193, 193, 193, 193, 193, 193, 183, 183, 172, 138, 65, 39, +207, 207, 207, 207, 207, 207, 207, 207, 204, 204, 204, 204, 201, 201, 201, +188, 188, 176, 141, 66, 40, 193, 193, 193, 193, 193, 193, 193, 193, 193, +193, 193, 193, 194, 194, 194, 184, 184, 173, 139, 65, 39, 204, 204, 204, +204, 204, 204, 204, 204, 201, 201, 201, 201, 198, 198, 198, 187, 187, 175, +140, 66, 40, }; +#endif + +#ifndef FFT_TWIDDLES48000_960 +#define FFT_TWIDDLES48000_960 +static const kiss_twiddle_cpx fft_twiddles48000_960[480] = { +{32767, 0}, {32766, -429}, +{32757, -858}, {32743, -1287}, +{32724, -1715}, {32698, -2143}, +{32667, -2570}, {32631, -2998}, +{32588, -3425}, {32541, -3851}, +{32488, -4277}, {32429, -4701}, +{32364, -5125}, {32295, -5548}, +{32219, -5971}, {32138, -6393}, +{32051, -6813}, {31960, -7231}, +{31863, -7650}, {31760, -8067}, +{31652, -8481}, {31539, -8895}, +{31419, -9306}, {31294, -9716}, +{31165, -10126}, {31030, -10532}, +{30889, -10937}, {30743, -11340}, +{30592, -11741}, {30436, -12141}, +{30274, -12540}, {30107, -12935}, +{29936, -13328}, {29758, -13718}, +{29577, -14107}, {29390, -14493}, +{29197, -14875}, {29000, -15257}, +{28797, -15635}, {28590, -16010}, +{28379, -16384}, {28162, -16753}, +{27940, -17119}, {27714, -17484}, +{27482, -17845}, {27246, -18205}, +{27006, -18560}, {26760, -18911}, +{26510, -19260}, {26257, -19606}, +{25997, -19947}, {25734, -20286}, +{25466, -20621}, {25194, -20952}, +{24918, -21281}, {24637, -21605}, +{24353, -21926}, {24063, -22242}, +{23770, -22555}, {23473, -22865}, +{23171, -23171}, {22866, -23472}, +{22557, -23769}, {22244, -24063}, +{21927, -24352}, {21606, -24636}, +{21282, -24917}, {20954, -25194}, +{20622, -25465}, {20288, -25733}, +{19949, -25997}, {19607, -26255}, +{19261, -26509}, {18914, -26760}, +{18561, -27004}, {18205, -27246}, +{17846, -27481}, {17485, -27713}, +{17122, -27940}, {16755, -28162}, +{16385, -28378}, {16012, -28590}, +{15636, -28797}, {15258, -28999}, +{14878, -29197}, {14494, -29389}, +{14108, -29576}, {13720, -29757}, +{13329, -29934}, {12937, -30107}, +{12540, -30274}, {12142, -30435}, +{11744, -30592}, {11342, -30743}, +{10939, -30889}, {10534, -31030}, +{10127, -31164}, {9718, -31294}, +{9307, -31418}, {8895, -31537}, +{8482, -31652}, {8067, -31759}, +{7650, -31862}, {7233, -31960}, +{6815, -32051}, {6393, -32138}, +{5973, -32219}, {5549, -32294}, +{5127, -32364}, {4703, -32429}, +{4278, -32487}, {3852, -32541}, +{3426, -32588}, {2999, -32630}, +{2572, -32667}, {2144, -32698}, +{1716, -32724}, {1287, -32742}, +{860, -32757}, {430, -32766}, +{0, -32767}, {-429, -32766}, +{-858, -32757}, {-1287, -32743}, +{-1715, -32724}, {-2143, -32698}, +{-2570, -32667}, {-2998, -32631}, +{-3425, -32588}, {-3851, -32541}, +{-4277, -32488}, {-4701, -32429}, +{-5125, -32364}, {-5548, -32295}, +{-5971, -32219}, {-6393, -32138}, +{-6813, -32051}, {-7231, -31960}, +{-7650, -31863}, {-8067, -31760}, +{-8481, -31652}, {-8895, -31539}, +{-9306, -31419}, {-9716, -31294}, +{-10126, -31165}, {-10532, -31030}, +{-10937, -30889}, {-11340, -30743}, +{-11741, -30592}, {-12141, -30436}, +{-12540, -30274}, {-12935, -30107}, +{-13328, -29936}, {-13718, -29758}, +{-14107, -29577}, {-14493, -29390}, +{-14875, -29197}, {-15257, -29000}, +{-15635, -28797}, {-16010, -28590}, +{-16384, -28379}, {-16753, -28162}, +{-17119, -27940}, {-17484, -27714}, +{-17845, -27482}, {-18205, -27246}, +{-18560, -27006}, {-18911, -26760}, +{-19260, -26510}, {-19606, -26257}, +{-19947, -25997}, {-20286, -25734}, +{-20621, -25466}, {-20952, -25194}, +{-21281, -24918}, {-21605, -24637}, +{-21926, -24353}, {-22242, -24063}, +{-22555, -23770}, {-22865, -23473}, +{-23171, -23171}, {-23472, -22866}, +{-23769, -22557}, {-24063, -22244}, +{-24352, -21927}, {-24636, -21606}, +{-24917, -21282}, {-25194, -20954}, +{-25465, -20622}, {-25733, -20288}, +{-25997, -19949}, {-26255, -19607}, +{-26509, -19261}, {-26760, -18914}, +{-27004, -18561}, {-27246, -18205}, +{-27481, -17846}, {-27713, -17485}, +{-27940, -17122}, {-28162, -16755}, +{-28378, -16385}, {-28590, -16012}, +{-28797, -15636}, {-28999, -15258}, +{-29197, -14878}, {-29389, -14494}, +{-29576, -14108}, {-29757, -13720}, +{-29934, -13329}, {-30107, -12937}, +{-30274, -12540}, {-30435, -12142}, +{-30592, -11744}, {-30743, -11342}, +{-30889, -10939}, {-31030, -10534}, +{-31164, -10127}, {-31294, -9718}, +{-31418, -9307}, {-31537, -8895}, +{-31652, -8482}, {-31759, -8067}, +{-31862, -7650}, {-31960, -7233}, +{-32051, -6815}, {-32138, -6393}, +{-32219, -5973}, {-32294, -5549}, +{-32364, -5127}, {-32429, -4703}, +{-32487, -4278}, {-32541, -3852}, +{-32588, -3426}, {-32630, -2999}, +{-32667, -2572}, {-32698, -2144}, +{-32724, -1716}, {-32742, -1287}, +{-32757, -860}, {-32766, -430}, +{-32767, 0}, {-32766, 429}, +{-32757, 858}, {-32743, 1287}, +{-32724, 1715}, {-32698, 2143}, +{-32667, 2570}, {-32631, 2998}, +{-32588, 3425}, {-32541, 3851}, +{-32488, 4277}, {-32429, 4701}, +{-32364, 5125}, {-32295, 5548}, +{-32219, 5971}, {-32138, 6393}, +{-32051, 6813}, {-31960, 7231}, +{-31863, 7650}, {-31760, 8067}, +{-31652, 8481}, {-31539, 8895}, +{-31419, 9306}, {-31294, 9716}, +{-31165, 10126}, {-31030, 10532}, +{-30889, 10937}, {-30743, 11340}, +{-30592, 11741}, {-30436, 12141}, +{-30274, 12540}, {-30107, 12935}, +{-29936, 13328}, {-29758, 13718}, +{-29577, 14107}, {-29390, 14493}, +{-29197, 14875}, {-29000, 15257}, +{-28797, 15635}, {-28590, 16010}, +{-28379, 16384}, {-28162, 16753}, +{-27940, 17119}, {-27714, 17484}, +{-27482, 17845}, {-27246, 18205}, +{-27006, 18560}, {-26760, 18911}, +{-26510, 19260}, {-26257, 19606}, +{-25997, 19947}, {-25734, 20286}, +{-25466, 20621}, {-25194, 20952}, +{-24918, 21281}, {-24637, 21605}, +{-24353, 21926}, {-24063, 22242}, +{-23770, 22555}, {-23473, 22865}, +{-23171, 23171}, {-22866, 23472}, +{-22557, 23769}, {-22244, 24063}, +{-21927, 24352}, {-21606, 24636}, +{-21282, 24917}, {-20954, 25194}, +{-20622, 25465}, {-20288, 25733}, +{-19949, 25997}, {-19607, 26255}, +{-19261, 26509}, {-18914, 26760}, +{-18561, 27004}, {-18205, 27246}, +{-17846, 27481}, {-17485, 27713}, +{-17122, 27940}, {-16755, 28162}, +{-16385, 28378}, {-16012, 28590}, +{-15636, 28797}, {-15258, 28999}, +{-14878, 29197}, {-14494, 29389}, +{-14108, 29576}, {-13720, 29757}, +{-13329, 29934}, {-12937, 30107}, +{-12540, 30274}, {-12142, 30435}, +{-11744, 30592}, {-11342, 30743}, +{-10939, 30889}, {-10534, 31030}, +{-10127, 31164}, {-9718, 31294}, +{-9307, 31418}, {-8895, 31537}, +{-8482, 31652}, {-8067, 31759}, +{-7650, 31862}, {-7233, 31960}, +{-6815, 32051}, {-6393, 32138}, +{-5973, 32219}, {-5549, 32294}, +{-5127, 32364}, {-4703, 32429}, +{-4278, 32487}, {-3852, 32541}, +{-3426, 32588}, {-2999, 32630}, +{-2572, 32667}, {-2144, 32698}, +{-1716, 32724}, {-1287, 32742}, +{-860, 32757}, {-430, 32766}, +{0, 32767}, {429, 32766}, +{858, 32757}, {1287, 32743}, +{1715, 32724}, {2143, 32698}, +{2570, 32667}, {2998, 32631}, +{3425, 32588}, {3851, 32541}, +{4277, 32488}, {4701, 32429}, +{5125, 32364}, {5548, 32295}, +{5971, 32219}, {6393, 32138}, +{6813, 32051}, {7231, 31960}, +{7650, 31863}, {8067, 31760}, +{8481, 31652}, {8895, 31539}, +{9306, 31419}, {9716, 31294}, +{10126, 31165}, {10532, 31030}, +{10937, 30889}, {11340, 30743}, +{11741, 30592}, {12141, 30436}, +{12540, 30274}, {12935, 30107}, +{13328, 29936}, {13718, 29758}, +{14107, 29577}, {14493, 29390}, +{14875, 29197}, {15257, 29000}, +{15635, 28797}, {16010, 28590}, +{16384, 28379}, {16753, 28162}, +{17119, 27940}, {17484, 27714}, +{17845, 27482}, {18205, 27246}, +{18560, 27006}, {18911, 26760}, +{19260, 26510}, {19606, 26257}, +{19947, 25997}, {20286, 25734}, +{20621, 25466}, {20952, 25194}, +{21281, 24918}, {21605, 24637}, +{21926, 24353}, {22242, 24063}, +{22555, 23770}, {22865, 23473}, +{23171, 23171}, {23472, 22866}, +{23769, 22557}, {24063, 22244}, +{24352, 21927}, {24636, 21606}, +{24917, 21282}, {25194, 20954}, +{25465, 20622}, {25733, 20288}, +{25997, 19949}, {26255, 19607}, +{26509, 19261}, {26760, 18914}, +{27004, 18561}, {27246, 18205}, +{27481, 17846}, {27713, 17485}, +{27940, 17122}, {28162, 16755}, +{28378, 16385}, {28590, 16012}, +{28797, 15636}, {28999, 15258}, +{29197, 14878}, {29389, 14494}, +{29576, 14108}, {29757, 13720}, +{29934, 13329}, {30107, 12937}, +{30274, 12540}, {30435, 12142}, +{30592, 11744}, {30743, 11342}, +{30889, 10939}, {31030, 10534}, +{31164, 10127}, {31294, 9718}, +{31418, 9307}, {31537, 8895}, +{31652, 8482}, {31759, 8067}, +{31862, 7650}, {31960, 7233}, +{32051, 6815}, {32138, 6393}, +{32219, 5973}, {32294, 5549}, +{32364, 5127}, {32429, 4703}, +{32487, 4278}, {32541, 3852}, +{32588, 3426}, {32630, 2999}, +{32667, 2572}, {32698, 2144}, +{32724, 1716}, {32742, 1287}, +{32757, 860}, {32766, 430}, +}; +#ifndef FFT_BITREV480 +#define FFT_BITREV480 +static const opus_int16 fft_bitrev480[480] = { +0, 96, 192, 288, 384, 32, 128, 224, 320, 416, 64, 160, 256, 352, 448, +8, 104, 200, 296, 392, 40, 136, 232, 328, 424, 72, 168, 264, 360, 456, +16, 112, 208, 304, 400, 48, 144, 240, 336, 432, 80, 176, 272, 368, 464, +24, 120, 216, 312, 408, 56, 152, 248, 344, 440, 88, 184, 280, 376, 472, +4, 100, 196, 292, 388, 36, 132, 228, 324, 420, 68, 164, 260, 356, 452, +12, 108, 204, 300, 396, 44, 140, 236, 332, 428, 76, 172, 268, 364, 460, +20, 116, 212, 308, 404, 52, 148, 244, 340, 436, 84, 180, 276, 372, 468, +28, 124, 220, 316, 412, 60, 156, 252, 348, 444, 92, 188, 284, 380, 476, +1, 97, 193, 289, 385, 33, 129, 225, 321, 417, 65, 161, 257, 353, 449, +9, 105, 201, 297, 393, 41, 137, 233, 329, 425, 73, 169, 265, 361, 457, +17, 113, 209, 305, 401, 49, 145, 241, 337, 433, 81, 177, 273, 369, 465, +25, 121, 217, 313, 409, 57, 153, 249, 345, 441, 89, 185, 281, 377, 473, +5, 101, 197, 293, 389, 37, 133, 229, 325, 421, 69, 165, 261, 357, 453, +13, 109, 205, 301, 397, 45, 141, 237, 333, 429, 77, 173, 269, 365, 461, +21, 117, 213, 309, 405, 53, 149, 245, 341, 437, 85, 181, 277, 373, 469, +29, 125, 221, 317, 413, 61, 157, 253, 349, 445, 93, 189, 285, 381, 477, +2, 98, 194, 290, 386, 34, 130, 226, 322, 418, 66, 162, 258, 354, 450, +10, 106, 202, 298, 394, 42, 138, 234, 330, 426, 74, 170, 266, 362, 458, +18, 114, 210, 306, 402, 50, 146, 242, 338, 434, 82, 178, 274, 370, 466, +26, 122, 218, 314, 410, 58, 154, 250, 346, 442, 90, 186, 282, 378, 474, +6, 102, 198, 294, 390, 38, 134, 230, 326, 422, 70, 166, 262, 358, 454, +14, 110, 206, 302, 398, 46, 142, 238, 334, 430, 78, 174, 270, 366, 462, +22, 118, 214, 310, 406, 54, 150, 246, 342, 438, 86, 182, 278, 374, 470, +30, 126, 222, 318, 414, 62, 158, 254, 350, 446, 94, 190, 286, 382, 478, +3, 99, 195, 291, 387, 35, 131, 227, 323, 419, 67, 163, 259, 355, 451, +11, 107, 203, 299, 395, 43, 139, 235, 331, 427, 75, 171, 267, 363, 459, +19, 115, 211, 307, 403, 51, 147, 243, 339, 435, 83, 179, 275, 371, 467, +27, 123, 219, 315, 411, 59, 155, 251, 347, 443, 91, 187, 283, 379, 475, +7, 103, 199, 295, 391, 39, 135, 231, 327, 423, 71, 167, 263, 359, 455, +15, 111, 207, 303, 399, 47, 143, 239, 335, 431, 79, 175, 271, 367, 463, +23, 119, 215, 311, 407, 55, 151, 247, 343, 439, 87, 183, 279, 375, 471, +31, 127, 223, 319, 415, 63, 159, 255, 351, 447, 95, 191, 287, 383, 479, +}; +#endif + +#ifndef FFT_BITREV240 +#define FFT_BITREV240 +static const opus_int16 fft_bitrev240[240] = { +0, 48, 96, 144, 192, 16, 64, 112, 160, 208, 32, 80, 128, 176, 224, +4, 52, 100, 148, 196, 20, 68, 116, 164, 212, 36, 84, 132, 180, 228, +8, 56, 104, 152, 200, 24, 72, 120, 168, 216, 40, 88, 136, 184, 232, +12, 60, 108, 156, 204, 28, 76, 124, 172, 220, 44, 92, 140, 188, 236, +1, 49, 97, 145, 193, 17, 65, 113, 161, 209, 33, 81, 129, 177, 225, +5, 53, 101, 149, 197, 21, 69, 117, 165, 213, 37, 85, 133, 181, 229, +9, 57, 105, 153, 201, 25, 73, 121, 169, 217, 41, 89, 137, 185, 233, +13, 61, 109, 157, 205, 29, 77, 125, 173, 221, 45, 93, 141, 189, 237, +2, 50, 98, 146, 194, 18, 66, 114, 162, 210, 34, 82, 130, 178, 226, +6, 54, 102, 150, 198, 22, 70, 118, 166, 214, 38, 86, 134, 182, 230, +10, 58, 106, 154, 202, 26, 74, 122, 170, 218, 42, 90, 138, 186, 234, +14, 62, 110, 158, 206, 30, 78, 126, 174, 222, 46, 94, 142, 190, 238, +3, 51, 99, 147, 195, 19, 67, 115, 163, 211, 35, 83, 131, 179, 227, +7, 55, 103, 151, 199, 23, 71, 119, 167, 215, 39, 87, 135, 183, 231, +11, 59, 107, 155, 203, 27, 75, 123, 171, 219, 43, 91, 139, 187, 235, +15, 63, 111, 159, 207, 31, 79, 127, 175, 223, 47, 95, 143, 191, 239, +}; +#endif + +#ifndef FFT_BITREV120 +#define FFT_BITREV120 +static const opus_int16 fft_bitrev120[120] = { +0, 24, 48, 72, 96, 8, 32, 56, 80, 104, 16, 40, 64, 88, 112, +4, 28, 52, 76, 100, 12, 36, 60, 84, 108, 20, 44, 68, 92, 116, +1, 25, 49, 73, 97, 9, 33, 57, 81, 105, 17, 41, 65, 89, 113, +5, 29, 53, 77, 101, 13, 37, 61, 85, 109, 21, 45, 69, 93, 117, +2, 26, 50, 74, 98, 10, 34, 58, 82, 106, 18, 42, 66, 90, 114, +6, 30, 54, 78, 102, 14, 38, 62, 86, 110, 22, 46, 70, 94, 118, +3, 27, 51, 75, 99, 11, 35, 59, 83, 107, 19, 43, 67, 91, 115, +7, 31, 55, 79, 103, 15, 39, 63, 87, 111, 23, 47, 71, 95, 119, +}; +#endif + +#ifndef FFT_BITREV60 +#define FFT_BITREV60 +static const opus_int16 fft_bitrev60[60] = { +0, 12, 24, 36, 48, 4, 16, 28, 40, 52, 8, 20, 32, 44, 56, +1, 13, 25, 37, 49, 5, 17, 29, 41, 53, 9, 21, 33, 45, 57, +2, 14, 26, 38, 50, 6, 18, 30, 42, 54, 10, 22, 34, 46, 58, +3, 15, 27, 39, 51, 7, 19, 31, 43, 55, 11, 23, 35, 47, 59, +}; +#endif + +#ifndef FFT_STATE48000_960_0 +#define FFT_STATE48000_960_0 +static const kiss_fft_state fft_state48000_960_0 = { +480, /* nfft */ +17476, /* scale */ +8, /* scale_shift */ +-1, /* shift */ +{5, 96, 3, 32, 4, 8, 2, 4, 4, 1, 0, 0, 0, 0, 0, 0, }, /* factors */ +fft_bitrev480, /* bitrev */ +fft_twiddles48000_960, /* bitrev */ +#ifdef OVERRIDE_FFT +(arch_fft_state *)&cfg_arch_480, +#else +NULL, +#endif +}; +#endif + +#ifndef FFT_STATE48000_960_1 +#define FFT_STATE48000_960_1 +static const kiss_fft_state fft_state48000_960_1 = { +240, /* nfft */ +17476, /* scale */ +7, /* scale_shift */ +1, /* shift */ +{5, 48, 3, 16, 4, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, }, /* factors */ +fft_bitrev240, /* bitrev */ +fft_twiddles48000_960, /* bitrev */ +#ifdef OVERRIDE_FFT +(arch_fft_state *)&cfg_arch_240, +#else +NULL, +#endif +}; +#endif + +#ifndef FFT_STATE48000_960_2 +#define FFT_STATE48000_960_2 +static const kiss_fft_state fft_state48000_960_2 = { +120, /* nfft */ +17476, /* scale */ +6, /* scale_shift */ +2, /* shift */ +{5, 24, 3, 8, 2, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, }, /* factors */ +fft_bitrev120, /* bitrev */ +fft_twiddles48000_960, /* bitrev */ +#ifdef OVERRIDE_FFT +(arch_fft_state *)&cfg_arch_120, +#else +NULL, +#endif +}; +#endif + +#ifndef FFT_STATE48000_960_3 +#define FFT_STATE48000_960_3 +static const kiss_fft_state fft_state48000_960_3 = { +60, /* nfft */ +17476, /* scale */ +5, /* scale_shift */ +3, /* shift */ +{5, 12, 3, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* factors */ +fft_bitrev60, /* bitrev */ +fft_twiddles48000_960, /* bitrev */ +#ifdef OVERRIDE_FFT +(arch_fft_state *)&cfg_arch_60, +#else +NULL, +#endif +}; +#endif + +#endif + +#ifndef MDCT_TWIDDLES960 +#define MDCT_TWIDDLES960 +static const opus_val16 mdct_twiddles960[1800] = { +32767, 32767, 32767, 32766, 32765, +32763, 32761, 32759, 32756, 32753, +32750, 32746, 32742, 32738, 32733, +32728, 32722, 32717, 32710, 32704, +32697, 32690, 32682, 32674, 32666, +32657, 32648, 32639, 32629, 32619, +32609, 32598, 32587, 32576, 32564, +32552, 32539, 32526, 32513, 32500, +32486, 32472, 32457, 32442, 32427, +32411, 32395, 32379, 32362, 32345, +32328, 32310, 32292, 32274, 32255, +32236, 32217, 32197, 32177, 32157, +32136, 32115, 32093, 32071, 32049, +32027, 32004, 31981, 31957, 31933, +31909, 31884, 31859, 31834, 31809, +31783, 31756, 31730, 31703, 31676, +31648, 31620, 31592, 31563, 31534, +31505, 31475, 31445, 31415, 31384, +31353, 31322, 31290, 31258, 31226, +31193, 31160, 31127, 31093, 31059, +31025, 30990, 30955, 30920, 30884, +30848, 30812, 30775, 30738, 30701, +30663, 30625, 30587, 30548, 30509, +30470, 30430, 30390, 30350, 30309, +30269, 30227, 30186, 30144, 30102, +30059, 30016, 29973, 29930, 29886, +29842, 29797, 29752, 29707, 29662, +29616, 29570, 29524, 29477, 29430, +29383, 29335, 29287, 29239, 29190, +29142, 29092, 29043, 28993, 28943, +28892, 28842, 28791, 28739, 28688, +28636, 28583, 28531, 28478, 28425, +28371, 28317, 28263, 28209, 28154, +28099, 28044, 27988, 27932, 27876, +27820, 27763, 27706, 27648, 27591, +27533, 27474, 27416, 27357, 27298, +27238, 27178, 27118, 27058, 26997, +26936, 26875, 26814, 26752, 26690, +26628, 26565, 26502, 26439, 26375, +26312, 26247, 26183, 26119, 26054, +25988, 25923, 25857, 25791, 25725, +25658, 25592, 25524, 25457, 25389, +25322, 25253, 25185, 25116, 25047, +24978, 24908, 24838, 24768, 24698, +24627, 24557, 24485, 24414, 24342, +24270, 24198, 24126, 24053, 23980, +23907, 23834, 23760, 23686, 23612, +23537, 23462, 23387, 23312, 23237, +23161, 23085, 23009, 22932, 22856, +22779, 22701, 22624, 22546, 22468, +22390, 22312, 22233, 22154, 22075, +21996, 21916, 21836, 21756, 21676, +21595, 21515, 21434, 21352, 21271, +21189, 21107, 21025, 20943, 20860, +20777, 20694, 20611, 20528, 20444, +20360, 20276, 20192, 20107, 20022, +19937, 19852, 19767, 19681, 19595, +19509, 19423, 19336, 19250, 19163, +19076, 18988, 18901, 18813, 18725, +18637, 18549, 18460, 18372, 18283, +18194, 18104, 18015, 17925, 17835, +17745, 17655, 17565, 17474, 17383, +17292, 17201, 17110, 17018, 16927, +16835, 16743, 16650, 16558, 16465, +16372, 16279, 16186, 16093, 15999, +15906, 15812, 15718, 15624, 15529, +15435, 15340, 15245, 15150, 15055, +14960, 14864, 14769, 14673, 14577, +14481, 14385, 14288, 14192, 14095, +13998, 13901, 13804, 13706, 13609, +13511, 13414, 13316, 13218, 13119, +13021, 12923, 12824, 12725, 12626, +12527, 12428, 12329, 12230, 12130, +12030, 11930, 11831, 11730, 11630, +11530, 11430, 11329, 11228, 11128, +11027, 10926, 10824, 10723, 10622, +10520, 10419, 10317, 10215, 10113, +10011, 9909, 9807, 9704, 9602, +9499, 9397, 9294, 9191, 9088, +8985, 8882, 8778, 8675, 8572, +8468, 8364, 8261, 8157, 8053, +7949, 7845, 7741, 7637, 7532, +7428, 7323, 7219, 7114, 7009, +6905, 6800, 6695, 6590, 6485, +6380, 6274, 6169, 6064, 5958, +5853, 5747, 5642, 5536, 5430, +5325, 5219, 5113, 5007, 4901, +4795, 4689, 4583, 4476, 4370, +4264, 4157, 4051, 3945, 3838, +3732, 3625, 3518, 3412, 3305, +3198, 3092, 2985, 2878, 2771, +2664, 2558, 2451, 2344, 2237, +2130, 2023, 1916, 1809, 1702, +1594, 1487, 1380, 1273, 1166, +1059, 952, 844, 737, 630, +523, 416, 308, 201, 94, +-13, -121, -228, -335, -442, +-550, -657, -764, -871, -978, +-1086, -1193, -1300, -1407, -1514, +-1621, -1728, -1835, -1942, -2049, +-2157, -2263, -2370, -2477, -2584, +-2691, -2798, -2905, -3012, -3118, +-3225, -3332, -3439, -3545, -3652, +-3758, -3865, -3971, -4078, -4184, +-4290, -4397, -4503, -4609, -4715, +-4821, -4927, -5033, -5139, -5245, +-5351, -5457, -5562, -5668, -5774, +-5879, -5985, -6090, -6195, -6301, +-6406, -6511, -6616, -6721, -6826, +-6931, -7036, -7140, -7245, -7349, +-7454, -7558, -7663, -7767, -7871, +-7975, -8079, -8183, -8287, -8390, +-8494, -8597, -8701, -8804, -8907, +-9011, -9114, -9217, -9319, -9422, +-9525, -9627, -9730, -9832, -9934, +-10037, -10139, -10241, -10342, -10444, +-10546, -10647, -10748, -10850, -10951, +-11052, -11153, -11253, -11354, -11455, +-11555, -11655, -11756, -11856, -11955, +-12055, -12155, -12254, -12354, -12453, +-12552, -12651, -12750, -12849, -12947, +-13046, -13144, -13242, -13340, -13438, +-13536, -13633, -13731, -13828, -13925, +-14022, -14119, -14216, -14312, -14409, +-14505, -14601, -14697, -14793, -14888, +-14984, -15079, -15174, -15269, -15364, +-15459, -15553, -15647, -15741, -15835, +-15929, -16023, -16116, -16210, -16303, +-16396, -16488, -16581, -16673, -16766, +-16858, -16949, -17041, -17133, -17224, +-17315, -17406, -17497, -17587, -17678, +-17768, -17858, -17948, -18037, -18127, +-18216, -18305, -18394, -18483, -18571, +-18659, -18747, -18835, -18923, -19010, +-19098, -19185, -19271, -19358, -19444, +-19531, -19617, -19702, -19788, -19873, +-19959, -20043, -20128, -20213, -20297, +-20381, -20465, -20549, -20632, -20715, +-20798, -20881, -20963, -21046, -21128, +-21210, -21291, -21373, -21454, -21535, +-21616, -21696, -21776, -21856, -21936, +-22016, -22095, -22174, -22253, -22331, +-22410, -22488, -22566, -22643, -22721, +-22798, -22875, -22951, -23028, -23104, +-23180, -23256, -23331, -23406, -23481, +-23556, -23630, -23704, -23778, -23852, +-23925, -23998, -24071, -24144, -24216, +-24288, -24360, -24432, -24503, -24574, +-24645, -24716, -24786, -24856, -24926, +-24995, -25064, -25133, -25202, -25270, +-25339, -25406, -25474, -25541, -25608, +-25675, -25742, -25808, -25874, -25939, +-26005, -26070, -26135, -26199, -26264, +-26327, -26391, -26455, -26518, -26581, +-26643, -26705, -26767, -26829, -26891, +-26952, -27013, -27073, -27133, -27193, +-27253, -27312, -27372, -27430, -27489, +-27547, -27605, -27663, -27720, -27777, +-27834, -27890, -27946, -28002, -28058, +-28113, -28168, -28223, -28277, -28331, +-28385, -28438, -28491, -28544, -28596, +-28649, -28701, -28752, -28803, -28854, +-28905, -28955, -29006, -29055, -29105, +-29154, -29203, -29251, -29299, -29347, +-29395, -29442, -29489, -29535, -29582, +-29628, -29673, -29719, -29764, -29808, +-29853, -29897, -29941, -29984, -30027, +-30070, -30112, -30154, -30196, -30238, +-30279, -30320, -30360, -30400, -30440, +-30480, -30519, -30558, -30596, -30635, +-30672, -30710, -30747, -30784, -30821, +-30857, -30893, -30929, -30964, -30999, +-31033, -31068, -31102, -31135, -31168, +-31201, -31234, -31266, -31298, -31330, +-31361, -31392, -31422, -31453, -31483, +-31512, -31541, -31570, -31599, -31627, +-31655, -31682, -31710, -31737, -31763, +-31789, -31815, -31841, -31866, -31891, +-31915, -31939, -31963, -31986, -32010, +-32032, -32055, -32077, -32099, -32120, +-32141, -32162, -32182, -32202, -32222, +-32241, -32260, -32279, -32297, -32315, +-32333, -32350, -32367, -32383, -32399, +-32415, -32431, -32446, -32461, -32475, +-32489, -32503, -32517, -32530, -32542, +-32555, -32567, -32579, -32590, -32601, +-32612, -32622, -32632, -32641, -32651, +-32659, -32668, -32676, -32684, -32692, +-32699, -32706, -32712, -32718, -32724, +-32729, -32734, -32739, -32743, -32747, +-32751, -32754, -32757, -32760, -32762, +-32764, -32765, -32767, -32767, -32767, +32767, 32767, 32765, 32761, 32756, +32750, 32742, 32732, 32722, 32710, +32696, 32681, 32665, 32647, 32628, +32608, 32586, 32562, 32538, 32512, +32484, 32455, 32425, 32393, 32360, +32326, 32290, 32253, 32214, 32174, +32133, 32090, 32046, 32001, 31954, +31906, 31856, 31805, 31753, 31700, +31645, 31588, 31530, 31471, 31411, +31349, 31286, 31222, 31156, 31089, +31020, 30951, 30880, 30807, 30733, +30658, 30582, 30504, 30425, 30345, +30263, 30181, 30096, 30011, 29924, +29836, 29747, 29656, 29564, 29471, +29377, 29281, 29184, 29086, 28987, +28886, 28784, 28681, 28577, 28471, +28365, 28257, 28147, 28037, 27925, +27812, 27698, 27583, 27467, 27349, +27231, 27111, 26990, 26868, 26744, +26620, 26494, 26367, 26239, 26110, +25980, 25849, 25717, 25583, 25449, +25313, 25176, 25038, 24900, 24760, +24619, 24477, 24333, 24189, 24044, +23898, 23751, 23602, 23453, 23303, +23152, 22999, 22846, 22692, 22537, +22380, 22223, 22065, 21906, 21746, +21585, 21423, 21261, 21097, 20933, +20767, 20601, 20434, 20265, 20096, +19927, 19756, 19584, 19412, 19239, +19065, 18890, 18714, 18538, 18361, +18183, 18004, 17824, 17644, 17463, +17281, 17098, 16915, 16731, 16546, +16361, 16175, 15988, 15800, 15612, +15423, 15234, 15043, 14852, 14661, +14469, 14276, 14083, 13889, 13694, +13499, 13303, 13107, 12910, 12713, +12515, 12317, 12118, 11918, 11718, +11517, 11316, 11115, 10913, 10710, +10508, 10304, 10100, 9896, 9691, +9486, 9281, 9075, 8869, 8662, +8455, 8248, 8040, 7832, 7623, +7415, 7206, 6996, 6787, 6577, +6366, 6156, 5945, 5734, 5523, +5311, 5100, 4888, 4675, 4463, +4251, 4038, 3825, 3612, 3399, +3185, 2972, 2758, 2544, 2330, +2116, 1902, 1688, 1474, 1260, +1045, 831, 617, 402, 188, +-27, -241, -456, -670, -885, +-1099, -1313, -1528, -1742, -1956, +-2170, -2384, -2598, -2811, -3025, +-3239, -3452, -3665, -3878, -4091, +-4304, -4516, -4728, -4941, -5153, +-5364, -5576, -5787, -5998, -6209, +-6419, -6629, -6839, -7049, -7258, +-7467, -7676, -7884, -8092, -8300, +-8507, -8714, -8920, -9127, -9332, +-9538, -9743, -9947, -10151, -10355, +-10558, -10761, -10963, -11165, -11367, +-11568, -11768, -11968, -12167, -12366, +-12565, -12762, -12960, -13156, -13352, +-13548, -13743, -13937, -14131, -14324, +-14517, -14709, -14900, -15091, -15281, +-15470, -15659, -15847, -16035, -16221, +-16407, -16593, -16777, -16961, -17144, +-17326, -17508, -17689, -17869, -18049, +-18227, -18405, -18582, -18758, -18934, +-19108, -19282, -19455, -19627, -19799, +-19969, -20139, -20308, -20475, -20642, +-20809, -20974, -21138, -21301, -21464, +-21626, -21786, -21946, -22105, -22263, +-22420, -22575, -22730, -22884, -23037, +-23189, -23340, -23490, -23640, -23788, +-23935, -24080, -24225, -24369, -24512, +-24654, -24795, -24934, -25073, -25211, +-25347, -25482, -25617, -25750, -25882, +-26013, -26143, -26272, -26399, -26526, +-26651, -26775, -26898, -27020, -27141, +-27260, -27379, -27496, -27612, -27727, +-27841, -27953, -28065, -28175, -28284, +-28391, -28498, -28603, -28707, -28810, +-28911, -29012, -29111, -29209, -29305, +-29401, -29495, -29587, -29679, -29769, +-29858, -29946, -30032, -30118, -30201, +-30284, -30365, -30445, -30524, -30601, +-30677, -30752, -30825, -30897, -30968, +-31038, -31106, -31172, -31238, -31302, +-31365, -31426, -31486, -31545, -31602, +-31658, -31713, -31766, -31818, -31869, +-31918, -31966, -32012, -32058, -32101, +-32144, -32185, -32224, -32262, -32299, +-32335, -32369, -32401, -32433, -32463, +-32491, -32518, -32544, -32568, -32591, +-32613, -32633, -32652, -32669, -32685, +-32700, -32713, -32724, -32735, -32744, +-32751, -32757, -32762, -32766, -32767, +32767, 32764, 32755, 32741, 32720, +32694, 32663, 32626, 32583, 32535, +32481, 32421, 32356, 32286, 32209, +32128, 32041, 31948, 31850, 31747, +31638, 31523, 31403, 31278, 31148, +31012, 30871, 30724, 30572, 30415, +30253, 30086, 29913, 29736, 29553, +29365, 29172, 28974, 28771, 28564, +28351, 28134, 27911, 27684, 27452, +27216, 26975, 26729, 26478, 26223, +25964, 25700, 25432, 25159, 24882, +24601, 24315, 24026, 23732, 23434, +23133, 22827, 22517, 22204, 21886, +21565, 21240, 20912, 20580, 20244, +19905, 19563, 19217, 18868, 18516, +18160, 17802, 17440, 17075, 16708, +16338, 15964, 15588, 15210, 14829, +14445, 14059, 13670, 13279, 12886, +12490, 12093, 11693, 11291, 10888, +10482, 10075, 9666, 9255, 8843, +8429, 8014, 7597, 7180, 6760, +6340, 5919, 5496, 5073, 4649, +4224, 3798, 3372, 2945, 2517, +2090, 1661, 1233, 804, 375, +-54, -483, -911, -1340, -1768, +-2197, -2624, -3052, -3479, -3905, +-4330, -4755, -5179, -5602, -6024, +-6445, -6865, -7284, -7702, -8118, +-8533, -8946, -9358, -9768, -10177, +-10584, -10989, -11392, -11793, -12192, +-12589, -12984, -13377, -13767, -14155, +-14541, -14924, -15305, -15683, -16058, +-16430, -16800, -17167, -17531, -17892, +-18249, -18604, -18956, -19304, -19649, +-19990, -20329, -20663, -20994, -21322, +-21646, -21966, -22282, -22595, -22904, +-23208, -23509, -23806, -24099, -24387, +-24672, -24952, -25228, -25499, -25766, +-26029, -26288, -26541, -26791, -27035, +-27275, -27511, -27741, -27967, -28188, +-28405, -28616, -28823, -29024, -29221, +-29412, -29599, -29780, -29957, -30128, +-30294, -30455, -30611, -30761, -30906, +-31046, -31181, -31310, -31434, -31552, +-31665, -31773, -31875, -31972, -32063, +-32149, -32229, -32304, -32373, -32437, +-32495, -32547, -32594, -32635, -32671, +-32701, -32726, -32745, -32758, -32766, +32767, 32754, 32717, 32658, 32577, +32473, 32348, 32200, 32029, 31837, +31624, 31388, 31131, 30853, 30553, +30232, 29891, 29530, 29148, 28746, +28324, 27883, 27423, 26944, 26447, +25931, 25398, 24847, 24279, 23695, +23095, 22478, 21846, 21199, 20538, +19863, 19174, 18472, 17757, 17030, +16291, 15541, 14781, 14010, 13230, +12441, 11643, 10837, 10024, 9204, +8377, 7545, 6708, 5866, 5020, +4171, 3319, 2464, 1608, 751, +-107, -965, -1822, -2678, -3532, +-4383, -5232, -6077, -6918, -7754, +-8585, -9409, -10228, -11039, -11843, +-12639, -13426, -14204, -14972, -15730, +-16477, -17213, -17937, -18648, -19347, +-20033, -20705, -21363, -22006, -22634, +-23246, -23843, -24423, -24986, -25533, +-26062, -26573, -27066, -27540, -27995, +-28431, -28848, -29245, -29622, -29979, +-30315, -30630, -30924, -31197, -31449, +-31679, -31887, -32074, -32239, -32381, +-32501, -32600, -32675, -32729, -32759, +}; +#endif + +static const CELTMode mode48000_960_120 = { +48000, /* Fs */ +120, /* overlap */ +21, /* nbEBands */ +21, /* effEBands */ +{27853, 0, 4096, 8192, }, /* preemph */ +eband5ms, /* eBands */ +3, /* maxLM */ +8, /* nbShortMdcts */ +120, /* shortMdctSize */ +11, /* nbAllocVectors */ +band_allocation, /* allocVectors */ +logN400, /* logN */ +window120, /* window */ +{1920, 3, {&fft_state48000_960_0, &fft_state48000_960_1, &fft_state48000_960_2, &fft_state48000_960_3, }, mdct_twiddles960}, /* mdct */ +{392, cache_index50, cache_bits50, cache_caps50}, /* cache */ +}; + +/* List of all the available modes */ +#define TOTAL_MODES 1 +static const CELTMode * const static_mode_list[TOTAL_MODES] = { +&mode48000_960_120, +}; diff --git a/thirdparty/opus/celt/static_modes_fixed_arm_ne10.h b/thirdparty/opus/celt/static_modes_fixed_arm_ne10.h new file mode 100644 index 000000000000..b8ef0cee9838 --- /dev/null +++ b/thirdparty/opus/celt/static_modes_fixed_arm_ne10.h @@ -0,0 +1,388 @@ +/* The contents of this file was automatically generated by + * dump_mode_arm_ne10.c with arguments: 48000 960 + * It contains static definitions for some pre-defined modes. */ +#include + +#ifndef NE10_FFT_PARAMS48000_960 +#define NE10_FFT_PARAMS48000_960 +static const ne10_int32_t ne10_factors_480[64] = { +4, 40, 4, 30, 2, 15, 5, 3, 3, 1, 1, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, }; +static const ne10_int32_t ne10_factors_240[64] = { +3, 20, 4, 15, 5, 3, 3, 1, 1, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, }; +static const ne10_int32_t ne10_factors_120[64] = { +3, 10, 2, 15, 5, 3, 3, 1, 1, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, }; +static const ne10_int32_t ne10_factors_60[64] = { +2, 5, 5, 3, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, }; +static const ne10_fft_cpx_int32_t ne10_twiddles_480[480] = { +{0,0}, {2147483647,0}, {2147483647,0}, +{2147483647,0}, {1961823921,-873460313}, {1436946998,-1595891394}, +{2147483647,0}, {1436946998,-1595891394}, {-224473265,-2135719496}, +{2147483647,0}, {663608871,-2042378339}, {-1737350854,-1262259096}, +{2147483647,0}, {-224473265,-2135719496}, {-2100555935,446487152}, +{2147483647,0}, {2100555974,-446486968}, {1961823921,-873460313}, +{1737350743,-1262259248}, {1436946998,-1595891394}, {1073741769,-1859775424}, +{663608871,-2042378339}, {224473078,-2135719516}, {-224473265,-2135719496}, +{-663609049,-2042378281}, {-1073741932,-1859775330}, {-1436947137,-1595891268}, +{-1737350854,-1262259096}, {-1961823997,-873460141}, {-2100556013,-446486785}, +{2147483647,0}, {2144540595,-112390613}, {2135719506,-224473172}, +{2121044558,-335940465}, {2100555974,-446486968}, {2074309912,-555809682}, +{2042378310,-663608960}, {2004848691,-769589332}, {1961823921,-873460313}, +{1913421927,-974937199}, {1859775377,-1073741851}, {1801031311,-1169603450}, +{1737350743,-1262259248}, {1668908218,-1351455280}, {1595891331,-1436947067}, +{1518500216,-1518500282}, {1436946998,-1595891394}, {1351455207,-1668908277}, +{1262259172,-1737350799}, {1169603371,-1801031362}, {1073741769,-1859775424}, +{974937230,-1913421912}, {873460227,-1961823959}, {769589125,-2004848771}, +{663608871,-2042378339}, {555809715,-2074309903}, {446486876,-2100555994}, +{335940246,-2121044593}, {224473078,-2135719516}, {112390647,-2144540593}, +{2147483647,0}, {2135719506,-224473172}, {2100555974,-446486968}, +{2042378310,-663608960}, {1961823921,-873460313}, {1859775377,-1073741851}, +{1737350743,-1262259248}, {1595891331,-1436947067}, {1436946998,-1595891394}, +{1262259172,-1737350799}, {1073741769,-1859775424}, {873460227,-1961823959}, +{663608871,-2042378339}, {446486876,-2100555994}, {224473078,-2135719516}, +{-94,-2147483647}, {-224473265,-2135719496}, {-446487060,-2100555955}, +{-663609049,-2042378281}, {-873460398,-1961823883}, {-1073741932,-1859775330}, +{-1262259116,-1737350839}, {-1436947137,-1595891268}, {-1595891628,-1436946738}, +{-1737350854,-1262259096}, {-1859775343,-1073741910}, {-1961823997,-873460141}, +{-2042378447,-663608538}, {-2100556013,-446486785}, {-2135719499,-224473240}, +{2147483647,0}, {2121044558,-335940465}, {2042378310,-663608960}, +{1913421927,-974937199}, {1737350743,-1262259248}, {1518500216,-1518500282}, +{1262259172,-1737350799}, {974937230,-1913421912}, {663608871,-2042378339}, +{335940246,-2121044593}, {-94,-2147483647}, {-335940431,-2121044564}, +{-663609049,-2042378281}, {-974937397,-1913421827}, {-1262259116,-1737350839}, +{-1518500258,-1518500240}, {-1737350854,-1262259096}, {-1913422071,-974936918}, +{-2042378447,-663608538}, {-2121044568,-335940406}, {-2147483647,188}, +{-2121044509,335940777}, {-2042378331,663608895}, {-1913421900,974937252}, +{-1737350633,1262259400}, {-1518499993,1518500506}, {-1262258813,1737351059}, +{-974936606,1913422229}, {-663609179,2042378239}, {-335940566,2121044542}, +{2147483647,0}, {2147299667,-28109693}, {2146747758,-56214570}, +{2145828015,-84309815}, {2144540595,-112390613}, {2142885719,-140452154}, +{2140863671,-168489630}, {2138474797,-196498235}, {2135719506,-224473172}, +{2132598271,-252409646}, {2129111626,-280302871}, {2125260168,-308148068}, +{2121044558,-335940465}, {2116465518,-363675300}, {2111523833,-391347822}, +{2106220349,-418953288}, {2100555974,-446486968}, {2094531681,-473944146}, +{2088148500,-501320115}, {2081407525,-528610186}, {2074309912,-555809682}, +{2066856885,-582913912}, {2059049696,-609918325}, {2050889698,-636818231}, +{2042378310,-663608960}, {2033516972,-690285983}, {2024307180,-716844791}, +{2014750533,-743280770}, {2004848691,-769589332}, {1994603329,-795766029}, +{1984016179,-821806435}, {1973089077,-847706028}, {1961823921,-873460313}, +{1950222618,-899064934}, {1938287127,-924515564}, {1926019520,-949807783}, +{1913421927,-974937199}, {1900496481,-999899565}, {1887245364,-1024690661}, +{1873670877,-1049306180}, {1859775377,-1073741851}, {1845561215,-1097993541}, +{1831030826,-1122057097}, {1816186632,-1145928502}, {1801031311,-1169603450}, +{1785567394,-1193077993}, {1769797456,-1216348214}, {1753724345,-1239409914}, +{1737350743,-1262259248}, {1720679456,-1284892300}, {1703713340,-1307305194}, +{1686455222,-1329494189}, {1668908218,-1351455280}, {1651075255,-1373184807}, +{1632959307,-1394679144}, {1614563642,-1415934412}, {1595891331,-1436947067}, +{1576945572,-1457713510}, {1557729613,-1478230181}, {1538246655,-1498493658}, +{1518500216,-1518500282}, {1498493590,-1538246721}, {1478230113,-1557729677}, +{1457713441,-1576945636}, {1436946998,-1595891394}, {1415934341,-1614563704}, +{1394679073,-1632959368}, {1373184735,-1651075315}, {1351455207,-1668908277}, +{1329494115,-1686455280}, {1307305120,-1703713397}, {1284892225,-1720679512}, +{1262259172,-1737350799}, {1239409837,-1753724400}, {1216348136,-1769797510}, +{1193077915,-1785567446}, {1169603371,-1801031362}, {1145928423,-1816186682}, +{1122057017,-1831030875}, {1097993571,-1845561197}, {1073741769,-1859775424}, +{1049305987,-1873670985}, {1024690635,-1887245378}, {999899482,-1900496524}, +{974937230,-1913421912}, {949807699,-1926019561}, {924515422,-1938287195}, +{899064965,-1950222603}, {873460227,-1961823959}, {847705824,-1973089164}, +{821806407,-1984016190}, {795765941,-1994603364}, {769589125,-2004848771}, +{743280682,-2014750566}, {716844642,-2024307233}, {690286016,-2033516961}, +{663608871,-2042378339}, {636818019,-2050889764}, {609918296,-2059049705}, +{582913822,-2066856911}, {555809715,-2074309903}, {528610126,-2081407540}, +{501319962,-2088148536}, {473944148,-2094531680}, {446486876,-2100555994}, +{418953102,-2106220386}, {391347792,-2111523838}, {363675176,-2116465540}, +{335940246,-2121044593}, {308148006,-2125260177}, {280302715,-2129111646}, +{252409648,-2132598271}, {224473078,-2135719516}, {196498046,-2138474814}, +{168489600,-2140863674}, {140452029,-2142885728}, {112390647,-2144540593}, +{84309753,-2145828017}, {56214412,-2146747762}, {28109695,-2147299667}, +{2147483647,0}, {2146747758,-56214570}, {2144540595,-112390613}, +{2140863671,-168489630}, {2135719506,-224473172}, {2129111626,-280302871}, +{2121044558,-335940465}, {2111523833,-391347822}, {2100555974,-446486968}, +{2088148500,-501320115}, {2074309912,-555809682}, {2059049696,-609918325}, +{2042378310,-663608960}, {2024307180,-716844791}, {2004848691,-769589332}, +{1984016179,-821806435}, {1961823921,-873460313}, {1938287127,-924515564}, +{1913421927,-974937199}, {1887245364,-1024690661}, {1859775377,-1073741851}, +{1831030826,-1122057097}, {1801031311,-1169603450}, {1769797456,-1216348214}, +{1737350743,-1262259248}, {1703713340,-1307305194}, {1668908218,-1351455280}, +{1632959307,-1394679144}, {1595891331,-1436947067}, {1557729613,-1478230181}, +{1518500216,-1518500282}, {1478230113,-1557729677}, {1436946998,-1595891394}, +{1394679073,-1632959368}, {1351455207,-1668908277}, {1307305120,-1703713397}, +{1262259172,-1737350799}, {1216348136,-1769797510}, {1169603371,-1801031362}, +{1122057017,-1831030875}, {1073741769,-1859775424}, {1024690635,-1887245378}, +{974937230,-1913421912}, {924515422,-1938287195}, {873460227,-1961823959}, +{821806407,-1984016190}, {769589125,-2004848771}, {716844642,-2024307233}, +{663608871,-2042378339}, {609918296,-2059049705}, {555809715,-2074309903}, +{501319962,-2088148536}, {446486876,-2100555994}, {391347792,-2111523838}, +{335940246,-2121044593}, {280302715,-2129111646}, {224473078,-2135719516}, +{168489600,-2140863674}, {112390647,-2144540593}, {56214412,-2146747762}, +{-94,-2147483647}, {-56214600,-2146747757}, {-112390835,-2144540584}, +{-168489787,-2140863659}, {-224473265,-2135719496}, {-280302901,-2129111622}, +{-335940431,-2121044564}, {-391347977,-2111523804}, {-446487060,-2100555955}, +{-501320144,-2088148493}, {-555809896,-2074309855}, {-609918476,-2059049651}, +{-663609049,-2042378281}, {-716844819,-2024307170}, {-769589300,-2004848703}, +{-821806581,-1984016118}, {-873460398,-1961823883}, {-924515591,-1938287114}, +{-974937397,-1913421827}, {-1024690575,-1887245411}, {-1073741932,-1859775330}, +{-1122057395,-1831030643}, {-1169603421,-1801031330}, {-1216348291,-1769797403}, +{-1262259116,-1737350839}, {-1307305268,-1703713283}, {-1351455453,-1668908078}, +{-1394679021,-1632959413}, {-1436947137,-1595891268}, {-1478230435,-1557729372}, +{-1518500258,-1518500240}, {-1557729742,-1478230045}, {-1595891628,-1436946738}, +{-1632959429,-1394679001}, {-1668908417,-1351455035}, {-1703713298,-1307305248}, +{-1737350854,-1262259096}, {-1769797708,-1216347848}, {-1801031344,-1169603400}, +{-1831030924,-1122056937}, {-1859775343,-1073741910}, {-1887245423,-1024690552}, +{-1913422071,-974936918}, {-1938287125,-924515568}, {-1961823997,-873460141}, +{-1984016324,-821806084}, {-2004848713,-769589276}, {-2024307264,-716844553}, +{-2042378447,-663608538}, {-2059049731,-609918206}, {-2074309994,-555809377}, +{-2088148499,-501320119}, {-2100556013,-446486785}, {-2111523902,-391347448}, +{-2121044568,-335940406}, {-2129111659,-280302621}, {-2135719499,-224473240}, +{-2140863681,-168489506}, {-2144540612,-112390298}, {-2146747758,-56214574}, +{2147483647,0}, {2145828015,-84309815}, {2140863671,-168489630}, +{2132598271,-252409646}, {2121044558,-335940465}, {2106220349,-418953288}, +{2088148500,-501320115}, {2066856885,-582913912}, {2042378310,-663608960}, +{2014750533,-743280770}, {1984016179,-821806435}, {1950222618,-899064934}, +{1913421927,-974937199}, {1873670877,-1049306180}, {1831030826,-1122057097}, +{1785567394,-1193077993}, {1737350743,-1262259248}, {1686455222,-1329494189}, +{1632959307,-1394679144}, {1576945572,-1457713510}, {1518500216,-1518500282}, +{1457713441,-1576945636}, {1394679073,-1632959368}, {1329494115,-1686455280}, +{1262259172,-1737350799}, {1193077915,-1785567446}, {1122057017,-1831030875}, +{1049305987,-1873670985}, {974937230,-1913421912}, {899064965,-1950222603}, +{821806407,-1984016190}, {743280682,-2014750566}, {663608871,-2042378339}, +{582913822,-2066856911}, {501319962,-2088148536}, {418953102,-2106220386}, +{335940246,-2121044593}, {252409648,-2132598271}, {168489600,-2140863674}, +{84309753,-2145828017}, {-94,-2147483647}, {-84309940,-2145828010}, +{-168489787,-2140863659}, {-252409834,-2132598249}, {-335940431,-2121044564}, +{-418953286,-2106220349}, {-501320144,-2088148493}, {-582914003,-2066856860}, +{-663609049,-2042378281}, {-743280858,-2014750501}, {-821806581,-1984016118}, +{-899065136,-1950222525}, {-974937397,-1913421827}, {-1049306374,-1873670768}, +{-1122057395,-1831030643}, {-1193078284,-1785567199}, {-1262259116,-1737350839}, +{-1329494061,-1686455323}, {-1394679021,-1632959413}, {-1457713485,-1576945595}, +{-1518500258,-1518500240}, {-1576945613,-1457713466}, {-1632959429,-1394679001}, +{-1686455338,-1329494041}, {-1737350854,-1262259096}, {-1785567498,-1193077837}, +{-1831030924,-1122056937}, {-1873671031,-1049305905}, {-1913422071,-974936918}, +{-1950222750,-899064648}, {-1984016324,-821806084}, {-2014750687,-743280354}, +{-2042378447,-663608538}, {-2066856867,-582913978}, {-2088148499,-501320119}, +{-2106220354,-418953261}, {-2121044568,-335940406}, {-2132598282,-252409555}, +{-2140863681,-168489506}, {-2145828021,-84309659}, {-2147483647,188}, +{-2145828006,84310034}, {-2140863651,168489881}, {-2132598237,252409928}, +{-2121044509,335940777}, {-2106220281,418953629}, {-2088148411,501320484}, +{-2066856765,582914339}, {-2042378331,663608895}, {-2014750557,743280706}, +{-1984016181,821806431}, {-1950222593,899064989}, {-1913421900,974937252}, +{-1873670848,1049306232}, {-1831030728,1122057257}, {-1785567289,1193078149}, +{-1737350633,1262259400}, {-1686455106,1329494336}, {-1632959185,1394679287}, +{-1576945358,1457713742}, {-1518499993,1518500506}, {-1457713209,1576945850}, +{-1394678735,1632959656}, {-1329493766,1686455555}, {-1262258813,1737351059}, +{-1193077546,1785567692}, {-1122056638,1831031107}, {-1049305599,1873671202}, +{-974936606,1913422229}, {-899064330,1950222896}, {-821805761,1984016458}, +{-743280025,2014750808}, {-663609179,2042378239}, {-582914134,2066856823}, +{-501320277,2088148461}, {-418953420,2106220322}, {-335940566,2121044542}, +{-252409716,2132598263}, {-168489668,2140863668}, {-84309821,2145828015}, +}; +static const ne10_fft_cpx_int32_t ne10_twiddles_240[240] = { +{0,0}, {2147483647,0}, {2147483647,0}, +{2147483647,0}, {1961823921,-873460313}, {1436946998,-1595891394}, +{2147483647,0}, {1436946998,-1595891394}, {-224473265,-2135719496}, +{2147483647,0}, {663608871,-2042378339}, {-1737350854,-1262259096}, +{2147483647,0}, {-224473265,-2135719496}, {-2100555935,446487152}, +{2147483647,0}, {2135719506,-224473172}, {2100555974,-446486968}, +{2042378310,-663608960}, {1961823921,-873460313}, {1859775377,-1073741851}, +{1737350743,-1262259248}, {1595891331,-1436947067}, {1436946998,-1595891394}, +{1262259172,-1737350799}, {1073741769,-1859775424}, {873460227,-1961823959}, +{663608871,-2042378339}, {446486876,-2100555994}, {224473078,-2135719516}, +{2147483647,0}, {2100555974,-446486968}, {1961823921,-873460313}, +{1737350743,-1262259248}, {1436946998,-1595891394}, {1073741769,-1859775424}, +{663608871,-2042378339}, {224473078,-2135719516}, {-224473265,-2135719496}, +{-663609049,-2042378281}, {-1073741932,-1859775330}, {-1436947137,-1595891268}, +{-1737350854,-1262259096}, {-1961823997,-873460141}, {-2100556013,-446486785}, +{2147483647,0}, {2042378310,-663608960}, {1737350743,-1262259248}, +{1262259172,-1737350799}, {663608871,-2042378339}, {-94,-2147483647}, +{-663609049,-2042378281}, {-1262259116,-1737350839}, {-1737350854,-1262259096}, +{-2042378447,-663608538}, {-2147483647,188}, {-2042378331,663608895}, +{-1737350633,1262259400}, {-1262258813,1737351059}, {-663609179,2042378239}, +{2147483647,0}, {2146747758,-56214570}, {2144540595,-112390613}, +{2140863671,-168489630}, {2135719506,-224473172}, {2129111626,-280302871}, +{2121044558,-335940465}, {2111523833,-391347822}, {2100555974,-446486968}, +{2088148500,-501320115}, {2074309912,-555809682}, {2059049696,-609918325}, +{2042378310,-663608960}, {2024307180,-716844791}, {2004848691,-769589332}, +{1984016179,-821806435}, {1961823921,-873460313}, {1938287127,-924515564}, +{1913421927,-974937199}, {1887245364,-1024690661}, {1859775377,-1073741851}, +{1831030826,-1122057097}, {1801031311,-1169603450}, {1769797456,-1216348214}, +{1737350743,-1262259248}, {1703713340,-1307305194}, {1668908218,-1351455280}, +{1632959307,-1394679144}, {1595891331,-1436947067}, {1557729613,-1478230181}, +{1518500216,-1518500282}, {1478230113,-1557729677}, {1436946998,-1595891394}, +{1394679073,-1632959368}, {1351455207,-1668908277}, {1307305120,-1703713397}, +{1262259172,-1737350799}, {1216348136,-1769797510}, {1169603371,-1801031362}, +{1122057017,-1831030875}, {1073741769,-1859775424}, {1024690635,-1887245378}, +{974937230,-1913421912}, {924515422,-1938287195}, {873460227,-1961823959}, +{821806407,-1984016190}, {769589125,-2004848771}, {716844642,-2024307233}, +{663608871,-2042378339}, {609918296,-2059049705}, {555809715,-2074309903}, +{501319962,-2088148536}, {446486876,-2100555994}, {391347792,-2111523838}, +{335940246,-2121044593}, {280302715,-2129111646}, {224473078,-2135719516}, +{168489600,-2140863674}, {112390647,-2144540593}, {56214412,-2146747762}, +{2147483647,0}, {2144540595,-112390613}, {2135719506,-224473172}, +{2121044558,-335940465}, {2100555974,-446486968}, {2074309912,-555809682}, +{2042378310,-663608960}, {2004848691,-769589332}, {1961823921,-873460313}, +{1913421927,-974937199}, {1859775377,-1073741851}, {1801031311,-1169603450}, +{1737350743,-1262259248}, {1668908218,-1351455280}, {1595891331,-1436947067}, +{1518500216,-1518500282}, {1436946998,-1595891394}, {1351455207,-1668908277}, +{1262259172,-1737350799}, {1169603371,-1801031362}, {1073741769,-1859775424}, +{974937230,-1913421912}, {873460227,-1961823959}, {769589125,-2004848771}, +{663608871,-2042378339}, {555809715,-2074309903}, {446486876,-2100555994}, +{335940246,-2121044593}, {224473078,-2135719516}, {112390647,-2144540593}, +{-94,-2147483647}, {-112390835,-2144540584}, {-224473265,-2135719496}, +{-335940431,-2121044564}, {-446487060,-2100555955}, {-555809896,-2074309855}, +{-663609049,-2042378281}, {-769589300,-2004848703}, {-873460398,-1961823883}, +{-974937397,-1913421827}, {-1073741932,-1859775330}, {-1169603421,-1801031330}, +{-1262259116,-1737350839}, {-1351455453,-1668908078}, {-1436947137,-1595891268}, +{-1518500258,-1518500240}, {-1595891628,-1436946738}, {-1668908417,-1351455035}, +{-1737350854,-1262259096}, {-1801031344,-1169603400}, {-1859775343,-1073741910}, +{-1913422071,-974936918}, {-1961823997,-873460141}, {-2004848713,-769589276}, +{-2042378447,-663608538}, {-2074309994,-555809377}, {-2100556013,-446486785}, +{-2121044568,-335940406}, {-2135719499,-224473240}, {-2144540612,-112390298}, +{2147483647,0}, {2140863671,-168489630}, {2121044558,-335940465}, +{2088148500,-501320115}, {2042378310,-663608960}, {1984016179,-821806435}, +{1913421927,-974937199}, {1831030826,-1122057097}, {1737350743,-1262259248}, +{1632959307,-1394679144}, {1518500216,-1518500282}, {1394679073,-1632959368}, +{1262259172,-1737350799}, {1122057017,-1831030875}, {974937230,-1913421912}, +{821806407,-1984016190}, {663608871,-2042378339}, {501319962,-2088148536}, +{335940246,-2121044593}, {168489600,-2140863674}, {-94,-2147483647}, +{-168489787,-2140863659}, {-335940431,-2121044564}, {-501320144,-2088148493}, +{-663609049,-2042378281}, {-821806581,-1984016118}, {-974937397,-1913421827}, +{-1122057395,-1831030643}, {-1262259116,-1737350839}, {-1394679021,-1632959413}, +{-1518500258,-1518500240}, {-1632959429,-1394679001}, {-1737350854,-1262259096}, +{-1831030924,-1122056937}, {-1913422071,-974936918}, {-1984016324,-821806084}, +{-2042378447,-663608538}, {-2088148499,-501320119}, {-2121044568,-335940406}, +{-2140863681,-168489506}, {-2147483647,188}, {-2140863651,168489881}, +{-2121044509,335940777}, {-2088148411,501320484}, {-2042378331,663608895}, +{-1984016181,821806431}, {-1913421900,974937252}, {-1831030728,1122057257}, +{-1737350633,1262259400}, {-1632959185,1394679287}, {-1518499993,1518500506}, +{-1394678735,1632959656}, {-1262258813,1737351059}, {-1122056638,1831031107}, +{-974936606,1913422229}, {-821805761,1984016458}, {-663609179,2042378239}, +{-501320277,2088148461}, {-335940566,2121044542}, {-168489668,2140863668}, +}; +static const ne10_fft_cpx_int32_t ne10_twiddles_120[120] = { +{0,0}, {2147483647,0}, {2147483647,0}, +{2147483647,0}, {1961823921,-873460313}, {1436946998,-1595891394}, +{2147483647,0}, {1436946998,-1595891394}, {-224473265,-2135719496}, +{2147483647,0}, {663608871,-2042378339}, {-1737350854,-1262259096}, +{2147483647,0}, {-224473265,-2135719496}, {-2100555935,446487152}, +{2147483647,0}, {2100555974,-446486968}, {1961823921,-873460313}, +{1737350743,-1262259248}, {1436946998,-1595891394}, {1073741769,-1859775424}, +{663608871,-2042378339}, {224473078,-2135719516}, {-224473265,-2135719496}, +{-663609049,-2042378281}, {-1073741932,-1859775330}, {-1436947137,-1595891268}, +{-1737350854,-1262259096}, {-1961823997,-873460141}, {-2100556013,-446486785}, +{2147483647,0}, {2144540595,-112390613}, {2135719506,-224473172}, +{2121044558,-335940465}, {2100555974,-446486968}, {2074309912,-555809682}, +{2042378310,-663608960}, {2004848691,-769589332}, {1961823921,-873460313}, +{1913421927,-974937199}, {1859775377,-1073741851}, {1801031311,-1169603450}, +{1737350743,-1262259248}, {1668908218,-1351455280}, {1595891331,-1436947067}, +{1518500216,-1518500282}, {1436946998,-1595891394}, {1351455207,-1668908277}, +{1262259172,-1737350799}, {1169603371,-1801031362}, {1073741769,-1859775424}, +{974937230,-1913421912}, {873460227,-1961823959}, {769589125,-2004848771}, +{663608871,-2042378339}, {555809715,-2074309903}, {446486876,-2100555994}, +{335940246,-2121044593}, {224473078,-2135719516}, {112390647,-2144540593}, +{2147483647,0}, {2135719506,-224473172}, {2100555974,-446486968}, +{2042378310,-663608960}, {1961823921,-873460313}, {1859775377,-1073741851}, +{1737350743,-1262259248}, {1595891331,-1436947067}, {1436946998,-1595891394}, +{1262259172,-1737350799}, {1073741769,-1859775424}, {873460227,-1961823959}, +{663608871,-2042378339}, {446486876,-2100555994}, {224473078,-2135719516}, +{-94,-2147483647}, {-224473265,-2135719496}, {-446487060,-2100555955}, +{-663609049,-2042378281}, {-873460398,-1961823883}, {-1073741932,-1859775330}, +{-1262259116,-1737350839}, {-1436947137,-1595891268}, {-1595891628,-1436946738}, +{-1737350854,-1262259096}, {-1859775343,-1073741910}, {-1961823997,-873460141}, +{-2042378447,-663608538}, {-2100556013,-446486785}, {-2135719499,-224473240}, +{2147483647,0}, {2121044558,-335940465}, {2042378310,-663608960}, +{1913421927,-974937199}, {1737350743,-1262259248}, {1518500216,-1518500282}, +{1262259172,-1737350799}, {974937230,-1913421912}, {663608871,-2042378339}, +{335940246,-2121044593}, {-94,-2147483647}, {-335940431,-2121044564}, +{-663609049,-2042378281}, {-974937397,-1913421827}, {-1262259116,-1737350839}, +{-1518500258,-1518500240}, {-1737350854,-1262259096}, {-1913422071,-974936918}, +{-2042378447,-663608538}, {-2121044568,-335940406}, {-2147483647,188}, +{-2121044509,335940777}, {-2042378331,663608895}, {-1913421900,974937252}, +{-1737350633,1262259400}, {-1518499993,1518500506}, {-1262258813,1737351059}, +{-974936606,1913422229}, {-663609179,2042378239}, {-335940566,2121044542}, +}; +static const ne10_fft_cpx_int32_t ne10_twiddles_60[60] = { +{0,0}, {2147483647,0}, {2147483647,0}, +{2147483647,0}, {1961823921,-873460313}, {1436946998,-1595891394}, +{2147483647,0}, {1436946998,-1595891394}, {-224473265,-2135719496}, +{2147483647,0}, {663608871,-2042378339}, {-1737350854,-1262259096}, +{2147483647,0}, {-224473265,-2135719496}, {-2100555935,446487152}, +{2147483647,0}, {2135719506,-224473172}, {2100555974,-446486968}, +{2042378310,-663608960}, {1961823921,-873460313}, {1859775377,-1073741851}, +{1737350743,-1262259248}, {1595891331,-1436947067}, {1436946998,-1595891394}, +{1262259172,-1737350799}, {1073741769,-1859775424}, {873460227,-1961823959}, +{663608871,-2042378339}, {446486876,-2100555994}, {224473078,-2135719516}, +{2147483647,0}, {2100555974,-446486968}, {1961823921,-873460313}, +{1737350743,-1262259248}, {1436946998,-1595891394}, {1073741769,-1859775424}, +{663608871,-2042378339}, {224473078,-2135719516}, {-224473265,-2135719496}, +{-663609049,-2042378281}, {-1073741932,-1859775330}, {-1436947137,-1595891268}, +{-1737350854,-1262259096}, {-1961823997,-873460141}, {-2100556013,-446486785}, +{2147483647,0}, {2042378310,-663608960}, {1737350743,-1262259248}, +{1262259172,-1737350799}, {663608871,-2042378339}, {-94,-2147483647}, +{-663609049,-2042378281}, {-1262259116,-1737350839}, {-1737350854,-1262259096}, +{-2042378447,-663608538}, {-2147483647,188}, {-2042378331,663608895}, +{-1737350633,1262259400}, {-1262258813,1737351059}, {-663609179,2042378239}, +}; +static const ne10_fft_state_int32_t ne10_fft_state_int32_t_480 = { +120, +(ne10_int32_t *)ne10_factors_480, +(ne10_fft_cpx_int32_t *)ne10_twiddles_480, +NULL, +(ne10_fft_cpx_int32_t *)&ne10_twiddles_480[120], +}; +static const arch_fft_state cfg_arch_480 = { +1, +(void *)&ne10_fft_state_int32_t_480, +}; + +static const ne10_fft_state_int32_t ne10_fft_state_int32_t_240 = { +60, +(ne10_int32_t *)ne10_factors_240, +(ne10_fft_cpx_int32_t *)ne10_twiddles_240, +NULL, +(ne10_fft_cpx_int32_t *)&ne10_twiddles_240[60], +}; +static const arch_fft_state cfg_arch_240 = { +1, +(void *)&ne10_fft_state_int32_t_240, +}; + +static const ne10_fft_state_int32_t ne10_fft_state_int32_t_120 = { +30, +(ne10_int32_t *)ne10_factors_120, +(ne10_fft_cpx_int32_t *)ne10_twiddles_120, +NULL, +(ne10_fft_cpx_int32_t *)&ne10_twiddles_120[30], +}; +static const arch_fft_state cfg_arch_120 = { +1, +(void *)&ne10_fft_state_int32_t_120, +}; + +static const ne10_fft_state_int32_t ne10_fft_state_int32_t_60 = { +15, +(ne10_int32_t *)ne10_factors_60, +(ne10_fft_cpx_int32_t *)ne10_twiddles_60, +NULL, +(ne10_fft_cpx_int32_t *)&ne10_twiddles_60[15], +}; +static const arch_fft_state cfg_arch_60 = { +1, +(void *)&ne10_fft_state_int32_t_60, +}; + +#endif /* end NE10_FFT_PARAMS48000_960 */ diff --git a/thirdparty/opus/celt/static_modes_float.h b/thirdparty/opus/celt/static_modes_float.h new file mode 100644 index 000000000000..e102a3839184 --- /dev/null +++ b/thirdparty/opus/celt/static_modes_float.h @@ -0,0 +1,888 @@ +/* The contents of this file was automatically generated by dump_modes.c + with arguments: 48000 960 + It contains static definitions for some pre-defined modes. */ +#include "modes.h" +#include "rate.h" + +#ifdef HAVE_ARM_NE10 +#define OVERRIDE_FFT 1 +#include "static_modes_float_arm_ne10.h" +#endif + +#ifndef DEF_WINDOW120 +#define DEF_WINDOW120 +static const opus_val16 window120[120] = { +6.7286966e-05f, 0.00060551348f, 0.0016815970f, 0.0032947962f, 0.0054439943f, +0.0081276923f, 0.011344001f, 0.015090633f, 0.019364886f, 0.024163635f, +0.029483315f, 0.035319905f, 0.041668911f, 0.048525347f, 0.055883718f, +0.063737999f, 0.072081616f, 0.080907428f, 0.090207705f, 0.099974111f, +0.11019769f, 0.12086883f, 0.13197729f, 0.14351214f, 0.15546177f, +0.16781389f, 0.18055550f, 0.19367290f, 0.20715171f, 0.22097682f, +0.23513243f, 0.24960208f, 0.26436860f, 0.27941419f, 0.29472040f, +0.31026818f, 0.32603788f, 0.34200931f, 0.35816177f, 0.37447407f, +0.39092462f, 0.40749142f, 0.42415215f, 0.44088423f, 0.45766484f, +0.47447104f, 0.49127978f, 0.50806798f, 0.52481261f, 0.54149077f, +0.55807973f, 0.57455701f, 0.59090049f, 0.60708841f, 0.62309951f, +0.63891306f, 0.65450896f, 0.66986776f, 0.68497077f, 0.69980010f, +0.71433873f, 0.72857055f, 0.74248043f, 0.75605424f, 0.76927895f, +0.78214257f, 0.79463430f, 0.80674445f, 0.81846456f, 0.82978733f, +0.84070669f, 0.85121779f, 0.86131698f, 0.87100183f, 0.88027111f, +0.88912479f, 0.89756398f, 0.90559094f, 0.91320904f, 0.92042270f, +0.92723738f, 0.93365955f, 0.93969656f, 0.94535671f, 0.95064907f, +0.95558353f, 0.96017067f, 0.96442171f, 0.96834849f, 0.97196334f, +0.97527906f, 0.97830883f, 0.98106616f, 0.98356480f, 0.98581869f, +0.98784191f, 0.98964856f, 0.99125274f, 0.99266849f, 0.99390969f, +0.99499004f, 0.99592297f, 0.99672162f, 0.99739874f, 0.99796667f, +0.99843728f, 0.99882195f, 0.99913147f, 0.99937606f, 0.99956527f, +0.99970802f, 0.99981248f, 0.99988613f, 0.99993565f, 0.99996697f, +0.99998518f, 0.99999457f, 0.99999859f, 0.99999982f, 1.0000000f, +}; +#endif + +#ifndef DEF_LOGN400 +#define DEF_LOGN400 +static const opus_int16 logN400[21] = { +0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 16, 16, 16, 21, 21, 24, 29, 34, 36, }; +#endif + +#ifndef DEF_PULSE_CACHE50 +#define DEF_PULSE_CACHE50 +static const opus_int16 cache_index50[105] = { +-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 41, 41, 41, +82, 82, 123, 164, 200, 222, 0, 0, 0, 0, 0, 0, 0, 0, 41, +41, 41, 41, 123, 123, 123, 164, 164, 240, 266, 283, 295, 41, 41, 41, +41, 41, 41, 41, 41, 123, 123, 123, 123, 240, 240, 240, 266, 266, 305, +318, 328, 336, 123, 123, 123, 123, 123, 123, 123, 123, 240, 240, 240, 240, +305, 305, 305, 318, 318, 343, 351, 358, 364, 240, 240, 240, 240, 240, 240, +240, 240, 305, 305, 305, 305, 343, 343, 343, 351, 351, 370, 376, 382, 387, +}; +static const unsigned char cache_bits50[392] = { +40, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, +7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, +7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 40, 15, 23, 28, +31, 34, 36, 38, 39, 41, 42, 43, 44, 45, 46, 47, 47, 49, 50, +51, 52, 53, 54, 55, 55, 57, 58, 59, 60, 61, 62, 63, 63, 65, +66, 67, 68, 69, 70, 71, 71, 40, 20, 33, 41, 48, 53, 57, 61, +64, 66, 69, 71, 73, 75, 76, 78, 80, 82, 85, 87, 89, 91, 92, +94, 96, 98, 101, 103, 105, 107, 108, 110, 112, 114, 117, 119, 121, 123, +124, 126, 128, 40, 23, 39, 51, 60, 67, 73, 79, 83, 87, 91, 94, +97, 100, 102, 105, 107, 111, 115, 118, 121, 124, 126, 129, 131, 135, 139, +142, 145, 148, 150, 153, 155, 159, 163, 166, 169, 172, 174, 177, 179, 35, +28, 49, 65, 78, 89, 99, 107, 114, 120, 126, 132, 136, 141, 145, 149, +153, 159, 165, 171, 176, 180, 185, 189, 192, 199, 205, 211, 216, 220, 225, +229, 232, 239, 245, 251, 21, 33, 58, 79, 97, 112, 125, 137, 148, 157, +166, 174, 182, 189, 195, 201, 207, 217, 227, 235, 243, 251, 17, 35, 63, +86, 106, 123, 139, 152, 165, 177, 187, 197, 206, 214, 222, 230, 237, 250, +25, 31, 55, 75, 91, 105, 117, 128, 138, 146, 154, 161, 168, 174, 180, +185, 190, 200, 208, 215, 222, 229, 235, 240, 245, 255, 16, 36, 65, 89, +110, 128, 144, 159, 173, 185, 196, 207, 217, 226, 234, 242, 250, 11, 41, +74, 103, 128, 151, 172, 191, 209, 225, 241, 255, 9, 43, 79, 110, 138, +163, 186, 207, 227, 246, 12, 39, 71, 99, 123, 144, 164, 182, 198, 214, +228, 241, 253, 9, 44, 81, 113, 142, 168, 192, 214, 235, 255, 7, 49, +90, 127, 160, 191, 220, 247, 6, 51, 95, 134, 170, 203, 234, 7, 47, +87, 123, 155, 184, 212, 237, 6, 52, 97, 137, 174, 208, 240, 5, 57, +106, 151, 192, 231, 5, 59, 111, 158, 202, 243, 5, 55, 103, 147, 187, +224, 5, 60, 113, 161, 206, 248, 4, 65, 122, 175, 224, 4, 67, 127, +182, 234, }; +static const unsigned char cache_caps50[168] = { +224, 224, 224, 224, 224, 224, 224, 224, 160, 160, 160, 160, 185, 185, 185, +178, 178, 168, 134, 61, 37, 224, 224, 224, 224, 224, 224, 224, 224, 240, +240, 240, 240, 207, 207, 207, 198, 198, 183, 144, 66, 40, 160, 160, 160, +160, 160, 160, 160, 160, 185, 185, 185, 185, 193, 193, 193, 183, 183, 172, +138, 64, 38, 240, 240, 240, 240, 240, 240, 240, 240, 207, 207, 207, 207, +204, 204, 204, 193, 193, 180, 143, 66, 40, 185, 185, 185, 185, 185, 185, +185, 185, 193, 193, 193, 193, 193, 193, 193, 183, 183, 172, 138, 65, 39, +207, 207, 207, 207, 207, 207, 207, 207, 204, 204, 204, 204, 201, 201, 201, +188, 188, 176, 141, 66, 40, 193, 193, 193, 193, 193, 193, 193, 193, 193, +193, 193, 193, 194, 194, 194, 184, 184, 173, 139, 65, 39, 204, 204, 204, +204, 204, 204, 204, 204, 201, 201, 201, 201, 198, 198, 198, 187, 187, 175, +140, 66, 40, }; +#endif + +#ifndef FFT_TWIDDLES48000_960 +#define FFT_TWIDDLES48000_960 +static const kiss_twiddle_cpx fft_twiddles48000_960[480] = { +{1.0000000f, -0.0000000f}, {0.99991433f, -0.013089596f}, +{0.99965732f, -0.026176948f}, {0.99922904f, -0.039259816f}, +{0.99862953f, -0.052335956f}, {0.99785892f, -0.065403129f}, +{0.99691733f, -0.078459096f}, {0.99580493f, -0.091501619f}, +{0.99452190f, -0.10452846f}, {0.99306846f, -0.11753740f}, +{0.99144486f, -0.13052619f}, {0.98965139f, -0.14349262f}, +{0.98768834f, -0.15643447f}, {0.98555606f, -0.16934950f}, +{0.98325491f, -0.18223553f}, {0.98078528f, -0.19509032f}, +{0.97814760f, -0.20791169f}, {0.97534232f, -0.22069744f}, +{0.97236992f, -0.23344536f}, {0.96923091f, -0.24615329f}, +{0.96592583f, -0.25881905f}, {0.96245524f, -0.27144045f}, +{0.95881973f, -0.28401534f}, {0.95501994f, -0.29654157f}, +{0.95105652f, -0.30901699f}, {0.94693013f, -0.32143947f}, +{0.94264149f, -0.33380686f}, {0.93819134f, -0.34611706f}, +{0.93358043f, -0.35836795f}, {0.92880955f, -0.37055744f}, +{0.92387953f, -0.38268343f}, {0.91879121f, -0.39474386f}, +{0.91354546f, -0.40673664f}, {0.90814317f, -0.41865974f}, +{0.90258528f, -0.43051110f}, {0.89687274f, -0.44228869f}, +{0.89100652f, -0.45399050f}, {0.88498764f, -0.46561452f}, +{0.87881711f, -0.47715876f}, {0.87249601f, -0.48862124f}, +{0.86602540f, -0.50000000f}, {0.85940641f, -0.51129309f}, +{0.85264016f, -0.52249856f}, {0.84572782f, -0.53361452f}, +{0.83867057f, -0.54463904f}, {0.83146961f, -0.55557023f}, +{0.82412619f, -0.56640624f}, {0.81664156f, -0.57714519f}, +{0.80901699f, -0.58778525f}, {0.80125381f, -0.59832460f}, +{0.79335334f, -0.60876143f}, {0.78531693f, -0.61909395f}, +{0.77714596f, -0.62932039f}, {0.76884183f, -0.63943900f}, +{0.76040597f, -0.64944805f}, {0.75183981f, -0.65934582f}, +{0.74314483f, -0.66913061f}, {0.73432251f, -0.67880075f}, +{0.72537437f, -0.68835458f}, {0.71630194f, -0.69779046f}, +{0.70710678f, -0.70710678f}, {0.69779046f, -0.71630194f}, +{0.68835458f, -0.72537437f}, {0.67880075f, -0.73432251f}, +{0.66913061f, -0.74314483f}, {0.65934582f, -0.75183981f}, +{0.64944805f, -0.76040597f}, {0.63943900f, -0.76884183f}, +{0.62932039f, -0.77714596f}, {0.61909395f, -0.78531693f}, +{0.60876143f, -0.79335334f}, {0.59832460f, -0.80125381f}, +{0.58778525f, -0.80901699f}, {0.57714519f, -0.81664156f}, +{0.56640624f, -0.82412619f}, {0.55557023f, -0.83146961f}, +{0.54463904f, -0.83867057f}, {0.53361452f, -0.84572782f}, +{0.52249856f, -0.85264016f}, {0.51129309f, -0.85940641f}, +{0.50000000f, -0.86602540f}, {0.48862124f, -0.87249601f}, +{0.47715876f, -0.87881711f}, {0.46561452f, -0.88498764f}, +{0.45399050f, -0.89100652f}, {0.44228869f, -0.89687274f}, +{0.43051110f, -0.90258528f}, {0.41865974f, -0.90814317f}, +{0.40673664f, -0.91354546f}, {0.39474386f, -0.91879121f}, +{0.38268343f, -0.92387953f}, {0.37055744f, -0.92880955f}, +{0.35836795f, -0.93358043f}, {0.34611706f, -0.93819134f}, +{0.33380686f, -0.94264149f}, {0.32143947f, -0.94693013f}, +{0.30901699f, -0.95105652f}, {0.29654157f, -0.95501994f}, +{0.28401534f, -0.95881973f}, {0.27144045f, -0.96245524f}, +{0.25881905f, -0.96592583f}, {0.24615329f, -0.96923091f}, +{0.23344536f, -0.97236992f}, {0.22069744f, -0.97534232f}, +{0.20791169f, -0.97814760f}, {0.19509032f, -0.98078528f}, +{0.18223553f, -0.98325491f}, {0.16934950f, -0.98555606f}, +{0.15643447f, -0.98768834f}, {0.14349262f, -0.98965139f}, +{0.13052619f, -0.99144486f}, {0.11753740f, -0.99306846f}, +{0.10452846f, -0.99452190f}, {0.091501619f, -0.99580493f}, +{0.078459096f, -0.99691733f}, {0.065403129f, -0.99785892f}, +{0.052335956f, -0.99862953f}, {0.039259816f, -0.99922904f}, +{0.026176948f, -0.99965732f}, {0.013089596f, -0.99991433f}, +{6.1230318e-17f, -1.0000000f}, {-0.013089596f, -0.99991433f}, +{-0.026176948f, -0.99965732f}, {-0.039259816f, -0.99922904f}, +{-0.052335956f, -0.99862953f}, {-0.065403129f, -0.99785892f}, +{-0.078459096f, -0.99691733f}, {-0.091501619f, -0.99580493f}, +{-0.10452846f, -0.99452190f}, {-0.11753740f, -0.99306846f}, +{-0.13052619f, -0.99144486f}, {-0.14349262f, -0.98965139f}, +{-0.15643447f, -0.98768834f}, {-0.16934950f, -0.98555606f}, +{-0.18223553f, -0.98325491f}, {-0.19509032f, -0.98078528f}, +{-0.20791169f, -0.97814760f}, {-0.22069744f, -0.97534232f}, +{-0.23344536f, -0.97236992f}, {-0.24615329f, -0.96923091f}, +{-0.25881905f, -0.96592583f}, {-0.27144045f, -0.96245524f}, +{-0.28401534f, -0.95881973f}, {-0.29654157f, -0.95501994f}, +{-0.30901699f, -0.95105652f}, {-0.32143947f, -0.94693013f}, +{-0.33380686f, -0.94264149f}, {-0.34611706f, -0.93819134f}, +{-0.35836795f, -0.93358043f}, {-0.37055744f, -0.92880955f}, +{-0.38268343f, -0.92387953f}, {-0.39474386f, -0.91879121f}, +{-0.40673664f, -0.91354546f}, {-0.41865974f, -0.90814317f}, +{-0.43051110f, -0.90258528f}, {-0.44228869f, -0.89687274f}, +{-0.45399050f, -0.89100652f}, {-0.46561452f, -0.88498764f}, +{-0.47715876f, -0.87881711f}, {-0.48862124f, -0.87249601f}, +{-0.50000000f, -0.86602540f}, {-0.51129309f, -0.85940641f}, +{-0.52249856f, -0.85264016f}, {-0.53361452f, -0.84572782f}, +{-0.54463904f, -0.83867057f}, {-0.55557023f, -0.83146961f}, +{-0.56640624f, -0.82412619f}, {-0.57714519f, -0.81664156f}, +{-0.58778525f, -0.80901699f}, {-0.59832460f, -0.80125381f}, +{-0.60876143f, -0.79335334f}, {-0.61909395f, -0.78531693f}, +{-0.62932039f, -0.77714596f}, {-0.63943900f, -0.76884183f}, +{-0.64944805f, -0.76040597f}, {-0.65934582f, -0.75183981f}, +{-0.66913061f, -0.74314483f}, {-0.67880075f, -0.73432251f}, +{-0.68835458f, -0.72537437f}, {-0.69779046f, -0.71630194f}, +{-0.70710678f, -0.70710678f}, {-0.71630194f, -0.69779046f}, +{-0.72537437f, -0.68835458f}, {-0.73432251f, -0.67880075f}, +{-0.74314483f, -0.66913061f}, {-0.75183981f, -0.65934582f}, +{-0.76040597f, -0.64944805f}, {-0.76884183f, -0.63943900f}, +{-0.77714596f, -0.62932039f}, {-0.78531693f, -0.61909395f}, +{-0.79335334f, -0.60876143f}, {-0.80125381f, -0.59832460f}, +{-0.80901699f, -0.58778525f}, {-0.81664156f, -0.57714519f}, +{-0.82412619f, -0.56640624f}, {-0.83146961f, -0.55557023f}, +{-0.83867057f, -0.54463904f}, {-0.84572782f, -0.53361452f}, +{-0.85264016f, -0.52249856f}, {-0.85940641f, -0.51129309f}, +{-0.86602540f, -0.50000000f}, {-0.87249601f, -0.48862124f}, +{-0.87881711f, -0.47715876f}, {-0.88498764f, -0.46561452f}, +{-0.89100652f, -0.45399050f}, {-0.89687274f, -0.44228869f}, +{-0.90258528f, -0.43051110f}, {-0.90814317f, -0.41865974f}, +{-0.91354546f, -0.40673664f}, {-0.91879121f, -0.39474386f}, +{-0.92387953f, -0.38268343f}, {-0.92880955f, -0.37055744f}, +{-0.93358043f, -0.35836795f}, {-0.93819134f, -0.34611706f}, +{-0.94264149f, -0.33380686f}, {-0.94693013f, -0.32143947f}, +{-0.95105652f, -0.30901699f}, {-0.95501994f, -0.29654157f}, +{-0.95881973f, -0.28401534f}, {-0.96245524f, -0.27144045f}, +{-0.96592583f, -0.25881905f}, {-0.96923091f, -0.24615329f}, +{-0.97236992f, -0.23344536f}, {-0.97534232f, -0.22069744f}, +{-0.97814760f, -0.20791169f}, {-0.98078528f, -0.19509032f}, +{-0.98325491f, -0.18223553f}, {-0.98555606f, -0.16934950f}, +{-0.98768834f, -0.15643447f}, {-0.98965139f, -0.14349262f}, +{-0.99144486f, -0.13052619f}, {-0.99306846f, -0.11753740f}, +{-0.99452190f, -0.10452846f}, {-0.99580493f, -0.091501619f}, +{-0.99691733f, -0.078459096f}, {-0.99785892f, -0.065403129f}, +{-0.99862953f, -0.052335956f}, {-0.99922904f, -0.039259816f}, +{-0.99965732f, -0.026176948f}, {-0.99991433f, -0.013089596f}, +{-1.0000000f, -1.2246064e-16f}, {-0.99991433f, 0.013089596f}, +{-0.99965732f, 0.026176948f}, {-0.99922904f, 0.039259816f}, +{-0.99862953f, 0.052335956f}, {-0.99785892f, 0.065403129f}, +{-0.99691733f, 0.078459096f}, {-0.99580493f, 0.091501619f}, +{-0.99452190f, 0.10452846f}, {-0.99306846f, 0.11753740f}, +{-0.99144486f, 0.13052619f}, {-0.98965139f, 0.14349262f}, +{-0.98768834f, 0.15643447f}, {-0.98555606f, 0.16934950f}, +{-0.98325491f, 0.18223553f}, {-0.98078528f, 0.19509032f}, +{-0.97814760f, 0.20791169f}, {-0.97534232f, 0.22069744f}, +{-0.97236992f, 0.23344536f}, {-0.96923091f, 0.24615329f}, +{-0.96592583f, 0.25881905f}, {-0.96245524f, 0.27144045f}, +{-0.95881973f, 0.28401534f}, {-0.95501994f, 0.29654157f}, +{-0.95105652f, 0.30901699f}, {-0.94693013f, 0.32143947f}, +{-0.94264149f, 0.33380686f}, {-0.93819134f, 0.34611706f}, +{-0.93358043f, 0.35836795f}, {-0.92880955f, 0.37055744f}, +{-0.92387953f, 0.38268343f}, {-0.91879121f, 0.39474386f}, +{-0.91354546f, 0.40673664f}, {-0.90814317f, 0.41865974f}, +{-0.90258528f, 0.43051110f}, {-0.89687274f, 0.44228869f}, +{-0.89100652f, 0.45399050f}, {-0.88498764f, 0.46561452f}, +{-0.87881711f, 0.47715876f}, {-0.87249601f, 0.48862124f}, +{-0.86602540f, 0.50000000f}, {-0.85940641f, 0.51129309f}, +{-0.85264016f, 0.52249856f}, {-0.84572782f, 0.53361452f}, +{-0.83867057f, 0.54463904f}, {-0.83146961f, 0.55557023f}, +{-0.82412619f, 0.56640624f}, {-0.81664156f, 0.57714519f}, +{-0.80901699f, 0.58778525f}, {-0.80125381f, 0.59832460f}, +{-0.79335334f, 0.60876143f}, {-0.78531693f, 0.61909395f}, +{-0.77714596f, 0.62932039f}, {-0.76884183f, 0.63943900f}, +{-0.76040597f, 0.64944805f}, {-0.75183981f, 0.65934582f}, +{-0.74314483f, 0.66913061f}, {-0.73432251f, 0.67880075f}, +{-0.72537437f, 0.68835458f}, {-0.71630194f, 0.69779046f}, +{-0.70710678f, 0.70710678f}, {-0.69779046f, 0.71630194f}, +{-0.68835458f, 0.72537437f}, {-0.67880075f, 0.73432251f}, +{-0.66913061f, 0.74314483f}, {-0.65934582f, 0.75183981f}, +{-0.64944805f, 0.76040597f}, {-0.63943900f, 0.76884183f}, +{-0.62932039f, 0.77714596f}, {-0.61909395f, 0.78531693f}, +{-0.60876143f, 0.79335334f}, {-0.59832460f, 0.80125381f}, +{-0.58778525f, 0.80901699f}, {-0.57714519f, 0.81664156f}, +{-0.56640624f, 0.82412619f}, {-0.55557023f, 0.83146961f}, +{-0.54463904f, 0.83867057f}, {-0.53361452f, 0.84572782f}, +{-0.52249856f, 0.85264016f}, {-0.51129309f, 0.85940641f}, +{-0.50000000f, 0.86602540f}, {-0.48862124f, 0.87249601f}, +{-0.47715876f, 0.87881711f}, {-0.46561452f, 0.88498764f}, +{-0.45399050f, 0.89100652f}, {-0.44228869f, 0.89687274f}, +{-0.43051110f, 0.90258528f}, {-0.41865974f, 0.90814317f}, +{-0.40673664f, 0.91354546f}, {-0.39474386f, 0.91879121f}, +{-0.38268343f, 0.92387953f}, {-0.37055744f, 0.92880955f}, +{-0.35836795f, 0.93358043f}, {-0.34611706f, 0.93819134f}, +{-0.33380686f, 0.94264149f}, {-0.32143947f, 0.94693013f}, +{-0.30901699f, 0.95105652f}, {-0.29654157f, 0.95501994f}, +{-0.28401534f, 0.95881973f}, {-0.27144045f, 0.96245524f}, +{-0.25881905f, 0.96592583f}, {-0.24615329f, 0.96923091f}, +{-0.23344536f, 0.97236992f}, {-0.22069744f, 0.97534232f}, +{-0.20791169f, 0.97814760f}, {-0.19509032f, 0.98078528f}, +{-0.18223553f, 0.98325491f}, {-0.16934950f, 0.98555606f}, +{-0.15643447f, 0.98768834f}, {-0.14349262f, 0.98965139f}, +{-0.13052619f, 0.99144486f}, {-0.11753740f, 0.99306846f}, +{-0.10452846f, 0.99452190f}, {-0.091501619f, 0.99580493f}, +{-0.078459096f, 0.99691733f}, {-0.065403129f, 0.99785892f}, +{-0.052335956f, 0.99862953f}, {-0.039259816f, 0.99922904f}, +{-0.026176948f, 0.99965732f}, {-0.013089596f, 0.99991433f}, +{-1.8369095e-16f, 1.0000000f}, {0.013089596f, 0.99991433f}, +{0.026176948f, 0.99965732f}, {0.039259816f, 0.99922904f}, +{0.052335956f, 0.99862953f}, {0.065403129f, 0.99785892f}, +{0.078459096f, 0.99691733f}, {0.091501619f, 0.99580493f}, +{0.10452846f, 0.99452190f}, {0.11753740f, 0.99306846f}, +{0.13052619f, 0.99144486f}, {0.14349262f, 0.98965139f}, +{0.15643447f, 0.98768834f}, {0.16934950f, 0.98555606f}, +{0.18223553f, 0.98325491f}, {0.19509032f, 0.98078528f}, +{0.20791169f, 0.97814760f}, {0.22069744f, 0.97534232f}, +{0.23344536f, 0.97236992f}, {0.24615329f, 0.96923091f}, +{0.25881905f, 0.96592583f}, {0.27144045f, 0.96245524f}, +{0.28401534f, 0.95881973f}, {0.29654157f, 0.95501994f}, +{0.30901699f, 0.95105652f}, {0.32143947f, 0.94693013f}, +{0.33380686f, 0.94264149f}, {0.34611706f, 0.93819134f}, +{0.35836795f, 0.93358043f}, {0.37055744f, 0.92880955f}, +{0.38268343f, 0.92387953f}, {0.39474386f, 0.91879121f}, +{0.40673664f, 0.91354546f}, {0.41865974f, 0.90814317f}, +{0.43051110f, 0.90258528f}, {0.44228869f, 0.89687274f}, +{0.45399050f, 0.89100652f}, {0.46561452f, 0.88498764f}, +{0.47715876f, 0.87881711f}, {0.48862124f, 0.87249601f}, +{0.50000000f, 0.86602540f}, {0.51129309f, 0.85940641f}, +{0.52249856f, 0.85264016f}, {0.53361452f, 0.84572782f}, +{0.54463904f, 0.83867057f}, {0.55557023f, 0.83146961f}, +{0.56640624f, 0.82412619f}, {0.57714519f, 0.81664156f}, +{0.58778525f, 0.80901699f}, {0.59832460f, 0.80125381f}, +{0.60876143f, 0.79335334f}, {0.61909395f, 0.78531693f}, +{0.62932039f, 0.77714596f}, {0.63943900f, 0.76884183f}, +{0.64944805f, 0.76040597f}, {0.65934582f, 0.75183981f}, +{0.66913061f, 0.74314483f}, {0.67880075f, 0.73432251f}, +{0.68835458f, 0.72537437f}, {0.69779046f, 0.71630194f}, +{0.70710678f, 0.70710678f}, {0.71630194f, 0.69779046f}, +{0.72537437f, 0.68835458f}, {0.73432251f, 0.67880075f}, +{0.74314483f, 0.66913061f}, {0.75183981f, 0.65934582f}, +{0.76040597f, 0.64944805f}, {0.76884183f, 0.63943900f}, +{0.77714596f, 0.62932039f}, {0.78531693f, 0.61909395f}, +{0.79335334f, 0.60876143f}, {0.80125381f, 0.59832460f}, +{0.80901699f, 0.58778525f}, {0.81664156f, 0.57714519f}, +{0.82412619f, 0.56640624f}, {0.83146961f, 0.55557023f}, +{0.83867057f, 0.54463904f}, {0.84572782f, 0.53361452f}, +{0.85264016f, 0.52249856f}, {0.85940641f, 0.51129309f}, +{0.86602540f, 0.50000000f}, {0.87249601f, 0.48862124f}, +{0.87881711f, 0.47715876f}, {0.88498764f, 0.46561452f}, +{0.89100652f, 0.45399050f}, {0.89687274f, 0.44228869f}, +{0.90258528f, 0.43051110f}, {0.90814317f, 0.41865974f}, +{0.91354546f, 0.40673664f}, {0.91879121f, 0.39474386f}, +{0.92387953f, 0.38268343f}, {0.92880955f, 0.37055744f}, +{0.93358043f, 0.35836795f}, {0.93819134f, 0.34611706f}, +{0.94264149f, 0.33380686f}, {0.94693013f, 0.32143947f}, +{0.95105652f, 0.30901699f}, {0.95501994f, 0.29654157f}, +{0.95881973f, 0.28401534f}, {0.96245524f, 0.27144045f}, +{0.96592583f, 0.25881905f}, {0.96923091f, 0.24615329f}, +{0.97236992f, 0.23344536f}, {0.97534232f, 0.22069744f}, +{0.97814760f, 0.20791169f}, {0.98078528f, 0.19509032f}, +{0.98325491f, 0.18223553f}, {0.98555606f, 0.16934950f}, +{0.98768834f, 0.15643447f}, {0.98965139f, 0.14349262f}, +{0.99144486f, 0.13052619f}, {0.99306846f, 0.11753740f}, +{0.99452190f, 0.10452846f}, {0.99580493f, 0.091501619f}, +{0.99691733f, 0.078459096f}, {0.99785892f, 0.065403129f}, +{0.99862953f, 0.052335956f}, {0.99922904f, 0.039259816f}, +{0.99965732f, 0.026176948f}, {0.99991433f, 0.013089596f}, +}; +#ifndef FFT_BITREV480 +#define FFT_BITREV480 +static const opus_int16 fft_bitrev480[480] = { +0, 96, 192, 288, 384, 32, 128, 224, 320, 416, 64, 160, 256, 352, 448, +8, 104, 200, 296, 392, 40, 136, 232, 328, 424, 72, 168, 264, 360, 456, +16, 112, 208, 304, 400, 48, 144, 240, 336, 432, 80, 176, 272, 368, 464, +24, 120, 216, 312, 408, 56, 152, 248, 344, 440, 88, 184, 280, 376, 472, +4, 100, 196, 292, 388, 36, 132, 228, 324, 420, 68, 164, 260, 356, 452, +12, 108, 204, 300, 396, 44, 140, 236, 332, 428, 76, 172, 268, 364, 460, +20, 116, 212, 308, 404, 52, 148, 244, 340, 436, 84, 180, 276, 372, 468, +28, 124, 220, 316, 412, 60, 156, 252, 348, 444, 92, 188, 284, 380, 476, +1, 97, 193, 289, 385, 33, 129, 225, 321, 417, 65, 161, 257, 353, 449, +9, 105, 201, 297, 393, 41, 137, 233, 329, 425, 73, 169, 265, 361, 457, +17, 113, 209, 305, 401, 49, 145, 241, 337, 433, 81, 177, 273, 369, 465, +25, 121, 217, 313, 409, 57, 153, 249, 345, 441, 89, 185, 281, 377, 473, +5, 101, 197, 293, 389, 37, 133, 229, 325, 421, 69, 165, 261, 357, 453, +13, 109, 205, 301, 397, 45, 141, 237, 333, 429, 77, 173, 269, 365, 461, +21, 117, 213, 309, 405, 53, 149, 245, 341, 437, 85, 181, 277, 373, 469, +29, 125, 221, 317, 413, 61, 157, 253, 349, 445, 93, 189, 285, 381, 477, +2, 98, 194, 290, 386, 34, 130, 226, 322, 418, 66, 162, 258, 354, 450, +10, 106, 202, 298, 394, 42, 138, 234, 330, 426, 74, 170, 266, 362, 458, +18, 114, 210, 306, 402, 50, 146, 242, 338, 434, 82, 178, 274, 370, 466, +26, 122, 218, 314, 410, 58, 154, 250, 346, 442, 90, 186, 282, 378, 474, +6, 102, 198, 294, 390, 38, 134, 230, 326, 422, 70, 166, 262, 358, 454, +14, 110, 206, 302, 398, 46, 142, 238, 334, 430, 78, 174, 270, 366, 462, +22, 118, 214, 310, 406, 54, 150, 246, 342, 438, 86, 182, 278, 374, 470, +30, 126, 222, 318, 414, 62, 158, 254, 350, 446, 94, 190, 286, 382, 478, +3, 99, 195, 291, 387, 35, 131, 227, 323, 419, 67, 163, 259, 355, 451, +11, 107, 203, 299, 395, 43, 139, 235, 331, 427, 75, 171, 267, 363, 459, +19, 115, 211, 307, 403, 51, 147, 243, 339, 435, 83, 179, 275, 371, 467, +27, 123, 219, 315, 411, 59, 155, 251, 347, 443, 91, 187, 283, 379, 475, +7, 103, 199, 295, 391, 39, 135, 231, 327, 423, 71, 167, 263, 359, 455, +15, 111, 207, 303, 399, 47, 143, 239, 335, 431, 79, 175, 271, 367, 463, +23, 119, 215, 311, 407, 55, 151, 247, 343, 439, 87, 183, 279, 375, 471, +31, 127, 223, 319, 415, 63, 159, 255, 351, 447, 95, 191, 287, 383, 479, +}; +#endif + +#ifndef FFT_BITREV240 +#define FFT_BITREV240 +static const opus_int16 fft_bitrev240[240] = { +0, 48, 96, 144, 192, 16, 64, 112, 160, 208, 32, 80, 128, 176, 224, +4, 52, 100, 148, 196, 20, 68, 116, 164, 212, 36, 84, 132, 180, 228, +8, 56, 104, 152, 200, 24, 72, 120, 168, 216, 40, 88, 136, 184, 232, +12, 60, 108, 156, 204, 28, 76, 124, 172, 220, 44, 92, 140, 188, 236, +1, 49, 97, 145, 193, 17, 65, 113, 161, 209, 33, 81, 129, 177, 225, +5, 53, 101, 149, 197, 21, 69, 117, 165, 213, 37, 85, 133, 181, 229, +9, 57, 105, 153, 201, 25, 73, 121, 169, 217, 41, 89, 137, 185, 233, +13, 61, 109, 157, 205, 29, 77, 125, 173, 221, 45, 93, 141, 189, 237, +2, 50, 98, 146, 194, 18, 66, 114, 162, 210, 34, 82, 130, 178, 226, +6, 54, 102, 150, 198, 22, 70, 118, 166, 214, 38, 86, 134, 182, 230, +10, 58, 106, 154, 202, 26, 74, 122, 170, 218, 42, 90, 138, 186, 234, +14, 62, 110, 158, 206, 30, 78, 126, 174, 222, 46, 94, 142, 190, 238, +3, 51, 99, 147, 195, 19, 67, 115, 163, 211, 35, 83, 131, 179, 227, +7, 55, 103, 151, 199, 23, 71, 119, 167, 215, 39, 87, 135, 183, 231, +11, 59, 107, 155, 203, 27, 75, 123, 171, 219, 43, 91, 139, 187, 235, +15, 63, 111, 159, 207, 31, 79, 127, 175, 223, 47, 95, 143, 191, 239, +}; +#endif + +#ifndef FFT_BITREV120 +#define FFT_BITREV120 +static const opus_int16 fft_bitrev120[120] = { +0, 24, 48, 72, 96, 8, 32, 56, 80, 104, 16, 40, 64, 88, 112, +4, 28, 52, 76, 100, 12, 36, 60, 84, 108, 20, 44, 68, 92, 116, +1, 25, 49, 73, 97, 9, 33, 57, 81, 105, 17, 41, 65, 89, 113, +5, 29, 53, 77, 101, 13, 37, 61, 85, 109, 21, 45, 69, 93, 117, +2, 26, 50, 74, 98, 10, 34, 58, 82, 106, 18, 42, 66, 90, 114, +6, 30, 54, 78, 102, 14, 38, 62, 86, 110, 22, 46, 70, 94, 118, +3, 27, 51, 75, 99, 11, 35, 59, 83, 107, 19, 43, 67, 91, 115, +7, 31, 55, 79, 103, 15, 39, 63, 87, 111, 23, 47, 71, 95, 119, +}; +#endif + +#ifndef FFT_BITREV60 +#define FFT_BITREV60 +static const opus_int16 fft_bitrev60[60] = { +0, 12, 24, 36, 48, 4, 16, 28, 40, 52, 8, 20, 32, 44, 56, +1, 13, 25, 37, 49, 5, 17, 29, 41, 53, 9, 21, 33, 45, 57, +2, 14, 26, 38, 50, 6, 18, 30, 42, 54, 10, 22, 34, 46, 58, +3, 15, 27, 39, 51, 7, 19, 31, 43, 55, 11, 23, 35, 47, 59, +}; +#endif + +#ifndef FFT_STATE48000_960_0 +#define FFT_STATE48000_960_0 +static const kiss_fft_state fft_state48000_960_0 = { +480, /* nfft */ +0.002083333f, /* scale */ +-1, /* shift */ +{5, 96, 3, 32, 4, 8, 2, 4, 4, 1, 0, 0, 0, 0, 0, 0, }, /* factors */ +fft_bitrev480, /* bitrev */ +fft_twiddles48000_960, /* bitrev */ +#ifdef OVERRIDE_FFT +(arch_fft_state *)&cfg_arch_480, +#else +NULL, +#endif +}; +#endif + +#ifndef FFT_STATE48000_960_1 +#define FFT_STATE48000_960_1 +static const kiss_fft_state fft_state48000_960_1 = { +240, /* nfft */ +0.004166667f, /* scale */ +1, /* shift */ +{5, 48, 3, 16, 4, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, }, /* factors */ +fft_bitrev240, /* bitrev */ +fft_twiddles48000_960, /* bitrev */ +#ifdef OVERRIDE_FFT +(arch_fft_state *)&cfg_arch_240, +#else +NULL, +#endif +}; +#endif + +#ifndef FFT_STATE48000_960_2 +#define FFT_STATE48000_960_2 +static const kiss_fft_state fft_state48000_960_2 = { +120, /* nfft */ +0.008333333f, /* scale */ +2, /* shift */ +{5, 24, 3, 8, 2, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, }, /* factors */ +fft_bitrev120, /* bitrev */ +fft_twiddles48000_960, /* bitrev */ +#ifdef OVERRIDE_FFT +(arch_fft_state *)&cfg_arch_120, +#else +NULL, +#endif +}; +#endif + +#ifndef FFT_STATE48000_960_3 +#define FFT_STATE48000_960_3 +static const kiss_fft_state fft_state48000_960_3 = { +60, /* nfft */ +0.016666667f, /* scale */ +3, /* shift */ +{5, 12, 3, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, /* factors */ +fft_bitrev60, /* bitrev */ +fft_twiddles48000_960, /* bitrev */ +#ifdef OVERRIDE_FFT +(arch_fft_state *)&cfg_arch_60, +#else +NULL, +#endif +}; +#endif + +#endif + +#ifndef MDCT_TWIDDLES960 +#define MDCT_TWIDDLES960 +static const opus_val16 mdct_twiddles960[1800] = { +0.99999994f, 0.99999321f, 0.99997580f, 0.99994773f, 0.99990886f, +0.99985933f, 0.99979913f, 0.99972820f, 0.99964654f, 0.99955416f, +0.99945110f, 0.99933738f, 0.99921292f, 0.99907774f, 0.99893188f, +0.99877530f, 0.99860805f, 0.99843007f, 0.99824142f, 0.99804211f, +0.99783206f, 0.99761140f, 0.99737996f, 0.99713790f, 0.99688518f, +0.99662173f, 0.99634761f, 0.99606287f, 0.99576741f, 0.99546129f, +0.99514455f, 0.99481714f, 0.99447906f, 0.99413031f, 0.99377096f, +0.99340093f, 0.99302030f, 0.99262899f, 0.99222708f, 0.99181455f, +0.99139136f, 0.99095762f, 0.99051321f, 0.99005818f, 0.98959261f, +0.98911643f, 0.98862964f, 0.98813224f, 0.98762429f, 0.98710573f, +0.98657662f, 0.98603696f, 0.98548669f, 0.98492593f, 0.98435456f, +0.98377270f, 0.98318028f, 0.98257732f, 0.98196387f, 0.98133987f, +0.98070538f, 0.98006040f, 0.97940493f, 0.97873890f, 0.97806245f, +0.97737551f, 0.97667813f, 0.97597027f, 0.97525197f, 0.97452319f, +0.97378403f, 0.97303438f, 0.97227436f, 0.97150391f, 0.97072303f, +0.96993178f, 0.96913016f, 0.96831810f, 0.96749574f, 0.96666300f, +0.96581990f, 0.96496642f, 0.96410263f, 0.96322852f, 0.96234411f, +0.96144938f, 0.96054435f, 0.95962906f, 0.95870346f, 0.95776761f, +0.95682150f, 0.95586514f, 0.95489854f, 0.95392174f, 0.95293468f, +0.95193744f, 0.95093000f, 0.94991243f, 0.94888461f, 0.94784665f, +0.94679856f, 0.94574034f, 0.94467193f, 0.94359344f, 0.94250488f, +0.94140619f, 0.94029742f, 0.93917859f, 0.93804967f, 0.93691075f, +0.93576175f, 0.93460274f, 0.93343377f, 0.93225473f, 0.93106574f, +0.92986679f, 0.92865789f, 0.92743903f, 0.92621022f, 0.92497152f, +0.92372292f, 0.92246443f, 0.92119598f, 0.91991776f, 0.91862965f, +0.91733170f, 0.91602397f, 0.91470635f, 0.91337901f, 0.91204184f, +0.91069490f, 0.90933824f, 0.90797186f, 0.90659571f, 0.90520984f, +0.90381432f, 0.90240908f, 0.90099424f, 0.89956969f, 0.89813554f, +0.89669174f, 0.89523834f, 0.89377540f, 0.89230281f, 0.89082074f, +0.88932908f, 0.88782793f, 0.88631725f, 0.88479710f, 0.88326746f, +0.88172835f, 0.88017982f, 0.87862182f, 0.87705445f, 0.87547767f, +0.87389153f, 0.87229604f, 0.87069118f, 0.86907703f, 0.86745358f, +0.86582077f, 0.86417878f, 0.86252749f, 0.86086690f, 0.85919720f, +0.85751826f, 0.85583007f, 0.85413277f, 0.85242635f, 0.85071075f, +0.84898609f, 0.84725231f, 0.84550947f, 0.84375757f, 0.84199661f, +0.84022665f, 0.83844769f, 0.83665979f, 0.83486289f, 0.83305705f, +0.83124226f, 0.82941860f, 0.82758605f, 0.82574469f, 0.82389444f, +0.82203537f, 0.82016748f, 0.81829083f, 0.81640542f, 0.81451124f, +0.81260836f, 0.81069672f, 0.80877650f, 0.80684757f, 0.80490994f, +0.80296379f, 0.80100900f, 0.79904562f, 0.79707366f, 0.79509324f, +0.79310423f, 0.79110676f, 0.78910083f, 0.78708643f, 0.78506362f, +0.78303236f, 0.78099275f, 0.77894479f, 0.77688843f, 0.77482378f, +0.77275085f, 0.77066964f, 0.76858020f, 0.76648247f, 0.76437658f, +0.76226246f, 0.76014024f, 0.75800985f, 0.75587130f, 0.75372469f, +0.75157005f, 0.74940729f, 0.74723655f, 0.74505776f, 0.74287105f, +0.74067634f, 0.73847371f, 0.73626316f, 0.73404479f, 0.73181850f, +0.72958434f, 0.72734243f, 0.72509271f, 0.72283524f, 0.72057003f, +0.71829706f, 0.71601641f, 0.71372813f, 0.71143216f, 0.70912862f, +0.70681745f, 0.70449871f, 0.70217246f, 0.69983864f, 0.69749737f, +0.69514859f, 0.69279242f, 0.69042879f, 0.68805778f, 0.68567938f, +0.68329364f, 0.68090063f, 0.67850029f, 0.67609268f, 0.67367786f, +0.67125577f, 0.66882652f, 0.66639012f, 0.66394657f, 0.66149592f, +0.65903819f, 0.65657341f, 0.65410155f, 0.65162271f, 0.64913690f, +0.64664418f, 0.64414448f, 0.64163786f, 0.63912445f, 0.63660413f, +0.63407701f, 0.63154310f, 0.62900239f, 0.62645501f, 0.62390089f, +0.62134010f, 0.61877263f, 0.61619854f, 0.61361790f, 0.61103064f, +0.60843682f, 0.60583651f, 0.60322970f, 0.60061646f, 0.59799677f, +0.59537065f, 0.59273821f, 0.59009939f, 0.58745426f, 0.58480281f, +0.58214509f, 0.57948118f, 0.57681108f, 0.57413477f, 0.57145232f, +0.56876373f, 0.56606907f, 0.56336832f, 0.56066155f, 0.55794877f, +0.55523002f, 0.55250537f, 0.54977477f, 0.54703826f, 0.54429591f, +0.54154772f, 0.53879374f, 0.53603399f, 0.53326851f, 0.53049731f, +0.52772039f, 0.52493787f, 0.52214974f, 0.51935595f, 0.51655668f, +0.51375180f, 0.51094145f, 0.50812566f, 0.50530440f, 0.50247771f, +0.49964568f, 0.49680826f, 0.49396557f, 0.49111754f, 0.48826426f, +0.48540577f, 0.48254207f, 0.47967321f, 0.47679919f, 0.47392011f, +0.47103590f, 0.46814668f, 0.46525243f, 0.46235323f, 0.45944905f, +0.45653993f, 0.45362595f, 0.45070711f, 0.44778344f, 0.44485497f, +0.44192174f, 0.43898380f, 0.43604112f, 0.43309379f, 0.43014181f, +0.42718524f, 0.42422408f, 0.42125839f, 0.41828820f, 0.41531351f, +0.41233435f, 0.40935081f, 0.40636289f, 0.40337059f, 0.40037400f, +0.39737311f, 0.39436796f, 0.39135858f, 0.38834500f, 0.38532731f, +0.38230544f, 0.37927949f, 0.37624949f, 0.37321547f, 0.37017745f, +0.36713544f, 0.36408952f, 0.36103970f, 0.35798600f, 0.35492846f, +0.35186714f, 0.34880206f, 0.34573323f, 0.34266070f, 0.33958447f, +0.33650464f, 0.33342120f, 0.33033419f, 0.32724363f, 0.32414958f, +0.32105204f, 0.31795108f, 0.31484672f, 0.31173897f, 0.30862790f, +0.30551350f, 0.30239585f, 0.29927495f, 0.29615086f, 0.29302359f, +0.28989318f, 0.28675964f, 0.28362307f, 0.28048345f, 0.27734083f, +0.27419522f, 0.27104670f, 0.26789525f, 0.26474094f, 0.26158381f, +0.25842386f, 0.25526115f, 0.25209570f, 0.24892756f, 0.24575676f, +0.24258332f, 0.23940729f, 0.23622867f, 0.23304754f, 0.22986393f, +0.22667783f, 0.22348931f, 0.22029841f, 0.21710514f, 0.21390954f, +0.21071166f, 0.20751151f, 0.20430915f, 0.20110460f, 0.19789790f, +0.19468907f, 0.19147816f, 0.18826519f, 0.18505022f, 0.18183327f, +0.17861435f, 0.17539354f, 0.17217083f, 0.16894630f, 0.16571994f, +0.16249183f, 0.15926196f, 0.15603039f, 0.15279715f, 0.14956227f, +0.14632578f, 0.14308774f, 0.13984816f, 0.13660708f, 0.13336454f, +0.13012058f, 0.12687522f, 0.12362850f, 0.12038045f, 0.11713112f, +0.11388054f, 0.11062872f, 0.10737573f, 0.10412160f, 0.10086634f, +0.097609997f, 0.094352618f, 0.091094226f, 0.087834857f, 0.084574550f, +0.081313334f, 0.078051247f, 0.074788325f, 0.071524605f, 0.068260118f, +0.064994894f, 0.061728980f, 0.058462404f, 0.055195201f, 0.051927410f, +0.048659060f, 0.045390189f, 0.042120833f, 0.038851023f, 0.035580799f, +0.032310195f, 0.029039243f, 0.025767982f, 0.022496443f, 0.019224664f, +0.015952680f, 0.012680525f, 0.0094082337f, 0.0061358409f, 0.0028633832f, +-0.00040910527f, -0.0036815894f, -0.0069540343f, -0.010226404f, -0.013498665f, +-0.016770782f, -0.020042717f, -0.023314439f, -0.026585912f, -0.029857099f, +-0.033127967f, -0.036398482f, -0.039668605f, -0.042938303f, -0.046207540f, +-0.049476285f, -0.052744497f, -0.056012146f, -0.059279196f, -0.062545612f, +-0.065811358f, -0.069076397f, -0.072340697f, -0.075604223f, -0.078866936f, +-0.082128808f, -0.085389800f, -0.088649876f, -0.091909006f, -0.095167145f, +-0.098424271f, -0.10168034f, -0.10493532f, -0.10818918f, -0.11144188f, +-0.11469338f, -0.11794366f, -0.12119267f, -0.12444039f, -0.12768677f, +-0.13093179f, -0.13417540f, -0.13741758f, -0.14065829f, -0.14389749f, +-0.14713514f, -0.15037122f, -0.15360570f, -0.15683852f, -0.16006967f, +-0.16329910f, -0.16652679f, -0.16975269f, -0.17297678f, -0.17619900f, +-0.17941935f, -0.18263777f, -0.18585424f, -0.18906870f, -0.19228116f, +-0.19549155f, -0.19869985f, -0.20190603f, -0.20511003f, -0.20831184f, +-0.21151142f, -0.21470875f, -0.21790376f, -0.22109644f, -0.22428675f, +-0.22747467f, -0.23066014f, -0.23384315f, -0.23702365f, -0.24020162f, +-0.24337701f, -0.24654980f, -0.24971995f, -0.25288740f, -0.25605217f, +-0.25921419f, -0.26237345f, -0.26552987f, -0.26868346f, -0.27183419f, +-0.27498198f, -0.27812684f, -0.28126872f, -0.28440759f, -0.28754342f, +-0.29067615f, -0.29380578f, -0.29693225f, -0.30005556f, -0.30317566f, +-0.30629250f, -0.30940607f, -0.31251630f, -0.31562322f, -0.31872672f, +-0.32182685f, -0.32492352f, -0.32801670f, -0.33110636f, -0.33419248f, +-0.33727503f, -0.34035397f, -0.34342924f, -0.34650084f, -0.34956875f, +-0.35263291f, -0.35569328f, -0.35874987f, -0.36180258f, -0.36485144f, +-0.36789638f, -0.37093741f, -0.37397444f, -0.37700745f, -0.38003644f, +-0.38306138f, -0.38608220f, -0.38909888f, -0.39211139f, -0.39511973f, +-0.39812380f, -0.40112361f, -0.40411916f, -0.40711036f, -0.41009718f, +-0.41307965f, -0.41605768f, -0.41903123f, -0.42200032f, -0.42496487f, +-0.42792490f, -0.43088034f, -0.43383113f, -0.43677729f, -0.43971881f, +-0.44265559f, -0.44558764f, -0.44851488f, -0.45143735f, -0.45435500f, +-0.45726776f, -0.46017563f, -0.46307856f, -0.46597654f, -0.46886954f, +-0.47175750f, -0.47464043f, -0.47751826f, -0.48039100f, -0.48325855f, +-0.48612097f, -0.48897815f, -0.49183011f, -0.49467680f, -0.49751821f, +-0.50035429f, -0.50318497f, -0.50601029f, -0.50883019f, -0.51164466f, +-0.51445359f, -0.51725709f, -0.52005500f, -0.52284735f, -0.52563411f, +-0.52841520f, -0.53119069f, -0.53396046f, -0.53672451f, -0.53948283f, +-0.54223537f, -0.54498214f, -0.54772300f, -0.55045801f, -0.55318713f, +-0.55591035f, -0.55862761f, -0.56133890f, -0.56404412f, -0.56674337f, +-0.56943649f, -0.57212353f, -0.57480448f, -0.57747924f, -0.58014780f, +-0.58281022f, -0.58546633f, -0.58811617f, -0.59075975f, -0.59339696f, +-0.59602785f, -0.59865236f, -0.60127044f, -0.60388207f, -0.60648727f, +-0.60908598f, -0.61167812f, -0.61426371f, -0.61684275f, -0.61941516f, +-0.62198097f, -0.62454009f, -0.62709254f, -0.62963831f, -0.63217729f, +-0.63470948f, -0.63723493f, -0.63975352f, -0.64226526f, -0.64477009f, +-0.64726806f, -0.64975911f, -0.65224314f, -0.65472025f, -0.65719032f, +-0.65965337f, -0.66210932f, -0.66455823f, -0.66700000f, -0.66943461f, +-0.67186207f, -0.67428231f, -0.67669535f, -0.67910111f, -0.68149966f, +-0.68389088f, -0.68627477f, -0.68865126f, -0.69102043f, -0.69338220f, +-0.69573659f, -0.69808346f, -0.70042288f, -0.70275480f, -0.70507920f, +-0.70739603f, -0.70970529f, -0.71200693f, -0.71430099f, -0.71658736f, +-0.71886611f, -0.72113711f, -0.72340041f, -0.72565591f, -0.72790372f, +-0.73014367f, -0.73237586f, -0.73460019f, -0.73681659f, -0.73902518f, +-0.74122584f, -0.74341851f, -0.74560326f, -0.74778003f, -0.74994880f, +-0.75210953f, -0.75426215f, -0.75640678f, -0.75854325f, -0.76067162f, +-0.76279181f, -0.76490390f, -0.76700771f, -0.76910341f, -0.77119076f, +-0.77326995f, -0.77534080f, -0.77740335f, -0.77945763f, -0.78150350f, +-0.78354102f, -0.78557014f, -0.78759086f, -0.78960317f, -0.79160696f, +-0.79360235f, -0.79558921f, -0.79756755f, -0.79953730f, -0.80149853f, +-0.80345118f, -0.80539525f, -0.80733067f, -0.80925739f, -0.81117553f, +-0.81308490f, -0.81498563f, -0.81687760f, -0.81876087f, -0.82063532f, +-0.82250100f, -0.82435787f, -0.82620591f, -0.82804507f, -0.82987541f, +-0.83169687f, -0.83350939f, -0.83531296f, -0.83710766f, -0.83889335f, +-0.84067005f, -0.84243774f, -0.84419644f, -0.84594607f, -0.84768665f, +-0.84941816f, -0.85114056f, -0.85285389f, -0.85455805f, -0.85625303f, +-0.85793889f, -0.85961550f, -0.86128294f, -0.86294121f, -0.86459017f, +-0.86622989f, -0.86786032f, -0.86948150f, -0.87109333f, -0.87269586f, +-0.87428904f, -0.87587279f, -0.87744725f, -0.87901229f, -0.88056785f, +-0.88211405f, -0.88365078f, -0.88517809f, -0.88669586f, -0.88820416f, +-0.88970292f, -0.89119220f, -0.89267188f, -0.89414203f, -0.89560264f, +-0.89705360f, -0.89849502f, -0.89992678f, -0.90134889f, -0.90276134f, +-0.90416414f, -0.90555727f, -0.90694070f, -0.90831441f, -0.90967834f, +-0.91103262f, -0.91237706f, -0.91371179f, -0.91503674f, -0.91635185f, +-0.91765714f, -0.91895264f, -0.92023826f, -0.92151409f, -0.92277998f, +-0.92403603f, -0.92528218f, -0.92651838f, -0.92774469f, -0.92896110f, +-0.93016750f, -0.93136400f, -0.93255049f, -0.93372697f, -0.93489349f, +-0.93604994f, -0.93719643f, -0.93833286f, -0.93945926f, -0.94057560f, +-0.94168180f, -0.94277799f, -0.94386405f, -0.94494003f, -0.94600588f, +-0.94706154f, -0.94810712f, -0.94914252f, -0.95016778f, -0.95118284f, +-0.95218778f, -0.95318246f, -0.95416695f, -0.95514119f, -0.95610523f, +-0.95705903f, -0.95800257f, -0.95893586f, -0.95985889f, -0.96077162f, +-0.96167403f, -0.96256620f, -0.96344805f, -0.96431959f, -0.96518075f, +-0.96603161f, -0.96687216f, -0.96770233f, -0.96852213f, -0.96933156f, +-0.97013056f, -0.97091925f, -0.97169751f, -0.97246534f, -0.97322279f, +-0.97396982f, -0.97470641f, -0.97543252f, -0.97614825f, -0.97685349f, +-0.97754824f, -0.97823256f, -0.97890645f, -0.97956979f, -0.98022264f, +-0.98086500f, -0.98149687f, -0.98211825f, -0.98272908f, -0.98332942f, +-0.98391914f, -0.98449844f, -0.98506713f, -0.98562527f, -0.98617285f, +-0.98670989f, -0.98723638f, -0.98775226f, -0.98825759f, -0.98875231f, +-0.98923647f, -0.98971003f, -0.99017298f, -0.99062532f, -0.99106705f, +-0.99149817f, -0.99191868f, -0.99232858f, -0.99272782f, -0.99311644f, +-0.99349445f, -0.99386179f, -0.99421853f, -0.99456459f, -0.99489999f, +-0.99522477f, -0.99553883f, -0.99584228f, -0.99613506f, -0.99641716f, +-0.99668860f, -0.99694937f, -0.99719942f, -0.99743885f, -0.99766755f, +-0.99788558f, -0.99809295f, -0.99828959f, -0.99847561f, -0.99865085f, +-0.99881548f, -0.99896932f, -0.99911255f, -0.99924499f, -0.99936682f, +-0.99947786f, -0.99957830f, -0.99966794f, -0.99974692f, -0.99981517f, +-0.99987274f, -0.99991959f, -0.99995571f, -0.99998116f, -0.99999589f, +0.99999964f, 0.99997288f, 0.99990326f, 0.99979085f, 0.99963558f, +0.99943751f, 0.99919659f, 0.99891287f, 0.99858636f, 0.99821711f, +0.99780506f, 0.99735034f, 0.99685282f, 0.99631262f, 0.99572974f, +0.99510419f, 0.99443603f, 0.99372530f, 0.99297196f, 0.99217612f, +0.99133772f, 0.99045694f, 0.98953366f, 0.98856801f, 0.98756003f, +0.98650974f, 0.98541719f, 0.98428243f, 0.98310548f, 0.98188645f, +0.98062533f, 0.97932225f, 0.97797716f, 0.97659022f, 0.97516143f, +0.97369087f, 0.97217858f, 0.97062469f, 0.96902919f, 0.96739221f, +0.96571374f, 0.96399397f, 0.96223283f, 0.96043050f, 0.95858705f, +0.95670253f, 0.95477700f, 0.95281059f, 0.95080340f, 0.94875544f, +0.94666684f, 0.94453770f, 0.94236809f, 0.94015813f, 0.93790787f, +0.93561745f, 0.93328691f, 0.93091643f, 0.92850608f, 0.92605597f, +0.92356616f, 0.92103678f, 0.91846794f, 0.91585976f, 0.91321236f, +0.91052586f, 0.90780038f, 0.90503591f, 0.90223277f, 0.89939094f, +0.89651060f, 0.89359182f, 0.89063478f, 0.88763964f, 0.88460642f, +0.88153529f, 0.87842643f, 0.87527996f, 0.87209594f, 0.86887461f, +0.86561602f, 0.86232042f, 0.85898781f, 0.85561842f, 0.85221243f, +0.84876984f, 0.84529096f, 0.84177583f, 0.83822471f, 0.83463764f, +0.83101481f, 0.82735640f, 0.82366252f, 0.81993335f, 0.81616908f, +0.81236988f, 0.80853581f, 0.80466717f, 0.80076402f, 0.79682660f, +0.79285502f, 0.78884947f, 0.78481019f, 0.78073722f, 0.77663082f, +0.77249116f, 0.76831841f, 0.76411277f, 0.75987434f, 0.75560343f, +0.75130010f, 0.74696463f, 0.74259710f, 0.73819780f, 0.73376691f, +0.72930455f, 0.72481096f, 0.72028631f, 0.71573079f, 0.71114463f, +0.70652801f, 0.70188117f, 0.69720417f, 0.69249737f, 0.68776089f, +0.68299496f, 0.67819971f, 0.67337549f, 0.66852236f, 0.66364062f, +0.65873051f, 0.65379208f, 0.64882571f, 0.64383155f, 0.63880974f, +0.63376063f, 0.62868434f, 0.62358117f, 0.61845124f, 0.61329484f, +0.60811216f, 0.60290343f, 0.59766883f, 0.59240872f, 0.58712316f, +0.58181250f, 0.57647687f, 0.57111657f, 0.56573176f, 0.56032276f, +0.55488980f, 0.54943299f, 0.54395270f, 0.53844911f, 0.53292239f, +0.52737290f, 0.52180082f, 0.51620632f, 0.51058978f, 0.50495136f, +0.49929130f, 0.49360985f, 0.48790723f, 0.48218375f, 0.47643960f, +0.47067502f, 0.46489030f, 0.45908567f, 0.45326138f, 0.44741765f, +0.44155475f, 0.43567297f, 0.42977250f, 0.42385364f, 0.41791660f, +0.41196167f, 0.40598908f, 0.39999911f, 0.39399201f, 0.38796803f, +0.38192743f, 0.37587047f, 0.36979741f, 0.36370850f, 0.35760403f, +0.35148421f, 0.34534934f, 0.33919969f, 0.33303553f, 0.32685706f, +0.32066461f, 0.31445843f, 0.30823877f, 0.30200592f, 0.29576012f, +0.28950164f, 0.28323078f, 0.27694780f, 0.27065292f, 0.26434645f, +0.25802869f, 0.25169984f, 0.24536023f, 0.23901010f, 0.23264973f, +0.22627939f, 0.21989937f, 0.21350993f, 0.20711134f, 0.20070387f, +0.19428782f, 0.18786344f, 0.18143101f, 0.17499080f, 0.16854310f, +0.16208819f, 0.15562633f, 0.14915779f, 0.14268288f, 0.13620184f, +0.12971498f, 0.12322257f, 0.11672486f, 0.11022217f, 0.10371475f, +0.097202882f, 0.090686858f, 0.084166944f, 0.077643424f, 0.071116582f, +0.064586692f, 0.058054037f, 0.051518895f, 0.044981543f, 0.038442269f, +0.031901345f, 0.025359053f, 0.018815678f, 0.012271495f, 0.0057267868f, +-0.00081816671f, -0.0073630852f, -0.013907688f, -0.020451695f, -0.026994826f, +-0.033536803f, -0.040077340f, -0.046616159f, -0.053152986f, -0.059687532f, +-0.066219524f, -0.072748676f, -0.079274714f, -0.085797355f, -0.092316322f, +-0.098831341f, -0.10534211f, -0.11184838f, -0.11834986f, -0.12484626f, +-0.13133731f, -0.13782275f, -0.14430228f, -0.15077563f, -0.15724251f, +-0.16370267f, -0.17015581f, -0.17660165f, -0.18303993f, -0.18947038f, +-0.19589271f, -0.20230664f, -0.20871192f, -0.21510825f, -0.22149536f, +-0.22787298f, -0.23424086f, -0.24059868f, -0.24694622f, -0.25328314f, +-0.25960925f, -0.26592422f, -0.27222782f, -0.27851975f, -0.28479972f, +-0.29106751f, -0.29732284f, -0.30356544f, -0.30979502f, -0.31601134f, +-0.32221413f, -0.32840309f, -0.33457801f, -0.34073856f, -0.34688455f, +-0.35301566f, -0.35913166f, -0.36523229f, -0.37131724f, -0.37738630f, +-0.38343921f, -0.38947567f, -0.39549544f, -0.40149832f, -0.40748394f, +-0.41345215f, -0.41940263f, -0.42533514f, -0.43124944f, -0.43714526f, +-0.44302234f, -0.44888046f, -0.45471936f, -0.46053877f, -0.46633846f, +-0.47211814f, -0.47787762f, -0.48361665f, -0.48933494f, -0.49503228f, +-0.50070840f, -0.50636309f, -0.51199609f, -0.51760709f, -0.52319598f, +-0.52876246f, -0.53430629f, -0.53982723f, -0.54532504f, -0.55079949f, +-0.55625033f, -0.56167740f, -0.56708032f, -0.57245898f, -0.57781315f, +-0.58314258f, -0.58844697f, -0.59372622f, -0.59897995f, -0.60420811f, +-0.60941035f, -0.61458647f, -0.61973625f, -0.62485951f, -0.62995601f, +-0.63502556f, -0.64006782f, -0.64508271f, -0.65007001f, -0.65502942f, +-0.65996075f, -0.66486382f, -0.66973841f, -0.67458433f, -0.67940134f, +-0.68418926f, -0.68894786f, -0.69367695f, -0.69837630f, -0.70304573f, +-0.70768511f, -0.71229410f, -0.71687263f, -0.72142041f, -0.72593731f, +-0.73042315f, -0.73487765f, -0.73930067f, -0.74369204f, -0.74805158f, +-0.75237900f, -0.75667429f, -0.76093709f, -0.76516730f, -0.76936477f, +-0.77352923f, -0.77766061f, -0.78175867f, -0.78582323f, -0.78985411f, +-0.79385114f, -0.79781419f, -0.80174309f, -0.80563760f, -0.80949765f, +-0.81332302f, -0.81711352f, -0.82086903f, -0.82458937f, -0.82827437f, +-0.83192390f, -0.83553779f, -0.83911592f, -0.84265804f, -0.84616417f, +-0.84963393f, -0.85306740f, -0.85646427f, -0.85982448f, -0.86314780f, +-0.86643422f, -0.86968350f, -0.87289548f, -0.87607014f, -0.87920725f, +-0.88230664f, -0.88536829f, -0.88839203f, -0.89137769f, -0.89432514f, +-0.89723432f, -0.90010506f, -0.90293723f, -0.90573072f, -0.90848541f, +-0.91120118f, -0.91387796f, -0.91651553f, -0.91911387f, -0.92167282f, +-0.92419231f, -0.92667222f, -0.92911243f, -0.93151283f, -0.93387336f, +-0.93619382f, -0.93847424f, -0.94071442f, -0.94291431f, -0.94507378f, +-0.94719279f, -0.94927126f, -0.95130903f, -0.95330608f, -0.95526224f, +-0.95717752f, -0.95905179f, -0.96088499f, -0.96267700f, -0.96442777f, +-0.96613729f, -0.96780539f, -0.96943200f, -0.97101706f, -0.97256058f, +-0.97406244f, -0.97552258f, -0.97694093f, -0.97831738f, -0.97965199f, +-0.98094457f, -0.98219514f, -0.98340368f, -0.98457009f, -0.98569429f, +-0.98677629f, -0.98781598f, -0.98881340f, -0.98976845f, -0.99068111f, +-0.99155134f, -0.99237907f, -0.99316430f, -0.99390697f, -0.99460709f, +-0.99526459f, -0.99587947f, -0.99645168f, -0.99698120f, -0.99746799f, +-0.99791211f, -0.99831343f, -0.99867201f, -0.99898779f, -0.99926084f, +-0.99949104f, -0.99967843f, -0.99982297f, -0.99992472f, -0.99998361f, +0.99999869f, 0.99989158f, 0.99961317f, 0.99916345f, 0.99854255f, +0.99775058f, 0.99678761f, 0.99565387f, 0.99434954f, 0.99287480f, +0.99122995f, 0.98941529f, 0.98743105f, 0.98527765f, 0.98295540f, +0.98046476f, 0.97780609f, 0.97497988f, 0.97198665f, 0.96882683f, +0.96550101f, 0.96200979f, 0.95835376f, 0.95453346f, 0.95054960f, +0.94640291f, 0.94209403f, 0.93762374f, 0.93299282f, 0.92820197f, +0.92325211f, 0.91814411f, 0.91287869f, 0.90745693f, 0.90187967f, +0.89614785f, 0.89026248f, 0.88422459f, 0.87803519f, 0.87169534f, +0.86520612f, 0.85856867f, 0.85178405f, 0.84485358f, 0.83777827f, +0.83055943f, 0.82319832f, 0.81569612f, 0.80805415f, 0.80027372f, +0.79235619f, 0.78430289f, 0.77611518f, 0.76779449f, 0.75934225f, +0.75075996f, 0.74204898f, 0.73321080f, 0.72424710f, 0.71515924f, +0.70594883f, 0.69661748f, 0.68716675f, 0.67759830f, 0.66791373f, +0.65811473f, 0.64820296f, 0.63818014f, 0.62804794f, 0.61780810f, +0.60746247f, 0.59701276f, 0.58646071f, 0.57580817f, 0.56505698f, +0.55420899f, 0.54326600f, 0.53222996f, 0.52110273f, 0.50988621f, +0.49858227f, 0.48719296f, 0.47572014f, 0.46416581f, 0.45253196f, +0.44082057f, 0.42903364f, 0.41717321f, 0.40524128f, 0.39323992f, +0.38117120f, 0.36903715f, 0.35683987f, 0.34458145f, 0.33226398f, +0.31988961f, 0.30746040f, 0.29497850f, 0.28244606f, 0.26986524f, +0.25723818f, 0.24456702f, 0.23185398f, 0.21910121f, 0.20631088f, +0.19348522f, 0.18062639f, 0.16773662f, 0.15481812f, 0.14187308f, +0.12890373f, 0.11591230f, 0.10290100f, 0.089872077f, 0.076827750f, +0.063770257f, 0.050701842f, 0.037624735f, 0.024541186f, 0.011453429f, +-0.0016362892f, -0.014725727f, -0.027812643f, -0.040894791f, -0.053969935f, +-0.067035832f, -0.080090240f, -0.093130924f, -0.10615565f, -0.11916219f, +-0.13214831f, -0.14511178f, -0.15805040f, -0.17096193f, -0.18384418f, +-0.19669491f, -0.20951195f, -0.22229309f, -0.23503613f, -0.24773891f, +-0.26039925f, -0.27301496f, -0.28558388f, -0.29810387f, -0.31057280f, +-0.32298848f, -0.33534884f, -0.34765175f, -0.35989508f, -0.37207675f, +-0.38419467f, -0.39624676f, -0.40823093f, -0.42014518f, -0.43198743f, +-0.44375566f, -0.45544785f, -0.46706200f, -0.47859612f, -0.49004826f, +-0.50141639f, -0.51269865f, -0.52389306f, -0.53499764f, -0.54601061f, +-0.55693001f, -0.56775403f, -0.57848072f, -0.58910829f, -0.59963489f, +-0.61005878f, -0.62037814f, -0.63059121f, -0.64069623f, -0.65069145f, +-0.66057515f, -0.67034572f, -0.68000144f, -0.68954057f, -0.69896162f, +-0.70826286f, -0.71744281f, -0.72649974f, -0.73543227f, -0.74423873f, +-0.75291771f, -0.76146764f, -0.76988715f, -0.77817470f, -0.78632891f, +-0.79434842f, -0.80223179f, -0.80997771f, -0.81758487f, -0.82505190f, +-0.83237761f, -0.83956063f, -0.84659988f, -0.85349399f, -0.86024189f, +-0.86684239f, -0.87329435f, -0.87959671f, -0.88574833f, -0.89174819f, +-0.89759529f, -0.90328854f, -0.90882701f, -0.91420978f, -0.91943592f, +-0.92450452f, -0.92941469f, -0.93416560f, -0.93875647f, -0.94318646f, +-0.94745487f, -0.95156091f, -0.95550388f, -0.95928317f, -0.96289814f, +-0.96634805f, -0.96963239f, -0.97275060f, -0.97570217f, -0.97848648f, +-0.98110318f, -0.98355180f, -0.98583186f, -0.98794299f, -0.98988485f, +-0.99165714f, -0.99325943f, -0.99469161f, -0.99595332f, -0.99704438f, +-0.99796462f, -0.99871385f, -0.99929196f, -0.99969882f, -0.99993443f, +0.99999464f, 0.99956632f, 0.99845290f, 0.99665523f, 0.99417448f, +0.99101239f, 0.98717111f, 0.98265326f, 0.97746199f, 0.97160077f, +0.96507365f, 0.95788515f, 0.95004016f, 0.94154406f, 0.93240267f, +0.92262226f, 0.91220951f, 0.90117162f, 0.88951606f, 0.87725091f, +0.86438453f, 0.85092574f, 0.83688372f, 0.82226819f, 0.80708915f, +0.79135692f, 0.77508235f, 0.75827658f, 0.74095112f, 0.72311783f, +0.70478898f, 0.68597710f, 0.66669506f, 0.64695615f, 0.62677377f, +0.60616189f, 0.58513457f, 0.56370622f, 0.54189157f, 0.51970547f, +0.49716324f, 0.47428027f, 0.45107225f, 0.42755505f, 0.40374488f, +0.37965798f, 0.35531086f, 0.33072025f, 0.30590299f, 0.28087607f, +0.25565663f, 0.23026201f, 0.20470956f, 0.17901683f, 0.15320139f, +0.12728097f, 0.10127331f, 0.075196236f, 0.049067631f, 0.022905400f, +-0.0032725304f, -0.029448219f, -0.055603724f, -0.081721120f, -0.10778251f, +-0.13377003f, -0.15966587f, -0.18545228f, -0.21111161f, -0.23662624f, +-0.26197869f, -0.28715160f, -0.31212771f, -0.33688989f, -0.36142120f, +-0.38570482f, -0.40972409f, -0.43346253f, -0.45690393f, -0.48003218f, +-0.50283146f, -0.52528608f, -0.54738069f, -0.56910020f, -0.59042966f, +-0.61135447f, -0.63186026f, -0.65193301f, -0.67155898f, -0.69072473f, +-0.70941705f, -0.72762316f, -0.74533063f, -0.76252723f, -0.77920127f, +-0.79534131f, -0.81093621f, -0.82597536f, -0.84044844f, -0.85434550f, +-0.86765707f, -0.88037395f, -0.89248747f, -0.90398932f, -0.91487163f, +-0.92512697f, -0.93474823f, -0.94372886f, -0.95206273f, -0.95974404f, +-0.96676767f, -0.97312868f, -0.97882277f, -0.98384601f, -0.98819500f, +-0.99186671f, -0.99485862f, -0.99716878f, -0.99879545f, -0.99973762f, +}; +#endif + +static const CELTMode mode48000_960_120 = { +48000, /* Fs */ +120, /* overlap */ +21, /* nbEBands */ +21, /* effEBands */ +{0.85000610f, 0.0000000f, 1.0000000f, 1.0000000f, }, /* preemph */ +eband5ms, /* eBands */ +3, /* maxLM */ +8, /* nbShortMdcts */ +120, /* shortMdctSize */ +11, /* nbAllocVectors */ +band_allocation, /* allocVectors */ +logN400, /* logN */ +window120, /* window */ +{1920, 3, {&fft_state48000_960_0, &fft_state48000_960_1, &fft_state48000_960_2, &fft_state48000_960_3, }, mdct_twiddles960}, /* mdct */ +{392, cache_index50, cache_bits50, cache_caps50}, /* cache */ +}; + +/* List of all the available modes */ +#define TOTAL_MODES 1 +static const CELTMode * const static_mode_list[TOTAL_MODES] = { +&mode48000_960_120, +}; diff --git a/thirdparty/opus/celt/static_modes_float_arm_ne10.h b/thirdparty/opus/celt/static_modes_float_arm_ne10.h new file mode 100644 index 000000000000..934a82a420a2 --- /dev/null +++ b/thirdparty/opus/celt/static_modes_float_arm_ne10.h @@ -0,0 +1,404 @@ +/* The contents of this file was automatically generated by + * dump_mode_arm_ne10.c with arguments: 48000 960 + * It contains static definitions for some pre-defined modes. */ +#include + +#ifndef NE10_FFT_PARAMS48000_960 +#define NE10_FFT_PARAMS48000_960 +static const ne10_int32_t ne10_factors_480[64] = { +4, 40, 4, 30, 2, 15, 5, 3, 3, 1, 1, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, }; +static const ne10_int32_t ne10_factors_240[64] = { +3, 20, 4, 15, 5, 3, 3, 1, 1, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, }; +static const ne10_int32_t ne10_factors_120[64] = { +3, 10, 2, 15, 5, 3, 3, 1, 1, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, }; +static const ne10_int32_t ne10_factors_60[64] = { +2, 5, 5, 3, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, }; +static const ne10_fft_cpx_float32_t ne10_twiddles_480[480] = { +{1.0000000f,0.0000000f}, {1.0000000f,-0.0000000f}, {1.0000000f,-0.0000000f}, +{1.0000000f,-0.0000000f}, {0.91354543f,-0.40673664f}, {0.66913056f,-0.74314487f}, +{1.0000000f,-0.0000000f}, {0.66913056f,-0.74314487f}, {-0.10452851f,-0.99452192f}, +{1.0000000f,-0.0000000f}, {0.30901697f,-0.95105654f}, {-0.80901700f,-0.58778518f}, +{1.0000000f,-0.0000000f}, {-0.10452851f,-0.99452192f}, {-0.97814757f,0.20791179f}, +{1.0000000f,-0.0000000f}, {0.97814763f,-0.20791170f}, {0.91354543f,-0.40673664f}, +{0.80901700f,-0.58778524f}, {0.66913056f,-0.74314487f}, {0.49999997f,-0.86602545f}, +{0.30901697f,-0.95105654f}, {0.10452842f,-0.99452192f}, {-0.10452851f,-0.99452192f}, +{-0.30901703f,-0.95105648f}, {-0.50000006f,-0.86602533f}, {-0.66913068f,-0.74314475f}, +{-0.80901700f,-0.58778518f}, {-0.91354549f,-0.40673658f}, {-0.97814763f,-0.20791161f}, +{1.0000000f,-0.0000000f}, {0.99862951f,-0.052335959f}, {0.99452192f,-0.10452846f}, +{0.98768836f,-0.15643448f}, {0.97814763f,-0.20791170f}, {0.96592581f,-0.25881904f}, +{0.95105648f,-0.30901700f}, {0.93358040f,-0.35836795f}, {0.91354543f,-0.40673664f}, +{0.89100653f,-0.45399052f}, {0.86602545f,-0.50000000f}, {0.83867055f,-0.54463905f}, +{0.80901700f,-0.58778524f}, {0.77714598f,-0.62932038f}, {0.74314475f,-0.66913062f}, +{0.70710677f,-0.70710683f}, {0.66913056f,-0.74314487f}, {0.62932038f,-0.77714598f}, +{0.58778524f,-0.80901700f}, {0.54463899f,-0.83867055f}, {0.49999997f,-0.86602545f}, +{0.45399052f,-0.89100653f}, {0.40673661f,-0.91354549f}, {0.35836786f,-0.93358046f}, +{0.30901697f,-0.95105654f}, {0.25881907f,-0.96592581f}, {0.20791166f,-0.97814763f}, +{0.15643437f,-0.98768836f}, {0.10452842f,-0.99452192f}, {0.052335974f,-0.99862951f}, +{1.0000000f,-0.0000000f}, {0.99452192f,-0.10452846f}, {0.97814763f,-0.20791170f}, +{0.95105648f,-0.30901700f}, {0.91354543f,-0.40673664f}, {0.86602545f,-0.50000000f}, +{0.80901700f,-0.58778524f}, {0.74314475f,-0.66913062f}, {0.66913056f,-0.74314487f}, +{0.58778524f,-0.80901700f}, {0.49999997f,-0.86602545f}, {0.40673661f,-0.91354549f}, +{0.30901697f,-0.95105654f}, {0.20791166f,-0.97814763f}, {0.10452842f,-0.99452192f}, +{-4.3711388e-08f,-1.0000000f}, {-0.10452851f,-0.99452192f}, {-0.20791174f,-0.97814757f}, +{-0.30901703f,-0.95105648f}, {-0.40673670f,-0.91354543f}, {-0.50000006f,-0.86602533f}, +{-0.58778518f,-0.80901700f}, {-0.66913068f,-0.74314475f}, {-0.74314493f,-0.66913044f}, +{-0.80901700f,-0.58778518f}, {-0.86602539f,-0.50000006f}, {-0.91354549f,-0.40673658f}, +{-0.95105654f,-0.30901679f}, {-0.97814763f,-0.20791161f}, {-0.99452192f,-0.10452849f}, +{1.0000000f,-0.0000000f}, {0.98768836f,-0.15643448f}, {0.95105648f,-0.30901700f}, +{0.89100653f,-0.45399052f}, {0.80901700f,-0.58778524f}, {0.70710677f,-0.70710683f}, +{0.58778524f,-0.80901700f}, {0.45399052f,-0.89100653f}, {0.30901697f,-0.95105654f}, +{0.15643437f,-0.98768836f}, {-4.3711388e-08f,-1.0000000f}, {-0.15643445f,-0.98768836f}, +{-0.30901703f,-0.95105648f}, {-0.45399061f,-0.89100647f}, {-0.58778518f,-0.80901700f}, +{-0.70710677f,-0.70710677f}, {-0.80901700f,-0.58778518f}, {-0.89100659f,-0.45399037f}, +{-0.95105654f,-0.30901679f}, {-0.98768836f,-0.15643445f}, {-1.0000000f,8.7422777e-08f}, +{-0.98768830f,0.15643461f}, {-0.95105654f,0.30901697f}, {-0.89100653f,0.45399055f}, +{-0.80901694f,0.58778536f}, {-0.70710665f,0.70710689f}, {-0.58778507f,0.80901712f}, +{-0.45399022f,0.89100665f}, {-0.30901709f,0.95105648f}, {-0.15643452f,0.98768830f}, +{1.0000000f,-0.0000000f}, {0.99991435f,-0.013089596f}, {0.99965733f,-0.026176950f}, +{0.99922901f,-0.039259817f}, {0.99862951f,-0.052335959f}, {0.99785894f,-0.065403134f}, +{0.99691731f,-0.078459099f}, {0.99580491f,-0.091501623f}, {0.99452192f,-0.10452846f}, +{0.99306846f,-0.11753740f}, {0.99144489f,-0.13052620f}, {0.98965138f,-0.14349262f}, +{0.98768836f,-0.15643448f}, {0.98555607f,-0.16934951f}, {0.98325491f,-0.18223552f}, +{0.98078525f,-0.19509032f}, {0.97814763f,-0.20791170f}, {0.97534233f,-0.22069745f}, +{0.97236991f,-0.23344538f}, {0.96923089f,-0.24615330f}, {0.96592581f,-0.25881904f}, +{0.96245521f,-0.27144045f}, {0.95881975f,-0.28401536f}, {0.95501995f,-0.29654160f}, +{0.95105648f,-0.30901700f}, {0.94693011f,-0.32143945f}, {0.94264150f,-0.33380687f}, +{0.93819129f,-0.34611708f}, {0.93358040f,-0.35836795f}, {0.92880952f,-0.37055743f}, +{0.92387956f,-0.38268346f}, {0.91879117f,-0.39474389f}, {0.91354543f,-0.40673664f}, +{0.90814316f,-0.41865975f}, {0.90258527f,-0.43051112f}, {0.89687270f,-0.44228873f}, +{0.89100653f,-0.45399052f}, {0.88498765f,-0.46561453f}, {0.87881708f,-0.47715878f}, +{0.87249601f,-0.48862126f}, {0.86602545f,-0.50000000f}, {0.85940641f,-0.51129311f}, +{0.85264015f,-0.52249855f}, {0.84572786f,-0.53361452f}, {0.83867055f,-0.54463905f}, +{0.83146960f,-0.55557024f}, {0.82412618f,-0.56640625f}, {0.81664151f,-0.57714522f}, +{0.80901700f,-0.58778524f}, {0.80125380f,-0.59832460f}, {0.79335332f,-0.60876143f}, +{0.78531694f,-0.61909395f}, {0.77714598f,-0.62932038f}, {0.76884180f,-0.63943899f}, +{0.76040596f,-0.64944810f}, {0.75183982f,-0.65934587f}, {0.74314475f,-0.66913062f}, +{0.73432249f,-0.67880076f}, {0.72537434f,-0.68835455f}, {0.71630192f,-0.69779050f}, +{0.70710677f,-0.70710683f}, {0.69779044f,-0.71630198f}, {0.68835455f,-0.72537440f}, +{0.67880070f,-0.73432255f}, {0.66913056f,-0.74314487f}, {0.65934581f,-0.75183982f}, +{0.64944804f,-0.76040596f}, {0.63943899f,-0.76884186f}, {0.62932038f,-0.77714598f}, +{0.61909395f,-0.78531694f}, {0.60876137f,-0.79335338f}, {0.59832460f,-0.80125386f}, +{0.58778524f,-0.80901700f}, {0.57714516f,-0.81664151f}, {0.56640625f,-0.82412618f}, +{0.55557019f,-0.83146960f}, {0.54463899f,-0.83867055f}, {0.53361452f,-0.84572786f}, +{0.52249849f,-0.85264015f}, {0.51129311f,-0.85940641f}, {0.49999997f,-0.86602545f}, +{0.48862118f,-0.87249601f}, {0.47715876f,-0.87881708f}, {0.46561447f,-0.88498765f}, +{0.45399052f,-0.89100653f}, {0.44228867f,-0.89687276f}, {0.43051103f,-0.90258533f}, +{0.41865975f,-0.90814316f}, {0.40673661f,-0.91354549f}, {0.39474380f,-0.91879129f}, +{0.38268343f,-0.92387956f}, {0.37055740f,-0.92880958f}, {0.35836786f,-0.93358046f}, +{0.34611705f,-0.93819135f}, {0.33380681f,-0.94264150f}, {0.32143947f,-0.94693011f}, +{0.30901697f,-0.95105654f}, {0.29654151f,-0.95501995f}, {0.28401533f,-0.95881975f}, +{0.27144039f,-0.96245527f}, {0.25881907f,-0.96592581f}, {0.24615327f,-0.96923089f}, +{0.23344530f,-0.97236991f}, {0.22069745f,-0.97534233f}, {0.20791166f,-0.97814763f}, +{0.19509023f,-0.98078531f}, {0.18223552f,-0.98325491f}, {0.16934945f,-0.98555607f}, +{0.15643437f,-0.98768836f}, {0.14349259f,-0.98965138f}, {0.13052613f,-0.99144489f}, +{0.11753740f,-0.99306846f}, {0.10452842f,-0.99452192f}, {0.091501534f,-0.99580491f}, +{0.078459084f,-0.99691731f}, {0.065403074f,-0.99785894f}, {0.052335974f,-0.99862951f}, +{0.039259788f,-0.99922901f}, {0.026176875f,-0.99965733f}, {0.013089597f,-0.99991435f}, +{1.0000000f,-0.0000000f}, {0.99965733f,-0.026176950f}, {0.99862951f,-0.052335959f}, +{0.99691731f,-0.078459099f}, {0.99452192f,-0.10452846f}, {0.99144489f,-0.13052620f}, +{0.98768836f,-0.15643448f}, {0.98325491f,-0.18223552f}, {0.97814763f,-0.20791170f}, +{0.97236991f,-0.23344538f}, {0.96592581f,-0.25881904f}, {0.95881975f,-0.28401536f}, +{0.95105648f,-0.30901700f}, {0.94264150f,-0.33380687f}, {0.93358040f,-0.35836795f}, +{0.92387956f,-0.38268346f}, {0.91354543f,-0.40673664f}, {0.90258527f,-0.43051112f}, +{0.89100653f,-0.45399052f}, {0.87881708f,-0.47715878f}, {0.86602545f,-0.50000000f}, +{0.85264015f,-0.52249855f}, {0.83867055f,-0.54463905f}, {0.82412618f,-0.56640625f}, +{0.80901700f,-0.58778524f}, {0.79335332f,-0.60876143f}, {0.77714598f,-0.62932038f}, +{0.76040596f,-0.64944810f}, {0.74314475f,-0.66913062f}, {0.72537434f,-0.68835455f}, +{0.70710677f,-0.70710683f}, {0.68835455f,-0.72537440f}, {0.66913056f,-0.74314487f}, +{0.64944804f,-0.76040596f}, {0.62932038f,-0.77714598f}, {0.60876137f,-0.79335338f}, +{0.58778524f,-0.80901700f}, {0.56640625f,-0.82412618f}, {0.54463899f,-0.83867055f}, +{0.52249849f,-0.85264015f}, {0.49999997f,-0.86602545f}, {0.47715876f,-0.87881708f}, +{0.45399052f,-0.89100653f}, {0.43051103f,-0.90258533f}, {0.40673661f,-0.91354549f}, +{0.38268343f,-0.92387956f}, {0.35836786f,-0.93358046f}, {0.33380681f,-0.94264150f}, +{0.30901697f,-0.95105654f}, {0.28401533f,-0.95881975f}, {0.25881907f,-0.96592581f}, +{0.23344530f,-0.97236991f}, {0.20791166f,-0.97814763f}, {0.18223552f,-0.98325491f}, +{0.15643437f,-0.98768836f}, {0.13052613f,-0.99144489f}, {0.10452842f,-0.99452192f}, +{0.078459084f,-0.99691731f}, {0.052335974f,-0.99862951f}, {0.026176875f,-0.99965733f}, +{-4.3711388e-08f,-1.0000000f}, {-0.026176963f,-0.99965733f}, {-0.052336060f,-0.99862951f}, +{-0.078459173f,-0.99691731f}, {-0.10452851f,-0.99452192f}, {-0.13052621f,-0.99144489f}, +{-0.15643445f,-0.98768836f}, {-0.18223560f,-0.98325491f}, {-0.20791174f,-0.97814757f}, +{-0.23344538f,-0.97236991f}, {-0.25881916f,-0.96592581f}, {-0.28401542f,-0.95881969f}, +{-0.30901703f,-0.95105648f}, {-0.33380687f,-0.94264150f}, {-0.35836795f,-0.93358040f}, +{-0.38268352f,-0.92387950f}, {-0.40673670f,-0.91354543f}, {-0.43051112f,-0.90258527f}, +{-0.45399061f,-0.89100647f}, {-0.47715873f,-0.87881708f}, {-0.50000006f,-0.86602533f}, +{-0.52249867f,-0.85264009f}, {-0.54463905f,-0.83867055f}, {-0.56640631f,-0.82412612f}, +{-0.58778518f,-0.80901700f}, {-0.60876143f,-0.79335332f}, {-0.62932050f,-0.77714586f}, +{-0.64944804f,-0.76040596f}, {-0.66913068f,-0.74314475f}, {-0.68835467f,-0.72537428f}, +{-0.70710677f,-0.70710677f}, {-0.72537446f,-0.68835449f}, {-0.74314493f,-0.66913044f}, +{-0.76040596f,-0.64944804f}, {-0.77714604f,-0.62932026f}, {-0.79335332f,-0.60876143f}, +{-0.80901700f,-0.58778518f}, {-0.82412624f,-0.56640613f}, {-0.83867055f,-0.54463899f}, +{-0.85264021f,-0.52249849f}, {-0.86602539f,-0.50000006f}, {-0.87881714f,-0.47715873f}, +{-0.89100659f,-0.45399037f}, {-0.90258527f,-0.43051112f}, {-0.91354549f,-0.40673658f}, +{-0.92387956f,-0.38268328f}, {-0.93358040f,-0.35836792f}, {-0.94264150f,-0.33380675f}, +{-0.95105654f,-0.30901679f}, {-0.95881975f,-0.28401530f}, {-0.96592587f,-0.25881892f}, +{-0.97236991f,-0.23344538f}, {-0.97814763f,-0.20791161f}, {-0.98325491f,-0.18223536f}, +{-0.98768836f,-0.15643445f}, {-0.99144489f,-0.13052608f}, {-0.99452192f,-0.10452849f}, +{-0.99691737f,-0.078459039f}, {-0.99862957f,-0.052335810f}, {-0.99965733f,-0.026176952f}, +{1.0000000f,-0.0000000f}, {0.99922901f,-0.039259817f}, {0.99691731f,-0.078459099f}, +{0.99306846f,-0.11753740f}, {0.98768836f,-0.15643448f}, {0.98078525f,-0.19509032f}, +{0.97236991f,-0.23344538f}, {0.96245521f,-0.27144045f}, {0.95105648f,-0.30901700f}, +{0.93819129f,-0.34611708f}, {0.92387956f,-0.38268346f}, {0.90814316f,-0.41865975f}, +{0.89100653f,-0.45399052f}, {0.87249601f,-0.48862126f}, {0.85264015f,-0.52249855f}, +{0.83146960f,-0.55557024f}, {0.80901700f,-0.58778524f}, {0.78531694f,-0.61909395f}, +{0.76040596f,-0.64944810f}, {0.73432249f,-0.67880076f}, {0.70710677f,-0.70710683f}, +{0.67880070f,-0.73432255f}, {0.64944804f,-0.76040596f}, {0.61909395f,-0.78531694f}, +{0.58778524f,-0.80901700f}, {0.55557019f,-0.83146960f}, {0.52249849f,-0.85264015f}, +{0.48862118f,-0.87249601f}, {0.45399052f,-0.89100653f}, {0.41865975f,-0.90814316f}, +{0.38268343f,-0.92387956f}, {0.34611705f,-0.93819135f}, {0.30901697f,-0.95105654f}, +{0.27144039f,-0.96245527f}, {0.23344530f,-0.97236991f}, {0.19509023f,-0.98078531f}, +{0.15643437f,-0.98768836f}, {0.11753740f,-0.99306846f}, {0.078459084f,-0.99691731f}, +{0.039259788f,-0.99922901f}, {-4.3711388e-08f,-1.0000000f}, {-0.039259877f,-0.99922901f}, +{-0.078459173f,-0.99691731f}, {-0.11753749f,-0.99306846f}, {-0.15643445f,-0.98768836f}, +{-0.19509032f,-0.98078525f}, {-0.23344538f,-0.97236991f}, {-0.27144048f,-0.96245521f}, +{-0.30901703f,-0.95105648f}, {-0.34611711f,-0.93819129f}, {-0.38268352f,-0.92387950f}, +{-0.41865984f,-0.90814310f}, {-0.45399061f,-0.89100647f}, {-0.48862135f,-0.87249595f}, +{-0.52249867f,-0.85264009f}, {-0.55557036f,-0.83146954f}, {-0.58778518f,-0.80901700f}, +{-0.61909389f,-0.78531694f}, {-0.64944804f,-0.76040596f}, {-0.67880076f,-0.73432249f}, +{-0.70710677f,-0.70710677f}, {-0.73432249f,-0.67880070f}, {-0.76040596f,-0.64944804f}, +{-0.78531694f,-0.61909389f}, {-0.80901700f,-0.58778518f}, {-0.83146966f,-0.55557019f}, +{-0.85264021f,-0.52249849f}, {-0.87249607f,-0.48862115f}, {-0.89100659f,-0.45399037f}, +{-0.90814322f,-0.41865960f}, {-0.92387956f,-0.38268328f}, {-0.93819135f,-0.34611690f}, +{-0.95105654f,-0.30901679f}, {-0.96245521f,-0.27144048f}, {-0.97236991f,-0.23344538f}, +{-0.98078531f,-0.19509031f}, {-0.98768836f,-0.15643445f}, {-0.99306846f,-0.11753736f}, +{-0.99691737f,-0.078459039f}, {-0.99922901f,-0.039259743f}, {-1.0000000f,8.7422777e-08f}, +{-0.99922901f,0.039259918f}, {-0.99691731f,0.078459218f}, {-0.99306846f,0.11753753f}, +{-0.98768830f,0.15643461f}, {-0.98078525f,0.19509049f}, {-0.97236985f,0.23344554f}, +{-0.96245515f,0.27144065f}, {-0.95105654f,0.30901697f}, {-0.93819135f,0.34611705f}, +{-0.92387956f,0.38268346f}, {-0.90814316f,0.41865975f}, {-0.89100653f,0.45399055f}, +{-0.87249601f,0.48862129f}, {-0.85264015f,0.52249861f}, {-0.83146960f,0.55557030f}, +{-0.80901694f,0.58778536f}, {-0.78531688f,0.61909401f}, {-0.76040590f,0.64944816f}, +{-0.73432243f,0.67880082f}, {-0.70710665f,0.70710689f}, {-0.67880058f,0.73432261f}, +{-0.64944792f,0.76040608f}, {-0.61909378f,0.78531706f}, {-0.58778507f,0.80901712f}, +{-0.55557001f,0.83146977f}, {-0.52249837f,0.85264033f}, {-0.48862100f,0.87249613f}, +{-0.45399022f,0.89100665f}, {-0.41865945f,0.90814328f}, {-0.38268313f,0.92387968f}, +{-0.34611672f,0.93819147f}, {-0.30901709f,0.95105648f}, {-0.27144054f,0.96245521f}, +{-0.23344545f,0.97236991f}, {-0.19509038f,0.98078525f}, {-0.15643452f,0.98768830f}, +{-0.11753743f,0.99306846f}, {-0.078459114f,0.99691731f}, {-0.039259821f,0.99922901f}, +}; +static const ne10_fft_cpx_float32_t ne10_twiddles_240[240] = { +{1.0000000f,0.0000000f}, {1.0000000f,-0.0000000f}, {1.0000000f,-0.0000000f}, +{1.0000000f,-0.0000000f}, {0.91354543f,-0.40673664f}, {0.66913056f,-0.74314487f}, +{1.0000000f,-0.0000000f}, {0.66913056f,-0.74314487f}, {-0.10452851f,-0.99452192f}, +{1.0000000f,-0.0000000f}, {0.30901697f,-0.95105654f}, {-0.80901700f,-0.58778518f}, +{1.0000000f,-0.0000000f}, {-0.10452851f,-0.99452192f}, {-0.97814757f,0.20791179f}, +{1.0000000f,-0.0000000f}, {0.99452192f,-0.10452846f}, {0.97814763f,-0.20791170f}, +{0.95105648f,-0.30901700f}, {0.91354543f,-0.40673664f}, {0.86602545f,-0.50000000f}, +{0.80901700f,-0.58778524f}, {0.74314475f,-0.66913062f}, {0.66913056f,-0.74314487f}, +{0.58778524f,-0.80901700f}, {0.49999997f,-0.86602545f}, {0.40673661f,-0.91354549f}, +{0.30901697f,-0.95105654f}, {0.20791166f,-0.97814763f}, {0.10452842f,-0.99452192f}, +{1.0000000f,-0.0000000f}, {0.97814763f,-0.20791170f}, {0.91354543f,-0.40673664f}, +{0.80901700f,-0.58778524f}, {0.66913056f,-0.74314487f}, {0.49999997f,-0.86602545f}, +{0.30901697f,-0.95105654f}, {0.10452842f,-0.99452192f}, {-0.10452851f,-0.99452192f}, +{-0.30901703f,-0.95105648f}, {-0.50000006f,-0.86602533f}, {-0.66913068f,-0.74314475f}, +{-0.80901700f,-0.58778518f}, {-0.91354549f,-0.40673658f}, {-0.97814763f,-0.20791161f}, +{1.0000000f,-0.0000000f}, {0.95105648f,-0.30901700f}, {0.80901700f,-0.58778524f}, +{0.58778524f,-0.80901700f}, {0.30901697f,-0.95105654f}, {-4.3711388e-08f,-1.0000000f}, +{-0.30901703f,-0.95105648f}, {-0.58778518f,-0.80901700f}, {-0.80901700f,-0.58778518f}, +{-0.95105654f,-0.30901679f}, {-1.0000000f,8.7422777e-08f}, {-0.95105654f,0.30901697f}, +{-0.80901694f,0.58778536f}, {-0.58778507f,0.80901712f}, {-0.30901709f,0.95105648f}, +{1.0000000f,-0.0000000f}, {0.99965733f,-0.026176950f}, {0.99862951f,-0.052335959f}, +{0.99691731f,-0.078459099f}, {0.99452192f,-0.10452846f}, {0.99144489f,-0.13052620f}, +{0.98768836f,-0.15643448f}, {0.98325491f,-0.18223552f}, {0.97814763f,-0.20791170f}, +{0.97236991f,-0.23344538f}, {0.96592581f,-0.25881904f}, {0.95881975f,-0.28401536f}, +{0.95105648f,-0.30901700f}, {0.94264150f,-0.33380687f}, {0.93358040f,-0.35836795f}, +{0.92387956f,-0.38268346f}, {0.91354543f,-0.40673664f}, {0.90258527f,-0.43051112f}, +{0.89100653f,-0.45399052f}, {0.87881708f,-0.47715878f}, {0.86602545f,-0.50000000f}, +{0.85264015f,-0.52249855f}, {0.83867055f,-0.54463905f}, {0.82412618f,-0.56640625f}, +{0.80901700f,-0.58778524f}, {0.79335332f,-0.60876143f}, {0.77714598f,-0.62932038f}, +{0.76040596f,-0.64944810f}, {0.74314475f,-0.66913062f}, {0.72537434f,-0.68835455f}, +{0.70710677f,-0.70710683f}, {0.68835455f,-0.72537440f}, {0.66913056f,-0.74314487f}, +{0.64944804f,-0.76040596f}, {0.62932038f,-0.77714598f}, {0.60876137f,-0.79335338f}, +{0.58778524f,-0.80901700f}, {0.56640625f,-0.82412618f}, {0.54463899f,-0.83867055f}, +{0.52249849f,-0.85264015f}, {0.49999997f,-0.86602545f}, {0.47715876f,-0.87881708f}, +{0.45399052f,-0.89100653f}, {0.43051103f,-0.90258533f}, {0.40673661f,-0.91354549f}, +{0.38268343f,-0.92387956f}, {0.35836786f,-0.93358046f}, {0.33380681f,-0.94264150f}, +{0.30901697f,-0.95105654f}, {0.28401533f,-0.95881975f}, {0.25881907f,-0.96592581f}, +{0.23344530f,-0.97236991f}, {0.20791166f,-0.97814763f}, {0.18223552f,-0.98325491f}, +{0.15643437f,-0.98768836f}, {0.13052613f,-0.99144489f}, {0.10452842f,-0.99452192f}, +{0.078459084f,-0.99691731f}, {0.052335974f,-0.99862951f}, {0.026176875f,-0.99965733f}, +{1.0000000f,-0.0000000f}, {0.99862951f,-0.052335959f}, {0.99452192f,-0.10452846f}, +{0.98768836f,-0.15643448f}, {0.97814763f,-0.20791170f}, {0.96592581f,-0.25881904f}, +{0.95105648f,-0.30901700f}, {0.93358040f,-0.35836795f}, {0.91354543f,-0.40673664f}, +{0.89100653f,-0.45399052f}, {0.86602545f,-0.50000000f}, {0.83867055f,-0.54463905f}, +{0.80901700f,-0.58778524f}, {0.77714598f,-0.62932038f}, {0.74314475f,-0.66913062f}, +{0.70710677f,-0.70710683f}, {0.66913056f,-0.74314487f}, {0.62932038f,-0.77714598f}, +{0.58778524f,-0.80901700f}, {0.54463899f,-0.83867055f}, {0.49999997f,-0.86602545f}, +{0.45399052f,-0.89100653f}, {0.40673661f,-0.91354549f}, {0.35836786f,-0.93358046f}, +{0.30901697f,-0.95105654f}, {0.25881907f,-0.96592581f}, {0.20791166f,-0.97814763f}, +{0.15643437f,-0.98768836f}, {0.10452842f,-0.99452192f}, {0.052335974f,-0.99862951f}, +{-4.3711388e-08f,-1.0000000f}, {-0.052336060f,-0.99862951f}, {-0.10452851f,-0.99452192f}, +{-0.15643445f,-0.98768836f}, {-0.20791174f,-0.97814757f}, {-0.25881916f,-0.96592581f}, +{-0.30901703f,-0.95105648f}, {-0.35836795f,-0.93358040f}, {-0.40673670f,-0.91354543f}, +{-0.45399061f,-0.89100647f}, {-0.50000006f,-0.86602533f}, {-0.54463905f,-0.83867055f}, +{-0.58778518f,-0.80901700f}, {-0.62932050f,-0.77714586f}, {-0.66913068f,-0.74314475f}, +{-0.70710677f,-0.70710677f}, {-0.74314493f,-0.66913044f}, {-0.77714604f,-0.62932026f}, +{-0.80901700f,-0.58778518f}, {-0.83867055f,-0.54463899f}, {-0.86602539f,-0.50000006f}, +{-0.89100659f,-0.45399037f}, {-0.91354549f,-0.40673658f}, {-0.93358040f,-0.35836792f}, +{-0.95105654f,-0.30901679f}, {-0.96592587f,-0.25881892f}, {-0.97814763f,-0.20791161f}, +{-0.98768836f,-0.15643445f}, {-0.99452192f,-0.10452849f}, {-0.99862957f,-0.052335810f}, +{1.0000000f,-0.0000000f}, {0.99691731f,-0.078459099f}, {0.98768836f,-0.15643448f}, +{0.97236991f,-0.23344538f}, {0.95105648f,-0.30901700f}, {0.92387956f,-0.38268346f}, +{0.89100653f,-0.45399052f}, {0.85264015f,-0.52249855f}, {0.80901700f,-0.58778524f}, +{0.76040596f,-0.64944810f}, {0.70710677f,-0.70710683f}, {0.64944804f,-0.76040596f}, +{0.58778524f,-0.80901700f}, {0.52249849f,-0.85264015f}, {0.45399052f,-0.89100653f}, +{0.38268343f,-0.92387956f}, {0.30901697f,-0.95105654f}, {0.23344530f,-0.97236991f}, +{0.15643437f,-0.98768836f}, {0.078459084f,-0.99691731f}, {-4.3711388e-08f,-1.0000000f}, +{-0.078459173f,-0.99691731f}, {-0.15643445f,-0.98768836f}, {-0.23344538f,-0.97236991f}, +{-0.30901703f,-0.95105648f}, {-0.38268352f,-0.92387950f}, {-0.45399061f,-0.89100647f}, +{-0.52249867f,-0.85264009f}, {-0.58778518f,-0.80901700f}, {-0.64944804f,-0.76040596f}, +{-0.70710677f,-0.70710677f}, {-0.76040596f,-0.64944804f}, {-0.80901700f,-0.58778518f}, +{-0.85264021f,-0.52249849f}, {-0.89100659f,-0.45399037f}, {-0.92387956f,-0.38268328f}, +{-0.95105654f,-0.30901679f}, {-0.97236991f,-0.23344538f}, {-0.98768836f,-0.15643445f}, +{-0.99691737f,-0.078459039f}, {-1.0000000f,8.7422777e-08f}, {-0.99691731f,0.078459218f}, +{-0.98768830f,0.15643461f}, {-0.97236985f,0.23344554f}, {-0.95105654f,0.30901697f}, +{-0.92387956f,0.38268346f}, {-0.89100653f,0.45399055f}, {-0.85264015f,0.52249861f}, +{-0.80901694f,0.58778536f}, {-0.76040590f,0.64944816f}, {-0.70710665f,0.70710689f}, +{-0.64944792f,0.76040608f}, {-0.58778507f,0.80901712f}, {-0.52249837f,0.85264033f}, +{-0.45399022f,0.89100665f}, {-0.38268313f,0.92387968f}, {-0.30901709f,0.95105648f}, +{-0.23344545f,0.97236991f}, {-0.15643452f,0.98768830f}, {-0.078459114f,0.99691731f}, +}; +static const ne10_fft_cpx_float32_t ne10_twiddles_120[120] = { +{1.0000000f,0.0000000f}, {1.0000000f,-0.0000000f}, {1.0000000f,-0.0000000f}, +{1.0000000f,-0.0000000f}, {0.91354543f,-0.40673664f}, {0.66913056f,-0.74314487f}, +{1.0000000f,-0.0000000f}, {0.66913056f,-0.74314487f}, {-0.10452851f,-0.99452192f}, +{1.0000000f,-0.0000000f}, {0.30901697f,-0.95105654f}, {-0.80901700f,-0.58778518f}, +{1.0000000f,-0.0000000f}, {-0.10452851f,-0.99452192f}, {-0.97814757f,0.20791179f}, +{1.0000000f,-0.0000000f}, {0.97814763f,-0.20791170f}, {0.91354543f,-0.40673664f}, +{0.80901700f,-0.58778524f}, {0.66913056f,-0.74314487f}, {0.49999997f,-0.86602545f}, +{0.30901697f,-0.95105654f}, {0.10452842f,-0.99452192f}, {-0.10452851f,-0.99452192f}, +{-0.30901703f,-0.95105648f}, {-0.50000006f,-0.86602533f}, {-0.66913068f,-0.74314475f}, +{-0.80901700f,-0.58778518f}, {-0.91354549f,-0.40673658f}, {-0.97814763f,-0.20791161f}, +{1.0000000f,-0.0000000f}, {0.99862951f,-0.052335959f}, {0.99452192f,-0.10452846f}, +{0.98768836f,-0.15643448f}, {0.97814763f,-0.20791170f}, {0.96592581f,-0.25881904f}, +{0.95105648f,-0.30901700f}, {0.93358040f,-0.35836795f}, {0.91354543f,-0.40673664f}, +{0.89100653f,-0.45399052f}, {0.86602545f,-0.50000000f}, {0.83867055f,-0.54463905f}, +{0.80901700f,-0.58778524f}, {0.77714598f,-0.62932038f}, {0.74314475f,-0.66913062f}, +{0.70710677f,-0.70710683f}, {0.66913056f,-0.74314487f}, {0.62932038f,-0.77714598f}, +{0.58778524f,-0.80901700f}, {0.54463899f,-0.83867055f}, {0.49999997f,-0.86602545f}, +{0.45399052f,-0.89100653f}, {0.40673661f,-0.91354549f}, {0.35836786f,-0.93358046f}, +{0.30901697f,-0.95105654f}, {0.25881907f,-0.96592581f}, {0.20791166f,-0.97814763f}, +{0.15643437f,-0.98768836f}, {0.10452842f,-0.99452192f}, {0.052335974f,-0.99862951f}, +{1.0000000f,-0.0000000f}, {0.99452192f,-0.10452846f}, {0.97814763f,-0.20791170f}, +{0.95105648f,-0.30901700f}, {0.91354543f,-0.40673664f}, {0.86602545f,-0.50000000f}, +{0.80901700f,-0.58778524f}, {0.74314475f,-0.66913062f}, {0.66913056f,-0.74314487f}, +{0.58778524f,-0.80901700f}, {0.49999997f,-0.86602545f}, {0.40673661f,-0.91354549f}, +{0.30901697f,-0.95105654f}, {0.20791166f,-0.97814763f}, {0.10452842f,-0.99452192f}, +{-4.3711388e-08f,-1.0000000f}, {-0.10452851f,-0.99452192f}, {-0.20791174f,-0.97814757f}, +{-0.30901703f,-0.95105648f}, {-0.40673670f,-0.91354543f}, {-0.50000006f,-0.86602533f}, +{-0.58778518f,-0.80901700f}, {-0.66913068f,-0.74314475f}, {-0.74314493f,-0.66913044f}, +{-0.80901700f,-0.58778518f}, {-0.86602539f,-0.50000006f}, {-0.91354549f,-0.40673658f}, +{-0.95105654f,-0.30901679f}, {-0.97814763f,-0.20791161f}, {-0.99452192f,-0.10452849f}, +{1.0000000f,-0.0000000f}, {0.98768836f,-0.15643448f}, {0.95105648f,-0.30901700f}, +{0.89100653f,-0.45399052f}, {0.80901700f,-0.58778524f}, {0.70710677f,-0.70710683f}, +{0.58778524f,-0.80901700f}, {0.45399052f,-0.89100653f}, {0.30901697f,-0.95105654f}, +{0.15643437f,-0.98768836f}, {-4.3711388e-08f,-1.0000000f}, {-0.15643445f,-0.98768836f}, +{-0.30901703f,-0.95105648f}, {-0.45399061f,-0.89100647f}, {-0.58778518f,-0.80901700f}, +{-0.70710677f,-0.70710677f}, {-0.80901700f,-0.58778518f}, {-0.89100659f,-0.45399037f}, +{-0.95105654f,-0.30901679f}, {-0.98768836f,-0.15643445f}, {-1.0000000f,8.7422777e-08f}, +{-0.98768830f,0.15643461f}, {-0.95105654f,0.30901697f}, {-0.89100653f,0.45399055f}, +{-0.80901694f,0.58778536f}, {-0.70710665f,0.70710689f}, {-0.58778507f,0.80901712f}, +{-0.45399022f,0.89100665f}, {-0.30901709f,0.95105648f}, {-0.15643452f,0.98768830f}, +}; +static const ne10_fft_cpx_float32_t ne10_twiddles_60[60] = { +{1.0000000f,0.0000000f}, {1.0000000f,-0.0000000f}, {1.0000000f,-0.0000000f}, +{1.0000000f,-0.0000000f}, {0.91354543f,-0.40673664f}, {0.66913056f,-0.74314487f}, +{1.0000000f,-0.0000000f}, {0.66913056f,-0.74314487f}, {-0.10452851f,-0.99452192f}, +{1.0000000f,-0.0000000f}, {0.30901697f,-0.95105654f}, {-0.80901700f,-0.58778518f}, +{1.0000000f,-0.0000000f}, {-0.10452851f,-0.99452192f}, {-0.97814757f,0.20791179f}, +{1.0000000f,-0.0000000f}, {0.99452192f,-0.10452846f}, {0.97814763f,-0.20791170f}, +{0.95105648f,-0.30901700f}, {0.91354543f,-0.40673664f}, {0.86602545f,-0.50000000f}, +{0.80901700f,-0.58778524f}, {0.74314475f,-0.66913062f}, {0.66913056f,-0.74314487f}, +{0.58778524f,-0.80901700f}, {0.49999997f,-0.86602545f}, {0.40673661f,-0.91354549f}, +{0.30901697f,-0.95105654f}, {0.20791166f,-0.97814763f}, {0.10452842f,-0.99452192f}, +{1.0000000f,-0.0000000f}, {0.97814763f,-0.20791170f}, {0.91354543f,-0.40673664f}, +{0.80901700f,-0.58778524f}, {0.66913056f,-0.74314487f}, {0.49999997f,-0.86602545f}, +{0.30901697f,-0.95105654f}, {0.10452842f,-0.99452192f}, {-0.10452851f,-0.99452192f}, +{-0.30901703f,-0.95105648f}, {-0.50000006f,-0.86602533f}, {-0.66913068f,-0.74314475f}, +{-0.80901700f,-0.58778518f}, {-0.91354549f,-0.40673658f}, {-0.97814763f,-0.20791161f}, +{1.0000000f,-0.0000000f}, {0.95105648f,-0.30901700f}, {0.80901700f,-0.58778524f}, +{0.58778524f,-0.80901700f}, {0.30901697f,-0.95105654f}, {-4.3711388e-08f,-1.0000000f}, +{-0.30901703f,-0.95105648f}, {-0.58778518f,-0.80901700f}, {-0.80901700f,-0.58778518f}, +{-0.95105654f,-0.30901679f}, {-1.0000000f,8.7422777e-08f}, {-0.95105654f,0.30901697f}, +{-0.80901694f,0.58778536f}, {-0.58778507f,0.80901712f}, {-0.30901709f,0.95105648f}, +}; +static const ne10_fft_state_float32_t ne10_fft_state_float32_t_480 = { +120, +(ne10_int32_t *)ne10_factors_480, +(ne10_fft_cpx_float32_t *)ne10_twiddles_480, +NULL, +(ne10_fft_cpx_float32_t *)&ne10_twiddles_480[120], +/* is_forward_scaled = true */ +(ne10_int32_t) 1, +/* is_backward_scaled = false */ +(ne10_int32_t) 0, +}; +static const arch_fft_state cfg_arch_480 = { +1, +(void *)&ne10_fft_state_float32_t_480, +}; + +static const ne10_fft_state_float32_t ne10_fft_state_float32_t_240 = { +60, +(ne10_int32_t *)ne10_factors_240, +(ne10_fft_cpx_float32_t *)ne10_twiddles_240, +NULL, +(ne10_fft_cpx_float32_t *)&ne10_twiddles_240[60], +/* is_forward_scaled = true */ +(ne10_int32_t) 1, +/* is_backward_scaled = false */ +(ne10_int32_t) 0, +}; +static const arch_fft_state cfg_arch_240 = { +1, +(void *)&ne10_fft_state_float32_t_240, +}; + +static const ne10_fft_state_float32_t ne10_fft_state_float32_t_120 = { +30, +(ne10_int32_t *)ne10_factors_120, +(ne10_fft_cpx_float32_t *)ne10_twiddles_120, +NULL, +(ne10_fft_cpx_float32_t *)&ne10_twiddles_120[30], +/* is_forward_scaled = true */ +(ne10_int32_t) 1, +/* is_backward_scaled = false */ +(ne10_int32_t) 0, +}; +static const arch_fft_state cfg_arch_120 = { +1, +(void *)&ne10_fft_state_float32_t_120, +}; + +static const ne10_fft_state_float32_t ne10_fft_state_float32_t_60 = { +15, +(ne10_int32_t *)ne10_factors_60, +(ne10_fft_cpx_float32_t *)ne10_twiddles_60, +NULL, +(ne10_fft_cpx_float32_t *)&ne10_twiddles_60[15], +/* is_forward_scaled = true */ +(ne10_int32_t) 1, +/* is_backward_scaled = false */ +(ne10_int32_t) 0, +}; +static const arch_fft_state cfg_arch_60 = { +1, +(void *)&ne10_fft_state_float32_t_60, +}; + +#endif /* end NE10_FFT_PARAMS48000_960 */ diff --git a/thirdparty/opus/celt/tests/test_unit_cwrs32.c b/thirdparty/opus/celt/tests/test_unit_cwrs32.c new file mode 100644 index 000000000000..36dd8af5f5c4 --- /dev/null +++ b/thirdparty/opus/celt/tests/test_unit_cwrs32.c @@ -0,0 +1,161 @@ +/* Copyright (c) 2008-2011 Xiph.Org Foundation, Mozilla Corporation, + Gregory Maxwell + Written by Jean-Marc Valin, Gregory Maxwell, and Timothy B. Terriberry */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include + +#ifndef CUSTOM_MODES +#define CUSTOM_MODES +#else +#define TEST_CUSTOM_MODES +#endif + +#define CELT_C +#include "stack_alloc.h" +#include "entenc.c" +#include "entdec.c" +#include "entcode.c" +#include "cwrs.c" +#include "mathops.c" +#include "rate.h" + +#define NMAX (240) +#define KMAX (128) + +#ifdef TEST_CUSTOM_MODES + +#define NDIMS (44) +static const int pn[NDIMS]={ + 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 18, 20, 22, + 24, 26, 28, 30, 32, 36, 40, 44, 48, + 52, 56, 60, 64, 72, 80, 88, 96, 104, + 112, 120, 128, 144, 160, 176, 192, 208 +}; +static const int pkmax[NDIMS]={ + 128, 128, 128, 128, 88, 52, 36, 26, 22, + 18, 16, 15, 13, 12, 12, 11, 10, 9, + 9, 8, 8, 7, 7, 7, 7, 6, 6, + 6, 6, 6, 5, 5, 5, 5, 5, 5, + 4, 4, 4, 4, 4, 4, 4, 4 +}; + +#else /* TEST_CUSTOM_MODES */ + +#define NDIMS (22) +static const int pn[NDIMS]={ + 2, 3, 4, 6, 8, 9, 11, 12, 16, + 18, 22, 24, 32, 36, 44, 48, 64, 72, + 88, 96, 144, 176 +}; +static const int pkmax[NDIMS]={ + 128, 128, 128, 88, 36, 26, 18, 16, 12, + 11, 9, 9, 7, 7, 6, 6, 5, 5, + 5, 5, 4, 4 +}; + +#endif + +int main(void){ + int t; + int n; + ALLOC_STACK; + for(t=0;tpkmax[t])break; + printf("Testing CWRS with N=%i, K=%i...\n",n,k); +#if defined(SMALL_FOOTPRINT) + nc=ncwrs_urow(n,k,uu); +#else + nc=CELT_PVQ_V(n,k); +#endif + inc=nc/20000; + if(inc<1)inc=1; + for(i=0;i");*/ +#if defined(SMALL_FOOTPRINT) + ii=icwrs(n,k,&v,y,u); +#else + ii=icwrs(n,y); + v=CELT_PVQ_V(n,k); +#endif + if(ii!=i){ + fprintf(stderr,"Combination-index mismatch (%lu!=%lu).\n", + (long)ii,(long)i); + return 1; + } + if(v!=nc){ + fprintf(stderr,"Combination count mismatch (%lu!=%lu).\n", + (long)v,(long)nc); + return 2; + } + /*printf(" %6u\n",i);*/ + } + /*printf("\n");*/ + } + } + return 0; +} diff --git a/thirdparty/opus/celt/tests/test_unit_dft.c b/thirdparty/opus/celt/tests/test_unit_dft.c new file mode 100644 index 000000000000..6166eb0e4fa9 --- /dev/null +++ b/thirdparty/opus/celt/tests/test_unit_dft.c @@ -0,0 +1,189 @@ +/* Copyright (c) 2008 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#define SKIP_CONFIG_H + +#ifndef CUSTOM_MODES +#define CUSTOM_MODES +#endif + +#include + +#define CELT_C +#define TEST_UNIT_DFT_C +#include "stack_alloc.h" +#include "kiss_fft.h" +#include "kiss_fft.c" +#include "mathops.c" +#include "entcode.c" + +#if defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) +# include "x86/x86cpu.c" +#elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/armcpu.c" +# include "celt_lpc.c" +# include "pitch.c" +# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/celt_neon_intr.c" +# if defined(HAVE_ARM_NE10) +# include "mdct.c" +# include "arm/celt_ne10_fft.c" +# include "arm/celt_ne10_mdct.c" +# endif +# endif +# include "arm/arm_celt_map.c" +#endif + +#ifndef M_PI +#define M_PI 3.141592653 +#endif + +int ret = 0; + +void check(kiss_fft_cpx * in,kiss_fft_cpx * out,int nfft,int isinverse) +{ + int bin,k; + double errpow=0,sigpow=0, snr; + + for (bin=0;bin1) { + int k; + for (k=1;k +#include +#include +#include +#include "entcode.h" +#include "entenc.h" +#include "entdec.h" +#include + +#include "entenc.c" +#include "entdec.c" +#include "entcode.c" + +#ifndef M_LOG2E +# define M_LOG2E 1.4426950408889634074 +#endif +#define DATA_SIZE 10000000 +#define DATA_SIZE2 10000 + +int main(int _argc,char **_argv){ + ec_enc enc; + ec_dec dec; + long nbits; + long nbits2; + double entropy; + int ft; + int ftb; + int sz; + int i; + int ret; + unsigned int sym; + unsigned int seed; + unsigned char *ptr; + const char *env_seed; + ret=0; + entropy=0; + if (_argc > 2) { + fprintf(stderr, "Usage: %s []\n", _argv[0]); + return 1; + } + env_seed = getenv("SEED"); + if (_argc > 1) + seed = atoi(_argv[1]); + else if (env_seed) + seed = atoi(env_seed); + else + seed = time(NULL); + /*Testing encoding of raw bit values.*/ + ptr = (unsigned char *)malloc(DATA_SIZE); + ec_enc_init(&enc,ptr, DATA_SIZE); + for(ft=2;ft<1024;ft++){ + for(i=0;i>(rand()%11U))+1U)+10; + sz=rand()/((RAND_MAX>>(rand()%9U))+1U); + data=(unsigned *)malloc(sz*sizeof(*data)); + tell=(unsigned *)malloc((sz+1)*sizeof(*tell)); + ec_enc_init(&enc,ptr,DATA_SIZE2); + zeros = rand()%13==0; + tell[0]=ec_tell_frac(&enc); + for(j=0;j>(rand()%9U))+1U); + logp1=(unsigned *)malloc(sz*sizeof(*logp1)); + data=(unsigned *)malloc(sz*sizeof(*data)); + tell=(unsigned *)malloc((sz+1)*sizeof(*tell)); + enc_method=(unsigned *)malloc(sz*sizeof(*enc_method)); + ec_enc_init(&enc,ptr,DATA_SIZE2); + tell[0]=ec_tell_frac(&enc); + for(j=0;j>1)+1); + logp1[j]=(rand()%15)+1; + enc_method[j]=rand()/((RAND_MAX>>2)+1); + switch(enc_method[j]){ + case 0:{ + ec_encode(&enc,data[j]?(1<>2)+1); + switch(dec_method){ + case 0:{ + fs=ec_decode(&dec,1<=(1<=(1< +#include +#include "laplace.h" +#define CELT_C +#include "stack_alloc.h" + +#include "entenc.c" +#include "entdec.c" +#include "entcode.c" +#include "laplace.c" + +#define DATA_SIZE 40000 + +int ec_laplace_get_start_freq(int decay) +{ + opus_uint32 ft = 32768 - LAPLACE_MINP*(2*LAPLACE_NMIN+1); + int fs = (ft*(16384-decay))/(16384+decay); + return fs+LAPLACE_MINP; +} + +int main(void) +{ + int i; + int ret = 0; + ec_enc enc; + ec_dec dec; + unsigned char *ptr; + int val[10000], decay[10000]; + ALLOC_STACK; + ptr = (unsigned char *)malloc(DATA_SIZE); + ec_enc_init(&enc,ptr,DATA_SIZE); + + val[0] = 3; decay[0] = 6000; + val[1] = 0; decay[1] = 5800; + val[2] = -1; decay[2] = 5600; + for (i=3;i<10000;i++) + { + val[i] = rand()%15-7; + decay[i] = rand()%11000+5000; + } + for (i=0;i<10000;i++) + ec_laplace_encode(&enc, &val[i], + ec_laplace_get_start_freq(decay[i]), decay[i]); + + ec_enc_done(&enc); + + ec_dec_init(&dec,ec_get_buffer(&enc),ec_range_bytes(&enc)); + + for (i=0;i<10000;i++) + { + int d = ec_laplace_decode(&dec, + ec_laplace_get_start_freq(decay[i]), decay[i]); + if (d != val[i]) + { + fprintf (stderr, "Got %d instead of %d\n", d, val[i]); + ret = 1; + } + } + + free(ptr); + return ret; +} diff --git a/thirdparty/opus/celt/tests/test_unit_mathops.c b/thirdparty/opus/celt/tests/test_unit_mathops.c new file mode 100644 index 000000000000..fd3319da91ce --- /dev/null +++ b/thirdparty/opus/celt/tests/test_unit_mathops.c @@ -0,0 +1,304 @@ +/* Copyright (c) 2008-2011 Xiph.Org Foundation, Mozilla Corporation, + Gregory Maxwell + Written by Jean-Marc Valin, Gregory Maxwell, and Timothy B. Terriberry */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifndef CUSTOM_MODES +#define CUSTOM_MODES +#endif + +#define CELT_C + +#include +#include +#include "mathops.c" +#include "entenc.c" +#include "entdec.c" +#include "entcode.c" +#include "bands.c" +#include "quant_bands.c" +#include "laplace.c" +#include "vq.c" +#include "cwrs.c" +#include "pitch.c" +#include "celt_lpc.c" +#include "celt.c" + +#if defined(OPUS_X86_MAY_HAVE_SSE) || defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) +# if defined(OPUS_X86_MAY_HAVE_SSE) +# include "x86/pitch_sse.c" +# endif +# if defined(OPUS_X86_MAY_HAVE_SSE2) +# include "x86/pitch_sse2.c" +# endif +# if defined(OPUS_X86_MAY_HAVE_SSE4_1) +# include "x86/pitch_sse4_1.c" +# include "x86/celt_lpc_sse.c" +# endif +# include "x86/x86_celt_map.c" +#elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/armcpu.c" +# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/celt_neon_intr.c" +# if defined(HAVE_ARM_NE10) +# include "kiss_fft.c" +# include "mdct.c" +# include "arm/celt_ne10_fft.c" +# include "arm/celt_ne10_mdct.c" +# endif +# endif +# include "arm/arm_celt_map.c" +#endif + +#ifdef FIXED_POINT +#define WORD "%d" +#else +#define WORD "%f" +#endif + +int ret = 0; + +void testdiv(void) +{ + opus_int32 i; + for (i=1;i<=327670;i++) + { + double prod; + opus_val32 val; + val = celt_rcp(i); +#ifdef FIXED_POINT + prod = (1./32768./65526.)*val*i; +#else + prod = val*i; +#endif + if (fabs(prod-1) > .00025) + { + fprintf (stderr, "div failed: 1/%d="WORD" (product = %f)\n", i, val, prod); + ret = 1; + } + } +} + +void testsqrt(void) +{ + opus_int32 i; + for (i=1;i<=1000000000;i++) + { + double ratio; + opus_val16 val; + val = celt_sqrt(i); + ratio = val/sqrt(i); + if (fabs(ratio - 1) > .0005 && fabs(val-sqrt(i)) > 2) + { + fprintf (stderr, "sqrt failed: sqrt(%d)="WORD" (ratio = %f)\n", i, val, ratio); + ret = 1; + } + i+= i>>10; + } +} + +void testbitexactcos(void) +{ + int i; + opus_int32 min_d,max_d,last,chk; + chk=max_d=0; + last=min_d=32767; + for(i=64;i<=16320;i++) + { + opus_int32 d; + opus_int32 q=bitexact_cos(i); + chk ^= q*i; + d = last - q; + if (d>max_d)max_d=d; + if (dmax_d)max_d=d; + if (d0.0009) + { + fprintf (stderr, "celt_log2 failed: fabs((1.442695040888963387*log(x))-celt_log2(x))>0.001 (x = %f, error = %f)\n", x,error); + ret = 1; + } + } +} + +void testexp2(void) +{ + float x; + for (x=-11.0;x<24.0;x+=0.0007) + { + float error = fabs(x-(1.442695040888963387*log(celt_exp2(x)))); + if (error>0.0002) + { + fprintf (stderr, "celt_exp2 failed: fabs(x-(1.442695040888963387*log(celt_exp2(x))))>0.0005 (x = %f, error = %f)\n", x,error); + ret = 1; + } + } +} + +void testexp2log2(void) +{ + float x; + for (x=-11.0;x<24.0;x+=0.0007) + { + float error = fabs(x-(celt_log2(celt_exp2(x)))); + if (error>0.001) + { + fprintf (stderr, "celt_log2/celt_exp2 failed: fabs(x-(celt_log2(celt_exp2(x))))>0.001 (x = %f, error = %f)\n", x,error); + ret = 1; + } + } +} +#else +void testlog2(void) +{ + opus_val32 x; + for (x=8;x<1073741824;x+=(x>>3)) + { + float error = fabs((1.442695040888963387*log(x/16384.0))-celt_log2(x)/1024.0); + if (error>0.003) + { + fprintf (stderr, "celt_log2 failed: x = %ld, error = %f\n", (long)x,error); + ret = 1; + } + } +} + +void testexp2(void) +{ + opus_val16 x; + for (x=-32768;x<15360;x++) + { + float error1 = fabs(x/1024.0-(1.442695040888963387*log(celt_exp2(x)/65536.0))); + float error2 = fabs(exp(0.6931471805599453094*x/1024.0)-celt_exp2(x)/65536.0); + if (error1>0.0002&&error2>0.00004) + { + fprintf (stderr, "celt_exp2 failed: x = "WORD", error1 = %f, error2 = %f\n", x,error1,error2); + ret = 1; + } + } +} + +void testexp2log2(void) +{ + opus_val32 x; + for (x=8;x<65536;x+=(x>>3)) + { + float error = fabs(x-0.25*celt_exp2(celt_log2(x)))/16384; + if (error>0.004) + { + fprintf (stderr, "celt_log2/celt_exp2 failed: fabs(x-(celt_exp2(celt_log2(x))))>0.001 (x = %ld, error = %f)\n", (long)x,error); + ret = 1; + } + } +} + +void testilog2(void) +{ + opus_val32 x; + for (x=1;x<=268435455;x+=127) + { + opus_val32 lg; + opus_val32 y; + + lg = celt_ilog2(x); + if (lg<0 || lg>=31) + { + printf("celt_ilog2 failed: 0<=celt_ilog2(x)<31 (x = %d, celt_ilog2(x) = %d)\n",x,lg); + ret = 1; + } + y = 1<>1)>=y) + { + printf("celt_ilog2 failed: 2**celt_ilog2(x)<=x<2**(celt_ilog2(x)+1) (x = %d, 2**celt_ilog2(x) = %d)\n",x,y); + ret = 1; + } + } +} +#endif + +int main(void) +{ + testbitexactcos(); + testbitexactlog2tan(); + testdiv(); + testsqrt(); + testlog2(); + testexp2(); + testexp2log2(); +#ifdef FIXED_POINT + testilog2(); +#endif + return ret; +} diff --git a/thirdparty/opus/celt/tests/test_unit_mdct.c b/thirdparty/opus/celt/tests/test_unit_mdct.c new file mode 100644 index 000000000000..8dbb9caa2e8c --- /dev/null +++ b/thirdparty/opus/celt/tests/test_unit_mdct.c @@ -0,0 +1,230 @@ +/* Copyright (c) 2008-2011 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#define SKIP_CONFIG_H + +#ifndef CUSTOM_MODES +#define CUSTOM_MODES +#endif + +#include + +#define CELT_C +#include "mdct.h" +#include "stack_alloc.h" + +#include "kiss_fft.c" +#include "mdct.c" +#include "mathops.c" +#include "entcode.c" + +#if defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) +# include "x86/x86cpu.c" +#elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/armcpu.c" +# include "pitch.c" +# include "celt_lpc.c" +# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/celt_neon_intr.c" +# if defined(HAVE_ARM_NE10) +# include "arm/celt_ne10_fft.c" +# include "arm/celt_ne10_mdct.c" +# endif +# endif +# include "arm/arm_celt_map.c" +#endif + +#ifndef M_PI +#define M_PI 3.141592653 +#endif + +int ret = 0; +void check(kiss_fft_scalar * in,kiss_fft_scalar * out,int nfft,int isinverse) +{ + int bin,k; + double errpow=0,sigpow=0; + double snr; + for (bin=0;bin1) { + int k; + for (k=1;k +#include +#include "vq.c" +#include "cwrs.c" +#include "entcode.c" +#include "entenc.c" +#include "entdec.c" +#include "mathops.c" +#include "bands.h" +#include "pitch.c" +#include "celt_lpc.c" +#include "celt.c" +#include + +#if defined(OPUS_X86_MAY_HAVE_SSE) || defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) +# if defined(OPUS_X86_MAY_HAVE_SSE) +# include "x86/pitch_sse.c" +# endif +# if defined(OPUS_X86_MAY_HAVE_SSE2) +# include "x86/pitch_sse2.c" +# endif +# if defined(OPUS_X86_MAY_HAVE_SSE4_1) +# include "x86/pitch_sse4_1.c" +# include "x86/celt_lpc_sse.c" +# endif +# include "x86/x86_celt_map.c" +#elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/armcpu.c" +# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +# include "arm/celt_neon_intr.c" +# if defined(HAVE_ARM_NE10) +# include "kiss_fft.c" +# include "mdct.c" +# include "arm/celt_ne10_fft.c" +# include "arm/celt_ne10_mdct.c" +# endif +# endif +# include "arm/arm_celt_map.c" +#endif + +#define MAX_SIZE 100 + +int ret=0; +void test_rotation(int N, int K) +{ + int i; + double err = 0, ener = 0, snr, snr0; + opus_val16 x0[MAX_SIZE]; + opus_val16 x1[MAX_SIZE]; + for (i=0;i 20) + { + fprintf(stderr, "FAIL!\n"); + ret = 1; + } +} + +int main(void) +{ + ALLOC_STACK; + test_rotation(15, 3); + test_rotation(23, 5); + test_rotation(50, 3); + test_rotation(80, 1); + return ret; +} diff --git a/thirdparty/opus/celt/tests/test_unit_types.c b/thirdparty/opus/celt/tests/test_unit_types.c new file mode 100644 index 000000000000..67a0fb8ed3be --- /dev/null +++ b/thirdparty/opus/celt/tests/test_unit_types.c @@ -0,0 +1,50 @@ +/* Copyright (c) 2008-2011 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opus_types.h" +#include + +int main(void) +{ + opus_int16 i = 1; + i <<= 14; + if (i>>14 != 1) + { + fprintf(stderr, "opus_int16 isn't 16 bits\n"); + return 1; + } + if (sizeof(opus_int16)*2 != sizeof(opus_int32)) + { + fprintf(stderr, "16*2 != 32\n"); + return 1; + } + return 0; +} diff --git a/thirdparty/opus/celt/vq.c b/thirdparty/opus/celt/vq.c new file mode 100644 index 000000000000..d29f38fd8ef7 --- /dev/null +++ b/thirdparty/opus/celt/vq.c @@ -0,0 +1,408 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "mathops.h" +#include "cwrs.h" +#include "vq.h" +#include "arch.h" +#include "os_support.h" +#include "bands.h" +#include "rate.h" +#include "pitch.h" + +#ifndef OVERRIDE_vq_exp_rotation1 +static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_val16 s) +{ + int i; + opus_val16 ms; + celt_norm *Xptr; + Xptr = X; + ms = NEG16(s); + for (i=0;i=0;i--) + { + celt_norm x1, x2; + x1 = Xptr[0]; + x2 = Xptr[stride]; + Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2), s, x1), 15)); + *Xptr-- = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15)); + } +} +#endif /* OVERRIDE_vq_exp_rotation1 */ + +static void exp_rotation(celt_norm *X, int len, int dir, int stride, int K, int spread) +{ + static const int SPREAD_FACTOR[3]={15,10,5}; + int i; + opus_val16 c, s; + opus_val16 gain, theta; + int stride2=0; + int factor; + + if (2*K>=len || spread==SPREAD_NONE) + return; + factor = SPREAD_FACTOR[spread-1]; + + gain = celt_div((opus_val32)MULT16_16(Q15_ONE,len),(opus_val32)(len+factor*K)); + theta = HALF16(MULT16_16_Q15(gain,gain)); + + c = celt_cos_norm(EXTEND32(theta)); + s = celt_cos_norm(EXTEND32(SUB16(Q15ONE,theta))); /* sin(theta) */ + + if (len>=8*stride) + { + stride2 = 1; + /* This is just a simple (equivalent) way of computing sqrt(len/stride) with rounding. + It's basically incrementing long as (stride2+0.5)^2 < len/stride. */ + while ((stride2*stride2+stride2)*stride + (stride>>2) < len) + stride2++; + } + /*NOTE: As a minor optimization, we could be passing around log2(B), not B, for both this and for + extract_collapse_mask().*/ + len = celt_udiv(len, stride); + for (i=0;i>1; +#endif + t = VSHR32(Ryy, 2*(k-7)); + g = MULT16_16_P15(celt_rsqrt_norm(t),gain); + + i=0; + do + X[i] = EXTRACT16(PSHR32(MULT16_16(g, iy[i]), k+1)); + while (++i < N); +} + +static unsigned extract_collapse_mask(int *iy, int N, int B) +{ + unsigned collapse_mask; + int N0; + int i; + if (B<=1) + return 1; + /*NOTE: As a minor optimization, we could be passing around log2(B), not B, for both this and for + exp_rotation().*/ + N0 = celt_udiv(N, B); + collapse_mask = 0; + i=0; do { + int j; + unsigned tmp=0; + j=0; do { + tmp |= iy[i*N0+j]; + } while (++j0, "alg_quant() needs at least one pulse"); + celt_assert2(N>1, "alg_quant() needs at least two dimensions"); + + ALLOC(y, N, celt_norm); + ALLOC(iy, N, int); + ALLOC(signx, N, opus_val16); + + exp_rotation(X, N, 1, B, K, spread); + + /* Get rid of the sign */ + sum = 0; + j=0; do { + if (X[j]>0) + signx[j]=1; + else { + signx[j]=-1; + X[j]=-X[j]; + } + iy[j] = 0; + y[j] = 0; + } while (++j (N>>1)) + { + opus_val16 rcp; + j=0; do { + sum += X[j]; + } while (++j EPSILON && sum < 64)) +#endif + { + X[0] = QCONST16(1.f,14); + j=1; do + X[j]=0; + while (++j=1, "Allocated too many pulses in the quick pass"); + + /* This should never happen, but just in case it does (e.g. on silence) + we fill the first bin with pulses. */ +#ifdef FIXED_POINT_DEBUG + celt_assert2(pulsesLeft<=N+3, "Not enough pulses in the quick pass"); +#endif + if (pulsesLeft > N+3) + { + opus_val16 tmp = (opus_val16)pulsesLeft; + yy = MAC16_16(yy, tmp, tmp); + yy = MAC16_16(yy, tmp, y[0]); + iy[0] += pulsesLeft; + pulsesLeft=0; + } + + s = 1; + for (i=0;i= best_num/best_den, but that way + we can do it without any division */ + /* OPT: Make sure to use conditional moves here */ + if (MULT16_16(best_den, Rxy) > MULT16_16(Ryy, best_num)) + { + best_den = Ryy; + best_num = Rxy; + best_id = j; + } + } while (++j0, "alg_unquant() needs at least one pulse"); + celt_assert2(N>1, "alg_unquant() needs at least two dimensions"); + ALLOC(iy, N, int); + Ryy = decode_pulses(iy, N, K, dec); + normalise_residual(iy, X, N, Ryy, gain); + exp_rotation(X, N, -1, B, K, spread); + collapse_mask = extract_collapse_mask(iy, N, B); + RESTORE_STACK; + return collapse_mask; +} + +#ifndef OVERRIDE_renormalise_vector +void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch) +{ + int i; +#ifdef FIXED_POINT + int k; +#endif + opus_val32 E; + opus_val16 g; + opus_val32 t; + celt_norm *xptr; + E = EPSILON + celt_inner_prod(X, X, N, arch); +#ifdef FIXED_POINT + k = celt_ilog2(E)>>1; +#endif + t = VSHR32(E, 2*(k-7)); + g = MULT16_16_P15(celt_rsqrt_norm(t),gain); + + xptr = X; + for (i=0;i header file. */ +#define HAVE_DLFCN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +#if (!defined( _MSC_VER ) || ( _MSC_VER >= 1800 )) + +/* Define to 1 if you have the `lrint' function. */ +#define HAVE_LRINT 1 + +/* Define to 1 if you have the `lrintf' function. */ +#define HAVE_LRINTF 1 + +#endif + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to the sub-directory in which libtool stores uninstalled libraries. + */ +#define LT_OBJDIR ".libs/" + +#ifdef OPUS_ARM_OPT +/* Make use of ARM asm optimization */ +#define OPUS_ARM_ASM 1 + +/* Use generic ARMv4 inline asm optimizations */ +#define OPUS_ARM_INLINE_ASM 1 + +/* Use ARMv5E inline asm optimizations */ +#define OPUS_ARM_INLINE_EDSP 1 + +/* Use ARMv6 inline asm optimizations */ +#define OPUS_ARM_INLINE_MEDIA 1 + +/* Use ARM NEON inline asm optimizations */ +#define OPUS_ARM_INLINE_NEON 1 + +/* Define if assembler supports EDSP instructions */ +#define OPUS_ARM_MAY_HAVE_EDSP 1 + +/* Define if assembler supports ARMv6 media instructions */ +#define OPUS_ARM_MAY_HAVE_MEDIA 1 + +/* Define if compiler supports NEON instructions */ +#define OPUS_ARM_MAY_HAVE_NEON 1 +#endif // OPUS_ARM_OPT + +#ifdef OPUS_ARM64_OPT +/* Make use of ARM asm optimization */ +#define OPUS_ARM_ASM 1 + +/* Use ARMv6 inline asm optimizations */ +#define OPUS_ARM_INLINE_MEDIA 1 // work + +/* Use ARM NEON inline asm optimizations */ +#define OPUS_ARM_INLINE_NEON 1 // work + +/* Define if assembler supports EDSP instructions */ +#define OPUS_ARM_MAY_HAVE_EDSP 1 // work + +/* Define if assembler supports ARMv6 media instructions */ +#define OPUS_ARM_MAY_HAVE_MEDIA 1 // work + +/* Define if compiler supports NEON instructions */ +#define OPUS_ARM_MAY_HAVE_NEON 1 + +#endif // OPUS_ARM64_OPT + +/* This is a build of OPUS */ +#define OPUS_BUILD /**/ + +#ifndef WIN32 + /* Use C99 variable-size arrays */ + #define VAR_ARRAYS 1 +#else + /* Fixes VS 2013 compile error */ + #define USE_ALLOCA 1 +#endif + +#ifndef OPUS_FIXED_POINT +#define FLOAT_APPROX 1 +#endif + + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define to the equivalent of the C99 'restrict' keyword, or to + nothing if this is not supported. Do not define if restrict is + supported directly. */ +#if (!defined( _MSC_VER ) || ( _MSC_VER >= 1800 )) +#define restrict __restrict +#else +#undef restrict +#endif +/* Work around a bug in Sun C++: it does not support _Restrict or + __restrict__, even though the corresponding Sun C compiler ends up with + "#define restrict _Restrict" or "#define restrict __restrict__" in the + previous line. Perhaps some future version of Sun C++ will work with + restrict; if so, hopefully it defines __RESTRICT like Sun C does. */ +#if defined __SUNPRO_CC && !defined __RESTRICT +# define _Restrict +# define __restrict__ +#endif diff --git a/thirdparty/opus/info.c b/thirdparty/opus/info.c new file mode 100644 index 000000000000..c36f9a9ee10b --- /dev/null +++ b/thirdparty/opus/info.c @@ -0,0 +1,758 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE libopusfile SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE libopusfile SOURCE CODE IS (C) COPYRIGHT 2012 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ********************************************************************/ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "internal.h" +#include +#include + +static unsigned op_parse_uint16le(const unsigned char *_data){ + return _data[0]|_data[1]<<8; +} + +static int op_parse_int16le(const unsigned char *_data){ + int ret; + ret=_data[0]|_data[1]<<8; + return (ret^0x8000)-0x8000; +} + +static opus_uint32 op_parse_uint32le(const unsigned char *_data){ + return _data[0]|(opus_uint32)_data[1]<<8| + (opus_uint32)_data[2]<<16|(opus_uint32)_data[3]<<24; +} + +static opus_uint32 op_parse_uint32be(const unsigned char *_data){ + return _data[3]|(opus_uint32)_data[2]<<8| + (opus_uint32)_data[1]<<16|(opus_uint32)_data[0]<<24; +} + +int opus_head_parse(OpusHead *_head,const unsigned char *_data,size_t _len){ + OpusHead head; + if(_len<8)return OP_ENOTFORMAT; + if(memcmp(_data,"OpusHead",8)!=0)return OP_ENOTFORMAT; + if(_len<9)return OP_EBADHEADER; + head.version=_data[8]; + if(head.version>15)return OP_EVERSION; + if(_len<19)return OP_EBADHEADER; + head.channel_count=_data[9]; + head.pre_skip=op_parse_uint16le(_data+10); + head.input_sample_rate=op_parse_uint32le(_data+12); + head.output_gain=op_parse_int16le(_data+16); + head.mapping_family=_data[18]; + if(head.mapping_family==0){ + if(head.channel_count<1||head.channel_count>2)return OP_EBADHEADER; + if(head.version<=1&&_len>19)return OP_EBADHEADER; + head.stream_count=1; + head.coupled_count=head.channel_count-1; + if(_head!=NULL){ + _head->mapping[0]=0; + _head->mapping[1]=1; + } + } + else if(head.mapping_family==1){ + size_t size; + int ci; + if(head.channel_count<1||head.channel_count>8)return OP_EBADHEADER; + size=21+head.channel_count; + if(_lensize)return OP_EBADHEADER; + head.stream_count=_data[19]; + if(head.stream_count<1)return OP_EBADHEADER; + head.coupled_count=_data[20]; + if(head.coupled_count>head.stream_count)return OP_EBADHEADER; + for(ci=0;ci=head.stream_count+head.coupled_count + &&_data[21+ci]!=255){ + return OP_EBADHEADER; + } + } + if(_head!=NULL)memcpy(_head->mapping,_data+21,head.channel_count); + } + /*General purpose players should not attempt to play back content with + channel mapping family 255.*/ + else if(head.mapping_family==255)return OP_EIMPL; + /*No other channel mapping families are currently defined.*/ + else return OP_EBADHEADER; + if(_head!=NULL)memcpy(_head,&head,head.mapping-(unsigned char *)&head); + return 0; +} + +void opus_tags_init(OpusTags *_tags){ + memset(_tags,0,sizeof(*_tags)); +} + +void opus_tags_clear(OpusTags *_tags){ + int ncomments; + int ci; + ncomments=_tags->comments; + if(_tags->user_comments!=NULL)ncomments++; + for(ci=ncomments;ci-->0;)_ogg_free(_tags->user_comments[ci]); + _ogg_free(_tags->user_comments); + _ogg_free(_tags->comment_lengths); + _ogg_free(_tags->vendor); +} + +/*Ensure there's room for up to _ncomments comments.*/ +static int op_tags_ensure_capacity(OpusTags *_tags,size_t _ncomments){ + char **user_comments; + int *comment_lengths; + int cur_ncomments; + char *binary_suffix_data; + int binary_suffix_len; + size_t size; + if(OP_UNLIKELY(_ncomments>=(size_t)INT_MAX))return OP_EFAULT; + size=sizeof(*_tags->comment_lengths)*(_ncomments+1); + if(size/sizeof(*_tags->comment_lengths)!=_ncomments+1)return OP_EFAULT; + cur_ncomments=_tags->comments; + comment_lengths=_tags->comment_lengths; + binary_suffix_len=comment_lengths==NULL?0:comment_lengths[cur_ncomments]; + comment_lengths=(int *)_ogg_realloc(_tags->comment_lengths,size); + if(OP_UNLIKELY(comment_lengths==NULL))return OP_EFAULT; + comment_lengths[_ncomments]=binary_suffix_len; + _tags->comment_lengths=comment_lengths; + size=sizeof(*_tags->user_comments)*(_ncomments+1); + if(size/sizeof(*_tags->user_comments)!=_ncomments+1)return OP_EFAULT; + user_comments=_tags->user_comments; + binary_suffix_data=user_comments==NULL?NULL:user_comments[cur_ncomments]; + user_comments=(char **)_ogg_realloc(_tags->user_comments,size); + if(OP_UNLIKELY(user_comments==NULL))return OP_EFAULT; + user_comments[_ncomments]=binary_suffix_data; + _tags->user_comments=user_comments; + return 0; +} + +/*Duplicate a (possibly non-NUL terminated) string with a known length.*/ +static char *op_strdup_with_len(const char *_s,size_t _len){ + size_t size; + char *ret; + size=sizeof(*ret)*(_len+1); + if(OP_UNLIKELY(size<_len))return NULL; + ret=(char *)_ogg_malloc(size); + if(OP_LIKELY(ret!=NULL)){ + ret=(char *)memcpy(ret,_s,sizeof(*ret)*_len); + ret[_len]='\0'; + } + return ret; +} + +/*The actual implementation of opus_tags_parse(). + Unlike the public API, this function requires _tags to already be + initialized, modifies its contents before success is guaranteed, and assumes + the caller will clear it on error.*/ +static int opus_tags_parse_impl(OpusTags *_tags, + const unsigned char *_data,size_t _len){ + opus_uint32 count; + size_t len; + int ncomments; + int ci; + len=_len; + if(len<8)return OP_ENOTFORMAT; + if(memcmp(_data,"OpusTags",8)!=0)return OP_ENOTFORMAT; + if(len<16)return OP_EBADHEADER; + _data+=8; + len-=8; + count=op_parse_uint32le(_data); + _data+=4; + len-=4; + if(count>len)return OP_EBADHEADER; + if(_tags!=NULL){ + _tags->vendor=op_strdup_with_len((char *)_data,count); + if(_tags->vendor==NULL)return OP_EFAULT; + } + _data+=count; + len-=count; + if(len<4)return OP_EBADHEADER; + count=op_parse_uint32le(_data); + _data+=4; + len-=4; + /*Check to make sure there's minimally sufficient data left in the packet.*/ + if(count>len>>2)return OP_EBADHEADER; + /*Check for overflow (the API limits this to an int).*/ + if(count>(opus_uint32)INT_MAX-1)return OP_EFAULT; + if(_tags!=NULL){ + int ret; + ret=op_tags_ensure_capacity(_tags,count); + if(ret<0)return ret; + } + ncomments=(int)count; + for(ci=0;cilen>>2)return OP_EBADHEADER; + count=op_parse_uint32le(_data); + _data+=4; + len-=4; + if(count>len)return OP_EBADHEADER; + /*Check for overflow (the API limits this to an int).*/ + if(count>(opus_uint32)INT_MAX)return OP_EFAULT; + if(_tags!=NULL){ + _tags->user_comments[ci]=op_strdup_with_len((char *)_data,count); + if(_tags->user_comments[ci]==NULL)return OP_EFAULT; + _tags->comment_lengths[ci]=(int)count; + _tags->comments=ci+1; + /*Needed by opus_tags_clear() if we fail before parsing the (optional) + binary metadata.*/ + _tags->user_comments[ci+1]=NULL; + } + _data+=count; + len-=count; + } + if(len>0&&(_data[0]&1)){ + if(len>(opus_uint32)INT_MAX)return OP_EFAULT; + if(_tags!=NULL){ + _tags->user_comments[ncomments]=(char *)_ogg_malloc(len); + if(OP_UNLIKELY(_tags->user_comments[ncomments]==NULL))return OP_EFAULT; + memcpy(_tags->user_comments[ncomments],_data,len); + _tags->comment_lengths[ncomments]=(int)len; + } + } + return 0; +} + +int opus_tags_parse(OpusTags *_tags,const unsigned char *_data,size_t _len){ + if(_tags!=NULL){ + OpusTags tags; + int ret; + opus_tags_init(&tags); + ret=opus_tags_parse_impl(&tags,_data,_len); + if(ret<0)opus_tags_clear(&tags); + else *_tags=*&tags; + return ret; + } + else return opus_tags_parse_impl(NULL,_data,_len); +} + +/*The actual implementation of opus_tags_copy(). + Unlike the public API, this function requires _dst to already be + initialized, modifies its contents before success is guaranteed, and assumes + the caller will clear it on error.*/ +static int opus_tags_copy_impl(OpusTags *_dst,const OpusTags *_src){ + char *vendor; + int ncomments; + int ret; + int ci; + vendor=_src->vendor; + _dst->vendor=op_strdup_with_len(vendor,strlen(vendor)); + if(OP_UNLIKELY(_dst->vendor==NULL))return OP_EFAULT; + ncomments=_src->comments; + ret=op_tags_ensure_capacity(_dst,ncomments); + if(OP_UNLIKELY(ret<0))return ret; + for(ci=0;cicomment_lengths[ci]; + OP_ASSERT(len>=0); + _dst->user_comments[ci]=op_strdup_with_len(_src->user_comments[ci],len); + if(OP_UNLIKELY(_dst->user_comments[ci]==NULL))return OP_EFAULT; + _dst->comment_lengths[ci]=len; + _dst->comments=ci+1; + } + if(_src->comment_lengths!=NULL){ + int len; + len=_src->comment_lengths[ncomments]; + if(len>0){ + _dst->user_comments[ncomments]=(char *)_ogg_malloc(len); + if(OP_UNLIKELY(_dst->user_comments[ncomments]==NULL))return OP_EFAULT; + memcpy(_dst->user_comments[ncomments],_src->user_comments[ncomments],len); + _dst->comment_lengths[ncomments]=len; + } + } + return 0; +} + +int opus_tags_copy(OpusTags *_dst,const OpusTags *_src){ + OpusTags dst; + int ret; + opus_tags_init(&dst); + ret=opus_tags_copy_impl(&dst,_src); + if(OP_UNLIKELY(ret<0))opus_tags_clear(&dst); + else *_dst=*&dst; + return 0; +} + +int opus_tags_add(OpusTags *_tags,const char *_tag,const char *_value){ + char *comment; + int tag_len; + int value_len; + int ncomments; + int ret; + ncomments=_tags->comments; + ret=op_tags_ensure_capacity(_tags,ncomments+1); + if(OP_UNLIKELY(ret<0))return ret; + tag_len=strlen(_tag); + value_len=strlen(_value); + /*+2 for '=' and '\0'.*/ + comment=(char *)_ogg_malloc(sizeof(*comment)*(tag_len+value_len+2)); + if(OP_UNLIKELY(comment==NULL))return OP_EFAULT; + memcpy(comment,_tag,sizeof(*comment)*tag_len); + comment[tag_len]='='; + memcpy(comment+tag_len+1,_value,sizeof(*comment)*(value_len+1)); + _tags->user_comments[ncomments]=comment; + _tags->comment_lengths[ncomments]=tag_len+value_len+1; + _tags->comments=ncomments+1; + return 0; +} + +int opus_tags_add_comment(OpusTags *_tags,const char *_comment){ + char *comment; + int comment_len; + int ncomments; + int ret; + ncomments=_tags->comments; + ret=op_tags_ensure_capacity(_tags,ncomments+1); + if(OP_UNLIKELY(ret<0))return ret; + comment_len=(int)strlen(_comment); + comment=op_strdup_with_len(_comment,comment_len); + if(OP_UNLIKELY(comment==NULL))return OP_EFAULT; + _tags->user_comments[ncomments]=comment; + _tags->comment_lengths[ncomments]=comment_len; + _tags->comments=ncomments+1; + return 0; +} + +int opus_tags_set_binary_suffix(OpusTags *_tags, + const unsigned char *_data,int _len){ + unsigned char *binary_suffix_data; + int ncomments; + int ret; + if(_len<0||_len>0&&(_data==NULL||!(_data[0]&1)))return OP_EINVAL; + ncomments=_tags->comments; + ret=op_tags_ensure_capacity(_tags,ncomments); + if(OP_UNLIKELY(ret<0))return ret; + binary_suffix_data= + (unsigned char *)_ogg_realloc(_tags->user_comments[ncomments],_len); + if(OP_UNLIKELY(binary_suffix_data==NULL))return OP_EFAULT; + memcpy(binary_suffix_data,_data,_len); + _tags->user_comments[ncomments]=(char *)binary_suffix_data; + _tags->comment_lengths[ncomments]=_len; + return 0; +} + +int opus_tagcompare(const char *_tag_name,const char *_comment){ + return opus_tagncompare(_tag_name,strlen(_tag_name),_comment); +} + +int opus_tagncompare(const char *_tag_name,int _tag_len,const char *_comment){ + int ret; + OP_ASSERT(_tag_len>=0); + ret=op_strncasecmp(_tag_name,_comment,_tag_len); + return ret?ret:'='-_comment[_tag_len]; +} + +const char *opus_tags_query(const OpusTags *_tags,const char *_tag,int _count){ + char **user_comments; + int tag_len; + int found; + int ncomments; + int ci; + tag_len=strlen(_tag); + ncomments=_tags->comments; + user_comments=_tags->user_comments; + found=0; + for(ci=0;cicomments; + user_comments=_tags->user_comments; + found=0; + for(ci=0;cicomments; + len=_tags->comment_lengths==NULL?0:_tags->comment_lengths[ncomments]; + *_len=len; + OP_ASSERT(len==0||_tags->user_comments!=NULL); + return len>0?(const unsigned char *)_tags->user_comments[ncomments]:NULL; +} + +static int opus_tags_get_gain(const OpusTags *_tags,int *_gain_q8, + const char *_tag_name,size_t _tag_len){ + char **comments; + int ncomments; + int ci; + comments=_tags->user_comments; + ncomments=_tags->comments; + /*Look for the first valid tag with the name _tag_name and use that.*/ + for(ci=0;ci='0'&&*p<='9'){ + gain_q8=10*gain_q8+*p-'0'; + if(gain_q8>32767-negative)break; + p++; + } + /*This didn't look like a signed 16-bit decimal integer. + Not a valid gain tag.*/ + if(*p!='\0')continue; + *_gain_q8=(int)(gain_q8+negative^negative); + return 0; + } + } + return OP_FALSE; +} + +int opus_tags_get_album_gain(const OpusTags *_tags,int *_gain_q8){ + return opus_tags_get_gain(_tags,_gain_q8,"R128_ALBUM_GAIN",15); +} + +int opus_tags_get_track_gain(const OpusTags *_tags,int *_gain_q8){ + return opus_tags_get_gain(_tags,_gain_q8,"R128_TRACK_GAIN",15); +} + +static int op_is_jpeg(const unsigned char *_buf,size_t _buf_sz){ + return _buf_sz>=11&&memcmp(_buf,"\xFF\xD8\xFF\xE0",4)==0 + &&(_buf[4]<<8|_buf[5])>=16&&memcmp(_buf+6,"JFIF",5)==0; +} + +/*Tries to extract the width, height, bits per pixel, and palette size of a + JPEG. + On failure, simply leaves its outputs unmodified.*/ +static void op_extract_jpeg_params(const unsigned char *_buf,size_t _buf_sz, + opus_uint32 *_width,opus_uint32 *_height, + opus_uint32 *_depth,opus_uint32 *_colors,int *_has_palette){ + if(op_is_jpeg(_buf,_buf_sz)){ + size_t offs; + offs=2; + for(;;){ + size_t segment_len; + int marker; + while(offs<_buf_sz&&_buf[offs]!=0xFF)offs++; + while(offs<_buf_sz&&_buf[offs]==0xFF)offs++; + marker=_buf[offs]; + offs++; + /*If we hit EOI* (end of image), or another SOI* (start of image), + or SOS (start of scan), then stop now.*/ + if(offs>=_buf_sz||(marker>=0xD8&&marker<=0xDA))break; + /*RST* (restart markers): skip (no segment length).*/ + else if(marker>=0xD0&&marker<=0xD7)continue; + /*Read the length of the marker segment.*/ + if(_buf_sz-offs<2)break; + segment_len=_buf[offs]<<8|_buf[offs+1]; + if(segment_len<2||_buf_sz-offs0xC0&&marker<0xD0&&(marker&3)!=0)){ + /*Found a SOFn (start of frame) marker segment:*/ + if(segment_len>=8){ + *_height=_buf[offs+3]<<8|_buf[offs+4]; + *_width=_buf[offs+5]<<8|_buf[offs+6]; + *_depth=_buf[offs+2]*_buf[offs+7]; + *_colors=0; + *_has_palette=0; + } + break; + } + /*Other markers: skip the whole marker segment.*/ + offs+=segment_len; + } + } +} + +static int op_is_png(const unsigned char *_buf,size_t _buf_sz){ + return _buf_sz>=8&&memcmp(_buf,"\x89PNG\x0D\x0A\x1A\x0A",8)==0; +} + +/*Tries to extract the width, height, bits per pixel, and palette size of a + PNG. + On failure, simply leaves its outputs unmodified.*/ +static void op_extract_png_params(const unsigned char *_buf,size_t _buf_sz, + opus_uint32 *_width,opus_uint32 *_height, + opus_uint32 *_depth,opus_uint32 *_colors,int *_has_palette){ + if(op_is_png(_buf,_buf_sz)){ + size_t offs; + offs=8; + while(_buf_sz-offs>=12){ + ogg_uint32_t chunk_len; + chunk_len=op_parse_uint32be(_buf+offs); + if(chunk_len>_buf_sz-(offs+12))break; + else if(chunk_len==13&&memcmp(_buf+offs+4,"IHDR",4)==0){ + int color_type; + *_width=op_parse_uint32be(_buf+offs+8); + *_height=op_parse_uint32be(_buf+offs+12); + color_type=_buf[offs+17]; + if(color_type==3){ + *_depth=24; + *_has_palette=1; + } + else{ + int sample_depth; + sample_depth=_buf[offs+16]; + if(color_type==0)*_depth=sample_depth; + else if(color_type==2)*_depth=sample_depth*3; + else if(color_type==4)*_depth=sample_depth*2; + else if(color_type==6)*_depth=sample_depth*4; + *_colors=0; + *_has_palette=0; + break; + } + } + else if(*_has_palette>0&&memcmp(_buf+offs+4,"PLTE",4)==0){ + *_colors=chunk_len/3; + break; + } + offs+=12+chunk_len; + } + } +} + +static int op_is_gif(const unsigned char *_buf,size_t _buf_sz){ + return _buf_sz>=6&&(memcmp(_buf,"GIF87a",6)==0||memcmp(_buf,"GIF89a",6)==0); +} + +/*Tries to extract the width, height, bits per pixel, and palette size of a + GIF. + On failure, simply leaves its outputs unmodified.*/ +static void op_extract_gif_params(const unsigned char *_buf,size_t _buf_sz, + opus_uint32 *_width,opus_uint32 *_height, + opus_uint32 *_depth,opus_uint32 *_colors,int *_has_palette){ + if(op_is_gif(_buf,_buf_sz)&&_buf_sz>=14){ + *_width=_buf[6]|_buf[7]<<8; + *_height=_buf[8]|_buf[9]<<8; + /*libFLAC hard-codes the depth to 24.*/ + *_depth=24; + *_colors=1<<((_buf[10]&7)+1); + *_has_palette=1; + } +} + +/*The actual implementation of opus_picture_tag_parse(). + Unlike the public API, this function requires _pic to already be + initialized, modifies its contents before success is guaranteed, and assumes + the caller will clear it on error.*/ +static int opus_picture_tag_parse_impl(OpusPictureTag *_pic,const char *_tag, + unsigned char *_buf,size_t _buf_sz,size_t _base64_sz){ + opus_int32 picture_type; + opus_uint32 mime_type_length; + char *mime_type; + opus_uint32 description_length; + char *description; + opus_uint32 width; + opus_uint32 height; + opus_uint32 depth; + opus_uint32 colors; + opus_uint32 data_length; + opus_uint32 file_width; + opus_uint32 file_height; + opus_uint32 file_depth; + opus_uint32 file_colors; + int format; + int has_palette; + int colors_set; + size_t i; + /*Decode the BASE64 data.*/ + for(i=0;i<_base64_sz;i++){ + opus_uint32 value; + int j; + value=0; + for(j=0;j<4;j++){ + unsigned c; + unsigned d; + c=(unsigned char)_tag[4*i+j]; + if(c=='+')d=62; + else if(c=='/')d=63; + else if(c>='0'&&c<='9')d=52+c-'0'; + else if(c>='a'&&c<='z')d=26+c-'a'; + else if(c>='A'&&c<='Z')d=c-'A'; + else if(c=='='&&3*i+j>_buf_sz)d=0; + else return OP_ENOTFORMAT; + value=value<<6|d; + } + _buf[3*i]=(unsigned char)(value>>16); + if(3*i+1<_buf_sz){ + _buf[3*i+1]=(unsigned char)(value>>8); + if(3*i+2<_buf_sz)_buf[3*i+2]=(unsigned char)value; + } + } + i=0; + picture_type=op_parse_uint32be(_buf+i); + i+=4; + /*Extract the MIME type.*/ + mime_type_length=op_parse_uint32be(_buf+i); + i+=4; + if(mime_type_length>_buf_sz-32)return OP_ENOTFORMAT; + mime_type=(char *)_ogg_malloc(sizeof(*_pic->mime_type)*(mime_type_length+1)); + if(mime_type==NULL)return OP_EFAULT; + memcpy(mime_type,_buf+i,sizeof(*mime_type)*mime_type_length); + mime_type[mime_type_length]='\0'; + _pic->mime_type=mime_type; + i+=mime_type_length; + /*Extract the description string.*/ + description_length=op_parse_uint32be(_buf+i); + i+=4; + if(description_length>_buf_sz-mime_type_length-32)return OP_ENOTFORMAT; + description= + (char *)_ogg_malloc(sizeof(*_pic->mime_type)*(description_length+1)); + if(description==NULL)return OP_EFAULT; + memcpy(description,_buf+i,sizeof(*description)*description_length); + description[description_length]='\0'; + _pic->description=description; + i+=description_length; + /*Extract the remaining fields.*/ + width=op_parse_uint32be(_buf+i); + i+=4; + height=op_parse_uint32be(_buf+i); + i+=4; + depth=op_parse_uint32be(_buf+i); + i+=4; + colors=op_parse_uint32be(_buf+i); + i+=4; + /*If one of these is set, they all must be, but colors==0 is a valid value.*/ + colors_set=width!=0||height!=0||depth!=0||colors!=0; + if((width==0||height==0||depth==0)&&colors_set)return OP_ENOTFORMAT; + data_length=op_parse_uint32be(_buf+i); + i+=4; + if(data_length>_buf_sz-i)return OP_ENOTFORMAT; + /*Trim extraneous data so we don't copy it below.*/ + _buf_sz=i+data_length; + /*Attempt to determine the image format.*/ + format=OP_PIC_FORMAT_UNKNOWN; + if(mime_type_length==3&&strcmp(mime_type,"-->")==0){ + format=OP_PIC_FORMAT_URL; + /*Picture type 1 must be a 32x32 PNG.*/ + if(picture_type==1&&(width!=0||height!=0)&&(width!=32||height!=32)){ + return OP_ENOTFORMAT; + } + /*Append a terminating NUL for the convenience of our callers.*/ + _buf[_buf_sz++]='\0'; + } + else{ + if(mime_type_length==10 + &&op_strncasecmp(mime_type,"image/jpeg",mime_type_length)==0){ + if(op_is_jpeg(_buf+i,data_length))format=OP_PIC_FORMAT_JPEG; + } + else if(mime_type_length==9 + &&op_strncasecmp(mime_type,"image/png",mime_type_length)==0){ + if(op_is_png(_buf+i,data_length))format=OP_PIC_FORMAT_PNG; + } + else if(mime_type_length==9 + &&op_strncasecmp(mime_type,"image/gif",mime_type_length)==0){ + if(op_is_gif(_buf+i,data_length))format=OP_PIC_FORMAT_GIF; + } + else if(mime_type_length==0||(mime_type_length==6 + &&op_strncasecmp(mime_type,"image/",mime_type_length)==0)){ + if(op_is_jpeg(_buf+i,data_length))format=OP_PIC_FORMAT_JPEG; + else if(op_is_png(_buf+i,data_length))format=OP_PIC_FORMAT_PNG; + else if(op_is_gif(_buf+i,data_length))format=OP_PIC_FORMAT_GIF; + } + file_width=file_height=file_depth=file_colors=0; + has_palette=-1; + switch(format){ + case OP_PIC_FORMAT_JPEG:{ + op_extract_jpeg_params(_buf+i,data_length, + &file_width,&file_height,&file_depth,&file_colors,&has_palette); + }break; + case OP_PIC_FORMAT_PNG:{ + op_extract_png_params(_buf+i,data_length, + &file_width,&file_height,&file_depth,&file_colors,&has_palette); + }break; + case OP_PIC_FORMAT_GIF:{ + op_extract_gif_params(_buf+i,data_length, + &file_width,&file_height,&file_depth,&file_colors,&has_palette); + }break; + } + if(has_palette>=0){ + /*If we successfully extracted these parameters from the image, override + any declared values.*/ + width=file_width; + height=file_height; + depth=file_depth; + colors=file_colors; + } + /*Picture type 1 must be a 32x32 PNG.*/ + if(picture_type==1&&(format!=OP_PIC_FORMAT_PNG||width!=32||height!=32)){ + return OP_ENOTFORMAT; + } + } + /*Adjust _buf_sz instead of using data_length to capture the terminating NUL + for URLs.*/ + _buf_sz-=i; + memmove(_buf,_buf+i,sizeof(*_buf)*_buf_sz); + _buf=(unsigned char *)_ogg_realloc(_buf,_buf_sz); + if(_buf_sz>0&&_buf==NULL)return OP_EFAULT; + _pic->type=picture_type; + _pic->width=width; + _pic->height=height; + _pic->depth=depth; + _pic->colors=colors; + _pic->data_length=data_length; + _pic->data=_buf; + _pic->format=format; + return 0; +} + +int opus_picture_tag_parse(OpusPictureTag *_pic,const char *_tag){ + OpusPictureTag pic; + unsigned char *buf; + size_t base64_sz; + size_t buf_sz; + size_t tag_length; + int ret; + if(opus_tagncompare("METADATA_BLOCK_PICTURE",22,_tag)==0)_tag+=23; + /*Figure out how much BASE64-encoded data we have.*/ + tag_length=strlen(_tag); + if(tag_length&3)return OP_ENOTFORMAT; + base64_sz=tag_length>>2; + buf_sz=3*base64_sz; + if(buf_sz<32)return OP_ENOTFORMAT; + if(_tag[tag_length-1]=='=')buf_sz--; + if(_tag[tag_length-2]=='=')buf_sz--; + if(buf_sz<32)return OP_ENOTFORMAT; + /*Allocate an extra byte to allow appending a terminating NUL to URL data.*/ + buf=(unsigned char *)_ogg_malloc(sizeof(*buf)*(buf_sz+1)); + if(buf==NULL)return OP_EFAULT; + opus_picture_tag_init(&pic); + ret=opus_picture_tag_parse_impl(&pic,_tag,buf,buf_sz,base64_sz); + if(ret<0){ + opus_picture_tag_clear(&pic); + _ogg_free(buf); + } + else *_pic=*&pic; + return ret; +} + +void opus_picture_tag_init(OpusPictureTag *_pic){ + memset(_pic,0,sizeof(*_pic)); +} + +void opus_picture_tag_clear(OpusPictureTag *_pic){ + _ogg_free(_pic->description); + _ogg_free(_pic->mime_type); + _ogg_free(_pic->data); +} diff --git a/thirdparty/opus/internal.c b/thirdparty/opus/internal.c new file mode 100644 index 000000000000..96c80def82f2 --- /dev/null +++ b/thirdparty/opus/internal.c @@ -0,0 +1,42 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE libopusfile SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE libopusfile SOURCE CODE IS (C) COPYRIGHT 2012 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ********************************************************************/ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "internal.h" + +#if defined(OP_ENABLE_ASSERTIONS) +void op_fatal_impl(const char *_str,const char *_file,int _line){ + fprintf(stderr,"Fatal (internal) error in %s, line %i: %s\n", + _file,_line,_str); + abort(); +} +#endif + +/*A version of strncasecmp() that is guaranteed to only ignore the case of + ASCII characters.*/ +int op_strncasecmp(const char *_a,const char *_b,int _n){ + int i; + for(i=0;i<_n;i++){ + int a; + int b; + int d; + a=_a[i]; + b=_b[i]; + if(a>='a'&&a<='z')a-='a'-'A'; + if(b>='a'&&b<='z')b-='a'-'A'; + d=a-b; + if(d)return d; + } + return 0; +} diff --git a/thirdparty/opus/internal.h b/thirdparty/opus/internal.h new file mode 100644 index 000000000000..ee48ea34c921 --- /dev/null +++ b/thirdparty/opus/internal.h @@ -0,0 +1,254 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE libopusfile SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE libopusfile SOURCE CODE IS (C) COPYRIGHT 2012 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ********************************************************************/ +#if !defined(_opusfile_internal_h) +# define _opusfile_internal_h (1) + +# if !defined(_REENTRANT) +# define _REENTRANT +# endif +# if !defined(_GNU_SOURCE) +# define _GNU_SOURCE +# endif +# if !defined(_LARGEFILE_SOURCE) +# define _LARGEFILE_SOURCE +# endif +# if !defined(_LARGEFILE64_SOURCE) +# define _LARGEFILE64_SOURCE +# endif +# if !defined(_FILE_OFFSET_BITS) +# define _FILE_OFFSET_BITS 64 +# endif + +# include +# include + +typedef struct OggOpusLink OggOpusLink; + +# if defined(OP_FIXED_POINT) + +typedef opus_int16 op_sample; + +# else + +typedef float op_sample; + +/*We're using this define to test for libopus 1.1 or later until libopus + provides a better mechanism.*/ +# if defined(OPUS_GET_EXPERT_FRAME_DURATION_REQUEST) +/*Enable soft clipping prevention in 16-bit decodes.*/ +# define OP_SOFT_CLIP (1) +# endif + +# endif + +# if OP_GNUC_PREREQ(4,2) +/*Disable excessive warnings about the order of operations.*/ +# pragma GCC diagnostic ignored "-Wparentheses" +# elif defined(_MSC_VER) +/*Disable excessive warnings about the order of operations.*/ +# pragma warning(disable:4554) +/*Disable warnings about "deprecated" POSIX functions.*/ +# pragma warning(disable:4996) +# endif + +# if OP_GNUC_PREREQ(3,0) +/*Another alternative is + (__builtin_constant_p(_x)?!!(_x):__builtin_expect(!!(_x),1)) + but that evaluates _x multiple times, which may be bad.*/ +# define OP_LIKELY(_x) (__builtin_expect(!!(_x),1)) +# define OP_UNLIKELY(_x) (__builtin_expect(!!(_x),0)) +# else +# define OP_LIKELY(_x) (!!(_x)) +# define OP_UNLIKELY(_x) (!!(_x)) +# endif + +# if defined(OP_ENABLE_ASSERTIONS) +# if OP_GNUC_PREREQ(2,5)||__SUNPRO_C>=0x590 +__attribute__((noreturn)) +# endif +void op_fatal_impl(const char *_str,const char *_file,int _line); + +# define OP_FATAL(_str) (op_fatal_impl(_str,__FILE__,__LINE__)) + +# define OP_ASSERT(_cond) \ + do{ \ + if(OP_UNLIKELY(!(_cond)))OP_FATAL("assertion failed: " #_cond); \ + } \ + while(0) +# define OP_ALWAYS_TRUE(_cond) OP_ASSERT(_cond) + +# else +# define OP_FATAL(_str) abort() +# define OP_ASSERT(_cond) +# define OP_ALWAYS_TRUE(_cond) ((void)(_cond)) +# endif + +# define OP_INT64_MAX (2*(((ogg_int64_t)1<<62)-1)|1) +# define OP_INT64_MIN (-OP_INT64_MAX-1) +# define OP_INT32_MAX (2*(((ogg_int32_t)1<<30)-1)|1) +# define OP_INT32_MIN (-OP_INT32_MAX-1) + +# define OP_MIN(_a,_b) ((_a)<(_b)?(_a):(_b)) +# define OP_MAX(_a,_b) ((_a)>(_b)?(_a):(_b)) +# define OP_CLAMP(_lo,_x,_hi) (OP_MAX(_lo,OP_MIN(_x,_hi))) + +/*Advance a file offset by the given amount, clamping against OP_INT64_MAX. + This is used to advance a known offset by things like OP_CHUNK_SIZE or + OP_PAGE_SIZE_MAX, while making sure to avoid signed overflow. + It assumes that both _offset and _amount are non-negative.*/ +#define OP_ADV_OFFSET(_offset,_amount) \ + (OP_MIN(_offset,OP_INT64_MAX-(_amount))+(_amount)) + +/*The maximum channel count for any mapping we'll actually decode.*/ +# define OP_NCHANNELS_MAX (8) + +/*Initial state.*/ +# define OP_NOTOPEN (0) +/*We've found the first Opus stream in the first link.*/ +# define OP_PARTOPEN (1) +# define OP_OPENED (2) +/*We've found the first Opus stream in the current link.*/ +# define OP_STREAMSET (3) +/*We've initialized the decoder for the chosen Opus stream in the current + link.*/ +# define OP_INITSET (4) + +/*Information cached for a single link in a chained Ogg Opus file. + We choose the first Opus stream encountered in each link to play back (and + require at least one).*/ +struct OggOpusLink{ + /*The byte offset of the first header page in this link.*/ + opus_int64 offset; + /*The byte offset of the first data page from the chosen Opus stream in this + link (after the headers).*/ + opus_int64 data_offset; + /*The byte offset of the last page from the chosen Opus stream in this link. + This is used when seeking to ensure we find a page before the last one, so + that end-trimming calculations work properly. + This is only valid for seekable sources.*/ + opus_int64 end_offset; + /*The granule position of the last sample. + This is only valid for seekable sources.*/ + ogg_int64_t pcm_end; + /*The granule position before the first sample.*/ + ogg_int64_t pcm_start; + /*The serial number.*/ + ogg_uint32_t serialno; + /*The contents of the info header.*/ + OpusHead head; + /*The contents of the comment header.*/ + OpusTags tags; +}; + +struct OggOpusFile{ + /*The callbacks used to access the data source.*/ + OpusFileCallbacks callbacks; + /*A FILE *, memory bufer, etc.*/ + void *source; + /*Whether or not we can seek with this data source.*/ + int seekable; + /*The number of links in this chained Ogg Opus file.*/ + int nlinks; + /*The cached information from each link in a chained Ogg Opus file. + If source isn't seekable (e.g., it's a pipe), only the current link + appears.*/ + OggOpusLink *links; + /*The number of serial numbers from a single link.*/ + int nserialnos; + /*The capacity of the list of serial numbers from a single link.*/ + int cserialnos; + /*Storage for the list of serial numbers from a single link.*/ + ogg_uint32_t *serialnos; + /*This is the current offset of the data processed by the ogg_sync_state. + After a seek, this should be set to the target offset so that we can track + the byte offsets of subsequent pages. + After a call to op_get_next_page(), this will point to the first byte after + that page.*/ + opus_int64 offset; + /*The total size of this data source, or -1 if it's unseekable.*/ + opus_int64 end; + /*Used to locate pages in the data source.*/ + ogg_sync_state oy; + /*One of OP_NOTOPEN, OP_PARTOPEN, OP_OPENED, OP_STREAMSET, OP_INITSET.*/ + int ready_state; + /*The current link being played back.*/ + int cur_link; + /*The number of decoded samples to discard from the start of decoding.*/ + opus_int32 cur_discard_count; + /*The granule position of the previous packet (current packet start time).*/ + ogg_int64_t prev_packet_gp; + /*The stream offset of the most recent page with completed packets, or -1. + This is only needed to recover continued packet data in the seeking logic, + when we use the current position as one of our bounds, only to later + discover it was the correct starting point.*/ + opus_int64 prev_page_offset; + /*The number of bytes read since the last bitrate query, including framing.*/ + opus_int64 bytes_tracked; + /*The number of samples decoded since the last bitrate query.*/ + ogg_int64_t samples_tracked; + /*Takes physical pages and welds them into a logical stream of packets.*/ + ogg_stream_state os; + /*Re-timestamped packets from a single page. + Buffering these relies on the undocumented libogg behavior that ogg_packet + pointers remain valid until the next page is submitted to the + ogg_stream_state they came from.*/ + ogg_packet op[255]; + /*The index of the next packet to return.*/ + int op_pos; + /*The total number of packets available.*/ + int op_count; + /*Central working state for the packet-to-PCM decoder.*/ + OpusMSDecoder *od; + /*The application-provided packet decode callback.*/ + op_decode_cb_func decode_cb; + /*The application-provided packet decode callback context.*/ + void *decode_cb_ctx; + /*The stream count used to initialize the decoder.*/ + int od_stream_count; + /*The coupled stream count used to initialize the decoder.*/ + int od_coupled_count; + /*The channel count used to initialize the decoder.*/ + int od_channel_count; + /*The channel mapping used to initialize the decoder.*/ + unsigned char od_mapping[OP_NCHANNELS_MAX]; + /*The buffered data for one decoded packet.*/ + op_sample *od_buffer; + /*The current position in the decoded buffer.*/ + int od_buffer_pos; + /*The number of valid samples in the decoded buffer.*/ + int od_buffer_size; + /*The type of gain offset to apply. + One of OP_HEADER_GAIN, OP_TRACK_GAIN, or OP_ABSOLUTE_GAIN.*/ + int gain_type; + /*The offset to apply to the gain.*/ + opus_int32 gain_offset_q8; + /*Internal state for soft clipping and dithering float->short output.*/ +#if !defined(OP_FIXED_POINT) +# if defined(OP_SOFT_CLIP) + float clip_state[OP_NCHANNELS_MAX]; +# endif + float dither_a[OP_NCHANNELS_MAX*4]; + float dither_b[OP_NCHANNELS_MAX*4]; + opus_uint32 dither_seed; + int dither_mute; + int dither_disabled; + /*The number of channels represented by the internal state. + This gets set to 0 whenever anything that would prevent state propagation + occurs (switching between the float/short APIs, or between the + stereo/multistream APIs).*/ + int state_channel_count; +#endif +}; + +int op_strncasecmp(const char *_a,const char *_b,int _n); + +#endif diff --git a/thirdparty/opus/mlp.c b/thirdparty/opus/mlp.c new file mode 100644 index 000000000000..ff9e50df4729 --- /dev/null +++ b/thirdparty/opus/mlp.c @@ -0,0 +1,145 @@ +/* Copyright (c) 2008-2011 Octasic Inc. + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opus_types.h" +#include "opus_defines.h" + +#include +#include "mlp.h" +#include "arch.h" +#include "tansig_table.h" +#define MAX_NEURONS 100 + +#if 0 +static OPUS_INLINE opus_val16 tansig_approx(opus_val32 _x) /* Q19 */ +{ + int i; + opus_val16 xx; /* Q11 */ + /*double x, y;*/ + opus_val16 dy, yy; /* Q14 */ + /*x = 1.9073e-06*_x;*/ + if (_x>=QCONST32(8,19)) + return QCONST32(1.,14); + if (_x<=-QCONST32(8,19)) + return -QCONST32(1.,14); + xx = EXTRACT16(SHR32(_x, 8)); + /*i = lrint(25*x);*/ + i = SHR32(ADD32(1024,MULT16_16(25, xx)),11); + /*x -= .04*i;*/ + xx -= EXTRACT16(SHR32(MULT16_16(20972,i),8)); + /*x = xx*(1./2048);*/ + /*y = tansig_table[250+i];*/ + yy = tansig_table[250+i]; + /*y = yy*(1./16384);*/ + dy = 16384-MULT16_16_Q14(yy,yy); + yy = yy + MULT16_16_Q14(MULT16_16_Q11(xx,dy),(16384 - MULT16_16_Q11(yy,xx))); + return yy; +} +#else +/*extern const float tansig_table[501];*/ +static OPUS_INLINE float tansig_approx(float x) +{ + int i; + float y, dy; + float sign=1; + /* Tests are reversed to catch NaNs */ + if (!(x<8)) + return 1; + if (!(x>-8)) + return -1; +#ifndef FIXED_POINT + /* Another check in case of -ffast-math */ + if (celt_isnan(x)) + return 0; +#endif + if (x<0) + { + x=-x; + sign=-1; + } + i = (int)floor(.5f+25*x); + x -= .04f*i; + y = tansig_table[i]; + dy = 1-y*y; + y = y + x*dy*(1 - y*x); + return sign*y; +} +#endif + +#if 0 +void mlp_process(const MLP *m, const opus_val16 *in, opus_val16 *out) +{ + int j; + opus_val16 hidden[MAX_NEURONS]; + const opus_val16 *W = m->weights; + /* Copy to tmp_in */ + for (j=0;jtopo[1];j++) + { + int k; + opus_val32 sum = SHL32(EXTEND32(*W++),8); + for (k=0;ktopo[0];k++) + sum = MAC16_16(sum, in[k],*W++); + hidden[j] = tansig_approx(sum); + } + for (j=0;jtopo[2];j++) + { + int k; + opus_val32 sum = SHL32(EXTEND32(*W++),14); + for (k=0;ktopo[1];k++) + sum = MAC16_16(sum, hidden[k], *W++); + out[j] = tansig_approx(EXTRACT16(PSHR32(sum,17))); + } +} +#else +void mlp_process(const MLP *m, const float *in, float *out) +{ + int j; + float hidden[MAX_NEURONS]; + const float *W = m->weights; + /* Copy to tmp_in */ + for (j=0;jtopo[1];j++) + { + int k; + float sum = *W++; + for (k=0;ktopo[0];k++) + sum = sum + in[k]**W++; + hidden[j] = tansig_approx(sum); + } + for (j=0;jtopo[2];j++) + { + int k; + float sum = *W++; + for (k=0;ktopo[1];k++) + sum = sum + hidden[k]**W++; + out[j] = tansig_approx(sum); + } +} +#endif diff --git a/thirdparty/opus/mlp.h b/thirdparty/opus/mlp.h new file mode 100644 index 000000000000..618e246e2c4b --- /dev/null +++ b/thirdparty/opus/mlp.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2008-2011 Octasic Inc. + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef _MLP_H_ +#define _MLP_H_ + +#include "arch.h" + +typedef struct { + int layers; + const int *topo; + const float *weights; +} MLP; + +extern const MLP net; + +void mlp_process(const MLP *m, const float *in, float *out); + +#endif /* _MLP_H_ */ diff --git a/thirdparty/opus/mlp_data.c b/thirdparty/opus/mlp_data.c new file mode 100644 index 000000000000..c2fda4e2e593 --- /dev/null +++ b/thirdparty/opus/mlp_data.c @@ -0,0 +1,109 @@ +/* The contents of this file was automatically generated by mlp_train.c + It contains multi-layer perceptron (MLP) weights. */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "mlp.h" + +/* RMS error was 0.138320, seed was 1361535663 */ + +static const float weights[422] = { + +/* hidden layer */ +-0.0941125f, -0.302976f, -0.603555f, -0.19393f, -0.185983f, +-0.601617f, -0.0465317f, -0.114563f, -0.103599f, -0.618938f, +-0.317859f, -0.169949f, -0.0702885f, 0.148065f, 0.409524f, +0.548432f, 0.367649f, -0.494393f, 0.764306f, -1.83957f, +0.170849f, 12.786f, -1.08848f, -1.27284f, -16.2606f, +24.1773f, -5.57454f, -0.17276f, -0.163388f, -0.224421f, +-0.0948944f, -0.0728695f, -0.26557f, -0.100283f, -0.0515459f, +-0.146142f, -0.120674f, -0.180655f, 0.12857f, 0.442138f, +-0.493735f, 0.167767f, 0.206699f, -0.197567f, 0.417999f, +1.50364f, -0.773341f, -10.0401f, 0.401872f, 2.97966f, +15.2165f, -1.88905f, -1.19254f, 0.0285397f, -0.00405139f, +0.0707565f, 0.00825699f, -0.0927269f, -0.010393f, -0.00428882f, +-0.00489743f, -0.0709731f, -0.00255992f, 0.0395619f, 0.226424f, +0.0325231f, 0.162175f, -0.100118f, 0.485789f, 0.12697f, +0.285937f, 0.0155637f, 0.10546f, 3.05558f, 1.15059f, +-1.00904f, -1.83088f, 3.31766f, -3.42516f, -0.119135f, +-0.0405654f, 0.00690068f, 0.0179877f, -0.0382487f, 0.00597941f, +-0.0183611f, 0.00190395f, -0.144322f, -0.0435671f, 0.000990594f, +0.221087f, 0.142405f, 0.484066f, 0.404395f, 0.511955f, +-0.237255f, 0.241742f, 0.35045f, -0.699428f, 10.3993f, +2.6507f, -2.43459f, -4.18838f, 1.05928f, 1.71067f, +0.00667811f, -0.0721335f, -0.0397346f, 0.0362704f, -0.11496f, +-0.0235776f, 0.0082161f, -0.0141741f, -0.0329699f, -0.0354253f, +0.00277404f, -0.290654f, -1.14767f, -0.319157f, -0.686544f, +0.36897f, 0.478899f, 0.182579f, -0.411069f, 0.881104f, +-4.60683f, 1.4697f, 0.335845f, -1.81905f, -30.1699f, +5.55225f, 0.0019508f, -0.123576f, -0.0727332f, -0.0641597f, +-0.0534458f, -0.108166f, -0.0937368f, -0.0697883f, -0.0275475f, +-0.192309f, -0.110074f, 0.285375f, -0.405597f, 0.0926724f, +-0.287881f, -0.851193f, -0.099493f, -0.233764f, -1.2852f, +1.13611f, 3.12168f, -0.0699f, -1.86216f, 2.65292f, +-7.31036f, 2.44776f, -0.00111802f, -0.0632786f, -0.0376296f, +-0.149851f, 0.142963f, 0.184368f, 0.123433f, 0.0756158f, +0.117312f, 0.0933395f, 0.0692163f, 0.0842592f, 0.0704683f, +0.0589963f, 0.0942205f, -0.448862f, 0.0262677f, 0.270352f, +-0.262317f, 0.172586f, 2.00227f, -0.159216f, 0.038422f, +10.2073f, 4.15536f, -2.3407f, -0.0550265f, 0.00964792f, +-0.141336f, 0.0274501f, 0.0343921f, -0.0487428f, 0.0950172f, +-0.00775017f, -0.0372492f, -0.00548121f, -0.0663695f, 0.0960506f, +-0.200008f, -0.0412827f, 0.58728f, 0.0515787f, 0.337254f, +0.855024f, 0.668371f, -0.114904f, -3.62962f, -0.467477f, +-0.215472f, 2.61537f, 0.406117f, -1.36373f, 0.0425394f, +0.12208f, 0.0934502f, 0.123055f, 0.0340935f, -0.142466f, +0.035037f, -0.0490666f, 0.0733208f, 0.0576672f, 0.123984f, +-0.0517194f, -0.253018f, 0.590565f, 0.145849f, 0.315185f, +0.221534f, -0.149081f, 0.216161f, -0.349575f, 24.5664f, +-0.994196f, 0.614289f, -18.7905f, -2.83277f, -0.716801f, +-0.347201f, 0.479515f, -0.246027f, 0.0758683f, 0.137293f, +-0.17781f, 0.118751f, -0.00108329f, -0.237334f, 0.355732f, +-0.12991f, -0.0547627f, -0.318576f, -0.325524f, 0.180494f, +-0.0625604f, 0.141219f, 0.344064f, 0.37658f, -0.591772f, +5.8427f, -0.38075f, 0.221894f, -1.41934f, -1.87943e+06f, +1.34114f, 0.0283355f, -0.0447856f, -0.0211466f, -0.0256927f, +0.0139618f, 0.0207934f, -0.0107666f, 0.0110969f, 0.0586069f, +-0.0253545f, -0.0328433f, 0.11872f, -0.216943f, 0.145748f, +0.119808f, -0.0915211f, -0.120647f, -0.0787719f, -0.143644f, +-0.595116f, -1.152f, -1.25335f, -1.17092f, 4.34023f, +-975268.f, -1.37033f, -0.0401123f, 0.210602f, -0.136656f, +0.135962f, -0.0523293f, 0.0444604f, 0.0143928f, 0.00412666f, +-0.0193003f, 0.218452f, -0.110204f, -2.02563f, 0.918238f, +-2.45362f, 1.19542f, -0.061362f, -1.92243f, 0.308111f, +0.49764f, 0.912356f, 0.209272f, -2.34525f, 2.19326f, +-6.47121f, 1.69771f, -0.725123f, 0.0118929f, 0.0377944f, +0.0554003f, 0.0226452f, -0.0704421f, -0.0300309f, 0.0122978f, +-0.0041782f, -0.0686612f, 0.0313115f, 0.039111f, 0.364111f, +-0.0945548f, 0.0229876f, -0.17414f, 0.329795f, 0.114714f, +0.30022f, 0.106997f, 0.132355f, 5.79932f, 0.908058f, +-0.905324f, -3.3561f, 0.190647f, 0.184211f, -0.673648f, +0.231807f, -0.0586222f, 0.230752f, -0.438277f, 0.245857f, +-0.17215f, 0.0876383f, -0.720512f, 0.162515f, 0.0170571f, +0.101781f, 0.388477f, 1.32931f, 1.08548f, -0.936301f, +-2.36958f, -6.71988f, -3.44376f, 2.13818f, 14.2318f, +4.91459f, -3.09052f, -9.69191f, -0.768234f, 1.79604f, +0.0549653f, 0.163399f, 0.0797025f, 0.0343933f, -0.0555876f, +-0.00505673f, 0.0187258f, 0.0326628f, 0.0231486f, 0.15573f, +0.0476223f, -0.254824f, 1.60155f, -0.801221f, 2.55496f, +0.737629f, -1.36249f, -0.695463f, -2.44301f, -1.73188f, +3.95279f, 1.89068f, 0.486087f, -11.3343f, 3.9416e+06f, + +/* output layer */ +-0.381439f, 0.12115f, -0.906927f, 2.93878f, 1.6388f, +0.882811f, 0.874344f, 1.21726f, -0.874545f, 0.321706f, +0.785055f, 0.946558f, -0.575066f, -3.46553f, 0.884905f, +0.0924047f, -9.90712f, 0.391338f, 0.160103f, -2.04954f, +4.1455f, 0.0684029f, -0.144761f, -0.285282f, 0.379244f, +-1.1584f, -0.0277241f, -9.85f, -4.82386f, 3.71333f, +3.87308f, 3.52558f}; + +static const int topo[3] = {25, 15, 2}; + +const MLP net = { + 3, + topo, + weights +}; diff --git a/thirdparty/opus/opus.c b/thirdparty/opus/opus.c new file mode 100644 index 000000000000..f76f125cfa30 --- /dev/null +++ b/thirdparty/opus/opus.c @@ -0,0 +1,356 @@ +/* Copyright (c) 2011 Xiph.Org Foundation, Skype Limited + Written by Jean-Marc Valin and Koen Vos */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opus.h" +#include "opus_private.h" + +#ifndef DISABLE_FLOAT_API +OPUS_EXPORT void opus_pcm_soft_clip(float *_x, int N, int C, float *declip_mem) +{ + int c; + int i; + float *x; + + if (C<1 || N<1 || !_x || !declip_mem) return; + + /* First thing: saturate everything to +/- 2 which is the highest level our + non-linearity can handle. At the point where the signal reaches +/-2, + the derivative will be zero anyway, so this doesn't introduce any + discontinuity in the derivative. */ + for (i=0;i=0) + break; + x[i*C] = x[i*C]+a*x[i*C]*x[i*C]; + } + + curr=0; + x0 = x[0]; + while(1) + { + int start, end; + float maxval; + int special=0; + int peak_pos; + for (i=curr;i1 || x[i*C]<-1) + break; + } + if (i==N) + { + a=0; + break; + } + peak_pos = i; + start=end=i; + maxval=ABS16(x[i*C]); + /* Look for first zero crossing before clipping */ + while (start>0 && x[i*C]*x[(start-1)*C]>=0) + start--; + /* Look for first zero crossing after clipping */ + while (end=0) + { + /* Look for other peaks until the next zero-crossing. */ + if (ABS16(x[end*C])>maxval) + { + maxval = ABS16(x[end*C]); + peak_pos = end; + } + end++; + } + /* Detect the special case where we clip before the first zero crossing */ + special = (start==0 && x[i*C]*x[0]>=0); + + /* Compute a such that maxval + a*maxval^2 = 1 */ + a=(maxval-1)/(maxval*maxval); + /* Slightly boost "a" by 2^-22. This is just enough to ensure -ffast-math + does not cause output values larger than +/-1, but small enough not + to matter even for 24-bit output. */ + a += a*2.4e-7; + if (x[i*C]>0) + a = -a; + /* Apply soft clipping */ + for (i=start;i=2) + { + /* Add a linear ramp from the first sample to the signal peak. + This avoids a discontinuity at the beginning of the frame. */ + float delta; + float offset = x0-x[0]; + delta = offset / peak_pos; + for (i=curr;i>2; + return 2; + } +} + +static int parse_size(const unsigned char *data, opus_int32 len, opus_int16 *size) +{ + if (len<1) + { + *size = -1; + return -1; + } else if (data[0]<252) + { + *size = data[0]; + return 1; + } else if (len<2) + { + *size = -1; + return -1; + } else { + *size = 4*data[1] + data[0]; + return 2; + } +} + +int opus_packet_get_samples_per_frame(const unsigned char *data, + opus_int32 Fs) +{ + int audiosize; + if (data[0]&0x80) + { + audiosize = ((data[0]>>3)&0x3); + audiosize = (Fs<>3)&0x3); + if (audiosize == 3) + audiosize = Fs*60/1000; + else + audiosize = (Fs< len) + return OPUS_INVALID_PACKET; + data += bytes; + last_size = len-size[0]; + break; + /* Multiple CBR/VBR frames (from 0 to 120 ms) */ + default: /*case 3:*/ + if (len<1) + return OPUS_INVALID_PACKET; + /* Number of frames encoded in bits 0 to 5 */ + ch = *data++; + count = ch&0x3F; + if (count <= 0 || framesize*count > 5760) + return OPUS_INVALID_PACKET; + len--; + /* Padding flag is bit 6 */ + if (ch&0x40) + { + int p; + do { + int tmp; + if (len<=0) + return OPUS_INVALID_PACKET; + p = *data++; + len--; + tmp = p==255 ? 254: p; + len -= tmp; + pad += tmp; + } while (p==255); + } + if (len<0) + return OPUS_INVALID_PACKET; + /* VBR flag is bit 7 */ + cbr = !(ch&0x80); + if (!cbr) + { + /* VBR case */ + last_size = len; + for (i=0;i len) + return OPUS_INVALID_PACKET; + data += bytes; + last_size -= bytes+size[i]; + } + if (last_size<0) + return OPUS_INVALID_PACKET; + } else if (!self_delimited) + { + /* CBR case */ + last_size = len/count; + if (last_size*count!=len) + return OPUS_INVALID_PACKET; + for (i=0;i len) + return OPUS_INVALID_PACKET; + data += bytes; + /* For CBR packets, apply the size to all the frames. */ + if (cbr) + { + if (size[count-1]*count > len) + return OPUS_INVALID_PACKET; + for (i=0;i last_size) + return OPUS_INVALID_PACKET; + } else + { + /* Because it's not encoded explicitly, it's possible the size of the + last packet (or all the packets, for the CBR case) is larger than + 1275. Reject them here.*/ + if (last_size > 1275) + return OPUS_INVALID_PACKET; + size[count-1] = (opus_int16)last_size; + } + + if (payload_offset) + *payload_offset = (int)(data-data0); + + for (i=0;i + *
  • audio_frame is the audio data in opus_int16 (or float for opus_encode_float())
  • + *
  • frame_size is the duration of the frame in samples (per channel)
  • + *
  • packet is the byte array to which the compressed data is written
  • + *
  • max_packet is the maximum number of bytes that can be written in the packet (4000 bytes is recommended). + * Do not use max_packet to control VBR target bitrate, instead use the #OPUS_SET_BITRATE CTL.
  • + * + * + * opus_encode() and opus_encode_float() return the number of bytes actually written to the packet. + * The return value can be negative, which indicates that an error has occurred. If the return value + * is 2 bytes or less, then the packet does not need to be transmitted (DTX). + * + * Once the encoder state if no longer needed, it can be destroyed with + * + * @code + * opus_encoder_destroy(enc); + * @endcode + * + * If the encoder was created with opus_encoder_init() rather than opus_encoder_create(), + * then no action is required aside from potentially freeing the memory that was manually + * allocated for it (calling free(enc) for the example above) + * + */ + +/** Opus encoder state. + * This contains the complete state of an Opus encoder. + * It is position independent and can be freely copied. + * @see opus_encoder_create,opus_encoder_init + */ +typedef struct OpusEncoder OpusEncoder; + +/** Gets the size of an OpusEncoder structure. + * @param[in] channels int: Number of channels. + * This must be 1 or 2. + * @returns The size in bytes. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_encoder_get_size(int channels); + +/** + */ + +/** Allocates and initializes an encoder state. + * There are three coding modes: + * + * @ref OPUS_APPLICATION_VOIP gives best quality at a given bitrate for voice + * signals. It enhances the input signal by high-pass filtering and + * emphasizing formants and harmonics. Optionally it includes in-band + * forward error correction to protect against packet loss. Use this + * mode for typical VoIP applications. Because of the enhancement, + * even at high bitrates the output may sound different from the input. + * + * @ref OPUS_APPLICATION_AUDIO gives best quality at a given bitrate for most + * non-voice signals like music. Use this mode for music and mixed + * (music/voice) content, broadcast, and applications requiring less + * than 15 ms of coding delay. + * + * @ref OPUS_APPLICATION_RESTRICTED_LOWDELAY configures low-delay mode that + * disables the speech-optimized mode in exchange for slightly reduced delay. + * This mode can only be set on an newly initialized or freshly reset encoder + * because it changes the codec delay. + * + * This is useful when the caller knows that the speech-optimized modes will not be needed (use with caution). + * @param [in] Fs opus_int32: Sampling rate of input signal (Hz) + * This must be one of 8000, 12000, 16000, + * 24000, or 48000. + * @param [in] channels int: Number of channels (1 or 2) in input signal + * @param [in] application int: Coding mode (@ref OPUS_APPLICATION_VOIP/@ref OPUS_APPLICATION_AUDIO/@ref OPUS_APPLICATION_RESTRICTED_LOWDELAY) + * @param [out] error int*: @ref opus_errorcodes + * @note Regardless of the sampling rate and number channels selected, the Opus encoder + * can switch to a lower audio bandwidth or number of channels if the bitrate + * selected is too low. This also means that it is safe to always use 48 kHz stereo input + * and let the encoder optimize the encoding. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT OpusEncoder *opus_encoder_create( + opus_int32 Fs, + int channels, + int application, + int *error +); + +/** Initializes a previously allocated encoder state + * The memory pointed to by st must be at least the size returned by opus_encoder_get_size(). + * This is intended for applications which use their own allocator instead of malloc. + * @see opus_encoder_create(),opus_encoder_get_size() + * To reset a previously initialized state, use the #OPUS_RESET_STATE CTL. + * @param [in] st OpusEncoder*: Encoder state + * @param [in] Fs opus_int32: Sampling rate of input signal (Hz) + * This must be one of 8000, 12000, 16000, + * 24000, or 48000. + * @param [in] channels int: Number of channels (1 or 2) in input signal + * @param [in] application int: Coding mode (OPUS_APPLICATION_VOIP/OPUS_APPLICATION_AUDIO/OPUS_APPLICATION_RESTRICTED_LOWDELAY) + * @retval #OPUS_OK Success or @ref opus_errorcodes + */ +OPUS_EXPORT int opus_encoder_init( + OpusEncoder *st, + opus_int32 Fs, + int channels, + int application +) OPUS_ARG_NONNULL(1); + +/** Encodes an Opus frame. + * @param [in] st OpusEncoder*: Encoder state + * @param [in] pcm opus_int16*: Input signal (interleaved if 2 channels). length is frame_size*channels*sizeof(opus_int16) + * @param [in] frame_size int: Number of samples per channel in the + * input signal. + * This must be an Opus frame size for + * the encoder's sampling rate. + * For example, at 48 kHz the permitted + * values are 120, 240, 480, 960, 1920, + * and 2880. + * Passing in a duration of less than + * 10 ms (480 samples at 48 kHz) will + * prevent the encoder from using the LPC + * or hybrid modes. + * @param [out] data unsigned char*: Output payload. + * This must contain storage for at + * least \a max_data_bytes. + * @param [in] max_data_bytes opus_int32: Size of the allocated + * memory for the output + * payload. This may be + * used to impose an upper limit on + * the instant bitrate, but should + * not be used as the only bitrate + * control. Use #OPUS_SET_BITRATE to + * control the bitrate. + * @returns The length of the encoded packet (in bytes) on success or a + * negative error code (see @ref opus_errorcodes) on failure. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT opus_int32 opus_encode( + OpusEncoder *st, + const opus_int16 *pcm, + int frame_size, + unsigned char *data, + opus_int32 max_data_bytes +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(2) OPUS_ARG_NONNULL(4); + +/** Encodes an Opus frame from floating point input. + * @param [in] st OpusEncoder*: Encoder state + * @param [in] pcm float*: Input in float format (interleaved if 2 channels), with a normal range of +/-1.0. + * Samples with a range beyond +/-1.0 are supported but will + * be clipped by decoders using the integer API and should + * only be used if it is known that the far end supports + * extended dynamic range. + * length is frame_size*channels*sizeof(float) + * @param [in] frame_size int: Number of samples per channel in the + * input signal. + * This must be an Opus frame size for + * the encoder's sampling rate. + * For example, at 48 kHz the permitted + * values are 120, 240, 480, 960, 1920, + * and 2880. + * Passing in a duration of less than + * 10 ms (480 samples at 48 kHz) will + * prevent the encoder from using the LPC + * or hybrid modes. + * @param [out] data unsigned char*: Output payload. + * This must contain storage for at + * least \a max_data_bytes. + * @param [in] max_data_bytes opus_int32: Size of the allocated + * memory for the output + * payload. This may be + * used to impose an upper limit on + * the instant bitrate, but should + * not be used as the only bitrate + * control. Use #OPUS_SET_BITRATE to + * control the bitrate. + * @returns The length of the encoded packet (in bytes) on success or a + * negative error code (see @ref opus_errorcodes) on failure. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT opus_int32 opus_encode_float( + OpusEncoder *st, + const float *pcm, + int frame_size, + unsigned char *data, + opus_int32 max_data_bytes +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(2) OPUS_ARG_NONNULL(4); + +/** Frees an OpusEncoder allocated by opus_encoder_create(). + * @param[in] st OpusEncoder*: State to be freed. + */ +OPUS_EXPORT void opus_encoder_destroy(OpusEncoder *st); + +/** Perform a CTL function on an Opus encoder. + * + * Generally the request and subsequent arguments are generated + * by a convenience macro. + * @param st OpusEncoder*: Encoder state. + * @param request This and all remaining parameters should be replaced by one + * of the convenience macros in @ref opus_genericctls or + * @ref opus_encoderctls. + * @see opus_genericctls + * @see opus_encoderctls + */ +OPUS_EXPORT int opus_encoder_ctl(OpusEncoder *st, int request, ...) OPUS_ARG_NONNULL(1); +/**@}*/ + +/** @defgroup opus_decoder Opus Decoder + * @{ + * + * @brief This page describes the process and functions used to decode Opus. + * + * The decoding process also starts with creating a decoder + * state. This can be done with: + * @code + * int error; + * OpusDecoder *dec; + * dec = opus_decoder_create(Fs, channels, &error); + * @endcode + * where + * @li Fs is the sampling rate and must be 8000, 12000, 16000, 24000, or 48000 + * @li channels is the number of channels (1 or 2) + * @li error will hold the error code in case of failure (or #OPUS_OK on success) + * @li the return value is a newly created decoder state to be used for decoding + * + * While opus_decoder_create() allocates memory for the state, it's also possible + * to initialize pre-allocated memory: + * @code + * int size; + * int error; + * OpusDecoder *dec; + * size = opus_decoder_get_size(channels); + * dec = malloc(size); + * error = opus_decoder_init(dec, Fs, channels); + * @endcode + * where opus_decoder_get_size() returns the required size for the decoder state. Note that + * future versions of this code may change the size, so no assuptions should be made about it. + * + * The decoder state is always continuous in memory and only a shallow copy is sufficient + * to copy it (e.g. memcpy()) + * + * To decode a frame, opus_decode() or opus_decode_float() must be called with a packet of compressed audio data: + * @code + * frame_size = opus_decode(dec, packet, len, decoded, max_size, 0); + * @endcode + * where + * + * @li packet is the byte array containing the compressed data + * @li len is the exact number of bytes contained in the packet + * @li decoded is the decoded audio data in opus_int16 (or float for opus_decode_float()) + * @li max_size is the max duration of the frame in samples (per channel) that can fit into the decoded_frame array + * + * opus_decode() and opus_decode_float() return the number of samples (per channel) decoded from the packet. + * If that value is negative, then an error has occurred. This can occur if the packet is corrupted or if the audio + * buffer is too small to hold the decoded audio. + * + * Opus is a stateful codec with overlapping blocks and as a result Opus + * packets are not coded independently of each other. Packets must be + * passed into the decoder serially and in the correct order for a correct + * decode. Lost packets can be replaced with loss concealment by calling + * the decoder with a null pointer and zero length for the missing packet. + * + * A single codec state may only be accessed from a single thread at + * a time and any required locking must be performed by the caller. Separate + * streams must be decoded with separate decoder states and can be decoded + * in parallel unless the library was compiled with NONTHREADSAFE_PSEUDOSTACK + * defined. + * + */ + +/** Opus decoder state. + * This contains the complete state of an Opus decoder. + * It is position independent and can be freely copied. + * @see opus_decoder_create,opus_decoder_init + */ +typedef struct OpusDecoder OpusDecoder; + +/** Gets the size of an OpusDecoder structure. + * @param [in] channels int: Number of channels. + * This must be 1 or 2. + * @returns The size in bytes. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_decoder_get_size(int channels); + +/** Allocates and initializes a decoder state. + * @param [in] Fs opus_int32: Sample rate to decode at (Hz). + * This must be one of 8000, 12000, 16000, + * 24000, or 48000. + * @param [in] channels int: Number of channels (1 or 2) to decode + * @param [out] error int*: #OPUS_OK Success or @ref opus_errorcodes + * + * Internally Opus stores data at 48000 Hz, so that should be the default + * value for Fs. However, the decoder can efficiently decode to buffers + * at 8, 12, 16, and 24 kHz so if for some reason the caller cannot use + * data at the full sample rate, or knows the compressed data doesn't + * use the full frequency range, it can request decoding at a reduced + * rate. Likewise, the decoder is capable of filling in either mono or + * interleaved stereo pcm buffers, at the caller's request. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT OpusDecoder *opus_decoder_create( + opus_int32 Fs, + int channels, + int *error +); + +/** Initializes a previously allocated decoder state. + * The state must be at least the size returned by opus_decoder_get_size(). + * This is intended for applications which use their own allocator instead of malloc. @see opus_decoder_create,opus_decoder_get_size + * To reset a previously initialized state, use the #OPUS_RESET_STATE CTL. + * @param [in] st OpusDecoder*: Decoder state. + * @param [in] Fs opus_int32: Sampling rate to decode to (Hz). + * This must be one of 8000, 12000, 16000, + * 24000, or 48000. + * @param [in] channels int: Number of channels (1 or 2) to decode + * @retval #OPUS_OK Success or @ref opus_errorcodes + */ +OPUS_EXPORT int opus_decoder_init( + OpusDecoder *st, + opus_int32 Fs, + int channels +) OPUS_ARG_NONNULL(1); + +/** Decode an Opus packet. + * @param [in] st OpusDecoder*: Decoder state + * @param [in] data char*: Input payload. Use a NULL pointer to indicate packet loss + * @param [in] len opus_int32: Number of bytes in payload* + * @param [out] pcm opus_int16*: Output signal (interleaved if 2 channels). length + * is frame_size*channels*sizeof(opus_int16) + * @param [in] frame_size Number of samples per channel of available space in \a pcm. + * If this is less than the maximum packet duration (120ms; 5760 for 48kHz), this function will + * not be capable of decoding some packets. In the case of PLC (data==NULL) or FEC (decode_fec=1), + * then frame_size needs to be exactly the duration of audio that is missing, otherwise the + * decoder will not be in the optimal state to decode the next incoming packet. For the PLC and + * FEC cases, frame_size must be a multiple of 2.5 ms. + * @param [in] decode_fec int: Flag (0 or 1) to request that any in-band forward error correction data be + * decoded. If no such data is available, the frame is decoded as if it were lost. + * @returns Number of decoded samples or @ref opus_errorcodes + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_decode( + OpusDecoder *st, + const unsigned char *data, + opus_int32 len, + opus_int16 *pcm, + int frame_size, + int decode_fec +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(4); + +/** Decode an Opus packet with floating point output. + * @param [in] st OpusDecoder*: Decoder state + * @param [in] data char*: Input payload. Use a NULL pointer to indicate packet loss + * @param [in] len opus_int32: Number of bytes in payload + * @param [out] pcm float*: Output signal (interleaved if 2 channels). length + * is frame_size*channels*sizeof(float) + * @param [in] frame_size Number of samples per channel of available space in \a pcm. + * If this is less than the maximum packet duration (120ms; 5760 for 48kHz), this function will + * not be capable of decoding some packets. In the case of PLC (data==NULL) or FEC (decode_fec=1), + * then frame_size needs to be exactly the duration of audio that is missing, otherwise the + * decoder will not be in the optimal state to decode the next incoming packet. For the PLC and + * FEC cases, frame_size must be a multiple of 2.5 ms. + * @param [in] decode_fec int: Flag (0 or 1) to request that any in-band forward error correction data be + * decoded. If no such data is available the frame is decoded as if it were lost. + * @returns Number of decoded samples or @ref opus_errorcodes + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_decode_float( + OpusDecoder *st, + const unsigned char *data, + opus_int32 len, + float *pcm, + int frame_size, + int decode_fec +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(4); + +/** Perform a CTL function on an Opus decoder. + * + * Generally the request and subsequent arguments are generated + * by a convenience macro. + * @param st OpusDecoder*: Decoder state. + * @param request This and all remaining parameters should be replaced by one + * of the convenience macros in @ref opus_genericctls or + * @ref opus_decoderctls. + * @see opus_genericctls + * @see opus_decoderctls + */ +OPUS_EXPORT int opus_decoder_ctl(OpusDecoder *st, int request, ...) OPUS_ARG_NONNULL(1); + +/** Frees an OpusDecoder allocated by opus_decoder_create(). + * @param[in] st OpusDecoder*: State to be freed. + */ +OPUS_EXPORT void opus_decoder_destroy(OpusDecoder *st); + +/** Parse an opus packet into one or more frames. + * Opus_decode will perform this operation internally so most applications do + * not need to use this function. + * This function does not copy the frames, the returned pointers are pointers into + * the input packet. + * @param [in] data char*: Opus packet to be parsed + * @param [in] len opus_int32: size of data + * @param [out] out_toc char*: TOC pointer + * @param [out] frames char*[48] encapsulated frames + * @param [out] size opus_int16[48] sizes of the encapsulated frames + * @param [out] payload_offset int*: returns the position of the payload within the packet (in bytes) + * @returns number of frames + */ +OPUS_EXPORT int opus_packet_parse( + const unsigned char *data, + opus_int32 len, + unsigned char *out_toc, + const unsigned char *frames[48], + opus_int16 size[48], + int *payload_offset +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(4); + +/** Gets the bandwidth of an Opus packet. + * @param [in] data char*: Opus packet + * @retval OPUS_BANDWIDTH_NARROWBAND Narrowband (4kHz bandpass) + * @retval OPUS_BANDWIDTH_MEDIUMBAND Mediumband (6kHz bandpass) + * @retval OPUS_BANDWIDTH_WIDEBAND Wideband (8kHz bandpass) + * @retval OPUS_BANDWIDTH_SUPERWIDEBAND Superwideband (12kHz bandpass) + * @retval OPUS_BANDWIDTH_FULLBAND Fullband (20kHz bandpass) + * @retval OPUS_INVALID_PACKET The compressed data passed is corrupted or of an unsupported type + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_packet_get_bandwidth(const unsigned char *data) OPUS_ARG_NONNULL(1); + +/** Gets the number of samples per frame from an Opus packet. + * @param [in] data char*: Opus packet. + * This must contain at least one byte of + * data. + * @param [in] Fs opus_int32: Sampling rate in Hz. + * This must be a multiple of 400, or + * inaccurate results will be returned. + * @returns Number of samples per frame. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_packet_get_samples_per_frame(const unsigned char *data, opus_int32 Fs) OPUS_ARG_NONNULL(1); + +/** Gets the number of channels from an Opus packet. + * @param [in] data char*: Opus packet + * @returns Number of channels + * @retval OPUS_INVALID_PACKET The compressed data passed is corrupted or of an unsupported type + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_packet_get_nb_channels(const unsigned char *data) OPUS_ARG_NONNULL(1); + +/** Gets the number of frames in an Opus packet. + * @param [in] packet char*: Opus packet + * @param [in] len opus_int32: Length of packet + * @returns Number of frames + * @retval OPUS_BAD_ARG Insufficient data was passed to the function + * @retval OPUS_INVALID_PACKET The compressed data passed is corrupted or of an unsupported type + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_packet_get_nb_frames(const unsigned char packet[], opus_int32 len) OPUS_ARG_NONNULL(1); + +/** Gets the number of samples of an Opus packet. + * @param [in] packet char*: Opus packet + * @param [in] len opus_int32: Length of packet + * @param [in] Fs opus_int32: Sampling rate in Hz. + * This must be a multiple of 400, or + * inaccurate results will be returned. + * @returns Number of samples + * @retval OPUS_BAD_ARG Insufficient data was passed to the function + * @retval OPUS_INVALID_PACKET The compressed data passed is corrupted or of an unsupported type + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_packet_get_nb_samples(const unsigned char packet[], opus_int32 len, opus_int32 Fs) OPUS_ARG_NONNULL(1); + +/** Gets the number of samples of an Opus packet. + * @param [in] dec OpusDecoder*: Decoder state + * @param [in] packet char*: Opus packet + * @param [in] len opus_int32: Length of packet + * @returns Number of samples + * @retval OPUS_BAD_ARG Insufficient data was passed to the function + * @retval OPUS_INVALID_PACKET The compressed data passed is corrupted or of an unsupported type + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_decoder_get_nb_samples(const OpusDecoder *dec, const unsigned char packet[], opus_int32 len) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(2); + +/** Applies soft-clipping to bring a float signal within the [-1,1] range. If + * the signal is already in that range, nothing is done. If there are values + * outside of [-1,1], then the signal is clipped as smoothly as possible to + * both fit in the range and avoid creating excessive distortion in the + * process. + * @param [in,out] pcm float*: Input PCM and modified PCM + * @param [in] frame_size int Number of samples per channel to process + * @param [in] channels int: Number of channels + * @param [in,out] softclip_mem float*: State memory for the soft clipping process (one float per channel, initialized to zero) + */ +OPUS_EXPORT void opus_pcm_soft_clip(float *pcm, int frame_size, int channels, float *softclip_mem); + + +/**@}*/ + +/** @defgroup opus_repacketizer Repacketizer + * @{ + * + * The repacketizer can be used to merge multiple Opus packets into a single + * packet or alternatively to split Opus packets that have previously been + * merged. Splitting valid Opus packets is always guaranteed to succeed, + * whereas merging valid packets only succeeds if all frames have the same + * mode, bandwidth, and frame size, and when the total duration of the merged + * packet is no more than 120 ms. The 120 ms limit comes from the + * specification and limits decoder memory requirements at a point where + * framing overhead becomes negligible. + * + * The repacketizer currently only operates on elementary Opus + * streams. It will not manipualte multistream packets successfully, except in + * the degenerate case where they consist of data from a single stream. + * + * The repacketizing process starts with creating a repacketizer state, either + * by calling opus_repacketizer_create() or by allocating the memory yourself, + * e.g., + * @code + * OpusRepacketizer *rp; + * rp = (OpusRepacketizer*)malloc(opus_repacketizer_get_size()); + * if (rp != NULL) + * opus_repacketizer_init(rp); + * @endcode + * + * Then the application should submit packets with opus_repacketizer_cat(), + * extract new packets with opus_repacketizer_out() or + * opus_repacketizer_out_range(), and then reset the state for the next set of + * input packets via opus_repacketizer_init(). + * + * For example, to split a sequence of packets into individual frames: + * @code + * unsigned char *data; + * int len; + * while (get_next_packet(&data, &len)) + * { + * unsigned char out[1276]; + * opus_int32 out_len; + * int nb_frames; + * int err; + * int i; + * err = opus_repacketizer_cat(rp, data, len); + * if (err != OPUS_OK) + * { + * release_packet(data); + * return err; + * } + * nb_frames = opus_repacketizer_get_nb_frames(rp); + * for (i = 0; i < nb_frames; i++) + * { + * out_len = opus_repacketizer_out_range(rp, i, i+1, out, sizeof(out)); + * if (out_len < 0) + * { + * release_packet(data); + * return (int)out_len; + * } + * output_next_packet(out, out_len); + * } + * opus_repacketizer_init(rp); + * release_packet(data); + * } + * @endcode + * + * Alternatively, to combine a sequence of frames into packets that each + * contain up to TARGET_DURATION_MS milliseconds of data: + * @code + * // The maximum number of packets with duration TARGET_DURATION_MS occurs + * // when the frame size is 2.5 ms, for a total of (TARGET_DURATION_MS*2/5) + * // packets. + * unsigned char *data[(TARGET_DURATION_MS*2/5)+1]; + * opus_int32 len[(TARGET_DURATION_MS*2/5)+1]; + * int nb_packets; + * unsigned char out[1277*(TARGET_DURATION_MS*2/2)]; + * opus_int32 out_len; + * int prev_toc; + * nb_packets = 0; + * while (get_next_packet(data+nb_packets, len+nb_packets)) + * { + * int nb_frames; + * int err; + * nb_frames = opus_packet_get_nb_frames(data[nb_packets], len[nb_packets]); + * if (nb_frames < 1) + * { + * release_packets(data, nb_packets+1); + * return nb_frames; + * } + * nb_frames += opus_repacketizer_get_nb_frames(rp); + * // If adding the next packet would exceed our target, or it has an + * // incompatible TOC sequence, output the packets we already have before + * // submitting it. + * // N.B., The nb_packets > 0 check ensures we've submitted at least one + * // packet since the last call to opus_repacketizer_init(). Otherwise a + * // single packet longer than TARGET_DURATION_MS would cause us to try to + * // output an (invalid) empty packet. It also ensures that prev_toc has + * // been set to a valid value. Additionally, len[nb_packets] > 0 is + * // guaranteed by the call to opus_packet_get_nb_frames() above, so the + * // reference to data[nb_packets][0] should be valid. + * if (nb_packets > 0 && ( + * ((prev_toc & 0xFC) != (data[nb_packets][0] & 0xFC)) || + * opus_packet_get_samples_per_frame(data[nb_packets], 48000)*nb_frames > + * TARGET_DURATION_MS*48)) + * { + * out_len = opus_repacketizer_out(rp, out, sizeof(out)); + * if (out_len < 0) + * { + * release_packets(data, nb_packets+1); + * return (int)out_len; + * } + * output_next_packet(out, out_len); + * opus_repacketizer_init(rp); + * release_packets(data, nb_packets); + * data[0] = data[nb_packets]; + * len[0] = len[nb_packets]; + * nb_packets = 0; + * } + * err = opus_repacketizer_cat(rp, data[nb_packets], len[nb_packets]); + * if (err != OPUS_OK) + * { + * release_packets(data, nb_packets+1); + * return err; + * } + * prev_toc = data[nb_packets][0]; + * nb_packets++; + * } + * // Output the final, partial packet. + * if (nb_packets > 0) + * { + * out_len = opus_repacketizer_out(rp, out, sizeof(out)); + * release_packets(data, nb_packets); + * if (out_len < 0) + * return (int)out_len; + * output_next_packet(out, out_len); + * } + * @endcode + * + * An alternate way of merging packets is to simply call opus_repacketizer_cat() + * unconditionally until it fails. At that point, the merged packet can be + * obtained with opus_repacketizer_out() and the input packet for which + * opus_repacketizer_cat() needs to be re-added to a newly reinitialized + * repacketizer state. + */ + +typedef struct OpusRepacketizer OpusRepacketizer; + +/** Gets the size of an OpusRepacketizer structure. + * @returns The size in bytes. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_repacketizer_get_size(void); + +/** (Re)initializes a previously allocated repacketizer state. + * The state must be at least the size returned by opus_repacketizer_get_size(). + * This can be used for applications which use their own allocator instead of + * malloc(). + * It must also be called to reset the queue of packets waiting to be + * repacketized, which is necessary if the maximum packet duration of 120 ms + * is reached or if you wish to submit packets with a different Opus + * configuration (coding mode, audio bandwidth, frame size, or channel count). + * Failure to do so will prevent a new packet from being added with + * opus_repacketizer_cat(). + * @see opus_repacketizer_create + * @see opus_repacketizer_get_size + * @see opus_repacketizer_cat + * @param rp OpusRepacketizer*: The repacketizer state to + * (re)initialize. + * @returns A pointer to the same repacketizer state that was passed in. + */ +OPUS_EXPORT OpusRepacketizer *opus_repacketizer_init(OpusRepacketizer *rp) OPUS_ARG_NONNULL(1); + +/** Allocates memory and initializes the new repacketizer with + * opus_repacketizer_init(). + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT OpusRepacketizer *opus_repacketizer_create(void); + +/** Frees an OpusRepacketizer allocated by + * opus_repacketizer_create(). + * @param[in] rp OpusRepacketizer*: State to be freed. + */ +OPUS_EXPORT void opus_repacketizer_destroy(OpusRepacketizer *rp); + +/** Add a packet to the current repacketizer state. + * This packet must match the configuration of any packets already submitted + * for repacketization since the last call to opus_repacketizer_init(). + * This means that it must have the same coding mode, audio bandwidth, frame + * size, and channel count. + * This can be checked in advance by examining the top 6 bits of the first + * byte of the packet, and ensuring they match the top 6 bits of the first + * byte of any previously submitted packet. + * The total duration of audio in the repacketizer state also must not exceed + * 120 ms, the maximum duration of a single packet, after adding this packet. + * + * The contents of the current repacketizer state can be extracted into new + * packets using opus_repacketizer_out() or opus_repacketizer_out_range(). + * + * In order to add a packet with a different configuration or to add more + * audio beyond 120 ms, you must clear the repacketizer state by calling + * opus_repacketizer_init(). + * If a packet is too large to add to the current repacketizer state, no part + * of it is added, even if it contains multiple frames, some of which might + * fit. + * If you wish to be able to add parts of such packets, you should first use + * another repacketizer to split the packet into pieces and add them + * individually. + * @see opus_repacketizer_out_range + * @see opus_repacketizer_out + * @see opus_repacketizer_init + * @param rp OpusRepacketizer*: The repacketizer state to which to + * add the packet. + * @param[in] data const unsigned char*: The packet data. + * The application must ensure + * this pointer remains valid + * until the next call to + * opus_repacketizer_init() or + * opus_repacketizer_destroy(). + * @param len opus_int32: The number of bytes in the packet data. + * @returns An error code indicating whether or not the operation succeeded. + * @retval #OPUS_OK The packet's contents have been added to the repacketizer + * state. + * @retval #OPUS_INVALID_PACKET The packet did not have a valid TOC sequence, + * the packet's TOC sequence was not compatible + * with previously submitted packets (because + * the coding mode, audio bandwidth, frame size, + * or channel count did not match), or adding + * this packet would increase the total amount of + * audio stored in the repacketizer state to more + * than 120 ms. + */ +OPUS_EXPORT int opus_repacketizer_cat(OpusRepacketizer *rp, const unsigned char *data, opus_int32 len) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(2); + + +/** Construct a new packet from data previously submitted to the repacketizer + * state via opus_repacketizer_cat(). + * @param rp OpusRepacketizer*: The repacketizer state from which to + * construct the new packet. + * @param begin int: The index of the first frame in the current + * repacketizer state to include in the output. + * @param end int: One past the index of the last frame in the + * current repacketizer state to include in the + * output. + * @param[out] data const unsigned char*: The buffer in which to + * store the output packet. + * @param maxlen opus_int32: The maximum number of bytes to store in + * the output buffer. In order to guarantee + * success, this should be at least + * 1276 for a single frame, + * or for multiple frames, + * 1277*(end-begin). + * However, 1*(end-begin) plus + * the size of all packet data submitted to + * the repacketizer since the last call to + * opus_repacketizer_init() or + * opus_repacketizer_create() is also + * sufficient, and possibly much smaller. + * @returns The total size of the output packet on success, or an error code + * on failure. + * @retval #OPUS_BAD_ARG [begin,end) was an invalid range of + * frames (begin < 0, begin >= end, or end > + * opus_repacketizer_get_nb_frames()). + * @retval #OPUS_BUFFER_TOO_SMALL \a maxlen was insufficient to contain the + * complete output packet. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT opus_int32 opus_repacketizer_out_range(OpusRepacketizer *rp, int begin, int end, unsigned char *data, opus_int32 maxlen) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(4); + +/** Return the total number of frames contained in packet data submitted to + * the repacketizer state so far via opus_repacketizer_cat() since the last + * call to opus_repacketizer_init() or opus_repacketizer_create(). + * This defines the valid range of packets that can be extracted with + * opus_repacketizer_out_range() or opus_repacketizer_out(). + * @param rp OpusRepacketizer*: The repacketizer state containing the + * frames. + * @returns The total number of frames contained in the packet data submitted + * to the repacketizer state. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_repacketizer_get_nb_frames(OpusRepacketizer *rp) OPUS_ARG_NONNULL(1); + +/** Construct a new packet from data previously submitted to the repacketizer + * state via opus_repacketizer_cat(). + * This is a convenience routine that returns all the data submitted so far + * in a single packet. + * It is equivalent to calling + * @code + * opus_repacketizer_out_range(rp, 0, opus_repacketizer_get_nb_frames(rp), + * data, maxlen) + * @endcode + * @param rp OpusRepacketizer*: The repacketizer state from which to + * construct the new packet. + * @param[out] data const unsigned char*: The buffer in which to + * store the output packet. + * @param maxlen opus_int32: The maximum number of bytes to store in + * the output buffer. In order to guarantee + * success, this should be at least + * 1277*opus_repacketizer_get_nb_frames(rp). + * However, + * 1*opus_repacketizer_get_nb_frames(rp) + * plus the size of all packet data + * submitted to the repacketizer since the + * last call to opus_repacketizer_init() or + * opus_repacketizer_create() is also + * sufficient, and possibly much smaller. + * @returns The total size of the output packet on success, or an error code + * on failure. + * @retval #OPUS_BUFFER_TOO_SMALL \a maxlen was insufficient to contain the + * complete output packet. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT opus_int32 opus_repacketizer_out(OpusRepacketizer *rp, unsigned char *data, opus_int32 maxlen) OPUS_ARG_NONNULL(1); + +/** Pads a given Opus packet to a larger size (possibly changing the TOC sequence). + * @param[in,out] data const unsigned char*: The buffer containing the + * packet to pad. + * @param len opus_int32: The size of the packet. + * This must be at least 1. + * @param new_len opus_int32: The desired size of the packet after padding. + * This must be at least as large as len. + * @returns an error code + * @retval #OPUS_OK \a on success. + * @retval #OPUS_BAD_ARG \a len was less than 1 or new_len was less than len. + * @retval #OPUS_INVALID_PACKET \a data did not contain a valid Opus packet. + */ +OPUS_EXPORT int opus_packet_pad(unsigned char *data, opus_int32 len, opus_int32 new_len); + +/** Remove all padding from a given Opus packet and rewrite the TOC sequence to + * minimize space usage. + * @param[in,out] data const unsigned char*: The buffer containing the + * packet to strip. + * @param len opus_int32: The size of the packet. + * This must be at least 1. + * @returns The new size of the output packet on success, or an error code + * on failure. + * @retval #OPUS_BAD_ARG \a len was less than 1. + * @retval #OPUS_INVALID_PACKET \a data did not contain a valid Opus packet. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT opus_int32 opus_packet_unpad(unsigned char *data, opus_int32 len); + +/** Pads a given Opus multi-stream packet to a larger size (possibly changing the TOC sequence). + * @param[in,out] data const unsigned char*: The buffer containing the + * packet to pad. + * @param len opus_int32: The size of the packet. + * This must be at least 1. + * @param new_len opus_int32: The desired size of the packet after padding. + * This must be at least 1. + * @param nb_streams opus_int32: The number of streams (not channels) in the packet. + * This must be at least as large as len. + * @returns an error code + * @retval #OPUS_OK \a on success. + * @retval #OPUS_BAD_ARG \a len was less than 1. + * @retval #OPUS_INVALID_PACKET \a data did not contain a valid Opus packet. + */ +OPUS_EXPORT int opus_multistream_packet_pad(unsigned char *data, opus_int32 len, opus_int32 new_len, int nb_streams); + +/** Remove all padding from a given Opus multi-stream packet and rewrite the TOC sequence to + * minimize space usage. + * @param[in,out] data const unsigned char*: The buffer containing the + * packet to strip. + * @param len opus_int32: The size of the packet. + * This must be at least 1. + * @param nb_streams opus_int32: The number of streams (not channels) in the packet. + * This must be at least 1. + * @returns The new size of the output packet on success, or an error code + * on failure. + * @retval #OPUS_BAD_ARG \a len was less than 1 or new_len was less than len. + * @retval #OPUS_INVALID_PACKET \a data did not contain a valid Opus packet. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT opus_int32 opus_multistream_packet_unpad(unsigned char *data, opus_int32 len, int nb_streams); + +/**@}*/ + +#ifdef __cplusplus +} +#endif + +#endif /* OPUS_H */ diff --git a/thirdparty/opus/opus/opus_custom.h b/thirdparty/opus/opus/opus_custom.h new file mode 100644 index 000000000000..41f36bf2fbc9 --- /dev/null +++ b/thirdparty/opus/opus/opus_custom.h @@ -0,0 +1,342 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Copyright (c) 2008-2012 Gregory Maxwell + Written by Jean-Marc Valin and Gregory Maxwell */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** + @file opus_custom.h + @brief Opus-Custom reference implementation API + */ + +#ifndef OPUS_CUSTOM_H +#define OPUS_CUSTOM_H + +#include "opus_defines.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef CUSTOM_MODES +# define OPUS_CUSTOM_EXPORT OPUS_EXPORT +# define OPUS_CUSTOM_EXPORT_STATIC OPUS_EXPORT +#else +# define OPUS_CUSTOM_EXPORT +# ifdef OPUS_BUILD +# define OPUS_CUSTOM_EXPORT_STATIC static OPUS_INLINE +# else +# define OPUS_CUSTOM_EXPORT_STATIC +# endif +#endif + +/** @defgroup opus_custom Opus Custom + * @{ + * Opus Custom is an optional part of the Opus specification and + * reference implementation which uses a distinct API from the regular + * API and supports frame sizes that are not normally supported.\ Use + * of Opus Custom is discouraged for all but very special applications + * for which a frame size different from 2.5, 5, 10, or 20 ms is needed + * (for either complexity or latency reasons) and where interoperability + * is less important. + * + * In addition to the interoperability limitations the use of Opus custom + * disables a substantial chunk of the codec and generally lowers the + * quality available at a given bitrate. Normally when an application needs + * a different frame size from the codec it should buffer to match the + * sizes but this adds a small amount of delay which may be important + * in some very low latency applications. Some transports (especially + * constant rate RF transports) may also work best with frames of + * particular durations. + * + * Libopus only supports custom modes if they are enabled at compile time. + * + * The Opus Custom API is similar to the regular API but the + * @ref opus_encoder_create and @ref opus_decoder_create calls take + * an additional mode parameter which is a structure produced by + * a call to @ref opus_custom_mode_create. Both the encoder and decoder + * must create a mode using the same sample rate (fs) and frame size + * (frame size) so these parameters must either be signaled out of band + * or fixed in a particular implementation. + * + * Similar to regular Opus the custom modes support on the fly frame size + * switching, but the sizes available depend on the particular frame size in + * use. For some initial frame sizes on a single on the fly size is available. + */ + +/** Contains the state of an encoder. One encoder state is needed + for each stream. It is initialized once at the beginning of the + stream. Do *not* re-initialize the state for every frame. + @brief Encoder state + */ +typedef struct OpusCustomEncoder OpusCustomEncoder; + +/** State of the decoder. One decoder state is needed for each stream. + It is initialized once at the beginning of the stream. Do *not* + re-initialize the state for every frame. + @brief Decoder state + */ +typedef struct OpusCustomDecoder OpusCustomDecoder; + +/** The mode contains all the information necessary to create an + encoder. Both the encoder and decoder need to be initialized + with exactly the same mode, otherwise the output will be + corrupted. + @brief Mode configuration + */ +typedef struct OpusCustomMode OpusCustomMode; + +/** Creates a new mode struct. This will be passed to an encoder or + * decoder. The mode MUST NOT BE DESTROYED until the encoders and + * decoders that use it are destroyed as well. + * @param [in] Fs int: Sampling rate (8000 to 96000 Hz) + * @param [in] frame_size int: Number of samples (per channel) to encode in each + * packet (64 - 1024, prime factorization must contain zero or more 2s, 3s, or 5s and no other primes) + * @param [out] error int*: Returned error code (if NULL, no error will be returned) + * @return A newly created mode + */ +OPUS_CUSTOM_EXPORT OPUS_WARN_UNUSED_RESULT OpusCustomMode *opus_custom_mode_create(opus_int32 Fs, int frame_size, int *error); + +/** Destroys a mode struct. Only call this after all encoders and + * decoders using this mode are destroyed as well. + * @param [in] mode OpusCustomMode*: Mode to be freed. + */ +OPUS_CUSTOM_EXPORT void opus_custom_mode_destroy(OpusCustomMode *mode); + + +#if !defined(OPUS_BUILD) || defined(CELT_ENCODER_C) + +/* Encoder */ +/** Gets the size of an OpusCustomEncoder structure. + * @param [in] mode OpusCustomMode *: Mode configuration + * @param [in] channels int: Number of channels + * @returns size + */ +OPUS_CUSTOM_EXPORT_STATIC OPUS_WARN_UNUSED_RESULT int opus_custom_encoder_get_size( + const OpusCustomMode *mode, + int channels +) OPUS_ARG_NONNULL(1); + +# ifdef CUSTOM_MODES +/** Initializes a previously allocated encoder state + * The memory pointed to by st must be the size returned by opus_custom_encoder_get_size. + * This is intended for applications which use their own allocator instead of malloc. + * @see opus_custom_encoder_create(),opus_custom_encoder_get_size() + * To reset a previously initialized state use the OPUS_RESET_STATE CTL. + * @param [in] st OpusCustomEncoder*: Encoder state + * @param [in] mode OpusCustomMode *: Contains all the information about the characteristics of + * the stream (must be the same characteristics as used for the + * decoder) + * @param [in] channels int: Number of channels + * @return OPUS_OK Success or @ref opus_errorcodes + */ +OPUS_CUSTOM_EXPORT int opus_custom_encoder_init( + OpusCustomEncoder *st, + const OpusCustomMode *mode, + int channels +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(2); +# endif +#endif + + +/** Creates a new encoder state. Each stream needs its own encoder + * state (can't be shared across simultaneous streams). + * @param [in] mode OpusCustomMode*: Contains all the information about the characteristics of + * the stream (must be the same characteristics as used for the + * decoder) + * @param [in] channels int: Number of channels + * @param [out] error int*: Returns an error code + * @return Newly created encoder state. +*/ +OPUS_CUSTOM_EXPORT OPUS_WARN_UNUSED_RESULT OpusCustomEncoder *opus_custom_encoder_create( + const OpusCustomMode *mode, + int channels, + int *error +) OPUS_ARG_NONNULL(1); + + +/** Destroys a an encoder state. + * @param[in] st OpusCustomEncoder*: State to be freed. + */ +OPUS_CUSTOM_EXPORT void opus_custom_encoder_destroy(OpusCustomEncoder *st); + +/** Encodes a frame of audio. + * @param [in] st OpusCustomEncoder*: Encoder state + * @param [in] pcm float*: PCM audio in float format, with a normal range of +/-1.0. + * Samples with a range beyond +/-1.0 are supported but will + * be clipped by decoders using the integer API and should + * only be used if it is known that the far end supports + * extended dynamic range. There must be exactly + * frame_size samples per channel. + * @param [in] frame_size int: Number of samples per frame of input signal + * @param [out] compressed char *: The compressed data is written here. This may not alias pcm and must be at least maxCompressedBytes long. + * @param [in] maxCompressedBytes int: Maximum number of bytes to use for compressing the frame + * (can change from one frame to another) + * @return Number of bytes written to "compressed". + * If negative, an error has occurred (see error codes). It is IMPORTANT that + * the length returned be somehow transmitted to the decoder. Otherwise, no + * decoding is possible. + */ +OPUS_CUSTOM_EXPORT OPUS_WARN_UNUSED_RESULT int opus_custom_encode_float( + OpusCustomEncoder *st, + const float *pcm, + int frame_size, + unsigned char *compressed, + int maxCompressedBytes +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(2) OPUS_ARG_NONNULL(4); + +/** Encodes a frame of audio. + * @param [in] st OpusCustomEncoder*: Encoder state + * @param [in] pcm opus_int16*: PCM audio in signed 16-bit format (native endian). + * There must be exactly frame_size samples per channel. + * @param [in] frame_size int: Number of samples per frame of input signal + * @param [out] compressed char *: The compressed data is written here. This may not alias pcm and must be at least maxCompressedBytes long. + * @param [in] maxCompressedBytes int: Maximum number of bytes to use for compressing the frame + * (can change from one frame to another) + * @return Number of bytes written to "compressed". + * If negative, an error has occurred (see error codes). It is IMPORTANT that + * the length returned be somehow transmitted to the decoder. Otherwise, no + * decoding is possible. + */ +OPUS_CUSTOM_EXPORT OPUS_WARN_UNUSED_RESULT int opus_custom_encode( + OpusCustomEncoder *st, + const opus_int16 *pcm, + int frame_size, + unsigned char *compressed, + int maxCompressedBytes +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(2) OPUS_ARG_NONNULL(4); + +/** Perform a CTL function on an Opus custom encoder. + * + * Generally the request and subsequent arguments are generated + * by a convenience macro. + * @see opus_encoderctls + */ +OPUS_CUSTOM_EXPORT int opus_custom_encoder_ctl(OpusCustomEncoder * OPUS_RESTRICT st, int request, ...) OPUS_ARG_NONNULL(1); + + +#if !defined(OPUS_BUILD) || defined(CELT_DECODER_C) +/* Decoder */ + +/** Gets the size of an OpusCustomDecoder structure. + * @param [in] mode OpusCustomMode *: Mode configuration + * @param [in] channels int: Number of channels + * @returns size + */ +OPUS_CUSTOM_EXPORT_STATIC OPUS_WARN_UNUSED_RESULT int opus_custom_decoder_get_size( + const OpusCustomMode *mode, + int channels +) OPUS_ARG_NONNULL(1); + +/** Initializes a previously allocated decoder state + * The memory pointed to by st must be the size returned by opus_custom_decoder_get_size. + * This is intended for applications which use their own allocator instead of malloc. + * @see opus_custom_decoder_create(),opus_custom_decoder_get_size() + * To reset a previously initialized state use the OPUS_RESET_STATE CTL. + * @param [in] st OpusCustomDecoder*: Decoder state + * @param [in] mode OpusCustomMode *: Contains all the information about the characteristics of + * the stream (must be the same characteristics as used for the + * encoder) + * @param [in] channels int: Number of channels + * @return OPUS_OK Success or @ref opus_errorcodes + */ +OPUS_CUSTOM_EXPORT_STATIC int opus_custom_decoder_init( + OpusCustomDecoder *st, + const OpusCustomMode *mode, + int channels +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(2); + +#endif + + +/** Creates a new decoder state. Each stream needs its own decoder state (can't + * be shared across simultaneous streams). + * @param [in] mode OpusCustomMode: Contains all the information about the characteristics of the + * stream (must be the same characteristics as used for the encoder) + * @param [in] channels int: Number of channels + * @param [out] error int*: Returns an error code + * @return Newly created decoder state. + */ +OPUS_CUSTOM_EXPORT OPUS_WARN_UNUSED_RESULT OpusCustomDecoder *opus_custom_decoder_create( + const OpusCustomMode *mode, + int channels, + int *error +) OPUS_ARG_NONNULL(1); + +/** Destroys a an decoder state. + * @param[in] st OpusCustomDecoder*: State to be freed. + */ +OPUS_CUSTOM_EXPORT void opus_custom_decoder_destroy(OpusCustomDecoder *st); + +/** Decode an opus custom frame with floating point output + * @param [in] st OpusCustomDecoder*: Decoder state + * @param [in] data char*: Input payload. Use a NULL pointer to indicate packet loss + * @param [in] len int: Number of bytes in payload + * @param [out] pcm float*: Output signal (interleaved if 2 channels). length + * is frame_size*channels*sizeof(float) + * @param [in] frame_size Number of samples per channel of available space in *pcm. + * @returns Number of decoded samples or @ref opus_errorcodes + */ +OPUS_CUSTOM_EXPORT OPUS_WARN_UNUSED_RESULT int opus_custom_decode_float( + OpusCustomDecoder *st, + const unsigned char *data, + int len, + float *pcm, + int frame_size +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(4); + +/** Decode an opus custom frame + * @param [in] st OpusCustomDecoder*: Decoder state + * @param [in] data char*: Input payload. Use a NULL pointer to indicate packet loss + * @param [in] len int: Number of bytes in payload + * @param [out] pcm opus_int16*: Output signal (interleaved if 2 channels). length + * is frame_size*channels*sizeof(opus_int16) + * @param [in] frame_size Number of samples per channel of available space in *pcm. + * @returns Number of decoded samples or @ref opus_errorcodes + */ +OPUS_CUSTOM_EXPORT OPUS_WARN_UNUSED_RESULT int opus_custom_decode( + OpusCustomDecoder *st, + const unsigned char *data, + int len, + opus_int16 *pcm, + int frame_size +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(4); + +/** Perform a CTL function on an Opus custom decoder. + * + * Generally the request and subsequent arguments are generated + * by a convenience macro. + * @see opus_genericctls + */ +OPUS_CUSTOM_EXPORT int opus_custom_decoder_ctl(OpusCustomDecoder * OPUS_RESTRICT st, int request, ...) OPUS_ARG_NONNULL(1); + +/**@}*/ + +#ifdef __cplusplus +} +#endif + +#endif /* OPUS_CUSTOM_H */ diff --git a/thirdparty/opus/opus/opus_defines.h b/thirdparty/opus/opus/opus_defines.h new file mode 100644 index 000000000000..315412dd1d23 --- /dev/null +++ b/thirdparty/opus/opus/opus_defines.h @@ -0,0 +1,753 @@ +/* Copyright (c) 2010-2011 Xiph.Org Foundation, Skype Limited + Written by Jean-Marc Valin and Koen Vos */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** + * @file opus_defines.h + * @brief Opus reference implementation constants + */ + +#ifndef OPUS_DEFINES_H +#define OPUS_DEFINES_H + +#include "opus_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @defgroup opus_errorcodes Error codes + * @{ + */ +/** No error @hideinitializer*/ +#define OPUS_OK 0 +/** One or more invalid/out of range arguments @hideinitializer*/ +#define OPUS_BAD_ARG -1 +/** Not enough bytes allocated in the buffer @hideinitializer*/ +#define OPUS_BUFFER_TOO_SMALL -2 +/** An internal error was detected @hideinitializer*/ +#define OPUS_INTERNAL_ERROR -3 +/** The compressed data passed is corrupted @hideinitializer*/ +#define OPUS_INVALID_PACKET -4 +/** Invalid/unsupported request number @hideinitializer*/ +#define OPUS_UNIMPLEMENTED -5 +/** An encoder or decoder structure is invalid or already freed @hideinitializer*/ +#define OPUS_INVALID_STATE -6 +/** Memory allocation has failed @hideinitializer*/ +#define OPUS_ALLOC_FAIL -7 +/**@}*/ + +/** @cond OPUS_INTERNAL_DOC */ +/**Export control for opus functions */ + +#ifndef OPUS_EXPORT +# if defined(WIN32) +# if defined(OPUS_BUILD) && defined(DLL_EXPORT) +# define OPUS_EXPORT __declspec(dllexport) +# else +# define OPUS_EXPORT +# endif +# elif defined(__GNUC__) && defined(OPUS_BUILD) +# define OPUS_EXPORT __attribute__ ((visibility ("default"))) +# else +# define OPUS_EXPORT +# endif +#endif + +# if !defined(OPUS_GNUC_PREREQ) +# if defined(__GNUC__)&&defined(__GNUC_MINOR__) +# define OPUS_GNUC_PREREQ(_maj,_min) \ + ((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min)) +# else +# define OPUS_GNUC_PREREQ(_maj,_min) 0 +# endif +# endif + +#if (!defined(__STDC_VERSION__) || (__STDC_VERSION__ < 199901L) ) +# if OPUS_GNUC_PREREQ(3,0) +# define OPUS_RESTRICT __restrict__ +# elif (defined(_MSC_VER) && _MSC_VER >= 1400) +# define OPUS_RESTRICT __restrict +# else +# define OPUS_RESTRICT +# endif +#else +# define OPUS_RESTRICT restrict +#endif + +#if (!defined(__STDC_VERSION__) || (__STDC_VERSION__ < 199901L) ) +# if OPUS_GNUC_PREREQ(2,7) +# define OPUS_INLINE __inline__ +# elif (defined(_MSC_VER)) +# define OPUS_INLINE __inline +# else +# define OPUS_INLINE +# endif +#else +# define OPUS_INLINE inline +#endif + +/**Warning attributes for opus functions + * NONNULL is not used in OPUS_BUILD to avoid the compiler optimizing out + * some paranoid null checks. */ +#if defined(__GNUC__) && OPUS_GNUC_PREREQ(3, 4) +# define OPUS_WARN_UNUSED_RESULT __attribute__ ((__warn_unused_result__)) +#else +# define OPUS_WARN_UNUSED_RESULT +#endif +#if !defined(OPUS_BUILD) && defined(__GNUC__) && OPUS_GNUC_PREREQ(3, 4) +# define OPUS_ARG_NONNULL(_x) __attribute__ ((__nonnull__(_x))) +#else +# define OPUS_ARG_NONNULL(_x) +#endif + +/** These are the actual Encoder CTL ID numbers. + * They should not be used directly by applications. + * In general, SETs should be even and GETs should be odd.*/ +#define OPUS_SET_APPLICATION_REQUEST 4000 +#define OPUS_GET_APPLICATION_REQUEST 4001 +#define OPUS_SET_BITRATE_REQUEST 4002 +#define OPUS_GET_BITRATE_REQUEST 4003 +#define OPUS_SET_MAX_BANDWIDTH_REQUEST 4004 +#define OPUS_GET_MAX_BANDWIDTH_REQUEST 4005 +#define OPUS_SET_VBR_REQUEST 4006 +#define OPUS_GET_VBR_REQUEST 4007 +#define OPUS_SET_BANDWIDTH_REQUEST 4008 +#define OPUS_GET_BANDWIDTH_REQUEST 4009 +#define OPUS_SET_COMPLEXITY_REQUEST 4010 +#define OPUS_GET_COMPLEXITY_REQUEST 4011 +#define OPUS_SET_INBAND_FEC_REQUEST 4012 +#define OPUS_GET_INBAND_FEC_REQUEST 4013 +#define OPUS_SET_PACKET_LOSS_PERC_REQUEST 4014 +#define OPUS_GET_PACKET_LOSS_PERC_REQUEST 4015 +#define OPUS_SET_DTX_REQUEST 4016 +#define OPUS_GET_DTX_REQUEST 4017 +#define OPUS_SET_VBR_CONSTRAINT_REQUEST 4020 +#define OPUS_GET_VBR_CONSTRAINT_REQUEST 4021 +#define OPUS_SET_FORCE_CHANNELS_REQUEST 4022 +#define OPUS_GET_FORCE_CHANNELS_REQUEST 4023 +#define OPUS_SET_SIGNAL_REQUEST 4024 +#define OPUS_GET_SIGNAL_REQUEST 4025 +#define OPUS_GET_LOOKAHEAD_REQUEST 4027 +/* #define OPUS_RESET_STATE 4028 */ +#define OPUS_GET_SAMPLE_RATE_REQUEST 4029 +#define OPUS_GET_FINAL_RANGE_REQUEST 4031 +#define OPUS_GET_PITCH_REQUEST 4033 +#define OPUS_SET_GAIN_REQUEST 4034 +#define OPUS_GET_GAIN_REQUEST 4045 /* Should have been 4035 */ +#define OPUS_SET_LSB_DEPTH_REQUEST 4036 +#define OPUS_GET_LSB_DEPTH_REQUEST 4037 +#define OPUS_GET_LAST_PACKET_DURATION_REQUEST 4039 +#define OPUS_SET_EXPERT_FRAME_DURATION_REQUEST 4040 +#define OPUS_GET_EXPERT_FRAME_DURATION_REQUEST 4041 +#define OPUS_SET_PREDICTION_DISABLED_REQUEST 4042 +#define OPUS_GET_PREDICTION_DISABLED_REQUEST 4043 + +/* Don't use 4045, it's already taken by OPUS_GET_GAIN_REQUEST */ + +/* Macros to trigger compilation errors when the wrong types are provided to a CTL */ +#define __opus_check_int(x) (((void)((x) == (opus_int32)0)), (opus_int32)(x)) +#define __opus_check_int_ptr(ptr) ((ptr) + ((ptr) - (opus_int32*)(ptr))) +#define __opus_check_uint_ptr(ptr) ((ptr) + ((ptr) - (opus_uint32*)(ptr))) +#define __opus_check_val16_ptr(ptr) ((ptr) + ((ptr) - (opus_val16*)(ptr))) +/** @endcond */ + +/** @defgroup opus_ctlvalues Pre-defined values for CTL interface + * @see opus_genericctls, opus_encoderctls + * @{ + */ +/* Values for the various encoder CTLs */ +#define OPUS_AUTO -1000 /**opus_int32: Allowed values: 0-10, inclusive. + * + * @hideinitializer */ +#define OPUS_SET_COMPLEXITY(x) OPUS_SET_COMPLEXITY_REQUEST, __opus_check_int(x) +/** Gets the encoder's complexity configuration. + * @see OPUS_SET_COMPLEXITY + * @param[out] x opus_int32 *: Returns a value in the range 0-10, + * inclusive. + * @hideinitializer */ +#define OPUS_GET_COMPLEXITY(x) OPUS_GET_COMPLEXITY_REQUEST, __opus_check_int_ptr(x) + +/** Configures the bitrate in the encoder. + * Rates from 500 to 512000 bits per second are meaningful, as well as the + * special values #OPUS_AUTO and #OPUS_BITRATE_MAX. + * The value #OPUS_BITRATE_MAX can be used to cause the codec to use as much + * rate as it can, which is useful for controlling the rate by adjusting the + * output buffer size. + * @see OPUS_GET_BITRATE + * @param[in] x opus_int32: Bitrate in bits per second. The default + * is determined based on the number of + * channels and the input sampling rate. + * @hideinitializer */ +#define OPUS_SET_BITRATE(x) OPUS_SET_BITRATE_REQUEST, __opus_check_int(x) +/** Gets the encoder's bitrate configuration. + * @see OPUS_SET_BITRATE + * @param[out] x opus_int32 *: Returns the bitrate in bits per second. + * The default is determined based on the + * number of channels and the input + * sampling rate. + * @hideinitializer */ +#define OPUS_GET_BITRATE(x) OPUS_GET_BITRATE_REQUEST, __opus_check_int_ptr(x) + +/** Enables or disables variable bitrate (VBR) in the encoder. + * The configured bitrate may not be met exactly because frames must + * be an integer number of bytes in length. + * @see OPUS_GET_VBR + * @see OPUS_SET_VBR_CONSTRAINT + * @param[in] x opus_int32: Allowed values: + *
    + *
    0
    Hard CBR. For LPC/hybrid modes at very low bit-rate, this can + * cause noticeable quality degradation.
    + *
    1
    VBR (default). The exact type of VBR is controlled by + * #OPUS_SET_VBR_CONSTRAINT.
    + *
    + * @hideinitializer */ +#define OPUS_SET_VBR(x) OPUS_SET_VBR_REQUEST, __opus_check_int(x) +/** Determine if variable bitrate (VBR) is enabled in the encoder. + * @see OPUS_SET_VBR + * @see OPUS_GET_VBR_CONSTRAINT + * @param[out] x opus_int32 *: Returns one of the following values: + *
    + *
    0
    Hard CBR.
    + *
    1
    VBR (default). The exact type of VBR may be retrieved via + * #OPUS_GET_VBR_CONSTRAINT.
    + *
    + * @hideinitializer */ +#define OPUS_GET_VBR(x) OPUS_GET_VBR_REQUEST, __opus_check_int_ptr(x) + +/** Enables or disables constrained VBR in the encoder. + * This setting is ignored when the encoder is in CBR mode. + * @warning Only the MDCT mode of Opus currently heeds the constraint. + * Speech mode ignores it completely, hybrid mode may fail to obey it + * if the LPC layer uses more bitrate than the constraint would have + * permitted. + * @see OPUS_GET_VBR_CONSTRAINT + * @see OPUS_SET_VBR + * @param[in] x opus_int32: Allowed values: + *
    + *
    0
    Unconstrained VBR.
    + *
    1
    Constrained VBR (default). This creates a maximum of one + * frame of buffering delay assuming a transport with a + * serialization speed of the nominal bitrate.
    + *
    + * @hideinitializer */ +#define OPUS_SET_VBR_CONSTRAINT(x) OPUS_SET_VBR_CONSTRAINT_REQUEST, __opus_check_int(x) +/** Determine if constrained VBR is enabled in the encoder. + * @see OPUS_SET_VBR_CONSTRAINT + * @see OPUS_GET_VBR + * @param[out] x opus_int32 *: Returns one of the following values: + *
    + *
    0
    Unconstrained VBR.
    + *
    1
    Constrained VBR (default).
    + *
    + * @hideinitializer */ +#define OPUS_GET_VBR_CONSTRAINT(x) OPUS_GET_VBR_CONSTRAINT_REQUEST, __opus_check_int_ptr(x) + +/** Configures mono/stereo forcing in the encoder. + * This can force the encoder to produce packets encoded as either mono or + * stereo, regardless of the format of the input audio. This is useful when + * the caller knows that the input signal is currently a mono source embedded + * in a stereo stream. + * @see OPUS_GET_FORCE_CHANNELS + * @param[in] x opus_int32: Allowed values: + *
    + *
    #OPUS_AUTO
    Not forced (default)
    + *
    1
    Forced mono
    + *
    2
    Forced stereo
    + *
    + * @hideinitializer */ +#define OPUS_SET_FORCE_CHANNELS(x) OPUS_SET_FORCE_CHANNELS_REQUEST, __opus_check_int(x) +/** Gets the encoder's forced channel configuration. + * @see OPUS_SET_FORCE_CHANNELS + * @param[out] x opus_int32 *: + *
    + *
    #OPUS_AUTO
    Not forced (default)
    + *
    1
    Forced mono
    + *
    2
    Forced stereo
    + *
    + * @hideinitializer */ +#define OPUS_GET_FORCE_CHANNELS(x) OPUS_GET_FORCE_CHANNELS_REQUEST, __opus_check_int_ptr(x) + +/** Configures the maximum bandpass that the encoder will select automatically. + * Applications should normally use this instead of #OPUS_SET_BANDWIDTH + * (leaving that set to the default, #OPUS_AUTO). This allows the + * application to set an upper bound based on the type of input it is + * providing, but still gives the encoder the freedom to reduce the bandpass + * when the bitrate becomes too low, for better overall quality. + * @see OPUS_GET_MAX_BANDWIDTH + * @param[in] x opus_int32: Allowed values: + *
    + *
    OPUS_BANDWIDTH_NARROWBAND
    4 kHz passband
    + *
    OPUS_BANDWIDTH_MEDIUMBAND
    6 kHz passband
    + *
    OPUS_BANDWIDTH_WIDEBAND
    8 kHz passband
    + *
    OPUS_BANDWIDTH_SUPERWIDEBAND
    12 kHz passband
    + *
    OPUS_BANDWIDTH_FULLBAND
    20 kHz passband (default)
    + *
    + * @hideinitializer */ +#define OPUS_SET_MAX_BANDWIDTH(x) OPUS_SET_MAX_BANDWIDTH_REQUEST, __opus_check_int(x) + +/** Gets the encoder's configured maximum allowed bandpass. + * @see OPUS_SET_MAX_BANDWIDTH + * @param[out] x opus_int32 *: Allowed values: + *
    + *
    #OPUS_BANDWIDTH_NARROWBAND
    4 kHz passband
    + *
    #OPUS_BANDWIDTH_MEDIUMBAND
    6 kHz passband
    + *
    #OPUS_BANDWIDTH_WIDEBAND
    8 kHz passband
    + *
    #OPUS_BANDWIDTH_SUPERWIDEBAND
    12 kHz passband
    + *
    #OPUS_BANDWIDTH_FULLBAND
    20 kHz passband (default)
    + *
    + * @hideinitializer */ +#define OPUS_GET_MAX_BANDWIDTH(x) OPUS_GET_MAX_BANDWIDTH_REQUEST, __opus_check_int_ptr(x) + +/** Sets the encoder's bandpass to a specific value. + * This prevents the encoder from automatically selecting the bandpass based + * on the available bitrate. If an application knows the bandpass of the input + * audio it is providing, it should normally use #OPUS_SET_MAX_BANDWIDTH + * instead, which still gives the encoder the freedom to reduce the bandpass + * when the bitrate becomes too low, for better overall quality. + * @see OPUS_GET_BANDWIDTH + * @param[in] x opus_int32: Allowed values: + *
    + *
    #OPUS_AUTO
    (default)
    + *
    #OPUS_BANDWIDTH_NARROWBAND
    4 kHz passband
    + *
    #OPUS_BANDWIDTH_MEDIUMBAND
    6 kHz passband
    + *
    #OPUS_BANDWIDTH_WIDEBAND
    8 kHz passband
    + *
    #OPUS_BANDWIDTH_SUPERWIDEBAND
    12 kHz passband
    + *
    #OPUS_BANDWIDTH_FULLBAND
    20 kHz passband
    + *
    + * @hideinitializer */ +#define OPUS_SET_BANDWIDTH(x) OPUS_SET_BANDWIDTH_REQUEST, __opus_check_int(x) + +/** Configures the type of signal being encoded. + * This is a hint which helps the encoder's mode selection. + * @see OPUS_GET_SIGNAL + * @param[in] x opus_int32: Allowed values: + *
    + *
    #OPUS_AUTO
    (default)
    + *
    #OPUS_SIGNAL_VOICE
    Bias thresholds towards choosing LPC or Hybrid modes.
    + *
    #OPUS_SIGNAL_MUSIC
    Bias thresholds towards choosing MDCT modes.
    + *
    + * @hideinitializer */ +#define OPUS_SET_SIGNAL(x) OPUS_SET_SIGNAL_REQUEST, __opus_check_int(x) +/** Gets the encoder's configured signal type. + * @see OPUS_SET_SIGNAL + * @param[out] x opus_int32 *: Returns one of the following values: + *
    + *
    #OPUS_AUTO
    (default)
    + *
    #OPUS_SIGNAL_VOICE
    Bias thresholds towards choosing LPC or Hybrid modes.
    + *
    #OPUS_SIGNAL_MUSIC
    Bias thresholds towards choosing MDCT modes.
    + *
    + * @hideinitializer */ +#define OPUS_GET_SIGNAL(x) OPUS_GET_SIGNAL_REQUEST, __opus_check_int_ptr(x) + + +/** Configures the encoder's intended application. + * The initial value is a mandatory argument to the encoder_create function. + * @see OPUS_GET_APPLICATION + * @param[in] x opus_int32: Returns one of the following values: + *
    + *
    #OPUS_APPLICATION_VOIP
    + *
    Process signal for improved speech intelligibility.
    + *
    #OPUS_APPLICATION_AUDIO
    + *
    Favor faithfulness to the original input.
    + *
    #OPUS_APPLICATION_RESTRICTED_LOWDELAY
    + *
    Configure the minimum possible coding delay by disabling certain modes + * of operation.
    + *
    + * @hideinitializer */ +#define OPUS_SET_APPLICATION(x) OPUS_SET_APPLICATION_REQUEST, __opus_check_int(x) +/** Gets the encoder's configured application. + * @see OPUS_SET_APPLICATION + * @param[out] x opus_int32 *: Returns one of the following values: + *
    + *
    #OPUS_APPLICATION_VOIP
    + *
    Process signal for improved speech intelligibility.
    + *
    #OPUS_APPLICATION_AUDIO
    + *
    Favor faithfulness to the original input.
    + *
    #OPUS_APPLICATION_RESTRICTED_LOWDELAY
    + *
    Configure the minimum possible coding delay by disabling certain modes + * of operation.
    + *
    + * @hideinitializer */ +#define OPUS_GET_APPLICATION(x) OPUS_GET_APPLICATION_REQUEST, __opus_check_int_ptr(x) + +/** Gets the total samples of delay added by the entire codec. + * This can be queried by the encoder and then the provided number of samples can be + * skipped on from the start of the decoder's output to provide time aligned input + * and output. From the perspective of a decoding application the real data begins this many + * samples late. + * + * The decoder contribution to this delay is identical for all decoders, but the + * encoder portion of the delay may vary from implementation to implementation, + * version to version, or even depend on the encoder's initial configuration. + * Applications needing delay compensation should call this CTL rather than + * hard-coding a value. + * @param[out] x opus_int32 *: Number of lookahead samples + * @hideinitializer */ +#define OPUS_GET_LOOKAHEAD(x) OPUS_GET_LOOKAHEAD_REQUEST, __opus_check_int_ptr(x) + +/** Configures the encoder's use of inband forward error correction (FEC). + * @note This is only applicable to the LPC layer + * @see OPUS_GET_INBAND_FEC + * @param[in] x opus_int32: Allowed values: + *
    + *
    0
    Disable inband FEC (default).
    + *
    1
    Enable inband FEC.
    + *
    + * @hideinitializer */ +#define OPUS_SET_INBAND_FEC(x) OPUS_SET_INBAND_FEC_REQUEST, __opus_check_int(x) +/** Gets encoder's configured use of inband forward error correction. + * @see OPUS_SET_INBAND_FEC + * @param[out] x opus_int32 *: Returns one of the following values: + *
    + *
    0
    Inband FEC disabled (default).
    + *
    1
    Inband FEC enabled.
    + *
    + * @hideinitializer */ +#define OPUS_GET_INBAND_FEC(x) OPUS_GET_INBAND_FEC_REQUEST, __opus_check_int_ptr(x) + +/** Configures the encoder's expected packet loss percentage. + * Higher values trigger progressively more loss resistant behavior in the encoder + * at the expense of quality at a given bitrate in the absence of packet loss, but + * greater quality under loss. + * @see OPUS_GET_PACKET_LOSS_PERC + * @param[in] x opus_int32: Loss percentage in the range 0-100, inclusive (default: 0). + * @hideinitializer */ +#define OPUS_SET_PACKET_LOSS_PERC(x) OPUS_SET_PACKET_LOSS_PERC_REQUEST, __opus_check_int(x) +/** Gets the encoder's configured packet loss percentage. + * @see OPUS_SET_PACKET_LOSS_PERC + * @param[out] x opus_int32 *: Returns the configured loss percentage + * in the range 0-100, inclusive (default: 0). + * @hideinitializer */ +#define OPUS_GET_PACKET_LOSS_PERC(x) OPUS_GET_PACKET_LOSS_PERC_REQUEST, __opus_check_int_ptr(x) + +/** Configures the encoder's use of discontinuous transmission (DTX). + * @note This is only applicable to the LPC layer + * @see OPUS_GET_DTX + * @param[in] x opus_int32: Allowed values: + *
    + *
    0
    Disable DTX (default).
    + *
    1
    Enabled DTX.
    + *
    + * @hideinitializer */ +#define OPUS_SET_DTX(x) OPUS_SET_DTX_REQUEST, __opus_check_int(x) +/** Gets encoder's configured use of discontinuous transmission. + * @see OPUS_SET_DTX + * @param[out] x opus_int32 *: Returns one of the following values: + *
    + *
    0
    DTX disabled (default).
    + *
    1
    DTX enabled.
    + *
    + * @hideinitializer */ +#define OPUS_GET_DTX(x) OPUS_GET_DTX_REQUEST, __opus_check_int_ptr(x) +/** Configures the depth of signal being encoded. + * + * This is a hint which helps the encoder identify silence and near-silence. + * It represents the number of significant bits of linear intensity below + * which the signal contains ignorable quantization or other noise. + * + * For example, OPUS_SET_LSB_DEPTH(14) would be an appropriate setting + * for G.711 u-law input. OPUS_SET_LSB_DEPTH(16) would be appropriate + * for 16-bit linear pcm input with opus_encode_float(). + * + * When using opus_encode() instead of opus_encode_float(), or when libopus + * is compiled for fixed-point, the encoder uses the minimum of the value + * set here and the value 16. + * + * @see OPUS_GET_LSB_DEPTH + * @param[in] x opus_int32: Input precision in bits, between 8 and 24 + * (default: 24). + * @hideinitializer */ +#define OPUS_SET_LSB_DEPTH(x) OPUS_SET_LSB_DEPTH_REQUEST, __opus_check_int(x) +/** Gets the encoder's configured signal depth. + * @see OPUS_SET_LSB_DEPTH + * @param[out] x opus_int32 *: Input precision in bits, between 8 and + * 24 (default: 24). + * @hideinitializer */ +#define OPUS_GET_LSB_DEPTH(x) OPUS_GET_LSB_DEPTH_REQUEST, __opus_check_int_ptr(x) + +/** Configures the encoder's use of variable duration frames. + * When variable duration is enabled, the encoder is free to use a shorter frame + * size than the one requested in the opus_encode*() call. + * It is then the user's responsibility + * to verify how much audio was encoded by checking the ToC byte of the encoded + * packet. The part of the audio that was not encoded needs to be resent to the + * encoder for the next call. Do not use this option unless you really + * know what you are doing. + * @see OPUS_GET_EXPERT_FRAME_DURATION + * @param[in] x opus_int32: Allowed values: + *
    + *
    OPUS_FRAMESIZE_ARG
    Select frame size from the argument (default).
    + *
    OPUS_FRAMESIZE_2_5_MS
    Use 2.5 ms frames.
    + *
    OPUS_FRAMESIZE_5_MS
    Use 5 ms frames.
    + *
    OPUS_FRAMESIZE_10_MS
    Use 10 ms frames.
    + *
    OPUS_FRAMESIZE_20_MS
    Use 20 ms frames.
    + *
    OPUS_FRAMESIZE_40_MS
    Use 40 ms frames.
    + *
    OPUS_FRAMESIZE_60_MS
    Use 60 ms frames.
    + *
    OPUS_FRAMESIZE_VARIABLE
    Optimize the frame size dynamically.
    + *
    + * @hideinitializer */ +#define OPUS_SET_EXPERT_FRAME_DURATION(x) OPUS_SET_EXPERT_FRAME_DURATION_REQUEST, __opus_check_int(x) +/** Gets the encoder's configured use of variable duration frames. + * @see OPUS_SET_EXPERT_FRAME_DURATION + * @param[out] x opus_int32 *: Returns one of the following values: + *
    + *
    OPUS_FRAMESIZE_ARG
    Select frame size from the argument (default).
    + *
    OPUS_FRAMESIZE_2_5_MS
    Use 2.5 ms frames.
    + *
    OPUS_FRAMESIZE_5_MS
    Use 5 ms frames.
    + *
    OPUS_FRAMESIZE_10_MS
    Use 10 ms frames.
    + *
    OPUS_FRAMESIZE_20_MS
    Use 20 ms frames.
    + *
    OPUS_FRAMESIZE_40_MS
    Use 40 ms frames.
    + *
    OPUS_FRAMESIZE_60_MS
    Use 60 ms frames.
    + *
    OPUS_FRAMESIZE_VARIABLE
    Optimize the frame size dynamically.
    + *
    + * @hideinitializer */ +#define OPUS_GET_EXPERT_FRAME_DURATION(x) OPUS_GET_EXPERT_FRAME_DURATION_REQUEST, __opus_check_int_ptr(x) + +/** If set to 1, disables almost all use of prediction, making frames almost + * completely independent. This reduces quality. + * @see OPUS_GET_PREDICTION_DISABLED + * @param[in] x opus_int32: Allowed values: + *
    + *
    0
    Enable prediction (default).
    + *
    1
    Disable prediction.
    + *
    + * @hideinitializer */ +#define OPUS_SET_PREDICTION_DISABLED(x) OPUS_SET_PREDICTION_DISABLED_REQUEST, __opus_check_int(x) +/** Gets the encoder's configured prediction status. + * @see OPUS_SET_PREDICTION_DISABLED + * @param[out] x opus_int32 *: Returns one of the following values: + *
    + *
    0
    Prediction enabled (default).
    + *
    1
    Prediction disabled.
    + *
    + * @hideinitializer */ +#define OPUS_GET_PREDICTION_DISABLED(x) OPUS_GET_PREDICTION_DISABLED_REQUEST, __opus_check_int_ptr(x) + +/**@}*/ + +/** @defgroup opus_genericctls Generic CTLs + * + * These macros are used with the \c opus_decoder_ctl and + * \c opus_encoder_ctl calls to generate a particular + * request. + * + * When called on an \c OpusDecoder they apply to that + * particular decoder instance. When called on an + * \c OpusEncoder they apply to the corresponding setting + * on that encoder instance, if present. + * + * Some usage examples: + * + * @code + * int ret; + * opus_int32 pitch; + * ret = opus_decoder_ctl(dec_ctx, OPUS_GET_PITCH(&pitch)); + * if (ret == OPUS_OK) return ret; + * + * opus_encoder_ctl(enc_ctx, OPUS_RESET_STATE); + * opus_decoder_ctl(dec_ctx, OPUS_RESET_STATE); + * + * opus_int32 enc_bw, dec_bw; + * opus_encoder_ctl(enc_ctx, OPUS_GET_BANDWIDTH(&enc_bw)); + * opus_decoder_ctl(dec_ctx, OPUS_GET_BANDWIDTH(&dec_bw)); + * if (enc_bw != dec_bw) { + * printf("packet bandwidth mismatch!\n"); + * } + * @endcode + * + * @see opus_encoder, opus_decoder_ctl, opus_encoder_ctl, opus_decoderctls, opus_encoderctls + * @{ + */ + +/** Resets the codec state to be equivalent to a freshly initialized state. + * This should be called when switching streams in order to prevent + * the back to back decoding from giving different results from + * one at a time decoding. + * @hideinitializer */ +#define OPUS_RESET_STATE 4028 + +/** Gets the final state of the codec's entropy coder. + * This is used for testing purposes, + * The encoder and decoder state should be identical after coding a payload + * (assuming no data corruption or software bugs) + * + * @param[out] x opus_uint32 *: Entropy coder state + * + * @hideinitializer */ +#define OPUS_GET_FINAL_RANGE(x) OPUS_GET_FINAL_RANGE_REQUEST, __opus_check_uint_ptr(x) + +/** Gets the encoder's configured bandpass or the decoder's last bandpass. + * @see OPUS_SET_BANDWIDTH + * @param[out] x opus_int32 *: Returns one of the following values: + *
    + *
    #OPUS_AUTO
    (default)
    + *
    #OPUS_BANDWIDTH_NARROWBAND
    4 kHz passband
    + *
    #OPUS_BANDWIDTH_MEDIUMBAND
    6 kHz passband
    + *
    #OPUS_BANDWIDTH_WIDEBAND
    8 kHz passband
    + *
    #OPUS_BANDWIDTH_SUPERWIDEBAND
    12 kHz passband
    + *
    #OPUS_BANDWIDTH_FULLBAND
    20 kHz passband
    + *
    + * @hideinitializer */ +#define OPUS_GET_BANDWIDTH(x) OPUS_GET_BANDWIDTH_REQUEST, __opus_check_int_ptr(x) + +/** Gets the sampling rate the encoder or decoder was initialized with. + * This simply returns the Fs value passed to opus_encoder_init() + * or opus_decoder_init(). + * @param[out] x opus_int32 *: Sampling rate of encoder or decoder. + * @hideinitializer + */ +#define OPUS_GET_SAMPLE_RATE(x) OPUS_GET_SAMPLE_RATE_REQUEST, __opus_check_int_ptr(x) + +/**@}*/ + +/** @defgroup opus_decoderctls Decoder related CTLs + * @see opus_genericctls, opus_encoderctls, opus_decoder + * @{ + */ + +/** Configures decoder gain adjustment. + * Scales the decoded output by a factor specified in Q8 dB units. + * This has a maximum range of -32768 to 32767 inclusive, and returns + * OPUS_BAD_ARG otherwise. The default is zero indicating no adjustment. + * This setting survives decoder reset. + * + * gain = pow(10, x/(20.0*256)) + * + * @param[in] x opus_int32: Amount to scale PCM signal by in Q8 dB units. + * @hideinitializer */ +#define OPUS_SET_GAIN(x) OPUS_SET_GAIN_REQUEST, __opus_check_int(x) +/** Gets the decoder's configured gain adjustment. @see OPUS_SET_GAIN + * + * @param[out] x opus_int32 *: Amount to scale PCM signal by in Q8 dB units. + * @hideinitializer */ +#define OPUS_GET_GAIN(x) OPUS_GET_GAIN_REQUEST, __opus_check_int_ptr(x) + +/** Gets the duration (in samples) of the last packet successfully decoded or concealed. + * @param[out] x opus_int32 *: Number of samples (at current sampling rate). + * @hideinitializer */ +#define OPUS_GET_LAST_PACKET_DURATION(x) OPUS_GET_LAST_PACKET_DURATION_REQUEST, __opus_check_int_ptr(x) + +/** Gets the pitch of the last decoded frame, if available. + * This can be used for any post-processing algorithm requiring the use of pitch, + * e.g. time stretching/shortening. If the last frame was not voiced, or if the + * pitch was not coded in the frame, then zero is returned. + * + * This CTL is only implemented for decoder instances. + * + * @param[out] x opus_int32 *: pitch period at 48 kHz (or 0 if not available) + * + * @hideinitializer */ +#define OPUS_GET_PITCH(x) OPUS_GET_PITCH_REQUEST, __opus_check_int_ptr(x) + +/**@}*/ + +/** @defgroup opus_libinfo Opus library information functions + * @{ + */ + +/** Converts an opus error code into a human readable string. + * + * @param[in] error int: Error number + * @returns Error string + */ +OPUS_EXPORT const char *opus_strerror(int error); + +/** Gets the libopus version string. + * + * Applications may look for the substring "-fixed" in the version string to + * determine whether they have a fixed-point or floating-point build at + * runtime. + * + * @returns Version string + */ +OPUS_EXPORT const char *opus_get_version_string(void); +/**@}*/ + +#ifdef __cplusplus +} +#endif + +#endif /* OPUS_DEFINES_H */ diff --git a/thirdparty/opus/opus/opus_multistream.h b/thirdparty/opus/opus/opus_multistream.h new file mode 100644 index 000000000000..3622e009fb53 --- /dev/null +++ b/thirdparty/opus/opus/opus_multistream.h @@ -0,0 +1,660 @@ +/* Copyright (c) 2011 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** + * @file opus_multistream.h + * @brief Opus reference implementation multistream API + */ + +#ifndef OPUS_MULTISTREAM_H +#define OPUS_MULTISTREAM_H + +#include "opus.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** @cond OPUS_INTERNAL_DOC */ + +/** Macros to trigger compilation errors when the wrong types are provided to a + * CTL. */ +/**@{*/ +#define __opus_check_encstate_ptr(ptr) ((ptr) + ((ptr) - (OpusEncoder**)(ptr))) +#define __opus_check_decstate_ptr(ptr) ((ptr) + ((ptr) - (OpusDecoder**)(ptr))) +/**@}*/ + +/** These are the actual encoder and decoder CTL ID numbers. + * They should not be used directly by applications. + * In general, SETs should be even and GETs should be odd.*/ +/**@{*/ +#define OPUS_MULTISTREAM_GET_ENCODER_STATE_REQUEST 5120 +#define OPUS_MULTISTREAM_GET_DECODER_STATE_REQUEST 5122 +/**@}*/ + +/** @endcond */ + +/** @defgroup opus_multistream_ctls Multistream specific encoder and decoder CTLs + * + * These are convenience macros that are specific to the + * opus_multistream_encoder_ctl() and opus_multistream_decoder_ctl() + * interface. + * The CTLs from @ref opus_genericctls, @ref opus_encoderctls, and + * @ref opus_decoderctls may be applied to a multistream encoder or decoder as + * well. + * In addition, you may retrieve the encoder or decoder state for an specific + * stream via #OPUS_MULTISTREAM_GET_ENCODER_STATE or + * #OPUS_MULTISTREAM_GET_DECODER_STATE and apply CTLs to it individually. + */ +/**@{*/ + +/** Gets the encoder state for an individual stream of a multistream encoder. + * @param[in] x opus_int32: The index of the stream whose encoder you + * wish to retrieve. + * This must be non-negative and less than + * the streams parameter used + * to initialize the encoder. + * @param[out] y OpusEncoder**: Returns a pointer to the given + * encoder state. + * @retval OPUS_BAD_ARG The index of the requested stream was out of range. + * @hideinitializer + */ +#define OPUS_MULTISTREAM_GET_ENCODER_STATE(x,y) OPUS_MULTISTREAM_GET_ENCODER_STATE_REQUEST, __opus_check_int(x), __opus_check_encstate_ptr(y) + +/** Gets the decoder state for an individual stream of a multistream decoder. + * @param[in] x opus_int32: The index of the stream whose decoder you + * wish to retrieve. + * This must be non-negative and less than + * the streams parameter used + * to initialize the decoder. + * @param[out] y OpusDecoder**: Returns a pointer to the given + * decoder state. + * @retval OPUS_BAD_ARG The index of the requested stream was out of range. + * @hideinitializer + */ +#define OPUS_MULTISTREAM_GET_DECODER_STATE(x,y) OPUS_MULTISTREAM_GET_DECODER_STATE_REQUEST, __opus_check_int(x), __opus_check_decstate_ptr(y) + +/**@}*/ + +/** @defgroup opus_multistream Opus Multistream API + * @{ + * + * The multistream API allows individual Opus streams to be combined into a + * single packet, enabling support for up to 255 channels. Unlike an + * elementary Opus stream, the encoder and decoder must negotiate the channel + * configuration before the decoder can successfully interpret the data in the + * packets produced by the encoder. Some basic information, such as packet + * duration, can be computed without any special negotiation. + * + * The format for multistream Opus packets is defined in + * RFC 7845 + * and is based on the self-delimited Opus framing described in Appendix B of + * RFC 6716. + * Normal Opus packets are just a degenerate case of multistream Opus packets, + * and can be encoded or decoded with the multistream API by setting + * streams to 1 when initializing the encoder or + * decoder. + * + * Multistream Opus streams can contain up to 255 elementary Opus streams. + * These may be either "uncoupled" or "coupled", indicating that the decoder + * is configured to decode them to either 1 or 2 channels, respectively. + * The streams are ordered so that all coupled streams appear at the + * beginning. + * + * A mapping table defines which decoded channel i + * should be used for each input/output (I/O) channel j. This table is + * typically provided as an unsigned char array. + * Let i = mapping[j] be the index for I/O channel j. + * If i < 2*coupled_streams, then I/O channel j is + * encoded as the left channel of stream (i/2) if i + * is even, or as the right channel of stream (i/2) if + * i is odd. Otherwise, I/O channel j is encoded as + * mono in stream (i - coupled_streams), unless it has the special + * value 255, in which case it is omitted from the encoding entirely (the + * decoder will reproduce it as silence). Each value i must either + * be the special value 255 or be less than streams + coupled_streams. + * + * The output channels specified by the encoder + * should use the + * Vorbis + * channel ordering. A decoder may wish to apply an additional permutation + * to the mapping the encoder used to achieve a different output channel + * order (e.g. for outputing in WAV order). + * + * Each multistream packet contains an Opus packet for each stream, and all of + * the Opus packets in a single multistream packet must have the same + * duration. Therefore the duration of a multistream packet can be extracted + * from the TOC sequence of the first stream, which is located at the + * beginning of the packet, just like an elementary Opus stream: + * + * @code + * int nb_samples; + * int nb_frames; + * nb_frames = opus_packet_get_nb_frames(data, len); + * if (nb_frames < 1) + * return nb_frames; + * nb_samples = opus_packet_get_samples_per_frame(data, 48000) * nb_frames; + * @endcode + * + * The general encoding and decoding process proceeds exactly the same as in + * the normal @ref opus_encoder and @ref opus_decoder APIs. + * See their documentation for an overview of how to use the corresponding + * multistream functions. + */ + +/** Opus multistream encoder state. + * This contains the complete state of a multistream Opus encoder. + * It is position independent and can be freely copied. + * @see opus_multistream_encoder_create + * @see opus_multistream_encoder_init + */ +typedef struct OpusMSEncoder OpusMSEncoder; + +/** Opus multistream decoder state. + * This contains the complete state of a multistream Opus decoder. + * It is position independent and can be freely copied. + * @see opus_multistream_decoder_create + * @see opus_multistream_decoder_init + */ +typedef struct OpusMSDecoder OpusMSDecoder; + +/**\name Multistream encoder functions */ +/**@{*/ + +/** Gets the size of an OpusMSEncoder structure. + * @param streams int: The total number of streams to encode from the + * input. + * This must be no more than 255. + * @param coupled_streams int: Number of coupled (2 channel) streams + * to encode. + * This must be no larger than the total + * number of streams. + * Additionally, The total number of + * encoded channels (streams + + * coupled_streams) must be no + * more than 255. + * @returns The size in bytes on success, or a negative error code + * (see @ref opus_errorcodes) on error. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT opus_int32 opus_multistream_encoder_get_size( + int streams, + int coupled_streams +); + +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT opus_int32 opus_multistream_surround_encoder_get_size( + int channels, + int mapping_family +); + + +/** Allocates and initializes a multistream encoder state. + * Call opus_multistream_encoder_destroy() to release + * this object when finished. + * @param Fs opus_int32: Sampling rate of the input signal (in Hz). + * This must be one of 8000, 12000, 16000, + * 24000, or 48000. + * @param channels int: Number of channels in the input signal. + * This must be at most 255. + * It may be greater than the number of + * coded channels (streams + + * coupled_streams). + * @param streams int: The total number of streams to encode from the + * input. + * This must be no more than the number of channels. + * @param coupled_streams int: Number of coupled (2 channel) streams + * to encode. + * This must be no larger than the total + * number of streams. + * Additionally, The total number of + * encoded channels (streams + + * coupled_streams) must be no + * more than the number of input channels. + * @param[in] mapping const unsigned char[channels]: Mapping from + * encoded channels to input channels, as described in + * @ref opus_multistream. As an extra constraint, the + * multistream encoder does not allow encoding coupled + * streams for which one channel is unused since this + * is never a good idea. + * @param application int: The target encoder application. + * This must be one of the following: + *
    + *
    #OPUS_APPLICATION_VOIP
    + *
    Process signal for improved speech intelligibility.
    + *
    #OPUS_APPLICATION_AUDIO
    + *
    Favor faithfulness to the original input.
    + *
    #OPUS_APPLICATION_RESTRICTED_LOWDELAY
    + *
    Configure the minimum possible coding delay by disabling certain modes + * of operation.
    + *
    + * @param[out] error int *: Returns #OPUS_OK on success, or an error + * code (see @ref opus_errorcodes) on + * failure. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT OpusMSEncoder *opus_multistream_encoder_create( + opus_int32 Fs, + int channels, + int streams, + int coupled_streams, + const unsigned char *mapping, + int application, + int *error +) OPUS_ARG_NONNULL(5); + +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT OpusMSEncoder *opus_multistream_surround_encoder_create( + opus_int32 Fs, + int channels, + int mapping_family, + int *streams, + int *coupled_streams, + unsigned char *mapping, + int application, + int *error +) OPUS_ARG_NONNULL(5); + +/** Initialize a previously allocated multistream encoder state. + * The memory pointed to by \a st must be at least the size returned by + * opus_multistream_encoder_get_size(). + * This is intended for applications which use their own allocator instead of + * malloc. + * To reset a previously initialized state, use the #OPUS_RESET_STATE CTL. + * @see opus_multistream_encoder_create + * @see opus_multistream_encoder_get_size + * @param st OpusMSEncoder*: Multistream encoder state to initialize. + * @param Fs opus_int32: Sampling rate of the input signal (in Hz). + * This must be one of 8000, 12000, 16000, + * 24000, or 48000. + * @param channels int: Number of channels in the input signal. + * This must be at most 255. + * It may be greater than the number of + * coded channels (streams + + * coupled_streams). + * @param streams int: The total number of streams to encode from the + * input. + * This must be no more than the number of channels. + * @param coupled_streams int: Number of coupled (2 channel) streams + * to encode. + * This must be no larger than the total + * number of streams. + * Additionally, The total number of + * encoded channels (streams + + * coupled_streams) must be no + * more than the number of input channels. + * @param[in] mapping const unsigned char[channels]: Mapping from + * encoded channels to input channels, as described in + * @ref opus_multistream. As an extra constraint, the + * multistream encoder does not allow encoding coupled + * streams for which one channel is unused since this + * is never a good idea. + * @param application int: The target encoder application. + * This must be one of the following: + *
    + *
    #OPUS_APPLICATION_VOIP
    + *
    Process signal for improved speech intelligibility.
    + *
    #OPUS_APPLICATION_AUDIO
    + *
    Favor faithfulness to the original input.
    + *
    #OPUS_APPLICATION_RESTRICTED_LOWDELAY
    + *
    Configure the minimum possible coding delay by disabling certain modes + * of operation.
    + *
    + * @returns #OPUS_OK on success, or an error code (see @ref opus_errorcodes) + * on failure. + */ +OPUS_EXPORT int opus_multistream_encoder_init( + OpusMSEncoder *st, + opus_int32 Fs, + int channels, + int streams, + int coupled_streams, + const unsigned char *mapping, + int application +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(6); + +OPUS_EXPORT int opus_multistream_surround_encoder_init( + OpusMSEncoder *st, + opus_int32 Fs, + int channels, + int mapping_family, + int *streams, + int *coupled_streams, + unsigned char *mapping, + int application +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(6); + +/** Encodes a multistream Opus frame. + * @param st OpusMSEncoder*: Multistream encoder state. + * @param[in] pcm const opus_int16*: The input signal as interleaved + * samples. + * This must contain + * frame_size*channels + * samples. + * @param frame_size int: Number of samples per channel in the input + * signal. + * This must be an Opus frame size for the + * encoder's sampling rate. + * For example, at 48 kHz the permitted values + * are 120, 240, 480, 960, 1920, and 2880. + * Passing in a duration of less than 10 ms + * (480 samples at 48 kHz) will prevent the + * encoder from using the LPC or hybrid modes. + * @param[out] data unsigned char*: Output payload. + * This must contain storage for at + * least \a max_data_bytes. + * @param [in] max_data_bytes opus_int32: Size of the allocated + * memory for the output + * payload. This may be + * used to impose an upper limit on + * the instant bitrate, but should + * not be used as the only bitrate + * control. Use #OPUS_SET_BITRATE to + * control the bitrate. + * @returns The length of the encoded packet (in bytes) on success or a + * negative error code (see @ref opus_errorcodes) on failure. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_multistream_encode( + OpusMSEncoder *st, + const opus_int16 *pcm, + int frame_size, + unsigned char *data, + opus_int32 max_data_bytes +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(2) OPUS_ARG_NONNULL(4); + +/** Encodes a multistream Opus frame from floating point input. + * @param st OpusMSEncoder*: Multistream encoder state. + * @param[in] pcm const float*: The input signal as interleaved + * samples with a normal range of + * +/-1.0. + * Samples with a range beyond +/-1.0 + * are supported but will be clipped by + * decoders using the integer API and + * should only be used if it is known + * that the far end supports extended + * dynamic range. + * This must contain + * frame_size*channels + * samples. + * @param frame_size int: Number of samples per channel in the input + * signal. + * This must be an Opus frame size for the + * encoder's sampling rate. + * For example, at 48 kHz the permitted values + * are 120, 240, 480, 960, 1920, and 2880. + * Passing in a duration of less than 10 ms + * (480 samples at 48 kHz) will prevent the + * encoder from using the LPC or hybrid modes. + * @param[out] data unsigned char*: Output payload. + * This must contain storage for at + * least \a max_data_bytes. + * @param [in] max_data_bytes opus_int32: Size of the allocated + * memory for the output + * payload. This may be + * used to impose an upper limit on + * the instant bitrate, but should + * not be used as the only bitrate + * control. Use #OPUS_SET_BITRATE to + * control the bitrate. + * @returns The length of the encoded packet (in bytes) on success or a + * negative error code (see @ref opus_errorcodes) on failure. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_multistream_encode_float( + OpusMSEncoder *st, + const float *pcm, + int frame_size, + unsigned char *data, + opus_int32 max_data_bytes +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(2) OPUS_ARG_NONNULL(4); + +/** Frees an OpusMSEncoder allocated by + * opus_multistream_encoder_create(). + * @param st OpusMSEncoder*: Multistream encoder state to be freed. + */ +OPUS_EXPORT void opus_multistream_encoder_destroy(OpusMSEncoder *st); + +/** Perform a CTL function on a multistream Opus encoder. + * + * Generally the request and subsequent arguments are generated by a + * convenience macro. + * @param st OpusMSEncoder*: Multistream encoder state. + * @param request This and all remaining parameters should be replaced by one + * of the convenience macros in @ref opus_genericctls, + * @ref opus_encoderctls, or @ref opus_multistream_ctls. + * @see opus_genericctls + * @see opus_encoderctls + * @see opus_multistream_ctls + */ +OPUS_EXPORT int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...) OPUS_ARG_NONNULL(1); + +/**@}*/ + +/**\name Multistream decoder functions */ +/**@{*/ + +/** Gets the size of an OpusMSDecoder structure. + * @param streams int: The total number of streams coded in the + * input. + * This must be no more than 255. + * @param coupled_streams int: Number streams to decode as coupled + * (2 channel) streams. + * This must be no larger than the total + * number of streams. + * Additionally, The total number of + * coded channels (streams + + * coupled_streams) must be no + * more than 255. + * @returns The size in bytes on success, or a negative error code + * (see @ref opus_errorcodes) on error. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT opus_int32 opus_multistream_decoder_get_size( + int streams, + int coupled_streams +); + +/** Allocates and initializes a multistream decoder state. + * Call opus_multistream_decoder_destroy() to release + * this object when finished. + * @param Fs opus_int32: Sampling rate to decode at (in Hz). + * This must be one of 8000, 12000, 16000, + * 24000, or 48000. + * @param channels int: Number of channels to output. + * This must be at most 255. + * It may be different from the number of coded + * channels (streams + + * coupled_streams). + * @param streams int: The total number of streams coded in the + * input. + * This must be no more than 255. + * @param coupled_streams int: Number of streams to decode as coupled + * (2 channel) streams. + * This must be no larger than the total + * number of streams. + * Additionally, The total number of + * coded channels (streams + + * coupled_streams) must be no + * more than 255. + * @param[in] mapping const unsigned char[channels]: Mapping from + * coded channels to output channels, as described in + * @ref opus_multistream. + * @param[out] error int *: Returns #OPUS_OK on success, or an error + * code (see @ref opus_errorcodes) on + * failure. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT OpusMSDecoder *opus_multistream_decoder_create( + opus_int32 Fs, + int channels, + int streams, + int coupled_streams, + const unsigned char *mapping, + int *error +) OPUS_ARG_NONNULL(5); + +/** Intialize a previously allocated decoder state object. + * The memory pointed to by \a st must be at least the size returned by + * opus_multistream_encoder_get_size(). + * This is intended for applications which use their own allocator instead of + * malloc. + * To reset a previously initialized state, use the #OPUS_RESET_STATE CTL. + * @see opus_multistream_decoder_create + * @see opus_multistream_deocder_get_size + * @param st OpusMSEncoder*: Multistream encoder state to initialize. + * @param Fs opus_int32: Sampling rate to decode at (in Hz). + * This must be one of 8000, 12000, 16000, + * 24000, or 48000. + * @param channels int: Number of channels to output. + * This must be at most 255. + * It may be different from the number of coded + * channels (streams + + * coupled_streams). + * @param streams int: The total number of streams coded in the + * input. + * This must be no more than 255. + * @param coupled_streams int: Number of streams to decode as coupled + * (2 channel) streams. + * This must be no larger than the total + * number of streams. + * Additionally, The total number of + * coded channels (streams + + * coupled_streams) must be no + * more than 255. + * @param[in] mapping const unsigned char[channels]: Mapping from + * coded channels to output channels, as described in + * @ref opus_multistream. + * @returns #OPUS_OK on success, or an error code (see @ref opus_errorcodes) + * on failure. + */ +OPUS_EXPORT int opus_multistream_decoder_init( + OpusMSDecoder *st, + opus_int32 Fs, + int channels, + int streams, + int coupled_streams, + const unsigned char *mapping +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(6); + +/** Decode a multistream Opus packet. + * @param st OpusMSDecoder*: Multistream decoder state. + * @param[in] data const unsigned char*: Input payload. + * Use a NULL + * pointer to indicate packet + * loss. + * @param len opus_int32: Number of bytes in payload. + * @param[out] pcm opus_int16*: Output signal, with interleaved + * samples. + * This must contain room for + * frame_size*channels + * samples. + * @param frame_size int: The number of samples per channel of + * available space in \a pcm. + * If this is less than the maximum packet duration + * (120 ms; 5760 for 48kHz), this function will not be capable + * of decoding some packets. In the case of PLC (data==NULL) + * or FEC (decode_fec=1), then frame_size needs to be exactly + * the duration of audio that is missing, otherwise the + * decoder will not be in the optimal state to decode the + * next incoming packet. For the PLC and FEC cases, frame_size + * must be a multiple of 2.5 ms. + * @param decode_fec int: Flag (0 or 1) to request that any in-band + * forward error correction data be decoded. + * If no such data is available, the frame is + * decoded as if it were lost. + * @returns Number of samples decoded on success or a negative error code + * (see @ref opus_errorcodes) on failure. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_multistream_decode( + OpusMSDecoder *st, + const unsigned char *data, + opus_int32 len, + opus_int16 *pcm, + int frame_size, + int decode_fec +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(4); + +/** Decode a multistream Opus packet with floating point output. + * @param st OpusMSDecoder*: Multistream decoder state. + * @param[in] data const unsigned char*: Input payload. + * Use a NULL + * pointer to indicate packet + * loss. + * @param len opus_int32: Number of bytes in payload. + * @param[out] pcm opus_int16*: Output signal, with interleaved + * samples. + * This must contain room for + * frame_size*channels + * samples. + * @param frame_size int: The number of samples per channel of + * available space in \a pcm. + * If this is less than the maximum packet duration + * (120 ms; 5760 for 48kHz), this function will not be capable + * of decoding some packets. In the case of PLC (data==NULL) + * or FEC (decode_fec=1), then frame_size needs to be exactly + * the duration of audio that is missing, otherwise the + * decoder will not be in the optimal state to decode the + * next incoming packet. For the PLC and FEC cases, frame_size + * must be a multiple of 2.5 ms. + * @param decode_fec int: Flag (0 or 1) to request that any in-band + * forward error correction data be decoded. + * If no such data is available, the frame is + * decoded as if it were lost. + * @returns Number of samples decoded on success or a negative error code + * (see @ref opus_errorcodes) on failure. + */ +OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_multistream_decode_float( + OpusMSDecoder *st, + const unsigned char *data, + opus_int32 len, + float *pcm, + int frame_size, + int decode_fec +) OPUS_ARG_NONNULL(1) OPUS_ARG_NONNULL(4); + +/** Perform a CTL function on a multistream Opus decoder. + * + * Generally the request and subsequent arguments are generated by a + * convenience macro. + * @param st OpusMSDecoder*: Multistream decoder state. + * @param request This and all remaining parameters should be replaced by one + * of the convenience macros in @ref opus_genericctls, + * @ref opus_decoderctls, or @ref opus_multistream_ctls. + * @see opus_genericctls + * @see opus_decoderctls + * @see opus_multistream_ctls + */ +OPUS_EXPORT int opus_multistream_decoder_ctl(OpusMSDecoder *st, int request, ...) OPUS_ARG_NONNULL(1); + +/** Frees an OpusMSDecoder allocated by + * opus_multistream_decoder_create(). + * @param st OpusMSDecoder: Multistream decoder state to be freed. + */ +OPUS_EXPORT void opus_multistream_decoder_destroy(OpusMSDecoder *st); + +/**@}*/ + +/**@}*/ + +#ifdef __cplusplus +} +#endif + +#endif /* OPUS_MULTISTREAM_H */ diff --git a/thirdparty/opus/opus/opus_types.h b/thirdparty/opus/opus/opus_types.h new file mode 100644 index 000000000000..b28e03aea202 --- /dev/null +++ b/thirdparty/opus/opus/opus_types.h @@ -0,0 +1,159 @@ +/* (C) COPYRIGHT 1994-2002 Xiph.Org Foundation */ +/* Modified by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +/* opus_types.h based on ogg_types.h from libogg */ + +/** + @file opus_types.h + @brief Opus reference implementation types +*/ +#ifndef OPUS_TYPES_H +#define OPUS_TYPES_H + +/* Use the real stdint.h if it's there (taken from Paul Hsieh's pstdint.h) */ +#if (defined(__STDC__) && __STDC__ && __STDC_VERSION__ >= 199901L) || (defined(__GNUC__) && (defined(_STDINT_H) || defined(_STDINT_H_)) || defined (HAVE_STDINT_H)) +#include + + typedef int16_t opus_int16; + typedef uint16_t opus_uint16; + typedef int32_t opus_int32; + typedef uint32_t opus_uint32; +#elif defined(_WIN32) + +# if defined(__CYGWIN__) +# include <_G_config.h> + typedef _G_int32_t opus_int32; + typedef _G_uint32_t opus_uint32; + typedef _G_int16 opus_int16; + typedef _G_uint16 opus_uint16; +# elif defined(__MINGW32__) + typedef short opus_int16; + typedef unsigned short opus_uint16; + typedef int opus_int32; + typedef unsigned int opus_uint32; +# elif defined(__MWERKS__) + typedef int opus_int32; + typedef unsigned int opus_uint32; + typedef short opus_int16; + typedef unsigned short opus_uint16; +# else + /* MSVC/Borland */ + typedef __int32 opus_int32; + typedef unsigned __int32 opus_uint32; + typedef __int16 opus_int16; + typedef unsigned __int16 opus_uint16; +# endif + +#elif defined(__MACOS__) + +# include + typedef SInt16 opus_int16; + typedef UInt16 opus_uint16; + typedef SInt32 opus_int32; + typedef UInt32 opus_uint32; + +#elif (defined(__APPLE__) && defined(__MACH__)) /* MacOS X Framework build */ + +# include + typedef int16_t opus_int16; + typedef u_int16_t opus_uint16; + typedef int32_t opus_int32; + typedef u_int32_t opus_uint32; + +#elif defined(__BEOS__) + + /* Be */ +# include + typedef int16 opus_int16; + typedef u_int16 opus_uint16; + typedef int32_t opus_int32; + typedef u_int32_t opus_uint32; + +#elif defined (__EMX__) + + /* OS/2 GCC */ + typedef short opus_int16; + typedef unsigned short opus_uint16; + typedef int opus_int32; + typedef unsigned int opus_uint32; + +#elif defined (DJGPP) + + /* DJGPP */ + typedef short opus_int16; + typedef unsigned short opus_uint16; + typedef int opus_int32; + typedef unsigned int opus_uint32; + +#elif defined(R5900) + + /* PS2 EE */ + typedef int opus_int32; + typedef unsigned opus_uint32; + typedef short opus_int16; + typedef unsigned short opus_uint16; + +#elif defined(__SYMBIAN32__) + + /* Symbian GCC */ + typedef signed short opus_int16; + typedef unsigned short opus_uint16; + typedef signed int opus_int32; + typedef unsigned int opus_uint32; + +#elif defined(CONFIG_TI_C54X) || defined (CONFIG_TI_C55X) + + typedef short opus_int16; + typedef unsigned short opus_uint16; + typedef long opus_int32; + typedef unsigned long opus_uint32; + +#elif defined(CONFIG_TI_C6X) + + typedef short opus_int16; + typedef unsigned short opus_uint16; + typedef int opus_int32; + typedef unsigned int opus_uint32; + +#else + + /* Give up, take a reasonable guess */ + typedef short opus_int16; + typedef unsigned short opus_uint16; + typedef int opus_int32; + typedef unsigned int opus_uint32; + +#endif + +#define opus_int int /* used for counters etc; at least 16 bits */ +#define opus_int64 long long +#define opus_int8 signed char + +#define opus_uint unsigned int /* used for counters etc; at least 16 bits */ +#define opus_uint64 unsigned long long +#define opus_uint8 unsigned char + +#endif /* OPUS_TYPES_H */ diff --git a/thirdparty/opus/opus/opusfile.h b/thirdparty/opus/opus/opusfile.h new file mode 100644 index 000000000000..4bf2fba926bf --- /dev/null +++ b/thirdparty/opus/opus/opusfile.h @@ -0,0 +1,2157 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE libopusfile SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE libopusfile SOURCE CODE IS (C) COPYRIGHT 1994-2012 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: stdio-based convenience library for opening/seeking/decoding + last mod: $Id: vorbisfile.h 17182 2010-04-29 03:48:32Z xiphmont $ + + ********************************************************************/ +#if !defined(_opusfile_h) +# define _opusfile_h (1) + +/**\mainpage + \section Introduction + + This is the documentation for the libopusfile C API. + + The libopusfile package provides a convenient high-level API for + decoding and basic manipulation of all Ogg Opus audio streams. + libopusfile is implemented as a layer on top of Xiph.Org's + reference + libogg + and + libopus + libraries. + + libopusfile provides several sets of built-in routines for + file/stream access, and may also use custom stream I/O routines provided by + the embedded environment. + There are built-in I/O routines provided for ANSI-compliant + stdio (FILE *), memory buffers, and URLs + (including URLs, plus optionally and URLs). + + \section Organization + + The main API is divided into several sections: + - \ref stream_open_close + - \ref stream_info + - \ref stream_decoding + - \ref stream_seeking + + Several additional sections are not tied to the main API. + - \ref stream_callbacks + - \ref header_info + - \ref error_codes + + \section Overview + + The libopusfile API always decodes files to 48 kHz. + The original sample rate is not preserved by the lossy compression, though + it is stored in the header to allow you to resample to it after decoding + (the libopusfile API does not currently provide a resampler, + but the + the + Speex resampler is a good choice if you need one). + In general, if you are playing back the audio, you should leave it at + 48 kHz, provided your audio hardware supports it. + When decoding to a file, it may be worth resampling back to the original + sample rate, so as not to surprise users who might not expect the sample + rate to change after encoding to Opus and decoding. + + Opus files can contain anywhere from 1 to 255 channels of audio. + The channel mappings for up to 8 channels are the same as the + Vorbis + mappings. + A special stereo API can convert everything to 2 channels, making it simple + to support multichannel files in an application which only has stereo + output. + Although the libopusfile ABI provides support for the theoretical + maximum number of channels, the current implementation does not support + files with more than 8 channels, as they do not have well-defined channel + mappings. + + Like all Ogg files, Opus files may be "chained". + That is, multiple Opus files may be combined into a single, longer file just + by concatenating the original files. + This is commonly done in internet radio streaming, as it allows the title + and artist to be updated each time the song changes, since each link in the + chain includes its own set of metadata. + + libopusfile fully supports chained files. + It will decode the first Opus stream found in each link of a chained file + (ignoring any other streams that might be concurrently multiplexed with it, + such as a video stream). + + The channel count can also change between links. + If your application is not prepared to deal with this, it can use the stereo + API to ensure the audio from all links will always get decoded into a + common format. + Since libopusfile always decodes to 48 kHz, you do not have to + worry about the sample rate changing between links (as was possible with + Vorbis). + This makes application support for chained files with libopusfile + very easy.*/ + +# if defined(__cplusplus) +extern "C" { +# endif + +# include +# include +# include +# include + +/**@cond PRIVATE*/ + +/*Enable special features for gcc and gcc-compatible compilers.*/ +# if !defined(OP_GNUC_PREREQ) +# if defined(__GNUC__)&&defined(__GNUC_MINOR__) +# define OP_GNUC_PREREQ(_maj,_min) \ + ((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min)) +# else +# define OP_GNUC_PREREQ(_maj,_min) 0 +# endif +# endif + +# if OP_GNUC_PREREQ(4,0) +# pragma GCC visibility push(default) +# endif + +typedef struct OpusHead OpusHead; +typedef struct OpusTags OpusTags; +typedef struct OpusPictureTag OpusPictureTag; +typedef struct OpusServerInfo OpusServerInfo; +typedef struct OpusFileCallbacks OpusFileCallbacks; +typedef struct OggOpusFile OggOpusFile; + +/*Warning attributes for libopusfile functions.*/ +# if OP_GNUC_PREREQ(3,4) +# define OP_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) +# else +# define OP_WARN_UNUSED_RESULT +# endif +# if OP_GNUC_PREREQ(3,4) +# define OP_ARG_NONNULL(_x) __attribute__((__nonnull__(_x))) +# else +# define OP_ARG_NONNULL(_x) +# endif + +/**@endcond*/ + +/**\defgroup error_codes Error Codes*/ +/*@{*/ +/**\name List of possible error codes + Many of the functions in this library return a negative error code when a + function fails. + This list provides a brief explanation of the common errors. + See each individual function for more details on what a specific error code + means in that context.*/ +/*@{*/ + +/**A request did not succeed.*/ +#define OP_FALSE (-1) +/*Currently not used externally.*/ +#define OP_EOF (-2) +/**There was a hole in the page sequence numbers (e.g., a page was corrupt or + missing).*/ +#define OP_HOLE (-3) +/**An underlying read, seek, or tell operation failed when it should have + succeeded.*/ +#define OP_EREAD (-128) +/**A NULL pointer was passed where one was unexpected, or an + internal memory allocation failed, or an internal library error was + encountered.*/ +#define OP_EFAULT (-129) +/**The stream used a feature that is not implemented, such as an unsupported + channel family.*/ +#define OP_EIMPL (-130) +/**One or more parameters to a function were invalid.*/ +#define OP_EINVAL (-131) +/**A purported Ogg Opus stream did not begin with an Ogg page, a purported + header packet did not start with one of the required strings, "OpusHead" or + "OpusTags", or a link in a chained file was encountered that did not + contain any logical Opus streams.*/ +#define OP_ENOTFORMAT (-132) +/**A required header packet was not properly formatted, contained illegal + values, or was missing altogether.*/ +#define OP_EBADHEADER (-133) +/**The ID header contained an unrecognized version number.*/ +#define OP_EVERSION (-134) +/*Currently not used at all.*/ +#define OP_ENOTAUDIO (-135) +/**An audio packet failed to decode properly. + This is usually caused by a multistream Ogg packet where the durations of + the individual Opus packets contained in it are not all the same.*/ +#define OP_EBADPACKET (-136) +/**We failed to find data we had seen before, or the bitstream structure was + sufficiently malformed that seeking to the target destination was + impossible.*/ +#define OP_EBADLINK (-137) +/**An operation that requires seeking was requested on an unseekable stream.*/ +#define OP_ENOSEEK (-138) +/**The first or last granule position of a link failed basic validity checks.*/ +#define OP_EBADTIMESTAMP (-139) + +/*@}*/ +/*@}*/ + +/**\defgroup header_info Header Information*/ +/*@{*/ + +/**The maximum number of channels in an Ogg Opus stream.*/ +#define OPUS_CHANNEL_COUNT_MAX (255) + +/**Ogg Opus bitstream information. + This contains the basic playback parameters for a stream, and corresponds to + the initial ID header packet of an Ogg Opus stream.*/ +struct OpusHead{ + /**The Ogg Opus format version, in the range 0...255. + The top 4 bits represent a "major" version, and the bottom four bits + represent backwards-compatible "minor" revisions. + The current specification describes version 1. + This library will recognize versions up through 15 as backwards compatible + with the current specification. + An earlier draft of the specification described a version 0, but the only + difference between version 1 and version 0 is that version 0 did + not specify the semantics for handling the version field.*/ + int version; + /**The number of channels, in the range 1...255.*/ + int channel_count; + /**The number of samples that should be discarded from the beginning of the + stream.*/ + unsigned pre_skip; + /**The sampling rate of the original input. + All Opus audio is coded at 48 kHz, and should also be decoded at 48 kHz + for playback (unless the target hardware does not support this sampling + rate). + However, this field may be used to resample the audio back to the original + sampling rate, for example, when saving the output to a file.*/ + opus_uint32 input_sample_rate; + /**The gain to apply to the decoded output, in dB, as a Q8 value in the range + -32768...32767. + The libopusfile API will automatically apply this gain to the + decoded output before returning it, scaling it by + pow(10,output_gain/(20.0*256)).*/ + int output_gain; + /**The channel mapping family, in the range 0...255. + Channel mapping family 0 covers mono or stereo in a single stream. + Channel mapping family 1 covers 1 to 8 channels in one or more streams, + using the Vorbis speaker assignments. + Channel mapping family 255 covers 1 to 255 channels in one or more + streams, but without any defined speaker assignment.*/ + int mapping_family; + /**The number of Opus streams in each Ogg packet, in the range 1...255.*/ + int stream_count; + /**The number of coupled Opus streams in each Ogg packet, in the range + 0...127. + This must satisfy 0 <= coupled_count <= stream_count and + coupled_count + stream_count <= 255. + The coupled streams appear first, before all uncoupled streams, in an Ogg + Opus packet.*/ + int coupled_count; + /**The mapping from coded stream channels to output channels. + Let index=mapping[k] be the value for channel k. + If index<2*coupled_count, then it refers to the left channel + from stream (index/2) if even, and the right channel from + stream (index/2) if odd. + Otherwise, it refers to the output of the uncoupled stream + (index-coupled_count).*/ + unsigned char mapping[OPUS_CHANNEL_COUNT_MAX]; +}; + +/**The metadata from an Ogg Opus stream. + + This structure holds the in-stream metadata corresponding to the 'comment' + header packet of an Ogg Opus stream. + The comment header is meant to be used much like someone jotting a quick + note on the label of a CD. + It should be a short, to the point text note that can be more than a couple + words, but not more than a short paragraph. + + The metadata is stored as a series of (tag, value) pairs, in length-encoded + string vectors, using the same format as Vorbis (without the final "framing + bit"), Theora, and Speex, except for the packet header. + The first occurrence of the '=' character delimits the tag and value. + A particular tag may occur more than once, and order is significant. + The character set encoding for the strings is always UTF-8, but the tag + names are limited to ASCII, and treated as case-insensitive. + See the Vorbis + comment header specification for details. + + In filling in this structure, libopusfile will null-terminate the + #user_comments strings for safety. + However, the bitstream format itself treats them as 8-bit clean vectors, + possibly containing NUL characters, so the #comment_lengths array should be + treated as their authoritative length. + + This structure is binary and source-compatible with a + vorbis_comment, and pointers to it may be freely cast to + vorbis_comment pointers, and vice versa. + It is provided as a separate type to avoid introducing a compile-time + dependency on the libvorbis headers.*/ +struct OpusTags{ + /**The array of comment string vectors.*/ + char **user_comments; + /**An array of the corresponding length of each vector, in bytes.*/ + int *comment_lengths; + /**The total number of comment streams.*/ + int comments; + /**The null-terminated vendor string. + This identifies the software used to encode the stream.*/ + char *vendor; +}; + +/**\name Picture tag image formats*/ +/*@{*/ + +/**The MIME type was not recognized, or the image data did not match the + declared MIME type.*/ +#define OP_PIC_FORMAT_UNKNOWN (-1) +/**The MIME type indicates the image data is really a URL.*/ +#define OP_PIC_FORMAT_URL (0) +/**The image is a JPEG.*/ +#define OP_PIC_FORMAT_JPEG (1) +/**The image is a PNG.*/ +#define OP_PIC_FORMAT_PNG (2) +/**The image is a GIF.*/ +#define OP_PIC_FORMAT_GIF (3) + +/*@}*/ + +/**The contents of a METADATA_BLOCK_PICTURE tag.*/ +struct OpusPictureTag{ + /**The picture type according to the ID3v2 APIC frame: +
      +
    1. Other
    2. +
    3. 32x32 pixels 'file icon' (PNG only)
    4. +
    5. Other file icon
    6. +
    7. Cover (front)
    8. +
    9. Cover (back)
    10. +
    11. Leaflet page
    12. +
    13. Media (e.g. label side of CD)
    14. +
    15. Lead artist/lead performer/soloist
    16. +
    17. Artist/performer
    18. +
    19. Conductor
    20. +
    21. Band/Orchestra
    22. +
    23. Composer
    24. +
    25. Lyricist/text writer
    26. +
    27. Recording Location
    28. +
    29. During recording
    30. +
    31. During performance
    32. +
    33. Movie/video screen capture
    34. +
    35. A bright colored fish
    36. +
    37. Illustration
    38. +
    39. Band/artist logotype
    40. +
    41. Publisher/Studio logotype
    42. +
    + Others are reserved and should not be used. + There may only be one each of picture type 1 and 2 in a file.*/ + opus_int32 type; + /**The MIME type of the picture, in printable ASCII characters 0x20-0x7E. + The MIME type may also be "-->" to signify that the data part + is a URL pointing to the picture instead of the picture data itself. + In this case, a terminating NUL is appended to the URL string in #data, + but #data_length is set to the length of the string excluding that + terminating NUL.*/ + char *mime_type; + /**The description of the picture, in UTF-8.*/ + char *description; + /**The width of the picture in pixels.*/ + opus_uint32 width; + /**The height of the picture in pixels.*/ + opus_uint32 height; + /**The color depth of the picture in bits-per-pixel (not + bits-per-channel).*/ + opus_uint32 depth; + /**For indexed-color pictures (e.g., GIF), the number of colors used, or 0 + for non-indexed pictures.*/ + opus_uint32 colors; + /**The length of the picture data in bytes.*/ + opus_uint32 data_length; + /**The binary picture data.*/ + unsigned char *data; + /**The format of the picture data, if known. + One of +
      +
    • #OP_PIC_FORMAT_UNKNOWN,
    • +
    • #OP_PIC_FORMAT_URL,
    • +
    • #OP_PIC_FORMAT_JPEG,
    • +
    • #OP_PIC_FORMAT_PNG, or
    • +
    • #OP_PIC_FORMAT_GIF.
    • +
    */ + int format; +}; + +/**\name Functions for manipulating header data + + These functions manipulate the #OpusHead and #OpusTags structures, + which describe the audio parameters and tag-value metadata, respectively. + These can be used to query the headers returned by libopusfile, or + to parse Opus headers from sources other than an Ogg Opus stream, provided + they use the same format.*/ +/*@{*/ + +/**Parses the contents of the ID header packet of an Ogg Opus stream. + \param[out] _head Returns the contents of the parsed packet. + The contents of this structure are untouched on error. + This may be NULL to merely test the header + for validity. + \param[in] _data The contents of the ID header packet. + \param _len The number of bytes of data in the ID header packet. + \return 0 on success or a negative value on error. + \retval #OP_ENOTFORMAT If the data does not start with the "OpusHead" + string. + \retval #OP_EVERSION If the version field signaled a version this library + does not know how to parse. + \retval #OP_EIMPL If the channel mapping family was 255, which general + purpose players should not attempt to play. + \retval #OP_EBADHEADER If the contents of the packet otherwise violate the + Ogg Opus specification: +
      +
    • Insufficient data,
    • +
    • Too much data for the known minor versions,
    • +
    • An unrecognized channel mapping family,
    • +
    • Zero channels or too many channels,
    • +
    • Zero coded streams,
    • +
    • Too many coupled streams, or
    • +
    • An invalid channel mapping index.
    • +
    */ +OP_WARN_UNUSED_RESULT int opus_head_parse(OpusHead *_head, + const unsigned char *_data,size_t _len) OP_ARG_NONNULL(2); + +/**Converts a granule position to a sample offset for a given Ogg Opus stream. + The sample offset is simply _gp-_head->pre_skip. + Granule position values smaller than OpusHead#pre_skip correspond to audio + that should never be played, and thus have no associated sample offset. + This function returns -1 for such values. + This function also correctly handles extremely large granule positions, + which may have wrapped around to a negative number when stored in a signed + ogg_int64_t value. + \param _head The #OpusHead information from the ID header of the stream. + \param _gp The granule position to convert. + \return The sample offset associated with the given granule position + (counting at a 48 kHz sampling rate), or the special value -1 on + error (i.e., the granule position was smaller than the pre-skip + amount).*/ +ogg_int64_t opus_granule_sample(const OpusHead *_head,ogg_int64_t _gp) + OP_ARG_NONNULL(1); + +/**Parses the contents of the 'comment' header packet of an Ogg Opus stream. + \param[out] _tags An uninitialized #OpusTags structure. + This returns the contents of the parsed packet. + The contents of this structure are untouched on error. + This may be NULL to merely test the header + for validity. + \param[in] _data The contents of the 'comment' header packet. + \param _len The number of bytes of data in the 'info' header packet. + \retval 0 Success. + \retval #OP_ENOTFORMAT If the data does not start with the "OpusTags" + string. + \retval #OP_EBADHEADER If the contents of the packet otherwise violate the + Ogg Opus specification. + \retval #OP_EFAULT If there wasn't enough memory to store the tags.*/ +OP_WARN_UNUSED_RESULT int opus_tags_parse(OpusTags *_tags, + const unsigned char *_data,size_t _len) OP_ARG_NONNULL(2); + +/**Performs a deep copy of an #OpusTags structure. + \param _dst The #OpusTags structure to copy into. + If this function fails, the contents of this structure remain + untouched. + \param _src The #OpusTags structure to copy from. + \retval 0 Success. + \retval #OP_EFAULT If there wasn't enough memory to copy the tags.*/ +int opus_tags_copy(OpusTags *_dst,const OpusTags *_src) OP_ARG_NONNULL(1); + +/**Initializes an #OpusTags structure. + This should be called on a freshly allocated #OpusTags structure before + attempting to use it. + \param _tags The #OpusTags structure to initialize.*/ +void opus_tags_init(OpusTags *_tags) OP_ARG_NONNULL(1); + +/**Add a (tag, value) pair to an initialized #OpusTags structure. + \note Neither opus_tags_add() nor opus_tags_add_comment() support values + containing embedded NULs, although the bitstream format does support them. + To add such tags, you will need to manipulate the #OpusTags structure + directly. + \param _tags The #OpusTags structure to add the (tag, value) pair to. + \param _tag A NUL-terminated, case-insensitive, ASCII string containing + the tag to add (without an '=' character). + \param _value A NUL-terminated UTF-8 containing the corresponding value. + \return 0 on success, or a negative value on failure. + \retval #OP_EFAULT An internal memory allocation failed.*/ +int opus_tags_add(OpusTags *_tags,const char *_tag,const char *_value) + OP_ARG_NONNULL(1) OP_ARG_NONNULL(2) OP_ARG_NONNULL(3); + +/**Add a comment to an initialized #OpusTags structure. + \note Neither opus_tags_add_comment() nor opus_tags_add() support comments + containing embedded NULs, although the bitstream format does support them. + To add such tags, you will need to manipulate the #OpusTags structure + directly. + \param _tags The #OpusTags structure to add the comment to. + \param _comment A NUL-terminated UTF-8 string containing the comment in + "TAG=value" form. + \return 0 on success, or a negative value on failure. + \retval #OP_EFAULT An internal memory allocation failed.*/ +int opus_tags_add_comment(OpusTags *_tags,const char *_comment) + OP_ARG_NONNULL(1) OP_ARG_NONNULL(2); + +/**Replace the binary suffix data at the end of the packet (if any). + \param _tags An initialized #OpusTags structure. + \param _data A buffer of binary data to append after the encoded user + comments. + The least significant bit of the first byte of this data must + be set (to ensure the data is preserved by other editors). + \param _len The number of bytes of binary data to append. + This may be zero to remove any existing binary suffix data. + \return 0 on success, or a negative value on error. + \retval #OP_EINVAL \a _len was negative, or \a _len was positive but + \a _data was NULL or the least significant + bit of the first byte was not set. + \retval #OP_EFAULT An internal memory allocation failed.*/ +int opus_tags_set_binary_suffix(OpusTags *_tags, + const unsigned char *_data,int _len) OP_ARG_NONNULL(1); + +/**Look up a comment value by its tag. + \param _tags An initialized #OpusTags structure. + \param _tag The tag to look up. + \param _count The instance of the tag. + The same tag can appear multiple times, each with a distinct + value, so an index is required to retrieve them all. + The order in which these values appear is significant and + should be preserved. + Use opus_tags_query_count() to get the legal range for the + \a _count parameter. + \return A pointer to the queried tag's value. + This points directly to data in the #OpusTags structure. + It should not be modified or freed by the application, and + modifications to the structure may invalidate the pointer. + \retval NULL If no matching tag is found.*/ +const char *opus_tags_query(const OpusTags *_tags,const char *_tag,int _count) + OP_ARG_NONNULL(1) OP_ARG_NONNULL(2); + +/**Look up the number of instances of a tag. + Call this first when querying for a specific tag and then iterate over the + number of instances with separate calls to opus_tags_query() to retrieve + all the values for that tag in order. + \param _tags An initialized #OpusTags structure. + \param _tag The tag to look up. + \return The number of instances of this particular tag.*/ +int opus_tags_query_count(const OpusTags *_tags,const char *_tag) + OP_ARG_NONNULL(1) OP_ARG_NONNULL(2); + +/**Retrieve the binary suffix data at the end of the packet (if any). + \param _tags An initialized #OpusTags structure. + \param[out] _len Returns the number of bytes of binary suffix data returned. + \return A pointer to the binary suffix data, or NULL if none + was present.*/ +const unsigned char *opus_tags_get_binary_suffix(const OpusTags *_tags, + int *_len) OP_ARG_NONNULL(1) OP_ARG_NONNULL(2); + +/**Get the album gain from an R128_ALBUM_GAIN tag, if one was specified. + This searches for the first R128_ALBUM_GAIN tag with a valid signed, + 16-bit decimal integer value and returns the value. + This routine is exposed merely for convenience for applications which wish + to do something special with the album gain (i.e., display it). + If you simply wish to apply the album gain instead of the header gain, you + can use op_set_gain_offset() with an #OP_ALBUM_GAIN type and no offset. + \param _tags An initialized #OpusTags structure. + \param[out] _gain_q8 The album gain, in 1/256ths of a dB. + This will lie in the range [-32768,32767], and should + be applied in addition to the header gain. + On error, no value is returned, and the previous + contents remain unchanged. + \return 0 on success, or a negative value on error. + \retval #OP_FALSE There was no album gain available in the given tags.*/ +int opus_tags_get_album_gain(const OpusTags *_tags,int *_gain_q8) + OP_ARG_NONNULL(1) OP_ARG_NONNULL(2); + +/**Get the track gain from an R128_TRACK_GAIN tag, if one was specified. + This searches for the first R128_TRACK_GAIN tag with a valid signed, + 16-bit decimal integer value and returns the value. + This routine is exposed merely for convenience for applications which wish + to do something special with the track gain (i.e., display it). + If you simply wish to apply the track gain instead of the header gain, you + can use op_set_gain_offset() with an #OP_TRACK_GAIN type and no offset. + \param _tags An initialized #OpusTags structure. + \param[out] _gain_q8 The track gain, in 1/256ths of a dB. + This will lie in the range [-32768,32767], and should + be applied in addition to the header gain. + On error, no value is returned, and the previous + contents remain unchanged. + \return 0 on success, or a negative value on error. + \retval #OP_FALSE There was no track gain available in the given tags.*/ +int opus_tags_get_track_gain(const OpusTags *_tags,int *_gain_q8) + OP_ARG_NONNULL(1) OP_ARG_NONNULL(2); + +/**Clears the #OpusTags structure. + This should be called on an #OpusTags structure after it is no longer + needed. + It will free all memory used by the structure members. + \param _tags The #OpusTags structure to clear.*/ +void opus_tags_clear(OpusTags *_tags) OP_ARG_NONNULL(1); + +/**Check if \a _comment is an instance of a \a _tag_name tag. + \see opus_tagncompare + \param _tag_name A NUL-terminated, case-insensitive, ASCII string containing + the name of the tag to check for (without the terminating + '=' character). + \param _comment The comment string to check. + \return An integer less than, equal to, or greater than zero if \a _comment + is found respectively, to be less than, to match, or be greater + than a "tag=value" string whose tag matches \a _tag_name.*/ +int opus_tagcompare(const char *_tag_name,const char *_comment); + +/**Check if \a _comment is an instance of a \a _tag_name tag. + This version is slightly more efficient than opus_tagcompare() if the length + of the tag name is already known (e.g., because it is a constant). + \see opus_tagcompare + \param _tag_name A case-insensitive ASCII string containing the name of the + tag to check for (without the terminating '=' character). + \param _tag_len The number of characters in the tag name. + This must be non-negative. + \param _comment The comment string to check. + \return An integer less than, equal to, or greater than zero if \a _comment + is found respectively, to be less than, to match, or be greater + than a "tag=value" string whose tag matches the first \a _tag_len + characters of \a _tag_name.*/ +int opus_tagncompare(const char *_tag_name,int _tag_len,const char *_comment); + +/**Parse a single METADATA_BLOCK_PICTURE tag. + This decodes the BASE64-encoded content of the tag and returns a structure + with the MIME type, description, image parameters (if known), and the + compressed image data. + If the MIME type indicates the presence of an image format we recognize + (JPEG, PNG, or GIF) and the actual image data contains the magic signature + associated with that format, then the OpusPictureTag::format field will be + set to the corresponding format. + This is provided as a convenience to avoid requiring applications to parse + the MIME type and/or do their own format detection for the commonly used + formats. + In this case, we also attempt to extract the image parameters directly from + the image data (overriding any that were present in the tag, which the + specification says applications are not meant to rely on). + The application must still provide its own support for actually decoding the + image data and, if applicable, retrieving that data from URLs. + \param[out] _pic Returns the parsed picture data. + No sanitation is done on the type, MIME type, or + description fields, so these might return invalid values. + The contents of this structure are left unmodified on + failure. + \param _tag The METADATA_BLOCK_PICTURE tag contents. + The leading "METADATA_BLOCK_PICTURE=" portion is optional, + to allow the function to be used on either directly on the + values in OpusTags::user_comments or on the return value + of opus_tags_query(). + \return 0 on success or a negative value on error. + \retval #OP_ENOTFORMAT The METADATA_BLOCK_PICTURE contents were not valid. + \retval #OP_EFAULT There was not enough memory to store the picture tag + contents.*/ +OP_WARN_UNUSED_RESULT int opus_picture_tag_parse(OpusPictureTag *_pic, + const char *_tag) OP_ARG_NONNULL(1) OP_ARG_NONNULL(2); + +/**Initializes an #OpusPictureTag structure. + This should be called on a freshly allocated #OpusPictureTag structure + before attempting to use it. + \param _pic The #OpusPictureTag structure to initialize.*/ +void opus_picture_tag_init(OpusPictureTag *_pic) OP_ARG_NONNULL(1); + +/**Clears the #OpusPictureTag structure. + This should be called on an #OpusPictureTag structure after it is no longer + needed. + It will free all memory used by the structure members. + \param _pic The #OpusPictureTag structure to clear.*/ +void opus_picture_tag_clear(OpusPictureTag *_pic) OP_ARG_NONNULL(1); + +/*@}*/ + +/*@}*/ + +/**\defgroup url_options URL Reading Options*/ +/*@{*/ +/**\name URL reading options + Options for op_url_stream_create() and associated functions. + These allow you to provide proxy configuration parameters, skip SSL + certificate checks, etc. + Options are processed in order, and if the same option is passed multiple + times, only the value specified by the last occurrence has an effect + (unless otherwise specified). + They may be expanded in the future.*/ +/*@{*/ + +/**@cond PRIVATE*/ + +/*These are the raw numbers used to define the request codes. + They should not be used directly.*/ +#define OP_SSL_SKIP_CERTIFICATE_CHECK_REQUEST (6464) +#define OP_HTTP_PROXY_HOST_REQUEST (6528) +#define OP_HTTP_PROXY_PORT_REQUEST (6592) +#define OP_HTTP_PROXY_USER_REQUEST (6656) +#define OP_HTTP_PROXY_PASS_REQUEST (6720) +#define OP_GET_SERVER_INFO_REQUEST (6784) + +#define OP_URL_OPT(_request) ((_request)+(char *)0) + +/*These macros trigger compilation errors or warnings if the wrong types are + provided to one of the URL options.*/ +#define OP_CHECK_INT(_x) ((void)((_x)==(opus_int32)0),(opus_int32)(_x)) +#define OP_CHECK_CONST_CHAR_PTR(_x) ((_x)+((_x)-(const char *)(_x))) +#define OP_CHECK_SERVER_INFO_PTR(_x) ((_x)+((_x)-(OpusServerInfo *)(_x))) + +/**@endcond*/ + +/**HTTP/Shoutcast/Icecast server information associated with a URL.*/ +struct OpusServerInfo{ + /**The name of the server (icy-name/ice-name). + This is NULL if there was no icy-name or + ice-name header.*/ + char *name; + /**A short description of the server (icy-description/ice-description). + This is NULL if there was no icy-description or + ice-description header.*/ + char *description; + /**The genre the server falls under (icy-genre/ice-genre). + This is NULL if there was no icy-genre or + ice-genre header.*/ + char *genre; + /**The homepage for the server (icy-url/ice-url). + This is NULL if there was no icy-url or + ice-url header.*/ + char *url; + /**The software used by the origin server (Server). + This is NULL if there was no Server header.*/ + char *server; + /**The media type of the entity sent to the recepient (Content-Type). + This is NULL if there was no Content-Type + header.*/ + char *content_type; + /**The nominal stream bitrate in kbps (icy-br/ice-bitrate). + This is -1 if there was no icy-br or + ice-bitrate header.*/ + opus_int32 bitrate_kbps; + /**Flag indicating whether the server is public (1) or not + (0) (icy-pub/ice-public). + This is -1 if there was no icy-pub or + ice-public header.*/ + int is_public; + /**Flag indicating whether the server is using HTTPS instead of HTTP. + This is 0 unless HTTPS is being used. + This may not match the protocol used in the original URL if there were + redirections.*/ + int is_ssl; +}; + +/**Initializes an #OpusServerInfo structure. + All fields are set as if the corresponding header was not available. + \param _info The #OpusServerInfo structure to initialize. + \note If you use this function, you must link against libopusurl.*/ +void opus_server_info_init(OpusServerInfo *_info) OP_ARG_NONNULL(1); + +/**Clears the #OpusServerInfo structure. + This should be called on an #OpusServerInfo structure after it is no longer + needed. + It will free all memory used by the structure members. + \param _info The #OpusServerInfo structure to clear. + \note If you use this function, you must link against libopusurl.*/ +void opus_server_info_clear(OpusServerInfo *_info) OP_ARG_NONNULL(1); + +/**Skip the certificate check when connecting via TLS/SSL (https). + \param _b opus_int32: Whether or not to skip the certificate + check. + The check will be skipped if \a _b is non-zero, and will not be + skipped if \a _b is zero. + \hideinitializer*/ +#define OP_SSL_SKIP_CERTIFICATE_CHECK(_b) \ + OP_URL_OPT(OP_SSL_SKIP_CERTIFICATE_CHECK_REQUEST),OP_CHECK_INT(_b) + +/**Proxy connections through the given host. + If no port is specified via #OP_HTTP_PROXY_PORT, the port number defaults + to 8080 (http-alt). + All proxy parameters are ignored for non-http and non-https URLs. + \param _host const char *: The proxy server hostname. + This may be NULL to disable the use of a proxy + server. + \hideinitializer*/ +#define OP_HTTP_PROXY_HOST(_host) \ + OP_URL_OPT(OP_HTTP_PROXY_HOST_REQUEST),OP_CHECK_CONST_CHAR_PTR(_host) + +/**Use the given port when proxying connections. + This option only has an effect if #OP_HTTP_PROXY_HOST is specified with a + non-NULL \a _host. + If this option is not provided, the proxy port number defaults to 8080 + (http-alt). + All proxy parameters are ignored for non-http and non-https URLs. + \param _port opus_int32: The proxy server port. + This must be in the range 0...65535 (inclusive), or the + URL function this is passed to will fail. + \hideinitializer*/ +#define OP_HTTP_PROXY_PORT(_port) \ + OP_URL_OPT(OP_HTTP_PROXY_PORT_REQUEST),OP_CHECK_INT(_port) + +/**Use the given user name for authentication when proxying connections. + All proxy parameters are ignored for non-http and non-https URLs. + \param _user const char *: The proxy server user name. + This may be NULL to disable proxy + authentication. + A non-NULL value only has an effect + if #OP_HTTP_PROXY_HOST and #OP_HTTP_PROXY_PASS + are also specified with non-NULL + arguments. + \hideinitializer*/ +#define OP_HTTP_PROXY_USER(_user) \ + OP_URL_OPT(OP_HTTP_PROXY_USER_REQUEST),OP_CHECK_CONST_CHAR_PTR(_user) + +/**Use the given password for authentication when proxying connections. + All proxy parameters are ignored for non-http and non-https URLs. + \param _pass const char *: The proxy server password. + This may be NULL to disable proxy + authentication. + A non-NULL value only has an effect + if #OP_HTTP_PROXY_HOST and #OP_HTTP_PROXY_USER + are also specified with non-NULL + arguments. + \hideinitializer*/ +#define OP_HTTP_PROXY_PASS(_pass) \ + OP_URL_OPT(OP_HTTP_PROXY_PASS_REQUEST),OP_CHECK_CONST_CHAR_PTR(_pass) + +/**Parse information about the streaming server (if any) and return it. + Very little validation is done. + In particular, OpusServerInfo::url may not be a valid URL, + OpusServerInfo::bitrate_kbps may not really be in kbps, and + OpusServerInfo::content_type may not be a valid MIME type. + The character set of the string fields is not specified anywhere, and should + not be assumed to be valid UTF-8. + \param _info OpusServerInfo *: Returns information about the server. + If there is any error opening the stream, the + contents of this structure remain + unmodified. + On success, fills in the structure with the + server information that was available, if + any. + After a successful return, the contents of + this structure should be freed by calling + opus_server_info_clear(). + \hideinitializer*/ +#define OP_GET_SERVER_INFO(_info) \ + OP_URL_OPT(OP_GET_SERVER_INFO_REQUEST),OP_CHECK_SERVER_INFO_PTR(_info) + +/*@}*/ +/*@}*/ + +/**\defgroup stream_callbacks Abstract Stream Reading Interface*/ +/*@{*/ +/**\name Functions for reading from streams + These functions define the interface used to read from and seek in a stream + of data. + A stream does not need to implement seeking, but the decoder will not be + able to seek if it does not do so. + These functions also include some convenience routines for working with + standard FILE pointers, complete streams stored in a single + block of memory, or URLs.*/ +/*@{*/ + +/**Reads up to \a _nbytes bytes of data from \a _stream. + \param _stream The stream to read from. + \param[out] _ptr The buffer to store the data in. + \param _nbytes The maximum number of bytes to read. + This function may return fewer, though it will not + return zero unless it reaches end-of-file. + \return The number of bytes successfully read, or a negative value on + error.*/ +typedef int (*op_read_func)(void *_stream,unsigned char *_ptr,int _nbytes); + +/**Sets the position indicator for \a _stream. + The new position, measured in bytes, is obtained by adding \a _offset + bytes to the position specified by \a _whence. + If \a _whence is set to SEEK_SET, SEEK_CUR, or + SEEK_END, the offset is relative to the start of the stream, + the current position indicator, or end-of-file, respectively. + \retval 0 Success. + \retval -1 Seeking is not supported or an error occurred. + errno need not be set.*/ +typedef int (*op_seek_func)(void *_stream,opus_int64 _offset,int _whence); + +/**Obtains the current value of the position indicator for \a _stream. + \return The current position indicator.*/ +typedef opus_int64 (*op_tell_func)(void *_stream); + +/**Closes the underlying stream. + \retval 0 Success. + \retval EOF An error occurred. + errno need not be set.*/ +typedef int (*op_close_func)(void *_stream); + +/**The callbacks used to access non-FILE stream resources. + The function prototypes are basically the same as for the stdio functions + fread(), fseek(), ftell(), and + fclose(). + The differences are that the FILE * arguments have been + replaced with a void *, which is to be used as a pointer to + whatever internal data these functions might need, that #seek and #tell + take and return 64-bit offsets, and that #seek must return -1 if + the stream is unseekable.*/ +struct OpusFileCallbacks{ + /**Used to read data from the stream. + This must not be NULL.*/ + op_read_func read; + /**Used to seek in the stream. + This may be NULL if seeking is not implemented.*/ + op_seek_func seek; + /**Used to return the current read position in the stream. + This may be NULL if seeking is not implemented.*/ + op_tell_func tell; + /**Used to close the stream when the decoder is freed. + This may be NULL to leave the stream open.*/ + op_close_func close; +}; + +/**Opens a stream with fopen() and fills in a set of callbacks + that can be used to access it. + This is useful to avoid writing your own portable 64-bit seeking wrappers, + and also avoids cross-module linking issues on Windows, where a + FILE * must be accessed by routines defined in the same module + that opened it. + \param[out] _cb The callbacks to use for this file. + If there is an error opening the file, nothing will be + filled in here. + \param _path The path to the file to open. + On Windows, this string must be UTF-8 (to allow access to + files whose names cannot be represented in the current + MBCS code page). + All other systems use the native character encoding. + \param _mode The mode to open the file in. + \return A stream handle to use with the callbacks, or NULL on + error.*/ +OP_WARN_UNUSED_RESULT void *op_fopen(OpusFileCallbacks *_cb, + const char *_path,const char *_mode) OP_ARG_NONNULL(1) OP_ARG_NONNULL(2) + OP_ARG_NONNULL(3); + +/**Opens a stream with fdopen() and fills in a set of callbacks + that can be used to access it. + This is useful to avoid writing your own portable 64-bit seeking wrappers, + and also avoids cross-module linking issues on Windows, where a + FILE * must be accessed by routines defined in the same module + that opened it. + \param[out] _cb The callbacks to use for this file. + If there is an error opening the file, nothing will be + filled in here. + \param _fd The file descriptor to open. + \param _mode The mode to open the file in. + \return A stream handle to use with the callbacks, or NULL on + error.*/ +OP_WARN_UNUSED_RESULT void *op_fdopen(OpusFileCallbacks *_cb, + int _fd,const char *_mode) OP_ARG_NONNULL(1) OP_ARG_NONNULL(3); + +/**Opens a stream with freopen() and fills in a set of callbacks + that can be used to access it. + This is useful to avoid writing your own portable 64-bit seeking wrappers, + and also avoids cross-module linking issues on Windows, where a + FILE * must be accessed by routines defined in the same module + that opened it. + \param[out] _cb The callbacks to use for this file. + If there is an error opening the file, nothing will be + filled in here. + \param _path The path to the file to open. + On Windows, this string must be UTF-8 (to allow access + to files whose names cannot be represented in the + current MBCS code page). + All other systems use the native character encoding. + \param _mode The mode to open the file in. + \param _stream A stream previously returned by op_fopen(), op_fdopen(), + or op_freopen(). + \return A stream handle to use with the callbacks, or NULL on + error.*/ +OP_WARN_UNUSED_RESULT void *op_freopen(OpusFileCallbacks *_cb, + const char *_path,const char *_mode,void *_stream) OP_ARG_NONNULL(1) + OP_ARG_NONNULL(2) OP_ARG_NONNULL(3) OP_ARG_NONNULL(4); + +/**Creates a stream that reads from the given block of memory. + This block of memory must contain the complete stream to decode. + This is useful for caching small streams (e.g., sound effects) in RAM. + \param[out] _cb The callbacks to use for this stream. + If there is an error creating the stream, nothing will be + filled in here. + \param _data The block of memory to read from. + \param _size The size of the block of memory. + \return A stream handle to use with the callbacks, or NULL on + error.*/ +OP_WARN_UNUSED_RESULT void *op_mem_stream_create(OpusFileCallbacks *_cb, + const unsigned char *_data,size_t _size) OP_ARG_NONNULL(1); + +/**Creates a stream that reads from the given URL. + This function behaves identically to op_url_stream_create(), except that it + takes a va_list instead of a variable number of arguments. + It does not call the va_end macro, and because it invokes the + va_arg macro, the value of \a _ap is undefined after the call. + \note If you use this function, you must link against libopusurl. + \param[out] _cb The callbacks to use for this stream. + If there is an error creating the stream, nothing will + be filled in here. + \param _url The URL to read from. + Currently only the , , and + schemes are supported. + Both and may be disabled at compile + time, in which case opening such URLs will always fail. + Currently this only supports URIs. + IRIs should be converted to UTF-8 and URL-escaped, with + internationalized domain names encoded in punycode, + before passing them to this function. + \param[in,out] _ap A list of the \ref url_options "optional flags" to use. + This is a variable-length list of options terminated + with NULL. + \return A stream handle to use with the callbacks, or NULL on + error.*/ +OP_WARN_UNUSED_RESULT void *op_url_stream_vcreate(OpusFileCallbacks *_cb, + const char *_url,va_list _ap) OP_ARG_NONNULL(1) OP_ARG_NONNULL(2); + +/**Creates a stream that reads from the given URL. + \note If you use this function, you must link against libopusurl. + \param[out] _cb The callbacks to use for this stream. + If there is an error creating the stream, nothing will be + filled in here. + \param _url The URL to read from. + Currently only the , , and schemes + are supported. + Both and may be disabled at compile time, + in which case opening such URLs will always fail. + Currently this only supports URIs. + IRIs should be converted to UTF-8 and URL-escaped, with + internationalized domain names encoded in punycode, before + passing them to this function. + \param ... The \ref url_options "optional flags" to use. + This is a variable-length list of options terminated with + NULL. + \return A stream handle to use with the callbacks, or NULL on + error.*/ +OP_WARN_UNUSED_RESULT void *op_url_stream_create(OpusFileCallbacks *_cb, + const char *_url,...) OP_ARG_NONNULL(1) OP_ARG_NONNULL(2); + +/*@}*/ +/*@}*/ + +/**\defgroup stream_open_close Opening and Closing*/ +/*@{*/ +/**\name Functions for opening and closing streams + + These functions allow you to test a stream to see if it is Opus, open it, + and close it. + Several flavors are provided for each of the built-in stream types, plus a + more general version which takes a set of application-provided callbacks.*/ +/*@{*/ + +/**Test to see if this is an Opus stream. + For good results, you will need at least 57 bytes (for a pure Opus-only + stream). + Something like 512 bytes will give more reliable results for multiplexed + streams. + This function is meant to be a quick-rejection filter. + Its purpose is not to guarantee that a stream is a valid Opus stream, but to + ensure that it looks enough like Opus that it isn't going to be recognized + as some other format (except possibly an Opus stream that is also + multiplexed with other codecs, such as video). + \param[out] _head The parsed ID header contents. + You may pass NULL if you do not need + this information. + If the function fails, the contents of this structure + remain untouched. + \param _initial_data An initial buffer of data from the start of the + stream. + \param _initial_bytes The number of bytes in \a _initial_data. + \return 0 if the data appears to be Opus, or a negative value on error. + \retval #OP_FALSE There was not enough data to tell if this was an Opus + stream or not. + \retval #OP_EFAULT An internal memory allocation failed. + \retval #OP_EIMPL The stream used a feature that is not implemented, + such as an unsupported channel family. + \retval #OP_ENOTFORMAT If the data did not contain a recognizable ID + header for an Opus stream. + \retval #OP_EVERSION If the version field signaled a version this library + does not know how to parse. + \retval #OP_EBADHEADER The ID header was not properly formatted or contained + illegal values.*/ +int op_test(OpusHead *_head, + const unsigned char *_initial_data,size_t _initial_bytes); + +/**Open a stream from the given file path. + \param _path The path to the file to open. + \param[out] _error Returns 0 on success, or a failure code on error. + You may pass in NULL if you don't want the + failure code. + The failure code will be #OP_EFAULT if the file could not + be opened, or one of the other failure codes from + op_open_callbacks() otherwise. + \return A freshly opened \c OggOpusFile, or NULL on error.*/ +OP_WARN_UNUSED_RESULT OggOpusFile *op_open_file(const char *_path,int *_error) + OP_ARG_NONNULL(1); + +/**Open a stream from a memory buffer. + \param _data The memory buffer to open. + \param _size The number of bytes in the buffer. + \param[out] _error Returns 0 on success, or a failure code on error. + You may pass in NULL if you don't want the + failure code. + See op_open_callbacks() for a full list of failure codes. + \return A freshly opened \c OggOpusFile, or NULL on error.*/ +OP_WARN_UNUSED_RESULT OggOpusFile *op_open_memory(const unsigned char *_data, + size_t _size,int *_error); + +/**Open a stream from a URL. + This function behaves identically to op_open_url(), except that it + takes a va_list instead of a variable number of arguments. + It does not call the va_end macro, and because it invokes the + va_arg macro, the value of \a _ap is undefined after the call. + \note If you use this function, you must link against libopusurl. + \param _url The URL to open. + Currently only the , , and + schemes are supported. + Both and may be disabled at compile + time, in which case opening such URLs will always + fail. + Currently this only supports URIs. + IRIs should be converted to UTF-8 and URL-escaped, + with internationalized domain names encoded in + punycode, before passing them to this function. + \param[out] _error Returns 0 on success, or a failure code on error. + You may pass in NULL if you don't want + the failure code. + See op_open_callbacks() for a full list of failure + codes. + \param[in,out] _ap A list of the \ref url_options "optional flags" to + use. + This is a variable-length list of options terminated + with NULL. + \return A freshly opened \c OggOpusFile, or NULL on error.*/ +OP_WARN_UNUSED_RESULT OggOpusFile *op_vopen_url(const char *_url, + int *_error,va_list _ap) OP_ARG_NONNULL(1); + +/**Open a stream from a URL. + \note If you use this function, you must link against libopusurl. + \param _url The URL to open. + Currently only the , , and schemes + are supported. + Both and may be disabled at compile + time, in which case opening such URLs will always fail. + Currently this only supports URIs. + IRIs should be converted to UTF-8 and URL-escaped, with + internationalized domain names encoded in punycode, + before passing them to this function. + \param[out] _error Returns 0 on success, or a failure code on error. + You may pass in NULL if you don't want the + failure code. + See op_open_callbacks() for a full list of failure codes. + \param ... The \ref url_options "optional flags" to use. + This is a variable-length list of options terminated with + NULL. + \return A freshly opened \c OggOpusFile, or NULL on error.*/ +OP_WARN_UNUSED_RESULT OggOpusFile *op_open_url(const char *_url, + int *_error,...) OP_ARG_NONNULL(1); + +/**Open a stream using the given set of callbacks to access it. + \param _source The stream to read from (e.g., a FILE *). + \param _cb The callbacks with which to access the stream. + read() must + be implemented. + seek() and + tell() may + be NULL, or may always return -1 to + indicate a source is unseekable, but if + seek() is + implemented and succeeds on a particular source, then + tell() must + also. + close() may + be NULL, but if it is not, it will be + called when the \c OggOpusFile is destroyed by + op_free(). + It will not be called if op_open_callbacks() fails + with an error. + \param _initial_data An initial buffer of data from the start of the + stream. + Applications can read some number of bytes from the + start of the stream to help identify this as an Opus + stream, and then provide them here to allow the + stream to be opened, even if it is unseekable. + \param _initial_bytes The number of bytes in \a _initial_data. + If the stream is seekable, its current position (as + reported by + tell() + at the start of this function) must be equal to + \a _initial_bytes. + Otherwise, seeking to absolute positions will + generate inconsistent results. + \param[out] _error Returns 0 on success, or a failure code on error. + You may pass in NULL if you don't want + the failure code. + The failure code will be one of +
    +
    #OP_EREAD
    +
    An underlying read, seek, or tell operation + failed when it should have succeeded, or we failed + to find data in the stream we had seen before.
    +
    #OP_EFAULT
    +
    There was a memory allocation failure, or an + internal library error.
    +
    #OP_EIMPL
    +
    The stream used a feature that is not + implemented, such as an unsupported channel + family.
    +
    #OP_EINVAL
    +
    seek() + was implemented and succeeded on this source, but + tell() + did not, or the starting position indicator was + not equal to \a _initial_bytes.
    +
    #OP_ENOTFORMAT
    +
    The stream contained a link that did not have + any logical Opus streams in it.
    +
    #OP_EBADHEADER
    +
    A required header packet was not properly + formatted, contained illegal values, or was missing + altogether.
    +
    #OP_EVERSION
    +
    An ID header contained an unrecognized version + number.
    +
    #OP_EBADLINK
    +
    We failed to find data we had seen before after + seeking.
    +
    #OP_EBADTIMESTAMP
    +
    The first or last timestamp in a link failed + basic validity checks.
    +
    + \return A freshly opened \c OggOpusFile, or NULL on error. + libopusfile does not take ownership of the source + if the call fails. + The calling application is responsible for closing the source if + this call returns an error.*/ +OP_WARN_UNUSED_RESULT OggOpusFile *op_open_callbacks(void *_source, + const OpusFileCallbacks *_cb,const unsigned char *_initial_data, + size_t _initial_bytes,int *_error) OP_ARG_NONNULL(2); + +/**Partially open a stream from the given file path. + \see op_test_callbacks + \param _path The path to the file to open. + \param[out] _error Returns 0 on success, or a failure code on error. + You may pass in NULL if you don't want the + failure code. + The failure code will be #OP_EFAULT if the file could not + be opened, or one of the other failure codes from + op_open_callbacks() otherwise. + \return A partially opened \c OggOpusFile, or NULL on error.*/ +OP_WARN_UNUSED_RESULT OggOpusFile *op_test_file(const char *_path,int *_error) + OP_ARG_NONNULL(1); + +/**Partially open a stream from a memory buffer. + \see op_test_callbacks + \param _data The memory buffer to open. + \param _size The number of bytes in the buffer. + \param[out] _error Returns 0 on success, or a failure code on error. + You may pass in NULL if you don't want the + failure code. + See op_open_callbacks() for a full list of failure codes. + \return A partially opened \c OggOpusFile, or NULL on error.*/ +OP_WARN_UNUSED_RESULT OggOpusFile *op_test_memory(const unsigned char *_data, + size_t _size,int *_error); + +/**Partially open a stream from a URL. + This function behaves identically to op_test_url(), except that it + takes a va_list instead of a variable number of arguments. + It does not call the va_end macro, and because it invokes the + va_arg macro, the value of \a _ap is undefined after the call. + \note If you use this function, you must link against libopusurl. + \see op_test_url + \see op_test_callbacks + \param _url The URL to open. + Currently only the , , and + schemes are supported. + Both and may be disabled at compile + time, in which case opening such URLs will always + fail. + Currently this only supports URIs. + IRIs should be converted to UTF-8 and URL-escaped, + with internationalized domain names encoded in + punycode, before passing them to this function. + \param[out] _error Returns 0 on success, or a failure code on error. + You may pass in NULL if you don't want + the failure code. + See op_open_callbacks() for a full list of failure + codes. + \param[in,out] _ap A list of the \ref url_options "optional flags" to + use. + This is a variable-length list of options terminated + with NULL. + \return A partially opened \c OggOpusFile, or NULL on error.*/ +OP_WARN_UNUSED_RESULT OggOpusFile *op_vtest_url(const char *_url, + int *_error,va_list _ap) OP_ARG_NONNULL(1); + +/**Partially open a stream from a URL. + \note If you use this function, you must link against libopusurl. + \see op_test_callbacks + \param _url The URL to open. + Currently only the , , and + schemes are supported. + Both and may be disabled at compile + time, in which case opening such URLs will always fail. + Currently this only supports URIs. + IRIs should be converted to UTF-8 and URL-escaped, with + internationalized domain names encoded in punycode, + before passing them to this function. + \param[out] _error Returns 0 on success, or a failure code on error. + You may pass in NULL if you don't want the + failure code. + See op_open_callbacks() for a full list of failure + codes. + \param ... The \ref url_options "optional flags" to use. + This is a variable-length list of options terminated + with NULL. + \return A partially opened \c OggOpusFile, or NULL on error.*/ +OP_WARN_UNUSED_RESULT OggOpusFile *op_test_url(const char *_url, + int *_error,...) OP_ARG_NONNULL(1); + +/**Partially open a stream using the given set of callbacks to access it. + This tests for Opusness and loads the headers for the first link. + It does not seek (although it tests for seekability). + You can query a partially open stream for the few pieces of basic + information returned by op_serialno(), op_channel_count(), op_head(), and + op_tags() (but only for the first link). + You may also determine if it is seekable via a call to op_seekable(). + You cannot read audio from the stream, seek, get the size or duration, + get information from links other than the first one, or even get the total + number of links until you finish opening the stream with op_test_open(). + If you do not need to do any of these things, you can dispose of it with + op_free() instead. + + This function is provided mostly to simplify porting existing code that used + libvorbisfile. + For new code, you are likely better off using op_test() instead, which + is less resource-intensive, requires less data to succeed, and imposes a + hard limit on the amount of data it examines (important for unseekable + sources, where all such data must be buffered until you are sure of the + stream type). + \param _source The stream to read from (e.g., a FILE *). + \param _cb The callbacks with which to access the stream. + read() must + be implemented. + seek() and + tell() may + be NULL, or may always return -1 to + indicate a source is unseekable, but if + seek() is + implemented and succeeds on a particular source, then + tell() must + also. + close() may + be NULL, but if it is not, it will be + called when the \c OggOpusFile is destroyed by + op_free(). + It will not be called if op_open_callbacks() fails + with an error. + \param _initial_data An initial buffer of data from the start of the + stream. + Applications can read some number of bytes from the + start of the stream to help identify this as an Opus + stream, and then provide them here to allow the + stream to be tested more thoroughly, even if it is + unseekable. + \param _initial_bytes The number of bytes in \a _initial_data. + If the stream is seekable, its current position (as + reported by + tell() + at the start of this function) must be equal to + \a _initial_bytes. + Otherwise, seeking to absolute positions will + generate inconsistent results. + \param[out] _error Returns 0 on success, or a failure code on error. + You may pass in NULL if you don't want + the failure code. + See op_open_callbacks() for a full list of failure + codes. + \return A partially opened \c OggOpusFile, or NULL on error. + libopusfile does not take ownership of the source + if the call fails. + The calling application is responsible for closing the source if + this call returns an error.*/ +OP_WARN_UNUSED_RESULT OggOpusFile *op_test_callbacks(void *_source, + const OpusFileCallbacks *_cb,const unsigned char *_initial_data, + size_t _initial_bytes,int *_error) OP_ARG_NONNULL(2); + +/**Finish opening a stream partially opened with op_test_callbacks() or one of + the associated convenience functions. + If this function fails, you are still responsible for freeing the + \c OggOpusFile with op_free(). + \param _of The \c OggOpusFile to finish opening. + \return 0 on success, or a negative value on error. + \retval #OP_EREAD An underlying read, seek, or tell operation failed + when it should have succeeded. + \retval #OP_EFAULT There was a memory allocation failure, or an + internal library error. + \retval #OP_EIMPL The stream used a feature that is not implemented, + such as an unsupported channel family. + \retval #OP_EINVAL The stream was not partially opened with + op_test_callbacks() or one of the associated + convenience functions. + \retval #OP_ENOTFORMAT The stream contained a link that did not have any + logical Opus streams in it. + \retval #OP_EBADHEADER A required header packet was not properly + formatted, contained illegal values, or was + missing altogether. + \retval #OP_EVERSION An ID header contained an unrecognized version + number. + \retval #OP_EBADLINK We failed to find data we had seen before after + seeking. + \retval #OP_EBADTIMESTAMP The first or last timestamp in a link failed basic + validity checks.*/ +int op_test_open(OggOpusFile *_of) OP_ARG_NONNULL(1); + +/**Release all memory used by an \c OggOpusFile. + \param _of The \c OggOpusFile to free.*/ +void op_free(OggOpusFile *_of); + +/*@}*/ +/*@}*/ + +/**\defgroup stream_info Stream Information*/ +/*@{*/ +/**\name Functions for obtaining information about streams + + These functions allow you to get basic information about a stream, including + seekability, the number of links (for chained streams), plus the size, + duration, bitrate, header parameters, and meta information for each link + (or, where available, the stream as a whole). + Some of these (size, duration) are only available for seekable streams. + You can also query the current stream position, link, and playback time, + and instantaneous bitrate during playback. + + Some of these functions may be used successfully on the partially open + streams returned by op_test_callbacks() or one of the associated + convenience functions. + Their documention will indicate so explicitly.*/ +/*@{*/ + +/**Returns whether or not the data source being read is seekable. + This is true if +
      +
    1. The seek() and + tell() callbacks are both + non-NULL,
    2. +
    3. The seek() callback was + successfully executed at least once, and
    4. +
    5. The tell() callback was + successfully able to report the position indicator afterwards.
    6. +
    + This function may be called on partially-opened streams. + \param _of The \c OggOpusFile whose seekable status is to be returned. + \return A non-zero value if seekable, and 0 if unseekable.*/ +int op_seekable(const OggOpusFile *_of) OP_ARG_NONNULL(1); + +/**Returns the number of links in this chained stream. + This function may be called on partially-opened streams, but it will always + return 1. + The actual number of links is not known until the stream is fully opened. + \param _of The \c OggOpusFile from which to retrieve the link count. + \return For fully-open seekable sources, this returns the total number of + links in the whole stream, which will be at least 1. + For partially-open or unseekable sources, this always returns 1.*/ +int op_link_count(const OggOpusFile *_of) OP_ARG_NONNULL(1); + +/**Get the serial number of the given link in a (possibly-chained) Ogg Opus + stream. + This function may be called on partially-opened streams, but it will always + return the serial number of the Opus stream in the first link. + \param _of The \c OggOpusFile from which to retrieve the serial number. + \param _li The index of the link whose serial number should be retrieved. + Use a negative number to get the serial number of the current + link. + \return The serial number of the given link. + If \a _li is greater than the total number of links, this returns + the serial number of the last link. + If the source is not seekable, this always returns the serial number + of the current link.*/ +opus_uint32 op_serialno(const OggOpusFile *_of,int _li) OP_ARG_NONNULL(1); + +/**Get the channel count of the given link in a (possibly-chained) Ogg Opus + stream. + This is equivalent to op_head(_of,_li)->channel_count, but + is provided for convenience. + This function may be called on partially-opened streams, but it will always + return the channel count of the Opus stream in the first link. + \param _of The \c OggOpusFile from which to retrieve the channel count. + \param _li The index of the link whose channel count should be retrieved. + Use a negative number to get the channel count of the current + link. + \return The channel count of the given link. + If \a _li is greater than the total number of links, this returns + the channel count of the last link. + If the source is not seekable, this always returns the channel count + of the current link.*/ +int op_channel_count(const OggOpusFile *_of,int _li) OP_ARG_NONNULL(1); + +/**Get the total (compressed) size of the stream, or of an individual link in + a (possibly-chained) Ogg Opus stream, including all headers and Ogg muxing + overhead. + \warning If the Opus stream (or link) is concurrently multiplexed with other + logical streams (e.g., video), this returns the size of the entire stream + (or link), not just the number of bytes in the first logical Opus stream. + Returning the latter would require scanning the entire file. + \param _of The \c OggOpusFile from which to retrieve the compressed size. + \param _li The index of the link whose compressed size should be computed. + Use a negative number to get the compressed size of the entire + stream. + \return The compressed size of the entire stream if \a _li is negative, the + compressed size of link \a _li if it is non-negative, or a negative + value on error. + The compressed size of the entire stream may be smaller than that + of the underlying source if trailing garbage was detected in the + file. + \retval #OP_EINVAL The source is not seekable (so we can't know the length), + \a _li wasn't less than the total number of links in + the stream, or the stream was only partially open.*/ +opus_int64 op_raw_total(const OggOpusFile *_of,int _li) OP_ARG_NONNULL(1); + +/**Get the total PCM length (number of samples at 48 kHz) of the stream, or of + an individual link in a (possibly-chained) Ogg Opus stream. + Users looking for op_time_total() should use op_pcm_total() + instead. + Because timestamps in Opus are fixed at 48 kHz, there is no need for a + separate function to convert this to seconds (and leaving it out avoids + introducing floating point to the API, for those that wish to avoid it). + \param _of The \c OggOpusFile from which to retrieve the PCM offset. + \param _li The index of the link whose PCM length should be computed. + Use a negative number to get the PCM length of the entire stream. + \return The PCM length of the entire stream if \a _li is negative, the PCM + length of link \a _li if it is non-negative, or a negative value on + error. + \retval #OP_EINVAL The source is not seekable (so we can't know the length), + \a _li wasn't less than the total number of links in + the stream, or the stream was only partially open.*/ +ogg_int64_t op_pcm_total(const OggOpusFile *_of,int _li) OP_ARG_NONNULL(1); + +/**Get the ID header information for the given link in a (possibly chained) Ogg + Opus stream. + This function may be called on partially-opened streams, but it will always + return the ID header information of the Opus stream in the first link. + \param _of The \c OggOpusFile from which to retrieve the ID header + information. + \param _li The index of the link whose ID header information should be + retrieved. + Use a negative number to get the ID header information of the + current link. + For an unseekable stream, \a _li is ignored, and the ID header + information for the current link is always returned, if + available. + \return The contents of the ID header for the given link.*/ +const OpusHead *op_head(const OggOpusFile *_of,int _li) OP_ARG_NONNULL(1); + +/**Get the comment header information for the given link in a (possibly + chained) Ogg Opus stream. + This function may be called on partially-opened streams, but it will always + return the tags from the Opus stream in the first link. + \param _of The \c OggOpusFile from which to retrieve the comment header + information. + \param _li The index of the link whose comment header information should be + retrieved. + Use a negative number to get the comment header information of + the current link. + For an unseekable stream, \a _li is ignored, and the comment + header information for the current link is always returned, if + available. + \return The contents of the comment header for the given link, or + NULL if this is an unseekable stream that encountered + an invalid link.*/ +const OpusTags *op_tags(const OggOpusFile *_of,int _li) OP_ARG_NONNULL(1); + +/**Retrieve the index of the current link. + This is the link that produced the data most recently read by + op_read_float() or its associated functions, or, after a seek, the link + that the seek target landed in. + Reading more data may advance the link index (even on the first read after a + seek). + \param _of The \c OggOpusFile from which to retrieve the current link index. + \return The index of the current link on success, or a negative value on + failure. + For seekable streams, this is a number between 0 and the value + returned by op_link_count(). + For unseekable streams, this value starts at 0 and increments by one + each time a new link is encountered (even though op_link_count() + always returns 1). + \retval #OP_EINVAL The stream was only partially open.*/ +int op_current_link(const OggOpusFile *_of) OP_ARG_NONNULL(1); + +/**Computes the bitrate of the stream, or of an individual link in a + (possibly-chained) Ogg Opus stream. + The stream must be seekable to compute the bitrate. + For unseekable streams, use op_bitrate_instant() to get periodic estimates. + \warning If the Opus stream (or link) is concurrently multiplexed with other + logical streams (e.g., video), this uses the size of the entire stream (or + link) to compute the bitrate, not just the number of bytes in the first + logical Opus stream. + Returning the latter requires scanning the entire file, but this may be done + by decoding the whole file and calling op_bitrate_instant() once at the + end. + Install a trivial decoding callback with op_set_decode_callback() if you + wish to skip actual decoding during this process. + \param _of The \c OggOpusFile from which to retrieve the bitrate. + \param _li The index of the link whose bitrate should be computed. + Use a negative number to get the bitrate of the whole stream. + \return The bitrate on success, or a negative value on error. + \retval #OP_EINVAL The stream was only partially open, the stream was not + seekable, or \a _li was larger than the number of + links.*/ +opus_int32 op_bitrate(const OggOpusFile *_of,int _li) OP_ARG_NONNULL(1); + +/**Compute the instantaneous bitrate, measured as the ratio of bits to playable + samples decoded since a) the last call to op_bitrate_instant(), b) the last + seek, or c) the start of playback, whichever was most recent. + This will spike somewhat after a seek or at the start/end of a chain + boundary, as pre-skip, pre-roll, and end-trimming causes samples to be + decoded but not played. + \param _of The \c OggOpusFile from which to retrieve the bitrate. + \return The bitrate, in bits per second, or a negative value on error. + \retval #OP_FALSE No data has been decoded since any of the events + described above. + \retval #OP_EINVAL The stream was only partially open.*/ +opus_int32 op_bitrate_instant(OggOpusFile *_of) OP_ARG_NONNULL(1); + +/**Obtain the current value of the position indicator for \a _of. + \param _of The \c OggOpusFile from which to retrieve the position indicator. + \return The byte position that is currently being read from. + \retval #OP_EINVAL The stream was only partially open.*/ +opus_int64 op_raw_tell(const OggOpusFile *_of) OP_ARG_NONNULL(1); + +/**Obtain the PCM offset of the next sample to be read. + If the stream is not properly timestamped, this might not increment by the + proper amount between reads, or even return monotonically increasing + values. + \param _of The \c OggOpusFile from which to retrieve the PCM offset. + \return The PCM offset of the next sample to be read. + \retval #OP_EINVAL The stream was only partially open.*/ +ogg_int64_t op_pcm_tell(const OggOpusFile *_of) OP_ARG_NONNULL(1); + +/*@}*/ +/*@}*/ + +/**\defgroup stream_seeking Seeking*/ +/*@{*/ +/**\name Functions for seeking in Opus streams + + These functions let you seek in Opus streams, if the underlying source + support it. + Seeking is implemented for all built-in stream I/O routines, though some + individual sources may not be seekable (pipes, live HTTP streams, or HTTP + streams from a server that does not support Range requests). + + op_raw_seek() is the fastest: it is guaranteed to perform at most one + physical seek, but, since the target is a byte position, makes no guarantee + how close to a given time it will come. + op_pcm_seek() provides sample-accurate seeking. + The number of physical seeks it requires is still quite small (often 1 or + 2, even in highly variable bitrate streams). + + Seeking in Opus requires decoding some pre-roll amount before playback to + allow the internal state to converge (as if recovering from packet loss). + This is handled internally by libopusfile, but means there is + little extra overhead for decoding up to the exact position requested + (since it must decode some amount of audio anyway). + It also means that decoding after seeking may not return exactly the same + values as would be obtained by decoding the stream straight through. + However, such differences are expected to be smaller than the loss + introduced by Opus's lossy compression.*/ +/*@{*/ + +/**Seek to a byte offset relative to the compressed data. + This also scans packets to update the PCM cursor. + It will cross a logical bitstream boundary, but only if it can't get any + packets out of the tail of the link to which it seeks. + \param _of The \c OggOpusFile in which to seek. + \param _byte_offset The byte position to seek to. + \return 0 on success, or a negative error code on failure. + \retval #OP_EREAD The underlying seek operation failed. + \retval #OP_EINVAL The stream was only partially open, or the target was + outside the valid range for the stream. + \retval #OP_ENOSEEK This stream is not seekable. + \retval #OP_EBADLINK Failed to initialize a decoder for a stream for an + unknown reason.*/ +int op_raw_seek(OggOpusFile *_of,opus_int64 _byte_offset) OP_ARG_NONNULL(1); + +/**Seek to the specified PCM offset, such that decoding will begin at exactly + the requested position. + \param _of The \c OggOpusFile in which to seek. + \param _pcm_offset The PCM offset to seek to. + This is in samples at 48 kHz relative to the start of the + stream. + \return 0 on success, or a negative value on error. + \retval #OP_EREAD An underlying read or seek operation failed. + \retval #OP_EINVAL The stream was only partially open, or the target was + outside the valid range for the stream. + \retval #OP_ENOSEEK This stream is not seekable. + \retval #OP_EBADLINK We failed to find data we had seen before, or the + bitstream structure was sufficiently malformed that + seeking to the target destination was impossible.*/ +int op_pcm_seek(OggOpusFile *_of,ogg_int64_t _pcm_offset) OP_ARG_NONNULL(1); + +/*@}*/ +/*@}*/ + +/**\defgroup stream_decoding Decoding*/ +/*@{*/ +/**\name Functions for decoding audio data + + These functions retrieve actual decoded audio data from the stream. + The general functions, op_read() and op_read_float() return 16-bit or + floating-point output, both using native endian ordering. + The number of channels returned can change from link to link in a chained + stream. + There are special functions, op_read_stereo() and op_read_float_stereo(), + which always output two channels, to simplify applications which do not + wish to handle multichannel audio. + These downmix multichannel files to two channels, so they can always return + samples in the same format for every link in a chained file. + + If the rest of your audio processing chain can handle floating point, the + floating-point routines should be preferred, as they prevent clipping and + other issues which might be avoided entirely if, e.g., you scale down the + volume at some other stage. + However, if you intend to consume 16-bit samples directly, the conversion in + libopusfile provides noise-shaping dithering and, if compiled + against libopus 1.1 or later, soft-clipping prevention. + + libopusfile can also be configured at compile time to use the + fixed-point libopus API. + If so, libopusfile's floating-point API may also be disabled. + In that configuration, nothing in libopusfile will use any + floating-point operations, to simplify support on devices without an + adequate FPU. + + \warning HTTPS streams may be be vulnerable to truncation attacks if you do + not check the error return code from op_read_float() or its associated + functions. + If the remote peer does not close the connection gracefully (with a TLS + "close notify" message), these functions will return #OP_EREAD instead of 0 + when they reach the end of the file. + If you are reading from an URL (particularly if seeking is not + supported), you should make sure to check for this error and warn the user + appropriately.*/ +/*@{*/ + +/**Indicates that the decoding callback should produce signed 16-bit + native-endian output samples.*/ +#define OP_DEC_FORMAT_SHORT (7008) +/**Indicates that the decoding callback should produce 32-bit native-endian + float samples.*/ +#define OP_DEC_FORMAT_FLOAT (7040) + +/**Indicates that the decoding callback did not decode anything, and that + libopusfile should decode normally instead.*/ +#define OP_DEC_USE_DEFAULT (6720) + +/**Called to decode an Opus packet. + This should invoke the functional equivalent of opus_multistream_decode() or + opus_multistream_decode_float(), except that it returns 0 on success + instead of the number of decoded samples (which is known a priori). + \param _ctx The application-provided callback context. + \param _decoder The decoder to use to decode the packet. + \param[out] _pcm The buffer to decode into. + This will always have enough room for \a _nchannels of + \a _nsamples samples, which should be placed into this + buffer interleaved. + \param _op The packet to decode. + This will always have its granule position set to a valid + value. + \param _nsamples The number of samples expected from the packet. + \param _nchannels The number of channels expected from the packet. + \param _format The desired sample output format. + This is either #OP_DEC_FORMAT_SHORT or + #OP_DEC_FORMAT_FLOAT. + \param _li The index of the link from which this packet was decoded. + \return A non-negative value on success, or a negative value on error. + Any error codes should be the same as those returned by + opus_multistream_decode() or opus_multistream_decode_float(). + Success codes are as follows: + \retval 0 Decoding was successful. + The application has filled the buffer with + exactly \a _nsamples*\a + _nchannels samples in the requested + format. + \retval #OP_DEC_USE_DEFAULT No decoding was done. + libopusfile should do the decoding + by itself instead.*/ +typedef int (*op_decode_cb_func)(void *_ctx,OpusMSDecoder *_decoder,void *_pcm, + const ogg_packet *_op,int _nsamples,int _nchannels,int _format,int _li); + +/**Sets the packet decode callback function. + If set, this is called once for each packet that needs to be decoded. + This can be used by advanced applications to do additional processing on the + compressed or uncompressed data. + For example, an application might save the final entropy coder state for + debugging and testing purposes, or it might apply additional filters + before the downmixing, dithering, or soft-clipping performed by + libopusfile, so long as these filters do not introduce any + latency. + + A call to this function is no guarantee that the audio will eventually be + delivered to the application. + libopusfile may discard some or all of the decoded audio data + (i.e., at the beginning or end of a link, or after a seek), however the + callback is still required to provide all of it. + \param _of The \c OggOpusFile on which to set the decode callback. + \param _decode_cb The callback function to call. + This may be NULL to disable calling the + callback. + \param _ctx The application-provided context pointer to pass to the + callback on each call.*/ +void op_set_decode_callback(OggOpusFile *_of, + op_decode_cb_func _decode_cb,void *_ctx) OP_ARG_NONNULL(1); + +/**Gain offset type that indicates that the provided offset is relative to the + header gain. + This is the default.*/ +#define OP_HEADER_GAIN (0) + +/**Gain offset type that indicates that the provided offset is relative to the + R128_ALBUM_GAIN value (if any), in addition to the header gain.*/ +#define OP_ALBUM_GAIN (3007) + +/**Gain offset type that indicates that the provided offset is relative to the + R128_TRACK_GAIN value (if any), in addition to the header gain.*/ +#define OP_TRACK_GAIN (3008) + +/**Gain offset type that indicates that the provided offset should be used as + the gain directly, without applying any the header or track gains.*/ +#define OP_ABSOLUTE_GAIN (3009) + +/**Sets the gain to be used for decoded output. + By default, the gain in the header is applied with no additional offset. + The total gain (including header gain and/or track gain, if applicable, and + this offset), will be clamped to [-32768,32767]/256 dB. + This is more than enough to saturate or underflow 16-bit PCM. + \note The new gain will not be applied to any already buffered, decoded + output. + This means you cannot change it sample-by-sample, as at best it will be + updated packet-by-packet. + It is meant for setting a target volume level, rather than applying smooth + fades, etc. + \param _of The \c OggOpusFile on which to set the gain offset. + \param _gain_type One of #OP_HEADER_GAIN, #OP_ALBUM_GAIN, + #OP_TRACK_GAIN, or #OP_ABSOLUTE_GAIN. + \param _gain_offset_q8 The gain offset to apply, in 1/256ths of a dB. + \return 0 on success or a negative value on error. + \retval #OP_EINVAL The \a _gain_type was unrecognized.*/ +int op_set_gain_offset(OggOpusFile *_of, + int _gain_type,opus_int32 _gain_offset_q8) OP_ARG_NONNULL(1); + +/**Sets whether or not dithering is enabled for 16-bit decoding. + By default, when libopusfile is compiled to use floating-point + internally, calling op_read() or op_read_stereo() will first decode to + float, and then convert to fixed-point using noise-shaping dithering. + This flag can be used to disable that dithering. + When the application uses op_read_float() or op_read_float_stereo(), or when + the library has been compiled to decode directly to fixed point, this flag + has no effect. + \param _of The \c OggOpusFile on which to enable or disable dithering. + \param _enabled A non-zero value to enable dithering, or 0 to disable it.*/ +void op_set_dither_enabled(OggOpusFile *_of,int _enabled) OP_ARG_NONNULL(1); + +/**Reads more samples from the stream. + \note Although \a _buf_size must indicate the total number of values that + can be stored in \a _pcm, the return value is the number of samples + per channel. + This is done because +
      +
    1. The channel count cannot be known a priori (reading more samples might + advance us into the next link, with a different channel count), so + \a _buf_size cannot also be in units of samples per channel,
    2. +
    3. Returning the samples per channel matches the libopus API + as closely as we're able,
    4. +
    5. Returning the total number of values instead of samples per channel + would mean the caller would need a division to compute the samples per + channel, and might worry about the possibility of getting back samples + for some channels and not others, and
    6. +
    7. This approach is relatively fool-proof: if an application passes too + small a value to \a _buf_size, they will simply get fewer samples back, + and if they assume the return value is the total number of values, then + they will simply read too few (rather than reading too many and going + off the end of the buffer).
    8. +
    + \param _of The \c OggOpusFile from which to read. + \param[out] _pcm A buffer in which to store the output PCM samples, as + signed native-endian 16-bit values at 48 kHz + with a nominal range of [-32768,32767). + Multiple channels are interleaved using the + Vorbis + channel ordering. + This must have room for at least \a _buf_size values. + \param _buf_size The number of values that can be stored in \a _pcm. + It is recommended that this be large enough for at + least 120 ms of data at 48 kHz per channel (5760 + values per channel). + Smaller buffers will simply return less data, possibly + consuming more memory to buffer the data internally. + libopusfile may return less data than + requested. + If so, there is no guarantee that the remaining data + in \a _pcm will be unmodified. + \param[out] _li The index of the link this data was decoded from. + You may pass NULL if you do not need this + information. + If this function fails (returning a negative value), + this parameter is left unset. + \return The number of samples read per channel on success, or a negative + value on failure. + The channel count can be retrieved on success by calling + op_head(_of,*_li). + The number of samples returned may be 0 if the buffer was too small + to store even a single sample for all channels, or if end-of-file + was reached. + The list of possible failure codes follows. + Most of them can only be returned by unseekable, chained streams + that encounter a new link. + \retval #OP_HOLE There was a hole in the data, and some samples + may have been skipped. + Call this function again to continue decoding + past the hole. + \retval #OP_EREAD An underlying read operation failed. + This may signal a truncation attack from an + source. + \retval #OP_EFAULT An internal memory allocation failed. + \retval #OP_EIMPL An unseekable stream encountered a new link that + used a feature that is not implemented, such as + an unsupported channel family. + \retval #OP_EINVAL The stream was only partially open. + \retval #OP_ENOTFORMAT An unseekable stream encountered a new link that + did not have any logical Opus streams in it. + \retval #OP_EBADHEADER An unseekable stream encountered a new link with a + required header packet that was not properly + formatted, contained illegal values, or was + missing altogether. + \retval #OP_EVERSION An unseekable stream encountered a new link with + an ID header that contained an unrecognized + version number. + \retval #OP_EBADPACKET Failed to properly decode the next packet. + \retval #OP_EBADLINK We failed to find data we had seen before. + \retval #OP_EBADTIMESTAMP An unseekable stream encountered a new link with + a starting timestamp that failed basic validity + checks.*/ +OP_WARN_UNUSED_RESULT int op_read(OggOpusFile *_of, + opus_int16 *_pcm,int _buf_size,int *_li) OP_ARG_NONNULL(1); + +/**Reads more samples from the stream. + \note Although \a _buf_size must indicate the total number of values that + can be stored in \a _pcm, the return value is the number of samples + per channel. +
      +
    1. The channel count cannot be known a priori (reading more samples might + advance us into the next link, with a different channel count), so + \a _buf_size cannot also be in units of samples per channel,
    2. +
    3. Returning the samples per channel matches the libopus API + as closely as we're able,
    4. +
    5. Returning the total number of values instead of samples per channel + would mean the caller would need a division to compute the samples per + channel, and might worry about the possibility of getting back samples + for some channels and not others, and
    6. +
    7. This approach is relatively fool-proof: if an application passes too + small a value to \a _buf_size, they will simply get fewer samples back, + and if they assume the return value is the total number of values, then + they will simply read too few (rather than reading too many and going + off the end of the buffer).
    8. +
    + \param _of The \c OggOpusFile from which to read. + \param[out] _pcm A buffer in which to store the output PCM samples as + signed floats at 48 kHz with a nominal range of + [-1.0,1.0]. + Multiple channels are interleaved using the + Vorbis + channel ordering. + This must have room for at least \a _buf_size floats. + \param _buf_size The number of floats that can be stored in \a _pcm. + It is recommended that this be large enough for at + least 120 ms of data at 48 kHz per channel (5760 + samples per channel). + Smaller buffers will simply return less data, possibly + consuming more memory to buffer the data internally. + If less than \a _buf_size values are returned, + libopusfile makes no guarantee that the + remaining data in \a _pcm will be unmodified. + \param[out] _li The index of the link this data was decoded from. + You may pass NULL if you do not need this + information. + If this function fails (returning a negative value), + this parameter is left unset. + \return The number of samples read per channel on success, or a negative + value on failure. + The channel count can be retrieved on success by calling + op_head(_of,*_li). + The number of samples returned may be 0 if the buffer was too small + to store even a single sample for all channels, or if end-of-file + was reached. + The list of possible failure codes follows. + Most of them can only be returned by unseekable, chained streams + that encounter a new link. + \retval #OP_HOLE There was a hole in the data, and some samples + may have been skipped. + Call this function again to continue decoding + past the hole. + \retval #OP_EREAD An underlying read operation failed. + This may signal a truncation attack from an + source. + \retval #OP_EFAULT An internal memory allocation failed. + \retval #OP_EIMPL An unseekable stream encountered a new link that + used a feature that is not implemented, such as + an unsupported channel family. + \retval #OP_EINVAL The stream was only partially open. + \retval #OP_ENOTFORMAT An unseekable stream encountered a new link that + did not have any logical Opus streams in it. + \retval #OP_EBADHEADER An unseekable stream encountered a new link with a + required header packet that was not properly + formatted, contained illegal values, or was + missing altogether. + \retval #OP_EVERSION An unseekable stream encountered a new link with + an ID header that contained an unrecognized + version number. + \retval #OP_EBADPACKET Failed to properly decode the next packet. + \retval #OP_EBADLINK We failed to find data we had seen before. + \retval #OP_EBADTIMESTAMP An unseekable stream encountered a new link with + a starting timestamp that failed basic validity + checks.*/ +OP_WARN_UNUSED_RESULT int op_read_float(OggOpusFile *_of, + float *_pcm,int _buf_size,int *_li) OP_ARG_NONNULL(1); + +/**Reads more samples from the stream and downmixes to stereo, if necessary. + This function is intended for simple players that want a uniform output + format, even if the channel count changes between links in a chained + stream. + \note \a _buf_size indicates the total number of values that can be stored + in \a _pcm, while the return value is the number of samples per + channel, even though the channel count is known, for consistency with + op_read(). + \param _of The \c OggOpusFile from which to read. + \param[out] _pcm A buffer in which to store the output PCM samples, as + signed native-endian 16-bit values at 48 kHz + with a nominal range of [-32768,32767). + The left and right channels are interleaved in the + buffer. + This must have room for at least \a _buf_size values. + \param _buf_size The number of values that can be stored in \a _pcm. + It is recommended that this be large enough for at + least 120 ms of data at 48 kHz per channel (11520 + values total). + Smaller buffers will simply return less data, possibly + consuming more memory to buffer the data internally. + If less than \a _buf_size values are returned, + libopusfile makes no guarantee that the + remaining data in \a _pcm will be unmodified. + \return The number of samples read per channel on success, or a negative + value on failure. + The number of samples returned may be 0 if the buffer was too small + to store even a single sample for both channels, or if end-of-file + was reached. + The list of possible failure codes follows. + Most of them can only be returned by unseekable, chained streams + that encounter a new link. + \retval #OP_HOLE There was a hole in the data, and some samples + may have been skipped. + Call this function again to continue decoding + past the hole. + \retval #OP_EREAD An underlying read operation failed. + This may signal a truncation attack from an + source. + \retval #OP_EFAULT An internal memory allocation failed. + \retval #OP_EIMPL An unseekable stream encountered a new link that + used a feature that is not implemented, such as + an unsupported channel family. + \retval #OP_EINVAL The stream was only partially open. + \retval #OP_ENOTFORMAT An unseekable stream encountered a new link that + did not have any logical Opus streams in it. + \retval #OP_EBADHEADER An unseekable stream encountered a new link with a + required header packet that was not properly + formatted, contained illegal values, or was + missing altogether. + \retval #OP_EVERSION An unseekable stream encountered a new link with + an ID header that contained an unrecognized + version number. + \retval #OP_EBADPACKET Failed to properly decode the next packet. + \retval #OP_EBADLINK We failed to find data we had seen before. + \retval #OP_EBADTIMESTAMP An unseekable stream encountered a new link with + a starting timestamp that failed basic validity + checks.*/ +OP_WARN_UNUSED_RESULT int op_read_stereo(OggOpusFile *_of, + opus_int16 *_pcm,int _buf_size) OP_ARG_NONNULL(1); + +/**Reads more samples from the stream and downmixes to stereo, if necessary. + This function is intended for simple players that want a uniform output + format, even if the channel count changes between links in a chained + stream. + \note \a _buf_size indicates the total number of values that can be stored + in \a _pcm, while the return value is the number of samples per + channel, even though the channel count is known, for consistency with + op_read_float(). + \param _of The \c OggOpusFile from which to read. + \param[out] _pcm A buffer in which to store the output PCM samples, as + signed floats at 48 kHz with a nominal range of + [-1.0,1.0]. + The left and right channels are interleaved in the + buffer. + This must have room for at least \a _buf_size values. + \param _buf_size The number of values that can be stored in \a _pcm. + It is recommended that this be large enough for at + least 120 ms of data at 48 kHz per channel (11520 + values total). + Smaller buffers will simply return less data, possibly + consuming more memory to buffer the data internally. + If less than \a _buf_size values are returned, + libopusfile makes no guarantee that the + remaining data in \a _pcm will be unmodified. + \return The number of samples read per channel on success, or a negative + value on failure. + The number of samples returned may be 0 if the buffer was too small + to store even a single sample for both channels, or if end-of-file + was reached. + The list of possible failure codes follows. + Most of them can only be returned by unseekable, chained streams + that encounter a new link. + \retval #OP_HOLE There was a hole in the data, and some samples + may have been skipped. + Call this function again to continue decoding + past the hole. + \retval #OP_EREAD An underlying read operation failed. + This may signal a truncation attack from an + source. + \retval #OP_EFAULT An internal memory allocation failed. + \retval #OP_EIMPL An unseekable stream encountered a new link that + used a feature that is not implemented, such as + an unsupported channel family. + \retval #OP_EINVAL The stream was only partially open. + \retval #OP_ENOTFORMAT An unseekable stream encountered a new link that + that did not have any logical Opus streams in it. + \retval #OP_EBADHEADER An unseekable stream encountered a new link with a + required header packet that was not properly + formatted, contained illegal values, or was + missing altogether. + \retval #OP_EVERSION An unseekable stream encountered a new link with + an ID header that contained an unrecognized + version number. + \retval #OP_EBADPACKET Failed to properly decode the next packet. + \retval #OP_EBADLINK We failed to find data we had seen before. + \retval #OP_EBADTIMESTAMP An unseekable stream encountered a new link with + a starting timestamp that failed basic validity + checks.*/ +OP_WARN_UNUSED_RESULT int op_read_float_stereo(OggOpusFile *_of, + float *_pcm,int _buf_size) OP_ARG_NONNULL(1); + +/*@}*/ +/*@}*/ + +# if OP_GNUC_PREREQ(4,0) +# pragma GCC visibility pop +# endif + +# if defined(__cplusplus) +} +# endif + +#endif diff --git a/thirdparty/opus/opus_compare.c b/thirdparty/opus/opus_compare.c new file mode 100644 index 000000000000..06c67d752f71 --- /dev/null +++ b/thirdparty/opus/opus_compare.c @@ -0,0 +1,379 @@ +/* Copyright (c) 2011-2012 Xiph.Org Foundation, Mozilla Corporation + Written by Jean-Marc Valin and Timothy B. Terriberry */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include + +#define OPUS_PI (3.14159265F) + +#define OPUS_COSF(_x) ((float)cos(_x)) +#define OPUS_SINF(_x) ((float)sin(_x)) + +static void *check_alloc(void *_ptr){ + if(_ptr==NULL){ + fprintf(stderr,"Out of memory.\n"); + exit(EXIT_FAILURE); + } + return _ptr; +} + +static void *opus_malloc(size_t _size){ + return check_alloc(malloc(_size)); +} + +static void *opus_realloc(void *_ptr,size_t _size){ + return check_alloc(realloc(_ptr,_size)); +} + +static size_t read_pcm16(float **_samples,FILE *_fin,int _nchannels){ + unsigned char buf[1024]; + float *samples; + size_t nsamples; + size_t csamples; + size_t xi; + size_t nread; + samples=NULL; + nsamples=csamples=0; + for(;;){ + nread=fread(buf,2*_nchannels,1024/(2*_nchannels),_fin); + if(nread<=0)break; + if(nsamples+nread>csamples){ + do csamples=csamples<<1|1; + while(nsamples+nread>csamples); + samples=(float *)opus_realloc(samples, + _nchannels*csamples*sizeof(*samples)); + } + for(xi=0;xi=_window_sz)ti-=_window_sz; + } + re*=_downsample; + im*=_downsample; + _ps[(xi*ps_sz+xj)*_nchannels+ci]=re*re+im*im+100000; + p[ci]+=_ps[(xi*ps_sz+xj)*_nchannels+ci]; + } + } + if(_out){ + _out[(xi*_nbands+bi)*_nchannels]=p[0]/(_bands[bi+1]-_bands[bi]); + if(_nchannels==2){ + _out[(xi*_nbands+bi)*_nchannels+1]=p[1]/(_bands[bi+1]-_bands[bi]); + } + } + } + } + free(window); +} + +#define NBANDS (21) +#define NFREQS (240) + +/*Bands on which we compute the pseudo-NMR (Bark-derived + CELT bands).*/ +static const int BANDS[NBANDS+1]={ + 0,2,4,6,8,10,12,14,16,20,24,28,32,40,48,56,68,80,96,120,156,200 +}; + +#define TEST_WIN_SIZE (480) +#define TEST_WIN_STEP (120) + +int main(int _argc,const char **_argv){ + FILE *fin1; + FILE *fin2; + float *x; + float *y; + float *xb; + float *X; + float *Y; + double err; + float Q; + size_t xlength; + size_t ylength; + size_t nframes; + size_t xi; + int ci; + int xj; + int bi; + int nchannels; + unsigned rate; + int downsample; + int ybands; + int yfreqs; + int max_compare; + if(_argc<3||_argc>6){ + fprintf(stderr,"Usage: %s [-s] [-r rate2] \n", + _argv[0]); + return EXIT_FAILURE; + } + nchannels=1; + if(strcmp(_argv[1],"-s")==0){ + nchannels=2; + _argv++; + } + rate=48000; + ybands=NBANDS; + yfreqs=NFREQS; + downsample=1; + if(strcmp(_argv[1],"-r")==0){ + rate=atoi(_argv[2]); + if(rate!=8000&&rate!=12000&&rate!=16000&&rate!=24000&&rate!=48000){ + fprintf(stderr, + "Sampling rate must be 8000, 12000, 16000, 24000, or 48000\n"); + return EXIT_FAILURE; + } + downsample=48000/rate; + switch(rate){ + case 8000:ybands=13;break; + case 12000:ybands=15;break; + case 16000:ybands=17;break; + case 24000:ybands=19;break; + } + yfreqs=NFREQS/downsample; + _argv+=2; + } + fin1=fopen(_argv[1],"rb"); + if(fin1==NULL){ + fprintf(stderr,"Error opening '%s'.\n",_argv[1]); + return EXIT_FAILURE; + } + fin2=fopen(_argv[2],"rb"); + if(fin2==NULL){ + fprintf(stderr,"Error opening '%s'.\n",_argv[2]); + fclose(fin1); + return EXIT_FAILURE; + } + /*Read in the data and allocate scratch space.*/ + xlength=read_pcm16(&x,fin1,2); + if(nchannels==1){ + for(xi=0;xi0;){ + for(ci=0;ci0){ + /*Temporal masking: -3 dB/2.5ms slope.*/ + for(bi=0;bi=79&&xj<=81)im*=0.1F; + if(xj==80)im*=0.1F; + Eb+=im; + } + } + Eb /= (BANDS[bi+1]-BANDS[bi])*nchannels; + Ef += Eb*Eb; + } + /*Using a fixed normalization value means we're willing to accept slightly + lower quality for lower sampling rates.*/ + Ef/=NBANDS; + Ef*=Ef; + err+=Ef*Ef; + } + err=pow(err/nframes,1.0/16); + Q=100*(1-0.5*log(1+err)/log(1.13)); + if(Q<0){ + fprintf(stderr,"Test vector FAILS\n"); + fprintf(stderr,"Internal weighted error is %f\n",err); + return EXIT_FAILURE; + } + else{ + fprintf(stderr,"Test vector PASSES\n"); + fprintf(stderr, + "Opus quality metric: %.1f %% (internal weighted error is %f)\n",Q,err); + return EXIT_SUCCESS; + } +} diff --git a/thirdparty/opus/opus_decoder.c b/thirdparty/opus/opus_decoder.c new file mode 100644 index 000000000000..080bec5072a1 --- /dev/null +++ b/thirdparty/opus/opus_decoder.c @@ -0,0 +1,981 @@ +/* Copyright (c) 2010 Xiph.Org Foundation, Skype Limited + Written by Jean-Marc Valin and Koen Vos */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#ifndef OPUS_BUILD +# error "OPUS_BUILD _MUST_ be defined to build Opus. This probably means you need other defines as well, as in a config.h. See the included build files for details." +#endif + +#if defined(__GNUC__) && (__GNUC__ >= 2) && !defined(__OPTIMIZE__) && !defined(OPUS_WILL_BE_SLOW) +# pragma message "You appear to be compiling without optimization, if so opus will be very slow." +#endif + +#include +#include "celt.h" +#include "opus.h" +#include "entdec.h" +#include "modes.h" +#include "API.h" +#include "stack_alloc.h" +#include "float_cast.h" +#include "opus_private.h" +#include "os_support.h" +#include "structs.h" +#include "define.h" +#include "mathops.h" +#include "cpu_support.h" + +struct OpusDecoder { + int celt_dec_offset; + int silk_dec_offset; + int channels; + opus_int32 Fs; /** Sampling rate (at the API level) */ + silk_DecControlStruct DecControl; + int decode_gain; + int arch; + + /* Everything beyond this point gets cleared on a reset */ +#define OPUS_DECODER_RESET_START stream_channels + int stream_channels; + + int bandwidth; + int mode; + int prev_mode; + int frame_size; + int prev_redundancy; + int last_packet_duration; +#ifndef FIXED_POINT + opus_val16 softclip_mem[2]; +#endif + + opus_uint32 rangeFinal; +}; + + +int opus_decoder_get_size(int channels) +{ + int silkDecSizeBytes, celtDecSizeBytes; + int ret; + if (channels<1 || channels > 2) + return 0; + ret = silk_Get_Decoder_Size( &silkDecSizeBytes ); + if(ret) + return 0; + silkDecSizeBytes = align(silkDecSizeBytes); + celtDecSizeBytes = celt_decoder_get_size(channels); + return align(sizeof(OpusDecoder))+silkDecSizeBytes+celtDecSizeBytes; +} + +int opus_decoder_init(OpusDecoder *st, opus_int32 Fs, int channels) +{ + void *silk_dec; + CELTDecoder *celt_dec; + int ret, silkDecSizeBytes; + + if ((Fs!=48000&&Fs!=24000&&Fs!=16000&&Fs!=12000&&Fs!=8000) + || (channels!=1&&channels!=2)) + return OPUS_BAD_ARG; + + OPUS_CLEAR((char*)st, opus_decoder_get_size(channels)); + /* Initialize SILK encoder */ + ret = silk_Get_Decoder_Size(&silkDecSizeBytes); + if (ret) + return OPUS_INTERNAL_ERROR; + + silkDecSizeBytes = align(silkDecSizeBytes); + st->silk_dec_offset = align(sizeof(OpusDecoder)); + st->celt_dec_offset = st->silk_dec_offset+silkDecSizeBytes; + silk_dec = (char*)st+st->silk_dec_offset; + celt_dec = (CELTDecoder*)((char*)st+st->celt_dec_offset); + st->stream_channels = st->channels = channels; + + st->Fs = Fs; + st->DecControl.API_sampleRate = st->Fs; + st->DecControl.nChannelsAPI = st->channels; + + /* Reset decoder */ + ret = silk_InitDecoder( silk_dec ); + if(ret)return OPUS_INTERNAL_ERROR; + + /* Initialize CELT decoder */ + ret = celt_decoder_init(celt_dec, Fs, channels); + if(ret!=OPUS_OK)return OPUS_INTERNAL_ERROR; + + celt_decoder_ctl(celt_dec, CELT_SET_SIGNALLING(0)); + + st->prev_mode = 0; + st->frame_size = Fs/400; + st->arch = opus_select_arch(); + return OPUS_OK; +} + +OpusDecoder *opus_decoder_create(opus_int32 Fs, int channels, int *error) +{ + int ret; + OpusDecoder *st; + if ((Fs!=48000&&Fs!=24000&&Fs!=16000&&Fs!=12000&&Fs!=8000) + || (channels!=1&&channels!=2)) + { + if (error) + *error = OPUS_BAD_ARG; + return NULL; + } + st = (OpusDecoder *)opus_alloc(opus_decoder_get_size(channels)); + if (st == NULL) + { + if (error) + *error = OPUS_ALLOC_FAIL; + return NULL; + } + ret = opus_decoder_init(st, Fs, channels); + if (error) + *error = ret; + if (ret != OPUS_OK) + { + opus_free(st); + st = NULL; + } + return st; +} + +static void smooth_fade(const opus_val16 *in1, const opus_val16 *in2, + opus_val16 *out, int overlap, int channels, + const opus_val16 *window, opus_int32 Fs) +{ + int i, c; + int inc = 48000/Fs; + for (c=0;csilk_dec_offset; + celt_dec = (CELTDecoder*)((char*)st+st->celt_dec_offset); + F20 = st->Fs/50; + F10 = F20>>1; + F5 = F10>>1; + F2_5 = F5>>1; + if (frame_size < F2_5) + { + RESTORE_STACK; + return OPUS_BUFFER_TOO_SMALL; + } + /* Limit frame_size to avoid excessive stack allocations. */ + frame_size = IMIN(frame_size, st->Fs/25*3); + /* Payloads of 1 (2 including ToC) or 0 trigger the PLC/DTX */ + if (len<=1) + { + data = NULL; + /* In that case, don't conceal more than what the ToC says */ + frame_size = IMIN(frame_size, st->frame_size); + } + if (data != NULL) + { + audiosize = st->frame_size; + mode = st->mode; + ec_dec_init(&dec,(unsigned char*)data,len); + } else { + audiosize = frame_size; + mode = st->prev_mode; + + if (mode == 0) + { + /* If we haven't got any packet yet, all we can do is return zeros */ + for (i=0;ichannels;i++) + pcm[i] = 0; + RESTORE_STACK; + return audiosize; + } + + /* Avoids trying to run the PLC on sizes other than 2.5 (CELT), 5 (CELT), + 10, or 20 (e.g. 12.5 or 30 ms). */ + if (audiosize > F20) + { + do { + int ret = opus_decode_frame(st, NULL, 0, pcm, IMIN(audiosize, F20), 0); + if (ret<0) + { + RESTORE_STACK; + return ret; + } + pcm += ret*st->channels; + audiosize -= ret; + } while (audiosize > 0); + RESTORE_STACK; + return frame_size; + } else if (audiosize < F20) + { + if (audiosize > F10) + audiosize = F10; + else if (mode != MODE_SILK_ONLY && audiosize > F5 && audiosize < F10) + audiosize = F5; + } + } + + /* In fixed-point, we can tell CELT to do the accumulation on top of the + SILK PCM buffer. This saves some stack space. */ +#ifdef FIXED_POINT + celt_accum = (mode != MODE_CELT_ONLY) && (frame_size >= F10); +#else + celt_accum = 0; +#endif + + pcm_transition_silk_size = ALLOC_NONE; + pcm_transition_celt_size = ALLOC_NONE; + if (data!=NULL && st->prev_mode > 0 && ( + (mode == MODE_CELT_ONLY && st->prev_mode != MODE_CELT_ONLY && !st->prev_redundancy) + || (mode != MODE_CELT_ONLY && st->prev_mode == MODE_CELT_ONLY) ) + ) + { + transition = 1; + /* Decide where to allocate the stack memory for pcm_transition */ + if (mode == MODE_CELT_ONLY) + pcm_transition_celt_size = F5*st->channels; + else + pcm_transition_silk_size = F5*st->channels; + } + ALLOC(pcm_transition_celt, pcm_transition_celt_size, opus_val16); + if (transition && mode == MODE_CELT_ONLY) + { + pcm_transition = pcm_transition_celt; + opus_decode_frame(st, NULL, 0, pcm_transition, IMIN(F5, audiosize), 0); + } + if (audiosize > frame_size) + { + /*fprintf(stderr, "PCM buffer too small: %d vs %d (mode = %d)\n", audiosize, frame_size, mode);*/ + RESTORE_STACK; + return OPUS_BAD_ARG; + } else { + frame_size = audiosize; + } + + /* Don't allocate any memory when in CELT-only mode */ + pcm_silk_size = (mode != MODE_CELT_ONLY && !celt_accum) ? IMAX(F10, frame_size)*st->channels : ALLOC_NONE; + ALLOC(pcm_silk, pcm_silk_size, opus_int16); + + /* SILK processing */ + if (mode != MODE_CELT_ONLY) + { + int lost_flag, decoded_samples; + opus_int16 *pcm_ptr; +#ifdef FIXED_POINT + if (celt_accum) + pcm_ptr = pcm; + else +#endif + pcm_ptr = pcm_silk; + + if (st->prev_mode==MODE_CELT_ONLY) + silk_InitDecoder( silk_dec ); + + /* The SILK PLC cannot produce frames of less than 10 ms */ + st->DecControl.payloadSize_ms = IMAX(10, 1000 * audiosize / st->Fs); + + if (data != NULL) + { + st->DecControl.nChannelsInternal = st->stream_channels; + if( mode == MODE_SILK_ONLY ) { + if( st->bandwidth == OPUS_BANDWIDTH_NARROWBAND ) { + st->DecControl.internalSampleRate = 8000; + } else if( st->bandwidth == OPUS_BANDWIDTH_MEDIUMBAND ) { + st->DecControl.internalSampleRate = 12000; + } else if( st->bandwidth == OPUS_BANDWIDTH_WIDEBAND ) { + st->DecControl.internalSampleRate = 16000; + } else { + st->DecControl.internalSampleRate = 16000; + silk_assert( 0 ); + } + } else { + /* Hybrid mode */ + st->DecControl.internalSampleRate = 16000; + } + } + + lost_flag = data == NULL ? 1 : 2 * decode_fec; + decoded_samples = 0; + do { + /* Call SILK decoder */ + int first_frame = decoded_samples == 0; + silk_ret = silk_Decode( silk_dec, &st->DecControl, + lost_flag, first_frame, &dec, pcm_ptr, &silk_frame_size, st->arch ); + if( silk_ret ) { + if (lost_flag) { + /* PLC failure should not be fatal */ + silk_frame_size = frame_size; + for (i=0;ichannels;i++) + pcm_ptr[i] = 0; + } else { + RESTORE_STACK; + return OPUS_INTERNAL_ERROR; + } + } + pcm_ptr += silk_frame_size * st->channels; + decoded_samples += silk_frame_size; + } while( decoded_samples < frame_size ); + } + + start_band = 0; + if (!decode_fec && mode != MODE_CELT_ONLY && data != NULL + && ec_tell(&dec)+17+20*(st->mode == MODE_HYBRID) <= 8*len) + { + /* Check if we have a redundant 0-8 kHz band */ + if (mode == MODE_HYBRID) + redundancy = ec_dec_bit_logp(&dec, 12); + else + redundancy = 1; + if (redundancy) + { + celt_to_silk = ec_dec_bit_logp(&dec, 1); + /* redundancy_bytes will be at least two, in the non-hybrid + case due to the ec_tell() check above */ + redundancy_bytes = mode==MODE_HYBRID ? + (opus_int32)ec_dec_uint(&dec, 256)+2 : + len-((ec_tell(&dec)+7)>>3); + len -= redundancy_bytes; + /* This is a sanity check. It should never happen for a valid + packet, so the exact behaviour is not normative. */ + if (len*8 < ec_tell(&dec)) + { + len = 0; + redundancy_bytes = 0; + redundancy = 0; + } + /* Shrink decoder because of raw bits */ + dec.storage -= redundancy_bytes; + } + } + if (mode != MODE_CELT_ONLY) + start_band = 17; + + { + int endband=21; + + switch(st->bandwidth) + { + case OPUS_BANDWIDTH_NARROWBAND: + endband = 13; + break; + case OPUS_BANDWIDTH_MEDIUMBAND: + case OPUS_BANDWIDTH_WIDEBAND: + endband = 17; + break; + case OPUS_BANDWIDTH_SUPERWIDEBAND: + endband = 19; + break; + case OPUS_BANDWIDTH_FULLBAND: + endband = 21; + break; + } + celt_decoder_ctl(celt_dec, CELT_SET_END_BAND(endband)); + celt_decoder_ctl(celt_dec, CELT_SET_CHANNELS(st->stream_channels)); + } + + if (redundancy) + { + transition = 0; + pcm_transition_silk_size=ALLOC_NONE; + } + + ALLOC(pcm_transition_silk, pcm_transition_silk_size, opus_val16); + + if (transition && mode != MODE_CELT_ONLY) + { + pcm_transition = pcm_transition_silk; + opus_decode_frame(st, NULL, 0, pcm_transition, IMIN(F5, audiosize), 0); + } + + /* Only allocation memory for redundancy if/when needed */ + redundant_audio_size = redundancy ? F5*st->channels : ALLOC_NONE; + ALLOC(redundant_audio, redundant_audio_size, opus_val16); + + /* 5 ms redundant frame for CELT->SILK*/ + if (redundancy && celt_to_silk) + { + celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0)); + celt_decode_with_ec(celt_dec, data+len, redundancy_bytes, + redundant_audio, F5, NULL, 0); + celt_decoder_ctl(celt_dec, OPUS_GET_FINAL_RANGE(&redundant_rng)); + } + + /* MUST be after PLC */ + celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(start_band)); + + if (mode != MODE_SILK_ONLY) + { + int celt_frame_size = IMIN(F20, frame_size); + /* Make sure to discard any previous CELT state */ + if (mode != st->prev_mode && st->prev_mode > 0 && !st->prev_redundancy) + celt_decoder_ctl(celt_dec, OPUS_RESET_STATE); + /* Decode CELT */ + celt_ret = celt_decode_with_ec(celt_dec, decode_fec ? NULL : data, + len, pcm, celt_frame_size, &dec, celt_accum); + } else { + unsigned char silence[2] = {0xFF, 0xFF}; + if (!celt_accum) + { + for (i=0;ichannels;i++) + pcm[i] = 0; + } + /* For hybrid -> SILK transitions, we let the CELT MDCT + do a fade-out by decoding a silence frame */ + if (st->prev_mode == MODE_HYBRID && !(redundancy && celt_to_silk && st->prev_redundancy) ) + { + celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0)); + celt_decode_with_ec(celt_dec, silence, 2, pcm, F2_5, NULL, celt_accum); + } + } + + if (mode != MODE_CELT_ONLY && !celt_accum) + { +#ifdef FIXED_POINT + for (i=0;ichannels;i++) + pcm[i] = SAT16(ADD32(pcm[i], pcm_silk[i])); +#else + for (i=0;ichannels;i++) + pcm[i] = pcm[i] + (opus_val16)((1.f/32768.f)*pcm_silk[i]); +#endif + } + + { + const CELTMode *celt_mode; + celt_decoder_ctl(celt_dec, CELT_GET_MODE(&celt_mode)); + window = celt_mode->window; + } + + /* 5 ms redundant frame for SILK->CELT */ + if (redundancy && !celt_to_silk) + { + celt_decoder_ctl(celt_dec, OPUS_RESET_STATE); + celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0)); + + celt_decode_with_ec(celt_dec, data+len, redundancy_bytes, redundant_audio, F5, NULL, 0); + celt_decoder_ctl(celt_dec, OPUS_GET_FINAL_RANGE(&redundant_rng)); + smooth_fade(pcm+st->channels*(frame_size-F2_5), redundant_audio+st->channels*F2_5, + pcm+st->channels*(frame_size-F2_5), F2_5, st->channels, window, st->Fs); + } + if (redundancy && celt_to_silk) + { + for (c=0;cchannels;c++) + { + for (i=0;ichannels*i+c] = redundant_audio[st->channels*i+c]; + } + smooth_fade(redundant_audio+st->channels*F2_5, pcm+st->channels*F2_5, + pcm+st->channels*F2_5, F2_5, st->channels, window, st->Fs); + } + if (transition) + { + if (audiosize >= F5) + { + for (i=0;ichannels*F2_5;i++) + pcm[i] = pcm_transition[i]; + smooth_fade(pcm_transition+st->channels*F2_5, pcm+st->channels*F2_5, + pcm+st->channels*F2_5, F2_5, + st->channels, window, st->Fs); + } else { + /* Not enough time to do a clean transition, but we do it anyway + This will not preserve amplitude perfectly and may introduce + a bit of temporal aliasing, but it shouldn't be too bad and + that's pretty much the best we can do. In any case, generating this + transition it pretty silly in the first place */ + smooth_fade(pcm_transition, pcm, + pcm, F2_5, + st->channels, window, st->Fs); + } + } + + if(st->decode_gain) + { + opus_val32 gain; + gain = celt_exp2(MULT16_16_P15(QCONST16(6.48814081e-4f, 25), st->decode_gain)); + for (i=0;ichannels;i++) + { + opus_val32 x; + x = MULT16_32_P16(pcm[i],gain); + pcm[i] = SATURATE(x, 32767); + } + } + + if (len <= 1) + st->rangeFinal = 0; + else + st->rangeFinal = dec.rng ^ redundant_rng; + + st->prev_mode = mode; + st->prev_redundancy = redundancy && !celt_to_silk; + + if (celt_ret>=0) + { + if (OPUS_CHECK_ARRAY(pcm, audiosize*st->channels)) + OPUS_PRINT_INT(audiosize); + } + + RESTORE_STACK; + return celt_ret < 0 ? celt_ret : audiosize; + +} + +int opus_decode_native(OpusDecoder *st, const unsigned char *data, + opus_int32 len, opus_val16 *pcm, int frame_size, int decode_fec, + int self_delimited, opus_int32 *packet_offset, int soft_clip) +{ + int i, nb_samples; + int count, offset; + unsigned char toc; + int packet_frame_size, packet_bandwidth, packet_mode, packet_stream_channels; + /* 48 x 2.5 ms = 120 ms */ + opus_int16 size[48]; + if (decode_fec<0 || decode_fec>1) + return OPUS_BAD_ARG; + /* For FEC/PLC, frame_size has to be to have a multiple of 2.5 ms */ + if ((decode_fec || len==0 || data==NULL) && frame_size%(st->Fs/400)!=0) + return OPUS_BAD_ARG; + if (len==0 || data==NULL) + { + int pcm_count=0; + do { + int ret; + ret = opus_decode_frame(st, NULL, 0, pcm+pcm_count*st->channels, frame_size-pcm_count, 0); + if (ret<0) + return ret; + pcm_count += ret; + } while (pcm_count < frame_size); + celt_assert(pcm_count == frame_size); + if (OPUS_CHECK_ARRAY(pcm, pcm_count*st->channels)) + OPUS_PRINT_INT(pcm_count); + st->last_packet_duration = pcm_count; + return pcm_count; + } else if (len<0) + return OPUS_BAD_ARG; + + packet_mode = opus_packet_get_mode(data); + packet_bandwidth = opus_packet_get_bandwidth(data); + packet_frame_size = opus_packet_get_samples_per_frame(data, st->Fs); + packet_stream_channels = opus_packet_get_nb_channels(data); + + count = opus_packet_parse_impl(data, len, self_delimited, &toc, NULL, + size, &offset, packet_offset); + if (count<0) + return count; + + data += offset; + + if (decode_fec) + { + int duration_copy; + int ret; + /* If no FEC can be present, run the PLC (recursive call) */ + if (frame_size < packet_frame_size || packet_mode == MODE_CELT_ONLY || st->mode == MODE_CELT_ONLY) + return opus_decode_native(st, NULL, 0, pcm, frame_size, 0, 0, NULL, soft_clip); + /* Otherwise, run the PLC on everything except the size for which we might have FEC */ + duration_copy = st->last_packet_duration; + if (frame_size-packet_frame_size!=0) + { + ret = opus_decode_native(st, NULL, 0, pcm, frame_size-packet_frame_size, 0, 0, NULL, soft_clip); + if (ret<0) + { + st->last_packet_duration = duration_copy; + return ret; + } + celt_assert(ret==frame_size-packet_frame_size); + } + /* Complete with FEC */ + st->mode = packet_mode; + st->bandwidth = packet_bandwidth; + st->frame_size = packet_frame_size; + st->stream_channels = packet_stream_channels; + ret = opus_decode_frame(st, data, size[0], pcm+st->channels*(frame_size-packet_frame_size), + packet_frame_size, 1); + if (ret<0) + return ret; + else { + if (OPUS_CHECK_ARRAY(pcm, frame_size*st->channels)) + OPUS_PRINT_INT(frame_size); + st->last_packet_duration = frame_size; + return frame_size; + } + } + + if (count*packet_frame_size > frame_size) + return OPUS_BUFFER_TOO_SMALL; + + /* Update the state as the last step to avoid updating it on an invalid packet */ + st->mode = packet_mode; + st->bandwidth = packet_bandwidth; + st->frame_size = packet_frame_size; + st->stream_channels = packet_stream_channels; + + nb_samples=0; + for (i=0;ichannels, frame_size-nb_samples, 0); + if (ret<0) + return ret; + celt_assert(ret==packet_frame_size); + data += size[i]; + nb_samples += ret; + } + st->last_packet_duration = nb_samples; + if (OPUS_CHECK_ARRAY(pcm, nb_samples*st->channels)) + OPUS_PRINT_INT(nb_samples); +#ifndef FIXED_POINT + if (soft_clip) + opus_pcm_soft_clip(pcm, nb_samples, st->channels, st->softclip_mem); + else + st->softclip_mem[0]=st->softclip_mem[1]=0; +#endif + return nb_samples; +} + +#ifdef FIXED_POINT + +int opus_decode(OpusDecoder *st, const unsigned char *data, + opus_int32 len, opus_val16 *pcm, int frame_size, int decode_fec) +{ + if(frame_size<=0) + return OPUS_BAD_ARG; + return opus_decode_native(st, data, len, pcm, frame_size, decode_fec, 0, NULL, 0); +} + +#ifndef DISABLE_FLOAT_API +int opus_decode_float(OpusDecoder *st, const unsigned char *data, + opus_int32 len, float *pcm, int frame_size, int decode_fec) +{ + VARDECL(opus_int16, out); + int ret, i; + int nb_samples; + ALLOC_STACK; + + if(frame_size<=0) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + if (data != NULL && len > 0 && !decode_fec) + { + nb_samples = opus_decoder_get_nb_samples(st, data, len); + if (nb_samples>0) + frame_size = IMIN(frame_size, nb_samples); + else + return OPUS_INVALID_PACKET; + } + ALLOC(out, frame_size*st->channels, opus_int16); + + ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, 0); + if (ret > 0) + { + for (i=0;ichannels;i++) + pcm[i] = (1.f/32768.f)*(out[i]); + } + RESTORE_STACK; + return ret; +} +#endif + + +#else +int opus_decode(OpusDecoder *st, const unsigned char *data, + opus_int32 len, opus_int16 *pcm, int frame_size, int decode_fec) +{ + VARDECL(float, out); + int ret, i; + int nb_samples; + ALLOC_STACK; + + if(frame_size<=0) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + + if (data != NULL && len > 0 && !decode_fec) + { + nb_samples = opus_decoder_get_nb_samples(st, data, len); + if (nb_samples>0) + frame_size = IMIN(frame_size, nb_samples); + else + return OPUS_INVALID_PACKET; + } + ALLOC(out, frame_size*st->channels, float); + + ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, 1); + if (ret > 0) + { + for (i=0;ichannels;i++) + pcm[i] = FLOAT2INT16(out[i]); + } + RESTORE_STACK; + return ret; +} + +int opus_decode_float(OpusDecoder *st, const unsigned char *data, + opus_int32 len, opus_val16 *pcm, int frame_size, int decode_fec) +{ + if(frame_size<=0) + return OPUS_BAD_ARG; + return opus_decode_native(st, data, len, pcm, frame_size, decode_fec, 0, NULL, 0); +} + +#endif + +int opus_decoder_ctl(OpusDecoder *st, int request, ...) +{ + int ret = OPUS_OK; + va_list ap; + void *silk_dec; + CELTDecoder *celt_dec; + + silk_dec = (char*)st+st->silk_dec_offset; + celt_dec = (CELTDecoder*)((char*)st+st->celt_dec_offset); + + + va_start(ap, request); + + switch (request) + { + case OPUS_GET_BANDWIDTH_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->bandwidth; + } + break; + case OPUS_GET_FINAL_RANGE_REQUEST: + { + opus_uint32 *value = va_arg(ap, opus_uint32*); + if (!value) + { + goto bad_arg; + } + *value = st->rangeFinal; + } + break; + case OPUS_RESET_STATE: + { + OPUS_CLEAR((char*)&st->OPUS_DECODER_RESET_START, + sizeof(OpusDecoder)- + ((char*)&st->OPUS_DECODER_RESET_START - (char*)st)); + + celt_decoder_ctl(celt_dec, OPUS_RESET_STATE); + silk_InitDecoder( silk_dec ); + st->stream_channels = st->channels; + st->frame_size = st->Fs/400; + } + break; + case OPUS_GET_SAMPLE_RATE_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->Fs; + } + break; + case OPUS_GET_PITCH_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + if (st->prev_mode == MODE_CELT_ONLY) + celt_decoder_ctl(celt_dec, OPUS_GET_PITCH(value)); + else + *value = st->DecControl.prevPitchLag; + } + break; + case OPUS_GET_GAIN_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->decode_gain; + } + break; + case OPUS_SET_GAIN_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value<-32768 || value>32767) + { + goto bad_arg; + } + st->decode_gain = value; + } + break; + case OPUS_GET_LAST_PACKET_DURATION_REQUEST: + { + opus_uint32 *value = va_arg(ap, opus_uint32*); + if (!value) + { + goto bad_arg; + } + *value = st->last_packet_duration; + } + break; + default: + /*fprintf(stderr, "unknown opus_decoder_ctl() request: %d", request);*/ + ret = OPUS_UNIMPLEMENTED; + break; + } + + va_end(ap); + return ret; +bad_arg: + va_end(ap); + return OPUS_BAD_ARG; +} + +void opus_decoder_destroy(OpusDecoder *st) +{ + opus_free(st); +} + + +int opus_packet_get_bandwidth(const unsigned char *data) +{ + int bandwidth; + if (data[0]&0x80) + { + bandwidth = OPUS_BANDWIDTH_MEDIUMBAND + ((data[0]>>5)&0x3); + if (bandwidth == OPUS_BANDWIDTH_MEDIUMBAND) + bandwidth = OPUS_BANDWIDTH_NARROWBAND; + } else if ((data[0]&0x60) == 0x60) + { + bandwidth = (data[0]&0x10) ? OPUS_BANDWIDTH_FULLBAND : + OPUS_BANDWIDTH_SUPERWIDEBAND; + } else { + bandwidth = OPUS_BANDWIDTH_NARROWBAND + ((data[0]>>5)&0x3); + } + return bandwidth; +} + +int opus_packet_get_nb_channels(const unsigned char *data) +{ + return (data[0]&0x4) ? 2 : 1; +} + +int opus_packet_get_nb_frames(const unsigned char packet[], opus_int32 len) +{ + int count; + if (len<1) + return OPUS_BAD_ARG; + count = packet[0]&0x3; + if (count==0) + return 1; + else if (count!=3) + return 2; + else if (len<2) + return OPUS_INVALID_PACKET; + else + return packet[1]&0x3F; +} + +int opus_packet_get_nb_samples(const unsigned char packet[], opus_int32 len, + opus_int32 Fs) +{ + int samples; + int count = opus_packet_get_nb_frames(packet, len); + + if (count<0) + return count; + + samples = count*opus_packet_get_samples_per_frame(packet, Fs); + /* Can't have more than 120 ms */ + if (samples*25 > Fs*3) + return OPUS_INVALID_PACKET; + else + return samples; +} + +int opus_decoder_get_nb_samples(const OpusDecoder *dec, + const unsigned char packet[], opus_int32 len) +{ + return opus_packet_get_nb_samples(packet, len, dec->Fs); +} diff --git a/thirdparty/opus/opus_encoder.c b/thirdparty/opus/opus_encoder.c new file mode 100644 index 000000000000..9a516a884aa7 --- /dev/null +++ b/thirdparty/opus/opus_encoder.c @@ -0,0 +1,2536 @@ +/* Copyright (c) 2010-2011 Xiph.Org Foundation, Skype Limited + Written by Jean-Marc Valin and Koen Vos */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include "celt.h" +#include "entenc.h" +#include "modes.h" +#include "API.h" +#include "stack_alloc.h" +#include "float_cast.h" +#include "opus.h" +#include "arch.h" +#include "pitch.h" +#include "opus_private.h" +#include "os_support.h" +#include "cpu_support.h" +#include "analysis.h" +#include "mathops.h" +#include "tuning_parameters.h" +#ifdef FIXED_POINT +#include "fixed/structs_FIX.h" +#else +#include "float/structs_FLP.h" +#endif + +#define MAX_ENCODER_BUFFER 480 + +typedef struct { + opus_val32 XX, XY, YY; + opus_val16 smoothed_width; + opus_val16 max_follower; +} StereoWidthState; + +struct OpusEncoder { + int celt_enc_offset; + int silk_enc_offset; + silk_EncControlStruct silk_mode; + int application; + int channels; + int delay_compensation; + int force_channels; + int signal_type; + int user_bandwidth; + int max_bandwidth; + int user_forced_mode; + int voice_ratio; + opus_int32 Fs; + int use_vbr; + int vbr_constraint; + int variable_duration; + opus_int32 bitrate_bps; + opus_int32 user_bitrate_bps; + int lsb_depth; + int encoder_buffer; + int lfe; + int arch; +#ifndef DISABLE_FLOAT_API + TonalityAnalysisState analysis; +#endif + +#define OPUS_ENCODER_RESET_START stream_channels + int stream_channels; + opus_int16 hybrid_stereo_width_Q14; + opus_int32 variable_HP_smth2_Q15; + opus_val16 prev_HB_gain; + opus_val32 hp_mem[4]; + int mode; + int prev_mode; + int prev_channels; + int prev_framesize; + int bandwidth; + int silk_bw_switch; + /* Sampling rate (at the API level) */ + int first; + opus_val16 * energy_masking; + StereoWidthState width_mem; + opus_val16 delay_buffer[MAX_ENCODER_BUFFER*2]; +#ifndef DISABLE_FLOAT_API + int detected_bandwidth; +#endif + opus_uint32 rangeFinal; +}; + +/* Transition tables for the voice and music. First column is the + middle (memoriless) threshold. The second column is the hysteresis + (difference with the middle) */ +static const opus_int32 mono_voice_bandwidth_thresholds[8] = { + 11000, 1000, /* NB<->MB */ + 14000, 1000, /* MB<->WB */ + 17000, 1000, /* WB<->SWB */ + 21000, 2000, /* SWB<->FB */ +}; +static const opus_int32 mono_music_bandwidth_thresholds[8] = { + 12000, 1000, /* NB<->MB */ + 15000, 1000, /* MB<->WB */ + 18000, 2000, /* WB<->SWB */ + 22000, 2000, /* SWB<->FB */ +}; +static const opus_int32 stereo_voice_bandwidth_thresholds[8] = { + 11000, 1000, /* NB<->MB */ + 14000, 1000, /* MB<->WB */ + 21000, 2000, /* WB<->SWB */ + 28000, 2000, /* SWB<->FB */ +}; +static const opus_int32 stereo_music_bandwidth_thresholds[8] = { + 12000, 1000, /* NB<->MB */ + 18000, 2000, /* MB<->WB */ + 21000, 2000, /* WB<->SWB */ + 30000, 2000, /* SWB<->FB */ +}; +/* Threshold bit-rates for switching between mono and stereo */ +static const opus_int32 stereo_voice_threshold = 30000; +static const opus_int32 stereo_music_threshold = 30000; + +/* Threshold bit-rate for switching between SILK/hybrid and CELT-only */ +static const opus_int32 mode_thresholds[2][2] = { + /* voice */ /* music */ + { 64000, 16000}, /* mono */ + { 36000, 16000}, /* stereo */ +}; + +int opus_encoder_get_size(int channels) +{ + int silkEncSizeBytes, celtEncSizeBytes; + int ret; + if (channels<1 || channels > 2) + return 0; + ret = silk_Get_Encoder_Size( &silkEncSizeBytes ); + if (ret) + return 0; + silkEncSizeBytes = align(silkEncSizeBytes); + celtEncSizeBytes = celt_encoder_get_size(channels); + return align(sizeof(OpusEncoder))+silkEncSizeBytes+celtEncSizeBytes; +} + +int opus_encoder_init(OpusEncoder* st, opus_int32 Fs, int channels, int application) +{ + void *silk_enc; + CELTEncoder *celt_enc; + int err; + int ret, silkEncSizeBytes; + + if((Fs!=48000&&Fs!=24000&&Fs!=16000&&Fs!=12000&&Fs!=8000)||(channels!=1&&channels!=2)|| + (application != OPUS_APPLICATION_VOIP && application != OPUS_APPLICATION_AUDIO + && application != OPUS_APPLICATION_RESTRICTED_LOWDELAY)) + return OPUS_BAD_ARG; + + OPUS_CLEAR((char*)st, opus_encoder_get_size(channels)); + /* Create SILK encoder */ + ret = silk_Get_Encoder_Size( &silkEncSizeBytes ); + if (ret) + return OPUS_BAD_ARG; + silkEncSizeBytes = align(silkEncSizeBytes); + st->silk_enc_offset = align(sizeof(OpusEncoder)); + st->celt_enc_offset = st->silk_enc_offset+silkEncSizeBytes; + silk_enc = (char*)st+st->silk_enc_offset; + celt_enc = (CELTEncoder*)((char*)st+st->celt_enc_offset); + + st->stream_channels = st->channels = channels; + + st->Fs = Fs; + + st->arch = opus_select_arch(); + + ret = silk_InitEncoder( silk_enc, st->arch, &st->silk_mode ); + if(ret)return OPUS_INTERNAL_ERROR; + + /* default SILK parameters */ + st->silk_mode.nChannelsAPI = channels; + st->silk_mode.nChannelsInternal = channels; + st->silk_mode.API_sampleRate = st->Fs; + st->silk_mode.maxInternalSampleRate = 16000; + st->silk_mode.minInternalSampleRate = 8000; + st->silk_mode.desiredInternalSampleRate = 16000; + st->silk_mode.payloadSize_ms = 20; + st->silk_mode.bitRate = 25000; + st->silk_mode.packetLossPercentage = 0; + st->silk_mode.complexity = 9; + st->silk_mode.useInBandFEC = 0; + st->silk_mode.useDTX = 0; + st->silk_mode.useCBR = 0; + st->silk_mode.reducedDependency = 0; + + /* Create CELT encoder */ + /* Initialize CELT encoder */ + err = celt_encoder_init(celt_enc, Fs, channels, st->arch); + if(err!=OPUS_OK)return OPUS_INTERNAL_ERROR; + + celt_encoder_ctl(celt_enc, CELT_SET_SIGNALLING(0)); + celt_encoder_ctl(celt_enc, OPUS_SET_COMPLEXITY(st->silk_mode.complexity)); + + st->use_vbr = 1; + /* Makes constrained VBR the default (safer for real-time use) */ + st->vbr_constraint = 1; + st->user_bitrate_bps = OPUS_AUTO; + st->bitrate_bps = 3000+Fs*channels; + st->application = application; + st->signal_type = OPUS_AUTO; + st->user_bandwidth = OPUS_AUTO; + st->max_bandwidth = OPUS_BANDWIDTH_FULLBAND; + st->force_channels = OPUS_AUTO; + st->user_forced_mode = OPUS_AUTO; + st->voice_ratio = -1; + st->encoder_buffer = st->Fs/100; + st->lsb_depth = 24; + st->variable_duration = OPUS_FRAMESIZE_ARG; + + /* Delay compensation of 4 ms (2.5 ms for SILK's extra look-ahead + + 1.5 ms for SILK resamplers and stereo prediction) */ + st->delay_compensation = st->Fs/250; + + st->hybrid_stereo_width_Q14 = 1 << 14; + st->prev_HB_gain = Q15ONE; + st->variable_HP_smth2_Q15 = silk_LSHIFT( silk_lin2log( VARIABLE_HP_MIN_CUTOFF_HZ ), 8 ); + st->first = 1; + st->mode = MODE_HYBRID; + st->bandwidth = OPUS_BANDWIDTH_FULLBAND; + +#ifndef DISABLE_FLOAT_API + tonality_analysis_init(&st->analysis); +#endif + + return OPUS_OK; +} + +static unsigned char gen_toc(int mode, int framerate, int bandwidth, int channels) +{ + int period; + unsigned char toc; + period = 0; + while (framerate < 400) + { + framerate <<= 1; + period++; + } + if (mode == MODE_SILK_ONLY) + { + toc = (bandwidth-OPUS_BANDWIDTH_NARROWBAND)<<5; + toc |= (period-2)<<3; + } else if (mode == MODE_CELT_ONLY) + { + int tmp = bandwidth-OPUS_BANDWIDTH_MEDIUMBAND; + if (tmp < 0) + tmp = 0; + toc = 0x80; + toc |= tmp << 5; + toc |= period<<3; + } else /* Hybrid */ + { + toc = 0x60; + toc |= (bandwidth-OPUS_BANDWIDTH_SUPERWIDEBAND)<<4; + toc |= (period-2)<<3; + } + toc |= (channels==2)<<2; + return toc; +} + +#ifndef FIXED_POINT +static void silk_biquad_float( + const opus_val16 *in, /* I: Input signal */ + const opus_int32 *B_Q28, /* I: MA coefficients [3] */ + const opus_int32 *A_Q28, /* I: AR coefficients [2] */ + opus_val32 *S, /* I/O: State vector [2] */ + opus_val16 *out, /* O: Output signal */ + const opus_int32 len, /* I: Signal length (must be even) */ + int stride +) +{ + /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */ + opus_int k; + opus_val32 vout; + opus_val32 inval; + opus_val32 A[2], B[3]; + + A[0] = (opus_val32)(A_Q28[0] * (1.f/((opus_int32)1<<28))); + A[1] = (opus_val32)(A_Q28[1] * (1.f/((opus_int32)1<<28))); + B[0] = (opus_val32)(B_Q28[0] * (1.f/((opus_int32)1<<28))); + B[1] = (opus_val32)(B_Q28[1] * (1.f/((opus_int32)1<<28))); + B[2] = (opus_val32)(B_Q28[2] * (1.f/((opus_int32)1<<28))); + + /* Negate A_Q28 values and split in two parts */ + + for( k = 0; k < len; k++ ) { + /* S[ 0 ], S[ 1 ]: Q12 */ + inval = in[ k*stride ]; + vout = S[ 0 ] + B[0]*inval; + + S[ 0 ] = S[1] - vout*A[0] + B[1]*inval; + + S[ 1 ] = - vout*A[1] + B[2]*inval + VERY_SMALL; + + /* Scale back to Q0 and saturate */ + out[ k*stride ] = vout; + } +} +#endif + +static void hp_cutoff(const opus_val16 *in, opus_int32 cutoff_Hz, opus_val16 *out, opus_val32 *hp_mem, int len, int channels, opus_int32 Fs) +{ + opus_int32 B_Q28[ 3 ], A_Q28[ 2 ]; + opus_int32 Fc_Q19, r_Q28, r_Q22; + + silk_assert( cutoff_Hz <= silk_int32_MAX / SILK_FIX_CONST( 1.5 * 3.14159 / 1000, 19 ) ); + Fc_Q19 = silk_DIV32_16( silk_SMULBB( SILK_FIX_CONST( 1.5 * 3.14159 / 1000, 19 ), cutoff_Hz ), Fs/1000 ); + silk_assert( Fc_Q19 > 0 && Fc_Q19 < 32768 ); + + r_Q28 = SILK_FIX_CONST( 1.0, 28 ) - silk_MUL( SILK_FIX_CONST( 0.92, 9 ), Fc_Q19 ); + + /* b = r * [ 1; -2; 1 ]; */ + /* a = [ 1; -2 * r * ( 1 - 0.5 * Fc^2 ); r^2 ]; */ + B_Q28[ 0 ] = r_Q28; + B_Q28[ 1 ] = silk_LSHIFT( -r_Q28, 1 ); + B_Q28[ 2 ] = r_Q28; + + /* -r * ( 2 - Fc * Fc ); */ + r_Q22 = silk_RSHIFT( r_Q28, 6 ); + A_Q28[ 0 ] = silk_SMULWW( r_Q22, silk_SMULWW( Fc_Q19, Fc_Q19 ) - SILK_FIX_CONST( 2.0, 22 ) ); + A_Q28[ 1 ] = silk_SMULWW( r_Q22, r_Q22 ); + +#ifdef FIXED_POINT + silk_biquad_alt( in, B_Q28, A_Q28, hp_mem, out, len, channels ); + if( channels == 2 ) { + silk_biquad_alt( in+1, B_Q28, A_Q28, hp_mem+2, out+1, len, channels ); + } +#else + silk_biquad_float( in, B_Q28, A_Q28, hp_mem, out, len, channels ); + if( channels == 2 ) { + silk_biquad_float( in+1, B_Q28, A_Q28, hp_mem+2, out+1, len, channels ); + } +#endif +} + +#ifdef FIXED_POINT +static void dc_reject(const opus_val16 *in, opus_int32 cutoff_Hz, opus_val16 *out, opus_val32 *hp_mem, int len, int channels, opus_int32 Fs) +{ + int c, i; + int shift; + + /* Approximates -round(log2(4.*cutoff_Hz/Fs)) */ + shift=celt_ilog2(Fs/(cutoff_Hz*3)); + for (c=0;cFs/400; + if (st->user_bitrate_bps==OPUS_AUTO) + return 60*st->Fs/frame_size + st->Fs*st->channels; + else if (st->user_bitrate_bps==OPUS_BITRATE_MAX) + return max_data_bytes*8*st->Fs/frame_size; + else + return st->user_bitrate_bps; +} + +#ifndef DISABLE_FLOAT_API +/* Don't use more than 60 ms for the frame size analysis */ +#define MAX_DYNAMIC_FRAMESIZE 24 +/* Estimates how much the bitrate will be boosted based on the sub-frame energy */ +static float transient_boost(const float *E, const float *E_1, int LM, int maxM) +{ + int i; + int M; + float sumE=0, sumE_1=0; + float metric; + + M = IMIN(maxM, (1<10 ? 1 : 0;*/ + /*return MAX16(0,1-exp(-.25*(metric-2.)));*/ + return MIN16(1,(float)sqrt(MAX16(0,.05f*(metric-2)))); +} + +/* Viterbi decoding trying to find the best frame size combination using look-ahead + + State numbering: + 0: unused + 1: 2.5 ms + 2: 5 ms (#1) + 3: 5 ms (#2) + 4: 10 ms (#1) + 5: 10 ms (#2) + 6: 10 ms (#3) + 7: 10 ms (#4) + 8: 20 ms (#1) + 9: 20 ms (#2) + 10: 20 ms (#3) + 11: 20 ms (#4) + 12: 20 ms (#5) + 13: 20 ms (#6) + 14: 20 ms (#7) + 15: 20 ms (#8) +*/ +static int transient_viterbi(const float *E, const float *E_1, int N, int frame_cost, int rate) +{ + int i; + float cost[MAX_DYNAMIC_FRAMESIZE][16]; + int states[MAX_DYNAMIC_FRAMESIZE][16]; + float best_cost; + int best_state; + float factor; + /* Take into account that we damp VBR in the 32 kb/s to 64 kb/s range. */ + if (rate<80) + factor=0; + else if (rate>160) + factor=1; + else + factor = (rate-80.f)/80.f; + /* Makes variable framesize less aggressive at lower bitrates, but I can't + find any valid theoretical justification for this (other than it seems + to help) */ + for (i=0;i<16;i++) + { + /* Impossible state */ + states[0][i] = -1; + cost[0][i] = 1e10; + } + for (i=0;i<4;i++) + { + cost[0][1<=0;i--) + { + /*printf("%d ", best_state);*/ + best_state = states[i][best_state]; + } + /*printf("%d\n", best_state);*/ + return best_state; +} + +static int optimize_framesize(const void *x, int len, int C, opus_int32 Fs, + int bitrate, opus_val16 tonality, float *mem, int buffering, + downmix_func downmix) +{ + int N; + int i; + float e[MAX_DYNAMIC_FRAMESIZE+4]; + float e_1[MAX_DYNAMIC_FRAMESIZE+3]; + opus_val32 memx; + int bestLM=0; + int subframe; + int pos; + int offset; + VARDECL(opus_val32, sub); + + subframe = Fs/400; + ALLOC(sub, subframe, opus_val32); + e[0]=mem[0]; + e_1[0]=1.f/(EPSILON+mem[0]); + if (buffering) + { + /* Consider the CELT delay when not in restricted-lowdelay */ + /* We assume the buffering is between 2.5 and 5 ms */ + offset = 2*subframe - buffering; + celt_assert(offset>=0 && offset <= subframe); + len -= offset; + e[1]=mem[1]; + e_1[1]=1.f/(EPSILON+mem[1]); + e[2]=mem[2]; + e_1[2]=1.f/(EPSILON+mem[2]); + pos = 3; + } else { + pos=1; + offset=0; + } + N=IMIN(len/subframe, MAX_DYNAMIC_FRAMESIZE); + /* Just silencing a warning, it's really initialized later */ + memx = 0; + for (i=0;i-1) + { + for (j=0;j-1) + { + for (j=0;j= OPUS_FRAMESIZE_2_5_MS && variable_duration <= OPUS_FRAMESIZE_60_MS) + new_size = IMIN(3*Fs/50, (Fs/400)<<(variable_duration-OPUS_FRAMESIZE_2_5_MS)); + else + return -1; + if (new_size>frame_size) + return -1; + if (400*new_size!=Fs && 200*new_size!=Fs && 100*new_size!=Fs && + 50*new_size!=Fs && 25*new_size!=Fs && 50*new_size!=3*Fs) + return -1; + return new_size; +} + +opus_int32 compute_frame_size(const void *analysis_pcm, int frame_size, + int variable_duration, int C, opus_int32 Fs, int bitrate_bps, + int delay_compensation, downmix_func downmix +#ifndef DISABLE_FLOAT_API + , float *subframe_mem +#endif + ) +{ +#ifndef DISABLE_FLOAT_API + if (variable_duration == OPUS_FRAMESIZE_VARIABLE && frame_size >= Fs/200) + { + int LM = 3; + LM = optimize_framesize(analysis_pcm, frame_size, C, Fs, bitrate_bps, + 0, subframe_mem, delay_compensation, downmix); + while ((Fs/400<frame_size) + LM--; + frame_size = (Fs/400<XX += MULT16_32_Q15(short_alpha, xx-mem->XX); + mem->XY += MULT16_32_Q15(short_alpha, xy-mem->XY); + mem->YY += MULT16_32_Q15(short_alpha, yy-mem->YY); + mem->XX = MAX32(0, mem->XX); + mem->XY = MAX32(0, mem->XY); + mem->YY = MAX32(0, mem->YY); + if (MAX32(mem->XX, mem->YY)>QCONST16(8e-4f, 18)) + { + opus_val16 corr; + opus_val16 ldiff; + opus_val16 width; + sqrt_xx = celt_sqrt(mem->XX); + sqrt_yy = celt_sqrt(mem->YY); + qrrt_xx = celt_sqrt(sqrt_xx); + qrrt_yy = celt_sqrt(sqrt_yy); + /* Inter-channel correlation */ + mem->XY = MIN32(mem->XY, sqrt_xx*sqrt_yy); + corr = SHR32(frac_div32(mem->XY,EPSILON+MULT16_16(sqrt_xx,sqrt_yy)),16); + /* Approximate loudness difference */ + ldiff = MULT16_16(Q15ONE, ABS16(qrrt_xx-qrrt_yy))/(EPSILON+qrrt_xx+qrrt_yy); + width = MULT16_16_Q15(celt_sqrt(QCONST32(1.f,30)-MULT16_16(corr,corr)), ldiff); + /* Smoothing over one second */ + mem->smoothed_width += (width-mem->smoothed_width)/frame_rate; + /* Peak follower */ + mem->max_follower = MAX16(mem->max_follower-QCONST16(.02f,15)/frame_rate, mem->smoothed_width); + } + /*printf("%f %f %f %f %f ", corr/(float)Q15ONE, ldiff/(float)Q15ONE, width/(float)Q15ONE, mem->smoothed_width/(float)Q15ONE, mem->max_follower/(float)Q15ONE);*/ + return EXTRACT16(MIN32(Q15ONE, MULT16_16(20, mem->max_follower))); +} + +opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size, + unsigned char *data, opus_int32 out_data_bytes, int lsb_depth, + const void *analysis_pcm, opus_int32 analysis_size, int c1, int c2, + int analysis_channels, downmix_func downmix, int float_api) +{ + void *silk_enc; + CELTEncoder *celt_enc; + int i; + int ret=0; + opus_int32 nBytes; + ec_enc enc; + int bytes_target; + int prefill=0; + int start_band = 0; + int redundancy = 0; + int redundancy_bytes = 0; /* Number of bytes to use for redundancy frame */ + int celt_to_silk = 0; + VARDECL(opus_val16, pcm_buf); + int nb_compr_bytes; + int to_celt = 0; + opus_uint32 redundant_rng = 0; + int cutoff_Hz, hp_freq_smth1; + int voice_est; /* Probability of voice in Q7 */ + opus_int32 equiv_rate; + int delay_compensation; + int frame_rate; + opus_int32 max_rate; /* Max bitrate we're allowed to use */ + int curr_bandwidth; + opus_val16 HB_gain; + opus_int32 max_data_bytes; /* Max number of bytes we're allowed to use */ + int total_buffer; + opus_val16 stereo_width; + const CELTMode *celt_mode; +#ifndef DISABLE_FLOAT_API + AnalysisInfo analysis_info; + int analysis_read_pos_bak=-1; + int analysis_read_subframe_bak=-1; +#endif + VARDECL(opus_val16, tmp_prefill); + + ALLOC_STACK; + + max_data_bytes = IMIN(1276, out_data_bytes); + + st->rangeFinal = 0; + if ((!st->variable_duration && 400*frame_size != st->Fs && 200*frame_size != st->Fs && 100*frame_size != st->Fs && + 50*frame_size != st->Fs && 25*frame_size != st->Fs && 50*frame_size != 3*st->Fs) + || (400*frame_size < st->Fs) + || max_data_bytes<=0 + ) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + silk_enc = (char*)st+st->silk_enc_offset; + celt_enc = (CELTEncoder*)((char*)st+st->celt_enc_offset); + if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) + delay_compensation = 0; + else + delay_compensation = st->delay_compensation; + + lsb_depth = IMIN(lsb_depth, st->lsb_depth); + + celt_encoder_ctl(celt_enc, CELT_GET_MODE(&celt_mode)); +#ifndef DISABLE_FLOAT_API + analysis_info.valid = 0; +#ifdef FIXED_POINT + if (st->silk_mode.complexity >= 10 && st->Fs==48000) +#else + if (st->silk_mode.complexity >= 7 && st->Fs==48000) +#endif + { + analysis_read_pos_bak = st->analysis.read_pos; + analysis_read_subframe_bak = st->analysis.read_subframe; + run_analysis(&st->analysis, celt_mode, analysis_pcm, analysis_size, frame_size, + c1, c2, analysis_channels, st->Fs, + lsb_depth, downmix, &analysis_info); + } +#else + (void)analysis_pcm; + (void)analysis_size; +#endif + + st->voice_ratio = -1; + +#ifndef DISABLE_FLOAT_API + st->detected_bandwidth = 0; + if (analysis_info.valid) + { + int analysis_bandwidth; + if (st->signal_type == OPUS_AUTO) + st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob)); + + analysis_bandwidth = analysis_info.bandwidth; + if (analysis_bandwidth<=12) + st->detected_bandwidth = OPUS_BANDWIDTH_NARROWBAND; + else if (analysis_bandwidth<=14) + st->detected_bandwidth = OPUS_BANDWIDTH_MEDIUMBAND; + else if (analysis_bandwidth<=16) + st->detected_bandwidth = OPUS_BANDWIDTH_WIDEBAND; + else if (analysis_bandwidth<=18) + st->detected_bandwidth = OPUS_BANDWIDTH_SUPERWIDEBAND; + else + st->detected_bandwidth = OPUS_BANDWIDTH_FULLBAND; + } +#endif + + if (st->channels==2 && st->force_channels!=1) + stereo_width = compute_stereo_width(pcm, frame_size, st->Fs, &st->width_mem); + else + stereo_width = 0; + total_buffer = delay_compensation; + st->bitrate_bps = user_bitrate_to_bitrate(st, frame_size, max_data_bytes); + + frame_rate = st->Fs/frame_size; + if (!st->use_vbr) + { + int cbrBytes; + /* Multiply by 3 to make sure the division is exact. */ + int frame_rate3 = 3*st->Fs/frame_size; + /* We need to make sure that "int" values always fit in 16 bits. */ + cbrBytes = IMIN( (3*st->bitrate_bps/8 + frame_rate3/2)/frame_rate3, max_data_bytes); + st->bitrate_bps = cbrBytes*(opus_int32)frame_rate3*8/3; + max_data_bytes = cbrBytes; + } + if (max_data_bytes<3 || st->bitrate_bps < 3*frame_rate*8 + || (frame_rate<50 && (max_data_bytes*frame_rate<300 || st->bitrate_bps < 2400))) + { + /*If the space is too low to do something useful, emit 'PLC' frames.*/ + int tocmode = st->mode; + int bw = st->bandwidth == 0 ? OPUS_BANDWIDTH_NARROWBAND : st->bandwidth; + if (tocmode==0) + tocmode = MODE_SILK_ONLY; + if (frame_rate>100) + tocmode = MODE_CELT_ONLY; + if (frame_rate < 50) + tocmode = MODE_SILK_ONLY; + if(tocmode==MODE_SILK_ONLY&&bw>OPUS_BANDWIDTH_WIDEBAND) + bw=OPUS_BANDWIDTH_WIDEBAND; + else if (tocmode==MODE_CELT_ONLY&&bw==OPUS_BANDWIDTH_MEDIUMBAND) + bw=OPUS_BANDWIDTH_NARROWBAND; + else if (tocmode==MODE_HYBRID&&bw<=OPUS_BANDWIDTH_SUPERWIDEBAND) + bw=OPUS_BANDWIDTH_SUPERWIDEBAND; + data[0] = gen_toc(tocmode, frame_rate, bw, st->stream_channels); + ret = 1; + if (!st->use_vbr) + { + ret = opus_packet_pad(data, ret, max_data_bytes); + if (ret == OPUS_OK) + ret = max_data_bytes; + } + RESTORE_STACK; + return ret; + } + max_rate = frame_rate*max_data_bytes*8; + + /* Equivalent 20-ms rate for mode/channel/bandwidth decisions */ + equiv_rate = st->bitrate_bps - (40*st->channels+20)*(st->Fs/frame_size - 50); + + if (st->signal_type == OPUS_SIGNAL_VOICE) + voice_est = 127; + else if (st->signal_type == OPUS_SIGNAL_MUSIC) + voice_est = 0; + else if (st->voice_ratio >= 0) + { + voice_est = st->voice_ratio*327>>8; + /* For AUDIO, never be more than 90% confident of having speech */ + if (st->application == OPUS_APPLICATION_AUDIO) + voice_est = IMIN(voice_est, 115); + } else if (st->application == OPUS_APPLICATION_VOIP) + voice_est = 115; + else + voice_est = 48; + + if (st->force_channels!=OPUS_AUTO && st->channels == 2) + { + st->stream_channels = st->force_channels; + } else { +#ifdef FUZZING + /* Random mono/stereo decision */ + if (st->channels == 2 && (rand()&0x1F)==0) + st->stream_channels = 3-st->stream_channels; +#else + /* Rate-dependent mono-stereo decision */ + if (st->channels == 2) + { + opus_int32 stereo_threshold; + stereo_threshold = stereo_music_threshold + ((voice_est*voice_est*(stereo_voice_threshold-stereo_music_threshold))>>14); + if (st->stream_channels == 2) + stereo_threshold -= 1000; + else + stereo_threshold += 1000; + st->stream_channels = (equiv_rate > stereo_threshold) ? 2 : 1; + } else { + st->stream_channels = st->channels; + } +#endif + } + equiv_rate = st->bitrate_bps - (40*st->stream_channels+20)*(st->Fs/frame_size - 50); + + /* Mode selection depending on application and signal type */ + if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) + { + st->mode = MODE_CELT_ONLY; + } else if (st->user_forced_mode == OPUS_AUTO) + { +#ifdef FUZZING + /* Random mode switching */ + if ((rand()&0xF)==0) + { + if ((rand()&0x1)==0) + st->mode = MODE_CELT_ONLY; + else + st->mode = MODE_SILK_ONLY; + } else { + if (st->prev_mode==MODE_CELT_ONLY) + st->mode = MODE_CELT_ONLY; + else + st->mode = MODE_SILK_ONLY; + } +#else + opus_int32 mode_voice, mode_music; + opus_int32 threshold; + + /* Interpolate based on stereo width */ + mode_voice = (opus_int32)(MULT16_32_Q15(Q15ONE-stereo_width,mode_thresholds[0][0]) + + MULT16_32_Q15(stereo_width,mode_thresholds[1][0])); + mode_music = (opus_int32)(MULT16_32_Q15(Q15ONE-stereo_width,mode_thresholds[1][1]) + + MULT16_32_Q15(stereo_width,mode_thresholds[1][1])); + /* Interpolate based on speech/music probability */ + threshold = mode_music + ((voice_est*voice_est*(mode_voice-mode_music))>>14); + /* Bias towards SILK for VoIP because of some useful features */ + if (st->application == OPUS_APPLICATION_VOIP) + threshold += 8000; + + /*printf("%f %d\n", stereo_width/(float)Q15ONE, threshold);*/ + /* Hysteresis */ + if (st->prev_mode == MODE_CELT_ONLY) + threshold -= 4000; + else if (st->prev_mode>0) + threshold += 4000; + + st->mode = (equiv_rate >= threshold) ? MODE_CELT_ONLY: MODE_SILK_ONLY; + + /* When FEC is enabled and there's enough packet loss, use SILK */ + if (st->silk_mode.useInBandFEC && st->silk_mode.packetLossPercentage > (128-voice_est)>>4) + st->mode = MODE_SILK_ONLY; + /* When encoding voice and DTX is enabled, set the encoder to SILK mode (at least for now) */ + if (st->silk_mode.useDTX && voice_est > 100) + st->mode = MODE_SILK_ONLY; +#endif + } else { + st->mode = st->user_forced_mode; + } + + /* Override the chosen mode to make sure we meet the requested frame size */ + if (st->mode != MODE_CELT_ONLY && frame_size < st->Fs/100) + st->mode = MODE_CELT_ONLY; + if (st->lfe) + st->mode = MODE_CELT_ONLY; + /* If max_data_bytes represents less than 8 kb/s, switch to CELT-only mode */ + if (max_data_bytes < (frame_rate > 50 ? 12000 : 8000)*frame_size / (st->Fs * 8)) + st->mode = MODE_CELT_ONLY; + + if (st->stream_channels == 1 && st->prev_channels ==2 && st->silk_mode.toMono==0 + && st->mode != MODE_CELT_ONLY && st->prev_mode != MODE_CELT_ONLY) + { + /* Delay stereo->mono transition by two frames so that SILK can do a smooth downmix */ + st->silk_mode.toMono = 1; + st->stream_channels = 2; + } else { + st->silk_mode.toMono = 0; + } + + if (st->prev_mode > 0 && + ((st->mode != MODE_CELT_ONLY && st->prev_mode == MODE_CELT_ONLY) || + (st->mode == MODE_CELT_ONLY && st->prev_mode != MODE_CELT_ONLY))) + { + redundancy = 1; + celt_to_silk = (st->mode != MODE_CELT_ONLY); + if (!celt_to_silk) + { + /* Switch to SILK/hybrid if frame size is 10 ms or more*/ + if (frame_size >= st->Fs/100) + { + st->mode = st->prev_mode; + to_celt = 1; + } else { + redundancy=0; + } + } + } + /* For the first frame at a new SILK bandwidth */ + if (st->silk_bw_switch) + { + redundancy = 1; + celt_to_silk = 1; + st->silk_bw_switch = 0; + prefill=1; + } + + if (redundancy) + { + /* Fair share of the max size allowed */ + redundancy_bytes = IMIN(257, max_data_bytes*(opus_int32)(st->Fs/200)/(frame_size+st->Fs/200)); + /* For VBR, target the actual bitrate (subject to the limit above) */ + if (st->use_vbr) + redundancy_bytes = IMIN(redundancy_bytes, st->bitrate_bps/1600); + } + + if (st->mode != MODE_CELT_ONLY && st->prev_mode == MODE_CELT_ONLY) + { + silk_EncControlStruct dummy; + silk_InitEncoder( silk_enc, st->arch, &dummy); + prefill=1; + } + + /* Automatic (rate-dependent) bandwidth selection */ + if (st->mode == MODE_CELT_ONLY || st->first || st->silk_mode.allowBandwidthSwitch) + { + const opus_int32 *voice_bandwidth_thresholds, *music_bandwidth_thresholds; + opus_int32 bandwidth_thresholds[8]; + int bandwidth = OPUS_BANDWIDTH_FULLBAND; + opus_int32 equiv_rate2; + + equiv_rate2 = equiv_rate; + if (st->mode != MODE_CELT_ONLY) + { + /* Adjust the threshold +/- 10% depending on complexity */ + equiv_rate2 = equiv_rate2 * (45+st->silk_mode.complexity)/50; + /* CBR is less efficient by ~1 kb/s */ + if (!st->use_vbr) + equiv_rate2 -= 1000; + } + if (st->channels==2 && st->force_channels!=1) + { + voice_bandwidth_thresholds = stereo_voice_bandwidth_thresholds; + music_bandwidth_thresholds = stereo_music_bandwidth_thresholds; + } else { + voice_bandwidth_thresholds = mono_voice_bandwidth_thresholds; + music_bandwidth_thresholds = mono_music_bandwidth_thresholds; + } + /* Interpolate bandwidth thresholds depending on voice estimation */ + for (i=0;i<8;i++) + { + bandwidth_thresholds[i] = music_bandwidth_thresholds[i] + + ((voice_est*voice_est*(voice_bandwidth_thresholds[i]-music_bandwidth_thresholds[i]))>>14); + } + do { + int threshold, hysteresis; + threshold = bandwidth_thresholds[2*(bandwidth-OPUS_BANDWIDTH_MEDIUMBAND)]; + hysteresis = bandwidth_thresholds[2*(bandwidth-OPUS_BANDWIDTH_MEDIUMBAND)+1]; + if (!st->first) + { + if (st->bandwidth >= bandwidth) + threshold -= hysteresis; + else + threshold += hysteresis; + } + if (equiv_rate2 >= threshold) + break; + } while (--bandwidth>OPUS_BANDWIDTH_NARROWBAND); + st->bandwidth = bandwidth; + /* Prevents any transition to SWB/FB until the SILK layer has fully + switched to WB mode and turned the variable LP filter off */ + if (!st->first && st->mode != MODE_CELT_ONLY && !st->silk_mode.inWBmodeWithoutVariableLP && st->bandwidth > OPUS_BANDWIDTH_WIDEBAND) + st->bandwidth = OPUS_BANDWIDTH_WIDEBAND; + } + + if (st->bandwidth>st->max_bandwidth) + st->bandwidth = st->max_bandwidth; + + if (st->user_bandwidth != OPUS_AUTO) + st->bandwidth = st->user_bandwidth; + + /* This prevents us from using hybrid at unsafe CBR/max rates */ + if (st->mode != MODE_CELT_ONLY && max_rate < 15000) + { + st->bandwidth = IMIN(st->bandwidth, OPUS_BANDWIDTH_WIDEBAND); + } + + /* Prevents Opus from wasting bits on frequencies that are above + the Nyquist rate of the input signal */ + if (st->Fs <= 24000 && st->bandwidth > OPUS_BANDWIDTH_SUPERWIDEBAND) + st->bandwidth = OPUS_BANDWIDTH_SUPERWIDEBAND; + if (st->Fs <= 16000 && st->bandwidth > OPUS_BANDWIDTH_WIDEBAND) + st->bandwidth = OPUS_BANDWIDTH_WIDEBAND; + if (st->Fs <= 12000 && st->bandwidth > OPUS_BANDWIDTH_MEDIUMBAND) + st->bandwidth = OPUS_BANDWIDTH_MEDIUMBAND; + if (st->Fs <= 8000 && st->bandwidth > OPUS_BANDWIDTH_NARROWBAND) + st->bandwidth = OPUS_BANDWIDTH_NARROWBAND; +#ifndef DISABLE_FLOAT_API + /* Use detected bandwidth to reduce the encoded bandwidth. */ + if (st->detected_bandwidth && st->user_bandwidth == OPUS_AUTO) + { + int min_detected_bandwidth; + /* Makes bandwidth detection more conservative just in case the detector + gets it wrong when we could have coded a high bandwidth transparently. + When operating in SILK/hybrid mode, we don't go below wideband to avoid + more complicated switches that require redundancy. */ + if (equiv_rate <= 18000*st->stream_channels && st->mode == MODE_CELT_ONLY) + min_detected_bandwidth = OPUS_BANDWIDTH_NARROWBAND; + else if (equiv_rate <= 24000*st->stream_channels && st->mode == MODE_CELT_ONLY) + min_detected_bandwidth = OPUS_BANDWIDTH_MEDIUMBAND; + else if (equiv_rate <= 30000*st->stream_channels) + min_detected_bandwidth = OPUS_BANDWIDTH_WIDEBAND; + else if (equiv_rate <= 44000*st->stream_channels) + min_detected_bandwidth = OPUS_BANDWIDTH_SUPERWIDEBAND; + else + min_detected_bandwidth = OPUS_BANDWIDTH_FULLBAND; + + st->detected_bandwidth = IMAX(st->detected_bandwidth, min_detected_bandwidth); + st->bandwidth = IMIN(st->bandwidth, st->detected_bandwidth); + } +#endif + celt_encoder_ctl(celt_enc, OPUS_SET_LSB_DEPTH(lsb_depth)); + + /* CELT mode doesn't support mediumband, use wideband instead */ + if (st->mode == MODE_CELT_ONLY && st->bandwidth == OPUS_BANDWIDTH_MEDIUMBAND) + st->bandwidth = OPUS_BANDWIDTH_WIDEBAND; + if (st->lfe) + st->bandwidth = OPUS_BANDWIDTH_NARROWBAND; + + /* Can't support higher than wideband for >20 ms frames */ + if (frame_size > st->Fs/50 && (st->mode == MODE_CELT_ONLY || st->bandwidth > OPUS_BANDWIDTH_WIDEBAND)) + { + VARDECL(unsigned char, tmp_data); + int nb_frames; + int bak_mode, bak_bandwidth, bak_channels, bak_to_mono; + VARDECL(OpusRepacketizer, rp); + opus_int32 bytes_per_frame; + opus_int32 repacketize_len; + +#ifndef DISABLE_FLOAT_API + if (analysis_read_pos_bak!= -1) + { + st->analysis.read_pos = analysis_read_pos_bak; + st->analysis.read_subframe = analysis_read_subframe_bak; + } +#endif + + nb_frames = frame_size > st->Fs/25 ? 3 : 2; + bytes_per_frame = IMIN(1276,(out_data_bytes-3)/nb_frames); + + ALLOC(tmp_data, nb_frames*bytes_per_frame, unsigned char); + + ALLOC(rp, 1, OpusRepacketizer); + opus_repacketizer_init(rp); + + bak_mode = st->user_forced_mode; + bak_bandwidth = st->user_bandwidth; + bak_channels = st->force_channels; + + st->user_forced_mode = st->mode; + st->user_bandwidth = st->bandwidth; + st->force_channels = st->stream_channels; + bak_to_mono = st->silk_mode.toMono; + + if (bak_to_mono) + st->force_channels = 1; + else + st->prev_channels = st->stream_channels; + for (i=0;isilk_mode.toMono = 0; + /* When switching from SILK/Hybrid to CELT, only ask for a switch at the last frame */ + if (to_celt && i==nb_frames-1) + st->user_forced_mode = MODE_CELT_ONLY; + tmp_len = opus_encode_native(st, pcm+i*(st->channels*st->Fs/50), st->Fs/50, + tmp_data+i*bytes_per_frame, bytes_per_frame, lsb_depth, + NULL, 0, c1, c2, analysis_channels, downmix, float_api); + if (tmp_len<0) + { + RESTORE_STACK; + return OPUS_INTERNAL_ERROR; + } + ret = opus_repacketizer_cat(rp, tmp_data+i*bytes_per_frame, tmp_len); + if (ret<0) + { + RESTORE_STACK; + return OPUS_INTERNAL_ERROR; + } + } + if (st->use_vbr) + repacketize_len = out_data_bytes; + else + repacketize_len = IMIN(3*st->bitrate_bps/(3*8*50/nb_frames), out_data_bytes); + ret = opus_repacketizer_out_range_impl(rp, 0, nb_frames, data, repacketize_len, 0, !st->use_vbr); + if (ret<0) + { + RESTORE_STACK; + return OPUS_INTERNAL_ERROR; + } + st->user_forced_mode = bak_mode; + st->user_bandwidth = bak_bandwidth; + st->force_channels = bak_channels; + st->silk_mode.toMono = bak_to_mono; + RESTORE_STACK; + return ret; + } + curr_bandwidth = st->bandwidth; + + /* Chooses the appropriate mode for speech + *NEVER* switch to/from CELT-only mode here as this will invalidate some assumptions */ + if (st->mode == MODE_SILK_ONLY && curr_bandwidth > OPUS_BANDWIDTH_WIDEBAND) + st->mode = MODE_HYBRID; + if (st->mode == MODE_HYBRID && curr_bandwidth <= OPUS_BANDWIDTH_WIDEBAND) + st->mode = MODE_SILK_ONLY; + + /* printf("%d %d %d %d\n", st->bitrate_bps, st->stream_channels, st->mode, curr_bandwidth); */ + bytes_target = IMIN(max_data_bytes-redundancy_bytes, st->bitrate_bps * frame_size / (st->Fs * 8)) - 1; + + data += 1; + + ec_enc_init(&enc, data, max_data_bytes-1); + + ALLOC(pcm_buf, (total_buffer+frame_size)*st->channels, opus_val16); + OPUS_COPY(pcm_buf, &st->delay_buffer[(st->encoder_buffer-total_buffer)*st->channels], total_buffer*st->channels); + + if (st->mode == MODE_CELT_ONLY) + hp_freq_smth1 = silk_LSHIFT( silk_lin2log( VARIABLE_HP_MIN_CUTOFF_HZ ), 8 ); + else + hp_freq_smth1 = ((silk_encoder*)silk_enc)->state_Fxx[0].sCmn.variable_HP_smth1_Q15; + + st->variable_HP_smth2_Q15 = silk_SMLAWB( st->variable_HP_smth2_Q15, + hp_freq_smth1 - st->variable_HP_smth2_Q15, SILK_FIX_CONST( VARIABLE_HP_SMTH_COEF2, 16 ) ); + + /* convert from log scale to Hertz */ + cutoff_Hz = silk_log2lin( silk_RSHIFT( st->variable_HP_smth2_Q15, 8 ) ); + + if (st->application == OPUS_APPLICATION_VOIP) + { + hp_cutoff(pcm, cutoff_Hz, &pcm_buf[total_buffer*st->channels], st->hp_mem, frame_size, st->channels, st->Fs); + } else { + dc_reject(pcm, 3, &pcm_buf[total_buffer*st->channels], st->hp_mem, frame_size, st->channels, st->Fs); + } +#ifndef FIXED_POINT + if (float_api) + { + opus_val32 sum; + sum = celt_inner_prod(&pcm_buf[total_buffer*st->channels], &pcm_buf[total_buffer*st->channels], frame_size*st->channels, st->arch); + /* This should filter out both NaNs and ridiculous signals that could + cause NaNs further down. */ + if (!(sum < 1e9f) || celt_isnan(sum)) + { + OPUS_CLEAR(&pcm_buf[total_buffer*st->channels], frame_size*st->channels); + st->hp_mem[0] = st->hp_mem[1] = st->hp_mem[2] = st->hp_mem[3] = 0; + } + } +#endif + + + /* SILK processing */ + HB_gain = Q15ONE; + if (st->mode != MODE_CELT_ONLY) + { + opus_int32 total_bitRate, celt_rate; +#ifdef FIXED_POINT + const opus_int16 *pcm_silk; +#else + VARDECL(opus_int16, pcm_silk); + ALLOC(pcm_silk, st->channels*frame_size, opus_int16); +#endif + + /* Distribute bits between SILK and CELT */ + total_bitRate = 8 * bytes_target * frame_rate; + if( st->mode == MODE_HYBRID ) { + int HB_gain_ref; + /* Base rate for SILK */ + st->silk_mode.bitRate = st->stream_channels * ( 5000 + 1000 * ( st->Fs == 100 * frame_size ) ); + if( curr_bandwidth == OPUS_BANDWIDTH_SUPERWIDEBAND ) { + /* SILK gets 2/3 of the remaining bits */ + st->silk_mode.bitRate += ( total_bitRate - st->silk_mode.bitRate ) * 2 / 3; + } else { /* FULLBAND */ + /* SILK gets 3/5 of the remaining bits */ + st->silk_mode.bitRate += ( total_bitRate - st->silk_mode.bitRate ) * 3 / 5; + } + /* Don't let SILK use more than 80% */ + if( st->silk_mode.bitRate > total_bitRate * 4/5 ) { + st->silk_mode.bitRate = total_bitRate * 4/5; + } + if (!st->energy_masking) + { + /* Increasingly attenuate high band when it gets allocated fewer bits */ + celt_rate = total_bitRate - st->silk_mode.bitRate; + HB_gain_ref = (curr_bandwidth == OPUS_BANDWIDTH_SUPERWIDEBAND) ? 3000 : 3600; + HB_gain = SHL32((opus_val32)celt_rate, 9) / SHR32((opus_val32)celt_rate + st->stream_channels * HB_gain_ref, 6); + HB_gain = HB_gain < (opus_val32)Q15ONE*6/7 ? HB_gain + Q15ONE/7 : Q15ONE; + } + } else { + /* SILK gets all bits */ + st->silk_mode.bitRate = total_bitRate; + } + + /* Surround masking for SILK */ + if (st->energy_masking && st->use_vbr && !st->lfe) + { + opus_val32 mask_sum=0; + opus_val16 masking_depth; + opus_int32 rate_offset; + int c; + int end = 17; + opus_int16 srate = 16000; + if (st->bandwidth == OPUS_BANDWIDTH_NARROWBAND) + { + end = 13; + srate = 8000; + } else if (st->bandwidth == OPUS_BANDWIDTH_MEDIUMBAND) + { + end = 15; + srate = 12000; + } + for (c=0;cchannels;c++) + { + for(i=0;ienergy_masking[21*c+i], + QCONST16(.5f, DB_SHIFT)), -QCONST16(2.0f, DB_SHIFT)); + if (mask > 0) + mask = HALF16(mask); + mask_sum += mask; + } + } + /* Conservative rate reduction, we cut the masking in half */ + masking_depth = mask_sum / end*st->channels; + masking_depth += QCONST16(.2f, DB_SHIFT); + rate_offset = (opus_int32)PSHR32(MULT16_16(srate, masking_depth), DB_SHIFT); + rate_offset = MAX32(rate_offset, -2*st->silk_mode.bitRate/3); + /* Split the rate change between the SILK and CELT part for hybrid. */ + if (st->bandwidth==OPUS_BANDWIDTH_SUPERWIDEBAND || st->bandwidth==OPUS_BANDWIDTH_FULLBAND) + st->silk_mode.bitRate += 3*rate_offset/5; + else + st->silk_mode.bitRate += rate_offset; + bytes_target += rate_offset * frame_size / (8 * st->Fs); + } + + st->silk_mode.payloadSize_ms = 1000 * frame_size / st->Fs; + st->silk_mode.nChannelsAPI = st->channels; + st->silk_mode.nChannelsInternal = st->stream_channels; + if (curr_bandwidth == OPUS_BANDWIDTH_NARROWBAND) { + st->silk_mode.desiredInternalSampleRate = 8000; + } else if (curr_bandwidth == OPUS_BANDWIDTH_MEDIUMBAND) { + st->silk_mode.desiredInternalSampleRate = 12000; + } else { + silk_assert( st->mode == MODE_HYBRID || curr_bandwidth == OPUS_BANDWIDTH_WIDEBAND ); + st->silk_mode.desiredInternalSampleRate = 16000; + } + if( st->mode == MODE_HYBRID ) { + /* Don't allow bandwidth reduction at lowest bitrates in hybrid mode */ + st->silk_mode.minInternalSampleRate = 16000; + } else { + st->silk_mode.minInternalSampleRate = 8000; + } + + if (st->mode == MODE_SILK_ONLY) + { + opus_int32 effective_max_rate = max_rate; + st->silk_mode.maxInternalSampleRate = 16000; + if (frame_rate > 50) + effective_max_rate = effective_max_rate*2/3; + if (effective_max_rate < 13000) + { + st->silk_mode.maxInternalSampleRate = 12000; + st->silk_mode.desiredInternalSampleRate = IMIN(12000, st->silk_mode.desiredInternalSampleRate); + } + if (effective_max_rate < 9600) + { + st->silk_mode.maxInternalSampleRate = 8000; + st->silk_mode.desiredInternalSampleRate = IMIN(8000, st->silk_mode.desiredInternalSampleRate); + } + } else { + st->silk_mode.maxInternalSampleRate = 16000; + } + + st->silk_mode.useCBR = !st->use_vbr; + + /* Call SILK encoder for the low band */ + nBytes = IMIN(1275, max_data_bytes-1-redundancy_bytes); + + st->silk_mode.maxBits = nBytes*8; + /* Only allow up to 90% of the bits for hybrid mode*/ + if (st->mode == MODE_HYBRID) + st->silk_mode.maxBits = (opus_int32)st->silk_mode.maxBits*9/10; + if (st->silk_mode.useCBR) + { + st->silk_mode.maxBits = (st->silk_mode.bitRate * frame_size / (st->Fs * 8))*8; + /* Reduce the initial target to make it easier to reach the CBR rate */ + st->silk_mode.bitRate = IMAX(1, st->silk_mode.bitRate-2000); + } + + if (prefill) + { + opus_int32 zero=0; + int prefill_offset; + /* Use a smooth onset for the SILK prefill to avoid the encoder trying to encode + a discontinuity. The exact location is what we need to avoid leaving any "gap" + in the audio when mixing with the redundant CELT frame. Here we can afford to + overwrite st->delay_buffer because the only thing that uses it before it gets + rewritten is tmp_prefill[] and even then only the part after the ramp really + gets used (rather than sent to the encoder and discarded) */ + prefill_offset = st->channels*(st->encoder_buffer-st->delay_compensation-st->Fs/400); + gain_fade(st->delay_buffer+prefill_offset, st->delay_buffer+prefill_offset, + 0, Q15ONE, celt_mode->overlap, st->Fs/400, st->channels, celt_mode->window, st->Fs); + OPUS_CLEAR(st->delay_buffer, prefill_offset); +#ifdef FIXED_POINT + pcm_silk = st->delay_buffer; +#else + for (i=0;iencoder_buffer*st->channels;i++) + pcm_silk[i] = FLOAT2INT16(st->delay_buffer[i]); +#endif + silk_Encode( silk_enc, &st->silk_mode, pcm_silk, st->encoder_buffer, NULL, &zero, 1 ); + } + +#ifdef FIXED_POINT + pcm_silk = pcm_buf+total_buffer*st->channels; +#else + for (i=0;ichannels;i++) + pcm_silk[i] = FLOAT2INT16(pcm_buf[total_buffer*st->channels + i]); +#endif + ret = silk_Encode( silk_enc, &st->silk_mode, pcm_silk, frame_size, &enc, &nBytes, 0 ); + if( ret ) { + /*fprintf (stderr, "SILK encode error: %d\n", ret);*/ + /* Handle error */ + RESTORE_STACK; + return OPUS_INTERNAL_ERROR; + } + if (nBytes==0) + { + st->rangeFinal = 0; + data[-1] = gen_toc(st->mode, st->Fs/frame_size, curr_bandwidth, st->stream_channels); + RESTORE_STACK; + return 1; + } + /* Extract SILK internal bandwidth for signaling in first byte */ + if( st->mode == MODE_SILK_ONLY ) { + if( st->silk_mode.internalSampleRate == 8000 ) { + curr_bandwidth = OPUS_BANDWIDTH_NARROWBAND; + } else if( st->silk_mode.internalSampleRate == 12000 ) { + curr_bandwidth = OPUS_BANDWIDTH_MEDIUMBAND; + } else if( st->silk_mode.internalSampleRate == 16000 ) { + curr_bandwidth = OPUS_BANDWIDTH_WIDEBAND; + } + } else { + silk_assert( st->silk_mode.internalSampleRate == 16000 ); + } + + st->silk_mode.opusCanSwitch = st->silk_mode.switchReady; + /* FIXME: How do we allocate the redundancy for CBR? */ + if (st->silk_mode.opusCanSwitch) + { + redundancy = 1; + celt_to_silk = 0; + st->silk_bw_switch = 1; + } + } + + /* CELT processing */ + { + int endband=21; + + switch(curr_bandwidth) + { + case OPUS_BANDWIDTH_NARROWBAND: + endband = 13; + break; + case OPUS_BANDWIDTH_MEDIUMBAND: + case OPUS_BANDWIDTH_WIDEBAND: + endband = 17; + break; + case OPUS_BANDWIDTH_SUPERWIDEBAND: + endband = 19; + break; + case OPUS_BANDWIDTH_FULLBAND: + endband = 21; + break; + } + celt_encoder_ctl(celt_enc, CELT_SET_END_BAND(endband)); + celt_encoder_ctl(celt_enc, CELT_SET_CHANNELS(st->stream_channels)); + } + celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(OPUS_BITRATE_MAX)); + if (st->mode != MODE_SILK_ONLY) + { + opus_val32 celt_pred=2; + celt_encoder_ctl(celt_enc, OPUS_SET_VBR(0)); + /* We may still decide to disable prediction later */ + if (st->silk_mode.reducedDependency) + celt_pred = 0; + celt_encoder_ctl(celt_enc, CELT_SET_PREDICTION(celt_pred)); + + if (st->mode == MODE_HYBRID) + { + int len; + + len = (ec_tell(&enc)+7)>>3; + if (redundancy) + len += st->mode == MODE_HYBRID ? 3 : 1; + if( st->use_vbr ) { + nb_compr_bytes = len + bytes_target - (st->silk_mode.bitRate * frame_size) / (8 * st->Fs); + } else { + /* check if SILK used up too much */ + nb_compr_bytes = len > bytes_target ? len : bytes_target; + } + } else { + if (st->use_vbr) + { + opus_int32 bonus=0; +#ifndef DISABLE_FLOAT_API + if (st->variable_duration==OPUS_FRAMESIZE_VARIABLE && frame_size != st->Fs/50) + { + bonus = (60*st->stream_channels+40)*(st->Fs/frame_size-50); + if (analysis_info.valid) + bonus = (opus_int32)(bonus*(1.f+.5f*analysis_info.tonality)); + } +#endif + celt_encoder_ctl(celt_enc, OPUS_SET_VBR(1)); + celt_encoder_ctl(celt_enc, OPUS_SET_VBR_CONSTRAINT(st->vbr_constraint)); + celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps+bonus)); + nb_compr_bytes = max_data_bytes-1-redundancy_bytes; + } else { + nb_compr_bytes = bytes_target; + } + } + + } else { + nb_compr_bytes = 0; + } + + ALLOC(tmp_prefill, st->channels*st->Fs/400, opus_val16); + if (st->mode != MODE_SILK_ONLY && st->mode != st->prev_mode && st->prev_mode > 0) + { + OPUS_COPY(tmp_prefill, &st->delay_buffer[(st->encoder_buffer-total_buffer-st->Fs/400)*st->channels], st->channels*st->Fs/400); + } + + if (st->channels*(st->encoder_buffer-(frame_size+total_buffer)) > 0) + { + OPUS_MOVE(st->delay_buffer, &st->delay_buffer[st->channels*frame_size], st->channels*(st->encoder_buffer-frame_size-total_buffer)); + OPUS_COPY(&st->delay_buffer[st->channels*(st->encoder_buffer-frame_size-total_buffer)], + &pcm_buf[0], + (frame_size+total_buffer)*st->channels); + } else { + OPUS_COPY(st->delay_buffer, &pcm_buf[(frame_size+total_buffer-st->encoder_buffer)*st->channels], st->encoder_buffer*st->channels); + } + /* gain_fade() and stereo_fade() need to be after the buffer copying + because we don't want any of this to affect the SILK part */ + if( st->prev_HB_gain < Q15ONE || HB_gain < Q15ONE ) { + gain_fade(pcm_buf, pcm_buf, + st->prev_HB_gain, HB_gain, celt_mode->overlap, frame_size, st->channels, celt_mode->window, st->Fs); + } + st->prev_HB_gain = HB_gain; + if (st->mode != MODE_HYBRID || st->stream_channels==1) + st->silk_mode.stereoWidth_Q14 = IMIN((1<<14),2*IMAX(0,equiv_rate-30000)); + if( !st->energy_masking && st->channels == 2 ) { + /* Apply stereo width reduction (at low bitrates) */ + if( st->hybrid_stereo_width_Q14 < (1 << 14) || st->silk_mode.stereoWidth_Q14 < (1 << 14) ) { + opus_val16 g1, g2; + g1 = st->hybrid_stereo_width_Q14; + g2 = (opus_val16)(st->silk_mode.stereoWidth_Q14); +#ifdef FIXED_POINT + g1 = g1==16384 ? Q15ONE : SHL16(g1,1); + g2 = g2==16384 ? Q15ONE : SHL16(g2,1); +#else + g1 *= (1.f/16384); + g2 *= (1.f/16384); +#endif + stereo_fade(pcm_buf, pcm_buf, g1, g2, celt_mode->overlap, + frame_size, st->channels, celt_mode->window, st->Fs); + st->hybrid_stereo_width_Q14 = st->silk_mode.stereoWidth_Q14; + } + } + + if ( st->mode != MODE_CELT_ONLY && ec_tell(&enc)+17+20*(st->mode == MODE_HYBRID) <= 8*(max_data_bytes-1)) + { + /* For SILK mode, the redundancy is inferred from the length */ + if (st->mode == MODE_HYBRID && (redundancy || ec_tell(&enc)+37 <= 8*nb_compr_bytes)) + ec_enc_bit_logp(&enc, redundancy, 12); + if (redundancy) + { + int max_redundancy; + ec_enc_bit_logp(&enc, celt_to_silk, 1); + if (st->mode == MODE_HYBRID) + max_redundancy = (max_data_bytes-1)-nb_compr_bytes; + else + max_redundancy = (max_data_bytes-1)-((ec_tell(&enc)+7)>>3); + /* Target the same bit-rate for redundancy as for the rest, + up to a max of 257 bytes */ + redundancy_bytes = IMIN(max_redundancy, st->bitrate_bps/1600); + redundancy_bytes = IMIN(257, IMAX(2, redundancy_bytes)); + if (st->mode == MODE_HYBRID) + ec_enc_uint(&enc, redundancy_bytes-2, 256); + } + } else { + redundancy = 0; + } + + if (!redundancy) + { + st->silk_bw_switch = 0; + redundancy_bytes = 0; + } + if (st->mode != MODE_CELT_ONLY)start_band=17; + + if (st->mode == MODE_SILK_ONLY) + { + ret = (ec_tell(&enc)+7)>>3; + ec_enc_done(&enc); + nb_compr_bytes = ret; + } else { + nb_compr_bytes = IMIN((max_data_bytes-1)-redundancy_bytes, nb_compr_bytes); + ec_enc_shrink(&enc, nb_compr_bytes); + } + +#ifndef DISABLE_FLOAT_API + if (redundancy || st->mode != MODE_SILK_ONLY) + celt_encoder_ctl(celt_enc, CELT_SET_ANALYSIS(&analysis_info)); +#endif + + /* 5 ms redundant frame for CELT->SILK */ + if (redundancy && celt_to_silk) + { + int err; + celt_encoder_ctl(celt_enc, CELT_SET_START_BAND(0)); + celt_encoder_ctl(celt_enc, OPUS_SET_VBR(0)); + err = celt_encode_with_ec(celt_enc, pcm_buf, st->Fs/200, data+nb_compr_bytes, redundancy_bytes, NULL); + if (err < 0) + { + RESTORE_STACK; + return OPUS_INTERNAL_ERROR; + } + celt_encoder_ctl(celt_enc, OPUS_GET_FINAL_RANGE(&redundant_rng)); + celt_encoder_ctl(celt_enc, OPUS_RESET_STATE); + } + + celt_encoder_ctl(celt_enc, CELT_SET_START_BAND(start_band)); + + if (st->mode != MODE_SILK_ONLY) + { + if (st->mode != st->prev_mode && st->prev_mode > 0) + { + unsigned char dummy[2]; + celt_encoder_ctl(celt_enc, OPUS_RESET_STATE); + + /* Prefilling */ + celt_encode_with_ec(celt_enc, tmp_prefill, st->Fs/400, dummy, 2, NULL); + celt_encoder_ctl(celt_enc, CELT_SET_PREDICTION(0)); + } + /* If false, we already busted the budget and we'll end up with a "PLC packet" */ + if (ec_tell(&enc) <= 8*nb_compr_bytes) + { + ret = celt_encode_with_ec(celt_enc, pcm_buf, frame_size, NULL, nb_compr_bytes, &enc); + if (ret < 0) + { + RESTORE_STACK; + return OPUS_INTERNAL_ERROR; + } + } + } + + /* 5 ms redundant frame for SILK->CELT */ + if (redundancy && !celt_to_silk) + { + int err; + unsigned char dummy[2]; + int N2, N4; + N2 = st->Fs/200; + N4 = st->Fs/400; + + celt_encoder_ctl(celt_enc, OPUS_RESET_STATE); + celt_encoder_ctl(celt_enc, CELT_SET_START_BAND(0)); + celt_encoder_ctl(celt_enc, CELT_SET_PREDICTION(0)); + + /* NOTE: We could speed this up slightly (at the expense of code size) by just adding a function that prefills the buffer */ + celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(frame_size-N2-N4), N4, dummy, 2, NULL); + + err = celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(frame_size-N2), N2, data+nb_compr_bytes, redundancy_bytes, NULL); + if (err < 0) + { + RESTORE_STACK; + return OPUS_INTERNAL_ERROR; + } + celt_encoder_ctl(celt_enc, OPUS_GET_FINAL_RANGE(&redundant_rng)); + } + + + + /* Signalling the mode in the first byte */ + data--; + data[0] = gen_toc(st->mode, st->Fs/frame_size, curr_bandwidth, st->stream_channels); + + st->rangeFinal = enc.rng ^ redundant_rng; + + if (to_celt) + st->prev_mode = MODE_CELT_ONLY; + else + st->prev_mode = st->mode; + st->prev_channels = st->stream_channels; + st->prev_framesize = frame_size; + + st->first = 0; + + /* In the unlikely case that the SILK encoder busted its target, tell + the decoder to call the PLC */ + if (ec_tell(&enc) > (max_data_bytes-1)*8) + { + if (max_data_bytes < 2) + { + RESTORE_STACK; + return OPUS_BUFFER_TOO_SMALL; + } + data[1] = 0; + ret = 1; + st->rangeFinal = 0; + } else if (st->mode==MODE_SILK_ONLY&&!redundancy) + { + /*When in LPC only mode it's perfectly + reasonable to strip off trailing zero bytes as + the required range decoder behavior is to + fill these in. This can't be done when the MDCT + modes are used because the decoder needs to know + the actual length for allocation purposes.*/ + while(ret>2&&data[ret]==0)ret--; + } + /* Count ToC and redundancy */ + ret += 1+redundancy_bytes; + if (!st->use_vbr) + { + if (opus_packet_pad(data, ret, max_data_bytes) != OPUS_OK) + + { + RESTORE_STACK; + return OPUS_INTERNAL_ERROR; + } + ret = max_data_bytes; + } + RESTORE_STACK; + return ret; +} + +#ifdef FIXED_POINT + +#ifndef DISABLE_FLOAT_API +opus_int32 opus_encode_float(OpusEncoder *st, const float *pcm, int analysis_frame_size, + unsigned char *data, opus_int32 max_data_bytes) +{ + int i, ret; + int frame_size; + int delay_compensation; + VARDECL(opus_int16, in); + ALLOC_STACK; + + if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) + delay_compensation = 0; + else + delay_compensation = st->delay_compensation; + frame_size = compute_frame_size(pcm, analysis_frame_size, + st->variable_duration, st->channels, st->Fs, st->bitrate_bps, + delay_compensation, downmix_float, st->analysis.subframe_mem); + + ALLOC(in, frame_size*st->channels, opus_int16); + + for (i=0;ichannels;i++) + in[i] = FLOAT2INT16(pcm[i]); + ret = opus_encode_native(st, in, frame_size, data, max_data_bytes, 16, + pcm, analysis_frame_size, 0, -2, st->channels, downmix_float, 1); + RESTORE_STACK; + return ret; +} +#endif + +opus_int32 opus_encode(OpusEncoder *st, const opus_int16 *pcm, int analysis_frame_size, + unsigned char *data, opus_int32 out_data_bytes) +{ + int frame_size; + int delay_compensation; + if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) + delay_compensation = 0; + else + delay_compensation = st->delay_compensation; + frame_size = compute_frame_size(pcm, analysis_frame_size, + st->variable_duration, st->channels, st->Fs, st->bitrate_bps, + delay_compensation, downmix_int +#ifndef DISABLE_FLOAT_API + , st->analysis.subframe_mem +#endif + ); + return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 16, + pcm, analysis_frame_size, 0, -2, st->channels, downmix_int, 0); +} + +#else +opus_int32 opus_encode(OpusEncoder *st, const opus_int16 *pcm, int analysis_frame_size, + unsigned char *data, opus_int32 max_data_bytes) +{ + int i, ret; + int frame_size; + int delay_compensation; + VARDECL(float, in); + ALLOC_STACK; + + if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) + delay_compensation = 0; + else + delay_compensation = st->delay_compensation; + frame_size = compute_frame_size(pcm, analysis_frame_size, + st->variable_duration, st->channels, st->Fs, st->bitrate_bps, + delay_compensation, downmix_int, st->analysis.subframe_mem); + + ALLOC(in, frame_size*st->channels, float); + + for (i=0;ichannels;i++) + in[i] = (1.0f/32768)*pcm[i]; + ret = opus_encode_native(st, in, frame_size, data, max_data_bytes, 16, + pcm, analysis_frame_size, 0, -2, st->channels, downmix_int, 0); + RESTORE_STACK; + return ret; +} +opus_int32 opus_encode_float(OpusEncoder *st, const float *pcm, int analysis_frame_size, + unsigned char *data, opus_int32 out_data_bytes) +{ + int frame_size; + int delay_compensation; + if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) + delay_compensation = 0; + else + delay_compensation = st->delay_compensation; + frame_size = compute_frame_size(pcm, analysis_frame_size, + st->variable_duration, st->channels, st->Fs, st->bitrate_bps, + delay_compensation, downmix_float, st->analysis.subframe_mem); + return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 24, + pcm, analysis_frame_size, 0, -2, st->channels, downmix_float, 1); +} +#endif + + +int opus_encoder_ctl(OpusEncoder *st, int request, ...) +{ + int ret; + CELTEncoder *celt_enc; + va_list ap; + + ret = OPUS_OK; + va_start(ap, request); + + celt_enc = (CELTEncoder*)((char*)st+st->celt_enc_offset); + + switch (request) + { + case OPUS_SET_APPLICATION_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if ( (value != OPUS_APPLICATION_VOIP && value != OPUS_APPLICATION_AUDIO + && value != OPUS_APPLICATION_RESTRICTED_LOWDELAY) + || (!st->first && st->application != value)) + { + ret = OPUS_BAD_ARG; + break; + } + st->application = value; + } + break; + case OPUS_GET_APPLICATION_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->application; + } + break; + case OPUS_SET_BITRATE_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value != OPUS_AUTO && value != OPUS_BITRATE_MAX) + { + if (value <= 0) + goto bad_arg; + else if (value <= 500) + value = 500; + else if (value > (opus_int32)300000*st->channels) + value = (opus_int32)300000*st->channels; + } + st->user_bitrate_bps = value; + } + break; + case OPUS_GET_BITRATE_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = user_bitrate_to_bitrate(st, st->prev_framesize, 1276); + } + break; + case OPUS_SET_FORCE_CHANNELS_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if((value<1 || value>st->channels) && value != OPUS_AUTO) + { + goto bad_arg; + } + st->force_channels = value; + } + break; + case OPUS_GET_FORCE_CHANNELS_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->force_channels; + } + break; + case OPUS_SET_MAX_BANDWIDTH_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value < OPUS_BANDWIDTH_NARROWBAND || value > OPUS_BANDWIDTH_FULLBAND) + { + goto bad_arg; + } + st->max_bandwidth = value; + if (st->max_bandwidth == OPUS_BANDWIDTH_NARROWBAND) { + st->silk_mode.maxInternalSampleRate = 8000; + } else if (st->max_bandwidth == OPUS_BANDWIDTH_MEDIUMBAND) { + st->silk_mode.maxInternalSampleRate = 12000; + } else { + st->silk_mode.maxInternalSampleRate = 16000; + } + } + break; + case OPUS_GET_MAX_BANDWIDTH_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->max_bandwidth; + } + break; + case OPUS_SET_BANDWIDTH_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if ((value < OPUS_BANDWIDTH_NARROWBAND || value > OPUS_BANDWIDTH_FULLBAND) && value != OPUS_AUTO) + { + goto bad_arg; + } + st->user_bandwidth = value; + if (st->user_bandwidth == OPUS_BANDWIDTH_NARROWBAND) { + st->silk_mode.maxInternalSampleRate = 8000; + } else if (st->user_bandwidth == OPUS_BANDWIDTH_MEDIUMBAND) { + st->silk_mode.maxInternalSampleRate = 12000; + } else { + st->silk_mode.maxInternalSampleRate = 16000; + } + } + break; + case OPUS_GET_BANDWIDTH_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->bandwidth; + } + break; + case OPUS_SET_DTX_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if(value<0 || value>1) + { + goto bad_arg; + } + st->silk_mode.useDTX = value; + } + break; + case OPUS_GET_DTX_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->silk_mode.useDTX; + } + break; + case OPUS_SET_COMPLEXITY_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if(value<0 || value>10) + { + goto bad_arg; + } + st->silk_mode.complexity = value; + celt_encoder_ctl(celt_enc, OPUS_SET_COMPLEXITY(value)); + } + break; + case OPUS_GET_COMPLEXITY_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->silk_mode.complexity; + } + break; + case OPUS_SET_INBAND_FEC_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if(value<0 || value>1) + { + goto bad_arg; + } + st->silk_mode.useInBandFEC = value; + } + break; + case OPUS_GET_INBAND_FEC_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->silk_mode.useInBandFEC; + } + break; + case OPUS_SET_PACKET_LOSS_PERC_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value < 0 || value > 100) + { + goto bad_arg; + } + st->silk_mode.packetLossPercentage = value; + celt_encoder_ctl(celt_enc, OPUS_SET_PACKET_LOSS_PERC(value)); + } + break; + case OPUS_GET_PACKET_LOSS_PERC_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->silk_mode.packetLossPercentage; + } + break; + case OPUS_SET_VBR_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if(value<0 || value>1) + { + goto bad_arg; + } + st->use_vbr = value; + st->silk_mode.useCBR = 1-value; + } + break; + case OPUS_GET_VBR_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->use_vbr; + } + break; + case OPUS_SET_VOICE_RATIO_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value<-1 || value>100) + { + goto bad_arg; + } + st->voice_ratio = value; + } + break; + case OPUS_GET_VOICE_RATIO_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->voice_ratio; + } + break; + case OPUS_SET_VBR_CONSTRAINT_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if(value<0 || value>1) + { + goto bad_arg; + } + st->vbr_constraint = value; + } + break; + case OPUS_GET_VBR_CONSTRAINT_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->vbr_constraint; + } + break; + case OPUS_SET_SIGNAL_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if(value!=OPUS_AUTO && value!=OPUS_SIGNAL_VOICE && value!=OPUS_SIGNAL_MUSIC) + { + goto bad_arg; + } + st->signal_type = value; + } + break; + case OPUS_GET_SIGNAL_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->signal_type; + } + break; + case OPUS_GET_LOOKAHEAD_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->Fs/400; + if (st->application != OPUS_APPLICATION_RESTRICTED_LOWDELAY) + *value += st->delay_compensation; + } + break; + case OPUS_GET_SAMPLE_RATE_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->Fs; + } + break; + case OPUS_GET_FINAL_RANGE_REQUEST: + { + opus_uint32 *value = va_arg(ap, opus_uint32*); + if (!value) + { + goto bad_arg; + } + *value = st->rangeFinal; + } + break; + case OPUS_SET_LSB_DEPTH_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value<8 || value>24) + { + goto bad_arg; + } + st->lsb_depth=value; + } + break; + case OPUS_GET_LSB_DEPTH_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->lsb_depth; + } + break; + case OPUS_SET_EXPERT_FRAME_DURATION_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value != OPUS_FRAMESIZE_ARG && value != OPUS_FRAMESIZE_2_5_MS && + value != OPUS_FRAMESIZE_5_MS && value != OPUS_FRAMESIZE_10_MS && + value != OPUS_FRAMESIZE_20_MS && value != OPUS_FRAMESIZE_40_MS && + value != OPUS_FRAMESIZE_60_MS && value != OPUS_FRAMESIZE_VARIABLE) + { + goto bad_arg; + } + st->variable_duration = value; + celt_encoder_ctl(celt_enc, OPUS_SET_EXPERT_FRAME_DURATION(value)); + } + break; + case OPUS_GET_EXPERT_FRAME_DURATION_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->variable_duration; + } + break; + case OPUS_SET_PREDICTION_DISABLED_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value > 1 || value < 0) + goto bad_arg; + st->silk_mode.reducedDependency = value; + } + break; + case OPUS_GET_PREDICTION_DISABLED_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + goto bad_arg; + *value = st->silk_mode.reducedDependency; + } + break; + case OPUS_RESET_STATE: + { + void *silk_enc; + silk_EncControlStruct dummy; + char *start; + silk_enc = (char*)st+st->silk_enc_offset; +#ifndef DISABLE_FLOAT_API + tonality_analysis_reset(&st->analysis); +#endif + + start = (char*)&st->OPUS_ENCODER_RESET_START; + OPUS_CLEAR(start, sizeof(OpusEncoder) - (start - (char*)st)); + + celt_encoder_ctl(celt_enc, OPUS_RESET_STATE); + silk_InitEncoder( silk_enc, st->arch, &dummy ); + st->stream_channels = st->channels; + st->hybrid_stereo_width_Q14 = 1 << 14; + st->prev_HB_gain = Q15ONE; + st->first = 1; + st->mode = MODE_HYBRID; + st->bandwidth = OPUS_BANDWIDTH_FULLBAND; + st->variable_HP_smth2_Q15 = silk_LSHIFT( silk_lin2log( VARIABLE_HP_MIN_CUTOFF_HZ ), 8 ); + } + break; + case OPUS_SET_FORCE_MODE_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if ((value < MODE_SILK_ONLY || value > MODE_CELT_ONLY) && value != OPUS_AUTO) + { + goto bad_arg; + } + st->user_forced_mode = value; + } + break; + case OPUS_SET_LFE_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + st->lfe = value; + ret = celt_encoder_ctl(celt_enc, OPUS_SET_LFE(value)); + } + break; + case OPUS_SET_ENERGY_MASK_REQUEST: + { + opus_val16 *value = va_arg(ap, opus_val16*); + st->energy_masking = value; + ret = celt_encoder_ctl(celt_enc, OPUS_SET_ENERGY_MASK(value)); + } + break; + + case CELT_GET_MODE_REQUEST: + { + const CELTMode ** value = va_arg(ap, const CELTMode**); + if (!value) + { + goto bad_arg; + } + ret = celt_encoder_ctl(celt_enc, CELT_GET_MODE(value)); + } + break; + default: + /* fprintf(stderr, "unknown opus_encoder_ctl() request: %d", request);*/ + ret = OPUS_UNIMPLEMENTED; + break; + } + va_end(ap); + return ret; +bad_arg: + va_end(ap); + return OPUS_BAD_ARG; +} + +void opus_encoder_destroy(OpusEncoder *st) +{ + opus_free(st); +} diff --git a/thirdparty/opus/opus_multistream.c b/thirdparty/opus/opus_multistream.c new file mode 100644 index 000000000000..09c3639b7f0d --- /dev/null +++ b/thirdparty/opus/opus_multistream.c @@ -0,0 +1,92 @@ +/* Copyright (c) 2011 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opus_multistream.h" +#include "opus.h" +#include "opus_private.h" +#include "stack_alloc.h" +#include +#include "float_cast.h" +#include "os_support.h" + + +int validate_layout(const ChannelLayout *layout) +{ + int i, max_channel; + + max_channel = layout->nb_streams+layout->nb_coupled_streams; + if (max_channel>255) + return 0; + for (i=0;inb_channels;i++) + { + if (layout->mapping[i] >= max_channel && layout->mapping[i] != 255) + return 0; + } + return 1; +} + + +int get_left_channel(const ChannelLayout *layout, int stream_id, int prev) +{ + int i; + i = (prev<0) ? 0 : prev+1; + for (;inb_channels;i++) + { + if (layout->mapping[i]==stream_id*2) + return i; + } + return -1; +} + +int get_right_channel(const ChannelLayout *layout, int stream_id, int prev) +{ + int i; + i = (prev<0) ? 0 : prev+1; + for (;inb_channels;i++) + { + if (layout->mapping[i]==stream_id*2+1) + return i; + } + return -1; +} + +int get_mono_channel(const ChannelLayout *layout, int stream_id, int prev) +{ + int i; + i = (prev<0) ? 0 : prev+1; + for (;inb_channels;i++) + { + if (layout->mapping[i]==stream_id+layout->nb_coupled_streams) + return i; + } + return -1; +} + diff --git a/thirdparty/opus/opus_multistream_decoder.c b/thirdparty/opus/opus_multistream_decoder.c new file mode 100644 index 000000000000..b95eaa6eac12 --- /dev/null +++ b/thirdparty/opus/opus_multistream_decoder.c @@ -0,0 +1,537 @@ +/* Copyright (c) 2011 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opus_multistream.h" +#include "opus.h" +#include "opus_private.h" +#include "stack_alloc.h" +#include +#include "float_cast.h" +#include "os_support.h" + +struct OpusMSDecoder { + ChannelLayout layout; + /* Decoder states go here */ +}; + + + + +/* DECODER */ + +opus_int32 opus_multistream_decoder_get_size(int nb_streams, int nb_coupled_streams) +{ + int coupled_size; + int mono_size; + + if(nb_streams<1||nb_coupled_streams>nb_streams||nb_coupled_streams<0)return 0; + coupled_size = opus_decoder_get_size(2); + mono_size = opus_decoder_get_size(1); + return align(sizeof(OpusMSDecoder)) + + nb_coupled_streams * align(coupled_size) + + (nb_streams-nb_coupled_streams) * align(mono_size); +} + +int opus_multistream_decoder_init( + OpusMSDecoder *st, + opus_int32 Fs, + int channels, + int streams, + int coupled_streams, + const unsigned char *mapping +) +{ + int coupled_size; + int mono_size; + int i, ret; + char *ptr; + + if ((channels>255) || (channels<1) || (coupled_streams>streams) || + (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams)) + return OPUS_BAD_ARG; + + st->layout.nb_channels = channels; + st->layout.nb_streams = streams; + st->layout.nb_coupled_streams = coupled_streams; + + for (i=0;ilayout.nb_channels;i++) + st->layout.mapping[i] = mapping[i]; + if (!validate_layout(&st->layout)) + return OPUS_BAD_ARG; + + ptr = (char*)st + align(sizeof(OpusMSDecoder)); + coupled_size = opus_decoder_get_size(2); + mono_size = opus_decoder_get_size(1); + + for (i=0;ilayout.nb_coupled_streams;i++) + { + ret=opus_decoder_init((OpusDecoder*)ptr, Fs, 2); + if(ret!=OPUS_OK)return ret; + ptr += align(coupled_size); + } + for (;ilayout.nb_streams;i++) + { + ret=opus_decoder_init((OpusDecoder*)ptr, Fs, 1); + if(ret!=OPUS_OK)return ret; + ptr += align(mono_size); + } + return OPUS_OK; +} + + +OpusMSDecoder *opus_multistream_decoder_create( + opus_int32 Fs, + int channels, + int streams, + int coupled_streams, + const unsigned char *mapping, + int *error +) +{ + int ret; + OpusMSDecoder *st; + if ((channels>255) || (channels<1) || (coupled_streams>streams) || + (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams)) + { + if (error) + *error = OPUS_BAD_ARG; + return NULL; + } + st = (OpusMSDecoder *)opus_alloc(opus_multistream_decoder_get_size(streams, coupled_streams)); + if (st==NULL) + { + if (error) + *error = OPUS_ALLOC_FAIL; + return NULL; + } + ret = opus_multistream_decoder_init(st, Fs, channels, streams, coupled_streams, mapping); + if (error) + *error = ret; + if (ret != OPUS_OK) + { + opus_free(st); + st = NULL; + } + return st; +} + +typedef void (*opus_copy_channel_out_func)( + void *dst, + int dst_stride, + int dst_channel, + const opus_val16 *src, + int src_stride, + int frame_size +); + +static int opus_multistream_packet_validate(const unsigned char *data, + opus_int32 len, int nb_streams, opus_int32 Fs) +{ + int s; + int count; + unsigned char toc; + opus_int16 size[48]; + int samples=0; + opus_int32 packet_offset; + + for (s=0;slayout.nb_streams-1) + { + RESTORE_STACK; + return OPUS_INVALID_PACKET; + } + if (!do_plc) + { + int ret = opus_multistream_packet_validate(data, len, st->layout.nb_streams, Fs); + if (ret < 0) + { + RESTORE_STACK; + return ret; + } else if (ret > frame_size) + { + RESTORE_STACK; + return OPUS_BUFFER_TOO_SMALL; + } + } + for (s=0;slayout.nb_streams;s++) + { + OpusDecoder *dec; + int packet_offset, ret; + + dec = (OpusDecoder*)ptr; + ptr += (s < st->layout.nb_coupled_streams) ? align(coupled_size) : align(mono_size); + + if (!do_plc && len<=0) + { + RESTORE_STACK; + return OPUS_INTERNAL_ERROR; + } + packet_offset = 0; + ret = opus_decode_native(dec, data, len, buf, frame_size, decode_fec, s!=st->layout.nb_streams-1, &packet_offset, soft_clip); + data += packet_offset; + len -= packet_offset; + if (ret <= 0) + { + RESTORE_STACK; + return ret; + } + frame_size = ret; + if (s < st->layout.nb_coupled_streams) + { + int chan, prev; + prev = -1; + /* Copy "left" audio to the channel(s) where it belongs */ + while ( (chan = get_left_channel(&st->layout, s, prev)) != -1) + { + (*copy_channel_out)(pcm, st->layout.nb_channels, chan, + buf, 2, frame_size); + prev = chan; + } + prev = -1; + /* Copy "right" audio to the channel(s) where it belongs */ + while ( (chan = get_right_channel(&st->layout, s, prev)) != -1) + { + (*copy_channel_out)(pcm, st->layout.nb_channels, chan, + buf+1, 2, frame_size); + prev = chan; + } + } else { + int chan, prev; + prev = -1; + /* Copy audio to the channel(s) where it belongs */ + while ( (chan = get_mono_channel(&st->layout, s, prev)) != -1) + { + (*copy_channel_out)(pcm, st->layout.nb_channels, chan, + buf, 1, frame_size); + prev = chan; + } + } + } + /* Handle muted channels */ + for (c=0;clayout.nb_channels;c++) + { + if (st->layout.mapping[c] == 255) + { + (*copy_channel_out)(pcm, st->layout.nb_channels, c, + NULL, 0, frame_size); + } + } + RESTORE_STACK; + return frame_size; +} + +#if !defined(DISABLE_FLOAT_API) +static void opus_copy_channel_out_float( + void *dst, + int dst_stride, + int dst_channel, + const opus_val16 *src, + int src_stride, + int frame_size +) +{ + float *float_dst; + opus_int32 i; + float_dst = (float*)dst; + if (src != NULL) + { + for (i=0;ilayout.nb_streams;s++) + { + OpusDecoder *dec; + dec = (OpusDecoder*)ptr; + if (s < st->layout.nb_coupled_streams) + ptr += align(coupled_size); + else + ptr += align(mono_size); + ret = opus_decoder_ctl(dec, request, &tmp); + if (ret != OPUS_OK) break; + *value ^= tmp; + } + } + break; + case OPUS_RESET_STATE: + { + int s; + for (s=0;slayout.nb_streams;s++) + { + OpusDecoder *dec; + + dec = (OpusDecoder*)ptr; + if (s < st->layout.nb_coupled_streams) + ptr += align(coupled_size); + else + ptr += align(mono_size); + ret = opus_decoder_ctl(dec, OPUS_RESET_STATE); + if (ret != OPUS_OK) + break; + } + } + break; + case OPUS_MULTISTREAM_GET_DECODER_STATE_REQUEST: + { + int s; + opus_int32 stream_id; + OpusDecoder **value; + stream_id = va_arg(ap, opus_int32); + if (stream_id<0 || stream_id >= st->layout.nb_streams) + ret = OPUS_BAD_ARG; + value = va_arg(ap, OpusDecoder**); + if (!value) + { + goto bad_arg; + } + for (s=0;slayout.nb_coupled_streams) + ptr += align(coupled_size); + else + ptr += align(mono_size); + } + *value = (OpusDecoder*)ptr; + } + break; + case OPUS_SET_GAIN_REQUEST: + { + int s; + /* This works for int32 params */ + opus_int32 value = va_arg(ap, opus_int32); + for (s=0;slayout.nb_streams;s++) + { + OpusDecoder *dec; + + dec = (OpusDecoder*)ptr; + if (s < st->layout.nb_coupled_streams) + ptr += align(coupled_size); + else + ptr += align(mono_size); + ret = opus_decoder_ctl(dec, request, value); + if (ret != OPUS_OK) + break; + } + } + break; + default: + ret = OPUS_UNIMPLEMENTED; + break; + } + + va_end(ap); + return ret; +bad_arg: + va_end(ap); + return OPUS_BAD_ARG; +} + + +void opus_multistream_decoder_destroy(OpusMSDecoder *st) +{ + opus_free(st); +} diff --git a/thirdparty/opus/opus_multistream_encoder.c b/thirdparty/opus/opus_multistream_encoder.c new file mode 100644 index 000000000000..1698223a16a4 --- /dev/null +++ b/thirdparty/opus/opus_multistream_encoder.c @@ -0,0 +1,1351 @@ +/* Copyright (c) 2011 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opus_multistream.h" +#include "opus.h" +#include "opus_private.h" +#include "stack_alloc.h" +#include +#include "float_cast.h" +#include "os_support.h" +#include "mathops.h" +#include "mdct.h" +#include "modes.h" +#include "bands.h" +#include "quant_bands.h" +#include "pitch.h" + +typedef struct { + int nb_streams; + int nb_coupled_streams; + unsigned char mapping[8]; +} VorbisLayout; + +/* Index is nb_channel-1*/ +static const VorbisLayout vorbis_mappings[8] = { + {1, 0, {0}}, /* 1: mono */ + {1, 1, {0, 1}}, /* 2: stereo */ + {2, 1, {0, 2, 1}}, /* 3: 1-d surround */ + {2, 2, {0, 1, 2, 3}}, /* 4: quadraphonic surround */ + {3, 2, {0, 4, 1, 2, 3}}, /* 5: 5-channel surround */ + {4, 2, {0, 4, 1, 2, 3, 5}}, /* 6: 5.1 surround */ + {4, 3, {0, 4, 1, 2, 3, 5, 6}}, /* 7: 6.1 surround */ + {5, 3, {0, 6, 1, 2, 3, 4, 5, 7}}, /* 8: 7.1 surround */ +}; + +typedef void (*opus_copy_channel_in_func)( + opus_val16 *dst, + int dst_stride, + const void *src, + int src_stride, + int src_channel, + int frame_size +); + +typedef enum { + MAPPING_TYPE_NONE, + MAPPING_TYPE_SURROUND +#ifdef ENABLE_EXPERIMENTAL_AMBISONICS + , /* Do not include comma at end of enumerator list */ + MAPPING_TYPE_AMBISONICS +#endif +} MappingType; + +struct OpusMSEncoder { + ChannelLayout layout; + int arch; + int lfe_stream; + int application; + int variable_duration; + MappingType mapping_type; + opus_int32 bitrate_bps; + float subframe_mem[3]; + /* Encoder states go here */ + /* then opus_val32 window_mem[channels*120]; */ + /* then opus_val32 preemph_mem[channels]; */ +}; + +static opus_val32 *ms_get_preemph_mem(OpusMSEncoder *st) +{ + int s; + char *ptr; + int coupled_size, mono_size; + + coupled_size = opus_encoder_get_size(2); + mono_size = opus_encoder_get_size(1); + ptr = (char*)st + align(sizeof(OpusMSEncoder)); + for (s=0;slayout.nb_streams;s++) + { + if (s < st->layout.nb_coupled_streams) + ptr += align(coupled_size); + else + ptr += align(mono_size); + } + /* void* cast avoids clang -Wcast-align warning */ + return (opus_val32*)(void*)(ptr+st->layout.nb_channels*120*sizeof(opus_val32)); +} + +static opus_val32 *ms_get_window_mem(OpusMSEncoder *st) +{ + int s; + char *ptr; + int coupled_size, mono_size; + + coupled_size = opus_encoder_get_size(2); + mono_size = opus_encoder_get_size(1); + ptr = (char*)st + align(sizeof(OpusMSEncoder)); + for (s=0;slayout.nb_streams;s++) + { + if (s < st->layout.nb_coupled_streams) + ptr += align(coupled_size); + else + ptr += align(mono_size); + } + /* void* cast avoids clang -Wcast-align warning */ + return (opus_val32*)(void*)ptr; +} + +static int validate_encoder_layout(const ChannelLayout *layout) +{ + int s; + for (s=0;snb_streams;s++) + { + if (s < layout->nb_coupled_streams) + { + if (get_left_channel(layout, s, -1)==-1) + return 0; + if (get_right_channel(layout, s, -1)==-1) + return 0; + } else { + if (get_mono_channel(layout, s, -1)==-1) + return 0; + } + } + return 1; +} + +static void channel_pos(int channels, int pos[8]) +{ + /* Position in the mix: 0 don't mix, 1: left, 2: center, 3:right */ + if (channels==4) + { + pos[0]=1; + pos[1]=3; + pos[2]=1; + pos[3]=3; + } else if (channels==3||channels==5||channels==6) + { + pos[0]=1; + pos[1]=2; + pos[2]=3; + pos[3]=1; + pos[4]=3; + pos[5]=0; + } else if (channels==7) + { + pos[0]=1; + pos[1]=2; + pos[2]=3; + pos[3]=1; + pos[4]=3; + pos[5]=2; + pos[6]=0; + } else if (channels==8) + { + pos[0]=1; + pos[1]=2; + pos[2]=3; + pos[3]=1; + pos[4]=3; + pos[5]=1; + pos[6]=3; + pos[7]=0; + } +} + +#if 1 +/* Computes a rough approximation of log2(2^a + 2^b) */ +static opus_val16 logSum(opus_val16 a, opus_val16 b) +{ + opus_val16 max; + opus_val32 diff; + opus_val16 frac; + static const opus_val16 diff_table[17] = { + QCONST16(0.5000000f, DB_SHIFT), QCONST16(0.2924813f, DB_SHIFT), QCONST16(0.1609640f, DB_SHIFT), QCONST16(0.0849625f, DB_SHIFT), + QCONST16(0.0437314f, DB_SHIFT), QCONST16(0.0221971f, DB_SHIFT), QCONST16(0.0111839f, DB_SHIFT), QCONST16(0.0056136f, DB_SHIFT), + QCONST16(0.0028123f, DB_SHIFT) + }; + int low; + if (a>b) + { + max = a; + diff = SUB32(EXTEND32(a),EXTEND32(b)); + } else { + max = b; + diff = SUB32(EXTEND32(b),EXTEND32(a)); + } + if (!(diff < QCONST16(8.f, DB_SHIFT))) /* inverted to catch NaNs */ + return max; +#ifdef FIXED_POINT + low = SHR32(diff, DB_SHIFT-1); + frac = SHL16(diff - SHL16(low, DB_SHIFT-1), 16-DB_SHIFT); +#else + low = (int)floor(2*diff); + frac = 2*diff - low; +#endif + return max + diff_table[low] + MULT16_16_Q15(frac, SUB16(diff_table[low+1], diff_table[low])); +} +#else +opus_val16 logSum(opus_val16 a, opus_val16 b) +{ + return log2(pow(4, a)+ pow(4, b))/2; +} +#endif + +void surround_analysis(const CELTMode *celt_mode, const void *pcm, opus_val16 *bandLogE, opus_val32 *mem, opus_val32 *preemph_mem, + int len, int overlap, int channels, int rate, opus_copy_channel_in_func copy_channel_in, int arch +) +{ + int c; + int i; + int LM; + int pos[8] = {0}; + int upsample; + int frame_size; + opus_val16 channel_offset; + opus_val32 bandE[21]; + opus_val16 maskLogE[3][21]; + VARDECL(opus_val32, in); + VARDECL(opus_val16, x); + VARDECL(opus_val32, freq); + SAVE_STACK; + + upsample = resampling_factor(rate); + frame_size = len*upsample; + + /* LM = log2(frame_size / 120) */ + for (LM=0;LMmaxLM;LM++) + if (celt_mode->shortMdctSize<preemph, preemph_mem+c, 0); +#ifndef FIXED_POINT + { + opus_val32 sum; + sum = celt_inner_prod(in, in, frame_size+overlap, 0); + /* This should filter out both NaNs and ridiculous signals that could + cause NaNs further down. */ + if (!(sum < 1e18f) || celt_isnan(sum)) + { + OPUS_CLEAR(in, frame_size+overlap); + preemph_mem[c] = 0; + } + } +#endif + clt_mdct_forward(&celt_mode->mdct, in, freq, celt_mode->window, + overlap, celt_mode->maxLM-LM, 1, arch); + if (upsample != 1) + { + int bound = len; + for (i=0;i=0;i--) + bandLogE[21*c+i] = MAX16(bandLogE[21*c+i], bandLogE[21*c+i+1]-QCONST16(2.f, DB_SHIFT)); + if (pos[c]==1) + { + for (i=0;i<21;i++) + maskLogE[0][i] = logSum(maskLogE[0][i], bandLogE[21*c+i]); + } else if (pos[c]==3) + { + for (i=0;i<21;i++) + maskLogE[2][i] = logSum(maskLogE[2][i], bandLogE[21*c+i]); + } else if (pos[c]==2) + { + for (i=0;i<21;i++) + { + maskLogE[0][i] = logSum(maskLogE[0][i], bandLogE[21*c+i]-QCONST16(.5f, DB_SHIFT)); + maskLogE[2][i] = logSum(maskLogE[2][i], bandLogE[21*c+i]-QCONST16(.5f, DB_SHIFT)); + } + } +#if 0 + for (i=0;i<21;i++) + printf("%f ", bandLogE[21*c+i]); + float sum=0; + for (i=0;i<21;i++) + sum += bandLogE[21*c+i]; + printf("%f ", sum/21); +#endif + OPUS_COPY(mem+c*overlap, in+frame_size, overlap); + } + for (i=0;i<21;i++) + maskLogE[1][i] = MIN32(maskLogE[0][i],maskLogE[2][i]); + channel_offset = HALF16(celt_log2(QCONST32(2.f,14)/(channels-1))); + for (c=0;c<3;c++) + for (i=0;i<21;i++) + maskLogE[c][i] += channel_offset; +#if 0 + for (c=0;c<3;c++) + { + for (i=0;i<21;i++) + printf("%f ", maskLogE[c][i]); + } +#endif + for (c=0;cnb_streams||nb_coupled_streams<0)return 0; + coupled_size = opus_encoder_get_size(2); + mono_size = opus_encoder_get_size(1); + return align(sizeof(OpusMSEncoder)) + + nb_coupled_streams * align(coupled_size) + + (nb_streams-nb_coupled_streams) * align(mono_size); +} + +opus_int32 opus_multistream_surround_encoder_get_size(int channels, int mapping_family) +{ + int nb_streams; + int nb_coupled_streams; + opus_int32 size; + + if (mapping_family==0) + { + if (channels==1) + { + nb_streams=1; + nb_coupled_streams=0; + } else if (channels==2) + { + nb_streams=1; + nb_coupled_streams=1; + } else + return 0; + } else if (mapping_family==1 && channels<=8 && channels>=1) + { + nb_streams=vorbis_mappings[channels-1].nb_streams; + nb_coupled_streams=vorbis_mappings[channels-1].nb_coupled_streams; + } else if (mapping_family==255) + { + nb_streams=channels; + nb_coupled_streams=0; +#ifdef ENABLE_EXPERIMENTAL_AMBISONICS + } else if (mapping_family==254) + { + nb_streams=channels; + nb_coupled_streams=0; +#endif + } else + return 0; + size = opus_multistream_encoder_get_size(nb_streams, nb_coupled_streams); + if (channels>2) + { + size += channels*(120*sizeof(opus_val32) + sizeof(opus_val32)); + } + return size; +} + +static int opus_multistream_encoder_init_impl( + OpusMSEncoder *st, + opus_int32 Fs, + int channels, + int streams, + int coupled_streams, + const unsigned char *mapping, + int application, + MappingType mapping_type +) +{ + int coupled_size; + int mono_size; + int i, ret; + char *ptr; + + if ((channels>255) || (channels<1) || (coupled_streams>streams) || + (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams)) + return OPUS_BAD_ARG; + + st->arch = opus_select_arch(); + st->layout.nb_channels = channels; + st->layout.nb_streams = streams; + st->layout.nb_coupled_streams = coupled_streams; + st->subframe_mem[0]=st->subframe_mem[1]=st->subframe_mem[2]=0; + if (mapping_type != MAPPING_TYPE_SURROUND) + st->lfe_stream = -1; + st->bitrate_bps = OPUS_AUTO; + st->application = application; + st->variable_duration = OPUS_FRAMESIZE_ARG; + for (i=0;ilayout.nb_channels;i++) + st->layout.mapping[i] = mapping[i]; + if (!validate_layout(&st->layout) || !validate_encoder_layout(&st->layout)) + return OPUS_BAD_ARG; + ptr = (char*)st + align(sizeof(OpusMSEncoder)); + coupled_size = opus_encoder_get_size(2); + mono_size = opus_encoder_get_size(1); + + for (i=0;ilayout.nb_coupled_streams;i++) + { + ret = opus_encoder_init((OpusEncoder*)ptr, Fs, 2, application); + if(ret!=OPUS_OK)return ret; + if (i==st->lfe_stream) + opus_encoder_ctl((OpusEncoder*)ptr, OPUS_SET_LFE(1)); + ptr += align(coupled_size); + } + for (;ilayout.nb_streams;i++) + { + ret = opus_encoder_init((OpusEncoder*)ptr, Fs, 1, application); + if (i==st->lfe_stream) + opus_encoder_ctl((OpusEncoder*)ptr, OPUS_SET_LFE(1)); + if(ret!=OPUS_OK)return ret; + ptr += align(mono_size); + } + if (mapping_type == MAPPING_TYPE_SURROUND) + { + OPUS_CLEAR(ms_get_preemph_mem(st), channels); + OPUS_CLEAR(ms_get_window_mem(st), channels*120); + } + st->mapping_type = mapping_type; + return OPUS_OK; +} + +int opus_multistream_encoder_init( + OpusMSEncoder *st, + opus_int32 Fs, + int channels, + int streams, + int coupled_streams, + const unsigned char *mapping, + int application +) +{ + return opus_multistream_encoder_init_impl(st, Fs, channels, streams, + coupled_streams, mapping, + application, MAPPING_TYPE_NONE); +} + +int opus_multistream_surround_encoder_init( + OpusMSEncoder *st, + opus_int32 Fs, + int channels, + int mapping_family, + int *streams, + int *coupled_streams, + unsigned char *mapping, + int application +) +{ + MappingType mapping_type; + + if ((channels>255) || (channels<1)) + return OPUS_BAD_ARG; + st->lfe_stream = -1; + if (mapping_family==0) + { + if (channels==1) + { + *streams=1; + *coupled_streams=0; + mapping[0]=0; + } else if (channels==2) + { + *streams=1; + *coupled_streams=1; + mapping[0]=0; + mapping[1]=1; + } else + return OPUS_UNIMPLEMENTED; + } else if (mapping_family==1 && channels<=8 && channels>=1) + { + int i; + *streams=vorbis_mappings[channels-1].nb_streams; + *coupled_streams=vorbis_mappings[channels-1].nb_coupled_streams; + for (i=0;i=6) + st->lfe_stream = *streams-1; + } else if (mapping_family==255) + { + int i; + *streams=channels; + *coupled_streams=0; + for(i=0;i2 && mapping_family==1) { + mapping_type = MAPPING_TYPE_SURROUND; +#ifdef ENABLE_EXPERIMENTAL_AMBISONICS + } else if (mapping_family==254) + { + mapping_type = MAPPING_TYPE_AMBISONICS; +#endif + } else + { + mapping_type = MAPPING_TYPE_NONE; + } + return opus_multistream_encoder_init_impl(st, Fs, channels, *streams, + *coupled_streams, mapping, + application, mapping_type); +} + +OpusMSEncoder *opus_multistream_encoder_create( + opus_int32 Fs, + int channels, + int streams, + int coupled_streams, + const unsigned char *mapping, + int application, + int *error +) +{ + int ret; + OpusMSEncoder *st; + if ((channels>255) || (channels<1) || (coupled_streams>streams) || + (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams)) + { + if (error) + *error = OPUS_BAD_ARG; + return NULL; + } + st = (OpusMSEncoder *)opus_alloc(opus_multistream_encoder_get_size(streams, coupled_streams)); + if (st==NULL) + { + if (error) + *error = OPUS_ALLOC_FAIL; + return NULL; + } + ret = opus_multistream_encoder_init(st, Fs, channels, streams, coupled_streams, mapping, application); + if (ret != OPUS_OK) + { + opus_free(st); + st = NULL; + } + if (error) + *error = ret; + return st; +} + +OpusMSEncoder *opus_multistream_surround_encoder_create( + opus_int32 Fs, + int channels, + int mapping_family, + int *streams, + int *coupled_streams, + unsigned char *mapping, + int application, + int *error +) +{ + int ret; + opus_int32 size; + OpusMSEncoder *st; + if ((channels>255) || (channels<1)) + { + if (error) + *error = OPUS_BAD_ARG; + return NULL; + } + size = opus_multistream_surround_encoder_get_size(channels, mapping_family); + if (!size) + { + if (error) + *error = OPUS_UNIMPLEMENTED; + return NULL; + } + st = (OpusMSEncoder *)opus_alloc(size); + if (st==NULL) + { + if (error) + *error = OPUS_ALLOC_FAIL; + return NULL; + } + ret = opus_multistream_surround_encoder_init(st, Fs, channels, mapping_family, streams, coupled_streams, mapping, application); + if (ret != OPUS_OK) + { + opus_free(st); + st = NULL; + } + if (error) + *error = ret; + return st; +} + +static void surround_rate_allocation( + OpusMSEncoder *st, + opus_int32 *rate, + int frame_size, + opus_int32 Fs + ) +{ + int i; + opus_int32 channel_rate; + int stream_offset; + int lfe_offset; + int coupled_ratio; /* Q8 */ + int lfe_ratio; /* Q8 */ + + if (st->bitrate_bps > st->layout.nb_channels*40000) + stream_offset = 20000; + else + stream_offset = st->bitrate_bps/st->layout.nb_channels/2; + stream_offset += 60*(Fs/frame_size-50); + /* We start by giving each stream (coupled or uncoupled) the same bitrate. + This models the main saving of coupled channels over uncoupled. */ + /* The LFE stream is an exception to the above and gets fewer bits. */ + lfe_offset = 3500 + 60*(Fs/frame_size-50); + /* Coupled streams get twice the mono rate after the first 20 kb/s. */ + coupled_ratio = 512; + /* Should depend on the bitrate, for now we assume LFE gets 1/8 the bits of mono */ + lfe_ratio = 32; + + /* Compute bitrate allocation between streams */ + if (st->bitrate_bps==OPUS_AUTO) + { + channel_rate = Fs+60*Fs/frame_size; + } else if (st->bitrate_bps==OPUS_BITRATE_MAX) + { + channel_rate = 300000; + } else { + int nb_lfe; + int nb_uncoupled; + int nb_coupled; + int total; + nb_lfe = (st->lfe_stream!=-1); + nb_coupled = st->layout.nb_coupled_streams; + nb_uncoupled = st->layout.nb_streams-nb_coupled-nb_lfe; + total = (nb_uncoupled<<8) /* mono */ + + coupled_ratio*nb_coupled /* stereo */ + + nb_lfe*lfe_ratio; + channel_rate = 256*(st->bitrate_bps-lfe_offset*nb_lfe-stream_offset*(nb_coupled+nb_uncoupled))/total; + } +#ifndef FIXED_POINT + if (st->variable_duration==OPUS_FRAMESIZE_VARIABLE && frame_size != Fs/50) + { + opus_int32 bonus; + bonus = 60*(Fs/frame_size-50); + channel_rate += bonus; + } +#endif + + for (i=0;ilayout.nb_streams;i++) + { + if (ilayout.nb_coupled_streams) + rate[i] = stream_offset+(channel_rate*coupled_ratio>>8); + else if (i!=st->lfe_stream) + rate[i] = stream_offset+channel_rate; + else + rate[i] = lfe_offset+(channel_rate*lfe_ratio>>8); + } +} + +#ifdef ENABLE_EXPERIMENTAL_AMBISONICS +static void ambisonics_rate_allocation( + OpusMSEncoder *st, + opus_int32 *rate, + int frame_size, + opus_int32 Fs + ) +{ + int i; + int non_mono_rate; + int total_rate; + + /* The mono channel gets (rate_ratio_num / rate_ratio_den) times as many bits + * as all other channels */ + const int rate_ratio_num = 4; + const int rate_ratio_den = 3; + const int num_channels = st->layout.nb_streams; + + if (st->bitrate_bps==OPUS_AUTO) + { + total_rate = num_channels * (20000 + st->layout.nb_streams*(Fs+60*Fs/frame_size)); + } else if (st->bitrate_bps==OPUS_BITRATE_MAX) + { + total_rate = num_channels * 320000; + } else { + total_rate = st->bitrate_bps; + } + + /* Let y be the non-mono rate and let p, q be integers such that the mono + * channel rate is (p/q) * y. + * Also let T be the total bitrate to allocate. Then + * (n - 1) y + (p/q) y = T + * y = (T q) / (qn - q + p) + */ + non_mono_rate = + total_rate * rate_ratio_den + / (rate_ratio_den*num_channels + rate_ratio_num - rate_ratio_den); + +#ifndef FIXED_POINT + if (st->variable_duration==OPUS_FRAMESIZE_VARIABLE && frame_size != Fs/50) + { + opus_int32 bonus = 60*(Fs/frame_size-50); + non_mono_rate += bonus; + } +#endif + + rate[0] = total_rate - (num_channels - 1) * non_mono_rate; + for (i=1;ilayout.nb_streams;i++) + { + rate[i] = non_mono_rate; + } +} +#endif /* ENABLE_EXPERIMENTAL_AMBISONICS */ + +static opus_int32 rate_allocation( + OpusMSEncoder *st, + opus_int32 *rate, + int frame_size + ) +{ + int i; + opus_int32 rate_sum=0; + opus_int32 Fs; + char *ptr; + + ptr = (char*)st + align(sizeof(OpusMSEncoder)); + opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_SAMPLE_RATE(&Fs)); + +#ifdef ENABLE_EXPERIMENTAL_AMBISONICS + if (st->mapping_type == MAPPING_TYPE_AMBISONICS) { + ambisonics_rate_allocation(st, rate, frame_size, Fs); + } else +#endif + { + surround_rate_allocation(st, rate, frame_size, Fs); + } + + for (i=0;ilayout.nb_streams;i++) + { + rate[i] = IMAX(rate[i], 500); + rate_sum += rate[i]; + } + return rate_sum; +} + +/* Max size in case the encoder decides to return three frames */ +#define MS_FRAME_TMP (3*1275+7) +static int opus_multistream_encode_native +( + OpusMSEncoder *st, + opus_copy_channel_in_func copy_channel_in, + const void *pcm, + int analysis_frame_size, + unsigned char *data, + opus_int32 max_data_bytes, + int lsb_depth, + downmix_func downmix, + int float_api +) +{ + opus_int32 Fs; + int coupled_size; + int mono_size; + int s; + char *ptr; + int tot_size; + VARDECL(opus_val16, buf); + VARDECL(opus_val16, bandSMR); + unsigned char tmp_data[MS_FRAME_TMP]; + OpusRepacketizer rp; + opus_int32 vbr; + const CELTMode *celt_mode; + opus_int32 bitrates[256]; + opus_val16 bandLogE[42]; + opus_val32 *mem = NULL; + opus_val32 *preemph_mem=NULL; + int frame_size; + opus_int32 rate_sum; + opus_int32 smallest_packet; + ALLOC_STACK; + + if (st->mapping_type == MAPPING_TYPE_SURROUND) + { + preemph_mem = ms_get_preemph_mem(st); + mem = ms_get_window_mem(st); + } + + ptr = (char*)st + align(sizeof(OpusMSEncoder)); + opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_SAMPLE_RATE(&Fs)); + opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_VBR(&vbr)); + opus_encoder_ctl((OpusEncoder*)ptr, CELT_GET_MODE(&celt_mode)); + + { + opus_int32 delay_compensation; + int channels; + + channels = st->layout.nb_streams + st->layout.nb_coupled_streams; + opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_LOOKAHEAD(&delay_compensation)); + delay_compensation -= Fs/400; + frame_size = compute_frame_size(pcm, analysis_frame_size, + st->variable_duration, channels, Fs, st->bitrate_bps, + delay_compensation, downmix +#ifndef DISABLE_FLOAT_API + , st->subframe_mem +#endif + ); + } + + if (400*frame_size < Fs) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + /* Validate frame_size before using it to allocate stack space. + This mirrors the checks in opus_encode[_float](). */ + if (400*frame_size != Fs && 200*frame_size != Fs && + 100*frame_size != Fs && 50*frame_size != Fs && + 25*frame_size != Fs && 50*frame_size != 3*Fs) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + + /* Smallest packet the encoder can produce. */ + smallest_packet = st->layout.nb_streams*2-1; + if (max_data_bytes < smallest_packet) + { + RESTORE_STACK; + return OPUS_BUFFER_TOO_SMALL; + } + ALLOC(buf, 2*frame_size, opus_val16); + coupled_size = opus_encoder_get_size(2); + mono_size = opus_encoder_get_size(1); + + ALLOC(bandSMR, 21*st->layout.nb_channels, opus_val16); + if (st->mapping_type == MAPPING_TYPE_SURROUND) + { + surround_analysis(celt_mode, pcm, bandSMR, mem, preemph_mem, frame_size, 120, st->layout.nb_channels, Fs, copy_channel_in, st->arch); + } + + /* Compute bitrate allocation between streams (this could be a lot better) */ + rate_sum = rate_allocation(st, bitrates, frame_size); + + if (!vbr) + { + if (st->bitrate_bps == OPUS_AUTO) + { + max_data_bytes = IMIN(max_data_bytes, 3*rate_sum/(3*8*Fs/frame_size)); + } else if (st->bitrate_bps != OPUS_BITRATE_MAX) + { + max_data_bytes = IMIN(max_data_bytes, IMAX(smallest_packet, + 3*st->bitrate_bps/(3*8*Fs/frame_size))); + } + } + ptr = (char*)st + align(sizeof(OpusMSEncoder)); + for (s=0;slayout.nb_streams;s++) + { + OpusEncoder *enc; + enc = (OpusEncoder*)ptr; + if (s < st->layout.nb_coupled_streams) + ptr += align(coupled_size); + else + ptr += align(mono_size); + opus_encoder_ctl(enc, OPUS_SET_BITRATE(bitrates[s])); + if (st->mapping_type == MAPPING_TYPE_SURROUND) + { + opus_int32 equiv_rate; + equiv_rate = st->bitrate_bps; + if (frame_size*50 < Fs) + equiv_rate -= 60*(Fs/frame_size - 50)*st->layout.nb_channels; + if (equiv_rate > 10000*st->layout.nb_channels) + opus_encoder_ctl(enc, OPUS_SET_BANDWIDTH(OPUS_BANDWIDTH_FULLBAND)); + else if (equiv_rate > 7000*st->layout.nb_channels) + opus_encoder_ctl(enc, OPUS_SET_BANDWIDTH(OPUS_BANDWIDTH_SUPERWIDEBAND)); + else if (equiv_rate > 5000*st->layout.nb_channels) + opus_encoder_ctl(enc, OPUS_SET_BANDWIDTH(OPUS_BANDWIDTH_WIDEBAND)); + else + opus_encoder_ctl(enc, OPUS_SET_BANDWIDTH(OPUS_BANDWIDTH_NARROWBAND)); + if (s < st->layout.nb_coupled_streams) + { + /* To preserve the spatial image, force stereo CELT on coupled streams */ + opus_encoder_ctl(enc, OPUS_SET_FORCE_MODE(MODE_CELT_ONLY)); + opus_encoder_ctl(enc, OPUS_SET_FORCE_CHANNELS(2)); + } + } +#ifdef ENABLE_EXPERIMENTAL_AMBISONICS + else if (st->mapping_type == MAPPING_TYPE_AMBISONICS) { + opus_encoder_ctl(enc, OPUS_SET_FORCE_MODE(MODE_CELT_ONLY)); + } +#endif + } + + ptr = (char*)st + align(sizeof(OpusMSEncoder)); + /* Counting ToC */ + tot_size = 0; + for (s=0;slayout.nb_streams;s++) + { + OpusEncoder *enc; + int len; + int curr_max; + int c1, c2; + int ret; + + opus_repacketizer_init(&rp); + enc = (OpusEncoder*)ptr; + if (s < st->layout.nb_coupled_streams) + { + int i; + int left, right; + left = get_left_channel(&st->layout, s, -1); + right = get_right_channel(&st->layout, s, -1); + (*copy_channel_in)(buf, 2, + pcm, st->layout.nb_channels, left, frame_size); + (*copy_channel_in)(buf+1, 2, + pcm, st->layout.nb_channels, right, frame_size); + ptr += align(coupled_size); + if (st->mapping_type == MAPPING_TYPE_SURROUND) + { + for (i=0;i<21;i++) + { + bandLogE[i] = bandSMR[21*left+i]; + bandLogE[21+i] = bandSMR[21*right+i]; + } + } + c1 = left; + c2 = right; + } else { + int i; + int chan = get_mono_channel(&st->layout, s, -1); + (*copy_channel_in)(buf, 1, + pcm, st->layout.nb_channels, chan, frame_size); + ptr += align(mono_size); + if (st->mapping_type == MAPPING_TYPE_SURROUND) + { + for (i=0;i<21;i++) + bandLogE[i] = bandSMR[21*chan+i]; + } + c1 = chan; + c2 = -1; + } + if (st->mapping_type == MAPPING_TYPE_SURROUND) + opus_encoder_ctl(enc, OPUS_SET_ENERGY_MASK(bandLogE)); + /* number of bytes left (+Toc) */ + curr_max = max_data_bytes - tot_size; + /* Reserve one byte for the last stream and two for the others */ + curr_max -= IMAX(0,2*(st->layout.nb_streams-s-1)-1); + curr_max = IMIN(curr_max,MS_FRAME_TMP); + /* Repacketizer will add one or two bytes for self-delimited frames */ + if (s != st->layout.nb_streams-1) curr_max -= curr_max>253 ? 2 : 1; + if (!vbr && s == st->layout.nb_streams-1) + opus_encoder_ctl(enc, OPUS_SET_BITRATE(curr_max*(8*Fs/frame_size))); + len = opus_encode_native(enc, buf, frame_size, tmp_data, curr_max, lsb_depth, + pcm, analysis_frame_size, c1, c2, st->layout.nb_channels, downmix, float_api); + if (len<0) + { + RESTORE_STACK; + return len; + } + /* We need to use the repacketizer to add the self-delimiting lengths + while taking into account the fact that the encoder can now return + more than one frame at a time (e.g. 60 ms CELT-only) */ + ret = opus_repacketizer_cat(&rp, tmp_data, len); + /* If the opus_repacketizer_cat() fails, then something's seriously wrong + with the encoder. */ + if (ret != OPUS_OK) + { + RESTORE_STACK; + return OPUS_INTERNAL_ERROR; + } + len = opus_repacketizer_out_range_impl(&rp, 0, opus_repacketizer_get_nb_frames(&rp), + data, max_data_bytes-tot_size, s != st->layout.nb_streams-1, !vbr && s == st->layout.nb_streams-1); + data += len; + tot_size += len; + } + /*printf("\n");*/ + RESTORE_STACK; + return tot_size; +} + +#if !defined(DISABLE_FLOAT_API) +static void opus_copy_channel_in_float( + opus_val16 *dst, + int dst_stride, + const void *src, + int src_stride, + int src_channel, + int frame_size +) +{ + const float *float_src; + opus_int32 i; + float_src = (const float *)src; + for (i=0;ibitrate_bps = value; + } + break; + case OPUS_GET_BITRATE_REQUEST: + { + int s; + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = 0; + for (s=0;slayout.nb_streams;s++) + { + opus_int32 rate; + OpusEncoder *enc; + enc = (OpusEncoder*)ptr; + if (s < st->layout.nb_coupled_streams) + ptr += align(coupled_size); + else + ptr += align(mono_size); + opus_encoder_ctl(enc, request, &rate); + *value += rate; + } + } + break; + case OPUS_GET_LSB_DEPTH_REQUEST: + case OPUS_GET_VBR_REQUEST: + case OPUS_GET_APPLICATION_REQUEST: + case OPUS_GET_BANDWIDTH_REQUEST: + case OPUS_GET_COMPLEXITY_REQUEST: + case OPUS_GET_PACKET_LOSS_PERC_REQUEST: + case OPUS_GET_DTX_REQUEST: + case OPUS_GET_VOICE_RATIO_REQUEST: + case OPUS_GET_VBR_CONSTRAINT_REQUEST: + case OPUS_GET_SIGNAL_REQUEST: + case OPUS_GET_LOOKAHEAD_REQUEST: + case OPUS_GET_SAMPLE_RATE_REQUEST: + case OPUS_GET_INBAND_FEC_REQUEST: + case OPUS_GET_FORCE_CHANNELS_REQUEST: + case OPUS_GET_PREDICTION_DISABLED_REQUEST: + { + OpusEncoder *enc; + /* For int32* GET params, just query the first stream */ + opus_int32 *value = va_arg(ap, opus_int32*); + enc = (OpusEncoder*)ptr; + ret = opus_encoder_ctl(enc, request, value); + } + break; + case OPUS_GET_FINAL_RANGE_REQUEST: + { + int s; + opus_uint32 *value = va_arg(ap, opus_uint32*); + opus_uint32 tmp; + if (!value) + { + goto bad_arg; + } + *value=0; + for (s=0;slayout.nb_streams;s++) + { + OpusEncoder *enc; + enc = (OpusEncoder*)ptr; + if (s < st->layout.nb_coupled_streams) + ptr += align(coupled_size); + else + ptr += align(mono_size); + ret = opus_encoder_ctl(enc, request, &tmp); + if (ret != OPUS_OK) break; + *value ^= tmp; + } + } + break; + case OPUS_SET_LSB_DEPTH_REQUEST: + case OPUS_SET_COMPLEXITY_REQUEST: + case OPUS_SET_VBR_REQUEST: + case OPUS_SET_VBR_CONSTRAINT_REQUEST: + case OPUS_SET_MAX_BANDWIDTH_REQUEST: + case OPUS_SET_BANDWIDTH_REQUEST: + case OPUS_SET_SIGNAL_REQUEST: + case OPUS_SET_APPLICATION_REQUEST: + case OPUS_SET_INBAND_FEC_REQUEST: + case OPUS_SET_PACKET_LOSS_PERC_REQUEST: + case OPUS_SET_DTX_REQUEST: + case OPUS_SET_FORCE_MODE_REQUEST: + case OPUS_SET_FORCE_CHANNELS_REQUEST: + case OPUS_SET_PREDICTION_DISABLED_REQUEST: + { + int s; + /* This works for int32 params */ + opus_int32 value = va_arg(ap, opus_int32); + for (s=0;slayout.nb_streams;s++) + { + OpusEncoder *enc; + + enc = (OpusEncoder*)ptr; + if (s < st->layout.nb_coupled_streams) + ptr += align(coupled_size); + else + ptr += align(mono_size); + ret = opus_encoder_ctl(enc, request, value); + if (ret != OPUS_OK) + break; + } + } + break; + case OPUS_MULTISTREAM_GET_ENCODER_STATE_REQUEST: + { + int s; + opus_int32 stream_id; + OpusEncoder **value; + stream_id = va_arg(ap, opus_int32); + if (stream_id<0 || stream_id >= st->layout.nb_streams) + ret = OPUS_BAD_ARG; + value = va_arg(ap, OpusEncoder**); + if (!value) + { + goto bad_arg; + } + for (s=0;slayout.nb_coupled_streams) + ptr += align(coupled_size); + else + ptr += align(mono_size); + } + *value = (OpusEncoder*)ptr; + } + break; + case OPUS_SET_EXPERT_FRAME_DURATION_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + st->variable_duration = value; + } + break; + case OPUS_GET_EXPERT_FRAME_DURATION_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + if (!value) + { + goto bad_arg; + } + *value = st->variable_duration; + } + break; + case OPUS_RESET_STATE: + { + int s; + st->subframe_mem[0] = st->subframe_mem[1] = st->subframe_mem[2] = 0; + if (st->mapping_type == MAPPING_TYPE_SURROUND) + { + OPUS_CLEAR(ms_get_preemph_mem(st), st->layout.nb_channels); + OPUS_CLEAR(ms_get_window_mem(st), st->layout.nb_channels*120); + } + for (s=0;slayout.nb_streams;s++) + { + OpusEncoder *enc; + enc = (OpusEncoder*)ptr; + if (s < st->layout.nb_coupled_streams) + ptr += align(coupled_size); + else + ptr += align(mono_size); + ret = opus_encoder_ctl(enc, OPUS_RESET_STATE); + if (ret != OPUS_OK) + break; + } + } + break; + default: + ret = OPUS_UNIMPLEMENTED; + break; + } + + va_end(ap); + return ret; +bad_arg: + va_end(ap); + return OPUS_BAD_ARG; +} + +void opus_multistream_encoder_destroy(OpusMSEncoder *st) +{ + opus_free(st); +} diff --git a/thirdparty/opus/opus_private.h b/thirdparty/opus/opus_private.h new file mode 100644 index 000000000000..3b62eed0964f --- /dev/null +++ b/thirdparty/opus/opus_private.h @@ -0,0 +1,134 @@ +/* Copyright (c) 2012 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +#ifndef OPUS_PRIVATE_H +#define OPUS_PRIVATE_H + +#include "arch.h" +#include "opus.h" +#include "celt.h" + +#include /* offsetof */ + +struct OpusRepacketizer { + unsigned char toc; + int nb_frames; + const unsigned char *frames[48]; + opus_int16 len[48]; + int framesize; +}; + +typedef struct ChannelLayout { + int nb_channels; + int nb_streams; + int nb_coupled_streams; + unsigned char mapping[256]; +} ChannelLayout; + +int validate_layout(const ChannelLayout *layout); +int get_left_channel(const ChannelLayout *layout, int stream_id, int prev); +int get_right_channel(const ChannelLayout *layout, int stream_id, int prev); +int get_mono_channel(const ChannelLayout *layout, int stream_id, int prev); + + + +#define MODE_SILK_ONLY 1000 +#define MODE_HYBRID 1001 +#define MODE_CELT_ONLY 1002 + +#define OPUS_SET_VOICE_RATIO_REQUEST 11018 +#define OPUS_GET_VOICE_RATIO_REQUEST 11019 + +/** Configures the encoder's expected percentage of voice + * opposed to music or other signals. + * + * @note This interface is currently more aspiration than actuality. It's + * ultimately expected to bias an automatic signal classifier, but it currently + * just shifts the static bitrate to mode mapping around a little bit. + * + * @param[in] x int: Voice percentage in the range 0-100, inclusive. + * @hideinitializer */ +#define OPUS_SET_VOICE_RATIO(x) OPUS_SET_VOICE_RATIO_REQUEST, __opus_check_int(x) +/** Gets the encoder's configured voice ratio value, @see OPUS_SET_VOICE_RATIO + * + * @param[out] x int*: Voice percentage in the range 0-100, inclusive. + * @hideinitializer */ +#define OPUS_GET_VOICE_RATIO(x) OPUS_GET_VOICE_RATIO_REQUEST, __opus_check_int_ptr(x) + + +#define OPUS_SET_FORCE_MODE_REQUEST 11002 +#define OPUS_SET_FORCE_MODE(x) OPUS_SET_FORCE_MODE_REQUEST, __opus_check_int(x) + +typedef void (*downmix_func)(const void *, opus_val32 *, int, int, int, int, int); +void downmix_float(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C); +void downmix_int(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C); + +int encode_size(int size, unsigned char *data); + +opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs); + +opus_int32 compute_frame_size(const void *analysis_pcm, int frame_size, + int variable_duration, int C, opus_int32 Fs, int bitrate_bps, + int delay_compensation, downmix_func downmix +#ifndef DISABLE_FLOAT_API + , float *subframe_mem +#endif + ); + +opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size, + unsigned char *data, opus_int32 out_data_bytes, int lsb_depth, + const void *analysis_pcm, opus_int32 analysis_size, int c1, int c2, + int analysis_channels, downmix_func downmix, int float_api); + +int opus_decode_native(OpusDecoder *st, const unsigned char *data, opus_int32 len, + opus_val16 *pcm, int frame_size, int decode_fec, int self_delimited, + opus_int32 *packet_offset, int soft_clip); + +/* Make sure everything is properly aligned. */ +static OPUS_INLINE int align(int i) +{ + struct foo {char c; union { void* p; opus_int32 i; opus_val32 v; } u;}; + + unsigned int alignment = offsetof(struct foo, u); + + /* Optimizing compilers should optimize div and multiply into and + for all sensible alignment values. */ + return ((i + alignment - 1) / alignment) * alignment; +} + +int opus_packet_parse_impl(const unsigned char *data, opus_int32 len, + int self_delimited, unsigned char *out_toc, + const unsigned char *frames[48], opus_int16 size[48], + int *payload_offset, opus_int32 *packet_offset); + +opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int end, + unsigned char *data, opus_int32 maxlen, int self_delimited, int pad); + +int pad_frame(unsigned char *data, opus_int32 len, opus_int32 new_len); + +#endif /* OPUS_PRIVATE_H */ diff --git a/thirdparty/opus/opusfile.c b/thirdparty/opus/opusfile.c new file mode 100644 index 000000000000..b8b3a354cfbb --- /dev/null +++ b/thirdparty/opus/opusfile.c @@ -0,0 +1,3266 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE libopusfile SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE libopusfile SOURCE CODE IS (C) COPYRIGHT 1994-2012 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: stdio-based convenience library for opening/seeking/decoding + last mod: $Id: vorbisfile.c 17573 2010-10-27 14:53:59Z xiphmont $ + + ********************************************************************/ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "internal.h" +#include +#include +#include +#include +#include +#include + +#include "opusfile.h" + +/*This implementation is largely based off of libvorbisfile. + All of the Ogg bits work roughly the same, though I have made some + "improvements" that have not been folded back there, yet.*/ + +/*A 'chained bitstream' is an Ogg Opus bitstream that contains more than one + logical bitstream arranged end to end (the only form of Ogg multiplexing + supported by this library. + Grouping (parallel multiplexing) is not supported, except to the extent that + if there are multiple logical Ogg streams in a single link of the chain, we + will ignore all but the first Opus stream we find.*/ + +/*An Ogg Opus file can be played beginning to end (streamed) without worrying + ahead of time about chaining (see opusdec from the opus-tools package). + If we have the whole file, however, and want random access + (seeking/scrubbing) or desire to know the total length/time of a file, we + need to account for the possibility of chaining.*/ + +/*We can handle things a number of ways. + We can determine the entire bitstream structure right off the bat, or find + pieces on demand. + This library determines and caches structure for the entire bitstream, but + builds a virtual decoder on the fly when moving between links in the chain.*/ + +/*There are also different ways to implement seeking. + Enough information exists in an Ogg bitstream to seek to sample-granularity + positions in the output. + Or, one can seek by picking some portion of the stream roughly in the desired + area if we only want coarse navigation through the stream. + We implement and expose both strategies.*/ + +/*The maximum number of bytes in a page (including the page headers).*/ +#define OP_PAGE_SIZE_MAX (65307) +/*The default amount to seek backwards per step when trying to find the + previous page. + This must be at least as large as the maximum size of a page.*/ +#define OP_CHUNK_SIZE (65536) +/*The maximum amount to seek backwards per step when trying to find the + previous page.*/ +#define OP_CHUNK_SIZE_MAX (1024*(opus_int32)1024) +/*A smaller read size is needed for low-rate streaming.*/ +#define OP_READ_SIZE (2048) + +int op_test(OpusHead *_head, + const unsigned char *_initial_data,size_t _initial_bytes){ + ogg_sync_state oy; + char *data; + int err; + /*The first page of a normal Opus file will be at most 57 bytes (27 Ogg + page header bytes + 1 lacing value + 21 Opus header bytes + 8 channel + mapping bytes). + It will be at least 47 bytes (27 Ogg page header bytes + 1 lacing value + + 19 Opus header bytes using channel mapping family 0). + If we don't have at least that much data, give up now.*/ + if(_initial_bytes<47)return OP_FALSE; + /*Only proceed if we start with the magic OggS string. + This is to prevent us spending a lot of time allocating memory and looking + for Ogg pages in non-Ogg files.*/ + if(memcmp(_initial_data,"OggS",4)!=0)return OP_ENOTFORMAT; + ogg_sync_init(&oy); + data=ogg_sync_buffer(&oy,_initial_bytes); + if(data!=NULL){ + ogg_stream_state os; + ogg_page og; + int ret; + memcpy(data,_initial_data,_initial_bytes); + ogg_sync_wrote(&oy,_initial_bytes); + ogg_stream_init(&os,-1); + err=OP_FALSE; + do{ + ogg_packet op; + ret=ogg_sync_pageout(&oy,&og); + /*Ignore holes.*/ + if(ret<0)continue; + /*Stop if we run out of data.*/ + if(!ret)break; + ogg_stream_reset_serialno(&os,ogg_page_serialno(&og)); + ogg_stream_pagein(&os,&og); + /*Only process the first packet on this page (if it's a BOS packet, + it's required to be the only one).*/ + if(ogg_stream_packetout(&os,&op)==1){ + if(op.b_o_s){ + ret=opus_head_parse(_head,op.packet,op.bytes); + /*If this didn't look like Opus, keep going.*/ + if(ret==OP_ENOTFORMAT)continue; + /*Otherwise we're done, one way or another.*/ + err=ret; + } + /*We finished parsing the headers. + There is no Opus to be found.*/ + else err=OP_ENOTFORMAT; + } + } + while(err==OP_FALSE); + ogg_stream_clear(&os); + } + else err=OP_EFAULT; + ogg_sync_clear(&oy); + return err; +} + +/*Many, many internal helpers. + The intention is not to be confusing. + Rampant duplication and monolithic function implementation (though we do have + some large, omnibus functions still) would be harder to understand anyway. + The high level functions are last. + Begin grokking near the end of the file if you prefer to read things + top-down.*/ + +/*The read/seek functions track absolute position within the stream.*/ + +/*Read a little more data from the file/pipe into the ogg_sync framer. + _nbytes: The maximum number of bytes to read. + Return: A positive number of bytes read on success, 0 on end-of-file, or a + negative value on failure.*/ +static int op_get_data(OggOpusFile *_of,int _nbytes){ + unsigned char *buffer; + int nbytes; + OP_ASSERT(_nbytes>0); + buffer=(unsigned char *)ogg_sync_buffer(&_of->oy,_nbytes); + nbytes=(int)(*_of->callbacks.read)(_of->source,buffer,_nbytes); + OP_ASSERT(nbytes<=_nbytes); + if(OP_LIKELY(nbytes>0))ogg_sync_wrote(&_of->oy,nbytes); + return nbytes; +} + +/*Save a tiny smidge of verbosity to make the code more readable.*/ +static int op_seek_helper(OggOpusFile *_of,opus_int64 _offset){ + if(_offset==_of->offset)return 0; + if(_of->callbacks.seek==NULL + ||(*_of->callbacks.seek)(_of->source,_offset,SEEK_SET)){ + return OP_EREAD; + } + _of->offset=_offset; + ogg_sync_reset(&_of->oy); + return 0; +} + +/*Get the current position indicator of the underlying source. + This should be the same as the value reported by tell().*/ +static opus_int64 op_position(const OggOpusFile *_of){ + /*The current position indicator is _not_ simply offset. + We may also have unprocessed, buffered data in the sync state.*/ + return _of->offset+_of->oy.fill-_of->oy.returned; +} + +/*From the head of the stream, get the next page. + _boundary specifies if the function is allowed to fetch more data from the + stream (and how much) or only use internally buffered data. + _boundary: -1: Unbounded search. + 0: Read no additional data. + Use only cached data. + n: Search for the start of a new page up to file position n. + Return: n>=0: Found a page at absolute offset n. + OP_FALSE: Hit the _boundary limit. + OP_EREAD: An underlying read operation failed. + OP_BADLINK: We hit end-of-file before reaching _boundary.*/ +static opus_int64 op_get_next_page(OggOpusFile *_of,ogg_page *_og, + opus_int64 _boundary){ + while(_boundary<=0||_of->offset<_boundary){ + int more; + more=ogg_sync_pageseek(&_of->oy,_og); + /*Skipped (-more) bytes.*/ + if(OP_UNLIKELY(more<0))_of->offset-=more; + else if(more==0){ + int read_nbytes; + int ret; + /*Send more paramedics.*/ + if(!_boundary)return OP_FALSE; + if(_boundary<0)read_nbytes=OP_READ_SIZE; + else{ + opus_int64 position; + position=op_position(_of); + if(position>=_boundary)return OP_FALSE; + read_nbytes=(int)OP_MIN(_boundary-position,OP_READ_SIZE); + } + ret=op_get_data(_of,read_nbytes); + if(OP_UNLIKELY(ret<0))return OP_EREAD; + if(OP_UNLIKELY(ret==0)){ + /*Only fail cleanly on EOF if we didn't have a known boundary. + Otherwise, we should have been able to reach that boundary, and this + is a fatal error.*/ + return OP_UNLIKELY(_boundary<0)?OP_FALSE:OP_EBADLINK; + } + } + else{ + /*Got a page. + Return the page start offset and advance the internal offset past the + page end.*/ + opus_int64 page_offset; + page_offset=_of->offset; + _of->offset+=more; + OP_ASSERT(page_offset>=0); + return page_offset; + } + } + return OP_FALSE; +} + +static int op_add_serialno(const ogg_page *_og, + ogg_uint32_t **_serialnos,int *_nserialnos,int *_cserialnos){ + ogg_uint32_t *serialnos; + int nserialnos; + int cserialnos; + ogg_uint32_t s; + s=ogg_page_serialno(_og); + serialnos=*_serialnos; + nserialnos=*_nserialnos; + cserialnos=*_cserialnos; + if(OP_UNLIKELY(nserialnos>=cserialnos)){ + if(OP_UNLIKELY(cserialnos>INT_MAX/(int)sizeof(*serialnos)-1>>1)){ + return OP_EFAULT; + } + cserialnos=2*cserialnos+1; + OP_ASSERT(nserialnos=OP_PAGE_SIZE_MAX); + begin=OP_MAX(begin-chunk_size,0); + ret=op_seek_helper(_of,begin); + if(OP_UNLIKELY(ret<0))return ret; + search_start=begin; + while(_of->offsetsearch_start=search_start; + _sr->offset=_offset=llret; + _sr->serialno=serialno; + OP_ASSERT(_of->offset-_offset>=0); + OP_ASSERT(_of->offset-_offset<=OP_PAGE_SIZE_MAX); + _sr->size=(opus_int32)(_of->offset-_offset); + _sr->gp=ogg_page_granulepos(&og); + /*If this page is from the stream we're looking for, remember it.*/ + if(serialno==_serialno){ + preferred_found=1; + *&preferred_sr=*_sr; + } + if(!op_lookup_serialno(serialno,_serialnos,_nserialnos)){ + /*We fell off the end of the link, which means we seeked back too far + and shouldn't have been looking in that link to begin with. + If we found the preferred serial number, forget that we saw it.*/ + preferred_found=0; + } + search_start=llret+1; + } + /*We started from the beginning of the stream and found nothing. + This should be impossible unless the contents of the source changed out + from under us after we read from it.*/ + if(OP_UNLIKELY(!begin)&&OP_UNLIKELY(_offset<0))return OP_EBADLINK; + /*Bump up the chunk size. + This is mildly helpful when seeks are very expensive (http).*/ + chunk_size=OP_MIN(2*chunk_size,OP_CHUNK_SIZE_MAX); + /*Avoid quadratic complexity if we hit an invalid patch of the file.*/ + end=OP_MIN(begin+OP_PAGE_SIZE_MAX-1,original_end); + } + while(_offset<0); + if(preferred_found)*_sr=*&preferred_sr; + return 0; +} + +/*Find the last page beginning before _offset with the given serial number and + a valid granule position. + Unlike the above search, this continues until it finds such a page, but does + not stray outside the current link. + We could implement it (inefficiently) by calling op_get_prev_page_serial() + repeatedly until it returned a page that had both our preferred serial + number and a valid granule position, but doing it with a separate function + allows us to avoid repeatedly re-scanning valid pages from other streams as + we seek-back-and-read-forward. + [out] _gp: Returns the granule position of the page that was found on + success. + _offset: The _offset before which to find a page. + Any page returned will consist of data entirely before _offset. + _serialno: The target serial number. + _serialnos: The list of serial numbers in the link that contains the + preferred serial number. + _nserialnos: The number of serial numbers in the current link. + Return: The offset of the page on success, or a negative value on failure. + OP_EREAD: Failed to read more data (error or EOF). + OP_EBADLINK: We couldn't find a page even after seeking back past the + beginning of the link.*/ +static opus_int64 op_get_last_page(OggOpusFile *_of,ogg_int64_t *_gp, + opus_int64 _offset,ogg_uint32_t _serialno, + const ogg_uint32_t *_serialnos,int _nserialnos){ + ogg_page og; + ogg_int64_t gp; + opus_int64 begin; + opus_int64 end; + opus_int64 original_end; + opus_int32 chunk_size; + /*The target serial number must belong to the current link.*/ + OP_ASSERT(op_lookup_serialno(_serialno,_serialnos,_nserialnos)); + original_end=end=begin=_offset; + _offset=-1; + /*We shouldn't have to initialize gp, but gcc is too dumb to figure out that + ret>=0 implies we entered the if(page_gp!=-1) block at least once.*/ + gp=-1; + chunk_size=OP_CHUNK_SIZE; + do{ + int left_link; + int ret; + OP_ASSERT(chunk_size>=OP_PAGE_SIZE_MAX); + begin=OP_MAX(begin-chunk_size,0); + ret=op_seek_helper(_of,begin); + if(OP_UNLIKELY(ret<0))return ret; + left_link=0; + while(_of->offsetready_stateos,ogg_page_serialno(_og)); + ogg_stream_pagein(&_of->os,_og); + if(OP_LIKELY(ogg_stream_packetout(&_of->os,&op)>0)){ + ret=opus_head_parse(_head,op.packet,op.bytes); + /*Found a valid Opus header. + Continue setup.*/ + if(OP_LIKELY(ret>=0))_of->ready_state=OP_STREAMSET; + /*If it's just a stream type we don't recognize, ignore it. + Everything else is fatal.*/ + else if(ret!=OP_ENOTFORMAT)return ret; + } + /*TODO: Should a BOS page with no packets be an error?*/ + } + /*Get the next page. + No need to clamp the boundary offset against _of->end, as all errors + become OP_ENOTFORMAT or OP_EBADHEADER.*/ + if(OP_UNLIKELY(op_get_next_page(_of,_og, + OP_ADV_OFFSET(_of->offset,OP_CHUNK_SIZE))<0)){ + return _of->ready_stateready_state!=OP_STREAMSET))return OP_ENOTFORMAT; + /*If the first non-header page belonged to our Opus stream, submit it.*/ + if(_of->os.serialno==ogg_page_serialno(_og))ogg_stream_pagein(&_of->os,_og); + /*Loop getting packets.*/ + for(;;){ + switch(ogg_stream_packetout(&_of->os,&op)){ + case 0:{ + /*Loop getting pages.*/ + for(;;){ + /*No need to clamp the boundary offset against _of->end, as all + errors become OP_EBADHEADER.*/ + if(OP_UNLIKELY(op_get_next_page(_of,_og, + OP_ADV_OFFSET(_of->offset,OP_CHUNK_SIZE))<0)){ + return OP_EBADHEADER; + } + /*If this page belongs to the correct stream, go parse it.*/ + if(_of->os.serialno==ogg_page_serialno(_og)){ + ogg_stream_pagein(&_of->os,_og); + break; + } + /*If the link ends before we see the Opus comment header, abort.*/ + if(OP_UNLIKELY(ogg_page_bos(_og)))return OP_EBADHEADER; + /*Otherwise, keep looking.*/ + } + }break; + /*We shouldn't get a hole in the headers!*/ + case -1:return OP_EBADHEADER; + default:{ + /*Got a packet. + It should be the comment header.*/ + ret=opus_tags_parse(_tags,op.packet,op.bytes); + if(OP_UNLIKELY(ret<0))return ret; + /*Make sure the page terminated at the end of the comment header. + If there is another packet on the page, or part of a packet, then + reject the stream. + Otherwise seekable sources won't be able to seek back to the start + properly.*/ + ret=ogg_stream_packetout(&_of->os,&op); + if(OP_UNLIKELY(ret!=0) + ||OP_UNLIKELY(_og->header[_og->header_len-1]==255)){ + /*If we fail, the caller assumes our tags are uninitialized.*/ + opus_tags_clear(_tags); + return OP_EBADHEADER; + } + return 0; + } + } + } +} + +static int op_fetch_headers(OggOpusFile *_of,OpusHead *_head, + OpusTags *_tags,ogg_uint32_t **_serialnos,int *_nserialnos, + int *_cserialnos,ogg_page *_og){ + ogg_page og; + int ret; + if(!_og){ + /*No need to clamp the boundary offset against _of->end, as all errors + become OP_ENOTFORMAT.*/ + if(OP_UNLIKELY(op_get_next_page(_of,&og, + OP_ADV_OFFSET(_of->offset,OP_CHUNK_SIZE))<0)){ + return OP_ENOTFORMAT; + } + _og=&og; + } + _of->ready_state=OP_OPENED; + ret=op_fetch_headers_impl(_of,_head,_tags,_serialnos,_nserialnos, + _cserialnos,_og); + /*Revert back from OP_STREAMSET to OP_OPENED on failure, to prevent + double-free of the tags in an unseekable stream.*/ + if(OP_UNLIKELY(ret<0))_of->ready_state=OP_OPENED; + return ret; +} + +/*Granule position manipulation routines. + A granule position is defined to be an unsigned 64-bit integer, with the + special value -1 in two's complement indicating an unset or invalid granule + position. + We are not guaranteed to have an unsigned 64-bit type, so we construct the + following routines that + a) Properly order negative numbers as larger than positive numbers, and + b) Check for underflow or overflow past the special -1 value. + This lets us operate on the full, valid range of granule positions in a + consistent and safe manner. + This full range is organized into distinct regions: + [ -1 (invalid) ][ 0 ... OP_INT64_MAX ][ OP_INT64_MIN ... -2 ][-1 (invalid) ] + + No one should actually use granule positions so large that they're negative, + even if they are technically valid, as very little software handles them + correctly (including most of Xiph.Org's). + This library also refuses to support durations so large they won't fit in a + signed 64-bit integer (to avoid exposing this mess to the application, and + to simplify a good deal of internal arithmetic), so the only way to use them + successfully is if pcm_start is very large. + This means there isn't anything you can do with negative granule positions + that you couldn't have done with purely non-negative ones. + The main purpose of these routines is to allow us to think very explicitly + about the possible failure cases of all granule position manipulations.*/ + +/*Safely adds a small signed integer to a valid (not -1) granule position. + The result can use the full 64-bit range of values (both positive and + negative), but will fail on overflow (wrapping past -1; wrapping past + OP_INT64_MAX is explicitly okay). + [out] _dst_gp: The resulting granule position. + Only modified on success. + _src_gp: The granule position to add to. + This must not be -1. + _delta: The amount to add. + This is allowed to be up to 32 bits to support the maximum + duration of a single Ogg page (255 packets * 120 ms per + packet == 1,468,800 samples at 48 kHz). + Return: 0 on success, or OP_EINVAL if the result would wrap around past -1.*/ +static int op_granpos_add(ogg_int64_t *_dst_gp,ogg_int64_t _src_gp, + opus_int32 _delta){ + /*The code below handles this case correctly, but there's no reason we + should ever be called with these values, so make sure we aren't.*/ + OP_ASSERT(_src_gp!=-1); + if(_delta>0){ + /*Adding this amount to the granule position would overflow its 64-bit + range.*/ + if(OP_UNLIKELY(_src_gp<0)&&OP_UNLIKELY(_src_gp>=-1-_delta))return OP_EINVAL; + if(OP_UNLIKELY(_src_gp>OP_INT64_MAX-_delta)){ + /*Adding this amount to the granule position would overflow the positive + half of its 64-bit range. + Since signed overflow is undefined in C, do it in a way the compiler + isn't allowed to screw up.*/ + _delta-=(opus_int32)(OP_INT64_MAX-_src_gp)+1; + _src_gp=OP_INT64_MIN; + } + } + else if(_delta<0){ + /*Subtracting this amount from the granule position would underflow its + 64-bit range.*/ + if(_src_gp>=0&&OP_UNLIKELY(_src_gp<-_delta))return OP_EINVAL; + if(OP_UNLIKELY(_src_gp da < 0.*/ + da=(OP_INT64_MIN-_gp_a)-1; + /*_gp_b >= 0 => db >= 0.*/ + db=OP_INT64_MAX-_gp_b; + /*Step 2: Check for overflow.*/ + if(OP_UNLIKELY(OP_INT64_MAX+da= 0 => da <= 0*/ + da=_gp_a+OP_INT64_MIN; + /*_gp_b < 0 => db <= 0*/ + db=OP_INT64_MIN-_gp_b; + /*Step 2: Check for overflow.*/ + if(OP_UNLIKELY(da=0)return 1; + /*Else fall through.*/ + } + else if(OP_UNLIKELY(_gp_b<0))return -1; + /*No wrapping case.*/ + return (_gp_a>_gp_b)-(_gp_b>_gp_a); +} + +/*Returns the duration of the packet (in samples at 48 kHz), or a negative + value on error.*/ +static int op_get_packet_duration(const unsigned char *_data,int _len){ + int nframes; + int frame_size; + int nsamples; + nframes=opus_packet_get_nb_frames(_data,_len); + if(OP_UNLIKELY(nframes<0))return OP_EBADPACKET; + frame_size=opus_packet_get_samples_per_frame(_data,48000); + nsamples=nframes*frame_size; + if(OP_UNLIKELY(nsamples>120*48))return OP_EBADPACKET; + return nsamples; +} + +/*This function more properly belongs in info.c, but we define it here to allow + the static granule position manipulation functions to remain static.*/ +ogg_int64_t opus_granule_sample(const OpusHead *_head,ogg_int64_t _gp){ + opus_int32 pre_skip; + pre_skip=_head->pre_skip; + if(_gp!=-1&&op_granpos_add(&_gp,_gp,-pre_skip))_gp=-1; + return _gp; +} + +/*Grab all the packets currently in the stream state, and compute their + durations. + _of->op_count is set to the number of packets collected. + [out] _durations: Returns the durations of the individual packets. + Return: The total duration of all packets, or OP_HOLE if there was a hole.*/ +static opus_int32 op_collect_audio_packets(OggOpusFile *_of, + int _durations[255]){ + opus_int32 total_duration; + int op_count; + /*Count the durations of all packets in the page.*/ + op_count=0; + total_duration=0; + for(;;){ + int ret; + /*This takes advantage of undocumented libogg behavior that returned + ogg_packet buffers are valid at least until the next page is + submitted. + Relying on this is not too terrible, as _none_ of the Ogg memory + ownership/lifetime rules are well-documented. + But I can read its code and know this will work.*/ + ret=ogg_stream_packetout(&_of->os,_of->op+op_count); + if(!ret)break; + if(OP_UNLIKELY(ret<0)){ + /*We shouldn't get holes in the middle of pages.*/ + OP_ASSERT(op_count==0); + /*Set the return value and break out of the loop. + We want to make sure op_count gets set to 0, because we've ingested a + page, so any previously loaded packets are now invalid.*/ + total_duration=OP_HOLE; + break; + } + /*Unless libogg is broken, we can't get more than 255 packets from a + single page.*/ + OP_ASSERT(op_count<255); + _durations[op_count]=op_get_packet_duration(_of->op[op_count].packet, + _of->op[op_count].bytes); + if(OP_LIKELY(_durations[op_count]>0)){ + /*With at most 255 packets on a page, this can't overflow.*/ + total_duration+=_durations[op_count++]; + } + /*Ignore packets with an invalid TOC sequence.*/ + else if(op_count>0){ + /*But save the granule position, if there was one.*/ + _of->op[op_count-1].granulepos=_of->op[op_count].granulepos; + } + } + _of->op_pos=0; + _of->op_count=op_count; + return total_duration; +} + +/*Starting from current cursor position, get the initial PCM offset of the next + page. + This also validates the granule position on the first page with a completed + audio data packet, as required by the spec. + If this link is completely empty (no pages with completed packets), then this + function sets pcm_start=pcm_end=0 and returns the BOS page of the next link + (if any). + In the seekable case, we initialize pcm_end=-1 before calling this function, + so that later we can detect that the link was empty before calling + op_find_final_pcm_offset(). + [inout] _link: The link for which to find pcm_start. + [out] _og: Returns the BOS page of the next link if this link was empty. + In the unseekable case, we can then feed this to + op_fetch_headers() to start the next link. + The caller may pass NULL (e.g., for seekable streams), in + which case this page will be discarded. + Return: 0 on success, 1 if there is a buffered BOS page available, or a + negative value on unrecoverable error.*/ +static int op_find_initial_pcm_offset(OggOpusFile *_of, + OggOpusLink *_link,ogg_page *_og){ + ogg_page og; + opus_int64 page_offset; + ogg_int64_t pcm_start; + ogg_int64_t prev_packet_gp; + ogg_int64_t cur_page_gp; + ogg_uint32_t serialno; + opus_int32 total_duration; + int durations[255]; + int cur_page_eos; + int op_count; + int pi; + if(_og==NULL)_og=&og; + serialno=_of->os.serialno; + op_count=0; + /*We shouldn't have to initialize total_duration, but gcc is too dumb to + figure out that op_count>0 implies we've been through the whole loop at + least once.*/ + total_duration=0; + do{ + page_offset=op_get_next_page(_of,_og,_of->end); + /*We should get a page unless the file is truncated or mangled. + Otherwise there are no audio data packets in the whole logical stream.*/ + if(OP_UNLIKELY(page_offset<0)){ + /*Fail if there was a read error.*/ + if(page_offsethead.pre_skip>0)return OP_EBADTIMESTAMP; + /*Set pcm_end and end_offset so we can skip the call to + op_find_final_pcm_offset().*/ + _link->pcm_start=_link->pcm_end=0; + _link->end_offset=_link->data_offset; + return 0; + } + /*Similarly, if we hit the next link in the chain, we've gone too far.*/ + if(OP_UNLIKELY(ogg_page_bos(_og))){ + if(_link->head.pre_skip>0)return OP_EBADTIMESTAMP; + /*Set pcm_end and end_offset so we can skip the call to + op_find_final_pcm_offset().*/ + _link->pcm_end=_link->pcm_start=0; + _link->end_offset=_link->data_offset; + /*Tell the caller we've got a buffered page for them.*/ + return 1; + } + /*Ignore pages from other streams (not strictly necessary, because of the + checks in ogg_stream_pagein(), but saves some work).*/ + if(serialno!=(ogg_uint32_t)ogg_page_serialno(_og))continue; + ogg_stream_pagein(&_of->os,_og); + /*Bitrate tracking: add the header's bytes here. + The body bytes are counted when we consume the packets.*/ + _of->bytes_tracked+=_og->header_len; + /*Count the durations of all packets in the page.*/ + do total_duration=op_collect_audio_packets(_of,durations); + /*Ignore holes.*/ + while(OP_UNLIKELY(total_duration<0)); + op_count=_of->op_count; + } + while(op_count<=0); + /*We found the first page with a completed audio data packet: actually look + at the granule position. + RFC 3533 says, "A special value of -1 (in two's complement) indicates that + no packets finish on this page," which does not say that a granule + position that is NOT -1 indicates that some packets DO finish on that page + (even though this was the intention, libogg itself violated this intention + for years before we fixed it). + The Ogg Opus specification only imposes its start-time requirements + on the granule position of the first page with completed packets, + so we ignore any set granule positions until then.*/ + cur_page_gp=_of->op[op_count-1].granulepos; + /*But getting a packet without a valid granule position on the page is not + okay.*/ + if(cur_page_gp==-1)return OP_EBADTIMESTAMP; + cur_page_eos=_of->op[op_count-1].e_o_s; + if(OP_LIKELY(!cur_page_eos)){ + /*The EOS flag wasn't set. + Work backwards from the provided granule position to get the starting PCM + offset.*/ + if(OP_UNLIKELY(op_granpos_add(&pcm_start,cur_page_gp,-total_duration)<0)){ + /*The starting granule position MUST not be smaller than the amount of + audio on the first page with completed packets.*/ + return OP_EBADTIMESTAMP; + } + } + else{ + /*The first page with completed packets was also the last.*/ + if(OP_LIKELY(op_granpos_add(&pcm_start,cur_page_gp,-total_duration)<0)){ + /*If there's less audio on the page than indicated by the granule + position, then we're doing end-trimming, and the starting PCM offset + is zero by spec mandate.*/ + pcm_start=0; + /*However, the end-trimming MUST not ask us to trim more samples than + exist after applying the pre-skip.*/ + if(OP_UNLIKELY(op_granpos_cmp(cur_page_gp,_link->head.pre_skip)<0)){ + return OP_EBADTIMESTAMP; + } + } + } + /*Timestamp the individual packets.*/ + prev_packet_gp=pcm_start; + for(pi=0;pi0){ + /*If we trimmed the entire packet, stop (the spec says encoders + shouldn't do this, but we support it anyway).*/ + if(OP_UNLIKELY(diff>durations[pi]))break; + _of->op[pi].granulepos=prev_packet_gp=cur_page_gp; + /*Move the EOS flag to this packet, if necessary, so we'll trim the + samples.*/ + _of->op[pi].e_o_s=1; + continue; + } + } + /*Update the granule position as normal.*/ + OP_ALWAYS_TRUE(!op_granpos_add(&_of->op[pi].granulepos, + prev_packet_gp,durations[pi])); + prev_packet_gp=_of->op[pi].granulepos; + } + /*Update the packet count after end-trimming.*/ + _of->op_count=pi; + _of->cur_discard_count=_link->head.pre_skip; + _of->prev_packet_gp=_link->pcm_start=pcm_start; + _of->prev_page_offset=page_offset; + return 0; +} + +/*Starting from current cursor position, get the final PCM offset of the + previous page. + This also validates the duration of the link, which, while not strictly + required by the spec, we need to ensure duration calculations don't + overflow. + This is only done for seekable sources. + We must validate that op_find_initial_pcm_offset() succeeded for this link + before calling this function, otherwise it will scan the entire stream + backwards until it reaches the start, and then fail.*/ +static int op_find_final_pcm_offset(OggOpusFile *_of, + const ogg_uint32_t *_serialnos,int _nserialnos,OggOpusLink *_link, + opus_int64 _offset,ogg_uint32_t _end_serialno,ogg_int64_t _end_gp, + ogg_int64_t *_total_duration){ + ogg_int64_t total_duration; + ogg_int64_t duration; + ogg_uint32_t cur_serialno; + /*For the time being, fetch end PCM offset the simple way.*/ + cur_serialno=_link->serialno; + if(_end_serialno!=cur_serialno||_end_gp==-1){ + _offset=op_get_last_page(_of,&_end_gp,_offset, + cur_serialno,_serialnos,_nserialnos); + if(OP_UNLIKELY(_offset<0))return (int)_offset; + } + /*At worst we should have found the first page with completed packets.*/ + if(OP_UNLIKELY(_offset<_link->data_offset))return OP_EBADLINK; + /*This implementation requires that the difference between the first and last + granule positions in each link be representable in a signed, 64-bit + number, and that each link also have at least as many samples as the + pre-skip requires.*/ + if(OP_UNLIKELY(op_granpos_diff(&duration,_end_gp,_link->pcm_start)<0) + ||OP_UNLIKELY(duration<_link->head.pre_skip)){ + return OP_EBADTIMESTAMP; + } + /*We also require that the total duration be representable in a signed, + 64-bit number.*/ + duration-=_link->head.pre_skip; + total_duration=*_total_duration; + if(OP_UNLIKELY(OP_INT64_MAX-durationpcm_end=_end_gp; + _link->end_offset=_offset; + return 0; +} + +/*Rescale the number _x from the range [0,_from] to [0,_to]. + _from and _to must be positive.*/ +static opus_int64 op_rescale64(opus_int64 _x,opus_int64 _from,opus_int64 _to){ + opus_int64 frac; + opus_int64 ret; + int i; + if(_x>=_from)return _to; + if(_x<=0)return 0; + frac=0; + for(i=0;i<63;i++){ + frac<<=1; + OP_ASSERT(_x<=_from); + if(_x>=_from>>1){ + _x-=_from-_x; + frac|=1; + } + else _x<<=1; + } + ret=0; + for(i=0;i<63;i++){ + if(frac&1)ret=(ret&_to&1)+(ret>>1)+(_to>>1); + else ret>>=1; + frac>>=1; + } + return ret; +} + +/*The minimum granule position spacing allowed for making predictions. + This corresponds to about 1 second of audio at 48 kHz for both Opus and + Vorbis, or one keyframe interval in Theora with the default keyframe spacing + of 256.*/ +#define OP_GP_SPACING_MIN (48000) + +/*Try to estimate the location of the next link using the current seek + records, assuming the initial granule position of any streams we've found is + 0.*/ +static opus_int64 op_predict_link_start(const OpusSeekRecord *_sr,int _nsr, + opus_int64 _searched,opus_int64 _end_searched,opus_int32 _bias){ + opus_int64 bisect; + int sri; + int srj; + /*Require that we be at least OP_CHUNK_SIZE from the end. + We don't require that we be at least OP_CHUNK_SIZE from the beginning, + because if we are we'll just scan forward without seeking.*/ + _end_searched-=OP_CHUNK_SIZE; + if(_searched>=_end_searched)return -1; + bisect=_end_searched; + for(sri=0;sri<_nsr;sri++){ + ogg_int64_t gp1; + ogg_int64_t gp2_min; + ogg_uint32_t serialno1; + opus_int64 offset1; + /*If the granule position is negative, either it's invalid or we'd cause + overflow.*/ + gp1=_sr[sri].gp; + if(gp1<0)continue; + /*We require some minimum distance between granule positions to make an + estimate. + We don't actually know what granule position scheme is being used, + because we have no idea what kind of stream these came from. + Therefore we require a minimum spacing between them, with the + expectation that while bitrates and granule position increments might + vary locally in quite complex ways, they are globally smooth.*/ + if(OP_UNLIKELY(op_granpos_add(&gp2_min,gp1,OP_GP_SPACING_MIN)<0)){ + /*No granule position would satisfy us.*/ + continue; + } + offset1=_sr[sri].offset; + serialno1=_sr[sri].serialno; + for(srj=sri;srj-->0;){ + ogg_int64_t gp2; + opus_int64 offset2; + opus_int64 num; + ogg_int64_t den; + ogg_int64_t ipart; + gp2=_sr[srj].gp; + if(gp20); + if(ipart>0&&(offset2-_searched)/ipart=_end_searched?-1:bisect; +} + +/*Finds each bitstream link, one at a time, using a bisection search. + This has to begin by knowing the offset of the first link's initial page.*/ +static int op_bisect_forward_serialno(OggOpusFile *_of, + opus_int64 _searched,OpusSeekRecord *_sr,int _csr, + ogg_uint32_t **_serialnos,int *_nserialnos,int *_cserialnos){ + ogg_page og; + OggOpusLink *links; + int nlinks; + int clinks; + ogg_uint32_t *serialnos; + int nserialnos; + ogg_int64_t total_duration; + int nsr; + int ret; + links=_of->links; + nlinks=clinks=_of->nlinks; + total_duration=0; + /*We start with one seek record, for the last page in the file. + We build up a list of records for places we seek to during link + enumeration. + This list is kept sorted in reverse order. + We only care about seek locations that were _not_ in the current link, + therefore we can add them one at a time to the end of the list as we + improve the lower bound on the location where the next link starts.*/ + nsr=1; + for(;;){ + opus_int64 end_searched; + opus_int64 bisect; + opus_int64 next; + opus_int64 last; + ogg_int64_t end_offset; + ogg_int64_t end_gp; + int sri; + serialnos=*_serialnos; + nserialnos=*_nserialnos; + if(OP_UNLIKELY(nlinks>=clinks)){ + if(OP_UNLIKELY(clinks>INT_MAX-1>>1))return OP_EFAULT; + clinks=2*clinks+1; + OP_ASSERT(nlinkslinks=links; + } + /*Invariants: + We have the headers and serial numbers for the link beginning at 'begin'. + We have the offset and granule position of the last page in the file + (potentially not a page we care about).*/ + /*Scan the seek records we already have to save us some bisection.*/ + for(sri=0;sri1){ + opus_int64 last_offset; + opus_int64 avg_link_size; + opus_int64 upper_limit; + last_offset=links[nlinks-1].offset; + avg_link_size=last_offset/(nlinks-1); + upper_limit=end_searched-OP_CHUNK_SIZE-avg_link_size; + if(OP_LIKELY(last_offset>_searched-avg_link_size) + &&OP_LIKELY(last_offset>1); + /*If we're within OP_CHUNK_SIZE of the start, scan forward.*/ + if(bisect-_searchedoffset-last>=0); + OP_ASSERT(_of->offset-last<=OP_PAGE_SIZE_MAX); + _sr[nsr].size=(opus_int32)(_of->offset-last); + _sr[nsr].serialno=serialno; + _sr[nsr].gp=gp; + nsr++; + } + } + else{ + _searched=_of->offset; + next_bias=OP_CHUNK_SIZE; + if(serialno==links[nlinks-1].serialno){ + /*This page was from the stream we want, remember it. + If it's the last such page in the link, we won't have to go back + looking for it later.*/ + end_gp=gp; + end_offset=last; + } + } + } + bisect=op_predict_link_start(_sr,nsr,_searched,end_searched,next_bias); + } + /*Bisection point found. + Get the final granule position of the previous link, assuming + op_find_initial_pcm_offset() didn't already determine the link was + empty.*/ + if(OP_LIKELY(links[nlinks-1].pcm_end==-1)){ + if(end_gp==-1){ + /*If we don't know where the end page is, we'll have to seek back and + look for it, starting from the end of the link.*/ + end_offset=next; + /*Also forget the last page we read. + It won't be available after the seek.*/ + last=-1; + } + ret=op_find_final_pcm_offset(_of,serialnos,nserialnos, + links+nlinks-1,end_offset,links[nlinks-1].serialno,end_gp, + &total_duration); + if(OP_UNLIKELY(ret<0))return ret; + } + if(last!=next){ + /*The last page we read was not the first page the next link. + Move the cursor position to the offset of that first page. + This only performs an actual seek if the first page of the next link + does not start at the end of the last page from the current Opus + stream with a valid granule position.*/ + ret=op_seek_helper(_of,next); + if(OP_UNLIKELY(ret<0))return ret; + } + ret=op_fetch_headers(_of,&links[nlinks].head,&links[nlinks].tags, + _serialnos,_nserialnos,_cserialnos,last!=next?NULL:&og); + if(OP_UNLIKELY(ret<0))return ret; + links[nlinks].offset=next; + links[nlinks].data_offset=_of->offset; + links[nlinks].serialno=_of->os.serialno; + links[nlinks].pcm_end=-1; + /*This might consume a page from the next link, however the next bisection + always starts with a seek.*/ + ret=op_find_initial_pcm_offset(_of,links+nlinks,NULL); + if(OP_UNLIKELY(ret<0))return ret; + _searched=_of->offset; + /*Mark the current link count so it can be cleaned up on error.*/ + _of->nlinks=++nlinks; + } + /*Last page is in the starting serialno list, so we've reached the last link. + Now find the last granule position for it (if we didn't the first time we + looked at the end of the stream, and if op_find_initial_pcm_offset() + didn't already determine the link was empty).*/ + if(OP_LIKELY(links[nlinks-1].pcm_end==-1)){ + ret=op_find_final_pcm_offset(_of,serialnos,nserialnos, + links+nlinks-1,_sr[0].offset,_sr[0].serialno,_sr[0].gp,&total_duration); + if(OP_UNLIKELY(ret<0))return ret; + } + /*Trim back the links array if necessary.*/ + links=(OggOpusLink *)_ogg_realloc(links,sizeof(*links)*nlinks); + if(OP_LIKELY(links!=NULL))_of->links=links; + /*We also don't need these anymore.*/ + _ogg_free(*_serialnos); + *_serialnos=NULL; + *_cserialnos=*_nserialnos=0; + return 0; +} + +static void op_update_gain(OggOpusFile *_of){ + OpusHead *head; + opus_int32 gain_q8; + int li; + /*If decode isn't ready, then we'll apply the gain when we initialize the + decoder.*/ + if(_of->ready_stategain_offset_q8; + li=_of->seekable?_of->cur_link:0; + head=&_of->links[li].head; + /*We don't have to worry about overflow here because the header gain and + track gain must lie in the range [-32768,32767], and the user-supplied + offset has been pre-clamped to [-98302,98303].*/ + switch(_of->gain_type){ + case OP_ALBUM_GAIN:{ + int album_gain_q8; + album_gain_q8=0; + opus_tags_get_album_gain(&_of->links[li].tags,&album_gain_q8); + gain_q8+=album_gain_q8; + gain_q8+=head->output_gain; + }break; + case OP_TRACK_GAIN:{ + int track_gain_q8; + track_gain_q8=0; + opus_tags_get_track_gain(&_of->links[li].tags,&track_gain_q8); + gain_q8+=track_gain_q8; + gain_q8+=head->output_gain; + }break; + case OP_HEADER_GAIN:gain_q8+=head->output_gain;break; + case OP_ABSOLUTE_GAIN:break; + default:OP_ASSERT(0); + } + gain_q8=OP_CLAMP(-32768,gain_q8,32767); + OP_ASSERT(_of->od!=NULL); +#if defined(OPUS_SET_GAIN) + opus_multistream_decoder_ctl(_of->od,OPUS_SET_GAIN(gain_q8)); +#else +/*A fallback that works with both float and fixed-point is a bunch of work, + so just force people to use a sufficiently new version. + This is deployed well enough at this point that this shouldn't be a burden.*/ +# error "libopus 1.0.1 or later required" +#endif +} + +static int op_make_decode_ready(OggOpusFile *_of){ + const OpusHead *head; + int li; + int stream_count; + int coupled_count; + int channel_count; + if(_of->ready_state>OP_STREAMSET)return 0; + if(OP_UNLIKELY(_of->ready_stateseekable?_of->cur_link:0; + head=&_of->links[li].head; + stream_count=head->stream_count; + coupled_count=head->coupled_count; + channel_count=head->channel_count; + /*Check to see if the current decoder is compatible with the current link.*/ + if(_of->od!=NULL&&_of->od_stream_count==stream_count + &&_of->od_coupled_count==coupled_count&&_of->od_channel_count==channel_count + &&memcmp(_of->od_mapping,head->mapping, + sizeof(*head->mapping)*channel_count)==0){ + opus_multistream_decoder_ctl(_of->od,OPUS_RESET_STATE); + } + else{ + int err; + opus_multistream_decoder_destroy(_of->od); + _of->od=opus_multistream_decoder_create(48000,channel_count, + stream_count,coupled_count,head->mapping,&err); + if(_of->od==NULL)return OP_EFAULT; + _of->od_stream_count=stream_count; + _of->od_coupled_count=coupled_count; + _of->od_channel_count=channel_count; + memcpy(_of->od_mapping,head->mapping,sizeof(*head->mapping)*channel_count); + } + _of->ready_state=OP_INITSET; + _of->bytes_tracked=0; + _of->samples_tracked=0; +#if !defined(OP_FIXED_POINT) + _of->state_channel_count=0; + /*Use the serial number for the PRNG seed to get repeatable output for + straight play-throughs.*/ + _of->dither_seed=_of->links[li].serialno; +#endif + op_update_gain(_of); + return 0; +} + +static int op_open_seekable2_impl(OggOpusFile *_of){ + /*64 seek records should be enough for anybody. + Actually, with a bisection search in a 63-bit range down to OP_CHUNK_SIZE + granularity, much more than enough.*/ + OpusSeekRecord sr[64]; + opus_int64 data_offset; + int ret; + /*We can seek, so set out learning all about this file.*/ + (*_of->callbacks.seek)(_of->source,0,SEEK_END); + _of->offset=_of->end=(*_of->callbacks.tell)(_of->source); + if(OP_UNLIKELY(_of->end<0))return OP_EREAD; + data_offset=_of->links[0].data_offset; + if(OP_UNLIKELY(_of->endend, + _of->links[0].serialno,_of->serialnos,_of->nserialnos); + if(OP_UNLIKELY(ret<0))return ret; + /*If there's any trailing junk, forget about it.*/ + _of->end=sr[0].offset+sr[0].size; + if(OP_UNLIKELY(_of->endserialnos,&_of->nserialnos,&_of->cserialnos); +} + +static int op_open_seekable2(OggOpusFile *_of){ + ogg_sync_state oy_start; + ogg_stream_state os_start; + ogg_packet *op_start; + opus_int64 prev_page_offset; + opus_int64 start_offset; + int start_op_count; + int ret; + /*We're partially open and have a first link header state in storage in _of. + Save off that stream state so we can come back to it. + It would be simpler to just dump all this state and seek back to + links[0].data_offset when we're done. + But we do the extra work to allow us to seek back to _exactly_ the same + stream position we're at now. + This allows, e.g., the HTTP backend to continue reading from the original + connection (if it's still available), instead of opening a new one. + This means we can open and start playing a normal Opus file with a single + link and reasonable packet sizes using only two HTTP requests.*/ + start_op_count=_of->op_count; + /*This is a bit too large to put on the stack unconditionally.*/ + op_start=(ogg_packet *)_ogg_malloc(sizeof(*op_start)*start_op_count); + if(op_start==NULL)return OP_EFAULT; + *&oy_start=_of->oy; + *&os_start=_of->os; + prev_page_offset=_of->prev_page_offset; + start_offset=_of->offset; + memcpy(op_start,_of->op,sizeof(*op_start)*start_op_count); + OP_ASSERT((*_of->callbacks.tell)(_of->source)==op_position(_of)); + ogg_sync_init(&_of->oy); + ogg_stream_init(&_of->os,-1); + ret=op_open_seekable2_impl(_of); + /*Restore the old stream state.*/ + ogg_stream_clear(&_of->os); + ogg_sync_clear(&_of->oy); + *&_of->oy=*&oy_start; + *&_of->os=*&os_start; + _of->offset=start_offset; + _of->op_count=start_op_count; + memcpy(_of->op,op_start,sizeof(*_of->op)*start_op_count); + _ogg_free(op_start); + _of->prev_packet_gp=_of->links[0].pcm_start; + _of->prev_page_offset=prev_page_offset; + _of->cur_discard_count=_of->links[0].head.pre_skip; + if(OP_UNLIKELY(ret<0))return ret; + /*And restore the position indicator.*/ + ret=(*_of->callbacks.seek)(_of->source,op_position(_of),SEEK_SET); + return OP_UNLIKELY(ret<0)?OP_EREAD:0; +} + +/*Clear out the current logical bitstream decoder.*/ +static void op_decode_clear(OggOpusFile *_of){ + /*We don't actually free the decoder. + We might be able to re-use it for the next link.*/ + _of->op_count=0; + _of->od_buffer_size=0; + _of->prev_packet_gp=-1; + _of->prev_page_offset=-1; + if(!_of->seekable){ + OP_ASSERT(_of->ready_state>=OP_INITSET); + opus_tags_clear(&_of->links[0].tags); + } + _of->ready_state=OP_OPENED; +} + +static void op_clear(OggOpusFile *_of){ + OggOpusLink *links; + _ogg_free(_of->od_buffer); + if(_of->od!=NULL)opus_multistream_decoder_destroy(_of->od); + links=_of->links; + if(!_of->seekable){ + if(_of->ready_state>OP_OPENED||_of->ready_state==OP_PARTOPEN){ + opus_tags_clear(&links[0].tags); + } + } + else if(OP_LIKELY(links!=NULL)){ + int nlinks; + int link; + nlinks=_of->nlinks; + for(link=0;linkserialnos); + ogg_stream_clear(&_of->os); + ogg_sync_clear(&_of->oy); + if(_of->callbacks.close!=NULL)(*_of->callbacks.close)(_of->source); +} + +static int op_open1(OggOpusFile *_of, + void *_source,const OpusFileCallbacks *_cb, + const unsigned char *_initial_data,size_t _initial_bytes){ + ogg_page og; + ogg_page *pog; + int seekable; + int ret; + memset(_of,0,sizeof(*_of)); + _of->end=-1; + _of->source=_source; + *&_of->callbacks=*_cb; + /*At a minimum, we need to be able to read data.*/ + if(OP_UNLIKELY(_of->callbacks.read==NULL))return OP_EREAD; + /*Initialize the framing state.*/ + ogg_sync_init(&_of->oy); + /*Perhaps some data was previously read into a buffer for testing against + other stream types. + Allow initialization from this previously read data (especially as we may + be reading from a non-seekable stream). + This requires copying it into a buffer allocated by ogg_sync_buffer() and + doesn't support seeking, so this is not a good mechanism to use for + decoding entire files from RAM.*/ + if(_initial_bytes>0){ + char *buffer; + buffer=ogg_sync_buffer(&_of->oy,_initial_bytes); + memcpy(buffer,_initial_data,_initial_bytes*sizeof(*buffer)); + ogg_sync_wrote(&_of->oy,_initial_bytes); + } + /*Can we seek? + Stevens suggests the seek test is portable.*/ + seekable=_cb->seek!=NULL&&(*_cb->seek)(_source,0,SEEK_CUR)!=-1; + /*If seek is implemented, tell must also be implemented.*/ + if(seekable){ + opus_int64 pos; + if(OP_UNLIKELY(_of->callbacks.tell==NULL))return OP_EINVAL; + pos=(*_of->callbacks.tell)(_of->source); + /*If the current position is not equal to the initial bytes consumed, + absolute seeking will not work.*/ + if(OP_UNLIKELY(pos!=(opus_int64)_initial_bytes))return OP_EINVAL; + } + _of->seekable=seekable; + /*Don't seek yet. + Set up a 'single' (current) logical bitstream entry for partial open.*/ + _of->links=(OggOpusLink *)_ogg_malloc(sizeof(*_of->links)); + /*The serialno gets filled in later by op_fetch_headers().*/ + ogg_stream_init(&_of->os,-1); + pog=NULL; + for(;;){ + /*Fetch all BOS pages, store the Opus header and all seen serial numbers, + and load subsequent Opus setup headers.*/ + ret=op_fetch_headers(_of,&_of->links[0].head,&_of->links[0].tags, + &_of->serialnos,&_of->nserialnos,&_of->cserialnos,pog); + if(OP_UNLIKELY(ret<0))break; + _of->nlinks=1; + _of->links[0].offset=0; + _of->links[0].data_offset=_of->offset; + _of->links[0].pcm_end=-1; + _of->links[0].serialno=_of->os.serialno; + /*Fetch the initial PCM offset.*/ + ret=op_find_initial_pcm_offset(_of,_of->links,&og); + if(seekable||OP_LIKELY(ret<=0))break; + /*This link was empty, but we already have the BOS page for the next one in + og. + We can't seek, so start processing the next link right now.*/ + opus_tags_clear(&_of->links[0].tags); + _of->nlinks=0; + if(!seekable)_of->cur_link++; + pog=&og; + } + if(OP_LIKELY(ret>=0))_of->ready_state=OP_PARTOPEN; + return ret; +} + +static int op_open2(OggOpusFile *_of){ + int ret; + OP_ASSERT(_of->ready_state==OP_PARTOPEN); + if(_of->seekable){ + _of->ready_state=OP_OPENED; + ret=op_open_seekable2(_of); + } + else ret=0; + if(OP_LIKELY(ret>=0)){ + /*We have buffered packets from op_find_initial_pcm_offset(). + Move to OP_INITSET so we can use them.*/ + _of->ready_state=OP_STREAMSET; + ret=op_make_decode_ready(_of); + if(OP_LIKELY(ret>=0))return 0; + } + /*Don't auto-close the stream on failure.*/ + _of->callbacks.close=NULL; + op_clear(_of); + return ret; +} + +OggOpusFile *op_test_callbacks(void *_source,const OpusFileCallbacks *_cb, + const unsigned char *_initial_data,size_t _initial_bytes,int *_error){ + OggOpusFile *of; + int ret; + of=(OggOpusFile *)_ogg_malloc(sizeof(*of)); + ret=OP_EFAULT; + if(OP_LIKELY(of!=NULL)){ + ret=op_open1(of,_source,_cb,_initial_data,_initial_bytes); + if(OP_LIKELY(ret>=0)){ + if(_error!=NULL)*_error=0; + return of; + } + /*Don't auto-close the stream on failure.*/ + of->callbacks.close=NULL; + op_clear(of); + _ogg_free(of); + } + if(_error!=NULL)*_error=ret; + return NULL; +} + +OggOpusFile *op_open_callbacks(void *_source,const OpusFileCallbacks *_cb, + const unsigned char *_initial_data,size_t _initial_bytes,int *_error){ + OggOpusFile *of; + of=op_test_callbacks(_source,_cb,_initial_data,_initial_bytes,_error); + if(OP_LIKELY(of!=NULL)){ + int ret; + ret=op_open2(of); + if(OP_LIKELY(ret>=0))return of; + if(_error!=NULL)*_error=ret; + _ogg_free(of); + } + return NULL; +} + +/*Convenience routine to clean up from failure for the open functions that + create their own streams.*/ +static OggOpusFile *op_open_close_on_failure(void *_source, + const OpusFileCallbacks *_cb,int *_error){ + OggOpusFile *of; + if(OP_UNLIKELY(_source==NULL)){ + if(_error!=NULL)*_error=OP_EFAULT; + return NULL; + } + of=op_open_callbacks(_source,_cb,NULL,0,_error); + if(OP_UNLIKELY(of==NULL))(*_cb->close)(_source); + return of; +} + +OggOpusFile *op_open_file(const char *_path,int *_error){ + OpusFileCallbacks cb; + return op_open_close_on_failure(op_fopen(&cb,_path,"rb"),&cb,_error); +} + +OggOpusFile *op_open_memory(const unsigned char *_data,size_t _size, + int *_error){ + OpusFileCallbacks cb; + return op_open_close_on_failure(op_mem_stream_create(&cb,_data,_size),&cb, + _error); +} + +/*Convenience routine to clean up from failure for the open functions that + create their own streams.*/ +static OggOpusFile *op_test_close_on_failure(void *_source, + const OpusFileCallbacks *_cb,int *_error){ + OggOpusFile *of; + if(OP_UNLIKELY(_source==NULL)){ + if(_error!=NULL)*_error=OP_EFAULT; + return NULL; + } + of=op_test_callbacks(_source,_cb,NULL,0,_error); + if(OP_UNLIKELY(of==NULL))(*_cb->close)(_source); + return of; +} + +OggOpusFile *op_test_file(const char *_path,int *_error){ + OpusFileCallbacks cb; + return op_test_close_on_failure(op_fopen(&cb,_path,"rb"),&cb,_error); +} + +OggOpusFile *op_test_memory(const unsigned char *_data,size_t _size, + int *_error){ + OpusFileCallbacks cb; + return op_test_close_on_failure(op_mem_stream_create(&cb,_data,_size),&cb, + _error); +} + +int op_test_open(OggOpusFile *_of){ + int ret; + if(OP_UNLIKELY(_of->ready_state!=OP_PARTOPEN))return OP_EINVAL; + ret=op_open2(_of); + /*op_open2() will clear this structure on failure. + Reset its contents to prevent double-frees in op_free().*/ + if(OP_UNLIKELY(ret<0))memset(_of,0,sizeof(*_of)); + return ret; +} + +void op_free(OggOpusFile *_of){ + if(OP_LIKELY(_of!=NULL)){ + op_clear(_of); + _ogg_free(_of); + } +} + +int op_seekable(const OggOpusFile *_of){ + return _of->seekable; +} + +int op_link_count(const OggOpusFile *_of){ + return _of->nlinks; +} + +ogg_uint32_t op_serialno(const OggOpusFile *_of,int _li){ + if(OP_UNLIKELY(_li>=_of->nlinks))_li=_of->nlinks-1; + if(!_of->seekable)_li=0; + return _of->links[_li<0?_of->cur_link:_li].serialno; +} + +int op_channel_count(const OggOpusFile *_of,int _li){ + return op_head(_of,_li)->channel_count; +} + +opus_int64 op_raw_total(const OggOpusFile *_of,int _li){ + if(OP_UNLIKELY(_of->ready_stateseekable) + ||OP_UNLIKELY(_li>=_of->nlinks)){ + return OP_EINVAL; + } + if(_li<0)return _of->end-_of->links[0].offset; + return (_li+1>=_of->nlinks?_of->end:_of->links[_li+1].offset) + -_of->links[_li].offset; +} + +ogg_int64_t op_pcm_total(const OggOpusFile *_of,int _li){ + OggOpusLink *links; + ogg_int64_t diff; + int nlinks; + nlinks=_of->nlinks; + if(OP_UNLIKELY(_of->ready_stateseekable) + ||OP_UNLIKELY(_li>=nlinks)){ + return OP_EINVAL; + } + links=_of->links; + /*We verify that the granule position differences are larger than the + pre-skip and that the total duration does not overflow during link + enumeration, so we don't have to check here.*/ + if(_li<0){ + ogg_int64_t pcm_total; + int li; + pcm_total=0; + for(li=0;li=_of->nlinks))_li=_of->nlinks-1; + if(!_of->seekable)_li=0; + return &_of->links[_li<0?_of->cur_link:_li].head; +} + +const OpusTags *op_tags(const OggOpusFile *_of,int _li){ + if(OP_UNLIKELY(_li>=_of->nlinks))_li=_of->nlinks-1; + if(!_of->seekable){ + if(_of->ready_stateready_state!=OP_PARTOPEN){ + return NULL; + } + _li=0; + } + else if(_li<0)_li=_of->ready_state>=OP_STREAMSET?_of->cur_link:0; + return &_of->links[_li].tags; +} + +int op_current_link(const OggOpusFile *_of){ + if(OP_UNLIKELY(_of->ready_statecur_link; +} + +/*Compute an average bitrate given a byte and sample count. + Return: The bitrate in bits per second.*/ +static opus_int32 op_calc_bitrate(opus_int64 _bytes,ogg_int64_t _samples){ + /*These rates are absurd, but let's handle them anyway.*/ + if(OP_UNLIKELY(_bytes>(OP_INT64_MAX-(_samples>>1))/(48000*8))){ + ogg_int64_t den; + if(OP_UNLIKELY(_bytes/(OP_INT32_MAX/(48000*8))>=_samples)){ + return OP_INT32_MAX; + } + den=_samples/(48000*8); + return (opus_int32)((_bytes+(den>>1))/den); + } + if(OP_UNLIKELY(_samples<=0))return OP_INT32_MAX; + /*This can't actually overflow in normal operation: even with a pre-skip of + 545 2.5 ms frames with 8 streams running at 1282*8+1 bytes per packet + (1275 byte frames + Opus framing overhead + Ogg lacing values), that all + produce a single sample of decoded output, we still don't top 45 Mbps. + The only way to get bitrates larger than that is with excessive Opus + padding, more encoded streams than output channels, or lots and lots of + Ogg pages with no packets on them.*/ + return (opus_int32)OP_MIN((_bytes*48000*8+(_samples>>1))/_samples, + OP_INT32_MAX); +} + +opus_int32 op_bitrate(const OggOpusFile *_of,int _li){ + if(OP_UNLIKELY(_of->ready_stateseekable) + ||OP_UNLIKELY(_li>=_of->nlinks)){ + return OP_EINVAL; + } + return op_calc_bitrate(op_raw_total(_of,_li),op_pcm_total(_of,_li)); +} + +opus_int32 op_bitrate_instant(OggOpusFile *_of){ + ogg_int64_t samples_tracked; + opus_int32 ret; + if(OP_UNLIKELY(_of->ready_statesamples_tracked; + if(OP_UNLIKELY(samples_tracked==0))return OP_FALSE; + ret=op_calc_bitrate(_of->bytes_tracked,samples_tracked); + _of->bytes_tracked=0; + _of->samples_tracked=0; + return ret; +} + +/*Fetch and process a page. + This handles the case where we're at a bitstream boundary and dumps the + decoding machine. + If the decoding machine is unloaded, it loads it. + It also keeps prev_packet_gp up to date (seek and read both use this). + Return: <0) Error, OP_HOLE (lost packet), or OP_EOF. + 0) Got at least one audio data packet.*/ +static int op_fetch_and_process_page(OggOpusFile *_of, + ogg_page *_og,opus_int64 _page_offset,int _spanp,int _ignore_holes){ + OggOpusLink *links; + ogg_uint32_t cur_serialno; + int seekable; + int cur_link; + int ret; + /*We shouldn't get here if we have unprocessed packets.*/ + OP_ASSERT(_of->ready_stateop_pos>=_of->op_count); + seekable=_of->seekable; + links=_of->links; + cur_link=seekable?_of->cur_link:0; + cur_serialno=links[cur_link].serialno; + /*Handle one page.*/ + for(;;){ + ogg_page og; + OP_ASSERT(_of->ready_state>=OP_OPENED); + /*If we were given a page to use, use it.*/ + if(_og!=NULL){ + *&og=*_og; + _og=NULL; + } + /*Keep reading until we get a page with the correct serialno.*/ + else _page_offset=op_get_next_page(_of,&og,_of->end); + /*EOF: Leave uninitialized.*/ + if(_page_offset<0)return _page_offsetready_state>=OP_STREAMSET) + &&cur_serialno!=(ogg_uint32_t)ogg_page_serialno(&og)){ + /*Two possibilities: + 1) Another stream is multiplexed into this logical section, or*/ + if(OP_LIKELY(!ogg_page_bos(&og)))continue; + /* 2) Our decoding just traversed a bitstream boundary.*/ + if(!_spanp)return OP_EOF; + if(OP_LIKELY(_of->ready_state>=OP_INITSET))op_decode_clear(_of); + } + /*Bitrate tracking: add the header's bytes here. + The body bytes are counted when we consume the packets.*/ + else _of->bytes_tracked+=og.header_len; + /*Do we need to load a new machine before submitting the page? + This is different in the seekable and non-seekable cases. + In the seekable case, we already have all the header information loaded + and cached. + We just initialize the machine with it and continue on our merry way. + In the non-seekable (streaming) case, we'll only be at a boundary if we + just left the previous logical bitstream, and we're now nominally at the + header of the next bitstream.*/ + if(OP_UNLIKELY(_of->ready_statenlinks; + for(li=0;li=nlinks)continue; + cur_serialno=serialno; + _of->cur_link=cur_link=li; + ogg_stream_reset_serialno(&_of->os,serialno); + _of->ready_state=OP_STREAMSET; + /*If we're at the start of this link, initialize the granule position + and pre-skip tracking.*/ + if(_page_offset<=links[cur_link].data_offset){ + _of->prev_packet_gp=links[cur_link].pcm_start; + _of->prev_page_offset=-1; + _of->cur_discard_count=links[cur_link].head.pre_skip; + /*Ignore a hole at the start of a new link (this is common for + streams joined in the middle) or after seeking.*/ + _ignore_holes=1; + } + } + else{ + do{ + /*We're streaming. + Fetch the two header packets, build the info struct.*/ + ret=op_fetch_headers(_of,&links[0].head,&links[0].tags, + NULL,NULL,NULL,&og); + if(OP_UNLIKELY(ret<0))return ret; + /*op_find_initial_pcm_offset() will suppress any initial hole for us, + so no need to set _ignore_holes.*/ + ret=op_find_initial_pcm_offset(_of,links,&og); + if(OP_UNLIKELY(ret<0))return ret; + _of->links[0].serialno=cur_serialno=_of->os.serialno; + _of->cur_link++; + } + /*If the link was empty, keep going, because we already have the + BOS page of the next one in og.*/ + while(OP_UNLIKELY(ret>0)); + /*If we didn't get any packets out of op_find_initial_pcm_offset(), + keep going (this is possible if end-trimming trimmed them all).*/ + if(_of->op_count<=0)continue; + /*Otherwise, we're done. + TODO: This resets bytes_tracked, which misses the header bytes + already processed by op_find_initial_pcm_offset().*/ + ret=op_make_decode_ready(_of); + if(OP_UNLIKELY(ret<0))return ret; + return 0; + } + } + /*The buffered page is the data we want, and we're ready for it. + Add it to the stream state.*/ + if(OP_UNLIKELY(_of->ready_state==OP_STREAMSET)){ + ret=op_make_decode_ready(_of); + if(OP_UNLIKELY(ret<0))return ret; + } + /*Extract all the packets from the current page.*/ + ogg_stream_pagein(&_of->os,&og); + if(OP_LIKELY(_of->ready_state>=OP_INITSET)){ + opus_int32 total_duration; + int durations[255]; + int op_count; + total_duration=op_collect_audio_packets(_of,durations); + if(OP_UNLIKELY(total_duration<0)){ + /*Drain the packets from the page anyway.*/ + total_duration=op_collect_audio_packets(_of,durations); + OP_ASSERT(total_duration>=0); + /*Report holes to the caller.*/ + if(!_ignore_holes)return OP_HOLE; + } + op_count=_of->op_count; + /*If we found at least one audio data packet, compute per-packet granule + positions for them.*/ + if(op_count>0){ + ogg_int64_t diff; + ogg_int64_t prev_packet_gp; + ogg_int64_t cur_packet_gp; + ogg_int64_t cur_page_gp; + int cur_page_eos; + int pi; + cur_page_gp=_of->op[op_count-1].granulepos; + cur_page_eos=_of->op[op_count-1].e_o_s; + prev_packet_gp=_of->prev_packet_gp; + if(OP_UNLIKELY(prev_packet_gp==-1)){ + opus_int32 cur_discard_count; + /*This is the first call after a raw seek. + Try to reconstruct prev_packet_gp from scratch.*/ + OP_ASSERT(seekable); + if(OP_UNLIKELY(cur_page_eos)){ + /*If the first page we hit after our seek was the EOS page, and + we didn't start from data_offset or before, we don't have + enough information to do end-trimming. + Proceed to the next link, rather than risk playing back some + samples that shouldn't have been played.*/ + _of->op_count=0; + continue; + } + /*By default discard 80 ms of data after a seek, unless we seek + into the pre-skip region.*/ + cur_discard_count=80*48; + cur_page_gp=_of->op[op_count-1].granulepos; + /*Try to initialize prev_packet_gp. + If the current page had packets but didn't have a granule + position, or the granule position it had was too small (both + illegal), just use the starting granule position for the link.*/ + prev_packet_gp=links[cur_link].pcm_start; + if(OP_LIKELY(cur_page_gp!=-1)){ + op_granpos_add(&prev_packet_gp,cur_page_gp,-total_duration); + } + if(OP_LIKELY(!op_granpos_diff(&diff, + prev_packet_gp,links[cur_link].pcm_start))){ + opus_int32 pre_skip; + /*If we start at the beginning of the pre-skip region, or we're + at least 80 ms from the end of the pre-skip region, we discard + to the end of the pre-skip region. + Otherwise, we still use the 80 ms default, which will discard + past the end of the pre-skip region.*/ + pre_skip=links[cur_link].head.pre_skip; + if(diff>=0&&diff<=OP_MAX(0,pre_skip-80*48)){ + cur_discard_count=pre_skip-(int)diff; + } + } + _of->cur_discard_count=cur_discard_count; + } + if(OP_UNLIKELY(cur_page_gp==-1)){ + /*This page had completed packets but didn't have a valid granule + position. + This is illegal, but we'll try to handle it by continuing to count + forwards from the previous page.*/ + if(op_granpos_add(&cur_page_gp,prev_packet_gp,total_duration)<0){ + /*The timestamp for this page overflowed.*/ + cur_page_gp=links[cur_link].pcm_end; + } + } + /*If we hit the last page, handle end-trimming.*/ + if(OP_UNLIKELY(cur_page_eos) + &&OP_LIKELY(!op_granpos_diff(&diff,cur_page_gp,prev_packet_gp)) + &&OP_LIKELY(diff0){ + /*If we trimmed the entire packet, stop (the spec says encoders + shouldn't do this, but we support it anyway).*/ + if(OP_UNLIKELY(diff>durations[pi]))break; + cur_packet_gp=cur_page_gp; + /*Move the EOS flag to this packet, if necessary, so we'll trim + the samples during decode.*/ + _of->op[pi].e_o_s=1; + } + else{ + /*Update the granule position as normal.*/ + OP_ALWAYS_TRUE(!op_granpos_add(&cur_packet_gp, + cur_packet_gp,durations[pi])); + } + _of->op[pi].granulepos=cur_packet_gp; + OP_ALWAYS_TRUE(!op_granpos_diff(&diff,cur_page_gp,cur_packet_gp)); + } + } + else{ + /*Propagate timestamps to earlier packets. + op_granpos_add(&prev_packet_gp,prev_packet_gp,total_duration) + should succeed and give prev_packet_gp==cur_page_gp. + But we don't bother to check that, as there isn't much we can do + if it's not true, and it actually will not be true on the first + page after a seek, if there was a continued packet. + The only thing we guarantee is that the start and end granule + positions of the packets are valid, and that they are monotonic + within a page. + They might be completely out of range for this link (we'll check + that elsewhere), or non-monotonic between pages.*/ + if(OP_UNLIKELY(op_granpos_add(&prev_packet_gp, + cur_page_gp,-total_duration)<0)){ + /*The starting timestamp for the first packet on this page + underflowed. + This is illegal, but we ignore it.*/ + prev_packet_gp=0; + } + for(pi=0;pi=0); + OP_ALWAYS_TRUE(!op_granpos_add(&cur_packet_gp, + cur_packet_gp,durations[pi])); + _of->op[pi].granulepos=cur_packet_gp; + } + OP_ASSERT(total_duration==0); + } + _of->prev_packet_gp=prev_packet_gp; + _of->prev_page_offset=_page_offset; + _of->op_count=pi; + /*If end-trimming didn't trim all the packets, we're done.*/ + if(OP_LIKELY(pi>0))return 0; + } + } + } +} + +int op_raw_seek(OggOpusFile *_of,opus_int64 _pos){ + int ret; + if(OP_UNLIKELY(_of->ready_stateseekable))return OP_ENOSEEK; + if(OP_UNLIKELY(_pos<0)||OP_UNLIKELY(_pos>_of->end))return OP_EINVAL; + /*Clear out any buffered, decoded data.*/ + op_decode_clear(_of); + _of->bytes_tracked=0; + _of->samples_tracked=0; + ret=op_seek_helper(_of,_pos); + if(OP_UNLIKELY(ret<0))return OP_EREAD; + ret=op_fetch_and_process_page(_of,NULL,-1,1,1); + /*If we hit EOF, op_fetch_and_process_page() leaves us uninitialized. + Instead, jump to the end.*/ + if(ret==OP_EOF){ + int cur_link; + op_decode_clear(_of); + cur_link=_of->nlinks-1; + _of->cur_link=cur_link; + _of->prev_packet_gp=_of->links[cur_link].pcm_end; + _of->cur_discard_count=0; + ret=0; + } + return ret; +} + +/*Convert a PCM offset relative to the start of the whole stream to a granule + position in an individual link.*/ +static ogg_int64_t op_get_granulepos(const OggOpusFile *_of, + ogg_int64_t _pcm_offset,int *_li){ + const OggOpusLink *links; + ogg_int64_t duration; + int nlinks; + int li; + OP_ASSERT(_pcm_offset>=0); + nlinks=_of->nlinks; + links=_of->links; + for(li=0;OP_LIKELY(liOP_INT64_MAX-_pcm_offset)){ + /*Adding this amount to the granule position would overflow the positive + half of its 64-bit range. + Since signed overflow is undefined in C, do it in a way the compiler + isn't allowed to screw up.*/ + _pcm_offset-=OP_INT64_MAX-pcm_start+1; + pcm_start=OP_INT64_MIN; + } + pcm_start+=_pcm_offset; + *_li=li; + return pcm_start; + } + _pcm_offset-=duration; + } + return -1; +} + +/*A small helper to determine if an Ogg page contains data that continues onto + a subsequent page.*/ +static int op_page_continues(const ogg_page *_og){ + int nlacing; + OP_ASSERT(_og->header_len>=27); + nlacing=_og->header[26]; + OP_ASSERT(_og->header_len>=27+nlacing); + /*This also correctly handles the (unlikely) case of nlacing==0, because + 0!=255.*/ + return _og->header[27+nlacing-1]==255; +} + +/*A small helper to buffer the continued packet data from a page.*/ +static void op_buffer_continued_data(OggOpusFile *_of,ogg_page *_og){ + ogg_packet op; + ogg_stream_pagein(&_of->os,_og); + /*Drain any packets that did end on this page (and ignore holes). + We only care about the continued packet data.*/ + while(ogg_stream_packetout(&_of->os,&op)); +} + +/*This controls how close the target has to be to use the current stream + position to subdivide the initial range. + Two minutes seems to be a good default.*/ +#define OP_CUR_TIME_THRESH (120*48*(opus_int32)1000) + +/*Note: The OP_SMALL_FOOTPRINT #define doesn't (currently) save much code size, + but it's meant to serve as documentation for portions of the seeking + algorithm that are purely optional, to aid others learning from/porting this + code to other contexts.*/ +/*#define OP_SMALL_FOOTPRINT (1)*/ + +/*Search within link _li for the page with the highest granule position + preceding (or equal to) _target_gp. + There is a danger here: missing pages or incorrect frame number information + in the bitstream could make our task impossible. + Account for that (and report it as an error condition).*/ +static int op_pcm_seek_page(OggOpusFile *_of, + ogg_int64_t _target_gp,int _li){ + const OggOpusLink *link; + ogg_page og; + ogg_int64_t pcm_pre_skip; + ogg_int64_t pcm_start; + ogg_int64_t pcm_end; + ogg_int64_t best_gp; + ogg_int64_t diff; + ogg_uint32_t serialno; + opus_int32 pre_skip; + opus_int64 begin; + opus_int64 end; + opus_int64 boundary; + opus_int64 best; + opus_int64 best_start; + opus_int64 page_offset; + opus_int64 d0; + opus_int64 d1; + opus_int64 d2; + int force_bisect; + int buffering; + int ret; + _of->bytes_tracked=0; + _of->samples_tracked=0; + link=_of->links+_li; + best_gp=pcm_start=link->pcm_start; + pcm_end=link->pcm_end; + serialno=link->serialno; + best=best_start=begin=link->data_offset; + page_offset=-1; + buffering=0; + /*We discard the first 80 ms of data after a seek, so seek back that much + farther. + If we can't, simply seek to the beginning of the link.*/ + if(OP_UNLIKELY(op_granpos_add(&_target_gp,_target_gp,-80*48)<0) + ||OP_UNLIKELY(op_granpos_cmp(_target_gp,pcm_start)<0)){ + _target_gp=pcm_start; + } + /*Special case seeking to the start of the link.*/ + pre_skip=link->head.pre_skip; + OP_ALWAYS_TRUE(!op_granpos_add(&pcm_pre_skip,pcm_start,pre_skip)); + if(op_granpos_cmp(_target_gp,pcm_pre_skip)<0)end=boundary=begin; + else{ + end=boundary=link->end_offset; +#if !defined(OP_SMALL_FOOTPRINT) + /*If we were decoding from this link, we can narrow the range a bit.*/ + if(_li==_of->cur_link&&_of->ready_state>=OP_INITSET){ + opus_int64 offset; + int op_count; + op_count=_of->op_count; + /*The only way the offset can be invalid _and_ we can fail the granule + position checks below is if someone changed the contents of the last + page since we read it. + We'd be within our rights to just return OP_EBADLINK in that case, but + we'll simply ignore the current position instead.*/ + offset=_of->offset; + if(op_count>0&&OP_LIKELY(offset<=end)){ + ogg_int64_t gp; + /*Make sure the timestamp is valid. + The granule position might be -1 if we collected the packets from a + page without a granule position after reporting a hole.*/ + gp=_of->op[op_count-1].granulepos; + if(OP_LIKELY(gp!=-1)&&OP_LIKELY(op_granpos_cmp(pcm_start,gp)<0) + &&OP_LIKELY(op_granpos_cmp(pcm_end,gp)>0)){ + OP_ALWAYS_TRUE(!op_granpos_diff(&diff,gp,_target_gp)); + /*We only actually use the current time if either + a) We can cut off at least half the range, or + b) We're seeking sufficiently close to the current position that + it's likely to be informative. + Otherwise it appears using the whole link range to estimate the + first seek location gives better results, on average.*/ + if(diff<0){ + OP_ASSERT(offset>=begin); + if(offset-begin>=end-begin>>1||diff>-OP_CUR_TIME_THRESH){ + best=begin=offset; + best_gp=pcm_start=gp; + /*If we have buffered data from a continued packet, remember the + offset of the previous page's start, so that if we do wind up + having to seek back here later, we can prime the stream with + the continued packet data. + With no continued packet, we remember the end of the page.*/ + best_start=_of->os.body_returned<_of->os.body_fill? + _of->prev_page_offset:best; + /*If there's completed packets and data in the stream state, + prev_page_offset should always be set.*/ + OP_ASSERT(best_start>=0); + /*Buffer any continued packet data starting from here.*/ + buffering=1; + } + } + else{ + ogg_int64_t prev_page_gp; + /*We might get lucky and already have the packet with the target + buffered. + Worth checking. + For very small files (with all of the data in a single page, + generally 1 second or less), we can loop them continuously + without seeking at all.*/ + OP_ALWAYS_TRUE(!op_granpos_add(&prev_page_gp,_of->op[0].granulepos, + -op_get_packet_duration(_of->op[0].packet,_of->op[0].bytes))); + if(op_granpos_cmp(prev_page_gp,_target_gp)<=0){ + /*Don't call op_decode_clear(), because it will dump our + packets.*/ + _of->op_pos=0; + _of->od_buffer_size=0; + _of->prev_packet_gp=prev_page_gp; + /*_of->prev_page_offset already points to the right place.*/ + _of->ready_state=OP_STREAMSET; + return op_make_decode_ready(_of); + } + /*No such luck. + Check if we can cut off at least half the range, though.*/ + if(offset-begin<=end-begin>>1||diffos,serialno); + _of->cur_link=_li; + _of->ready_state=OP_STREAMSET; + /*Initialize the interval size history.*/ + d2=d1=d0=end-begin; + force_bisect=0; + while(begin>1; + d1=d2>>1; + d2=end-begin>>1; + if(force_bisect)bisect=begin+(end-begin>>1); + else{ + ogg_int64_t diff2; + OP_ALWAYS_TRUE(!op_granpos_diff(&diff,_target_gp,pcm_start)); + OP_ALWAYS_TRUE(!op_granpos_diff(&diff2,pcm_end,pcm_start)); + /*Take a (pretty decent) guess.*/ + bisect=begin+op_rescale64(diff,diff2,end-begin)-OP_CHUNK_SIZE; + } + if(bisect-OP_CHUNK_SIZEoffset){ + /*Discard any buffered continued packet data.*/ + if(buffering)ogg_stream_reset(&_of->os); + buffering=0; + page_offset=-1; + ret=op_seek_helper(_of,bisect); + if(OP_UNLIKELY(ret<0))return ret; + } + chunk_size=OP_CHUNK_SIZE; + next_boundary=boundary; + /*Now scan forward and figure out where we landed. + In the ideal case, we will see a page with a granule position at or + before our target, followed by a page with a granule position after our + target (or the end of the search interval). + Then we can just drop out and will have all of the data we need with no + additional seeking. + If we landed too far before, or after, we'll break out and do another + bisection.*/ + while(beginos); + buffering=0; + bisect=OP_MAX(bisect-chunk_size,begin); + ret=op_seek_helper(_of,bisect); + if(OP_UNLIKELY(ret<0))return ret; + /*Bump up the chunk size.*/ + chunk_size=OP_MIN(2*chunk_size,OP_CHUNK_SIZE_MAX); + /*If we did find a page from another stream or without a timestamp, + don't read past it.*/ + boundary=next_boundary; + } + } + else{ + ogg_int64_t gp; + int has_packets; + /*Save the offset of the first page we found after the seek, regardless + of the stream it came from or whether or not it has a timestamp.*/ + next_boundary=OP_MIN(page_offset,next_boundary); + if(serialno!=(ogg_uint32_t)ogg_page_serialno(&og))continue; + has_packets=ogg_page_packets(&og)>0; + /*Force the gp to -1 (as it should be per spec) if no packets end on + this page. + Otherwise we might get confused when we try to pull out a packet + with that timestamp and can't find it.*/ + gp=has_packets?ogg_page_granulepos(&og):-1; + if(gp==-1){ + if(buffering){ + if(OP_LIKELY(!has_packets))ogg_stream_pagein(&_of->os,&og); + else{ + /*If packets did end on this page, but we still didn't have a + valid granule position (in violation of the spec!), stop + buffering continued packet data. + Otherwise we might continue past the packet we actually + wanted.*/ + ogg_stream_reset(&_of->os); + buffering=0; + } + } + continue; + } + if(op_granpos_cmp(gp,_target_gp)<0){ + /*We found a page that ends before our target. + Advance to the raw offset of the next page.*/ + begin=_of->offset; + if(OP_UNLIKELY(op_granpos_cmp(pcm_start,gp)>0) + ||OP_UNLIKELY(op_granpos_cmp(pcm_end,gp)<0)){ + /*Don't let pcm_start get out of range! + That could happen with an invalid timestamp.*/ + break; + } + /*Save the byte offset of the end of the page with this granule + position.*/ + best=best_start=begin; + /*Buffer any data from a continued packet, if necessary. + This avoids the need to seek back here if the next timestamp we + encounter while scanning forward lies after our target.*/ + if(buffering)ogg_stream_reset(&_of->os); + if(op_page_continues(&og)){ + op_buffer_continued_data(_of,&og); + /*If we have a continued packet, remember the offset of this + page's start, so that if we do wind up having to seek back here + later, we can prime the stream with the continued packet data. + With no continued packet, we remember the end of the page.*/ + best_start=page_offset; + } + /*Then force buffering on, so that if a packet starts (but does not + end) on the next page, we still avoid the extra seek back.*/ + buffering=1; + best_gp=pcm_start=gp; + OP_ALWAYS_TRUE(!op_granpos_diff(&diff,_target_gp,pcm_start)); + /*If we're more than a second away from our target, break out and + do another bisection.*/ + if(diff>48000)break; + /*Otherwise, keep scanning forward (do NOT use begin+1).*/ + bisect=begin; + } + else{ + /*We found a page that ends after our target.*/ + /*If we scanned the whole interval before we found it, we're done.*/ + if(bisect<=begin+1)end=begin; + else{ + end=bisect; + /*In later iterations, don't read past the first page we found.*/ + boundary=next_boundary; + /*If we're not making much progress shrinking the interval size, + start forcing straight bisection to limit the worst case.*/ + force_bisect=end-begin>d0*2; + /*Don't let pcm_end get out of range! + That could happen with an invalid timestamp.*/ + if(OP_LIKELY(op_granpos_cmp(pcm_end,gp)>0) + &&OP_LIKELY(op_granpos_cmp(pcm_start,gp)<=0)){ + pcm_end=gp; + } + break; + } + } + } + } + } + /*Found our page.*/ + OP_ASSERT(op_granpos_cmp(best_gp,pcm_start)>=0); + /*Seek, if necessary. + If we were buffering data from a continued packet, we should be able to + continue to scan forward to get the rest of the data (even if + page_offset==-1). + Otherwise, we need to seek back to best_start.*/ + if(!buffering){ + if(best_start!=page_offset){ + page_offset=-1; + ret=op_seek_helper(_of,best_start); + if(OP_UNLIKELY(ret<0))return ret; + } + if(best_startend_offset); + if(OP_UNLIKELY(page_offsetprev_packet_gp=best_gp; + _of->prev_page_offset=best_start; + ret=op_fetch_and_process_page(_of,page_offset<0?NULL:&og,page_offset,0,1); + if(OP_UNLIKELY(ret<0))return OP_EBADLINK; + /*Verify result.*/ + if(OP_UNLIKELY(op_granpos_cmp(_of->prev_packet_gp,_target_gp)>0)){ + return OP_EBADLINK; + } + /*Our caller will set cur_discard_count to handle pre-roll.*/ + return 0; +} + +int op_pcm_seek(OggOpusFile *_of,ogg_int64_t _pcm_offset){ + const OggOpusLink *link; + ogg_int64_t pcm_start; + ogg_int64_t target_gp; + ogg_int64_t prev_packet_gp; + ogg_int64_t skip; + ogg_int64_t diff; + int op_count; + int op_pos; + int ret; + int li; + if(OP_UNLIKELY(_of->ready_stateseekable))return OP_ENOSEEK; + if(OP_UNLIKELY(_pcm_offset<0))return OP_EINVAL; + target_gp=op_get_granulepos(_of,_pcm_offset,&li); + if(OP_UNLIKELY(target_gp==-1))return OP_EINVAL; + link=_of->links+li; + pcm_start=link->pcm_start; + OP_ALWAYS_TRUE(!op_granpos_diff(&_pcm_offset,target_gp,pcm_start)); +#if !defined(OP_SMALL_FOOTPRINT) + /*For small (90 ms or less) forward seeks within the same link, just decode + forward. + This also optimizes the case of seeking to the current position.*/ + if(li==_of->cur_link&&_of->ready_state>=OP_INITSET){ + ogg_int64_t gp; + gp=_of->prev_packet_gp; + if(OP_LIKELY(gp!=-1)){ + int nbuffered; + nbuffered=OP_MAX(_of->od_buffer_size-_of->od_buffer_pos,0); + OP_ALWAYS_TRUE(!op_granpos_add(&gp,gp,-nbuffered)); + /*We do _not_ add cur_discard_count to gp. + Otherwise the total amount to discard could grow without bound, and it + would be better just to do a full seek.*/ + if(OP_LIKELY(!op_granpos_diff(&diff,gp,pcm_start))){ + ogg_int64_t discard_count; + discard_count=_pcm_offset-diff; + /*We use a threshold of 90 ms instead of 80, since 80 ms is the + _minimum_ we would have discarded after a full seek. + Assuming 20 ms frames (the default), we'd discard 90 ms on average.*/ + if(discard_count>=0&&OP_UNLIKELY(discard_count<90*48)){ + _of->cur_discard_count=(opus_int32)discard_count; + return 0; + } + } + } + } +#endif + ret=op_pcm_seek_page(_of,target_gp,li); + if(OP_UNLIKELY(ret<0))return ret; + /*Now skip samples until we actually get to our target.*/ + /*Figure out where we should skip to.*/ + if(_pcm_offset<=link->head.pre_skip)skip=0; + else skip=OP_MAX(_pcm_offset-80*48,0); + OP_ASSERT(_pcm_offset-skip>=0); + OP_ASSERT(_pcm_offset-skipop_count; + prev_packet_gp=_of->prev_packet_gp; + for(op_pos=_of->op_pos;op_posop[op_pos].granulepos; + if(OP_LIKELY(!op_granpos_diff(&diff,cur_packet_gp,pcm_start)) + &&diff>skip){ + break; + } + prev_packet_gp=cur_packet_gp; + } + _of->prev_packet_gp=prev_packet_gp; + _of->op_pos=op_pos; + if(op_posskip)return OP_EBADLINK; + OP_ASSERT(_pcm_offset-diffcur_discard_count=(opus_int32)(_pcm_offset-diff); + return 0; +} + +opus_int64 op_raw_tell(const OggOpusFile *_of){ + if(OP_UNLIKELY(_of->ready_stateoffset; +} + +/*Convert a granule position from a given link to a PCM offset relative to the + start of the whole stream. + For unseekable sources, this gets reset to 0 at the beginning of each link.*/ +static ogg_int64_t op_get_pcm_offset(const OggOpusFile *_of, + ogg_int64_t _gp,int _li){ + const OggOpusLink *links; + ogg_int64_t pcm_offset; + ogg_int64_t delta; + int li; + links=_of->links; + pcm_offset=0; + OP_ASSERT(_li<_of->nlinks); + for(li=0;li<_li;li++){ + OP_ALWAYS_TRUE(!op_granpos_diff(&delta, + links[li].pcm_end,links[li].pcm_start)); + delta-=links[li].head.pre_skip; + pcm_offset+=delta; + } + OP_ASSERT(_li>=0); + if(_of->seekable&&OP_UNLIKELY(op_granpos_cmp(_gp,links[_li].pcm_end)>0)){ + _gp=links[_li].pcm_end; + } + if(OP_LIKELY(op_granpos_cmp(_gp,links[_li].pcm_start)>0)){ + if(OP_UNLIKELY(op_granpos_diff(&delta,_gp,links[_li].pcm_start)<0)){ + /*This means an unseekable stream claimed to have a page from more than + 2 billion days after we joined.*/ + OP_ASSERT(!_of->seekable); + return OP_INT64_MAX; + } + if(deltaready_stateprev_packet_gp; + if(gp==-1)return 0; + nbuffered=OP_MAX(_of->od_buffer_size-_of->od_buffer_pos,0); + OP_ALWAYS_TRUE(!op_granpos_add(&gp,gp,-nbuffered)); + li=_of->seekable?_of->cur_link:0; + if(op_granpos_add(&gp,gp,_of->cur_discard_count)<0){ + gp=_of->links[li].pcm_end; + } + return op_get_pcm_offset(_of,gp,li); +} + +void op_set_decode_callback(OggOpusFile *_of, + op_decode_cb_func _decode_cb,void *_ctx){ + _of->decode_cb=_decode_cb; + _of->decode_cb_ctx=_ctx; +} + +int op_set_gain_offset(OggOpusFile *_of, + int _gain_type,opus_int32 _gain_offset_q8){ + if(_gain_type!=OP_HEADER_GAIN&&_gain_type!=OP_ALBUM_GAIN + &&_gain_type!=OP_TRACK_GAIN&&_gain_type!=OP_ABSOLUTE_GAIN){ + return OP_EINVAL; + } + _of->gain_type=_gain_type; + /*The sum of header gain and track gain lies in the range [-65536,65534]. + These bounds allow the offset to set the final value to anywhere in the + range [-32768,32767], which is what we'll clamp it to before applying.*/ + _of->gain_offset_q8=OP_CLAMP(-98302,_gain_offset_q8,98303); + op_update_gain(_of); + return 0; +} + +void op_set_dither_enabled(OggOpusFile *_of,int _enabled){ +#if !defined(OP_FIXED_POINT) + _of->dither_disabled=!_enabled; + if(!_enabled)_of->dither_mute=65; +#endif +} + +/*Allocate the decoder scratch buffer. + This is done lazily, since if the user provides large enough buffers, we'll + never need it.*/ +static int op_init_buffer(OggOpusFile *_of){ + int nchannels_max; + if(_of->seekable){ + const OggOpusLink *links; + int nlinks; + int li; + links=_of->links; + nlinks=_of->nlinks; + nchannels_max=1; + for(li=0;liod_buffer=(op_sample *)_ogg_malloc( + sizeof(*_of->od_buffer)*nchannels_max*120*48); + if(_of->od_buffer==NULL)return OP_EFAULT; + return 0; +} + +/*Decode a single packet into the target buffer.*/ +static int op_decode(OggOpusFile *_of,op_sample *_pcm, + const ogg_packet *_op,int _nsamples,int _nchannels){ + int ret; + /*First we try using the application-provided decode callback.*/ + if(_of->decode_cb!=NULL){ +#if defined(OP_FIXED_POINT) + ret=(*_of->decode_cb)(_of->decode_cb_ctx,_of->od,_pcm,_op, + _nsamples,_nchannels,OP_DEC_FORMAT_SHORT,_of->cur_link); +#else + ret=(*_of->decode_cb)(_of->decode_cb_ctx,_of->od,_pcm,_op, + _nsamples,_nchannels,OP_DEC_FORMAT_FLOAT,_of->cur_link); +#endif + } + else ret=OP_DEC_USE_DEFAULT; + /*If the application didn't want to handle decoding, do it ourselves.*/ + if(ret==OP_DEC_USE_DEFAULT){ +#if defined(OP_FIXED_POINT) + ret=opus_multistream_decode(_of->od, + _op->packet,_op->bytes,_pcm,_nsamples,0); +#else + ret=opus_multistream_decode_float(_of->od, + _op->packet,_op->bytes,_pcm,_nsamples,0); +#endif + OP_ASSERT(ret<0||ret==_nsamples); + } + /*If the application returned a positive value other than 0 or + OP_DEC_USE_DEFAULT, fail.*/ + else if(OP_UNLIKELY(ret>0))return OP_EBADPACKET; + if(OP_UNLIKELY(ret<0))return OP_EBADPACKET; + return ret; +} + +/*Read more samples from the stream, using the same API as op_read() or + op_read_float().*/ +static int op_read_native(OggOpusFile *_of, + op_sample *_pcm,int _buf_size,int *_li){ + if(OP_UNLIKELY(_of->ready_stateready_state>=OP_INITSET)){ + int nchannels; + int od_buffer_pos; + int nsamples; + int op_pos; + nchannels=_of->links[_of->seekable?_of->cur_link:0].head.channel_count; + od_buffer_pos=_of->od_buffer_pos; + nsamples=_of->od_buffer_size-od_buffer_pos; + /*If we have buffered samples, return them.*/ + if(nsamples>0){ + if(nsamples*nchannels>_buf_size)nsamples=_buf_size/nchannels; + memcpy(_pcm,_of->od_buffer+nchannels*od_buffer_pos, + sizeof(*_pcm)*nchannels*nsamples); + od_buffer_pos+=nsamples; + _of->od_buffer_pos=od_buffer_pos; + if(_li!=NULL)*_li=_of->cur_link; + return nsamples; + } + /*If we have buffered packets, decode one.*/ + op_pos=_of->op_pos; + if(OP_LIKELY(op_pos<_of->op_count)){ + const ogg_packet *pop; + ogg_int64_t diff; + opus_int32 cur_discard_count; + int duration; + int trimmed_duration; + pop=_of->op+op_pos++; + _of->op_pos=op_pos; + cur_discard_count=_of->cur_discard_count; + duration=op_get_packet_duration(pop->packet,pop->bytes); + /*We don't buffer packets with an invalid TOC sequence.*/ + OP_ASSERT(duration>0); + trimmed_duration=duration; + /*Perform end-trimming.*/ + if(OP_UNLIKELY(pop->e_o_s)){ + if(OP_UNLIKELY(op_granpos_cmp(pop->granulepos, + _of->prev_packet_gp)<=0)){ + trimmed_duration=0; + } + else if(OP_LIKELY(!op_granpos_diff(&diff, + pop->granulepos,_of->prev_packet_gp))){ + trimmed_duration=(int)OP_MIN(diff,trimmed_duration); + } + } + _of->prev_packet_gp=pop->granulepos; + if(OP_UNLIKELY(duration*nchannels>_buf_size)){ + op_sample *buf; + /*If the user's buffer is too small, decode into a scratch buffer.*/ + buf=_of->od_buffer; + if(OP_UNLIKELY(buf==NULL)){ + ret=op_init_buffer(_of); + if(OP_UNLIKELY(ret<0))return ret; + buf=_of->od_buffer; + } + ret=op_decode(_of,buf,pop,duration,nchannels); + if(OP_UNLIKELY(ret<0))return ret; + /*Perform pre-skip/pre-roll.*/ + od_buffer_pos=(int)OP_MIN(trimmed_duration,cur_discard_count); + cur_discard_count-=od_buffer_pos; + _of->cur_discard_count=cur_discard_count; + _of->od_buffer_pos=od_buffer_pos; + _of->od_buffer_size=trimmed_duration; + /*Update bitrate tracking based on the actual samples we used from + what was decoded.*/ + _of->bytes_tracked+=pop->bytes; + _of->samples_tracked+=trimmed_duration-od_buffer_pos; + } + else{ + /*Otherwise decode directly into the user's buffer.*/ + ret=op_decode(_of,_pcm,pop,duration,nchannels); + if(OP_UNLIKELY(ret<0))return ret; + if(OP_LIKELY(trimmed_duration>0)){ + /*Perform pre-skip/pre-roll.*/ + od_buffer_pos=(int)OP_MIN(trimmed_duration,cur_discard_count); + cur_discard_count-=od_buffer_pos; + _of->cur_discard_count=cur_discard_count; + trimmed_duration-=od_buffer_pos; + if(OP_LIKELY(trimmed_duration>0) + &&OP_UNLIKELY(od_buffer_pos>0)){ + memmove(_pcm,_pcm+od_buffer_pos*nchannels, + sizeof(*_pcm)*trimmed_duration*nchannels); + } + /*Update bitrate tracking based on the actual samples we used from + what was decoded.*/ + _of->bytes_tracked+=pop->bytes; + _of->samples_tracked+=trimmed_duration; + if(OP_LIKELY(trimmed_duration>0)){ + if(_li!=NULL)*_li=_of->cur_link; + return trimmed_duration; + } + } + } + /*Don't grab another page yet. + This one might have more packets, or might have buffered data now.*/ + continue; + } + } + /*Suck in another page.*/ + ret=op_fetch_and_process_page(_of,NULL,-1,1,0); + if(OP_UNLIKELY(ret==OP_EOF)){ + if(_li!=NULL)*_li=_of->cur_link; + return 0; + } + if(OP_UNLIKELY(ret<0))return ret; + } +} + +/*A generic filter to apply to the decoded audio data. + _src is non-const because we will destructively modify the contents of the + source buffer that we consume in some cases.*/ +typedef int (*op_read_filter_func)(OggOpusFile *_of,void *_dst,int _dst_sz, + op_sample *_src,int _nsamples,int _nchannels); + +/*Decode some samples and then apply a custom filter to them. + This is used to convert to different output formats.*/ +static int op_filter_read_native(OggOpusFile *_of,void *_dst,int _dst_sz, + op_read_filter_func _filter,int *_li){ + int ret; + /*Ensure we have some decoded samples in our buffer.*/ + ret=op_read_native(_of,NULL,0,_li); + /*Now apply the filter to them.*/ + if(OP_LIKELY(ret>=0)&&OP_LIKELY(_of->ready_state>=OP_INITSET)){ + int od_buffer_pos; + od_buffer_pos=_of->od_buffer_pos; + ret=_of->od_buffer_size-od_buffer_pos; + if(OP_LIKELY(ret>0)){ + int nchannels; + nchannels=_of->links[_of->seekable?_of->cur_link:0].head.channel_count; + ret=(*_filter)(_of,_dst,_dst_sz, + _of->od_buffer+nchannels*od_buffer_pos,ret,nchannels); + OP_ASSERT(ret>=0); + OP_ASSERT(ret<=_of->od_buffer_size-od_buffer_pos); + od_buffer_pos+=ret; + _of->od_buffer_pos=od_buffer_pos; + } + } + return ret; +} + +#if !defined(OP_FIXED_POINT)||!defined(OP_DISABLE_FLOAT_API) + +/*Matrices for downmixing from the supported channel counts to stereo. + The matrices with 5 or more channels are normalized to a total volume of 2.0, + since most mixes sound too quiet if normalized to 1.0 (as there is generally + little volume in the side/rear channels).*/ +static const float OP_STEREO_DOWNMIX[OP_NCHANNELS_MAX-2][OP_NCHANNELS_MAX][2]={ + /*3.0*/ + { + {0.5858F,0.0F},{0.4142F,0.4142F},{0.0F,0.5858F} + }, + /*quadrophonic*/ + { + {0.4226F,0.0F},{0.0F,0.4226F},{0.366F,0.2114F},{0.2114F,0.336F} + }, + /*5.0*/ + { + {0.651F,0.0F},{0.46F,0.46F},{0.0F,0.651F},{0.5636F,0.3254F}, + {0.3254F,0.5636F} + }, + /*5.1*/ + { + {0.529F,0.0F},{0.3741F,0.3741F},{0.0F,0.529F},{0.4582F,0.2645F}, + {0.2645F,0.4582F},{0.3741F,0.3741F} + }, + /*6.1*/ + { + {0.4553F,0.0F},{0.322F,0.322F},{0.0F,0.4553F},{0.3943F,0.2277F}, + {0.2277F,0.3943F},{0.2788F,0.2788F},{0.322F,0.322F} + }, + /*7.1*/ + { + {0.3886F,0.0F},{0.2748F,0.2748F},{0.0F,0.3886F},{0.3366F,0.1943F}, + {0.1943F,0.3366F},{0.3366F,0.1943F},{0.1943F,0.3366F},{0.2748F,0.2748F} + } +}; + +#endif + +#if defined(OP_FIXED_POINT) + +/*Matrices for downmixing from the supported channel counts to stereo. + The matrices with 5 or more channels are normalized to a total volume of 2.0, + since most mixes sound too quiet if normalized to 1.0 (as there is generally + little volume in the side/rear channels). + Hence we keep the coefficients in Q14, so the downmix values won't overflow a + 32-bit number.*/ +static const opus_int16 OP_STEREO_DOWNMIX_Q14 + [OP_NCHANNELS_MAX-2][OP_NCHANNELS_MAX][2]={ + /*3.0*/ + { + {9598,0},{6786,6786},{0,9598} + }, + /*quadrophonic*/ + { + {6924,0},{0,6924},{5996,3464},{3464,5996} + }, + /*5.0*/ + { + {10666,0},{7537,7537},{0,10666},{9234,5331},{5331,9234} + }, + /*5.1*/ + { + {8668,0},{6129,6129},{0,8668},{7507,4335},{4335,7507},{6129,6129} + }, + /*6.1*/ + { + {7459,0},{5275,5275},{0,7459},{6460,3731},{3731,6460},{4568,4568}, + {5275,5275} + }, + /*7.1*/ + { + {6368,0},{4502,4502},{0,6368},{5515,3183},{3183,5515},{5515,3183}, + {3183,5515},{4502,4502} + } +}; + +int op_read(OggOpusFile *_of,opus_int16 *_pcm,int _buf_size,int *_li){ + return op_read_native(_of,_pcm,_buf_size,_li); +} + +static int op_stereo_filter(OggOpusFile *_of,void *_dst,int _dst_sz, + op_sample *_src,int _nsamples,int _nchannels){ + (void)_of; + _nsamples=OP_MIN(_nsamples,_dst_sz>>1); + if(_nchannels==2)memcpy(_dst,_src,_nsamples*2*sizeof(*_src)); + else{ + opus_int16 *dst; + int i; + dst=(opus_int16 *)_dst; + if(_nchannels==1){ + for(i=0;i<_nsamples;i++)dst[2*i+0]=dst[2*i+1]=_src[i]; + } + else{ + for(i=0;i<_nsamples;i++){ + opus_int32 l; + opus_int32 r; + int ci; + l=r=0; + for(ci=0;ci<_nchannels;ci++){ + opus_int32 s; + s=_src[_nchannels*i+ci]; + l+=OP_STEREO_DOWNMIX_Q14[_nchannels-3][ci][0]*s; + r+=OP_STEREO_DOWNMIX_Q14[_nchannels-3][ci][1]*s; + } + /*TODO: For 5 or more channels, we should do soft clipping here.*/ + dst[2*i+0]=(opus_int16)OP_CLAMP(-32768,l+8192>>14,32767); + dst[2*i+1]=(opus_int16)OP_CLAMP(-32768,r+8192>>14,32767); + } + } + } + return _nsamples; +} + +int op_read_stereo(OggOpusFile *_of,opus_int16 *_pcm,int _buf_size){ + return op_filter_read_native(_of,_pcm,_buf_size,op_stereo_filter,NULL); +} + +# if !defined(OP_DISABLE_FLOAT_API) + +static int op_short2float_filter(OggOpusFile *_of,void *_dst,int _dst_sz, + op_sample *_src,int _nsamples,int _nchannels){ + float *dst; + int i; + (void)_of; + dst=(float *)_dst; + if(OP_UNLIKELY(_nsamples*_nchannels>_dst_sz))_nsamples=_dst_sz/_nchannels; + _dst_sz=_nsamples*_nchannels; + for(i=0;i<_dst_sz;i++)dst[i]=(1.0F/32768)*_src[i]; + return _nsamples; +} + +int op_read_float(OggOpusFile *_of,float *_pcm,int _buf_size,int *_li){ + return op_filter_read_native(_of,_pcm,_buf_size,op_short2float_filter,_li); +} + +static int op_short2float_stereo_filter(OggOpusFile *_of, + void *_dst,int _dst_sz,op_sample *_src,int _nsamples,int _nchannels){ + float *dst; + int i; + dst=(float *)_dst; + _nsamples=OP_MIN(_nsamples,_dst_sz>>1); + if(_nchannels==1){ + _nsamples=op_short2float_filter(_of,dst,_nsamples,_src,_nsamples,1); + for(i=_nsamples;i-->0;)dst[2*i+0]=dst[2*i+1]=dst[i]; + } + else if(_nchannels<5){ + /*For 3 or 4 channels, we can downmix in fixed point without risk of + clipping.*/ + if(_nchannels>2){ + _nsamples=op_stereo_filter(_of,_src,_nsamples*2, + _src,_nsamples,_nchannels); + } + return op_short2float_filter(_of,dst,_dst_sz,_src,_nsamples,2); + } + else{ + /*For 5 or more channels, we convert to floats and then downmix (so that we + don't risk clipping).*/ + for(i=0;i<_nsamples;i++){ + float l; + float r; + int ci; + l=r=0; + for(ci=0;ci<_nchannels;ci++){ + float s; + s=(1.0F/32768)*_src[_nchannels*i+ci]; + l+=OP_STEREO_DOWNMIX[_nchannels-3][ci][0]*s; + r+=OP_STEREO_DOWNMIX[_nchannels-3][ci][1]*s; + } + dst[2*i+0]=l; + dst[2*i+1]=r; + } + } + return _nsamples; +} + +int op_read_float_stereo(OggOpusFile *_of,float *_pcm,int _buf_size){ + return op_filter_read_native(_of,_pcm,_buf_size, + op_short2float_stereo_filter,NULL); +} + +# endif + +#else + +# if defined(OP_HAVE_LRINTF) +# include +# define op_float2int(_x) (lrintf(_x)) +# else +# define op_float2int(_x) ((int)((_x)+((_x)<0?-0.5F:0.5F))) +# endif + +/*The dithering code here is adapted from opusdec, part of opus-tools. + It was originally written by Greg Maxwell.*/ + +static opus_uint32 op_rand(opus_uint32 _seed){ + return _seed*96314165+907633515&0xFFFFFFFFU; +} + +/*This implements 16-bit quantization with full triangular dither and IIR noise + shaping. + The noise shaping filters were designed by Sebastian Gesemann, and are based + on the LAME ATH curves with flattening to limit their peak gain to 20 dB. + Everyone else's noise shaping filters are mildly crazy. + The 48 kHz version of this filter is just a warped version of the 44.1 kHz + filter and probably could be improved by shifting the HF shelf up in + frequency a little bit, since 48 kHz has a bit more room and being more + conservative against bat-ears is probably more important than more noise + suppression. + This process can increase the peak level of the signal (in theory by the peak + error of 1.5 +20 dB, though that is unobservably rare). + To avoid clipping, the signal is attenuated by a couple thousandths of a dB. + Initially, the approach taken here was to only attenuate by the 99.9th + percentile, making clipping rare but not impossible (like SoX), but the + limited gain of the filter means that the worst case was only two + thousandths of a dB more, so this just uses the worst case. + The attenuation is probably also helpful to prevent clipping in the DAC + reconstruction filters or downstream resampling, in any case.*/ + +# define OP_GAIN (32753.0F) + +# define OP_PRNG_GAIN (1.0F/0xFFFFFFFF) + +/*48 kHz noise shaping filter, sd=2.34.*/ + +static const float OP_FCOEF_B[4]={ + 2.2374F,-0.7339F,-0.1251F,-0.6033F +}; + +static const float OP_FCOEF_A[4]={ + 0.9030F,0.0116F,-0.5853F,-0.2571F +}; + +static int op_float2short_filter(OggOpusFile *_of,void *_dst,int _dst_sz, + float *_src,int _nsamples,int _nchannels){ + opus_int16 *dst; + int ci; + int i; + dst=(opus_int16 *)_dst; + if(OP_UNLIKELY(_nsamples*_nchannels>_dst_sz))_nsamples=_dst_sz/_nchannels; +# if defined(OP_SOFT_CLIP) + if(_of->state_channel_count!=_nchannels){ + for(ci=0;ci<_nchannels;ci++)_of->clip_state[ci]=0; + } + opus_pcm_soft_clip(_src,_nsamples,_nchannels,_of->clip_state); +# endif + if(_of->dither_disabled){ + for(i=0;i<_nchannels*_nsamples;i++){ + dst[i]=op_float2int(OP_CLAMP(-32768,32768.0F*_src[i],32767)); + } + } + else{ + opus_uint32 seed; + int mute; + seed=_of->dither_seed; + mute=_of->dither_mute; + if(_of->state_channel_count!=_nchannels)mute=65; + /*In order to avoid replacing digital silence with quiet dither noise, we + mute if the output has been silent for a while.*/ + if(mute>64)memset(_of->dither_a,0,sizeof(*_of->dither_a)*4*_nchannels); + for(i=0;i<_nsamples;i++){ + int silent; + silent=1; + for(ci=0;ci<_nchannels;ci++){ + float r; + float s; + float err; + int si; + int j; + s=_src[_nchannels*i+ci]; + silent&=s==0; + s*=OP_GAIN; + err=0; + for(j=0;j<4;j++){ + err+=OP_FCOEF_B[j]*_of->dither_b[ci*4+j] + -OP_FCOEF_A[j]*_of->dither_a[ci*4+j]; + } + for(j=3;j-->0;)_of->dither_a[ci*4+j+1]=_of->dither_a[ci*4+j]; + for(j=3;j-->0;)_of->dither_b[ci*4+j+1]=_of->dither_b[ci*4+j]; + _of->dither_a[ci*4]=err; + s-=err; + if(mute>16)r=0; + else{ + seed=op_rand(seed); + r=seed*OP_PRNG_GAIN; + seed=op_rand(seed); + r-=seed*OP_PRNG_GAIN; + } + /*Clamp in float out of paranoia that the input will be > 96 dBFS and + wrap if the integer is clamped.*/ + si=op_float2int(OP_CLAMP(-32768,s+r,32767)); + dst[_nchannels*i+ci]=(opus_int16)si; + /*Including clipping in the noise shaping is generally disastrous: the + futile effort to restore the clipped energy results in more clipping. + However, small amounts---at the level which could normally be created + by dither and rounding---are harmless and can even reduce clipping + somewhat due to the clipping sometimes reducing the dither + rounding + error.*/ + _of->dither_b[ci*4]=mute>16?0:OP_CLAMP(-1.5F,si-s,1.5F); + } + mute++; + if(!silent)mute=0; + } + _of->dither_mute=OP_MIN(mute,65); + _of->dither_seed=seed; + } + _of->state_channel_count=_nchannels; + return _nsamples; +} + +int op_read(OggOpusFile *_of,opus_int16 *_pcm,int _buf_size,int *_li){ + return op_filter_read_native(_of,_pcm,_buf_size,op_float2short_filter,_li); +} + +int op_read_float(OggOpusFile *_of,float *_pcm,int _buf_size,int *_li){ + _of->state_channel_count=0; + return op_read_native(_of,_pcm,_buf_size,_li); +} + +static int op_stereo_filter(OggOpusFile *_of,void *_dst,int _dst_sz, + op_sample *_src,int _nsamples,int _nchannels){ + (void)_of; + _nsamples=OP_MIN(_nsamples,_dst_sz>>1); + if(_nchannels==2)memcpy(_dst,_src,_nsamples*2*sizeof(*_src)); + else{ + float *dst; + int i; + dst=(float *)_dst; + if(_nchannels==1){ + for(i=0;i<_nsamples;i++)dst[2*i+0]=dst[2*i+1]=_src[i]; + } + else{ + for(i=0;i<_nsamples;i++){ + float l; + float r; + int ci; + l=r=0; + for(ci=0;ci<_nchannels;ci++){ + l+=OP_STEREO_DOWNMIX[_nchannels-3][ci][0]*_src[_nchannels*i+ci]; + r+=OP_STEREO_DOWNMIX[_nchannels-3][ci][1]*_src[_nchannels*i+ci]; + } + dst[2*i+0]=l; + dst[2*i+1]=r; + } + } + } + return _nsamples; +} + +static int op_float2short_stereo_filter(OggOpusFile *_of, + void *_dst,int _dst_sz,op_sample *_src,int _nsamples,int _nchannels){ + opus_int16 *dst; + dst=(opus_int16 *)_dst; + if(_nchannels==1){ + int i; + _nsamples=op_float2short_filter(_of,dst,_dst_sz>>1,_src,_nsamples,1); + for(i=_nsamples;i-->0;)dst[2*i+0]=dst[2*i+1]=dst[i]; + } + else{ + if(_nchannels>2){ + _nsamples=OP_MIN(_nsamples,_dst_sz>>1); + _nsamples=op_stereo_filter(_of,_src,_nsamples*2, + _src,_nsamples,_nchannels); + } + _nsamples=op_float2short_filter(_of,dst,_dst_sz,_src,_nsamples,2); + } + return _nsamples; +} + +int op_read_stereo(OggOpusFile *_of,opus_int16 *_pcm,int _buf_size){ + return op_filter_read_native(_of,_pcm,_buf_size, + op_float2short_stereo_filter,NULL); +} + +int op_read_float_stereo(OggOpusFile *_of,float *_pcm,int _buf_size){ + _of->state_channel_count=0; + return op_filter_read_native(_of,_pcm,_buf_size,op_stereo_filter,NULL); +} + +#endif diff --git a/thirdparty/opus/repacketizer.c b/thirdparty/opus/repacketizer.c new file mode 100644 index 000000000000..c80ee7f00148 --- /dev/null +++ b/thirdparty/opus/repacketizer.c @@ -0,0 +1,348 @@ +/* Copyright (c) 2011 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opus.h" +#include "opus_private.h" +#include "os_support.h" + + +int opus_repacketizer_get_size(void) +{ + return sizeof(OpusRepacketizer); +} + +OpusRepacketizer *opus_repacketizer_init(OpusRepacketizer *rp) +{ + rp->nb_frames = 0; + return rp; +} + +OpusRepacketizer *opus_repacketizer_create(void) +{ + OpusRepacketizer *rp; + rp=(OpusRepacketizer *)opus_alloc(opus_repacketizer_get_size()); + if(rp==NULL)return NULL; + return opus_repacketizer_init(rp); +} + +void opus_repacketizer_destroy(OpusRepacketizer *rp) +{ + opus_free(rp); +} + +static int opus_repacketizer_cat_impl(OpusRepacketizer *rp, const unsigned char *data, opus_int32 len, int self_delimited) +{ + unsigned char tmp_toc; + int curr_nb_frames,ret; + /* Set of check ToC */ + if (len<1) return OPUS_INVALID_PACKET; + if (rp->nb_frames == 0) + { + rp->toc = data[0]; + rp->framesize = opus_packet_get_samples_per_frame(data, 8000); + } else if ((rp->toc&0xFC) != (data[0]&0xFC)) + { + /*fprintf(stderr, "toc mismatch: 0x%x vs 0x%x\n", rp->toc, data[0]);*/ + return OPUS_INVALID_PACKET; + } + curr_nb_frames = opus_packet_get_nb_frames(data, len); + if(curr_nb_frames<1) return OPUS_INVALID_PACKET; + + /* Check the 120 ms maximum packet size */ + if ((curr_nb_frames+rp->nb_frames)*rp->framesize > 960) + { + return OPUS_INVALID_PACKET; + } + + ret=opus_packet_parse_impl(data, len, self_delimited, &tmp_toc, &rp->frames[rp->nb_frames], &rp->len[rp->nb_frames], NULL, NULL); + if(ret<1)return ret; + + rp->nb_frames += curr_nb_frames; + return OPUS_OK; +} + +int opus_repacketizer_cat(OpusRepacketizer *rp, const unsigned char *data, opus_int32 len) +{ + return opus_repacketizer_cat_impl(rp, data, len, 0); +} + +int opus_repacketizer_get_nb_frames(OpusRepacketizer *rp) +{ + return rp->nb_frames; +} + +opus_int32 opus_repacketizer_out_range_impl(OpusRepacketizer *rp, int begin, int end, + unsigned char *data, opus_int32 maxlen, int self_delimited, int pad) +{ + int i, count; + opus_int32 tot_size; + opus_int16 *len; + const unsigned char **frames; + unsigned char * ptr; + + if (begin<0 || begin>=end || end>rp->nb_frames) + { + /*fprintf(stderr, "%d %d %d\n", begin, end, rp->nb_frames);*/ + return OPUS_BAD_ARG; + } + count = end-begin; + + len = rp->len+begin; + frames = rp->frames+begin; + if (self_delimited) + tot_size = 1 + (len[count-1]>=252); + else + tot_size = 0; + + ptr = data; + if (count==1) + { + /* Code 0 */ + tot_size += len[0]+1; + if (tot_size > maxlen) + return OPUS_BUFFER_TOO_SMALL; + *ptr++ = rp->toc&0xFC; + } else if (count==2) + { + if (len[1] == len[0]) + { + /* Code 1 */ + tot_size += 2*len[0]+1; + if (tot_size > maxlen) + return OPUS_BUFFER_TOO_SMALL; + *ptr++ = (rp->toc&0xFC) | 0x1; + } else { + /* Code 2 */ + tot_size += len[0]+len[1]+2+(len[0]>=252); + if (tot_size > maxlen) + return OPUS_BUFFER_TOO_SMALL; + *ptr++ = (rp->toc&0xFC) | 0x2; + ptr += encode_size(len[0], ptr); + } + } + if (count > 2 || (pad && tot_size < maxlen)) + { + /* Code 3 */ + int vbr; + int pad_amount=0; + + /* Restart the process for the padding case */ + ptr = data; + if (self_delimited) + tot_size = 1 + (len[count-1]>=252); + else + tot_size = 0; + vbr = 0; + for (i=1;i=252) + len[i]; + tot_size += len[count-1]; + + if (tot_size > maxlen) + return OPUS_BUFFER_TOO_SMALL; + *ptr++ = (rp->toc&0xFC) | 0x3; + *ptr++ = count | 0x80; + } else { + tot_size += count*len[0]+2; + if (tot_size > maxlen) + return OPUS_BUFFER_TOO_SMALL; + *ptr++ = (rp->toc&0xFC) | 0x3; + *ptr++ = count; + } + pad_amount = pad ? (maxlen-tot_size) : 0; + if (pad_amount != 0) + { + int nb_255s; + data[1] |= 0x40; + nb_255s = (pad_amount-1)/255; + for (i=0;inb_frames, data, maxlen, 0, 0); +} + +int opus_packet_pad(unsigned char *data, opus_int32 len, opus_int32 new_len) +{ + OpusRepacketizer rp; + opus_int32 ret; + if (len < 1) + return OPUS_BAD_ARG; + if (len==new_len) + return OPUS_OK; + else if (len > new_len) + return OPUS_BAD_ARG; + opus_repacketizer_init(&rp); + /* Moving payload to the end of the packet so we can do in-place padding */ + OPUS_MOVE(data+new_len-len, data, len); + ret = opus_repacketizer_cat(&rp, data+new_len-len, len); + if (ret != OPUS_OK) + return ret; + ret = opus_repacketizer_out_range_impl(&rp, 0, rp.nb_frames, data, new_len, 0, 1); + if (ret > 0) + return OPUS_OK; + else + return ret; +} + +opus_int32 opus_packet_unpad(unsigned char *data, opus_int32 len) +{ + OpusRepacketizer rp; + opus_int32 ret; + if (len < 1) + return OPUS_BAD_ARG; + opus_repacketizer_init(&rp); + ret = opus_repacketizer_cat(&rp, data, len); + if (ret < 0) + return ret; + ret = opus_repacketizer_out_range_impl(&rp, 0, rp.nb_frames, data, len, 0, 0); + celt_assert(ret > 0 && ret <= len); + return ret; +} + +int opus_multistream_packet_pad(unsigned char *data, opus_int32 len, opus_int32 new_len, int nb_streams) +{ + int s; + int count; + unsigned char toc; + opus_int16 size[48]; + opus_int32 packet_offset; + opus_int32 amount; + + if (len < 1) + return OPUS_BAD_ARG; + if (len==new_len) + return OPUS_OK; + else if (len > new_len) + return OPUS_BAD_ARG; + amount = new_len - len; + /* Seek to last stream */ + for (s=0;s +#include +#include + +#define MAX_PACKETOUT 32000 + +void usage(char *argv0) +{ + fprintf(stderr, "usage: %s [options] input_file output_file\n", argv0); +} + +static void int_to_char(opus_uint32 i, unsigned char ch[4]) +{ + ch[0] = i>>24; + ch[1] = (i>>16)&0xFF; + ch[2] = (i>>8)&0xFF; + ch[3] = i&0xFF; +} + +static opus_uint32 char_to_int(unsigned char ch[4]) +{ + return ((opus_uint32)ch[0]<<24) | ((opus_uint32)ch[1]<<16) + | ((opus_uint32)ch[2]<< 8) | (opus_uint32)ch[3]; +} + +int main(int argc, char *argv[]) +{ + int i, eof=0; + FILE *fin, *fout; + unsigned char packets[48][1500]; + int len[48]; + int rng[48]; + OpusRepacketizer *rp; + unsigned char output_packet[MAX_PACKETOUT]; + int merge = 1, split=0; + + if (argc < 3) + { + usage(argv[0]); + return EXIT_FAILURE; + } + for (i=1;i48) + { + fprintf(stderr, "-merge parameter must be less than 48.\n"); + return EXIT_FAILURE; + } + i++; + } else if (strcmp(argv[i], "-split")==0) + split = 1; + else + { + fprintf(stderr, "Unknown option: %s\n", argv[i]); + usage(argv[0]); + return EXIT_FAILURE; + } + } + fin = fopen(argv[argc-2], "r"); + if(fin==NULL) + { + fprintf(stderr, "Error opening input file: %s\n", argv[argc-2]); + return EXIT_FAILURE; + } + fout = fopen(argv[argc-1], "w"); + if(fout==NULL) + { + fprintf(stderr, "Error opening output file: %s\n", argv[argc-1]); + fclose(fin); + return EXIT_FAILURE; + } + + rp = opus_repacketizer_create(); + while (!eof) + { + int err; + int nb_packets=merge; + opus_repacketizer_init(rp); + for (i=0;i1500 || len[i]<0) + { + if (feof(fin)) + { + eof = 1; + } else { + fprintf(stderr, "Invalid payload length\n"); + fclose(fin); + fclose(fout); + return EXIT_FAILURE; + } + break; + } + err = fread(ch, 1, 4, fin); + rng[i] = char_to_int(ch); + err = fread(packets[i], 1, len[i], fin); + if (feof(fin)) + { + eof = 1; + break; + } + err = opus_repacketizer_cat(rp, packets[i], len[i]); + if (err!=OPUS_OK) + { + fprintf(stderr, "opus_repacketizer_cat() failed: %s\n", opus_strerror(err)); + break; + } + } + nb_packets = i; + + if (eof) + break; + + if (!split) + { + err = opus_repacketizer_out(rp, output_packet, MAX_PACKETOUT); + if (err>0) { + unsigned char int_field[4]; + int_to_char(err, int_field); + if(fwrite(int_field, 1, 4, fout)!=4){ + fprintf(stderr, "Error writing.\n"); + return EXIT_FAILURE; + } + int_to_char(rng[nb_packets-1], int_field); + if (fwrite(int_field, 1, 4, fout)!=4) { + fprintf(stderr, "Error writing.\n"); + return EXIT_FAILURE; + } + if (fwrite(output_packet, 1, err, fout)!=(unsigned)err) { + fprintf(stderr, "Error writing.\n"); + return EXIT_FAILURE; + } + /*fprintf(stderr, "out len = %d\n", err);*/ + } else { + fprintf(stderr, "opus_repacketizer_out() failed: %s\n", opus_strerror(err)); + } + } else { + int nb_frames = opus_repacketizer_get_nb_frames(rp); + for (i=0;i0) { + unsigned char int_field[4]; + int_to_char(err, int_field); + if (fwrite(int_field, 1, 4, fout)!=4) { + fprintf(stderr, "Error writing.\n"); + return EXIT_FAILURE; + } + if (i==nb_frames-1) + int_to_char(rng[nb_packets-1], int_field); + else + int_to_char(0, int_field); + if (fwrite(int_field, 1, 4, fout)!=4) { + fprintf(stderr, "Error writing.\n"); + return EXIT_FAILURE; + } + if (fwrite(output_packet, 1, err, fout)!=(unsigned)err) { + fprintf(stderr, "Error writing.\n"); + return EXIT_FAILURE; + } + /*fprintf(stderr, "out len = %d\n", err);*/ + } else { + fprintf(stderr, "opus_repacketizer_out() failed: %s\n", opus_strerror(err)); + } + + } + } + } + + fclose(fin); + fclose(fout); + return EXIT_SUCCESS; +} diff --git a/thirdparty/opus/silk/A2NLSF.c b/thirdparty/opus/silk/A2NLSF.c new file mode 100644 index 000000000000..b6e9e5ffccaa --- /dev/null +++ b/thirdparty/opus/silk/A2NLSF.c @@ -0,0 +1,267 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +/* Conversion between prediction filter coefficients and NLSFs */ +/* Requires the order to be an even number */ +/* A piecewise linear approximation maps LSF <-> cos(LSF) */ +/* Therefore the result is not accurate NLSFs, but the two */ +/* functions are accurate inverses of each other */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" +#include "tables.h" + +/* Number of binary divisions, when not in low complexity mode */ +#define BIN_DIV_STEPS_A2NLSF_FIX 3 /* must be no higher than 16 - log2( LSF_COS_TAB_SZ_FIX ) */ +#define MAX_ITERATIONS_A2NLSF_FIX 30 + +/* Helper function for A2NLSF(..) */ +/* Transforms polynomials from cos(n*f) to cos(f)^n */ +static OPUS_INLINE void silk_A2NLSF_trans_poly( + opus_int32 *p, /* I/O Polynomial */ + const opus_int dd /* I Polynomial order (= filter order / 2 ) */ +) +{ + opus_int k, n; + + for( k = 2; k <= dd; k++ ) { + for( n = dd; n > k; n-- ) { + p[ n - 2 ] -= p[ n ]; + } + p[ k - 2 ] -= silk_LSHIFT( p[ k ], 1 ); + } +} +/* Helper function for A2NLSF(..) */ +/* Polynomial evaluation */ +static OPUS_INLINE opus_int32 silk_A2NLSF_eval_poly( /* return the polynomial evaluation, in Q16 */ + opus_int32 *p, /* I Polynomial, Q16 */ + const opus_int32 x, /* I Evaluation point, Q12 */ + const opus_int dd /* I Order */ +) +{ + opus_int n; + opus_int32 x_Q16, y32; + + y32 = p[ dd ]; /* Q16 */ + x_Q16 = silk_LSHIFT( x, 4 ); + + if ( opus_likely( 8 == dd ) ) + { + y32 = silk_SMLAWW( p[ 7 ], y32, x_Q16 ); + y32 = silk_SMLAWW( p[ 6 ], y32, x_Q16 ); + y32 = silk_SMLAWW( p[ 5 ], y32, x_Q16 ); + y32 = silk_SMLAWW( p[ 4 ], y32, x_Q16 ); + y32 = silk_SMLAWW( p[ 3 ], y32, x_Q16 ); + y32 = silk_SMLAWW( p[ 2 ], y32, x_Q16 ); + y32 = silk_SMLAWW( p[ 1 ], y32, x_Q16 ); + y32 = silk_SMLAWW( p[ 0 ], y32, x_Q16 ); + } + else + { + for( n = dd - 1; n >= 0; n-- ) { + y32 = silk_SMLAWW( p[ n ], y32, x_Q16 ); /* Q16 */ + } + } + return y32; +} + +static OPUS_INLINE void silk_A2NLSF_init( + const opus_int32 *a_Q16, + opus_int32 *P, + opus_int32 *Q, + const opus_int dd +) +{ + opus_int k; + + /* Convert filter coefs to even and odd polynomials */ + P[dd] = silk_LSHIFT( 1, 16 ); + Q[dd] = silk_LSHIFT( 1, 16 ); + for( k = 0; k < dd; k++ ) { + P[ k ] = -a_Q16[ dd - k - 1 ] - a_Q16[ dd + k ]; /* Q16 */ + Q[ k ] = -a_Q16[ dd - k - 1 ] + a_Q16[ dd + k ]; /* Q16 */ + } + + /* Divide out zeros as we have that for even filter orders, */ + /* z = 1 is always a root in Q, and */ + /* z = -1 is always a root in P */ + for( k = dd; k > 0; k-- ) { + P[ k - 1 ] -= P[ k ]; + Q[ k - 1 ] += Q[ k ]; + } + + /* Transform polynomials from cos(n*f) to cos(f)^n */ + silk_A2NLSF_trans_poly( P, dd ); + silk_A2NLSF_trans_poly( Q, dd ); +} + +/* Compute Normalized Line Spectral Frequencies (NLSFs) from whitening filter coefficients */ +/* If not all roots are found, the a_Q16 coefficients are bandwidth expanded until convergence. */ +void silk_A2NLSF( + opus_int16 *NLSF, /* O Normalized Line Spectral Frequencies in Q15 (0..2^15-1) [d] */ + opus_int32 *a_Q16, /* I/O Monic whitening filter coefficients in Q16 [d] */ + const opus_int d /* I Filter order (must be even) */ +) +{ + opus_int i, k, m, dd, root_ix, ffrac; + opus_int32 xlo, xhi, xmid; + opus_int32 ylo, yhi, ymid, thr; + opus_int32 nom, den; + opus_int32 P[ SILK_MAX_ORDER_LPC / 2 + 1 ]; + opus_int32 Q[ SILK_MAX_ORDER_LPC / 2 + 1 ]; + opus_int32 *PQ[ 2 ]; + opus_int32 *p; + + /* Store pointers to array */ + PQ[ 0 ] = P; + PQ[ 1 ] = Q; + + dd = silk_RSHIFT( d, 1 ); + + silk_A2NLSF_init( a_Q16, P, Q, dd ); + + /* Find roots, alternating between P and Q */ + p = P; /* Pointer to polynomial */ + + xlo = silk_LSFCosTab_FIX_Q12[ 0 ]; /* Q12*/ + ylo = silk_A2NLSF_eval_poly( p, xlo, dd ); + + if( ylo < 0 ) { + /* Set the first NLSF to zero and move on to the next */ + NLSF[ 0 ] = 0; + p = Q; /* Pointer to polynomial */ + ylo = silk_A2NLSF_eval_poly( p, xlo, dd ); + root_ix = 1; /* Index of current root */ + } else { + root_ix = 0; /* Index of current root */ + } + k = 1; /* Loop counter */ + i = 0; /* Counter for bandwidth expansions applied */ + thr = 0; + while( 1 ) { + /* Evaluate polynomial */ + xhi = silk_LSFCosTab_FIX_Q12[ k ]; /* Q12 */ + yhi = silk_A2NLSF_eval_poly( p, xhi, dd ); + + /* Detect zero crossing */ + if( ( ylo <= 0 && yhi >= thr ) || ( ylo >= 0 && yhi <= -thr ) ) { + if( yhi == 0 ) { + /* If the root lies exactly at the end of the current */ + /* interval, look for the next root in the next interval */ + thr = 1; + } else { + thr = 0; + } + /* Binary division */ + ffrac = -256; + for( m = 0; m < BIN_DIV_STEPS_A2NLSF_FIX; m++ ) { + /* Evaluate polynomial */ + xmid = silk_RSHIFT_ROUND( xlo + xhi, 1 ); + ymid = silk_A2NLSF_eval_poly( p, xmid, dd ); + + /* Detect zero crossing */ + if( ( ylo <= 0 && ymid >= 0 ) || ( ylo >= 0 && ymid <= 0 ) ) { + /* Reduce frequency */ + xhi = xmid; + yhi = ymid; + } else { + /* Increase frequency */ + xlo = xmid; + ylo = ymid; + ffrac = silk_ADD_RSHIFT( ffrac, 128, m ); + } + } + + /* Interpolate */ + if( silk_abs( ylo ) < 65536 ) { + /* Avoid dividing by zero */ + den = ylo - yhi; + nom = silk_LSHIFT( ylo, 8 - BIN_DIV_STEPS_A2NLSF_FIX ) + silk_RSHIFT( den, 1 ); + if( den != 0 ) { + ffrac += silk_DIV32( nom, den ); + } + } else { + /* No risk of dividing by zero because abs(ylo - yhi) >= abs(ylo) >= 65536 */ + ffrac += silk_DIV32( ylo, silk_RSHIFT( ylo - yhi, 8 - BIN_DIV_STEPS_A2NLSF_FIX ) ); + } + NLSF[ root_ix ] = (opus_int16)silk_min_32( silk_LSHIFT( (opus_int32)k, 8 ) + ffrac, silk_int16_MAX ); + + silk_assert( NLSF[ root_ix ] >= 0 ); + + root_ix++; /* Next root */ + if( root_ix >= d ) { + /* Found all roots */ + break; + } + /* Alternate pointer to polynomial */ + p = PQ[ root_ix & 1 ]; + + /* Evaluate polynomial */ + xlo = silk_LSFCosTab_FIX_Q12[ k - 1 ]; /* Q12*/ + ylo = silk_LSHIFT( 1 - ( root_ix & 2 ), 12 ); + } else { + /* Increment loop counter */ + k++; + xlo = xhi; + ylo = yhi; + thr = 0; + + if( k > LSF_COS_TAB_SZ_FIX ) { + i++; + if( i > MAX_ITERATIONS_A2NLSF_FIX ) { + /* Set NLSFs to white spectrum and exit */ + NLSF[ 0 ] = (opus_int16)silk_DIV32_16( 1 << 15, d + 1 ); + for( k = 1; k < d; k++ ) { + NLSF[ k ] = (opus_int16)silk_SMULBB( k + 1, NLSF[ 0 ] ); + } + return; + } + + /* Error: Apply progressively more bandwidth expansion and run again */ + silk_bwexpander_32( a_Q16, d, 65536 - silk_SMULBB( 10 + i, i ) ); /* 10_Q16 = 0.00015*/ + + silk_A2NLSF_init( a_Q16, P, Q, dd ); + p = P; /* Pointer to polynomial */ + xlo = silk_LSFCosTab_FIX_Q12[ 0 ]; /* Q12*/ + ylo = silk_A2NLSF_eval_poly( p, xlo, dd ); + if( ylo < 0 ) { + /* Set the first NLSF to zero and move on to the next */ + NLSF[ 0 ] = 0; + p = Q; /* Pointer to polynomial */ + ylo = silk_A2NLSF_eval_poly( p, xlo, dd ); + root_ix = 1; /* Index of current root */ + } else { + root_ix = 0; /* Index of current root */ + } + k = 1; /* Reset loop counter */ + } + } + } +} diff --git a/thirdparty/opus/silk/API.h b/thirdparty/opus/silk/API.h new file mode 100644 index 000000000000..0131acbb08fb --- /dev/null +++ b/thirdparty/opus/silk/API.h @@ -0,0 +1,134 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_API_H +#define SILK_API_H + +#include "control.h" +#include "typedef.h" +#include "errors.h" +#include "entenc.h" +#include "entdec.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#define SILK_MAX_FRAMES_PER_PACKET 3 + +/* Struct for TOC (Table of Contents) */ +typedef struct { + opus_int VADFlag; /* Voice activity for packet */ + opus_int VADFlags[ SILK_MAX_FRAMES_PER_PACKET ]; /* Voice activity for each frame in packet */ + opus_int inbandFECFlag; /* Flag indicating if packet contains in-band FEC */ +} silk_TOC_struct; + +/****************************************/ +/* Encoder functions */ +/****************************************/ + +/***********************************************/ +/* Get size in bytes of the Silk encoder state */ +/***********************************************/ +opus_int silk_Get_Encoder_Size( /* O Returns error code */ + opus_int *encSizeBytes /* O Number of bytes in SILK encoder state */ +); + +/*************************/ +/* Init or reset encoder */ +/*************************/ +opus_int silk_InitEncoder( /* O Returns error code */ + void *encState, /* I/O State */ + int arch, /* I Run-time architecture */ + silk_EncControlStruct *encStatus /* O Encoder Status */ +); + +/**************************/ +/* Encode frame with Silk */ +/**************************/ +/* Note: if prefillFlag is set, the input must contain 10 ms of audio, irrespective of what */ +/* encControl->payloadSize_ms is set to */ +opus_int silk_Encode( /* O Returns error code */ + void *encState, /* I/O State */ + silk_EncControlStruct *encControl, /* I Control status */ + const opus_int16 *samplesIn, /* I Speech sample input vector */ + opus_int nSamplesIn, /* I Number of samples in input vector */ + ec_enc *psRangeEnc, /* I/O Compressor data structure */ + opus_int32 *nBytesOut, /* I/O Number of bytes in payload (input: Max bytes) */ + const opus_int prefillFlag /* I Flag to indicate prefilling buffers no coding */ +); + +/****************************************/ +/* Decoder functions */ +/****************************************/ + +/***********************************************/ +/* Get size in bytes of the Silk decoder state */ +/***********************************************/ +opus_int silk_Get_Decoder_Size( /* O Returns error code */ + opus_int *decSizeBytes /* O Number of bytes in SILK decoder state */ +); + +/*************************/ +/* Init or Reset decoder */ +/*************************/ +opus_int silk_InitDecoder( /* O Returns error code */ + void *decState /* I/O State */ +); + +/******************/ +/* Decode a frame */ +/******************/ +opus_int silk_Decode( /* O Returns error code */ + void* decState, /* I/O State */ + silk_DecControlStruct* decControl, /* I/O Control Structure */ + opus_int lostFlag, /* I 0: no loss, 1 loss, 2 decode fec */ + opus_int newPacketFlag, /* I Indicates first decoder call for this packet */ + ec_dec *psRangeDec, /* I/O Compressor data structure */ + opus_int16 *samplesOut, /* O Decoded output speech vector */ + opus_int32 *nSamplesOut, /* O Number of samples decoded */ + int arch /* I Run-time architecture */ +); + +#if 0 +/**************************************/ +/* Get table of contents for a packet */ +/**************************************/ +opus_int silk_get_TOC( + const opus_uint8 *payload, /* I Payload data */ + const opus_int nBytesIn, /* I Number of input bytes */ + const opus_int nFramesPerPayload, /* I Number of SILK frames per payload */ + silk_TOC_struct *Silk_TOC /* O Type of content */ +); +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/thirdparty/opus/silk/CNG.c b/thirdparty/opus/silk/CNG.c new file mode 100644 index 000000000000..8443ad63bb13 --- /dev/null +++ b/thirdparty/opus/silk/CNG.c @@ -0,0 +1,184 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" +#include "stack_alloc.h" + +/* Generates excitation for CNG LPC synthesis */ +static OPUS_INLINE void silk_CNG_exc( + opus_int32 exc_Q14[], /* O CNG excitation signal Q10 */ + opus_int32 exc_buf_Q14[], /* I Random samples buffer Q10 */ + opus_int length, /* I Length */ + opus_int32 *rand_seed /* I/O Seed to random index generator */ +) +{ + opus_int32 seed; + opus_int i, idx, exc_mask; + + exc_mask = CNG_BUF_MASK_MAX; + while( exc_mask > length ) { + exc_mask = silk_RSHIFT( exc_mask, 1 ); + } + + seed = *rand_seed; + for( i = 0; i < length; i++ ) { + seed = silk_RAND( seed ); + idx = (opus_int)( silk_RSHIFT( seed, 24 ) & exc_mask ); + silk_assert( idx >= 0 ); + silk_assert( idx <= CNG_BUF_MASK_MAX ); + exc_Q14[ i ] = exc_buf_Q14[ idx ]; + } + *rand_seed = seed; +} + +void silk_CNG_Reset( + silk_decoder_state *psDec /* I/O Decoder state */ +) +{ + opus_int i, NLSF_step_Q15, NLSF_acc_Q15; + + NLSF_step_Q15 = silk_DIV32_16( silk_int16_MAX, psDec->LPC_order + 1 ); + NLSF_acc_Q15 = 0; + for( i = 0; i < psDec->LPC_order; i++ ) { + NLSF_acc_Q15 += NLSF_step_Q15; + psDec->sCNG.CNG_smth_NLSF_Q15[ i ] = NLSF_acc_Q15; + } + psDec->sCNG.CNG_smth_Gain_Q16 = 0; + psDec->sCNG.rand_seed = 3176576; +} + +/* Updates CNG estimate, and applies the CNG when packet was lost */ +void silk_CNG( + silk_decoder_state *psDec, /* I/O Decoder state */ + silk_decoder_control *psDecCtrl, /* I/O Decoder control */ + opus_int16 frame[], /* I/O Signal */ + opus_int length /* I Length of residual */ +) +{ + opus_int i, subfr; + opus_int32 LPC_pred_Q10, max_Gain_Q16, gain_Q16, gain_Q10; + opus_int16 A_Q12[ MAX_LPC_ORDER ]; + silk_CNG_struct *psCNG = &psDec->sCNG; + SAVE_STACK; + + if( psDec->fs_kHz != psCNG->fs_kHz ) { + /* Reset state */ + silk_CNG_Reset( psDec ); + + psCNG->fs_kHz = psDec->fs_kHz; + } + if( psDec->lossCnt == 0 && psDec->prevSignalType == TYPE_NO_VOICE_ACTIVITY ) { + /* Update CNG parameters */ + + /* Smoothing of LSF's */ + for( i = 0; i < psDec->LPC_order; i++ ) { + psCNG->CNG_smth_NLSF_Q15[ i ] += silk_SMULWB( (opus_int32)psDec->prevNLSF_Q15[ i ] - (opus_int32)psCNG->CNG_smth_NLSF_Q15[ i ], CNG_NLSF_SMTH_Q16 ); + } + /* Find the subframe with the highest gain */ + max_Gain_Q16 = 0; + subfr = 0; + for( i = 0; i < psDec->nb_subfr; i++ ) { + if( psDecCtrl->Gains_Q16[ i ] > max_Gain_Q16 ) { + max_Gain_Q16 = psDecCtrl->Gains_Q16[ i ]; + subfr = i; + } + } + /* Update CNG excitation buffer with excitation from this subframe */ + silk_memmove( &psCNG->CNG_exc_buf_Q14[ psDec->subfr_length ], psCNG->CNG_exc_buf_Q14, ( psDec->nb_subfr - 1 ) * psDec->subfr_length * sizeof( opus_int32 ) ); + silk_memcpy( psCNG->CNG_exc_buf_Q14, &psDec->exc_Q14[ subfr * psDec->subfr_length ], psDec->subfr_length * sizeof( opus_int32 ) ); + + /* Smooth gains */ + for( i = 0; i < psDec->nb_subfr; i++ ) { + psCNG->CNG_smth_Gain_Q16 += silk_SMULWB( psDecCtrl->Gains_Q16[ i ] - psCNG->CNG_smth_Gain_Q16, CNG_GAIN_SMTH_Q16 ); + } + } + + /* Add CNG when packet is lost or during DTX */ + if( psDec->lossCnt ) { + VARDECL( opus_int32, CNG_sig_Q14 ); + ALLOC( CNG_sig_Q14, length + MAX_LPC_ORDER, opus_int32 ); + + /* Generate CNG excitation */ + gain_Q16 = silk_SMULWW( psDec->sPLC.randScale_Q14, psDec->sPLC.prevGain_Q16[1] ); + if( gain_Q16 >= (1 << 21) || psCNG->CNG_smth_Gain_Q16 > (1 << 23) ) { + gain_Q16 = silk_SMULTT( gain_Q16, gain_Q16 ); + gain_Q16 = silk_SUB_LSHIFT32(silk_SMULTT( psCNG->CNG_smth_Gain_Q16, psCNG->CNG_smth_Gain_Q16 ), gain_Q16, 5 ); + gain_Q16 = silk_LSHIFT32( silk_SQRT_APPROX( gain_Q16 ), 16 ); + } else { + gain_Q16 = silk_SMULWW( gain_Q16, gain_Q16 ); + gain_Q16 = silk_SUB_LSHIFT32(silk_SMULWW( psCNG->CNG_smth_Gain_Q16, psCNG->CNG_smth_Gain_Q16 ), gain_Q16, 5 ); + gain_Q16 = silk_LSHIFT32( silk_SQRT_APPROX( gain_Q16 ), 8 ); + } + gain_Q10 = silk_RSHIFT( gain_Q16, 6 ); + + silk_CNG_exc( CNG_sig_Q14 + MAX_LPC_ORDER, psCNG->CNG_exc_buf_Q14, length, &psCNG->rand_seed ); + + /* Convert CNG NLSF to filter representation */ + silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order ); + + /* Generate CNG signal, by synthesis filtering */ + silk_memcpy( CNG_sig_Q14, psCNG->CNG_synth_state, MAX_LPC_ORDER * sizeof( opus_int32 ) ); + for( i = 0; i < length; i++ ) { + silk_assert( psDec->LPC_order == 10 || psDec->LPC_order == 16 ); + /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ + LPC_pred_Q10 = silk_RSHIFT( psDec->LPC_order, 1 ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i - 1 ], A_Q12[ 0 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i - 2 ], A_Q12[ 1 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i - 3 ], A_Q12[ 2 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i - 4 ], A_Q12[ 3 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i - 5 ], A_Q12[ 4 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i - 6 ], A_Q12[ 5 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i - 7 ], A_Q12[ 6 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i - 8 ], A_Q12[ 7 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i - 9 ], A_Q12[ 8 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i - 10 ], A_Q12[ 9 ] ); + if( psDec->LPC_order == 16 ) { + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i - 11 ], A_Q12[ 10 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i - 12 ], A_Q12[ 11 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i - 13 ], A_Q12[ 12 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i - 14 ], A_Q12[ 13 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i - 15 ], A_Q12[ 14 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, CNG_sig_Q14[ MAX_LPC_ORDER + i - 16 ], A_Q12[ 15 ] ); + } + + /* Update states */ + CNG_sig_Q14[ MAX_LPC_ORDER + i ] = silk_ADD_LSHIFT( CNG_sig_Q14[ MAX_LPC_ORDER + i ], LPC_pred_Q10, 4 ); + + /* Scale with Gain and add to input signal */ + frame[ i ] = (opus_int16)silk_ADD_SAT16( frame[ i ], silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( CNG_sig_Q14[ MAX_LPC_ORDER + i ], gain_Q10 ), 8 ) ) ); + + } + silk_memcpy( psCNG->CNG_synth_state, &CNG_sig_Q14[ length ], MAX_LPC_ORDER * sizeof( opus_int32 ) ); + } else { + silk_memset( psCNG->CNG_synth_state, 0, psDec->LPC_order * sizeof( opus_int32 ) ); + } + RESTORE_STACK; +} diff --git a/thirdparty/opus/silk/HP_variable_cutoff.c b/thirdparty/opus/silk/HP_variable_cutoff.c new file mode 100644 index 000000000000..bbe10f04cefa --- /dev/null +++ b/thirdparty/opus/silk/HP_variable_cutoff.c @@ -0,0 +1,77 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#ifdef FIXED_POINT +#include "main_FIX.h" +#else +#include "main_FLP.h" +#endif +#include "tuning_parameters.h" + +/* High-pass filter with cutoff frequency adaptation based on pitch lag statistics */ +void silk_HP_variable_cutoff( + silk_encoder_state_Fxx state_Fxx[] /* I/O Encoder states */ +) +{ + opus_int quality_Q15; + opus_int32 pitch_freq_Hz_Q16, pitch_freq_log_Q7, delta_freq_Q7; + silk_encoder_state *psEncC1 = &state_Fxx[ 0 ].sCmn; + + /* Adaptive cutoff frequency: estimate low end of pitch frequency range */ + if( psEncC1->prevSignalType == TYPE_VOICED ) { + /* difference, in log domain */ + pitch_freq_Hz_Q16 = silk_DIV32_16( silk_LSHIFT( silk_MUL( psEncC1->fs_kHz, 1000 ), 16 ), psEncC1->prevLag ); + pitch_freq_log_Q7 = silk_lin2log( pitch_freq_Hz_Q16 ) - ( 16 << 7 ); + + /* adjustment based on quality */ + quality_Q15 = psEncC1->input_quality_bands_Q15[ 0 ]; + pitch_freq_log_Q7 = silk_SMLAWB( pitch_freq_log_Q7, silk_SMULWB( silk_LSHIFT( -quality_Q15, 2 ), quality_Q15 ), + pitch_freq_log_Q7 - ( silk_lin2log( SILK_FIX_CONST( VARIABLE_HP_MIN_CUTOFF_HZ, 16 ) ) - ( 16 << 7 ) ) ); + + /* delta_freq = pitch_freq_log - psEnc->variable_HP_smth1; */ + delta_freq_Q7 = pitch_freq_log_Q7 - silk_RSHIFT( psEncC1->variable_HP_smth1_Q15, 8 ); + if( delta_freq_Q7 < 0 ) { + /* less smoothing for decreasing pitch frequency, to track something close to the minimum */ + delta_freq_Q7 = silk_MUL( delta_freq_Q7, 3 ); + } + + /* limit delta, to reduce impact of outliers in pitch estimation */ + delta_freq_Q7 = silk_LIMIT_32( delta_freq_Q7, -SILK_FIX_CONST( VARIABLE_HP_MAX_DELTA_FREQ, 7 ), SILK_FIX_CONST( VARIABLE_HP_MAX_DELTA_FREQ, 7 ) ); + + /* update smoother */ + psEncC1->variable_HP_smth1_Q15 = silk_SMLAWB( psEncC1->variable_HP_smth1_Q15, + silk_SMULBB( psEncC1->speech_activity_Q8, delta_freq_Q7 ), SILK_FIX_CONST( VARIABLE_HP_SMTH_COEF1, 16 ) ); + + /* limit frequency range */ + psEncC1->variable_HP_smth1_Q15 = silk_LIMIT_32( psEncC1->variable_HP_smth1_Q15, + silk_LSHIFT( silk_lin2log( VARIABLE_HP_MIN_CUTOFF_HZ ), 8 ), + silk_LSHIFT( silk_lin2log( VARIABLE_HP_MAX_CUTOFF_HZ ), 8 ) ); + } +} diff --git a/thirdparty/opus/silk/Inlines.h b/thirdparty/opus/silk/Inlines.h new file mode 100644 index 000000000000..ec986cdfddf4 --- /dev/null +++ b/thirdparty/opus/silk/Inlines.h @@ -0,0 +1,188 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +/*! \file silk_Inlines.h + * \brief silk_Inlines.h defines OPUS_INLINE signal processing functions. + */ + +#ifndef SILK_FIX_INLINES_H +#define SILK_FIX_INLINES_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +/* count leading zeros of opus_int64 */ +static OPUS_INLINE opus_int32 silk_CLZ64( opus_int64 in ) +{ + opus_int32 in_upper; + + in_upper = (opus_int32)silk_RSHIFT64(in, 32); + if (in_upper == 0) { + /* Search in the lower 32 bits */ + return 32 + silk_CLZ32( (opus_int32) in ); + } else { + /* Search in the upper 32 bits */ + return silk_CLZ32( in_upper ); + } +} + +/* get number of leading zeros and fractional part (the bits right after the leading one */ +static OPUS_INLINE void silk_CLZ_FRAC( + opus_int32 in, /* I input */ + opus_int32 *lz, /* O number of leading zeros */ + opus_int32 *frac_Q7 /* O the 7 bits right after the leading one */ +) +{ + opus_int32 lzeros = silk_CLZ32(in); + + * lz = lzeros; + * frac_Q7 = silk_ROR32(in, 24 - lzeros) & 0x7f; +} + +/* Approximation of square root */ +/* Accuracy: < +/- 10% for output values > 15 */ +/* < +/- 2.5% for output values > 120 */ +static OPUS_INLINE opus_int32 silk_SQRT_APPROX( opus_int32 x ) +{ + opus_int32 y, lz, frac_Q7; + + if( x <= 0 ) { + return 0; + } + + silk_CLZ_FRAC(x, &lz, &frac_Q7); + + if( lz & 1 ) { + y = 32768; + } else { + y = 46214; /* 46214 = sqrt(2) * 32768 */ + } + + /* get scaling right */ + y >>= silk_RSHIFT(lz, 1); + + /* increment using fractional part of input */ + y = silk_SMLAWB(y, y, silk_SMULBB(213, frac_Q7)); + + return y; +} + +/* Divide two int32 values and return result as int32 in a given Q-domain */ +static OPUS_INLINE opus_int32 silk_DIV32_varQ( /* O returns a good approximation of "(a32 << Qres) / b32" */ + const opus_int32 a32, /* I numerator (Q0) */ + const opus_int32 b32, /* I denominator (Q0) */ + const opus_int Qres /* I Q-domain of result (>= 0) */ +) +{ + opus_int a_headrm, b_headrm, lshift; + opus_int32 b32_inv, a32_nrm, b32_nrm, result; + + silk_assert( b32 != 0 ); + silk_assert( Qres >= 0 ); + + /* Compute number of bits head room and normalize inputs */ + a_headrm = silk_CLZ32( silk_abs(a32) ) - 1; + a32_nrm = silk_LSHIFT(a32, a_headrm); /* Q: a_headrm */ + b_headrm = silk_CLZ32( silk_abs(b32) ) - 1; + b32_nrm = silk_LSHIFT(b32, b_headrm); /* Q: b_headrm */ + + /* Inverse of b32, with 14 bits of precision */ + b32_inv = silk_DIV32_16( silk_int32_MAX >> 2, silk_RSHIFT(b32_nrm, 16) ); /* Q: 29 + 16 - b_headrm */ + + /* First approximation */ + result = silk_SMULWB(a32_nrm, b32_inv); /* Q: 29 + a_headrm - b_headrm */ + + /* Compute residual by subtracting product of denominator and first approximation */ + /* It's OK to overflow because the final value of a32_nrm should always be small */ + a32_nrm = silk_SUB32_ovflw(a32_nrm, silk_LSHIFT_ovflw( silk_SMMUL(b32_nrm, result), 3 )); /* Q: a_headrm */ + + /* Refinement */ + result = silk_SMLAWB(result, a32_nrm, b32_inv); /* Q: 29 + a_headrm - b_headrm */ + + /* Convert to Qres domain */ + lshift = 29 + a_headrm - b_headrm - Qres; + if( lshift < 0 ) { + return silk_LSHIFT_SAT32(result, -lshift); + } else { + if( lshift < 32){ + return silk_RSHIFT(result, lshift); + } else { + /* Avoid undefined result */ + return 0; + } + } +} + +/* Invert int32 value and return result as int32 in a given Q-domain */ +static OPUS_INLINE opus_int32 silk_INVERSE32_varQ( /* O returns a good approximation of "(1 << Qres) / b32" */ + const opus_int32 b32, /* I denominator (Q0) */ + const opus_int Qres /* I Q-domain of result (> 0) */ +) +{ + opus_int b_headrm, lshift; + opus_int32 b32_inv, b32_nrm, err_Q32, result; + + silk_assert( b32 != 0 ); + silk_assert( Qres > 0 ); + + /* Compute number of bits head room and normalize input */ + b_headrm = silk_CLZ32( silk_abs(b32) ) - 1; + b32_nrm = silk_LSHIFT(b32, b_headrm); /* Q: b_headrm */ + + /* Inverse of b32, with 14 bits of precision */ + b32_inv = silk_DIV32_16( silk_int32_MAX >> 2, silk_RSHIFT(b32_nrm, 16) ); /* Q: 29 + 16 - b_headrm */ + + /* First approximation */ + result = silk_LSHIFT(b32_inv, 16); /* Q: 61 - b_headrm */ + + /* Compute residual by subtracting product of denominator and first approximation from one */ + err_Q32 = silk_LSHIFT( ((opus_int32)1<<29) - silk_SMULWB(b32_nrm, b32_inv), 3 ); /* Q32 */ + + /* Refinement */ + result = silk_SMLAWW(result, err_Q32, b32_inv); /* Q: 61 - b_headrm */ + + /* Convert to Qres domain */ + lshift = 61 - b_headrm - Qres; + if( lshift <= 0 ) { + return silk_LSHIFT_SAT32(result, -lshift); + } else { + if( lshift < 32){ + return silk_RSHIFT(result, lshift); + }else{ + /* Avoid undefined result */ + return 0; + } + } +} + +#ifdef __cplusplus +} +#endif + +#endif /* SILK_FIX_INLINES_H */ diff --git a/thirdparty/opus/silk/LPC_analysis_filter.c b/thirdparty/opus/silk/LPC_analysis_filter.c new file mode 100644 index 000000000000..20906673ff12 --- /dev/null +++ b/thirdparty/opus/silk/LPC_analysis_filter.c @@ -0,0 +1,108 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" +#include "celt_lpc.h" + +/*******************************************/ +/* LPC analysis filter */ +/* NB! State is kept internally and the */ +/* filter always starts with zero state */ +/* first d output samples are set to zero */ +/*******************************************/ + +void silk_LPC_analysis_filter( + opus_int16 *out, /* O Output signal */ + const opus_int16 *in, /* I Input signal */ + const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */ + const opus_int32 len, /* I Signal length */ + const opus_int32 d, /* I Filter order */ + int arch /* I Run-time architecture */ +) +{ + opus_int j; +#ifdef FIXED_POINT + opus_int16 mem[SILK_MAX_ORDER_LPC]; + opus_int16 num[SILK_MAX_ORDER_LPC]; +#else + int ix; + opus_int32 out32_Q12, out32; + const opus_int16 *in_ptr; +#endif + + silk_assert( d >= 6 ); + silk_assert( (d & 1) == 0 ); + silk_assert( d <= len ); + +#ifdef FIXED_POINT + silk_assert( d <= SILK_MAX_ORDER_LPC ); + for ( j = 0; j < d; j++ ) { + num[ j ] = -B[ j ]; + } + for (j=0;j 0; k-- ) { + /* Check for stability */ + if( ( Anew_QA[ k ] > A_LIMIT ) || ( Anew_QA[ k ] < -A_LIMIT ) ) { + return 0; + } + + /* Set RC equal to negated AR coef */ + rc_Q31 = -silk_LSHIFT( Anew_QA[ k ], 31 - QA ); + + /* rc_mult1_Q30 range: [ 1 : 2^30 ] */ + rc_mult1_Q30 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 ); + silk_assert( rc_mult1_Q30 > ( 1 << 15 ) ); /* reduce A_LIMIT if fails */ + silk_assert( rc_mult1_Q30 <= ( 1 << 30 ) ); + + /* rc_mult2 range: [ 2^30 : silk_int32_MAX ] */ + mult2Q = 32 - silk_CLZ32( silk_abs( rc_mult1_Q30 ) ); + rc_mult2 = silk_INVERSE32_varQ( rc_mult1_Q30, mult2Q + 30 ); + + /* Update inverse gain */ + /* invGain_Q30 range: [ 0 : 2^30 ] */ + invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 ); + silk_assert( invGain_Q30 >= 0 ); + silk_assert( invGain_Q30 <= ( 1 << 30 ) ); + + /* Swap pointers */ + Aold_QA = Anew_QA; + Anew_QA = A_QA[ k & 1 ]; + + /* Update AR coefficient */ + for( n = 0; n < k; n++ ) { + tmp_QA = Aold_QA[ n ] - MUL32_FRAC_Q( Aold_QA[ k - n - 1 ], rc_Q31, 31 ); + Anew_QA[ n ] = MUL32_FRAC_Q( tmp_QA, rc_mult2 , mult2Q ); + } + } + + /* Check for stability */ + if( ( Anew_QA[ 0 ] > A_LIMIT ) || ( Anew_QA[ 0 ] < -A_LIMIT ) ) { + return 0; + } + + /* Set RC equal to negated AR coef */ + rc_Q31 = -silk_LSHIFT( Anew_QA[ 0 ], 31 - QA ); + + /* Range: [ 1 : 2^30 ] */ + rc_mult1_Q30 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 ); + + /* Update inverse gain */ + /* Range: [ 0 : 2^30 ] */ + invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 ); + silk_assert( invGain_Q30 >= 0 ); + silk_assert( invGain_Q30 <= 1<<30 ); + + return invGain_Q30; +} + +/* For input in Q12 domain */ +opus_int32 silk_LPC_inverse_pred_gain( /* O Returns inverse prediction gain in energy domain, Q30 */ + const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */ + const opus_int order /* I Prediction order */ +) +{ + opus_int k; + opus_int32 Atmp_QA[ 2 ][ SILK_MAX_ORDER_LPC ]; + opus_int32 *Anew_QA; + opus_int32 DC_resp = 0; + + Anew_QA = Atmp_QA[ order & 1 ]; + + /* Increase Q domain of the AR coefficients */ + for( k = 0; k < order; k++ ) { + DC_resp += (opus_int32)A_Q12[ k ]; + Anew_QA[ k ] = silk_LSHIFT32( (opus_int32)A_Q12[ k ], QA - 12 ); + } + /* If the DC is unstable, we don't even need to do the full calculations */ + if( DC_resp >= 4096 ) { + return 0; + } + return LPC_inverse_pred_gain_QA( Atmp_QA, order ); +} + +#ifdef FIXED_POINT + +/* For input in Q24 domain */ +opus_int32 silk_LPC_inverse_pred_gain_Q24( /* O Returns inverse prediction gain in energy domain, Q30 */ + const opus_int32 *A_Q24, /* I Prediction coefficients [order] */ + const opus_int order /* I Prediction order */ +) +{ + opus_int k; + opus_int32 Atmp_QA[ 2 ][ SILK_MAX_ORDER_LPC ]; + opus_int32 *Anew_QA; + + Anew_QA = Atmp_QA[ order & 1 ]; + + /* Increase Q domain of the AR coefficients */ + for( k = 0; k < order; k++ ) { + Anew_QA[ k ] = silk_RSHIFT32( A_Q24[ k ], 24 - QA ); + } + + return LPC_inverse_pred_gain_QA( Atmp_QA, order ); +} +#endif diff --git a/thirdparty/opus/silk/LP_variable_cutoff.c b/thirdparty/opus/silk/LP_variable_cutoff.c new file mode 100644 index 000000000000..f639e1f899a9 --- /dev/null +++ b/thirdparty/opus/silk/LP_variable_cutoff.c @@ -0,0 +1,135 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/* + Elliptic/Cauer filters designed with 0.1 dB passband ripple, + 80 dB minimum stopband attenuation, and + [0.95 : 0.15 : 0.35] normalized cut off frequencies. +*/ + +#include "main.h" + +/* Helper function, interpolates the filter taps */ +static OPUS_INLINE void silk_LP_interpolate_filter_taps( + opus_int32 B_Q28[ TRANSITION_NB ], + opus_int32 A_Q28[ TRANSITION_NA ], + const opus_int ind, + const opus_int32 fac_Q16 +) +{ + opus_int nb, na; + + if( ind < TRANSITION_INT_NUM - 1 ) { + if( fac_Q16 > 0 ) { + if( fac_Q16 < 32768 ) { /* fac_Q16 is in range of a 16-bit int */ + /* Piece-wise linear interpolation of B and A */ + for( nb = 0; nb < TRANSITION_NB; nb++ ) { + B_Q28[ nb ] = silk_SMLAWB( + silk_Transition_LP_B_Q28[ ind ][ nb ], + silk_Transition_LP_B_Q28[ ind + 1 ][ nb ] - + silk_Transition_LP_B_Q28[ ind ][ nb ], + fac_Q16 ); + } + for( na = 0; na < TRANSITION_NA; na++ ) { + A_Q28[ na ] = silk_SMLAWB( + silk_Transition_LP_A_Q28[ ind ][ na ], + silk_Transition_LP_A_Q28[ ind + 1 ][ na ] - + silk_Transition_LP_A_Q28[ ind ][ na ], + fac_Q16 ); + } + } else { /* ( fac_Q16 - ( 1 << 16 ) ) is in range of a 16-bit int */ + silk_assert( fac_Q16 - ( 1 << 16 ) == silk_SAT16( fac_Q16 - ( 1 << 16 ) ) ); + /* Piece-wise linear interpolation of B and A */ + for( nb = 0; nb < TRANSITION_NB; nb++ ) { + B_Q28[ nb ] = silk_SMLAWB( + silk_Transition_LP_B_Q28[ ind + 1 ][ nb ], + silk_Transition_LP_B_Q28[ ind + 1 ][ nb ] - + silk_Transition_LP_B_Q28[ ind ][ nb ], + fac_Q16 - ( (opus_int32)1 << 16 ) ); + } + for( na = 0; na < TRANSITION_NA; na++ ) { + A_Q28[ na ] = silk_SMLAWB( + silk_Transition_LP_A_Q28[ ind + 1 ][ na ], + silk_Transition_LP_A_Q28[ ind + 1 ][ na ] - + silk_Transition_LP_A_Q28[ ind ][ na ], + fac_Q16 - ( (opus_int32)1 << 16 ) ); + } + } + } else { + silk_memcpy( B_Q28, silk_Transition_LP_B_Q28[ ind ], TRANSITION_NB * sizeof( opus_int32 ) ); + silk_memcpy( A_Q28, silk_Transition_LP_A_Q28[ ind ], TRANSITION_NA * sizeof( opus_int32 ) ); + } + } else { + silk_memcpy( B_Q28, silk_Transition_LP_B_Q28[ TRANSITION_INT_NUM - 1 ], TRANSITION_NB * sizeof( opus_int32 ) ); + silk_memcpy( A_Q28, silk_Transition_LP_A_Q28[ TRANSITION_INT_NUM - 1 ], TRANSITION_NA * sizeof( opus_int32 ) ); + } +} + +/* Low-pass filter with variable cutoff frequency based on */ +/* piece-wise linear interpolation between elliptic filters */ +/* Start by setting psEncC->mode <> 0; */ +/* Deactivate by setting psEncC->mode = 0; */ +void silk_LP_variable_cutoff( + silk_LP_state *psLP, /* I/O LP filter state */ + opus_int16 *frame, /* I/O Low-pass filtered output signal */ + const opus_int frame_length /* I Frame length */ +) +{ + opus_int32 B_Q28[ TRANSITION_NB ], A_Q28[ TRANSITION_NA ], fac_Q16 = 0; + opus_int ind = 0; + + silk_assert( psLP->transition_frame_no >= 0 && psLP->transition_frame_no <= TRANSITION_FRAMES ); + + /* Run filter if needed */ + if( psLP->mode != 0 ) { + /* Calculate index and interpolation factor for interpolation */ +#if( TRANSITION_INT_STEPS == 64 ) + fac_Q16 = silk_LSHIFT( TRANSITION_FRAMES - psLP->transition_frame_no, 16 - 6 ); +#else + fac_Q16 = silk_DIV32_16( silk_LSHIFT( TRANSITION_FRAMES - psLP->transition_frame_no, 16 ), TRANSITION_FRAMES ); +#endif + ind = silk_RSHIFT( fac_Q16, 16 ); + fac_Q16 -= silk_LSHIFT( ind, 16 ); + + silk_assert( ind >= 0 ); + silk_assert( ind < TRANSITION_INT_NUM ); + + /* Interpolate filter coefficients */ + silk_LP_interpolate_filter_taps( B_Q28, A_Q28, ind, fac_Q16 ); + + /* Update transition frame number for next frame */ + psLP->transition_frame_no = silk_LIMIT( psLP->transition_frame_no + psLP->mode, 0, TRANSITION_FRAMES ); + + /* ARMA low-pass filtering */ + silk_assert( TRANSITION_NB == 3 && TRANSITION_NA == 2 ); + silk_biquad_alt( frame, B_Q28, A_Q28, psLP->In_LP_State, frame, frame_length, 1); + } +} diff --git a/thirdparty/opus/silk/MacroCount.h b/thirdparty/opus/silk/MacroCount.h new file mode 100644 index 000000000000..834817d058b9 --- /dev/null +++ b/thirdparty/opus/silk/MacroCount.h @@ -0,0 +1,718 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SIGPROCFIX_API_MACROCOUNT_H +#define SIGPROCFIX_API_MACROCOUNT_H +#include + +#ifdef silk_MACRO_COUNT +#define varDefine opus_int64 ops_count = 0; + +extern opus_int64 ops_count; + +static OPUS_INLINE opus_int64 silk_SaveCount(){ + return(ops_count); +} + +static OPUS_INLINE opus_int64 silk_SaveResetCount(){ + opus_int64 ret; + + ret = ops_count; + ops_count = 0; + return(ret); +} + +static OPUS_INLINE silk_PrintCount(){ + printf("ops_count = %d \n ", (opus_int32)ops_count); +} + +#undef silk_MUL +static OPUS_INLINE opus_int32 silk_MUL(opus_int32 a32, opus_int32 b32){ + opus_int32 ret; + ops_count += 4; + ret = a32 * b32; + return ret; +} + +#undef silk_MUL_uint +static OPUS_INLINE opus_uint32 silk_MUL_uint(opus_uint32 a32, opus_uint32 b32){ + opus_uint32 ret; + ops_count += 4; + ret = a32 * b32; + return ret; +} +#undef silk_MLA +static OPUS_INLINE opus_int32 silk_MLA(opus_int32 a32, opus_int32 b32, opus_int32 c32){ + opus_int32 ret; + ops_count += 4; + ret = a32 + b32 * c32; + return ret; +} + +#undef silk_MLA_uint +static OPUS_INLINE opus_int32 silk_MLA_uint(opus_uint32 a32, opus_uint32 b32, opus_uint32 c32){ + opus_uint32 ret; + ops_count += 4; + ret = a32 + b32 * c32; + return ret; +} + +#undef silk_SMULWB +static OPUS_INLINE opus_int32 silk_SMULWB(opus_int32 a32, opus_int32 b32){ + opus_int32 ret; + ops_count += 5; + ret = (a32 >> 16) * (opus_int32)((opus_int16)b32) + (((a32 & 0x0000FFFF) * (opus_int32)((opus_int16)b32)) >> 16); + return ret; +} +#undef silk_SMLAWB +static OPUS_INLINE opus_int32 silk_SMLAWB(opus_int32 a32, opus_int32 b32, opus_int32 c32){ + opus_int32 ret; + ops_count += 5; + ret = ((a32) + ((((b32) >> 16) * (opus_int32)((opus_int16)(c32))) + ((((b32) & 0x0000FFFF) * (opus_int32)((opus_int16)(c32))) >> 16))); + return ret; +} + +#undef silk_SMULWT +static OPUS_INLINE opus_int32 silk_SMULWT(opus_int32 a32, opus_int32 b32){ + opus_int32 ret; + ops_count += 4; + ret = (a32 >> 16) * (b32 >> 16) + (((a32 & 0x0000FFFF) * (b32 >> 16)) >> 16); + return ret; +} +#undef silk_SMLAWT +static OPUS_INLINE opus_int32 silk_SMLAWT(opus_int32 a32, opus_int32 b32, opus_int32 c32){ + opus_int32 ret; + ops_count += 4; + ret = a32 + ((b32 >> 16) * (c32 >> 16)) + (((b32 & 0x0000FFFF) * ((c32 >> 16)) >> 16)); + return ret; +} + +#undef silk_SMULBB +static OPUS_INLINE opus_int32 silk_SMULBB(opus_int32 a32, opus_int32 b32){ + opus_int32 ret; + ops_count += 1; + ret = (opus_int32)((opus_int16)a32) * (opus_int32)((opus_int16)b32); + return ret; +} +#undef silk_SMLABB +static OPUS_INLINE opus_int32 silk_SMLABB(opus_int32 a32, opus_int32 b32, opus_int32 c32){ + opus_int32 ret; + ops_count += 1; + ret = a32 + (opus_int32)((opus_int16)b32) * (opus_int32)((opus_int16)c32); + return ret; +} + +#undef silk_SMULBT +static OPUS_INLINE opus_int32 silk_SMULBT(opus_int32 a32, opus_int32 b32 ){ + opus_int32 ret; + ops_count += 4; + ret = ((opus_int32)((opus_int16)a32)) * (b32 >> 16); + return ret; +} + +#undef silk_SMLABT +static OPUS_INLINE opus_int32 silk_SMLABT(opus_int32 a32, opus_int32 b32, opus_int32 c32){ + opus_int32 ret; + ops_count += 1; + ret = a32 + ((opus_int32)((opus_int16)b32)) * (c32 >> 16); + return ret; +} + +#undef silk_SMULTT +static OPUS_INLINE opus_int32 silk_SMULTT(opus_int32 a32, opus_int32 b32){ + opus_int32 ret; + ops_count += 1; + ret = (a32 >> 16) * (b32 >> 16); + return ret; +} + +#undef silk_SMLATT +static OPUS_INLINE opus_int32 silk_SMLATT(opus_int32 a32, opus_int32 b32, opus_int32 c32){ + opus_int32 ret; + ops_count += 1; + ret = a32 + (b32 >> 16) * (c32 >> 16); + return ret; +} + + +/* multiply-accumulate macros that allow overflow in the addition (ie, no asserts in debug mode)*/ +#undef silk_MLA_ovflw +#define silk_MLA_ovflw silk_MLA + +#undef silk_SMLABB_ovflw +#define silk_SMLABB_ovflw silk_SMLABB + +#undef silk_SMLABT_ovflw +#define silk_SMLABT_ovflw silk_SMLABT + +#undef silk_SMLATT_ovflw +#define silk_SMLATT_ovflw silk_SMLATT + +#undef silk_SMLAWB_ovflw +#define silk_SMLAWB_ovflw silk_SMLAWB + +#undef silk_SMLAWT_ovflw +#define silk_SMLAWT_ovflw silk_SMLAWT + +#undef silk_SMULL +static OPUS_INLINE opus_int64 silk_SMULL(opus_int32 a32, opus_int32 b32){ + opus_int64 ret; + ops_count += 8; + ret = ((opus_int64)(a32) * /*(opus_int64)*/(b32)); + return ret; +} + +#undef silk_SMLAL +static OPUS_INLINE opus_int64 silk_SMLAL(opus_int64 a64, opus_int32 b32, opus_int32 c32){ + opus_int64 ret; + ops_count += 8; + ret = a64 + ((opus_int64)(b32) * /*(opus_int64)*/(c32)); + return ret; +} +#undef silk_SMLALBB +static OPUS_INLINE opus_int64 silk_SMLALBB(opus_int64 a64, opus_int16 b16, opus_int16 c16){ + opus_int64 ret; + ops_count += 4; + ret = a64 + ((opus_int64)(b16) * /*(opus_int64)*/(c16)); + return ret; +} + +#undef SigProcFIX_CLZ16 +static OPUS_INLINE opus_int32 SigProcFIX_CLZ16(opus_int16 in16) +{ + opus_int32 out32 = 0; + ops_count += 10; + if( in16 == 0 ) { + return 16; + } + /* test nibbles */ + if( in16 & 0xFF00 ) { + if( in16 & 0xF000 ) { + in16 >>= 12; + } else { + out32 += 4; + in16 >>= 8; + } + } else { + if( in16 & 0xFFF0 ) { + out32 += 8; + in16 >>= 4; + } else { + out32 += 12; + } + } + /* test bits and return */ + if( in16 & 0xC ) { + if( in16 & 0x8 ) + return out32 + 0; + else + return out32 + 1; + } else { + if( in16 & 0xE ) + return out32 + 2; + else + return out32 + 3; + } +} + +#undef SigProcFIX_CLZ32 +static OPUS_INLINE opus_int32 SigProcFIX_CLZ32(opus_int32 in32) +{ + /* test highest 16 bits and convert to opus_int16 */ + ops_count += 2; + if( in32 & 0xFFFF0000 ) { + return SigProcFIX_CLZ16((opus_int16)(in32 >> 16)); + } else { + return SigProcFIX_CLZ16((opus_int16)in32) + 16; + } +} + +#undef silk_DIV32 +static OPUS_INLINE opus_int32 silk_DIV32(opus_int32 a32, opus_int32 b32){ + ops_count += 64; + return a32 / b32; +} + +#undef silk_DIV32_16 +static OPUS_INLINE opus_int32 silk_DIV32_16(opus_int32 a32, opus_int32 b32){ + ops_count += 32; + return a32 / b32; +} + +#undef silk_SAT8 +static OPUS_INLINE opus_int8 silk_SAT8(opus_int64 a){ + opus_int8 tmp; + ops_count += 1; + tmp = (opus_int8)((a) > silk_int8_MAX ? silk_int8_MAX : \ + ((a) < silk_int8_MIN ? silk_int8_MIN : (a))); + return(tmp); +} + +#undef silk_SAT16 +static OPUS_INLINE opus_int16 silk_SAT16(opus_int64 a){ + opus_int16 tmp; + ops_count += 1; + tmp = (opus_int16)((a) > silk_int16_MAX ? silk_int16_MAX : \ + ((a) < silk_int16_MIN ? silk_int16_MIN : (a))); + return(tmp); +} +#undef silk_SAT32 +static OPUS_INLINE opus_int32 silk_SAT32(opus_int64 a){ + opus_int32 tmp; + ops_count += 1; + tmp = (opus_int32)((a) > silk_int32_MAX ? silk_int32_MAX : \ + ((a) < silk_int32_MIN ? silk_int32_MIN : (a))); + return(tmp); +} +#undef silk_POS_SAT32 +static OPUS_INLINE opus_int32 silk_POS_SAT32(opus_int64 a){ + opus_int32 tmp; + ops_count += 1; + tmp = (opus_int32)((a) > silk_int32_MAX ? silk_int32_MAX : (a)); + return(tmp); +} + +#undef silk_ADD_POS_SAT8 +static OPUS_INLINE opus_int8 silk_ADD_POS_SAT8(opus_int64 a, opus_int64 b){ + opus_int8 tmp; + ops_count += 1; + tmp = (opus_int8)((((a)+(b)) & 0x80) ? silk_int8_MAX : ((a)+(b))); + return(tmp); +} +#undef silk_ADD_POS_SAT16 +static OPUS_INLINE opus_int16 silk_ADD_POS_SAT16(opus_int64 a, opus_int64 b){ + opus_int16 tmp; + ops_count += 1; + tmp = (opus_int16)((((a)+(b)) & 0x8000) ? silk_int16_MAX : ((a)+(b))); + return(tmp); +} + +#undef silk_ADD_POS_SAT32 +static OPUS_INLINE opus_int32 silk_ADD_POS_SAT32(opus_int64 a, opus_int64 b){ + opus_int32 tmp; + ops_count += 1; + tmp = (opus_int32)((((a)+(b)) & 0x80000000) ? silk_int32_MAX : ((a)+(b))); + return(tmp); +} + +#undef silk_ADD_POS_SAT64 +static OPUS_INLINE opus_int64 silk_ADD_POS_SAT64(opus_int64 a, opus_int64 b){ + opus_int64 tmp; + ops_count += 1; + tmp = ((((a)+(b)) & 0x8000000000000000LL) ? silk_int64_MAX : ((a)+(b))); + return(tmp); +} + +#undef silk_LSHIFT8 +static OPUS_INLINE opus_int8 silk_LSHIFT8(opus_int8 a, opus_int32 shift){ + opus_int8 ret; + ops_count += 1; + ret = a << shift; + return ret; +} +#undef silk_LSHIFT16 +static OPUS_INLINE opus_int16 silk_LSHIFT16(opus_int16 a, opus_int32 shift){ + opus_int16 ret; + ops_count += 1; + ret = a << shift; + return ret; +} +#undef silk_LSHIFT32 +static OPUS_INLINE opus_int32 silk_LSHIFT32(opus_int32 a, opus_int32 shift){ + opus_int32 ret; + ops_count += 1; + ret = a << shift; + return ret; +} +#undef silk_LSHIFT64 +static OPUS_INLINE opus_int64 silk_LSHIFT64(opus_int64 a, opus_int shift){ + ops_count += 1; + return a << shift; +} + +#undef silk_LSHIFT_ovflw +static OPUS_INLINE opus_int32 silk_LSHIFT_ovflw(opus_int32 a, opus_int32 shift){ + ops_count += 1; + return a << shift; +} + +#undef silk_LSHIFT_uint +static OPUS_INLINE opus_uint32 silk_LSHIFT_uint(opus_uint32 a, opus_int32 shift){ + opus_uint32 ret; + ops_count += 1; + ret = a << shift; + return ret; +} + +#undef silk_RSHIFT8 +static OPUS_INLINE opus_int8 silk_RSHIFT8(opus_int8 a, opus_int32 shift){ + ops_count += 1; + return a >> shift; +} +#undef silk_RSHIFT16 +static OPUS_INLINE opus_int16 silk_RSHIFT16(opus_int16 a, opus_int32 shift){ + ops_count += 1; + return a >> shift; +} +#undef silk_RSHIFT32 +static OPUS_INLINE opus_int32 silk_RSHIFT32(opus_int32 a, opus_int32 shift){ + ops_count += 1; + return a >> shift; +} +#undef silk_RSHIFT64 +static OPUS_INLINE opus_int64 silk_RSHIFT64(opus_int64 a, opus_int64 shift){ + ops_count += 1; + return a >> shift; +} + +#undef silk_RSHIFT_uint +static OPUS_INLINE opus_uint32 silk_RSHIFT_uint(opus_uint32 a, opus_int32 shift){ + ops_count += 1; + return a >> shift; +} + +#undef silk_ADD_LSHIFT +static OPUS_INLINE opus_int32 silk_ADD_LSHIFT(opus_int32 a, opus_int32 b, opus_int32 shift){ + opus_int32 ret; + ops_count += 1; + ret = a + (b << shift); + return ret; /* shift >= 0*/ +} +#undef silk_ADD_LSHIFT32 +static OPUS_INLINE opus_int32 silk_ADD_LSHIFT32(opus_int32 a, opus_int32 b, opus_int32 shift){ + opus_int32 ret; + ops_count += 1; + ret = a + (b << shift); + return ret; /* shift >= 0*/ +} +#undef silk_ADD_LSHIFT_uint +static OPUS_INLINE opus_uint32 silk_ADD_LSHIFT_uint(opus_uint32 a, opus_uint32 b, opus_int32 shift){ + opus_uint32 ret; + ops_count += 1; + ret = a + (b << shift); + return ret; /* shift >= 0*/ +} +#undef silk_ADD_RSHIFT +static OPUS_INLINE opus_int32 silk_ADD_RSHIFT(opus_int32 a, opus_int32 b, opus_int32 shift){ + opus_int32 ret; + ops_count += 1; + ret = a + (b >> shift); + return ret; /* shift > 0*/ +} +#undef silk_ADD_RSHIFT32 +static OPUS_INLINE opus_int32 silk_ADD_RSHIFT32(opus_int32 a, opus_int32 b, opus_int32 shift){ + opus_int32 ret; + ops_count += 1; + ret = a + (b >> shift); + return ret; /* shift > 0*/ +} +#undef silk_ADD_RSHIFT_uint +static OPUS_INLINE opus_uint32 silk_ADD_RSHIFT_uint(opus_uint32 a, opus_uint32 b, opus_int32 shift){ + opus_uint32 ret; + ops_count += 1; + ret = a + (b >> shift); + return ret; /* shift > 0*/ +} +#undef silk_SUB_LSHIFT32 +static OPUS_INLINE opus_int32 silk_SUB_LSHIFT32(opus_int32 a, opus_int32 b, opus_int32 shift){ + opus_int32 ret; + ops_count += 1; + ret = a - (b << shift); + return ret; /* shift >= 0*/ +} +#undef silk_SUB_RSHIFT32 +static OPUS_INLINE opus_int32 silk_SUB_RSHIFT32(opus_int32 a, opus_int32 b, opus_int32 shift){ + opus_int32 ret; + ops_count += 1; + ret = a - (b >> shift); + return ret; /* shift > 0*/ +} + +#undef silk_RSHIFT_ROUND +static OPUS_INLINE opus_int32 silk_RSHIFT_ROUND(opus_int32 a, opus_int32 shift){ + opus_int32 ret; + ops_count += 3; + ret = shift == 1 ? (a >> 1) + (a & 1) : ((a >> (shift - 1)) + 1) >> 1; + return ret; +} + +#undef silk_RSHIFT_ROUND64 +static OPUS_INLINE opus_int64 silk_RSHIFT_ROUND64(opus_int64 a, opus_int32 shift){ + opus_int64 ret; + ops_count += 6; + ret = shift == 1 ? (a >> 1) + (a & 1) : ((a >> (shift - 1)) + 1) >> 1; + return ret; +} + +#undef silk_abs_int64 +static OPUS_INLINE opus_int64 silk_abs_int64(opus_int64 a){ + ops_count += 1; + return (((a) > 0) ? (a) : -(a)); /* Be careful, silk_abs returns wrong when input equals to silk_intXX_MIN*/ +} + +#undef silk_abs_int32 +static OPUS_INLINE opus_int32 silk_abs_int32(opus_int32 a){ + ops_count += 1; + return silk_abs(a); +} + + +#undef silk_min +static silk_min(a, b){ + ops_count += 1; + return (((a) < (b)) ? (a) : (b)); +} +#undef silk_max +static silk_max(a, b){ + ops_count += 1; + return (((a) > (b)) ? (a) : (b)); +} +#undef silk_sign +static silk_sign(a){ + ops_count += 1; + return ((a) > 0 ? 1 : ( (a) < 0 ? -1 : 0 )); +} + +#undef silk_ADD16 +static OPUS_INLINE opus_int16 silk_ADD16(opus_int16 a, opus_int16 b){ + opus_int16 ret; + ops_count += 1; + ret = a + b; + return ret; +} + +#undef silk_ADD32 +static OPUS_INLINE opus_int32 silk_ADD32(opus_int32 a, opus_int32 b){ + opus_int32 ret; + ops_count += 1; + ret = a + b; + return ret; +} + +#undef silk_ADD64 +static OPUS_INLINE opus_int64 silk_ADD64(opus_int64 a, opus_int64 b){ + opus_int64 ret; + ops_count += 2; + ret = a + b; + return ret; +} + +#undef silk_SUB16 +static OPUS_INLINE opus_int16 silk_SUB16(opus_int16 a, opus_int16 b){ + opus_int16 ret; + ops_count += 1; + ret = a - b; + return ret; +} + +#undef silk_SUB32 +static OPUS_INLINE opus_int32 silk_SUB32(opus_int32 a, opus_int32 b){ + opus_int32 ret; + ops_count += 1; + ret = a - b; + return ret; +} + +#undef silk_SUB64 +static OPUS_INLINE opus_int64 silk_SUB64(opus_int64 a, opus_int64 b){ + opus_int64 ret; + ops_count += 2; + ret = a - b; + return ret; +} + +#undef silk_ADD_SAT16 +static OPUS_INLINE opus_int16 silk_ADD_SAT16( opus_int16 a16, opus_int16 b16 ) { + opus_int16 res; + /* Nb will be counted in AKP_add32 and silk_SAT16*/ + res = (opus_int16)silk_SAT16( silk_ADD32( (opus_int32)(a16), (b16) ) ); + return res; +} + +#undef silk_ADD_SAT32 +static OPUS_INLINE opus_int32 silk_ADD_SAT32(opus_int32 a32, opus_int32 b32){ + opus_int32 res; + ops_count += 1; + res = ((((a32) + (b32)) & 0x80000000) == 0 ? \ + ((((a32) & (b32)) & 0x80000000) != 0 ? silk_int32_MIN : (a32)+(b32)) : \ + ((((a32) | (b32)) & 0x80000000) == 0 ? silk_int32_MAX : (a32)+(b32)) ); + return res; +} + +#undef silk_ADD_SAT64 +static OPUS_INLINE opus_int64 silk_ADD_SAT64( opus_int64 a64, opus_int64 b64 ) { + opus_int64 res; + ops_count += 1; + res = ((((a64) + (b64)) & 0x8000000000000000LL) == 0 ? \ + ((((a64) & (b64)) & 0x8000000000000000LL) != 0 ? silk_int64_MIN : (a64)+(b64)) : \ + ((((a64) | (b64)) & 0x8000000000000000LL) == 0 ? silk_int64_MAX : (a64)+(b64)) ); + return res; +} + +#undef silk_SUB_SAT16 +static OPUS_INLINE opus_int16 silk_SUB_SAT16( opus_int16 a16, opus_int16 b16 ) { + opus_int16 res; + silk_assert(0); + /* Nb will be counted in sub-macros*/ + res = (opus_int16)silk_SAT16( silk_SUB32( (opus_int32)(a16), (b16) ) ); + return res; +} + +#undef silk_SUB_SAT32 +static OPUS_INLINE opus_int32 silk_SUB_SAT32( opus_int32 a32, opus_int32 b32 ) { + opus_int32 res; + ops_count += 1; + res = ((((a32)-(b32)) & 0x80000000) == 0 ? \ + (( (a32) & ((b32)^0x80000000) & 0x80000000) ? silk_int32_MIN : (a32)-(b32)) : \ + ((((a32)^0x80000000) & (b32) & 0x80000000) ? silk_int32_MAX : (a32)-(b32)) ); + return res; +} + +#undef silk_SUB_SAT64 +static OPUS_INLINE opus_int64 silk_SUB_SAT64( opus_int64 a64, opus_int64 b64 ) { + opus_int64 res; + ops_count += 1; + res = ((((a64)-(b64)) & 0x8000000000000000LL) == 0 ? \ + (( (a64) & ((b64)^0x8000000000000000LL) & 0x8000000000000000LL) ? silk_int64_MIN : (a64)-(b64)) : \ + ((((a64)^0x8000000000000000LL) & (b64) & 0x8000000000000000LL) ? silk_int64_MAX : (a64)-(b64)) ); + + return res; +} + +#undef silk_SMULWW +static OPUS_INLINE opus_int32 silk_SMULWW(opus_int32 a32, opus_int32 b32){ + opus_int32 ret; + /* Nb will be counted in sub-macros*/ + ret = silk_MLA(silk_SMULWB((a32), (b32)), (a32), silk_RSHIFT_ROUND((b32), 16)); + return ret; +} + +#undef silk_SMLAWW +static OPUS_INLINE opus_int32 silk_SMLAWW(opus_int32 a32, opus_int32 b32, opus_int32 c32){ + opus_int32 ret; + /* Nb will be counted in sub-macros*/ + ret = silk_MLA(silk_SMLAWB((a32), (b32), (c32)), (b32), silk_RSHIFT_ROUND((c32), 16)); + return ret; +} + +#undef silk_min_int +static OPUS_INLINE opus_int silk_min_int(opus_int a, opus_int b) +{ + ops_count += 1; + return (((a) < (b)) ? (a) : (b)); +} + +#undef silk_min_16 +static OPUS_INLINE opus_int16 silk_min_16(opus_int16 a, opus_int16 b) +{ + ops_count += 1; + return (((a) < (b)) ? (a) : (b)); +} +#undef silk_min_32 +static OPUS_INLINE opus_int32 silk_min_32(opus_int32 a, opus_int32 b) +{ + ops_count += 1; + return (((a) < (b)) ? (a) : (b)); +} +#undef silk_min_64 +static OPUS_INLINE opus_int64 silk_min_64(opus_int64 a, opus_int64 b) +{ + ops_count += 1; + return (((a) < (b)) ? (a) : (b)); +} + +/* silk_min() versions with typecast in the function call */ +#undef silk_max_int +static OPUS_INLINE opus_int silk_max_int(opus_int a, opus_int b) +{ + ops_count += 1; + return (((a) > (b)) ? (a) : (b)); +} +#undef silk_max_16 +static OPUS_INLINE opus_int16 silk_max_16(opus_int16 a, opus_int16 b) +{ + ops_count += 1; + return (((a) > (b)) ? (a) : (b)); +} +#undef silk_max_32 +static OPUS_INLINE opus_int32 silk_max_32(opus_int32 a, opus_int32 b) +{ + ops_count += 1; + return (((a) > (b)) ? (a) : (b)); +} + +#undef silk_max_64 +static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b) +{ + ops_count += 1; + return (((a) > (b)) ? (a) : (b)); +} + + +#undef silk_LIMIT_int +static OPUS_INLINE opus_int silk_LIMIT_int(opus_int a, opus_int limit1, opus_int limit2) +{ + opus_int ret; + ops_count += 6; + + ret = ((limit1) > (limit2) ? ((a) > (limit1) ? (limit1) : ((a) < (limit2) ? (limit2) : (a))) \ + : ((a) > (limit2) ? (limit2) : ((a) < (limit1) ? (limit1) : (a)))); + + return(ret); +} + +#undef silk_LIMIT_16 +static OPUS_INLINE opus_int16 silk_LIMIT_16(opus_int16 a, opus_int16 limit1, opus_int16 limit2) +{ + opus_int16 ret; + ops_count += 6; + + ret = ((limit1) > (limit2) ? ((a) > (limit1) ? (limit1) : ((a) < (limit2) ? (limit2) : (a))) \ + : ((a) > (limit2) ? (limit2) : ((a) < (limit1) ? (limit1) : (a)))); + +return(ret); +} + + +#undef silk_LIMIT_32 +static OPUS_INLINE opus_int silk_LIMIT_32(opus_int32 a, opus_int32 limit1, opus_int32 limit2) +{ + opus_int32 ret; + ops_count += 6; + + ret = ((limit1) > (limit2) ? ((a) > (limit1) ? (limit1) : ((a) < (limit2) ? (limit2) : (a))) \ + : ((a) > (limit2) ? (limit2) : ((a) < (limit1) ? (limit1) : (a)))); + return(ret); +} + +#else +#define varDefine +#define silk_SaveCount() + +#endif +#endif + diff --git a/thirdparty/opus/silk/MacroDebug.h b/thirdparty/opus/silk/MacroDebug.h new file mode 100644 index 000000000000..35aedc5c5fa6 --- /dev/null +++ b/thirdparty/opus/silk/MacroDebug.h @@ -0,0 +1,952 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Copyright (C) 2012 Xiph.Org Foundation +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef MACRO_DEBUG_H +#define MACRO_DEBUG_H + +/* Redefine macro functions with extensive assertion in DEBUG mode. + As functions can't be undefined, this file can't work with SigProcFIX_MacroCount.h */ + +#if ( defined (FIXED_DEBUG) || ( 0 && defined (_DEBUG) ) ) && !defined (silk_MACRO_COUNT) + +#undef silk_ADD16 +#define silk_ADD16(a,b) silk_ADD16_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int16 silk_ADD16_(opus_int16 a, opus_int16 b, char *file, int line){ + opus_int16 ret; + + ret = a + b; + if ( ret != silk_ADD_SAT16( a, b ) ) + { + fprintf (stderr, "silk_ADD16(%d, %d) in %s: line %d\n", a, b, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_ADD32 +#define silk_ADD32(a,b) silk_ADD32_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_ADD32_(opus_int32 a, opus_int32 b, char *file, int line){ + opus_int32 ret; + + ret = a + b; + if ( ret != silk_ADD_SAT32( a, b ) ) + { + fprintf (stderr, "silk_ADD32(%d, %d) in %s: line %d\n", a, b, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_ADD64 +#define silk_ADD64(a,b) silk_ADD64_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int64 silk_ADD64_(opus_int64 a, opus_int64 b, char *file, int line){ + opus_int64 ret; + + ret = a + b; + if ( ret != silk_ADD_SAT64( a, b ) ) + { + fprintf (stderr, "silk_ADD64(%lld, %lld) in %s: line %d\n", (long long)a, (long long)b, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_SUB16 +#define silk_SUB16(a,b) silk_SUB16_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int16 silk_SUB16_(opus_int16 a, opus_int16 b, char *file, int line){ + opus_int16 ret; + + ret = a - b; + if ( ret != silk_SUB_SAT16( a, b ) ) + { + fprintf (stderr, "silk_SUB16(%d, %d) in %s: line %d\n", a, b, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_SUB32 +#define silk_SUB32(a,b) silk_SUB32_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_SUB32_(opus_int32 a, opus_int32 b, char *file, int line){ + opus_int32 ret; + + ret = a - b; + if ( ret != silk_SUB_SAT32( a, b ) ) + { + fprintf (stderr, "silk_SUB32(%d, %d) in %s: line %d\n", a, b, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_SUB64 +#define silk_SUB64(a,b) silk_SUB64_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int64 silk_SUB64_(opus_int64 a, opus_int64 b, char *file, int line){ + opus_int64 ret; + + ret = a - b; + if ( ret != silk_SUB_SAT64( a, b ) ) + { + fprintf (stderr, "silk_SUB64(%lld, %lld) in %s: line %d\n", (long long)a, (long long)b, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_ADD_SAT16 +#define silk_ADD_SAT16(a,b) silk_ADD_SAT16_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int16 silk_ADD_SAT16_( opus_int16 a16, opus_int16 b16, char *file, int line) { + opus_int16 res; + res = (opus_int16)silk_SAT16( silk_ADD32( (opus_int32)(a16), (b16) ) ); + if ( res != silk_SAT16( (opus_int32)a16 + (opus_int32)b16 ) ) + { + fprintf (stderr, "silk_ADD_SAT16(%d, %d) in %s: line %d\n", a16, b16, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return res; +} + +#undef silk_ADD_SAT32 +#define silk_ADD_SAT32(a,b) silk_ADD_SAT32_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_ADD_SAT32_(opus_int32 a32, opus_int32 b32, char *file, int line){ + opus_int32 res; + res = ((((opus_uint32)(a32) + (opus_uint32)(b32)) & 0x80000000) == 0 ? \ + ((((a32) & (b32)) & 0x80000000) != 0 ? silk_int32_MIN : (a32)+(b32)) : \ + ((((a32) | (b32)) & 0x80000000) == 0 ? silk_int32_MAX : (a32)+(b32)) ); + if ( res != silk_SAT32( (opus_int64)a32 + (opus_int64)b32 ) ) + { + fprintf (stderr, "silk_ADD_SAT32(%d, %d) in %s: line %d\n", a32, b32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return res; +} + +#undef silk_ADD_SAT64 +#define silk_ADD_SAT64(a,b) silk_ADD_SAT64_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int64 silk_ADD_SAT64_( opus_int64 a64, opus_int64 b64, char *file, int line) { + opus_int64 res; + int fail = 0; + res = ((((a64) + (b64)) & 0x8000000000000000LL) == 0 ? \ + ((((a64) & (b64)) & 0x8000000000000000LL) != 0 ? silk_int64_MIN : (a64)+(b64)) : \ + ((((a64) | (b64)) & 0x8000000000000000LL) == 0 ? silk_int64_MAX : (a64)+(b64)) ); + if( res != a64 + b64 ) { + /* Check that we saturated to the correct extreme value */ + if ( !(( res == silk_int64_MAX && ( ( a64 >> 1 ) + ( b64 >> 1 ) > ( silk_int64_MAX >> 3 ) ) ) || + ( res == silk_int64_MIN && ( ( a64 >> 1 ) + ( b64 >> 1 ) < ( silk_int64_MIN >> 3 ) ) ) ) ) + { + fail = 1; + } + } else { + /* Saturation not necessary */ + fail = res != a64 + b64; + } + if ( fail ) + { + fprintf (stderr, "silk_ADD_SAT64(%lld, %lld) in %s: line %d\n", (long long)a64, (long long)b64, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return res; +} + +#undef silk_SUB_SAT16 +#define silk_SUB_SAT16(a,b) silk_SUB_SAT16_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int16 silk_SUB_SAT16_( opus_int16 a16, opus_int16 b16, char *file, int line ) { + opus_int16 res; + res = (opus_int16)silk_SAT16( silk_SUB32( (opus_int32)(a16), (b16) ) ); + if ( res != silk_SAT16( (opus_int32)a16 - (opus_int32)b16 ) ) + { + fprintf (stderr, "silk_SUB_SAT16(%d, %d) in %s: line %d\n", a16, b16, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return res; +} + +#undef silk_SUB_SAT32 +#define silk_SUB_SAT32(a,b) silk_SUB_SAT32_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_SUB_SAT32_( opus_int32 a32, opus_int32 b32, char *file, int line ) { + opus_int32 res; + res = ((((opus_uint32)(a32)-(opus_uint32)(b32)) & 0x80000000) == 0 ? \ + (( (a32) & ((b32)^0x80000000) & 0x80000000) ? silk_int32_MIN : (a32)-(b32)) : \ + ((((a32)^0x80000000) & (b32) & 0x80000000) ? silk_int32_MAX : (a32)-(b32)) ); + if ( res != silk_SAT32( (opus_int64)a32 - (opus_int64)b32 ) ) + { + fprintf (stderr, "silk_SUB_SAT32(%d, %d) in %s: line %d\n", a32, b32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return res; +} + +#undef silk_SUB_SAT64 +#define silk_SUB_SAT64(a,b) silk_SUB_SAT64_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int64 silk_SUB_SAT64_( opus_int64 a64, opus_int64 b64, char *file, int line ) { + opus_int64 res; + int fail = 0; + res = ((((a64)-(b64)) & 0x8000000000000000LL) == 0 ? \ + (( (a64) & ((b64)^0x8000000000000000LL) & 0x8000000000000000LL) ? silk_int64_MIN : (a64)-(b64)) : \ + ((((a64)^0x8000000000000000LL) & (b64) & 0x8000000000000000LL) ? silk_int64_MAX : (a64)-(b64)) ); + if( res != a64 - b64 ) { + /* Check that we saturated to the correct extreme value */ + if( !(( res == silk_int64_MAX && ( ( a64 >> 1 ) + ( b64 >> 1 ) > ( silk_int64_MAX >> 3 ) ) ) || + ( res == silk_int64_MIN && ( ( a64 >> 1 ) + ( b64 >> 1 ) < ( silk_int64_MIN >> 3 ) ) ) )) + { + fail = 1; + } + } else { + /* Saturation not necessary */ + fail = res != a64 - b64; + } + if ( fail ) + { + fprintf (stderr, "silk_SUB_SAT64(%lld, %lld) in %s: line %d\n", (long long)a64, (long long)b64, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return res; +} + +#undef silk_MUL +#define silk_MUL(a,b) silk_MUL_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_MUL_(opus_int32 a32, opus_int32 b32, char *file, int line){ + opus_int32 ret; + opus_int64 ret64; + ret = a32 * b32; + ret64 = (opus_int64)a32 * (opus_int64)b32; + if ( (opus_int64)ret != ret64 ) + { + fprintf (stderr, "silk_MUL(%d, %d) in %s: line %d\n", a32, b32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_MUL_uint +#define silk_MUL_uint(a,b) silk_MUL_uint_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_uint32 silk_MUL_uint_(opus_uint32 a32, opus_uint32 b32, char *file, int line){ + opus_uint32 ret; + ret = a32 * b32; + if ( (opus_uint64)ret != (opus_uint64)a32 * (opus_uint64)b32 ) + { + fprintf (stderr, "silk_MUL_uint(%u, %u) in %s: line %d\n", a32, b32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_MLA +#define silk_MLA(a,b,c) silk_MLA_((a), (b), (c), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_MLA_(opus_int32 a32, opus_int32 b32, opus_int32 c32, char *file, int line){ + opus_int32 ret; + ret = a32 + b32 * c32; + if ( (opus_int64)ret != (opus_int64)a32 + (opus_int64)b32 * (opus_int64)c32 ) + { + fprintf (stderr, "silk_MLA(%d, %d, %d) in %s: line %d\n", a32, b32, c32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_MLA_uint +#define silk_MLA_uint(a,b,c) silk_MLA_uint_((a), (b), (c), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_MLA_uint_(opus_uint32 a32, opus_uint32 b32, opus_uint32 c32, char *file, int line){ + opus_uint32 ret; + ret = a32 + b32 * c32; + if ( (opus_int64)ret != (opus_int64)a32 + (opus_int64)b32 * (opus_int64)c32 ) + { + fprintf (stderr, "silk_MLA_uint(%d, %d, %d) in %s: line %d\n", a32, b32, c32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_SMULWB +#define silk_SMULWB(a,b) silk_SMULWB_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_SMULWB_(opus_int32 a32, opus_int32 b32, char *file, int line){ + opus_int32 ret; + ret = (a32 >> 16) * (opus_int32)((opus_int16)b32) + (((a32 & 0x0000FFFF) * (opus_int32)((opus_int16)b32)) >> 16); + if ( (opus_int64)ret != ((opus_int64)a32 * (opus_int16)b32) >> 16 ) + { + fprintf (stderr, "silk_SMULWB(%d, %d) in %s: line %d\n", a32, b32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_SMLAWB +#define silk_SMLAWB(a,b,c) silk_SMLAWB_((a), (b), (c), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_SMLAWB_(opus_int32 a32, opus_int32 b32, opus_int32 c32, char *file, int line){ + opus_int32 ret; + ret = silk_ADD32( a32, silk_SMULWB( b32, c32 ) ); + if ( silk_ADD32( a32, silk_SMULWB( b32, c32 ) ) != silk_ADD_SAT32( a32, silk_SMULWB( b32, c32 ) ) ) + { + fprintf (stderr, "silk_SMLAWB(%d, %d, %d) in %s: line %d\n", a32, b32, c32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_SMULWT +#define silk_SMULWT(a,b) silk_SMULWT_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_SMULWT_(opus_int32 a32, opus_int32 b32, char *file, int line){ + opus_int32 ret; + ret = (a32 >> 16) * (b32 >> 16) + (((a32 & 0x0000FFFF) * (b32 >> 16)) >> 16); + if ( (opus_int64)ret != ((opus_int64)a32 * (b32 >> 16)) >> 16 ) + { + fprintf (stderr, "silk_SMULWT(%d, %d) in %s: line %d\n", a32, b32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_SMLAWT +#define silk_SMLAWT(a,b,c) silk_SMLAWT_((a), (b), (c), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_SMLAWT_(opus_int32 a32, opus_int32 b32, opus_int32 c32, char *file, int line){ + opus_int32 ret; + ret = a32 + ((b32 >> 16) * (c32 >> 16)) + (((b32 & 0x0000FFFF) * ((c32 >> 16)) >> 16)); + if ( (opus_int64)ret != (opus_int64)a32 + (((opus_int64)b32 * (c32 >> 16)) >> 16) ) + { + fprintf (stderr, "silk_SMLAWT(%d, %d, %d) in %s: line %d\n", a32, b32, c32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_SMULL +#define silk_SMULL(a,b) silk_SMULL_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int64 silk_SMULL_(opus_int64 a64, opus_int64 b64, char *file, int line){ + opus_int64 ret64; + int fail = 0; + ret64 = a64 * b64; + if( b64 != 0 ) { + fail = a64 != (ret64 / b64); + } else if( a64 != 0 ) { + fail = b64 != (ret64 / a64); + } + if ( fail ) + { + fprintf (stderr, "silk_SMULL(%lld, %lld) in %s: line %d\n", (long long)a64, (long long)b64, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret64; +} + +/* no checking needed for silk_SMULBB */ +#undef silk_SMLABB +#define silk_SMLABB(a,b,c) silk_SMLABB_((a), (b), (c), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_SMLABB_(opus_int32 a32, opus_int32 b32, opus_int32 c32, char *file, int line){ + opus_int32 ret; + ret = a32 + (opus_int32)((opus_int16)b32) * (opus_int32)((opus_int16)c32); + if ( (opus_int64)ret != (opus_int64)a32 + (opus_int64)b32 * (opus_int16)c32 ) + { + fprintf (stderr, "silk_SMLABB(%d, %d, %d) in %s: line %d\n", a32, b32, c32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +/* no checking needed for silk_SMULBT */ +#undef silk_SMLABT +#define silk_SMLABT(a,b,c) silk_SMLABT_((a), (b), (c), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_SMLABT_(opus_int32 a32, opus_int32 b32, opus_int32 c32, char *file, int line){ + opus_int32 ret; + ret = a32 + ((opus_int32)((opus_int16)b32)) * (c32 >> 16); + if ( (opus_int64)ret != (opus_int64)a32 + (opus_int64)b32 * (c32 >> 16) ) + { + fprintf (stderr, "silk_SMLABT(%d, %d, %d) in %s: line %d\n", a32, b32, c32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +/* no checking needed for silk_SMULTT */ +#undef silk_SMLATT +#define silk_SMLATT(a,b,c) silk_SMLATT_((a), (b), (c), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_SMLATT_(opus_int32 a32, opus_int32 b32, opus_int32 c32, char *file, int line){ + opus_int32 ret; + ret = a32 + (b32 >> 16) * (c32 >> 16); + if ( (opus_int64)ret != (opus_int64)a32 + (b32 >> 16) * (c32 >> 16) ) + { + fprintf (stderr, "silk_SMLATT(%d, %d, %d) in %s: line %d\n", a32, b32, c32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_SMULWW +#define silk_SMULWW(a,b) silk_SMULWW_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_SMULWW_(opus_int32 a32, opus_int32 b32, char *file, int line){ + opus_int32 ret, tmp1, tmp2; + opus_int64 ret64; + int fail = 0; + + ret = silk_SMULWB( a32, b32 ); + tmp1 = silk_RSHIFT_ROUND( b32, 16 ); + tmp2 = silk_MUL( a32, tmp1 ); + + fail |= (opus_int64)tmp2 != (opus_int64) a32 * (opus_int64) tmp1; + + tmp1 = ret; + ret = silk_ADD32( tmp1, tmp2 ); + fail |= silk_ADD32( tmp1, tmp2 ) != silk_ADD_SAT32( tmp1, tmp2 ); + + ret64 = silk_RSHIFT64( silk_SMULL( a32, b32 ), 16 ); + fail |= (opus_int64)ret != ret64; + + if ( fail ) + { + fprintf (stderr, "silk_SMULWT(%d, %d) in %s: line %d\n", a32, b32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + + return ret; +} + +#undef silk_SMLAWW +#define silk_SMLAWW(a,b,c) silk_SMLAWW_((a), (b), (c), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_SMLAWW_(opus_int32 a32, opus_int32 b32, opus_int32 c32, char *file, int line){ + opus_int32 ret, tmp; + + tmp = silk_SMULWW( b32, c32 ); + ret = silk_ADD32( a32, tmp ); + if ( ret != silk_ADD_SAT32( a32, tmp ) ) + { + fprintf (stderr, "silk_SMLAWW(%d, %d, %d) in %s: line %d\n", a32, b32, c32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +/* Multiply-accumulate macros that allow overflow in the addition (ie, no asserts in debug mode) */ +#undef silk_MLA_ovflw +#define silk_MLA_ovflw(a32, b32, c32) ((a32) + ((b32) * (c32))) +#undef silk_SMLABB_ovflw +#define silk_SMLABB_ovflw(a32, b32, c32) ((a32) + ((opus_int32)((opus_int16)(b32))) * (opus_int32)((opus_int16)(c32))) + +/* no checking needed for silk_SMULL + no checking needed for silk_SMLAL + no checking needed for silk_SMLALBB + no checking needed for SigProcFIX_CLZ16 + no checking needed for SigProcFIX_CLZ32*/ + +#undef silk_DIV32 +#define silk_DIV32(a,b) silk_DIV32_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_DIV32_(opus_int32 a32, opus_int32 b32, char *file, int line){ + if ( b32 == 0 ) + { + fprintf (stderr, "silk_DIV32(%d, %d) in %s: line %d\n", a32, b32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return a32 / b32; +} + +#undef silk_DIV32_16 +#define silk_DIV32_16(a,b) silk_DIV32_16_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_DIV32_16_(opus_int32 a32, opus_int32 b32, char *file, int line){ + int fail = 0; + fail |= b32 == 0; + fail |= b32 > silk_int16_MAX; + fail |= b32 < silk_int16_MIN; + if ( fail ) + { + fprintf (stderr, "silk_DIV32_16(%d, %d) in %s: line %d\n", a32, b32, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return a32 / b32; +} + +/* no checking needed for silk_SAT8 + no checking needed for silk_SAT16 + no checking needed for silk_SAT32 + no checking needed for silk_POS_SAT32 + no checking needed for silk_ADD_POS_SAT8 + no checking needed for silk_ADD_POS_SAT16 + no checking needed for silk_ADD_POS_SAT32 + no checking needed for silk_ADD_POS_SAT64 */ + +#undef silk_LSHIFT8 +#define silk_LSHIFT8(a,b) silk_LSHIFT8_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int8 silk_LSHIFT8_(opus_int8 a, opus_int32 shift, char *file, int line){ + opus_int8 ret; + int fail = 0; + ret = a << shift; + fail |= shift < 0; + fail |= shift >= 8; + fail |= (opus_int64)ret != ((opus_int64)a) << shift; + if ( fail ) + { + fprintf (stderr, "silk_LSHIFT8(%d, %d) in %s: line %d\n", a, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_LSHIFT16 +#define silk_LSHIFT16(a,b) silk_LSHIFT16_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int16 silk_LSHIFT16_(opus_int16 a, opus_int32 shift, char *file, int line){ + opus_int16 ret; + int fail = 0; + ret = a << shift; + fail |= shift < 0; + fail |= shift >= 16; + fail |= (opus_int64)ret != ((opus_int64)a) << shift; + if ( fail ) + { + fprintf (stderr, "silk_LSHIFT16(%d, %d) in %s: line %d\n", a, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_LSHIFT32 +#define silk_LSHIFT32(a,b) silk_LSHIFT32_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_LSHIFT32_(opus_int32 a, opus_int32 shift, char *file, int line){ + opus_int32 ret; + int fail = 0; + ret = a << shift; + fail |= shift < 0; + fail |= shift >= 32; + fail |= (opus_int64)ret != ((opus_int64)a) << shift; + if ( fail ) + { + fprintf (stderr, "silk_LSHIFT32(%d, %d) in %s: line %d\n", a, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_LSHIFT64 +#define silk_LSHIFT64(a,b) silk_LSHIFT64_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int64 silk_LSHIFT64_(opus_int64 a, opus_int shift, char *file, int line){ + opus_int64 ret; + int fail = 0; + ret = a << shift; + fail |= shift < 0; + fail |= shift >= 64; + fail |= (ret>>shift) != ((opus_int64)a); + if ( fail ) + { + fprintf (stderr, "silk_LSHIFT64(%lld, %d) in %s: line %d\n", (long long)a, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_LSHIFT_ovflw +#define silk_LSHIFT_ovflw(a,b) silk_LSHIFT_ovflw_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_LSHIFT_ovflw_(opus_int32 a, opus_int32 shift, char *file, int line){ + if ( (shift < 0) || (shift >= 32) ) /* no check for overflow */ + { + fprintf (stderr, "silk_LSHIFT_ovflw(%d, %d) in %s: line %d\n", a, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return a << shift; +} + +#undef silk_LSHIFT_uint +#define silk_LSHIFT_uint(a,b) silk_LSHIFT_uint_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_uint32 silk_LSHIFT_uint_(opus_uint32 a, opus_int32 shift, char *file, int line){ + opus_uint32 ret; + ret = a << shift; + if ( (shift < 0) || ((opus_int64)ret != ((opus_int64)a) << shift)) + { + fprintf (stderr, "silk_LSHIFT_uint(%u, %d) in %s: line %d\n", a, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_RSHIFT8 +#define silk_RSHITF8(a,b) silk_RSHIFT8_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int8 silk_RSHIFT8_(opus_int8 a, opus_int32 shift, char *file, int line){ + if ( (shift < 0) || (shift>=8) ) + { + fprintf (stderr, "silk_RSHITF8(%d, %d) in %s: line %d\n", a, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return a >> shift; +} + +#undef silk_RSHIFT16 +#define silk_RSHITF16(a,b) silk_RSHIFT16_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int16 silk_RSHIFT16_(opus_int16 a, opus_int32 shift, char *file, int line){ + if ( (shift < 0) || (shift>=16) ) + { + fprintf (stderr, "silk_RSHITF16(%d, %d) in %s: line %d\n", a, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return a >> shift; +} + +#undef silk_RSHIFT32 +#define silk_RSHIFT32(a,b) silk_RSHIFT32_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_RSHIFT32_(opus_int32 a, opus_int32 shift, char *file, int line){ + if ( (shift < 0) || (shift>=32) ) + { + fprintf (stderr, "silk_RSHITF32(%d, %d) in %s: line %d\n", a, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return a >> shift; +} + +#undef silk_RSHIFT64 +#define silk_RSHIFT64(a,b) silk_RSHIFT64_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int64 silk_RSHIFT64_(opus_int64 a, opus_int64 shift, char *file, int line){ + if ( (shift < 0) || (shift>=64) ) + { + fprintf (stderr, "silk_RSHITF64(%lld, %lld) in %s: line %d\n", (long long)a, (long long)shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return a >> shift; +} + +#undef silk_RSHIFT_uint +#define silk_RSHIFT_uint(a,b) silk_RSHIFT_uint_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_uint32 silk_RSHIFT_uint_(opus_uint32 a, opus_int32 shift, char *file, int line){ + if ( (shift < 0) || (shift>32) ) + { + fprintf (stderr, "silk_RSHIFT_uint(%u, %d) in %s: line %d\n", a, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return a >> shift; +} + +#undef silk_ADD_LSHIFT +#define silk_ADD_LSHIFT(a,b,c) silk_ADD_LSHIFT_((a), (b), (c), __FILE__, __LINE__) +static OPUS_INLINE int silk_ADD_LSHIFT_(int a, int b, int shift, char *file, int line){ + opus_int16 ret; + ret = a + (b << shift); + if ( (shift < 0) || (shift>15) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) << shift)) ) + { + fprintf (stderr, "silk_ADD_LSHIFT(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; /* shift >= 0 */ +} + +#undef silk_ADD_LSHIFT32 +#define silk_ADD_LSHIFT32(a,b,c) silk_ADD_LSHIFT32_((a), (b), (c), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_ADD_LSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){ + opus_int32 ret; + ret = a + (b << shift); + if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) << shift)) ) + { + fprintf (stderr, "silk_ADD_LSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; /* shift >= 0 */ +} + +#undef silk_ADD_LSHIFT_uint +#define silk_ADD_LSHIFT_uint(a,b,c) silk_ADD_LSHIFT_uint_((a), (b), (c), __FILE__, __LINE__) +static OPUS_INLINE opus_uint32 silk_ADD_LSHIFT_uint_(opus_uint32 a, opus_uint32 b, opus_int32 shift, char *file, int line){ + opus_uint32 ret; + ret = a + (b << shift); + if ( (shift < 0) || (shift>32) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) << shift)) ) + { + fprintf (stderr, "silk_ADD_LSHIFT_uint(%u, %u, %d) in %s: line %d\n", a, b, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; /* shift >= 0 */ +} + +#undef silk_ADD_RSHIFT +#define silk_ADD_RSHIFT(a,b,c) silk_ADD_RSHIFT_((a), (b), (c), __FILE__, __LINE__) +static OPUS_INLINE int silk_ADD_RSHIFT_(int a, int b, int shift, char *file, int line){ + opus_int16 ret; + ret = a + (b >> shift); + if ( (shift < 0) || (shift>15) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) >> shift)) ) + { + fprintf (stderr, "silk_ADD_RSHIFT(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; /* shift > 0 */ +} + +#undef silk_ADD_RSHIFT32 +#define silk_ADD_RSHIFT32(a,b,c) silk_ADD_RSHIFT32_((a), (b), (c), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_ADD_RSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){ + opus_int32 ret; + ret = a + (b >> shift); + if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) >> shift)) ) + { + fprintf (stderr, "silk_ADD_RSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; /* shift > 0 */ +} + +#undef silk_ADD_RSHIFT_uint +#define silk_ADD_RSHIFT_uint(a,b,c) silk_ADD_RSHIFT_uint_((a), (b), (c), __FILE__, __LINE__) +static OPUS_INLINE opus_uint32 silk_ADD_RSHIFT_uint_(opus_uint32 a, opus_uint32 b, opus_int32 shift, char *file, int line){ + opus_uint32 ret; + ret = a + (b >> shift); + if ( (shift < 0) || (shift>32) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) >> shift)) ) + { + fprintf (stderr, "silk_ADD_RSHIFT_uint(%u, %u, %d) in %s: line %d\n", a, b, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; /* shift > 0 */ +} + +#undef silk_SUB_LSHIFT32 +#define silk_SUB_LSHIFT32(a,b,c) silk_SUB_LSHIFT32_((a), (b), (c), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_SUB_LSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){ + opus_int32 ret; + ret = a - (b << shift); + if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (((opus_int64)b) << shift)) ) + { + fprintf (stderr, "silk_SUB_LSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; /* shift >= 0 */ +} + +#undef silk_SUB_RSHIFT32 +#define silk_SUB_RSHIFT32(a,b,c) silk_SUB_RSHIFT32_((a), (b), (c), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_SUB_RSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){ + opus_int32 ret; + ret = a - (b >> shift); + if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (((opus_int64)b) >> shift)) ) + { + fprintf (stderr, "silk_SUB_RSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; /* shift > 0 */ +} + +#undef silk_RSHIFT_ROUND +#define silk_RSHIFT_ROUND(a,b) silk_RSHIFT_ROUND_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_RSHIFT_ROUND_(opus_int32 a, opus_int32 shift, char *file, int line){ + opus_int32 ret; + ret = shift == 1 ? (a >> 1) + (a & 1) : ((a >> (shift - 1)) + 1) >> 1; + /* the marco definition can't handle a shift of zero */ + if ( (shift <= 0) || (shift>31) || ((opus_int64)ret != ((opus_int64)a + ((opus_int64)1 << (shift - 1))) >> shift) ) + { + fprintf (stderr, "silk_RSHIFT_ROUND(%d, %d) in %s: line %d\n", a, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return ret; +} + +#undef silk_RSHIFT_ROUND64 +#define silk_RSHIFT_ROUND64(a,b) silk_RSHIFT_ROUND64_((a), (b), __FILE__, __LINE__) +static OPUS_INLINE opus_int64 silk_RSHIFT_ROUND64_(opus_int64 a, opus_int32 shift, char *file, int line){ + opus_int64 ret; + /* the marco definition can't handle a shift of zero */ + if ( (shift <= 0) || (shift>=64) ) + { + fprintf (stderr, "silk_RSHIFT_ROUND64(%lld, %d) in %s: line %d\n", (long long)a, shift, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + ret = shift == 1 ? (a >> 1) + (a & 1) : ((a >> (shift - 1)) + 1) >> 1; + return ret; +} + +/* silk_abs is used on floats also, so doesn't work... */ +/*#undef silk_abs +static OPUS_INLINE opus_int32 silk_abs(opus_int32 a){ + silk_assert(a != 0x80000000); + return (((a) > 0) ? (a) : -(a)); // Be careful, silk_abs returns wrong when input equals to silk_intXX_MIN +}*/ + +#undef silk_abs_int64 +#define silk_abs_int64(a) silk_abs_int64_((a), __FILE__, __LINE__) +static OPUS_INLINE opus_int64 silk_abs_int64_(opus_int64 a, char *file, int line){ + if ( a == silk_int64_MIN ) + { + fprintf (stderr, "silk_abs_int64(%lld) in %s: line %d\n", (long long)a, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return (((a) > 0) ? (a) : -(a)); /* Be careful, silk_abs returns wrong when input equals to silk_intXX_MIN */ +} + +#undef silk_abs_int32 +#define silk_abs_int32(a) silk_abs_int32_((a), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_abs_int32_(opus_int32 a, char *file, int line){ + if ( a == silk_int32_MIN ) + { + fprintf (stderr, "silk_abs_int32(%d) in %s: line %d\n", a, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return silk_abs(a); +} + +#undef silk_CHECK_FIT8 +#define silk_CHECK_FIT8(a) silk_CHECK_FIT8_((a), __FILE__, __LINE__) +static OPUS_INLINE opus_int8 silk_CHECK_FIT8_( opus_int64 a, char *file, int line ){ + opus_int8 ret; + ret = (opus_int8)a; + if ( (opus_int64)ret != a ) + { + fprintf (stderr, "silk_CHECK_FIT8(%lld) in %s: line %d\n", (long long)a, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return( ret ); +} + +#undef silk_CHECK_FIT16 +#define silk_CHECK_FIT16(a) silk_CHECK_FIT16_((a), __FILE__, __LINE__) +static OPUS_INLINE opus_int16 silk_CHECK_FIT16_( opus_int64 a, char *file, int line ){ + opus_int16 ret; + ret = (opus_int16)a; + if ( (opus_int64)ret != a ) + { + fprintf (stderr, "silk_CHECK_FIT16(%lld) in %s: line %d\n", (long long)a, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return( ret ); +} + +#undef silk_CHECK_FIT32 +#define silk_CHECK_FIT32(a) silk_CHECK_FIT32_((a), __FILE__, __LINE__) +static OPUS_INLINE opus_int32 silk_CHECK_FIT32_( opus_int64 a, char *file, int line ){ + opus_int32 ret; + ret = (opus_int32)a; + if ( (opus_int64)ret != a ) + { + fprintf (stderr, "silk_CHECK_FIT32(%lld) in %s: line %d\n", (long long)a, file, line); +#ifdef FIXED_DEBUG_ASSERT + silk_assert( 0 ); +#endif + } + return( ret ); +} + +/* no checking for silk_NSHIFT_MUL_32_32 + no checking for silk_NSHIFT_MUL_16_16 + no checking needed for silk_min + no checking needed for silk_max + no checking needed for silk_sign +*/ + +#endif +#endif /* MACRO_DEBUG_H */ diff --git a/thirdparty/opus/silk/NLSF2A.c b/thirdparty/opus/silk/NLSF2A.c new file mode 100644 index 000000000000..b1c559ea6821 --- /dev/null +++ b/thirdparty/opus/silk/NLSF2A.c @@ -0,0 +1,178 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/* conversion between prediction filter coefficients and LSFs */ +/* order should be even */ +/* a piecewise linear approximation maps LSF <-> cos(LSF) */ +/* therefore the result is not accurate LSFs, but the two */ +/* functions are accurate inverses of each other */ + +#include "SigProc_FIX.h" +#include "tables.h" + +#define QA 16 + +/* helper function for NLSF2A(..) */ +static OPUS_INLINE void silk_NLSF2A_find_poly( + opus_int32 *out, /* O intermediate polynomial, QA [dd+1] */ + const opus_int32 *cLSF, /* I vector of interleaved 2*cos(LSFs), QA [d] */ + opus_int dd /* I polynomial order (= 1/2 * filter order) */ +) +{ + opus_int k, n; + opus_int32 ftmp; + + out[0] = silk_LSHIFT( 1, QA ); + out[1] = -cLSF[0]; + for( k = 1; k < dd; k++ ) { + ftmp = cLSF[2*k]; /* QA*/ + out[k+1] = silk_LSHIFT( out[k-1], 1 ) - (opus_int32)silk_RSHIFT_ROUND64( silk_SMULL( ftmp, out[k] ), QA ); + for( n = k; n > 1; n-- ) { + out[n] += out[n-2] - (opus_int32)silk_RSHIFT_ROUND64( silk_SMULL( ftmp, out[n-1] ), QA ); + } + out[1] -= ftmp; + } +} + +/* compute whitening filter coefficients from normalized line spectral frequencies */ +void silk_NLSF2A( + opus_int16 *a_Q12, /* O monic whitening filter coefficients in Q12, [ d ] */ + const opus_int16 *NLSF, /* I normalized line spectral frequencies in Q15, [ d ] */ + const opus_int d /* I filter order (should be even) */ +) +{ + /* This ordering was found to maximize quality. It improves numerical accuracy of + silk_NLSF2A_find_poly() compared to "standard" ordering. */ + static const unsigned char ordering16[16] = { + 0, 15, 8, 7, 4, 11, 12, 3, 2, 13, 10, 5, 6, 9, 14, 1 + }; + static const unsigned char ordering10[10] = { + 0, 9, 6, 3, 4, 5, 8, 1, 2, 7 + }; + const unsigned char *ordering; + opus_int k, i, dd; + opus_int32 cos_LSF_QA[ SILK_MAX_ORDER_LPC ]; + opus_int32 P[ SILK_MAX_ORDER_LPC / 2 + 1 ], Q[ SILK_MAX_ORDER_LPC / 2 + 1 ]; + opus_int32 Ptmp, Qtmp, f_int, f_frac, cos_val, delta; + opus_int32 a32_QA1[ SILK_MAX_ORDER_LPC ]; + opus_int32 maxabs, absval, idx=0, sc_Q16; + + silk_assert( LSF_COS_TAB_SZ_FIX == 128 ); + silk_assert( d==10||d==16 ); + + /* convert LSFs to 2*cos(LSF), using piecewise linear curve from table */ + ordering = d == 16 ? ordering16 : ordering10; + for( k = 0; k < d; k++ ) { + silk_assert(NLSF[k] >= 0 ); + + /* f_int on a scale 0-127 (rounded down) */ + f_int = silk_RSHIFT( NLSF[k], 15 - 7 ); + + /* f_frac, range: 0..255 */ + f_frac = NLSF[k] - silk_LSHIFT( f_int, 15 - 7 ); + + silk_assert(f_int >= 0); + silk_assert(f_int < LSF_COS_TAB_SZ_FIX ); + + /* Read start and end value from table */ + cos_val = silk_LSFCosTab_FIX_Q12[ f_int ]; /* Q12 */ + delta = silk_LSFCosTab_FIX_Q12[ f_int + 1 ] - cos_val; /* Q12, with a range of 0..200 */ + + /* Linear interpolation */ + cos_LSF_QA[ordering[k]] = silk_RSHIFT_ROUND( silk_LSHIFT( cos_val, 8 ) + silk_MUL( delta, f_frac ), 20 - QA ); /* QA */ + } + + dd = silk_RSHIFT( d, 1 ); + + /* generate even and odd polynomials using convolution */ + silk_NLSF2A_find_poly( P, &cos_LSF_QA[ 0 ], dd ); + silk_NLSF2A_find_poly( Q, &cos_LSF_QA[ 1 ], dd ); + + /* convert even and odd polynomials to opus_int32 Q12 filter coefs */ + for( k = 0; k < dd; k++ ) { + Ptmp = P[ k+1 ] + P[ k ]; + Qtmp = Q[ k+1 ] - Q[ k ]; + + /* the Ptmp and Qtmp values at this stage need to fit in int32 */ + a32_QA1[ k ] = -Qtmp - Ptmp; /* QA+1 */ + a32_QA1[ d-k-1 ] = Qtmp - Ptmp; /* QA+1 */ + } + + /* Limit the maximum absolute value of the prediction coefficients, so that they'll fit in int16 */ + for( i = 0; i < 10; i++ ) { + /* Find maximum absolute value and its index */ + maxabs = 0; + for( k = 0; k < d; k++ ) { + absval = silk_abs( a32_QA1[k] ); + if( absval > maxabs ) { + maxabs = absval; + idx = k; + } + } + maxabs = silk_RSHIFT_ROUND( maxabs, QA + 1 - 12 ); /* QA+1 -> Q12 */ + + if( maxabs > silk_int16_MAX ) { + /* Reduce magnitude of prediction coefficients */ + maxabs = silk_min( maxabs, 163838 ); /* ( silk_int32_MAX >> 14 ) + silk_int16_MAX = 163838 */ + sc_Q16 = SILK_FIX_CONST( 0.999, 16 ) - silk_DIV32( silk_LSHIFT( maxabs - silk_int16_MAX, 14 ), + silk_RSHIFT32( silk_MUL( maxabs, idx + 1), 2 ) ); + silk_bwexpander_32( a32_QA1, d, sc_Q16 ); + } else { + break; + } + } + + if( i == 10 ) { + /* Reached the last iteration, clip the coefficients */ + for( k = 0; k < d; k++ ) { + a_Q12[ k ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( a32_QA1[ k ], QA + 1 - 12 ) ); /* QA+1 -> Q12 */ + a32_QA1[ k ] = silk_LSHIFT( (opus_int32)a_Q12[ k ], QA + 1 - 12 ); + } + } else { + for( k = 0; k < d; k++ ) { + a_Q12[ k ] = (opus_int16)silk_RSHIFT_ROUND( a32_QA1[ k ], QA + 1 - 12 ); /* QA+1 -> Q12 */ + } + } + + for( i = 0; i < MAX_LPC_STABILIZE_ITERATIONS; i++ ) { + if( silk_LPC_inverse_pred_gain( a_Q12, d ) < SILK_FIX_CONST( 1.0 / MAX_PREDICTION_POWER_GAIN, 30 ) ) { + /* Prediction coefficients are (too close to) unstable; apply bandwidth expansion */ + /* on the unscaled coefficients, convert to Q12 and measure again */ + silk_bwexpander_32( a32_QA1, d, 65536 - silk_LSHIFT( 2, i ) ); + for( k = 0; k < d; k++ ) { + a_Q12[ k ] = (opus_int16)silk_RSHIFT_ROUND( a32_QA1[ k ], QA + 1 - 12 ); /* QA+1 -> Q12 */ + } + } else { + break; + } + } +} + diff --git a/thirdparty/opus/silk/NLSF_VQ.c b/thirdparty/opus/silk/NLSF_VQ.c new file mode 100644 index 000000000000..69b6e22e1896 --- /dev/null +++ b/thirdparty/opus/silk/NLSF_VQ.c @@ -0,0 +1,68 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* Compute quantization errors for an LPC_order element input vector for a VQ codebook */ +void silk_NLSF_VQ( + opus_int32 err_Q26[], /* O Quantization errors [K] */ + const opus_int16 in_Q15[], /* I Input vectors to be quantized [LPC_order] */ + const opus_uint8 pCB_Q8[], /* I Codebook vectors [K*LPC_order] */ + const opus_int K, /* I Number of codebook vectors */ + const opus_int LPC_order /* I Number of LPCs */ +) +{ + opus_int i, m; + opus_int32 diff_Q15, sum_error_Q30, sum_error_Q26; + + silk_assert( LPC_order <= 16 ); + silk_assert( ( LPC_order & 1 ) == 0 ); + + /* Loop over codebook */ + for( i = 0; i < K; i++ ) { + sum_error_Q26 = 0; + for( m = 0; m < LPC_order; m += 2 ) { + /* Compute weighted squared quantization error for index m */ + diff_Q15 = silk_SUB_LSHIFT32( in_Q15[ m ], (opus_int32)*pCB_Q8++, 7 ); /* range: [ -32767 : 32767 ]*/ + sum_error_Q30 = silk_SMULBB( diff_Q15, diff_Q15 ); + + /* Compute weighted squared quantization error for index m + 1 */ + diff_Q15 = silk_SUB_LSHIFT32( in_Q15[m + 1], (opus_int32)*pCB_Q8++, 7 ); /* range: [ -32767 : 32767 ]*/ + sum_error_Q30 = silk_SMLABB( sum_error_Q30, diff_Q15, diff_Q15 ); + + sum_error_Q26 = silk_ADD_RSHIFT32( sum_error_Q26, sum_error_Q30, 4 ); + + silk_assert( sum_error_Q26 >= 0 ); + silk_assert( sum_error_Q30 >= 0 ); + } + err_Q26[ i ] = sum_error_Q26; + } +} diff --git a/thirdparty/opus/silk/NLSF_VQ_weights_laroia.c b/thirdparty/opus/silk/NLSF_VQ_weights_laroia.c new file mode 100644 index 000000000000..04894c59ab7e --- /dev/null +++ b/thirdparty/opus/silk/NLSF_VQ_weights_laroia.c @@ -0,0 +1,80 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "define.h" +#include "SigProc_FIX.h" + +/* +R. Laroia, N. Phamdo and N. Farvardin, "Robust and Efficient Quantization of Speech LSP +Parameters Using Structured Vector Quantization", Proc. IEEE Int. Conf. Acoust., Speech, +Signal Processing, pp. 641-644, 1991. +*/ + +/* Laroia low complexity NLSF weights */ +void silk_NLSF_VQ_weights_laroia( + opus_int16 *pNLSFW_Q_OUT, /* O Pointer to input vector weights [D] */ + const opus_int16 *pNLSF_Q15, /* I Pointer to input vector [D] */ + const opus_int D /* I Input vector dimension (even) */ +) +{ + opus_int k; + opus_int32 tmp1_int, tmp2_int; + + silk_assert( D > 0 ); + silk_assert( ( D & 1 ) == 0 ); + + /* First value */ + tmp1_int = silk_max_int( pNLSF_Q15[ 0 ], 1 ); + tmp1_int = silk_DIV32_16( (opus_int32)1 << ( 15 + NLSF_W_Q ), tmp1_int ); + tmp2_int = silk_max_int( pNLSF_Q15[ 1 ] - pNLSF_Q15[ 0 ], 1 ); + tmp2_int = silk_DIV32_16( (opus_int32)1 << ( 15 + NLSF_W_Q ), tmp2_int ); + pNLSFW_Q_OUT[ 0 ] = (opus_int16)silk_min_int( tmp1_int + tmp2_int, silk_int16_MAX ); + silk_assert( pNLSFW_Q_OUT[ 0 ] > 0 ); + + /* Main loop */ + for( k = 1; k < D - 1; k += 2 ) { + tmp1_int = silk_max_int( pNLSF_Q15[ k + 1 ] - pNLSF_Q15[ k ], 1 ); + tmp1_int = silk_DIV32_16( (opus_int32)1 << ( 15 + NLSF_W_Q ), tmp1_int ); + pNLSFW_Q_OUT[ k ] = (opus_int16)silk_min_int( tmp1_int + tmp2_int, silk_int16_MAX ); + silk_assert( pNLSFW_Q_OUT[ k ] > 0 ); + + tmp2_int = silk_max_int( pNLSF_Q15[ k + 2 ] - pNLSF_Q15[ k + 1 ], 1 ); + tmp2_int = silk_DIV32_16( (opus_int32)1 << ( 15 + NLSF_W_Q ), tmp2_int ); + pNLSFW_Q_OUT[ k + 1 ] = (opus_int16)silk_min_int( tmp1_int + tmp2_int, silk_int16_MAX ); + silk_assert( pNLSFW_Q_OUT[ k + 1 ] > 0 ); + } + + /* Last value */ + tmp1_int = silk_max_int( ( 1 << 15 ) - pNLSF_Q15[ D - 1 ], 1 ); + tmp1_int = silk_DIV32_16( (opus_int32)1 << ( 15 + NLSF_W_Q ), tmp1_int ); + pNLSFW_Q_OUT[ D - 1 ] = (opus_int16)silk_min_int( tmp1_int + tmp2_int, silk_int16_MAX ); + silk_assert( pNLSFW_Q_OUT[ D - 1 ] > 0 ); +} diff --git a/thirdparty/opus/silk/NLSF_decode.c b/thirdparty/opus/silk/NLSF_decode.c new file mode 100644 index 000000000000..9f715060b8ef --- /dev/null +++ b/thirdparty/opus/silk/NLSF_decode.c @@ -0,0 +1,101 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* Predictive dequantizer for NLSF residuals */ +static OPUS_INLINE void silk_NLSF_residual_dequant( /* O Returns RD value in Q30 */ + opus_int16 x_Q10[], /* O Output [ order ] */ + const opus_int8 indices[], /* I Quantization indices [ order ] */ + const opus_uint8 pred_coef_Q8[], /* I Backward predictor coefs [ order ] */ + const opus_int quant_step_size_Q16, /* I Quantization step size */ + const opus_int16 order /* I Number of input values */ +) +{ + opus_int i, out_Q10, pred_Q10; + + out_Q10 = 0; + for( i = order-1; i >= 0; i-- ) { + pred_Q10 = silk_RSHIFT( silk_SMULBB( out_Q10, (opus_int16)pred_coef_Q8[ i ] ), 8 ); + out_Q10 = silk_LSHIFT( indices[ i ], 10 ); + if( out_Q10 > 0 ) { + out_Q10 = silk_SUB16( out_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) ); + } else if( out_Q10 < 0 ) { + out_Q10 = silk_ADD16( out_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) ); + } + out_Q10 = silk_SMLAWB( pred_Q10, (opus_int32)out_Q10, quant_step_size_Q16 ); + x_Q10[ i ] = out_Q10; + } +} + + +/***********************/ +/* NLSF vector decoder */ +/***********************/ +void silk_NLSF_decode( + opus_int16 *pNLSF_Q15, /* O Quantized NLSF vector [ LPC_ORDER ] */ + opus_int8 *NLSFIndices, /* I Codebook path vector [ LPC_ORDER + 1 ] */ + const silk_NLSF_CB_struct *psNLSF_CB /* I Codebook object */ +) +{ + opus_int i; + opus_uint8 pred_Q8[ MAX_LPC_ORDER ]; + opus_int16 ec_ix[ MAX_LPC_ORDER ]; + opus_int16 res_Q10[ MAX_LPC_ORDER ]; + opus_int16 W_tmp_QW[ MAX_LPC_ORDER ]; + opus_int32 W_tmp_Q9, NLSF_Q15_tmp; + const opus_uint8 *pCB_element; + + /* Decode first stage */ + pCB_element = &psNLSF_CB->CB1_NLSF_Q8[ NLSFIndices[ 0 ] * psNLSF_CB->order ]; + for( i = 0; i < psNLSF_CB->order; i++ ) { + pNLSF_Q15[ i ] = silk_LSHIFT( (opus_int16)pCB_element[ i ], 7 ); + } + + /* Unpack entropy table indices and predictor for current CB1 index */ + silk_NLSF_unpack( ec_ix, pred_Q8, psNLSF_CB, NLSFIndices[ 0 ] ); + + /* Predictive residual dequantizer */ + silk_NLSF_residual_dequant( res_Q10, &NLSFIndices[ 1 ], pred_Q8, psNLSF_CB->quantStepSize_Q16, psNLSF_CB->order ); + + /* Weights from codebook vector */ + silk_NLSF_VQ_weights_laroia( W_tmp_QW, pNLSF_Q15, psNLSF_CB->order ); + + /* Apply inverse square-rooted weights and add to output */ + for( i = 0; i < psNLSF_CB->order; i++ ) { + W_tmp_Q9 = silk_SQRT_APPROX( silk_LSHIFT( (opus_int32)W_tmp_QW[ i ], 18 - NLSF_W_Q ) ); + NLSF_Q15_tmp = silk_ADD32( pNLSF_Q15[ i ], silk_DIV32_16( silk_LSHIFT( (opus_int32)res_Q10[ i ], 14 ), W_tmp_Q9 ) ); + pNLSF_Q15[ i ] = (opus_int16)silk_LIMIT( NLSF_Q15_tmp, 0, 32767 ); + } + + /* NLSF stabilization */ + silk_NLSF_stabilize( pNLSF_Q15, psNLSF_CB->deltaMin_Q15, psNLSF_CB->order ); +} diff --git a/thirdparty/opus/silk/NLSF_del_dec_quant.c b/thirdparty/opus/silk/NLSF_del_dec_quant.c new file mode 100644 index 000000000000..de88fee060e3 --- /dev/null +++ b/thirdparty/opus/silk/NLSF_del_dec_quant.c @@ -0,0 +1,217 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* Delayed-decision quantizer for NLSF residuals */ +opus_int32 silk_NLSF_del_dec_quant( /* O Returns RD value in Q25 */ + opus_int8 indices[], /* O Quantization indices [ order ] */ + const opus_int16 x_Q10[], /* I Input [ order ] */ + const opus_int16 w_Q5[], /* I Weights [ order ] */ + const opus_uint8 pred_coef_Q8[], /* I Backward predictor coefs [ order ] */ + const opus_int16 ec_ix[], /* I Indices to entropy coding tables [ order ] */ + const opus_uint8 ec_rates_Q5[], /* I Rates [] */ + const opus_int quant_step_size_Q16, /* I Quantization step size */ + const opus_int16 inv_quant_step_size_Q6, /* I Inverse quantization step size */ + const opus_int32 mu_Q20, /* I R/D tradeoff */ + const opus_int16 order /* I Number of input values */ +) +{ + opus_int i, j, nStates, ind_tmp, ind_min_max, ind_max_min, in_Q10, res_Q10; + opus_int pred_Q10, diff_Q10, rate0_Q5, rate1_Q5; + opus_int16 out0_Q10, out1_Q10; + opus_int32 RD_tmp_Q25, min_Q25, min_max_Q25, max_min_Q25; + opus_int ind_sort[ NLSF_QUANT_DEL_DEC_STATES ]; + opus_int8 ind[ NLSF_QUANT_DEL_DEC_STATES ][ MAX_LPC_ORDER ]; + opus_int16 prev_out_Q10[ 2 * NLSF_QUANT_DEL_DEC_STATES ]; + opus_int32 RD_Q25[ 2 * NLSF_QUANT_DEL_DEC_STATES ]; + opus_int32 RD_min_Q25[ NLSF_QUANT_DEL_DEC_STATES ]; + opus_int32 RD_max_Q25[ NLSF_QUANT_DEL_DEC_STATES ]; + const opus_uint8 *rates_Q5; + + opus_int out0_Q10_table[2 * NLSF_QUANT_MAX_AMPLITUDE_EXT]; + opus_int out1_Q10_table[2 * NLSF_QUANT_MAX_AMPLITUDE_EXT]; + + for (i = -NLSF_QUANT_MAX_AMPLITUDE_EXT; i <= NLSF_QUANT_MAX_AMPLITUDE_EXT-1; i++) + { + out0_Q10 = silk_LSHIFT( i, 10 ); + out1_Q10 = silk_ADD16( out0_Q10, 1024 ); + if( i > 0 ) { + out0_Q10 = silk_SUB16( out0_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) ); + out1_Q10 = silk_SUB16( out1_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) ); + } else if( i == 0 ) { + out1_Q10 = silk_SUB16( out1_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) ); + } else if( i == -1 ) { + out0_Q10 = silk_ADD16( out0_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) ); + } else { + out0_Q10 = silk_ADD16( out0_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) ); + out1_Q10 = silk_ADD16( out1_Q10, SILK_FIX_CONST( NLSF_QUANT_LEVEL_ADJ, 10 ) ); + } + out0_Q10_table[ i + NLSF_QUANT_MAX_AMPLITUDE_EXT ] = silk_RSHIFT( silk_SMULBB( out0_Q10, quant_step_size_Q16 ), 16 ); + out1_Q10_table[ i + NLSF_QUANT_MAX_AMPLITUDE_EXT ] = silk_RSHIFT( silk_SMULBB( out1_Q10, quant_step_size_Q16 ), 16 ); + } + + silk_assert( (NLSF_QUANT_DEL_DEC_STATES & (NLSF_QUANT_DEL_DEC_STATES-1)) == 0 ); /* must be power of two */ + + nStates = 1; + RD_Q25[ 0 ] = 0; + prev_out_Q10[ 0 ] = 0; + for( i = order - 1; ; i-- ) { + rates_Q5 = &ec_rates_Q5[ ec_ix[ i ] ]; + in_Q10 = x_Q10[ i ]; + for( j = 0; j < nStates; j++ ) { + pred_Q10 = silk_RSHIFT( silk_SMULBB( (opus_int16)pred_coef_Q8[ i ], prev_out_Q10[ j ] ), 8 ); + res_Q10 = silk_SUB16( in_Q10, pred_Q10 ); + ind_tmp = silk_RSHIFT( silk_SMULBB( inv_quant_step_size_Q6, res_Q10 ), 16 ); + ind_tmp = silk_LIMIT( ind_tmp, -NLSF_QUANT_MAX_AMPLITUDE_EXT, NLSF_QUANT_MAX_AMPLITUDE_EXT-1 ); + ind[ j ][ i ] = (opus_int8)ind_tmp; + + /* compute outputs for ind_tmp and ind_tmp + 1 */ + out0_Q10 = out0_Q10_table[ ind_tmp + NLSF_QUANT_MAX_AMPLITUDE_EXT ]; + out1_Q10 = out1_Q10_table[ ind_tmp + NLSF_QUANT_MAX_AMPLITUDE_EXT ]; + + out0_Q10 = silk_ADD16( out0_Q10, pred_Q10 ); + out1_Q10 = silk_ADD16( out1_Q10, pred_Q10 ); + prev_out_Q10[ j ] = out0_Q10; + prev_out_Q10[ j + nStates ] = out1_Q10; + + /* compute RD for ind_tmp and ind_tmp + 1 */ + if( ind_tmp + 1 >= NLSF_QUANT_MAX_AMPLITUDE ) { + if( ind_tmp + 1 == NLSF_QUANT_MAX_AMPLITUDE ) { + rate0_Q5 = rates_Q5[ ind_tmp + NLSF_QUANT_MAX_AMPLITUDE ]; + rate1_Q5 = 280; + } else { + rate0_Q5 = silk_SMLABB( 280 - 43 * NLSF_QUANT_MAX_AMPLITUDE, 43, ind_tmp ); + rate1_Q5 = silk_ADD16( rate0_Q5, 43 ); + } + } else if( ind_tmp <= -NLSF_QUANT_MAX_AMPLITUDE ) { + if( ind_tmp == -NLSF_QUANT_MAX_AMPLITUDE ) { + rate0_Q5 = 280; + rate1_Q5 = rates_Q5[ ind_tmp + 1 + NLSF_QUANT_MAX_AMPLITUDE ]; + } else { + rate0_Q5 = silk_SMLABB( 280 - 43 * NLSF_QUANT_MAX_AMPLITUDE, -43, ind_tmp ); + rate1_Q5 = silk_SUB16( rate0_Q5, 43 ); + } + } else { + rate0_Q5 = rates_Q5[ ind_tmp + NLSF_QUANT_MAX_AMPLITUDE ]; + rate1_Q5 = rates_Q5[ ind_tmp + 1 + NLSF_QUANT_MAX_AMPLITUDE ]; + } + RD_tmp_Q25 = RD_Q25[ j ]; + diff_Q10 = silk_SUB16( in_Q10, out0_Q10 ); + RD_Q25[ j ] = silk_SMLABB( silk_MLA( RD_tmp_Q25, silk_SMULBB( diff_Q10, diff_Q10 ), w_Q5[ i ] ), mu_Q20, rate0_Q5 ); + diff_Q10 = silk_SUB16( in_Q10, out1_Q10 ); + RD_Q25[ j + nStates ] = silk_SMLABB( silk_MLA( RD_tmp_Q25, silk_SMULBB( diff_Q10, diff_Q10 ), w_Q5[ i ] ), mu_Q20, rate1_Q5 ); + } + + if( nStates <= ( NLSF_QUANT_DEL_DEC_STATES >> 1 ) ) { + /* double number of states and copy */ + for( j = 0; j < nStates; j++ ) { + ind[ j + nStates ][ i ] = ind[ j ][ i ] + 1; + } + nStates = silk_LSHIFT( nStates, 1 ); + for( j = nStates; j < NLSF_QUANT_DEL_DEC_STATES; j++ ) { + ind[ j ][ i ] = ind[ j - nStates ][ i ]; + } + } else if( i > 0 ) { + /* sort lower and upper half of RD_Q25, pairwise */ + for( j = 0; j < NLSF_QUANT_DEL_DEC_STATES; j++ ) { + if( RD_Q25[ j ] > RD_Q25[ j + NLSF_QUANT_DEL_DEC_STATES ] ) { + RD_max_Q25[ j ] = RD_Q25[ j ]; + RD_min_Q25[ j ] = RD_Q25[ j + NLSF_QUANT_DEL_DEC_STATES ]; + RD_Q25[ j ] = RD_min_Q25[ j ]; + RD_Q25[ j + NLSF_QUANT_DEL_DEC_STATES ] = RD_max_Q25[ j ]; + /* swap prev_out values */ + out0_Q10 = prev_out_Q10[ j ]; + prev_out_Q10[ j ] = prev_out_Q10[ j + NLSF_QUANT_DEL_DEC_STATES ]; + prev_out_Q10[ j + NLSF_QUANT_DEL_DEC_STATES ] = out0_Q10; + ind_sort[ j ] = j + NLSF_QUANT_DEL_DEC_STATES; + } else { + RD_min_Q25[ j ] = RD_Q25[ j ]; + RD_max_Q25[ j ] = RD_Q25[ j + NLSF_QUANT_DEL_DEC_STATES ]; + ind_sort[ j ] = j; + } + } + /* compare the highest RD values of the winning half with the lowest one in the losing half, and copy if necessary */ + /* afterwards ind_sort[] will contain the indices of the NLSF_QUANT_DEL_DEC_STATES winning RD values */ + while( 1 ) { + min_max_Q25 = silk_int32_MAX; + max_min_Q25 = 0; + ind_min_max = 0; + ind_max_min = 0; + for( j = 0; j < NLSF_QUANT_DEL_DEC_STATES; j++ ) { + if( min_max_Q25 > RD_max_Q25[ j ] ) { + min_max_Q25 = RD_max_Q25[ j ]; + ind_min_max = j; + } + if( max_min_Q25 < RD_min_Q25[ j ] ) { + max_min_Q25 = RD_min_Q25[ j ]; + ind_max_min = j; + } + } + if( min_max_Q25 >= max_min_Q25 ) { + break; + } + /* copy ind_min_max to ind_max_min */ + ind_sort[ ind_max_min ] = ind_sort[ ind_min_max ] ^ NLSF_QUANT_DEL_DEC_STATES; + RD_Q25[ ind_max_min ] = RD_Q25[ ind_min_max + NLSF_QUANT_DEL_DEC_STATES ]; + prev_out_Q10[ ind_max_min ] = prev_out_Q10[ ind_min_max + NLSF_QUANT_DEL_DEC_STATES ]; + RD_min_Q25[ ind_max_min ] = 0; + RD_max_Q25[ ind_min_max ] = silk_int32_MAX; + silk_memcpy( ind[ ind_max_min ], ind[ ind_min_max ], MAX_LPC_ORDER * sizeof( opus_int8 ) ); + } + /* increment index if it comes from the upper half */ + for( j = 0; j < NLSF_QUANT_DEL_DEC_STATES; j++ ) { + ind[ j ][ i ] += silk_RSHIFT( ind_sort[ j ], NLSF_QUANT_DEL_DEC_STATES_LOG2 ); + } + } else { /* i == 0 */ + break; + } + } + + /* last sample: find winner, copy indices and return RD value */ + ind_tmp = 0; + min_Q25 = silk_int32_MAX; + for( j = 0; j < 2 * NLSF_QUANT_DEL_DEC_STATES; j++ ) { + if( min_Q25 > RD_Q25[ j ] ) { + min_Q25 = RD_Q25[ j ]; + ind_tmp = j; + } + } + for( j = 0; j < order; j++ ) { + indices[ j ] = ind[ ind_tmp & ( NLSF_QUANT_DEL_DEC_STATES - 1 ) ][ j ]; + silk_assert( indices[ j ] >= -NLSF_QUANT_MAX_AMPLITUDE_EXT ); + silk_assert( indices[ j ] <= NLSF_QUANT_MAX_AMPLITUDE_EXT ); + } + indices[ 0 ] += silk_RSHIFT( ind_tmp, NLSF_QUANT_DEL_DEC_STATES_LOG2 ); + silk_assert( indices[ 0 ] <= NLSF_QUANT_MAX_AMPLITUDE_EXT ); + silk_assert( min_Q25 >= 0 ); + return min_Q25; +} diff --git a/thirdparty/opus/silk/NLSF_encode.c b/thirdparty/opus/silk/NLSF_encode.c new file mode 100644 index 000000000000..f03c3f1c3549 --- /dev/null +++ b/thirdparty/opus/silk/NLSF_encode.c @@ -0,0 +1,137 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" +#include "stack_alloc.h" + +/***********************/ +/* NLSF vector encoder */ +/***********************/ +opus_int32 silk_NLSF_encode( /* O Returns RD value in Q25 */ + opus_int8 *NLSFIndices, /* I Codebook path vector [ LPC_ORDER + 1 ] */ + opus_int16 *pNLSF_Q15, /* I/O Quantized NLSF vector [ LPC_ORDER ] */ + const silk_NLSF_CB_struct *psNLSF_CB, /* I Codebook object */ + const opus_int16 *pW_QW, /* I NLSF weight vector [ LPC_ORDER ] */ + const opus_int NLSF_mu_Q20, /* I Rate weight for the RD optimization */ + const opus_int nSurvivors, /* I Max survivors after first stage */ + const opus_int signalType /* I Signal type: 0/1/2 */ +) +{ + opus_int i, s, ind1, bestIndex, prob_Q8, bits_q7; + opus_int32 W_tmp_Q9, ret; + VARDECL( opus_int32, err_Q26 ); + VARDECL( opus_int32, RD_Q25 ); + VARDECL( opus_int, tempIndices1 ); + VARDECL( opus_int8, tempIndices2 ); + opus_int16 res_Q15[ MAX_LPC_ORDER ]; + opus_int16 res_Q10[ MAX_LPC_ORDER ]; + opus_int16 NLSF_tmp_Q15[ MAX_LPC_ORDER ]; + opus_int16 W_tmp_QW[ MAX_LPC_ORDER ]; + opus_int16 W_adj_Q5[ MAX_LPC_ORDER ]; + opus_uint8 pred_Q8[ MAX_LPC_ORDER ]; + opus_int16 ec_ix[ MAX_LPC_ORDER ]; + const opus_uint8 *pCB_element, *iCDF_ptr; + SAVE_STACK; + + silk_assert( nSurvivors <= NLSF_VQ_MAX_SURVIVORS ); + silk_assert( signalType >= 0 && signalType <= 2 ); + silk_assert( NLSF_mu_Q20 <= 32767 && NLSF_mu_Q20 >= 0 ); + + /* NLSF stabilization */ + silk_NLSF_stabilize( pNLSF_Q15, psNLSF_CB->deltaMin_Q15, psNLSF_CB->order ); + + /* First stage: VQ */ + ALLOC( err_Q26, psNLSF_CB->nVectors, opus_int32 ); + silk_NLSF_VQ( err_Q26, pNLSF_Q15, psNLSF_CB->CB1_NLSF_Q8, psNLSF_CB->nVectors, psNLSF_CB->order ); + + /* Sort the quantization errors */ + ALLOC( tempIndices1, nSurvivors, opus_int ); + silk_insertion_sort_increasing( err_Q26, tempIndices1, psNLSF_CB->nVectors, nSurvivors ); + + ALLOC( RD_Q25, nSurvivors, opus_int32 ); + ALLOC( tempIndices2, nSurvivors * MAX_LPC_ORDER, opus_int8 ); + + /* Loop over survivors */ + for( s = 0; s < nSurvivors; s++ ) { + ind1 = tempIndices1[ s ]; + + /* Residual after first stage */ + pCB_element = &psNLSF_CB->CB1_NLSF_Q8[ ind1 * psNLSF_CB->order ]; + for( i = 0; i < psNLSF_CB->order; i++ ) { + NLSF_tmp_Q15[ i ] = silk_LSHIFT16( (opus_int16)pCB_element[ i ], 7 ); + res_Q15[ i ] = pNLSF_Q15[ i ] - NLSF_tmp_Q15[ i ]; + } + + /* Weights from codebook vector */ + silk_NLSF_VQ_weights_laroia( W_tmp_QW, NLSF_tmp_Q15, psNLSF_CB->order ); + + /* Apply square-rooted weights */ + for( i = 0; i < psNLSF_CB->order; i++ ) { + W_tmp_Q9 = silk_SQRT_APPROX( silk_LSHIFT( (opus_int32)W_tmp_QW[ i ], 18 - NLSF_W_Q ) ); + res_Q10[ i ] = (opus_int16)silk_RSHIFT( silk_SMULBB( res_Q15[ i ], W_tmp_Q9 ), 14 ); + } + + /* Modify input weights accordingly */ + for( i = 0; i < psNLSF_CB->order; i++ ) { + W_adj_Q5[ i ] = silk_DIV32_16( silk_LSHIFT( (opus_int32)pW_QW[ i ], 5 ), W_tmp_QW[ i ] ); + } + + /* Unpack entropy table indices and predictor for current CB1 index */ + silk_NLSF_unpack( ec_ix, pred_Q8, psNLSF_CB, ind1 ); + + /* Trellis quantizer */ + RD_Q25[ s ] = silk_NLSF_del_dec_quant( &tempIndices2[ s * MAX_LPC_ORDER ], res_Q10, W_adj_Q5, pred_Q8, ec_ix, + psNLSF_CB->ec_Rates_Q5, psNLSF_CB->quantStepSize_Q16, psNLSF_CB->invQuantStepSize_Q6, NLSF_mu_Q20, psNLSF_CB->order ); + + /* Add rate for first stage */ + iCDF_ptr = &psNLSF_CB->CB1_iCDF[ ( signalType >> 1 ) * psNLSF_CB->nVectors ]; + if( ind1 == 0 ) { + prob_Q8 = 256 - iCDF_ptr[ ind1 ]; + } else { + prob_Q8 = iCDF_ptr[ ind1 - 1 ] - iCDF_ptr[ ind1 ]; + } + bits_q7 = ( 8 << 7 ) - silk_lin2log( prob_Q8 ); + RD_Q25[ s ] = silk_SMLABB( RD_Q25[ s ], bits_q7, silk_RSHIFT( NLSF_mu_Q20, 2 ) ); + } + + /* Find the lowest rate-distortion error */ + silk_insertion_sort_increasing( RD_Q25, &bestIndex, nSurvivors, 1 ); + + NLSFIndices[ 0 ] = (opus_int8)tempIndices1[ bestIndex ]; + silk_memcpy( &NLSFIndices[ 1 ], &tempIndices2[ bestIndex * MAX_LPC_ORDER ], psNLSF_CB->order * sizeof( opus_int8 ) ); + + /* Decode */ + silk_NLSF_decode( pNLSF_Q15, NLSFIndices, psNLSF_CB ); + + ret = RD_Q25[ 0 ]; + RESTORE_STACK; + return ret; +} diff --git a/thirdparty/opus/silk/NLSF_stabilize.c b/thirdparty/opus/silk/NLSF_stabilize.c new file mode 100644 index 000000000000..8f3426b91e68 --- /dev/null +++ b/thirdparty/opus/silk/NLSF_stabilize.c @@ -0,0 +1,142 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/* NLSF stabilizer: */ +/* */ +/* - Moves NLSFs further apart if they are too close */ +/* - Moves NLSFs away from borders if they are too close */ +/* - High effort to achieve a modification with minimum */ +/* Euclidean distance to input vector */ +/* - Output are sorted NLSF coefficients */ +/* */ + +#include "SigProc_FIX.h" + +/* Constant Definitions */ +#define MAX_LOOPS 20 + +/* NLSF stabilizer, for a single input data vector */ +void silk_NLSF_stabilize( + opus_int16 *NLSF_Q15, /* I/O Unstable/stabilized normalized LSF vector in Q15 [L] */ + const opus_int16 *NDeltaMin_Q15, /* I Min distance vector, NDeltaMin_Q15[L] must be >= 1 [L+1] */ + const opus_int L /* I Number of NLSF parameters in the input vector */ +) +{ + opus_int i, I=0, k, loops; + opus_int16 center_freq_Q15; + opus_int32 diff_Q15, min_diff_Q15, min_center_Q15, max_center_Q15; + + /* This is necessary to ensure an output within range of a opus_int16 */ + silk_assert( NDeltaMin_Q15[L] >= 1 ); + + for( loops = 0; loops < MAX_LOOPS; loops++ ) { + /**************************/ + /* Find smallest distance */ + /**************************/ + /* First element */ + min_diff_Q15 = NLSF_Q15[0] - NDeltaMin_Q15[0]; + I = 0; + /* Middle elements */ + for( i = 1; i <= L-1; i++ ) { + diff_Q15 = NLSF_Q15[i] - ( NLSF_Q15[i-1] + NDeltaMin_Q15[i] ); + if( diff_Q15 < min_diff_Q15 ) { + min_diff_Q15 = diff_Q15; + I = i; + } + } + /* Last element */ + diff_Q15 = ( 1 << 15 ) - ( NLSF_Q15[L-1] + NDeltaMin_Q15[L] ); + if( diff_Q15 < min_diff_Q15 ) { + min_diff_Q15 = diff_Q15; + I = L; + } + + /***************************************************/ + /* Now check if the smallest distance non-negative */ + /***************************************************/ + if( min_diff_Q15 >= 0 ) { + return; + } + + if( I == 0 ) { + /* Move away from lower limit */ + NLSF_Q15[0] = NDeltaMin_Q15[0]; + + } else if( I == L) { + /* Move away from higher limit */ + NLSF_Q15[L-1] = ( 1 << 15 ) - NDeltaMin_Q15[L]; + + } else { + /* Find the lower extreme for the location of the current center frequency */ + min_center_Q15 = 0; + for( k = 0; k < I; k++ ) { + min_center_Q15 += NDeltaMin_Q15[k]; + } + min_center_Q15 += silk_RSHIFT( NDeltaMin_Q15[I], 1 ); + + /* Find the upper extreme for the location of the current center frequency */ + max_center_Q15 = 1 << 15; + for( k = L; k > I; k-- ) { + max_center_Q15 -= NDeltaMin_Q15[k]; + } + max_center_Q15 -= silk_RSHIFT( NDeltaMin_Q15[I], 1 ); + + /* Move apart, sorted by value, keeping the same center frequency */ + center_freq_Q15 = (opus_int16)silk_LIMIT_32( silk_RSHIFT_ROUND( (opus_int32)NLSF_Q15[I-1] + (opus_int32)NLSF_Q15[I], 1 ), + min_center_Q15, max_center_Q15 ); + NLSF_Q15[I-1] = center_freq_Q15 - silk_RSHIFT( NDeltaMin_Q15[I], 1 ); + NLSF_Q15[I] = NLSF_Q15[I-1] + NDeltaMin_Q15[I]; + } + } + + /* Safe and simple fall back method, which is less ideal than the above */ + if( loops == MAX_LOOPS ) + { + /* Insertion sort (fast for already almost sorted arrays): */ + /* Best case: O(n) for an already sorted array */ + /* Worst case: O(n^2) for an inversely sorted array */ + silk_insertion_sort_increasing_all_values_int16( &NLSF_Q15[0], L ); + + /* First NLSF should be no less than NDeltaMin[0] */ + NLSF_Q15[0] = silk_max_int( NLSF_Q15[0], NDeltaMin_Q15[0] ); + + /* Keep delta_min distance between the NLSFs */ + for( i = 1; i < L; i++ ) + NLSF_Q15[i] = silk_max_int( NLSF_Q15[i], silk_ADD_SAT16( NLSF_Q15[i-1], NDeltaMin_Q15[i] ) ); + + /* Last NLSF should be no higher than 1 - NDeltaMin[L] */ + NLSF_Q15[L-1] = silk_min_int( NLSF_Q15[L-1], (1<<15) - NDeltaMin_Q15[L] ); + + /* Keep NDeltaMin distance between the NLSFs */ + for( i = L-2; i >= 0; i-- ) + NLSF_Q15[i] = silk_min_int( NLSF_Q15[i], NLSF_Q15[i+1] - NDeltaMin_Q15[i+1] ); + } +} diff --git a/thirdparty/opus/silk/NLSF_unpack.c b/thirdparty/opus/silk/NLSF_unpack.c new file mode 100644 index 000000000000..17bd23f752f3 --- /dev/null +++ b/thirdparty/opus/silk/NLSF_unpack.c @@ -0,0 +1,55 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* Unpack predictor values and indices for entropy coding tables */ +void silk_NLSF_unpack( + opus_int16 ec_ix[], /* O Indices to entropy tables [ LPC_ORDER ] */ + opus_uint8 pred_Q8[], /* O LSF predictor [ LPC_ORDER ] */ + const silk_NLSF_CB_struct *psNLSF_CB, /* I Codebook object */ + const opus_int CB1_index /* I Index of vector in first LSF codebook */ +) +{ + opus_int i; + opus_uint8 entry; + const opus_uint8 *ec_sel_ptr; + + ec_sel_ptr = &psNLSF_CB->ec_sel[ CB1_index * psNLSF_CB->order / 2 ]; + for( i = 0; i < psNLSF_CB->order; i += 2 ) { + entry = *ec_sel_ptr++; + ec_ix [ i ] = silk_SMULBB( silk_RSHIFT( entry, 1 ) & 7, 2 * NLSF_QUANT_MAX_AMPLITUDE + 1 ); + pred_Q8[ i ] = psNLSF_CB->pred_Q8[ i + ( entry & 1 ) * ( psNLSF_CB->order - 1 ) ]; + ec_ix [ i + 1 ] = silk_SMULBB( silk_RSHIFT( entry, 5 ) & 7, 2 * NLSF_QUANT_MAX_AMPLITUDE + 1 ); + pred_Q8[ i + 1 ] = psNLSF_CB->pred_Q8[ i + ( silk_RSHIFT( entry, 4 ) & 1 ) * ( psNLSF_CB->order - 1 ) + 1 ]; + } +} + diff --git a/thirdparty/opus/silk/NSQ.c b/thirdparty/opus/silk/NSQ.c new file mode 100644 index 000000000000..43e3fee7e0ad --- /dev/null +++ b/thirdparty/opus/silk/NSQ.c @@ -0,0 +1,429 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" +#include "stack_alloc.h" +#include "NSQ.h" + + +static OPUS_INLINE void silk_nsq_scale_states( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + const opus_int32 x_Q3[], /* I input in Q3 */ + opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */ + const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */ + opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ + opus_int subfr, /* I subframe number */ + const opus_int LTP_scale_Q14, /* I */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ + const opus_int signal_type /* I Signal type */ +); + +#if !defined(OPUS_X86_MAY_HAVE_SSE4_1) +static OPUS_INLINE void silk_noise_shape_quantizer( + silk_nsq_state *NSQ, /* I/O NSQ state */ + opus_int signalType, /* I Signal type */ + const opus_int32 x_sc_Q10[], /* I */ + opus_int8 pulses[], /* O */ + opus_int16 xq[], /* O */ + opus_int32 sLTP_Q15[], /* I/O LTP state */ + const opus_int16 a_Q12[], /* I Short term prediction coefs */ + const opus_int16 b_Q14[], /* I Long term prediction coefs */ + const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */ + opus_int lag, /* I Pitch lag */ + opus_int32 HarmShapeFIRPacked_Q14, /* I */ + opus_int Tilt_Q14, /* I Spectral tilt */ + opus_int32 LF_shp_Q14, /* I */ + opus_int32 Gain_Q16, /* I */ + opus_int Lambda_Q10, /* I */ + opus_int offset_Q10, /* I */ + opus_int length, /* I Input length */ + opus_int shapingLPCOrder, /* I Noise shaping AR filter order */ + opus_int predictLPCOrder, /* I Prediction filter order */ + int arch /* I Architecture */ +); +#endif + +void silk_NSQ_c +( + const silk_encoder_state *psEncC, /* I/O Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int32 x_Q3[], /* I Prefiltered input signal */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ +) +{ + opus_int k, lag, start_idx, LSF_interpolation_flag; + const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13; + opus_int16 *pxq; + VARDECL( opus_int32, sLTP_Q15 ); + VARDECL( opus_int16, sLTP ); + opus_int32 HarmShapeFIRPacked_Q14; + opus_int offset_Q10; + VARDECL( opus_int32, x_sc_Q10 ); + SAVE_STACK; + + NSQ->rand_seed = psIndices->Seed; + + /* Set unvoiced lag to the previous one, overwrite later for voiced */ + lag = NSQ->lagPrev; + + silk_assert( NSQ->prev_gain_Q16 != 0 ); + + offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ]; + + if( psIndices->NLSFInterpCoef_Q2 == 4 ) { + LSF_interpolation_flag = 0; + } else { + LSF_interpolation_flag = 1; + } + + ALLOC( sLTP_Q15, + psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); + ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 ); + ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 ); + /* Set up pointers to start of sub frame */ + NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length; + NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; + pxq = &NSQ->xq[ psEncC->ltp_mem_length ]; + for( k = 0; k < psEncC->nb_subfr; k++ ) { + A_Q12 = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ]; + B_Q14 = <PCoef_Q14[ k * LTP_ORDER ]; + AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ]; + + /* Noise shape parameters */ + silk_assert( HarmShapeGain_Q14[ k ] >= 0 ); + HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 ); + HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 ); + + NSQ->rewhite_flag = 0; + if( psIndices->signalType == TYPE_VOICED ) { + /* Voiced */ + lag = pitchL[ k ]; + + /* Re-whitening */ + if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) { + /* Rewhiten with new A coefs */ + start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2; + silk_assert( start_idx > 0 ); + + silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ], + A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch ); + + NSQ->rewhite_flag = 1; + NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; + } + } + + silk_nsq_scale_states( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType ); + + silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14, + AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10, + offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch ); + + x_Q3 += psEncC->subfr_length; + pulses += psEncC->subfr_length; + pxq += psEncC->subfr_length; + } + + /* Update lagPrev for next frame */ + NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ]; + + /* Save quantized speech and noise shaping signals */ + /* DEBUG_STORE_DATA( enc.pcm, &NSQ->xq[ psEncC->ltp_mem_length ], psEncC->frame_length * sizeof( opus_int16 ) ) */ + silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) ); + silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) ); + RESTORE_STACK; +} + +/***********************************/ +/* silk_noise_shape_quantizer */ +/***********************************/ + +#if !defined(OPUS_X86_MAY_HAVE_SSE4_1) +static OPUS_INLINE +#endif +void silk_noise_shape_quantizer( + silk_nsq_state *NSQ, /* I/O NSQ state */ + opus_int signalType, /* I Signal type */ + const opus_int32 x_sc_Q10[], /* I */ + opus_int8 pulses[], /* O */ + opus_int16 xq[], /* O */ + opus_int32 sLTP_Q15[], /* I/O LTP state */ + const opus_int16 a_Q12[], /* I Short term prediction coefs */ + const opus_int16 b_Q14[], /* I Long term prediction coefs */ + const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */ + opus_int lag, /* I Pitch lag */ + opus_int32 HarmShapeFIRPacked_Q14, /* I */ + opus_int Tilt_Q14, /* I Spectral tilt */ + opus_int32 LF_shp_Q14, /* I */ + opus_int32 Gain_Q16, /* I */ + opus_int Lambda_Q10, /* I */ + opus_int offset_Q10, /* I */ + opus_int length, /* I Input length */ + opus_int shapingLPCOrder, /* I Noise shaping AR filter order */ + opus_int predictLPCOrder, /* I Prediction filter order */ + int arch /* I Architecture */ +) +{ + opus_int i; + opus_int32 LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13; + opus_int32 n_LF_Q12, r_Q10, rr_Q10, q1_Q0, q1_Q10, q2_Q10, rd1_Q20, rd2_Q20; + opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10; + opus_int32 tmp1, tmp2, sLF_AR_shp_Q14; + opus_int32 *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr; +#ifdef silk_short_prediction_create_arch_coef + opus_int32 a_Q12_arch[MAX_LPC_ORDER]; +#endif + + shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ]; + pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; + Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 ); + + /* Set up short term AR state */ + psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ]; + +#ifdef silk_short_prediction_create_arch_coef + silk_short_prediction_create_arch_coef(a_Q12_arch, a_Q12, predictLPCOrder); +#endif + + for( i = 0; i < length; i++ ) { + /* Generate dither */ + NSQ->rand_seed = silk_RAND( NSQ->rand_seed ); + + /* Short-term prediction */ + LPC_pred_Q10 = silk_noise_shape_quantizer_short_prediction(psLPC_Q14, a_Q12, a_Q12_arch, predictLPCOrder, arch); + + /* Long-term prediction */ + if( signalType == TYPE_VOICED ) { + /* Unrolled loop */ + /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ + LTP_pred_Q13 = 2; + LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ 0 ], b_Q14[ 0 ] ); + LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -1 ], b_Q14[ 1 ] ); + LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -2 ], b_Q14[ 2 ] ); + LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -3 ], b_Q14[ 3 ] ); + LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -4 ], b_Q14[ 4 ] ); + pred_lag_ptr++; + } else { + LTP_pred_Q13 = 0; + } + + /* Noise shape feedback */ + silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */ + n_AR_Q12 = silk_NSQ_noise_shape_feedback_loop(psLPC_Q14, NSQ->sAR2_Q14, AR_shp_Q13, shapingLPCOrder, arch); + + n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sLF_AR_shp_Q14, Tilt_Q14 ); + + n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 ); + n_LF_Q12 = silk_SMLAWT( n_LF_Q12, NSQ->sLF_AR_shp_Q14, LF_shp_Q14 ); + + silk_assert( lag > 0 || signalType != TYPE_VOICED ); + + /* Combine prediction and noise shaping signals */ + tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */ + tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */ + if( lag > 0 ) { + /* Symmetric, packed FIR coefficients */ + n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); + n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); + n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 ); + shp_lag_ptr++; + + tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 ); /* Q13 */ + tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 ); /* Q13 */ + tmp1 = silk_RSHIFT_ROUND( tmp1, 3 ); /* Q10 */ + } else { + tmp1 = silk_RSHIFT_ROUND( tmp1, 2 ); /* Q10 */ + } + + r_Q10 = silk_SUB32( x_sc_Q10[ i ], tmp1 ); /* residual error Q10 */ + + /* Flip sign depending on dither */ + if ( NSQ->rand_seed < 0 ) { + r_Q10 = -r_Q10; + } + r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 ); + + /* Find two quantization level candidates and measure their rate-distortion */ + q1_Q10 = silk_SUB32( r_Q10, offset_Q10 ); + q1_Q0 = silk_RSHIFT( q1_Q10, 10 ); + if( q1_Q0 > 0 ) { + q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 ); + q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 ); + q2_Q10 = silk_ADD32( q1_Q10, 1024 ); + rd1_Q20 = silk_SMULBB( q1_Q10, Lambda_Q10 ); + rd2_Q20 = silk_SMULBB( q2_Q10, Lambda_Q10 ); + } else if( q1_Q0 == 0 ) { + q1_Q10 = offset_Q10; + q2_Q10 = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 ); + rd1_Q20 = silk_SMULBB( q1_Q10, Lambda_Q10 ); + rd2_Q20 = silk_SMULBB( q2_Q10, Lambda_Q10 ); + } else if( q1_Q0 == -1 ) { + q2_Q10 = offset_Q10; + q1_Q10 = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 ); + rd1_Q20 = silk_SMULBB( -q1_Q10, Lambda_Q10 ); + rd2_Q20 = silk_SMULBB( q2_Q10, Lambda_Q10 ); + } else { /* Q1_Q0 < -1 */ + q1_Q10 = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 ); + q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 ); + q2_Q10 = silk_ADD32( q1_Q10, 1024 ); + rd1_Q20 = silk_SMULBB( -q1_Q10, Lambda_Q10 ); + rd2_Q20 = silk_SMULBB( -q2_Q10, Lambda_Q10 ); + } + rr_Q10 = silk_SUB32( r_Q10, q1_Q10 ); + rd1_Q20 = silk_SMLABB( rd1_Q20, rr_Q10, rr_Q10 ); + rr_Q10 = silk_SUB32( r_Q10, q2_Q10 ); + rd2_Q20 = silk_SMLABB( rd2_Q20, rr_Q10, rr_Q10 ); + + if( rd2_Q20 < rd1_Q20 ) { + q1_Q10 = q2_Q10; + } + + pulses[ i ] = (opus_int8)silk_RSHIFT_ROUND( q1_Q10, 10 ); + + /* Excitation */ + exc_Q14 = silk_LSHIFT( q1_Q10, 4 ); + if ( NSQ->rand_seed < 0 ) { + exc_Q14 = -exc_Q14; + } + + /* Add predictions */ + LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 ); + xq_Q14 = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 ); + + /* Scale XQ back to normal level before saving */ + xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( xq_Q14, Gain_Q10 ), 8 ) ); + + /* Update states */ + psLPC_Q14++; + *psLPC_Q14 = xq_Q14; + sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 ); + NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14; + + NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 ); + sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 ); + NSQ->sLTP_shp_buf_idx++; + NSQ->sLTP_buf_idx++; + + /* Make dither dependent on quantized signal */ + NSQ->rand_seed = silk_ADD32_ovflw( NSQ->rand_seed, pulses[ i ] ); + } + + /* Update LPC synth buffer */ + silk_memcpy( NSQ->sLPC_Q14, &NSQ->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); +} + +static OPUS_INLINE void silk_nsq_scale_states( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + const opus_int32 x_Q3[], /* I input in Q3 */ + opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */ + const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */ + opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ + opus_int subfr, /* I subframe number */ + const opus_int LTP_scale_Q14, /* I */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ + const opus_int signal_type /* I Signal type */ +) +{ + opus_int i, lag; + opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23; + + lag = pitchL[ subfr ]; + inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 ); + silk_assert( inv_gain_Q31 != 0 ); + + /* Calculate gain adjustment factor */ + if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { + gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); + } else { + gain_adj_Q16 = (opus_int32)1 << 16; + } + + /* Scale input */ + inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 ); + for( i = 0; i < psEncC->subfr_length; i++ ) { + x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 ); + } + + /* Save inverse gain */ + NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; + + /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ + if( NSQ->rewhite_flag ) { + if( subfr == 0 ) { + /* Do LTP downscaling */ + inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 ); + } + for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) { + silk_assert( i < MAX_FRAME_LENGTH ); + sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] ); + } + } + + /* Adjust for changing gain */ + if( gain_adj_Q16 != (opus_int32)1 << 16 ) { + /* Scale long-term shaping state */ + for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i++ ) { + NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] ); + } + + /* Scale long-term prediction state */ + if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) { + for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) { + sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] ); + } + } + + NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 ); + + /* Scale short-term prediction and shaping states */ + for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) { + NSQ->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLPC_Q14[ i ] ); + } + for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) { + NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] ); + } + } +} diff --git a/thirdparty/opus/silk/NSQ.h b/thirdparty/opus/silk/NSQ.h new file mode 100644 index 000000000000..971832f660e2 --- /dev/null +++ b/thirdparty/opus/silk/NSQ.h @@ -0,0 +1,101 @@ +/*********************************************************************** +Copyright (c) 2014 Vidyo. +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ +#ifndef SILK_NSQ_H +#define SILK_NSQ_H + +#include "SigProc_FIX.h" + +#undef silk_short_prediction_create_arch_coef + +static OPUS_INLINE opus_int32 silk_noise_shape_quantizer_short_prediction_c(const opus_int32 *buf32, const opus_int16 *coef16, opus_int order) +{ + opus_int32 out; + silk_assert( order == 10 || order == 16 ); + + /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ + out = silk_RSHIFT( order, 1 ); + out = silk_SMLAWB( out, buf32[ 0 ], coef16[ 0 ] ); + out = silk_SMLAWB( out, buf32[ -1 ], coef16[ 1 ] ); + out = silk_SMLAWB( out, buf32[ -2 ], coef16[ 2 ] ); + out = silk_SMLAWB( out, buf32[ -3 ], coef16[ 3 ] ); + out = silk_SMLAWB( out, buf32[ -4 ], coef16[ 4 ] ); + out = silk_SMLAWB( out, buf32[ -5 ], coef16[ 5 ] ); + out = silk_SMLAWB( out, buf32[ -6 ], coef16[ 6 ] ); + out = silk_SMLAWB( out, buf32[ -7 ], coef16[ 7 ] ); + out = silk_SMLAWB( out, buf32[ -8 ], coef16[ 8 ] ); + out = silk_SMLAWB( out, buf32[ -9 ], coef16[ 9 ] ); + + if( order == 16 ) + { + out = silk_SMLAWB( out, buf32[ -10 ], coef16[ 10 ] ); + out = silk_SMLAWB( out, buf32[ -11 ], coef16[ 11 ] ); + out = silk_SMLAWB( out, buf32[ -12 ], coef16[ 12 ] ); + out = silk_SMLAWB( out, buf32[ -13 ], coef16[ 13 ] ); + out = silk_SMLAWB( out, buf32[ -14 ], coef16[ 14 ] ); + out = silk_SMLAWB( out, buf32[ -15 ], coef16[ 15 ] ); + } + return out; +} + +#define silk_noise_shape_quantizer_short_prediction(in, coef, coefRev, order, arch) ((void)arch,silk_noise_shape_quantizer_short_prediction_c(in, coef, order)) + +static OPUS_INLINE opus_int32 silk_NSQ_noise_shape_feedback_loop_c(const opus_int32 *data0, opus_int32 *data1, const opus_int16 *coef, opus_int order) +{ + opus_int32 out; + opus_int32 tmp1, tmp2; + opus_int j; + + tmp2 = data0[0]; + tmp1 = data1[0]; + data1[0] = tmp2; + + out = silk_RSHIFT(order, 1); + out = silk_SMLAWB(out, tmp2, coef[0]); + + for (j = 2; j < order; j += 2) { + tmp2 = data1[j - 1]; + data1[j - 1] = tmp1; + out = silk_SMLAWB(out, tmp1, coef[j - 1]); + tmp1 = data1[j + 0]; + data1[j + 0] = tmp2; + out = silk_SMLAWB(out, tmp2, coef[j]); + } + data1[order - 1] = tmp1; + out = silk_SMLAWB(out, tmp1, coef[order - 1]); + /* Q11 -> Q12 */ + out = silk_LSHIFT32( out, 1 ); + return out; +} + +#define silk_NSQ_noise_shape_feedback_loop(data0, data1, coef, order, arch) ((void)arch,silk_NSQ_noise_shape_feedback_loop_c(data0, data1, coef, order)) + +#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +#include "arm/NSQ_neon.h" +#endif + +#endif /* SILK_NSQ_H */ diff --git a/thirdparty/opus/silk/NSQ_del_dec.c b/thirdparty/opus/silk/NSQ_del_dec.c new file mode 100644 index 000000000000..ab6feeac9870 --- /dev/null +++ b/thirdparty/opus/silk/NSQ_del_dec.c @@ -0,0 +1,716 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" +#include "stack_alloc.h" +#include "NSQ.h" + + +typedef struct { + opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ]; + opus_int32 RandState[ DECISION_DELAY ]; + opus_int32 Q_Q10[ DECISION_DELAY ]; + opus_int32 Xq_Q14[ DECISION_DELAY ]; + opus_int32 Pred_Q15[ DECISION_DELAY ]; + opus_int32 Shape_Q14[ DECISION_DELAY ]; + opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ]; + opus_int32 LF_AR_Q14; + opus_int32 Seed; + opus_int32 SeedInit; + opus_int32 RD_Q10; +} NSQ_del_dec_struct; + +typedef struct { + opus_int32 Q_Q10; + opus_int32 RD_Q10; + opus_int32 xq_Q14; + opus_int32 LF_AR_Q14; + opus_int32 sLTP_shp_Q14; + opus_int32 LPC_exc_Q14; +} NSQ_sample_struct; + +typedef NSQ_sample_struct NSQ_sample_pair[ 2 ]; + +#if defined(MIPSr1_ASM) +#include "mips/NSQ_del_dec_mipsr1.h" +#endif +static OPUS_INLINE void silk_nsq_del_dec_scale_states( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ + const opus_int32 x_Q3[], /* I Input in Q3 */ + opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */ + const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ + opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ + opus_int subfr, /* I Subframe number */ + opus_int nStatesDelayedDecision, /* I Number of del dec states */ + const opus_int LTP_scale_Q14, /* I LTP state scaling */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ + const opus_int signal_type, /* I Signal type */ + const opus_int decisionDelay /* I Decision delay */ +); + +/******************************************/ +/* Noise shape quantizer for one subframe */ +/******************************************/ +static OPUS_INLINE void silk_noise_shape_quantizer_del_dec( + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ + opus_int signalType, /* I Signal type */ + const opus_int32 x_Q10[], /* I */ + opus_int8 pulses[], /* O */ + opus_int16 xq[], /* O */ + opus_int32 sLTP_Q15[], /* I/O LTP filter state */ + opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */ + const opus_int16 a_Q12[], /* I Short term prediction coefs */ + const opus_int16 b_Q14[], /* I Long term prediction coefs */ + const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ + opus_int lag, /* I Pitch lag */ + opus_int32 HarmShapeFIRPacked_Q14, /* I */ + opus_int Tilt_Q14, /* I Spectral tilt */ + opus_int32 LF_shp_Q14, /* I */ + opus_int32 Gain_Q16, /* I */ + opus_int Lambda_Q10, /* I */ + opus_int offset_Q10, /* I */ + opus_int length, /* I Input length */ + opus_int subfr, /* I Subframe number */ + opus_int shapingLPCOrder, /* I Shaping LPC filter order */ + opus_int predictLPCOrder, /* I Prediction filter order */ + opus_int warping_Q16, /* I */ + opus_int nStatesDelayedDecision, /* I Number of states in decision tree */ + opus_int *smpl_buf_idx, /* I Index to newest samples in buffers */ + opus_int decisionDelay, /* I */ + int arch /* I */ +); + +void silk_NSQ_del_dec_c( + const silk_encoder_state *psEncC, /* I/O Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int32 x_Q3[], /* I Prefiltered input signal */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ +) +{ + opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr; + opus_int last_smple_idx, smpl_buf_idx, decisionDelay; + const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13; + opus_int16 *pxq; + VARDECL( opus_int32, sLTP_Q15 ); + VARDECL( opus_int16, sLTP ); + opus_int32 HarmShapeFIRPacked_Q14; + opus_int offset_Q10; + opus_int32 RDmin_Q10, Gain_Q10; + VARDECL( opus_int32, x_sc_Q10 ); + VARDECL( opus_int32, delayedGain_Q10 ); + VARDECL( NSQ_del_dec_struct, psDelDec ); + NSQ_del_dec_struct *psDD; + SAVE_STACK; + + /* Set unvoiced lag to the previous one, overwrite later for voiced */ + lag = NSQ->lagPrev; + + silk_assert( NSQ->prev_gain_Q16 != 0 ); + + /* Initialize delayed decision states */ + ALLOC( psDelDec, psEncC->nStatesDelayedDecision, NSQ_del_dec_struct ); + silk_memset( psDelDec, 0, psEncC->nStatesDelayedDecision * sizeof( NSQ_del_dec_struct ) ); + for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) { + psDD = &psDelDec[ k ]; + psDD->Seed = ( k + psIndices->Seed ) & 3; + psDD->SeedInit = psDD->Seed; + psDD->RD_Q10 = 0; + psDD->LF_AR_Q14 = NSQ->sLF_AR_shp_Q14; + psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ]; + silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); + silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) ); + } + + offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ]; + smpl_buf_idx = 0; /* index of oldest samples */ + + decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length ); + + /* For voiced frames limit the decision delay to lower than the pitch lag */ + if( psIndices->signalType == TYPE_VOICED ) { + for( k = 0; k < psEncC->nb_subfr; k++ ) { + decisionDelay = silk_min_int( decisionDelay, pitchL[ k ] - LTP_ORDER / 2 - 1 ); + } + } else { + if( lag > 0 ) { + decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 ); + } + } + + if( psIndices->NLSFInterpCoef_Q2 == 4 ) { + LSF_interpolation_flag = 0; + } else { + LSF_interpolation_flag = 1; + } + + ALLOC( sLTP_Q15, + psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); + ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 ); + ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 ); + ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 ); + /* Set up pointers to start of sub frame */ + pxq = &NSQ->xq[ psEncC->ltp_mem_length ]; + NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length; + NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; + subfr = 0; + for( k = 0; k < psEncC->nb_subfr; k++ ) { + A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ]; + B_Q14 = <PCoef_Q14[ k * LTP_ORDER ]; + AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ]; + + /* Noise shape parameters */ + silk_assert( HarmShapeGain_Q14[ k ] >= 0 ); + HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 ); + HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 ); + + NSQ->rewhite_flag = 0; + if( psIndices->signalType == TYPE_VOICED ) { + /* Voiced */ + lag = pitchL[ k ]; + + /* Re-whitening */ + if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) { + if( k == 2 ) { + /* RESET DELAYED DECISIONS */ + /* Find winner */ + RDmin_Q10 = psDelDec[ 0 ].RD_Q10; + Winner_ind = 0; + for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) { + if( psDelDec[ i ].RD_Q10 < RDmin_Q10 ) { + RDmin_Q10 = psDelDec[ i ].RD_Q10; + Winner_ind = i; + } + } + for( i = 0; i < psEncC->nStatesDelayedDecision; i++ ) { + if( i != Winner_ind ) { + psDelDec[ i ].RD_Q10 += ( silk_int32_MAX >> 4 ); + silk_assert( psDelDec[ i ].RD_Q10 >= 0 ); + } + } + + /* Copy final part of signals from winner state to output and long-term filter states */ + psDD = &psDelDec[ Winner_ind ]; + last_smple_idx = smpl_buf_idx + decisionDelay; + for( i = 0; i < decisionDelay; i++ ) { + last_smple_idx = ( last_smple_idx - 1 ) & DECISION_DELAY_MASK; + pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 ); + pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( + silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gains_Q16[ 1 ] ), 14 ) ); + NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ]; + } + + subfr = 0; + } + + /* Rewhiten with new A coefs */ + start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2; + silk_assert( start_idx > 0 ); + + silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ], + A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch ); + + NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; + NSQ->rewhite_flag = 1; + } + } + + silk_nsq_del_dec_scale_states( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, + psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay ); + + silk_noise_shape_quantizer_del_dec( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, + delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], + Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder, + psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay, psEncC->arch ); + + x_Q3 += psEncC->subfr_length; + pulses += psEncC->subfr_length; + pxq += psEncC->subfr_length; + } + + /* Find winner */ + RDmin_Q10 = psDelDec[ 0 ].RD_Q10; + Winner_ind = 0; + for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) { + if( psDelDec[ k ].RD_Q10 < RDmin_Q10 ) { + RDmin_Q10 = psDelDec[ k ].RD_Q10; + Winner_ind = k; + } + } + + /* Copy final part of signals from winner state to output and long-term filter states */ + psDD = &psDelDec[ Winner_ind ]; + psIndices->Seed = psDD->SeedInit; + last_smple_idx = smpl_buf_idx + decisionDelay; + Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 ); + for( i = 0; i < decisionDelay; i++ ) { + last_smple_idx = ( last_smple_idx - 1 ) & DECISION_DELAY_MASK; + pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 ); + pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( + silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) ); + NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ]; + } + silk_memcpy( NSQ->sLPC_Q14, &psDD->sLPC_Q14[ psEncC->subfr_length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); + silk_memcpy( NSQ->sAR2_Q14, psDD->sAR2_Q14, sizeof( psDD->sAR2_Q14 ) ); + + /* Update states */ + NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14; + NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ]; + + /* Save quantized speech signal */ + /* DEBUG_STORE_DATA( enc.pcm, &NSQ->xq[psEncC->ltp_mem_length], psEncC->frame_length * sizeof( opus_int16 ) ) */ + silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) ); + silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) ); + RESTORE_STACK; +} + +/******************************************/ +/* Noise shape quantizer for one subframe */ +/******************************************/ +#ifndef OVERRIDE_silk_noise_shape_quantizer_del_dec +static OPUS_INLINE void silk_noise_shape_quantizer_del_dec( + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ + opus_int signalType, /* I Signal type */ + const opus_int32 x_Q10[], /* I */ + opus_int8 pulses[], /* O */ + opus_int16 xq[], /* O */ + opus_int32 sLTP_Q15[], /* I/O LTP filter state */ + opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */ + const opus_int16 a_Q12[], /* I Short term prediction coefs */ + const opus_int16 b_Q14[], /* I Long term prediction coefs */ + const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ + opus_int lag, /* I Pitch lag */ + opus_int32 HarmShapeFIRPacked_Q14, /* I */ + opus_int Tilt_Q14, /* I Spectral tilt */ + opus_int32 LF_shp_Q14, /* I */ + opus_int32 Gain_Q16, /* I */ + opus_int Lambda_Q10, /* I */ + opus_int offset_Q10, /* I */ + opus_int length, /* I Input length */ + opus_int subfr, /* I Subframe number */ + opus_int shapingLPCOrder, /* I Shaping LPC filter order */ + opus_int predictLPCOrder, /* I Prediction filter order */ + opus_int warping_Q16, /* I */ + opus_int nStatesDelayedDecision, /* I Number of states in decision tree */ + opus_int *smpl_buf_idx, /* I Index to newest samples in buffers */ + opus_int decisionDelay, /* I */ + int arch /* I */ +) +{ + opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx; + opus_int32 Winner_rand_state; + opus_int32 LTP_pred_Q14, LPC_pred_Q14, n_AR_Q14, n_LTP_Q14; + opus_int32 n_LF_Q14, r_Q10, rr_Q10, rd1_Q10, rd2_Q10, RDmin_Q10, RDmax_Q10; + opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10; + opus_int32 tmp1, tmp2, sLF_AR_shp_Q14; + opus_int32 *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14; +#ifdef silk_short_prediction_create_arch_coef + opus_int32 a_Q12_arch[MAX_LPC_ORDER]; +#endif + + VARDECL( NSQ_sample_pair, psSampleState ); + NSQ_del_dec_struct *psDD; + NSQ_sample_struct *psSS; + SAVE_STACK; + + silk_assert( nStatesDelayedDecision > 0 ); + ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair ); + + shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ]; + pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; + Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 ); + +#ifdef silk_short_prediction_create_arch_coef + silk_short_prediction_create_arch_coef(a_Q12_arch, a_Q12, predictLPCOrder); +#endif + + for( i = 0; i < length; i++ ) { + /* Perform common calculations used in all states */ + + /* Long-term prediction */ + if( signalType == TYPE_VOICED ) { + /* Unrolled loop */ + /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ + LTP_pred_Q14 = 2; + LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ 0 ], b_Q14[ 0 ] ); + LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -1 ], b_Q14[ 1 ] ); + LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -2 ], b_Q14[ 2 ] ); + LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -3 ], b_Q14[ 3 ] ); + LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] ); + LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */ + pred_lag_ptr++; + } else { + LTP_pred_Q14 = 0; + } + + /* Long-term shaping */ + if( lag > 0 ) { + /* Symmetric, packed FIR coefficients */ + n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); + n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); + n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */ + shp_lag_ptr++; + } else { + n_LTP_Q14 = 0; + } + + for( k = 0; k < nStatesDelayedDecision; k++ ) { + /* Delayed decision state */ + psDD = &psDelDec[ k ]; + + /* Sample state */ + psSS = psSampleState[ k ]; + + /* Generate dither */ + psDD->Seed = silk_RAND( psDD->Seed ); + + /* Pointer used in short term prediction and shaping */ + psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ]; + /* Short-term prediction */ + LPC_pred_Q14 = silk_noise_shape_quantizer_short_prediction(psLPC_Q14, a_Q12, a_Q12_arch, predictLPCOrder, arch); + LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */ + + /* Noise shape feedback */ + silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */ + /* Output of lowpass section */ + tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 ); + /* Output of allpass section */ + tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 ); + psDD->sAR2_Q14[ 0 ] = tmp2; + n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 ); + n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] ); + /* Loop over allpass sections */ + for( j = 2; j < shapingLPCOrder; j += 2 ) { + /* Output of allpass section */ + tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], psDD->sAR2_Q14[ j + 0 ] - tmp1, warping_Q16 ); + psDD->sAR2_Q14[ j - 1 ] = tmp1; + n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] ); + /* Output of allpass section */ + tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], psDD->sAR2_Q14[ j + 1 ] - tmp2, warping_Q16 ); + psDD->sAR2_Q14[ j + 0 ] = tmp2; + n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] ); + } + psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1; + n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ shapingLPCOrder - 1 ] ); + + n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 ); /* Q11 -> Q12 */ + n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 ); /* Q12 */ + n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 2 ); /* Q12 -> Q14 */ + + n_LF_Q14 = silk_SMULWB( psDD->Shape_Q14[ *smpl_buf_idx ], LF_shp_Q14 ); /* Q12 */ + n_LF_Q14 = silk_SMLAWT( n_LF_Q14, psDD->LF_AR_Q14, LF_shp_Q14 ); /* Q12 */ + n_LF_Q14 = silk_LSHIFT( n_LF_Q14, 2 ); /* Q12 -> Q14 */ + + /* Input minus prediction plus noise feedback */ + /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */ + tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 ); /* Q14 */ + tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */ + tmp1 = silk_SUB32( tmp2, tmp1 ); /* Q13 */ + tmp1 = silk_RSHIFT_ROUND( tmp1, 4 ); /* Q10 */ + + r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 ); /* residual error Q10 */ + + /* Flip sign depending on dither */ + if ( psDD->Seed < 0 ) { + r_Q10 = -r_Q10; + } + r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 ); + + /* Find two quantization level candidates and measure their rate-distortion */ + q1_Q10 = silk_SUB32( r_Q10, offset_Q10 ); + q1_Q0 = silk_RSHIFT( q1_Q10, 10 ); + if( q1_Q0 > 0 ) { + q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 ); + q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 ); + q2_Q10 = silk_ADD32( q1_Q10, 1024 ); + rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 ); + rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 ); + } else if( q1_Q0 == 0 ) { + q1_Q10 = offset_Q10; + q2_Q10 = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 ); + rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 ); + rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 ); + } else if( q1_Q0 == -1 ) { + q2_Q10 = offset_Q10; + q1_Q10 = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 ); + rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 ); + rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 ); + } else { /* q1_Q0 < -1 */ + q1_Q10 = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 ); + q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 ); + q2_Q10 = silk_ADD32( q1_Q10, 1024 ); + rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 ); + rd2_Q10 = silk_SMULBB( -q2_Q10, Lambda_Q10 ); + } + rr_Q10 = silk_SUB32( r_Q10, q1_Q10 ); + rd1_Q10 = silk_RSHIFT( silk_SMLABB( rd1_Q10, rr_Q10, rr_Q10 ), 10 ); + rr_Q10 = silk_SUB32( r_Q10, q2_Q10 ); + rd2_Q10 = silk_RSHIFT( silk_SMLABB( rd2_Q10, rr_Q10, rr_Q10 ), 10 ); + + if( rd1_Q10 < rd2_Q10 ) { + psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 ); + psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 ); + psSS[ 0 ].Q_Q10 = q1_Q10; + psSS[ 1 ].Q_Q10 = q2_Q10; + } else { + psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 ); + psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 ); + psSS[ 0 ].Q_Q10 = q2_Q10; + psSS[ 1 ].Q_Q10 = q1_Q10; + } + + /* Update states for best quantization */ + + /* Quantized excitation */ + exc_Q14 = silk_LSHIFT32( psSS[ 0 ].Q_Q10, 4 ); + if ( psDD->Seed < 0 ) { + exc_Q14 = -exc_Q14; + } + + /* Add predictions */ + LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 ); + xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 ); + + /* Update states */ + sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 ); + psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 ); + psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14; + psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14; + psSS[ 0 ].xq_Q14 = xq_Q14; + + /* Update states for second best quantization */ + + /* Quantized excitation */ + exc_Q14 = silk_LSHIFT32( psSS[ 1 ].Q_Q10, 4 ); + if ( psDD->Seed < 0 ) { + exc_Q14 = -exc_Q14; + } + + + /* Add predictions */ + LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 ); + xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 ); + + /* Update states */ + sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 ); + psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 ); + psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14; + psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14; + psSS[ 1 ].xq_Q14 = xq_Q14; + } + + *smpl_buf_idx = ( *smpl_buf_idx - 1 ) & DECISION_DELAY_MASK; /* Index to newest samples */ + last_smple_idx = ( *smpl_buf_idx + decisionDelay ) & DECISION_DELAY_MASK; /* Index to decisionDelay old samples */ + + /* Find winner */ + RDmin_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10; + Winner_ind = 0; + for( k = 1; k < nStatesDelayedDecision; k++ ) { + if( psSampleState[ k ][ 0 ].RD_Q10 < RDmin_Q10 ) { + RDmin_Q10 = psSampleState[ k ][ 0 ].RD_Q10; + Winner_ind = k; + } + } + + /* Increase RD values of expired states */ + Winner_rand_state = psDelDec[ Winner_ind ].RandState[ last_smple_idx ]; + for( k = 0; k < nStatesDelayedDecision; k++ ) { + if( psDelDec[ k ].RandState[ last_smple_idx ] != Winner_rand_state ) { + psSampleState[ k ][ 0 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 0 ].RD_Q10, silk_int32_MAX >> 4 ); + psSampleState[ k ][ 1 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 1 ].RD_Q10, silk_int32_MAX >> 4 ); + silk_assert( psSampleState[ k ][ 0 ].RD_Q10 >= 0 ); + } + } + + /* Find worst in first set and best in second set */ + RDmax_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10; + RDmin_Q10 = psSampleState[ 0 ][ 1 ].RD_Q10; + RDmax_ind = 0; + RDmin_ind = 0; + for( k = 1; k < nStatesDelayedDecision; k++ ) { + /* find worst in first set */ + if( psSampleState[ k ][ 0 ].RD_Q10 > RDmax_Q10 ) { + RDmax_Q10 = psSampleState[ k ][ 0 ].RD_Q10; + RDmax_ind = k; + } + /* find best in second set */ + if( psSampleState[ k ][ 1 ].RD_Q10 < RDmin_Q10 ) { + RDmin_Q10 = psSampleState[ k ][ 1 ].RD_Q10; + RDmin_ind = k; + } + } + + /* Replace a state if best from second set outperforms worst in first set */ + if( RDmin_Q10 < RDmax_Q10 ) { + silk_memcpy( ( (opus_int32 *)&psDelDec[ RDmax_ind ] ) + i, + ( (opus_int32 *)&psDelDec[ RDmin_ind ] ) + i, sizeof( NSQ_del_dec_struct ) - i * sizeof( opus_int32) ); + silk_memcpy( &psSampleState[ RDmax_ind ][ 0 ], &psSampleState[ RDmin_ind ][ 1 ], sizeof( NSQ_sample_struct ) ); + } + + /* Write samples from winner to output and long-term filter states */ + psDD = &psDelDec[ Winner_ind ]; + if( subfr > 0 || i >= decisionDelay ) { + pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 ); + xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( + silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], delayedGain_Q10[ last_smple_idx ] ), 8 ) ); + NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDD->Shape_Q14[ last_smple_idx ]; + sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDD->Pred_Q15[ last_smple_idx ]; + } + NSQ->sLTP_shp_buf_idx++; + NSQ->sLTP_buf_idx++; + + /* Update states */ + for( k = 0; k < nStatesDelayedDecision; k++ ) { + psDD = &psDelDec[ k ]; + psSS = &psSampleState[ k ][ 0 ]; + psDD->LF_AR_Q14 = psSS->LF_AR_Q14; + psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14; + psDD->Xq_Q14[ *smpl_buf_idx ] = psSS->xq_Q14; + psDD->Q_Q10[ *smpl_buf_idx ] = psSS->Q_Q10; + psDD->Pred_Q15[ *smpl_buf_idx ] = silk_LSHIFT32( psSS->LPC_exc_Q14, 1 ); + psDD->Shape_Q14[ *smpl_buf_idx ] = psSS->sLTP_shp_Q14; + psDD->Seed = silk_ADD32_ovflw( psDD->Seed, silk_RSHIFT_ROUND( psSS->Q_Q10, 10 ) ); + psDD->RandState[ *smpl_buf_idx ] = psDD->Seed; + psDD->RD_Q10 = psSS->RD_Q10; + } + delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10; + } + /* Update LPC states */ + for( k = 0; k < nStatesDelayedDecision; k++ ) { + psDD = &psDelDec[ k ]; + silk_memcpy( psDD->sLPC_Q14, &psDD->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); + } + RESTORE_STACK; +} +#endif /* OVERRIDE_silk_noise_shape_quantizer_del_dec */ + +static OPUS_INLINE void silk_nsq_del_dec_scale_states( + const silk_encoder_state *psEncC, /* I Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ + const opus_int32 x_Q3[], /* I Input in Q3 */ + opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */ + const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ + opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ + opus_int subfr, /* I Subframe number */ + opus_int nStatesDelayedDecision, /* I Number of del dec states */ + const opus_int LTP_scale_Q14, /* I LTP state scaling */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ + const opus_int signal_type, /* I Signal type */ + const opus_int decisionDelay /* I Decision delay */ +) +{ + opus_int i, k, lag; + opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23; + NSQ_del_dec_struct *psDD; + + lag = pitchL[ subfr ]; + inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 ); + silk_assert( inv_gain_Q31 != 0 ); + + /* Calculate gain adjustment factor */ + if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { + gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); + } else { + gain_adj_Q16 = (opus_int32)1 << 16; + } + + /* Scale input */ + inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 ); + for( i = 0; i < psEncC->subfr_length; i++ ) { + x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 ); + } + + /* Save inverse gain */ + NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; + + /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ + if( NSQ->rewhite_flag ) { + if( subfr == 0 ) { + /* Do LTP downscaling */ + inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 ); + } + for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) { + silk_assert( i < MAX_FRAME_LENGTH ); + sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] ); + } + } + + /* Adjust for changing gain */ + if( gain_adj_Q16 != (opus_int32)1 << 16 ) { + /* Scale long-term shaping state */ + for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i++ ) { + NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] ); + } + + /* Scale long-term prediction state */ + if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) { + for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) { + sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] ); + } + } + + for( k = 0; k < nStatesDelayedDecision; k++ ) { + psDD = &psDelDec[ k ]; + + /* Scale scalar states */ + psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 ); + + /* Scale short-term prediction and shaping states */ + for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) { + psDD->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sLPC_Q14[ i ] ); + } + for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) { + psDD->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sAR2_Q14[ i ] ); + } + for( i = 0; i < DECISION_DELAY; i++ ) { + psDD->Pred_Q15[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Pred_Q15[ i ] ); + psDD->Shape_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Shape_Q14[ i ] ); + } + } + } +} diff --git a/thirdparty/opus/silk/PLC.c b/thirdparty/opus/silk/PLC.c new file mode 100644 index 000000000000..fb6ea887b7ee --- /dev/null +++ b/thirdparty/opus/silk/PLC.c @@ -0,0 +1,446 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" +#include "stack_alloc.h" +#include "PLC.h" + +#define NB_ATT 2 +static const opus_int16 HARM_ATT_Q15[NB_ATT] = { 32440, 31130 }; /* 0.99, 0.95 */ +static const opus_int16 PLC_RAND_ATTENUATE_V_Q15[NB_ATT] = { 31130, 26214 }; /* 0.95, 0.8 */ +static const opus_int16 PLC_RAND_ATTENUATE_UV_Q15[NB_ATT] = { 32440, 29491 }; /* 0.99, 0.9 */ + +static OPUS_INLINE void silk_PLC_update( + silk_decoder_state *psDec, /* I/O Decoder state */ + silk_decoder_control *psDecCtrl /* I/O Decoder control */ +); + +static OPUS_INLINE void silk_PLC_conceal( + silk_decoder_state *psDec, /* I/O Decoder state */ + silk_decoder_control *psDecCtrl, /* I/O Decoder control */ + opus_int16 frame[], /* O LPC residual signal */ + int arch /* I Run-time architecture */ +); + + +void silk_PLC_Reset( + silk_decoder_state *psDec /* I/O Decoder state */ +) +{ + psDec->sPLC.pitchL_Q8 = silk_LSHIFT( psDec->frame_length, 8 - 1 ); + psDec->sPLC.prevGain_Q16[ 0 ] = SILK_FIX_CONST( 1, 16 ); + psDec->sPLC.prevGain_Q16[ 1 ] = SILK_FIX_CONST( 1, 16 ); + psDec->sPLC.subfr_length = 20; + psDec->sPLC.nb_subfr = 2; +} + +void silk_PLC( + silk_decoder_state *psDec, /* I/O Decoder state */ + silk_decoder_control *psDecCtrl, /* I/O Decoder control */ + opus_int16 frame[], /* I/O signal */ + opus_int lost, /* I Loss flag */ + int arch /* I Run-time architecture */ +) +{ + /* PLC control function */ + if( psDec->fs_kHz != psDec->sPLC.fs_kHz ) { + silk_PLC_Reset( psDec ); + psDec->sPLC.fs_kHz = psDec->fs_kHz; + } + + if( lost ) { + /****************************/ + /* Generate Signal */ + /****************************/ + silk_PLC_conceal( psDec, psDecCtrl, frame, arch ); + + psDec->lossCnt++; + } else { + /****************************/ + /* Update state */ + /****************************/ + silk_PLC_update( psDec, psDecCtrl ); + } +} + +/**************************************************/ +/* Update state of PLC */ +/**************************************************/ +static OPUS_INLINE void silk_PLC_update( + silk_decoder_state *psDec, /* I/O Decoder state */ + silk_decoder_control *psDecCtrl /* I/O Decoder control */ +) +{ + opus_int32 LTP_Gain_Q14, temp_LTP_Gain_Q14; + opus_int i, j; + silk_PLC_struct *psPLC; + + psPLC = &psDec->sPLC; + + /* Update parameters used in case of packet loss */ + psDec->prevSignalType = psDec->indices.signalType; + LTP_Gain_Q14 = 0; + if( psDec->indices.signalType == TYPE_VOICED ) { + /* Find the parameters for the last subframe which contains a pitch pulse */ + for( j = 0; j * psDec->subfr_length < psDecCtrl->pitchL[ psDec->nb_subfr - 1 ]; j++ ) { + if( j == psDec->nb_subfr ) { + break; + } + temp_LTP_Gain_Q14 = 0; + for( i = 0; i < LTP_ORDER; i++ ) { + temp_LTP_Gain_Q14 += psDecCtrl->LTPCoef_Q14[ ( psDec->nb_subfr - 1 - j ) * LTP_ORDER + i ]; + } + if( temp_LTP_Gain_Q14 > LTP_Gain_Q14 ) { + LTP_Gain_Q14 = temp_LTP_Gain_Q14; + silk_memcpy( psPLC->LTPCoef_Q14, + &psDecCtrl->LTPCoef_Q14[ silk_SMULBB( psDec->nb_subfr - 1 - j, LTP_ORDER ) ], + LTP_ORDER * sizeof( opus_int16 ) ); + + psPLC->pitchL_Q8 = silk_LSHIFT( psDecCtrl->pitchL[ psDec->nb_subfr - 1 - j ], 8 ); + } + } + + silk_memset( psPLC->LTPCoef_Q14, 0, LTP_ORDER * sizeof( opus_int16 ) ); + psPLC->LTPCoef_Q14[ LTP_ORDER / 2 ] = LTP_Gain_Q14; + + /* Limit LT coefs */ + if( LTP_Gain_Q14 < V_PITCH_GAIN_START_MIN_Q14 ) { + opus_int scale_Q10; + opus_int32 tmp; + + tmp = silk_LSHIFT( V_PITCH_GAIN_START_MIN_Q14, 10 ); + scale_Q10 = silk_DIV32( tmp, silk_max( LTP_Gain_Q14, 1 ) ); + for( i = 0; i < LTP_ORDER; i++ ) { + psPLC->LTPCoef_Q14[ i ] = silk_RSHIFT( silk_SMULBB( psPLC->LTPCoef_Q14[ i ], scale_Q10 ), 10 ); + } + } else if( LTP_Gain_Q14 > V_PITCH_GAIN_START_MAX_Q14 ) { + opus_int scale_Q14; + opus_int32 tmp; + + tmp = silk_LSHIFT( V_PITCH_GAIN_START_MAX_Q14, 14 ); + scale_Q14 = silk_DIV32( tmp, silk_max( LTP_Gain_Q14, 1 ) ); + for( i = 0; i < LTP_ORDER; i++ ) { + psPLC->LTPCoef_Q14[ i ] = silk_RSHIFT( silk_SMULBB( psPLC->LTPCoef_Q14[ i ], scale_Q14 ), 14 ); + } + } + } else { + psPLC->pitchL_Q8 = silk_LSHIFT( silk_SMULBB( psDec->fs_kHz, 18 ), 8 ); + silk_memset( psPLC->LTPCoef_Q14, 0, LTP_ORDER * sizeof( opus_int16 )); + } + + /* Save LPC coeficients */ + silk_memcpy( psPLC->prevLPC_Q12, psDecCtrl->PredCoef_Q12[ 1 ], psDec->LPC_order * sizeof( opus_int16 ) ); + psPLC->prevLTP_scale_Q14 = psDecCtrl->LTP_scale_Q14; + + /* Save last two gains */ + silk_memcpy( psPLC->prevGain_Q16, &psDecCtrl->Gains_Q16[ psDec->nb_subfr - 2 ], 2 * sizeof( opus_int32 ) ); + + psPLC->subfr_length = psDec->subfr_length; + psPLC->nb_subfr = psDec->nb_subfr; +} + +static OPUS_INLINE void silk_PLC_energy(opus_int32 *energy1, opus_int *shift1, opus_int32 *energy2, opus_int *shift2, + const opus_int32 *exc_Q14, const opus_int32 *prevGain_Q10, int subfr_length, int nb_subfr) +{ + int i, k; + VARDECL( opus_int16, exc_buf ); + opus_int16 *exc_buf_ptr; + SAVE_STACK; + ALLOC( exc_buf, 2*subfr_length, opus_int16 ); + /* Find random noise component */ + /* Scale previous excitation signal */ + exc_buf_ptr = exc_buf; + for( k = 0; k < 2; k++ ) { + for( i = 0; i < subfr_length; i++ ) { + exc_buf_ptr[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT( + silk_SMULWW( exc_Q14[ i + ( k + nb_subfr - 2 ) * subfr_length ], prevGain_Q10[ k ] ), 8 ) ); + } + exc_buf_ptr += subfr_length; + } + /* Find the subframe with lowest energy of the last two and use that as random noise generator */ + silk_sum_sqr_shift( energy1, shift1, exc_buf, subfr_length ); + silk_sum_sqr_shift( energy2, shift2, &exc_buf[ subfr_length ], subfr_length ); + RESTORE_STACK; +} + +static OPUS_INLINE void silk_PLC_conceal( + silk_decoder_state *psDec, /* I/O Decoder state */ + silk_decoder_control *psDecCtrl, /* I/O Decoder control */ + opus_int16 frame[], /* O LPC residual signal */ + int arch /* I Run-time architecture */ +) +{ + opus_int i, j, k; + opus_int lag, idx, sLTP_buf_idx, shift1, shift2; + opus_int32 rand_seed, harm_Gain_Q15, rand_Gain_Q15, inv_gain_Q30; + opus_int32 energy1, energy2, *rand_ptr, *pred_lag_ptr; + opus_int32 LPC_pred_Q10, LTP_pred_Q12; + opus_int16 rand_scale_Q14; + opus_int16 *B_Q14; + opus_int32 *sLPC_Q14_ptr; + opus_int16 A_Q12[ MAX_LPC_ORDER ]; +#ifdef SMALL_FOOTPRINT + opus_int16 *sLTP; +#else + VARDECL( opus_int16, sLTP ); +#endif + VARDECL( opus_int32, sLTP_Q14 ); + silk_PLC_struct *psPLC = &psDec->sPLC; + opus_int32 prevGain_Q10[2]; + SAVE_STACK; + + ALLOC( sLTP_Q14, psDec->ltp_mem_length + psDec->frame_length, opus_int32 ); +#ifdef SMALL_FOOTPRINT + /* Ugly hack that breaks aliasing rules to save stack: put sLTP at the very end of sLTP_Q14. */ + sLTP = ((opus_int16*)&sLTP_Q14[psDec->ltp_mem_length + psDec->frame_length])-psDec->ltp_mem_length; +#else + ALLOC( sLTP, psDec->ltp_mem_length, opus_int16 ); +#endif + + prevGain_Q10[0] = silk_RSHIFT( psPLC->prevGain_Q16[ 0 ], 6); + prevGain_Q10[1] = silk_RSHIFT( psPLC->prevGain_Q16[ 1 ], 6); + + if( psDec->first_frame_after_reset ) { + silk_memset( psPLC->prevLPC_Q12, 0, sizeof( psPLC->prevLPC_Q12 ) ); + } + + silk_PLC_energy(&energy1, &shift1, &energy2, &shift2, psDec->exc_Q14, prevGain_Q10, psDec->subfr_length, psDec->nb_subfr); + + if( silk_RSHIFT( energy1, shift2 ) < silk_RSHIFT( energy2, shift1 ) ) { + /* First sub-frame has lowest energy */ + rand_ptr = &psDec->exc_Q14[ silk_max_int( 0, ( psPLC->nb_subfr - 1 ) * psPLC->subfr_length - RAND_BUF_SIZE ) ]; + } else { + /* Second sub-frame has lowest energy */ + rand_ptr = &psDec->exc_Q14[ silk_max_int( 0, psPLC->nb_subfr * psPLC->subfr_length - RAND_BUF_SIZE ) ]; + } + + /* Set up Gain to random noise component */ + B_Q14 = psPLC->LTPCoef_Q14; + rand_scale_Q14 = psPLC->randScale_Q14; + + /* Set up attenuation gains */ + harm_Gain_Q15 = HARM_ATT_Q15[ silk_min_int( NB_ATT - 1, psDec->lossCnt ) ]; + if( psDec->prevSignalType == TYPE_VOICED ) { + rand_Gain_Q15 = PLC_RAND_ATTENUATE_V_Q15[ silk_min_int( NB_ATT - 1, psDec->lossCnt ) ]; + } else { + rand_Gain_Q15 = PLC_RAND_ATTENUATE_UV_Q15[ silk_min_int( NB_ATT - 1, psDec->lossCnt ) ]; + } + + /* LPC concealment. Apply BWE to previous LPC */ + silk_bwexpander( psPLC->prevLPC_Q12, psDec->LPC_order, SILK_FIX_CONST( BWE_COEF, 16 ) ); + + /* Preload LPC coeficients to array on stack. Gives small performance gain */ + silk_memcpy( A_Q12, psPLC->prevLPC_Q12, psDec->LPC_order * sizeof( opus_int16 ) ); + + /* First Lost frame */ + if( psDec->lossCnt == 0 ) { + rand_scale_Q14 = 1 << 14; + + /* Reduce random noise Gain for voiced frames */ + if( psDec->prevSignalType == TYPE_VOICED ) { + for( i = 0; i < LTP_ORDER; i++ ) { + rand_scale_Q14 -= B_Q14[ i ]; + } + rand_scale_Q14 = silk_max_16( 3277, rand_scale_Q14 ); /* 0.2 */ + rand_scale_Q14 = (opus_int16)silk_RSHIFT( silk_SMULBB( rand_scale_Q14, psPLC->prevLTP_scale_Q14 ), 14 ); + } else { + /* Reduce random noise for unvoiced frames with high LPC gain */ + opus_int32 invGain_Q30, down_scale_Q30; + + invGain_Q30 = silk_LPC_inverse_pred_gain( psPLC->prevLPC_Q12, psDec->LPC_order ); + + down_scale_Q30 = silk_min_32( silk_RSHIFT( (opus_int32)1 << 30, LOG2_INV_LPC_GAIN_HIGH_THRES ), invGain_Q30 ); + down_scale_Q30 = silk_max_32( silk_RSHIFT( (opus_int32)1 << 30, LOG2_INV_LPC_GAIN_LOW_THRES ), down_scale_Q30 ); + down_scale_Q30 = silk_LSHIFT( down_scale_Q30, LOG2_INV_LPC_GAIN_HIGH_THRES ); + + rand_Gain_Q15 = silk_RSHIFT( silk_SMULWB( down_scale_Q30, rand_Gain_Q15 ), 14 ); + } + } + + rand_seed = psPLC->rand_seed; + lag = silk_RSHIFT_ROUND( psPLC->pitchL_Q8, 8 ); + sLTP_buf_idx = psDec->ltp_mem_length; + + /* Rewhiten LTP state */ + idx = psDec->ltp_mem_length - lag - psDec->LPC_order - LTP_ORDER / 2; + silk_assert( idx > 0 ); + silk_LPC_analysis_filter( &sLTP[ idx ], &psDec->outBuf[ idx ], A_Q12, psDec->ltp_mem_length - idx, psDec->LPC_order, arch ); + /* Scale LTP state */ + inv_gain_Q30 = silk_INVERSE32_varQ( psPLC->prevGain_Q16[ 1 ], 46 ); + inv_gain_Q30 = silk_min( inv_gain_Q30, silk_int32_MAX >> 1 ); + for( i = idx + psDec->LPC_order; i < psDec->ltp_mem_length; i++ ) { + sLTP_Q14[ i ] = silk_SMULWB( inv_gain_Q30, sLTP[ i ] ); + } + + /***************************/ + /* LTP synthesis filtering */ + /***************************/ + for( k = 0; k < psDec->nb_subfr; k++ ) { + /* Set up pointer */ + pred_lag_ptr = &sLTP_Q14[ sLTP_buf_idx - lag + LTP_ORDER / 2 ]; + for( i = 0; i < psDec->subfr_length; i++ ) { + /* Unrolled loop */ + /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ + LTP_pred_Q12 = 2; + LTP_pred_Q12 = silk_SMLAWB( LTP_pred_Q12, pred_lag_ptr[ 0 ], B_Q14[ 0 ] ); + LTP_pred_Q12 = silk_SMLAWB( LTP_pred_Q12, pred_lag_ptr[ -1 ], B_Q14[ 1 ] ); + LTP_pred_Q12 = silk_SMLAWB( LTP_pred_Q12, pred_lag_ptr[ -2 ], B_Q14[ 2 ] ); + LTP_pred_Q12 = silk_SMLAWB( LTP_pred_Q12, pred_lag_ptr[ -3 ], B_Q14[ 3 ] ); + LTP_pred_Q12 = silk_SMLAWB( LTP_pred_Q12, pred_lag_ptr[ -4 ], B_Q14[ 4 ] ); + pred_lag_ptr++; + + /* Generate LPC excitation */ + rand_seed = silk_RAND( rand_seed ); + idx = silk_RSHIFT( rand_seed, 25 ) & RAND_BUF_MASK; + sLTP_Q14[ sLTP_buf_idx ] = silk_LSHIFT32( silk_SMLAWB( LTP_pred_Q12, rand_ptr[ idx ], rand_scale_Q14 ), 2 ); + sLTP_buf_idx++; + } + + /* Gradually reduce LTP gain */ + for( j = 0; j < LTP_ORDER; j++ ) { + B_Q14[ j ] = silk_RSHIFT( silk_SMULBB( harm_Gain_Q15, B_Q14[ j ] ), 15 ); + } + /* Gradually reduce excitation gain */ + rand_scale_Q14 = silk_RSHIFT( silk_SMULBB( rand_scale_Q14, rand_Gain_Q15 ), 15 ); + + /* Slowly increase pitch lag */ + psPLC->pitchL_Q8 = silk_SMLAWB( psPLC->pitchL_Q8, psPLC->pitchL_Q8, PITCH_DRIFT_FAC_Q16 ); + psPLC->pitchL_Q8 = silk_min_32( psPLC->pitchL_Q8, silk_LSHIFT( silk_SMULBB( MAX_PITCH_LAG_MS, psDec->fs_kHz ), 8 ) ); + lag = silk_RSHIFT_ROUND( psPLC->pitchL_Q8, 8 ); + } + + /***************************/ + /* LPC synthesis filtering */ + /***************************/ + sLPC_Q14_ptr = &sLTP_Q14[ psDec->ltp_mem_length - MAX_LPC_ORDER ]; + + /* Copy LPC state */ + silk_memcpy( sLPC_Q14_ptr, psDec->sLPC_Q14_buf, MAX_LPC_ORDER * sizeof( opus_int32 ) ); + + silk_assert( psDec->LPC_order >= 10 ); /* check that unrolling works */ + for( i = 0; i < psDec->frame_length; i++ ) { + /* partly unrolled */ + /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ + LPC_pred_Q10 = silk_RSHIFT( psDec->LPC_order, 1 ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14_ptr[ MAX_LPC_ORDER + i - 1 ], A_Q12[ 0 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14_ptr[ MAX_LPC_ORDER + i - 2 ], A_Q12[ 1 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14_ptr[ MAX_LPC_ORDER + i - 3 ], A_Q12[ 2 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14_ptr[ MAX_LPC_ORDER + i - 4 ], A_Q12[ 3 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14_ptr[ MAX_LPC_ORDER + i - 5 ], A_Q12[ 4 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14_ptr[ MAX_LPC_ORDER + i - 6 ], A_Q12[ 5 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14_ptr[ MAX_LPC_ORDER + i - 7 ], A_Q12[ 6 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14_ptr[ MAX_LPC_ORDER + i - 8 ], A_Q12[ 7 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14_ptr[ MAX_LPC_ORDER + i - 9 ], A_Q12[ 8 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14_ptr[ MAX_LPC_ORDER + i - 10 ], A_Q12[ 9 ] ); + for( j = 10; j < psDec->LPC_order; j++ ) { + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14_ptr[ MAX_LPC_ORDER + i - j - 1 ], A_Q12[ j ] ); + } + + /* Add prediction to LPC excitation */ + sLPC_Q14_ptr[ MAX_LPC_ORDER + i ] = silk_ADD_SAT32( sLPC_Q14_ptr[ MAX_LPC_ORDER + i ], + silk_LSHIFT_SAT32( LPC_pred_Q10, 4 )); + + /* Scale with Gain */ + frame[ i ] = (opus_int16)silk_SAT16( silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( sLPC_Q14_ptr[ MAX_LPC_ORDER + i ], prevGain_Q10[ 1 ] ), 8 ) ) ); + } + + /* Save LPC state */ + silk_memcpy( psDec->sLPC_Q14_buf, &sLPC_Q14_ptr[ psDec->frame_length ], MAX_LPC_ORDER * sizeof( opus_int32 ) ); + + /**************************************/ + /* Update states */ + /**************************************/ + psPLC->rand_seed = rand_seed; + psPLC->randScale_Q14 = rand_scale_Q14; + for( i = 0; i < MAX_NB_SUBFR; i++ ) { + psDecCtrl->pitchL[ i ] = lag; + } + RESTORE_STACK; +} + +/* Glues concealed frames with new good received frames */ +void silk_PLC_glue_frames( + silk_decoder_state *psDec, /* I/O decoder state */ + opus_int16 frame[], /* I/O signal */ + opus_int length /* I length of signal */ +) +{ + opus_int i, energy_shift; + opus_int32 energy; + silk_PLC_struct *psPLC; + psPLC = &psDec->sPLC; + + if( psDec->lossCnt ) { + /* Calculate energy in concealed residual */ + silk_sum_sqr_shift( &psPLC->conc_energy, &psPLC->conc_energy_shift, frame, length ); + + psPLC->last_frame_lost = 1; + } else { + if( psDec->sPLC.last_frame_lost ) { + /* Calculate residual in decoded signal if last frame was lost */ + silk_sum_sqr_shift( &energy, &energy_shift, frame, length ); + + /* Normalize energies */ + if( energy_shift > psPLC->conc_energy_shift ) { + psPLC->conc_energy = silk_RSHIFT( psPLC->conc_energy, energy_shift - psPLC->conc_energy_shift ); + } else if( energy_shift < psPLC->conc_energy_shift ) { + energy = silk_RSHIFT( energy, psPLC->conc_energy_shift - energy_shift ); + } + + /* Fade in the energy difference */ + if( energy > psPLC->conc_energy ) { + opus_int32 frac_Q24, LZ; + opus_int32 gain_Q16, slope_Q16; + + LZ = silk_CLZ32( psPLC->conc_energy ); + LZ = LZ - 1; + psPLC->conc_energy = silk_LSHIFT( psPLC->conc_energy, LZ ); + energy = silk_RSHIFT( energy, silk_max_32( 24 - LZ, 0 ) ); + + frac_Q24 = silk_DIV32( psPLC->conc_energy, silk_max( energy, 1 ) ); + + gain_Q16 = silk_LSHIFT( silk_SQRT_APPROX( frac_Q24 ), 4 ); + slope_Q16 = silk_DIV32_16( ( (opus_int32)1 << 16 ) - gain_Q16, length ); + /* Make slope 4x steeper to avoid missing onsets after DTX */ + slope_Q16 = silk_LSHIFT( slope_Q16, 2 ); + + for( i = 0; i < length; i++ ) { + frame[ i ] = silk_SMULWB( gain_Q16, frame[ i ] ); + gain_Q16 += slope_Q16; + if( gain_Q16 > (opus_int32)1 << 16 ) { + break; + } + } + } + } + psPLC->last_frame_lost = 0; + } +} diff --git a/thirdparty/opus/silk/PLC.h b/thirdparty/opus/silk/PLC.h new file mode 100644 index 000000000000..6438f516330e --- /dev/null +++ b/thirdparty/opus/silk/PLC.h @@ -0,0 +1,62 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_PLC_H +#define SILK_PLC_H + +#include "main.h" + +#define BWE_COEF 0.99 +#define V_PITCH_GAIN_START_MIN_Q14 11469 /* 0.7 in Q14 */ +#define V_PITCH_GAIN_START_MAX_Q14 15565 /* 0.95 in Q14 */ +#define MAX_PITCH_LAG_MS 18 +#define RAND_BUF_SIZE 128 +#define RAND_BUF_MASK ( RAND_BUF_SIZE - 1 ) +#define LOG2_INV_LPC_GAIN_HIGH_THRES 3 /* 2^3 = 8 dB LPC gain */ +#define LOG2_INV_LPC_GAIN_LOW_THRES 8 /* 2^8 = 24 dB LPC gain */ +#define PITCH_DRIFT_FAC_Q16 655 /* 0.01 in Q16 */ + +void silk_PLC_Reset( + silk_decoder_state *psDec /* I/O Decoder state */ +); + +void silk_PLC( + silk_decoder_state *psDec, /* I/O Decoder state */ + silk_decoder_control *psDecCtrl, /* I/O Decoder control */ + opus_int16 frame[], /* I/O signal */ + opus_int lost, /* I Loss flag */ + int arch /* I Run-time architecture */ +); + +void silk_PLC_glue_frames( + silk_decoder_state *psDec, /* I/O decoder state */ + opus_int16 frame[], /* I/O signal */ + opus_int length /* I length of signal */ +); + +#endif + diff --git a/thirdparty/opus/silk/SigProc_FIX.h b/thirdparty/opus/silk/SigProc_FIX.h new file mode 100644 index 000000000000..b63299441e93 --- /dev/null +++ b/thirdparty/opus/silk/SigProc_FIX.h @@ -0,0 +1,615 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_SIGPROC_FIX_H +#define SILK_SIGPROC_FIX_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +/*#define silk_MACRO_COUNT */ /* Used to enable WMOPS counting */ + +#define SILK_MAX_ORDER_LPC 16 /* max order of the LPC analysis in schur() and k2a() */ + +#include /* for memset(), memcpy(), memmove() */ +#include "typedef.h" +#include "resampler_structs.h" +#include "macros.h" +#include "cpu_support.h" + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) +#include "x86/SigProc_FIX_sse.h" +#endif + +/********************************************************************/ +/* SIGNAL PROCESSING FUNCTIONS */ +/********************************************************************/ + +/*! + * Initialize/reset the resampler state for a given pair of input/output sampling rates +*/ +opus_int silk_resampler_init( + silk_resampler_state_struct *S, /* I/O Resampler state */ + opus_int32 Fs_Hz_in, /* I Input sampling rate (Hz) */ + opus_int32 Fs_Hz_out, /* I Output sampling rate (Hz) */ + opus_int forEnc /* I If 1: encoder; if 0: decoder */ +); + +/*! + * Resampler: convert from one sampling rate to another + */ +opus_int silk_resampler( + silk_resampler_state_struct *S, /* I/O Resampler state */ + opus_int16 out[], /* O Output signal */ + const opus_int16 in[], /* I Input signal */ + opus_int32 inLen /* I Number of input samples */ +); + +/*! +* Downsample 2x, mediocre quality +*/ +void silk_resampler_down2( + opus_int32 *S, /* I/O State vector [ 2 ] */ + opus_int16 *out, /* O Output signal [ len ] */ + const opus_int16 *in, /* I Input signal [ floor(len/2) ] */ + opus_int32 inLen /* I Number of input samples */ +); + +/*! + * Downsample by a factor 2/3, low quality +*/ +void silk_resampler_down2_3( + opus_int32 *S, /* I/O State vector [ 6 ] */ + opus_int16 *out, /* O Output signal [ floor(2*inLen/3) ] */ + const opus_int16 *in, /* I Input signal [ inLen ] */ + opus_int32 inLen /* I Number of input samples */ +); + +/*! + * second order ARMA filter; + * slower than biquad() but uses more precise coefficients + * can handle (slowly) varying coefficients + */ +void silk_biquad_alt( + const opus_int16 *in, /* I input signal */ + const opus_int32 *B_Q28, /* I MA coefficients [3] */ + const opus_int32 *A_Q28, /* I AR coefficients [2] */ + opus_int32 *S, /* I/O State vector [2] */ + opus_int16 *out, /* O output signal */ + const opus_int32 len, /* I signal length (must be even) */ + opus_int stride /* I Operate on interleaved signal if > 1 */ +); + +/* Variable order MA prediction error filter. */ +void silk_LPC_analysis_filter( + opus_int16 *out, /* O Output signal */ + const opus_int16 *in, /* I Input signal */ + const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */ + const opus_int32 len, /* I Signal length */ + const opus_int32 d, /* I Filter order */ + int arch /* I Run-time architecture */ +); + +/* Chirp (bandwidth expand) LP AR filter */ +void silk_bwexpander( + opus_int16 *ar, /* I/O AR filter to be expanded (without leading 1) */ + const opus_int d, /* I Length of ar */ + opus_int32 chirp_Q16 /* I Chirp factor (typically in the range 0 to 1) */ +); + +/* Chirp (bandwidth expand) LP AR filter */ +void silk_bwexpander_32( + opus_int32 *ar, /* I/O AR filter to be expanded (without leading 1) */ + const opus_int d, /* I Length of ar */ + opus_int32 chirp_Q16 /* I Chirp factor in Q16 */ +); + +/* Compute inverse of LPC prediction gain, and */ +/* test if LPC coefficients are stable (all poles within unit circle) */ +opus_int32 silk_LPC_inverse_pred_gain( /* O Returns inverse prediction gain in energy domain, Q30 */ + const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */ + const opus_int order /* I Prediction order */ +); + +/* For input in Q24 domain */ +opus_int32 silk_LPC_inverse_pred_gain_Q24( /* O Returns inverse prediction gain in energy domain, Q30 */ + const opus_int32 *A_Q24, /* I Prediction coefficients [order] */ + const opus_int order /* I Prediction order */ +); + +/* Split signal in two decimated bands using first-order allpass filters */ +void silk_ana_filt_bank_1( + const opus_int16 *in, /* I Input signal [N] */ + opus_int32 *S, /* I/O State vector [2] */ + opus_int16 *outL, /* O Low band [N/2] */ + opus_int16 *outH, /* O High band [N/2] */ + const opus_int32 N /* I Number of input samples */ +); + +/********************************************************************/ +/* SCALAR FUNCTIONS */ +/********************************************************************/ + +/* Approximation of 128 * log2() (exact inverse of approx 2^() below) */ +/* Convert input to a log scale */ +opus_int32 silk_lin2log( + const opus_int32 inLin /* I input in linear scale */ +); + +/* Approximation of a sigmoid function */ +opus_int silk_sigm_Q15( + opus_int in_Q5 /* I */ +); + +/* Approximation of 2^() (exact inverse of approx log2() above) */ +/* Convert input to a linear scale */ +opus_int32 silk_log2lin( + const opus_int32 inLog_Q7 /* I input on log scale */ +); + +/* Compute number of bits to right shift the sum of squares of a vector */ +/* of int16s to make it fit in an int32 */ +void silk_sum_sqr_shift( + opus_int32 *energy, /* O Energy of x, after shifting to the right */ + opus_int *shift, /* O Number of bits right shift applied to energy */ + const opus_int16 *x, /* I Input vector */ + opus_int len /* I Length of input vector */ +); + +/* Calculates the reflection coefficients from the correlation sequence */ +/* Faster than schur64(), but much less accurate. */ +/* uses SMLAWB(), requiring armv5E and higher. */ +opus_int32 silk_schur( /* O Returns residual energy */ + opus_int16 *rc_Q15, /* O reflection coefficients [order] Q15 */ + const opus_int32 *c, /* I correlations [order+1] */ + const opus_int32 order /* I prediction order */ +); + +/* Calculates the reflection coefficients from the correlation sequence */ +/* Slower than schur(), but more accurate. */ +/* Uses SMULL(), available on armv4 */ +opus_int32 silk_schur64( /* O returns residual energy */ + opus_int32 rc_Q16[], /* O Reflection coefficients [order] Q16 */ + const opus_int32 c[], /* I Correlations [order+1] */ + opus_int32 order /* I Prediction order */ +); + +/* Step up function, converts reflection coefficients to prediction coefficients */ +void silk_k2a( + opus_int32 *A_Q24, /* O Prediction coefficients [order] Q24 */ + const opus_int16 *rc_Q15, /* I Reflection coefficients [order] Q15 */ + const opus_int32 order /* I Prediction order */ +); + +/* Step up function, converts reflection coefficients to prediction coefficients */ +void silk_k2a_Q16( + opus_int32 *A_Q24, /* O Prediction coefficients [order] Q24 */ + const opus_int32 *rc_Q16, /* I Reflection coefficients [order] Q16 */ + const opus_int32 order /* I Prediction order */ +); + +/* Apply sine window to signal vector. */ +/* Window types: */ +/* 1 -> sine window from 0 to pi/2 */ +/* 2 -> sine window from pi/2 to pi */ +/* every other sample of window is linearly interpolated, for speed */ +void silk_apply_sine_window( + opus_int16 px_win[], /* O Pointer to windowed signal */ + const opus_int16 px[], /* I Pointer to input signal */ + const opus_int win_type, /* I Selects a window type */ + const opus_int length /* I Window length, multiple of 4 */ +); + +/* Compute autocorrelation */ +void silk_autocorr( + opus_int32 *results, /* O Result (length correlationCount) */ + opus_int *scale, /* O Scaling of the correlation vector */ + const opus_int16 *inputData, /* I Input data to correlate */ + const opus_int inputDataSize, /* I Length of input */ + const opus_int correlationCount, /* I Number of correlation taps to compute */ + int arch /* I Run-time architecture */ +); + +void silk_decode_pitch( + opus_int16 lagIndex, /* I */ + opus_int8 contourIndex, /* O */ + opus_int pitch_lags[], /* O 4 pitch values */ + const opus_int Fs_kHz, /* I sampling frequency (kHz) */ + const opus_int nb_subfr /* I number of sub frames */ +); + +opus_int silk_pitch_analysis_core( /* O Voicing estimate: 0 voiced, 1 unvoiced */ + const opus_int16 *frame, /* I Signal of length PE_FRAME_LENGTH_MS*Fs_kHz */ + opus_int *pitch_out, /* O 4 pitch lag values */ + opus_int16 *lagIndex, /* O Lag Index */ + opus_int8 *contourIndex, /* O Pitch contour Index */ + opus_int *LTPCorr_Q15, /* I/O Normalized correlation; input: value from previous frame */ + opus_int prevLag, /* I Last lag of previous frame; set to zero is unvoiced */ + const opus_int32 search_thres1_Q16, /* I First stage threshold for lag candidates 0 - 1 */ + const opus_int search_thres2_Q13, /* I Final threshold for lag candidates 0 - 1 */ + const opus_int Fs_kHz, /* I Sample frequency (kHz) */ + const opus_int complexity, /* I Complexity setting, 0-2, where 2 is highest */ + const opus_int nb_subfr, /* I number of 5 ms subframes */ + int arch /* I Run-time architecture */ +); + +/* Compute Normalized Line Spectral Frequencies (NLSFs) from whitening filter coefficients */ +/* If not all roots are found, the a_Q16 coefficients are bandwidth expanded until convergence. */ +void silk_A2NLSF( + opus_int16 *NLSF, /* O Normalized Line Spectral Frequencies in Q15 (0..2^15-1) [d] */ + opus_int32 *a_Q16, /* I/O Monic whitening filter coefficients in Q16 [d] */ + const opus_int d /* I Filter order (must be even) */ +); + +/* compute whitening filter coefficients from normalized line spectral frequencies */ +void silk_NLSF2A( + opus_int16 *a_Q12, /* O monic whitening filter coefficients in Q12, [ d ] */ + const opus_int16 *NLSF, /* I normalized line spectral frequencies in Q15, [ d ] */ + const opus_int d /* I filter order (should be even) */ +); + +void silk_insertion_sort_increasing( + opus_int32 *a, /* I/O Unsorted / Sorted vector */ + opus_int *idx, /* O Index vector for the sorted elements */ + const opus_int L, /* I Vector length */ + const opus_int K /* I Number of correctly sorted positions */ +); + +void silk_insertion_sort_decreasing_int16( + opus_int16 *a, /* I/O Unsorted / Sorted vector */ + opus_int *idx, /* O Index vector for the sorted elements */ + const opus_int L, /* I Vector length */ + const opus_int K /* I Number of correctly sorted positions */ +); + +void silk_insertion_sort_increasing_all_values_int16( + opus_int16 *a, /* I/O Unsorted / Sorted vector */ + const opus_int L /* I Vector length */ +); + +/* NLSF stabilizer, for a single input data vector */ +void silk_NLSF_stabilize( + opus_int16 *NLSF_Q15, /* I/O Unstable/stabilized normalized LSF vector in Q15 [L] */ + const opus_int16 *NDeltaMin_Q15, /* I Min distance vector, NDeltaMin_Q15[L] must be >= 1 [L+1] */ + const opus_int L /* I Number of NLSF parameters in the input vector */ +); + +/* Laroia low complexity NLSF weights */ +void silk_NLSF_VQ_weights_laroia( + opus_int16 *pNLSFW_Q_OUT, /* O Pointer to input vector weights [D] */ + const opus_int16 *pNLSF_Q15, /* I Pointer to input vector [D] */ + const opus_int D /* I Input vector dimension (even) */ +); + +/* Compute reflection coefficients from input signal */ +void silk_burg_modified_c( + opus_int32 *res_nrg, /* O Residual energy */ + opus_int *res_nrg_Q, /* O Residual energy Q value */ + opus_int32 A_Q16[], /* O Prediction coefficients (length order) */ + const opus_int16 x[], /* I Input signal, length: nb_subfr * ( D + subfr_length ) */ + const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */ + const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */ + const opus_int nb_subfr, /* I Number of subframes stacked in x */ + const opus_int D, /* I Order */ + int arch /* I Run-time architecture */ +); + +/* Copy and multiply a vector by a constant */ +void silk_scale_copy_vector16( + opus_int16 *data_out, + const opus_int16 *data_in, + opus_int32 gain_Q16, /* I Gain in Q16 */ + const opus_int dataSize /* I Length */ +); + +/* Some for the LTP related function requires Q26 to work.*/ +void silk_scale_vector32_Q26_lshift_18( + opus_int32 *data1, /* I/O Q0/Q18 */ + opus_int32 gain_Q26, /* I Q26 */ + opus_int dataSize /* I length */ +); + +/********************************************************************/ +/* INLINE ARM MATH */ +/********************************************************************/ + +/* return sum( inVec1[i] * inVec2[i] ) */ + +opus_int32 silk_inner_prod_aligned( + const opus_int16 *const inVec1, /* I input vector 1 */ + const opus_int16 *const inVec2, /* I input vector 2 */ + const opus_int len, /* I vector lengths */ + int arch /* I Run-time architecture */ +); + + +opus_int32 silk_inner_prod_aligned_scale( + const opus_int16 *const inVec1, /* I input vector 1 */ + const opus_int16 *const inVec2, /* I input vector 2 */ + const opus_int scale, /* I number of bits to shift */ + const opus_int len /* I vector lengths */ +); + +opus_int64 silk_inner_prod16_aligned_64_c( + const opus_int16 *inVec1, /* I input vector 1 */ + const opus_int16 *inVec2, /* I input vector 2 */ + const opus_int len /* I vector lengths */ +); + +/********************************************************************/ +/* MACROS */ +/********************************************************************/ + +/* Rotate a32 right by 'rot' bits. Negative rot values result in rotating + left. Output is 32bit int. + Note: contemporary compilers recognize the C expression below and + compile it into a 'ror' instruction if available. No need for OPUS_INLINE ASM! */ +static OPUS_INLINE opus_int32 silk_ROR32( opus_int32 a32, opus_int rot ) +{ + opus_uint32 x = (opus_uint32) a32; + opus_uint32 r = (opus_uint32) rot; + opus_uint32 m = (opus_uint32) -rot; + if( rot == 0 ) { + return a32; + } else if( rot < 0 ) { + return (opus_int32) ((x << m) | (x >> (32 - m))); + } else { + return (opus_int32) ((x << (32 - r)) | (x >> r)); + } +} + +/* Allocate opus_int16 aligned to 4-byte memory address */ +#if EMBEDDED_ARM +#define silk_DWORD_ALIGN __attribute__((aligned(4))) +#else +#define silk_DWORD_ALIGN +#endif + +/* Useful Macros that can be adjusted to other platforms */ +#define silk_memcpy(dest, src, size) memcpy((dest), (src), (size)) +#define silk_memset(dest, src, size) memset((dest), (src), (size)) +#define silk_memmove(dest, src, size) memmove((dest), (src), (size)) + +/* Fixed point macros */ + +/* (a32 * b32) output have to be 32bit int */ +#define silk_MUL(a32, b32) ((a32) * (b32)) + +/* (a32 * b32) output have to be 32bit uint */ +#define silk_MUL_uint(a32, b32) silk_MUL(a32, b32) + +/* a32 + (b32 * c32) output have to be 32bit int */ +#define silk_MLA(a32, b32, c32) silk_ADD32((a32),((b32) * (c32))) + +/* a32 + (b32 * c32) output have to be 32bit uint */ +#define silk_MLA_uint(a32, b32, c32) silk_MLA(a32, b32, c32) + +/* ((a32 >> 16) * (b32 >> 16)) output have to be 32bit int */ +#define silk_SMULTT(a32, b32) (((a32) >> 16) * ((b32) >> 16)) + +/* a32 + ((a32 >> 16) * (b32 >> 16)) output have to be 32bit int */ +#define silk_SMLATT(a32, b32, c32) silk_ADD32((a32),((b32) >> 16) * ((c32) >> 16)) + +#define silk_SMLALBB(a64, b16, c16) silk_ADD64((a64),(opus_int64)((opus_int32)(b16) * (opus_int32)(c16))) + +/* (a32 * b32) */ +#define silk_SMULL(a32, b32) ((opus_int64)(a32) * /*(opus_int64)*/(b32)) + +/* Adds two signed 32-bit values in a way that can overflow, while not relying on undefined behaviour + (just standard two's complement implementation-specific behaviour) */ +#define silk_ADD32_ovflw(a, b) ((opus_int32)((opus_uint32)(a) + (opus_uint32)(b))) +/* Subtractss two signed 32-bit values in a way that can overflow, while not relying on undefined behaviour + (just standard two's complement implementation-specific behaviour) */ +#define silk_SUB32_ovflw(a, b) ((opus_int32)((opus_uint32)(a) - (opus_uint32)(b))) + +/* Multiply-accumulate macros that allow overflow in the addition (ie, no asserts in debug mode) */ +#define silk_MLA_ovflw(a32, b32, c32) silk_ADD32_ovflw((a32), (opus_uint32)(b32) * (opus_uint32)(c32)) +#define silk_SMLABB_ovflw(a32, b32, c32) (silk_ADD32_ovflw((a32) , ((opus_int32)((opus_int16)(b32))) * (opus_int32)((opus_int16)(c32)))) + +#define silk_DIV32_16(a32, b16) ((opus_int32)((a32) / (b16))) +#define silk_DIV32(a32, b32) ((opus_int32)((a32) / (b32))) + +/* These macros enables checking for overflow in silk_API_Debug.h*/ +#define silk_ADD16(a, b) ((a) + (b)) +#define silk_ADD32(a, b) ((a) + (b)) +#define silk_ADD64(a, b) ((a) + (b)) + +#define silk_SUB16(a, b) ((a) - (b)) +#define silk_SUB32(a, b) ((a) - (b)) +#define silk_SUB64(a, b) ((a) - (b)) + +#define silk_SAT8(a) ((a) > silk_int8_MAX ? silk_int8_MAX : \ + ((a) < silk_int8_MIN ? silk_int8_MIN : (a))) +#define silk_SAT16(a) ((a) > silk_int16_MAX ? silk_int16_MAX : \ + ((a) < silk_int16_MIN ? silk_int16_MIN : (a))) +#define silk_SAT32(a) ((a) > silk_int32_MAX ? silk_int32_MAX : \ + ((a) < silk_int32_MIN ? silk_int32_MIN : (a))) + +#define silk_CHECK_FIT8(a) (a) +#define silk_CHECK_FIT16(a) (a) +#define silk_CHECK_FIT32(a) (a) + +#define silk_ADD_SAT16(a, b) (opus_int16)silk_SAT16( silk_ADD32( (opus_int32)(a), (b) ) ) +#define silk_ADD_SAT64(a, b) ((((a) + (b)) & 0x8000000000000000LL) == 0 ? \ + ((((a) & (b)) & 0x8000000000000000LL) != 0 ? silk_int64_MIN : (a)+(b)) : \ + ((((a) | (b)) & 0x8000000000000000LL) == 0 ? silk_int64_MAX : (a)+(b)) ) + +#define silk_SUB_SAT16(a, b) (opus_int16)silk_SAT16( silk_SUB32( (opus_int32)(a), (b) ) ) +#define silk_SUB_SAT64(a, b) ((((a)-(b)) & 0x8000000000000000LL) == 0 ? \ + (( (a) & ((b)^0x8000000000000000LL) & 0x8000000000000000LL) ? silk_int64_MIN : (a)-(b)) : \ + ((((a)^0x8000000000000000LL) & (b) & 0x8000000000000000LL) ? silk_int64_MAX : (a)-(b)) ) + +/* Saturation for positive input values */ +#define silk_POS_SAT32(a) ((a) > silk_int32_MAX ? silk_int32_MAX : (a)) + +/* Add with saturation for positive input values */ +#define silk_ADD_POS_SAT8(a, b) ((((a)+(b)) & 0x80) ? silk_int8_MAX : ((a)+(b))) +#define silk_ADD_POS_SAT16(a, b) ((((a)+(b)) & 0x8000) ? silk_int16_MAX : ((a)+(b))) +#define silk_ADD_POS_SAT32(a, b) ((((a)+(b)) & 0x80000000) ? silk_int32_MAX : ((a)+(b))) +#define silk_ADD_POS_SAT64(a, b) ((((a)+(b)) & 0x8000000000000000LL) ? silk_int64_MAX : ((a)+(b))) + +#define silk_LSHIFT8(a, shift) ((opus_int8)((opus_uint8)(a)<<(shift))) /* shift >= 0, shift < 8 */ +#define silk_LSHIFT16(a, shift) ((opus_int16)((opus_uint16)(a)<<(shift))) /* shift >= 0, shift < 16 */ +#define silk_LSHIFT32(a, shift) ((opus_int32)((opus_uint32)(a)<<(shift))) /* shift >= 0, shift < 32 */ +#define silk_LSHIFT64(a, shift) ((opus_int64)((opus_uint64)(a)<<(shift))) /* shift >= 0, shift < 64 */ +#define silk_LSHIFT(a, shift) silk_LSHIFT32(a, shift) /* shift >= 0, shift < 32 */ + +#define silk_RSHIFT8(a, shift) ((a)>>(shift)) /* shift >= 0, shift < 8 */ +#define silk_RSHIFT16(a, shift) ((a)>>(shift)) /* shift >= 0, shift < 16 */ +#define silk_RSHIFT32(a, shift) ((a)>>(shift)) /* shift >= 0, shift < 32 */ +#define silk_RSHIFT64(a, shift) ((a)>>(shift)) /* shift >= 0, shift < 64 */ +#define silk_RSHIFT(a, shift) silk_RSHIFT32(a, shift) /* shift >= 0, shift < 32 */ + +/* saturates before shifting */ +#define silk_LSHIFT_SAT32(a, shift) (silk_LSHIFT32( silk_LIMIT( (a), silk_RSHIFT32( silk_int32_MIN, (shift) ), \ + silk_RSHIFT32( silk_int32_MAX, (shift) ) ), (shift) )) + +#define silk_LSHIFT_ovflw(a, shift) ((opus_int32)((opus_uint32)(a) << (shift))) /* shift >= 0, allowed to overflow */ +#define silk_LSHIFT_uint(a, shift) ((a) << (shift)) /* shift >= 0 */ +#define silk_RSHIFT_uint(a, shift) ((a) >> (shift)) /* shift >= 0 */ + +#define silk_ADD_LSHIFT(a, b, shift) ((a) + silk_LSHIFT((b), (shift))) /* shift >= 0 */ +#define silk_ADD_LSHIFT32(a, b, shift) silk_ADD32((a), silk_LSHIFT32((b), (shift))) /* shift >= 0 */ +#define silk_ADD_LSHIFT_uint(a, b, shift) ((a) + silk_LSHIFT_uint((b), (shift))) /* shift >= 0 */ +#define silk_ADD_RSHIFT(a, b, shift) ((a) + silk_RSHIFT((b), (shift))) /* shift >= 0 */ +#define silk_ADD_RSHIFT32(a, b, shift) silk_ADD32((a), silk_RSHIFT32((b), (shift))) /* shift >= 0 */ +#define silk_ADD_RSHIFT_uint(a, b, shift) ((a) + silk_RSHIFT_uint((b), (shift))) /* shift >= 0 */ +#define silk_SUB_LSHIFT32(a, b, shift) silk_SUB32((a), silk_LSHIFT32((b), (shift))) /* shift >= 0 */ +#define silk_SUB_RSHIFT32(a, b, shift) silk_SUB32((a), silk_RSHIFT32((b), (shift))) /* shift >= 0 */ + +/* Requires that shift > 0 */ +#define silk_RSHIFT_ROUND(a, shift) ((shift) == 1 ? ((a) >> 1) + ((a) & 1) : (((a) >> ((shift) - 1)) + 1) >> 1) +#define silk_RSHIFT_ROUND64(a, shift) ((shift) == 1 ? ((a) >> 1) + ((a) & 1) : (((a) >> ((shift) - 1)) + 1) >> 1) + +/* Number of rightshift required to fit the multiplication */ +#define silk_NSHIFT_MUL_32_32(a, b) ( -(31- (32-silk_CLZ32(silk_abs(a)) + (32-silk_CLZ32(silk_abs(b))))) ) +#define silk_NSHIFT_MUL_16_16(a, b) ( -(15- (16-silk_CLZ16(silk_abs(a)) + (16-silk_CLZ16(silk_abs(b))))) ) + + +#define silk_min(a, b) (((a) < (b)) ? (a) : (b)) +#define silk_max(a, b) (((a) > (b)) ? (a) : (b)) + +/* Macro to convert floating-point constants to fixed-point */ +#define SILK_FIX_CONST( C, Q ) ((opus_int32)((C) * ((opus_int64)1 << (Q)) + 0.5)) + +/* silk_min() versions with typecast in the function call */ +static OPUS_INLINE opus_int silk_min_int(opus_int a, opus_int b) +{ + return (((a) < (b)) ? (a) : (b)); +} +static OPUS_INLINE opus_int16 silk_min_16(opus_int16 a, opus_int16 b) +{ + return (((a) < (b)) ? (a) : (b)); +} +static OPUS_INLINE opus_int32 silk_min_32(opus_int32 a, opus_int32 b) +{ + return (((a) < (b)) ? (a) : (b)); +} +static OPUS_INLINE opus_int64 silk_min_64(opus_int64 a, opus_int64 b) +{ + return (((a) < (b)) ? (a) : (b)); +} + +/* silk_min() versions with typecast in the function call */ +static OPUS_INLINE opus_int silk_max_int(opus_int a, opus_int b) +{ + return (((a) > (b)) ? (a) : (b)); +} +static OPUS_INLINE opus_int16 silk_max_16(opus_int16 a, opus_int16 b) +{ + return (((a) > (b)) ? (a) : (b)); +} +static OPUS_INLINE opus_int32 silk_max_32(opus_int32 a, opus_int32 b) +{ + return (((a) > (b)) ? (a) : (b)); +} +static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b) +{ + return (((a) > (b)) ? (a) : (b)); +} + +#define silk_LIMIT( a, limit1, limit2) ((limit1) > (limit2) ? ((a) > (limit1) ? (limit1) : ((a) < (limit2) ? (limit2) : (a))) \ + : ((a) > (limit2) ? (limit2) : ((a) < (limit1) ? (limit1) : (a)))) + +#define silk_LIMIT_int silk_LIMIT +#define silk_LIMIT_16 silk_LIMIT +#define silk_LIMIT_32 silk_LIMIT + +#define silk_abs(a) (((a) > 0) ? (a) : -(a)) /* Be careful, silk_abs returns wrong when input equals to silk_intXX_MIN */ +#define silk_abs_int(a) (((a) ^ ((a) >> (8 * sizeof(a) - 1))) - ((a) >> (8 * sizeof(a) - 1))) +#define silk_abs_int32(a) (((a) ^ ((a) >> 31)) - ((a) >> 31)) +#define silk_abs_int64(a) (((a) > 0) ? (a) : -(a)) + +#define silk_sign(a) ((a) > 0 ? 1 : ( (a) < 0 ? -1 : 0 )) + +/* PSEUDO-RANDOM GENERATOR */ +/* Make sure to store the result as the seed for the next call (also in between */ +/* frames), otherwise result won't be random at all. When only using some of the */ +/* bits, take the most significant bits by right-shifting. */ +#define silk_RAND(seed) (silk_MLA_ovflw(907633515, (seed), 196314165)) + +/* Add some multiplication functions that can be easily mapped to ARM. */ + +/* silk_SMMUL: Signed top word multiply. + ARMv6 2 instruction cycles. + ARMv3M+ 3 instruction cycles. use SMULL and ignore LSB registers.(except xM)*/ +/*#define silk_SMMUL(a32, b32) (opus_int32)silk_RSHIFT(silk_SMLAL(silk_SMULWB((a32), (b32)), (a32), silk_RSHIFT_ROUND((b32), 16)), 16)*/ +/* the following seems faster on x86 */ +#define silk_SMMUL(a32, b32) (opus_int32)silk_RSHIFT64(silk_SMULL((a32), (b32)), 32) + +#if !defined(OPUS_X86_MAY_HAVE_SSE4_1) +#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ + ((void)(arch), silk_burg_modified_c(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) + +#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \ + ((void)(arch),silk_inner_prod16_aligned_64_c(inVec1, inVec2, len)) +#endif + +#include "Inlines.h" +#include "MacroCount.h" +#include "MacroDebug.h" + +#ifdef OPUS_ARM_INLINE_ASM +#include "arm/SigProc_FIX_armv4.h" +#endif + +#ifdef OPUS_ARM_INLINE_EDSP +#include "arm/SigProc_FIX_armv5e.h" +#endif + +#if defined(MIPSr1_ASM) +#include "mips/sigproc_fix_mipsr1.h" +#endif + + +#ifdef __cplusplus +} +#endif + +#endif /* SILK_SIGPROC_FIX_H */ diff --git a/thirdparty/opus/silk/VAD.c b/thirdparty/opus/silk/VAD.c new file mode 100644 index 000000000000..0a782af2f132 --- /dev/null +++ b/thirdparty/opus/silk/VAD.c @@ -0,0 +1,362 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" +#include "stack_alloc.h" + +/* Silk VAD noise level estimation */ +# if !defined(OPUS_X86_MAY_HAVE_SSE4_1) +static OPUS_INLINE void silk_VAD_GetNoiseLevels( + const opus_int32 pX[ VAD_N_BANDS ], /* I subband energies */ + silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */ +); +#endif + +/**********************************/ +/* Initialization of the Silk VAD */ +/**********************************/ +opus_int silk_VAD_Init( /* O Return value, 0 if success */ + silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */ +) +{ + opus_int b, ret = 0; + + /* reset state memory */ + silk_memset( psSilk_VAD, 0, sizeof( silk_VAD_state ) ); + + /* init noise levels */ + /* Initialize array with approx pink noise levels (psd proportional to inverse of frequency) */ + for( b = 0; b < VAD_N_BANDS; b++ ) { + psSilk_VAD->NoiseLevelBias[ b ] = silk_max_32( silk_DIV32_16( VAD_NOISE_LEVELS_BIAS, b + 1 ), 1 ); + } + + /* Initialize state */ + for( b = 0; b < VAD_N_BANDS; b++ ) { + psSilk_VAD->NL[ b ] = silk_MUL( 100, psSilk_VAD->NoiseLevelBias[ b ] ); + psSilk_VAD->inv_NL[ b ] = silk_DIV32( silk_int32_MAX, psSilk_VAD->NL[ b ] ); + } + psSilk_VAD->counter = 15; + + /* init smoothed energy-to-noise ratio*/ + for( b = 0; b < VAD_N_BANDS; b++ ) { + psSilk_VAD->NrgRatioSmth_Q8[ b ] = 100 * 256; /* 100 * 256 --> 20 dB SNR */ + } + + return( ret ); +} + +/* Weighting factors for tilt measure */ +static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 }; + +/***************************************/ +/* Get the speech activity level in Q8 */ +/***************************************/ +opus_int silk_VAD_GetSA_Q8_c( /* O Return value, 0 if success */ + silk_encoder_state *psEncC, /* I/O Encoder state */ + const opus_int16 pIn[] /* I PCM input */ +) +{ + opus_int SA_Q15, pSNR_dB_Q7, input_tilt; + opus_int decimated_framelength1, decimated_framelength2; + opus_int decimated_framelength; + opus_int dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s; + opus_int32 sumSquared, smooth_coef_Q16; + opus_int16 HPstateTmp; + VARDECL( opus_int16, X ); + opus_int32 Xnrg[ VAD_N_BANDS ]; + opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ]; + opus_int32 speech_nrg, x_tmp; + opus_int X_offset[ VAD_N_BANDS ]; + opus_int ret = 0; + silk_VAD_state *psSilk_VAD = &psEncC->sVAD; + SAVE_STACK; + + /* Safety checks */ + silk_assert( VAD_N_BANDS == 4 ); + silk_assert( MAX_FRAME_LENGTH >= psEncC->frame_length ); + silk_assert( psEncC->frame_length <= 512 ); + silk_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) ); + + /***********************/ + /* Filter and Decimate */ + /***********************/ + decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 ); + decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 ); + decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 ); + /* Decimate into 4 bands: + 0 L 3L L 3L 5L + - -- - -- -- + 8 8 2 4 4 + + [0-1 kHz| temp. |1-2 kHz| 2-4 kHz | 4-8 kHz | + + They're arranged to allow the minimal ( frame_length / 4 ) extra + scratch space during the downsampling process */ + X_offset[ 0 ] = 0; + X_offset[ 1 ] = decimated_framelength + decimated_framelength2; + X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength; + X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2; + ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 ); + + /* 0-8 kHz to 0-4 kHz and 4-8 kHz */ + silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[ 0 ], + X, &X[ X_offset[ 3 ] ], psEncC->frame_length ); + + /* 0-4 kHz to 0-2 kHz and 2-4 kHz */ + silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ], + X, &X[ X_offset[ 2 ] ], decimated_framelength1 ); + + /* 0-2 kHz to 0-1 kHz and 1-2 kHz */ + silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ], + X, &X[ X_offset[ 1 ] ], decimated_framelength2 ); + + /*********************************************/ + /* HP filter on lowest band (differentiator) */ + /*********************************************/ + X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 ); + HPstateTmp = X[ decimated_framelength - 1 ]; + for( i = decimated_framelength - 1; i > 0; i-- ) { + X[ i - 1 ] = silk_RSHIFT( X[ i - 1 ], 1 ); + X[ i ] -= X[ i - 1 ]; + } + X[ 0 ] -= psSilk_VAD->HPstate; + psSilk_VAD->HPstate = HPstateTmp; + + /*************************************/ + /* Calculate the energy in each band */ + /*************************************/ + for( b = 0; b < VAD_N_BANDS; b++ ) { + /* Find the decimated framelength in the non-uniformly divided bands */ + decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) ); + + /* Split length into subframe lengths */ + dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 ); + dec_subframe_offset = 0; + + /* Compute energy per sub-frame */ + /* initialize with summed energy of last subframe */ + Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ]; + for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) { + sumSquared = 0; + for( i = 0; i < dec_subframe_length; i++ ) { + /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2. */ + /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128) */ + x_tmp = silk_RSHIFT( + X[ X_offset[ b ] + i + dec_subframe_offset ], 3 ); + sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp ); + + /* Safety check */ + silk_assert( sumSquared >= 0 ); + } + + /* Add/saturate summed energy of current subframe */ + if( s < VAD_INTERNAL_SUBFRAMES - 1 ) { + Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared ); + } else { + /* Look-ahead subframe */ + Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) ); + } + + dec_subframe_offset += dec_subframe_length; + } + psSilk_VAD->XnrgSubfr[ b ] = sumSquared; + } + + /********************/ + /* Noise estimation */ + /********************/ + silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD ); + + /***********************************************/ + /* Signal-plus-noise to noise ratio estimation */ + /***********************************************/ + sumSquared = 0; + input_tilt = 0; + for( b = 0; b < VAD_N_BANDS; b++ ) { + speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ]; + if( speech_nrg > 0 ) { + /* Divide, with sufficient resolution */ + if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) { + NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 ); + } else { + NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 ); + } + + /* Convert to log domain */ + SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128; + + /* Sum-of-squares */ + sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 ); /* Q14 */ + + /* Tilt measure */ + if( speech_nrg < ( (opus_int32)1 << 20 ) ) { + /* Scale down SNR value for small subband speech energies */ + SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 ); + } + input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 ); + } else { + NrgToNoiseRatio_Q8[ b ] = 256; + } + } + + /* Mean-of-squares */ + sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */ + + /* Root-mean-square approximation, scale to dBs, and write to output pointer */ + pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */ + + /*********************************/ + /* Speech Probability Estimation */ + /*********************************/ + SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 ); + + /**************************/ + /* Frequency Tilt Measure */ + /**************************/ + psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 ); + + /**************************************************/ + /* Scale the sigmoid output based on power levels */ + /**************************************************/ + speech_nrg = 0; + for( b = 0; b < VAD_N_BANDS; b++ ) { + /* Accumulate signal-without-noise energies, higher frequency bands have more weight */ + speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 ); + } + + /* Power scaling */ + if( speech_nrg <= 0 ) { + SA_Q15 = silk_RSHIFT( SA_Q15, 1 ); + } else if( speech_nrg < 32768 ) { + if( psEncC->frame_length == 10 * psEncC->fs_kHz ) { + speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 ); + } else { + speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 ); + } + + /* square-root */ + speech_nrg = silk_SQRT_APPROX( speech_nrg ); + SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 ); + } + + /* Copy the resulting speech activity in Q8 */ + psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX ); + + /***********************************/ + /* Energy Level and SNR estimation */ + /***********************************/ + /* Smoothing coefficient */ + smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) ); + + if( psEncC->frame_length == 10 * psEncC->fs_kHz ) { + smooth_coef_Q16 >>= 1; + } + + for( b = 0; b < VAD_N_BANDS; b++ ) { + /* compute smoothed energy-to-noise ratio per band */ + psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ], + NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 ); + + /* signal to noise ratio in dB per band */ + SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 ); + /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */ + psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) ); + } + + RESTORE_STACK; + return( ret ); +} + +/**************************/ +/* Noise level estimation */ +/**************************/ +# if !defined(OPUS_X86_MAY_HAVE_SSE4_1) +static OPUS_INLINE +#endif +void silk_VAD_GetNoiseLevels( + const opus_int32 pX[ VAD_N_BANDS ], /* I subband energies */ + silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */ +) +{ + opus_int k; + opus_int32 nl, nrg, inv_nrg; + opus_int coef, min_coef; + + /* Initially faster smoothing */ + if( psSilk_VAD->counter < 1000 ) { /* 1000 = 20 sec */ + min_coef = silk_DIV32_16( silk_int16_MAX, silk_RSHIFT( psSilk_VAD->counter, 4 ) + 1 ); + } else { + min_coef = 0; + } + + for( k = 0; k < VAD_N_BANDS; k++ ) { + /* Get old noise level estimate for current band */ + nl = psSilk_VAD->NL[ k ]; + silk_assert( nl >= 0 ); + + /* Add bias */ + nrg = silk_ADD_POS_SAT32( pX[ k ], psSilk_VAD->NoiseLevelBias[ k ] ); + silk_assert( nrg > 0 ); + + /* Invert energies */ + inv_nrg = silk_DIV32( silk_int32_MAX, nrg ); + silk_assert( inv_nrg >= 0 ); + + /* Less update when subband energy is high */ + if( nrg > silk_LSHIFT( nl, 3 ) ) { + coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 >> 3; + } else if( nrg < nl ) { + coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16; + } else { + coef = silk_SMULWB( silk_SMULWW( inv_nrg, nl ), VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 << 1 ); + } + + /* Initially faster smoothing */ + coef = silk_max_int( coef, min_coef ); + + /* Smooth inverse energies */ + psSilk_VAD->inv_NL[ k ] = silk_SMLAWB( psSilk_VAD->inv_NL[ k ], inv_nrg - psSilk_VAD->inv_NL[ k ], coef ); + silk_assert( psSilk_VAD->inv_NL[ k ] >= 0 ); + + /* Compute noise level by inverting again */ + nl = silk_DIV32( silk_int32_MAX, psSilk_VAD->inv_NL[ k ] ); + silk_assert( nl >= 0 ); + + /* Limit noise levels (guarantee 7 bits of head room) */ + nl = silk_min( nl, 0x00FFFFFF ); + + /* Store as part of state */ + psSilk_VAD->NL[ k ] = nl; + } + + /* Increment frame counter */ + psSilk_VAD->counter++; +} diff --git a/thirdparty/opus/silk/VQ_WMat_EC.c b/thirdparty/opus/silk/VQ_WMat_EC.c new file mode 100644 index 000000000000..7983f1db80cc --- /dev/null +++ b/thirdparty/opus/silk/VQ_WMat_EC.c @@ -0,0 +1,120 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */ +void silk_VQ_WMat_EC_c( + opus_int8 *ind, /* O index of best codebook vector */ + opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */ + opus_int *gain_Q7, /* O sum of absolute LTP coefficients */ + const opus_int16 *in_Q14, /* I input vector to be quantized */ + const opus_int32 *W_Q18, /* I weighting matrix */ + const opus_int8 *cb_Q7, /* I codebook */ + const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */ + const opus_uint8 *cl_Q5, /* I code length for each codebook vector */ + const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */ + const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */ + opus_int L /* I number of vectors in codebook */ +) +{ + opus_int k, gain_tmp_Q7; + const opus_int8 *cb_row_Q7; + opus_int16 diff_Q14[ 5 ]; + opus_int32 sum1_Q14, sum2_Q16; + + /* Loop over codebook */ + *rate_dist_Q14 = silk_int32_MAX; + cb_row_Q7 = cb_Q7; + for( k = 0; k < L; k++ ) { + gain_tmp_Q7 = cb_gain_Q7[k]; + + diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 ); + diff_Q14[ 1 ] = in_Q14[ 1 ] - silk_LSHIFT( cb_row_Q7[ 1 ], 7 ); + diff_Q14[ 2 ] = in_Q14[ 2 ] - silk_LSHIFT( cb_row_Q7[ 2 ], 7 ); + diff_Q14[ 3 ] = in_Q14[ 3 ] - silk_LSHIFT( cb_row_Q7[ 3 ], 7 ); + diff_Q14[ 4 ] = in_Q14[ 4 ] - silk_LSHIFT( cb_row_Q7[ 4 ], 7 ); + + /* Weighted rate */ + sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] ); + + /* Penalty for too large gain */ + sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 ); + + silk_assert( sum1_Q14 >= 0 ); + + /* first row of W_Q18 */ + sum2_Q16 = silk_SMULWB( W_Q18[ 1 ], diff_Q14[ 1 ] ); + sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 2 ], diff_Q14[ 2 ] ); + sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 3 ], diff_Q14[ 3 ] ); + sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 4 ], diff_Q14[ 4 ] ); + sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 ); + sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 0 ], diff_Q14[ 0 ] ); + sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 0 ] ); + + /* second row of W_Q18 */ + sum2_Q16 = silk_SMULWB( W_Q18[ 7 ], diff_Q14[ 2 ] ); + sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 8 ], diff_Q14[ 3 ] ); + sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 9 ], diff_Q14[ 4 ] ); + sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 ); + sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 6 ], diff_Q14[ 1 ] ); + sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 1 ] ); + + /* third row of W_Q18 */ + sum2_Q16 = silk_SMULWB( W_Q18[ 13 ], diff_Q14[ 3 ] ); + sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] ); + sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 ); + sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] ); + sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 2 ] ); + + /* fourth row of W_Q18 */ + sum2_Q16 = silk_SMULWB( W_Q18[ 19 ], diff_Q14[ 4 ] ); + sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 ); + sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] ); + sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 3 ] ); + + /* last row of W_Q18 */ + sum2_Q16 = silk_SMULWB( W_Q18[ 24 ], diff_Q14[ 4 ] ); + sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 4 ] ); + + silk_assert( sum1_Q14 >= 0 ); + + /* find best */ + if( sum1_Q14 < *rate_dist_Q14 ) { + *rate_dist_Q14 = sum1_Q14; + *ind = (opus_int8)k; + *gain_Q7 = gain_tmp_Q7; + } + + /* Go to next cbk vector */ + cb_row_Q7 += LTP_ORDER; + } +} diff --git a/thirdparty/opus/silk/ana_filt_bank_1.c b/thirdparty/opus/silk/ana_filt_bank_1.c new file mode 100644 index 000000000000..24cfb03fdb2a --- /dev/null +++ b/thirdparty/opus/silk/ana_filt_bank_1.c @@ -0,0 +1,74 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" + +/* Coefficients for 2-band filter bank based on first-order allpass filters */ +static opus_int16 A_fb1_20 = 5394 << 1; +static opus_int16 A_fb1_21 = -24290; /* (opus_int16)(20623 << 1) */ + +/* Split signal into two decimated bands using first-order allpass filters */ +void silk_ana_filt_bank_1( + const opus_int16 *in, /* I Input signal [N] */ + opus_int32 *S, /* I/O State vector [2] */ + opus_int16 *outL, /* O Low band [N/2] */ + opus_int16 *outH, /* O High band [N/2] */ + const opus_int32 N /* I Number of input samples */ +) +{ + opus_int k, N2 = silk_RSHIFT( N, 1 ); + opus_int32 in32, X, Y, out_1, out_2; + + /* Internal variables and state are in Q10 format */ + for( k = 0; k < N2; k++ ) { + /* Convert to Q10 */ + in32 = silk_LSHIFT( (opus_int32)in[ 2 * k ], 10 ); + + /* All-pass section for even input sample */ + Y = silk_SUB32( in32, S[ 0 ] ); + X = silk_SMLAWB( Y, Y, A_fb1_21 ); + out_1 = silk_ADD32( S[ 0 ], X ); + S[ 0 ] = silk_ADD32( in32, X ); + + /* Convert to Q10 */ + in32 = silk_LSHIFT( (opus_int32)in[ 2 * k + 1 ], 10 ); + + /* All-pass section for odd input sample, and add to output of previous section */ + Y = silk_SUB32( in32, S[ 1 ] ); + X = silk_SMULWB( Y, A_fb1_20 ); + out_2 = silk_ADD32( S[ 1 ], X ); + S[ 1 ] = silk_ADD32( in32, X ); + + /* Add/subtract, convert back to int16 and store to output */ + outL[ k ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_ADD32( out_2, out_1 ), 11 ) ); + outH[ k ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SUB32( out_2, out_1 ), 11 ) ); + } +} diff --git a/thirdparty/opus/silk/biquad_alt.c b/thirdparty/opus/silk/biquad_alt.c new file mode 100644 index 000000000000..d55f5ee92ec2 --- /dev/null +++ b/thirdparty/opus/silk/biquad_alt.c @@ -0,0 +1,78 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +/* * + * silk_biquad_alt.c * + * * + * Second order ARMA filter * + * Can handle slowly varying filter coefficients * + * */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" + +/* Second order ARMA filter, alternative implementation */ +void silk_biquad_alt( + const opus_int16 *in, /* I input signal */ + const opus_int32 *B_Q28, /* I MA coefficients [3] */ + const opus_int32 *A_Q28, /* I AR coefficients [2] */ + opus_int32 *S, /* I/O State vector [2] */ + opus_int16 *out, /* O output signal */ + const opus_int32 len, /* I signal length (must be even) */ + opus_int stride /* I Operate on interleaved signal if > 1 */ +) +{ + /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */ + opus_int k; + opus_int32 inval, A0_U_Q28, A0_L_Q28, A1_U_Q28, A1_L_Q28, out32_Q14; + + /* Negate A_Q28 values and split in two parts */ + A0_L_Q28 = ( -A_Q28[ 0 ] ) & 0x00003FFF; /* lower part */ + A0_U_Q28 = silk_RSHIFT( -A_Q28[ 0 ], 14 ); /* upper part */ + A1_L_Q28 = ( -A_Q28[ 1 ] ) & 0x00003FFF; /* lower part */ + A1_U_Q28 = silk_RSHIFT( -A_Q28[ 1 ], 14 ); /* upper part */ + + for( k = 0; k < len; k++ ) { + /* S[ 0 ], S[ 1 ]: Q12 */ + inval = in[ k * stride ]; + out32_Q14 = silk_LSHIFT( silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], inval ), 2 ); + + S[ 0 ] = S[1] + silk_RSHIFT_ROUND( silk_SMULWB( out32_Q14, A0_L_Q28 ), 14 ); + S[ 0 ] = silk_SMLAWB( S[ 0 ], out32_Q14, A0_U_Q28 ); + S[ 0 ] = silk_SMLAWB( S[ 0 ], B_Q28[ 1 ], inval); + + S[ 1 ] = silk_RSHIFT_ROUND( silk_SMULWB( out32_Q14, A1_L_Q28 ), 14 ); + S[ 1 ] = silk_SMLAWB( S[ 1 ], out32_Q14, A1_U_Q28 ); + S[ 1 ] = silk_SMLAWB( S[ 1 ], B_Q28[ 2 ], inval ); + + /* Scale back to Q0 and saturate */ + out[ k * stride ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14 + (1<<14) - 1, 14 ) ); + } +} diff --git a/thirdparty/opus/silk/bwexpander.c b/thirdparty/opus/silk/bwexpander.c new file mode 100644 index 000000000000..2eb4456695e7 --- /dev/null +++ b/thirdparty/opus/silk/bwexpander.c @@ -0,0 +1,51 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" + +/* Chirp (bandwidth expand) LP AR filter */ +void silk_bwexpander( + opus_int16 *ar, /* I/O AR filter to be expanded (without leading 1) */ + const opus_int d, /* I Length of ar */ + opus_int32 chirp_Q16 /* I Chirp factor (typically in the range 0 to 1) */ +) +{ + opus_int i; + opus_int32 chirp_minus_one_Q16 = chirp_Q16 - 65536; + + /* NB: Dont use silk_SMULWB, instead of silk_RSHIFT_ROUND( silk_MUL(), 16 ), below. */ + /* Bias in silk_SMULWB can lead to unstable filters */ + for( i = 0; i < d - 1; i++ ) { + ar[ i ] = (opus_int16)silk_RSHIFT_ROUND( silk_MUL( chirp_Q16, ar[ i ] ), 16 ); + chirp_Q16 += silk_RSHIFT_ROUND( silk_MUL( chirp_Q16, chirp_minus_one_Q16 ), 16 ); + } + ar[ d - 1 ] = (opus_int16)silk_RSHIFT_ROUND( silk_MUL( chirp_Q16, ar[ d - 1 ] ), 16 ); +} diff --git a/thirdparty/opus/silk/bwexpander_32.c b/thirdparty/opus/silk/bwexpander_32.c new file mode 100644 index 000000000000..d0010f73dfd5 --- /dev/null +++ b/thirdparty/opus/silk/bwexpander_32.c @@ -0,0 +1,50 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" + +/* Chirp (bandwidth expand) LP AR filter */ +void silk_bwexpander_32( + opus_int32 *ar, /* I/O AR filter to be expanded (without leading 1) */ + const opus_int d, /* I Length of ar */ + opus_int32 chirp_Q16 /* I Chirp factor in Q16 */ +) +{ + opus_int i; + opus_int32 chirp_minus_one_Q16 = chirp_Q16 - 65536; + + for( i = 0; i < d - 1; i++ ) { + ar[ i ] = silk_SMULWW( chirp_Q16, ar[ i ] ); + chirp_Q16 += silk_RSHIFT_ROUND( silk_MUL( chirp_Q16, chirp_minus_one_Q16 ), 16 ); + } + ar[ d - 1 ] = silk_SMULWW( chirp_Q16, ar[ d - 1 ] ); +} + diff --git a/thirdparty/opus/silk/check_control_input.c b/thirdparty/opus/silk/check_control_input.c new file mode 100644 index 000000000000..b5de9ce48d69 --- /dev/null +++ b/thirdparty/opus/silk/check_control_input.c @@ -0,0 +1,106 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" +#include "control.h" +#include "errors.h" + +/* Check encoder control struct */ +opus_int check_control_input( + silk_EncControlStruct *encControl /* I Control structure */ +) +{ + silk_assert( encControl != NULL ); + + if( ( ( encControl->API_sampleRate != 8000 ) && + ( encControl->API_sampleRate != 12000 ) && + ( encControl->API_sampleRate != 16000 ) && + ( encControl->API_sampleRate != 24000 ) && + ( encControl->API_sampleRate != 32000 ) && + ( encControl->API_sampleRate != 44100 ) && + ( encControl->API_sampleRate != 48000 ) ) || + ( ( encControl->desiredInternalSampleRate != 8000 ) && + ( encControl->desiredInternalSampleRate != 12000 ) && + ( encControl->desiredInternalSampleRate != 16000 ) ) || + ( ( encControl->maxInternalSampleRate != 8000 ) && + ( encControl->maxInternalSampleRate != 12000 ) && + ( encControl->maxInternalSampleRate != 16000 ) ) || + ( ( encControl->minInternalSampleRate != 8000 ) && + ( encControl->minInternalSampleRate != 12000 ) && + ( encControl->minInternalSampleRate != 16000 ) ) || + ( encControl->minInternalSampleRate > encControl->desiredInternalSampleRate ) || + ( encControl->maxInternalSampleRate < encControl->desiredInternalSampleRate ) || + ( encControl->minInternalSampleRate > encControl->maxInternalSampleRate ) ) { + silk_assert( 0 ); + return SILK_ENC_FS_NOT_SUPPORTED; + } + if( encControl->payloadSize_ms != 10 && + encControl->payloadSize_ms != 20 && + encControl->payloadSize_ms != 40 && + encControl->payloadSize_ms != 60 ) { + silk_assert( 0 ); + return SILK_ENC_PACKET_SIZE_NOT_SUPPORTED; + } + if( encControl->packetLossPercentage < 0 || encControl->packetLossPercentage > 100 ) { + silk_assert( 0 ); + return SILK_ENC_INVALID_LOSS_RATE; + } + if( encControl->useDTX < 0 || encControl->useDTX > 1 ) { + silk_assert( 0 ); + return SILK_ENC_INVALID_DTX_SETTING; + } + if( encControl->useCBR < 0 || encControl->useCBR > 1 ) { + silk_assert( 0 ); + return SILK_ENC_INVALID_CBR_SETTING; + } + if( encControl->useInBandFEC < 0 || encControl->useInBandFEC > 1 ) { + silk_assert( 0 ); + return SILK_ENC_INVALID_INBAND_FEC_SETTING; + } + if( encControl->nChannelsAPI < 1 || encControl->nChannelsAPI > ENCODER_NUM_CHANNELS ) { + silk_assert( 0 ); + return SILK_ENC_INVALID_NUMBER_OF_CHANNELS_ERROR; + } + if( encControl->nChannelsInternal < 1 || encControl->nChannelsInternal > ENCODER_NUM_CHANNELS ) { + silk_assert( 0 ); + return SILK_ENC_INVALID_NUMBER_OF_CHANNELS_ERROR; + } + if( encControl->nChannelsInternal > encControl->nChannelsAPI ) { + silk_assert( 0 ); + return SILK_ENC_INVALID_NUMBER_OF_CHANNELS_ERROR; + } + if( encControl->complexity < 0 || encControl->complexity > 10 ) { + silk_assert( 0 ); + return SILK_ENC_INVALID_COMPLEXITY_SETTING; + } + + return SILK_NO_ERROR; +} diff --git a/thirdparty/opus/silk/code_signs.c b/thirdparty/opus/silk/code_signs.c new file mode 100644 index 000000000000..dfd1dca9a18f --- /dev/null +++ b/thirdparty/opus/silk/code_signs.c @@ -0,0 +1,115 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/*#define silk_enc_map(a) ((a) > 0 ? 1 : 0)*/ +/*#define silk_dec_map(a) ((a) > 0 ? 1 : -1)*/ +/* shifting avoids if-statement */ +#define silk_enc_map(a) ( silk_RSHIFT( (a), 15 ) + 1 ) +#define silk_dec_map(a) ( silk_LSHIFT( (a), 1 ) - 1 ) + +/* Encodes signs of excitation */ +void silk_encode_signs( + ec_enc *psRangeEnc, /* I/O Compressor data structure */ + const opus_int8 pulses[], /* I pulse signal */ + opus_int length, /* I length of input */ + const opus_int signalType, /* I Signal type */ + const opus_int quantOffsetType, /* I Quantization offset type */ + const opus_int sum_pulses[ MAX_NB_SHELL_BLOCKS ] /* I Sum of absolute pulses per block */ +) +{ + opus_int i, j, p; + opus_uint8 icdf[ 2 ]; + const opus_int8 *q_ptr; + const opus_uint8 *icdf_ptr; + + icdf[ 1 ] = 0; + q_ptr = pulses; + i = silk_SMULBB( 7, silk_ADD_LSHIFT( quantOffsetType, signalType, 1 ) ); + icdf_ptr = &silk_sign_iCDF[ i ]; + length = silk_RSHIFT( length + SHELL_CODEC_FRAME_LENGTH/2, LOG2_SHELL_CODEC_FRAME_LENGTH ); + for( i = 0; i < length; i++ ) { + p = sum_pulses[ i ]; + if( p > 0 ) { + icdf[ 0 ] = icdf_ptr[ silk_min( p & 0x1F, 6 ) ]; + for( j = 0; j < SHELL_CODEC_FRAME_LENGTH; j++ ) { + if( q_ptr[ j ] != 0 ) { + ec_enc_icdf( psRangeEnc, silk_enc_map( q_ptr[ j ]), icdf, 8 ); + } + } + } + q_ptr += SHELL_CODEC_FRAME_LENGTH; + } +} + +/* Decodes signs of excitation */ +void silk_decode_signs( + ec_dec *psRangeDec, /* I/O Compressor data structure */ + opus_int16 pulses[], /* I/O pulse signal */ + opus_int length, /* I length of input */ + const opus_int signalType, /* I Signal type */ + const opus_int quantOffsetType, /* I Quantization offset type */ + const opus_int sum_pulses[ MAX_NB_SHELL_BLOCKS ] /* I Sum of absolute pulses per block */ +) +{ + opus_int i, j, p; + opus_uint8 icdf[ 2 ]; + opus_int16 *q_ptr; + const opus_uint8 *icdf_ptr; + + icdf[ 1 ] = 0; + q_ptr = pulses; + i = silk_SMULBB( 7, silk_ADD_LSHIFT( quantOffsetType, signalType, 1 ) ); + icdf_ptr = &silk_sign_iCDF[ i ]; + length = silk_RSHIFT( length + SHELL_CODEC_FRAME_LENGTH/2, LOG2_SHELL_CODEC_FRAME_LENGTH ); + for( i = 0; i < length; i++ ) { + p = sum_pulses[ i ]; + if( p > 0 ) { + icdf[ 0 ] = icdf_ptr[ silk_min( p & 0x1F, 6 ) ]; + for( j = 0; j < SHELL_CODEC_FRAME_LENGTH; j++ ) { + if( q_ptr[ j ] > 0 ) { + /* attach sign */ +#if 0 + /* conditional implementation */ + if( ec_dec_icdf( psRangeDec, icdf, 8 ) == 0 ) { + q_ptr[ j ] = -q_ptr[ j ]; + } +#else + /* implementation with shift, subtraction, multiplication */ + q_ptr[ j ] *= silk_dec_map( ec_dec_icdf( psRangeDec, icdf, 8 ) ); +#endif + } + } + } + q_ptr += SHELL_CODEC_FRAME_LENGTH; + } +} diff --git a/thirdparty/opus/silk/control.h b/thirdparty/opus/silk/control.h new file mode 100644 index 000000000000..747e5426a0c3 --- /dev/null +++ b/thirdparty/opus/silk/control.h @@ -0,0 +1,142 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_CONTROL_H +#define SILK_CONTROL_H + +#include "typedef.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/* Decoder API flags */ +#define FLAG_DECODE_NORMAL 0 +#define FLAG_PACKET_LOST 1 +#define FLAG_DECODE_LBRR 2 + +/***********************************************/ +/* Structure for controlling encoder operation */ +/***********************************************/ +typedef struct { + /* I: Number of channels; 1/2 */ + opus_int32 nChannelsAPI; + + /* I: Number of channels; 1/2 */ + opus_int32 nChannelsInternal; + + /* I: Input signal sampling rate in Hertz; 8000/12000/16000/24000/32000/44100/48000 */ + opus_int32 API_sampleRate; + + /* I: Maximum internal sampling rate in Hertz; 8000/12000/16000 */ + opus_int32 maxInternalSampleRate; + + /* I: Minimum internal sampling rate in Hertz; 8000/12000/16000 */ + opus_int32 minInternalSampleRate; + + /* I: Soft request for internal sampling rate in Hertz; 8000/12000/16000 */ + opus_int32 desiredInternalSampleRate; + + /* I: Number of samples per packet in milliseconds; 10/20/40/60 */ + opus_int payloadSize_ms; + + /* I: Bitrate during active speech in bits/second; internally limited */ + opus_int32 bitRate; + + /* I: Uplink packet loss in percent (0-100) */ + opus_int packetLossPercentage; + + /* I: Complexity mode; 0 is lowest, 10 is highest complexity */ + opus_int complexity; + + /* I: Flag to enable in-band Forward Error Correction (FEC); 0/1 */ + opus_int useInBandFEC; + + /* I: Flag to enable discontinuous transmission (DTX); 0/1 */ + opus_int useDTX; + + /* I: Flag to use constant bitrate */ + opus_int useCBR; + + /* I: Maximum number of bits allowed for the frame */ + opus_int maxBits; + + /* I: Causes a smooth downmix to mono */ + opus_int toMono; + + /* I: Opus encoder is allowing us to switch bandwidth */ + opus_int opusCanSwitch; + + /* I: Make frames as independent as possible (but still use LPC) */ + opus_int reducedDependency; + + /* O: Internal sampling rate used, in Hertz; 8000/12000/16000 */ + opus_int32 internalSampleRate; + + /* O: Flag that bandwidth switching is allowed (because low voice activity) */ + opus_int allowBandwidthSwitch; + + /* O: Flag that SILK runs in WB mode without variable LP filter (use for switching between WB/SWB/FB) */ + opus_int inWBmodeWithoutVariableLP; + + /* O: Stereo width */ + opus_int stereoWidth_Q14; + + /* O: Tells the Opus encoder we're ready to switch */ + opus_int switchReady; + +} silk_EncControlStruct; + +/**************************************************************************/ +/* Structure for controlling decoder operation and reading decoder status */ +/**************************************************************************/ +typedef struct { + /* I: Number of channels; 1/2 */ + opus_int32 nChannelsAPI; + + /* I: Number of channels; 1/2 */ + opus_int32 nChannelsInternal; + + /* I: Output signal sampling rate in Hertz; 8000/12000/16000/24000/32000/44100/48000 */ + opus_int32 API_sampleRate; + + /* I: Internal sampling rate used, in Hertz; 8000/12000/16000 */ + opus_int32 internalSampleRate; + + /* I: Number of samples per packet in milliseconds; 10/20/40/60 */ + opus_int payloadSize_ms; + + /* O: Pitch lag of previous frame (0 if unvoiced), measured in samples at 48 kHz */ + opus_int prevPitchLag; +} silk_DecControlStruct; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/thirdparty/opus/silk/control_SNR.c b/thirdparty/opus/silk/control_SNR.c new file mode 100644 index 000000000000..cee87eb0d8b3 --- /dev/null +++ b/thirdparty/opus/silk/control_SNR.c @@ -0,0 +1,76 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" +#include "tuning_parameters.h" + +/* Control SNR of redidual quantizer */ +opus_int silk_control_SNR( + silk_encoder_state *psEncC, /* I/O Pointer to Silk encoder state */ + opus_int32 TargetRate_bps /* I Target max bitrate (bps) */ +) +{ + opus_int k, ret = SILK_NO_ERROR; + opus_int32 frac_Q6; + const opus_int32 *rateTable; + + /* Set bitrate/coding quality */ + TargetRate_bps = silk_LIMIT( TargetRate_bps, MIN_TARGET_RATE_BPS, MAX_TARGET_RATE_BPS ); + if( TargetRate_bps != psEncC->TargetRate_bps ) { + psEncC->TargetRate_bps = TargetRate_bps; + + /* If new TargetRate_bps, translate to SNR_dB value */ + if( psEncC->fs_kHz == 8 ) { + rateTable = silk_TargetRate_table_NB; + } else if( psEncC->fs_kHz == 12 ) { + rateTable = silk_TargetRate_table_MB; + } else { + rateTable = silk_TargetRate_table_WB; + } + + /* Reduce bitrate for 10 ms modes in these calculations */ + if( psEncC->nb_subfr == 2 ) { + TargetRate_bps -= REDUCE_BITRATE_10_MS_BPS; + } + + /* Find bitrate interval in table and interpolate */ + for( k = 1; k < TARGET_RATE_TAB_SZ; k++ ) { + if( TargetRate_bps <= rateTable[ k ] ) { + frac_Q6 = silk_DIV32( silk_LSHIFT( TargetRate_bps - rateTable[ k - 1 ], 6 ), + rateTable[ k ] - rateTable[ k - 1 ] ); + psEncC->SNR_dB_Q7 = silk_LSHIFT( silk_SNR_table_Q1[ k - 1 ], 6 ) + silk_MUL( frac_Q6, silk_SNR_table_Q1[ k ] - silk_SNR_table_Q1[ k - 1 ] ); + break; + } + } + } + + return ret; +} diff --git a/thirdparty/opus/silk/control_audio_bandwidth.c b/thirdparty/opus/silk/control_audio_bandwidth.c new file mode 100644 index 000000000000..4f9bc5cbdaa9 --- /dev/null +++ b/thirdparty/opus/silk/control_audio_bandwidth.c @@ -0,0 +1,126 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" +#include "tuning_parameters.h" + +/* Control internal sampling rate */ +opus_int silk_control_audio_bandwidth( + silk_encoder_state *psEncC, /* I/O Pointer to Silk encoder state */ + silk_EncControlStruct *encControl /* I Control structure */ +) +{ + opus_int fs_kHz; + opus_int32 fs_Hz; + + fs_kHz = psEncC->fs_kHz; + fs_Hz = silk_SMULBB( fs_kHz, 1000 ); + if( fs_Hz == 0 ) { + /* Encoder has just been initialized */ + fs_Hz = silk_min( psEncC->desiredInternal_fs_Hz, psEncC->API_fs_Hz ); + fs_kHz = silk_DIV32_16( fs_Hz, 1000 ); + } else if( fs_Hz > psEncC->API_fs_Hz || fs_Hz > psEncC->maxInternal_fs_Hz || fs_Hz < psEncC->minInternal_fs_Hz ) { + /* Make sure internal rate is not higher than external rate or maximum allowed, or lower than minimum allowed */ + fs_Hz = psEncC->API_fs_Hz; + fs_Hz = silk_min( fs_Hz, psEncC->maxInternal_fs_Hz ); + fs_Hz = silk_max( fs_Hz, psEncC->minInternal_fs_Hz ); + fs_kHz = silk_DIV32_16( fs_Hz, 1000 ); + } else { + /* State machine for the internal sampling rate switching */ + if( psEncC->sLP.transition_frame_no >= TRANSITION_FRAMES ) { + /* Stop transition phase */ + psEncC->sLP.mode = 0; + } + if( psEncC->allow_bandwidth_switch || encControl->opusCanSwitch ) { + /* Check if we should switch down */ + if( silk_SMULBB( psEncC->fs_kHz, 1000 ) > psEncC->desiredInternal_fs_Hz ) + { + /* Switch down */ + if( psEncC->sLP.mode == 0 ) { + /* New transition */ + psEncC->sLP.transition_frame_no = TRANSITION_FRAMES; + + /* Reset transition filter state */ + silk_memset( psEncC->sLP.In_LP_State, 0, sizeof( psEncC->sLP.In_LP_State ) ); + } + if( encControl->opusCanSwitch ) { + /* Stop transition phase */ + psEncC->sLP.mode = 0; + + /* Switch to a lower sample frequency */ + fs_kHz = psEncC->fs_kHz == 16 ? 12 : 8; + } else { + if( psEncC->sLP.transition_frame_no <= 0 ) { + encControl->switchReady = 1; + /* Make room for redundancy */ + encControl->maxBits -= encControl->maxBits * 5 / ( encControl->payloadSize_ms + 5 ); + } else { + /* Direction: down (at double speed) */ + psEncC->sLP.mode = -2; + } + } + } + else + /* Check if we should switch up */ + if( silk_SMULBB( psEncC->fs_kHz, 1000 ) < psEncC->desiredInternal_fs_Hz ) + { + /* Switch up */ + if( encControl->opusCanSwitch ) { + /* Switch to a higher sample frequency */ + fs_kHz = psEncC->fs_kHz == 8 ? 12 : 16; + + /* New transition */ + psEncC->sLP.transition_frame_no = 0; + + /* Reset transition filter state */ + silk_memset( psEncC->sLP.In_LP_State, 0, sizeof( psEncC->sLP.In_LP_State ) ); + + /* Direction: up */ + psEncC->sLP.mode = 1; + } else { + if( psEncC->sLP.mode == 0 ) { + encControl->switchReady = 1; + /* Make room for redundancy */ + encControl->maxBits -= encControl->maxBits * 5 / ( encControl->payloadSize_ms + 5 ); + } else { + /* Direction: up */ + psEncC->sLP.mode = 1; + } + } + } else { + if (psEncC->sLP.mode<0) + psEncC->sLP.mode = 1; + } + } + } + + return fs_kHz; +} diff --git a/thirdparty/opus/silk/control_codec.c b/thirdparty/opus/silk/control_codec.c new file mode 100644 index 000000000000..044eea3f2a84 --- /dev/null +++ b/thirdparty/opus/silk/control_codec.c @@ -0,0 +1,428 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#ifdef FIXED_POINT +#include "main_FIX.h" +#define silk_encoder_state_Fxx silk_encoder_state_FIX +#else +#include "main_FLP.h" +#define silk_encoder_state_Fxx silk_encoder_state_FLP +#endif +#include "stack_alloc.h" +#include "tuning_parameters.h" +#include "pitch_est_defines.h" + +static opus_int silk_setup_resamplers( + silk_encoder_state_Fxx *psEnc, /* I/O */ + opus_int fs_kHz /* I */ +); + +static opus_int silk_setup_fs( + silk_encoder_state_Fxx *psEnc, /* I/O */ + opus_int fs_kHz, /* I */ + opus_int PacketSize_ms /* I */ +); + +static opus_int silk_setup_complexity( + silk_encoder_state *psEncC, /* I/O */ + opus_int Complexity /* I */ +); + +static OPUS_INLINE opus_int silk_setup_LBRR( + silk_encoder_state *psEncC, /* I/O */ + const opus_int32 TargetRate_bps /* I */ +); + + +/* Control encoder */ +opus_int silk_control_encoder( + silk_encoder_state_Fxx *psEnc, /* I/O Pointer to Silk encoder state */ + silk_EncControlStruct *encControl, /* I Control structure */ + const opus_int32 TargetRate_bps, /* I Target max bitrate (bps) */ + const opus_int allow_bw_switch, /* I Flag to allow switching audio bandwidth */ + const opus_int channelNb, /* I Channel number */ + const opus_int force_fs_kHz +) +{ + opus_int fs_kHz, ret = 0; + + psEnc->sCmn.useDTX = encControl->useDTX; + psEnc->sCmn.useCBR = encControl->useCBR; + psEnc->sCmn.API_fs_Hz = encControl->API_sampleRate; + psEnc->sCmn.maxInternal_fs_Hz = encControl->maxInternalSampleRate; + psEnc->sCmn.minInternal_fs_Hz = encControl->minInternalSampleRate; + psEnc->sCmn.desiredInternal_fs_Hz = encControl->desiredInternalSampleRate; + psEnc->sCmn.useInBandFEC = encControl->useInBandFEC; + psEnc->sCmn.nChannelsAPI = encControl->nChannelsAPI; + psEnc->sCmn.nChannelsInternal = encControl->nChannelsInternal; + psEnc->sCmn.allow_bandwidth_switch = allow_bw_switch; + psEnc->sCmn.channelNb = channelNb; + + if( psEnc->sCmn.controlled_since_last_payload != 0 && psEnc->sCmn.prefillFlag == 0 ) { + if( psEnc->sCmn.API_fs_Hz != psEnc->sCmn.prev_API_fs_Hz && psEnc->sCmn.fs_kHz > 0 ) { + /* Change in API sampling rate in the middle of encoding a packet */ + ret += silk_setup_resamplers( psEnc, psEnc->sCmn.fs_kHz ); + } + return ret; + } + + /* Beyond this point we know that there are no previously coded frames in the payload buffer */ + + /********************************************/ + /* Determine internal sampling rate */ + /********************************************/ + fs_kHz = silk_control_audio_bandwidth( &psEnc->sCmn, encControl ); + if( force_fs_kHz ) { + fs_kHz = force_fs_kHz; + } + /********************************************/ + /* Prepare resampler and buffered data */ + /********************************************/ + ret += silk_setup_resamplers( psEnc, fs_kHz ); + + /********************************************/ + /* Set internal sampling frequency */ + /********************************************/ + ret += silk_setup_fs( psEnc, fs_kHz, encControl->payloadSize_ms ); + + /********************************************/ + /* Set encoding complexity */ + /********************************************/ + ret += silk_setup_complexity( &psEnc->sCmn, encControl->complexity ); + + /********************************************/ + /* Set packet loss rate measured by farend */ + /********************************************/ + psEnc->sCmn.PacketLoss_perc = encControl->packetLossPercentage; + + /********************************************/ + /* Set LBRR usage */ + /********************************************/ + ret += silk_setup_LBRR( &psEnc->sCmn, TargetRate_bps ); + + psEnc->sCmn.controlled_since_last_payload = 1; + + return ret; +} + +static opus_int silk_setup_resamplers( + silk_encoder_state_Fxx *psEnc, /* I/O */ + opus_int fs_kHz /* I */ +) +{ + opus_int ret = SILK_NO_ERROR; + SAVE_STACK; + + if( psEnc->sCmn.fs_kHz != fs_kHz || psEnc->sCmn.prev_API_fs_Hz != psEnc->sCmn.API_fs_Hz ) + { + if( psEnc->sCmn.fs_kHz == 0 ) { + /* Initialize the resampler for enc_API.c preparing resampling from API_fs_Hz to fs_kHz */ + ret += silk_resampler_init( &psEnc->sCmn.resampler_state, psEnc->sCmn.API_fs_Hz, fs_kHz * 1000, 1 ); + } else { + VARDECL( opus_int16, x_buf_API_fs_Hz ); + VARDECL( silk_resampler_state_struct, temp_resampler_state ); +#ifdef FIXED_POINT + opus_int16 *x_bufFIX = psEnc->x_buf; +#else + VARDECL( opus_int16, x_bufFIX ); + opus_int32 new_buf_samples; +#endif + opus_int32 api_buf_samples; + opus_int32 old_buf_samples; + opus_int32 buf_length_ms; + + buf_length_ms = silk_LSHIFT( psEnc->sCmn.nb_subfr * 5, 1 ) + LA_SHAPE_MS; + old_buf_samples = buf_length_ms * psEnc->sCmn.fs_kHz; + +#ifndef FIXED_POINT + new_buf_samples = buf_length_ms * fs_kHz; + ALLOC( x_bufFIX, silk_max( old_buf_samples, new_buf_samples ), + opus_int16 ); + silk_float2short_array( x_bufFIX, psEnc->x_buf, old_buf_samples ); +#endif + + /* Initialize resampler for temporary resampling of x_buf data to API_fs_Hz */ + ALLOC( temp_resampler_state, 1, silk_resampler_state_struct ); + ret += silk_resampler_init( temp_resampler_state, silk_SMULBB( psEnc->sCmn.fs_kHz, 1000 ), psEnc->sCmn.API_fs_Hz, 0 ); + + /* Calculate number of samples to temporarily upsample */ + api_buf_samples = buf_length_ms * silk_DIV32_16( psEnc->sCmn.API_fs_Hz, 1000 ); + + /* Temporary resampling of x_buf data to API_fs_Hz */ + ALLOC( x_buf_API_fs_Hz, api_buf_samples, opus_int16 ); + ret += silk_resampler( temp_resampler_state, x_buf_API_fs_Hz, x_bufFIX, old_buf_samples ); + + /* Initialize the resampler for enc_API.c preparing resampling from API_fs_Hz to fs_kHz */ + ret += silk_resampler_init( &psEnc->sCmn.resampler_state, psEnc->sCmn.API_fs_Hz, silk_SMULBB( fs_kHz, 1000 ), 1 ); + + /* Correct resampler state by resampling buffered data from API_fs_Hz to fs_kHz */ + ret += silk_resampler( &psEnc->sCmn.resampler_state, x_bufFIX, x_buf_API_fs_Hz, api_buf_samples ); + +#ifndef FIXED_POINT + silk_short2float_array( psEnc->x_buf, x_bufFIX, new_buf_samples); +#endif + } + } + + psEnc->sCmn.prev_API_fs_Hz = psEnc->sCmn.API_fs_Hz; + + RESTORE_STACK; + return ret; +} + +static opus_int silk_setup_fs( + silk_encoder_state_Fxx *psEnc, /* I/O */ + opus_int fs_kHz, /* I */ + opus_int PacketSize_ms /* I */ +) +{ + opus_int ret = SILK_NO_ERROR; + + /* Set packet size */ + if( PacketSize_ms != psEnc->sCmn.PacketSize_ms ) { + if( ( PacketSize_ms != 10 ) && + ( PacketSize_ms != 20 ) && + ( PacketSize_ms != 40 ) && + ( PacketSize_ms != 60 ) ) { + ret = SILK_ENC_PACKET_SIZE_NOT_SUPPORTED; + } + if( PacketSize_ms <= 10 ) { + psEnc->sCmn.nFramesPerPacket = 1; + psEnc->sCmn.nb_subfr = PacketSize_ms == 10 ? 2 : 1; + psEnc->sCmn.frame_length = silk_SMULBB( PacketSize_ms, fs_kHz ); + psEnc->sCmn.pitch_LPC_win_length = silk_SMULBB( FIND_PITCH_LPC_WIN_MS_2_SF, fs_kHz ); + if( psEnc->sCmn.fs_kHz == 8 ) { + psEnc->sCmn.pitch_contour_iCDF = silk_pitch_contour_10_ms_NB_iCDF; + } else { + psEnc->sCmn.pitch_contour_iCDF = silk_pitch_contour_10_ms_iCDF; + } + } else { + psEnc->sCmn.nFramesPerPacket = silk_DIV32_16( PacketSize_ms, MAX_FRAME_LENGTH_MS ); + psEnc->sCmn.nb_subfr = MAX_NB_SUBFR; + psEnc->sCmn.frame_length = silk_SMULBB( 20, fs_kHz ); + psEnc->sCmn.pitch_LPC_win_length = silk_SMULBB( FIND_PITCH_LPC_WIN_MS, fs_kHz ); + if( psEnc->sCmn.fs_kHz == 8 ) { + psEnc->sCmn.pitch_contour_iCDF = silk_pitch_contour_NB_iCDF; + } else { + psEnc->sCmn.pitch_contour_iCDF = silk_pitch_contour_iCDF; + } + } + psEnc->sCmn.PacketSize_ms = PacketSize_ms; + psEnc->sCmn.TargetRate_bps = 0; /* trigger new SNR computation */ + } + + /* Set internal sampling frequency */ + silk_assert( fs_kHz == 8 || fs_kHz == 12 || fs_kHz == 16 ); + silk_assert( psEnc->sCmn.nb_subfr == 2 || psEnc->sCmn.nb_subfr == 4 ); + if( psEnc->sCmn.fs_kHz != fs_kHz ) { + /* reset part of the state */ + silk_memset( &psEnc->sShape, 0, sizeof( psEnc->sShape ) ); + silk_memset( &psEnc->sPrefilt, 0, sizeof( psEnc->sPrefilt ) ); + silk_memset( &psEnc->sCmn.sNSQ, 0, sizeof( psEnc->sCmn.sNSQ ) ); + silk_memset( psEnc->sCmn.prev_NLSFq_Q15, 0, sizeof( psEnc->sCmn.prev_NLSFq_Q15 ) ); + silk_memset( &psEnc->sCmn.sLP.In_LP_State, 0, sizeof( psEnc->sCmn.sLP.In_LP_State ) ); + psEnc->sCmn.inputBufIx = 0; + psEnc->sCmn.nFramesEncoded = 0; + psEnc->sCmn.TargetRate_bps = 0; /* trigger new SNR computation */ + + /* Initialize non-zero parameters */ + psEnc->sCmn.prevLag = 100; + psEnc->sCmn.first_frame_after_reset = 1; + psEnc->sPrefilt.lagPrev = 100; + psEnc->sShape.LastGainIndex = 10; + psEnc->sCmn.sNSQ.lagPrev = 100; + psEnc->sCmn.sNSQ.prev_gain_Q16 = 65536; + psEnc->sCmn.prevSignalType = TYPE_NO_VOICE_ACTIVITY; + + psEnc->sCmn.fs_kHz = fs_kHz; + if( psEnc->sCmn.fs_kHz == 8 ) { + if( psEnc->sCmn.nb_subfr == MAX_NB_SUBFR ) { + psEnc->sCmn.pitch_contour_iCDF = silk_pitch_contour_NB_iCDF; + } else { + psEnc->sCmn.pitch_contour_iCDF = silk_pitch_contour_10_ms_NB_iCDF; + } + } else { + if( psEnc->sCmn.nb_subfr == MAX_NB_SUBFR ) { + psEnc->sCmn.pitch_contour_iCDF = silk_pitch_contour_iCDF; + } else { + psEnc->sCmn.pitch_contour_iCDF = silk_pitch_contour_10_ms_iCDF; + } + } + if( psEnc->sCmn.fs_kHz == 8 || psEnc->sCmn.fs_kHz == 12 ) { + psEnc->sCmn.predictLPCOrder = MIN_LPC_ORDER; + psEnc->sCmn.psNLSF_CB = &silk_NLSF_CB_NB_MB; + } else { + psEnc->sCmn.predictLPCOrder = MAX_LPC_ORDER; + psEnc->sCmn.psNLSF_CB = &silk_NLSF_CB_WB; + } + psEnc->sCmn.subfr_length = SUB_FRAME_LENGTH_MS * fs_kHz; + psEnc->sCmn.frame_length = silk_SMULBB( psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr ); + psEnc->sCmn.ltp_mem_length = silk_SMULBB( LTP_MEM_LENGTH_MS, fs_kHz ); + psEnc->sCmn.la_pitch = silk_SMULBB( LA_PITCH_MS, fs_kHz ); + psEnc->sCmn.max_pitch_lag = silk_SMULBB( 18, fs_kHz ); + if( psEnc->sCmn.nb_subfr == MAX_NB_SUBFR ) { + psEnc->sCmn.pitch_LPC_win_length = silk_SMULBB( FIND_PITCH_LPC_WIN_MS, fs_kHz ); + } else { + psEnc->sCmn.pitch_LPC_win_length = silk_SMULBB( FIND_PITCH_LPC_WIN_MS_2_SF, fs_kHz ); + } + if( psEnc->sCmn.fs_kHz == 16 ) { + psEnc->sCmn.mu_LTP_Q9 = SILK_FIX_CONST( MU_LTP_QUANT_WB, 9 ); + psEnc->sCmn.pitch_lag_low_bits_iCDF = silk_uniform8_iCDF; + } else if( psEnc->sCmn.fs_kHz == 12 ) { + psEnc->sCmn.mu_LTP_Q9 = SILK_FIX_CONST( MU_LTP_QUANT_MB, 9 ); + psEnc->sCmn.pitch_lag_low_bits_iCDF = silk_uniform6_iCDF; + } else { + psEnc->sCmn.mu_LTP_Q9 = SILK_FIX_CONST( MU_LTP_QUANT_NB, 9 ); + psEnc->sCmn.pitch_lag_low_bits_iCDF = silk_uniform4_iCDF; + } + } + + /* Check that settings are valid */ + silk_assert( ( psEnc->sCmn.subfr_length * psEnc->sCmn.nb_subfr ) == psEnc->sCmn.frame_length ); + + return ret; +} + +static opus_int silk_setup_complexity( + silk_encoder_state *psEncC, /* I/O */ + opus_int Complexity /* I */ +) +{ + opus_int ret = 0; + + /* Set encoding complexity */ + silk_assert( Complexity >= 0 && Complexity <= 10 ); + if( Complexity < 2 ) { + psEncC->pitchEstimationComplexity = SILK_PE_MIN_COMPLEX; + psEncC->pitchEstimationThreshold_Q16 = SILK_FIX_CONST( 0.8, 16 ); + psEncC->pitchEstimationLPCOrder = 6; + psEncC->shapingLPCOrder = 8; + psEncC->la_shape = 3 * psEncC->fs_kHz; + psEncC->nStatesDelayedDecision = 1; + psEncC->useInterpolatedNLSFs = 0; + psEncC->LTPQuantLowComplexity = 1; + psEncC->NLSF_MSVQ_Survivors = 2; + psEncC->warping_Q16 = 0; + } else if( Complexity < 4 ) { + psEncC->pitchEstimationComplexity = SILK_PE_MID_COMPLEX; + psEncC->pitchEstimationThreshold_Q16 = SILK_FIX_CONST( 0.76, 16 ); + psEncC->pitchEstimationLPCOrder = 8; + psEncC->shapingLPCOrder = 10; + psEncC->la_shape = 5 * psEncC->fs_kHz; + psEncC->nStatesDelayedDecision = 1; + psEncC->useInterpolatedNLSFs = 0; + psEncC->LTPQuantLowComplexity = 0; + psEncC->NLSF_MSVQ_Survivors = 4; + psEncC->warping_Q16 = 0; + } else if( Complexity < 6 ) { + psEncC->pitchEstimationComplexity = SILK_PE_MID_COMPLEX; + psEncC->pitchEstimationThreshold_Q16 = SILK_FIX_CONST( 0.74, 16 ); + psEncC->pitchEstimationLPCOrder = 10; + psEncC->shapingLPCOrder = 12; + psEncC->la_shape = 5 * psEncC->fs_kHz; + psEncC->nStatesDelayedDecision = 2; + psEncC->useInterpolatedNLSFs = 1; + psEncC->LTPQuantLowComplexity = 0; + psEncC->NLSF_MSVQ_Survivors = 8; + psEncC->warping_Q16 = psEncC->fs_kHz * SILK_FIX_CONST( WARPING_MULTIPLIER, 16 ); + } else if( Complexity < 8 ) { + psEncC->pitchEstimationComplexity = SILK_PE_MID_COMPLEX; + psEncC->pitchEstimationThreshold_Q16 = SILK_FIX_CONST( 0.72, 16 ); + psEncC->pitchEstimationLPCOrder = 12; + psEncC->shapingLPCOrder = 14; + psEncC->la_shape = 5 * psEncC->fs_kHz; + psEncC->nStatesDelayedDecision = 3; + psEncC->useInterpolatedNLSFs = 1; + psEncC->LTPQuantLowComplexity = 0; + psEncC->NLSF_MSVQ_Survivors = 16; + psEncC->warping_Q16 = psEncC->fs_kHz * SILK_FIX_CONST( WARPING_MULTIPLIER, 16 ); + } else { + psEncC->pitchEstimationComplexity = SILK_PE_MAX_COMPLEX; + psEncC->pitchEstimationThreshold_Q16 = SILK_FIX_CONST( 0.7, 16 ); + psEncC->pitchEstimationLPCOrder = 16; + psEncC->shapingLPCOrder = 16; + psEncC->la_shape = 5 * psEncC->fs_kHz; + psEncC->nStatesDelayedDecision = MAX_DEL_DEC_STATES; + psEncC->useInterpolatedNLSFs = 1; + psEncC->LTPQuantLowComplexity = 0; + psEncC->NLSF_MSVQ_Survivors = 32; + psEncC->warping_Q16 = psEncC->fs_kHz * SILK_FIX_CONST( WARPING_MULTIPLIER, 16 ); + } + + /* Do not allow higher pitch estimation LPC order than predict LPC order */ + psEncC->pitchEstimationLPCOrder = silk_min_int( psEncC->pitchEstimationLPCOrder, psEncC->predictLPCOrder ); + psEncC->shapeWinLength = SUB_FRAME_LENGTH_MS * psEncC->fs_kHz + 2 * psEncC->la_shape; + psEncC->Complexity = Complexity; + + silk_assert( psEncC->pitchEstimationLPCOrder <= MAX_FIND_PITCH_LPC_ORDER ); + silk_assert( psEncC->shapingLPCOrder <= MAX_SHAPE_LPC_ORDER ); + silk_assert( psEncC->nStatesDelayedDecision <= MAX_DEL_DEC_STATES ); + silk_assert( psEncC->warping_Q16 <= 32767 ); + silk_assert( psEncC->la_shape <= LA_SHAPE_MAX ); + silk_assert( psEncC->shapeWinLength <= SHAPE_LPC_WIN_MAX ); + silk_assert( psEncC->NLSF_MSVQ_Survivors <= NLSF_VQ_MAX_SURVIVORS ); + + return ret; +} + +static OPUS_INLINE opus_int silk_setup_LBRR( + silk_encoder_state *psEncC, /* I/O */ + const opus_int32 TargetRate_bps /* I */ +) +{ + opus_int LBRR_in_previous_packet, ret = SILK_NO_ERROR; + opus_int32 LBRR_rate_thres_bps; + + LBRR_in_previous_packet = psEncC->LBRR_enabled; + psEncC->LBRR_enabled = 0; + if( psEncC->useInBandFEC && psEncC->PacketLoss_perc > 0 ) { + if( psEncC->fs_kHz == 8 ) { + LBRR_rate_thres_bps = LBRR_NB_MIN_RATE_BPS; + } else if( psEncC->fs_kHz == 12 ) { + LBRR_rate_thres_bps = LBRR_MB_MIN_RATE_BPS; + } else { + LBRR_rate_thres_bps = LBRR_WB_MIN_RATE_BPS; + } + LBRR_rate_thres_bps = silk_SMULWB( silk_MUL( LBRR_rate_thres_bps, 125 - silk_min( psEncC->PacketLoss_perc, 25 ) ), SILK_FIX_CONST( 0.01, 16 ) ); + + if( TargetRate_bps > LBRR_rate_thres_bps ) { + /* Set gain increase for coding LBRR excitation */ + if( LBRR_in_previous_packet == 0 ) { + /* Previous packet did not have LBRR, and was therefore coded at a higher bitrate */ + psEncC->LBRR_GainIncreases = 7; + } else { + psEncC->LBRR_GainIncreases = silk_max_int( 7 - silk_SMULWB( (opus_int32)psEncC->PacketLoss_perc, SILK_FIX_CONST( 0.4, 16 ) ), 2 ); + } + psEncC->LBRR_enabled = 1; + } + } + + return ret; +} diff --git a/thirdparty/opus/silk/debug.c b/thirdparty/opus/silk/debug.c new file mode 100644 index 000000000000..9253faf71bfe --- /dev/null +++ b/thirdparty/opus/silk/debug.c @@ -0,0 +1,170 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "debug.h" +#include "SigProc_FIX.h" + +#if SILK_TIC_TOC + +#ifdef _WIN32 + +#if (defined(_WIN32) || defined(_WINCE)) +#include /* timer */ +#else /* Linux or Mac*/ +#include +#endif + +unsigned long silk_GetHighResolutionTime(void) /* O time in usec*/ +{ + /* Returns a time counter in microsec */ + /* the resolution is platform dependent */ + /* but is typically 1.62 us resolution */ + LARGE_INTEGER lpPerformanceCount; + LARGE_INTEGER lpFrequency; + QueryPerformanceCounter(&lpPerformanceCount); + QueryPerformanceFrequency(&lpFrequency); + return (unsigned long)((1000000*(lpPerformanceCount.QuadPart)) / lpFrequency.QuadPart); +} +#else /* Linux or Mac*/ +unsigned long GetHighResolutionTime(void) /* O time in usec*/ +{ + struct timeval tv; + gettimeofday(&tv, 0); + return((tv.tv_sec*1000000)+(tv.tv_usec)); +} +#endif + +int silk_Timer_nTimers = 0; +int silk_Timer_depth_ctr = 0; +char silk_Timer_tags[silk_NUM_TIMERS_MAX][silk_NUM_TIMERS_MAX_TAG_LEN]; +#ifdef WIN32 +LARGE_INTEGER silk_Timer_start[silk_NUM_TIMERS_MAX]; +#else +unsigned long silk_Timer_start[silk_NUM_TIMERS_MAX]; +#endif +unsigned int silk_Timer_cnt[silk_NUM_TIMERS_MAX]; +opus_int64 silk_Timer_min[silk_NUM_TIMERS_MAX]; +opus_int64 silk_Timer_sum[silk_NUM_TIMERS_MAX]; +opus_int64 silk_Timer_max[silk_NUM_TIMERS_MAX]; +opus_int64 silk_Timer_depth[silk_NUM_TIMERS_MAX]; + +#ifdef WIN32 +void silk_TimerSave(char *file_name) +{ + if( silk_Timer_nTimers > 0 ) + { + int k; + FILE *fp; + LARGE_INTEGER lpFrequency; + LARGE_INTEGER lpPerformanceCount1, lpPerformanceCount2; + int del = 0x7FFFFFFF; + double avg, sum_avg; + /* estimate overhead of calling performance counters */ + for( k = 0; k < 1000; k++ ) { + QueryPerformanceCounter(&lpPerformanceCount1); + QueryPerformanceCounter(&lpPerformanceCount2); + lpPerformanceCount2.QuadPart -= lpPerformanceCount1.QuadPart; + if( (int)lpPerformanceCount2.LowPart < del ) + del = lpPerformanceCount2.LowPart; + } + QueryPerformanceFrequency(&lpFrequency); + /* print results to file */ + sum_avg = 0.0f; + for( k = 0; k < silk_Timer_nTimers; k++ ) { + if (silk_Timer_depth[k] == 0) { + sum_avg += (1e6 * silk_Timer_sum[k] / silk_Timer_cnt[k] - del) / lpFrequency.QuadPart * silk_Timer_cnt[k]; + } + } + fp = fopen(file_name, "w"); + fprintf(fp, " min avg %% max count\n"); + for( k = 0; k < silk_Timer_nTimers; k++ ) { + if (silk_Timer_depth[k] == 0) { + fprintf(fp, "%-28s", silk_Timer_tags[k]); + } else if (silk_Timer_depth[k] == 1) { + fprintf(fp, " %-27s", silk_Timer_tags[k]); + } else if (silk_Timer_depth[k] == 2) { + fprintf(fp, " %-26s", silk_Timer_tags[k]); + } else if (silk_Timer_depth[k] == 3) { + fprintf(fp, " %-25s", silk_Timer_tags[k]); + } else { + fprintf(fp, " %-24s", silk_Timer_tags[k]); + } + avg = (1e6 * silk_Timer_sum[k] / silk_Timer_cnt[k] - del) / lpFrequency.QuadPart; + fprintf(fp, "%8.2f", (1e6 * (silk_max_64(silk_Timer_min[k] - del, 0))) / lpFrequency.QuadPart); + fprintf(fp, "%12.2f %6.2f", avg, 100.0 * avg / sum_avg * silk_Timer_cnt[k]); + fprintf(fp, "%12.2f", (1e6 * (silk_max_64(silk_Timer_max[k] - del, 0))) / lpFrequency.QuadPart); + fprintf(fp, "%10d\n", silk_Timer_cnt[k]); + } + fprintf(fp, " microseconds\n"); + fclose(fp); + } +} +#else +void silk_TimerSave(char *file_name) +{ + if( silk_Timer_nTimers > 0 ) + { + int k; + FILE *fp; + /* print results to file */ + fp = fopen(file_name, "w"); + fprintf(fp, " min avg max count\n"); + for( k = 0; k < silk_Timer_nTimers; k++ ) + { + if (silk_Timer_depth[k] == 0) { + fprintf(fp, "%-28s", silk_Timer_tags[k]); + } else if (silk_Timer_depth[k] == 1) { + fprintf(fp, " %-27s", silk_Timer_tags[k]); + } else if (silk_Timer_depth[k] == 2) { + fprintf(fp, " %-26s", silk_Timer_tags[k]); + } else if (silk_Timer_depth[k] == 3) { + fprintf(fp, " %-25s", silk_Timer_tags[k]); + } else { + fprintf(fp, " %-24s", silk_Timer_tags[k]); + } + fprintf(fp, "%d ", silk_Timer_min[k]); + fprintf(fp, "%f ", (double)silk_Timer_sum[k] / (double)silk_Timer_cnt[k]); + fprintf(fp, "%d ", silk_Timer_max[k]); + fprintf(fp, "%10d\n", silk_Timer_cnt[k]); + } + fprintf(fp, " microseconds\n"); + fclose(fp); + } +} +#endif + +#endif /* SILK_TIC_TOC */ + +#if SILK_DEBUG +FILE *silk_debug_store_fp[ silk_NUM_STORES_MAX ]; +int silk_debug_store_count = 0; +#endif /* SILK_DEBUG */ + diff --git a/thirdparty/opus/silk/debug.h b/thirdparty/opus/silk/debug.h new file mode 100644 index 000000000000..efb6d3e99e7f --- /dev/null +++ b/thirdparty/opus/silk/debug.h @@ -0,0 +1,279 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_DEBUG_H +#define SILK_DEBUG_H + +#include "typedef.h" +#include /* file writing */ +#include /* strcpy, strcmp */ + +#ifdef __cplusplus +extern "C" +{ +#endif + +unsigned long GetHighResolutionTime(void); /* O time in usec*/ + +/* make SILK_DEBUG dependent on compiler's _DEBUG */ +#if defined _WIN32 + #ifdef _DEBUG + #define SILK_DEBUG 1 + #else + #define SILK_DEBUG 0 + #endif + + /* overrule the above */ + #if 0 + /* #define NO_ASSERTS*/ + #undef SILK_DEBUG + #define SILK_DEBUG 1 + #endif +#else + #define SILK_DEBUG 0 +#endif + +/* Flag for using timers */ +#define SILK_TIC_TOC 0 + + +#if SILK_TIC_TOC + +#if (defined(_WIN32) || defined(_WINCE)) +#include /* timer */ +#else /* Linux or Mac*/ +#include +#endif + +/*********************************/ +/* timer functions for profiling */ +/*********************************/ +/* example: */ +/* */ +/* TIC(LPC) */ +/* do_LPC(in_vec, order, acoef); // do LPC analysis */ +/* TOC(LPC) */ +/* */ +/* and call the following just before exiting (from main) */ +/* */ +/* silk_TimerSave("silk_TimingData.txt"); */ +/* */ +/* results are now in silk_TimingData.txt */ + +void silk_TimerSave(char *file_name); + +/* max number of timers (in different locations) */ +#define silk_NUM_TIMERS_MAX 50 +/* max length of name tags in TIC(..), TOC(..) */ +#define silk_NUM_TIMERS_MAX_TAG_LEN 30 + +extern int silk_Timer_nTimers; +extern int silk_Timer_depth_ctr; +extern char silk_Timer_tags[silk_NUM_TIMERS_MAX][silk_NUM_TIMERS_MAX_TAG_LEN]; +#ifdef _WIN32 +extern LARGE_INTEGER silk_Timer_start[silk_NUM_TIMERS_MAX]; +#else +extern unsigned long silk_Timer_start[silk_NUM_TIMERS_MAX]; +#endif +extern unsigned int silk_Timer_cnt[silk_NUM_TIMERS_MAX]; +extern opus_int64 silk_Timer_sum[silk_NUM_TIMERS_MAX]; +extern opus_int64 silk_Timer_max[silk_NUM_TIMERS_MAX]; +extern opus_int64 silk_Timer_min[silk_NUM_TIMERS_MAX]; +extern opus_int64 silk_Timer_depth[silk_NUM_TIMERS_MAX]; + +/* WARNING: TIC()/TOC can measure only up to 0.1 seconds at a time */ +#ifdef _WIN32 +#define TIC(TAG_NAME) { \ + static int init = 0; \ + static int ID = -1; \ + if( init == 0 ) \ + { \ + int k; \ + init = 1; \ + for( k = 0; k < silk_Timer_nTimers; k++ ) { \ + if( strcmp(silk_Timer_tags[k], #TAG_NAME) == 0 ) { \ + ID = k; \ + break; \ + } \ + } \ + if (ID == -1) { \ + ID = silk_Timer_nTimers; \ + silk_Timer_nTimers++; \ + silk_Timer_depth[ID] = silk_Timer_depth_ctr; \ + strcpy(silk_Timer_tags[ID], #TAG_NAME); \ + silk_Timer_cnt[ID] = 0; \ + silk_Timer_sum[ID] = 0; \ + silk_Timer_min[ID] = 0xFFFFFFFF; \ + silk_Timer_max[ID] = 0; \ + } \ + } \ + silk_Timer_depth_ctr++; \ + QueryPerformanceCounter(&silk_Timer_start[ID]); \ +} +#else +#define TIC(TAG_NAME) { \ + static int init = 0; \ + static int ID = -1; \ + if( init == 0 ) \ + { \ + int k; \ + init = 1; \ + for( k = 0; k < silk_Timer_nTimers; k++ ) { \ + if( strcmp(silk_Timer_tags[k], #TAG_NAME) == 0 ) { \ + ID = k; \ + break; \ + } \ + } \ + if (ID == -1) { \ + ID = silk_Timer_nTimers; \ + silk_Timer_nTimers++; \ + silk_Timer_depth[ID] = silk_Timer_depth_ctr; \ + strcpy(silk_Timer_tags[ID], #TAG_NAME); \ + silk_Timer_cnt[ID] = 0; \ + silk_Timer_sum[ID] = 0; \ + silk_Timer_min[ID] = 0xFFFFFFFF; \ + silk_Timer_max[ID] = 0; \ + } \ + } \ + silk_Timer_depth_ctr++; \ + silk_Timer_start[ID] = GetHighResolutionTime(); \ +} +#endif + +#ifdef _WIN32 +#define TOC(TAG_NAME) { \ + LARGE_INTEGER lpPerformanceCount; \ + static int init = 0; \ + static int ID = 0; \ + if( init == 0 ) \ + { \ + int k; \ + init = 1; \ + for( k = 0; k < silk_Timer_nTimers; k++ ) { \ + if( strcmp(silk_Timer_tags[k], #TAG_NAME) == 0 ) { \ + ID = k; \ + break; \ + } \ + } \ + } \ + QueryPerformanceCounter(&lpPerformanceCount); \ + lpPerformanceCount.QuadPart -= silk_Timer_start[ID].QuadPart; \ + if((lpPerformanceCount.QuadPart < 100000000) && \ + (lpPerformanceCount.QuadPart >= 0)) { \ + silk_Timer_cnt[ID]++; \ + silk_Timer_sum[ID] += lpPerformanceCount.QuadPart; \ + if( lpPerformanceCount.QuadPart > silk_Timer_max[ID] ) \ + silk_Timer_max[ID] = lpPerformanceCount.QuadPart; \ + if( lpPerformanceCount.QuadPart < silk_Timer_min[ID] ) \ + silk_Timer_min[ID] = lpPerformanceCount.QuadPart; \ + } \ + silk_Timer_depth_ctr--; \ +} +#else +#define TOC(TAG_NAME) { \ + unsigned long endTime; \ + static int init = 0; \ + static int ID = 0; \ + if( init == 0 ) \ + { \ + int k; \ + init = 1; \ + for( k = 0; k < silk_Timer_nTimers; k++ ) { \ + if( strcmp(silk_Timer_tags[k], #TAG_NAME) == 0 ) { \ + ID = k; \ + break; \ + } \ + } \ + } \ + endTime = GetHighResolutionTime(); \ + endTime -= silk_Timer_start[ID]; \ + if((endTime < 100000000) && \ + (endTime >= 0)) { \ + silk_Timer_cnt[ID]++; \ + silk_Timer_sum[ID] += endTime; \ + if( endTime > silk_Timer_max[ID] ) \ + silk_Timer_max[ID] = endTime; \ + if( endTime < silk_Timer_min[ID] ) \ + silk_Timer_min[ID] = endTime; \ + } \ + silk_Timer_depth_ctr--; \ +} +#endif + +#else /* SILK_TIC_TOC */ + +/* define macros as empty strings */ +#define TIC(TAG_NAME) +#define TOC(TAG_NAME) +#define silk_TimerSave(FILE_NAME) + +#endif /* SILK_TIC_TOC */ + + +#if SILK_DEBUG +/************************************/ +/* write data to file for debugging */ +/************************************/ +/* Example: DEBUG_STORE_DATA(testfile.pcm, &RIN[0], 160*sizeof(opus_int16)); */ + +#define silk_NUM_STORES_MAX 100 +extern FILE *silk_debug_store_fp[ silk_NUM_STORES_MAX ]; +extern int silk_debug_store_count; + +/* Faster way of storing the data */ +#define DEBUG_STORE_DATA( FILE_NAME, DATA_PTR, N_BYTES ) { \ + static opus_int init = 0, cnt = 0; \ + static FILE **fp; \ + if (init == 0) { \ + init = 1; \ + cnt = silk_debug_store_count++; \ + silk_debug_store_fp[ cnt ] = fopen(#FILE_NAME, "wb"); \ + } \ + fwrite((DATA_PTR), (N_BYTES), 1, silk_debug_store_fp[ cnt ]); \ +} + +/* Call this at the end of main() */ +#define SILK_DEBUG_STORE_CLOSE_FILES { \ + opus_int i; \ + for( i = 0; i < silk_debug_store_count; i++ ) { \ + fclose( silk_debug_store_fp[ i ] ); \ + } \ +} + +#else /* SILK_DEBUG */ + +/* define macros as empty strings */ +#define DEBUG_STORE_DATA(FILE_NAME, DATA_PTR, N_BYTES) +#define SILK_DEBUG_STORE_CLOSE_FILES + +#endif /* SILK_DEBUG */ + +#ifdef __cplusplus +} +#endif + +#endif /* SILK_DEBUG_H */ diff --git a/thirdparty/opus/silk/dec_API.c b/thirdparty/opus/silk/dec_API.c new file mode 100644 index 000000000000..b7d8ed48d88f --- /dev/null +++ b/thirdparty/opus/silk/dec_API.c @@ -0,0 +1,419 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#include "API.h" +#include "main.h" +#include "stack_alloc.h" +#include "os_support.h" + +/************************/ +/* Decoder Super Struct */ +/************************/ +typedef struct { + silk_decoder_state channel_state[ DECODER_NUM_CHANNELS ]; + stereo_dec_state sStereo; + opus_int nChannelsAPI; + opus_int nChannelsInternal; + opus_int prev_decode_only_middle; +} silk_decoder; + +/*********************/ +/* Decoder functions */ +/*********************/ + +opus_int silk_Get_Decoder_Size( /* O Returns error code */ + opus_int *decSizeBytes /* O Number of bytes in SILK decoder state */ +) +{ + opus_int ret = SILK_NO_ERROR; + + *decSizeBytes = sizeof( silk_decoder ); + + return ret; +} + +/* Reset decoder state */ +opus_int silk_InitDecoder( /* O Returns error code */ + void *decState /* I/O State */ +) +{ + opus_int n, ret = SILK_NO_ERROR; + silk_decoder_state *channel_state = ((silk_decoder *)decState)->channel_state; + + for( n = 0; n < DECODER_NUM_CHANNELS; n++ ) { + ret = silk_init_decoder( &channel_state[ n ] ); + } + silk_memset(&((silk_decoder *)decState)->sStereo, 0, sizeof(((silk_decoder *)decState)->sStereo)); + /* Not strictly needed, but it's cleaner that way */ + ((silk_decoder *)decState)->prev_decode_only_middle = 0; + + return ret; +} + +/* Decode a frame */ +opus_int silk_Decode( /* O Returns error code */ + void* decState, /* I/O State */ + silk_DecControlStruct* decControl, /* I/O Control Structure */ + opus_int lostFlag, /* I 0: no loss, 1 loss, 2 decode fec */ + opus_int newPacketFlag, /* I Indicates first decoder call for this packet */ + ec_dec *psRangeDec, /* I/O Compressor data structure */ + opus_int16 *samplesOut, /* O Decoded output speech vector */ + opus_int32 *nSamplesOut, /* O Number of samples decoded */ + int arch /* I Run-time architecture */ +) +{ + opus_int i, n, decode_only_middle = 0, ret = SILK_NO_ERROR; + opus_int32 nSamplesOutDec, LBRR_symbol; + opus_int16 *samplesOut1_tmp[ 2 ]; + VARDECL( opus_int16, samplesOut1_tmp_storage1 ); + VARDECL( opus_int16, samplesOut1_tmp_storage2 ); + VARDECL( opus_int16, samplesOut2_tmp ); + opus_int32 MS_pred_Q13[ 2 ] = { 0 }; + opus_int16 *resample_out_ptr; + silk_decoder *psDec = ( silk_decoder * )decState; + silk_decoder_state *channel_state = psDec->channel_state; + opus_int has_side; + opus_int stereo_to_mono; + int delay_stack_alloc; + SAVE_STACK; + + silk_assert( decControl->nChannelsInternal == 1 || decControl->nChannelsInternal == 2 ); + + /**********************************/ + /* Test if first frame in payload */ + /**********************************/ + if( newPacketFlag ) { + for( n = 0; n < decControl->nChannelsInternal; n++ ) { + channel_state[ n ].nFramesDecoded = 0; /* Used to count frames in packet */ + } + } + + /* If Mono -> Stereo transition in bitstream: init state of second channel */ + if( decControl->nChannelsInternal > psDec->nChannelsInternal ) { + ret += silk_init_decoder( &channel_state[ 1 ] ); + } + + stereo_to_mono = decControl->nChannelsInternal == 1 && psDec->nChannelsInternal == 2 && + ( decControl->internalSampleRate == 1000*channel_state[ 0 ].fs_kHz ); + + if( channel_state[ 0 ].nFramesDecoded == 0 ) { + for( n = 0; n < decControl->nChannelsInternal; n++ ) { + opus_int fs_kHz_dec; + if( decControl->payloadSize_ms == 0 ) { + /* Assuming packet loss, use 10 ms */ + channel_state[ n ].nFramesPerPacket = 1; + channel_state[ n ].nb_subfr = 2; + } else if( decControl->payloadSize_ms == 10 ) { + channel_state[ n ].nFramesPerPacket = 1; + channel_state[ n ].nb_subfr = 2; + } else if( decControl->payloadSize_ms == 20 ) { + channel_state[ n ].nFramesPerPacket = 1; + channel_state[ n ].nb_subfr = 4; + } else if( decControl->payloadSize_ms == 40 ) { + channel_state[ n ].nFramesPerPacket = 2; + channel_state[ n ].nb_subfr = 4; + } else if( decControl->payloadSize_ms == 60 ) { + channel_state[ n ].nFramesPerPacket = 3; + channel_state[ n ].nb_subfr = 4; + } else { + silk_assert( 0 ); + RESTORE_STACK; + return SILK_DEC_INVALID_FRAME_SIZE; + } + fs_kHz_dec = ( decControl->internalSampleRate >> 10 ) + 1; + if( fs_kHz_dec != 8 && fs_kHz_dec != 12 && fs_kHz_dec != 16 ) { + silk_assert( 0 ); + RESTORE_STACK; + return SILK_DEC_INVALID_SAMPLING_FREQUENCY; + } + ret += silk_decoder_set_fs( &channel_state[ n ], fs_kHz_dec, decControl->API_sampleRate ); + } + } + + if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 2 && ( psDec->nChannelsAPI == 1 || psDec->nChannelsInternal == 1 ) ) { + silk_memset( psDec->sStereo.pred_prev_Q13, 0, sizeof( psDec->sStereo.pred_prev_Q13 ) ); + silk_memset( psDec->sStereo.sSide, 0, sizeof( psDec->sStereo.sSide ) ); + silk_memcpy( &channel_state[ 1 ].resampler_state, &channel_state[ 0 ].resampler_state, sizeof( silk_resampler_state_struct ) ); + } + psDec->nChannelsAPI = decControl->nChannelsAPI; + psDec->nChannelsInternal = decControl->nChannelsInternal; + + if( decControl->API_sampleRate > (opus_int32)MAX_API_FS_KHZ * 1000 || decControl->API_sampleRate < 8000 ) { + ret = SILK_DEC_INVALID_SAMPLING_FREQUENCY; + RESTORE_STACK; + return( ret ); + } + + if( lostFlag != FLAG_PACKET_LOST && channel_state[ 0 ].nFramesDecoded == 0 ) { + /* First decoder call for this payload */ + /* Decode VAD flags and LBRR flag */ + for( n = 0; n < decControl->nChannelsInternal; n++ ) { + for( i = 0; i < channel_state[ n ].nFramesPerPacket; i++ ) { + channel_state[ n ].VAD_flags[ i ] = ec_dec_bit_logp(psRangeDec, 1); + } + channel_state[ n ].LBRR_flag = ec_dec_bit_logp(psRangeDec, 1); + } + /* Decode LBRR flags */ + for( n = 0; n < decControl->nChannelsInternal; n++ ) { + silk_memset( channel_state[ n ].LBRR_flags, 0, sizeof( channel_state[ n ].LBRR_flags ) ); + if( channel_state[ n ].LBRR_flag ) { + if( channel_state[ n ].nFramesPerPacket == 1 ) { + channel_state[ n ].LBRR_flags[ 0 ] = 1; + } else { + LBRR_symbol = ec_dec_icdf( psRangeDec, silk_LBRR_flags_iCDF_ptr[ channel_state[ n ].nFramesPerPacket - 2 ], 8 ) + 1; + for( i = 0; i < channel_state[ n ].nFramesPerPacket; i++ ) { + channel_state[ n ].LBRR_flags[ i ] = silk_RSHIFT( LBRR_symbol, i ) & 1; + } + } + } + } + + if( lostFlag == FLAG_DECODE_NORMAL ) { + /* Regular decoding: skip all LBRR data */ + for( i = 0; i < channel_state[ 0 ].nFramesPerPacket; i++ ) { + for( n = 0; n < decControl->nChannelsInternal; n++ ) { + if( channel_state[ n ].LBRR_flags[ i ] ) { + opus_int16 pulses[ MAX_FRAME_LENGTH ]; + opus_int condCoding; + + if( decControl->nChannelsInternal == 2 && n == 0 ) { + silk_stereo_decode_pred( psRangeDec, MS_pred_Q13 ); + if( channel_state[ 1 ].LBRR_flags[ i ] == 0 ) { + silk_stereo_decode_mid_only( psRangeDec, &decode_only_middle ); + } + } + /* Use conditional coding if previous frame available */ + if( i > 0 && channel_state[ n ].LBRR_flags[ i - 1 ] ) { + condCoding = CODE_CONDITIONALLY; + } else { + condCoding = CODE_INDEPENDENTLY; + } + silk_decode_indices( &channel_state[ n ], psRangeDec, i, 1, condCoding ); + silk_decode_pulses( psRangeDec, pulses, channel_state[ n ].indices.signalType, + channel_state[ n ].indices.quantOffsetType, channel_state[ n ].frame_length ); + } + } + } + } + } + + /* Get MS predictor index */ + if( decControl->nChannelsInternal == 2 ) { + if( lostFlag == FLAG_DECODE_NORMAL || + ( lostFlag == FLAG_DECODE_LBRR && channel_state[ 0 ].LBRR_flags[ channel_state[ 0 ].nFramesDecoded ] == 1 ) ) + { + silk_stereo_decode_pred( psRangeDec, MS_pred_Q13 ); + /* For LBRR data, decode mid-only flag only if side-channel's LBRR flag is false */ + if( ( lostFlag == FLAG_DECODE_NORMAL && channel_state[ 1 ].VAD_flags[ channel_state[ 0 ].nFramesDecoded ] == 0 ) || + ( lostFlag == FLAG_DECODE_LBRR && channel_state[ 1 ].LBRR_flags[ channel_state[ 0 ].nFramesDecoded ] == 0 ) ) + { + silk_stereo_decode_mid_only( psRangeDec, &decode_only_middle ); + } else { + decode_only_middle = 0; + } + } else { + for( n = 0; n < 2; n++ ) { + MS_pred_Q13[ n ] = psDec->sStereo.pred_prev_Q13[ n ]; + } + } + } + + /* Reset side channel decoder prediction memory for first frame with side coding */ + if( decControl->nChannelsInternal == 2 && decode_only_middle == 0 && psDec->prev_decode_only_middle == 1 ) { + silk_memset( psDec->channel_state[ 1 ].outBuf, 0, sizeof(psDec->channel_state[ 1 ].outBuf) ); + silk_memset( psDec->channel_state[ 1 ].sLPC_Q14_buf, 0, sizeof(psDec->channel_state[ 1 ].sLPC_Q14_buf) ); + psDec->channel_state[ 1 ].lagPrev = 100; + psDec->channel_state[ 1 ].LastGainIndex = 10; + psDec->channel_state[ 1 ].prevSignalType = TYPE_NO_VOICE_ACTIVITY; + psDec->channel_state[ 1 ].first_frame_after_reset = 1; + } + + /* Check if the temp buffer fits into the output PCM buffer. If it fits, + we can delay allocating the temp buffer until after the SILK peak stack + usage. We need to use a < and not a <= because of the two extra samples. */ + delay_stack_alloc = decControl->internalSampleRate*decControl->nChannelsInternal + < decControl->API_sampleRate*decControl->nChannelsAPI; + ALLOC( samplesOut1_tmp_storage1, delay_stack_alloc ? ALLOC_NONE + : decControl->nChannelsInternal*(channel_state[ 0 ].frame_length + 2 ), + opus_int16 ); + if ( delay_stack_alloc ) + { + samplesOut1_tmp[ 0 ] = samplesOut; + samplesOut1_tmp[ 1 ] = samplesOut + channel_state[ 0 ].frame_length + 2; + } else { + samplesOut1_tmp[ 0 ] = samplesOut1_tmp_storage1; + samplesOut1_tmp[ 1 ] = samplesOut1_tmp_storage1 + channel_state[ 0 ].frame_length + 2; + } + + if( lostFlag == FLAG_DECODE_NORMAL ) { + has_side = !decode_only_middle; + } else { + has_side = !psDec->prev_decode_only_middle + || (decControl->nChannelsInternal == 2 && lostFlag == FLAG_DECODE_LBRR && channel_state[1].LBRR_flags[ channel_state[1].nFramesDecoded ] == 1 ); + } + /* Call decoder for one frame */ + for( n = 0; n < decControl->nChannelsInternal; n++ ) { + if( n == 0 || has_side ) { + opus_int FrameIndex; + opus_int condCoding; + + FrameIndex = channel_state[ 0 ].nFramesDecoded - n; + /* Use independent coding if no previous frame available */ + if( FrameIndex <= 0 ) { + condCoding = CODE_INDEPENDENTLY; + } else if( lostFlag == FLAG_DECODE_LBRR ) { + condCoding = channel_state[ n ].LBRR_flags[ FrameIndex - 1 ] ? CODE_CONDITIONALLY : CODE_INDEPENDENTLY; + } else if( n > 0 && psDec->prev_decode_only_middle ) { + /* If we skipped a side frame in this packet, we don't + need LTP scaling; the LTP state is well-defined. */ + condCoding = CODE_INDEPENDENTLY_NO_LTP_SCALING; + } else { + condCoding = CODE_CONDITIONALLY; + } + ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag, condCoding, arch); + } else { + silk_memset( &samplesOut1_tmp[ n ][ 2 ], 0, nSamplesOutDec * sizeof( opus_int16 ) ); + } + channel_state[ n ].nFramesDecoded++; + } + + if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 2 ) { + /* Convert Mid/Side to Left/Right */ + silk_stereo_MS_to_LR( &psDec->sStereo, samplesOut1_tmp[ 0 ], samplesOut1_tmp[ 1 ], MS_pred_Q13, channel_state[ 0 ].fs_kHz, nSamplesOutDec ); + } else { + /* Buffering */ + silk_memcpy( samplesOut1_tmp[ 0 ], psDec->sStereo.sMid, 2 * sizeof( opus_int16 ) ); + silk_memcpy( psDec->sStereo.sMid, &samplesOut1_tmp[ 0 ][ nSamplesOutDec ], 2 * sizeof( opus_int16 ) ); + } + + /* Number of output samples */ + *nSamplesOut = silk_DIV32( nSamplesOutDec * decControl->API_sampleRate, silk_SMULBB( channel_state[ 0 ].fs_kHz, 1000 ) ); + + /* Set up pointers to temp buffers */ + ALLOC( samplesOut2_tmp, + decControl->nChannelsAPI == 2 ? *nSamplesOut : ALLOC_NONE, opus_int16 ); + if( decControl->nChannelsAPI == 2 ) { + resample_out_ptr = samplesOut2_tmp; + } else { + resample_out_ptr = samplesOut; + } + + ALLOC( samplesOut1_tmp_storage2, delay_stack_alloc + ? decControl->nChannelsInternal*(channel_state[ 0 ].frame_length + 2 ) + : ALLOC_NONE, + opus_int16 ); + if ( delay_stack_alloc ) { + OPUS_COPY(samplesOut1_tmp_storage2, samplesOut, decControl->nChannelsInternal*(channel_state[ 0 ].frame_length + 2)); + samplesOut1_tmp[ 0 ] = samplesOut1_tmp_storage2; + samplesOut1_tmp[ 1 ] = samplesOut1_tmp_storage2 + channel_state[ 0 ].frame_length + 2; + } + for( n = 0; n < silk_min( decControl->nChannelsAPI, decControl->nChannelsInternal ); n++ ) { + + /* Resample decoded signal to API_sampleRate */ + ret += silk_resampler( &channel_state[ n ].resampler_state, resample_out_ptr, &samplesOut1_tmp[ n ][ 1 ], nSamplesOutDec ); + + /* Interleave if stereo output and stereo stream */ + if( decControl->nChannelsAPI == 2 ) { + for( i = 0; i < *nSamplesOut; i++ ) { + samplesOut[ n + 2 * i ] = resample_out_ptr[ i ]; + } + } + } + + /* Create two channel output from mono stream */ + if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 1 ) { + if ( stereo_to_mono ){ + /* Resample right channel for newly collapsed stereo just in case + we weren't doing collapsing when switching to mono */ + ret += silk_resampler( &channel_state[ 1 ].resampler_state, resample_out_ptr, &samplesOut1_tmp[ 0 ][ 1 ], nSamplesOutDec ); + + for( i = 0; i < *nSamplesOut; i++ ) { + samplesOut[ 1 + 2 * i ] = resample_out_ptr[ i ]; + } + } else { + for( i = 0; i < *nSamplesOut; i++ ) { + samplesOut[ 1 + 2 * i ] = samplesOut[ 0 + 2 * i ]; + } + } + } + + /* Export pitch lag, measured at 48 kHz sampling rate */ + if( channel_state[ 0 ].prevSignalType == TYPE_VOICED ) { + int mult_tab[ 3 ] = { 6, 4, 3 }; + decControl->prevPitchLag = channel_state[ 0 ].lagPrev * mult_tab[ ( channel_state[ 0 ].fs_kHz - 8 ) >> 2 ]; + } else { + decControl->prevPitchLag = 0; + } + + if( lostFlag == FLAG_PACKET_LOST ) { + /* On packet loss, remove the gain clamping to prevent having the energy "bounce back" + if we lose packets when the energy is going down */ + for ( i = 0; i < psDec->nChannelsInternal; i++ ) + psDec->channel_state[ i ].LastGainIndex = 10; + } else { + psDec->prev_decode_only_middle = decode_only_middle; + } + RESTORE_STACK; + return ret; +} + +#if 0 +/* Getting table of contents for a packet */ +opus_int silk_get_TOC( + const opus_uint8 *payload, /* I Payload data */ + const opus_int nBytesIn, /* I Number of input bytes */ + const opus_int nFramesPerPayload, /* I Number of SILK frames per payload */ + silk_TOC_struct *Silk_TOC /* O Type of content */ +) +{ + opus_int i, flags, ret = SILK_NO_ERROR; + + if( nBytesIn < 1 ) { + return -1; + } + if( nFramesPerPayload < 0 || nFramesPerPayload > 3 ) { + return -1; + } + + silk_memset( Silk_TOC, 0, sizeof( *Silk_TOC ) ); + + /* For stereo, extract the flags for the mid channel */ + flags = silk_RSHIFT( payload[ 0 ], 7 - nFramesPerPayload ) & ( silk_LSHIFT( 1, nFramesPerPayload + 1 ) - 1 ); + + Silk_TOC->inbandFECFlag = flags & 1; + for( i = nFramesPerPayload - 1; i >= 0 ; i-- ) { + flags = silk_RSHIFT( flags, 1 ); + Silk_TOC->VADFlags[ i ] = flags & 1; + Silk_TOC->VADFlag |= flags & 1; + } + + return ret; +} +#endif diff --git a/thirdparty/opus/silk/decode_core.c b/thirdparty/opus/silk/decode_core.c new file mode 100644 index 000000000000..e569c0e72b3e --- /dev/null +++ b/thirdparty/opus/silk/decode_core.c @@ -0,0 +1,239 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" +#include "stack_alloc.h" + +/**********************************************************/ +/* Core decoder. Performs inverse NSQ operation LTP + LPC */ +/**********************************************************/ +void silk_decode_core( + silk_decoder_state *psDec, /* I/O Decoder state */ + silk_decoder_control *psDecCtrl, /* I Decoder control */ + opus_int16 xq[], /* O Decoded speech */ + const opus_int16 pulses[ MAX_FRAME_LENGTH ], /* I Pulse signal */ + int arch /* I Run-time architecture */ +) +{ + opus_int i, k, lag = 0, start_idx, sLTP_buf_idx, NLSF_interpolation_flag, signalType; + opus_int16 *A_Q12, *B_Q14, *pxq, A_Q12_tmp[ MAX_LPC_ORDER ]; + VARDECL( opus_int16, sLTP ); + VARDECL( opus_int32, sLTP_Q15 ); + opus_int32 LTP_pred_Q13, LPC_pred_Q10, Gain_Q10, inv_gain_Q31, gain_adj_Q16, rand_seed, offset_Q10; + opus_int32 *pred_lag_ptr, *pexc_Q14, *pres_Q14; + VARDECL( opus_int32, res_Q14 ); + VARDECL( opus_int32, sLPC_Q14 ); + SAVE_STACK; + + silk_assert( psDec->prev_gain_Q16 != 0 ); + + ALLOC( sLTP, psDec->ltp_mem_length, opus_int16 ); + ALLOC( sLTP_Q15, psDec->ltp_mem_length + psDec->frame_length, opus_int32 ); + ALLOC( res_Q14, psDec->subfr_length, opus_int32 ); + ALLOC( sLPC_Q14, psDec->subfr_length + MAX_LPC_ORDER, opus_int32 ); + + offset_Q10 = silk_Quantization_Offsets_Q10[ psDec->indices.signalType >> 1 ][ psDec->indices.quantOffsetType ]; + + if( psDec->indices.NLSFInterpCoef_Q2 < 1 << 2 ) { + NLSF_interpolation_flag = 1; + } else { + NLSF_interpolation_flag = 0; + } + + /* Decode excitation */ + rand_seed = psDec->indices.Seed; + for( i = 0; i < psDec->frame_length; i++ ) { + rand_seed = silk_RAND( rand_seed ); + psDec->exc_Q14[ i ] = silk_LSHIFT( (opus_int32)pulses[ i ], 14 ); + if( psDec->exc_Q14[ i ] > 0 ) { + psDec->exc_Q14[ i ] -= QUANT_LEVEL_ADJUST_Q10 << 4; + } else + if( psDec->exc_Q14[ i ] < 0 ) { + psDec->exc_Q14[ i ] += QUANT_LEVEL_ADJUST_Q10 << 4; + } + psDec->exc_Q14[ i ] += offset_Q10 << 4; + if( rand_seed < 0 ) { + psDec->exc_Q14[ i ] = -psDec->exc_Q14[ i ]; + } + + rand_seed = silk_ADD32_ovflw( rand_seed, pulses[ i ] ); + } + + /* Copy LPC state */ + silk_memcpy( sLPC_Q14, psDec->sLPC_Q14_buf, MAX_LPC_ORDER * sizeof( opus_int32 ) ); + + pexc_Q14 = psDec->exc_Q14; + pxq = xq; + sLTP_buf_idx = psDec->ltp_mem_length; + /* Loop over subframes */ + for( k = 0; k < psDec->nb_subfr; k++ ) { + pres_Q14 = res_Q14; + A_Q12 = psDecCtrl->PredCoef_Q12[ k >> 1 ]; + + /* Preload LPC coeficients to array on stack. Gives small performance gain */ + silk_memcpy( A_Q12_tmp, A_Q12, psDec->LPC_order * sizeof( opus_int16 ) ); + B_Q14 = &psDecCtrl->LTPCoef_Q14[ k * LTP_ORDER ]; + signalType = psDec->indices.signalType; + + Gain_Q10 = silk_RSHIFT( psDecCtrl->Gains_Q16[ k ], 6 ); + inv_gain_Q31 = silk_INVERSE32_varQ( psDecCtrl->Gains_Q16[ k ], 47 ); + + /* Calculate gain adjustment factor */ + if( psDecCtrl->Gains_Q16[ k ] != psDec->prev_gain_Q16 ) { + gain_adj_Q16 = silk_DIV32_varQ( psDec->prev_gain_Q16, psDecCtrl->Gains_Q16[ k ], 16 ); + + /* Scale short term state */ + for( i = 0; i < MAX_LPC_ORDER; i++ ) { + sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, sLPC_Q14[ i ] ); + } + } else { + gain_adj_Q16 = (opus_int32)1 << 16; + } + + /* Save inv_gain */ + silk_assert( inv_gain_Q31 != 0 ); + psDec->prev_gain_Q16 = psDecCtrl->Gains_Q16[ k ]; + + /* Avoid abrupt transition from voiced PLC to unvoiced normal decoding */ + if( psDec->lossCnt && psDec->prevSignalType == TYPE_VOICED && + psDec->indices.signalType != TYPE_VOICED && k < MAX_NB_SUBFR/2 ) { + + silk_memset( B_Q14, 0, LTP_ORDER * sizeof( opus_int16 ) ); + B_Q14[ LTP_ORDER/2 ] = SILK_FIX_CONST( 0.25, 14 ); + + signalType = TYPE_VOICED; + psDecCtrl->pitchL[ k ] = psDec->lagPrev; + } + + if( signalType == TYPE_VOICED ) { + /* Voiced */ + lag = psDecCtrl->pitchL[ k ]; + + /* Re-whitening */ + if( k == 0 || ( k == 2 && NLSF_interpolation_flag ) ) { + /* Rewhiten with new A coefs */ + start_idx = psDec->ltp_mem_length - lag - psDec->LPC_order - LTP_ORDER / 2; + silk_assert( start_idx > 0 ); + + if( k == 2 ) { + silk_memcpy( &psDec->outBuf[ psDec->ltp_mem_length ], xq, 2 * psDec->subfr_length * sizeof( opus_int16 ) ); + } + + silk_LPC_analysis_filter( &sLTP[ start_idx ], &psDec->outBuf[ start_idx + k * psDec->subfr_length ], + A_Q12, psDec->ltp_mem_length - start_idx, psDec->LPC_order, arch ); + + /* After rewhitening the LTP state is unscaled */ + if( k == 0 ) { + /* Do LTP downscaling to reduce inter-packet dependency */ + inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, psDecCtrl->LTP_scale_Q14 ), 2 ); + } + for( i = 0; i < lag + LTP_ORDER/2; i++ ) { + sLTP_Q15[ sLTP_buf_idx - i - 1 ] = silk_SMULWB( inv_gain_Q31, sLTP[ psDec->ltp_mem_length - i - 1 ] ); + } + } else { + /* Update LTP state when Gain changes */ + if( gain_adj_Q16 != (opus_int32)1 << 16 ) { + for( i = 0; i < lag + LTP_ORDER/2; i++ ) { + sLTP_Q15[ sLTP_buf_idx - i - 1 ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ sLTP_buf_idx - i - 1 ] ); + } + } + } + } + + /* Long-term prediction */ + if( signalType == TYPE_VOICED ) { + /* Set up pointer */ + pred_lag_ptr = &sLTP_Q15[ sLTP_buf_idx - lag + LTP_ORDER / 2 ]; + for( i = 0; i < psDec->subfr_length; i++ ) { + /* Unrolled loop */ + /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ + LTP_pred_Q13 = 2; + LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ 0 ], B_Q14[ 0 ] ); + LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -1 ], B_Q14[ 1 ] ); + LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -2 ], B_Q14[ 2 ] ); + LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -3 ], B_Q14[ 3 ] ); + LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -4 ], B_Q14[ 4 ] ); + pred_lag_ptr++; + + /* Generate LPC excitation */ + pres_Q14[ i ] = silk_ADD_LSHIFT32( pexc_Q14[ i ], LTP_pred_Q13, 1 ); + + /* Update states */ + sLTP_Q15[ sLTP_buf_idx ] = silk_LSHIFT( pres_Q14[ i ], 1 ); + sLTP_buf_idx++; + } + } else { + pres_Q14 = pexc_Q14; + } + + for( i = 0; i < psDec->subfr_length; i++ ) { + /* Short-term prediction */ + silk_assert( psDec->LPC_order == 10 || psDec->LPC_order == 16 ); + /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ + LPC_pred_Q10 = silk_RSHIFT( psDec->LPC_order, 1 ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i - 1 ], A_Q12_tmp[ 0 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i - 2 ], A_Q12_tmp[ 1 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i - 3 ], A_Q12_tmp[ 2 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i - 4 ], A_Q12_tmp[ 3 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i - 5 ], A_Q12_tmp[ 4 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i - 6 ], A_Q12_tmp[ 5 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i - 7 ], A_Q12_tmp[ 6 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i - 8 ], A_Q12_tmp[ 7 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i - 9 ], A_Q12_tmp[ 8 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i - 10 ], A_Q12_tmp[ 9 ] ); + if( psDec->LPC_order == 16 ) { + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i - 11 ], A_Q12_tmp[ 10 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i - 12 ], A_Q12_tmp[ 11 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i - 13 ], A_Q12_tmp[ 12 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i - 14 ], A_Q12_tmp[ 13 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i - 15 ], A_Q12_tmp[ 14 ] ); + LPC_pred_Q10 = silk_SMLAWB( LPC_pred_Q10, sLPC_Q14[ MAX_LPC_ORDER + i - 16 ], A_Q12_tmp[ 15 ] ); + } + + /* Add prediction to LPC excitation */ + sLPC_Q14[ MAX_LPC_ORDER + i ] = silk_ADD_SAT32( pres_Q14[ i ], silk_LSHIFT_SAT32( LPC_pred_Q10, 4 ) ); + + /* Scale with gain */ + pxq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( sLPC_Q14[ MAX_LPC_ORDER + i ], Gain_Q10 ), 8 ) ); + } + + /* DEBUG_STORE_DATA( dec.pcm, pxq, psDec->subfr_length * sizeof( opus_int16 ) ) */ + + /* Update LPC filter state */ + silk_memcpy( sLPC_Q14, &sLPC_Q14[ psDec->subfr_length ], MAX_LPC_ORDER * sizeof( opus_int32 ) ); + pexc_Q14 += psDec->subfr_length; + pxq += psDec->subfr_length; + } + + /* Save LPC state */ + silk_memcpy( psDec->sLPC_Q14_buf, sLPC_Q14, MAX_LPC_ORDER * sizeof( opus_int32 ) ); + RESTORE_STACK; +} diff --git a/thirdparty/opus/silk/decode_frame.c b/thirdparty/opus/silk/decode_frame.c new file mode 100644 index 000000000000..a605d95ac6ab --- /dev/null +++ b/thirdparty/opus/silk/decode_frame.c @@ -0,0 +1,129 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" +#include "stack_alloc.h" +#include "PLC.h" + +/****************/ +/* Decode frame */ +/****************/ +opus_int silk_decode_frame( + silk_decoder_state *psDec, /* I/O Pointer to Silk decoder state */ + ec_dec *psRangeDec, /* I/O Compressor data structure */ + opus_int16 pOut[], /* O Pointer to output speech frame */ + opus_int32 *pN, /* O Pointer to size of output frame */ + opus_int lostFlag, /* I 0: no loss, 1 loss, 2 decode fec */ + opus_int condCoding, /* I The type of conditional coding to use */ + int arch /* I Run-time architecture */ +) +{ + VARDECL( silk_decoder_control, psDecCtrl ); + opus_int L, mv_len, ret = 0; + SAVE_STACK; + + L = psDec->frame_length; + ALLOC( psDecCtrl, 1, silk_decoder_control ); + psDecCtrl->LTP_scale_Q14 = 0; + + /* Safety checks */ + silk_assert( L > 0 && L <= MAX_FRAME_LENGTH ); + + if( lostFlag == FLAG_DECODE_NORMAL || + ( lostFlag == FLAG_DECODE_LBRR && psDec->LBRR_flags[ psDec->nFramesDecoded ] == 1 ) ) + { + VARDECL( opus_int16, pulses ); + ALLOC( pulses, (L + SHELL_CODEC_FRAME_LENGTH - 1) & + ~(SHELL_CODEC_FRAME_LENGTH - 1), opus_int16 ); + /*********************************************/ + /* Decode quantization indices of side info */ + /*********************************************/ + silk_decode_indices( psDec, psRangeDec, psDec->nFramesDecoded, lostFlag, condCoding ); + + /*********************************************/ + /* Decode quantization indices of excitation */ + /*********************************************/ + silk_decode_pulses( psRangeDec, pulses, psDec->indices.signalType, + psDec->indices.quantOffsetType, psDec->frame_length ); + + /********************************************/ + /* Decode parameters and pulse signal */ + /********************************************/ + silk_decode_parameters( psDec, psDecCtrl, condCoding ); + + /********************************************************/ + /* Run inverse NSQ */ + /********************************************************/ + silk_decode_core( psDec, psDecCtrl, pOut, pulses, arch ); + + /********************************************************/ + /* Update PLC state */ + /********************************************************/ + silk_PLC( psDec, psDecCtrl, pOut, 0, arch ); + + psDec->lossCnt = 0; + psDec->prevSignalType = psDec->indices.signalType; + silk_assert( psDec->prevSignalType >= 0 && psDec->prevSignalType <= 2 ); + + /* A frame has been decoded without errors */ + psDec->first_frame_after_reset = 0; + } else { + /* Handle packet loss by extrapolation */ + silk_PLC( psDec, psDecCtrl, pOut, 1, arch ); + } + + /*************************/ + /* Update output buffer. */ + /*************************/ + silk_assert( psDec->ltp_mem_length >= psDec->frame_length ); + mv_len = psDec->ltp_mem_length - psDec->frame_length; + silk_memmove( psDec->outBuf, &psDec->outBuf[ psDec->frame_length ], mv_len * sizeof(opus_int16) ); + silk_memcpy( &psDec->outBuf[ mv_len ], pOut, psDec->frame_length * sizeof( opus_int16 ) ); + + /************************************************/ + /* Comfort noise generation / estimation */ + /************************************************/ + silk_CNG( psDec, psDecCtrl, pOut, L ); + + /****************************************************************/ + /* Ensure smooth connection of extrapolated and good frames */ + /****************************************************************/ + silk_PLC_glue_frames( psDec, pOut, L ); + + /* Update some decoder state variables */ + psDec->lagPrev = psDecCtrl->pitchL[ psDec->nb_subfr - 1 ]; + + /* Set output frame length */ + *pN = L; + + RESTORE_STACK; + return ret; +} diff --git a/thirdparty/opus/silk/decode_indices.c b/thirdparty/opus/silk/decode_indices.c new file mode 100644 index 000000000000..7afe5c26c192 --- /dev/null +++ b/thirdparty/opus/silk/decode_indices.c @@ -0,0 +1,151 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* Decode side-information parameters from payload */ +void silk_decode_indices( + silk_decoder_state *psDec, /* I/O State */ + ec_dec *psRangeDec, /* I/O Compressor data structure */ + opus_int FrameIndex, /* I Frame number */ + opus_int decode_LBRR, /* I Flag indicating LBRR data is being decoded */ + opus_int condCoding /* I The type of conditional coding to use */ +) +{ + opus_int i, k, Ix; + opus_int decode_absolute_lagIndex, delta_lagIndex; + opus_int16 ec_ix[ MAX_LPC_ORDER ]; + opus_uint8 pred_Q8[ MAX_LPC_ORDER ]; + + /*******************************************/ + /* Decode signal type and quantizer offset */ + /*******************************************/ + if( decode_LBRR || psDec->VAD_flags[ FrameIndex ] ) { + Ix = ec_dec_icdf( psRangeDec, silk_type_offset_VAD_iCDF, 8 ) + 2; + } else { + Ix = ec_dec_icdf( psRangeDec, silk_type_offset_no_VAD_iCDF, 8 ); + } + psDec->indices.signalType = (opus_int8)silk_RSHIFT( Ix, 1 ); + psDec->indices.quantOffsetType = (opus_int8)( Ix & 1 ); + + /****************/ + /* Decode gains */ + /****************/ + /* First subframe */ + if( condCoding == CODE_CONDITIONALLY ) { + /* Conditional coding */ + psDec->indices.GainsIndices[ 0 ] = (opus_int8)ec_dec_icdf( psRangeDec, silk_delta_gain_iCDF, 8 ); + } else { + /* Independent coding, in two stages: MSB bits followed by 3 LSBs */ + psDec->indices.GainsIndices[ 0 ] = (opus_int8)silk_LSHIFT( ec_dec_icdf( psRangeDec, silk_gain_iCDF[ psDec->indices.signalType ], 8 ), 3 ); + psDec->indices.GainsIndices[ 0 ] += (opus_int8)ec_dec_icdf( psRangeDec, silk_uniform8_iCDF, 8 ); + } + + /* Remaining subframes */ + for( i = 1; i < psDec->nb_subfr; i++ ) { + psDec->indices.GainsIndices[ i ] = (opus_int8)ec_dec_icdf( psRangeDec, silk_delta_gain_iCDF, 8 ); + } + + /**********************/ + /* Decode LSF Indices */ + /**********************/ + psDec->indices.NLSFIndices[ 0 ] = (opus_int8)ec_dec_icdf( psRangeDec, &psDec->psNLSF_CB->CB1_iCDF[ ( psDec->indices.signalType >> 1 ) * psDec->psNLSF_CB->nVectors ], 8 ); + silk_NLSF_unpack( ec_ix, pred_Q8, psDec->psNLSF_CB, psDec->indices.NLSFIndices[ 0 ] ); + silk_assert( psDec->psNLSF_CB->order == psDec->LPC_order ); + for( i = 0; i < psDec->psNLSF_CB->order; i++ ) { + Ix = ec_dec_icdf( psRangeDec, &psDec->psNLSF_CB->ec_iCDF[ ec_ix[ i ] ], 8 ); + if( Ix == 0 ) { + Ix -= ec_dec_icdf( psRangeDec, silk_NLSF_EXT_iCDF, 8 ); + } else if( Ix == 2 * NLSF_QUANT_MAX_AMPLITUDE ) { + Ix += ec_dec_icdf( psRangeDec, silk_NLSF_EXT_iCDF, 8 ); + } + psDec->indices.NLSFIndices[ i+1 ] = (opus_int8)( Ix - NLSF_QUANT_MAX_AMPLITUDE ); + } + + /* Decode LSF interpolation factor */ + if( psDec->nb_subfr == MAX_NB_SUBFR ) { + psDec->indices.NLSFInterpCoef_Q2 = (opus_int8)ec_dec_icdf( psRangeDec, silk_NLSF_interpolation_factor_iCDF, 8 ); + } else { + psDec->indices.NLSFInterpCoef_Q2 = 4; + } + + if( psDec->indices.signalType == TYPE_VOICED ) + { + /*********************/ + /* Decode pitch lags */ + /*********************/ + /* Get lag index */ + decode_absolute_lagIndex = 1; + if( condCoding == CODE_CONDITIONALLY && psDec->ec_prevSignalType == TYPE_VOICED ) { + /* Decode Delta index */ + delta_lagIndex = (opus_int16)ec_dec_icdf( psRangeDec, silk_pitch_delta_iCDF, 8 ); + if( delta_lagIndex > 0 ) { + delta_lagIndex = delta_lagIndex - 9; + psDec->indices.lagIndex = (opus_int16)( psDec->ec_prevLagIndex + delta_lagIndex ); + decode_absolute_lagIndex = 0; + } + } + if( decode_absolute_lagIndex ) { + /* Absolute decoding */ + psDec->indices.lagIndex = (opus_int16)ec_dec_icdf( psRangeDec, silk_pitch_lag_iCDF, 8 ) * silk_RSHIFT( psDec->fs_kHz, 1 ); + psDec->indices.lagIndex += (opus_int16)ec_dec_icdf( psRangeDec, psDec->pitch_lag_low_bits_iCDF, 8 ); + } + psDec->ec_prevLagIndex = psDec->indices.lagIndex; + + /* Get countour index */ + psDec->indices.contourIndex = (opus_int8)ec_dec_icdf( psRangeDec, psDec->pitch_contour_iCDF, 8 ); + + /********************/ + /* Decode LTP gains */ + /********************/ + /* Decode PERIndex value */ + psDec->indices.PERIndex = (opus_int8)ec_dec_icdf( psRangeDec, silk_LTP_per_index_iCDF, 8 ); + + for( k = 0; k < psDec->nb_subfr; k++ ) { + psDec->indices.LTPIndex[ k ] = (opus_int8)ec_dec_icdf( psRangeDec, silk_LTP_gain_iCDF_ptrs[ psDec->indices.PERIndex ], 8 ); + } + + /**********************/ + /* Decode LTP scaling */ + /**********************/ + if( condCoding == CODE_INDEPENDENTLY ) { + psDec->indices.LTP_scaleIndex = (opus_int8)ec_dec_icdf( psRangeDec, silk_LTPscale_iCDF, 8 ); + } else { + psDec->indices.LTP_scaleIndex = 0; + } + } + psDec->ec_prevSignalType = psDec->indices.signalType; + + /***************/ + /* Decode seed */ + /***************/ + psDec->indices.Seed = (opus_int8)ec_dec_icdf( psRangeDec, silk_uniform4_iCDF, 8 ); +} diff --git a/thirdparty/opus/silk/decode_parameters.c b/thirdparty/opus/silk/decode_parameters.c new file mode 100644 index 000000000000..e345b1dcefb9 --- /dev/null +++ b/thirdparty/opus/silk/decode_parameters.c @@ -0,0 +1,115 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* Decode parameters from payload */ +void silk_decode_parameters( + silk_decoder_state *psDec, /* I/O State */ + silk_decoder_control *psDecCtrl, /* I/O Decoder control */ + opus_int condCoding /* I The type of conditional coding to use */ +) +{ + opus_int i, k, Ix; + opus_int16 pNLSF_Q15[ MAX_LPC_ORDER ], pNLSF0_Q15[ MAX_LPC_ORDER ]; + const opus_int8 *cbk_ptr_Q7; + + /* Dequant Gains */ + silk_gains_dequant( psDecCtrl->Gains_Q16, psDec->indices.GainsIndices, + &psDec->LastGainIndex, condCoding == CODE_CONDITIONALLY, psDec->nb_subfr ); + + /****************/ + /* Decode NLSFs */ + /****************/ + silk_NLSF_decode( pNLSF_Q15, psDec->indices.NLSFIndices, psDec->psNLSF_CB ); + + /* Convert NLSF parameters to AR prediction filter coefficients */ + silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15, psDec->LPC_order ); + + /* If just reset, e.g., because internal Fs changed, do not allow interpolation */ + /* improves the case of packet loss in the first frame after a switch */ + if( psDec->first_frame_after_reset == 1 ) { + psDec->indices.NLSFInterpCoef_Q2 = 4; + } + + if( psDec->indices.NLSFInterpCoef_Q2 < 4 ) { + /* Calculation of the interpolated NLSF0 vector from the interpolation factor, */ + /* the previous NLSF1, and the current NLSF1 */ + for( i = 0; i < psDec->LPC_order; i++ ) { + pNLSF0_Q15[ i ] = psDec->prevNLSF_Q15[ i ] + silk_RSHIFT( silk_MUL( psDec->indices.NLSFInterpCoef_Q2, + pNLSF_Q15[ i ] - psDec->prevNLSF_Q15[ i ] ), 2 ); + } + + /* Convert NLSF parameters to AR prediction filter coefficients */ + silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 0 ], pNLSF0_Q15, psDec->LPC_order ); + } else { + /* Copy LPC coefficients for first half from second half */ + silk_memcpy( psDecCtrl->PredCoef_Q12[ 0 ], psDecCtrl->PredCoef_Q12[ 1 ], psDec->LPC_order * sizeof( opus_int16 ) ); + } + + silk_memcpy( psDec->prevNLSF_Q15, pNLSF_Q15, psDec->LPC_order * sizeof( opus_int16 ) ); + + /* After a packet loss do BWE of LPC coefs */ + if( psDec->lossCnt ) { + silk_bwexpander( psDecCtrl->PredCoef_Q12[ 0 ], psDec->LPC_order, BWE_AFTER_LOSS_Q16 ); + silk_bwexpander( psDecCtrl->PredCoef_Q12[ 1 ], psDec->LPC_order, BWE_AFTER_LOSS_Q16 ); + } + + if( psDec->indices.signalType == TYPE_VOICED ) { + /*********************/ + /* Decode pitch lags */ + /*********************/ + + /* Decode pitch values */ + silk_decode_pitch( psDec->indices.lagIndex, psDec->indices.contourIndex, psDecCtrl->pitchL, psDec->fs_kHz, psDec->nb_subfr ); + + /* Decode Codebook Index */ + cbk_ptr_Q7 = silk_LTP_vq_ptrs_Q7[ psDec->indices.PERIndex ]; /* set pointer to start of codebook */ + + for( k = 0; k < psDec->nb_subfr; k++ ) { + Ix = psDec->indices.LTPIndex[ k ]; + for( i = 0; i < LTP_ORDER; i++ ) { + psDecCtrl->LTPCoef_Q14[ k * LTP_ORDER + i ] = silk_LSHIFT( cbk_ptr_Q7[ Ix * LTP_ORDER + i ], 7 ); + } + } + + /**********************/ + /* Decode LTP scaling */ + /**********************/ + Ix = psDec->indices.LTP_scaleIndex; + psDecCtrl->LTP_scale_Q14 = silk_LTPScales_table_Q14[ Ix ]; + } else { + silk_memset( psDecCtrl->pitchL, 0, psDec->nb_subfr * sizeof( opus_int ) ); + silk_memset( psDecCtrl->LTPCoef_Q14, 0, LTP_ORDER * psDec->nb_subfr * sizeof( opus_int16 ) ); + psDec->indices.PERIndex = 0; + psDecCtrl->LTP_scale_Q14 = 0; + } +} diff --git a/thirdparty/opus/silk/decode_pitch.c b/thirdparty/opus/silk/decode_pitch.c new file mode 100644 index 000000000000..fedbc6a52557 --- /dev/null +++ b/thirdparty/opus/silk/decode_pitch.c @@ -0,0 +1,77 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/*********************************************************** +* Pitch analyser function +********************************************************** */ +#include "SigProc_FIX.h" +#include "pitch_est_defines.h" + +void silk_decode_pitch( + opus_int16 lagIndex, /* I */ + opus_int8 contourIndex, /* O */ + opus_int pitch_lags[], /* O 4 pitch values */ + const opus_int Fs_kHz, /* I sampling frequency (kHz) */ + const opus_int nb_subfr /* I number of sub frames */ +) +{ + opus_int lag, k, min_lag, max_lag, cbk_size; + const opus_int8 *Lag_CB_ptr; + + if( Fs_kHz == 8 ) { + if( nb_subfr == PE_MAX_NB_SUBFR ) { + Lag_CB_ptr = &silk_CB_lags_stage2[ 0 ][ 0 ]; + cbk_size = PE_NB_CBKS_STAGE2_EXT; + } else { + silk_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1 ); + Lag_CB_ptr = &silk_CB_lags_stage2_10_ms[ 0 ][ 0 ]; + cbk_size = PE_NB_CBKS_STAGE2_10MS; + } + } else { + if( nb_subfr == PE_MAX_NB_SUBFR ) { + Lag_CB_ptr = &silk_CB_lags_stage3[ 0 ][ 0 ]; + cbk_size = PE_NB_CBKS_STAGE3_MAX; + } else { + silk_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1 ); + Lag_CB_ptr = &silk_CB_lags_stage3_10_ms[ 0 ][ 0 ]; + cbk_size = PE_NB_CBKS_STAGE3_10MS; + } + } + + min_lag = silk_SMULBB( PE_MIN_LAG_MS, Fs_kHz ); + max_lag = silk_SMULBB( PE_MAX_LAG_MS, Fs_kHz ); + lag = min_lag + lagIndex; + + for( k = 0; k < nb_subfr; k++ ) { + pitch_lags[ k ] = lag + matrix_ptr( Lag_CB_ptr, k, contourIndex, cbk_size ); + pitch_lags[ k ] = silk_LIMIT( pitch_lags[ k ], min_lag, max_lag ); + } +} diff --git a/thirdparty/opus/silk/decode_pulses.c b/thirdparty/opus/silk/decode_pulses.c new file mode 100644 index 000000000000..d6bbec922520 --- /dev/null +++ b/thirdparty/opus/silk/decode_pulses.c @@ -0,0 +1,115 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/*********************************************/ +/* Decode quantization indices of excitation */ +/*********************************************/ +void silk_decode_pulses( + ec_dec *psRangeDec, /* I/O Compressor data structure */ + opus_int16 pulses[], /* O Excitation signal */ + const opus_int signalType, /* I Sigtype */ + const opus_int quantOffsetType, /* I quantOffsetType */ + const opus_int frame_length /* I Frame length */ +) +{ + opus_int i, j, k, iter, abs_q, nLS, RateLevelIndex; + opus_int sum_pulses[ MAX_NB_SHELL_BLOCKS ], nLshifts[ MAX_NB_SHELL_BLOCKS ]; + opus_int16 *pulses_ptr; + const opus_uint8 *cdf_ptr; + + /*********************/ + /* Decode rate level */ + /*********************/ + RateLevelIndex = ec_dec_icdf( psRangeDec, silk_rate_levels_iCDF[ signalType >> 1 ], 8 ); + + /* Calculate number of shell blocks */ + silk_assert( 1 << LOG2_SHELL_CODEC_FRAME_LENGTH == SHELL_CODEC_FRAME_LENGTH ); + iter = silk_RSHIFT( frame_length, LOG2_SHELL_CODEC_FRAME_LENGTH ); + if( iter * SHELL_CODEC_FRAME_LENGTH < frame_length ) { + silk_assert( frame_length == 12 * 10 ); /* Make sure only happens for 10 ms @ 12 kHz */ + iter++; + } + + /***************************************************/ + /* Sum-Weighted-Pulses Decoding */ + /***************************************************/ + cdf_ptr = silk_pulses_per_block_iCDF[ RateLevelIndex ]; + for( i = 0; i < iter; i++ ) { + nLshifts[ i ] = 0; + sum_pulses[ i ] = ec_dec_icdf( psRangeDec, cdf_ptr, 8 ); + + /* LSB indication */ + while( sum_pulses[ i ] == SILK_MAX_PULSES + 1 ) { + nLshifts[ i ]++; + /* When we've already got 10 LSBs, we shift the table to not allow (SILK_MAX_PULSES + 1) */ + sum_pulses[ i ] = ec_dec_icdf( psRangeDec, + silk_pulses_per_block_iCDF[ N_RATE_LEVELS - 1] + ( nLshifts[ i ] == 10 ), 8 ); + } + } + + /***************************************************/ + /* Shell decoding */ + /***************************************************/ + for( i = 0; i < iter; i++ ) { + if( sum_pulses[ i ] > 0 ) { + silk_shell_decoder( &pulses[ silk_SMULBB( i, SHELL_CODEC_FRAME_LENGTH ) ], psRangeDec, sum_pulses[ i ] ); + } else { + silk_memset( &pulses[ silk_SMULBB( i, SHELL_CODEC_FRAME_LENGTH ) ], 0, SHELL_CODEC_FRAME_LENGTH * sizeof( pulses[0] ) ); + } + } + + /***************************************************/ + /* LSB Decoding */ + /***************************************************/ + for( i = 0; i < iter; i++ ) { + if( nLshifts[ i ] > 0 ) { + nLS = nLshifts[ i ]; + pulses_ptr = &pulses[ silk_SMULBB( i, SHELL_CODEC_FRAME_LENGTH ) ]; + for( k = 0; k < SHELL_CODEC_FRAME_LENGTH; k++ ) { + abs_q = pulses_ptr[ k ]; + for( j = 0; j < nLS; j++ ) { + abs_q = silk_LSHIFT( abs_q, 1 ); + abs_q += ec_dec_icdf( psRangeDec, silk_lsb_iCDF, 8 ); + } + pulses_ptr[ k ] = abs_q; + } + /* Mark the number of pulses non-zero for sign decoding. */ + sum_pulses[ i ] |= nLS << 5; + } + } + + /****************************************/ + /* Decode and add signs to pulse signal */ + /****************************************/ + silk_decode_signs( psRangeDec, pulses, frame_length, signalType, quantOffsetType, sum_pulses ); +} diff --git a/thirdparty/opus/silk/decoder_set_fs.c b/thirdparty/opus/silk/decoder_set_fs.c new file mode 100644 index 000000000000..eef0fd25e183 --- /dev/null +++ b/thirdparty/opus/silk/decoder_set_fs.c @@ -0,0 +1,108 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* Set decoder sampling rate */ +opus_int silk_decoder_set_fs( + silk_decoder_state *psDec, /* I/O Decoder state pointer */ + opus_int fs_kHz, /* I Sampling frequency (kHz) */ + opus_int32 fs_API_Hz /* I API Sampling frequency (Hz) */ +) +{ + opus_int frame_length, ret = 0; + + silk_assert( fs_kHz == 8 || fs_kHz == 12 || fs_kHz == 16 ); + silk_assert( psDec->nb_subfr == MAX_NB_SUBFR || psDec->nb_subfr == MAX_NB_SUBFR/2 ); + + /* New (sub)frame length */ + psDec->subfr_length = silk_SMULBB( SUB_FRAME_LENGTH_MS, fs_kHz ); + frame_length = silk_SMULBB( psDec->nb_subfr, psDec->subfr_length ); + + /* Initialize resampler when switching internal or external sampling frequency */ + if( psDec->fs_kHz != fs_kHz || psDec->fs_API_hz != fs_API_Hz ) { + /* Initialize the resampler for dec_API.c preparing resampling from fs_kHz to API_fs_Hz */ + ret += silk_resampler_init( &psDec->resampler_state, silk_SMULBB( fs_kHz, 1000 ), fs_API_Hz, 0 ); + + psDec->fs_API_hz = fs_API_Hz; + } + + if( psDec->fs_kHz != fs_kHz || frame_length != psDec->frame_length ) { + if( fs_kHz == 8 ) { + if( psDec->nb_subfr == MAX_NB_SUBFR ) { + psDec->pitch_contour_iCDF = silk_pitch_contour_NB_iCDF; + } else { + psDec->pitch_contour_iCDF = silk_pitch_contour_10_ms_NB_iCDF; + } + } else { + if( psDec->nb_subfr == MAX_NB_SUBFR ) { + psDec->pitch_contour_iCDF = silk_pitch_contour_iCDF; + } else { + psDec->pitch_contour_iCDF = silk_pitch_contour_10_ms_iCDF; + } + } + if( psDec->fs_kHz != fs_kHz ) { + psDec->ltp_mem_length = silk_SMULBB( LTP_MEM_LENGTH_MS, fs_kHz ); + if( fs_kHz == 8 || fs_kHz == 12 ) { + psDec->LPC_order = MIN_LPC_ORDER; + psDec->psNLSF_CB = &silk_NLSF_CB_NB_MB; + } else { + psDec->LPC_order = MAX_LPC_ORDER; + psDec->psNLSF_CB = &silk_NLSF_CB_WB; + } + if( fs_kHz == 16 ) { + psDec->pitch_lag_low_bits_iCDF = silk_uniform8_iCDF; + } else if( fs_kHz == 12 ) { + psDec->pitch_lag_low_bits_iCDF = silk_uniform6_iCDF; + } else if( fs_kHz == 8 ) { + psDec->pitch_lag_low_bits_iCDF = silk_uniform4_iCDF; + } else { + /* unsupported sampling rate */ + silk_assert( 0 ); + } + psDec->first_frame_after_reset = 1; + psDec->lagPrev = 100; + psDec->LastGainIndex = 10; + psDec->prevSignalType = TYPE_NO_VOICE_ACTIVITY; + silk_memset( psDec->outBuf, 0, sizeof(psDec->outBuf)); + silk_memset( psDec->sLPC_Q14_buf, 0, sizeof(psDec->sLPC_Q14_buf) ); + } + + psDec->fs_kHz = fs_kHz; + psDec->frame_length = frame_length; + } + + /* Check that settings are valid */ + silk_assert( psDec->frame_length > 0 && psDec->frame_length <= MAX_FRAME_LENGTH ); + + return ret; +} + diff --git a/thirdparty/opus/silk/define.h b/thirdparty/opus/silk/define.h new file mode 100644 index 000000000000..19c9b00e259e --- /dev/null +++ b/thirdparty/opus/silk/define.h @@ -0,0 +1,235 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_DEFINE_H +#define SILK_DEFINE_H + +#include "errors.h" +#include "typedef.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/* Max number of encoder channels (1/2) */ +#define ENCODER_NUM_CHANNELS 2 +/* Number of decoder channels (1/2) */ +#define DECODER_NUM_CHANNELS 2 + +#define MAX_FRAMES_PER_PACKET 3 + +/* Limits on bitrate */ +#define MIN_TARGET_RATE_BPS 5000 +#define MAX_TARGET_RATE_BPS 80000 +#define TARGET_RATE_TAB_SZ 8 + +/* LBRR thresholds */ +#define LBRR_NB_MIN_RATE_BPS 12000 +#define LBRR_MB_MIN_RATE_BPS 14000 +#define LBRR_WB_MIN_RATE_BPS 16000 + +/* DTX settings */ +#define NB_SPEECH_FRAMES_BEFORE_DTX 10 /* eq 200 ms */ +#define MAX_CONSECUTIVE_DTX 20 /* eq 400 ms */ + +/* Maximum sampling frequency */ +#define MAX_FS_KHZ 16 +#define MAX_API_FS_KHZ 48 + +/* Signal types */ +#define TYPE_NO_VOICE_ACTIVITY 0 +#define TYPE_UNVOICED 1 +#define TYPE_VOICED 2 + +/* Conditional coding types */ +#define CODE_INDEPENDENTLY 0 +#define CODE_INDEPENDENTLY_NO_LTP_SCALING 1 +#define CODE_CONDITIONALLY 2 + +/* Settings for stereo processing */ +#define STEREO_QUANT_TAB_SIZE 16 +#define STEREO_QUANT_SUB_STEPS 5 +#define STEREO_INTERP_LEN_MS 8 /* must be even */ +#define STEREO_RATIO_SMOOTH_COEF 0.01 /* smoothing coef for signal norms and stereo width */ + +/* Range of pitch lag estimates */ +#define PITCH_EST_MIN_LAG_MS 2 /* 2 ms -> 500 Hz */ +#define PITCH_EST_MAX_LAG_MS 18 /* 18 ms -> 56 Hz */ + +/* Maximum number of subframes */ +#define MAX_NB_SUBFR 4 + +/* Number of samples per frame */ +#define LTP_MEM_LENGTH_MS 20 +#define SUB_FRAME_LENGTH_MS 5 +#define MAX_SUB_FRAME_LENGTH ( SUB_FRAME_LENGTH_MS * MAX_FS_KHZ ) +#define MAX_FRAME_LENGTH_MS ( SUB_FRAME_LENGTH_MS * MAX_NB_SUBFR ) +#define MAX_FRAME_LENGTH ( MAX_FRAME_LENGTH_MS * MAX_FS_KHZ ) + +/* Milliseconds of lookahead for pitch analysis */ +#define LA_PITCH_MS 2 +#define LA_PITCH_MAX ( LA_PITCH_MS * MAX_FS_KHZ ) + +/* Order of LPC used in find pitch */ +#define MAX_FIND_PITCH_LPC_ORDER 16 + +/* Length of LPC window used in find pitch */ +#define FIND_PITCH_LPC_WIN_MS ( 20 + (LA_PITCH_MS << 1) ) +#define FIND_PITCH_LPC_WIN_MS_2_SF ( 10 + (LA_PITCH_MS << 1) ) +#define FIND_PITCH_LPC_WIN_MAX ( FIND_PITCH_LPC_WIN_MS * MAX_FS_KHZ ) + +/* Milliseconds of lookahead for noise shape analysis */ +#define LA_SHAPE_MS 5 +#define LA_SHAPE_MAX ( LA_SHAPE_MS * MAX_FS_KHZ ) + +/* Maximum length of LPC window used in noise shape analysis */ +#define SHAPE_LPC_WIN_MAX ( 15 * MAX_FS_KHZ ) + +/* dB level of lowest gain quantization level */ +#define MIN_QGAIN_DB 2 +/* dB level of highest gain quantization level */ +#define MAX_QGAIN_DB 88 +/* Number of gain quantization levels */ +#define N_LEVELS_QGAIN 64 +/* Max increase in gain quantization index */ +#define MAX_DELTA_GAIN_QUANT 36 +/* Max decrease in gain quantization index */ +#define MIN_DELTA_GAIN_QUANT -4 + +/* Quantization offsets (multiples of 4) */ +#define OFFSET_VL_Q10 32 +#define OFFSET_VH_Q10 100 +#define OFFSET_UVL_Q10 100 +#define OFFSET_UVH_Q10 240 + +#define QUANT_LEVEL_ADJUST_Q10 80 + +/* Maximum numbers of iterations used to stabilize an LPC vector */ +#define MAX_LPC_STABILIZE_ITERATIONS 16 +#define MAX_PREDICTION_POWER_GAIN 1e4f +#define MAX_PREDICTION_POWER_GAIN_AFTER_RESET 1e2f + +#define MAX_LPC_ORDER 16 +#define MIN_LPC_ORDER 10 + +/* Find Pred Coef defines */ +#define LTP_ORDER 5 + +/* LTP quantization settings */ +#define NB_LTP_CBKS 3 + +/* Flag to use harmonic noise shaping */ +#define USE_HARM_SHAPING 1 + +/* Max LPC order of noise shaping filters */ +#define MAX_SHAPE_LPC_ORDER 16 + +#define HARM_SHAPE_FIR_TAPS 3 + +/* Maximum number of delayed decision states */ +#define MAX_DEL_DEC_STATES 4 + +#define LTP_BUF_LENGTH 512 +#define LTP_MASK ( LTP_BUF_LENGTH - 1 ) + +#define DECISION_DELAY 32 +#define DECISION_DELAY_MASK ( DECISION_DELAY - 1 ) + +/* Number of subframes for excitation entropy coding */ +#define SHELL_CODEC_FRAME_LENGTH 16 +#define LOG2_SHELL_CODEC_FRAME_LENGTH 4 +#define MAX_NB_SHELL_BLOCKS ( MAX_FRAME_LENGTH / SHELL_CODEC_FRAME_LENGTH ) + +/* Number of rate levels, for entropy coding of excitation */ +#define N_RATE_LEVELS 10 + +/* Maximum sum of pulses per shell coding frame */ +#define SILK_MAX_PULSES 16 + +#define MAX_MATRIX_SIZE MAX_LPC_ORDER /* Max of LPC Order and LTP order */ + +#if( MAX_LPC_ORDER > DECISION_DELAY ) +# define NSQ_LPC_BUF_LENGTH MAX_LPC_ORDER +#else +# define NSQ_LPC_BUF_LENGTH DECISION_DELAY +#endif + +/***************************/ +/* Voice activity detector */ +/***************************/ +#define VAD_N_BANDS 4 + +#define VAD_INTERNAL_SUBFRAMES_LOG2 2 +#define VAD_INTERNAL_SUBFRAMES ( 1 << VAD_INTERNAL_SUBFRAMES_LOG2 ) + +#define VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 1024 /* Must be < 4096 */ +#define VAD_NOISE_LEVELS_BIAS 50 + +/* Sigmoid settings */ +#define VAD_NEGATIVE_OFFSET_Q5 128 /* sigmoid is 0 at -128 */ +#define VAD_SNR_FACTOR_Q16 45000 + +/* smoothing for SNR measurement */ +#define VAD_SNR_SMOOTH_COEF_Q18 4096 + +/* Size of the piecewise linear cosine approximation table for the LSFs */ +#define LSF_COS_TAB_SZ_FIX 128 + +/******************/ +/* NLSF quantizer */ +/******************/ +#define NLSF_W_Q 2 +#define NLSF_VQ_MAX_VECTORS 32 +#define NLSF_VQ_MAX_SURVIVORS 32 +#define NLSF_QUANT_MAX_AMPLITUDE 4 +#define NLSF_QUANT_MAX_AMPLITUDE_EXT 10 +#define NLSF_QUANT_LEVEL_ADJ 0.1 +#define NLSF_QUANT_DEL_DEC_STATES_LOG2 2 +#define NLSF_QUANT_DEL_DEC_STATES ( 1 << NLSF_QUANT_DEL_DEC_STATES_LOG2 ) + +/* Transition filtering for mode switching */ +#define TRANSITION_TIME_MS 5120 /* 5120 = 64 * FRAME_LENGTH_MS * ( TRANSITION_INT_NUM - 1 ) = 64*(20*4)*/ +#define TRANSITION_NB 3 /* Hardcoded in tables */ +#define TRANSITION_NA 2 /* Hardcoded in tables */ +#define TRANSITION_INT_NUM 5 /* Hardcoded in tables */ +#define TRANSITION_FRAMES ( TRANSITION_TIME_MS / MAX_FRAME_LENGTH_MS ) +#define TRANSITION_INT_STEPS ( TRANSITION_FRAMES / ( TRANSITION_INT_NUM - 1 ) ) + +/* BWE factors to apply after packet loss */ +#define BWE_AFTER_LOSS_Q16 63570 + +/* Defines for CN generation */ +#define CNG_BUF_MASK_MAX 255 /* 2^floor(log2(MAX_FRAME_LENGTH))-1 */ +#define CNG_GAIN_SMTH_Q16 4634 /* 0.25^(1/4) */ +#define CNG_NLSF_SMTH_Q16 16348 /* 0.25 */ + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/thirdparty/opus/silk/enc_API.c b/thirdparty/opus/silk/enc_API.c new file mode 100644 index 000000000000..f8060286dbaf --- /dev/null +++ b/thirdparty/opus/silk/enc_API.c @@ -0,0 +1,563 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#include "define.h" +#include "API.h" +#include "control.h" +#include "typedef.h" +#include "stack_alloc.h" +#include "structs.h" +#include "tuning_parameters.h" +#ifdef FIXED_POINT +#include "main_FIX.h" +#else +#include "main_FLP.h" +#endif + +/***************************************/ +/* Read control structure from encoder */ +/***************************************/ +static opus_int silk_QueryEncoder( /* O Returns error code */ + const void *encState, /* I State */ + silk_EncControlStruct *encStatus /* O Encoder Status */ +); + +/****************************************/ +/* Encoder functions */ +/****************************************/ + +opus_int silk_Get_Encoder_Size( /* O Returns error code */ + opus_int *encSizeBytes /* O Number of bytes in SILK encoder state */ +) +{ + opus_int ret = SILK_NO_ERROR; + + *encSizeBytes = sizeof( silk_encoder ); + + return ret; +} + +/*************************/ +/* Init or Reset encoder */ +/*************************/ +opus_int silk_InitEncoder( /* O Returns error code */ + void *encState, /* I/O State */ + int arch, /* I Run-time architecture */ + silk_EncControlStruct *encStatus /* O Encoder Status */ +) +{ + silk_encoder *psEnc; + opus_int n, ret = SILK_NO_ERROR; + + psEnc = (silk_encoder *)encState; + + /* Reset encoder */ + silk_memset( psEnc, 0, sizeof( silk_encoder ) ); + for( n = 0; n < ENCODER_NUM_CHANNELS; n++ ) { + if( ret += silk_init_encoder( &psEnc->state_Fxx[ n ], arch ) ) { + silk_assert( 0 ); + } + } + + psEnc->nChannelsAPI = 1; + psEnc->nChannelsInternal = 1; + + /* Read control structure */ + if( ret += silk_QueryEncoder( encState, encStatus ) ) { + silk_assert( 0 ); + } + + return ret; +} + +/***************************************/ +/* Read control structure from encoder */ +/***************************************/ +static opus_int silk_QueryEncoder( /* O Returns error code */ + const void *encState, /* I State */ + silk_EncControlStruct *encStatus /* O Encoder Status */ +) +{ + opus_int ret = SILK_NO_ERROR; + silk_encoder_state_Fxx *state_Fxx; + silk_encoder *psEnc = (silk_encoder *)encState; + + state_Fxx = psEnc->state_Fxx; + + encStatus->nChannelsAPI = psEnc->nChannelsAPI; + encStatus->nChannelsInternal = psEnc->nChannelsInternal; + encStatus->API_sampleRate = state_Fxx[ 0 ].sCmn.API_fs_Hz; + encStatus->maxInternalSampleRate = state_Fxx[ 0 ].sCmn.maxInternal_fs_Hz; + encStatus->minInternalSampleRate = state_Fxx[ 0 ].sCmn.minInternal_fs_Hz; + encStatus->desiredInternalSampleRate = state_Fxx[ 0 ].sCmn.desiredInternal_fs_Hz; + encStatus->payloadSize_ms = state_Fxx[ 0 ].sCmn.PacketSize_ms; + encStatus->bitRate = state_Fxx[ 0 ].sCmn.TargetRate_bps; + encStatus->packetLossPercentage = state_Fxx[ 0 ].sCmn.PacketLoss_perc; + encStatus->complexity = state_Fxx[ 0 ].sCmn.Complexity; + encStatus->useInBandFEC = state_Fxx[ 0 ].sCmn.useInBandFEC; + encStatus->useDTX = state_Fxx[ 0 ].sCmn.useDTX; + encStatus->useCBR = state_Fxx[ 0 ].sCmn.useCBR; + encStatus->internalSampleRate = silk_SMULBB( state_Fxx[ 0 ].sCmn.fs_kHz, 1000 ); + encStatus->allowBandwidthSwitch = state_Fxx[ 0 ].sCmn.allow_bandwidth_switch; + encStatus->inWBmodeWithoutVariableLP = state_Fxx[ 0 ].sCmn.fs_kHz == 16 && state_Fxx[ 0 ].sCmn.sLP.mode == 0; + + return ret; +} + + +/**************************/ +/* Encode frame with Silk */ +/**************************/ +/* Note: if prefillFlag is set, the input must contain 10 ms of audio, irrespective of what */ +/* encControl->payloadSize_ms is set to */ +opus_int silk_Encode( /* O Returns error code */ + void *encState, /* I/O State */ + silk_EncControlStruct *encControl, /* I Control status */ + const opus_int16 *samplesIn, /* I Speech sample input vector */ + opus_int nSamplesIn, /* I Number of samples in input vector */ + ec_enc *psRangeEnc, /* I/O Compressor data structure */ + opus_int32 *nBytesOut, /* I/O Number of bytes in payload (input: Max bytes) */ + const opus_int prefillFlag /* I Flag to indicate prefilling buffers no coding */ +) +{ + opus_int n, i, nBits, flags, tmp_payloadSize_ms = 0, tmp_complexity = 0, ret = 0; + opus_int nSamplesToBuffer, nSamplesToBufferMax, nBlocksOf10ms; + opus_int nSamplesFromInput = 0, nSamplesFromInputMax; + opus_int speech_act_thr_for_switch_Q8; + opus_int32 TargetRate_bps, MStargetRates_bps[ 2 ], channelRate_bps, LBRR_symbol, sum; + silk_encoder *psEnc = ( silk_encoder * )encState; + VARDECL( opus_int16, buf ); + opus_int transition, curr_block, tot_blocks; + SAVE_STACK; + + if (encControl->reducedDependency) + { + psEnc->state_Fxx[0].sCmn.first_frame_after_reset = 1; + psEnc->state_Fxx[1].sCmn.first_frame_after_reset = 1; + } + psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded = psEnc->state_Fxx[ 1 ].sCmn.nFramesEncoded = 0; + + /* Check values in encoder control structure */ + if( ( ret = check_control_input( encControl ) ) != 0 ) { + silk_assert( 0 ); + RESTORE_STACK; + return ret; + } + + encControl->switchReady = 0; + + if( encControl->nChannelsInternal > psEnc->nChannelsInternal ) { + /* Mono -> Stereo transition: init state of second channel and stereo state */ + ret += silk_init_encoder( &psEnc->state_Fxx[ 1 ], psEnc->state_Fxx[ 0 ].sCmn.arch ); + silk_memset( psEnc->sStereo.pred_prev_Q13, 0, sizeof( psEnc->sStereo.pred_prev_Q13 ) ); + silk_memset( psEnc->sStereo.sSide, 0, sizeof( psEnc->sStereo.sSide ) ); + psEnc->sStereo.mid_side_amp_Q0[ 0 ] = 0; + psEnc->sStereo.mid_side_amp_Q0[ 1 ] = 1; + psEnc->sStereo.mid_side_amp_Q0[ 2 ] = 0; + psEnc->sStereo.mid_side_amp_Q0[ 3 ] = 1; + psEnc->sStereo.width_prev_Q14 = 0; + psEnc->sStereo.smth_width_Q14 = SILK_FIX_CONST( 1, 14 ); + if( psEnc->nChannelsAPI == 2 ) { + silk_memcpy( &psEnc->state_Fxx[ 1 ].sCmn.resampler_state, &psEnc->state_Fxx[ 0 ].sCmn.resampler_state, sizeof( silk_resampler_state_struct ) ); + silk_memcpy( &psEnc->state_Fxx[ 1 ].sCmn.In_HP_State, &psEnc->state_Fxx[ 0 ].sCmn.In_HP_State, sizeof( psEnc->state_Fxx[ 1 ].sCmn.In_HP_State ) ); + } + } + + transition = (encControl->payloadSize_ms != psEnc->state_Fxx[ 0 ].sCmn.PacketSize_ms) || (psEnc->nChannelsInternal != encControl->nChannelsInternal); + + psEnc->nChannelsAPI = encControl->nChannelsAPI; + psEnc->nChannelsInternal = encControl->nChannelsInternal; + + nBlocksOf10ms = silk_DIV32( 100 * nSamplesIn, encControl->API_sampleRate ); + tot_blocks = ( nBlocksOf10ms > 1 ) ? nBlocksOf10ms >> 1 : 1; + curr_block = 0; + if( prefillFlag ) { + /* Only accept input length of 10 ms */ + if( nBlocksOf10ms != 1 ) { + silk_assert( 0 ); + RESTORE_STACK; + return SILK_ENC_INPUT_INVALID_NO_OF_SAMPLES; + } + /* Reset Encoder */ + for( n = 0; n < encControl->nChannelsInternal; n++ ) { + ret = silk_init_encoder( &psEnc->state_Fxx[ n ], psEnc->state_Fxx[ n ].sCmn.arch ); + silk_assert( !ret ); + } + tmp_payloadSize_ms = encControl->payloadSize_ms; + encControl->payloadSize_ms = 10; + tmp_complexity = encControl->complexity; + encControl->complexity = 0; + for( n = 0; n < encControl->nChannelsInternal; n++ ) { + psEnc->state_Fxx[ n ].sCmn.controlled_since_last_payload = 0; + psEnc->state_Fxx[ n ].sCmn.prefillFlag = 1; + } + } else { + /* Only accept input lengths that are a multiple of 10 ms */ + if( nBlocksOf10ms * encControl->API_sampleRate != 100 * nSamplesIn || nSamplesIn < 0 ) { + silk_assert( 0 ); + RESTORE_STACK; + return SILK_ENC_INPUT_INVALID_NO_OF_SAMPLES; + } + /* Make sure no more than one packet can be produced */ + if( 1000 * (opus_int32)nSamplesIn > encControl->payloadSize_ms * encControl->API_sampleRate ) { + silk_assert( 0 ); + RESTORE_STACK; + return SILK_ENC_INPUT_INVALID_NO_OF_SAMPLES; + } + } + + TargetRate_bps = silk_RSHIFT32( encControl->bitRate, encControl->nChannelsInternal - 1 ); + for( n = 0; n < encControl->nChannelsInternal; n++ ) { + /* Force the side channel to the same rate as the mid */ + opus_int force_fs_kHz = (n==1) ? psEnc->state_Fxx[0].sCmn.fs_kHz : 0; + if( ( ret = silk_control_encoder( &psEnc->state_Fxx[ n ], encControl, TargetRate_bps, psEnc->allowBandwidthSwitch, n, force_fs_kHz ) ) != 0 ) { + silk_assert( 0 ); + RESTORE_STACK; + return ret; + } + if( psEnc->state_Fxx[n].sCmn.first_frame_after_reset || transition ) { + for( i = 0; i < psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket; i++ ) { + psEnc->state_Fxx[ n ].sCmn.LBRR_flags[ i ] = 0; + } + } + psEnc->state_Fxx[ n ].sCmn.inDTX = psEnc->state_Fxx[ n ].sCmn.useDTX; + } + silk_assert( encControl->nChannelsInternal == 1 || psEnc->state_Fxx[ 0 ].sCmn.fs_kHz == psEnc->state_Fxx[ 1 ].sCmn.fs_kHz ); + + /* Input buffering/resampling and encoding */ + nSamplesToBufferMax = + 10 * nBlocksOf10ms * psEnc->state_Fxx[ 0 ].sCmn.fs_kHz; + nSamplesFromInputMax = + silk_DIV32_16( nSamplesToBufferMax * + psEnc->state_Fxx[ 0 ].sCmn.API_fs_Hz, + psEnc->state_Fxx[ 0 ].sCmn.fs_kHz * 1000 ); + ALLOC( buf, nSamplesFromInputMax, opus_int16 ); + while( 1 ) { + nSamplesToBuffer = psEnc->state_Fxx[ 0 ].sCmn.frame_length - psEnc->state_Fxx[ 0 ].sCmn.inputBufIx; + nSamplesToBuffer = silk_min( nSamplesToBuffer, nSamplesToBufferMax ); + nSamplesFromInput = silk_DIV32_16( nSamplesToBuffer * psEnc->state_Fxx[ 0 ].sCmn.API_fs_Hz, psEnc->state_Fxx[ 0 ].sCmn.fs_kHz * 1000 ); + /* Resample and write to buffer */ + if( encControl->nChannelsAPI == 2 && encControl->nChannelsInternal == 2 ) { + opus_int id = psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded; + for( n = 0; n < nSamplesFromInput; n++ ) { + buf[ n ] = samplesIn[ 2 * n ]; + } + /* Making sure to start both resamplers from the same state when switching from mono to stereo */ + if( psEnc->nPrevChannelsInternal == 1 && id==0 ) { + silk_memcpy( &psEnc->state_Fxx[ 1 ].sCmn.resampler_state, &psEnc->state_Fxx[ 0 ].sCmn.resampler_state, sizeof(psEnc->state_Fxx[ 1 ].sCmn.resampler_state)); + } + + ret += silk_resampler( &psEnc->state_Fxx[ 0 ].sCmn.resampler_state, + &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput ); + psEnc->state_Fxx[ 0 ].sCmn.inputBufIx += nSamplesToBuffer; + + nSamplesToBuffer = psEnc->state_Fxx[ 1 ].sCmn.frame_length - psEnc->state_Fxx[ 1 ].sCmn.inputBufIx; + nSamplesToBuffer = silk_min( nSamplesToBuffer, 10 * nBlocksOf10ms * psEnc->state_Fxx[ 1 ].sCmn.fs_kHz ); + for( n = 0; n < nSamplesFromInput; n++ ) { + buf[ n ] = samplesIn[ 2 * n + 1 ]; + } + ret += silk_resampler( &psEnc->state_Fxx[ 1 ].sCmn.resampler_state, + &psEnc->state_Fxx[ 1 ].sCmn.inputBuf[ psEnc->state_Fxx[ 1 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput ); + + psEnc->state_Fxx[ 1 ].sCmn.inputBufIx += nSamplesToBuffer; + } else if( encControl->nChannelsAPI == 2 && encControl->nChannelsInternal == 1 ) { + /* Combine left and right channels before resampling */ + for( n = 0; n < nSamplesFromInput; n++ ) { + sum = samplesIn[ 2 * n ] + samplesIn[ 2 * n + 1 ]; + buf[ n ] = (opus_int16)silk_RSHIFT_ROUND( sum, 1 ); + } + ret += silk_resampler( &psEnc->state_Fxx[ 0 ].sCmn.resampler_state, + &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput ); + /* On the first mono frame, average the results for the two resampler states */ + if( psEnc->nPrevChannelsInternal == 2 && psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded == 0 ) { + ret += silk_resampler( &psEnc->state_Fxx[ 1 ].sCmn.resampler_state, + &psEnc->state_Fxx[ 1 ].sCmn.inputBuf[ psEnc->state_Fxx[ 1 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput ); + for( n = 0; n < psEnc->state_Fxx[ 0 ].sCmn.frame_length; n++ ) { + psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx+n+2 ] = + silk_RSHIFT(psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx+n+2 ] + + psEnc->state_Fxx[ 1 ].sCmn.inputBuf[ psEnc->state_Fxx[ 1 ].sCmn.inputBufIx+n+2 ], 1); + } + } + psEnc->state_Fxx[ 0 ].sCmn.inputBufIx += nSamplesToBuffer; + } else { + silk_assert( encControl->nChannelsAPI == 1 && encControl->nChannelsInternal == 1 ); + silk_memcpy(buf, samplesIn, nSamplesFromInput*sizeof(opus_int16)); + ret += silk_resampler( &psEnc->state_Fxx[ 0 ].sCmn.resampler_state, + &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput ); + psEnc->state_Fxx[ 0 ].sCmn.inputBufIx += nSamplesToBuffer; + } + + samplesIn += nSamplesFromInput * encControl->nChannelsAPI; + nSamplesIn -= nSamplesFromInput; + + /* Default */ + psEnc->allowBandwidthSwitch = 0; + + /* Silk encoder */ + if( psEnc->state_Fxx[ 0 ].sCmn.inputBufIx >= psEnc->state_Fxx[ 0 ].sCmn.frame_length ) { + /* Enough data in input buffer, so encode */ + silk_assert( psEnc->state_Fxx[ 0 ].sCmn.inputBufIx == psEnc->state_Fxx[ 0 ].sCmn.frame_length ); + silk_assert( encControl->nChannelsInternal == 1 || psEnc->state_Fxx[ 1 ].sCmn.inputBufIx == psEnc->state_Fxx[ 1 ].sCmn.frame_length ); + + /* Deal with LBRR data */ + if( psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded == 0 && !prefillFlag ) { + /* Create space at start of payload for VAD and FEC flags */ + opus_uint8 iCDF[ 2 ] = { 0, 0 }; + iCDF[ 0 ] = 256 - silk_RSHIFT( 256, ( psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket + 1 ) * encControl->nChannelsInternal ); + ec_enc_icdf( psRangeEnc, 0, iCDF, 8 ); + + /* Encode any LBRR data from previous packet */ + /* Encode LBRR flags */ + for( n = 0; n < encControl->nChannelsInternal; n++ ) { + LBRR_symbol = 0; + for( i = 0; i < psEnc->state_Fxx[ n ].sCmn.nFramesPerPacket; i++ ) { + LBRR_symbol |= silk_LSHIFT( psEnc->state_Fxx[ n ].sCmn.LBRR_flags[ i ], i ); + } + psEnc->state_Fxx[ n ].sCmn.LBRR_flag = LBRR_symbol > 0 ? 1 : 0; + if( LBRR_symbol && psEnc->state_Fxx[ n ].sCmn.nFramesPerPacket > 1 ) { + ec_enc_icdf( psRangeEnc, LBRR_symbol - 1, silk_LBRR_flags_iCDF_ptr[ psEnc->state_Fxx[ n ].sCmn.nFramesPerPacket - 2 ], 8 ); + } + } + + /* Code LBRR indices and excitation signals */ + for( i = 0; i < psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket; i++ ) { + for( n = 0; n < encControl->nChannelsInternal; n++ ) { + if( psEnc->state_Fxx[ n ].sCmn.LBRR_flags[ i ] ) { + opus_int condCoding; + + if( encControl->nChannelsInternal == 2 && n == 0 ) { + silk_stereo_encode_pred( psRangeEnc, psEnc->sStereo.predIx[ i ] ); + /* For LBRR data there's no need to code the mid-only flag if the side-channel LBRR flag is set */ + if( psEnc->state_Fxx[ 1 ].sCmn.LBRR_flags[ i ] == 0 ) { + silk_stereo_encode_mid_only( psRangeEnc, psEnc->sStereo.mid_only_flags[ i ] ); + } + } + /* Use conditional coding if previous frame available */ + if( i > 0 && psEnc->state_Fxx[ n ].sCmn.LBRR_flags[ i - 1 ] ) { + condCoding = CODE_CONDITIONALLY; + } else { + condCoding = CODE_INDEPENDENTLY; + } + silk_encode_indices( &psEnc->state_Fxx[ n ].sCmn, psRangeEnc, i, 1, condCoding ); + silk_encode_pulses( psRangeEnc, psEnc->state_Fxx[ n ].sCmn.indices_LBRR[i].signalType, psEnc->state_Fxx[ n ].sCmn.indices_LBRR[i].quantOffsetType, + psEnc->state_Fxx[ n ].sCmn.pulses_LBRR[ i ], psEnc->state_Fxx[ n ].sCmn.frame_length ); + } + } + } + + /* Reset LBRR flags */ + for( n = 0; n < encControl->nChannelsInternal; n++ ) { + silk_memset( psEnc->state_Fxx[ n ].sCmn.LBRR_flags, 0, sizeof( psEnc->state_Fxx[ n ].sCmn.LBRR_flags ) ); + } + + psEnc->nBitsUsedLBRR = ec_tell( psRangeEnc ); + } + + silk_HP_variable_cutoff( psEnc->state_Fxx ); + + /* Total target bits for packet */ + nBits = silk_DIV32_16( silk_MUL( encControl->bitRate, encControl->payloadSize_ms ), 1000 ); + /* Subtract bits used for LBRR */ + if( !prefillFlag ) { + nBits -= psEnc->nBitsUsedLBRR; + } + /* Divide by number of uncoded frames left in packet */ + nBits = silk_DIV32_16( nBits, psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket ); + /* Convert to bits/second */ + if( encControl->payloadSize_ms == 10 ) { + TargetRate_bps = silk_SMULBB( nBits, 100 ); + } else { + TargetRate_bps = silk_SMULBB( nBits, 50 ); + } + /* Subtract fraction of bits in excess of target in previous frames and packets */ + TargetRate_bps -= silk_DIV32_16( silk_MUL( psEnc->nBitsExceeded, 1000 ), BITRESERVOIR_DECAY_TIME_MS ); + if( !prefillFlag && psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded > 0 ) { + /* Compare actual vs target bits so far in this packet */ + opus_int32 bitsBalance = ec_tell( psRangeEnc ) - psEnc->nBitsUsedLBRR - nBits * psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded; + TargetRate_bps -= silk_DIV32_16( silk_MUL( bitsBalance, 1000 ), BITRESERVOIR_DECAY_TIME_MS ); + } + /* Never exceed input bitrate */ + TargetRate_bps = silk_LIMIT( TargetRate_bps, encControl->bitRate, 5000 ); + + /* Convert Left/Right to Mid/Side */ + if( encControl->nChannelsInternal == 2 ) { + silk_stereo_LR_to_MS( &psEnc->sStereo, &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ 2 ], &psEnc->state_Fxx[ 1 ].sCmn.inputBuf[ 2 ], + psEnc->sStereo.predIx[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ], &psEnc->sStereo.mid_only_flags[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ], + MStargetRates_bps, TargetRate_bps, psEnc->state_Fxx[ 0 ].sCmn.speech_activity_Q8, encControl->toMono, + psEnc->state_Fxx[ 0 ].sCmn.fs_kHz, psEnc->state_Fxx[ 0 ].sCmn.frame_length ); + if( psEnc->sStereo.mid_only_flags[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ] == 0 ) { + /* Reset side channel encoder memory for first frame with side coding */ + if( psEnc->prev_decode_only_middle == 1 ) { + silk_memset( &psEnc->state_Fxx[ 1 ].sShape, 0, sizeof( psEnc->state_Fxx[ 1 ].sShape ) ); + silk_memset( &psEnc->state_Fxx[ 1 ].sPrefilt, 0, sizeof( psEnc->state_Fxx[ 1 ].sPrefilt ) ); + silk_memset( &psEnc->state_Fxx[ 1 ].sCmn.sNSQ, 0, sizeof( psEnc->state_Fxx[ 1 ].sCmn.sNSQ ) ); + silk_memset( psEnc->state_Fxx[ 1 ].sCmn.prev_NLSFq_Q15, 0, sizeof( psEnc->state_Fxx[ 1 ].sCmn.prev_NLSFq_Q15 ) ); + silk_memset( &psEnc->state_Fxx[ 1 ].sCmn.sLP.In_LP_State, 0, sizeof( psEnc->state_Fxx[ 1 ].sCmn.sLP.In_LP_State ) ); + psEnc->state_Fxx[ 1 ].sCmn.prevLag = 100; + psEnc->state_Fxx[ 1 ].sCmn.sNSQ.lagPrev = 100; + psEnc->state_Fxx[ 1 ].sShape.LastGainIndex = 10; + psEnc->state_Fxx[ 1 ].sCmn.prevSignalType = TYPE_NO_VOICE_ACTIVITY; + psEnc->state_Fxx[ 1 ].sCmn.sNSQ.prev_gain_Q16 = 65536; + psEnc->state_Fxx[ 1 ].sCmn.first_frame_after_reset = 1; + } + silk_encode_do_VAD_Fxx( &psEnc->state_Fxx[ 1 ] ); + } else { + psEnc->state_Fxx[ 1 ].sCmn.VAD_flags[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ] = 0; + } + if( !prefillFlag ) { + silk_stereo_encode_pred( psRangeEnc, psEnc->sStereo.predIx[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ] ); + if( psEnc->state_Fxx[ 1 ].sCmn.VAD_flags[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ] == 0 ) { + silk_stereo_encode_mid_only( psRangeEnc, psEnc->sStereo.mid_only_flags[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ] ); + } + } + } else { + /* Buffering */ + silk_memcpy( psEnc->state_Fxx[ 0 ].sCmn.inputBuf, psEnc->sStereo.sMid, 2 * sizeof( opus_int16 ) ); + silk_memcpy( psEnc->sStereo.sMid, &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.frame_length ], 2 * sizeof( opus_int16 ) ); + } + silk_encode_do_VAD_Fxx( &psEnc->state_Fxx[ 0 ] ); + + /* Encode */ + for( n = 0; n < encControl->nChannelsInternal; n++ ) { + opus_int maxBits, useCBR; + + /* Handling rate constraints */ + maxBits = encControl->maxBits; + if( tot_blocks == 2 && curr_block == 0 ) { + maxBits = maxBits * 3 / 5; + } else if( tot_blocks == 3 ) { + if( curr_block == 0 ) { + maxBits = maxBits * 2 / 5; + } else if( curr_block == 1 ) { + maxBits = maxBits * 3 / 4; + } + } + useCBR = encControl->useCBR && curr_block == tot_blocks - 1; + + if( encControl->nChannelsInternal == 1 ) { + channelRate_bps = TargetRate_bps; + } else { + channelRate_bps = MStargetRates_bps[ n ]; + if( n == 0 && MStargetRates_bps[ 1 ] > 0 ) { + useCBR = 0; + /* Give mid up to 1/2 of the max bits for that frame */ + maxBits -= encControl->maxBits / ( tot_blocks * 2 ); + } + } + + if( channelRate_bps > 0 ) { + opus_int condCoding; + + silk_control_SNR( &psEnc->state_Fxx[ n ].sCmn, channelRate_bps ); + + /* Use independent coding if no previous frame available */ + if( psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded - n <= 0 ) { + condCoding = CODE_INDEPENDENTLY; + } else if( n > 0 && psEnc->prev_decode_only_middle ) { + /* If we skipped a side frame in this packet, we don't + need LTP scaling; the LTP state is well-defined. */ + condCoding = CODE_INDEPENDENTLY_NO_LTP_SCALING; + } else { + condCoding = CODE_CONDITIONALLY; + } + if( ( ret = silk_encode_frame_Fxx( &psEnc->state_Fxx[ n ], nBytesOut, psRangeEnc, condCoding, maxBits, useCBR ) ) != 0 ) { + silk_assert( 0 ); + } + } + psEnc->state_Fxx[ n ].sCmn.controlled_since_last_payload = 0; + psEnc->state_Fxx[ n ].sCmn.inputBufIx = 0; + psEnc->state_Fxx[ n ].sCmn.nFramesEncoded++; + } + psEnc->prev_decode_only_middle = psEnc->sStereo.mid_only_flags[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded - 1 ]; + + /* Insert VAD and FEC flags at beginning of bitstream */ + if( *nBytesOut > 0 && psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded == psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket) { + flags = 0; + for( n = 0; n < encControl->nChannelsInternal; n++ ) { + for( i = 0; i < psEnc->state_Fxx[ n ].sCmn.nFramesPerPacket; i++ ) { + flags = silk_LSHIFT( flags, 1 ); + flags |= psEnc->state_Fxx[ n ].sCmn.VAD_flags[ i ]; + } + flags = silk_LSHIFT( flags, 1 ); + flags |= psEnc->state_Fxx[ n ].sCmn.LBRR_flag; + } + if( !prefillFlag ) { + ec_enc_patch_initial_bits( psRangeEnc, flags, ( psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket + 1 ) * encControl->nChannelsInternal ); + } + + /* Return zero bytes if all channels DTXed */ + if( psEnc->state_Fxx[ 0 ].sCmn.inDTX && ( encControl->nChannelsInternal == 1 || psEnc->state_Fxx[ 1 ].sCmn.inDTX ) ) { + *nBytesOut = 0; + } + + psEnc->nBitsExceeded += *nBytesOut * 8; + psEnc->nBitsExceeded -= silk_DIV32_16( silk_MUL( encControl->bitRate, encControl->payloadSize_ms ), 1000 ); + psEnc->nBitsExceeded = silk_LIMIT( psEnc->nBitsExceeded, 0, 10000 ); + + /* Update flag indicating if bandwidth switching is allowed */ + speech_act_thr_for_switch_Q8 = silk_SMLAWB( SILK_FIX_CONST( SPEECH_ACTIVITY_DTX_THRES, 8 ), + SILK_FIX_CONST( ( 1 - SPEECH_ACTIVITY_DTX_THRES ) / MAX_BANDWIDTH_SWITCH_DELAY_MS, 16 + 8 ), psEnc->timeSinceSwitchAllowed_ms ); + if( psEnc->state_Fxx[ 0 ].sCmn.speech_activity_Q8 < speech_act_thr_for_switch_Q8 ) { + psEnc->allowBandwidthSwitch = 1; + psEnc->timeSinceSwitchAllowed_ms = 0; + } else { + psEnc->allowBandwidthSwitch = 0; + psEnc->timeSinceSwitchAllowed_ms += encControl->payloadSize_ms; + } + } + + if( nSamplesIn == 0 ) { + break; + } + } else { + break; + } + curr_block++; + } + + psEnc->nPrevChannelsInternal = encControl->nChannelsInternal; + + encControl->allowBandwidthSwitch = psEnc->allowBandwidthSwitch; + encControl->inWBmodeWithoutVariableLP = psEnc->state_Fxx[ 0 ].sCmn.fs_kHz == 16 && psEnc->state_Fxx[ 0 ].sCmn.sLP.mode == 0; + encControl->internalSampleRate = silk_SMULBB( psEnc->state_Fxx[ 0 ].sCmn.fs_kHz, 1000 ); + encControl->stereoWidth_Q14 = encControl->toMono ? 0 : psEnc->sStereo.smth_width_Q14; + if( prefillFlag ) { + encControl->payloadSize_ms = tmp_payloadSize_ms; + encControl->complexity = tmp_complexity; + for( n = 0; n < encControl->nChannelsInternal; n++ ) { + psEnc->state_Fxx[ n ].sCmn.controlled_since_last_payload = 0; + psEnc->state_Fxx[ n ].sCmn.prefillFlag = 0; + } + } + + RESTORE_STACK; + return ret; +} + diff --git a/thirdparty/opus/silk/encode_indices.c b/thirdparty/opus/silk/encode_indices.c new file mode 100644 index 000000000000..666c8c0b13e4 --- /dev/null +++ b/thirdparty/opus/silk/encode_indices.c @@ -0,0 +1,181 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* Encode side-information parameters to payload */ +void silk_encode_indices( + silk_encoder_state *psEncC, /* I/O Encoder state */ + ec_enc *psRangeEnc, /* I/O Compressor data structure */ + opus_int FrameIndex, /* I Frame number */ + opus_int encode_LBRR, /* I Flag indicating LBRR data is being encoded */ + opus_int condCoding /* I The type of conditional coding to use */ +) +{ + opus_int i, k, typeOffset; + opus_int encode_absolute_lagIndex, delta_lagIndex; + opus_int16 ec_ix[ MAX_LPC_ORDER ]; + opus_uint8 pred_Q8[ MAX_LPC_ORDER ]; + const SideInfoIndices *psIndices; + + if( encode_LBRR ) { + psIndices = &psEncC->indices_LBRR[ FrameIndex ]; + } else { + psIndices = &psEncC->indices; + } + + /*******************************************/ + /* Encode signal type and quantizer offset */ + /*******************************************/ + typeOffset = 2 * psIndices->signalType + psIndices->quantOffsetType; + silk_assert( typeOffset >= 0 && typeOffset < 6 ); + silk_assert( encode_LBRR == 0 || typeOffset >= 2 ); + if( encode_LBRR || typeOffset >= 2 ) { + ec_enc_icdf( psRangeEnc, typeOffset - 2, silk_type_offset_VAD_iCDF, 8 ); + } else { + ec_enc_icdf( psRangeEnc, typeOffset, silk_type_offset_no_VAD_iCDF, 8 ); + } + + /****************/ + /* Encode gains */ + /****************/ + /* first subframe */ + if( condCoding == CODE_CONDITIONALLY ) { + /* conditional coding */ + silk_assert( psIndices->GainsIndices[ 0 ] >= 0 && psIndices->GainsIndices[ 0 ] < MAX_DELTA_GAIN_QUANT - MIN_DELTA_GAIN_QUANT + 1 ); + ec_enc_icdf( psRangeEnc, psIndices->GainsIndices[ 0 ], silk_delta_gain_iCDF, 8 ); + } else { + /* independent coding, in two stages: MSB bits followed by 3 LSBs */ + silk_assert( psIndices->GainsIndices[ 0 ] >= 0 && psIndices->GainsIndices[ 0 ] < N_LEVELS_QGAIN ); + ec_enc_icdf( psRangeEnc, silk_RSHIFT( psIndices->GainsIndices[ 0 ], 3 ), silk_gain_iCDF[ psIndices->signalType ], 8 ); + ec_enc_icdf( psRangeEnc, psIndices->GainsIndices[ 0 ] & 7, silk_uniform8_iCDF, 8 ); + } + + /* remaining subframes */ + for( i = 1; i < psEncC->nb_subfr; i++ ) { + silk_assert( psIndices->GainsIndices[ i ] >= 0 && psIndices->GainsIndices[ i ] < MAX_DELTA_GAIN_QUANT - MIN_DELTA_GAIN_QUANT + 1 ); + ec_enc_icdf( psRangeEnc, psIndices->GainsIndices[ i ], silk_delta_gain_iCDF, 8 ); + } + + /****************/ + /* Encode NLSFs */ + /****************/ + ec_enc_icdf( psRangeEnc, psIndices->NLSFIndices[ 0 ], &psEncC->psNLSF_CB->CB1_iCDF[ ( psIndices->signalType >> 1 ) * psEncC->psNLSF_CB->nVectors ], 8 ); + silk_NLSF_unpack( ec_ix, pred_Q8, psEncC->psNLSF_CB, psIndices->NLSFIndices[ 0 ] ); + silk_assert( psEncC->psNLSF_CB->order == psEncC->predictLPCOrder ); + for( i = 0; i < psEncC->psNLSF_CB->order; i++ ) { + if( psIndices->NLSFIndices[ i+1 ] >= NLSF_QUANT_MAX_AMPLITUDE ) { + ec_enc_icdf( psRangeEnc, 2 * NLSF_QUANT_MAX_AMPLITUDE, &psEncC->psNLSF_CB->ec_iCDF[ ec_ix[ i ] ], 8 ); + ec_enc_icdf( psRangeEnc, psIndices->NLSFIndices[ i+1 ] - NLSF_QUANT_MAX_AMPLITUDE, silk_NLSF_EXT_iCDF, 8 ); + } else if( psIndices->NLSFIndices[ i+1 ] <= -NLSF_QUANT_MAX_AMPLITUDE ) { + ec_enc_icdf( psRangeEnc, 0, &psEncC->psNLSF_CB->ec_iCDF[ ec_ix[ i ] ], 8 ); + ec_enc_icdf( psRangeEnc, -psIndices->NLSFIndices[ i+1 ] - NLSF_QUANT_MAX_AMPLITUDE, silk_NLSF_EXT_iCDF, 8 ); + } else { + ec_enc_icdf( psRangeEnc, psIndices->NLSFIndices[ i+1 ] + NLSF_QUANT_MAX_AMPLITUDE, &psEncC->psNLSF_CB->ec_iCDF[ ec_ix[ i ] ], 8 ); + } + } + + /* Encode NLSF interpolation factor */ + if( psEncC->nb_subfr == MAX_NB_SUBFR ) { + silk_assert( psIndices->NLSFInterpCoef_Q2 >= 0 && psIndices->NLSFInterpCoef_Q2 < 5 ); + ec_enc_icdf( psRangeEnc, psIndices->NLSFInterpCoef_Q2, silk_NLSF_interpolation_factor_iCDF, 8 ); + } + + if( psIndices->signalType == TYPE_VOICED ) + { + /*********************/ + /* Encode pitch lags */ + /*********************/ + /* lag index */ + encode_absolute_lagIndex = 1; + if( condCoding == CODE_CONDITIONALLY && psEncC->ec_prevSignalType == TYPE_VOICED ) { + /* Delta Encoding */ + delta_lagIndex = psIndices->lagIndex - psEncC->ec_prevLagIndex; + if( delta_lagIndex < -8 || delta_lagIndex > 11 ) { + delta_lagIndex = 0; + } else { + delta_lagIndex = delta_lagIndex + 9; + encode_absolute_lagIndex = 0; /* Only use delta */ + } + silk_assert( delta_lagIndex >= 0 && delta_lagIndex < 21 ); + ec_enc_icdf( psRangeEnc, delta_lagIndex, silk_pitch_delta_iCDF, 8 ); + } + if( encode_absolute_lagIndex ) { + /* Absolute encoding */ + opus_int32 pitch_high_bits, pitch_low_bits; + pitch_high_bits = silk_DIV32_16( psIndices->lagIndex, silk_RSHIFT( psEncC->fs_kHz, 1 ) ); + pitch_low_bits = psIndices->lagIndex - silk_SMULBB( pitch_high_bits, silk_RSHIFT( psEncC->fs_kHz, 1 ) ); + silk_assert( pitch_low_bits < psEncC->fs_kHz / 2 ); + silk_assert( pitch_high_bits < 32 ); + ec_enc_icdf( psRangeEnc, pitch_high_bits, silk_pitch_lag_iCDF, 8 ); + ec_enc_icdf( psRangeEnc, pitch_low_bits, psEncC->pitch_lag_low_bits_iCDF, 8 ); + } + psEncC->ec_prevLagIndex = psIndices->lagIndex; + + /* Countour index */ + silk_assert( psIndices->contourIndex >= 0 ); + silk_assert( ( psIndices->contourIndex < 34 && psEncC->fs_kHz > 8 && psEncC->nb_subfr == 4 ) || + ( psIndices->contourIndex < 11 && psEncC->fs_kHz == 8 && psEncC->nb_subfr == 4 ) || + ( psIndices->contourIndex < 12 && psEncC->fs_kHz > 8 && psEncC->nb_subfr == 2 ) || + ( psIndices->contourIndex < 3 && psEncC->fs_kHz == 8 && psEncC->nb_subfr == 2 ) ); + ec_enc_icdf( psRangeEnc, psIndices->contourIndex, psEncC->pitch_contour_iCDF, 8 ); + + /********************/ + /* Encode LTP gains */ + /********************/ + /* PERIndex value */ + silk_assert( psIndices->PERIndex >= 0 && psIndices->PERIndex < 3 ); + ec_enc_icdf( psRangeEnc, psIndices->PERIndex, silk_LTP_per_index_iCDF, 8 ); + + /* Codebook Indices */ + for( k = 0; k < psEncC->nb_subfr; k++ ) { + silk_assert( psIndices->LTPIndex[ k ] >= 0 && psIndices->LTPIndex[ k ] < ( 8 << psIndices->PERIndex ) ); + ec_enc_icdf( psRangeEnc, psIndices->LTPIndex[ k ], silk_LTP_gain_iCDF_ptrs[ psIndices->PERIndex ], 8 ); + } + + /**********************/ + /* Encode LTP scaling */ + /**********************/ + if( condCoding == CODE_INDEPENDENTLY ) { + silk_assert( psIndices->LTP_scaleIndex >= 0 && psIndices->LTP_scaleIndex < 3 ); + ec_enc_icdf( psRangeEnc, psIndices->LTP_scaleIndex, silk_LTPscale_iCDF, 8 ); + } + silk_assert( !condCoding || psIndices->LTP_scaleIndex == 0 ); + } + + psEncC->ec_prevSignalType = psIndices->signalType; + + /***************/ + /* Encode seed */ + /***************/ + silk_assert( psIndices->Seed >= 0 && psIndices->Seed < 4 ); + ec_enc_icdf( psRangeEnc, psIndices->Seed, silk_uniform4_iCDF, 8 ); +} diff --git a/thirdparty/opus/silk/encode_pulses.c b/thirdparty/opus/silk/encode_pulses.c new file mode 100644 index 000000000000..ab00264f9914 --- /dev/null +++ b/thirdparty/opus/silk/encode_pulses.c @@ -0,0 +1,206 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" +#include "stack_alloc.h" + +/*********************************************/ +/* Encode quantization indices of excitation */ +/*********************************************/ + +static OPUS_INLINE opus_int combine_and_check( /* return ok */ + opus_int *pulses_comb, /* O */ + const opus_int *pulses_in, /* I */ + opus_int max_pulses, /* I max value for sum of pulses */ + opus_int len /* I number of output values */ +) +{ + opus_int k, sum; + + for( k = 0; k < len; k++ ) { + sum = pulses_in[ 2 * k ] + pulses_in[ 2 * k + 1 ]; + if( sum > max_pulses ) { + return 1; + } + pulses_comb[ k ] = sum; + } + + return 0; +} + +/* Encode quantization indices of excitation */ +void silk_encode_pulses( + ec_enc *psRangeEnc, /* I/O compressor data structure */ + const opus_int signalType, /* I Signal type */ + const opus_int quantOffsetType, /* I quantOffsetType */ + opus_int8 pulses[], /* I quantization indices */ + const opus_int frame_length /* I Frame length */ +) +{ + opus_int i, k, j, iter, bit, nLS, scale_down, RateLevelIndex = 0; + opus_int32 abs_q, minSumBits_Q5, sumBits_Q5; + VARDECL( opus_int, abs_pulses ); + VARDECL( opus_int, sum_pulses ); + VARDECL( opus_int, nRshifts ); + opus_int pulses_comb[ 8 ]; + opus_int *abs_pulses_ptr; + const opus_int8 *pulses_ptr; + const opus_uint8 *cdf_ptr; + const opus_uint8 *nBits_ptr; + SAVE_STACK; + + silk_memset( pulses_comb, 0, 8 * sizeof( opus_int ) ); /* Fixing Valgrind reported problem*/ + + /****************************/ + /* Prepare for shell coding */ + /****************************/ + /* Calculate number of shell blocks */ + silk_assert( 1 << LOG2_SHELL_CODEC_FRAME_LENGTH == SHELL_CODEC_FRAME_LENGTH ); + iter = silk_RSHIFT( frame_length, LOG2_SHELL_CODEC_FRAME_LENGTH ); + if( iter * SHELL_CODEC_FRAME_LENGTH < frame_length ) { + silk_assert( frame_length == 12 * 10 ); /* Make sure only happens for 10 ms @ 12 kHz */ + iter++; + silk_memset( &pulses[ frame_length ], 0, SHELL_CODEC_FRAME_LENGTH * sizeof(opus_int8)); + } + + /* Take the absolute value of the pulses */ + ALLOC( abs_pulses, iter * SHELL_CODEC_FRAME_LENGTH, opus_int ); + silk_assert( !( SHELL_CODEC_FRAME_LENGTH & 3 ) ); + for( i = 0; i < iter * SHELL_CODEC_FRAME_LENGTH; i+=4 ) { + abs_pulses[i+0] = ( opus_int )silk_abs( pulses[ i + 0 ] ); + abs_pulses[i+1] = ( opus_int )silk_abs( pulses[ i + 1 ] ); + abs_pulses[i+2] = ( opus_int )silk_abs( pulses[ i + 2 ] ); + abs_pulses[i+3] = ( opus_int )silk_abs( pulses[ i + 3 ] ); + } + + /* Calc sum pulses per shell code frame */ + ALLOC( sum_pulses, iter, opus_int ); + ALLOC( nRshifts, iter, opus_int ); + abs_pulses_ptr = abs_pulses; + for( i = 0; i < iter; i++ ) { + nRshifts[ i ] = 0; + + while( 1 ) { + /* 1+1 -> 2 */ + scale_down = combine_and_check( pulses_comb, abs_pulses_ptr, silk_max_pulses_table[ 0 ], 8 ); + /* 2+2 -> 4 */ + scale_down += combine_and_check( pulses_comb, pulses_comb, silk_max_pulses_table[ 1 ], 4 ); + /* 4+4 -> 8 */ + scale_down += combine_and_check( pulses_comb, pulses_comb, silk_max_pulses_table[ 2 ], 2 ); + /* 8+8 -> 16 */ + scale_down += combine_and_check( &sum_pulses[ i ], pulses_comb, silk_max_pulses_table[ 3 ], 1 ); + + if( scale_down ) { + /* We need to downscale the quantization signal */ + nRshifts[ i ]++; + for( k = 0; k < SHELL_CODEC_FRAME_LENGTH; k++ ) { + abs_pulses_ptr[ k ] = silk_RSHIFT( abs_pulses_ptr[ k ], 1 ); + } + } else { + /* Jump out of while(1) loop and go to next shell coding frame */ + break; + } + } + abs_pulses_ptr += SHELL_CODEC_FRAME_LENGTH; + } + + /**************/ + /* Rate level */ + /**************/ + /* find rate level that leads to fewest bits for coding of pulses per block info */ + minSumBits_Q5 = silk_int32_MAX; + for( k = 0; k < N_RATE_LEVELS - 1; k++ ) { + nBits_ptr = silk_pulses_per_block_BITS_Q5[ k ]; + sumBits_Q5 = silk_rate_levels_BITS_Q5[ signalType >> 1 ][ k ]; + for( i = 0; i < iter; i++ ) { + if( nRshifts[ i ] > 0 ) { + sumBits_Q5 += nBits_ptr[ SILK_MAX_PULSES + 1 ]; + } else { + sumBits_Q5 += nBits_ptr[ sum_pulses[ i ] ]; + } + } + if( sumBits_Q5 < minSumBits_Q5 ) { + minSumBits_Q5 = sumBits_Q5; + RateLevelIndex = k; + } + } + ec_enc_icdf( psRangeEnc, RateLevelIndex, silk_rate_levels_iCDF[ signalType >> 1 ], 8 ); + + /***************************************************/ + /* Sum-Weighted-Pulses Encoding */ + /***************************************************/ + cdf_ptr = silk_pulses_per_block_iCDF[ RateLevelIndex ]; + for( i = 0; i < iter; i++ ) { + if( nRshifts[ i ] == 0 ) { + ec_enc_icdf( psRangeEnc, sum_pulses[ i ], cdf_ptr, 8 ); + } else { + ec_enc_icdf( psRangeEnc, SILK_MAX_PULSES + 1, cdf_ptr, 8 ); + for( k = 0; k < nRshifts[ i ] - 1; k++ ) { + ec_enc_icdf( psRangeEnc, SILK_MAX_PULSES + 1, silk_pulses_per_block_iCDF[ N_RATE_LEVELS - 1 ], 8 ); + } + ec_enc_icdf( psRangeEnc, sum_pulses[ i ], silk_pulses_per_block_iCDF[ N_RATE_LEVELS - 1 ], 8 ); + } + } + + /******************/ + /* Shell Encoding */ + /******************/ + for( i = 0; i < iter; i++ ) { + if( sum_pulses[ i ] > 0 ) { + silk_shell_encoder( psRangeEnc, &abs_pulses[ i * SHELL_CODEC_FRAME_LENGTH ] ); + } + } + + /****************/ + /* LSB Encoding */ + /****************/ + for( i = 0; i < iter; i++ ) { + if( nRshifts[ i ] > 0 ) { + pulses_ptr = &pulses[ i * SHELL_CODEC_FRAME_LENGTH ]; + nLS = nRshifts[ i ] - 1; + for( k = 0; k < SHELL_CODEC_FRAME_LENGTH; k++ ) { + abs_q = (opus_int8)silk_abs( pulses_ptr[ k ] ); + for( j = nLS; j > 0; j-- ) { + bit = silk_RSHIFT( abs_q, j ) & 1; + ec_enc_icdf( psRangeEnc, bit, silk_lsb_iCDF, 8 ); + } + bit = abs_q & 1; + ec_enc_icdf( psRangeEnc, bit, silk_lsb_iCDF, 8 ); + } + } + } + + /****************/ + /* Encode signs */ + /****************/ + silk_encode_signs( psRangeEnc, pulses, frame_length, signalType, quantOffsetType, sum_pulses ); + RESTORE_STACK; +} diff --git a/thirdparty/opus/silk/errors.h b/thirdparty/opus/silk/errors.h new file mode 100644 index 000000000000..45070800f212 --- /dev/null +++ b/thirdparty/opus/silk/errors.h @@ -0,0 +1,98 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_ERRORS_H +#define SILK_ERRORS_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +/******************/ +/* Error messages */ +/******************/ +#define SILK_NO_ERROR 0 + +/**************************/ +/* Encoder error messages */ +/**************************/ + +/* Input length is not a multiple of 10 ms, or length is longer than the packet length */ +#define SILK_ENC_INPUT_INVALID_NO_OF_SAMPLES -101 + +/* Sampling frequency not 8000, 12000 or 16000 Hertz */ +#define SILK_ENC_FS_NOT_SUPPORTED -102 + +/* Packet size not 10, 20, 40, or 60 ms */ +#define SILK_ENC_PACKET_SIZE_NOT_SUPPORTED -103 + +/* Allocated payload buffer too short */ +#define SILK_ENC_PAYLOAD_BUF_TOO_SHORT -104 + +/* Loss rate not between 0 and 100 percent */ +#define SILK_ENC_INVALID_LOSS_RATE -105 + +/* Complexity setting not valid, use 0...10 */ +#define SILK_ENC_INVALID_COMPLEXITY_SETTING -106 + +/* Inband FEC setting not valid, use 0 or 1 */ +#define SILK_ENC_INVALID_INBAND_FEC_SETTING -107 + +/* DTX setting not valid, use 0 or 1 */ +#define SILK_ENC_INVALID_DTX_SETTING -108 + +/* CBR setting not valid, use 0 or 1 */ +#define SILK_ENC_INVALID_CBR_SETTING -109 + +/* Internal encoder error */ +#define SILK_ENC_INTERNAL_ERROR -110 + +/* Internal encoder error */ +#define SILK_ENC_INVALID_NUMBER_OF_CHANNELS_ERROR -111 + +/**************************/ +/* Decoder error messages */ +/**************************/ + +/* Output sampling frequency lower than internal decoded sampling frequency */ +#define SILK_DEC_INVALID_SAMPLING_FREQUENCY -200 + +/* Payload size exceeded the maximum allowed 1024 bytes */ +#define SILK_DEC_PAYLOAD_TOO_LARGE -201 + +/* Payload has bit errors */ +#define SILK_DEC_PAYLOAD_ERROR -202 + +/* Payload has bit errors */ +#define SILK_DEC_INVALID_FRAME_SIZE -203 + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/thirdparty/opus/silk/fixed/LTP_analysis_filter_FIX.c b/thirdparty/opus/silk/fixed/LTP_analysis_filter_FIX.c new file mode 100644 index 000000000000..5574e7069fe8 --- /dev/null +++ b/thirdparty/opus/silk/fixed/LTP_analysis_filter_FIX.c @@ -0,0 +1,90 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" + +void silk_LTP_analysis_filter_FIX( + opus_int16 *LTP_res, /* O LTP residual signal of length MAX_NB_SUBFR * ( pre_length + subfr_length ) */ + const opus_int16 *x, /* I Pointer to input signal with at least max( pitchL ) preceding samples */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],/* I LTP_ORDER LTP coefficients for each MAX_NB_SUBFR subframe */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag, one for each subframe */ + const opus_int32 invGains_Q16[ MAX_NB_SUBFR ], /* I Inverse quantization gains, one for each subframe */ + const opus_int subfr_length, /* I Length of each subframe */ + const opus_int nb_subfr, /* I Number of subframes */ + const opus_int pre_length /* I Length of the preceding samples starting at &x[0] for each subframe */ +) +{ + const opus_int16 *x_ptr, *x_lag_ptr; + opus_int16 Btmp_Q14[ LTP_ORDER ]; + opus_int16 *LTP_res_ptr; + opus_int k, i; + opus_int32 LTP_est; + + x_ptr = x; + LTP_res_ptr = LTP_res; + for( k = 0; k < nb_subfr; k++ ) { + + x_lag_ptr = x_ptr - pitchL[ k ]; + + Btmp_Q14[ 0 ] = LTPCoef_Q14[ k * LTP_ORDER ]; + Btmp_Q14[ 1 ] = LTPCoef_Q14[ k * LTP_ORDER + 1 ]; + Btmp_Q14[ 2 ] = LTPCoef_Q14[ k * LTP_ORDER + 2 ]; + Btmp_Q14[ 3 ] = LTPCoef_Q14[ k * LTP_ORDER + 3 ]; + Btmp_Q14[ 4 ] = LTPCoef_Q14[ k * LTP_ORDER + 4 ]; + + /* LTP analysis FIR filter */ + for( i = 0; i < subfr_length + pre_length; i++ ) { + LTP_res_ptr[ i ] = x_ptr[ i ]; + + /* Long-term prediction */ + LTP_est = silk_SMULBB( x_lag_ptr[ LTP_ORDER / 2 ], Btmp_Q14[ 0 ] ); + LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ 1 ], Btmp_Q14[ 1 ] ); + LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ 0 ], Btmp_Q14[ 2 ] ); + LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ -1 ], Btmp_Q14[ 3 ] ); + LTP_est = silk_SMLABB_ovflw( LTP_est, x_lag_ptr[ -2 ], Btmp_Q14[ 4 ] ); + + LTP_est = silk_RSHIFT_ROUND( LTP_est, 14 ); /* round and -> Q0*/ + + /* Subtract long-term prediction */ + LTP_res_ptr[ i ] = (opus_int16)silk_SAT16( (opus_int32)x_ptr[ i ] - LTP_est ); + + /* Scale residual */ + LTP_res_ptr[ i ] = silk_SMULWB( invGains_Q16[ k ], LTP_res_ptr[ i ] ); + + x_lag_ptr++; + } + + /* Update pointers */ + LTP_res_ptr += subfr_length + pre_length; + x_ptr += subfr_length; + } +} + diff --git a/thirdparty/opus/silk/fixed/LTP_scale_ctrl_FIX.c b/thirdparty/opus/silk/fixed/LTP_scale_ctrl_FIX.c new file mode 100644 index 000000000000..3dcedef89189 --- /dev/null +++ b/thirdparty/opus/silk/fixed/LTP_scale_ctrl_FIX.c @@ -0,0 +1,53 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" + +/* Calculation of LTP state scaling */ +void silk_LTP_scale_ctrl_FIX( + silk_encoder_state_FIX *psEnc, /* I/O encoder state */ + silk_encoder_control_FIX *psEncCtrl, /* I/O encoder control */ + opus_int condCoding /* I The type of conditional coding to use */ +) +{ + opus_int round_loss; + + if( condCoding == CODE_INDEPENDENTLY ) { + /* Only scale if first frame in packet */ + round_loss = psEnc->sCmn.PacketLoss_perc + psEnc->sCmn.nFramesPerPacket; + psEnc->sCmn.indices.LTP_scaleIndex = (opus_int8)silk_LIMIT( + silk_SMULWB( silk_SMULBB( round_loss, psEncCtrl->LTPredCodGain_Q7 ), SILK_FIX_CONST( 0.1, 9 ) ), 0, 2 ); + } else { + /* Default is minimum scaling */ + psEnc->sCmn.indices.LTP_scaleIndex = 0; + } + psEncCtrl->LTP_scale_Q14 = silk_LTPScales_table_Q14[ psEnc->sCmn.indices.LTP_scaleIndex ]; +} diff --git a/thirdparty/opus/silk/fixed/apply_sine_window_FIX.c b/thirdparty/opus/silk/fixed/apply_sine_window_FIX.c new file mode 100644 index 000000000000..4502b7130e97 --- /dev/null +++ b/thirdparty/opus/silk/fixed/apply_sine_window_FIX.c @@ -0,0 +1,101 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" + +/* Apply sine window to signal vector. */ +/* Window types: */ +/* 1 -> sine window from 0 to pi/2 */ +/* 2 -> sine window from pi/2 to pi */ +/* Every other sample is linearly interpolated, for speed. */ +/* Window length must be between 16 and 120 (incl) and a multiple of 4. */ + +/* Matlab code for table: + for k=16:9*4:16+2*9*4, fprintf(' %7.d,', -round(65536*pi ./ (k:4:k+8*4))); fprintf('\n'); end +*/ +static const opus_int16 freq_table_Q16[ 27 ] = { + 12111, 9804, 8235, 7100, 6239, 5565, 5022, 4575, 4202, + 3885, 3612, 3375, 3167, 2984, 2820, 2674, 2542, 2422, + 2313, 2214, 2123, 2038, 1961, 1889, 1822, 1760, 1702, +}; + +void silk_apply_sine_window( + opus_int16 px_win[], /* O Pointer to windowed signal */ + const opus_int16 px[], /* I Pointer to input signal */ + const opus_int win_type, /* I Selects a window type */ + const opus_int length /* I Window length, multiple of 4 */ +) +{ + opus_int k, f_Q16, c_Q16; + opus_int32 S0_Q16, S1_Q16; + + silk_assert( win_type == 1 || win_type == 2 ); + + /* Length must be in a range from 16 to 120 and a multiple of 4 */ + silk_assert( length >= 16 && length <= 120 ); + silk_assert( ( length & 3 ) == 0 ); + + /* Frequency */ + k = ( length >> 2 ) - 4; + silk_assert( k >= 0 && k <= 26 ); + f_Q16 = (opus_int)freq_table_Q16[ k ]; + + /* Factor used for cosine approximation */ + c_Q16 = silk_SMULWB( (opus_int32)f_Q16, -f_Q16 ); + silk_assert( c_Q16 >= -32768 ); + + /* initialize state */ + if( win_type == 1 ) { + /* start from 0 */ + S0_Q16 = 0; + /* approximation of sin(f) */ + S1_Q16 = f_Q16 + silk_RSHIFT( length, 3 ); + } else { + /* start from 1 */ + S0_Q16 = ( (opus_int32)1 << 16 ); + /* approximation of cos(f) */ + S1_Q16 = ( (opus_int32)1 << 16 ) + silk_RSHIFT( c_Q16, 1 ) + silk_RSHIFT( length, 4 ); + } + + /* Uses the recursive equation: sin(n*f) = 2 * cos(f) * sin((n-1)*f) - sin((n-2)*f) */ + /* 4 samples at a time */ + for( k = 0; k < length; k += 4 ) { + px_win[ k ] = (opus_int16)silk_SMULWB( silk_RSHIFT( S0_Q16 + S1_Q16, 1 ), px[ k ] ); + px_win[ k + 1 ] = (opus_int16)silk_SMULWB( S1_Q16, px[ k + 1] ); + S0_Q16 = silk_SMULWB( S1_Q16, c_Q16 ) + silk_LSHIFT( S1_Q16, 1 ) - S0_Q16 + 1; + S0_Q16 = silk_min( S0_Q16, ( (opus_int32)1 << 16 ) ); + + px_win[ k + 2 ] = (opus_int16)silk_SMULWB( silk_RSHIFT( S0_Q16 + S1_Q16, 1 ), px[ k + 2] ); + px_win[ k + 3 ] = (opus_int16)silk_SMULWB( S0_Q16, px[ k + 3 ] ); + S1_Q16 = silk_SMULWB( S0_Q16, c_Q16 ) + silk_LSHIFT( S0_Q16, 1 ) - S1_Q16; + S1_Q16 = silk_min( S1_Q16, ( (opus_int32)1 << 16 ) ); + } +} diff --git a/thirdparty/opus/silk/fixed/autocorr_FIX.c b/thirdparty/opus/silk/fixed/autocorr_FIX.c new file mode 100644 index 000000000000..de95c9869360 --- /dev/null +++ b/thirdparty/opus/silk/fixed/autocorr_FIX.c @@ -0,0 +1,48 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" +#include "celt_lpc.h" + +/* Compute autocorrelation */ +void silk_autocorr( + opus_int32 *results, /* O Result (length correlationCount) */ + opus_int *scale, /* O Scaling of the correlation vector */ + const opus_int16 *inputData, /* I Input data to correlate */ + const opus_int inputDataSize, /* I Length of input */ + const opus_int correlationCount, /* I Number of correlation taps to compute */ + int arch /* I Run-time architecture */ +) +{ + opus_int corrCount; + corrCount = silk_min_int( inputDataSize, correlationCount ); + *scale = _celt_autocorr(inputData, results, NULL, 0, corrCount-1, inputDataSize, arch); +} diff --git a/thirdparty/opus/silk/fixed/burg_modified_FIX.c b/thirdparty/opus/silk/fixed/burg_modified_FIX.c new file mode 100644 index 000000000000..17d0e0993cf5 --- /dev/null +++ b/thirdparty/opus/silk/fixed/burg_modified_FIX.c @@ -0,0 +1,280 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" +#include "define.h" +#include "tuning_parameters.h" +#include "pitch.h" + +#define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */ + +#define QA 25 +#define N_BITS_HEAD_ROOM 2 +#define MIN_RSHIFTS -16 +#define MAX_RSHIFTS (32 - QA) + +/* Compute reflection coefficients from input signal */ +void silk_burg_modified_c( + opus_int32 *res_nrg, /* O Residual energy */ + opus_int *res_nrg_Q, /* O Residual energy Q value */ + opus_int32 A_Q16[], /* O Prediction coefficients (length order) */ + const opus_int16 x[], /* I Input signal, length: nb_subfr * ( D + subfr_length ) */ + const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */ + const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */ + const opus_int nb_subfr, /* I Number of subframes stacked in x */ + const opus_int D, /* I Order */ + int arch /* I Run-time architecture */ +) +{ + opus_int k, n, s, lz, rshifts, reached_max_gain; + opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2; + const opus_int16 *x_ptr; + opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ]; + opus_int32 C_last_row[ SILK_MAX_ORDER_LPC ]; + opus_int32 Af_QA[ SILK_MAX_ORDER_LPC ]; + opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ]; + opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ]; + opus_int32 xcorr[ SILK_MAX_ORDER_LPC ]; + opus_int64 C0_64; + + silk_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE ); + + /* Compute autocorrelations, added over subframes */ + C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr, arch ); + lz = silk_CLZ64(C0_64); + rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz; + if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS; + if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS; + + if (rshifts > 0) { + C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts ); + } else { + C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts ); + } + + CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */ + silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) ); + if( rshifts > 0 ) { + for( s = 0; s < nb_subfr; s++ ) { + x_ptr = x + s * subfr_length; + for( n = 1; n < D + 1; n++ ) { + C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64( + silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts ); + } + } + } else { + for( s = 0; s < nb_subfr; s++ ) { + int i; + opus_int32 d; + x_ptr = x + s * subfr_length; + celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch ); + for( n = 1; n < D + 1; n++ ) { + for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ ) + d = MAC16_16( d, x_ptr[ i ], x_ptr[ i - n ] ); + xcorr[ n - 1 ] += d; + } + for( n = 1; n < D + 1; n++ ) { + C_first_row[ n - 1 ] += silk_LSHIFT32( xcorr[ n - 1 ], -rshifts ); + } + } + } + silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) ); + + /* Initialize */ + CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */ + + invGain_Q30 = (opus_int32)1 << 30; + reached_max_gain = 0; + for( n = 0; n < D; n++ ) { + /* Update first row of correlation matrix (without first element) */ + /* Update last row of correlation matrix (without last element, stored in reversed order) */ + /* Update C * Af */ + /* Update C * flipud(Af) (stored in reversed order) */ + if( rshifts > -2 ) { + for( s = 0; s < nb_subfr; s++ ) { + x_ptr = x + s * subfr_length; + x1 = -silk_LSHIFT32( (opus_int32)x_ptr[ n ], 16 - rshifts ); /* Q(16-rshifts) */ + x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts ); /* Q(16-rshifts) */ + tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ], QA - 16 ); /* Q(QA-16) */ + tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16 ); /* Q(QA-16) */ + for( k = 0; k < n; k++ ) { + C_first_row[ k ] = silk_SMLAWB( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */ + C_last_row[ k ] = silk_SMLAWB( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */ + Atmp_QA = Af_QA[ k ]; + tmp1 = silk_SMLAWB( tmp1, Atmp_QA, x_ptr[ n - k - 1 ] ); /* Q(QA-16) */ + tmp2 = silk_SMLAWB( tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ] ); /* Q(QA-16) */ + } + tmp1 = silk_LSHIFT32( -tmp1, 32 - QA - rshifts ); /* Q(16-rshifts) */ + tmp2 = silk_LSHIFT32( -tmp2, 32 - QA - rshifts ); /* Q(16-rshifts) */ + for( k = 0; k <= n; k++ ) { + CAf[ k ] = silk_SMLAWB( CAf[ k ], tmp1, x_ptr[ n - k ] ); /* Q( -rshift ) */ + CAb[ k ] = silk_SMLAWB( CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ] ); /* Q( -rshift ) */ + } + } + } else { + for( s = 0; s < nb_subfr; s++ ) { + x_ptr = x + s * subfr_length; + x1 = -silk_LSHIFT32( (opus_int32)x_ptr[ n ], -rshifts ); /* Q( -rshifts ) */ + x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts ); /* Q( -rshifts ) */ + tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ], 17 ); /* Q17 */ + tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 17 ); /* Q17 */ + for( k = 0; k < n; k++ ) { + C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */ + C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */ + Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */ + /* We sometimes have get overflows in the multiplications (even beyond +/- 2^32), + but they cancel each other and the real result seems to always fit in a 32-bit + signed integer. This was determined experimentally, not theoretically (unfortunately). */ + tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */ + tmp2 = silk_MLA_ovflw( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */ + } + tmp1 = -tmp1; /* Q17 */ + tmp2 = -tmp2; /* Q17 */ + for( k = 0; k <= n; k++ ) { + CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1, + silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts - 1 ) ); /* Q( -rshift ) */ + CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2, + silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1 ) ); /* Q( -rshift ) */ + } + } + } + + /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */ + tmp1 = C_first_row[ n ]; /* Q( -rshifts ) */ + tmp2 = C_last_row[ n ]; /* Q( -rshifts ) */ + num = 0; /* Q( -rshifts ) */ + nrg = silk_ADD32( CAb[ 0 ], CAf[ 0 ] ); /* Q( 1-rshifts ) */ + for( k = 0; k < n; k++ ) { + Atmp_QA = Af_QA[ k ]; + lz = silk_CLZ32( silk_abs( Atmp_QA ) ) - 1; + lz = silk_min( 32 - QA, lz ); + Atmp1 = silk_LSHIFT32( Atmp_QA, lz ); /* Q( QA + lz ) */ + + tmp1 = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( C_last_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */ + tmp2 = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( C_first_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */ + num = silk_ADD_LSHIFT32( num, silk_SMMUL( CAb[ n - k ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */ + nrg = silk_ADD_LSHIFT32( nrg, silk_SMMUL( silk_ADD32( CAb[ k + 1 ], CAf[ k + 1 ] ), + Atmp1 ), 32 - QA - lz ); /* Q( 1-rshifts ) */ + } + CAf[ n + 1 ] = tmp1; /* Q( -rshifts ) */ + CAb[ n + 1 ] = tmp2; /* Q( -rshifts ) */ + num = silk_ADD32( num, tmp2 ); /* Q( -rshifts ) */ + num = silk_LSHIFT32( -num, 1 ); /* Q( 1-rshifts ) */ + + /* Calculate the next order reflection (parcor) coefficient */ + if( silk_abs( num ) < nrg ) { + rc_Q31 = silk_DIV32_varQ( num, nrg, 31 ); + } else { + rc_Q31 = ( num > 0 ) ? silk_int32_MAX : silk_int32_MIN; + } + + /* Update inverse prediction gain */ + tmp1 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 ); + tmp1 = silk_LSHIFT( silk_SMMUL( invGain_Q30, tmp1 ), 2 ); + if( tmp1 <= minInvGain_Q30 ) { + /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */ + tmp2 = ( (opus_int32)1 << 30 ) - silk_DIV32_varQ( minInvGain_Q30, invGain_Q30, 30 ); /* Q30 */ + rc_Q31 = silk_SQRT_APPROX( tmp2 ); /* Q15 */ + if( rc_Q31 > 0 ) { + /* Newton-Raphson iteration */ + rc_Q31 = silk_RSHIFT32( rc_Q31 + silk_DIV32( tmp2, rc_Q31 ), 1 ); /* Q15 */ + rc_Q31 = silk_LSHIFT32( rc_Q31, 16 ); /* Q31 */ + if( num < 0 ) { + /* Ensure adjusted reflection coefficients has the original sign */ + rc_Q31 = -rc_Q31; + } + } + invGain_Q30 = minInvGain_Q30; + reached_max_gain = 1; + } else { + invGain_Q30 = tmp1; + } + + /* Update the AR coefficients */ + for( k = 0; k < (n + 1) >> 1; k++ ) { + tmp1 = Af_QA[ k ]; /* QA */ + tmp2 = Af_QA[ n - k - 1 ]; /* QA */ + Af_QA[ k ] = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 ); /* QA */ + Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 ); /* QA */ + } + Af_QA[ n ] = silk_RSHIFT32( rc_Q31, 31 - QA ); /* QA */ + + if( reached_max_gain ) { + /* Reached max prediction gain; set remaining coefficients to zero and exit loop */ + for( k = n + 1; k < D; k++ ) { + Af_QA[ k ] = 0; + } + break; + } + + /* Update C * Af and C * Ab */ + for( k = 0; k <= n + 1; k++ ) { + tmp1 = CAf[ k ]; /* Q( -rshifts ) */ + tmp2 = CAb[ n - k + 1 ]; /* Q( -rshifts ) */ + CAf[ k ] = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 ); /* Q( -rshifts ) */ + CAb[ n - k + 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 ); /* Q( -rshifts ) */ + } + } + + if( reached_max_gain ) { + for( k = 0; k < D; k++ ) { + /* Scale coefficients */ + A_Q16[ k ] = -silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 ); + } + /* Subtract energy of preceding samples from C0 */ + if( rshifts > 0 ) { + for( s = 0; s < nb_subfr; s++ ) { + x_ptr = x + s * subfr_length; + C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts ); + } + } else { + for( s = 0; s < nb_subfr; s++ ) { + x_ptr = x + s * subfr_length; + C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch), -rshifts); + } + } + /* Approximate residual energy */ + *res_nrg = silk_LSHIFT( silk_SMMUL( invGain_Q30, C0 ), 2 ); + *res_nrg_Q = -rshifts; + } else { + /* Return residual energy */ + nrg = CAf[ 0 ]; /* Q( -rshifts ) */ + tmp1 = (opus_int32)1 << 16; /* Q16 */ + for( k = 0; k < D; k++ ) { + Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 ); /* Q16 */ + nrg = silk_SMLAWW( nrg, CAf[ k + 1 ], Atmp1 ); /* Q( -rshifts ) */ + tmp1 = silk_SMLAWW( tmp1, Atmp1, Atmp1 ); /* Q16 */ + A_Q16[ k ] = -Atmp1; + } + *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */ + *res_nrg_Q = -rshifts; + } +} diff --git a/thirdparty/opus/silk/fixed/corrMatrix_FIX.c b/thirdparty/opus/silk/fixed/corrMatrix_FIX.c new file mode 100644 index 000000000000..c1d437c7859e --- /dev/null +++ b/thirdparty/opus/silk/fixed/corrMatrix_FIX.c @@ -0,0 +1,158 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/********************************************************************** + * Correlation Matrix Computations for LS estimate. + **********************************************************************/ + +#include "main_FIX.h" + +/* Calculates correlation vector X'*t */ +void silk_corrVector_FIX( + const opus_int16 *x, /* I x vector [L + order - 1] used to form data matrix X */ + const opus_int16 *t, /* I Target vector [L] */ + const opus_int L, /* I Length of vectors */ + const opus_int order, /* I Max lag for correlation */ + opus_int32 *Xt, /* O Pointer to X'*t correlation vector [order] */ + const opus_int rshifts, /* I Right shifts of correlations */ + int arch /* I Run-time architecture */ +) +{ + opus_int lag, i; + const opus_int16 *ptr1, *ptr2; + opus_int32 inner_prod; + + ptr1 = &x[ order - 1 ]; /* Points to first sample of column 0 of X: X[:,0] */ + ptr2 = t; + /* Calculate X'*t */ + if( rshifts > 0 ) { + /* Right shifting used */ + for( lag = 0; lag < order; lag++ ) { + inner_prod = 0; + for( i = 0; i < L; i++ ) { + inner_prod += silk_RSHIFT32( silk_SMULBB( ptr1[ i ], ptr2[i] ), rshifts ); + } + Xt[ lag ] = inner_prod; /* X[:,lag]'*t */ + ptr1--; /* Go to next column of X */ + } + } else { + silk_assert( rshifts == 0 ); + for( lag = 0; lag < order; lag++ ) { + Xt[ lag ] = silk_inner_prod_aligned( ptr1, ptr2, L, arch ); /* X[:,lag]'*t */ + ptr1--; /* Go to next column of X */ + } + } +} + +/* Calculates correlation matrix X'*X */ +void silk_corrMatrix_FIX( + const opus_int16 *x, /* I x vector [L + order - 1] used to form data matrix X */ + const opus_int L, /* I Length of vectors */ + const opus_int order, /* I Max lag for correlation */ + const opus_int head_room, /* I Desired headroom */ + opus_int32 *XX, /* O Pointer to X'*X correlation matrix [ order x order ] */ + opus_int *rshifts, /* I/O Right shifts of correlations */ + int arch /* I Run-time architecture */ +) +{ + opus_int i, j, lag, rshifts_local, head_room_rshifts; + opus_int32 energy; + const opus_int16 *ptr1, *ptr2; + + /* Calculate energy to find shift used to fit in 32 bits */ + silk_sum_sqr_shift( &energy, &rshifts_local, x, L + order - 1 ); + /* Add shifts to get the desired head room */ + head_room_rshifts = silk_max( head_room - silk_CLZ32( energy ), 0 ); + + energy = silk_RSHIFT32( energy, head_room_rshifts ); + rshifts_local += head_room_rshifts; + + /* Calculate energy of first column (0) of X: X[:,0]'*X[:,0] */ + /* Remove contribution of first order - 1 samples */ + for( i = 0; i < order - 1; i++ ) { + energy -= silk_RSHIFT32( silk_SMULBB( x[ i ], x[ i ] ), rshifts_local ); + } + if( rshifts_local < *rshifts ) { + /* Adjust energy */ + energy = silk_RSHIFT32( energy, *rshifts - rshifts_local ); + rshifts_local = *rshifts; + } + + /* Calculate energy of remaining columns of X: X[:,j]'*X[:,j] */ + /* Fill out the diagonal of the correlation matrix */ + matrix_ptr( XX, 0, 0, order ) = energy; + ptr1 = &x[ order - 1 ]; /* First sample of column 0 of X */ + for( j = 1; j < order; j++ ) { + energy = silk_SUB32( energy, silk_RSHIFT32( silk_SMULBB( ptr1[ L - j ], ptr1[ L - j ] ), rshifts_local ) ); + energy = silk_ADD32( energy, silk_RSHIFT32( silk_SMULBB( ptr1[ -j ], ptr1[ -j ] ), rshifts_local ) ); + matrix_ptr( XX, j, j, order ) = energy; + } + + ptr2 = &x[ order - 2 ]; /* First sample of column 1 of X */ + /* Calculate the remaining elements of the correlation matrix */ + if( rshifts_local > 0 ) { + /* Right shifting used */ + for( lag = 1; lag < order; lag++ ) { + /* Inner product of column 0 and column lag: X[:,0]'*X[:,lag] */ + energy = 0; + for( i = 0; i < L; i++ ) { + energy += silk_RSHIFT32( silk_SMULBB( ptr1[ i ], ptr2[i] ), rshifts_local ); + } + /* Calculate remaining off diagonal: X[:,j]'*X[:,j + lag] */ + matrix_ptr( XX, lag, 0, order ) = energy; + matrix_ptr( XX, 0, lag, order ) = energy; + for( j = 1; j < ( order - lag ); j++ ) { + energy = silk_SUB32( energy, silk_RSHIFT32( silk_SMULBB( ptr1[ L - j ], ptr2[ L - j ] ), rshifts_local ) ); + energy = silk_ADD32( energy, silk_RSHIFT32( silk_SMULBB( ptr1[ -j ], ptr2[ -j ] ), rshifts_local ) ); + matrix_ptr( XX, lag + j, j, order ) = energy; + matrix_ptr( XX, j, lag + j, order ) = energy; + } + ptr2--; /* Update pointer to first sample of next column (lag) in X */ + } + } else { + for( lag = 1; lag < order; lag++ ) { + /* Inner product of column 0 and column lag: X[:,0]'*X[:,lag] */ + energy = silk_inner_prod_aligned( ptr1, ptr2, L, arch ); + matrix_ptr( XX, lag, 0, order ) = energy; + matrix_ptr( XX, 0, lag, order ) = energy; + /* Calculate remaining off diagonal: X[:,j]'*X[:,j + lag] */ + for( j = 1; j < ( order - lag ); j++ ) { + energy = silk_SUB32( energy, silk_SMULBB( ptr1[ L - j ], ptr2[ L - j ] ) ); + energy = silk_SMLABB( energy, ptr1[ -j ], ptr2[ -j ] ); + matrix_ptr( XX, lag + j, j, order ) = energy; + matrix_ptr( XX, j, lag + j, order ) = energy; + } + ptr2--;/* Update pointer to first sample of next column (lag) in X */ + } + } + *rshifts = rshifts_local; +} + diff --git a/thirdparty/opus/silk/fixed/encode_frame_FIX.c b/thirdparty/opus/silk/fixed/encode_frame_FIX.c new file mode 100644 index 000000000000..5ef44b03fc4c --- /dev/null +++ b/thirdparty/opus/silk/fixed/encode_frame_FIX.c @@ -0,0 +1,387 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" +#include "stack_alloc.h" +#include "tuning_parameters.h" + +/* Low Bitrate Redundancy (LBRR) encoding. Reuse all parameters but encode with lower bitrate */ +static OPUS_INLINE void silk_LBRR_encode_FIX( + silk_encoder_state_FIX *psEnc, /* I/O Pointer to Silk FIX encoder state */ + silk_encoder_control_FIX *psEncCtrl, /* I/O Pointer to Silk FIX encoder control struct */ + const opus_int32 xfw_Q3[], /* I Input signal */ + opus_int condCoding /* I The type of conditional coding used so far for this frame */ +); + +void silk_encode_do_VAD_FIX( + silk_encoder_state_FIX *psEnc /* I/O Pointer to Silk FIX encoder state */ +) +{ + /****************************/ + /* Voice Activity Detection */ + /****************************/ + silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.arch ); + + /**************************************************/ + /* Convert speech activity into VAD and DTX flags */ + /**************************************************/ + if( psEnc->sCmn.speech_activity_Q8 < SILK_FIX_CONST( SPEECH_ACTIVITY_DTX_THRES, 8 ) ) { + psEnc->sCmn.indices.signalType = TYPE_NO_VOICE_ACTIVITY; + psEnc->sCmn.noSpeechCounter++; + if( psEnc->sCmn.noSpeechCounter < NB_SPEECH_FRAMES_BEFORE_DTX ) { + psEnc->sCmn.inDTX = 0; + } else if( psEnc->sCmn.noSpeechCounter > MAX_CONSECUTIVE_DTX + NB_SPEECH_FRAMES_BEFORE_DTX ) { + psEnc->sCmn.noSpeechCounter = NB_SPEECH_FRAMES_BEFORE_DTX; + psEnc->sCmn.inDTX = 0; + } + psEnc->sCmn.VAD_flags[ psEnc->sCmn.nFramesEncoded ] = 0; + } else { + psEnc->sCmn.noSpeechCounter = 0; + psEnc->sCmn.inDTX = 0; + psEnc->sCmn.indices.signalType = TYPE_UNVOICED; + psEnc->sCmn.VAD_flags[ psEnc->sCmn.nFramesEncoded ] = 1; + } +} + +/****************/ +/* Encode frame */ +/****************/ +opus_int silk_encode_frame_FIX( + silk_encoder_state_FIX *psEnc, /* I/O Pointer to Silk FIX encoder state */ + opus_int32 *pnBytesOut, /* O Pointer to number of payload bytes; */ + ec_enc *psRangeEnc, /* I/O compressor data structure */ + opus_int condCoding, /* I The type of conditional coding to use */ + opus_int maxBits, /* I If > 0: maximum number of output bits */ + opus_int useCBR /* I Flag to force constant-bitrate operation */ +) +{ + silk_encoder_control_FIX sEncCtrl; + opus_int i, iter, maxIter, found_upper, found_lower, ret = 0; + opus_int16 *x_frame; + ec_enc sRangeEnc_copy, sRangeEnc_copy2; + silk_nsq_state sNSQ_copy, sNSQ_copy2; + opus_int32 seed_copy, nBits, nBits_lower, nBits_upper, gainMult_lower, gainMult_upper; + opus_int32 gainsID, gainsID_lower, gainsID_upper; + opus_int16 gainMult_Q8; + opus_int16 ec_prevLagIndex_copy; + opus_int ec_prevSignalType_copy; + opus_int8 LastGainIndex_copy2; + SAVE_STACK; + + /* This is totally unnecessary but many compilers (including gcc) are too dumb to realise it */ + LastGainIndex_copy2 = nBits_lower = nBits_upper = gainMult_lower = gainMult_upper = 0; + + psEnc->sCmn.indices.Seed = psEnc->sCmn.frameCounter++ & 3; + + /**************************************************************/ + /* Set up Input Pointers, and insert frame in input buffer */ + /*************************************************************/ + /* start of frame to encode */ + x_frame = psEnc->x_buf + psEnc->sCmn.ltp_mem_length; + + /***************************************/ + /* Ensure smooth bandwidth transitions */ + /***************************************/ + silk_LP_variable_cutoff( &psEnc->sCmn.sLP, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.frame_length ); + + /*******************************************/ + /* Copy new frame to front of input buffer */ + /*******************************************/ + silk_memcpy( x_frame + LA_SHAPE_MS * psEnc->sCmn.fs_kHz, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.frame_length * sizeof( opus_int16 ) ); + + if( !psEnc->sCmn.prefillFlag ) { + VARDECL( opus_int32, xfw_Q3 ); + VARDECL( opus_int16, res_pitch ); + VARDECL( opus_uint8, ec_buf_copy ); + opus_int16 *res_pitch_frame; + + ALLOC( res_pitch, + psEnc->sCmn.la_pitch + psEnc->sCmn.frame_length + + psEnc->sCmn.ltp_mem_length, opus_int16 ); + /* start of pitch LPC residual frame */ + res_pitch_frame = res_pitch + psEnc->sCmn.ltp_mem_length; + + /*****************************************/ + /* Find pitch lags, initial LPC analysis */ + /*****************************************/ + silk_find_pitch_lags_FIX( psEnc, &sEncCtrl, res_pitch, x_frame, psEnc->sCmn.arch ); + + /************************/ + /* Noise shape analysis */ + /************************/ + silk_noise_shape_analysis_FIX( psEnc, &sEncCtrl, res_pitch_frame, x_frame, psEnc->sCmn.arch ); + + /***************************************************/ + /* Find linear prediction coefficients (LPC + LTP) */ + /***************************************************/ + silk_find_pred_coefs_FIX( psEnc, &sEncCtrl, res_pitch, x_frame, condCoding ); + + /****************************************/ + /* Process gains */ + /****************************************/ + silk_process_gains_FIX( psEnc, &sEncCtrl, condCoding ); + + /*****************************************/ + /* Prefiltering for noise shaper */ + /*****************************************/ + ALLOC( xfw_Q3, psEnc->sCmn.frame_length, opus_int32 ); + silk_prefilter_FIX( psEnc, &sEncCtrl, xfw_Q3, x_frame ); + + /****************************************/ + /* Low Bitrate Redundant Encoding */ + /****************************************/ + silk_LBRR_encode_FIX( psEnc, &sEncCtrl, xfw_Q3, condCoding ); + + /* Loop over quantizer and entropy coding to control bitrate */ + maxIter = 6; + gainMult_Q8 = SILK_FIX_CONST( 1, 8 ); + found_lower = 0; + found_upper = 0; + gainsID = silk_gains_ID( psEnc->sCmn.indices.GainsIndices, psEnc->sCmn.nb_subfr ); + gainsID_lower = -1; + gainsID_upper = -1; + /* Copy part of the input state */ + silk_memcpy( &sRangeEnc_copy, psRangeEnc, sizeof( ec_enc ) ); + silk_memcpy( &sNSQ_copy, &psEnc->sCmn.sNSQ, sizeof( silk_nsq_state ) ); + seed_copy = psEnc->sCmn.indices.Seed; + ec_prevLagIndex_copy = psEnc->sCmn.ec_prevLagIndex; + ec_prevSignalType_copy = psEnc->sCmn.ec_prevSignalType; + ALLOC( ec_buf_copy, 1275, opus_uint8 ); + for( iter = 0; ; iter++ ) { + if( gainsID == gainsID_lower ) { + nBits = nBits_lower; + } else if( gainsID == gainsID_upper ) { + nBits = nBits_upper; + } else { + /* Restore part of the input state */ + if( iter > 0 ) { + silk_memcpy( psRangeEnc, &sRangeEnc_copy, sizeof( ec_enc ) ); + silk_memcpy( &psEnc->sCmn.sNSQ, &sNSQ_copy, sizeof( silk_nsq_state ) ); + psEnc->sCmn.indices.Seed = seed_copy; + psEnc->sCmn.ec_prevLagIndex = ec_prevLagIndex_copy; + psEnc->sCmn.ec_prevSignalType = ec_prevSignalType_copy; + } + + /*****************************************/ + /* Noise shaping quantization */ + /*****************************************/ + if( psEnc->sCmn.nStatesDelayedDecision > 1 || psEnc->sCmn.warping_Q16 > 0 ) { + silk_NSQ_del_dec( &psEnc->sCmn, &psEnc->sCmn.sNSQ, &psEnc->sCmn.indices, xfw_Q3, psEnc->sCmn.pulses, + sEncCtrl.PredCoef_Q12[ 0 ], sEncCtrl.LTPCoef_Q14, sEncCtrl.AR2_Q13, sEncCtrl.HarmShapeGain_Q14, + sEncCtrl.Tilt_Q14, sEncCtrl.LF_shp_Q14, sEncCtrl.Gains_Q16, sEncCtrl.pitchL, sEncCtrl.Lambda_Q10, sEncCtrl.LTP_scale_Q14, + psEnc->sCmn.arch ); + } else { + silk_NSQ( &psEnc->sCmn, &psEnc->sCmn.sNSQ, &psEnc->sCmn.indices, xfw_Q3, psEnc->sCmn.pulses, + sEncCtrl.PredCoef_Q12[ 0 ], sEncCtrl.LTPCoef_Q14, sEncCtrl.AR2_Q13, sEncCtrl.HarmShapeGain_Q14, + sEncCtrl.Tilt_Q14, sEncCtrl.LF_shp_Q14, sEncCtrl.Gains_Q16, sEncCtrl.pitchL, sEncCtrl.Lambda_Q10, sEncCtrl.LTP_scale_Q14, + psEnc->sCmn.arch); + } + + /****************************************/ + /* Encode Parameters */ + /****************************************/ + silk_encode_indices( &psEnc->sCmn, psRangeEnc, psEnc->sCmn.nFramesEncoded, 0, condCoding ); + + /****************************************/ + /* Encode Excitation Signal */ + /****************************************/ + silk_encode_pulses( psRangeEnc, psEnc->sCmn.indices.signalType, psEnc->sCmn.indices.quantOffsetType, + psEnc->sCmn.pulses, psEnc->sCmn.frame_length ); + + nBits = ec_tell( psRangeEnc ); + + if( useCBR == 0 && iter == 0 && nBits <= maxBits ) { + break; + } + } + + if( iter == maxIter ) { + if( found_lower && ( gainsID == gainsID_lower || nBits > maxBits ) ) { + /* Restore output state from earlier iteration that did meet the bitrate budget */ + silk_memcpy( psRangeEnc, &sRangeEnc_copy2, sizeof( ec_enc ) ); + silk_assert( sRangeEnc_copy2.offs <= 1275 ); + silk_memcpy( psRangeEnc->buf, ec_buf_copy, sRangeEnc_copy2.offs ); + silk_memcpy( &psEnc->sCmn.sNSQ, &sNSQ_copy2, sizeof( silk_nsq_state ) ); + psEnc->sShape.LastGainIndex = LastGainIndex_copy2; + } + break; + } + + if( nBits > maxBits ) { + if( found_lower == 0 && iter >= 2 ) { + /* Adjust the quantizer's rate/distortion tradeoff and discard previous "upper" results */ + sEncCtrl.Lambda_Q10 = silk_ADD_RSHIFT32( sEncCtrl.Lambda_Q10, sEncCtrl.Lambda_Q10, 1 ); + found_upper = 0; + gainsID_upper = -1; + } else { + found_upper = 1; + nBits_upper = nBits; + gainMult_upper = gainMult_Q8; + gainsID_upper = gainsID; + } + } else if( nBits < maxBits - 5 ) { + found_lower = 1; + nBits_lower = nBits; + gainMult_lower = gainMult_Q8; + if( gainsID != gainsID_lower ) { + gainsID_lower = gainsID; + /* Copy part of the output state */ + silk_memcpy( &sRangeEnc_copy2, psRangeEnc, sizeof( ec_enc ) ); + silk_assert( psRangeEnc->offs <= 1275 ); + silk_memcpy( ec_buf_copy, psRangeEnc->buf, psRangeEnc->offs ); + silk_memcpy( &sNSQ_copy2, &psEnc->sCmn.sNSQ, sizeof( silk_nsq_state ) ); + LastGainIndex_copy2 = psEnc->sShape.LastGainIndex; + } + } else { + /* Within 5 bits of budget: close enough */ + break; + } + + if( ( found_lower & found_upper ) == 0 ) { + /* Adjust gain according to high-rate rate/distortion curve */ + opus_int32 gain_factor_Q16; + gain_factor_Q16 = silk_log2lin( silk_LSHIFT( nBits - maxBits, 7 ) / psEnc->sCmn.frame_length + SILK_FIX_CONST( 16, 7 ) ); + gain_factor_Q16 = silk_min_32( gain_factor_Q16, SILK_FIX_CONST( 2, 16 ) ); + if( nBits > maxBits ) { + gain_factor_Q16 = silk_max_32( gain_factor_Q16, SILK_FIX_CONST( 1.3, 16 ) ); + } + gainMult_Q8 = silk_SMULWB( gain_factor_Q16, gainMult_Q8 ); + } else { + /* Adjust gain by interpolating */ + gainMult_Q8 = gainMult_lower + silk_DIV32_16( silk_MUL( gainMult_upper - gainMult_lower, maxBits - nBits_lower ), nBits_upper - nBits_lower ); + /* New gain multplier must be between 25% and 75% of old range (note that gainMult_upper < gainMult_lower) */ + if( gainMult_Q8 > silk_ADD_RSHIFT32( gainMult_lower, gainMult_upper - gainMult_lower, 2 ) ) { + gainMult_Q8 = silk_ADD_RSHIFT32( gainMult_lower, gainMult_upper - gainMult_lower, 2 ); + } else + if( gainMult_Q8 < silk_SUB_RSHIFT32( gainMult_upper, gainMult_upper - gainMult_lower, 2 ) ) { + gainMult_Q8 = silk_SUB_RSHIFT32( gainMult_upper, gainMult_upper - gainMult_lower, 2 ); + } + } + + for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) { + sEncCtrl.Gains_Q16[ i ] = silk_LSHIFT_SAT32( silk_SMULWB( sEncCtrl.GainsUnq_Q16[ i ], gainMult_Q8 ), 8 ); + } + + /* Quantize gains */ + psEnc->sShape.LastGainIndex = sEncCtrl.lastGainIndexPrev; + silk_gains_quant( psEnc->sCmn.indices.GainsIndices, sEncCtrl.Gains_Q16, + &psEnc->sShape.LastGainIndex, condCoding == CODE_CONDITIONALLY, psEnc->sCmn.nb_subfr ); + + /* Unique identifier of gains vector */ + gainsID = silk_gains_ID( psEnc->sCmn.indices.GainsIndices, psEnc->sCmn.nb_subfr ); + } + } + + /* Update input buffer */ + silk_memmove( psEnc->x_buf, &psEnc->x_buf[ psEnc->sCmn.frame_length ], + ( psEnc->sCmn.ltp_mem_length + LA_SHAPE_MS * psEnc->sCmn.fs_kHz ) * sizeof( opus_int16 ) ); + + /* Exit without entropy coding */ + if( psEnc->sCmn.prefillFlag ) { + /* No payload */ + *pnBytesOut = 0; + RESTORE_STACK; + return ret; + } + + /* Parameters needed for next frame */ + psEnc->sCmn.prevLag = sEncCtrl.pitchL[ psEnc->sCmn.nb_subfr - 1 ]; + psEnc->sCmn.prevSignalType = psEnc->sCmn.indices.signalType; + + /****************************************/ + /* Finalize payload */ + /****************************************/ + psEnc->sCmn.first_frame_after_reset = 0; + /* Payload size */ + *pnBytesOut = silk_RSHIFT( ec_tell( psRangeEnc ) + 7, 3 ); + + RESTORE_STACK; + return ret; +} + +/* Low-Bitrate Redundancy (LBRR) encoding. Reuse all parameters but encode excitation at lower bitrate */ +static OPUS_INLINE void silk_LBRR_encode_FIX( + silk_encoder_state_FIX *psEnc, /* I/O Pointer to Silk FIX encoder state */ + silk_encoder_control_FIX *psEncCtrl, /* I/O Pointer to Silk FIX encoder control struct */ + const opus_int32 xfw_Q3[], /* I Input signal */ + opus_int condCoding /* I The type of conditional coding used so far for this frame */ +) +{ + opus_int32 TempGains_Q16[ MAX_NB_SUBFR ]; + SideInfoIndices *psIndices_LBRR = &psEnc->sCmn.indices_LBRR[ psEnc->sCmn.nFramesEncoded ]; + silk_nsq_state sNSQ_LBRR; + + /*******************************************/ + /* Control use of inband LBRR */ + /*******************************************/ + if( psEnc->sCmn.LBRR_enabled && psEnc->sCmn.speech_activity_Q8 > SILK_FIX_CONST( LBRR_SPEECH_ACTIVITY_THRES, 8 ) ) { + psEnc->sCmn.LBRR_flags[ psEnc->sCmn.nFramesEncoded ] = 1; + + /* Copy noise shaping quantizer state and quantization indices from regular encoding */ + silk_memcpy( &sNSQ_LBRR, &psEnc->sCmn.sNSQ, sizeof( silk_nsq_state ) ); + silk_memcpy( psIndices_LBRR, &psEnc->sCmn.indices, sizeof( SideInfoIndices ) ); + + /* Save original gains */ + silk_memcpy( TempGains_Q16, psEncCtrl->Gains_Q16, psEnc->sCmn.nb_subfr * sizeof( opus_int32 ) ); + + if( psEnc->sCmn.nFramesEncoded == 0 || psEnc->sCmn.LBRR_flags[ psEnc->sCmn.nFramesEncoded - 1 ] == 0 ) { + /* First frame in packet or previous frame not LBRR coded */ + psEnc->sCmn.LBRRprevLastGainIndex = psEnc->sShape.LastGainIndex; + + /* Increase Gains to get target LBRR rate */ + psIndices_LBRR->GainsIndices[ 0 ] = psIndices_LBRR->GainsIndices[ 0 ] + psEnc->sCmn.LBRR_GainIncreases; + psIndices_LBRR->GainsIndices[ 0 ] = silk_min_int( psIndices_LBRR->GainsIndices[ 0 ], N_LEVELS_QGAIN - 1 ); + } + + /* Decode to get gains in sync with decoder */ + /* Overwrite unquantized gains with quantized gains */ + silk_gains_dequant( psEncCtrl->Gains_Q16, psIndices_LBRR->GainsIndices, + &psEnc->sCmn.LBRRprevLastGainIndex, condCoding == CODE_CONDITIONALLY, psEnc->sCmn.nb_subfr ); + + /*****************************************/ + /* Noise shaping quantization */ + /*****************************************/ + if( psEnc->sCmn.nStatesDelayedDecision > 1 || psEnc->sCmn.warping_Q16 > 0 ) { + silk_NSQ_del_dec( &psEnc->sCmn, &sNSQ_LBRR, psIndices_LBRR, xfw_Q3, + psEnc->sCmn.pulses_LBRR[ psEnc->sCmn.nFramesEncoded ], psEncCtrl->PredCoef_Q12[ 0 ], psEncCtrl->LTPCoef_Q14, + psEncCtrl->AR2_Q13, psEncCtrl->HarmShapeGain_Q14, psEncCtrl->Tilt_Q14, psEncCtrl->LF_shp_Q14, + psEncCtrl->Gains_Q16, psEncCtrl->pitchL, psEncCtrl->Lambda_Q10, psEncCtrl->LTP_scale_Q14, psEnc->sCmn.arch ); + } else { + silk_NSQ( &psEnc->sCmn, &sNSQ_LBRR, psIndices_LBRR, xfw_Q3, + psEnc->sCmn.pulses_LBRR[ psEnc->sCmn.nFramesEncoded ], psEncCtrl->PredCoef_Q12[ 0 ], psEncCtrl->LTPCoef_Q14, + psEncCtrl->AR2_Q13, psEncCtrl->HarmShapeGain_Q14, psEncCtrl->Tilt_Q14, psEncCtrl->LF_shp_Q14, + psEncCtrl->Gains_Q16, psEncCtrl->pitchL, psEncCtrl->Lambda_Q10, psEncCtrl->LTP_scale_Q14, psEnc->sCmn.arch ); + } + + /* Restore original gains */ + silk_memcpy( psEncCtrl->Gains_Q16, TempGains_Q16, psEnc->sCmn.nb_subfr * sizeof( opus_int32 ) ); + } +} diff --git a/thirdparty/opus/silk/fixed/find_LPC_FIX.c b/thirdparty/opus/silk/fixed/find_LPC_FIX.c new file mode 100644 index 000000000000..e11cdc86e67d --- /dev/null +++ b/thirdparty/opus/silk/fixed/find_LPC_FIX.c @@ -0,0 +1,151 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" +#include "stack_alloc.h" +#include "tuning_parameters.h" + +/* Finds LPC vector from correlations, and converts to NLSF */ +void silk_find_LPC_FIX( + silk_encoder_state *psEncC, /* I/O Encoder state */ + opus_int16 NLSF_Q15[], /* O NLSFs */ + const opus_int16 x[], /* I Input signal */ + const opus_int32 minInvGain_Q30 /* I Inverse of max prediction gain */ +) +{ + opus_int k, subfr_length; + opus_int32 a_Q16[ MAX_LPC_ORDER ]; + opus_int isInterpLower, shift; + opus_int32 res_nrg0, res_nrg1; + opus_int rshift0, rshift1; + + /* Used only for LSF interpolation */ + opus_int32 a_tmp_Q16[ MAX_LPC_ORDER ], res_nrg_interp, res_nrg, res_tmp_nrg; + opus_int res_nrg_interp_Q, res_nrg_Q, res_tmp_nrg_Q; + opus_int16 a_tmp_Q12[ MAX_LPC_ORDER ]; + opus_int16 NLSF0_Q15[ MAX_LPC_ORDER ]; + SAVE_STACK; + + subfr_length = psEncC->subfr_length + psEncC->predictLPCOrder; + + /* Default: no interpolation */ + psEncC->indices.NLSFInterpCoef_Q2 = 4; + + /* Burg AR analysis for the full frame */ + silk_burg_modified( &res_nrg, &res_nrg_Q, a_Q16, x, minInvGain_Q30, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder, psEncC->arch ); + + if( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) { + VARDECL( opus_int16, LPC_res ); + + /* Optimal solution for last 10 ms */ + silk_burg_modified( &res_tmp_nrg, &res_tmp_nrg_Q, a_tmp_Q16, x + 2 * subfr_length, minInvGain_Q30, subfr_length, 2, psEncC->predictLPCOrder, psEncC->arch ); + + /* subtract residual energy here, as that's easier than adding it to the */ + /* residual energy of the first 10 ms in each iteration of the search below */ + shift = res_tmp_nrg_Q - res_nrg_Q; + if( shift >= 0 ) { + if( shift < 32 ) { + res_nrg = res_nrg - silk_RSHIFT( res_tmp_nrg, shift ); + } + } else { + silk_assert( shift > -32 ); + res_nrg = silk_RSHIFT( res_nrg, -shift ) - res_tmp_nrg; + res_nrg_Q = res_tmp_nrg_Q; + } + + /* Convert to NLSFs */ + silk_A2NLSF( NLSF_Q15, a_tmp_Q16, psEncC->predictLPCOrder ); + + ALLOC( LPC_res, 2 * subfr_length, opus_int16 ); + + /* Search over interpolation indices to find the one with lowest residual energy */ + for( k = 3; k >= 0; k-- ) { + /* Interpolate NLSFs for first half */ + silk_interpolate( NLSF0_Q15, psEncC->prev_NLSFq_Q15, NLSF_Q15, k, psEncC->predictLPCOrder ); + + /* Convert to LPC for residual energy evaluation */ + silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder ); + + /* Calculate residual energy with NLSF interpolation */ + silk_LPC_analysis_filter( LPC_res, x, a_tmp_Q12, 2 * subfr_length, psEncC->predictLPCOrder, psEncC->arch ); + + silk_sum_sqr_shift( &res_nrg0, &rshift0, LPC_res + psEncC->predictLPCOrder, subfr_length - psEncC->predictLPCOrder ); + silk_sum_sqr_shift( &res_nrg1, &rshift1, LPC_res + psEncC->predictLPCOrder + subfr_length, subfr_length - psEncC->predictLPCOrder ); + + /* Add subframe energies from first half frame */ + shift = rshift0 - rshift1; + if( shift >= 0 ) { + res_nrg1 = silk_RSHIFT( res_nrg1, shift ); + res_nrg_interp_Q = -rshift0; + } else { + res_nrg0 = silk_RSHIFT( res_nrg0, -shift ); + res_nrg_interp_Q = -rshift1; + } + res_nrg_interp = silk_ADD32( res_nrg0, res_nrg1 ); + + /* Compare with first half energy without NLSF interpolation, or best interpolated value so far */ + shift = res_nrg_interp_Q - res_nrg_Q; + if( shift >= 0 ) { + if( silk_RSHIFT( res_nrg_interp, shift ) < res_nrg ) { + isInterpLower = silk_TRUE; + } else { + isInterpLower = silk_FALSE; + } + } else { + if( -shift < 32 ) { + if( res_nrg_interp < silk_RSHIFT( res_nrg, -shift ) ) { + isInterpLower = silk_TRUE; + } else { + isInterpLower = silk_FALSE; + } + } else { + isInterpLower = silk_FALSE; + } + } + + /* Determine whether current interpolated NLSFs are best so far */ + if( isInterpLower == silk_TRUE ) { + /* Interpolation has lower residual energy */ + res_nrg = res_nrg_interp; + res_nrg_Q = res_nrg_interp_Q; + psEncC->indices.NLSFInterpCoef_Q2 = (opus_int8)k; + } + } + } + + if( psEncC->indices.NLSFInterpCoef_Q2 == 4 ) { + /* NLSF interpolation is currently inactive, calculate NLSFs from full frame AR coefficients */ + silk_A2NLSF( NLSF_Q15, a_Q16, psEncC->predictLPCOrder ); + } + + silk_assert( psEncC->indices.NLSFInterpCoef_Q2 == 4 || ( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) ); + RESTORE_STACK; +} diff --git a/thirdparty/opus/silk/fixed/find_LTP_FIX.c b/thirdparty/opus/silk/fixed/find_LTP_FIX.c new file mode 100644 index 000000000000..1314a281375b --- /dev/null +++ b/thirdparty/opus/silk/fixed/find_LTP_FIX.c @@ -0,0 +1,245 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" +#include "tuning_parameters.h" + +/* Head room for correlations */ +#define LTP_CORRS_HEAD_ROOM 2 + +void silk_fit_LTP( + opus_int32 LTP_coefs_Q16[ LTP_ORDER ], + opus_int16 LTP_coefs_Q14[ LTP_ORDER ] +); + +void silk_find_LTP_FIX( + opus_int16 b_Q14[ MAX_NB_SUBFR * LTP_ORDER ], /* O LTP coefs */ + opus_int32 WLTP[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ], /* O Weight for LTP quantization */ + opus_int *LTPredCodGain_Q7, /* O LTP coding gain */ + const opus_int16 r_lpc[], /* I residual signal after LPC signal + state for first 10 ms */ + const opus_int lag[ MAX_NB_SUBFR ], /* I LTP lags */ + const opus_int32 Wght_Q15[ MAX_NB_SUBFR ], /* I weights */ + const opus_int subfr_length, /* I subframe length */ + const opus_int nb_subfr, /* I number of subframes */ + const opus_int mem_offset, /* I number of samples in LTP memory */ + opus_int corr_rshifts[ MAX_NB_SUBFR ], /* O right shifts applied to correlations */ + int arch /* I Run-time architecture */ +) +{ + opus_int i, k, lshift; + const opus_int16 *r_ptr, *lag_ptr; + opus_int16 *b_Q14_ptr; + + opus_int32 regu; + opus_int32 *WLTP_ptr; + opus_int32 b_Q16[ LTP_ORDER ], delta_b_Q14[ LTP_ORDER ], d_Q14[ MAX_NB_SUBFR ], nrg[ MAX_NB_SUBFR ], g_Q26; + opus_int32 w[ MAX_NB_SUBFR ], WLTP_max, max_abs_d_Q14, max_w_bits; + + opus_int32 temp32, denom32; + opus_int extra_shifts; + opus_int rr_shifts, maxRshifts, maxRshifts_wxtra, LZs; + opus_int32 LPC_res_nrg, LPC_LTP_res_nrg, div_Q16; + opus_int32 Rr[ LTP_ORDER ], rr[ MAX_NB_SUBFR ]; + opus_int32 wd, m_Q12; + + b_Q14_ptr = b_Q14; + WLTP_ptr = WLTP; + r_ptr = &r_lpc[ mem_offset ]; + for( k = 0; k < nb_subfr; k++ ) { + lag_ptr = r_ptr - ( lag[ k ] + LTP_ORDER / 2 ); + + silk_sum_sqr_shift( &rr[ k ], &rr_shifts, r_ptr, subfr_length ); /* rr[ k ] in Q( -rr_shifts ) */ + + /* Assure headroom */ + LZs = silk_CLZ32( rr[k] ); + if( LZs < LTP_CORRS_HEAD_ROOM ) { + rr[ k ] = silk_RSHIFT_ROUND( rr[ k ], LTP_CORRS_HEAD_ROOM - LZs ); + rr_shifts += ( LTP_CORRS_HEAD_ROOM - LZs ); + } + corr_rshifts[ k ] = rr_shifts; + silk_corrMatrix_FIX( lag_ptr, subfr_length, LTP_ORDER, LTP_CORRS_HEAD_ROOM, WLTP_ptr, &corr_rshifts[ k ], arch ); /* WLTP_fix_ptr in Q( -corr_rshifts[ k ] ) */ + + /* The correlation vector always has lower max abs value than rr and/or RR so head room is assured */ + silk_corrVector_FIX( lag_ptr, r_ptr, subfr_length, LTP_ORDER, Rr, corr_rshifts[ k ], arch ); /* Rr_fix_ptr in Q( -corr_rshifts[ k ] ) */ + if( corr_rshifts[ k ] > rr_shifts ) { + rr[ k ] = silk_RSHIFT( rr[ k ], corr_rshifts[ k ] - rr_shifts ); /* rr[ k ] in Q( -corr_rshifts[ k ] ) */ + } + silk_assert( rr[ k ] >= 0 ); + + regu = 1; + regu = silk_SMLAWB( regu, rr[ k ], SILK_FIX_CONST( LTP_DAMPING/3, 16 ) ); + regu = silk_SMLAWB( regu, matrix_ptr( WLTP_ptr, 0, 0, LTP_ORDER ), SILK_FIX_CONST( LTP_DAMPING/3, 16 ) ); + regu = silk_SMLAWB( regu, matrix_ptr( WLTP_ptr, LTP_ORDER-1, LTP_ORDER-1, LTP_ORDER ), SILK_FIX_CONST( LTP_DAMPING/3, 16 ) ); + silk_regularize_correlations_FIX( WLTP_ptr, &rr[k], regu, LTP_ORDER ); + + silk_solve_LDL_FIX( WLTP_ptr, LTP_ORDER, Rr, b_Q16 ); /* WLTP_fix_ptr and Rr_fix_ptr both in Q(-corr_rshifts[k]) */ + + /* Limit and store in Q14 */ + silk_fit_LTP( b_Q16, b_Q14_ptr ); + + /* Calculate residual energy */ + nrg[ k ] = silk_residual_energy16_covar_FIX( b_Q14_ptr, WLTP_ptr, Rr, rr[ k ], LTP_ORDER, 14 ); /* nrg_fix in Q( -corr_rshifts[ k ] ) */ + + /* temp = Wght[ k ] / ( nrg[ k ] * Wght[ k ] + 0.01f * subfr_length ); */ + extra_shifts = silk_min_int( corr_rshifts[ k ], LTP_CORRS_HEAD_ROOM ); + denom32 = silk_LSHIFT_SAT32( silk_SMULWB( nrg[ k ], Wght_Q15[ k ] ), 1 + extra_shifts ) + /* Q( -corr_rshifts[ k ] + extra_shifts ) */ + silk_RSHIFT( silk_SMULWB( (opus_int32)subfr_length, 655 ), corr_rshifts[ k ] - extra_shifts ); /* Q( -corr_rshifts[ k ] + extra_shifts ) */ + denom32 = silk_max( denom32, 1 ); + silk_assert( ((opus_int64)Wght_Q15[ k ] << 16 ) < silk_int32_MAX ); /* Wght always < 0.5 in Q0 */ + temp32 = silk_DIV32( silk_LSHIFT( (opus_int32)Wght_Q15[ k ], 16 ), denom32 ); /* Q( 15 + 16 + corr_rshifts[k] - extra_shifts ) */ + temp32 = silk_RSHIFT( temp32, 31 + corr_rshifts[ k ] - extra_shifts - 26 ); /* Q26 */ + + /* Limit temp such that the below scaling never wraps around */ + WLTP_max = 0; + for( i = 0; i < LTP_ORDER * LTP_ORDER; i++ ) { + WLTP_max = silk_max( WLTP_ptr[ i ], WLTP_max ); + } + lshift = silk_CLZ32( WLTP_max ) - 1 - 3; /* keep 3 bits free for vq_nearest_neighbor_fix */ + silk_assert( 26 - 18 + lshift >= 0 ); + if( 26 - 18 + lshift < 31 ) { + temp32 = silk_min_32( temp32, silk_LSHIFT( (opus_int32)1, 26 - 18 + lshift ) ); + } + + silk_scale_vector32_Q26_lshift_18( WLTP_ptr, temp32, LTP_ORDER * LTP_ORDER ); /* WLTP_ptr in Q( 18 - corr_rshifts[ k ] ) */ + + w[ k ] = matrix_ptr( WLTP_ptr, LTP_ORDER/2, LTP_ORDER/2, LTP_ORDER ); /* w in Q( 18 - corr_rshifts[ k ] ) */ + silk_assert( w[k] >= 0 ); + + r_ptr += subfr_length; + b_Q14_ptr += LTP_ORDER; + WLTP_ptr += LTP_ORDER * LTP_ORDER; + } + + maxRshifts = 0; + for( k = 0; k < nb_subfr; k++ ) { + maxRshifts = silk_max_int( corr_rshifts[ k ], maxRshifts ); + } + + /* Compute LTP coding gain */ + if( LTPredCodGain_Q7 != NULL ) { + LPC_LTP_res_nrg = 0; + LPC_res_nrg = 0; + silk_assert( LTP_CORRS_HEAD_ROOM >= 2 ); /* Check that no overflow will happen when adding */ + for( k = 0; k < nb_subfr; k++ ) { + LPC_res_nrg = silk_ADD32( LPC_res_nrg, silk_RSHIFT( silk_ADD32( silk_SMULWB( rr[ k ], Wght_Q15[ k ] ), 1 ), 1 + ( maxRshifts - corr_rshifts[ k ] ) ) ); /* Q( -maxRshifts ) */ + LPC_LTP_res_nrg = silk_ADD32( LPC_LTP_res_nrg, silk_RSHIFT( silk_ADD32( silk_SMULWB( nrg[ k ], Wght_Q15[ k ] ), 1 ), 1 + ( maxRshifts - corr_rshifts[ k ] ) ) ); /* Q( -maxRshifts ) */ + } + LPC_LTP_res_nrg = silk_max( LPC_LTP_res_nrg, 1 ); /* avoid division by zero */ + + div_Q16 = silk_DIV32_varQ( LPC_res_nrg, LPC_LTP_res_nrg, 16 ); + *LTPredCodGain_Q7 = ( opus_int )silk_SMULBB( 3, silk_lin2log( div_Q16 ) - ( 16 << 7 ) ); + + silk_assert( *LTPredCodGain_Q7 == ( opus_int )silk_SAT16( silk_MUL( 3, silk_lin2log( div_Q16 ) - ( 16 << 7 ) ) ) ); + } + + /* smoothing */ + /* d = sum( B, 1 ); */ + b_Q14_ptr = b_Q14; + for( k = 0; k < nb_subfr; k++ ) { + d_Q14[ k ] = 0; + for( i = 0; i < LTP_ORDER; i++ ) { + d_Q14[ k ] += b_Q14_ptr[ i ]; + } + b_Q14_ptr += LTP_ORDER; + } + + /* m = ( w * d' ) / ( sum( w ) + 1e-3 ); */ + + /* Find maximum absolute value of d_Q14 and the bits used by w in Q0 */ + max_abs_d_Q14 = 0; + max_w_bits = 0; + for( k = 0; k < nb_subfr; k++ ) { + max_abs_d_Q14 = silk_max_32( max_abs_d_Q14, silk_abs( d_Q14[ k ] ) ); + /* w[ k ] is in Q( 18 - corr_rshifts[ k ] ) */ + /* Find bits needed in Q( 18 - maxRshifts ) */ + max_w_bits = silk_max_32( max_w_bits, 32 - silk_CLZ32( w[ k ] ) + corr_rshifts[ k ] - maxRshifts ); + } + + /* max_abs_d_Q14 = (5 << 15); worst case, i.e. LTP_ORDER * -silk_int16_MIN */ + silk_assert( max_abs_d_Q14 <= ( 5 << 15 ) ); + + /* How many bits is needed for w*d' in Q( 18 - maxRshifts ) in the worst case, of all d_Q14's being equal to max_abs_d_Q14 */ + extra_shifts = max_w_bits + 32 - silk_CLZ32( max_abs_d_Q14 ) - 14; + + /* Subtract what we got available; bits in output var plus maxRshifts */ + extra_shifts -= ( 32 - 1 - 2 + maxRshifts ); /* Keep sign bit free as well as 2 bits for accumulation */ + extra_shifts = silk_max_int( extra_shifts, 0 ); + + maxRshifts_wxtra = maxRshifts + extra_shifts; + + temp32 = silk_RSHIFT( 262, maxRshifts + extra_shifts ) + 1; /* 1e-3f in Q( 18 - (maxRshifts + extra_shifts) ) */ + wd = 0; + for( k = 0; k < nb_subfr; k++ ) { + /* w has at least 2 bits of headroom so no overflow should happen */ + temp32 = silk_ADD32( temp32, silk_RSHIFT( w[ k ], maxRshifts_wxtra - corr_rshifts[ k ] ) ); /* Q( 18 - maxRshifts_wxtra ) */ + wd = silk_ADD32( wd, silk_LSHIFT( silk_SMULWW( silk_RSHIFT( w[ k ], maxRshifts_wxtra - corr_rshifts[ k ] ), d_Q14[ k ] ), 2 ) ); /* Q( 18 - maxRshifts_wxtra ) */ + } + m_Q12 = silk_DIV32_varQ( wd, temp32, 12 ); + + b_Q14_ptr = b_Q14; + for( k = 0; k < nb_subfr; k++ ) { + /* w_fix[ k ] from Q( 18 - corr_rshifts[ k ] ) to Q( 16 ) */ + if( 2 - corr_rshifts[k] > 0 ) { + temp32 = silk_RSHIFT( w[ k ], 2 - corr_rshifts[ k ] ); + } else { + temp32 = silk_LSHIFT_SAT32( w[ k ], corr_rshifts[ k ] - 2 ); + } + + g_Q26 = silk_MUL( + silk_DIV32( + SILK_FIX_CONST( LTP_SMOOTHING, 26 ), + silk_RSHIFT( SILK_FIX_CONST( LTP_SMOOTHING, 26 ), 10 ) + temp32 ), /* Q10 */ + silk_LSHIFT_SAT32( silk_SUB_SAT32( (opus_int32)m_Q12, silk_RSHIFT( d_Q14[ k ], 2 ) ), 4 ) ); /* Q16 */ + + temp32 = 0; + for( i = 0; i < LTP_ORDER; i++ ) { + delta_b_Q14[ i ] = silk_max_16( b_Q14_ptr[ i ], 1638 ); /* 1638_Q14 = 0.1_Q0 */ + temp32 += delta_b_Q14[ i ]; /* Q14 */ + } + temp32 = silk_DIV32( g_Q26, temp32 ); /* Q14 -> Q12 */ + for( i = 0; i < LTP_ORDER; i++ ) { + b_Q14_ptr[ i ] = silk_LIMIT_32( (opus_int32)b_Q14_ptr[ i ] + silk_SMULWB( silk_LSHIFT_SAT32( temp32, 4 ), delta_b_Q14[ i ] ), -16000, 28000 ); + } + b_Q14_ptr += LTP_ORDER; + } +} + +void silk_fit_LTP( + opus_int32 LTP_coefs_Q16[ LTP_ORDER ], + opus_int16 LTP_coefs_Q14[ LTP_ORDER ] +) +{ + opus_int i; + + for( i = 0; i < LTP_ORDER; i++ ) { + LTP_coefs_Q14[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( LTP_coefs_Q16[ i ], 2 ) ); + } +} diff --git a/thirdparty/opus/silk/fixed/find_pitch_lags_FIX.c b/thirdparty/opus/silk/fixed/find_pitch_lags_FIX.c new file mode 100644 index 000000000000..b8440a8247a3 --- /dev/null +++ b/thirdparty/opus/silk/fixed/find_pitch_lags_FIX.c @@ -0,0 +1,145 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" +#include "stack_alloc.h" +#include "tuning_parameters.h" + +/* Find pitch lags */ +void silk_find_pitch_lags_FIX( + silk_encoder_state_FIX *psEnc, /* I/O encoder state */ + silk_encoder_control_FIX *psEncCtrl, /* I/O encoder control */ + opus_int16 res[], /* O residual */ + const opus_int16 x[], /* I Speech signal */ + int arch /* I Run-time architecture */ +) +{ + opus_int buf_len, i, scale; + opus_int32 thrhld_Q13, res_nrg; + const opus_int16 *x_buf, *x_buf_ptr; + VARDECL( opus_int16, Wsig ); + opus_int16 *Wsig_ptr; + opus_int32 auto_corr[ MAX_FIND_PITCH_LPC_ORDER + 1 ]; + opus_int16 rc_Q15[ MAX_FIND_PITCH_LPC_ORDER ]; + opus_int32 A_Q24[ MAX_FIND_PITCH_LPC_ORDER ]; + opus_int16 A_Q12[ MAX_FIND_PITCH_LPC_ORDER ]; + SAVE_STACK; + + /******************************************/ + /* Set up buffer lengths etc based on Fs */ + /******************************************/ + buf_len = psEnc->sCmn.la_pitch + psEnc->sCmn.frame_length + psEnc->sCmn.ltp_mem_length; + + /* Safety check */ + silk_assert( buf_len >= psEnc->sCmn.pitch_LPC_win_length ); + + x_buf = x - psEnc->sCmn.ltp_mem_length; + + /*************************************/ + /* Estimate LPC AR coefficients */ + /*************************************/ + + /* Calculate windowed signal */ + + ALLOC( Wsig, psEnc->sCmn.pitch_LPC_win_length, opus_int16 ); + + /* First LA_LTP samples */ + x_buf_ptr = x_buf + buf_len - psEnc->sCmn.pitch_LPC_win_length; + Wsig_ptr = Wsig; + silk_apply_sine_window( Wsig_ptr, x_buf_ptr, 1, psEnc->sCmn.la_pitch ); + + /* Middle un - windowed samples */ + Wsig_ptr += psEnc->sCmn.la_pitch; + x_buf_ptr += psEnc->sCmn.la_pitch; + silk_memcpy( Wsig_ptr, x_buf_ptr, ( psEnc->sCmn.pitch_LPC_win_length - silk_LSHIFT( psEnc->sCmn.la_pitch, 1 ) ) * sizeof( opus_int16 ) ); + + /* Last LA_LTP samples */ + Wsig_ptr += psEnc->sCmn.pitch_LPC_win_length - silk_LSHIFT( psEnc->sCmn.la_pitch, 1 ); + x_buf_ptr += psEnc->sCmn.pitch_LPC_win_length - silk_LSHIFT( psEnc->sCmn.la_pitch, 1 ); + silk_apply_sine_window( Wsig_ptr, x_buf_ptr, 2, psEnc->sCmn.la_pitch ); + + /* Calculate autocorrelation sequence */ + silk_autocorr( auto_corr, &scale, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1, arch ); + + /* Add white noise, as fraction of energy */ + auto_corr[ 0 ] = silk_SMLAWB( auto_corr[ 0 ], auto_corr[ 0 ], SILK_FIX_CONST( FIND_PITCH_WHITE_NOISE_FRACTION, 16 ) ) + 1; + + /* Calculate the reflection coefficients using schur */ + res_nrg = silk_schur( rc_Q15, auto_corr, psEnc->sCmn.pitchEstimationLPCOrder ); + + /* Prediction gain */ + psEncCtrl->predGain_Q16 = silk_DIV32_varQ( auto_corr[ 0 ], silk_max_int( res_nrg, 1 ), 16 ); + + /* Convert reflection coefficients to prediction coefficients */ + silk_k2a( A_Q24, rc_Q15, psEnc->sCmn.pitchEstimationLPCOrder ); + + /* Convert From 32 bit Q24 to 16 bit Q12 coefs */ + for( i = 0; i < psEnc->sCmn.pitchEstimationLPCOrder; i++ ) { + A_Q12[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT( A_Q24[ i ], 12 ) ); + } + + /* Do BWE */ + silk_bwexpander( A_Q12, psEnc->sCmn.pitchEstimationLPCOrder, SILK_FIX_CONST( FIND_PITCH_BANDWIDTH_EXPANSION, 16 ) ); + + /*****************************************/ + /* LPC analysis filtering */ + /*****************************************/ + silk_LPC_analysis_filter( res, x_buf, A_Q12, buf_len, psEnc->sCmn.pitchEstimationLPCOrder, psEnc->sCmn.arch ); + + if( psEnc->sCmn.indices.signalType != TYPE_NO_VOICE_ACTIVITY && psEnc->sCmn.first_frame_after_reset == 0 ) { + /* Threshold for pitch estimator */ + thrhld_Q13 = SILK_FIX_CONST( 0.6, 13 ); + thrhld_Q13 = silk_SMLABB( thrhld_Q13, SILK_FIX_CONST( -0.004, 13 ), psEnc->sCmn.pitchEstimationLPCOrder ); + thrhld_Q13 = silk_SMLAWB( thrhld_Q13, SILK_FIX_CONST( -0.1, 21 ), psEnc->sCmn.speech_activity_Q8 ); + thrhld_Q13 = silk_SMLABB( thrhld_Q13, SILK_FIX_CONST( -0.15, 13 ), silk_RSHIFT( psEnc->sCmn.prevSignalType, 1 ) ); + thrhld_Q13 = silk_SMLAWB( thrhld_Q13, SILK_FIX_CONST( -0.1, 14 ), psEnc->sCmn.input_tilt_Q15 ); + thrhld_Q13 = silk_SAT16( thrhld_Q13 ); + + /*****************************************/ + /* Call pitch estimator */ + /*****************************************/ + if( silk_pitch_analysis_core( res, psEncCtrl->pitchL, &psEnc->sCmn.indices.lagIndex, &psEnc->sCmn.indices.contourIndex, + &psEnc->LTPCorr_Q15, psEnc->sCmn.prevLag, psEnc->sCmn.pitchEstimationThreshold_Q16, + (opus_int)thrhld_Q13, psEnc->sCmn.fs_kHz, psEnc->sCmn.pitchEstimationComplexity, psEnc->sCmn.nb_subfr, + psEnc->sCmn.arch) == 0 ) + { + psEnc->sCmn.indices.signalType = TYPE_VOICED; + } else { + psEnc->sCmn.indices.signalType = TYPE_UNVOICED; + } + } else { + silk_memset( psEncCtrl->pitchL, 0, sizeof( psEncCtrl->pitchL ) ); + psEnc->sCmn.indices.lagIndex = 0; + psEnc->sCmn.indices.contourIndex = 0; + psEnc->LTPCorr_Q15 = 0; + } + RESTORE_STACK; +} diff --git a/thirdparty/opus/silk/fixed/find_pred_coefs_FIX.c b/thirdparty/opus/silk/fixed/find_pred_coefs_FIX.c new file mode 100644 index 000000000000..d308e9cf5fe6 --- /dev/null +++ b/thirdparty/opus/silk/fixed/find_pred_coefs_FIX.c @@ -0,0 +1,148 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" +#include "stack_alloc.h" + +void silk_find_pred_coefs_FIX( + silk_encoder_state_FIX *psEnc, /* I/O encoder state */ + silk_encoder_control_FIX *psEncCtrl, /* I/O encoder control */ + const opus_int16 res_pitch[], /* I Residual from pitch analysis */ + const opus_int16 x[], /* I Speech signal */ + opus_int condCoding /* I The type of conditional coding to use */ +) +{ + opus_int i; + opus_int32 invGains_Q16[ MAX_NB_SUBFR ], local_gains[ MAX_NB_SUBFR ], Wght_Q15[ MAX_NB_SUBFR ]; + opus_int16 NLSF_Q15[ MAX_LPC_ORDER ]; + const opus_int16 *x_ptr; + opus_int16 *x_pre_ptr; + VARDECL( opus_int16, LPC_in_pre ); + opus_int32 tmp, min_gain_Q16, minInvGain_Q30; + opus_int LTP_corrs_rshift[ MAX_NB_SUBFR ]; + SAVE_STACK; + + /* weighting for weighted least squares */ + min_gain_Q16 = silk_int32_MAX >> 6; + for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) { + min_gain_Q16 = silk_min( min_gain_Q16, psEncCtrl->Gains_Q16[ i ] ); + } + for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) { + /* Divide to Q16 */ + silk_assert( psEncCtrl->Gains_Q16[ i ] > 0 ); + /* Invert and normalize gains, and ensure that maximum invGains_Q16 is within range of a 16 bit int */ + invGains_Q16[ i ] = silk_DIV32_varQ( min_gain_Q16, psEncCtrl->Gains_Q16[ i ], 16 - 2 ); + + /* Ensure Wght_Q15 a minimum value 1 */ + invGains_Q16[ i ] = silk_max( invGains_Q16[ i ], 363 ); + + /* Square the inverted gains */ + silk_assert( invGains_Q16[ i ] == silk_SAT16( invGains_Q16[ i ] ) ); + tmp = silk_SMULWB( invGains_Q16[ i ], invGains_Q16[ i ] ); + Wght_Q15[ i ] = silk_RSHIFT( tmp, 1 ); + + /* Invert the inverted and normalized gains */ + local_gains[ i ] = silk_DIV32( ( (opus_int32)1 << 16 ), invGains_Q16[ i ] ); + } + + ALLOC( LPC_in_pre, + psEnc->sCmn.nb_subfr * psEnc->sCmn.predictLPCOrder + + psEnc->sCmn.frame_length, opus_int16 ); + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + VARDECL( opus_int32, WLTP ); + + /**********/ + /* VOICED */ + /**********/ + silk_assert( psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->pitchL[ 0 ] + LTP_ORDER / 2 ); + + ALLOC( WLTP, psEnc->sCmn.nb_subfr * LTP_ORDER * LTP_ORDER, opus_int32 ); + + /* LTP analysis */ + silk_find_LTP_FIX( psEncCtrl->LTPCoef_Q14, WLTP, &psEncCtrl->LTPredCodGain_Q7, + res_pitch, psEncCtrl->pitchL, Wght_Q15, psEnc->sCmn.subfr_length, + psEnc->sCmn.nb_subfr, psEnc->sCmn.ltp_mem_length, LTP_corrs_rshift, psEnc->sCmn.arch ); + + /* Quantize LTP gain parameters */ + silk_quant_LTP_gains( psEncCtrl->LTPCoef_Q14, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex, + &psEnc->sCmn.sum_log_gain_Q7, WLTP, psEnc->sCmn.mu_LTP_Q9, psEnc->sCmn.LTPQuantLowComplexity, psEnc->sCmn.nb_subfr, + psEnc->sCmn.arch); + + /* Control LTP scaling */ + silk_LTP_scale_ctrl_FIX( psEnc, psEncCtrl, condCoding ); + + /* Create LTP residual */ + silk_LTP_analysis_filter_FIX( LPC_in_pre, x - psEnc->sCmn.predictLPCOrder, psEncCtrl->LTPCoef_Q14, + psEncCtrl->pitchL, invGains_Q16, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.predictLPCOrder ); + + } else { + /************/ + /* UNVOICED */ + /************/ + /* Create signal with prepended subframes, scaled by inverse gains */ + x_ptr = x - psEnc->sCmn.predictLPCOrder; + x_pre_ptr = LPC_in_pre; + for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) { + silk_scale_copy_vector16( x_pre_ptr, x_ptr, invGains_Q16[ i ], + psEnc->sCmn.subfr_length + psEnc->sCmn.predictLPCOrder ); + x_pre_ptr += psEnc->sCmn.subfr_length + psEnc->sCmn.predictLPCOrder; + x_ptr += psEnc->sCmn.subfr_length; + } + + silk_memset( psEncCtrl->LTPCoef_Q14, 0, psEnc->sCmn.nb_subfr * LTP_ORDER * sizeof( opus_int16 ) ); + psEncCtrl->LTPredCodGain_Q7 = 0; + psEnc->sCmn.sum_log_gain_Q7 = 0; + } + + /* Limit on total predictive coding gain */ + if( psEnc->sCmn.first_frame_after_reset ) { + minInvGain_Q30 = SILK_FIX_CONST( 1.0f / MAX_PREDICTION_POWER_GAIN_AFTER_RESET, 30 ); + } else { + minInvGain_Q30 = silk_log2lin( silk_SMLAWB( 16 << 7, (opus_int32)psEncCtrl->LTPredCodGain_Q7, SILK_FIX_CONST( 1.0 / 3, 16 ) ) ); /* Q16 */ + minInvGain_Q30 = silk_DIV32_varQ( minInvGain_Q30, + silk_SMULWW( SILK_FIX_CONST( MAX_PREDICTION_POWER_GAIN, 0 ), + silk_SMLAWB( SILK_FIX_CONST( 0.25, 18 ), SILK_FIX_CONST( 0.75, 18 ), psEncCtrl->coding_quality_Q14 ) ), 14 ); + } + + /* LPC_in_pre contains the LTP-filtered input for voiced, and the unfiltered input for unvoiced */ + silk_find_LPC_FIX( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain_Q30 ); + + /* Quantize LSFs */ + silk_process_NLSFs( &psEnc->sCmn, psEncCtrl->PredCoef_Q12, NLSF_Q15, psEnc->sCmn.prev_NLSFq_Q15 ); + + /* Calculate residual energy using quantized LPC coefficients */ + silk_residual_energy_FIX( psEncCtrl->ResNrg, psEncCtrl->ResNrgQ, LPC_in_pre, psEncCtrl->PredCoef_Q12, local_gains, + psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.predictLPCOrder, psEnc->sCmn.arch ); + + /* Copy to prediction struct for use in next frame for interpolation */ + silk_memcpy( psEnc->sCmn.prev_NLSFq_Q15, NLSF_Q15, sizeof( psEnc->sCmn.prev_NLSFq_Q15 ) ); + RESTORE_STACK; +} diff --git a/thirdparty/opus/silk/fixed/k2a_FIX.c b/thirdparty/opus/silk/fixed/k2a_FIX.c new file mode 100644 index 000000000000..5fee599bcb74 --- /dev/null +++ b/thirdparty/opus/silk/fixed/k2a_FIX.c @@ -0,0 +1,53 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" + +/* Step up function, converts reflection coefficients to prediction coefficients */ +void silk_k2a( + opus_int32 *A_Q24, /* O Prediction coefficients [order] Q24 */ + const opus_int16 *rc_Q15, /* I Reflection coefficients [order] Q15 */ + const opus_int32 order /* I Prediction order */ +) +{ + opus_int k, n; + opus_int32 Atmp[ SILK_MAX_ORDER_LPC ]; + + for( k = 0; k < order; k++ ) { + for( n = 0; n < k; n++ ) { + Atmp[ n ] = A_Q24[ n ]; + } + for( n = 0; n < k; n++ ) { + A_Q24[ n ] = silk_SMLAWB( A_Q24[ n ], silk_LSHIFT( Atmp[ k - n - 1 ], 1 ), rc_Q15[ k ] ); + } + A_Q24[ k ] = -silk_LSHIFT( (opus_int32)rc_Q15[ k ], 9 ); + } +} diff --git a/thirdparty/opus/silk/fixed/k2a_Q16_FIX.c b/thirdparty/opus/silk/fixed/k2a_Q16_FIX.c new file mode 100644 index 000000000000..3b039875446a --- /dev/null +++ b/thirdparty/opus/silk/fixed/k2a_Q16_FIX.c @@ -0,0 +1,53 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" + +/* Step up function, converts reflection coefficients to prediction coefficients */ +void silk_k2a_Q16( + opus_int32 *A_Q24, /* O Prediction coefficients [order] Q24 */ + const opus_int32 *rc_Q16, /* I Reflection coefficients [order] Q16 */ + const opus_int32 order /* I Prediction order */ +) +{ + opus_int k, n; + opus_int32 Atmp[ SILK_MAX_ORDER_LPC ]; + + for( k = 0; k < order; k++ ) { + for( n = 0; n < k; n++ ) { + Atmp[ n ] = A_Q24[ n ]; + } + for( n = 0; n < k; n++ ) { + A_Q24[ n ] = silk_SMLAWW( A_Q24[ n ], Atmp[ k - n - 1 ], rc_Q16[ k ] ); + } + A_Q24[ k ] = -silk_LSHIFT( rc_Q16[ k ], 8 ); + } +} diff --git a/thirdparty/opus/silk/fixed/main_FIX.h b/thirdparty/opus/silk/fixed/main_FIX.h new file mode 100644 index 000000000000..375b5eb32ee6 --- /dev/null +++ b/thirdparty/opus/silk/fixed/main_FIX.h @@ -0,0 +1,272 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_MAIN_FIX_H +#define SILK_MAIN_FIX_H + +#include "SigProc_FIX.h" +#include "structs_FIX.h" +#include "control.h" +#include "main.h" +#include "PLC.h" +#include "debug.h" +#include "entenc.h" + +#ifndef FORCE_CPP_BUILD +#ifdef __cplusplus +extern "C" +{ +#endif +#endif + +#define silk_encoder_state_Fxx silk_encoder_state_FIX +#define silk_encode_do_VAD_Fxx silk_encode_do_VAD_FIX +#define silk_encode_frame_Fxx silk_encode_frame_FIX + +/*********************/ +/* Encoder Functions */ +/*********************/ + +/* High-pass filter with cutoff frequency adaptation based on pitch lag statistics */ +void silk_HP_variable_cutoff( + silk_encoder_state_Fxx state_Fxx[] /* I/O Encoder states */ +); + +/* Encoder main function */ +void silk_encode_do_VAD_FIX( + silk_encoder_state_FIX *psEnc /* I/O Pointer to Silk FIX encoder state */ +); + +/* Encoder main function */ +opus_int silk_encode_frame_FIX( + silk_encoder_state_FIX *psEnc, /* I/O Pointer to Silk FIX encoder state */ + opus_int32 *pnBytesOut, /* O Pointer to number of payload bytes; */ + ec_enc *psRangeEnc, /* I/O compressor data structure */ + opus_int condCoding, /* I The type of conditional coding to use */ + opus_int maxBits, /* I If > 0: maximum number of output bits */ + opus_int useCBR /* I Flag to force constant-bitrate operation */ +); + +/* Initializes the Silk encoder state */ +opus_int silk_init_encoder( + silk_encoder_state_Fxx *psEnc, /* I/O Pointer to Silk FIX encoder state */ + int arch /* I Run-time architecture */ +); + +/* Control the Silk encoder */ +opus_int silk_control_encoder( + silk_encoder_state_Fxx *psEnc, /* I/O Pointer to Silk encoder state */ + silk_EncControlStruct *encControl, /* I Control structure */ + const opus_int32 TargetRate_bps, /* I Target max bitrate (bps) */ + const opus_int allow_bw_switch, /* I Flag to allow switching audio bandwidth */ + const opus_int channelNb, /* I Channel number */ + const opus_int force_fs_kHz +); + +/****************/ +/* Prefiltering */ +/****************/ +void silk_prefilter_FIX( + silk_encoder_state_FIX *psEnc, /* I/O Encoder state */ + const silk_encoder_control_FIX *psEncCtrl, /* I Encoder control */ + opus_int32 xw_Q10[], /* O Weighted signal */ + const opus_int16 x[] /* I Speech signal */ +); + +void silk_warped_LPC_analysis_filter_FIX_c( + opus_int32 state[], /* I/O State [order + 1] */ + opus_int32 res_Q2[], /* O Residual signal [length] */ + const opus_int16 coef_Q13[], /* I Coefficients [order] */ + const opus_int16 input[], /* I Input signal [length] */ + const opus_int16 lambda_Q16, /* I Warping factor */ + const opus_int length, /* I Length of input signal */ + const opus_int order /* I Filter order (even) */ +); + + +/**************************/ +/* Noise shaping analysis */ +/**************************/ +/* Compute noise shaping coefficients and initial gain values */ +void silk_noise_shape_analysis_FIX( + silk_encoder_state_FIX *psEnc, /* I/O Encoder state FIX */ + silk_encoder_control_FIX *psEncCtrl, /* I/O Encoder control FIX */ + const opus_int16 *pitch_res, /* I LPC residual from pitch analysis */ + const opus_int16 *x, /* I Input signal [ frame_length + la_shape ] */ + int arch /* I Run-time architecture */ +); + +/* Autocorrelations for a warped frequency axis */ +void silk_warped_autocorrelation_FIX( + opus_int32 *corr, /* O Result [order + 1] */ + opus_int *scale, /* O Scaling of the correlation vector */ + const opus_int16 *input, /* I Input data to correlate */ + const opus_int warping_Q16, /* I Warping coefficient */ + const opus_int length, /* I Length of input */ + const opus_int order /* I Correlation order (even) */ +); + +/* Calculation of LTP state scaling */ +void silk_LTP_scale_ctrl_FIX( + silk_encoder_state_FIX *psEnc, /* I/O encoder state */ + silk_encoder_control_FIX *psEncCtrl, /* I/O encoder control */ + opus_int condCoding /* I The type of conditional coding to use */ +); + +/**********************************************/ +/* Prediction Analysis */ +/**********************************************/ +/* Find pitch lags */ +void silk_find_pitch_lags_FIX( + silk_encoder_state_FIX *psEnc, /* I/O encoder state */ + silk_encoder_control_FIX *psEncCtrl, /* I/O encoder control */ + opus_int16 res[], /* O residual */ + const opus_int16 x[], /* I Speech signal */ + int arch /* I Run-time architecture */ +); + +/* Find LPC and LTP coefficients */ +void silk_find_pred_coefs_FIX( + silk_encoder_state_FIX *psEnc, /* I/O encoder state */ + silk_encoder_control_FIX *psEncCtrl, /* I/O encoder control */ + const opus_int16 res_pitch[], /* I Residual from pitch analysis */ + const opus_int16 x[], /* I Speech signal */ + opus_int condCoding /* I The type of conditional coding to use */ +); + +/* LPC analysis */ +void silk_find_LPC_FIX( + silk_encoder_state *psEncC, /* I/O Encoder state */ + opus_int16 NLSF_Q15[], /* O NLSFs */ + const opus_int16 x[], /* I Input signal */ + const opus_int32 minInvGain_Q30 /* I Inverse of max prediction gain */ +); + +/* LTP analysis */ +void silk_find_LTP_FIX( + opus_int16 b_Q14[ MAX_NB_SUBFR * LTP_ORDER ], /* O LTP coefs */ + opus_int32 WLTP[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ], /* O Weight for LTP quantization */ + opus_int *LTPredCodGain_Q7, /* O LTP coding gain */ + const opus_int16 r_lpc[], /* I residual signal after LPC signal + state for first 10 ms */ + const opus_int lag[ MAX_NB_SUBFR ], /* I LTP lags */ + const opus_int32 Wght_Q15[ MAX_NB_SUBFR ], /* I weights */ + const opus_int subfr_length, /* I subframe length */ + const opus_int nb_subfr, /* I number of subframes */ + const opus_int mem_offset, /* I number of samples in LTP memory */ + opus_int corr_rshifts[ MAX_NB_SUBFR ], /* O right shifts applied to correlations */ + int arch /* I Run-time architecture */ +); + +void silk_LTP_analysis_filter_FIX( + opus_int16 *LTP_res, /* O LTP residual signal of length MAX_NB_SUBFR * ( pre_length + subfr_length ) */ + const opus_int16 *x, /* I Pointer to input signal with at least max( pitchL ) preceding samples */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],/* I LTP_ORDER LTP coefficients for each MAX_NB_SUBFR subframe */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag, one for each subframe */ + const opus_int32 invGains_Q16[ MAX_NB_SUBFR ], /* I Inverse quantization gains, one for each subframe */ + const opus_int subfr_length, /* I Length of each subframe */ + const opus_int nb_subfr, /* I Number of subframes */ + const opus_int pre_length /* I Length of the preceding samples starting at &x[0] for each subframe */ +); + +/* Calculates residual energies of input subframes where all subframes have LPC_order */ +/* of preceding samples */ +void silk_residual_energy_FIX( + opus_int32 nrgs[ MAX_NB_SUBFR ], /* O Residual energy per subframe */ + opus_int nrgsQ[ MAX_NB_SUBFR ], /* O Q value per subframe */ + const opus_int16 x[], /* I Input signal */ + opus_int16 a_Q12[ 2 ][ MAX_LPC_ORDER ], /* I AR coefs for each frame half */ + const opus_int32 gains[ MAX_NB_SUBFR ], /* I Quantization gains */ + const opus_int subfr_length, /* I Subframe length */ + const opus_int nb_subfr, /* I Number of subframes */ + const opus_int LPC_order, /* I LPC order */ + int arch /* I Run-time architecture */ +); + +/* Residual energy: nrg = wxx - 2 * wXx * c + c' * wXX * c */ +opus_int32 silk_residual_energy16_covar_FIX( + const opus_int16 *c, /* I Prediction vector */ + const opus_int32 *wXX, /* I Correlation matrix */ + const opus_int32 *wXx, /* I Correlation vector */ + opus_int32 wxx, /* I Signal energy */ + opus_int D, /* I Dimension */ + opus_int cQ /* I Q value for c vector 0 - 15 */ +); + +/* Processing of gains */ +void silk_process_gains_FIX( + silk_encoder_state_FIX *psEnc, /* I/O Encoder state */ + silk_encoder_control_FIX *psEncCtrl, /* I/O Encoder control */ + opus_int condCoding /* I The type of conditional coding to use */ +); + +/******************/ +/* Linear Algebra */ +/******************/ +/* Calculates correlation matrix X'*X */ +void silk_corrMatrix_FIX( + const opus_int16 *x, /* I x vector [L + order - 1] used to form data matrix X */ + const opus_int L, /* I Length of vectors */ + const opus_int order, /* I Max lag for correlation */ + const opus_int head_room, /* I Desired headroom */ + opus_int32 *XX, /* O Pointer to X'*X correlation matrix [ order x order ] */ + opus_int *rshifts, /* I/O Right shifts of correlations */ + int arch /* I Run-time architecture */ +); + +/* Calculates correlation vector X'*t */ +void silk_corrVector_FIX( + const opus_int16 *x, /* I x vector [L + order - 1] used to form data matrix X */ + const opus_int16 *t, /* I Target vector [L] */ + const opus_int L, /* I Length of vectors */ + const opus_int order, /* I Max lag for correlation */ + opus_int32 *Xt, /* O Pointer to X'*t correlation vector [order] */ + const opus_int rshifts, /* I Right shifts of correlations */ + int arch /* I Run-time architecture */ +); + +/* Add noise to matrix diagonal */ +void silk_regularize_correlations_FIX( + opus_int32 *XX, /* I/O Correlation matrices */ + opus_int32 *xx, /* I/O Correlation values */ + opus_int32 noise, /* I Noise to add */ + opus_int D /* I Dimension of XX */ +); + +/* Solves Ax = b, assuming A is symmetric */ +void silk_solve_LDL_FIX( + opus_int32 *A, /* I Pointer to symetric square matrix A */ + opus_int M, /* I Size of matrix */ + const opus_int32 *b, /* I Pointer to b vector */ + opus_int32 *x_Q16 /* O Pointer to x solution vector */ +); + +#ifndef FORCE_CPP_BUILD +#ifdef __cplusplus +} +#endif /* __cplusplus */ +#endif /* FORCE_CPP_BUILD */ +#endif /* SILK_MAIN_FIX_H */ diff --git a/thirdparty/opus/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h b/thirdparty/opus/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h new file mode 100644 index 000000000000..c30481e43751 --- /dev/null +++ b/thirdparty/opus/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h @@ -0,0 +1,336 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + + +/**************************************************************/ +/* Compute noise shaping coefficients and initial gain values */ +/**************************************************************/ +#define OVERRIDE_silk_noise_shape_analysis_FIX + +void silk_noise_shape_analysis_FIX( + silk_encoder_state_FIX *psEnc, /* I/O Encoder state FIX */ + silk_encoder_control_FIX *psEncCtrl, /* I/O Encoder control FIX */ + const opus_int16 *pitch_res, /* I LPC residual from pitch analysis */ + const opus_int16 *x, /* I Input signal [ frame_length + la_shape ] */ + int arch /* I Run-time architecture */ +) +{ + silk_shape_state_FIX *psShapeSt = &psEnc->sShape; + opus_int k, i, nSamples, Qnrg, b_Q14, warping_Q16, scale = 0; + opus_int32 SNR_adj_dB_Q7, HarmBoost_Q16, HarmShapeGain_Q16, Tilt_Q16, tmp32; + opus_int32 nrg, pre_nrg_Q30, log_energy_Q7, log_energy_prev_Q7, energy_variation_Q7; + opus_int32 delta_Q16, BWExp1_Q16, BWExp2_Q16, gain_mult_Q16, gain_add_Q16, strength_Q16, b_Q8; + opus_int32 auto_corr[ MAX_SHAPE_LPC_ORDER + 1 ]; + opus_int32 refl_coef_Q16[ MAX_SHAPE_LPC_ORDER ]; + opus_int32 AR1_Q24[ MAX_SHAPE_LPC_ORDER ]; + opus_int32 AR2_Q24[ MAX_SHAPE_LPC_ORDER ]; + VARDECL( opus_int16, x_windowed ); + const opus_int16 *x_ptr, *pitch_res_ptr; + SAVE_STACK; + + /* Point to start of first LPC analysis block */ + x_ptr = x - psEnc->sCmn.la_shape; + + /****************/ + /* GAIN CONTROL */ + /****************/ + SNR_adj_dB_Q7 = psEnc->sCmn.SNR_dB_Q7; + + /* Input quality is the average of the quality in the lowest two VAD bands */ + psEncCtrl->input_quality_Q14 = ( opus_int )silk_RSHIFT( (opus_int32)psEnc->sCmn.input_quality_bands_Q15[ 0 ] + + psEnc->sCmn.input_quality_bands_Q15[ 1 ], 2 ); + + /* Coding quality level, between 0.0_Q0 and 1.0_Q0, but in Q14 */ + psEncCtrl->coding_quality_Q14 = silk_RSHIFT( silk_sigm_Q15( silk_RSHIFT_ROUND( SNR_adj_dB_Q7 - + SILK_FIX_CONST( 20.0, 7 ), 4 ) ), 1 ); + + /* Reduce coding SNR during low speech activity */ + if( psEnc->sCmn.useCBR == 0 ) { + b_Q8 = SILK_FIX_CONST( 1.0, 8 ) - psEnc->sCmn.speech_activity_Q8; + b_Q8 = silk_SMULWB( silk_LSHIFT( b_Q8, 8 ), b_Q8 ); + SNR_adj_dB_Q7 = silk_SMLAWB( SNR_adj_dB_Q7, + silk_SMULBB( SILK_FIX_CONST( -BG_SNR_DECR_dB, 7 ) >> ( 4 + 1 ), b_Q8 ), /* Q11*/ + silk_SMULWB( SILK_FIX_CONST( 1.0, 14 ) + psEncCtrl->input_quality_Q14, psEncCtrl->coding_quality_Q14 ) ); /* Q12*/ + } + + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + /* Reduce gains for periodic signals */ + SNR_adj_dB_Q7 = silk_SMLAWB( SNR_adj_dB_Q7, SILK_FIX_CONST( HARM_SNR_INCR_dB, 8 ), psEnc->LTPCorr_Q15 ); + } else { + /* For unvoiced signals and low-quality input, adjust the quality slower than SNR_dB setting */ + SNR_adj_dB_Q7 = silk_SMLAWB( SNR_adj_dB_Q7, + silk_SMLAWB( SILK_FIX_CONST( 6.0, 9 ), -SILK_FIX_CONST( 0.4, 18 ), psEnc->sCmn.SNR_dB_Q7 ), + SILK_FIX_CONST( 1.0, 14 ) - psEncCtrl->input_quality_Q14 ); + } + + /*************************/ + /* SPARSENESS PROCESSING */ + /*************************/ + /* Set quantizer offset */ + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + /* Initially set to 0; may be overruled in process_gains(..) */ + psEnc->sCmn.indices.quantOffsetType = 0; + psEncCtrl->sparseness_Q8 = 0; + } else { + /* Sparseness measure, based on relative fluctuations of energy per 2 milliseconds */ + nSamples = silk_LSHIFT( psEnc->sCmn.fs_kHz, 1 ); + energy_variation_Q7 = 0; + log_energy_prev_Q7 = 0; + pitch_res_ptr = pitch_res; + for( k = 0; k < silk_SMULBB( SUB_FRAME_LENGTH_MS, psEnc->sCmn.nb_subfr ) / 2; k++ ) { + silk_sum_sqr_shift( &nrg, &scale, pitch_res_ptr, nSamples ); + nrg += silk_RSHIFT( nSamples, scale ); /* Q(-scale)*/ + + log_energy_Q7 = silk_lin2log( nrg ); + if( k > 0 ) { + energy_variation_Q7 += silk_abs( log_energy_Q7 - log_energy_prev_Q7 ); + } + log_energy_prev_Q7 = log_energy_Q7; + pitch_res_ptr += nSamples; + } + + psEncCtrl->sparseness_Q8 = silk_RSHIFT( silk_sigm_Q15( silk_SMULWB( energy_variation_Q7 - + SILK_FIX_CONST( 5.0, 7 ), SILK_FIX_CONST( 0.1, 16 ) ) ), 7 ); + + /* Set quantization offset depending on sparseness measure */ + if( psEncCtrl->sparseness_Q8 > SILK_FIX_CONST( SPARSENESS_THRESHOLD_QNT_OFFSET, 8 ) ) { + psEnc->sCmn.indices.quantOffsetType = 0; + } else { + psEnc->sCmn.indices.quantOffsetType = 1; + } + + /* Increase coding SNR for sparse signals */ + SNR_adj_dB_Q7 = silk_SMLAWB( SNR_adj_dB_Q7, SILK_FIX_CONST( SPARSE_SNR_INCR_dB, 15 ), psEncCtrl->sparseness_Q8 - SILK_FIX_CONST( 0.5, 8 ) ); + } + + /*******************************/ + /* Control bandwidth expansion */ + /*******************************/ + /* More BWE for signals with high prediction gain */ + strength_Q16 = silk_SMULWB( psEncCtrl->predGain_Q16, SILK_FIX_CONST( FIND_PITCH_WHITE_NOISE_FRACTION, 16 ) ); + BWExp1_Q16 = BWExp2_Q16 = silk_DIV32_varQ( SILK_FIX_CONST( BANDWIDTH_EXPANSION, 16 ), + silk_SMLAWW( SILK_FIX_CONST( 1.0, 16 ), strength_Q16, strength_Q16 ), 16 ); + delta_Q16 = silk_SMULWB( SILK_FIX_CONST( 1.0, 16 ) - silk_SMULBB( 3, psEncCtrl->coding_quality_Q14 ), + SILK_FIX_CONST( LOW_RATE_BANDWIDTH_EXPANSION_DELTA, 16 ) ); + BWExp1_Q16 = silk_SUB32( BWExp1_Q16, delta_Q16 ); + BWExp2_Q16 = silk_ADD32( BWExp2_Q16, delta_Q16 ); + /* BWExp1 will be applied after BWExp2, so make it relative */ + BWExp1_Q16 = silk_DIV32_16( silk_LSHIFT( BWExp1_Q16, 14 ), silk_RSHIFT( BWExp2_Q16, 2 ) ); + + if( psEnc->sCmn.warping_Q16 > 0 ) { + /* Slightly more warping in analysis will move quantization noise up in frequency, where it's better masked */ + warping_Q16 = silk_SMLAWB( psEnc->sCmn.warping_Q16, (opus_int32)psEncCtrl->coding_quality_Q14, SILK_FIX_CONST( 0.01, 18 ) ); + } else { + warping_Q16 = 0; + } + + /********************************************/ + /* Compute noise shaping AR coefs and gains */ + /********************************************/ + ALLOC( x_windowed, psEnc->sCmn.shapeWinLength, opus_int16 ); + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + /* Apply window: sine slope followed by flat part followed by cosine slope */ + opus_int shift, slope_part, flat_part; + flat_part = psEnc->sCmn.fs_kHz * 3; + slope_part = silk_RSHIFT( psEnc->sCmn.shapeWinLength - flat_part, 1 ); + + silk_apply_sine_window( x_windowed, x_ptr, 1, slope_part ); + shift = slope_part; + silk_memcpy( x_windowed + shift, x_ptr + shift, flat_part * sizeof(opus_int16) ); + shift += flat_part; + silk_apply_sine_window( x_windowed + shift, x_ptr + shift, 2, slope_part ); + + /* Update pointer: next LPC analysis block */ + x_ptr += psEnc->sCmn.subfr_length; + + if( psEnc->sCmn.warping_Q16 > 0 ) { + /* Calculate warped auto correlation */ + silk_warped_autocorrelation_FIX( auto_corr, &scale, x_windowed, warping_Q16, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder ); + } else { + /* Calculate regular auto correlation */ + silk_autocorr( auto_corr, &scale, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1, arch ); + } + + /* Add white noise, as a fraction of energy */ + auto_corr[0] = silk_ADD32( auto_corr[0], silk_max_32( silk_SMULWB( silk_RSHIFT( auto_corr[ 0 ], 4 ), + SILK_FIX_CONST( SHAPE_WHITE_NOISE_FRACTION, 20 ) ), 1 ) ); + + /* Calculate the reflection coefficients using schur */ + nrg = silk_schur64( refl_coef_Q16, auto_corr, psEnc->sCmn.shapingLPCOrder ); + silk_assert( nrg >= 0 ); + + /* Convert reflection coefficients to prediction coefficients */ + silk_k2a_Q16( AR2_Q24, refl_coef_Q16, psEnc->sCmn.shapingLPCOrder ); + + Qnrg = -scale; /* range: -12...30*/ + silk_assert( Qnrg >= -12 ); + silk_assert( Qnrg <= 30 ); + + /* Make sure that Qnrg is an even number */ + if( Qnrg & 1 ) { + Qnrg -= 1; + nrg >>= 1; + } + + tmp32 = silk_SQRT_APPROX( nrg ); + Qnrg >>= 1; /* range: -6...15*/ + + psEncCtrl->Gains_Q16[ k ] = (silk_LSHIFT32( silk_LIMIT( (tmp32), silk_RSHIFT32( silk_int32_MIN, (16 - Qnrg) ), \ + silk_RSHIFT32( silk_int32_MAX, (16 - Qnrg) ) ), (16 - Qnrg) )); + + if( psEnc->sCmn.warping_Q16 > 0 ) { + /* Adjust gain for warping */ + gain_mult_Q16 = warped_gain( AR2_Q24, warping_Q16, psEnc->sCmn.shapingLPCOrder ); + silk_assert( psEncCtrl->Gains_Q16[ k ] >= 0 ); + if ( silk_SMULWW( silk_RSHIFT_ROUND( psEncCtrl->Gains_Q16[ k ], 1 ), gain_mult_Q16 ) >= ( silk_int32_MAX >> 1 ) ) { + psEncCtrl->Gains_Q16[ k ] = silk_int32_MAX; + } else { + psEncCtrl->Gains_Q16[ k ] = silk_SMULWW( psEncCtrl->Gains_Q16[ k ], gain_mult_Q16 ); + } + } + + /* Bandwidth expansion for synthesis filter shaping */ + silk_bwexpander_32( AR2_Q24, psEnc->sCmn.shapingLPCOrder, BWExp2_Q16 ); + + /* Compute noise shaping filter coefficients */ + silk_memcpy( AR1_Q24, AR2_Q24, psEnc->sCmn.shapingLPCOrder * sizeof( opus_int32 ) ); + + /* Bandwidth expansion for analysis filter shaping */ + silk_assert( BWExp1_Q16 <= SILK_FIX_CONST( 1.0, 16 ) ); + silk_bwexpander_32( AR1_Q24, psEnc->sCmn.shapingLPCOrder, BWExp1_Q16 ); + + /* Ratio of prediction gains, in energy domain */ + pre_nrg_Q30 = silk_LPC_inverse_pred_gain_Q24( AR2_Q24, psEnc->sCmn.shapingLPCOrder ); + nrg = silk_LPC_inverse_pred_gain_Q24( AR1_Q24, psEnc->sCmn.shapingLPCOrder ); + + /*psEncCtrl->GainsPre[ k ] = 1.0f - 0.7f * ( 1.0f - pre_nrg / nrg ) = 0.3f + 0.7f * pre_nrg / nrg;*/ + pre_nrg_Q30 = silk_LSHIFT32( silk_SMULWB( pre_nrg_Q30, SILK_FIX_CONST( 0.7, 15 ) ), 1 ); + psEncCtrl->GainsPre_Q14[ k ] = ( opus_int ) SILK_FIX_CONST( 0.3, 14 ) + silk_DIV32_varQ( pre_nrg_Q30, nrg, 14 ); + + /* Convert to monic warped prediction coefficients and limit absolute values */ + limit_warped_coefs( AR2_Q24, AR1_Q24, warping_Q16, SILK_FIX_CONST( 3.999, 24 ), psEnc->sCmn.shapingLPCOrder ); + + /* Convert from Q24 to Q13 and store in int16 */ + for( i = 0; i < psEnc->sCmn.shapingLPCOrder; i++ ) { + psEncCtrl->AR1_Q13[ k * MAX_SHAPE_LPC_ORDER + i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( AR1_Q24[ i ], 11 ) ); + psEncCtrl->AR2_Q13[ k * MAX_SHAPE_LPC_ORDER + i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( AR2_Q24[ i ], 11 ) ); + } + } + + /*****************/ + /* Gain tweaking */ + /*****************/ + /* Increase gains during low speech activity and put lower limit on gains */ + gain_mult_Q16 = silk_log2lin( -silk_SMLAWB( -SILK_FIX_CONST( 16.0, 7 ), SNR_adj_dB_Q7, SILK_FIX_CONST( 0.16, 16 ) ) ); + gain_add_Q16 = silk_log2lin( silk_SMLAWB( SILK_FIX_CONST( 16.0, 7 ), SILK_FIX_CONST( MIN_QGAIN_DB, 7 ), SILK_FIX_CONST( 0.16, 16 ) ) ); + silk_assert( gain_mult_Q16 > 0 ); + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + psEncCtrl->Gains_Q16[ k ] = silk_SMULWW( psEncCtrl->Gains_Q16[ k ], gain_mult_Q16 ); + silk_assert( psEncCtrl->Gains_Q16[ k ] >= 0 ); + psEncCtrl->Gains_Q16[ k ] = silk_ADD_POS_SAT32( psEncCtrl->Gains_Q16[ k ], gain_add_Q16 ); + } + + gain_mult_Q16 = SILK_FIX_CONST( 1.0, 16 ) + silk_RSHIFT_ROUND( silk_MLA( SILK_FIX_CONST( INPUT_TILT, 26 ), + psEncCtrl->coding_quality_Q14, SILK_FIX_CONST( HIGH_RATE_INPUT_TILT, 12 ) ), 10 ); + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + psEncCtrl->GainsPre_Q14[ k ] = silk_SMULWB( gain_mult_Q16, psEncCtrl->GainsPre_Q14[ k ] ); + } + + /************************************************/ + /* Control low-frequency shaping and noise tilt */ + /************************************************/ + /* Less low frequency shaping for noisy inputs */ + strength_Q16 = silk_MUL( SILK_FIX_CONST( LOW_FREQ_SHAPING, 4 ), silk_SMLAWB( SILK_FIX_CONST( 1.0, 12 ), + SILK_FIX_CONST( LOW_QUALITY_LOW_FREQ_SHAPING_DECR, 13 ), psEnc->sCmn.input_quality_bands_Q15[ 0 ] - SILK_FIX_CONST( 1.0, 15 ) ) ); + strength_Q16 = silk_RSHIFT( silk_MUL( strength_Q16, psEnc->sCmn.speech_activity_Q8 ), 8 ); + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + /* Reduce low frequencies quantization noise for periodic signals, depending on pitch lag */ + /*f = 400; freqz([1, -0.98 + 2e-4 * f], [1, -0.97 + 7e-4 * f], 2^12, Fs); axis([0, 1000, -10, 1])*/ + opus_int fs_kHz_inv = silk_DIV32_16( SILK_FIX_CONST( 0.2, 14 ), psEnc->sCmn.fs_kHz ); + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + b_Q14 = fs_kHz_inv + silk_DIV32_16( SILK_FIX_CONST( 3.0, 14 ), psEncCtrl->pitchL[ k ] ); + /* Pack two coefficients in one int32 */ + psEncCtrl->LF_shp_Q14[ k ] = silk_LSHIFT( SILK_FIX_CONST( 1.0, 14 ) - b_Q14 - silk_SMULWB( strength_Q16, b_Q14 ), 16 ); + psEncCtrl->LF_shp_Q14[ k ] |= (opus_uint16)( b_Q14 - SILK_FIX_CONST( 1.0, 14 ) ); + } + silk_assert( SILK_FIX_CONST( HARM_HP_NOISE_COEF, 24 ) < SILK_FIX_CONST( 0.5, 24 ) ); /* Guarantees that second argument to SMULWB() is within range of an opus_int16*/ + Tilt_Q16 = - SILK_FIX_CONST( HP_NOISE_COEF, 16 ) - + silk_SMULWB( SILK_FIX_CONST( 1.0, 16 ) - SILK_FIX_CONST( HP_NOISE_COEF, 16 ), + silk_SMULWB( SILK_FIX_CONST( HARM_HP_NOISE_COEF, 24 ), psEnc->sCmn.speech_activity_Q8 ) ); + } else { + b_Q14 = silk_DIV32_16( 21299, psEnc->sCmn.fs_kHz ); /* 1.3_Q0 = 21299_Q14*/ + /* Pack two coefficients in one int32 */ + psEncCtrl->LF_shp_Q14[ 0 ] = silk_LSHIFT( SILK_FIX_CONST( 1.0, 14 ) - b_Q14 - + silk_SMULWB( strength_Q16, silk_SMULWB( SILK_FIX_CONST( 0.6, 16 ), b_Q14 ) ), 16 ); + psEncCtrl->LF_shp_Q14[ 0 ] |= (opus_uint16)( b_Q14 - SILK_FIX_CONST( 1.0, 14 ) ); + for( k = 1; k < psEnc->sCmn.nb_subfr; k++ ) { + psEncCtrl->LF_shp_Q14[ k ] = psEncCtrl->LF_shp_Q14[ 0 ]; + } + Tilt_Q16 = -SILK_FIX_CONST( HP_NOISE_COEF, 16 ); + } + + /****************************/ + /* HARMONIC SHAPING CONTROL */ + /****************************/ + /* Control boosting of harmonic frequencies */ + HarmBoost_Q16 = silk_SMULWB( silk_SMULWB( SILK_FIX_CONST( 1.0, 17 ) - silk_LSHIFT( psEncCtrl->coding_quality_Q14, 3 ), + psEnc->LTPCorr_Q15 ), SILK_FIX_CONST( LOW_RATE_HARMONIC_BOOST, 16 ) ); + + /* More harmonic boost for noisy input signals */ + HarmBoost_Q16 = silk_SMLAWB( HarmBoost_Q16, + SILK_FIX_CONST( 1.0, 16 ) - silk_LSHIFT( psEncCtrl->input_quality_Q14, 2 ), SILK_FIX_CONST( LOW_INPUT_QUALITY_HARMONIC_BOOST, 16 ) ); + + if( USE_HARM_SHAPING && psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + /* More harmonic noise shaping for high bitrates or noisy input */ + HarmShapeGain_Q16 = silk_SMLAWB( SILK_FIX_CONST( HARMONIC_SHAPING, 16 ), + SILK_FIX_CONST( 1.0, 16 ) - silk_SMULWB( SILK_FIX_CONST( 1.0, 18 ) - silk_LSHIFT( psEncCtrl->coding_quality_Q14, 4 ), + psEncCtrl->input_quality_Q14 ), SILK_FIX_CONST( HIGH_RATE_OR_LOW_QUALITY_HARMONIC_SHAPING, 16 ) ); + + /* Less harmonic noise shaping for less periodic signals */ + HarmShapeGain_Q16 = silk_SMULWB( silk_LSHIFT( HarmShapeGain_Q16, 1 ), + silk_SQRT_APPROX( silk_LSHIFT( psEnc->LTPCorr_Q15, 15 ) ) ); + } else { + HarmShapeGain_Q16 = 0; + } + + /*************************/ + /* Smooth over subframes */ + /*************************/ + for( k = 0; k < MAX_NB_SUBFR; k++ ) { + psShapeSt->HarmBoost_smth_Q16 = + silk_SMLAWB( psShapeSt->HarmBoost_smth_Q16, HarmBoost_Q16 - psShapeSt->HarmBoost_smth_Q16, SILK_FIX_CONST( SUBFR_SMTH_COEF, 16 ) ); + psShapeSt->HarmShapeGain_smth_Q16 = + silk_SMLAWB( psShapeSt->HarmShapeGain_smth_Q16, HarmShapeGain_Q16 - psShapeSt->HarmShapeGain_smth_Q16, SILK_FIX_CONST( SUBFR_SMTH_COEF, 16 ) ); + psShapeSt->Tilt_smth_Q16 = + silk_SMLAWB( psShapeSt->Tilt_smth_Q16, Tilt_Q16 - psShapeSt->Tilt_smth_Q16, SILK_FIX_CONST( SUBFR_SMTH_COEF, 16 ) ); + + psEncCtrl->HarmBoost_Q14[ k ] = ( opus_int )silk_RSHIFT_ROUND( psShapeSt->HarmBoost_smth_Q16, 2 ); + psEncCtrl->HarmShapeGain_Q14[ k ] = ( opus_int )silk_RSHIFT_ROUND( psShapeSt->HarmShapeGain_smth_Q16, 2 ); + psEncCtrl->Tilt_Q14[ k ] = ( opus_int )silk_RSHIFT_ROUND( psShapeSt->Tilt_smth_Q16, 2 ); + } + RESTORE_STACK; +} diff --git a/thirdparty/opus/silk/fixed/mips/prefilter_FIX_mipsr1.h b/thirdparty/opus/silk/fixed/mips/prefilter_FIX_mipsr1.h new file mode 100644 index 000000000000..21b256885f34 --- /dev/null +++ b/thirdparty/opus/silk/fixed/mips/prefilter_FIX_mipsr1.h @@ -0,0 +1,184 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ +#ifndef __PREFILTER_FIX_MIPSR1_H__ +#define __PREFILTER_FIX_MIPSR1_H__ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" +#include "stack_alloc.h" +#include "tuning_parameters.h" + +#define OVERRIDE_silk_warped_LPC_analysis_filter_FIX +void silk_warped_LPC_analysis_filter_FIX( + opus_int32 state[], /* I/O State [order + 1] */ + opus_int32 res_Q2[], /* O Residual signal [length] */ + const opus_int16 coef_Q13[], /* I Coefficients [order] */ + const opus_int16 input[], /* I Input signal [length] */ + const opus_int16 lambda_Q16, /* I Warping factor */ + const opus_int length, /* I Length of input signal */ + const opus_int order, /* I Filter order (even) */ + int arch +) +{ + opus_int n, i; + opus_int32 acc_Q11, acc_Q22, tmp1, tmp2, tmp3, tmp4; + opus_int32 state_cur, state_next; + + (void)arch; + + /* Order must be even */ + /* Length must be even */ + + silk_assert( ( order & 1 ) == 0 ); + silk_assert( ( length & 1 ) == 0 ); + + for( n = 0; n < length; n+=2 ) { + /* Output of lowpass section */ + tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 ); + state_cur = silk_LSHIFT( input[ n ], 14 ); + /* Output of allpass section */ + tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 ); + state_next = tmp2; + acc_Q11 = silk_RSHIFT( order, 1 ); + acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] ); + + + /* Output of lowpass section */ + tmp4 = silk_SMLAWB( state_cur, state_next, lambda_Q16 ); + state[ 0 ] = silk_LSHIFT( input[ n+1 ], 14 ); + /* Output of allpass section */ + tmp3 = silk_SMLAWB( state_next, tmp1 - tmp4, lambda_Q16 ); + state[ 1 ] = tmp4; + acc_Q22 = silk_RSHIFT( order, 1 ); + acc_Q22 = silk_SMLAWB( acc_Q22, tmp4, coef_Q13[ 0 ] ); + + /* Loop over allpass sections */ + for( i = 2; i < order; i += 2 ) { + /* Output of allpass section */ + tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 ); + state_cur = tmp1; + acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] ); + /* Output of allpass section */ + tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 ); + state_next = tmp2; + acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] ); + + + /* Output of allpass section */ + tmp4 = silk_SMLAWB( state_cur, state_next - tmp3, lambda_Q16 ); + state[ i ] = tmp3; + acc_Q22 = silk_SMLAWB( acc_Q22, tmp3, coef_Q13[ i - 1 ] ); + /* Output of allpass section */ + tmp3 = silk_SMLAWB( state_next, tmp1 - tmp4, lambda_Q16 ); + state[ i + 1 ] = tmp4; + acc_Q22 = silk_SMLAWB( acc_Q22, tmp4, coef_Q13[ i ] ); + } + acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] ); + res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 ); + + state[ order ] = tmp3; + acc_Q22 = silk_SMLAWB( acc_Q22, tmp3, coef_Q13[ order - 1 ] ); + res_Q2[ n+1 ] = silk_LSHIFT( (opus_int32)input[ n+1 ], 2 ) - silk_RSHIFT_ROUND( acc_Q22, 9 ); + } +} + + + +/* Prefilter for finding Quantizer input signal */ +#define OVERRIDE_silk_prefilt_FIX +static inline void silk_prefilt_FIX( + silk_prefilter_state_FIX *P, /* I/O state */ + opus_int32 st_res_Q12[], /* I short term residual signal */ + opus_int32 xw_Q3[], /* O prefiltered signal */ + opus_int32 HarmShapeFIRPacked_Q12, /* I Harmonic shaping coeficients */ + opus_int Tilt_Q14, /* I Tilt shaping coeficient */ + opus_int32 LF_shp_Q14, /* I Low-frequancy shaping coeficients */ + opus_int lag, /* I Lag for harmonic shaping */ + opus_int length /* I Length of signals */ +) +{ + opus_int i, idx, LTP_shp_buf_idx; + opus_int32 n_LTP_Q12, n_Tilt_Q10, n_LF_Q10; + opus_int32 sLF_MA_shp_Q12, sLF_AR_shp_Q12; + opus_int16 *LTP_shp_buf; + + /* To speed up use temp variables instead of using the struct */ + LTP_shp_buf = P->sLTP_shp; + LTP_shp_buf_idx = P->sLTP_shp_buf_idx; + sLF_AR_shp_Q12 = P->sLF_AR_shp_Q12; + sLF_MA_shp_Q12 = P->sLF_MA_shp_Q12; + + if( lag > 0 ) { + for( i = 0; i < length; i++ ) { + /* unrolled loop */ + silk_assert( HARM_SHAPE_FIR_TAPS == 3 ); + idx = lag + LTP_shp_buf_idx; + n_LTP_Q12 = silk_SMULBB( LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 - 1) & LTP_MASK ], HarmShapeFIRPacked_Q12 ); + n_LTP_Q12 = silk_SMLABT( n_LTP_Q12, LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 ) & LTP_MASK ], HarmShapeFIRPacked_Q12 ); + n_LTP_Q12 = silk_SMLABB( n_LTP_Q12, LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 + 1) & LTP_MASK ], HarmShapeFIRPacked_Q12 ); + + n_Tilt_Q10 = silk_SMULWB( sLF_AR_shp_Q12, Tilt_Q14 ); + n_LF_Q10 = silk_SMLAWB( silk_SMULWT( sLF_AR_shp_Q12, LF_shp_Q14 ), sLF_MA_shp_Q12, LF_shp_Q14 ); + + sLF_AR_shp_Q12 = silk_SUB32( st_res_Q12[ i ], silk_LSHIFT( n_Tilt_Q10, 2 ) ); + sLF_MA_shp_Q12 = silk_SUB32( sLF_AR_shp_Q12, silk_LSHIFT( n_LF_Q10, 2 ) ); + + LTP_shp_buf_idx = ( LTP_shp_buf_idx - 1 ) & LTP_MASK; + LTP_shp_buf[ LTP_shp_buf_idx ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sLF_MA_shp_Q12, 12 ) ); + + xw_Q3[i] = silk_RSHIFT_ROUND( silk_SUB32( sLF_MA_shp_Q12, n_LTP_Q12 ), 9 ); + } + } + else + { + for( i = 0; i < length; i++ ) { + + n_LTP_Q12 = 0; + + n_Tilt_Q10 = silk_SMULWB( sLF_AR_shp_Q12, Tilt_Q14 ); + n_LF_Q10 = silk_SMLAWB( silk_SMULWT( sLF_AR_shp_Q12, LF_shp_Q14 ), sLF_MA_shp_Q12, LF_shp_Q14 ); + + sLF_AR_shp_Q12 = silk_SUB32( st_res_Q12[ i ], silk_LSHIFT( n_Tilt_Q10, 2 ) ); + sLF_MA_shp_Q12 = silk_SUB32( sLF_AR_shp_Q12, silk_LSHIFT( n_LF_Q10, 2 ) ); + + LTP_shp_buf_idx = ( LTP_shp_buf_idx - 1 ) & LTP_MASK; + LTP_shp_buf[ LTP_shp_buf_idx ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sLF_MA_shp_Q12, 12 ) ); + + xw_Q3[i] = silk_RSHIFT_ROUND( sLF_MA_shp_Q12, 9 ); + } + } + + /* Copy temp variable back to state */ + P->sLF_AR_shp_Q12 = sLF_AR_shp_Q12; + P->sLF_MA_shp_Q12 = sLF_MA_shp_Q12; + P->sLTP_shp_buf_idx = LTP_shp_buf_idx; +} + +#endif /* __PREFILTER_FIX_MIPSR1_H__ */ diff --git a/thirdparty/opus/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h b/thirdparty/opus/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h new file mode 100644 index 000000000000..e803ef0fceb0 --- /dev/null +++ b/thirdparty/opus/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h @@ -0,0 +1,165 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef __WARPED_AUTOCORRELATION_FIX_MIPSR1_H__ +#define __WARPED_AUTOCORRELATION_FIX_MIPSR1_H__ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" + +#undef QC +#define QC 10 + +#undef QS +#define QS 14 + +/* Autocorrelations for a warped frequency axis */ +#define OVERRIDE_silk_warped_autocorrelation_FIX +void silk_warped_autocorrelation_FIX( + opus_int32 *corr, /* O Result [order + 1] */ + opus_int *scale, /* O Scaling of the correlation vector */ + const opus_int16 *input, /* I Input data to correlate */ + const opus_int warping_Q16, /* I Warping coefficient */ + const opus_int length, /* I Length of input */ + const opus_int order /* I Correlation order (even) */ +) +{ + opus_int n, i, lsh; + opus_int32 tmp1_QS=0, tmp2_QS=0, tmp3_QS=0, tmp4_QS=0, tmp5_QS=0, tmp6_QS=0, tmp7_QS=0, tmp8_QS=0, start_1=0, start_2=0, start_3=0; + opus_int32 state_QS[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; + opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; + opus_int64 temp64; + + opus_int32 val; + val = 2 * QS - QC; + + /* Order must be even */ + silk_assert( ( order & 1 ) == 0 ); + silk_assert( 2 * QS - QC >= 0 ); + + /* Loop over samples */ + for( n = 0; n < length; n=n+4 ) { + + tmp1_QS = silk_LSHIFT32( (opus_int32)input[ n ], QS ); + start_1 = tmp1_QS; + tmp3_QS = silk_LSHIFT32( (opus_int32)input[ n+1], QS ); + start_2 = tmp3_QS; + tmp5_QS = silk_LSHIFT32( (opus_int32)input[ n+2], QS ); + start_3 = tmp5_QS; + tmp7_QS = silk_LSHIFT32( (opus_int32)input[ n+3], QS ); + + /* Loop over allpass sections */ + for( i = 0; i < order; i += 2 ) { + /* Output of allpass section */ + tmp2_QS = silk_SMLAWB( state_QS[ i ], state_QS[ i + 1 ] - tmp1_QS, warping_Q16 ); + corr_QC[ i ] = __builtin_mips_madd( corr_QC[ i ], tmp1_QS, start_1); + + tmp4_QS = silk_SMLAWB( tmp1_QS, tmp2_QS - tmp3_QS, warping_Q16 ); + corr_QC[ i ] = __builtin_mips_madd( corr_QC[ i ], tmp3_QS, start_2); + + tmp6_QS = silk_SMLAWB( tmp3_QS, tmp4_QS - tmp5_QS, warping_Q16 ); + corr_QC[ i ] = __builtin_mips_madd( corr_QC[ i ], tmp5_QS, start_3); + + tmp8_QS = silk_SMLAWB( tmp5_QS, tmp6_QS - tmp7_QS, warping_Q16 ); + state_QS[ i ] = tmp7_QS; + corr_QC[ i ] = __builtin_mips_madd( corr_QC[ i ], tmp7_QS, state_QS[0]); + + /* Output of allpass section */ + tmp1_QS = silk_SMLAWB( state_QS[ i + 1 ], state_QS[ i + 2 ] - tmp2_QS, warping_Q16 ); + corr_QC[ i+1 ] = __builtin_mips_madd( corr_QC[ i+1 ], tmp2_QS, start_1); + + tmp3_QS = silk_SMLAWB( tmp2_QS, tmp1_QS - tmp4_QS, warping_Q16 ); + corr_QC[ i+1 ] = __builtin_mips_madd( corr_QC[ i+1 ], tmp4_QS, start_2); + + tmp5_QS = silk_SMLAWB( tmp4_QS, tmp3_QS - tmp6_QS, warping_Q16 ); + corr_QC[ i+1 ] = __builtin_mips_madd( corr_QC[ i+1 ], tmp6_QS, start_3); + + tmp7_QS = silk_SMLAWB( tmp6_QS, tmp5_QS - tmp8_QS, warping_Q16 ); + state_QS[ i + 1 ] = tmp8_QS; + corr_QC[ i+1 ] = __builtin_mips_madd( corr_QC[ i+1 ], tmp8_QS, state_QS[ 0 ]); + + } + state_QS[ order ] = tmp7_QS; + + corr_QC[ order ] = __builtin_mips_madd( corr_QC[ order ], tmp1_QS, start_1); + corr_QC[ order ] = __builtin_mips_madd( corr_QC[ order ], tmp3_QS, start_2); + corr_QC[ order ] = __builtin_mips_madd( corr_QC[ order ], tmp5_QS, start_3); + corr_QC[ order ] = __builtin_mips_madd( corr_QC[ order ], tmp7_QS, state_QS[ 0 ]); + } + + for(;n< length; n++ ) { + + tmp1_QS = silk_LSHIFT32( (opus_int32)input[ n ], QS ); + + /* Loop over allpass sections */ + for( i = 0; i < order; i += 2 ) { + + /* Output of allpass section */ + tmp2_QS = silk_SMLAWB( state_QS[ i ], state_QS[ i + 1 ] - tmp1_QS, warping_Q16 ); + state_QS[ i ] = tmp1_QS; + corr_QC[ i ] = __builtin_mips_madd( corr_QC[ i ], tmp1_QS, state_QS[ 0 ]); + + /* Output of allpass section */ + tmp1_QS = silk_SMLAWB( state_QS[ i + 1 ], state_QS[ i + 2 ] - tmp2_QS, warping_Q16 ); + state_QS[ i + 1 ] = tmp2_QS; + corr_QC[ i+1 ] = __builtin_mips_madd( corr_QC[ i+1 ], tmp2_QS, state_QS[ 0 ]); + } + state_QS[ order ] = tmp1_QS; + corr_QC[ order ] = __builtin_mips_madd( corr_QC[ order ], tmp1_QS, state_QS[ 0 ]); + } + + temp64 = corr_QC[ 0 ]; + temp64 = __builtin_mips_shilo(temp64, val); + + lsh = silk_CLZ64( temp64 ) - 35; + lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC ); + *scale = -( QC + lsh ); + silk_assert( *scale >= -30 && *scale <= 12 ); + if( lsh >= 0 ) { + for( i = 0; i < order + 1; i++ ) { + temp64 = corr_QC[ i ]; + //temp64 = __builtin_mips_shilo(temp64, val); + temp64 = (val >= 0) ? (temp64 >> val) : (temp64 << -val); + corr[ i ] = (opus_int32)silk_CHECK_FIT32( __builtin_mips_shilo( temp64, -lsh ) ); + } + } else { + for( i = 0; i < order + 1; i++ ) { + temp64 = corr_QC[ i ]; + //temp64 = __builtin_mips_shilo(temp64, val); + temp64 = (val >= 0) ? (temp64 >> val) : (temp64 << -val); + corr[ i ] = (opus_int32)silk_CHECK_FIT32( __builtin_mips_shilo( temp64, -lsh ) ); + } + } + + corr_QC[ 0 ] = __builtin_mips_shilo(corr_QC[ 0 ], val); + + silk_assert( corr_QC[ 0 ] >= 0 ); /* If breaking, decrease QC*/ +} +#endif /* __WARPED_AUTOCORRELATION_FIX_MIPSR1_H__ */ diff --git a/thirdparty/opus/silk/fixed/noise_shape_analysis_FIX.c b/thirdparty/opus/silk/fixed/noise_shape_analysis_FIX.c new file mode 100644 index 000000000000..22a89f75aec0 --- /dev/null +++ b/thirdparty/opus/silk/fixed/noise_shape_analysis_FIX.c @@ -0,0 +1,451 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" +#include "stack_alloc.h" +#include "tuning_parameters.h" + +/* Compute gain to make warped filter coefficients have a zero mean log frequency response on a */ +/* non-warped frequency scale. (So that it can be implemented with a minimum-phase monic filter.) */ +/* Note: A monic filter is one with the first coefficient equal to 1.0. In Silk we omit the first */ +/* coefficient in an array of coefficients, for monic filters. */ +static OPUS_INLINE opus_int32 warped_gain( /* gain in Q16*/ + const opus_int32 *coefs_Q24, + opus_int lambda_Q16, + opus_int order +) { + opus_int i; + opus_int32 gain_Q24; + + lambda_Q16 = -lambda_Q16; + gain_Q24 = coefs_Q24[ order - 1 ]; + for( i = order - 2; i >= 0; i-- ) { + gain_Q24 = silk_SMLAWB( coefs_Q24[ i ], gain_Q24, lambda_Q16 ); + } + gain_Q24 = silk_SMLAWB( SILK_FIX_CONST( 1.0, 24 ), gain_Q24, -lambda_Q16 ); + return silk_INVERSE32_varQ( gain_Q24, 40 ); +} + +/* Convert warped filter coefficients to monic pseudo-warped coefficients and limit maximum */ +/* amplitude of monic warped coefficients by using bandwidth expansion on the true coefficients */ +static OPUS_INLINE void limit_warped_coefs( + opus_int32 *coefs_syn_Q24, + opus_int32 *coefs_ana_Q24, + opus_int lambda_Q16, + opus_int32 limit_Q24, + opus_int order +) { + opus_int i, iter, ind = 0; + opus_int32 tmp, maxabs_Q24, chirp_Q16, gain_syn_Q16, gain_ana_Q16; + opus_int32 nom_Q16, den_Q24; + + /* Convert to monic coefficients */ + lambda_Q16 = -lambda_Q16; + for( i = order - 1; i > 0; i-- ) { + coefs_syn_Q24[ i - 1 ] = silk_SMLAWB( coefs_syn_Q24[ i - 1 ], coefs_syn_Q24[ i ], lambda_Q16 ); + coefs_ana_Q24[ i - 1 ] = silk_SMLAWB( coefs_ana_Q24[ i - 1 ], coefs_ana_Q24[ i ], lambda_Q16 ); + } + lambda_Q16 = -lambda_Q16; + nom_Q16 = silk_SMLAWB( SILK_FIX_CONST( 1.0, 16 ), -(opus_int32)lambda_Q16, lambda_Q16 ); + den_Q24 = silk_SMLAWB( SILK_FIX_CONST( 1.0, 24 ), coefs_syn_Q24[ 0 ], lambda_Q16 ); + gain_syn_Q16 = silk_DIV32_varQ( nom_Q16, den_Q24, 24 ); + den_Q24 = silk_SMLAWB( SILK_FIX_CONST( 1.0, 24 ), coefs_ana_Q24[ 0 ], lambda_Q16 ); + gain_ana_Q16 = silk_DIV32_varQ( nom_Q16, den_Q24, 24 ); + for( i = 0; i < order; i++ ) { + coefs_syn_Q24[ i ] = silk_SMULWW( gain_syn_Q16, coefs_syn_Q24[ i ] ); + coefs_ana_Q24[ i ] = silk_SMULWW( gain_ana_Q16, coefs_ana_Q24[ i ] ); + } + + for( iter = 0; iter < 10; iter++ ) { + /* Find maximum absolute value */ + maxabs_Q24 = -1; + for( i = 0; i < order; i++ ) { + tmp = silk_max( silk_abs_int32( coefs_syn_Q24[ i ] ), silk_abs_int32( coefs_ana_Q24[ i ] ) ); + if( tmp > maxabs_Q24 ) { + maxabs_Q24 = tmp; + ind = i; + } + } + if( maxabs_Q24 <= limit_Q24 ) { + /* Coefficients are within range - done */ + return; + } + + /* Convert back to true warped coefficients */ + for( i = 1; i < order; i++ ) { + coefs_syn_Q24[ i - 1 ] = silk_SMLAWB( coefs_syn_Q24[ i - 1 ], coefs_syn_Q24[ i ], lambda_Q16 ); + coefs_ana_Q24[ i - 1 ] = silk_SMLAWB( coefs_ana_Q24[ i - 1 ], coefs_ana_Q24[ i ], lambda_Q16 ); + } + gain_syn_Q16 = silk_INVERSE32_varQ( gain_syn_Q16, 32 ); + gain_ana_Q16 = silk_INVERSE32_varQ( gain_ana_Q16, 32 ); + for( i = 0; i < order; i++ ) { + coefs_syn_Q24[ i ] = silk_SMULWW( gain_syn_Q16, coefs_syn_Q24[ i ] ); + coefs_ana_Q24[ i ] = silk_SMULWW( gain_ana_Q16, coefs_ana_Q24[ i ] ); + } + + /* Apply bandwidth expansion */ + chirp_Q16 = SILK_FIX_CONST( 0.99, 16 ) - silk_DIV32_varQ( + silk_SMULWB( maxabs_Q24 - limit_Q24, silk_SMLABB( SILK_FIX_CONST( 0.8, 10 ), SILK_FIX_CONST( 0.1, 10 ), iter ) ), + silk_MUL( maxabs_Q24, ind + 1 ), 22 ); + silk_bwexpander_32( coefs_syn_Q24, order, chirp_Q16 ); + silk_bwexpander_32( coefs_ana_Q24, order, chirp_Q16 ); + + /* Convert to monic warped coefficients */ + lambda_Q16 = -lambda_Q16; + for( i = order - 1; i > 0; i-- ) { + coefs_syn_Q24[ i - 1 ] = silk_SMLAWB( coefs_syn_Q24[ i - 1 ], coefs_syn_Q24[ i ], lambda_Q16 ); + coefs_ana_Q24[ i - 1 ] = silk_SMLAWB( coefs_ana_Q24[ i - 1 ], coefs_ana_Q24[ i ], lambda_Q16 ); + } + lambda_Q16 = -lambda_Q16; + nom_Q16 = silk_SMLAWB( SILK_FIX_CONST( 1.0, 16 ), -(opus_int32)lambda_Q16, lambda_Q16 ); + den_Q24 = silk_SMLAWB( SILK_FIX_CONST( 1.0, 24 ), coefs_syn_Q24[ 0 ], lambda_Q16 ); + gain_syn_Q16 = silk_DIV32_varQ( nom_Q16, den_Q24, 24 ); + den_Q24 = silk_SMLAWB( SILK_FIX_CONST( 1.0, 24 ), coefs_ana_Q24[ 0 ], lambda_Q16 ); + gain_ana_Q16 = silk_DIV32_varQ( nom_Q16, den_Q24, 24 ); + for( i = 0; i < order; i++ ) { + coefs_syn_Q24[ i ] = silk_SMULWW( gain_syn_Q16, coefs_syn_Q24[ i ] ); + coefs_ana_Q24[ i ] = silk_SMULWW( gain_ana_Q16, coefs_ana_Q24[ i ] ); + } + } + silk_assert( 0 ); +} + +#if defined(MIPSr1_ASM) +#include "mips/noise_shape_analysis_FIX_mipsr1.h" +#endif + +/**************************************************************/ +/* Compute noise shaping coefficients and initial gain values */ +/**************************************************************/ +#ifndef OVERRIDE_silk_noise_shape_analysis_FIX +void silk_noise_shape_analysis_FIX( + silk_encoder_state_FIX *psEnc, /* I/O Encoder state FIX */ + silk_encoder_control_FIX *psEncCtrl, /* I/O Encoder control FIX */ + const opus_int16 *pitch_res, /* I LPC residual from pitch analysis */ + const opus_int16 *x, /* I Input signal [ frame_length + la_shape ] */ + int arch /* I Run-time architecture */ +) +{ + silk_shape_state_FIX *psShapeSt = &psEnc->sShape; + opus_int k, i, nSamples, Qnrg, b_Q14, warping_Q16, scale = 0; + opus_int32 SNR_adj_dB_Q7, HarmBoost_Q16, HarmShapeGain_Q16, Tilt_Q16, tmp32; + opus_int32 nrg, pre_nrg_Q30, log_energy_Q7, log_energy_prev_Q7, energy_variation_Q7; + opus_int32 delta_Q16, BWExp1_Q16, BWExp2_Q16, gain_mult_Q16, gain_add_Q16, strength_Q16, b_Q8; + opus_int32 auto_corr[ MAX_SHAPE_LPC_ORDER + 1 ]; + opus_int32 refl_coef_Q16[ MAX_SHAPE_LPC_ORDER ]; + opus_int32 AR1_Q24[ MAX_SHAPE_LPC_ORDER ]; + opus_int32 AR2_Q24[ MAX_SHAPE_LPC_ORDER ]; + VARDECL( opus_int16, x_windowed ); + const opus_int16 *x_ptr, *pitch_res_ptr; + SAVE_STACK; + + /* Point to start of first LPC analysis block */ + x_ptr = x - psEnc->sCmn.la_shape; + + /****************/ + /* GAIN CONTROL */ + /****************/ + SNR_adj_dB_Q7 = psEnc->sCmn.SNR_dB_Q7; + + /* Input quality is the average of the quality in the lowest two VAD bands */ + psEncCtrl->input_quality_Q14 = ( opus_int )silk_RSHIFT( (opus_int32)psEnc->sCmn.input_quality_bands_Q15[ 0 ] + + psEnc->sCmn.input_quality_bands_Q15[ 1 ], 2 ); + + /* Coding quality level, between 0.0_Q0 and 1.0_Q0, but in Q14 */ + psEncCtrl->coding_quality_Q14 = silk_RSHIFT( silk_sigm_Q15( silk_RSHIFT_ROUND( SNR_adj_dB_Q7 - + SILK_FIX_CONST( 20.0, 7 ), 4 ) ), 1 ); + + /* Reduce coding SNR during low speech activity */ + if( psEnc->sCmn.useCBR == 0 ) { + b_Q8 = SILK_FIX_CONST( 1.0, 8 ) - psEnc->sCmn.speech_activity_Q8; + b_Q8 = silk_SMULWB( silk_LSHIFT( b_Q8, 8 ), b_Q8 ); + SNR_adj_dB_Q7 = silk_SMLAWB( SNR_adj_dB_Q7, + silk_SMULBB( SILK_FIX_CONST( -BG_SNR_DECR_dB, 7 ) >> ( 4 + 1 ), b_Q8 ), /* Q11*/ + silk_SMULWB( SILK_FIX_CONST( 1.0, 14 ) + psEncCtrl->input_quality_Q14, psEncCtrl->coding_quality_Q14 ) ); /* Q12*/ + } + + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + /* Reduce gains for periodic signals */ + SNR_adj_dB_Q7 = silk_SMLAWB( SNR_adj_dB_Q7, SILK_FIX_CONST( HARM_SNR_INCR_dB, 8 ), psEnc->LTPCorr_Q15 ); + } else { + /* For unvoiced signals and low-quality input, adjust the quality slower than SNR_dB setting */ + SNR_adj_dB_Q7 = silk_SMLAWB( SNR_adj_dB_Q7, + silk_SMLAWB( SILK_FIX_CONST( 6.0, 9 ), -SILK_FIX_CONST( 0.4, 18 ), psEnc->sCmn.SNR_dB_Q7 ), + SILK_FIX_CONST( 1.0, 14 ) - psEncCtrl->input_quality_Q14 ); + } + + /*************************/ + /* SPARSENESS PROCESSING */ + /*************************/ + /* Set quantizer offset */ + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + /* Initially set to 0; may be overruled in process_gains(..) */ + psEnc->sCmn.indices.quantOffsetType = 0; + psEncCtrl->sparseness_Q8 = 0; + } else { + /* Sparseness measure, based on relative fluctuations of energy per 2 milliseconds */ + nSamples = silk_LSHIFT( psEnc->sCmn.fs_kHz, 1 ); + energy_variation_Q7 = 0; + log_energy_prev_Q7 = 0; + pitch_res_ptr = pitch_res; + for( k = 0; k < silk_SMULBB( SUB_FRAME_LENGTH_MS, psEnc->sCmn.nb_subfr ) / 2; k++ ) { + silk_sum_sqr_shift( &nrg, &scale, pitch_res_ptr, nSamples ); + nrg += silk_RSHIFT( nSamples, scale ); /* Q(-scale)*/ + + log_energy_Q7 = silk_lin2log( nrg ); + if( k > 0 ) { + energy_variation_Q7 += silk_abs( log_energy_Q7 - log_energy_prev_Q7 ); + } + log_energy_prev_Q7 = log_energy_Q7; + pitch_res_ptr += nSamples; + } + + psEncCtrl->sparseness_Q8 = silk_RSHIFT( silk_sigm_Q15( silk_SMULWB( energy_variation_Q7 - + SILK_FIX_CONST( 5.0, 7 ), SILK_FIX_CONST( 0.1, 16 ) ) ), 7 ); + + /* Set quantization offset depending on sparseness measure */ + if( psEncCtrl->sparseness_Q8 > SILK_FIX_CONST( SPARSENESS_THRESHOLD_QNT_OFFSET, 8 ) ) { + psEnc->sCmn.indices.quantOffsetType = 0; + } else { + psEnc->sCmn.indices.quantOffsetType = 1; + } + + /* Increase coding SNR for sparse signals */ + SNR_adj_dB_Q7 = silk_SMLAWB( SNR_adj_dB_Q7, SILK_FIX_CONST( SPARSE_SNR_INCR_dB, 15 ), psEncCtrl->sparseness_Q8 - SILK_FIX_CONST( 0.5, 8 ) ); + } + + /*******************************/ + /* Control bandwidth expansion */ + /*******************************/ + /* More BWE for signals with high prediction gain */ + strength_Q16 = silk_SMULWB( psEncCtrl->predGain_Q16, SILK_FIX_CONST( FIND_PITCH_WHITE_NOISE_FRACTION, 16 ) ); + BWExp1_Q16 = BWExp2_Q16 = silk_DIV32_varQ( SILK_FIX_CONST( BANDWIDTH_EXPANSION, 16 ), + silk_SMLAWW( SILK_FIX_CONST( 1.0, 16 ), strength_Q16, strength_Q16 ), 16 ); + delta_Q16 = silk_SMULWB( SILK_FIX_CONST( 1.0, 16 ) - silk_SMULBB( 3, psEncCtrl->coding_quality_Q14 ), + SILK_FIX_CONST( LOW_RATE_BANDWIDTH_EXPANSION_DELTA, 16 ) ); + BWExp1_Q16 = silk_SUB32( BWExp1_Q16, delta_Q16 ); + BWExp2_Q16 = silk_ADD32( BWExp2_Q16, delta_Q16 ); + /* BWExp1 will be applied after BWExp2, so make it relative */ + BWExp1_Q16 = silk_DIV32_16( silk_LSHIFT( BWExp1_Q16, 14 ), silk_RSHIFT( BWExp2_Q16, 2 ) ); + + if( psEnc->sCmn.warping_Q16 > 0 ) { + /* Slightly more warping in analysis will move quantization noise up in frequency, where it's better masked */ + warping_Q16 = silk_SMLAWB( psEnc->sCmn.warping_Q16, (opus_int32)psEncCtrl->coding_quality_Q14, SILK_FIX_CONST( 0.01, 18 ) ); + } else { + warping_Q16 = 0; + } + + /********************************************/ + /* Compute noise shaping AR coefs and gains */ + /********************************************/ + ALLOC( x_windowed, psEnc->sCmn.shapeWinLength, opus_int16 ); + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + /* Apply window: sine slope followed by flat part followed by cosine slope */ + opus_int shift, slope_part, flat_part; + flat_part = psEnc->sCmn.fs_kHz * 3; + slope_part = silk_RSHIFT( psEnc->sCmn.shapeWinLength - flat_part, 1 ); + + silk_apply_sine_window( x_windowed, x_ptr, 1, slope_part ); + shift = slope_part; + silk_memcpy( x_windowed + shift, x_ptr + shift, flat_part * sizeof(opus_int16) ); + shift += flat_part; + silk_apply_sine_window( x_windowed + shift, x_ptr + shift, 2, slope_part ); + + /* Update pointer: next LPC analysis block */ + x_ptr += psEnc->sCmn.subfr_length; + + if( psEnc->sCmn.warping_Q16 > 0 ) { + /* Calculate warped auto correlation */ + silk_warped_autocorrelation_FIX( auto_corr, &scale, x_windowed, warping_Q16, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder ); + } else { + /* Calculate regular auto correlation */ + silk_autocorr( auto_corr, &scale, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1, arch ); + } + + /* Add white noise, as a fraction of energy */ + auto_corr[0] = silk_ADD32( auto_corr[0], silk_max_32( silk_SMULWB( silk_RSHIFT( auto_corr[ 0 ], 4 ), + SILK_FIX_CONST( SHAPE_WHITE_NOISE_FRACTION, 20 ) ), 1 ) ); + + /* Calculate the reflection coefficients using schur */ + nrg = silk_schur64( refl_coef_Q16, auto_corr, psEnc->sCmn.shapingLPCOrder ); + silk_assert( nrg >= 0 ); + + /* Convert reflection coefficients to prediction coefficients */ + silk_k2a_Q16( AR2_Q24, refl_coef_Q16, psEnc->sCmn.shapingLPCOrder ); + + Qnrg = -scale; /* range: -12...30*/ + silk_assert( Qnrg >= -12 ); + silk_assert( Qnrg <= 30 ); + + /* Make sure that Qnrg is an even number */ + if( Qnrg & 1 ) { + Qnrg -= 1; + nrg >>= 1; + } + + tmp32 = silk_SQRT_APPROX( nrg ); + Qnrg >>= 1; /* range: -6...15*/ + + psEncCtrl->Gains_Q16[ k ] = silk_LSHIFT_SAT32( tmp32, 16 - Qnrg ); + + if( psEnc->sCmn.warping_Q16 > 0 ) { + /* Adjust gain for warping */ + gain_mult_Q16 = warped_gain( AR2_Q24, warping_Q16, psEnc->sCmn.shapingLPCOrder ); + silk_assert( psEncCtrl->Gains_Q16[ k ] >= 0 ); + if ( silk_SMULWW( silk_RSHIFT_ROUND( psEncCtrl->Gains_Q16[ k ], 1 ), gain_mult_Q16 ) >= ( silk_int32_MAX >> 1 ) ) { + psEncCtrl->Gains_Q16[ k ] = silk_int32_MAX; + } else { + psEncCtrl->Gains_Q16[ k ] = silk_SMULWW( psEncCtrl->Gains_Q16[ k ], gain_mult_Q16 ); + } + } + + /* Bandwidth expansion for synthesis filter shaping */ + silk_bwexpander_32( AR2_Q24, psEnc->sCmn.shapingLPCOrder, BWExp2_Q16 ); + + /* Compute noise shaping filter coefficients */ + silk_memcpy( AR1_Q24, AR2_Q24, psEnc->sCmn.shapingLPCOrder * sizeof( opus_int32 ) ); + + /* Bandwidth expansion for analysis filter shaping */ + silk_assert( BWExp1_Q16 <= SILK_FIX_CONST( 1.0, 16 ) ); + silk_bwexpander_32( AR1_Q24, psEnc->sCmn.shapingLPCOrder, BWExp1_Q16 ); + + /* Ratio of prediction gains, in energy domain */ + pre_nrg_Q30 = silk_LPC_inverse_pred_gain_Q24( AR2_Q24, psEnc->sCmn.shapingLPCOrder ); + nrg = silk_LPC_inverse_pred_gain_Q24( AR1_Q24, psEnc->sCmn.shapingLPCOrder ); + + /*psEncCtrl->GainsPre[ k ] = 1.0f - 0.7f * ( 1.0f - pre_nrg / nrg ) = 0.3f + 0.7f * pre_nrg / nrg;*/ + pre_nrg_Q30 = silk_LSHIFT32( silk_SMULWB( pre_nrg_Q30, SILK_FIX_CONST( 0.7, 15 ) ), 1 ); + psEncCtrl->GainsPre_Q14[ k ] = ( opus_int ) SILK_FIX_CONST( 0.3, 14 ) + silk_DIV32_varQ( pre_nrg_Q30, nrg, 14 ); + + /* Convert to monic warped prediction coefficients and limit absolute values */ + limit_warped_coefs( AR2_Q24, AR1_Q24, warping_Q16, SILK_FIX_CONST( 3.999, 24 ), psEnc->sCmn.shapingLPCOrder ); + + /* Convert from Q24 to Q13 and store in int16 */ + for( i = 0; i < psEnc->sCmn.shapingLPCOrder; i++ ) { + psEncCtrl->AR1_Q13[ k * MAX_SHAPE_LPC_ORDER + i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( AR1_Q24[ i ], 11 ) ); + psEncCtrl->AR2_Q13[ k * MAX_SHAPE_LPC_ORDER + i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( AR2_Q24[ i ], 11 ) ); + } + } + + /*****************/ + /* Gain tweaking */ + /*****************/ + /* Increase gains during low speech activity and put lower limit on gains */ + gain_mult_Q16 = silk_log2lin( -silk_SMLAWB( -SILK_FIX_CONST( 16.0, 7 ), SNR_adj_dB_Q7, SILK_FIX_CONST( 0.16, 16 ) ) ); + gain_add_Q16 = silk_log2lin( silk_SMLAWB( SILK_FIX_CONST( 16.0, 7 ), SILK_FIX_CONST( MIN_QGAIN_DB, 7 ), SILK_FIX_CONST( 0.16, 16 ) ) ); + silk_assert( gain_mult_Q16 > 0 ); + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + psEncCtrl->Gains_Q16[ k ] = silk_SMULWW( psEncCtrl->Gains_Q16[ k ], gain_mult_Q16 ); + silk_assert( psEncCtrl->Gains_Q16[ k ] >= 0 ); + psEncCtrl->Gains_Q16[ k ] = silk_ADD_POS_SAT32( psEncCtrl->Gains_Q16[ k ], gain_add_Q16 ); + } + + gain_mult_Q16 = SILK_FIX_CONST( 1.0, 16 ) + silk_RSHIFT_ROUND( silk_MLA( SILK_FIX_CONST( INPUT_TILT, 26 ), + psEncCtrl->coding_quality_Q14, SILK_FIX_CONST( HIGH_RATE_INPUT_TILT, 12 ) ), 10 ); + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + psEncCtrl->GainsPre_Q14[ k ] = silk_SMULWB( gain_mult_Q16, psEncCtrl->GainsPre_Q14[ k ] ); + } + + /************************************************/ + /* Control low-frequency shaping and noise tilt */ + /************************************************/ + /* Less low frequency shaping for noisy inputs */ + strength_Q16 = silk_MUL( SILK_FIX_CONST( LOW_FREQ_SHAPING, 4 ), silk_SMLAWB( SILK_FIX_CONST( 1.0, 12 ), + SILK_FIX_CONST( LOW_QUALITY_LOW_FREQ_SHAPING_DECR, 13 ), psEnc->sCmn.input_quality_bands_Q15[ 0 ] - SILK_FIX_CONST( 1.0, 15 ) ) ); + strength_Q16 = silk_RSHIFT( silk_MUL( strength_Q16, psEnc->sCmn.speech_activity_Q8 ), 8 ); + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + /* Reduce low frequencies quantization noise for periodic signals, depending on pitch lag */ + /*f = 400; freqz([1, -0.98 + 2e-4 * f], [1, -0.97 + 7e-4 * f], 2^12, Fs); axis([0, 1000, -10, 1])*/ + opus_int fs_kHz_inv = silk_DIV32_16( SILK_FIX_CONST( 0.2, 14 ), psEnc->sCmn.fs_kHz ); + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + b_Q14 = fs_kHz_inv + silk_DIV32_16( SILK_FIX_CONST( 3.0, 14 ), psEncCtrl->pitchL[ k ] ); + /* Pack two coefficients in one int32 */ + psEncCtrl->LF_shp_Q14[ k ] = silk_LSHIFT( SILK_FIX_CONST( 1.0, 14 ) - b_Q14 - silk_SMULWB( strength_Q16, b_Q14 ), 16 ); + psEncCtrl->LF_shp_Q14[ k ] |= (opus_uint16)( b_Q14 - SILK_FIX_CONST( 1.0, 14 ) ); + } + silk_assert( SILK_FIX_CONST( HARM_HP_NOISE_COEF, 24 ) < SILK_FIX_CONST( 0.5, 24 ) ); /* Guarantees that second argument to SMULWB() is within range of an opus_int16*/ + Tilt_Q16 = - SILK_FIX_CONST( HP_NOISE_COEF, 16 ) - + silk_SMULWB( SILK_FIX_CONST( 1.0, 16 ) - SILK_FIX_CONST( HP_NOISE_COEF, 16 ), + silk_SMULWB( SILK_FIX_CONST( HARM_HP_NOISE_COEF, 24 ), psEnc->sCmn.speech_activity_Q8 ) ); + } else { + b_Q14 = silk_DIV32_16( 21299, psEnc->sCmn.fs_kHz ); /* 1.3_Q0 = 21299_Q14*/ + /* Pack two coefficients in one int32 */ + psEncCtrl->LF_shp_Q14[ 0 ] = silk_LSHIFT( SILK_FIX_CONST( 1.0, 14 ) - b_Q14 - + silk_SMULWB( strength_Q16, silk_SMULWB( SILK_FIX_CONST( 0.6, 16 ), b_Q14 ) ), 16 ); + psEncCtrl->LF_shp_Q14[ 0 ] |= (opus_uint16)( b_Q14 - SILK_FIX_CONST( 1.0, 14 ) ); + for( k = 1; k < psEnc->sCmn.nb_subfr; k++ ) { + psEncCtrl->LF_shp_Q14[ k ] = psEncCtrl->LF_shp_Q14[ 0 ]; + } + Tilt_Q16 = -SILK_FIX_CONST( HP_NOISE_COEF, 16 ); + } + + /****************************/ + /* HARMONIC SHAPING CONTROL */ + /****************************/ + /* Control boosting of harmonic frequencies */ + HarmBoost_Q16 = silk_SMULWB( silk_SMULWB( SILK_FIX_CONST( 1.0, 17 ) - silk_LSHIFT( psEncCtrl->coding_quality_Q14, 3 ), + psEnc->LTPCorr_Q15 ), SILK_FIX_CONST( LOW_RATE_HARMONIC_BOOST, 16 ) ); + + /* More harmonic boost for noisy input signals */ + HarmBoost_Q16 = silk_SMLAWB( HarmBoost_Q16, + SILK_FIX_CONST( 1.0, 16 ) - silk_LSHIFT( psEncCtrl->input_quality_Q14, 2 ), SILK_FIX_CONST( LOW_INPUT_QUALITY_HARMONIC_BOOST, 16 ) ); + + if( USE_HARM_SHAPING && psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + /* More harmonic noise shaping for high bitrates or noisy input */ + HarmShapeGain_Q16 = silk_SMLAWB( SILK_FIX_CONST( HARMONIC_SHAPING, 16 ), + SILK_FIX_CONST( 1.0, 16 ) - silk_SMULWB( SILK_FIX_CONST( 1.0, 18 ) - silk_LSHIFT( psEncCtrl->coding_quality_Q14, 4 ), + psEncCtrl->input_quality_Q14 ), SILK_FIX_CONST( HIGH_RATE_OR_LOW_QUALITY_HARMONIC_SHAPING, 16 ) ); + + /* Less harmonic noise shaping for less periodic signals */ + HarmShapeGain_Q16 = silk_SMULWB( silk_LSHIFT( HarmShapeGain_Q16, 1 ), + silk_SQRT_APPROX( silk_LSHIFT( psEnc->LTPCorr_Q15, 15 ) ) ); + } else { + HarmShapeGain_Q16 = 0; + } + + /*************************/ + /* Smooth over subframes */ + /*************************/ + for( k = 0; k < MAX_NB_SUBFR; k++ ) { + psShapeSt->HarmBoost_smth_Q16 = + silk_SMLAWB( psShapeSt->HarmBoost_smth_Q16, HarmBoost_Q16 - psShapeSt->HarmBoost_smth_Q16, SILK_FIX_CONST( SUBFR_SMTH_COEF, 16 ) ); + psShapeSt->HarmShapeGain_smth_Q16 = + silk_SMLAWB( psShapeSt->HarmShapeGain_smth_Q16, HarmShapeGain_Q16 - psShapeSt->HarmShapeGain_smth_Q16, SILK_FIX_CONST( SUBFR_SMTH_COEF, 16 ) ); + psShapeSt->Tilt_smth_Q16 = + silk_SMLAWB( psShapeSt->Tilt_smth_Q16, Tilt_Q16 - psShapeSt->Tilt_smth_Q16, SILK_FIX_CONST( SUBFR_SMTH_COEF, 16 ) ); + + psEncCtrl->HarmBoost_Q14[ k ] = ( opus_int )silk_RSHIFT_ROUND( psShapeSt->HarmBoost_smth_Q16, 2 ); + psEncCtrl->HarmShapeGain_Q14[ k ] = ( opus_int )silk_RSHIFT_ROUND( psShapeSt->HarmShapeGain_smth_Q16, 2 ); + psEncCtrl->Tilt_Q14[ k ] = ( opus_int )silk_RSHIFT_ROUND( psShapeSt->Tilt_smth_Q16, 2 ); + } + RESTORE_STACK; +} +#endif /* OVERRIDE_silk_noise_shape_analysis_FIX */ diff --git a/thirdparty/opus/silk/fixed/pitch_analysis_core_FIX.c b/thirdparty/opus/silk/fixed/pitch_analysis_core_FIX.c new file mode 100644 index 000000000000..01bb9fc0a837 --- /dev/null +++ b/thirdparty/opus/silk/fixed/pitch_analysis_core_FIX.c @@ -0,0 +1,746 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/*********************************************************** +* Pitch analyser function +********************************************************** */ +#include "SigProc_FIX.h" +#include "pitch_est_defines.h" +#include "stack_alloc.h" +#include "debug.h" +#include "pitch.h" + +#define SCRATCH_SIZE 22 +#define SF_LENGTH_4KHZ ( PE_SUBFR_LENGTH_MS * 4 ) +#define SF_LENGTH_8KHZ ( PE_SUBFR_LENGTH_MS * 8 ) +#define MIN_LAG_4KHZ ( PE_MIN_LAG_MS * 4 ) +#define MIN_LAG_8KHZ ( PE_MIN_LAG_MS * 8 ) +#define MAX_LAG_4KHZ ( PE_MAX_LAG_MS * 4 ) +#define MAX_LAG_8KHZ ( PE_MAX_LAG_MS * 8 - 1 ) +#define CSTRIDE_4KHZ ( MAX_LAG_4KHZ + 1 - MIN_LAG_4KHZ ) +#define CSTRIDE_8KHZ ( MAX_LAG_8KHZ + 3 - ( MIN_LAG_8KHZ - 2 ) ) +#define D_COMP_MIN ( MIN_LAG_8KHZ - 3 ) +#define D_COMP_MAX ( MAX_LAG_8KHZ + 4 ) +#define D_COMP_STRIDE ( D_COMP_MAX - D_COMP_MIN ) + +typedef opus_int32 silk_pe_stage3_vals[ PE_NB_STAGE3_LAGS ]; + +/************************************************************/ +/* Internally used functions */ +/************************************************************/ +static void silk_P_Ana_calc_corr_st3( + silk_pe_stage3_vals cross_corr_st3[], /* O 3 DIM correlation array */ + const opus_int16 frame[], /* I vector to correlate */ + opus_int start_lag, /* I lag offset to search around */ + opus_int sf_length, /* I length of a 5 ms subframe */ + opus_int nb_subfr, /* I number of subframes */ + opus_int complexity, /* I Complexity setting */ + int arch /* I Run-time architecture */ +); + +static void silk_P_Ana_calc_energy_st3( + silk_pe_stage3_vals energies_st3[], /* O 3 DIM energy array */ + const opus_int16 frame[], /* I vector to calc energy in */ + opus_int start_lag, /* I lag offset to search around */ + opus_int sf_length, /* I length of one 5 ms subframe */ + opus_int nb_subfr, /* I number of subframes */ + opus_int complexity, /* I Complexity setting */ + int arch /* I Run-time architecture */ +); + +/*************************************************************/ +/* FIXED POINT CORE PITCH ANALYSIS FUNCTION */ +/*************************************************************/ +opus_int silk_pitch_analysis_core( /* O Voicing estimate: 0 voiced, 1 unvoiced */ + const opus_int16 *frame, /* I Signal of length PE_FRAME_LENGTH_MS*Fs_kHz */ + opus_int *pitch_out, /* O 4 pitch lag values */ + opus_int16 *lagIndex, /* O Lag Index */ + opus_int8 *contourIndex, /* O Pitch contour Index */ + opus_int *LTPCorr_Q15, /* I/O Normalized correlation; input: value from previous frame */ + opus_int prevLag, /* I Last lag of previous frame; set to zero is unvoiced */ + const opus_int32 search_thres1_Q16, /* I First stage threshold for lag candidates 0 - 1 */ + const opus_int search_thres2_Q13, /* I Final threshold for lag candidates 0 - 1 */ + const opus_int Fs_kHz, /* I Sample frequency (kHz) */ + const opus_int complexity, /* I Complexity setting, 0-2, where 2 is highest */ + const opus_int nb_subfr, /* I number of 5 ms subframes */ + int arch /* I Run-time architecture */ +) +{ + VARDECL( opus_int16, frame_8kHz ); + VARDECL( opus_int16, frame_4kHz ); + opus_int32 filt_state[ 6 ]; + const opus_int16 *input_frame_ptr; + opus_int i, k, d, j; + VARDECL( opus_int16, C ); + VARDECL( opus_int32, xcorr32 ); + const opus_int16 *target_ptr, *basis_ptr; + opus_int32 cross_corr, normalizer, energy, shift, energy_basis, energy_target; + opus_int d_srch[ PE_D_SRCH_LENGTH ], Cmax, length_d_srch, length_d_comp; + VARDECL( opus_int16, d_comp ); + opus_int32 sum, threshold, lag_counter; + opus_int CBimax, CBimax_new, CBimax_old, lag, start_lag, end_lag, lag_new; + opus_int32 CC[ PE_NB_CBKS_STAGE2_EXT ], CCmax, CCmax_b, CCmax_new_b, CCmax_new; + VARDECL( silk_pe_stage3_vals, energies_st3 ); + VARDECL( silk_pe_stage3_vals, cross_corr_st3 ); + opus_int frame_length, frame_length_8kHz, frame_length_4kHz; + opus_int sf_length; + opus_int min_lag; + opus_int max_lag; + opus_int32 contour_bias_Q15, diff; + opus_int nb_cbk_search, cbk_size; + opus_int32 delta_lag_log2_sqr_Q7, lag_log2_Q7, prevLag_log2_Q7, prev_lag_bias_Q13; + const opus_int8 *Lag_CB_ptr; + SAVE_STACK; + /* Check for valid sampling frequency */ + silk_assert( Fs_kHz == 8 || Fs_kHz == 12 || Fs_kHz == 16 ); + + /* Check for valid complexity setting */ + silk_assert( complexity >= SILK_PE_MIN_COMPLEX ); + silk_assert( complexity <= SILK_PE_MAX_COMPLEX ); + + silk_assert( search_thres1_Q16 >= 0 && search_thres1_Q16 <= (1<<16) ); + silk_assert( search_thres2_Q13 >= 0 && search_thres2_Q13 <= (1<<13) ); + + /* Set up frame lengths max / min lag for the sampling frequency */ + frame_length = ( PE_LTP_MEM_LENGTH_MS + nb_subfr * PE_SUBFR_LENGTH_MS ) * Fs_kHz; + frame_length_4kHz = ( PE_LTP_MEM_LENGTH_MS + nb_subfr * PE_SUBFR_LENGTH_MS ) * 4; + frame_length_8kHz = ( PE_LTP_MEM_LENGTH_MS + nb_subfr * PE_SUBFR_LENGTH_MS ) * 8; + sf_length = PE_SUBFR_LENGTH_MS * Fs_kHz; + min_lag = PE_MIN_LAG_MS * Fs_kHz; + max_lag = PE_MAX_LAG_MS * Fs_kHz - 1; + + /* Resample from input sampled at Fs_kHz to 8 kHz */ + ALLOC( frame_8kHz, frame_length_8kHz, opus_int16 ); + if( Fs_kHz == 16 ) { + silk_memset( filt_state, 0, 2 * sizeof( opus_int32 ) ); + silk_resampler_down2( filt_state, frame_8kHz, frame, frame_length ); + } else if( Fs_kHz == 12 ) { + silk_memset( filt_state, 0, 6 * sizeof( opus_int32 ) ); + silk_resampler_down2_3( filt_state, frame_8kHz, frame, frame_length ); + } else { + silk_assert( Fs_kHz == 8 ); + silk_memcpy( frame_8kHz, frame, frame_length_8kHz * sizeof(opus_int16) ); + } + + /* Decimate again to 4 kHz */ + silk_memset( filt_state, 0, 2 * sizeof( opus_int32 ) );/* Set state to zero */ + ALLOC( frame_4kHz, frame_length_4kHz, opus_int16 ); + silk_resampler_down2( filt_state, frame_4kHz, frame_8kHz, frame_length_8kHz ); + + /* Low-pass filter */ + for( i = frame_length_4kHz - 1; i > 0; i-- ) { + frame_4kHz[ i ] = silk_ADD_SAT16( frame_4kHz[ i ], frame_4kHz[ i - 1 ] ); + } + + /******************************************************************************* + ** Scale 4 kHz signal down to prevent correlations measures from overflowing + ** find scaling as max scaling for each 8kHz(?) subframe + *******************************************************************************/ + + /* Inner product is calculated with different lengths, so scale for the worst case */ + silk_sum_sqr_shift( &energy, &shift, frame_4kHz, frame_length_4kHz ); + if( shift > 0 ) { + shift = silk_RSHIFT( shift, 1 ); + for( i = 0; i < frame_length_4kHz; i++ ) { + frame_4kHz[ i ] = silk_RSHIFT( frame_4kHz[ i ], shift ); + } + } + + /****************************************************************************** + * FIRST STAGE, operating in 4 khz + ******************************************************************************/ + ALLOC( C, nb_subfr * CSTRIDE_8KHZ, opus_int16 ); + ALLOC( xcorr32, MAX_LAG_4KHZ-MIN_LAG_4KHZ+1, opus_int32 ); + silk_memset( C, 0, (nb_subfr >> 1) * CSTRIDE_4KHZ * sizeof( opus_int16 ) ); + target_ptr = &frame_4kHz[ silk_LSHIFT( SF_LENGTH_4KHZ, 2 ) ]; + for( k = 0; k < nb_subfr >> 1; k++ ) { + /* Check that we are within range of the array */ + silk_assert( target_ptr >= frame_4kHz ); + silk_assert( target_ptr + SF_LENGTH_8KHZ <= frame_4kHz + frame_length_4kHz ); + + basis_ptr = target_ptr - MIN_LAG_4KHZ; + + /* Check that we are within range of the array */ + silk_assert( basis_ptr >= frame_4kHz ); + silk_assert( basis_ptr + SF_LENGTH_8KHZ <= frame_4kHz + frame_length_4kHz ); + + celt_pitch_xcorr( target_ptr, target_ptr - MAX_LAG_4KHZ, xcorr32, SF_LENGTH_8KHZ, MAX_LAG_4KHZ - MIN_LAG_4KHZ + 1, arch ); + + /* Calculate first vector products before loop */ + cross_corr = xcorr32[ MAX_LAG_4KHZ - MIN_LAG_4KHZ ]; + normalizer = silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ, arch ); + normalizer = silk_ADD32( normalizer, silk_inner_prod_aligned( basis_ptr, basis_ptr, SF_LENGTH_8KHZ, arch ) ); + normalizer = silk_ADD32( normalizer, silk_SMULBB( SF_LENGTH_8KHZ, 4000 ) ); + + matrix_ptr( C, k, 0, CSTRIDE_4KHZ ) = + (opus_int16)silk_DIV32_varQ( cross_corr, normalizer, 13 + 1 ); /* Q13 */ + + /* From now on normalizer is computed recursively */ + for( d = MIN_LAG_4KHZ + 1; d <= MAX_LAG_4KHZ; d++ ) { + basis_ptr--; + + /* Check that we are within range of the array */ + silk_assert( basis_ptr >= frame_4kHz ); + silk_assert( basis_ptr + SF_LENGTH_8KHZ <= frame_4kHz + frame_length_4kHz ); + + cross_corr = xcorr32[ MAX_LAG_4KHZ - d ]; + + /* Add contribution of new sample and remove contribution from oldest sample */ + normalizer = silk_ADD32( normalizer, + silk_SMULBB( basis_ptr[ 0 ], basis_ptr[ 0 ] ) - + silk_SMULBB( basis_ptr[ SF_LENGTH_8KHZ ], basis_ptr[ SF_LENGTH_8KHZ ] ) ); + + matrix_ptr( C, k, d - MIN_LAG_4KHZ, CSTRIDE_4KHZ) = + (opus_int16)silk_DIV32_varQ( cross_corr, normalizer, 13 + 1 ); /* Q13 */ + } + /* Update target pointer */ + target_ptr += SF_LENGTH_8KHZ; + } + + /* Combine two subframes into single correlation measure and apply short-lag bias */ + if( nb_subfr == PE_MAX_NB_SUBFR ) { + for( i = MAX_LAG_4KHZ; i >= MIN_LAG_4KHZ; i-- ) { + sum = (opus_int32)matrix_ptr( C, 0, i - MIN_LAG_4KHZ, CSTRIDE_4KHZ ) + + (opus_int32)matrix_ptr( C, 1, i - MIN_LAG_4KHZ, CSTRIDE_4KHZ ); /* Q14 */ + sum = silk_SMLAWB( sum, sum, silk_LSHIFT( -i, 4 ) ); /* Q14 */ + C[ i - MIN_LAG_4KHZ ] = (opus_int16)sum; /* Q14 */ + } + } else { + /* Only short-lag bias */ + for( i = MAX_LAG_4KHZ; i >= MIN_LAG_4KHZ; i-- ) { + sum = silk_LSHIFT( (opus_int32)C[ i - MIN_LAG_4KHZ ], 1 ); /* Q14 */ + sum = silk_SMLAWB( sum, sum, silk_LSHIFT( -i, 4 ) ); /* Q14 */ + C[ i - MIN_LAG_4KHZ ] = (opus_int16)sum; /* Q14 */ + } + } + + /* Sort */ + length_d_srch = silk_ADD_LSHIFT32( 4, complexity, 1 ); + silk_assert( 3 * length_d_srch <= PE_D_SRCH_LENGTH ); + silk_insertion_sort_decreasing_int16( C, d_srch, CSTRIDE_4KHZ, + length_d_srch ); + + /* Escape if correlation is very low already here */ + Cmax = (opus_int)C[ 0 ]; /* Q14 */ + if( Cmax < SILK_FIX_CONST( 0.2, 14 ) ) { + silk_memset( pitch_out, 0, nb_subfr * sizeof( opus_int ) ); + *LTPCorr_Q15 = 0; + *lagIndex = 0; + *contourIndex = 0; + RESTORE_STACK; + return 1; + } + + threshold = silk_SMULWB( search_thres1_Q16, Cmax ); + for( i = 0; i < length_d_srch; i++ ) { + /* Convert to 8 kHz indices for the sorted correlation that exceeds the threshold */ + if( C[ i ] > threshold ) { + d_srch[ i ] = silk_LSHIFT( d_srch[ i ] + MIN_LAG_4KHZ, 1 ); + } else { + length_d_srch = i; + break; + } + } + silk_assert( length_d_srch > 0 ); + + ALLOC( d_comp, D_COMP_STRIDE, opus_int16 ); + for( i = D_COMP_MIN; i < D_COMP_MAX; i++ ) { + d_comp[ i - D_COMP_MIN ] = 0; + } + for( i = 0; i < length_d_srch; i++ ) { + d_comp[ d_srch[ i ] - D_COMP_MIN ] = 1; + } + + /* Convolution */ + for( i = D_COMP_MAX - 1; i >= MIN_LAG_8KHZ; i-- ) { + d_comp[ i - D_COMP_MIN ] += + d_comp[ i - 1 - D_COMP_MIN ] + d_comp[ i - 2 - D_COMP_MIN ]; + } + + length_d_srch = 0; + for( i = MIN_LAG_8KHZ; i < MAX_LAG_8KHZ + 1; i++ ) { + if( d_comp[ i + 1 - D_COMP_MIN ] > 0 ) { + d_srch[ length_d_srch ] = i; + length_d_srch++; + } + } + + /* Convolution */ + for( i = D_COMP_MAX - 1; i >= MIN_LAG_8KHZ; i-- ) { + d_comp[ i - D_COMP_MIN ] += d_comp[ i - 1 - D_COMP_MIN ] + + d_comp[ i - 2 - D_COMP_MIN ] + d_comp[ i - 3 - D_COMP_MIN ]; + } + + length_d_comp = 0; + for( i = MIN_LAG_8KHZ; i < D_COMP_MAX; i++ ) { + if( d_comp[ i - D_COMP_MIN ] > 0 ) { + d_comp[ length_d_comp ] = i - 2; + length_d_comp++; + } + } + + /********************************************************************************** + ** SECOND STAGE, operating at 8 kHz, on lag sections with high correlation + *************************************************************************************/ + + /****************************************************************************** + ** Scale signal down to avoid correlations measures from overflowing + *******************************************************************************/ + /* find scaling as max scaling for each subframe */ + silk_sum_sqr_shift( &energy, &shift, frame_8kHz, frame_length_8kHz ); + if( shift > 0 ) { + shift = silk_RSHIFT( shift, 1 ); + for( i = 0; i < frame_length_8kHz; i++ ) { + frame_8kHz[ i ] = silk_RSHIFT( frame_8kHz[ i ], shift ); + } + } + + /********************************************************************************* + * Find energy of each subframe projected onto its history, for a range of delays + *********************************************************************************/ + silk_memset( C, 0, nb_subfr * CSTRIDE_8KHZ * sizeof( opus_int16 ) ); + + target_ptr = &frame_8kHz[ PE_LTP_MEM_LENGTH_MS * 8 ]; + for( k = 0; k < nb_subfr; k++ ) { + + /* Check that we are within range of the array */ + silk_assert( target_ptr >= frame_8kHz ); + silk_assert( target_ptr + SF_LENGTH_8KHZ <= frame_8kHz + frame_length_8kHz ); + + energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, SF_LENGTH_8KHZ, arch ), 1 ); + for( j = 0; j < length_d_comp; j++ ) { + d = d_comp[ j ]; + basis_ptr = target_ptr - d; + + /* Check that we are within range of the array */ + silk_assert( basis_ptr >= frame_8kHz ); + silk_assert( basis_ptr + SF_LENGTH_8KHZ <= frame_8kHz + frame_length_8kHz ); + + cross_corr = silk_inner_prod_aligned( target_ptr, basis_ptr, SF_LENGTH_8KHZ, arch ); + if( cross_corr > 0 ) { + energy_basis = silk_inner_prod_aligned( basis_ptr, basis_ptr, SF_LENGTH_8KHZ, arch ); + matrix_ptr( C, k, d - ( MIN_LAG_8KHZ - 2 ), CSTRIDE_8KHZ ) = + (opus_int16)silk_DIV32_varQ( cross_corr, + silk_ADD32( energy_target, + energy_basis ), + 13 + 1 ); /* Q13 */ + } else { + matrix_ptr( C, k, d - ( MIN_LAG_8KHZ - 2 ), CSTRIDE_8KHZ ) = 0; + } + } + target_ptr += SF_LENGTH_8KHZ; + } + + /* search over lag range and lags codebook */ + /* scale factor for lag codebook, as a function of center lag */ + + CCmax = silk_int32_MIN; + CCmax_b = silk_int32_MIN; + + CBimax = 0; /* To avoid returning undefined lag values */ + lag = -1; /* To check if lag with strong enough correlation has been found */ + + if( prevLag > 0 ) { + if( Fs_kHz == 12 ) { + prevLag = silk_DIV32_16( silk_LSHIFT( prevLag, 1 ), 3 ); + } else if( Fs_kHz == 16 ) { + prevLag = silk_RSHIFT( prevLag, 1 ); + } + prevLag_log2_Q7 = silk_lin2log( (opus_int32)prevLag ); + } else { + prevLag_log2_Q7 = 0; + } + silk_assert( search_thres2_Q13 == silk_SAT16( search_thres2_Q13 ) ); + /* Set up stage 2 codebook based on number of subframes */ + if( nb_subfr == PE_MAX_NB_SUBFR ) { + cbk_size = PE_NB_CBKS_STAGE2_EXT; + Lag_CB_ptr = &silk_CB_lags_stage2[ 0 ][ 0 ]; + if( Fs_kHz == 8 && complexity > SILK_PE_MIN_COMPLEX ) { + /* If input is 8 khz use a larger codebook here because it is last stage */ + nb_cbk_search = PE_NB_CBKS_STAGE2_EXT; + } else { + nb_cbk_search = PE_NB_CBKS_STAGE2; + } + } else { + cbk_size = PE_NB_CBKS_STAGE2_10MS; + Lag_CB_ptr = &silk_CB_lags_stage2_10_ms[ 0 ][ 0 ]; + nb_cbk_search = PE_NB_CBKS_STAGE2_10MS; + } + + for( k = 0; k < length_d_srch; k++ ) { + d = d_srch[ k ]; + for( j = 0; j < nb_cbk_search; j++ ) { + CC[ j ] = 0; + for( i = 0; i < nb_subfr; i++ ) { + opus_int d_subfr; + /* Try all codebooks */ + d_subfr = d + matrix_ptr( Lag_CB_ptr, i, j, cbk_size ); + CC[ j ] = CC[ j ] + + (opus_int32)matrix_ptr( C, i, + d_subfr - ( MIN_LAG_8KHZ - 2 ), + CSTRIDE_8KHZ ); + } + } + /* Find best codebook */ + CCmax_new = silk_int32_MIN; + CBimax_new = 0; + for( i = 0; i < nb_cbk_search; i++ ) { + if( CC[ i ] > CCmax_new ) { + CCmax_new = CC[ i ]; + CBimax_new = i; + } + } + + /* Bias towards shorter lags */ + lag_log2_Q7 = silk_lin2log( d ); /* Q7 */ + silk_assert( lag_log2_Q7 == silk_SAT16( lag_log2_Q7 ) ); + silk_assert( nb_subfr * SILK_FIX_CONST( PE_SHORTLAG_BIAS, 13 ) == silk_SAT16( nb_subfr * SILK_FIX_CONST( PE_SHORTLAG_BIAS, 13 ) ) ); + CCmax_new_b = CCmax_new - silk_RSHIFT( silk_SMULBB( nb_subfr * SILK_FIX_CONST( PE_SHORTLAG_BIAS, 13 ), lag_log2_Q7 ), 7 ); /* Q13 */ + + /* Bias towards previous lag */ + silk_assert( nb_subfr * SILK_FIX_CONST( PE_PREVLAG_BIAS, 13 ) == silk_SAT16( nb_subfr * SILK_FIX_CONST( PE_PREVLAG_BIAS, 13 ) ) ); + if( prevLag > 0 ) { + delta_lag_log2_sqr_Q7 = lag_log2_Q7 - prevLag_log2_Q7; + silk_assert( delta_lag_log2_sqr_Q7 == silk_SAT16( delta_lag_log2_sqr_Q7 ) ); + delta_lag_log2_sqr_Q7 = silk_RSHIFT( silk_SMULBB( delta_lag_log2_sqr_Q7, delta_lag_log2_sqr_Q7 ), 7 ); + prev_lag_bias_Q13 = silk_RSHIFT( silk_SMULBB( nb_subfr * SILK_FIX_CONST( PE_PREVLAG_BIAS, 13 ), *LTPCorr_Q15 ), 15 ); /* Q13 */ + prev_lag_bias_Q13 = silk_DIV32( silk_MUL( prev_lag_bias_Q13, delta_lag_log2_sqr_Q7 ), delta_lag_log2_sqr_Q7 + SILK_FIX_CONST( 0.5, 7 ) ); + CCmax_new_b -= prev_lag_bias_Q13; /* Q13 */ + } + + if( CCmax_new_b > CCmax_b && /* Find maximum biased correlation */ + CCmax_new > silk_SMULBB( nb_subfr, search_thres2_Q13 ) && /* Correlation needs to be high enough to be voiced */ + silk_CB_lags_stage2[ 0 ][ CBimax_new ] <= MIN_LAG_8KHZ /* Lag must be in range */ + ) { + CCmax_b = CCmax_new_b; + CCmax = CCmax_new; + lag = d; + CBimax = CBimax_new; + } + } + + if( lag == -1 ) { + /* No suitable candidate found */ + silk_memset( pitch_out, 0, nb_subfr * sizeof( opus_int ) ); + *LTPCorr_Q15 = 0; + *lagIndex = 0; + *contourIndex = 0; + RESTORE_STACK; + return 1; + } + + /* Output normalized correlation */ + *LTPCorr_Q15 = (opus_int)silk_LSHIFT( silk_DIV32_16( CCmax, nb_subfr ), 2 ); + silk_assert( *LTPCorr_Q15 >= 0 ); + + if( Fs_kHz > 8 ) { + VARDECL( opus_int16, scratch_mem ); + /***************************************************************************/ + /* Scale input signal down to avoid correlations measures from overflowing */ + /***************************************************************************/ + /* find scaling as max scaling for each subframe */ + silk_sum_sqr_shift( &energy, &shift, frame, frame_length ); + ALLOC( scratch_mem, shift > 0 ? frame_length : ALLOC_NONE, opus_int16 ); + if( shift > 0 ) { + /* Move signal to scratch mem because the input signal should be unchanged */ + shift = silk_RSHIFT( shift, 1 ); + for( i = 0; i < frame_length; i++ ) { + scratch_mem[ i ] = silk_RSHIFT( frame[ i ], shift ); + } + input_frame_ptr = scratch_mem; + } else { + input_frame_ptr = frame; + } + + /* Search in original signal */ + + CBimax_old = CBimax; + /* Compensate for decimation */ + silk_assert( lag == silk_SAT16( lag ) ); + if( Fs_kHz == 12 ) { + lag = silk_RSHIFT( silk_SMULBB( lag, 3 ), 1 ); + } else if( Fs_kHz == 16 ) { + lag = silk_LSHIFT( lag, 1 ); + } else { + lag = silk_SMULBB( lag, 3 ); + } + + lag = silk_LIMIT_int( lag, min_lag, max_lag ); + start_lag = silk_max_int( lag - 2, min_lag ); + end_lag = silk_min_int( lag + 2, max_lag ); + lag_new = lag; /* to avoid undefined lag */ + CBimax = 0; /* to avoid undefined lag */ + + CCmax = silk_int32_MIN; + /* pitch lags according to second stage */ + for( k = 0; k < nb_subfr; k++ ) { + pitch_out[ k ] = lag + 2 * silk_CB_lags_stage2[ k ][ CBimax_old ]; + } + + /* Set up codebook parameters according to complexity setting and frame length */ + if( nb_subfr == PE_MAX_NB_SUBFR ) { + nb_cbk_search = (opus_int)silk_nb_cbk_searchs_stage3[ complexity ]; + cbk_size = PE_NB_CBKS_STAGE3_MAX; + Lag_CB_ptr = &silk_CB_lags_stage3[ 0 ][ 0 ]; + } else { + nb_cbk_search = PE_NB_CBKS_STAGE3_10MS; + cbk_size = PE_NB_CBKS_STAGE3_10MS; + Lag_CB_ptr = &silk_CB_lags_stage3_10_ms[ 0 ][ 0 ]; + } + + /* Calculate the correlations and energies needed in stage 3 */ + ALLOC( energies_st3, nb_subfr * nb_cbk_search, silk_pe_stage3_vals ); + ALLOC( cross_corr_st3, nb_subfr * nb_cbk_search, silk_pe_stage3_vals ); + silk_P_Ana_calc_corr_st3( cross_corr_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity, arch ); + silk_P_Ana_calc_energy_st3( energies_st3, input_frame_ptr, start_lag, sf_length, nb_subfr, complexity, arch ); + + lag_counter = 0; + silk_assert( lag == silk_SAT16( lag ) ); + contour_bias_Q15 = silk_DIV32_16( SILK_FIX_CONST( PE_FLATCONTOUR_BIAS, 15 ), lag ); + + target_ptr = &input_frame_ptr[ PE_LTP_MEM_LENGTH_MS * Fs_kHz ]; + energy_target = silk_ADD32( silk_inner_prod_aligned( target_ptr, target_ptr, nb_subfr * sf_length, arch ), 1 ); + for( d = start_lag; d <= end_lag; d++ ) { + for( j = 0; j < nb_cbk_search; j++ ) { + cross_corr = 0; + energy = energy_target; + for( k = 0; k < nb_subfr; k++ ) { + cross_corr = silk_ADD32( cross_corr, + matrix_ptr( cross_corr_st3, k, j, + nb_cbk_search )[ lag_counter ] ); + energy = silk_ADD32( energy, + matrix_ptr( energies_st3, k, j, + nb_cbk_search )[ lag_counter ] ); + silk_assert( energy >= 0 ); + } + if( cross_corr > 0 ) { + CCmax_new = silk_DIV32_varQ( cross_corr, energy, 13 + 1 ); /* Q13 */ + /* Reduce depending on flatness of contour */ + diff = silk_int16_MAX - silk_MUL( contour_bias_Q15, j ); /* Q15 */ + silk_assert( diff == silk_SAT16( diff ) ); + CCmax_new = silk_SMULWB( CCmax_new, diff ); /* Q14 */ + } else { + CCmax_new = 0; + } + + if( CCmax_new > CCmax && ( d + silk_CB_lags_stage3[ 0 ][ j ] ) <= max_lag ) { + CCmax = CCmax_new; + lag_new = d; + CBimax = j; + } + } + lag_counter++; + } + + for( k = 0; k < nb_subfr; k++ ) { + pitch_out[ k ] = lag_new + matrix_ptr( Lag_CB_ptr, k, CBimax, cbk_size ); + pitch_out[ k ] = silk_LIMIT( pitch_out[ k ], min_lag, PE_MAX_LAG_MS * Fs_kHz ); + } + *lagIndex = (opus_int16)( lag_new - min_lag); + *contourIndex = (opus_int8)CBimax; + } else { /* Fs_kHz == 8 */ + /* Save Lags */ + for( k = 0; k < nb_subfr; k++ ) { + pitch_out[ k ] = lag + matrix_ptr( Lag_CB_ptr, k, CBimax, cbk_size ); + pitch_out[ k ] = silk_LIMIT( pitch_out[ k ], MIN_LAG_8KHZ, PE_MAX_LAG_MS * 8 ); + } + *lagIndex = (opus_int16)( lag - MIN_LAG_8KHZ ); + *contourIndex = (opus_int8)CBimax; + } + silk_assert( *lagIndex >= 0 ); + /* return as voiced */ + RESTORE_STACK; + return 0; +} + +/*********************************************************************** + * Calculates the correlations used in stage 3 search. In order to cover + * the whole lag codebook for all the searched offset lags (lag +- 2), + * the following correlations are needed in each sub frame: + * + * sf1: lag range [-8,...,7] total 16 correlations + * sf2: lag range [-4,...,4] total 9 correlations + * sf3: lag range [-3,....4] total 8 correltions + * sf4: lag range [-6,....8] total 15 correlations + * + * In total 48 correlations. The direct implementation computed in worst + * case 4*12*5 = 240 correlations, but more likely around 120. + ***********************************************************************/ +static void silk_P_Ana_calc_corr_st3( + silk_pe_stage3_vals cross_corr_st3[], /* O 3 DIM correlation array */ + const opus_int16 frame[], /* I vector to correlate */ + opus_int start_lag, /* I lag offset to search around */ + opus_int sf_length, /* I length of a 5 ms subframe */ + opus_int nb_subfr, /* I number of subframes */ + opus_int complexity, /* I Complexity setting */ + int arch /* I Run-time architecture */ +) +{ + const opus_int16 *target_ptr; + opus_int i, j, k, lag_counter, lag_low, lag_high; + opus_int nb_cbk_search, delta, idx, cbk_size; + VARDECL( opus_int32, scratch_mem ); + VARDECL( opus_int32, xcorr32 ); + const opus_int8 *Lag_range_ptr, *Lag_CB_ptr; + SAVE_STACK; + + silk_assert( complexity >= SILK_PE_MIN_COMPLEX ); + silk_assert( complexity <= SILK_PE_MAX_COMPLEX ); + + if( nb_subfr == PE_MAX_NB_SUBFR ) { + Lag_range_ptr = &silk_Lag_range_stage3[ complexity ][ 0 ][ 0 ]; + Lag_CB_ptr = &silk_CB_lags_stage3[ 0 ][ 0 ]; + nb_cbk_search = silk_nb_cbk_searchs_stage3[ complexity ]; + cbk_size = PE_NB_CBKS_STAGE3_MAX; + } else { + silk_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1); + Lag_range_ptr = &silk_Lag_range_stage3_10_ms[ 0 ][ 0 ]; + Lag_CB_ptr = &silk_CB_lags_stage3_10_ms[ 0 ][ 0 ]; + nb_cbk_search = PE_NB_CBKS_STAGE3_10MS; + cbk_size = PE_NB_CBKS_STAGE3_10MS; + } + ALLOC( scratch_mem, SCRATCH_SIZE, opus_int32 ); + ALLOC( xcorr32, SCRATCH_SIZE, opus_int32 ); + + target_ptr = &frame[ silk_LSHIFT( sf_length, 2 ) ]; /* Pointer to middle of frame */ + for( k = 0; k < nb_subfr; k++ ) { + lag_counter = 0; + + /* Calculate the correlations for each subframe */ + lag_low = matrix_ptr( Lag_range_ptr, k, 0, 2 ); + lag_high = matrix_ptr( Lag_range_ptr, k, 1, 2 ); + silk_assert(lag_high-lag_low+1 <= SCRATCH_SIZE); + celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr32, sf_length, lag_high - lag_low + 1, arch ); + for( j = lag_low; j <= lag_high; j++ ) { + silk_assert( lag_counter < SCRATCH_SIZE ); + scratch_mem[ lag_counter ] = xcorr32[ lag_high - j ]; + lag_counter++; + } + + delta = matrix_ptr( Lag_range_ptr, k, 0, 2 ); + for( i = 0; i < nb_cbk_search; i++ ) { + /* Fill out the 3 dim array that stores the correlations for */ + /* each code_book vector for each start lag */ + idx = matrix_ptr( Lag_CB_ptr, k, i, cbk_size ) - delta; + for( j = 0; j < PE_NB_STAGE3_LAGS; j++ ) { + silk_assert( idx + j < SCRATCH_SIZE ); + silk_assert( idx + j < lag_counter ); + matrix_ptr( cross_corr_st3, k, i, nb_cbk_search )[ j ] = + scratch_mem[ idx + j ]; + } + } + target_ptr += sf_length; + } + RESTORE_STACK; +} + +/********************************************************************/ +/* Calculate the energies for first two subframes. The energies are */ +/* calculated recursively. */ +/********************************************************************/ +static void silk_P_Ana_calc_energy_st3( + silk_pe_stage3_vals energies_st3[], /* O 3 DIM energy array */ + const opus_int16 frame[], /* I vector to calc energy in */ + opus_int start_lag, /* I lag offset to search around */ + opus_int sf_length, /* I length of one 5 ms subframe */ + opus_int nb_subfr, /* I number of subframes */ + opus_int complexity, /* I Complexity setting */ + int arch /* I Run-time architecture */ +) +{ + const opus_int16 *target_ptr, *basis_ptr; + opus_int32 energy; + opus_int k, i, j, lag_counter; + opus_int nb_cbk_search, delta, idx, cbk_size, lag_diff; + VARDECL( opus_int32, scratch_mem ); + const opus_int8 *Lag_range_ptr, *Lag_CB_ptr; + SAVE_STACK; + + silk_assert( complexity >= SILK_PE_MIN_COMPLEX ); + silk_assert( complexity <= SILK_PE_MAX_COMPLEX ); + + if( nb_subfr == PE_MAX_NB_SUBFR ) { + Lag_range_ptr = &silk_Lag_range_stage3[ complexity ][ 0 ][ 0 ]; + Lag_CB_ptr = &silk_CB_lags_stage3[ 0 ][ 0 ]; + nb_cbk_search = silk_nb_cbk_searchs_stage3[ complexity ]; + cbk_size = PE_NB_CBKS_STAGE3_MAX; + } else { + silk_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1); + Lag_range_ptr = &silk_Lag_range_stage3_10_ms[ 0 ][ 0 ]; + Lag_CB_ptr = &silk_CB_lags_stage3_10_ms[ 0 ][ 0 ]; + nb_cbk_search = PE_NB_CBKS_STAGE3_10MS; + cbk_size = PE_NB_CBKS_STAGE3_10MS; + } + ALLOC( scratch_mem, SCRATCH_SIZE, opus_int32 ); + + target_ptr = &frame[ silk_LSHIFT( sf_length, 2 ) ]; + for( k = 0; k < nb_subfr; k++ ) { + lag_counter = 0; + + /* Calculate the energy for first lag */ + basis_ptr = target_ptr - ( start_lag + matrix_ptr( Lag_range_ptr, k, 0, 2 ) ); + energy = silk_inner_prod_aligned( basis_ptr, basis_ptr, sf_length, arch ); + silk_assert( energy >= 0 ); + scratch_mem[ lag_counter ] = energy; + lag_counter++; + + lag_diff = ( matrix_ptr( Lag_range_ptr, k, 1, 2 ) - matrix_ptr( Lag_range_ptr, k, 0, 2 ) + 1 ); + for( i = 1; i < lag_diff; i++ ) { + /* remove part outside new window */ + energy -= silk_SMULBB( basis_ptr[ sf_length - i ], basis_ptr[ sf_length - i ] ); + silk_assert( energy >= 0 ); + + /* add part that comes into window */ + energy = silk_ADD_SAT32( energy, silk_SMULBB( basis_ptr[ -i ], basis_ptr[ -i ] ) ); + silk_assert( energy >= 0 ); + silk_assert( lag_counter < SCRATCH_SIZE ); + scratch_mem[ lag_counter ] = energy; + lag_counter++; + } + + delta = matrix_ptr( Lag_range_ptr, k, 0, 2 ); + for( i = 0; i < nb_cbk_search; i++ ) { + /* Fill out the 3 dim array that stores the correlations for */ + /* each code_book vector for each start lag */ + idx = matrix_ptr( Lag_CB_ptr, k, i, cbk_size ) - delta; + for( j = 0; j < PE_NB_STAGE3_LAGS; j++ ) { + silk_assert( idx + j < SCRATCH_SIZE ); + silk_assert( idx + j < lag_counter ); + matrix_ptr( energies_st3, k, i, nb_cbk_search )[ j ] = + scratch_mem[ idx + j ]; + silk_assert( + matrix_ptr( energies_st3, k, i, nb_cbk_search )[ j ] >= 0 ); + } + } + target_ptr += sf_length; + } + RESTORE_STACK; +} diff --git a/thirdparty/opus/silk/fixed/prefilter_FIX.c b/thirdparty/opus/silk/fixed/prefilter_FIX.c new file mode 100644 index 000000000000..6a8e35152efb --- /dev/null +++ b/thirdparty/opus/silk/fixed/prefilter_FIX.c @@ -0,0 +1,221 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" +#include "stack_alloc.h" +#include "tuning_parameters.h" + +#if defined(MIPSr1_ASM) +#include "mips/prefilter_FIX_mipsr1.h" +#endif + + +#if !defined(OVERRIDE_silk_warped_LPC_analysis_filter_FIX) +#define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \ + ((void)(arch),silk_warped_LPC_analysis_filter_FIX_c(state, res_Q2, coef_Q13, input, lambda_Q16, length, order)) +#endif + +/* Prefilter for finding Quantizer input signal */ +static OPUS_INLINE void silk_prefilt_FIX( + silk_prefilter_state_FIX *P, /* I/O state */ + opus_int32 st_res_Q12[], /* I short term residual signal */ + opus_int32 xw_Q3[], /* O prefiltered signal */ + opus_int32 HarmShapeFIRPacked_Q12, /* I Harmonic shaping coeficients */ + opus_int Tilt_Q14, /* I Tilt shaping coeficient */ + opus_int32 LF_shp_Q14, /* I Low-frequancy shaping coeficients */ + opus_int lag, /* I Lag for harmonic shaping */ + opus_int length /* I Length of signals */ +); + +void silk_warped_LPC_analysis_filter_FIX_c( + opus_int32 state[], /* I/O State [order + 1] */ + opus_int32 res_Q2[], /* O Residual signal [length] */ + const opus_int16 coef_Q13[], /* I Coefficients [order] */ + const opus_int16 input[], /* I Input signal [length] */ + const opus_int16 lambda_Q16, /* I Warping factor */ + const opus_int length, /* I Length of input signal */ + const opus_int order /* I Filter order (even) */ +) +{ + opus_int n, i; + opus_int32 acc_Q11, tmp1, tmp2; + + /* Order must be even */ + silk_assert( ( order & 1 ) == 0 ); + + for( n = 0; n < length; n++ ) { + /* Output of lowpass section */ + tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 ); + state[ 0 ] = silk_LSHIFT( input[ n ], 14 ); + /* Output of allpass section */ + tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 ); + state[ 1 ] = tmp2; + acc_Q11 = silk_RSHIFT( order, 1 ); + acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] ); + /* Loop over allpass sections */ + for( i = 2; i < order; i += 2 ) { + /* Output of allpass section */ + tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 ); + state[ i ] = tmp1; + acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] ); + /* Output of allpass section */ + tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 ); + state[ i + 1 ] = tmp2; + acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] ); + } + state[ order ] = tmp1; + acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] ); + res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 ); + } +} + +void silk_prefilter_FIX( + silk_encoder_state_FIX *psEnc, /* I/O Encoder state */ + const silk_encoder_control_FIX *psEncCtrl, /* I Encoder control */ + opus_int32 xw_Q3[], /* O Weighted signal */ + const opus_int16 x[] /* I Speech signal */ +) +{ + silk_prefilter_state_FIX *P = &psEnc->sPrefilt; + opus_int j, k, lag; + opus_int32 tmp_32; + const opus_int16 *AR1_shp_Q13; + const opus_int16 *px; + opus_int32 *pxw_Q3; + opus_int HarmShapeGain_Q12, Tilt_Q14; + opus_int32 HarmShapeFIRPacked_Q12, LF_shp_Q14; + VARDECL( opus_int32, x_filt_Q12 ); + VARDECL( opus_int32, st_res_Q2 ); + opus_int16 B_Q10[ 2 ]; + SAVE_STACK; + + /* Set up pointers */ + px = x; + pxw_Q3 = xw_Q3; + lag = P->lagPrev; + ALLOC( x_filt_Q12, psEnc->sCmn.subfr_length, opus_int32 ); + ALLOC( st_res_Q2, psEnc->sCmn.subfr_length, opus_int32 ); + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + /* Update Variables that change per sub frame */ + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + lag = psEncCtrl->pitchL[ k ]; + } + + /* Noise shape parameters */ + HarmShapeGain_Q12 = silk_SMULWB( (opus_int32)psEncCtrl->HarmShapeGain_Q14[ k ], 16384 - psEncCtrl->HarmBoost_Q14[ k ] ); + silk_assert( HarmShapeGain_Q12 >= 0 ); + HarmShapeFIRPacked_Q12 = silk_RSHIFT( HarmShapeGain_Q12, 2 ); + HarmShapeFIRPacked_Q12 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q12, 1 ), 16 ); + Tilt_Q14 = psEncCtrl->Tilt_Q14[ k ]; + LF_shp_Q14 = psEncCtrl->LF_shp_Q14[ k ]; + AR1_shp_Q13 = &psEncCtrl->AR1_Q13[ k * MAX_SHAPE_LPC_ORDER ]; + + /* Short term FIR filtering*/ + silk_warped_LPC_analysis_filter_FIX( P->sAR_shp, st_res_Q2, AR1_shp_Q13, px, + psEnc->sCmn.warping_Q16, psEnc->sCmn.subfr_length, psEnc->sCmn.shapingLPCOrder, psEnc->sCmn.arch ); + + /* Reduce (mainly) low frequencies during harmonic emphasis */ + B_Q10[ 0 ] = silk_RSHIFT_ROUND( psEncCtrl->GainsPre_Q14[ k ], 4 ); + tmp_32 = silk_SMLABB( SILK_FIX_CONST( INPUT_TILT, 26 ), psEncCtrl->HarmBoost_Q14[ k ], HarmShapeGain_Q12 ); /* Q26 */ + tmp_32 = silk_SMLABB( tmp_32, psEncCtrl->coding_quality_Q14, SILK_FIX_CONST( HIGH_RATE_INPUT_TILT, 12 ) ); /* Q26 */ + tmp_32 = silk_SMULWB( tmp_32, -psEncCtrl->GainsPre_Q14[ k ] ); /* Q24 */ + tmp_32 = silk_RSHIFT_ROUND( tmp_32, 14 ); /* Q10 */ + B_Q10[ 1 ]= silk_SAT16( tmp_32 ); + x_filt_Q12[ 0 ] = silk_MLA( silk_MUL( st_res_Q2[ 0 ], B_Q10[ 0 ] ), P->sHarmHP_Q2, B_Q10[ 1 ] ); + for( j = 1; j < psEnc->sCmn.subfr_length; j++ ) { + x_filt_Q12[ j ] = silk_MLA( silk_MUL( st_res_Q2[ j ], B_Q10[ 0 ] ), st_res_Q2[ j - 1 ], B_Q10[ 1 ] ); + } + P->sHarmHP_Q2 = st_res_Q2[ psEnc->sCmn.subfr_length - 1 ]; + + silk_prefilt_FIX( P, x_filt_Q12, pxw_Q3, HarmShapeFIRPacked_Q12, Tilt_Q14, LF_shp_Q14, lag, psEnc->sCmn.subfr_length ); + + px += psEnc->sCmn.subfr_length; + pxw_Q3 += psEnc->sCmn.subfr_length; + } + + P->lagPrev = psEncCtrl->pitchL[ psEnc->sCmn.nb_subfr - 1 ]; + RESTORE_STACK; +} + +#ifndef OVERRIDE_silk_prefilt_FIX +/* Prefilter for finding Quantizer input signal */ +static OPUS_INLINE void silk_prefilt_FIX( + silk_prefilter_state_FIX *P, /* I/O state */ + opus_int32 st_res_Q12[], /* I short term residual signal */ + opus_int32 xw_Q3[], /* O prefiltered signal */ + opus_int32 HarmShapeFIRPacked_Q12, /* I Harmonic shaping coeficients */ + opus_int Tilt_Q14, /* I Tilt shaping coeficient */ + opus_int32 LF_shp_Q14, /* I Low-frequancy shaping coeficients */ + opus_int lag, /* I Lag for harmonic shaping */ + opus_int length /* I Length of signals */ +) +{ + opus_int i, idx, LTP_shp_buf_idx; + opus_int32 n_LTP_Q12, n_Tilt_Q10, n_LF_Q10; + opus_int32 sLF_MA_shp_Q12, sLF_AR_shp_Q12; + opus_int16 *LTP_shp_buf; + + /* To speed up use temp variables instead of using the struct */ + LTP_shp_buf = P->sLTP_shp; + LTP_shp_buf_idx = P->sLTP_shp_buf_idx; + sLF_AR_shp_Q12 = P->sLF_AR_shp_Q12; + sLF_MA_shp_Q12 = P->sLF_MA_shp_Q12; + + for( i = 0; i < length; i++ ) { + if( lag > 0 ) { + /* unrolled loop */ + silk_assert( HARM_SHAPE_FIR_TAPS == 3 ); + idx = lag + LTP_shp_buf_idx; + n_LTP_Q12 = silk_SMULBB( LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 - 1) & LTP_MASK ], HarmShapeFIRPacked_Q12 ); + n_LTP_Q12 = silk_SMLABT( n_LTP_Q12, LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 ) & LTP_MASK ], HarmShapeFIRPacked_Q12 ); + n_LTP_Q12 = silk_SMLABB( n_LTP_Q12, LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 + 1) & LTP_MASK ], HarmShapeFIRPacked_Q12 ); + } else { + n_LTP_Q12 = 0; + } + + n_Tilt_Q10 = silk_SMULWB( sLF_AR_shp_Q12, Tilt_Q14 ); + n_LF_Q10 = silk_SMLAWB( silk_SMULWT( sLF_AR_shp_Q12, LF_shp_Q14 ), sLF_MA_shp_Q12, LF_shp_Q14 ); + + sLF_AR_shp_Q12 = silk_SUB32( st_res_Q12[ i ], silk_LSHIFT( n_Tilt_Q10, 2 ) ); + sLF_MA_shp_Q12 = silk_SUB32( sLF_AR_shp_Q12, silk_LSHIFT( n_LF_Q10, 2 ) ); + + LTP_shp_buf_idx = ( LTP_shp_buf_idx - 1 ) & LTP_MASK; + LTP_shp_buf[ LTP_shp_buf_idx ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sLF_MA_shp_Q12, 12 ) ); + + xw_Q3[i] = silk_RSHIFT_ROUND( silk_SUB32( sLF_MA_shp_Q12, n_LTP_Q12 ), 9 ); + } + + /* Copy temp variable back to state */ + P->sLF_AR_shp_Q12 = sLF_AR_shp_Q12; + P->sLF_MA_shp_Q12 = sLF_MA_shp_Q12; + P->sLTP_shp_buf_idx = LTP_shp_buf_idx; +} +#endif /* OVERRIDE_silk_prefilt_FIX */ diff --git a/thirdparty/opus/silk/fixed/process_gains_FIX.c b/thirdparty/opus/silk/fixed/process_gains_FIX.c new file mode 100644 index 000000000000..05aba317889b --- /dev/null +++ b/thirdparty/opus/silk/fixed/process_gains_FIX.c @@ -0,0 +1,117 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" +#include "tuning_parameters.h" + +/* Processing of gains */ +void silk_process_gains_FIX( + silk_encoder_state_FIX *psEnc, /* I/O Encoder state */ + silk_encoder_control_FIX *psEncCtrl, /* I/O Encoder control */ + opus_int condCoding /* I The type of conditional coding to use */ +) +{ + silk_shape_state_FIX *psShapeSt = &psEnc->sShape; + opus_int k; + opus_int32 s_Q16, InvMaxSqrVal_Q16, gain, gain_squared, ResNrg, ResNrgPart, quant_offset_Q10; + + /* Gain reduction when LTP coding gain is high */ + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + /*s = -0.5f * silk_sigmoid( 0.25f * ( psEncCtrl->LTPredCodGain - 12.0f ) ); */ + s_Q16 = -silk_sigm_Q15( silk_RSHIFT_ROUND( psEncCtrl->LTPredCodGain_Q7 - SILK_FIX_CONST( 12.0, 7 ), 4 ) ); + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + psEncCtrl->Gains_Q16[ k ] = silk_SMLAWB( psEncCtrl->Gains_Q16[ k ], psEncCtrl->Gains_Q16[ k ], s_Q16 ); + } + } + + /* Limit the quantized signal */ + /* InvMaxSqrVal = pow( 2.0f, 0.33f * ( 21.0f - SNR_dB ) ) / subfr_length; */ + InvMaxSqrVal_Q16 = silk_DIV32_16( silk_log2lin( + silk_SMULWB( SILK_FIX_CONST( 21 + 16 / 0.33, 7 ) - psEnc->sCmn.SNR_dB_Q7, SILK_FIX_CONST( 0.33, 16 ) ) ), psEnc->sCmn.subfr_length ); + + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + /* Soft limit on ratio residual energy and squared gains */ + ResNrg = psEncCtrl->ResNrg[ k ]; + ResNrgPart = silk_SMULWW( ResNrg, InvMaxSqrVal_Q16 ); + if( psEncCtrl->ResNrgQ[ k ] > 0 ) { + ResNrgPart = silk_RSHIFT_ROUND( ResNrgPart, psEncCtrl->ResNrgQ[ k ] ); + } else { + if( ResNrgPart >= silk_RSHIFT( silk_int32_MAX, -psEncCtrl->ResNrgQ[ k ] ) ) { + ResNrgPart = silk_int32_MAX; + } else { + ResNrgPart = silk_LSHIFT( ResNrgPart, -psEncCtrl->ResNrgQ[ k ] ); + } + } + gain = psEncCtrl->Gains_Q16[ k ]; + gain_squared = silk_ADD_SAT32( ResNrgPart, silk_SMMUL( gain, gain ) ); + if( gain_squared < silk_int16_MAX ) { + /* recalculate with higher precision */ + gain_squared = silk_SMLAWW( silk_LSHIFT( ResNrgPart, 16 ), gain, gain ); + silk_assert( gain_squared > 0 ); + gain = silk_SQRT_APPROX( gain_squared ); /* Q8 */ + gain = silk_min( gain, silk_int32_MAX >> 8 ); + psEncCtrl->Gains_Q16[ k ] = silk_LSHIFT_SAT32( gain, 8 ); /* Q16 */ + } else { + gain = silk_SQRT_APPROX( gain_squared ); /* Q0 */ + gain = silk_min( gain, silk_int32_MAX >> 16 ); + psEncCtrl->Gains_Q16[ k ] = silk_LSHIFT_SAT32( gain, 16 ); /* Q16 */ + } + } + + /* Save unquantized gains and gain Index */ + silk_memcpy( psEncCtrl->GainsUnq_Q16, psEncCtrl->Gains_Q16, psEnc->sCmn.nb_subfr * sizeof( opus_int32 ) ); + psEncCtrl->lastGainIndexPrev = psShapeSt->LastGainIndex; + + /* Quantize gains */ + silk_gains_quant( psEnc->sCmn.indices.GainsIndices, psEncCtrl->Gains_Q16, + &psShapeSt->LastGainIndex, condCoding == CODE_CONDITIONALLY, psEnc->sCmn.nb_subfr ); + + /* Set quantizer offset for voiced signals. Larger offset when LTP coding gain is low or tilt is high (ie low-pass) */ + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + if( psEncCtrl->LTPredCodGain_Q7 + silk_RSHIFT( psEnc->sCmn.input_tilt_Q15, 8 ) > SILK_FIX_CONST( 1.0, 7 ) ) { + psEnc->sCmn.indices.quantOffsetType = 0; + } else { + psEnc->sCmn.indices.quantOffsetType = 1; + } + } + + /* Quantizer boundary adjustment */ + quant_offset_Q10 = silk_Quantization_Offsets_Q10[ psEnc->sCmn.indices.signalType >> 1 ][ psEnc->sCmn.indices.quantOffsetType ]; + psEncCtrl->Lambda_Q10 = SILK_FIX_CONST( LAMBDA_OFFSET, 10 ) + + silk_SMULBB( SILK_FIX_CONST( LAMBDA_DELAYED_DECISIONS, 10 ), psEnc->sCmn.nStatesDelayedDecision ) + + silk_SMULWB( SILK_FIX_CONST( LAMBDA_SPEECH_ACT, 18 ), psEnc->sCmn.speech_activity_Q8 ) + + silk_SMULWB( SILK_FIX_CONST( LAMBDA_INPUT_QUALITY, 12 ), psEncCtrl->input_quality_Q14 ) + + silk_SMULWB( SILK_FIX_CONST( LAMBDA_CODING_QUALITY, 12 ), psEncCtrl->coding_quality_Q14 ) + + silk_SMULWB( SILK_FIX_CONST( LAMBDA_QUANT_OFFSET, 16 ), quant_offset_Q10 ); + + silk_assert( psEncCtrl->Lambda_Q10 > 0 ); + silk_assert( psEncCtrl->Lambda_Q10 < SILK_FIX_CONST( 2, 10 ) ); +} diff --git a/thirdparty/opus/silk/fixed/regularize_correlations_FIX.c b/thirdparty/opus/silk/fixed/regularize_correlations_FIX.c new file mode 100644 index 000000000000..a2836b05f4f8 --- /dev/null +++ b/thirdparty/opus/silk/fixed/regularize_correlations_FIX.c @@ -0,0 +1,47 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" + +/* Add noise to matrix diagonal */ +void silk_regularize_correlations_FIX( + opus_int32 *XX, /* I/O Correlation matrices */ + opus_int32 *xx, /* I/O Correlation values */ + opus_int32 noise, /* I Noise to add */ + opus_int D /* I Dimension of XX */ +) +{ + opus_int i; + for( i = 0; i < D; i++ ) { + matrix_ptr( &XX[ 0 ], i, i, D ) = silk_ADD32( matrix_ptr( &XX[ 0 ], i, i, D ), noise ); + } + xx[ 0 ] += noise; +} diff --git a/thirdparty/opus/silk/fixed/residual_energy16_FIX.c b/thirdparty/opus/silk/fixed/residual_energy16_FIX.c new file mode 100644 index 000000000000..ebffb2a66f9a --- /dev/null +++ b/thirdparty/opus/silk/fixed/residual_energy16_FIX.c @@ -0,0 +1,103 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" + +/* Residual energy: nrg = wxx - 2 * wXx * c + c' * wXX * c */ +opus_int32 silk_residual_energy16_covar_FIX( + const opus_int16 *c, /* I Prediction vector */ + const opus_int32 *wXX, /* I Correlation matrix */ + const opus_int32 *wXx, /* I Correlation vector */ + opus_int32 wxx, /* I Signal energy */ + opus_int D, /* I Dimension */ + opus_int cQ /* I Q value for c vector 0 - 15 */ +) +{ + opus_int i, j, lshifts, Qxtra; + opus_int32 c_max, w_max, tmp, tmp2, nrg; + opus_int cn[ MAX_MATRIX_SIZE ]; + const opus_int32 *pRow; + + /* Safety checks */ + silk_assert( D >= 0 ); + silk_assert( D <= 16 ); + silk_assert( cQ > 0 ); + silk_assert( cQ < 16 ); + + lshifts = 16 - cQ; + Qxtra = lshifts; + + c_max = 0; + for( i = 0; i < D; i++ ) { + c_max = silk_max_32( c_max, silk_abs( (opus_int32)c[ i ] ) ); + } + Qxtra = silk_min_int( Qxtra, silk_CLZ32( c_max ) - 17 ); + + w_max = silk_max_32( wXX[ 0 ], wXX[ D * D - 1 ] ); + Qxtra = silk_min_int( Qxtra, silk_CLZ32( silk_MUL( D, silk_RSHIFT( silk_SMULWB( w_max, c_max ), 4 ) ) ) - 5 ); + Qxtra = silk_max_int( Qxtra, 0 ); + for( i = 0; i < D; i++ ) { + cn[ i ] = silk_LSHIFT( ( opus_int )c[ i ], Qxtra ); + silk_assert( silk_abs(cn[i]) <= ( silk_int16_MAX + 1 ) ); /* Check that silk_SMLAWB can be used */ + } + lshifts -= Qxtra; + + /* Compute wxx - 2 * wXx * c */ + tmp = 0; + for( i = 0; i < D; i++ ) { + tmp = silk_SMLAWB( tmp, wXx[ i ], cn[ i ] ); + } + nrg = silk_RSHIFT( wxx, 1 + lshifts ) - tmp; /* Q: -lshifts - 1 */ + + /* Add c' * wXX * c, assuming wXX is symmetric */ + tmp2 = 0; + for( i = 0; i < D; i++ ) { + tmp = 0; + pRow = &wXX[ i * D ]; + for( j = i + 1; j < D; j++ ) { + tmp = silk_SMLAWB( tmp, pRow[ j ], cn[ j ] ); + } + tmp = silk_SMLAWB( tmp, silk_RSHIFT( pRow[ i ], 1 ), cn[ i ] ); + tmp2 = silk_SMLAWB( tmp2, tmp, cn[ i ] ); + } + nrg = silk_ADD_LSHIFT32( nrg, tmp2, lshifts ); /* Q: -lshifts - 1 */ + + /* Keep one bit free always, because we add them for LSF interpolation */ + if( nrg < 1 ) { + nrg = 1; + } else if( nrg > silk_RSHIFT( silk_int32_MAX, lshifts + 2 ) ) { + nrg = silk_int32_MAX >> 1; + } else { + nrg = silk_LSHIFT( nrg, lshifts + 1 ); /* Q0 */ + } + return nrg; + +} diff --git a/thirdparty/opus/silk/fixed/residual_energy_FIX.c b/thirdparty/opus/silk/fixed/residual_energy_FIX.c new file mode 100644 index 000000000000..41f74778e82e --- /dev/null +++ b/thirdparty/opus/silk/fixed/residual_energy_FIX.c @@ -0,0 +1,98 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" +#include "stack_alloc.h" + +/* Calculates residual energies of input subframes where all subframes have LPC_order */ +/* of preceding samples */ +void silk_residual_energy_FIX( + opus_int32 nrgs[ MAX_NB_SUBFR ], /* O Residual energy per subframe */ + opus_int nrgsQ[ MAX_NB_SUBFR ], /* O Q value per subframe */ + const opus_int16 x[], /* I Input signal */ + opus_int16 a_Q12[ 2 ][ MAX_LPC_ORDER ], /* I AR coefs for each frame half */ + const opus_int32 gains[ MAX_NB_SUBFR ], /* I Quantization gains */ + const opus_int subfr_length, /* I Subframe length */ + const opus_int nb_subfr, /* I Number of subframes */ + const opus_int LPC_order, /* I LPC order */ + int arch /* I Run-time architecture */ +) +{ + opus_int offset, i, j, rshift, lz1, lz2; + opus_int16 *LPC_res_ptr; + VARDECL( opus_int16, LPC_res ); + const opus_int16 *x_ptr; + opus_int32 tmp32; + SAVE_STACK; + + x_ptr = x; + offset = LPC_order + subfr_length; + + /* Filter input to create the LPC residual for each frame half, and measure subframe energies */ + ALLOC( LPC_res, ( MAX_NB_SUBFR >> 1 ) * offset, opus_int16 ); + silk_assert( ( nb_subfr >> 1 ) * ( MAX_NB_SUBFR >> 1 ) == nb_subfr ); + for( i = 0; i < nb_subfr >> 1; i++ ) { + /* Calculate half frame LPC residual signal including preceding samples */ + silk_LPC_analysis_filter( LPC_res, x_ptr, a_Q12[ i ], ( MAX_NB_SUBFR >> 1 ) * offset, LPC_order, arch ); + + /* Point to first subframe of the just calculated LPC residual signal */ + LPC_res_ptr = LPC_res + LPC_order; + for( j = 0; j < ( MAX_NB_SUBFR >> 1 ); j++ ) { + /* Measure subframe energy */ + silk_sum_sqr_shift( &nrgs[ i * ( MAX_NB_SUBFR >> 1 ) + j ], &rshift, LPC_res_ptr, subfr_length ); + + /* Set Q values for the measured energy */ + nrgsQ[ i * ( MAX_NB_SUBFR >> 1 ) + j ] = -rshift; + + /* Move to next subframe */ + LPC_res_ptr += offset; + } + /* Move to next frame half */ + x_ptr += ( MAX_NB_SUBFR >> 1 ) * offset; + } + + /* Apply the squared subframe gains */ + for( i = 0; i < nb_subfr; i++ ) { + /* Fully upscale gains and energies */ + lz1 = silk_CLZ32( nrgs[ i ] ) - 1; + lz2 = silk_CLZ32( gains[ i ] ) - 1; + + tmp32 = silk_LSHIFT32( gains[ i ], lz2 ); + + /* Find squared gains */ + tmp32 = silk_SMMUL( tmp32, tmp32 ); /* Q( 2 * lz2 - 32 )*/ + + /* Scale energies */ + nrgs[ i ] = silk_SMMUL( tmp32, silk_LSHIFT32( nrgs[ i ], lz1 ) ); /* Q( nrgsQ[ i ] + lz1 + 2 * lz2 - 32 - 32 )*/ + nrgsQ[ i ] += lz1 + 2 * lz2 - 32 - 32; + } + RESTORE_STACK; +} diff --git a/thirdparty/opus/silk/fixed/schur64_FIX.c b/thirdparty/opus/silk/fixed/schur64_FIX.c new file mode 100644 index 000000000000..764a10ef3ea5 --- /dev/null +++ b/thirdparty/opus/silk/fixed/schur64_FIX.c @@ -0,0 +1,92 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" + +/* Slower than schur(), but more accurate. */ +/* Uses SMULL(), available on armv4 */ +opus_int32 silk_schur64( /* O returns residual energy */ + opus_int32 rc_Q16[], /* O Reflection coefficients [order] Q16 */ + const opus_int32 c[], /* I Correlations [order+1] */ + opus_int32 order /* I Prediction order */ +) +{ + opus_int k, n; + opus_int32 C[ SILK_MAX_ORDER_LPC + 1 ][ 2 ]; + opus_int32 Ctmp1_Q30, Ctmp2_Q30, rc_tmp_Q31; + + silk_assert( order==6||order==8||order==10||order==12||order==14||order==16 ); + + /* Check for invalid input */ + if( c[ 0 ] <= 0 ) { + silk_memset( rc_Q16, 0, order * sizeof( opus_int32 ) ); + return 0; + } + + for( k = 0; k < order + 1; k++ ) { + C[ k ][ 0 ] = C[ k ][ 1 ] = c[ k ]; + } + + for( k = 0; k < order; k++ ) { + /* Check that we won't be getting an unstable rc, otherwise stop here. */ + if (silk_abs_int32(C[ k + 1 ][ 0 ]) >= C[ 0 ][ 1 ]) { + if ( C[ k + 1 ][ 0 ] > 0 ) { + rc_Q16[ k ] = -SILK_FIX_CONST( .99f, 16 ); + } else { + rc_Q16[ k ] = SILK_FIX_CONST( .99f, 16 ); + } + k++; + break; + } + + /* Get reflection coefficient: divide two Q30 values and get result in Q31 */ + rc_tmp_Q31 = silk_DIV32_varQ( -C[ k + 1 ][ 0 ], C[ 0 ][ 1 ], 31 ); + + /* Save the output */ + rc_Q16[ k ] = silk_RSHIFT_ROUND( rc_tmp_Q31, 15 ); + + /* Update correlations */ + for( n = 0; n < order - k; n++ ) { + Ctmp1_Q30 = C[ n + k + 1 ][ 0 ]; + Ctmp2_Q30 = C[ n ][ 1 ]; + + /* Multiply and add the highest int32 */ + C[ n + k + 1 ][ 0 ] = Ctmp1_Q30 + silk_SMMUL( silk_LSHIFT( Ctmp2_Q30, 1 ), rc_tmp_Q31 ); + C[ n ][ 1 ] = Ctmp2_Q30 + silk_SMMUL( silk_LSHIFT( Ctmp1_Q30, 1 ), rc_tmp_Q31 ); + } + } + + for(; k < order; k++ ) { + rc_Q16[ k ] = 0; + } + + return silk_max_32( 1, C[ 0 ][ 1 ] ); +} diff --git a/thirdparty/opus/silk/fixed/schur_FIX.c b/thirdparty/opus/silk/fixed/schur_FIX.c new file mode 100644 index 000000000000..c4c0ef23b478 --- /dev/null +++ b/thirdparty/opus/silk/fixed/schur_FIX.c @@ -0,0 +1,106 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" + +/* Faster than schur64(), but much less accurate. */ +/* uses SMLAWB(), requiring armv5E and higher. */ +opus_int32 silk_schur( /* O Returns residual energy */ + opus_int16 *rc_Q15, /* O reflection coefficients [order] Q15 */ + const opus_int32 *c, /* I correlations [order+1] */ + const opus_int32 order /* I prediction order */ +) +{ + opus_int k, n, lz; + opus_int32 C[ SILK_MAX_ORDER_LPC + 1 ][ 2 ]; + opus_int32 Ctmp1, Ctmp2, rc_tmp_Q15; + + silk_assert( order==6||order==8||order==10||order==12||order==14||order==16 ); + + /* Get number of leading zeros */ + lz = silk_CLZ32( c[ 0 ] ); + + /* Copy correlations and adjust level to Q30 */ + if( lz < 2 ) { + /* lz must be 1, so shift one to the right */ + for( k = 0; k < order + 1; k++ ) { + C[ k ][ 0 ] = C[ k ][ 1 ] = silk_RSHIFT( c[ k ], 1 ); + } + } else if( lz > 2 ) { + /* Shift to the left */ + lz -= 2; + for( k = 0; k < order + 1; k++ ) { + C[ k ][ 0 ] = C[ k ][ 1 ] = silk_LSHIFT( c[ k ], lz ); + } + } else { + /* No need to shift */ + for( k = 0; k < order + 1; k++ ) { + C[ k ][ 0 ] = C[ k ][ 1 ] = c[ k ]; + } + } + + for( k = 0; k < order; k++ ) { + /* Check that we won't be getting an unstable rc, otherwise stop here. */ + if (silk_abs_int32(C[ k + 1 ][ 0 ]) >= C[ 0 ][ 1 ]) { + if ( C[ k + 1 ][ 0 ] > 0 ) { + rc_Q15[ k ] = -SILK_FIX_CONST( .99f, 15 ); + } else { + rc_Q15[ k ] = SILK_FIX_CONST( .99f, 15 ); + } + k++; + break; + } + + /* Get reflection coefficient */ + rc_tmp_Q15 = -silk_DIV32_16( C[ k + 1 ][ 0 ], silk_max_32( silk_RSHIFT( C[ 0 ][ 1 ], 15 ), 1 ) ); + + /* Clip (shouldn't happen for properly conditioned inputs) */ + rc_tmp_Q15 = silk_SAT16( rc_tmp_Q15 ); + + /* Store */ + rc_Q15[ k ] = (opus_int16)rc_tmp_Q15; + + /* Update correlations */ + for( n = 0; n < order - k; n++ ) { + Ctmp1 = C[ n + k + 1 ][ 0 ]; + Ctmp2 = C[ n ][ 1 ]; + C[ n + k + 1 ][ 0 ] = silk_SMLAWB( Ctmp1, silk_LSHIFT( Ctmp2, 1 ), rc_tmp_Q15 ); + C[ n ][ 1 ] = silk_SMLAWB( Ctmp2, silk_LSHIFT( Ctmp1, 1 ), rc_tmp_Q15 ); + } + } + + for(; k < order; k++ ) { + rc_Q15[ k ] = 0; + } + + /* return residual energy */ + return silk_max_32( 1, C[ 0 ][ 1 ] ); +} diff --git a/thirdparty/opus/silk/fixed/solve_LS_FIX.c b/thirdparty/opus/silk/fixed/solve_LS_FIX.c new file mode 100644 index 000000000000..51d7d49d02a5 --- /dev/null +++ b/thirdparty/opus/silk/fixed/solve_LS_FIX.c @@ -0,0 +1,249 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" +#include "stack_alloc.h" +#include "tuning_parameters.h" + +/*****************************/ +/* Internal function headers */ +/*****************************/ + +typedef struct { + opus_int32 Q36_part; + opus_int32 Q48_part; +} inv_D_t; + +/* Factorize square matrix A into LDL form */ +static OPUS_INLINE void silk_LDL_factorize_FIX( + opus_int32 *A, /* I/O Pointer to Symetric Square Matrix */ + opus_int M, /* I Size of Matrix */ + opus_int32 *L_Q16, /* I/O Pointer to Square Upper triangular Matrix */ + inv_D_t *inv_D /* I/O Pointer to vector holding inverted diagonal elements of D */ +); + +/* Solve Lx = b, when L is lower triangular and has ones on the diagonal */ +static OPUS_INLINE void silk_LS_SolveFirst_FIX( + const opus_int32 *L_Q16, /* I Pointer to Lower Triangular Matrix */ + opus_int M, /* I Dim of Matrix equation */ + const opus_int32 *b, /* I b Vector */ + opus_int32 *x_Q16 /* O x Vector */ +); + +/* Solve L^t*x = b, where L is lower triangular with ones on the diagonal */ +static OPUS_INLINE void silk_LS_SolveLast_FIX( + const opus_int32 *L_Q16, /* I Pointer to Lower Triangular Matrix */ + const opus_int M, /* I Dim of Matrix equation */ + const opus_int32 *b, /* I b Vector */ + opus_int32 *x_Q16 /* O x Vector */ +); + +static OPUS_INLINE void silk_LS_divide_Q16_FIX( + opus_int32 T[], /* I/O Numenator vector */ + inv_D_t *inv_D, /* I 1 / D vector */ + opus_int M /* I dimension */ +); + +/* Solves Ax = b, assuming A is symmetric */ +void silk_solve_LDL_FIX( + opus_int32 *A, /* I Pointer to symetric square matrix A */ + opus_int M, /* I Size of matrix */ + const opus_int32 *b, /* I Pointer to b vector */ + opus_int32 *x_Q16 /* O Pointer to x solution vector */ +) +{ + VARDECL( opus_int32, L_Q16 ); + opus_int32 Y[ MAX_MATRIX_SIZE ]; + inv_D_t inv_D[ MAX_MATRIX_SIZE ]; + SAVE_STACK; + + silk_assert( M <= MAX_MATRIX_SIZE ); + ALLOC( L_Q16, M * M, opus_int32 ); + + /*************************************************** + Factorize A by LDL such that A = L*D*L', + where L is lower triangular with ones on diagonal + ****************************************************/ + silk_LDL_factorize_FIX( A, M, L_Q16, inv_D ); + + /**************************************************** + * substitute D*L'*x = Y. ie: + L*D*L'*x = b => L*Y = b <=> Y = inv(L)*b + ******************************************************/ + silk_LS_SolveFirst_FIX( L_Q16, M, b, Y ); + + /**************************************************** + D*L'*x = Y <=> L'*x = inv(D)*Y, because D is + diagonal just multiply with 1/d_i + ****************************************************/ + silk_LS_divide_Q16_FIX( Y, inv_D, M ); + + /**************************************************** + x = inv(L') * inv(D) * Y + *****************************************************/ + silk_LS_SolveLast_FIX( L_Q16, M, Y, x_Q16 ); + RESTORE_STACK; +} + +static OPUS_INLINE void silk_LDL_factorize_FIX( + opus_int32 *A, /* I/O Pointer to Symetric Square Matrix */ + opus_int M, /* I Size of Matrix */ + opus_int32 *L_Q16, /* I/O Pointer to Square Upper triangular Matrix */ + inv_D_t *inv_D /* I/O Pointer to vector holding inverted diagonal elements of D */ +) +{ + opus_int i, j, k, status, loop_count; + const opus_int32 *ptr1, *ptr2; + opus_int32 diag_min_value, tmp_32, err; + opus_int32 v_Q0[ MAX_MATRIX_SIZE ], D_Q0[ MAX_MATRIX_SIZE ]; + opus_int32 one_div_diag_Q36, one_div_diag_Q40, one_div_diag_Q48; + + silk_assert( M <= MAX_MATRIX_SIZE ); + + status = 1; + diag_min_value = silk_max_32( silk_SMMUL( silk_ADD_SAT32( A[ 0 ], A[ silk_SMULBB( M, M ) - 1 ] ), SILK_FIX_CONST( FIND_LTP_COND_FAC, 31 ) ), 1 << 9 ); + for( loop_count = 0; loop_count < M && status == 1; loop_count++ ) { + status = 0; + for( j = 0; j < M; j++ ) { + ptr1 = matrix_adr( L_Q16, j, 0, M ); + tmp_32 = 0; + for( i = 0; i < j; i++ ) { + v_Q0[ i ] = silk_SMULWW( D_Q0[ i ], ptr1[ i ] ); /* Q0 */ + tmp_32 = silk_SMLAWW( tmp_32, v_Q0[ i ], ptr1[ i ] ); /* Q0 */ + } + tmp_32 = silk_SUB32( matrix_ptr( A, j, j, M ), tmp_32 ); + + if( tmp_32 < diag_min_value ) { + tmp_32 = silk_SUB32( silk_SMULBB( loop_count + 1, diag_min_value ), tmp_32 ); + /* Matrix not positive semi-definite, or ill conditioned */ + for( i = 0; i < M; i++ ) { + matrix_ptr( A, i, i, M ) = silk_ADD32( matrix_ptr( A, i, i, M ), tmp_32 ); + } + status = 1; + break; + } + D_Q0[ j ] = tmp_32; /* always < max(Correlation) */ + + /* two-step division */ + one_div_diag_Q36 = silk_INVERSE32_varQ( tmp_32, 36 ); /* Q36 */ + one_div_diag_Q40 = silk_LSHIFT( one_div_diag_Q36, 4 ); /* Q40 */ + err = silk_SUB32( (opus_int32)1 << 24, silk_SMULWW( tmp_32, one_div_diag_Q40 ) ); /* Q24 */ + one_div_diag_Q48 = silk_SMULWW( err, one_div_diag_Q40 ); /* Q48 */ + + /* Save 1/Ds */ + inv_D[ j ].Q36_part = one_div_diag_Q36; + inv_D[ j ].Q48_part = one_div_diag_Q48; + + matrix_ptr( L_Q16, j, j, M ) = 65536; /* 1.0 in Q16 */ + ptr1 = matrix_adr( A, j, 0, M ); + ptr2 = matrix_adr( L_Q16, j + 1, 0, M ); + for( i = j + 1; i < M; i++ ) { + tmp_32 = 0; + for( k = 0; k < j; k++ ) { + tmp_32 = silk_SMLAWW( tmp_32, v_Q0[ k ], ptr2[ k ] ); /* Q0 */ + } + tmp_32 = silk_SUB32( ptr1[ i ], tmp_32 ); /* always < max(Correlation) */ + + /* tmp_32 / D_Q0[j] : Divide to Q16 */ + matrix_ptr( L_Q16, i, j, M ) = silk_ADD32( silk_SMMUL( tmp_32, one_div_diag_Q48 ), + silk_RSHIFT( silk_SMULWW( tmp_32, one_div_diag_Q36 ), 4 ) ); + + /* go to next column */ + ptr2 += M; + } + } + } + + silk_assert( status == 0 ); +} + +static OPUS_INLINE void silk_LS_divide_Q16_FIX( + opus_int32 T[], /* I/O Numenator vector */ + inv_D_t *inv_D, /* I 1 / D vector */ + opus_int M /* I dimension */ +) +{ + opus_int i; + opus_int32 tmp_32; + opus_int32 one_div_diag_Q36, one_div_diag_Q48; + + for( i = 0; i < M; i++ ) { + one_div_diag_Q36 = inv_D[ i ].Q36_part; + one_div_diag_Q48 = inv_D[ i ].Q48_part; + + tmp_32 = T[ i ]; + T[ i ] = silk_ADD32( silk_SMMUL( tmp_32, one_div_diag_Q48 ), silk_RSHIFT( silk_SMULWW( tmp_32, one_div_diag_Q36 ), 4 ) ); + } +} + +/* Solve Lx = b, when L is lower triangular and has ones on the diagonal */ +static OPUS_INLINE void silk_LS_SolveFirst_FIX( + const opus_int32 *L_Q16, /* I Pointer to Lower Triangular Matrix */ + opus_int M, /* I Dim of Matrix equation */ + const opus_int32 *b, /* I b Vector */ + opus_int32 *x_Q16 /* O x Vector */ +) +{ + opus_int i, j; + const opus_int32 *ptr32; + opus_int32 tmp_32; + + for( i = 0; i < M; i++ ) { + ptr32 = matrix_adr( L_Q16, i, 0, M ); + tmp_32 = 0; + for( j = 0; j < i; j++ ) { + tmp_32 = silk_SMLAWW( tmp_32, ptr32[ j ], x_Q16[ j ] ); + } + x_Q16[ i ] = silk_SUB32( b[ i ], tmp_32 ); + } +} + +/* Solve L^t*x = b, where L is lower triangular with ones on the diagonal */ +static OPUS_INLINE void silk_LS_SolveLast_FIX( + const opus_int32 *L_Q16, /* I Pointer to Lower Triangular Matrix */ + const opus_int M, /* I Dim of Matrix equation */ + const opus_int32 *b, /* I b Vector */ + opus_int32 *x_Q16 /* O x Vector */ +) +{ + opus_int i, j; + const opus_int32 *ptr32; + opus_int32 tmp_32; + + for( i = M - 1; i >= 0; i-- ) { + ptr32 = matrix_adr( L_Q16, 0, i, M ); + tmp_32 = 0; + for( j = M - 1; j > i; j-- ) { + tmp_32 = silk_SMLAWW( tmp_32, ptr32[ silk_SMULBB( j, M ) ], x_Q16[ j ] ); + } + x_Q16[ i ] = silk_SUB32( b[ i ], tmp_32 ); + } +} diff --git a/thirdparty/opus/silk/fixed/structs_FIX.h b/thirdparty/opus/silk/fixed/structs_FIX.h new file mode 100644 index 000000000000..3294b251281e --- /dev/null +++ b/thirdparty/opus/silk/fixed/structs_FIX.h @@ -0,0 +1,134 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_STRUCTS_FIX_H +#define SILK_STRUCTS_FIX_H + +#include "typedef.h" +#include "main.h" +#include "structs.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/********************************/ +/* Noise shaping analysis state */ +/********************************/ +typedef struct { + opus_int8 LastGainIndex; + opus_int32 HarmBoost_smth_Q16; + opus_int32 HarmShapeGain_smth_Q16; + opus_int32 Tilt_smth_Q16; +} silk_shape_state_FIX; + +/********************************/ +/* Prefilter state */ +/********************************/ +typedef struct { + opus_int16 sLTP_shp[ LTP_BUF_LENGTH ]; + opus_int32 sAR_shp[ MAX_SHAPE_LPC_ORDER + 1 ]; + opus_int sLTP_shp_buf_idx; + opus_int32 sLF_AR_shp_Q12; + opus_int32 sLF_MA_shp_Q12; + opus_int32 sHarmHP_Q2; + opus_int32 rand_seed; + opus_int lagPrev; +} silk_prefilter_state_FIX; + +/********************************/ +/* Encoder state FIX */ +/********************************/ +typedef struct { + silk_encoder_state sCmn; /* Common struct, shared with floating-point code */ + silk_shape_state_FIX sShape; /* Shape state */ + silk_prefilter_state_FIX sPrefilt; /* Prefilter State */ + + /* Buffer for find pitch and noise shape analysis */ + silk_DWORD_ALIGN opus_int16 x_buf[ 2 * MAX_FRAME_LENGTH + LA_SHAPE_MAX ];/* Buffer for find pitch and noise shape analysis */ + opus_int LTPCorr_Q15; /* Normalized correlation from pitch lag estimator */ +} silk_encoder_state_FIX; + +/************************/ +/* Encoder control FIX */ +/************************/ +typedef struct { + /* Prediction and coding parameters */ + opus_int32 Gains_Q16[ MAX_NB_SUBFR ]; + silk_DWORD_ALIGN opus_int16 PredCoef_Q12[ 2 ][ MAX_LPC_ORDER ]; + opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ]; + opus_int LTP_scale_Q14; + opus_int pitchL[ MAX_NB_SUBFR ]; + + /* Noise shaping parameters */ + /* Testing */ + silk_DWORD_ALIGN opus_int16 AR1_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ]; + silk_DWORD_ALIGN opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ]; + opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ]; /* Packs two int16 coefficients per int32 value */ + opus_int GainsPre_Q14[ MAX_NB_SUBFR ]; + opus_int HarmBoost_Q14[ MAX_NB_SUBFR ]; + opus_int Tilt_Q14[ MAX_NB_SUBFR ]; + opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ]; + opus_int Lambda_Q10; + opus_int input_quality_Q14; + opus_int coding_quality_Q14; + + /* measures */ + opus_int sparseness_Q8; + opus_int32 predGain_Q16; + opus_int LTPredCodGain_Q7; + opus_int32 ResNrg[ MAX_NB_SUBFR ]; /* Residual energy per subframe */ + opus_int ResNrgQ[ MAX_NB_SUBFR ]; /* Q domain for the residual energy > 0 */ + + /* Parameters for CBR mode */ + opus_int32 GainsUnq_Q16[ MAX_NB_SUBFR ]; + opus_int8 lastGainIndexPrev; +} silk_encoder_control_FIX; + +/************************/ +/* Encoder Super Struct */ +/************************/ +typedef struct { + silk_encoder_state_FIX state_Fxx[ ENCODER_NUM_CHANNELS ]; + stereo_enc_state sStereo; + opus_int32 nBitsUsedLBRR; + opus_int32 nBitsExceeded; + opus_int nChannelsAPI; + opus_int nChannelsInternal; + opus_int nPrevChannelsInternal; + opus_int timeSinceSwitchAllowed_ms; + opus_int allowBandwidthSwitch; + opus_int prev_decode_only_middle; +} silk_encoder; + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/thirdparty/opus/silk/fixed/vector_ops_FIX.c b/thirdparty/opus/silk/fixed/vector_ops_FIX.c new file mode 100644 index 000000000000..d94980014f62 --- /dev/null +++ b/thirdparty/opus/silk/fixed/vector_ops_FIX.c @@ -0,0 +1,102 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" +#include "pitch.h" + +/* Copy and multiply a vector by a constant */ +void silk_scale_copy_vector16( + opus_int16 *data_out, + const opus_int16 *data_in, + opus_int32 gain_Q16, /* I Gain in Q16 */ + const opus_int dataSize /* I Length */ +) +{ + opus_int i; + opus_int32 tmp32; + + for( i = 0; i < dataSize; i++ ) { + tmp32 = silk_SMULWB( gain_Q16, data_in[ i ] ); + data_out[ i ] = (opus_int16)silk_CHECK_FIT16( tmp32 ); + } +} + +/* Multiply a vector by a constant */ +void silk_scale_vector32_Q26_lshift_18( + opus_int32 *data1, /* I/O Q0/Q18 */ + opus_int32 gain_Q26, /* I Q26 */ + opus_int dataSize /* I length */ +) +{ + opus_int i; + + for( i = 0; i < dataSize; i++ ) { + data1[ i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( silk_SMULL( data1[ i ], gain_Q26 ), 8 ) ); /* OUTPUT: Q18 */ + } +} + +/* sum = for(i=0;i6, memory access can be reduced by half. */ +opus_int32 silk_inner_prod_aligned( + const opus_int16 *const inVec1, /* I input vector 1 */ + const opus_int16 *const inVec2, /* I input vector 2 */ + const opus_int len, /* I vector lengths */ + int arch /* I Run-time architecture */ +) +{ +#ifdef FIXED_POINT + return celt_inner_prod(inVec1, inVec2, len, arch); +#else + opus_int i; + opus_int32 sum = 0; + for( i = 0; i < len; i++ ) { + sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] ); + } + return sum; +#endif +} + +opus_int64 silk_inner_prod16_aligned_64_c( + const opus_int16 *inVec1, /* I input vector 1 */ + const opus_int16 *inVec2, /* I input vector 2 */ + const opus_int len /* I vector lengths */ +) +{ + opus_int i; + opus_int64 sum = 0; + for( i = 0; i < len; i++ ) { + sum = silk_SMLALBB( sum, inVec1[ i ], inVec2[ i ] ); + } + return sum; +} diff --git a/thirdparty/opus/silk/fixed/warped_autocorrelation_FIX.c b/thirdparty/opus/silk/fixed/warped_autocorrelation_FIX.c new file mode 100644 index 000000000000..6ca6c1184d86 --- /dev/null +++ b/thirdparty/opus/silk/fixed/warped_autocorrelation_FIX.c @@ -0,0 +1,95 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FIX.h" + +#define QC 10 +#define QS 14 + +#if defined(MIPSr1_ASM) +#include "mips/warped_autocorrelation_FIX_mipsr1.h" +#endif + + +#ifndef OVERRIDE_silk_warped_autocorrelation_FIX +/* Autocorrelations for a warped frequency axis */ +void silk_warped_autocorrelation_FIX( + opus_int32 *corr, /* O Result [order + 1] */ + opus_int *scale, /* O Scaling of the correlation vector */ + const opus_int16 *input, /* I Input data to correlate */ + const opus_int warping_Q16, /* I Warping coefficient */ + const opus_int length, /* I Length of input */ + const opus_int order /* I Correlation order (even) */ +) +{ + opus_int n, i, lsh; + opus_int32 tmp1_QS, tmp2_QS; + opus_int32 state_QS[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; + opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; + + /* Order must be even */ + silk_assert( ( order & 1 ) == 0 ); + silk_assert( 2 * QS - QC >= 0 ); + + /* Loop over samples */ + for( n = 0; n < length; n++ ) { + tmp1_QS = silk_LSHIFT32( (opus_int32)input[ n ], QS ); + /* Loop over allpass sections */ + for( i = 0; i < order; i += 2 ) { + /* Output of allpass section */ + tmp2_QS = silk_SMLAWB( state_QS[ i ], state_QS[ i + 1 ] - tmp1_QS, warping_Q16 ); + state_QS[ i ] = tmp1_QS; + corr_QC[ i ] += silk_RSHIFT64( silk_SMULL( tmp1_QS, state_QS[ 0 ] ), 2 * QS - QC ); + /* Output of allpass section */ + tmp1_QS = silk_SMLAWB( state_QS[ i + 1 ], state_QS[ i + 2 ] - tmp2_QS, warping_Q16 ); + state_QS[ i + 1 ] = tmp2_QS; + corr_QC[ i + 1 ] += silk_RSHIFT64( silk_SMULL( tmp2_QS, state_QS[ 0 ] ), 2 * QS - QC ); + } + state_QS[ order ] = tmp1_QS; + corr_QC[ order ] += silk_RSHIFT64( silk_SMULL( tmp1_QS, state_QS[ 0 ] ), 2 * QS - QC ); + } + + lsh = silk_CLZ64( corr_QC[ 0 ] ) - 35; + lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC ); + *scale = -( QC + lsh ); + silk_assert( *scale >= -30 && *scale <= 12 ); + if( lsh >= 0 ) { + for( i = 0; i < order + 1; i++ ) { + corr[ i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QC[ i ], lsh ) ); + } + } else { + for( i = 0; i < order + 1; i++ ) { + corr[ i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QC[ i ], -lsh ) ); + } + } + silk_assert( corr_QC[ 0 ] >= 0 ); /* If breaking, decrease QC*/ +} +#endif /* OVERRIDE_silk_warped_autocorrelation_FIX */ diff --git a/thirdparty/opus/silk/float/LPC_analysis_filter_FLP.c b/thirdparty/opus/silk/float/LPC_analysis_filter_FLP.c new file mode 100644 index 000000000000..cae89a0a182c --- /dev/null +++ b/thirdparty/opus/silk/float/LPC_analysis_filter_FLP.c @@ -0,0 +1,249 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include "main_FLP.h" + +/************************************************/ +/* LPC analysis filter */ +/* NB! State is kept internally and the */ +/* filter always starts with zero state */ +/* first Order output samples are set to zero */ +/************************************************/ + +/* 16th order LPC analysis filter, does not write first 16 samples */ +static OPUS_INLINE void silk_LPC_analysis_filter16_FLP( + silk_float r_LPC[], /* O LPC residual signal */ + const silk_float PredCoef[], /* I LPC coefficients */ + const silk_float s[], /* I Input signal */ + const opus_int length /* I Length of input signal */ +) +{ + opus_int ix; + silk_float LPC_pred; + const silk_float *s_ptr; + + for( ix = 16; ix < length; ix++ ) { + s_ptr = &s[ix - 1]; + + /* short-term prediction */ + LPC_pred = s_ptr[ 0 ] * PredCoef[ 0 ] + + s_ptr[ -1 ] * PredCoef[ 1 ] + + s_ptr[ -2 ] * PredCoef[ 2 ] + + s_ptr[ -3 ] * PredCoef[ 3 ] + + s_ptr[ -4 ] * PredCoef[ 4 ] + + s_ptr[ -5 ] * PredCoef[ 5 ] + + s_ptr[ -6 ] * PredCoef[ 6 ] + + s_ptr[ -7 ] * PredCoef[ 7 ] + + s_ptr[ -8 ] * PredCoef[ 8 ] + + s_ptr[ -9 ] * PredCoef[ 9 ] + + s_ptr[ -10 ] * PredCoef[ 10 ] + + s_ptr[ -11 ] * PredCoef[ 11 ] + + s_ptr[ -12 ] * PredCoef[ 12 ] + + s_ptr[ -13 ] * PredCoef[ 13 ] + + s_ptr[ -14 ] * PredCoef[ 14 ] + + s_ptr[ -15 ] * PredCoef[ 15 ]; + + /* prediction error */ + r_LPC[ix] = s_ptr[ 1 ] - LPC_pred; + } +} + +/* 12th order LPC analysis filter, does not write first 12 samples */ +static OPUS_INLINE void silk_LPC_analysis_filter12_FLP( + silk_float r_LPC[], /* O LPC residual signal */ + const silk_float PredCoef[], /* I LPC coefficients */ + const silk_float s[], /* I Input signal */ + const opus_int length /* I Length of input signal */ +) +{ + opus_int ix; + silk_float LPC_pred; + const silk_float *s_ptr; + + for( ix = 12; ix < length; ix++ ) { + s_ptr = &s[ix - 1]; + + /* short-term prediction */ + LPC_pred = s_ptr[ 0 ] * PredCoef[ 0 ] + + s_ptr[ -1 ] * PredCoef[ 1 ] + + s_ptr[ -2 ] * PredCoef[ 2 ] + + s_ptr[ -3 ] * PredCoef[ 3 ] + + s_ptr[ -4 ] * PredCoef[ 4 ] + + s_ptr[ -5 ] * PredCoef[ 5 ] + + s_ptr[ -6 ] * PredCoef[ 6 ] + + s_ptr[ -7 ] * PredCoef[ 7 ] + + s_ptr[ -8 ] * PredCoef[ 8 ] + + s_ptr[ -9 ] * PredCoef[ 9 ] + + s_ptr[ -10 ] * PredCoef[ 10 ] + + s_ptr[ -11 ] * PredCoef[ 11 ]; + + /* prediction error */ + r_LPC[ix] = s_ptr[ 1 ] - LPC_pred; + } +} + +/* 10th order LPC analysis filter, does not write first 10 samples */ +static OPUS_INLINE void silk_LPC_analysis_filter10_FLP( + silk_float r_LPC[], /* O LPC residual signal */ + const silk_float PredCoef[], /* I LPC coefficients */ + const silk_float s[], /* I Input signal */ + const opus_int length /* I Length of input signal */ +) +{ + opus_int ix; + silk_float LPC_pred; + const silk_float *s_ptr; + + for( ix = 10; ix < length; ix++ ) { + s_ptr = &s[ix - 1]; + + /* short-term prediction */ + LPC_pred = s_ptr[ 0 ] * PredCoef[ 0 ] + + s_ptr[ -1 ] * PredCoef[ 1 ] + + s_ptr[ -2 ] * PredCoef[ 2 ] + + s_ptr[ -3 ] * PredCoef[ 3 ] + + s_ptr[ -4 ] * PredCoef[ 4 ] + + s_ptr[ -5 ] * PredCoef[ 5 ] + + s_ptr[ -6 ] * PredCoef[ 6 ] + + s_ptr[ -7 ] * PredCoef[ 7 ] + + s_ptr[ -8 ] * PredCoef[ 8 ] + + s_ptr[ -9 ] * PredCoef[ 9 ]; + + /* prediction error */ + r_LPC[ix] = s_ptr[ 1 ] - LPC_pred; + } +} + +/* 8th order LPC analysis filter, does not write first 8 samples */ +static OPUS_INLINE void silk_LPC_analysis_filter8_FLP( + silk_float r_LPC[], /* O LPC residual signal */ + const silk_float PredCoef[], /* I LPC coefficients */ + const silk_float s[], /* I Input signal */ + const opus_int length /* I Length of input signal */ +) +{ + opus_int ix; + silk_float LPC_pred; + const silk_float *s_ptr; + + for( ix = 8; ix < length; ix++ ) { + s_ptr = &s[ix - 1]; + + /* short-term prediction */ + LPC_pred = s_ptr[ 0 ] * PredCoef[ 0 ] + + s_ptr[ -1 ] * PredCoef[ 1 ] + + s_ptr[ -2 ] * PredCoef[ 2 ] + + s_ptr[ -3 ] * PredCoef[ 3 ] + + s_ptr[ -4 ] * PredCoef[ 4 ] + + s_ptr[ -5 ] * PredCoef[ 5 ] + + s_ptr[ -6 ] * PredCoef[ 6 ] + + s_ptr[ -7 ] * PredCoef[ 7 ]; + + /* prediction error */ + r_LPC[ix] = s_ptr[ 1 ] - LPC_pred; + } +} + +/* 6th order LPC analysis filter, does not write first 6 samples */ +static OPUS_INLINE void silk_LPC_analysis_filter6_FLP( + silk_float r_LPC[], /* O LPC residual signal */ + const silk_float PredCoef[], /* I LPC coefficients */ + const silk_float s[], /* I Input signal */ + const opus_int length /* I Length of input signal */ +) +{ + opus_int ix; + silk_float LPC_pred; + const silk_float *s_ptr; + + for( ix = 6; ix < length; ix++ ) { + s_ptr = &s[ix - 1]; + + /* short-term prediction */ + LPC_pred = s_ptr[ 0 ] * PredCoef[ 0 ] + + s_ptr[ -1 ] * PredCoef[ 1 ] + + s_ptr[ -2 ] * PredCoef[ 2 ] + + s_ptr[ -3 ] * PredCoef[ 3 ] + + s_ptr[ -4 ] * PredCoef[ 4 ] + + s_ptr[ -5 ] * PredCoef[ 5 ]; + + /* prediction error */ + r_LPC[ix] = s_ptr[ 1 ] - LPC_pred; + } +} + +/************************************************/ +/* LPC analysis filter */ +/* NB! State is kept internally and the */ +/* filter always starts with zero state */ +/* first Order output samples are set to zero */ +/************************************************/ +void silk_LPC_analysis_filter_FLP( + silk_float r_LPC[], /* O LPC residual signal */ + const silk_float PredCoef[], /* I LPC coefficients */ + const silk_float s[], /* I Input signal */ + const opus_int length, /* I Length of input signal */ + const opus_int Order /* I LPC order */ +) +{ + silk_assert( Order <= length ); + + switch( Order ) { + case 6: + silk_LPC_analysis_filter6_FLP( r_LPC, PredCoef, s, length ); + break; + + case 8: + silk_LPC_analysis_filter8_FLP( r_LPC, PredCoef, s, length ); + break; + + case 10: + silk_LPC_analysis_filter10_FLP( r_LPC, PredCoef, s, length ); + break; + + case 12: + silk_LPC_analysis_filter12_FLP( r_LPC, PredCoef, s, length ); + break; + + case 16: + silk_LPC_analysis_filter16_FLP( r_LPC, PredCoef, s, length ); + break; + + default: + silk_assert( 0 ); + break; + } + + /* Set first Order output samples to zero */ + silk_memset( r_LPC, 0, Order * sizeof( silk_float ) ); +} + diff --git a/thirdparty/opus/silk/float/LPC_inv_pred_gain_FLP.c b/thirdparty/opus/silk/float/LPC_inv_pred_gain_FLP.c new file mode 100644 index 000000000000..25178bacdde4 --- /dev/null +++ b/thirdparty/opus/silk/float/LPC_inv_pred_gain_FLP.c @@ -0,0 +1,76 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" +#include "SigProc_FLP.h" + +#define RC_THRESHOLD 0.9999f + +/* compute inverse of LPC prediction gain, and */ +/* test if LPC coefficients are stable (all poles within unit circle) */ +/* this code is based on silk_a2k_FLP() */ +silk_float silk_LPC_inverse_pred_gain_FLP( /* O return inverse prediction gain, energy domain */ + const silk_float *A, /* I prediction coefficients [order] */ + opus_int32 order /* I prediction order */ +) +{ + opus_int k, n; + double invGain, rc, rc_mult1, rc_mult2; + silk_float Atmp[ 2 ][ SILK_MAX_ORDER_LPC ]; + silk_float *Aold, *Anew; + + Anew = Atmp[ order & 1 ]; + silk_memcpy( Anew, A, order * sizeof(silk_float) ); + + invGain = 1.0; + for( k = order - 1; k > 0; k-- ) { + rc = -Anew[ k ]; + if( rc > RC_THRESHOLD || rc < -RC_THRESHOLD ) { + return 0.0f; + } + rc_mult1 = 1.0f - rc * rc; + rc_mult2 = 1.0f / rc_mult1; + invGain *= rc_mult1; + /* swap pointers */ + Aold = Anew; + Anew = Atmp[ k & 1 ]; + for( n = 0; n < k; n++ ) { + Anew[ n ] = (silk_float)( ( Aold[ n ] - Aold[ k - n - 1 ] * rc ) * rc_mult2 ); + } + } + rc = -Anew[ 0 ]; + if( rc > RC_THRESHOLD || rc < -RC_THRESHOLD ) { + return 0.0f; + } + rc_mult1 = 1.0f - rc * rc; + invGain *= rc_mult1; + return (silk_float)invGain; +} diff --git a/thirdparty/opus/silk/float/LTP_analysis_filter_FLP.c b/thirdparty/opus/silk/float/LTP_analysis_filter_FLP.c new file mode 100644 index 000000000000..849b7c1c528b --- /dev/null +++ b/thirdparty/opus/silk/float/LTP_analysis_filter_FLP.c @@ -0,0 +1,75 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FLP.h" + +void silk_LTP_analysis_filter_FLP( + silk_float *LTP_res, /* O LTP res MAX_NB_SUBFR*(pre_lgth+subfr_lngth) */ + const silk_float *x, /* I Input signal, with preceding samples */ + const silk_float B[ LTP_ORDER * MAX_NB_SUBFR ], /* I LTP coefficients for each subframe */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const silk_float invGains[ MAX_NB_SUBFR ], /* I Inverse quantization gains */ + const opus_int subfr_length, /* I Length of each subframe */ + const opus_int nb_subfr, /* I number of subframes */ + const opus_int pre_length /* I Preceding samples for each subframe */ +) +{ + const silk_float *x_ptr, *x_lag_ptr; + silk_float Btmp[ LTP_ORDER ]; + silk_float *LTP_res_ptr; + silk_float inv_gain; + opus_int k, i, j; + + x_ptr = x; + LTP_res_ptr = LTP_res; + for( k = 0; k < nb_subfr; k++ ) { + x_lag_ptr = x_ptr - pitchL[ k ]; + inv_gain = invGains[ k ]; + for( i = 0; i < LTP_ORDER; i++ ) { + Btmp[ i ] = B[ k * LTP_ORDER + i ]; + } + + /* LTP analysis FIR filter */ + for( i = 0; i < subfr_length + pre_length; i++ ) { + LTP_res_ptr[ i ] = x_ptr[ i ]; + /* Subtract long-term prediction */ + for( j = 0; j < LTP_ORDER; j++ ) { + LTP_res_ptr[ i ] -= Btmp[ j ] * x_lag_ptr[ LTP_ORDER / 2 - j ]; + } + LTP_res_ptr[ i ] *= inv_gain; + x_lag_ptr++; + } + + /* Update pointers */ + LTP_res_ptr += subfr_length + pre_length; + x_ptr += subfr_length; + } +} diff --git a/thirdparty/opus/silk/float/LTP_scale_ctrl_FLP.c b/thirdparty/opus/silk/float/LTP_scale_ctrl_FLP.c new file mode 100644 index 000000000000..8dbe29d0fa6b --- /dev/null +++ b/thirdparty/opus/silk/float/LTP_scale_ctrl_FLP.c @@ -0,0 +1,52 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FLP.h" + +void silk_LTP_scale_ctrl_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */ + opus_int condCoding /* I The type of conditional coding to use */ +) +{ + opus_int round_loss; + + if( condCoding == CODE_INDEPENDENTLY ) { + /* Only scale if first frame in packet */ + round_loss = psEnc->sCmn.PacketLoss_perc + psEnc->sCmn.nFramesPerPacket; + psEnc->sCmn.indices.LTP_scaleIndex = (opus_int8)silk_LIMIT( round_loss * psEncCtrl->LTPredCodGain * 0.1f, 0.0f, 2.0f ); + } else { + /* Default is minimum scaling */ + psEnc->sCmn.indices.LTP_scaleIndex = 0; + } + + psEncCtrl->LTP_scale = (silk_float)silk_LTPScales_table_Q14[ psEnc->sCmn.indices.LTP_scaleIndex ] / 16384.0f; +} diff --git a/thirdparty/opus/silk/float/SigProc_FLP.h b/thirdparty/opus/silk/float/SigProc_FLP.h new file mode 100644 index 000000000000..f0cb3733be6c --- /dev/null +++ b/thirdparty/opus/silk/float/SigProc_FLP.h @@ -0,0 +1,204 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_SIGPROC_FLP_H +#define SILK_SIGPROC_FLP_H + +#include "SigProc_FIX.h" +#include "float_cast.h" +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +/********************************************************************/ +/* SIGNAL PROCESSING FUNCTIONS */ +/********************************************************************/ + +/* Chirp (bw expand) LP AR filter */ +void silk_bwexpander_FLP( + silk_float *ar, /* I/O AR filter to be expanded (without leading 1) */ + const opus_int d, /* I length of ar */ + const silk_float chirp /* I chirp factor (typically in range (0..1) ) */ +); + +/* compute inverse of LPC prediction gain, and */ +/* test if LPC coefficients are stable (all poles within unit circle) */ +/* this code is based on silk_FLP_a2k() */ +silk_float silk_LPC_inverse_pred_gain_FLP( /* O return inverse prediction gain, energy domain */ + const silk_float *A, /* I prediction coefficients [order] */ + opus_int32 order /* I prediction order */ +); + +silk_float silk_schur_FLP( /* O returns residual energy */ + silk_float refl_coef[], /* O reflection coefficients (length order) */ + const silk_float auto_corr[], /* I autocorrelation sequence (length order+1) */ + opus_int order /* I order */ +); + +void silk_k2a_FLP( + silk_float *A, /* O prediction coefficients [order] */ + const silk_float *rc, /* I reflection coefficients [order] */ + opus_int32 order /* I prediction order */ +); + +/* Solve the normal equations using the Levinson-Durbin recursion */ +silk_float silk_levinsondurbin_FLP( /* O prediction error energy */ + silk_float A[], /* O prediction coefficients [order] */ + const silk_float corr[], /* I input auto-correlations [order + 1] */ + const opus_int order /* I prediction order */ +); + +/* compute autocorrelation */ +void silk_autocorrelation_FLP( + silk_float *results, /* O result (length correlationCount) */ + const silk_float *inputData, /* I input data to correlate */ + opus_int inputDataSize, /* I length of input */ + opus_int correlationCount /* I number of correlation taps to compute */ +); + +opus_int silk_pitch_analysis_core_FLP( /* O Voicing estimate: 0 voiced, 1 unvoiced */ + const silk_float *frame, /* I Signal of length PE_FRAME_LENGTH_MS*Fs_kHz */ + opus_int *pitch_out, /* O Pitch lag values [nb_subfr] */ + opus_int16 *lagIndex, /* O Lag Index */ + opus_int8 *contourIndex, /* O Pitch contour Index */ + silk_float *LTPCorr, /* I/O Normalized correlation; input: value from previous frame */ + opus_int prevLag, /* I Last lag of previous frame; set to zero is unvoiced */ + const silk_float search_thres1, /* I First stage threshold for lag candidates 0 - 1 */ + const silk_float search_thres2, /* I Final threshold for lag candidates 0 - 1 */ + const opus_int Fs_kHz, /* I sample frequency (kHz) */ + const opus_int complexity, /* I Complexity setting, 0-2, where 2 is highest */ + const opus_int nb_subfr, /* I Number of 5 ms subframes */ + int arch /* I Run-time architecture */ +); + +void silk_insertion_sort_decreasing_FLP( + silk_float *a, /* I/O Unsorted / Sorted vector */ + opus_int *idx, /* O Index vector for the sorted elements */ + const opus_int L, /* I Vector length */ + const opus_int K /* I Number of correctly sorted positions */ +); + +/* Compute reflection coefficients from input signal */ +silk_float silk_burg_modified_FLP( /* O returns residual energy */ + silk_float A[], /* O prediction coefficients (length order) */ + const silk_float x[], /* I input signal, length: nb_subfr*(D+L_sub) */ + const silk_float minInvGain, /* I minimum inverse prediction gain */ + const opus_int subfr_length, /* I input signal subframe length (incl. D preceding samples) */ + const opus_int nb_subfr, /* I number of subframes stacked in x */ + const opus_int D /* I order */ +); + +/* multiply a vector by a constant */ +void silk_scale_vector_FLP( + silk_float *data1, + silk_float gain, + opus_int dataSize +); + +/* copy and multiply a vector by a constant */ +void silk_scale_copy_vector_FLP( + silk_float *data_out, + const silk_float *data_in, + silk_float gain, + opus_int dataSize +); + +/* inner product of two silk_float arrays, with result as double */ +double silk_inner_product_FLP( + const silk_float *data1, + const silk_float *data2, + opus_int dataSize +); + +/* sum of squares of a silk_float array, with result as double */ +double silk_energy_FLP( + const silk_float *data, + opus_int dataSize +); + +/********************************************************************/ +/* MACROS */ +/********************************************************************/ + +#define PI (3.1415926536f) + +#define silk_min_float( a, b ) (((a) < (b)) ? (a) : (b)) +#define silk_max_float( a, b ) (((a) > (b)) ? (a) : (b)) +#define silk_abs_float( a ) ((silk_float)fabs(a)) + +/* sigmoid function */ +static OPUS_INLINE silk_float silk_sigmoid( silk_float x ) +{ + return (silk_float)(1.0 / (1.0 + exp(-x))); +} + +/* floating-point to integer conversion (rounding) */ +static OPUS_INLINE opus_int32 silk_float2int( silk_float x ) +{ + return (opus_int32)float2int( x ); +} + +/* floating-point to integer conversion (rounding) */ +static OPUS_INLINE void silk_float2short_array( + opus_int16 *out, + const silk_float *in, + opus_int32 length +) +{ + opus_int32 k; + for( k = length - 1; k >= 0; k-- ) { + out[k] = silk_SAT16( (opus_int32)float2int( in[k] ) ); + } +} + +/* integer to floating-point conversion */ +static OPUS_INLINE void silk_short2float_array( + silk_float *out, + const opus_int16 *in, + opus_int32 length +) +{ + opus_int32 k; + for( k = length - 1; k >= 0; k-- ) { + out[k] = (silk_float)in[k]; + } +} + +/* using log2() helps the fixed-point conversion */ +static OPUS_INLINE silk_float silk_log2( double x ) +{ + return ( silk_float )( 3.32192809488736 * log10( x ) ); +} + +#ifdef __cplusplus +} +#endif + +#endif /* SILK_SIGPROC_FLP_H */ diff --git a/thirdparty/opus/silk/float/apply_sine_window_FLP.c b/thirdparty/opus/silk/float/apply_sine_window_FLP.c new file mode 100644 index 000000000000..6aae57c0ab99 --- /dev/null +++ b/thirdparty/opus/silk/float/apply_sine_window_FLP.c @@ -0,0 +1,81 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FLP.h" + +/* Apply sine window to signal vector */ +/* Window types: */ +/* 1 -> sine window from 0 to pi/2 */ +/* 2 -> sine window from pi/2 to pi */ +void silk_apply_sine_window_FLP( + silk_float px_win[], /* O Pointer to windowed signal */ + const silk_float px[], /* I Pointer to input signal */ + const opus_int win_type, /* I Selects a window type */ + const opus_int length /* I Window length, multiple of 4 */ +) +{ + opus_int k; + silk_float freq, c, S0, S1; + + silk_assert( win_type == 1 || win_type == 2 ); + + /* Length must be multiple of 4 */ + silk_assert( ( length & 3 ) == 0 ); + + freq = PI / ( length + 1 ); + + /* Approximation of 2 * cos(f) */ + c = 2.0f - freq * freq; + + /* Initialize state */ + if( win_type < 2 ) { + /* Start from 0 */ + S0 = 0.0f; + /* Approximation of sin(f) */ + S1 = freq; + } else { + /* Start from 1 */ + S0 = 1.0f; + /* Approximation of cos(f) */ + S1 = 0.5f * c; + } + + /* Uses the recursive equation: sin(n*f) = 2 * cos(f) * sin((n-1)*f) - sin((n-2)*f) */ + /* 4 samples at a time */ + for( k = 0; k < length; k += 4 ) { + px_win[ k + 0 ] = px[ k + 0 ] * 0.5f * ( S0 + S1 ); + px_win[ k + 1 ] = px[ k + 1 ] * S1; + S0 = c * S1 - S0; + px_win[ k + 2 ] = px[ k + 2 ] * 0.5f * ( S1 + S0 ); + px_win[ k + 3 ] = px[ k + 3 ] * S0; + S1 = c * S0 - S1; + } +} diff --git a/thirdparty/opus/silk/float/autocorrelation_FLP.c b/thirdparty/opus/silk/float/autocorrelation_FLP.c new file mode 100644 index 000000000000..8b8a9e659aae --- /dev/null +++ b/thirdparty/opus/silk/float/autocorrelation_FLP.c @@ -0,0 +1,52 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "typedef.h" +#include "SigProc_FLP.h" + +/* compute autocorrelation */ +void silk_autocorrelation_FLP( + silk_float *results, /* O result (length correlationCount) */ + const silk_float *inputData, /* I input data to correlate */ + opus_int inputDataSize, /* I length of input */ + opus_int correlationCount /* I number of correlation taps to compute */ +) +{ + opus_int i; + + if( correlationCount > inputDataSize ) { + correlationCount = inputDataSize; + } + + for( i = 0; i < correlationCount; i++ ) { + results[ i ] = (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i ); + } +} diff --git a/thirdparty/opus/silk/float/burg_modified_FLP.c b/thirdparty/opus/silk/float/burg_modified_FLP.c new file mode 100644 index 000000000000..ea5dc25a93a7 --- /dev/null +++ b/thirdparty/opus/silk/float/burg_modified_FLP.c @@ -0,0 +1,186 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FLP.h" +#include "tuning_parameters.h" +#include "define.h" + +#define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384*/ + +/* Compute reflection coefficients from input signal */ +silk_float silk_burg_modified_FLP( /* O returns residual energy */ + silk_float A[], /* O prediction coefficients (length order) */ + const silk_float x[], /* I input signal, length: nb_subfr*(D+L_sub) */ + const silk_float minInvGain, /* I minimum inverse prediction gain */ + const opus_int subfr_length, /* I input signal subframe length (incl. D preceding samples) */ + const opus_int nb_subfr, /* I number of subframes stacked in x */ + const opus_int D /* I order */ +) +{ + opus_int k, n, s, reached_max_gain; + double C0, invGain, num, nrg_f, nrg_b, rc, Atmp, tmp1, tmp2; + const silk_float *x_ptr; + double C_first_row[ SILK_MAX_ORDER_LPC ], C_last_row[ SILK_MAX_ORDER_LPC ]; + double CAf[ SILK_MAX_ORDER_LPC + 1 ], CAb[ SILK_MAX_ORDER_LPC + 1 ]; + double Af[ SILK_MAX_ORDER_LPC ]; + + silk_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE ); + + /* Compute autocorrelations, added over subframes */ + C0 = silk_energy_FLP( x, nb_subfr * subfr_length ); + silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( double ) ); + for( s = 0; s < nb_subfr; s++ ) { + x_ptr = x + s * subfr_length; + for( n = 1; n < D + 1; n++ ) { + C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n ); + } + } + silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( double ) ); + + /* Initialize */ + CAb[ 0 ] = CAf[ 0 ] = C0 + FIND_LPC_COND_FAC * C0 + 1e-9f; + invGain = 1.0f; + reached_max_gain = 0; + for( n = 0; n < D; n++ ) { + /* Update first row of correlation matrix (without first element) */ + /* Update last row of correlation matrix (without last element, stored in reversed order) */ + /* Update C * Af */ + /* Update C * flipud(Af) (stored in reversed order) */ + for( s = 0; s < nb_subfr; s++ ) { + x_ptr = x + s * subfr_length; + tmp1 = x_ptr[ n ]; + tmp2 = x_ptr[ subfr_length - n - 1 ]; + for( k = 0; k < n; k++ ) { + C_first_row[ k ] -= x_ptr[ n ] * x_ptr[ n - k - 1 ]; + C_last_row[ k ] -= x_ptr[ subfr_length - n - 1 ] * x_ptr[ subfr_length - n + k ]; + Atmp = Af[ k ]; + tmp1 += x_ptr[ n - k - 1 ] * Atmp; + tmp2 += x_ptr[ subfr_length - n + k ] * Atmp; + } + for( k = 0; k <= n; k++ ) { + CAf[ k ] -= tmp1 * x_ptr[ n - k ]; + CAb[ k ] -= tmp2 * x_ptr[ subfr_length - n + k - 1 ]; + } + } + tmp1 = C_first_row[ n ]; + tmp2 = C_last_row[ n ]; + for( k = 0; k < n; k++ ) { + Atmp = Af[ k ]; + tmp1 += C_last_row[ n - k - 1 ] * Atmp; + tmp2 += C_first_row[ n - k - 1 ] * Atmp; + } + CAf[ n + 1 ] = tmp1; + CAb[ n + 1 ] = tmp2; + + /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */ + num = CAb[ n + 1 ]; + nrg_b = CAb[ 0 ]; + nrg_f = CAf[ 0 ]; + for( k = 0; k < n; k++ ) { + Atmp = Af[ k ]; + num += CAb[ n - k ] * Atmp; + nrg_b += CAb[ k + 1 ] * Atmp; + nrg_f += CAf[ k + 1 ] * Atmp; + } + silk_assert( nrg_f > 0.0 ); + silk_assert( nrg_b > 0.0 ); + + /* Calculate the next order reflection (parcor) coefficient */ + rc = -2.0 * num / ( nrg_f + nrg_b ); + silk_assert( rc > -1.0 && rc < 1.0 ); + + /* Update inverse prediction gain */ + tmp1 = invGain * ( 1.0 - rc * rc ); + if( tmp1 <= minInvGain ) { + /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */ + rc = sqrt( 1.0 - minInvGain / invGain ); + if( num > 0 ) { + /* Ensure adjusted reflection coefficients has the original sign */ + rc = -rc; + } + invGain = minInvGain; + reached_max_gain = 1; + } else { + invGain = tmp1; + } + + /* Update the AR coefficients */ + for( k = 0; k < (n + 1) >> 1; k++ ) { + tmp1 = Af[ k ]; + tmp2 = Af[ n - k - 1 ]; + Af[ k ] = tmp1 + rc * tmp2; + Af[ n - k - 1 ] = tmp2 + rc * tmp1; + } + Af[ n ] = rc; + + if( reached_max_gain ) { + /* Reached max prediction gain; set remaining coefficients to zero and exit loop */ + for( k = n + 1; k < D; k++ ) { + Af[ k ] = 0.0; + } + break; + } + + /* Update C * Af and C * Ab */ + for( k = 0; k <= n + 1; k++ ) { + tmp1 = CAf[ k ]; + CAf[ k ] += rc * CAb[ n - k + 1 ]; + CAb[ n - k + 1 ] += rc * tmp1; + } + } + + if( reached_max_gain ) { + /* Convert to silk_float */ + for( k = 0; k < D; k++ ) { + A[ k ] = (silk_float)( -Af[ k ] ); + } + /* Subtract energy of preceding samples from C0 */ + for( s = 0; s < nb_subfr; s++ ) { + C0 -= silk_energy_FLP( x + s * subfr_length, D ); + } + /* Approximate residual energy */ + nrg_f = C0 * invGain; + } else { + /* Compute residual energy and store coefficients as silk_float */ + nrg_f = CAf[ 0 ]; + tmp1 = 1.0; + for( k = 0; k < D; k++ ) { + Atmp = Af[ k ]; + nrg_f += CAf[ k + 1 ] * Atmp; + tmp1 += Atmp * Atmp; + A[ k ] = (silk_float)(-Atmp); + } + nrg_f -= FIND_LPC_COND_FAC * C0 * tmp1; + } + + /* Return residual energy */ + return (silk_float)nrg_f; +} diff --git a/thirdparty/opus/silk/float/bwexpander_FLP.c b/thirdparty/opus/silk/float/bwexpander_FLP.c new file mode 100644 index 000000000000..d55a4d79ab63 --- /dev/null +++ b/thirdparty/opus/silk/float/bwexpander_FLP.c @@ -0,0 +1,49 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FLP.h" + +/* Chirp (bw expand) LP AR filter */ +void silk_bwexpander_FLP( + silk_float *ar, /* I/O AR filter to be expanded (without leading 1) */ + const opus_int d, /* I length of ar */ + const silk_float chirp /* I chirp factor (typically in range (0..1) ) */ +) +{ + opus_int i; + silk_float cfac = chirp; + + for( i = 0; i < d - 1; i++ ) { + ar[ i ] *= cfac; + cfac *= chirp; + } + ar[ d - 1 ] *= cfac; +} diff --git a/thirdparty/opus/silk/float/corrMatrix_FLP.c b/thirdparty/opus/silk/float/corrMatrix_FLP.c new file mode 100644 index 000000000000..eae6a1cfca29 --- /dev/null +++ b/thirdparty/opus/silk/float/corrMatrix_FLP.c @@ -0,0 +1,93 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/********************************************************************** + * Correlation matrix computations for LS estimate. + **********************************************************************/ + +#include "main_FLP.h" + +/* Calculates correlation vector X'*t */ +void silk_corrVector_FLP( + const silk_float *x, /* I x vector [L+order-1] used to create X */ + const silk_float *t, /* I Target vector [L] */ + const opus_int L, /* I Length of vecors */ + const opus_int Order, /* I Max lag for correlation */ + silk_float *Xt /* O X'*t correlation vector [order] */ +) +{ + opus_int lag; + const silk_float *ptr1; + + ptr1 = &x[ Order - 1 ]; /* Points to first sample of column 0 of X: X[:,0] */ + for( lag = 0; lag < Order; lag++ ) { + /* Calculate X[:,lag]'*t */ + Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L ); + ptr1--; /* Next column of X */ + } +} + +/* Calculates correlation matrix X'*X */ +void silk_corrMatrix_FLP( + const silk_float *x, /* I x vector [ L+order-1 ] used to create X */ + const opus_int L, /* I Length of vectors */ + const opus_int Order, /* I Max lag for correlation */ + silk_float *XX /* O X'*X correlation matrix [order x order] */ +) +{ + opus_int j, lag; + double energy; + const silk_float *ptr1, *ptr2; + + ptr1 = &x[ Order - 1 ]; /* First sample of column 0 of X */ + energy = silk_energy_FLP( ptr1, L ); /* X[:,0]'*X[:,0] */ + matrix_ptr( XX, 0, 0, Order ) = ( silk_float )energy; + for( j = 1; j < Order; j++ ) { + /* Calculate X[:,j]'*X[:,j] */ + energy += ptr1[ -j ] * ptr1[ -j ] - ptr1[ L - j ] * ptr1[ L - j ]; + matrix_ptr( XX, j, j, Order ) = ( silk_float )energy; + } + + ptr2 = &x[ Order - 2 ]; /* First sample of column 1 of X */ + for( lag = 1; lag < Order; lag++ ) { + /* Calculate X[:,0]'*X[:,lag] */ + energy = silk_inner_product_FLP( ptr1, ptr2, L ); + matrix_ptr( XX, lag, 0, Order ) = ( silk_float )energy; + matrix_ptr( XX, 0, lag, Order ) = ( silk_float )energy; + /* Calculate X[:,j]'*X[:,j + lag] */ + for( j = 1; j < ( Order - lag ); j++ ) { + energy += ptr1[ -j ] * ptr2[ -j ] - ptr1[ L - j ] * ptr2[ L - j ]; + matrix_ptr( XX, lag + j, j, Order ) = ( silk_float )energy; + matrix_ptr( XX, j, lag + j, Order ) = ( silk_float )energy; + } + ptr2--; /* Next column of X */ + } +} diff --git a/thirdparty/opus/silk/float/encode_frame_FLP.c b/thirdparty/opus/silk/float/encode_frame_FLP.c new file mode 100644 index 000000000000..2092a4d9e248 --- /dev/null +++ b/thirdparty/opus/silk/float/encode_frame_FLP.c @@ -0,0 +1,372 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FLP.h" +#include "tuning_parameters.h" + +/* Low Bitrate Redundancy (LBRR) encoding. Reuse all parameters but encode with lower bitrate */ +static OPUS_INLINE void silk_LBRR_encode_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */ + const silk_float xfw[], /* I Input signal */ + opus_int condCoding /* I The type of conditional coding used so far for this frame */ +); + +void silk_encode_do_VAD_FLP( + silk_encoder_state_FLP *psEnc /* I/O Encoder state FLP */ +) +{ + /****************************/ + /* Voice Activity Detection */ + /****************************/ + silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.arch ); + + /**************************************************/ + /* Convert speech activity into VAD and DTX flags */ + /**************************************************/ + if( psEnc->sCmn.speech_activity_Q8 < SILK_FIX_CONST( SPEECH_ACTIVITY_DTX_THRES, 8 ) ) { + psEnc->sCmn.indices.signalType = TYPE_NO_VOICE_ACTIVITY; + psEnc->sCmn.noSpeechCounter++; + if( psEnc->sCmn.noSpeechCounter < NB_SPEECH_FRAMES_BEFORE_DTX ) { + psEnc->sCmn.inDTX = 0; + } else if( psEnc->sCmn.noSpeechCounter > MAX_CONSECUTIVE_DTX + NB_SPEECH_FRAMES_BEFORE_DTX ) { + psEnc->sCmn.noSpeechCounter = NB_SPEECH_FRAMES_BEFORE_DTX; + psEnc->sCmn.inDTX = 0; + } + psEnc->sCmn.VAD_flags[ psEnc->sCmn.nFramesEncoded ] = 0; + } else { + psEnc->sCmn.noSpeechCounter = 0; + psEnc->sCmn.inDTX = 0; + psEnc->sCmn.indices.signalType = TYPE_UNVOICED; + psEnc->sCmn.VAD_flags[ psEnc->sCmn.nFramesEncoded ] = 1; + } +} + +/****************/ +/* Encode frame */ +/****************/ +opus_int silk_encode_frame_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + opus_int32 *pnBytesOut, /* O Number of payload bytes; */ + ec_enc *psRangeEnc, /* I/O compressor data structure */ + opus_int condCoding, /* I The type of conditional coding to use */ + opus_int maxBits, /* I If > 0: maximum number of output bits */ + opus_int useCBR /* I Flag to force constant-bitrate operation */ +) +{ + silk_encoder_control_FLP sEncCtrl; + opus_int i, iter, maxIter, found_upper, found_lower, ret = 0; + silk_float *x_frame, *res_pitch_frame; + silk_float xfw[ MAX_FRAME_LENGTH ]; + silk_float res_pitch[ 2 * MAX_FRAME_LENGTH + LA_PITCH_MAX ]; + ec_enc sRangeEnc_copy, sRangeEnc_copy2; + silk_nsq_state sNSQ_copy, sNSQ_copy2; + opus_int32 seed_copy, nBits, nBits_lower, nBits_upper, gainMult_lower, gainMult_upper; + opus_int32 gainsID, gainsID_lower, gainsID_upper; + opus_int16 gainMult_Q8; + opus_int16 ec_prevLagIndex_copy; + opus_int ec_prevSignalType_copy; + opus_int8 LastGainIndex_copy2; + opus_int32 pGains_Q16[ MAX_NB_SUBFR ]; + opus_uint8 ec_buf_copy[ 1275 ]; + + /* This is totally unnecessary but many compilers (including gcc) are too dumb to realise it */ + LastGainIndex_copy2 = nBits_lower = nBits_upper = gainMult_lower = gainMult_upper = 0; + + psEnc->sCmn.indices.Seed = psEnc->sCmn.frameCounter++ & 3; + + /**************************************************************/ + /* Set up Input Pointers, and insert frame in input buffer */ + /**************************************************************/ + /* pointers aligned with start of frame to encode */ + x_frame = psEnc->x_buf + psEnc->sCmn.ltp_mem_length; /* start of frame to encode */ + res_pitch_frame = res_pitch + psEnc->sCmn.ltp_mem_length; /* start of pitch LPC residual frame */ + + /***************************************/ + /* Ensure smooth bandwidth transitions */ + /***************************************/ + silk_LP_variable_cutoff( &psEnc->sCmn.sLP, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.frame_length ); + + /*******************************************/ + /* Copy new frame to front of input buffer */ + /*******************************************/ + silk_short2float_array( x_frame + LA_SHAPE_MS * psEnc->sCmn.fs_kHz, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.frame_length ); + + /* Add tiny signal to avoid high CPU load from denormalized floating point numbers */ + for( i = 0; i < 8; i++ ) { + x_frame[ LA_SHAPE_MS * psEnc->sCmn.fs_kHz + i * ( psEnc->sCmn.frame_length >> 3 ) ] += ( 1 - ( i & 2 ) ) * 1e-6f; + } + + if( !psEnc->sCmn.prefillFlag ) { + /*****************************************/ + /* Find pitch lags, initial LPC analysis */ + /*****************************************/ + silk_find_pitch_lags_FLP( psEnc, &sEncCtrl, res_pitch, x_frame, psEnc->sCmn.arch ); + + /************************/ + /* Noise shape analysis */ + /************************/ + silk_noise_shape_analysis_FLP( psEnc, &sEncCtrl, res_pitch_frame, x_frame ); + + /***************************************************/ + /* Find linear prediction coefficients (LPC + LTP) */ + /***************************************************/ + silk_find_pred_coefs_FLP( psEnc, &sEncCtrl, res_pitch, x_frame, condCoding ); + + /****************************************/ + /* Process gains */ + /****************************************/ + silk_process_gains_FLP( psEnc, &sEncCtrl, condCoding ); + + /*****************************************/ + /* Prefiltering for noise shaper */ + /*****************************************/ + silk_prefilter_FLP( psEnc, &sEncCtrl, xfw, x_frame ); + + /****************************************/ + /* Low Bitrate Redundant Encoding */ + /****************************************/ + silk_LBRR_encode_FLP( psEnc, &sEncCtrl, xfw, condCoding ); + + /* Loop over quantizer and entroy coding to control bitrate */ + maxIter = 6; + gainMult_Q8 = SILK_FIX_CONST( 1, 8 ); + found_lower = 0; + found_upper = 0; + gainsID = silk_gains_ID( psEnc->sCmn.indices.GainsIndices, psEnc->sCmn.nb_subfr ); + gainsID_lower = -1; + gainsID_upper = -1; + /* Copy part of the input state */ + silk_memcpy( &sRangeEnc_copy, psRangeEnc, sizeof( ec_enc ) ); + silk_memcpy( &sNSQ_copy, &psEnc->sCmn.sNSQ, sizeof( silk_nsq_state ) ); + seed_copy = psEnc->sCmn.indices.Seed; + ec_prevLagIndex_copy = psEnc->sCmn.ec_prevLagIndex; + ec_prevSignalType_copy = psEnc->sCmn.ec_prevSignalType; + for( iter = 0; ; iter++ ) { + if( gainsID == gainsID_lower ) { + nBits = nBits_lower; + } else if( gainsID == gainsID_upper ) { + nBits = nBits_upper; + } else { + /* Restore part of the input state */ + if( iter > 0 ) { + silk_memcpy( psRangeEnc, &sRangeEnc_copy, sizeof( ec_enc ) ); + silk_memcpy( &psEnc->sCmn.sNSQ, &sNSQ_copy, sizeof( silk_nsq_state ) ); + psEnc->sCmn.indices.Seed = seed_copy; + psEnc->sCmn.ec_prevLagIndex = ec_prevLagIndex_copy; + psEnc->sCmn.ec_prevSignalType = ec_prevSignalType_copy; + } + + /*****************************************/ + /* Noise shaping quantization */ + /*****************************************/ + silk_NSQ_wrapper_FLP( psEnc, &sEncCtrl, &psEnc->sCmn.indices, &psEnc->sCmn.sNSQ, psEnc->sCmn.pulses, xfw ); + + /****************************************/ + /* Encode Parameters */ + /****************************************/ + silk_encode_indices( &psEnc->sCmn, psRangeEnc, psEnc->sCmn.nFramesEncoded, 0, condCoding ); + + /****************************************/ + /* Encode Excitation Signal */ + /****************************************/ + silk_encode_pulses( psRangeEnc, psEnc->sCmn.indices.signalType, psEnc->sCmn.indices.quantOffsetType, + psEnc->sCmn.pulses, psEnc->sCmn.frame_length ); + + nBits = ec_tell( psRangeEnc ); + + if( useCBR == 0 && iter == 0 && nBits <= maxBits ) { + break; + } + } + + if( iter == maxIter ) { + if( found_lower && ( gainsID == gainsID_lower || nBits > maxBits ) ) { + /* Restore output state from earlier iteration that did meet the bitrate budget */ + silk_memcpy( psRangeEnc, &sRangeEnc_copy2, sizeof( ec_enc ) ); + silk_assert( sRangeEnc_copy2.offs <= 1275 ); + silk_memcpy( psRangeEnc->buf, ec_buf_copy, sRangeEnc_copy2.offs ); + silk_memcpy( &psEnc->sCmn.sNSQ, &sNSQ_copy2, sizeof( silk_nsq_state ) ); + psEnc->sShape.LastGainIndex = LastGainIndex_copy2; + } + break; + } + + if( nBits > maxBits ) { + if( found_lower == 0 && iter >= 2 ) { + /* Adjust the quantizer's rate/distortion tradeoff and discard previous "upper" results */ + sEncCtrl.Lambda *= 1.5f; + found_upper = 0; + gainsID_upper = -1; + } else { + found_upper = 1; + nBits_upper = nBits; + gainMult_upper = gainMult_Q8; + gainsID_upper = gainsID; + } + } else if( nBits < maxBits - 5 ) { + found_lower = 1; + nBits_lower = nBits; + gainMult_lower = gainMult_Q8; + if( gainsID != gainsID_lower ) { + gainsID_lower = gainsID; + /* Copy part of the output state */ + silk_memcpy( &sRangeEnc_copy2, psRangeEnc, sizeof( ec_enc ) ); + silk_assert( psRangeEnc->offs <= 1275 ); + silk_memcpy( ec_buf_copy, psRangeEnc->buf, psRangeEnc->offs ); + silk_memcpy( &sNSQ_copy2, &psEnc->sCmn.sNSQ, sizeof( silk_nsq_state ) ); + LastGainIndex_copy2 = psEnc->sShape.LastGainIndex; + } + } else { + /* Within 5 bits of budget: close enough */ + break; + } + + if( ( found_lower & found_upper ) == 0 ) { + /* Adjust gain according to high-rate rate/distortion curve */ + opus_int32 gain_factor_Q16; + gain_factor_Q16 = silk_log2lin( silk_LSHIFT( nBits - maxBits, 7 ) / psEnc->sCmn.frame_length + SILK_FIX_CONST( 16, 7 ) ); + gain_factor_Q16 = silk_min_32( gain_factor_Q16, SILK_FIX_CONST( 2, 16 ) ); + if( nBits > maxBits ) { + gain_factor_Q16 = silk_max_32( gain_factor_Q16, SILK_FIX_CONST( 1.3, 16 ) ); + } + gainMult_Q8 = silk_SMULWB( gain_factor_Q16, gainMult_Q8 ); + } else { + /* Adjust gain by interpolating */ + gainMult_Q8 = gainMult_lower + ( ( gainMult_upper - gainMult_lower ) * ( maxBits - nBits_lower ) ) / ( nBits_upper - nBits_lower ); + /* New gain multplier must be between 25% and 75% of old range (note that gainMult_upper < gainMult_lower) */ + if( gainMult_Q8 > silk_ADD_RSHIFT32( gainMult_lower, gainMult_upper - gainMult_lower, 2 ) ) { + gainMult_Q8 = silk_ADD_RSHIFT32( gainMult_lower, gainMult_upper - gainMult_lower, 2 ); + } else + if( gainMult_Q8 < silk_SUB_RSHIFT32( gainMult_upper, gainMult_upper - gainMult_lower, 2 ) ) { + gainMult_Q8 = silk_SUB_RSHIFT32( gainMult_upper, gainMult_upper - gainMult_lower, 2 ); + } + } + + for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) { + pGains_Q16[ i ] = silk_LSHIFT_SAT32( silk_SMULWB( sEncCtrl.GainsUnq_Q16[ i ], gainMult_Q8 ), 8 ); + } + + /* Quantize gains */ + psEnc->sShape.LastGainIndex = sEncCtrl.lastGainIndexPrev; + silk_gains_quant( psEnc->sCmn.indices.GainsIndices, pGains_Q16, + &psEnc->sShape.LastGainIndex, condCoding == CODE_CONDITIONALLY, psEnc->sCmn.nb_subfr ); + + /* Unique identifier of gains vector */ + gainsID = silk_gains_ID( psEnc->sCmn.indices.GainsIndices, psEnc->sCmn.nb_subfr ); + + /* Overwrite unquantized gains with quantized gains and convert back to Q0 from Q16 */ + for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) { + sEncCtrl.Gains[ i ] = pGains_Q16[ i ] / 65536.0f; + } + } + } + + /* Update input buffer */ + silk_memmove( psEnc->x_buf, &psEnc->x_buf[ psEnc->sCmn.frame_length ], + ( psEnc->sCmn.ltp_mem_length + LA_SHAPE_MS * psEnc->sCmn.fs_kHz ) * sizeof( silk_float ) ); + + /* Exit without entropy coding */ + if( psEnc->sCmn.prefillFlag ) { + /* No payload */ + *pnBytesOut = 0; + return ret; + } + + /* Parameters needed for next frame */ + psEnc->sCmn.prevLag = sEncCtrl.pitchL[ psEnc->sCmn.nb_subfr - 1 ]; + psEnc->sCmn.prevSignalType = psEnc->sCmn.indices.signalType; + + /****************************************/ + /* Finalize payload */ + /****************************************/ + psEnc->sCmn.first_frame_after_reset = 0; + /* Payload size */ + *pnBytesOut = silk_RSHIFT( ec_tell( psRangeEnc ) + 7, 3 ); + + return ret; +} + +/* Low-Bitrate Redundancy (LBRR) encoding. Reuse all parameters but encode excitation at lower bitrate */ +static OPUS_INLINE void silk_LBRR_encode_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */ + const silk_float xfw[], /* I Input signal */ + opus_int condCoding /* I The type of conditional coding used so far for this frame */ +) +{ + opus_int k; + opus_int32 Gains_Q16[ MAX_NB_SUBFR ]; + silk_float TempGains[ MAX_NB_SUBFR ]; + SideInfoIndices *psIndices_LBRR = &psEnc->sCmn.indices_LBRR[ psEnc->sCmn.nFramesEncoded ]; + silk_nsq_state sNSQ_LBRR; + + /*******************************************/ + /* Control use of inband LBRR */ + /*******************************************/ + if( psEnc->sCmn.LBRR_enabled && psEnc->sCmn.speech_activity_Q8 > SILK_FIX_CONST( LBRR_SPEECH_ACTIVITY_THRES, 8 ) ) { + psEnc->sCmn.LBRR_flags[ psEnc->sCmn.nFramesEncoded ] = 1; + + /* Copy noise shaping quantizer state and quantization indices from regular encoding */ + silk_memcpy( &sNSQ_LBRR, &psEnc->sCmn.sNSQ, sizeof( silk_nsq_state ) ); + silk_memcpy( psIndices_LBRR, &psEnc->sCmn.indices, sizeof( SideInfoIndices ) ); + + /* Save original gains */ + silk_memcpy( TempGains, psEncCtrl->Gains, psEnc->sCmn.nb_subfr * sizeof( silk_float ) ); + + if( psEnc->sCmn.nFramesEncoded == 0 || psEnc->sCmn.LBRR_flags[ psEnc->sCmn.nFramesEncoded - 1 ] == 0 ) { + /* First frame in packet or previous frame not LBRR coded */ + psEnc->sCmn.LBRRprevLastGainIndex = psEnc->sShape.LastGainIndex; + + /* Increase Gains to get target LBRR rate */ + psIndices_LBRR->GainsIndices[ 0 ] += psEnc->sCmn.LBRR_GainIncreases; + psIndices_LBRR->GainsIndices[ 0 ] = silk_min_int( psIndices_LBRR->GainsIndices[ 0 ], N_LEVELS_QGAIN - 1 ); + } + + /* Decode to get gains in sync with decoder */ + silk_gains_dequant( Gains_Q16, psIndices_LBRR->GainsIndices, + &psEnc->sCmn.LBRRprevLastGainIndex, condCoding == CODE_CONDITIONALLY, psEnc->sCmn.nb_subfr ); + + /* Overwrite unquantized gains with quantized gains and convert back to Q0 from Q16 */ + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + psEncCtrl->Gains[ k ] = Gains_Q16[ k ] * ( 1.0f / 65536.0f ); + } + + /*****************************************/ + /* Noise shaping quantization */ + /*****************************************/ + silk_NSQ_wrapper_FLP( psEnc, psEncCtrl, psIndices_LBRR, &sNSQ_LBRR, + psEnc->sCmn.pulses_LBRR[ psEnc->sCmn.nFramesEncoded ], xfw ); + + /* Restore original gains */ + silk_memcpy( psEncCtrl->Gains, TempGains, psEnc->sCmn.nb_subfr * sizeof( silk_float ) ); + } +} diff --git a/thirdparty/opus/silk/float/energy_FLP.c b/thirdparty/opus/silk/float/energy_FLP.c new file mode 100644 index 000000000000..24b8179f9e3b --- /dev/null +++ b/thirdparty/opus/silk/float/energy_FLP.c @@ -0,0 +1,60 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FLP.h" + +/* sum of squares of a silk_float array, with result as double */ +double silk_energy_FLP( + const silk_float *data, + opus_int dataSize +) +{ + opus_int i, dataSize4; + double result; + + /* 4x unrolled loop */ + result = 0.0; + dataSize4 = dataSize & 0xFFFC; + for( i = 0; i < dataSize4; i += 4 ) { + result += data[ i + 0 ] * (double)data[ i + 0 ] + + data[ i + 1 ] * (double)data[ i + 1 ] + + data[ i + 2 ] * (double)data[ i + 2 ] + + data[ i + 3 ] * (double)data[ i + 3 ]; + } + + /* add any remaining products */ + for( ; i < dataSize; i++ ) { + result += data[ i ] * (double)data[ i ]; + } + + silk_assert( result >= 0.0 ); + return result; +} diff --git a/thirdparty/opus/silk/float/find_LPC_FLP.c b/thirdparty/opus/silk/float/find_LPC_FLP.c new file mode 100644 index 000000000000..fcfe1c3681eb --- /dev/null +++ b/thirdparty/opus/silk/float/find_LPC_FLP.c @@ -0,0 +1,104 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "define.h" +#include "main_FLP.h" +#include "tuning_parameters.h" + +/* LPC analysis */ +void silk_find_LPC_FLP( + silk_encoder_state *psEncC, /* I/O Encoder state */ + opus_int16 NLSF_Q15[], /* O NLSFs */ + const silk_float x[], /* I Input signal */ + const silk_float minInvGain /* I Inverse of max prediction gain */ +) +{ + opus_int k, subfr_length; + silk_float a[ MAX_LPC_ORDER ]; + + /* Used only for NLSF interpolation */ + silk_float res_nrg, res_nrg_2nd, res_nrg_interp; + opus_int16 NLSF0_Q15[ MAX_LPC_ORDER ]; + silk_float a_tmp[ MAX_LPC_ORDER ]; + silk_float LPC_res[ MAX_FRAME_LENGTH + MAX_NB_SUBFR * MAX_LPC_ORDER ]; + + subfr_length = psEncC->subfr_length + psEncC->predictLPCOrder; + + /* Default: No interpolation */ + psEncC->indices.NLSFInterpCoef_Q2 = 4; + + /* Burg AR analysis for the full frame */ + res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder ); + + if( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) { + /* Optimal solution for last 10 ms; subtract residual energy here, as that's easier than */ + /* adding it to the residual energy of the first 10 ms in each iteration of the search below */ + res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder ); + + /* Convert to NLSFs */ + silk_A2NLSF_FLP( NLSF_Q15, a_tmp, psEncC->predictLPCOrder ); + + /* Search over interpolation indices to find the one with lowest residual energy */ + res_nrg_2nd = silk_float_MAX; + for( k = 3; k >= 0; k-- ) { + /* Interpolate NLSFs for first half */ + silk_interpolate( NLSF0_Q15, psEncC->prev_NLSFq_Q15, NLSF_Q15, k, psEncC->predictLPCOrder ); + + /* Convert to LPC for residual energy evaluation */ + silk_NLSF2A_FLP( a_tmp, NLSF0_Q15, psEncC->predictLPCOrder ); + + /* Calculate residual energy with LSF interpolation */ + silk_LPC_analysis_filter_FLP( LPC_res, a_tmp, x, 2 * subfr_length, psEncC->predictLPCOrder ); + res_nrg_interp = (silk_float)( + silk_energy_FLP( LPC_res + psEncC->predictLPCOrder, subfr_length - psEncC->predictLPCOrder ) + + silk_energy_FLP( LPC_res + psEncC->predictLPCOrder + subfr_length, subfr_length - psEncC->predictLPCOrder ) ); + + /* Determine whether current interpolated NLSFs are best so far */ + if( res_nrg_interp < res_nrg ) { + /* Interpolation has lower residual energy */ + res_nrg = res_nrg_interp; + psEncC->indices.NLSFInterpCoef_Q2 = (opus_int8)k; + } else if( res_nrg_interp > res_nrg_2nd ) { + /* No reason to continue iterating - residual energies will continue to climb */ + break; + } + res_nrg_2nd = res_nrg_interp; + } + } + + if( psEncC->indices.NLSFInterpCoef_Q2 == 4 ) { + /* NLSF interpolation is currently inactive, calculate NLSFs from full frame AR coefficients */ + silk_A2NLSF_FLP( NLSF_Q15, a, psEncC->predictLPCOrder ); + } + + silk_assert( psEncC->indices.NLSFInterpCoef_Q2 == 4 || + ( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) ); +} diff --git a/thirdparty/opus/silk/float/find_LTP_FLP.c b/thirdparty/opus/silk/float/find_LTP_FLP.c new file mode 100644 index 000000000000..722999601409 --- /dev/null +++ b/thirdparty/opus/silk/float/find_LTP_FLP.c @@ -0,0 +1,132 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FLP.h" +#include "tuning_parameters.h" + +void silk_find_LTP_FLP( + silk_float b[ MAX_NB_SUBFR * LTP_ORDER ], /* O LTP coefs */ + silk_float WLTP[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ], /* O Weight for LTP quantization */ + silk_float *LTPredCodGain, /* O LTP coding gain */ + const silk_float r_lpc[], /* I LPC residual */ + const opus_int lag[ MAX_NB_SUBFR ], /* I LTP lags */ + const silk_float Wght[ MAX_NB_SUBFR ], /* I Weights */ + const opus_int subfr_length, /* I Subframe length */ + const opus_int nb_subfr, /* I number of subframes */ + const opus_int mem_offset /* I Number of samples in LTP memory */ +) +{ + opus_int i, k; + silk_float *b_ptr, temp, *WLTP_ptr; + silk_float LPC_res_nrg, LPC_LTP_res_nrg; + silk_float d[ MAX_NB_SUBFR ], m, g, delta_b[ LTP_ORDER ]; + silk_float w[ MAX_NB_SUBFR ], nrg[ MAX_NB_SUBFR ], regu; + silk_float Rr[ LTP_ORDER ], rr[ MAX_NB_SUBFR ]; + const silk_float *r_ptr, *lag_ptr; + + b_ptr = b; + WLTP_ptr = WLTP; + r_ptr = &r_lpc[ mem_offset ]; + for( k = 0; k < nb_subfr; k++ ) { + lag_ptr = r_ptr - ( lag[ k ] + LTP_ORDER / 2 ); + + silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, WLTP_ptr ); + silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, Rr ); + + rr[ k ] = ( silk_float )silk_energy_FLP( r_ptr, subfr_length ); + regu = 1.0f + rr[ k ] + + matrix_ptr( WLTP_ptr, 0, 0, LTP_ORDER ) + + matrix_ptr( WLTP_ptr, LTP_ORDER-1, LTP_ORDER-1, LTP_ORDER ); + regu *= LTP_DAMPING / 3; + silk_regularize_correlations_FLP( WLTP_ptr, &rr[ k ], regu, LTP_ORDER ); + silk_solve_LDL_FLP( WLTP_ptr, LTP_ORDER, Rr, b_ptr ); + + /* Calculate residual energy */ + nrg[ k ] = silk_residual_energy_covar_FLP( b_ptr, WLTP_ptr, Rr, rr[ k ], LTP_ORDER ); + + temp = Wght[ k ] / ( nrg[ k ] * Wght[ k ] + 0.01f * subfr_length ); + silk_scale_vector_FLP( WLTP_ptr, temp, LTP_ORDER * LTP_ORDER ); + w[ k ] = matrix_ptr( WLTP_ptr, LTP_ORDER / 2, LTP_ORDER / 2, LTP_ORDER ); + + r_ptr += subfr_length; + b_ptr += LTP_ORDER; + WLTP_ptr += LTP_ORDER * LTP_ORDER; + } + + /* Compute LTP coding gain */ + if( LTPredCodGain != NULL ) { + LPC_LTP_res_nrg = 1e-6f; + LPC_res_nrg = 0.0f; + for( k = 0; k < nb_subfr; k++ ) { + LPC_res_nrg += rr[ k ] * Wght[ k ]; + LPC_LTP_res_nrg += nrg[ k ] * Wght[ k ]; + } + + silk_assert( LPC_LTP_res_nrg > 0 ); + *LTPredCodGain = 3.0f * silk_log2( LPC_res_nrg / LPC_LTP_res_nrg ); + } + + /* Smoothing */ + /* d = sum( B, 1 ); */ + b_ptr = b; + for( k = 0; k < nb_subfr; k++ ) { + d[ k ] = 0; + for( i = 0; i < LTP_ORDER; i++ ) { + d[ k ] += b_ptr[ i ]; + } + b_ptr += LTP_ORDER; + } + /* m = ( w * d' ) / ( sum( w ) + 1e-3 ); */ + temp = 1e-3f; + for( k = 0; k < nb_subfr; k++ ) { + temp += w[ k ]; + } + m = 0; + for( k = 0; k < nb_subfr; k++ ) { + m += d[ k ] * w[ k ]; + } + m = m / temp; + + b_ptr = b; + for( k = 0; k < nb_subfr; k++ ) { + g = LTP_SMOOTHING / ( LTP_SMOOTHING + w[ k ] ) * ( m - d[ k ] ); + temp = 0; + for( i = 0; i < LTP_ORDER; i++ ) { + delta_b[ i ] = silk_max_float( b_ptr[ i ], 0.1f ); + temp += delta_b[ i ]; + } + temp = g / temp; + for( i = 0; i < LTP_ORDER; i++ ) { + b_ptr[ i ] = b_ptr[ i ] + delta_b[ i ] * temp; + } + b_ptr += LTP_ORDER; + } +} diff --git a/thirdparty/opus/silk/float/find_pitch_lags_FLP.c b/thirdparty/opus/silk/float/find_pitch_lags_FLP.c new file mode 100644 index 000000000000..f3b22d25ce66 --- /dev/null +++ b/thirdparty/opus/silk/float/find_pitch_lags_FLP.c @@ -0,0 +1,132 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include "main_FLP.h" +#include "tuning_parameters.h" + +void silk_find_pitch_lags_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */ + silk_float res[], /* O Residual */ + const silk_float x[], /* I Speech signal */ + int arch /* I Run-time architecture */ +) +{ + opus_int buf_len; + silk_float thrhld, res_nrg; + const silk_float *x_buf_ptr, *x_buf; + silk_float auto_corr[ MAX_FIND_PITCH_LPC_ORDER + 1 ]; + silk_float A[ MAX_FIND_PITCH_LPC_ORDER ]; + silk_float refl_coef[ MAX_FIND_PITCH_LPC_ORDER ]; + silk_float Wsig[ FIND_PITCH_LPC_WIN_MAX ]; + silk_float *Wsig_ptr; + + /******************************************/ + /* Set up buffer lengths etc based on Fs */ + /******************************************/ + buf_len = psEnc->sCmn.la_pitch + psEnc->sCmn.frame_length + psEnc->sCmn.ltp_mem_length; + + /* Safety check */ + silk_assert( buf_len >= psEnc->sCmn.pitch_LPC_win_length ); + + x_buf = x - psEnc->sCmn.ltp_mem_length; + + /******************************************/ + /* Estimate LPC AR coeficients */ + /******************************************/ + + /* Calculate windowed signal */ + + /* First LA_LTP samples */ + x_buf_ptr = x_buf + buf_len - psEnc->sCmn.pitch_LPC_win_length; + Wsig_ptr = Wsig; + silk_apply_sine_window_FLP( Wsig_ptr, x_buf_ptr, 1, psEnc->sCmn.la_pitch ); + + /* Middle non-windowed samples */ + Wsig_ptr += psEnc->sCmn.la_pitch; + x_buf_ptr += psEnc->sCmn.la_pitch; + silk_memcpy( Wsig_ptr, x_buf_ptr, ( psEnc->sCmn.pitch_LPC_win_length - ( psEnc->sCmn.la_pitch << 1 ) ) * sizeof( silk_float ) ); + + /* Last LA_LTP samples */ + Wsig_ptr += psEnc->sCmn.pitch_LPC_win_length - ( psEnc->sCmn.la_pitch << 1 ); + x_buf_ptr += psEnc->sCmn.pitch_LPC_win_length - ( psEnc->sCmn.la_pitch << 1 ); + silk_apply_sine_window_FLP( Wsig_ptr, x_buf_ptr, 2, psEnc->sCmn.la_pitch ); + + /* Calculate autocorrelation sequence */ + silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1 ); + + /* Add white noise, as a fraction of the energy */ + auto_corr[ 0 ] += auto_corr[ 0 ] * FIND_PITCH_WHITE_NOISE_FRACTION + 1; + + /* Calculate the reflection coefficients using Schur */ + res_nrg = silk_schur_FLP( refl_coef, auto_corr, psEnc->sCmn.pitchEstimationLPCOrder ); + + /* Prediction gain */ + psEncCtrl->predGain = auto_corr[ 0 ] / silk_max_float( res_nrg, 1.0f ); + + /* Convert reflection coefficients to prediction coefficients */ + silk_k2a_FLP( A, refl_coef, psEnc->sCmn.pitchEstimationLPCOrder ); + + /* Bandwidth expansion */ + silk_bwexpander_FLP( A, psEnc->sCmn.pitchEstimationLPCOrder, FIND_PITCH_BANDWIDTH_EXPANSION ); + + /*****************************************/ + /* LPC analysis filtering */ + /*****************************************/ + silk_LPC_analysis_filter_FLP( res, A, x_buf, buf_len, psEnc->sCmn.pitchEstimationLPCOrder ); + + if( psEnc->sCmn.indices.signalType != TYPE_NO_VOICE_ACTIVITY && psEnc->sCmn.first_frame_after_reset == 0 ) { + /* Threshold for pitch estimator */ + thrhld = 0.6f; + thrhld -= 0.004f * psEnc->sCmn.pitchEstimationLPCOrder; + thrhld -= 0.1f * psEnc->sCmn.speech_activity_Q8 * ( 1.0f / 256.0f ); + thrhld -= 0.15f * (psEnc->sCmn.prevSignalType >> 1); + thrhld -= 0.1f * psEnc->sCmn.input_tilt_Q15 * ( 1.0f / 32768.0f ); + + /*****************************************/ + /* Call Pitch estimator */ + /*****************************************/ + if( silk_pitch_analysis_core_FLP( res, psEncCtrl->pitchL, &psEnc->sCmn.indices.lagIndex, + &psEnc->sCmn.indices.contourIndex, &psEnc->LTPCorr, psEnc->sCmn.prevLag, psEnc->sCmn.pitchEstimationThreshold_Q16 / 65536.0f, + thrhld, psEnc->sCmn.fs_kHz, psEnc->sCmn.pitchEstimationComplexity, psEnc->sCmn.nb_subfr, arch ) == 0 ) + { + psEnc->sCmn.indices.signalType = TYPE_VOICED; + } else { + psEnc->sCmn.indices.signalType = TYPE_UNVOICED; + } + } else { + silk_memset( psEncCtrl->pitchL, 0, sizeof( psEncCtrl->pitchL ) ); + psEnc->sCmn.indices.lagIndex = 0; + psEnc->sCmn.indices.contourIndex = 0; + psEnc->LTPCorr = 0; + } +} diff --git a/thirdparty/opus/silk/float/find_pred_coefs_FLP.c b/thirdparty/opus/silk/float/find_pred_coefs_FLP.c new file mode 100644 index 000000000000..1af4fe5f1b2d --- /dev/null +++ b/thirdparty/opus/silk/float/find_pred_coefs_FLP.c @@ -0,0 +1,118 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FLP.h" + +/* Find LPC and LTP coefficients */ +void silk_find_pred_coefs_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */ + const silk_float res_pitch[], /* I Residual from pitch analysis */ + const silk_float x[], /* I Speech signal */ + opus_int condCoding /* I The type of conditional coding to use */ +) +{ + opus_int i; + silk_float WLTP[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ]; + silk_float invGains[ MAX_NB_SUBFR ], Wght[ MAX_NB_SUBFR ]; + opus_int16 NLSF_Q15[ MAX_LPC_ORDER ]; + const silk_float *x_ptr; + silk_float *x_pre_ptr, LPC_in_pre[ MAX_NB_SUBFR * MAX_LPC_ORDER + MAX_FRAME_LENGTH ]; + silk_float minInvGain; + + /* Weighting for weighted least squares */ + for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) { + silk_assert( psEncCtrl->Gains[ i ] > 0.0f ); + invGains[ i ] = 1.0f / psEncCtrl->Gains[ i ]; + Wght[ i ] = invGains[ i ] * invGains[ i ]; + } + + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + /**********/ + /* VOICED */ + /**********/ + silk_assert( psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->pitchL[ 0 ] + LTP_ORDER / 2 ); + + /* LTP analysis */ + silk_find_LTP_FLP( psEncCtrl->LTPCoef, WLTP, &psEncCtrl->LTPredCodGain, res_pitch, + psEncCtrl->pitchL, Wght, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.ltp_mem_length ); + + /* Quantize LTP gain parameters */ + silk_quant_LTP_gains_FLP( psEncCtrl->LTPCoef, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex, + &psEnc->sCmn.sum_log_gain_Q7, WLTP, psEnc->sCmn.mu_LTP_Q9, psEnc->sCmn.LTPQuantLowComplexity, psEnc->sCmn.nb_subfr, + psEnc->sCmn.arch ); + + /* Control LTP scaling */ + silk_LTP_scale_ctrl_FLP( psEnc, psEncCtrl, condCoding ); + + /* Create LTP residual */ + silk_LTP_analysis_filter_FLP( LPC_in_pre, x - psEnc->sCmn.predictLPCOrder, psEncCtrl->LTPCoef, + psEncCtrl->pitchL, invGains, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.predictLPCOrder ); + } else { + /************/ + /* UNVOICED */ + /************/ + /* Create signal with prepended subframes, scaled by inverse gains */ + x_ptr = x - psEnc->sCmn.predictLPCOrder; + x_pre_ptr = LPC_in_pre; + for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) { + silk_scale_copy_vector_FLP( x_pre_ptr, x_ptr, invGains[ i ], + psEnc->sCmn.subfr_length + psEnc->sCmn.predictLPCOrder ); + x_pre_ptr += psEnc->sCmn.subfr_length + psEnc->sCmn.predictLPCOrder; + x_ptr += psEnc->sCmn.subfr_length; + } + silk_memset( psEncCtrl->LTPCoef, 0, psEnc->sCmn.nb_subfr * LTP_ORDER * sizeof( silk_float ) ); + psEncCtrl->LTPredCodGain = 0.0f; + psEnc->sCmn.sum_log_gain_Q7 = 0; + } + + /* Limit on total predictive coding gain */ + if( psEnc->sCmn.first_frame_after_reset ) { + minInvGain = 1.0f / MAX_PREDICTION_POWER_GAIN_AFTER_RESET; + } else { + minInvGain = (silk_float)pow( 2, psEncCtrl->LTPredCodGain / 3 ) / MAX_PREDICTION_POWER_GAIN; + minInvGain /= 0.25f + 0.75f * psEncCtrl->coding_quality; + } + + /* LPC_in_pre contains the LTP-filtered input for voiced, and the unfiltered input for unvoiced */ + silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain ); + + /* Quantize LSFs */ + silk_process_NLSFs_FLP( &psEnc->sCmn, psEncCtrl->PredCoef, NLSF_Q15, psEnc->sCmn.prev_NLSFq_Q15 ); + + /* Calculate residual energy using quantized LPC coefficients */ + silk_residual_energy_FLP( psEncCtrl->ResNrg, LPC_in_pre, psEncCtrl->PredCoef, psEncCtrl->Gains, + psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.predictLPCOrder ); + + /* Copy to prediction struct for use in next frame for interpolation */ + silk_memcpy( psEnc->sCmn.prev_NLSFq_Q15, NLSF_Q15, sizeof( psEnc->sCmn.prev_NLSFq_Q15 ) ); +} + diff --git a/thirdparty/opus/silk/float/inner_product_FLP.c b/thirdparty/opus/silk/float/inner_product_FLP.c new file mode 100644 index 000000000000..029c012911da --- /dev/null +++ b/thirdparty/opus/silk/float/inner_product_FLP.c @@ -0,0 +1,60 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FLP.h" + +/* inner product of two silk_float arrays, with result as double */ +double silk_inner_product_FLP( + const silk_float *data1, + const silk_float *data2, + opus_int dataSize +) +{ + opus_int i, dataSize4; + double result; + + /* 4x unrolled loop */ + result = 0.0; + dataSize4 = dataSize & 0xFFFC; + for( i = 0; i < dataSize4; i += 4 ) { + result += data1[ i + 0 ] * (double)data2[ i + 0 ] + + data1[ i + 1 ] * (double)data2[ i + 1 ] + + data1[ i + 2 ] * (double)data2[ i + 2 ] + + data1[ i + 3 ] * (double)data2[ i + 3 ]; + } + + /* add any remaining products */ + for( ; i < dataSize; i++ ) { + result += data1[ i ] * (double)data2[ i ]; + } + + return result; +} diff --git a/thirdparty/opus/silk/float/k2a_FLP.c b/thirdparty/opus/silk/float/k2a_FLP.c new file mode 100644 index 000000000000..12af4e76697a --- /dev/null +++ b/thirdparty/opus/silk/float/k2a_FLP.c @@ -0,0 +1,53 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FLP.h" + +/* step up function, converts reflection coefficients to prediction coefficients */ +void silk_k2a_FLP( + silk_float *A, /* O prediction coefficients [order] */ + const silk_float *rc, /* I reflection coefficients [order] */ + opus_int32 order /* I prediction order */ +) +{ + opus_int k, n; + silk_float Atmp[ SILK_MAX_ORDER_LPC ]; + + for( k = 0; k < order; k++ ) { + for( n = 0; n < k; n++ ) { + Atmp[ n ] = A[ n ]; + } + for( n = 0; n < k; n++ ) { + A[ n ] += Atmp[ k - n - 1 ] * rc[ k ]; + } + A[ k ] = -rc[ k ]; + } +} diff --git a/thirdparty/opus/silk/float/levinsondurbin_FLP.c b/thirdparty/opus/silk/float/levinsondurbin_FLP.c new file mode 100644 index 000000000000..f0ba6069812f --- /dev/null +++ b/thirdparty/opus/silk/float/levinsondurbin_FLP.c @@ -0,0 +1,81 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FLP.h" + +/* Solve the normal equations using the Levinson-Durbin recursion */ +silk_float silk_levinsondurbin_FLP( /* O prediction error energy */ + silk_float A[], /* O prediction coefficients [order] */ + const silk_float corr[], /* I input auto-correlations [order + 1] */ + const opus_int order /* I prediction order */ +) +{ + opus_int i, mHalf, m; + silk_float min_nrg, nrg, t, km, Atmp1, Atmp2; + + min_nrg = 1e-12f * corr[ 0 ] + 1e-9f; + nrg = corr[ 0 ]; + nrg = silk_max_float(min_nrg, nrg); + A[ 0 ] = corr[ 1 ] / nrg; + nrg -= A[ 0 ] * corr[ 1 ]; + nrg = silk_max_float(min_nrg, nrg); + + for( m = 1; m < order; m++ ) + { + t = corr[ m + 1 ]; + for( i = 0; i < m; i++ ) { + t -= A[ i ] * corr[ m - i ]; + } + + /* reflection coefficient */ + km = t / nrg; + + /* residual energy */ + nrg -= km * t; + nrg = silk_max_float(min_nrg, nrg); + + mHalf = m >> 1; + for( i = 0; i < mHalf; i++ ) { + Atmp1 = A[ i ]; + Atmp2 = A[ m - i - 1 ]; + A[ m - i - 1 ] -= km * Atmp1; + A[ i ] -= km * Atmp2; + } + if( m & 1 ) { + A[ mHalf ] -= km * A[ mHalf ]; + } + A[ m ] = km; + } + + /* return the residual energy */ + return nrg; +} + diff --git a/thirdparty/opus/silk/float/main_FLP.h b/thirdparty/opus/silk/float/main_FLP.h new file mode 100644 index 000000000000..e5a75972e5f8 --- /dev/null +++ b/thirdparty/opus/silk/float/main_FLP.h @@ -0,0 +1,313 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_MAIN_FLP_H +#define SILK_MAIN_FLP_H + +#include "SigProc_FLP.h" +#include "SigProc_FIX.h" +#include "structs_FLP.h" +#include "main.h" +#include "define.h" +#include "debug.h" +#include "entenc.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#define silk_encoder_state_Fxx silk_encoder_state_FLP +#define silk_encode_do_VAD_Fxx silk_encode_do_VAD_FLP +#define silk_encode_frame_Fxx silk_encode_frame_FLP + +/*********************/ +/* Encoder Functions */ +/*********************/ + +/* High-pass filter with cutoff frequency adaptation based on pitch lag statistics */ +void silk_HP_variable_cutoff( + silk_encoder_state_Fxx state_Fxx[] /* I/O Encoder states */ +); + +/* Encoder main function */ +void silk_encode_do_VAD_FLP( + silk_encoder_state_FLP *psEnc /* I/O Encoder state FLP */ +); + +/* Encoder main function */ +opus_int silk_encode_frame_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + opus_int32 *pnBytesOut, /* O Number of payload bytes; */ + ec_enc *psRangeEnc, /* I/O compressor data structure */ + opus_int condCoding, /* I The type of conditional coding to use */ + opus_int maxBits, /* I If > 0: maximum number of output bits */ + opus_int useCBR /* I Flag to force constant-bitrate operation */ +); + +/* Initializes the Silk encoder state */ +opus_int silk_init_encoder( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + int arch /* I Run-tim architecture */ +); + +/* Control the Silk encoder */ +opus_int silk_control_encoder( + silk_encoder_state_FLP *psEnc, /* I/O Pointer to Silk encoder state FLP */ + silk_EncControlStruct *encControl, /* I Control structure */ + const opus_int32 TargetRate_bps, /* I Target max bitrate (bps) */ + const opus_int allow_bw_switch, /* I Flag to allow switching audio bandwidth */ + const opus_int channelNb, /* I Channel number */ + const opus_int force_fs_kHz +); + +/****************/ +/* Prefiltering */ +/****************/ +void silk_prefilter_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + const silk_encoder_control_FLP *psEncCtrl, /* I Encoder control FLP */ + silk_float xw[], /* O Weighted signal */ + const silk_float x[] /* I Speech signal */ +); + +/**************************/ +/* Noise shaping analysis */ +/**************************/ +/* Compute noise shaping coefficients and initial gain values */ +void silk_noise_shape_analysis_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */ + const silk_float *pitch_res, /* I LPC residual from pitch analysis */ + const silk_float *x /* I Input signal [frame_length + la_shape] */ +); + +/* Autocorrelations for a warped frequency axis */ +void silk_warped_autocorrelation_FLP( + silk_float *corr, /* O Result [order + 1] */ + const silk_float *input, /* I Input data to correlate */ + const silk_float warping, /* I Warping coefficient */ + const opus_int length, /* I Length of input */ + const opus_int order /* I Correlation order (even) */ +); + +/* Calculation of LTP state scaling */ +void silk_LTP_scale_ctrl_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */ + opus_int condCoding /* I The type of conditional coding to use */ +); + +/**********************************************/ +/* Prediction Analysis */ +/**********************************************/ +/* Find pitch lags */ +void silk_find_pitch_lags_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */ + silk_float res[], /* O Residual */ + const silk_float x[], /* I Speech signal */ + int arch /* I Run-time architecture */ +); + +/* Find LPC and LTP coefficients */ +void silk_find_pred_coefs_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */ + const silk_float res_pitch[], /* I Residual from pitch analysis */ + const silk_float x[], /* I Speech signal */ + opus_int condCoding /* I The type of conditional coding to use */ +); + +/* LPC analysis */ +void silk_find_LPC_FLP( + silk_encoder_state *psEncC, /* I/O Encoder state */ + opus_int16 NLSF_Q15[], /* O NLSFs */ + const silk_float x[], /* I Input signal */ + const silk_float minInvGain /* I Prediction gain from LTP (dB) */ +); + +/* LTP analysis */ +void silk_find_LTP_FLP( + silk_float b[ MAX_NB_SUBFR * LTP_ORDER ], /* O LTP coefs */ + silk_float WLTP[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ], /* O Weight for LTP quantization */ + silk_float *LTPredCodGain, /* O LTP coding gain */ + const silk_float r_lpc[], /* I LPC residual */ + const opus_int lag[ MAX_NB_SUBFR ], /* I LTP lags */ + const silk_float Wght[ MAX_NB_SUBFR ], /* I Weights */ + const opus_int subfr_length, /* I Subframe length */ + const opus_int nb_subfr, /* I number of subframes */ + const opus_int mem_offset /* I Number of samples in LTP memory */ +); + +void silk_LTP_analysis_filter_FLP( + silk_float *LTP_res, /* O LTP res MAX_NB_SUBFR*(pre_lgth+subfr_lngth) */ + const silk_float *x, /* I Input signal, with preceding samples */ + const silk_float B[ LTP_ORDER * MAX_NB_SUBFR ], /* I LTP coefficients for each subframe */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const silk_float invGains[ MAX_NB_SUBFR ], /* I Inverse quantization gains */ + const opus_int subfr_length, /* I Length of each subframe */ + const opus_int nb_subfr, /* I number of subframes */ + const opus_int pre_length /* I Preceding samples for each subframe */ +); + +/* Calculates residual energies of input subframes where all subframes have LPC_order */ +/* of preceding samples */ +void silk_residual_energy_FLP( + silk_float nrgs[ MAX_NB_SUBFR ], /* O Residual energy per subframe */ + const silk_float x[], /* I Input signal */ + silk_float a[ 2 ][ MAX_LPC_ORDER ], /* I AR coefs for each frame half */ + const silk_float gains[], /* I Quantization gains */ + const opus_int subfr_length, /* I Subframe length */ + const opus_int nb_subfr, /* I number of subframes */ + const opus_int LPC_order /* I LPC order */ +); + +/* 16th order LPC analysis filter */ +void silk_LPC_analysis_filter_FLP( + silk_float r_LPC[], /* O LPC residual signal */ + const silk_float PredCoef[], /* I LPC coefficients */ + const silk_float s[], /* I Input signal */ + const opus_int length, /* I Length of input signal */ + const opus_int Order /* I LPC order */ +); + +/* LTP tap quantizer */ +void silk_quant_LTP_gains_FLP( + silk_float B[ MAX_NB_SUBFR * LTP_ORDER ], /* I/O (Un-)quantized LTP gains */ + opus_int8 cbk_index[ MAX_NB_SUBFR ], /* O Codebook index */ + opus_int8 *periodicity_index, /* O Periodicity index */ + opus_int32 *sum_log_gain_Q7, /* I/O Cumulative max prediction gain */ + const silk_float W[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ], /* I Error weights */ + const opus_int mu_Q10, /* I Mu value (R/D tradeoff) */ + const opus_int lowComplexity, /* I Flag for low complexity */ + const opus_int nb_subfr, /* I number of subframes */ + int arch /* I Run-time architecture */ +); + +/* Residual energy: nrg = wxx - 2 * wXx * c + c' * wXX * c */ +silk_float silk_residual_energy_covar_FLP( /* O Weighted residual energy */ + const silk_float *c, /* I Filter coefficients */ + silk_float *wXX, /* I/O Weighted correlation matrix, reg. out */ + const silk_float *wXx, /* I Weighted correlation vector */ + const silk_float wxx, /* I Weighted correlation value */ + const opus_int D /* I Dimension */ +); + +/* Processing of gains */ +void silk_process_gains_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */ + opus_int condCoding /* I The type of conditional coding to use */ +); + +/******************/ +/* Linear Algebra */ +/******************/ +/* Calculates correlation matrix X'*X */ +void silk_corrMatrix_FLP( + const silk_float *x, /* I x vector [ L+order-1 ] used to create X */ + const opus_int L, /* I Length of vectors */ + const opus_int Order, /* I Max lag for correlation */ + silk_float *XX /* O X'*X correlation matrix [order x order] */ +); + +/* Calculates correlation vector X'*t */ +void silk_corrVector_FLP( + const silk_float *x, /* I x vector [L+order-1] used to create X */ + const silk_float *t, /* I Target vector [L] */ + const opus_int L, /* I Length of vecors */ + const opus_int Order, /* I Max lag for correlation */ + silk_float *Xt /* O X'*t correlation vector [order] */ +); + +/* Add noise to matrix diagonal */ +void silk_regularize_correlations_FLP( + silk_float *XX, /* I/O Correlation matrices */ + silk_float *xx, /* I/O Correlation values */ + const silk_float noise, /* I Noise energy to add */ + const opus_int D /* I Dimension of XX */ +); + +/* Function to solve linear equation Ax = b, where A is an MxM symmetric matrix */ +void silk_solve_LDL_FLP( + silk_float *A, /* I/O Symmetric square matrix, out: reg. */ + const opus_int M, /* I Size of matrix */ + const silk_float *b, /* I Pointer to b vector */ + silk_float *x /* O Pointer to x solution vector */ +); + +/* Apply sine window to signal vector. */ +/* Window types: */ +/* 1 -> sine window from 0 to pi/2 */ +/* 2 -> sine window from pi/2 to pi */ +void silk_apply_sine_window_FLP( + silk_float px_win[], /* O Pointer to windowed signal */ + const silk_float px[], /* I Pointer to input signal */ + const opus_int win_type, /* I Selects a window type */ + const opus_int length /* I Window length, multiple of 4 */ +); + +/* Wrapper functions. Call flp / fix code */ + +/* Convert AR filter coefficients to NLSF parameters */ +void silk_A2NLSF_FLP( + opus_int16 *NLSF_Q15, /* O NLSF vector [ LPC_order ] */ + const silk_float *pAR, /* I LPC coefficients [ LPC_order ] */ + const opus_int LPC_order /* I LPC order */ +); + +/* Convert NLSF parameters to AR prediction filter coefficients */ +void silk_NLSF2A_FLP( + silk_float *pAR, /* O LPC coefficients [ LPC_order ] */ + const opus_int16 *NLSF_Q15, /* I NLSF vector [ LPC_order ] */ + const opus_int LPC_order /* I LPC order */ +); + +/* Limit, stabilize, and quantize NLSFs */ +void silk_process_NLSFs_FLP( + silk_encoder_state *psEncC, /* I/O Encoder state */ + silk_float PredCoef[ 2 ][ MAX_LPC_ORDER ], /* O Prediction coefficients */ + opus_int16 NLSF_Q15[ MAX_LPC_ORDER ], /* I/O Normalized LSFs (quant out) (0 - (2^15-1)) */ + const opus_int16 prev_NLSF_Q15[ MAX_LPC_ORDER ] /* I Previous Normalized LSFs (0 - (2^15-1)) */ +); + +/* Floating-point Silk NSQ wrapper */ +void silk_NSQ_wrapper_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */ + SideInfoIndices *psIndices, /* I/O Quantization indices */ + silk_nsq_state *psNSQ, /* I/O Noise Shaping Quantzation state */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const silk_float x[] /* I Prefiltered input signal */ +); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/thirdparty/opus/silk/float/noise_shape_analysis_FLP.c b/thirdparty/opus/silk/float/noise_shape_analysis_FLP.c new file mode 100644 index 000000000000..65f6ea587053 --- /dev/null +++ b/thirdparty/opus/silk/float/noise_shape_analysis_FLP.c @@ -0,0 +1,365 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FLP.h" +#include "tuning_parameters.h" + +/* Compute gain to make warped filter coefficients have a zero mean log frequency response on a */ +/* non-warped frequency scale. (So that it can be implemented with a minimum-phase monic filter.) */ +/* Note: A monic filter is one with the first coefficient equal to 1.0. In Silk we omit the first */ +/* coefficient in an array of coefficients, for monic filters. */ +static OPUS_INLINE silk_float warped_gain( + const silk_float *coefs, + silk_float lambda, + opus_int order +) { + opus_int i; + silk_float gain; + + lambda = -lambda; + gain = coefs[ order - 1 ]; + for( i = order - 2; i >= 0; i-- ) { + gain = lambda * gain + coefs[ i ]; + } + return (silk_float)( 1.0f / ( 1.0f - lambda * gain ) ); +} + +/* Convert warped filter coefficients to monic pseudo-warped coefficients and limit maximum */ +/* amplitude of monic warped coefficients by using bandwidth expansion on the true coefficients */ +static OPUS_INLINE void warped_true2monic_coefs( + silk_float *coefs_syn, + silk_float *coefs_ana, + silk_float lambda, + silk_float limit, + opus_int order +) { + opus_int i, iter, ind = 0; + silk_float tmp, maxabs, chirp, gain_syn, gain_ana; + + /* Convert to monic coefficients */ + for( i = order - 1; i > 0; i-- ) { + coefs_syn[ i - 1 ] -= lambda * coefs_syn[ i ]; + coefs_ana[ i - 1 ] -= lambda * coefs_ana[ i ]; + } + gain_syn = ( 1.0f - lambda * lambda ) / ( 1.0f + lambda * coefs_syn[ 0 ] ); + gain_ana = ( 1.0f - lambda * lambda ) / ( 1.0f + lambda * coefs_ana[ 0 ] ); + for( i = 0; i < order; i++ ) { + coefs_syn[ i ] *= gain_syn; + coefs_ana[ i ] *= gain_ana; + } + + /* Limit */ + for( iter = 0; iter < 10; iter++ ) { + /* Find maximum absolute value */ + maxabs = -1.0f; + for( i = 0; i < order; i++ ) { + tmp = silk_max( silk_abs_float( coefs_syn[ i ] ), silk_abs_float( coefs_ana[ i ] ) ); + if( tmp > maxabs ) { + maxabs = tmp; + ind = i; + } + } + if( maxabs <= limit ) { + /* Coefficients are within range - done */ + return; + } + + /* Convert back to true warped coefficients */ + for( i = 1; i < order; i++ ) { + coefs_syn[ i - 1 ] += lambda * coefs_syn[ i ]; + coefs_ana[ i - 1 ] += lambda * coefs_ana[ i ]; + } + gain_syn = 1.0f / gain_syn; + gain_ana = 1.0f / gain_ana; + for( i = 0; i < order; i++ ) { + coefs_syn[ i ] *= gain_syn; + coefs_ana[ i ] *= gain_ana; + } + + /* Apply bandwidth expansion */ + chirp = 0.99f - ( 0.8f + 0.1f * iter ) * ( maxabs - limit ) / ( maxabs * ( ind + 1 ) ); + silk_bwexpander_FLP( coefs_syn, order, chirp ); + silk_bwexpander_FLP( coefs_ana, order, chirp ); + + /* Convert to monic warped coefficients */ + for( i = order - 1; i > 0; i-- ) { + coefs_syn[ i - 1 ] -= lambda * coefs_syn[ i ]; + coefs_ana[ i - 1 ] -= lambda * coefs_ana[ i ]; + } + gain_syn = ( 1.0f - lambda * lambda ) / ( 1.0f + lambda * coefs_syn[ 0 ] ); + gain_ana = ( 1.0f - lambda * lambda ) / ( 1.0f + lambda * coefs_ana[ 0 ] ); + for( i = 0; i < order; i++ ) { + coefs_syn[ i ] *= gain_syn; + coefs_ana[ i ] *= gain_ana; + } + } + silk_assert( 0 ); +} + +/* Compute noise shaping coefficients and initial gain values */ +void silk_noise_shape_analysis_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */ + const silk_float *pitch_res, /* I LPC residual from pitch analysis */ + const silk_float *x /* I Input signal [frame_length + la_shape] */ +) +{ + silk_shape_state_FLP *psShapeSt = &psEnc->sShape; + opus_int k, nSamples; + silk_float SNR_adj_dB, HarmBoost, HarmShapeGain, Tilt; + silk_float nrg, pre_nrg, log_energy, log_energy_prev, energy_variation; + silk_float delta, BWExp1, BWExp2, gain_mult, gain_add, strength, b, warping; + silk_float x_windowed[ SHAPE_LPC_WIN_MAX ]; + silk_float auto_corr[ MAX_SHAPE_LPC_ORDER + 1 ]; + const silk_float *x_ptr, *pitch_res_ptr; + + /* Point to start of first LPC analysis block */ + x_ptr = x - psEnc->sCmn.la_shape; + + /****************/ + /* GAIN CONTROL */ + /****************/ + SNR_adj_dB = psEnc->sCmn.SNR_dB_Q7 * ( 1 / 128.0f ); + + /* Input quality is the average of the quality in the lowest two VAD bands */ + psEncCtrl->input_quality = 0.5f * ( psEnc->sCmn.input_quality_bands_Q15[ 0 ] + psEnc->sCmn.input_quality_bands_Q15[ 1 ] ) * ( 1.0f / 32768.0f ); + + /* Coding quality level, between 0.0 and 1.0 */ + psEncCtrl->coding_quality = silk_sigmoid( 0.25f * ( SNR_adj_dB - 20.0f ) ); + + if( psEnc->sCmn.useCBR == 0 ) { + /* Reduce coding SNR during low speech activity */ + b = 1.0f - psEnc->sCmn.speech_activity_Q8 * ( 1.0f / 256.0f ); + SNR_adj_dB -= BG_SNR_DECR_dB * psEncCtrl->coding_quality * ( 0.5f + 0.5f * psEncCtrl->input_quality ) * b * b; + } + + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + /* Reduce gains for periodic signals */ + SNR_adj_dB += HARM_SNR_INCR_dB * psEnc->LTPCorr; + } else { + /* For unvoiced signals and low-quality input, adjust the quality slower than SNR_dB setting */ + SNR_adj_dB += ( -0.4f * psEnc->sCmn.SNR_dB_Q7 * ( 1 / 128.0f ) + 6.0f ) * ( 1.0f - psEncCtrl->input_quality ); + } + + /*************************/ + /* SPARSENESS PROCESSING */ + /*************************/ + /* Set quantizer offset */ + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + /* Initially set to 0; may be overruled in process_gains(..) */ + psEnc->sCmn.indices.quantOffsetType = 0; + psEncCtrl->sparseness = 0.0f; + } else { + /* Sparseness measure, based on relative fluctuations of energy per 2 milliseconds */ + nSamples = 2 * psEnc->sCmn.fs_kHz; + energy_variation = 0.0f; + log_energy_prev = 0.0f; + pitch_res_ptr = pitch_res; + for( k = 0; k < silk_SMULBB( SUB_FRAME_LENGTH_MS, psEnc->sCmn.nb_subfr ) / 2; k++ ) { + nrg = ( silk_float )nSamples + ( silk_float )silk_energy_FLP( pitch_res_ptr, nSamples ); + log_energy = silk_log2( nrg ); + if( k > 0 ) { + energy_variation += silk_abs_float( log_energy - log_energy_prev ); + } + log_energy_prev = log_energy; + pitch_res_ptr += nSamples; + } + psEncCtrl->sparseness = silk_sigmoid( 0.4f * ( energy_variation - 5.0f ) ); + + /* Set quantization offset depending on sparseness measure */ + if( psEncCtrl->sparseness > SPARSENESS_THRESHOLD_QNT_OFFSET ) { + psEnc->sCmn.indices.quantOffsetType = 0; + } else { + psEnc->sCmn.indices.quantOffsetType = 1; + } + + /* Increase coding SNR for sparse signals */ + SNR_adj_dB += SPARSE_SNR_INCR_dB * ( psEncCtrl->sparseness - 0.5f ); + } + + /*******************************/ + /* Control bandwidth expansion */ + /*******************************/ + /* More BWE for signals with high prediction gain */ + strength = FIND_PITCH_WHITE_NOISE_FRACTION * psEncCtrl->predGain; /* between 0.0 and 1.0 */ + BWExp1 = BWExp2 = BANDWIDTH_EXPANSION / ( 1.0f + strength * strength ); + delta = LOW_RATE_BANDWIDTH_EXPANSION_DELTA * ( 1.0f - 0.75f * psEncCtrl->coding_quality ); + BWExp1 -= delta; + BWExp2 += delta; + /* BWExp1 will be applied after BWExp2, so make it relative */ + BWExp1 /= BWExp2; + + if( psEnc->sCmn.warping_Q16 > 0 ) { + /* Slightly more warping in analysis will move quantization noise up in frequency, where it's better masked */ + warping = (silk_float)psEnc->sCmn.warping_Q16 / 65536.0f + 0.01f * psEncCtrl->coding_quality; + } else { + warping = 0.0f; + } + + /********************************************/ + /* Compute noise shaping AR coefs and gains */ + /********************************************/ + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + /* Apply window: sine slope followed by flat part followed by cosine slope */ + opus_int shift, slope_part, flat_part; + flat_part = psEnc->sCmn.fs_kHz * 3; + slope_part = ( psEnc->sCmn.shapeWinLength - flat_part ) / 2; + + silk_apply_sine_window_FLP( x_windowed, x_ptr, 1, slope_part ); + shift = slope_part; + silk_memcpy( x_windowed + shift, x_ptr + shift, flat_part * sizeof(silk_float) ); + shift += flat_part; + silk_apply_sine_window_FLP( x_windowed + shift, x_ptr + shift, 2, slope_part ); + + /* Update pointer: next LPC analysis block */ + x_ptr += psEnc->sCmn.subfr_length; + + if( psEnc->sCmn.warping_Q16 > 0 ) { + /* Calculate warped auto correlation */ + silk_warped_autocorrelation_FLP( auto_corr, x_windowed, warping, + psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder ); + } else { + /* Calculate regular auto correlation */ + silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1 ); + } + + /* Add white noise, as a fraction of energy */ + auto_corr[ 0 ] += auto_corr[ 0 ] * SHAPE_WHITE_NOISE_FRACTION; + + /* Convert correlations to prediction coefficients, and compute residual energy */ + nrg = silk_levinsondurbin_FLP( &psEncCtrl->AR2[ k * MAX_SHAPE_LPC_ORDER ], auto_corr, psEnc->sCmn.shapingLPCOrder ); + psEncCtrl->Gains[ k ] = ( silk_float )sqrt( nrg ); + + if( psEnc->sCmn.warping_Q16 > 0 ) { + /* Adjust gain for warping */ + psEncCtrl->Gains[ k ] *= warped_gain( &psEncCtrl->AR2[ k * MAX_SHAPE_LPC_ORDER ], warping, psEnc->sCmn.shapingLPCOrder ); + } + + /* Bandwidth expansion for synthesis filter shaping */ + silk_bwexpander_FLP( &psEncCtrl->AR2[ k * MAX_SHAPE_LPC_ORDER ], psEnc->sCmn.shapingLPCOrder, BWExp2 ); + + /* Compute noise shaping filter coefficients */ + silk_memcpy( + &psEncCtrl->AR1[ k * MAX_SHAPE_LPC_ORDER ], + &psEncCtrl->AR2[ k * MAX_SHAPE_LPC_ORDER ], + psEnc->sCmn.shapingLPCOrder * sizeof( silk_float ) ); + + /* Bandwidth expansion for analysis filter shaping */ + silk_bwexpander_FLP( &psEncCtrl->AR1[ k * MAX_SHAPE_LPC_ORDER ], psEnc->sCmn.shapingLPCOrder, BWExp1 ); + + /* Ratio of prediction gains, in energy domain */ + pre_nrg = silk_LPC_inverse_pred_gain_FLP( &psEncCtrl->AR2[ k * MAX_SHAPE_LPC_ORDER ], psEnc->sCmn.shapingLPCOrder ); + nrg = silk_LPC_inverse_pred_gain_FLP( &psEncCtrl->AR1[ k * MAX_SHAPE_LPC_ORDER ], psEnc->sCmn.shapingLPCOrder ); + psEncCtrl->GainsPre[ k ] = 1.0f - 0.7f * ( 1.0f - pre_nrg / nrg ); + + /* Convert to monic warped prediction coefficients and limit absolute values */ + warped_true2monic_coefs( &psEncCtrl->AR2[ k * MAX_SHAPE_LPC_ORDER ], &psEncCtrl->AR1[ k * MAX_SHAPE_LPC_ORDER ], + warping, 3.999f, psEnc->sCmn.shapingLPCOrder ); + } + + /*****************/ + /* Gain tweaking */ + /*****************/ + /* Increase gains during low speech activity */ + gain_mult = (silk_float)pow( 2.0f, -0.16f * SNR_adj_dB ); + gain_add = (silk_float)pow( 2.0f, 0.16f * MIN_QGAIN_DB ); + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + psEncCtrl->Gains[ k ] *= gain_mult; + psEncCtrl->Gains[ k ] += gain_add; + } + + gain_mult = 1.0f + INPUT_TILT + psEncCtrl->coding_quality * HIGH_RATE_INPUT_TILT; + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + psEncCtrl->GainsPre[ k ] *= gain_mult; + } + + /************************************************/ + /* Control low-frequency shaping and noise tilt */ + /************************************************/ + /* Less low frequency shaping for noisy inputs */ + strength = LOW_FREQ_SHAPING * ( 1.0f + LOW_QUALITY_LOW_FREQ_SHAPING_DECR * ( psEnc->sCmn.input_quality_bands_Q15[ 0 ] * ( 1.0f / 32768.0f ) - 1.0f ) ); + strength *= psEnc->sCmn.speech_activity_Q8 * ( 1.0f / 256.0f ); + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + /* Reduce low frequencies quantization noise for periodic signals, depending on pitch lag */ + /*f = 400; freqz([1, -0.98 + 2e-4 * f], [1, -0.97 + 7e-4 * f], 2^12, Fs); axis([0, 1000, -10, 1])*/ + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + b = 0.2f / psEnc->sCmn.fs_kHz + 3.0f / psEncCtrl->pitchL[ k ]; + psEncCtrl->LF_MA_shp[ k ] = -1.0f + b; + psEncCtrl->LF_AR_shp[ k ] = 1.0f - b - b * strength; + } + Tilt = - HP_NOISE_COEF - + (1 - HP_NOISE_COEF) * HARM_HP_NOISE_COEF * psEnc->sCmn.speech_activity_Q8 * ( 1.0f / 256.0f ); + } else { + b = 1.3f / psEnc->sCmn.fs_kHz; + psEncCtrl->LF_MA_shp[ 0 ] = -1.0f + b; + psEncCtrl->LF_AR_shp[ 0 ] = 1.0f - b - b * strength * 0.6f; + for( k = 1; k < psEnc->sCmn.nb_subfr; k++ ) { + psEncCtrl->LF_MA_shp[ k ] = psEncCtrl->LF_MA_shp[ 0 ]; + psEncCtrl->LF_AR_shp[ k ] = psEncCtrl->LF_AR_shp[ 0 ]; + } + Tilt = -HP_NOISE_COEF; + } + + /****************************/ + /* HARMONIC SHAPING CONTROL */ + /****************************/ + /* Control boosting of harmonic frequencies */ + HarmBoost = LOW_RATE_HARMONIC_BOOST * ( 1.0f - psEncCtrl->coding_quality ) * psEnc->LTPCorr; + + /* More harmonic boost for noisy input signals */ + HarmBoost += LOW_INPUT_QUALITY_HARMONIC_BOOST * ( 1.0f - psEncCtrl->input_quality ); + + if( USE_HARM_SHAPING && psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + /* Harmonic noise shaping */ + HarmShapeGain = HARMONIC_SHAPING; + + /* More harmonic noise shaping for high bitrates or noisy input */ + HarmShapeGain += HIGH_RATE_OR_LOW_QUALITY_HARMONIC_SHAPING * + ( 1.0f - ( 1.0f - psEncCtrl->coding_quality ) * psEncCtrl->input_quality ); + + /* Less harmonic noise shaping for less periodic signals */ + HarmShapeGain *= ( silk_float )sqrt( psEnc->LTPCorr ); + } else { + HarmShapeGain = 0.0f; + } + + /*************************/ + /* Smooth over subframes */ + /*************************/ + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + psShapeSt->HarmBoost_smth += SUBFR_SMTH_COEF * ( HarmBoost - psShapeSt->HarmBoost_smth ); + psEncCtrl->HarmBoost[ k ] = psShapeSt->HarmBoost_smth; + psShapeSt->HarmShapeGain_smth += SUBFR_SMTH_COEF * ( HarmShapeGain - psShapeSt->HarmShapeGain_smth ); + psEncCtrl->HarmShapeGain[ k ] = psShapeSt->HarmShapeGain_smth; + psShapeSt->Tilt_smth += SUBFR_SMTH_COEF * ( Tilt - psShapeSt->Tilt_smth ); + psEncCtrl->Tilt[ k ] = psShapeSt->Tilt_smth; + } +} diff --git a/thirdparty/opus/silk/float/pitch_analysis_core_FLP.c b/thirdparty/opus/silk/float/pitch_analysis_core_FLP.c new file mode 100644 index 000000000000..d0e637a29dc9 --- /dev/null +++ b/thirdparty/opus/silk/float/pitch_analysis_core_FLP.c @@ -0,0 +1,630 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/***************************************************************************** +* Pitch analyser function +******************************************************************************/ +#include "SigProc_FLP.h" +#include "SigProc_FIX.h" +#include "pitch_est_defines.h" +#include "pitch.h" + +#define SCRATCH_SIZE 22 + +/************************************************************/ +/* Internally used functions */ +/************************************************************/ +static void silk_P_Ana_calc_corr_st3( + silk_float cross_corr_st3[ PE_MAX_NB_SUBFR ][ PE_NB_CBKS_STAGE3_MAX ][ PE_NB_STAGE3_LAGS ], /* O 3 DIM correlation array */ + const silk_float frame[], /* I vector to correlate */ + opus_int start_lag, /* I start lag */ + opus_int sf_length, /* I sub frame length */ + opus_int nb_subfr, /* I number of subframes */ + opus_int complexity, /* I Complexity setting */ + int arch /* I Run-time architecture */ +); + +static void silk_P_Ana_calc_energy_st3( + silk_float energies_st3[ PE_MAX_NB_SUBFR ][ PE_NB_CBKS_STAGE3_MAX ][ PE_NB_STAGE3_LAGS ], /* O 3 DIM correlation array */ + const silk_float frame[], /* I vector to correlate */ + opus_int start_lag, /* I start lag */ + opus_int sf_length, /* I sub frame length */ + opus_int nb_subfr, /* I number of subframes */ + opus_int complexity /* I Complexity setting */ +); + +/************************************************************/ +/* CORE PITCH ANALYSIS FUNCTION */ +/************************************************************/ +opus_int silk_pitch_analysis_core_FLP( /* O Voicing estimate: 0 voiced, 1 unvoiced */ + const silk_float *frame, /* I Signal of length PE_FRAME_LENGTH_MS*Fs_kHz */ + opus_int *pitch_out, /* O Pitch lag values [nb_subfr] */ + opus_int16 *lagIndex, /* O Lag Index */ + opus_int8 *contourIndex, /* O Pitch contour Index */ + silk_float *LTPCorr, /* I/O Normalized correlation; input: value from previous frame */ + opus_int prevLag, /* I Last lag of previous frame; set to zero is unvoiced */ + const silk_float search_thres1, /* I First stage threshold for lag candidates 0 - 1 */ + const silk_float search_thres2, /* I Final threshold for lag candidates 0 - 1 */ + const opus_int Fs_kHz, /* I sample frequency (kHz) */ + const opus_int complexity, /* I Complexity setting, 0-2, where 2 is highest */ + const opus_int nb_subfr, /* I Number of 5 ms subframes */ + int arch /* I Run-time architecture */ +) +{ + opus_int i, k, d, j; + silk_float frame_8kHz[ PE_MAX_FRAME_LENGTH_MS * 8 ]; + silk_float frame_4kHz[ PE_MAX_FRAME_LENGTH_MS * 4 ]; + opus_int16 frame_8_FIX[ PE_MAX_FRAME_LENGTH_MS * 8 ]; + opus_int16 frame_4_FIX[ PE_MAX_FRAME_LENGTH_MS * 4 ]; + opus_int32 filt_state[ 6 ]; + silk_float threshold, contour_bias; + silk_float C[ PE_MAX_NB_SUBFR][ (PE_MAX_LAG >> 1) + 5 ]; + opus_val32 xcorr[ PE_MAX_LAG_MS * 4 - PE_MIN_LAG_MS * 4 + 1 ]; + silk_float CC[ PE_NB_CBKS_STAGE2_EXT ]; + const silk_float *target_ptr, *basis_ptr; + double cross_corr, normalizer, energy, energy_tmp; + opus_int d_srch[ PE_D_SRCH_LENGTH ]; + opus_int16 d_comp[ (PE_MAX_LAG >> 1) + 5 ]; + opus_int length_d_srch, length_d_comp; + silk_float Cmax, CCmax, CCmax_b, CCmax_new_b, CCmax_new; + opus_int CBimax, CBimax_new, lag, start_lag, end_lag, lag_new; + opus_int cbk_size; + silk_float lag_log2, prevLag_log2, delta_lag_log2_sqr; + silk_float energies_st3[ PE_MAX_NB_SUBFR ][ PE_NB_CBKS_STAGE3_MAX ][ PE_NB_STAGE3_LAGS ]; + silk_float cross_corr_st3[ PE_MAX_NB_SUBFR ][ PE_NB_CBKS_STAGE3_MAX ][ PE_NB_STAGE3_LAGS ]; + opus_int lag_counter; + opus_int frame_length, frame_length_8kHz, frame_length_4kHz; + opus_int sf_length, sf_length_8kHz, sf_length_4kHz; + opus_int min_lag, min_lag_8kHz, min_lag_4kHz; + opus_int max_lag, max_lag_8kHz, max_lag_4kHz; + opus_int nb_cbk_search; + const opus_int8 *Lag_CB_ptr; + + /* Check for valid sampling frequency */ + silk_assert( Fs_kHz == 8 || Fs_kHz == 12 || Fs_kHz == 16 ); + + /* Check for valid complexity setting */ + silk_assert( complexity >= SILK_PE_MIN_COMPLEX ); + silk_assert( complexity <= SILK_PE_MAX_COMPLEX ); + + silk_assert( search_thres1 >= 0.0f && search_thres1 <= 1.0f ); + silk_assert( search_thres2 >= 0.0f && search_thres2 <= 1.0f ); + + /* Set up frame lengths max / min lag for the sampling frequency */ + frame_length = ( PE_LTP_MEM_LENGTH_MS + nb_subfr * PE_SUBFR_LENGTH_MS ) * Fs_kHz; + frame_length_4kHz = ( PE_LTP_MEM_LENGTH_MS + nb_subfr * PE_SUBFR_LENGTH_MS ) * 4; + frame_length_8kHz = ( PE_LTP_MEM_LENGTH_MS + nb_subfr * PE_SUBFR_LENGTH_MS ) * 8; + sf_length = PE_SUBFR_LENGTH_MS * Fs_kHz; + sf_length_4kHz = PE_SUBFR_LENGTH_MS * 4; + sf_length_8kHz = PE_SUBFR_LENGTH_MS * 8; + min_lag = PE_MIN_LAG_MS * Fs_kHz; + min_lag_4kHz = PE_MIN_LAG_MS * 4; + min_lag_8kHz = PE_MIN_LAG_MS * 8; + max_lag = PE_MAX_LAG_MS * Fs_kHz - 1; + max_lag_4kHz = PE_MAX_LAG_MS * 4; + max_lag_8kHz = PE_MAX_LAG_MS * 8 - 1; + + /* Resample from input sampled at Fs_kHz to 8 kHz */ + if( Fs_kHz == 16 ) { + /* Resample to 16 -> 8 khz */ + opus_int16 frame_16_FIX[ 16 * PE_MAX_FRAME_LENGTH_MS ]; + silk_float2short_array( frame_16_FIX, frame, frame_length ); + silk_memset( filt_state, 0, 2 * sizeof( opus_int32 ) ); + silk_resampler_down2( filt_state, frame_8_FIX, frame_16_FIX, frame_length ); + silk_short2float_array( frame_8kHz, frame_8_FIX, frame_length_8kHz ); + } else if( Fs_kHz == 12 ) { + /* Resample to 12 -> 8 khz */ + opus_int16 frame_12_FIX[ 12 * PE_MAX_FRAME_LENGTH_MS ]; + silk_float2short_array( frame_12_FIX, frame, frame_length ); + silk_memset( filt_state, 0, 6 * sizeof( opus_int32 ) ); + silk_resampler_down2_3( filt_state, frame_8_FIX, frame_12_FIX, frame_length ); + silk_short2float_array( frame_8kHz, frame_8_FIX, frame_length_8kHz ); + } else { + silk_assert( Fs_kHz == 8 ); + silk_float2short_array( frame_8_FIX, frame, frame_length_8kHz ); + } + + /* Decimate again to 4 kHz */ + silk_memset( filt_state, 0, 2 * sizeof( opus_int32 ) ); + silk_resampler_down2( filt_state, frame_4_FIX, frame_8_FIX, frame_length_8kHz ); + silk_short2float_array( frame_4kHz, frame_4_FIX, frame_length_4kHz ); + + /* Low-pass filter */ + for( i = frame_length_4kHz - 1; i > 0; i-- ) { + frame_4kHz[ i ] += frame_4kHz[ i - 1 ]; + } + + /****************************************************************************** + * FIRST STAGE, operating in 4 khz + ******************************************************************************/ + silk_memset(C, 0, sizeof(silk_float) * nb_subfr * ((PE_MAX_LAG >> 1) + 5)); + target_ptr = &frame_4kHz[ silk_LSHIFT( sf_length_4kHz, 2 ) ]; + for( k = 0; k < nb_subfr >> 1; k++ ) { + /* Check that we are within range of the array */ + silk_assert( target_ptr >= frame_4kHz ); + silk_assert( target_ptr + sf_length_8kHz <= frame_4kHz + frame_length_4kHz ); + + basis_ptr = target_ptr - min_lag_4kHz; + + /* Check that we are within range of the array */ + silk_assert( basis_ptr >= frame_4kHz ); + silk_assert( basis_ptr + sf_length_8kHz <= frame_4kHz + frame_length_4kHz ); + + celt_pitch_xcorr( target_ptr, target_ptr-max_lag_4kHz, xcorr, sf_length_8kHz, max_lag_4kHz - min_lag_4kHz + 1, arch ); + + /* Calculate first vector products before loop */ + cross_corr = xcorr[ max_lag_4kHz - min_lag_4kHz ]; + normalizer = silk_energy_FLP( target_ptr, sf_length_8kHz ) + + silk_energy_FLP( basis_ptr, sf_length_8kHz ) + + sf_length_8kHz * 4000.0f; + + C[ 0 ][ min_lag_4kHz ] += (silk_float)( 2 * cross_corr / normalizer ); + + /* From now on normalizer is computed recursively */ + for( d = min_lag_4kHz + 1; d <= max_lag_4kHz; d++ ) { + basis_ptr--; + + /* Check that we are within range of the array */ + silk_assert( basis_ptr >= frame_4kHz ); + silk_assert( basis_ptr + sf_length_8kHz <= frame_4kHz + frame_length_4kHz ); + + cross_corr = xcorr[ max_lag_4kHz - d ]; + + /* Add contribution of new sample and remove contribution from oldest sample */ + normalizer += + basis_ptr[ 0 ] * (double)basis_ptr[ 0 ] - + basis_ptr[ sf_length_8kHz ] * (double)basis_ptr[ sf_length_8kHz ]; + C[ 0 ][ d ] += (silk_float)( 2 * cross_corr / normalizer ); + } + /* Update target pointer */ + target_ptr += sf_length_8kHz; + } + + /* Apply short-lag bias */ + for( i = max_lag_4kHz; i >= min_lag_4kHz; i-- ) { + C[ 0 ][ i ] -= C[ 0 ][ i ] * i / 4096.0f; + } + + /* Sort */ + length_d_srch = 4 + 2 * complexity; + silk_assert( 3 * length_d_srch <= PE_D_SRCH_LENGTH ); + silk_insertion_sort_decreasing_FLP( &C[ 0 ][ min_lag_4kHz ], d_srch, max_lag_4kHz - min_lag_4kHz + 1, length_d_srch ); + + /* Escape if correlation is very low already here */ + Cmax = C[ 0 ][ min_lag_4kHz ]; + if( Cmax < 0.2f ) { + silk_memset( pitch_out, 0, nb_subfr * sizeof( opus_int ) ); + *LTPCorr = 0.0f; + *lagIndex = 0; + *contourIndex = 0; + return 1; + } + + threshold = search_thres1 * Cmax; + for( i = 0; i < length_d_srch; i++ ) { + /* Convert to 8 kHz indices for the sorted correlation that exceeds the threshold */ + if( C[ 0 ][ min_lag_4kHz + i ] > threshold ) { + d_srch[ i ] = silk_LSHIFT( d_srch[ i ] + min_lag_4kHz, 1 ); + } else { + length_d_srch = i; + break; + } + } + silk_assert( length_d_srch > 0 ); + + for( i = min_lag_8kHz - 5; i < max_lag_8kHz + 5; i++ ) { + d_comp[ i ] = 0; + } + for( i = 0; i < length_d_srch; i++ ) { + d_comp[ d_srch[ i ] ] = 1; + } + + /* Convolution */ + for( i = max_lag_8kHz + 3; i >= min_lag_8kHz; i-- ) { + d_comp[ i ] += d_comp[ i - 1 ] + d_comp[ i - 2 ]; + } + + length_d_srch = 0; + for( i = min_lag_8kHz; i < max_lag_8kHz + 1; i++ ) { + if( d_comp[ i + 1 ] > 0 ) { + d_srch[ length_d_srch ] = i; + length_d_srch++; + } + } + + /* Convolution */ + for( i = max_lag_8kHz + 3; i >= min_lag_8kHz; i-- ) { + d_comp[ i ] += d_comp[ i - 1 ] + d_comp[ i - 2 ] + d_comp[ i - 3 ]; + } + + length_d_comp = 0; + for( i = min_lag_8kHz; i < max_lag_8kHz + 4; i++ ) { + if( d_comp[ i ] > 0 ) { + d_comp[ length_d_comp ] = (opus_int16)( i - 2 ); + length_d_comp++; + } + } + + /********************************************************************************** + ** SECOND STAGE, operating at 8 kHz, on lag sections with high correlation + *************************************************************************************/ + /********************************************************************************* + * Find energy of each subframe projected onto its history, for a range of delays + *********************************************************************************/ + silk_memset( C, 0, PE_MAX_NB_SUBFR*((PE_MAX_LAG >> 1) + 5) * sizeof(silk_float)); + + if( Fs_kHz == 8 ) { + target_ptr = &frame[ PE_LTP_MEM_LENGTH_MS * 8 ]; + } else { + target_ptr = &frame_8kHz[ PE_LTP_MEM_LENGTH_MS * 8 ]; + } + for( k = 0; k < nb_subfr; k++ ) { + energy_tmp = silk_energy_FLP( target_ptr, sf_length_8kHz ) + 1.0; + for( j = 0; j < length_d_comp; j++ ) { + d = d_comp[ j ]; + basis_ptr = target_ptr - d; + cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz ); + if( cross_corr > 0.0f ) { + energy = silk_energy_FLP( basis_ptr, sf_length_8kHz ); + C[ k ][ d ] = (silk_float)( 2 * cross_corr / ( energy + energy_tmp ) ); + } else { + C[ k ][ d ] = 0.0f; + } + } + target_ptr += sf_length_8kHz; + } + + /* search over lag range and lags codebook */ + /* scale factor for lag codebook, as a function of center lag */ + + CCmax = 0.0f; /* This value doesn't matter */ + CCmax_b = -1000.0f; + + CBimax = 0; /* To avoid returning undefined lag values */ + lag = -1; /* To check if lag with strong enough correlation has been found */ + + if( prevLag > 0 ) { + if( Fs_kHz == 12 ) { + prevLag = silk_LSHIFT( prevLag, 1 ) / 3; + } else if( Fs_kHz == 16 ) { + prevLag = silk_RSHIFT( prevLag, 1 ); + } + prevLag_log2 = silk_log2( (silk_float)prevLag ); + } else { + prevLag_log2 = 0; + } + + /* Set up stage 2 codebook based on number of subframes */ + if( nb_subfr == PE_MAX_NB_SUBFR ) { + cbk_size = PE_NB_CBKS_STAGE2_EXT; + Lag_CB_ptr = &silk_CB_lags_stage2[ 0 ][ 0 ]; + if( Fs_kHz == 8 && complexity > SILK_PE_MIN_COMPLEX ) { + /* If input is 8 khz use a larger codebook here because it is last stage */ + nb_cbk_search = PE_NB_CBKS_STAGE2_EXT; + } else { + nb_cbk_search = PE_NB_CBKS_STAGE2; + } + } else { + cbk_size = PE_NB_CBKS_STAGE2_10MS; + Lag_CB_ptr = &silk_CB_lags_stage2_10_ms[ 0 ][ 0 ]; + nb_cbk_search = PE_NB_CBKS_STAGE2_10MS; + } + + for( k = 0; k < length_d_srch; k++ ) { + d = d_srch[ k ]; + for( j = 0; j < nb_cbk_search; j++ ) { + CC[j] = 0.0f; + for( i = 0; i < nb_subfr; i++ ) { + /* Try all codebooks */ + CC[ j ] += C[ i ][ d + matrix_ptr( Lag_CB_ptr, i, j, cbk_size )]; + } + } + /* Find best codebook */ + CCmax_new = -1000.0f; + CBimax_new = 0; + for( i = 0; i < nb_cbk_search; i++ ) { + if( CC[ i ] > CCmax_new ) { + CCmax_new = CC[ i ]; + CBimax_new = i; + } + } + + /* Bias towards shorter lags */ + lag_log2 = silk_log2( (silk_float)d ); + CCmax_new_b = CCmax_new - PE_SHORTLAG_BIAS * nb_subfr * lag_log2; + + /* Bias towards previous lag */ + if( prevLag > 0 ) { + delta_lag_log2_sqr = lag_log2 - prevLag_log2; + delta_lag_log2_sqr *= delta_lag_log2_sqr; + CCmax_new_b -= PE_PREVLAG_BIAS * nb_subfr * (*LTPCorr) * delta_lag_log2_sqr / ( delta_lag_log2_sqr + 0.5f ); + } + + if( CCmax_new_b > CCmax_b && /* Find maximum biased correlation */ + CCmax_new > nb_subfr * search_thres2 /* Correlation needs to be high enough to be voiced */ + ) { + CCmax_b = CCmax_new_b; + CCmax = CCmax_new; + lag = d; + CBimax = CBimax_new; + } + } + + if( lag == -1 ) { + /* No suitable candidate found */ + silk_memset( pitch_out, 0, PE_MAX_NB_SUBFR * sizeof(opus_int) ); + *LTPCorr = 0.0f; + *lagIndex = 0; + *contourIndex = 0; + return 1; + } + + /* Output normalized correlation */ + *LTPCorr = (silk_float)( CCmax / nb_subfr ); + silk_assert( *LTPCorr >= 0.0f ); + + if( Fs_kHz > 8 ) { + /* Search in original signal */ + + /* Compensate for decimation */ + silk_assert( lag == silk_SAT16( lag ) ); + if( Fs_kHz == 12 ) { + lag = silk_RSHIFT_ROUND( silk_SMULBB( lag, 3 ), 1 ); + } else { /* Fs_kHz == 16 */ + lag = silk_LSHIFT( lag, 1 ); + } + + lag = silk_LIMIT_int( lag, min_lag, max_lag ); + start_lag = silk_max_int( lag - 2, min_lag ); + end_lag = silk_min_int( lag + 2, max_lag ); + lag_new = lag; /* to avoid undefined lag */ + CBimax = 0; /* to avoid undefined lag */ + + CCmax = -1000.0f; + + /* Calculate the correlations and energies needed in stage 3 */ + silk_P_Ana_calc_corr_st3( cross_corr_st3, frame, start_lag, sf_length, nb_subfr, complexity, arch ); + silk_P_Ana_calc_energy_st3( energies_st3, frame, start_lag, sf_length, nb_subfr, complexity ); + + lag_counter = 0; + silk_assert( lag == silk_SAT16( lag ) ); + contour_bias = PE_FLATCONTOUR_BIAS / lag; + + /* Set up cbk parameters according to complexity setting and frame length */ + if( nb_subfr == PE_MAX_NB_SUBFR ) { + nb_cbk_search = (opus_int)silk_nb_cbk_searchs_stage3[ complexity ]; + cbk_size = PE_NB_CBKS_STAGE3_MAX; + Lag_CB_ptr = &silk_CB_lags_stage3[ 0 ][ 0 ]; + } else { + nb_cbk_search = PE_NB_CBKS_STAGE3_10MS; + cbk_size = PE_NB_CBKS_STAGE3_10MS; + Lag_CB_ptr = &silk_CB_lags_stage3_10_ms[ 0 ][ 0 ]; + } + + target_ptr = &frame[ PE_LTP_MEM_LENGTH_MS * Fs_kHz ]; + energy_tmp = silk_energy_FLP( target_ptr, nb_subfr * sf_length ) + 1.0; + for( d = start_lag; d <= end_lag; d++ ) { + for( j = 0; j < nb_cbk_search; j++ ) { + cross_corr = 0.0; + energy = energy_tmp; + for( k = 0; k < nb_subfr; k++ ) { + cross_corr += cross_corr_st3[ k ][ j ][ lag_counter ]; + energy += energies_st3[ k ][ j ][ lag_counter ]; + } + if( cross_corr > 0.0 ) { + CCmax_new = (silk_float)( 2 * cross_corr / energy ); + /* Reduce depending on flatness of contour */ + CCmax_new *= 1.0f - contour_bias * j; + } else { + CCmax_new = 0.0f; + } + + if( CCmax_new > CCmax && ( d + (opus_int)silk_CB_lags_stage3[ 0 ][ j ] ) <= max_lag ) { + CCmax = CCmax_new; + lag_new = d; + CBimax = j; + } + } + lag_counter++; + } + + for( k = 0; k < nb_subfr; k++ ) { + pitch_out[ k ] = lag_new + matrix_ptr( Lag_CB_ptr, k, CBimax, cbk_size ); + pitch_out[ k ] = silk_LIMIT( pitch_out[ k ], min_lag, PE_MAX_LAG_MS * Fs_kHz ); + } + *lagIndex = (opus_int16)( lag_new - min_lag ); + *contourIndex = (opus_int8)CBimax; + } else { /* Fs_kHz == 8 */ + /* Save Lags */ + for( k = 0; k < nb_subfr; k++ ) { + pitch_out[ k ] = lag + matrix_ptr( Lag_CB_ptr, k, CBimax, cbk_size ); + pitch_out[ k ] = silk_LIMIT( pitch_out[ k ], min_lag_8kHz, PE_MAX_LAG_MS * 8 ); + } + *lagIndex = (opus_int16)( lag - min_lag_8kHz ); + *contourIndex = (opus_int8)CBimax; + } + silk_assert( *lagIndex >= 0 ); + /* return as voiced */ + return 0; +} + +/*********************************************************************** + * Calculates the correlations used in stage 3 search. In order to cover + * the whole lag codebook for all the searched offset lags (lag +- 2), + * the following correlations are needed in each sub frame: + * + * sf1: lag range [-8,...,7] total 16 correlations + * sf2: lag range [-4,...,4] total 9 correlations + * sf3: lag range [-3,....4] total 8 correltions + * sf4: lag range [-6,....8] total 15 correlations + * + * In total 48 correlations. The direct implementation computed in worst + * case 4*12*5 = 240 correlations, but more likely around 120. + ***********************************************************************/ +static void silk_P_Ana_calc_corr_st3( + silk_float cross_corr_st3[ PE_MAX_NB_SUBFR ][ PE_NB_CBKS_STAGE3_MAX ][ PE_NB_STAGE3_LAGS ], /* O 3 DIM correlation array */ + const silk_float frame[], /* I vector to correlate */ + opus_int start_lag, /* I start lag */ + opus_int sf_length, /* I sub frame length */ + opus_int nb_subfr, /* I number of subframes */ + opus_int complexity, /* I Complexity setting */ + int arch /* I Run-time architecture */ +) +{ + const silk_float *target_ptr; + opus_int i, j, k, lag_counter, lag_low, lag_high; + opus_int nb_cbk_search, delta, idx, cbk_size; + silk_float scratch_mem[ SCRATCH_SIZE ]; + opus_val32 xcorr[ SCRATCH_SIZE ]; + const opus_int8 *Lag_range_ptr, *Lag_CB_ptr; + + silk_assert( complexity >= SILK_PE_MIN_COMPLEX ); + silk_assert( complexity <= SILK_PE_MAX_COMPLEX ); + + if( nb_subfr == PE_MAX_NB_SUBFR ) { + Lag_range_ptr = &silk_Lag_range_stage3[ complexity ][ 0 ][ 0 ]; + Lag_CB_ptr = &silk_CB_lags_stage3[ 0 ][ 0 ]; + nb_cbk_search = silk_nb_cbk_searchs_stage3[ complexity ]; + cbk_size = PE_NB_CBKS_STAGE3_MAX; + } else { + silk_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1); + Lag_range_ptr = &silk_Lag_range_stage3_10_ms[ 0 ][ 0 ]; + Lag_CB_ptr = &silk_CB_lags_stage3_10_ms[ 0 ][ 0 ]; + nb_cbk_search = PE_NB_CBKS_STAGE3_10MS; + cbk_size = PE_NB_CBKS_STAGE3_10MS; + } + + target_ptr = &frame[ silk_LSHIFT( sf_length, 2 ) ]; /* Pointer to middle of frame */ + for( k = 0; k < nb_subfr; k++ ) { + lag_counter = 0; + + /* Calculate the correlations for each subframe */ + lag_low = matrix_ptr( Lag_range_ptr, k, 0, 2 ); + lag_high = matrix_ptr( Lag_range_ptr, k, 1, 2 ); + silk_assert(lag_high-lag_low+1 <= SCRATCH_SIZE); + celt_pitch_xcorr( target_ptr, target_ptr - start_lag - lag_high, xcorr, sf_length, lag_high - lag_low + 1, arch ); + for( j = lag_low; j <= lag_high; j++ ) { + silk_assert( lag_counter < SCRATCH_SIZE ); + scratch_mem[ lag_counter ] = xcorr[ lag_high - j ]; + lag_counter++; + } + + delta = matrix_ptr( Lag_range_ptr, k, 0, 2 ); + for( i = 0; i < nb_cbk_search; i++ ) { + /* Fill out the 3 dim array that stores the correlations for */ + /* each code_book vector for each start lag */ + idx = matrix_ptr( Lag_CB_ptr, k, i, cbk_size ) - delta; + for( j = 0; j < PE_NB_STAGE3_LAGS; j++ ) { + silk_assert( idx + j < SCRATCH_SIZE ); + silk_assert( idx + j < lag_counter ); + cross_corr_st3[ k ][ i ][ j ] = scratch_mem[ idx + j ]; + } + } + target_ptr += sf_length; + } +} + +/********************************************************************/ +/* Calculate the energies for first two subframes. The energies are */ +/* calculated recursively. */ +/********************************************************************/ +static void silk_P_Ana_calc_energy_st3( + silk_float energies_st3[ PE_MAX_NB_SUBFR ][ PE_NB_CBKS_STAGE3_MAX ][ PE_NB_STAGE3_LAGS ], /* O 3 DIM correlation array */ + const silk_float frame[], /* I vector to correlate */ + opus_int start_lag, /* I start lag */ + opus_int sf_length, /* I sub frame length */ + opus_int nb_subfr, /* I number of subframes */ + opus_int complexity /* I Complexity setting */ +) +{ + const silk_float *target_ptr, *basis_ptr; + double energy; + opus_int k, i, j, lag_counter; + opus_int nb_cbk_search, delta, idx, cbk_size, lag_diff; + silk_float scratch_mem[ SCRATCH_SIZE ]; + const opus_int8 *Lag_range_ptr, *Lag_CB_ptr; + + silk_assert( complexity >= SILK_PE_MIN_COMPLEX ); + silk_assert( complexity <= SILK_PE_MAX_COMPLEX ); + + if( nb_subfr == PE_MAX_NB_SUBFR ) { + Lag_range_ptr = &silk_Lag_range_stage3[ complexity ][ 0 ][ 0 ]; + Lag_CB_ptr = &silk_CB_lags_stage3[ 0 ][ 0 ]; + nb_cbk_search = silk_nb_cbk_searchs_stage3[ complexity ]; + cbk_size = PE_NB_CBKS_STAGE3_MAX; + } else { + silk_assert( nb_subfr == PE_MAX_NB_SUBFR >> 1); + Lag_range_ptr = &silk_Lag_range_stage3_10_ms[ 0 ][ 0 ]; + Lag_CB_ptr = &silk_CB_lags_stage3_10_ms[ 0 ][ 0 ]; + nb_cbk_search = PE_NB_CBKS_STAGE3_10MS; + cbk_size = PE_NB_CBKS_STAGE3_10MS; + } + + target_ptr = &frame[ silk_LSHIFT( sf_length, 2 ) ]; + for( k = 0; k < nb_subfr; k++ ) { + lag_counter = 0; + + /* Calculate the energy for first lag */ + basis_ptr = target_ptr - ( start_lag + matrix_ptr( Lag_range_ptr, k, 0, 2 ) ); + energy = silk_energy_FLP( basis_ptr, sf_length ) + 1e-3; + silk_assert( energy >= 0.0 ); + scratch_mem[lag_counter] = (silk_float)energy; + lag_counter++; + + lag_diff = ( matrix_ptr( Lag_range_ptr, k, 1, 2 ) - matrix_ptr( Lag_range_ptr, k, 0, 2 ) + 1 ); + for( i = 1; i < lag_diff; i++ ) { + /* remove part outside new window */ + energy -= basis_ptr[sf_length - i] * (double)basis_ptr[sf_length - i]; + silk_assert( energy >= 0.0 ); + + /* add part that comes into window */ + energy += basis_ptr[ -i ] * (double)basis_ptr[ -i ]; + silk_assert( energy >= 0.0 ); + silk_assert( lag_counter < SCRATCH_SIZE ); + scratch_mem[lag_counter] = (silk_float)energy; + lag_counter++; + } + + delta = matrix_ptr( Lag_range_ptr, k, 0, 2 ); + for( i = 0; i < nb_cbk_search; i++ ) { + /* Fill out the 3 dim array that stores the correlations for */ + /* each code_book vector for each start lag */ + idx = matrix_ptr( Lag_CB_ptr, k, i, cbk_size ) - delta; + for( j = 0; j < PE_NB_STAGE3_LAGS; j++ ) { + silk_assert( idx + j < SCRATCH_SIZE ); + silk_assert( idx + j < lag_counter ); + energies_st3[ k ][ i ][ j ] = scratch_mem[ idx + j ]; + silk_assert( energies_st3[ k ][ i ][ j ] >= 0.0f ); + } + } + target_ptr += sf_length; + } +} diff --git a/thirdparty/opus/silk/float/prefilter_FLP.c b/thirdparty/opus/silk/float/prefilter_FLP.c new file mode 100644 index 000000000000..8bc32fb41040 --- /dev/null +++ b/thirdparty/opus/silk/float/prefilter_FLP.c @@ -0,0 +1,206 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FLP.h" +#include "tuning_parameters.h" + +/* +* Prefilter for finding Quantizer input signal +*/ +static OPUS_INLINE void silk_prefilt_FLP( + silk_prefilter_state_FLP *P, /* I/O state */ + silk_float st_res[], /* I */ + silk_float xw[], /* O */ + silk_float *HarmShapeFIR, /* I */ + silk_float Tilt, /* I */ + silk_float LF_MA_shp, /* I */ + silk_float LF_AR_shp, /* I */ + opus_int lag, /* I */ + opus_int length /* I */ +); + +static void silk_warped_LPC_analysis_filter_FLP( + silk_float state[], /* I/O State [order + 1] */ + silk_float res[], /* O Residual signal [length] */ + const silk_float coef[], /* I Coefficients [order] */ + const silk_float input[], /* I Input signal [length] */ + const silk_float lambda, /* I Warping factor */ + const opus_int length, /* I Length of input signal */ + const opus_int order /* I Filter order (even) */ +) +{ + opus_int n, i; + silk_float acc, tmp1, tmp2; + + /* Order must be even */ + silk_assert( ( order & 1 ) == 0 ); + + for( n = 0; n < length; n++ ) { + /* Output of lowpass section */ + tmp2 = state[ 0 ] + lambda * state[ 1 ]; + state[ 0 ] = input[ n ]; + /* Output of allpass section */ + tmp1 = state[ 1 ] + lambda * ( state[ 2 ] - tmp2 ); + state[ 1 ] = tmp2; + acc = coef[ 0 ] * tmp2; + /* Loop over allpass sections */ + for( i = 2; i < order; i += 2 ) { + /* Output of allpass section */ + tmp2 = state[ i ] + lambda * ( state[ i + 1 ] - tmp1 ); + state[ i ] = tmp1; + acc += coef[ i - 1 ] * tmp1; + /* Output of allpass section */ + tmp1 = state[ i + 1 ] + lambda * ( state[ i + 2 ] - tmp2 ); + state[ i + 1 ] = tmp2; + acc += coef[ i ] * tmp2; + } + state[ order ] = tmp1; + acc += coef[ order - 1 ] * tmp1; + res[ n ] = input[ n ] - acc; + } +} + +/* +* silk_prefilter. Main prefilter function +*/ +void silk_prefilter_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + const silk_encoder_control_FLP *psEncCtrl, /* I Encoder control FLP */ + silk_float xw[], /* O Weighted signal */ + const silk_float x[] /* I Speech signal */ +) +{ + silk_prefilter_state_FLP *P = &psEnc->sPrefilt; + opus_int j, k, lag; + silk_float HarmShapeGain, Tilt, LF_MA_shp, LF_AR_shp; + silk_float B[ 2 ]; + const silk_float *AR1_shp; + const silk_float *px; + silk_float *pxw; + silk_float HarmShapeFIR[ 3 ]; + silk_float st_res[ MAX_SUB_FRAME_LENGTH + MAX_LPC_ORDER ]; + + /* Set up pointers */ + px = x; + pxw = xw; + lag = P->lagPrev; + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + /* Update Variables that change per sub frame */ + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + lag = psEncCtrl->pitchL[ k ]; + } + + /* Noise shape parameters */ + HarmShapeGain = psEncCtrl->HarmShapeGain[ k ] * ( 1.0f - psEncCtrl->HarmBoost[ k ] ); + HarmShapeFIR[ 0 ] = 0.25f * HarmShapeGain; + HarmShapeFIR[ 1 ] = 32767.0f / 65536.0f * HarmShapeGain; + HarmShapeFIR[ 2 ] = 0.25f * HarmShapeGain; + Tilt = psEncCtrl->Tilt[ k ]; + LF_MA_shp = psEncCtrl->LF_MA_shp[ k ]; + LF_AR_shp = psEncCtrl->LF_AR_shp[ k ]; + AR1_shp = &psEncCtrl->AR1[ k * MAX_SHAPE_LPC_ORDER ]; + + /* Short term FIR filtering */ + silk_warped_LPC_analysis_filter_FLP( P->sAR_shp, st_res, AR1_shp, px, + (silk_float)psEnc->sCmn.warping_Q16 / 65536.0f, psEnc->sCmn.subfr_length, psEnc->sCmn.shapingLPCOrder ); + + /* Reduce (mainly) low frequencies during harmonic emphasis */ + B[ 0 ] = psEncCtrl->GainsPre[ k ]; + B[ 1 ] = -psEncCtrl->GainsPre[ k ] * + ( psEncCtrl->HarmBoost[ k ] * HarmShapeGain + INPUT_TILT + psEncCtrl->coding_quality * HIGH_RATE_INPUT_TILT ); + pxw[ 0 ] = B[ 0 ] * st_res[ 0 ] + B[ 1 ] * P->sHarmHP; + for( j = 1; j < psEnc->sCmn.subfr_length; j++ ) { + pxw[ j ] = B[ 0 ] * st_res[ j ] + B[ 1 ] * st_res[ j - 1 ]; + } + P->sHarmHP = st_res[ psEnc->sCmn.subfr_length - 1 ]; + + silk_prefilt_FLP( P, pxw, pxw, HarmShapeFIR, Tilt, LF_MA_shp, LF_AR_shp, lag, psEnc->sCmn.subfr_length ); + + px += psEnc->sCmn.subfr_length; + pxw += psEnc->sCmn.subfr_length; + } + P->lagPrev = psEncCtrl->pitchL[ psEnc->sCmn.nb_subfr - 1 ]; +} + +/* +* Prefilter for finding Quantizer input signal +*/ +static OPUS_INLINE void silk_prefilt_FLP( + silk_prefilter_state_FLP *P, /* I/O state */ + silk_float st_res[], /* I */ + silk_float xw[], /* O */ + silk_float *HarmShapeFIR, /* I */ + silk_float Tilt, /* I */ + silk_float LF_MA_shp, /* I */ + silk_float LF_AR_shp, /* I */ + opus_int lag, /* I */ + opus_int length /* I */ +) +{ + opus_int i; + opus_int idx, LTP_shp_buf_idx; + silk_float n_Tilt, n_LF, n_LTP; + silk_float sLF_AR_shp, sLF_MA_shp; + silk_float *LTP_shp_buf; + + /* To speed up use temp variables instead of using the struct */ + LTP_shp_buf = P->sLTP_shp; + LTP_shp_buf_idx = P->sLTP_shp_buf_idx; + sLF_AR_shp = P->sLF_AR_shp; + sLF_MA_shp = P->sLF_MA_shp; + + for( i = 0; i < length; i++ ) { + if( lag > 0 ) { + silk_assert( HARM_SHAPE_FIR_TAPS == 3 ); + idx = lag + LTP_shp_buf_idx; + n_LTP = LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 - 1) & LTP_MASK ] * HarmShapeFIR[ 0 ]; + n_LTP += LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 ) & LTP_MASK ] * HarmShapeFIR[ 1 ]; + n_LTP += LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 + 1) & LTP_MASK ] * HarmShapeFIR[ 2 ]; + } else { + n_LTP = 0; + } + + n_Tilt = sLF_AR_shp * Tilt; + n_LF = sLF_AR_shp * LF_AR_shp + sLF_MA_shp * LF_MA_shp; + + sLF_AR_shp = st_res[ i ] - n_Tilt; + sLF_MA_shp = sLF_AR_shp - n_LF; + + LTP_shp_buf_idx = ( LTP_shp_buf_idx - 1 ) & LTP_MASK; + LTP_shp_buf[ LTP_shp_buf_idx ] = sLF_MA_shp; + + xw[ i ] = sLF_MA_shp - n_LTP; + } + /* Copy temp variable back to state */ + P->sLF_AR_shp = sLF_AR_shp; + P->sLF_MA_shp = sLF_MA_shp; + P->sLTP_shp_buf_idx = LTP_shp_buf_idx; +} diff --git a/thirdparty/opus/silk/float/process_gains_FLP.c b/thirdparty/opus/silk/float/process_gains_FLP.c new file mode 100644 index 000000000000..c0da0dae44b0 --- /dev/null +++ b/thirdparty/opus/silk/float/process_gains_FLP.c @@ -0,0 +1,103 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FLP.h" +#include "tuning_parameters.h" + +/* Processing of gains */ +void silk_process_gains_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */ + opus_int condCoding /* I The type of conditional coding to use */ +) +{ + silk_shape_state_FLP *psShapeSt = &psEnc->sShape; + opus_int k; + opus_int32 pGains_Q16[ MAX_NB_SUBFR ]; + silk_float s, InvMaxSqrVal, gain, quant_offset; + + /* Gain reduction when LTP coding gain is high */ + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + s = 1.0f - 0.5f * silk_sigmoid( 0.25f * ( psEncCtrl->LTPredCodGain - 12.0f ) ); + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + psEncCtrl->Gains[ k ] *= s; + } + } + + /* Limit the quantized signal */ + InvMaxSqrVal = ( silk_float )( pow( 2.0f, 0.33f * ( 21.0f - psEnc->sCmn.SNR_dB_Q7 * ( 1 / 128.0f ) ) ) / psEnc->sCmn.subfr_length ); + + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + /* Soft limit on ratio residual energy and squared gains */ + gain = psEncCtrl->Gains[ k ]; + gain = ( silk_float )sqrt( gain * gain + psEncCtrl->ResNrg[ k ] * InvMaxSqrVal ); + psEncCtrl->Gains[ k ] = silk_min_float( gain, 32767.0f ); + } + + /* Prepare gains for noise shaping quantization */ + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + pGains_Q16[ k ] = (opus_int32)( psEncCtrl->Gains[ k ] * 65536.0f ); + } + + /* Save unquantized gains and gain Index */ + silk_memcpy( psEncCtrl->GainsUnq_Q16, pGains_Q16, psEnc->sCmn.nb_subfr * sizeof( opus_int32 ) ); + psEncCtrl->lastGainIndexPrev = psShapeSt->LastGainIndex; + + /* Quantize gains */ + silk_gains_quant( psEnc->sCmn.indices.GainsIndices, pGains_Q16, + &psShapeSt->LastGainIndex, condCoding == CODE_CONDITIONALLY, psEnc->sCmn.nb_subfr ); + + /* Overwrite unquantized gains with quantized gains and convert back to Q0 from Q16 */ + for( k = 0; k < psEnc->sCmn.nb_subfr; k++ ) { + psEncCtrl->Gains[ k ] = pGains_Q16[ k ] / 65536.0f; + } + + /* Set quantizer offset for voiced signals. Larger offset when LTP coding gain is low or tilt is high (ie low-pass) */ + if( psEnc->sCmn.indices.signalType == TYPE_VOICED ) { + if( psEncCtrl->LTPredCodGain + psEnc->sCmn.input_tilt_Q15 * ( 1.0f / 32768.0f ) > 1.0f ) { + psEnc->sCmn.indices.quantOffsetType = 0; + } else { + psEnc->sCmn.indices.quantOffsetType = 1; + } + } + + /* Quantizer boundary adjustment */ + quant_offset = silk_Quantization_Offsets_Q10[ psEnc->sCmn.indices.signalType >> 1 ][ psEnc->sCmn.indices.quantOffsetType ] / 1024.0f; + psEncCtrl->Lambda = LAMBDA_OFFSET + + LAMBDA_DELAYED_DECISIONS * psEnc->sCmn.nStatesDelayedDecision + + LAMBDA_SPEECH_ACT * psEnc->sCmn.speech_activity_Q8 * ( 1.0f / 256.0f ) + + LAMBDA_INPUT_QUALITY * psEncCtrl->input_quality + + LAMBDA_CODING_QUALITY * psEncCtrl->coding_quality + + LAMBDA_QUANT_OFFSET * quant_offset; + + silk_assert( psEncCtrl->Lambda > 0.0f ); + silk_assert( psEncCtrl->Lambda < 2.0f ); +} diff --git a/thirdparty/opus/silk/float/regularize_correlations_FLP.c b/thirdparty/opus/silk/float/regularize_correlations_FLP.c new file mode 100644 index 000000000000..df4612604ca0 --- /dev/null +++ b/thirdparty/opus/silk/float/regularize_correlations_FLP.c @@ -0,0 +1,48 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FLP.h" + +/* Add noise to matrix diagonal */ +void silk_regularize_correlations_FLP( + silk_float *XX, /* I/O Correlation matrices */ + silk_float *xx, /* I/O Correlation values */ + const silk_float noise, /* I Noise energy to add */ + const opus_int D /* I Dimension of XX */ +) +{ + opus_int i; + + for( i = 0; i < D; i++ ) { + matrix_ptr( &XX[ 0 ], i, i, D ) += noise; + } + xx[ 0 ] += noise; +} diff --git a/thirdparty/opus/silk/float/residual_energy_FLP.c b/thirdparty/opus/silk/float/residual_energy_FLP.c new file mode 100644 index 000000000000..b2e03a86a490 --- /dev/null +++ b/thirdparty/opus/silk/float/residual_energy_FLP.c @@ -0,0 +1,117 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FLP.h" + +#define MAX_ITERATIONS_RESIDUAL_NRG 10 +#define REGULARIZATION_FACTOR 1e-8f + +/* Residual energy: nrg = wxx - 2 * wXx * c + c' * wXX * c */ +silk_float silk_residual_energy_covar_FLP( /* O Weighted residual energy */ + const silk_float *c, /* I Filter coefficients */ + silk_float *wXX, /* I/O Weighted correlation matrix, reg. out */ + const silk_float *wXx, /* I Weighted correlation vector */ + const silk_float wxx, /* I Weighted correlation value */ + const opus_int D /* I Dimension */ +) +{ + opus_int i, j, k; + silk_float tmp, nrg = 0.0f, regularization; + + /* Safety checks */ + silk_assert( D >= 0 ); + + regularization = REGULARIZATION_FACTOR * ( wXX[ 0 ] + wXX[ D * D - 1 ] ); + for( k = 0; k < MAX_ITERATIONS_RESIDUAL_NRG; k++ ) { + nrg = wxx; + + tmp = 0.0f; + for( i = 0; i < D; i++ ) { + tmp += wXx[ i ] * c[ i ]; + } + nrg -= 2.0f * tmp; + + /* compute c' * wXX * c, assuming wXX is symmetric */ + for( i = 0; i < D; i++ ) { + tmp = 0.0f; + for( j = i + 1; j < D; j++ ) { + tmp += matrix_c_ptr( wXX, i, j, D ) * c[ j ]; + } + nrg += c[ i ] * ( 2.0f * tmp + matrix_c_ptr( wXX, i, i, D ) * c[ i ] ); + } + if( nrg > 0 ) { + break; + } else { + /* Add white noise */ + for( i = 0; i < D; i++ ) { + matrix_c_ptr( wXX, i, i, D ) += regularization; + } + /* Increase noise for next run */ + regularization *= 2.0f; + } + } + if( k == MAX_ITERATIONS_RESIDUAL_NRG ) { + silk_assert( nrg == 0 ); + nrg = 1.0f; + } + + return nrg; +} + +/* Calculates residual energies of input subframes where all subframes have LPC_order */ +/* of preceding samples */ +void silk_residual_energy_FLP( + silk_float nrgs[ MAX_NB_SUBFR ], /* O Residual energy per subframe */ + const silk_float x[], /* I Input signal */ + silk_float a[ 2 ][ MAX_LPC_ORDER ], /* I AR coefs for each frame half */ + const silk_float gains[], /* I Quantization gains */ + const opus_int subfr_length, /* I Subframe length */ + const opus_int nb_subfr, /* I number of subframes */ + const opus_int LPC_order /* I LPC order */ +) +{ + opus_int shift; + silk_float *LPC_res_ptr, LPC_res[ ( MAX_FRAME_LENGTH + MAX_NB_SUBFR * MAX_LPC_ORDER ) / 2 ]; + + LPC_res_ptr = LPC_res + LPC_order; + shift = LPC_order + subfr_length; + + /* Filter input to create the LPC residual for each frame half, and measure subframe energies */ + silk_LPC_analysis_filter_FLP( LPC_res, a[ 0 ], x + 0 * shift, 2 * shift, LPC_order ); + nrgs[ 0 ] = ( silk_float )( gains[ 0 ] * gains[ 0 ] * silk_energy_FLP( LPC_res_ptr + 0 * shift, subfr_length ) ); + nrgs[ 1 ] = ( silk_float )( gains[ 1 ] * gains[ 1 ] * silk_energy_FLP( LPC_res_ptr + 1 * shift, subfr_length ) ); + + if( nb_subfr == MAX_NB_SUBFR ) { + silk_LPC_analysis_filter_FLP( LPC_res, a[ 1 ], x + 2 * shift, 2 * shift, LPC_order ); + nrgs[ 2 ] = ( silk_float )( gains[ 2 ] * gains[ 2 ] * silk_energy_FLP( LPC_res_ptr + 0 * shift, subfr_length ) ); + nrgs[ 3 ] = ( silk_float )( gains[ 3 ] * gains[ 3 ] * silk_energy_FLP( LPC_res_ptr + 1 * shift, subfr_length ) ); + } +} diff --git a/thirdparty/opus/silk/float/scale_copy_vector_FLP.c b/thirdparty/opus/silk/float/scale_copy_vector_FLP.c new file mode 100644 index 000000000000..20db32b3b19a --- /dev/null +++ b/thirdparty/opus/silk/float/scale_copy_vector_FLP.c @@ -0,0 +1,57 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FLP.h" + +/* copy and multiply a vector by a constant */ +void silk_scale_copy_vector_FLP( + silk_float *data_out, + const silk_float *data_in, + silk_float gain, + opus_int dataSize +) +{ + opus_int i, dataSize4; + + /* 4x unrolled loop */ + dataSize4 = dataSize & 0xFFFC; + for( i = 0; i < dataSize4; i += 4 ) { + data_out[ i + 0 ] = gain * data_in[ i + 0 ]; + data_out[ i + 1 ] = gain * data_in[ i + 1 ]; + data_out[ i + 2 ] = gain * data_in[ i + 2 ]; + data_out[ i + 3 ] = gain * data_in[ i + 3 ]; + } + + /* any remaining elements */ + for( ; i < dataSize; i++ ) { + data_out[ i ] = gain * data_in[ i ]; + } +} diff --git a/thirdparty/opus/silk/float/scale_vector_FLP.c b/thirdparty/opus/silk/float/scale_vector_FLP.c new file mode 100644 index 000000000000..108fdcbed54d --- /dev/null +++ b/thirdparty/opus/silk/float/scale_vector_FLP.c @@ -0,0 +1,56 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FLP.h" + +/* multiply a vector by a constant */ +void silk_scale_vector_FLP( + silk_float *data1, + silk_float gain, + opus_int dataSize +) +{ + opus_int i, dataSize4; + + /* 4x unrolled loop */ + dataSize4 = dataSize & 0xFFFC; + for( i = 0; i < dataSize4; i += 4 ) { + data1[ i + 0 ] *= gain; + data1[ i + 1 ] *= gain; + data1[ i + 2 ] *= gain; + data1[ i + 3 ] *= gain; + } + + /* any remaining elements */ + for( ; i < dataSize; i++ ) { + data1[ i ] *= gain; + } +} diff --git a/thirdparty/opus/silk/float/schur_FLP.c b/thirdparty/opus/silk/float/schur_FLP.c new file mode 100644 index 000000000000..ee436f8351c8 --- /dev/null +++ b/thirdparty/opus/silk/float/schur_FLP.c @@ -0,0 +1,70 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FLP.h" + +silk_float silk_schur_FLP( /* O returns residual energy */ + silk_float refl_coef[], /* O reflection coefficients (length order) */ + const silk_float auto_corr[], /* I autocorrelation sequence (length order+1) */ + opus_int order /* I order */ +) +{ + opus_int k, n; + silk_float C[ SILK_MAX_ORDER_LPC + 1 ][ 2 ]; + silk_float Ctmp1, Ctmp2, rc_tmp; + + silk_assert( order==6||order==8||order==10||order==12||order==14||order==16 ); + + /* Copy correlations */ + for( k = 0; k < order+1; k++ ) { + C[ k ][ 0 ] = C[ k ][ 1 ] = auto_corr[ k ]; + } + + for( k = 0; k < order; k++ ) { + /* Get reflection coefficient */ + rc_tmp = -C[ k + 1 ][ 0 ] / silk_max_float( C[ 0 ][ 1 ], 1e-9f ); + + /* Save the output */ + refl_coef[ k ] = rc_tmp; + + /* Update correlations */ + for( n = 0; n < order - k; n++ ) { + Ctmp1 = C[ n + k + 1 ][ 0 ]; + Ctmp2 = C[ n ][ 1 ]; + C[ n + k + 1 ][ 0 ] = Ctmp1 + Ctmp2 * rc_tmp; + C[ n ][ 1 ] = Ctmp2 + Ctmp1 * rc_tmp; + } + } + + /* Return residual energy */ + return C[ 0 ][ 1 ]; +} + diff --git a/thirdparty/opus/silk/float/solve_LS_FLP.c b/thirdparty/opus/silk/float/solve_LS_FLP.c new file mode 100644 index 000000000000..7c90d665a0f0 --- /dev/null +++ b/thirdparty/opus/silk/float/solve_LS_FLP.c @@ -0,0 +1,207 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FLP.h" +#include "tuning_parameters.h" + +/********************************************************************** + * LDL Factorisation. Finds the upper triangular matrix L and the diagonal + * Matrix D (only the diagonal elements returned in a vector)such that + * the symmetric matric A is given by A = L*D*L'. + **********************************************************************/ +static OPUS_INLINE void silk_LDL_FLP( + silk_float *A, /* I/O Pointer to Symetric Square Matrix */ + opus_int M, /* I Size of Matrix */ + silk_float *L, /* I/O Pointer to Square Upper triangular Matrix */ + silk_float *Dinv /* I/O Pointer to vector holding the inverse diagonal elements of D */ +); + +/********************************************************************** + * Function to solve linear equation Ax = b, when A is a MxM lower + * triangular matrix, with ones on the diagonal. + **********************************************************************/ +static OPUS_INLINE void silk_SolveWithLowerTriangularWdiagOnes_FLP( + const silk_float *L, /* I Pointer to Lower Triangular Matrix */ + opus_int M, /* I Dim of Matrix equation */ + const silk_float *b, /* I b Vector */ + silk_float *x /* O x Vector */ +); + +/********************************************************************** + * Function to solve linear equation (A^T)x = b, when A is a MxM lower + * triangular, with ones on the diagonal. (ie then A^T is upper triangular) + **********************************************************************/ +static OPUS_INLINE void silk_SolveWithUpperTriangularFromLowerWdiagOnes_FLP( + const silk_float *L, /* I Pointer to Lower Triangular Matrix */ + opus_int M, /* I Dim of Matrix equation */ + const silk_float *b, /* I b Vector */ + silk_float *x /* O x Vector */ +); + +/********************************************************************** + * Function to solve linear equation Ax = b, when A is a MxM + * symmetric square matrix - using LDL factorisation + **********************************************************************/ +void silk_solve_LDL_FLP( + silk_float *A, /* I/O Symmetric square matrix, out: reg. */ + const opus_int M, /* I Size of matrix */ + const silk_float *b, /* I Pointer to b vector */ + silk_float *x /* O Pointer to x solution vector */ +) +{ + opus_int i; + silk_float L[ MAX_MATRIX_SIZE ][ MAX_MATRIX_SIZE ]; + silk_float T[ MAX_MATRIX_SIZE ]; + silk_float Dinv[ MAX_MATRIX_SIZE ]; /* inverse diagonal elements of D*/ + + silk_assert( M <= MAX_MATRIX_SIZE ); + + /*************************************************** + Factorize A by LDL such that A = L*D*(L^T), + where L is lower triangular with ones on diagonal + ****************************************************/ + silk_LDL_FLP( A, M, &L[ 0 ][ 0 ], Dinv ); + + /**************************************************** + * substitute D*(L^T) = T. ie: + L*D*(L^T)*x = b => L*T = b <=> T = inv(L)*b + ******************************************************/ + silk_SolveWithLowerTriangularWdiagOnes_FLP( &L[ 0 ][ 0 ], M, b, T ); + + /**************************************************** + D*(L^T)*x = T <=> (L^T)*x = inv(D)*T, because D is + diagonal just multiply with 1/d_i + ****************************************************/ + for( i = 0; i < M; i++ ) { + T[ i ] = T[ i ] * Dinv[ i ]; + } + /**************************************************** + x = inv(L') * inv(D) * T + *****************************************************/ + silk_SolveWithUpperTriangularFromLowerWdiagOnes_FLP( &L[ 0 ][ 0 ], M, T, x ); +} + +static OPUS_INLINE void silk_SolveWithUpperTriangularFromLowerWdiagOnes_FLP( + const silk_float *L, /* I Pointer to Lower Triangular Matrix */ + opus_int M, /* I Dim of Matrix equation */ + const silk_float *b, /* I b Vector */ + silk_float *x /* O x Vector */ +) +{ + opus_int i, j; + silk_float temp; + const silk_float *ptr1; + + for( i = M - 1; i >= 0; i-- ) { + ptr1 = matrix_adr( L, 0, i, M ); + temp = 0; + for( j = M - 1; j > i ; j-- ) { + temp += ptr1[ j * M ] * x[ j ]; + } + temp = b[ i ] - temp; + x[ i ] = temp; + } +} + +static OPUS_INLINE void silk_SolveWithLowerTriangularWdiagOnes_FLP( + const silk_float *L, /* I Pointer to Lower Triangular Matrix */ + opus_int M, /* I Dim of Matrix equation */ + const silk_float *b, /* I b Vector */ + silk_float *x /* O x Vector */ +) +{ + opus_int i, j; + silk_float temp; + const silk_float *ptr1; + + for( i = 0; i < M; i++ ) { + ptr1 = matrix_adr( L, i, 0, M ); + temp = 0; + for( j = 0; j < i; j++ ) { + temp += ptr1[ j ] * x[ j ]; + } + temp = b[ i ] - temp; + x[ i ] = temp; + } +} + +static OPUS_INLINE void silk_LDL_FLP( + silk_float *A, /* I/O Pointer to Symetric Square Matrix */ + opus_int M, /* I Size of Matrix */ + silk_float *L, /* I/O Pointer to Square Upper triangular Matrix */ + silk_float *Dinv /* I/O Pointer to vector holding the inverse diagonal elements of D */ +) +{ + opus_int i, j, k, loop_count, err = 1; + silk_float *ptr1, *ptr2; + double temp, diag_min_value; + silk_float v[ MAX_MATRIX_SIZE ], D[ MAX_MATRIX_SIZE ]; /* temp arrays*/ + + silk_assert( M <= MAX_MATRIX_SIZE ); + + diag_min_value = FIND_LTP_COND_FAC * 0.5f * ( A[ 0 ] + A[ M * M - 1 ] ); + for( loop_count = 0; loop_count < M && err == 1; loop_count++ ) { + err = 0; + for( j = 0; j < M; j++ ) { + ptr1 = matrix_adr( L, j, 0, M ); + temp = matrix_ptr( A, j, j, M ); /* element in row j column j*/ + for( i = 0; i < j; i++ ) { + v[ i ] = ptr1[ i ] * D[ i ]; + temp -= ptr1[ i ] * v[ i ]; + } + if( temp < diag_min_value ) { + /* Badly conditioned matrix: add white noise and run again */ + temp = ( loop_count + 1 ) * diag_min_value - temp; + for( i = 0; i < M; i++ ) { + matrix_ptr( A, i, i, M ) += ( silk_float )temp; + } + err = 1; + break; + } + D[ j ] = ( silk_float )temp; + Dinv[ j ] = ( silk_float )( 1.0f / temp ); + matrix_ptr( L, j, j, M ) = 1.0f; + + ptr1 = matrix_adr( A, j, 0, M ); + ptr2 = matrix_adr( L, j + 1, 0, M); + for( i = j + 1; i < M; i++ ) { + temp = 0.0; + for( k = 0; k < j; k++ ) { + temp += ptr2[ k ] * v[ k ]; + } + matrix_ptr( L, i, j, M ) = ( silk_float )( ( ptr1[ i ] - temp ) * Dinv[ j ] ); + ptr2 += M; /* go to next column*/ + } + } + } + silk_assert( err == 0 ); +} + diff --git a/thirdparty/opus/silk/float/sort_FLP.c b/thirdparty/opus/silk/float/sort_FLP.c new file mode 100644 index 000000000000..f08d7592c56e --- /dev/null +++ b/thirdparty/opus/silk/float/sort_FLP.c @@ -0,0 +1,83 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/* Insertion sort (fast for already almost sorted arrays): */ +/* Best case: O(n) for an already sorted array */ +/* Worst case: O(n^2) for an inversely sorted array */ + +#include "typedef.h" +#include "SigProc_FLP.h" + +void silk_insertion_sort_decreasing_FLP( + silk_float *a, /* I/O Unsorted / Sorted vector */ + opus_int *idx, /* O Index vector for the sorted elements */ + const opus_int L, /* I Vector length */ + const opus_int K /* I Number of correctly sorted positions */ +) +{ + silk_float value; + opus_int i, j; + + /* Safety checks */ + silk_assert( K > 0 ); + silk_assert( L > 0 ); + silk_assert( L >= K ); + + /* Write start indices in index vector */ + for( i = 0; i < K; i++ ) { + idx[ i ] = i; + } + + /* Sort vector elements by value, decreasing order */ + for( i = 1; i < K; i++ ) { + value = a[ i ]; + for( j = i - 1; ( j >= 0 ) && ( value > a[ j ] ); j-- ) { + a[ j + 1 ] = a[ j ]; /* Shift value */ + idx[ j + 1 ] = idx[ j ]; /* Shift index */ + } + a[ j + 1 ] = value; /* Write value */ + idx[ j + 1 ] = i; /* Write index */ + } + + /* If less than L values are asked check the remaining values, */ + /* but only spend CPU to ensure that the K first values are correct */ + for( i = K; i < L; i++ ) { + value = a[ i ]; + if( value > a[ K - 1 ] ) { + for( j = K - 2; ( j >= 0 ) && ( value > a[ j ] ); j-- ) { + a[ j + 1 ] = a[ j ]; /* Shift value */ + idx[ j + 1 ] = idx[ j ]; /* Shift index */ + } + a[ j + 1 ] = value; /* Write value */ + idx[ j + 1 ] = i; /* Write index */ + } + } +} diff --git a/thirdparty/opus/silk/float/structs_FLP.h b/thirdparty/opus/silk/float/structs_FLP.h new file mode 100644 index 000000000000..14d647ced27e --- /dev/null +++ b/thirdparty/opus/silk/float/structs_FLP.h @@ -0,0 +1,132 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_STRUCTS_FLP_H +#define SILK_STRUCTS_FLP_H + +#include "typedef.h" +#include "main.h" +#include "structs.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/********************************/ +/* Noise shaping analysis state */ +/********************************/ +typedef struct { + opus_int8 LastGainIndex; + silk_float HarmBoost_smth; + silk_float HarmShapeGain_smth; + silk_float Tilt_smth; +} silk_shape_state_FLP; + +/********************************/ +/* Prefilter state */ +/********************************/ +typedef struct { + silk_float sLTP_shp[ LTP_BUF_LENGTH ]; + silk_float sAR_shp[ MAX_SHAPE_LPC_ORDER + 1 ]; + opus_int sLTP_shp_buf_idx; + silk_float sLF_AR_shp; + silk_float sLF_MA_shp; + silk_float sHarmHP; + opus_int32 rand_seed; + opus_int lagPrev; +} silk_prefilter_state_FLP; + +/********************************/ +/* Encoder state FLP */ +/********************************/ +typedef struct { + silk_encoder_state sCmn; /* Common struct, shared with fixed-point code */ + silk_shape_state_FLP sShape; /* Noise shaping state */ + silk_prefilter_state_FLP sPrefilt; /* Prefilter State */ + + /* Buffer for find pitch and noise shape analysis */ + silk_float x_buf[ 2 * MAX_FRAME_LENGTH + LA_SHAPE_MAX ];/* Buffer for find pitch and noise shape analysis */ + silk_float LTPCorr; /* Normalized correlation from pitch lag estimator */ +} silk_encoder_state_FLP; + +/************************/ +/* Encoder control FLP */ +/************************/ +typedef struct { + /* Prediction and coding parameters */ + silk_float Gains[ MAX_NB_SUBFR ]; + silk_float PredCoef[ 2 ][ MAX_LPC_ORDER ]; /* holds interpolated and final coefficients */ + silk_float LTPCoef[LTP_ORDER * MAX_NB_SUBFR]; + silk_float LTP_scale; + opus_int pitchL[ MAX_NB_SUBFR ]; + + /* Noise shaping parameters */ + silk_float AR1[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ]; + silk_float AR2[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ]; + silk_float LF_MA_shp[ MAX_NB_SUBFR ]; + silk_float LF_AR_shp[ MAX_NB_SUBFR ]; + silk_float GainsPre[ MAX_NB_SUBFR ]; + silk_float HarmBoost[ MAX_NB_SUBFR ]; + silk_float Tilt[ MAX_NB_SUBFR ]; + silk_float HarmShapeGain[ MAX_NB_SUBFR ]; + silk_float Lambda; + silk_float input_quality; + silk_float coding_quality; + + /* Measures */ + silk_float sparseness; + silk_float predGain; + silk_float LTPredCodGain; + silk_float ResNrg[ MAX_NB_SUBFR ]; /* Residual energy per subframe */ + + /* Parameters for CBR mode */ + opus_int32 GainsUnq_Q16[ MAX_NB_SUBFR ]; + opus_int8 lastGainIndexPrev; +} silk_encoder_control_FLP; + +/************************/ +/* Encoder Super Struct */ +/************************/ +typedef struct { + silk_encoder_state_FLP state_Fxx[ ENCODER_NUM_CHANNELS ]; + stereo_enc_state sStereo; + opus_int32 nBitsUsedLBRR; + opus_int32 nBitsExceeded; + opus_int nChannelsAPI; + opus_int nChannelsInternal; + opus_int nPrevChannelsInternal; + opus_int timeSinceSwitchAllowed_ms; + opus_int allowBandwidthSwitch; + opus_int prev_decode_only_middle; +} silk_encoder; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/thirdparty/opus/silk/float/warped_autocorrelation_FLP.c b/thirdparty/opus/silk/float/warped_autocorrelation_FLP.c new file mode 100644 index 000000000000..542414f48ec5 --- /dev/null +++ b/thirdparty/opus/silk/float/warped_autocorrelation_FLP.c @@ -0,0 +1,73 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FLP.h" + +/* Autocorrelations for a warped frequency axis */ +void silk_warped_autocorrelation_FLP( + silk_float *corr, /* O Result [order + 1] */ + const silk_float *input, /* I Input data to correlate */ + const silk_float warping, /* I Warping coefficient */ + const opus_int length, /* I Length of input */ + const opus_int order /* I Correlation order (even) */ +) +{ + opus_int n, i; + double tmp1, tmp2; + double state[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; + double C[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; + + /* Order must be even */ + silk_assert( ( order & 1 ) == 0 ); + + /* Loop over samples */ + for( n = 0; n < length; n++ ) { + tmp1 = input[ n ]; + /* Loop over allpass sections */ + for( i = 0; i < order; i += 2 ) { + /* Output of allpass section */ + tmp2 = state[ i ] + warping * ( state[ i + 1 ] - tmp1 ); + state[ i ] = tmp1; + C[ i ] += state[ 0 ] * tmp1; + /* Output of allpass section */ + tmp1 = state[ i + 1 ] + warping * ( state[ i + 2 ] - tmp2 ); + state[ i + 1 ] = tmp2; + C[ i + 1 ] += state[ 0 ] * tmp2; + } + state[ order ] = tmp1; + C[ order ] += state[ 0 ] * tmp1; + } + + /* Copy correlations in silk_float output format */ + for( i = 0; i < order + 1; i++ ) { + corr[ i ] = ( silk_float )C[ i ]; + } +} diff --git a/thirdparty/opus/silk/float/wrappers_FLP.c b/thirdparty/opus/silk/float/wrappers_FLP.c new file mode 100644 index 000000000000..6666b8efaade --- /dev/null +++ b/thirdparty/opus/silk/float/wrappers_FLP.c @@ -0,0 +1,202 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main_FLP.h" + +/* Wrappers. Calls flp / fix code */ + +/* Convert AR filter coefficients to NLSF parameters */ +void silk_A2NLSF_FLP( + opus_int16 *NLSF_Q15, /* O NLSF vector [ LPC_order ] */ + const silk_float *pAR, /* I LPC coefficients [ LPC_order ] */ + const opus_int LPC_order /* I LPC order */ +) +{ + opus_int i; + opus_int32 a_fix_Q16[ MAX_LPC_ORDER ]; + + for( i = 0; i < LPC_order; i++ ) { + a_fix_Q16[ i ] = silk_float2int( pAR[ i ] * 65536.0f ); + } + + silk_A2NLSF( NLSF_Q15, a_fix_Q16, LPC_order ); +} + +/* Convert LSF parameters to AR prediction filter coefficients */ +void silk_NLSF2A_FLP( + silk_float *pAR, /* O LPC coefficients [ LPC_order ] */ + const opus_int16 *NLSF_Q15, /* I NLSF vector [ LPC_order ] */ + const opus_int LPC_order /* I LPC order */ +) +{ + opus_int i; + opus_int16 a_fix_Q12[ MAX_LPC_ORDER ]; + + silk_NLSF2A( a_fix_Q12, NLSF_Q15, LPC_order ); + + for( i = 0; i < LPC_order; i++ ) { + pAR[ i ] = ( silk_float )a_fix_Q12[ i ] * ( 1.0f / 4096.0f ); + } +} + +/******************************************/ +/* Floating-point NLSF processing wrapper */ +/******************************************/ +void silk_process_NLSFs_FLP( + silk_encoder_state *psEncC, /* I/O Encoder state */ + silk_float PredCoef[ 2 ][ MAX_LPC_ORDER ], /* O Prediction coefficients */ + opus_int16 NLSF_Q15[ MAX_LPC_ORDER ], /* I/O Normalized LSFs (quant out) (0 - (2^15-1)) */ + const opus_int16 prev_NLSF_Q15[ MAX_LPC_ORDER ] /* I Previous Normalized LSFs (0 - (2^15-1)) */ +) +{ + opus_int i, j; + opus_int16 PredCoef_Q12[ 2 ][ MAX_LPC_ORDER ]; + + silk_process_NLSFs( psEncC, PredCoef_Q12, NLSF_Q15, prev_NLSF_Q15); + + for( j = 0; j < 2; j++ ) { + for( i = 0; i < psEncC->predictLPCOrder; i++ ) { + PredCoef[ j ][ i ] = ( silk_float )PredCoef_Q12[ j ][ i ] * ( 1.0f / 4096.0f ); + } + } +} + +/****************************************/ +/* Floating-point Silk NSQ wrapper */ +/****************************************/ +void silk_NSQ_wrapper_FLP( + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + silk_encoder_control_FLP *psEncCtrl, /* I/O Encoder control FLP */ + SideInfoIndices *psIndices, /* I/O Quantization indices */ + silk_nsq_state *psNSQ, /* I/O Noise Shaping Quantzation state */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const silk_float x[] /* I Prefiltered input signal */ +) +{ + opus_int i, j; + opus_int32 x_Q3[ MAX_FRAME_LENGTH ]; + opus_int32 Gains_Q16[ MAX_NB_SUBFR ]; + silk_DWORD_ALIGN opus_int16 PredCoef_Q12[ 2 ][ MAX_LPC_ORDER ]; + opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ]; + opus_int LTP_scale_Q14; + + /* Noise shaping parameters */ + opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ]; + opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ]; /* Packs two int16 coefficients per int32 value */ + opus_int Lambda_Q10; + opus_int Tilt_Q14[ MAX_NB_SUBFR ]; + opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ]; + + /* Convert control struct to fix control struct */ + /* Noise shape parameters */ + for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) { + for( j = 0; j < psEnc->sCmn.shapingLPCOrder; j++ ) { + AR2_Q13[ i * MAX_SHAPE_LPC_ORDER + j ] = silk_float2int( psEncCtrl->AR2[ i * MAX_SHAPE_LPC_ORDER + j ] * 8192.0f ); + } + } + + for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) { + LF_shp_Q14[ i ] = silk_LSHIFT32( silk_float2int( psEncCtrl->LF_AR_shp[ i ] * 16384.0f ), 16 ) | + (opus_uint16)silk_float2int( psEncCtrl->LF_MA_shp[ i ] * 16384.0f ); + Tilt_Q14[ i ] = (opus_int)silk_float2int( psEncCtrl->Tilt[ i ] * 16384.0f ); + HarmShapeGain_Q14[ i ] = (opus_int)silk_float2int( psEncCtrl->HarmShapeGain[ i ] * 16384.0f ); + } + Lambda_Q10 = ( opus_int )silk_float2int( psEncCtrl->Lambda * 1024.0f ); + + /* prediction and coding parameters */ + for( i = 0; i < psEnc->sCmn.nb_subfr * LTP_ORDER; i++ ) { + LTPCoef_Q14[ i ] = (opus_int16)silk_float2int( psEncCtrl->LTPCoef[ i ] * 16384.0f ); + } + + for( j = 0; j < 2; j++ ) { + for( i = 0; i < psEnc->sCmn.predictLPCOrder; i++ ) { + PredCoef_Q12[ j ][ i ] = (opus_int16)silk_float2int( psEncCtrl->PredCoef[ j ][ i ] * 4096.0f ); + } + } + + for( i = 0; i < psEnc->sCmn.nb_subfr; i++ ) { + Gains_Q16[ i ] = silk_float2int( psEncCtrl->Gains[ i ] * 65536.0f ); + silk_assert( Gains_Q16[ i ] > 0 ); + } + + if( psIndices->signalType == TYPE_VOICED ) { + LTP_scale_Q14 = silk_LTPScales_table_Q14[ psIndices->LTP_scaleIndex ]; + } else { + LTP_scale_Q14 = 0; + } + + /* Convert input to fix */ + for( i = 0; i < psEnc->sCmn.frame_length; i++ ) { + x_Q3[ i ] = silk_float2int( 8.0f * x[ i ] ); + } + + /* Call NSQ */ + if( psEnc->sCmn.nStatesDelayedDecision > 1 || psEnc->sCmn.warping_Q16 > 0 ) { + silk_NSQ_del_dec( &psEnc->sCmn, psNSQ, psIndices, x_Q3, pulses, PredCoef_Q12[ 0 ], LTPCoef_Q14, + AR2_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, psEncCtrl->pitchL, Lambda_Q10, LTP_scale_Q14, psEnc->sCmn.arch ); + } else { + silk_NSQ( &psEnc->sCmn, psNSQ, psIndices, x_Q3, pulses, PredCoef_Q12[ 0 ], LTPCoef_Q14, + AR2_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, psEncCtrl->pitchL, Lambda_Q10, LTP_scale_Q14, psEnc->sCmn.arch ); + } +} + +/***********************************************/ +/* Floating-point Silk LTP quantiation wrapper */ +/***********************************************/ +void silk_quant_LTP_gains_FLP( + silk_float B[ MAX_NB_SUBFR * LTP_ORDER ], /* I/O (Un-)quantized LTP gains */ + opus_int8 cbk_index[ MAX_NB_SUBFR ], /* O Codebook index */ + opus_int8 *periodicity_index, /* O Periodicity index */ + opus_int32 *sum_log_gain_Q7, /* I/O Cumulative max prediction gain */ + const silk_float W[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ], /* I Error weights */ + const opus_int mu_Q10, /* I Mu value (R/D tradeoff) */ + const opus_int lowComplexity, /* I Flag for low complexity */ + const opus_int nb_subfr, /* I number of subframes */ + int arch /* I Run-time architecture */ +) +{ + opus_int i; + opus_int16 B_Q14[ MAX_NB_SUBFR * LTP_ORDER ]; + opus_int32 W_Q18[ MAX_NB_SUBFR*LTP_ORDER*LTP_ORDER ]; + + for( i = 0; i < nb_subfr * LTP_ORDER; i++ ) { + B_Q14[ i ] = (opus_int16)silk_float2int( B[ i ] * 16384.0f ); + } + for( i = 0; i < nb_subfr * LTP_ORDER * LTP_ORDER; i++ ) { + W_Q18[ i ] = (opus_int32)silk_float2int( W[ i ] * 262144.0f ); + } + + silk_quant_LTP_gains( B_Q14, cbk_index, periodicity_index, sum_log_gain_Q7, W_Q18, mu_Q10, lowComplexity, nb_subfr, arch ); + + for( i = 0; i < nb_subfr * LTP_ORDER; i++ ) { + B[ i ] = (silk_float)B_Q14[ i ] * ( 1.0f / 16384.0f ); + } +} diff --git a/thirdparty/opus/silk/gain_quant.c b/thirdparty/opus/silk/gain_quant.c new file mode 100644 index 000000000000..64ccd0611bff --- /dev/null +++ b/thirdparty/opus/silk/gain_quant.c @@ -0,0 +1,141 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +#define OFFSET ( ( MIN_QGAIN_DB * 128 ) / 6 + 16 * 128 ) +#define SCALE_Q16 ( ( 65536 * ( N_LEVELS_QGAIN - 1 ) ) / ( ( ( MAX_QGAIN_DB - MIN_QGAIN_DB ) * 128 ) / 6 ) ) +#define INV_SCALE_Q16 ( ( 65536 * ( ( ( MAX_QGAIN_DB - MIN_QGAIN_DB ) * 128 ) / 6 ) ) / ( N_LEVELS_QGAIN - 1 ) ) + +/* Gain scalar quantization with hysteresis, uniform on log scale */ +void silk_gains_quant( + opus_int8 ind[ MAX_NB_SUBFR ], /* O gain indices */ + opus_int32 gain_Q16[ MAX_NB_SUBFR ], /* I/O gains (quantized out) */ + opus_int8 *prev_ind, /* I/O last index in previous frame */ + const opus_int conditional, /* I first gain is delta coded if 1 */ + const opus_int nb_subfr /* I number of subframes */ +) +{ + opus_int k, double_step_size_threshold; + + for( k = 0; k < nb_subfr; k++ ) { + /* Convert to log scale, scale, floor() */ + ind[ k ] = silk_SMULWB( SCALE_Q16, silk_lin2log( gain_Q16[ k ] ) - OFFSET ); + + /* Round towards previous quantized gain (hysteresis) */ + if( ind[ k ] < *prev_ind ) { + ind[ k ]++; + } + ind[ k ] = silk_LIMIT_int( ind[ k ], 0, N_LEVELS_QGAIN - 1 ); + + /* Compute delta indices and limit */ + if( k == 0 && conditional == 0 ) { + /* Full index */ + ind[ k ] = silk_LIMIT_int( ind[ k ], *prev_ind + MIN_DELTA_GAIN_QUANT, N_LEVELS_QGAIN - 1 ); + *prev_ind = ind[ k ]; + } else { + /* Delta index */ + ind[ k ] = ind[ k ] - *prev_ind; + + /* Double the quantization step size for large gain increases, so that the max gain level can be reached */ + double_step_size_threshold = 2 * MAX_DELTA_GAIN_QUANT - N_LEVELS_QGAIN + *prev_ind; + if( ind[ k ] > double_step_size_threshold ) { + ind[ k ] = double_step_size_threshold + silk_RSHIFT( ind[ k ] - double_step_size_threshold + 1, 1 ); + } + + ind[ k ] = silk_LIMIT_int( ind[ k ], MIN_DELTA_GAIN_QUANT, MAX_DELTA_GAIN_QUANT ); + + /* Accumulate deltas */ + if( ind[ k ] > double_step_size_threshold ) { + *prev_ind += silk_LSHIFT( ind[ k ], 1 ) - double_step_size_threshold; + } else { + *prev_ind += ind[ k ]; + } + + /* Shift to make non-negative */ + ind[ k ] -= MIN_DELTA_GAIN_QUANT; + } + + /* Scale and convert to linear scale */ + gain_Q16[ k ] = silk_log2lin( silk_min_32( silk_SMULWB( INV_SCALE_Q16, *prev_ind ) + OFFSET, 3967 ) ); /* 3967 = 31 in Q7 */ + } +} + +/* Gains scalar dequantization, uniform on log scale */ +void silk_gains_dequant( + opus_int32 gain_Q16[ MAX_NB_SUBFR ], /* O quantized gains */ + const opus_int8 ind[ MAX_NB_SUBFR ], /* I gain indices */ + opus_int8 *prev_ind, /* I/O last index in previous frame */ + const opus_int conditional, /* I first gain is delta coded if 1 */ + const opus_int nb_subfr /* I number of subframes */ +) +{ + opus_int k, ind_tmp, double_step_size_threshold; + + for( k = 0; k < nb_subfr; k++ ) { + if( k == 0 && conditional == 0 ) { + /* Gain index is not allowed to go down more than 16 steps (~21.8 dB) */ + *prev_ind = silk_max_int( ind[ k ], *prev_ind - 16 ); + } else { + /* Delta index */ + ind_tmp = ind[ k ] + MIN_DELTA_GAIN_QUANT; + + /* Accumulate deltas */ + double_step_size_threshold = 2 * MAX_DELTA_GAIN_QUANT - N_LEVELS_QGAIN + *prev_ind; + if( ind_tmp > double_step_size_threshold ) { + *prev_ind += silk_LSHIFT( ind_tmp, 1 ) - double_step_size_threshold; + } else { + *prev_ind += ind_tmp; + } + } + *prev_ind = silk_LIMIT_int( *prev_ind, 0, N_LEVELS_QGAIN - 1 ); + + /* Scale and convert to linear scale */ + gain_Q16[ k ] = silk_log2lin( silk_min_32( silk_SMULWB( INV_SCALE_Q16, *prev_ind ) + OFFSET, 3967 ) ); /* 3967 = 31 in Q7 */ + } +} + +/* Compute unique identifier of gain indices vector */ +opus_int32 silk_gains_ID( /* O returns unique identifier of gains */ + const opus_int8 ind[ MAX_NB_SUBFR ], /* I gain indices */ + const opus_int nb_subfr /* I number of subframes */ +) +{ + opus_int k; + opus_int32 gainsID; + + gainsID = 0; + for( k = 0; k < nb_subfr; k++ ) { + gainsID = silk_ADD_LSHIFT32( ind[ k ], gainsID, 8 ); + } + + return gainsID; +} diff --git a/thirdparty/opus/silk/init_decoder.c b/thirdparty/opus/silk/init_decoder.c new file mode 100644 index 000000000000..f887c67886fb --- /dev/null +++ b/thirdparty/opus/silk/init_decoder.c @@ -0,0 +1,56 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/************************/ +/* Init Decoder State */ +/************************/ +opus_int silk_init_decoder( + silk_decoder_state *psDec /* I/O Decoder state pointer */ +) +{ + /* Clear the entire encoder state, except anything copied */ + silk_memset( psDec, 0, sizeof( silk_decoder_state ) ); + + /* Used to deactivate LSF interpolation */ + psDec->first_frame_after_reset = 1; + psDec->prev_gain_Q16 = 65536; + + /* Reset CNG state */ + silk_CNG_Reset( psDec ); + + /* Reset PLC state */ + silk_PLC_Reset( psDec ); + + return(0); +} + diff --git a/thirdparty/opus/silk/init_encoder.c b/thirdparty/opus/silk/init_encoder.c new file mode 100644 index 000000000000..65995c33fa47 --- /dev/null +++ b/thirdparty/opus/silk/init_encoder.c @@ -0,0 +1,64 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#ifdef FIXED_POINT +#include "main_FIX.h" +#else +#include "main_FLP.h" +#endif +#include "tuning_parameters.h" +#include "cpu_support.h" + +/*********************************/ +/* Initialize Silk Encoder state */ +/*********************************/ +opus_int silk_init_encoder( + silk_encoder_state_Fxx *psEnc, /* I/O Pointer to Silk FIX encoder state */ + int arch /* I Run-time architecture */ +) +{ + opus_int ret = 0; + + /* Clear the entire encoder state */ + silk_memset( psEnc, 0, sizeof( silk_encoder_state_Fxx ) ); + + psEnc->sCmn.arch = arch; + + psEnc->sCmn.variable_HP_smth1_Q15 = silk_LSHIFT( silk_lin2log( SILK_FIX_CONST( VARIABLE_HP_MIN_CUTOFF_HZ, 16 ) ) - ( 16 << 7 ), 8 ); + psEnc->sCmn.variable_HP_smth2_Q15 = psEnc->sCmn.variable_HP_smth1_Q15; + + /* Used to deactivate LSF interpolation, pitch prediction */ + psEnc->sCmn.first_frame_after_reset = 1; + + /* Initialize Silk VAD */ + ret += silk_VAD_Init( &psEnc->sCmn.sVAD ); + + return ret; +} diff --git a/thirdparty/opus/silk/inner_prod_aligned.c b/thirdparty/opus/silk/inner_prod_aligned.c new file mode 100644 index 000000000000..257ae9e04eb3 --- /dev/null +++ b/thirdparty/opus/silk/inner_prod_aligned.c @@ -0,0 +1,47 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" + +opus_int32 silk_inner_prod_aligned_scale( + const opus_int16 *const inVec1, /* I input vector 1 */ + const opus_int16 *const inVec2, /* I input vector 2 */ + const opus_int scale, /* I number of bits to shift */ + const opus_int len /* I vector lengths */ +) +{ + opus_int i; + opus_int32 sum = 0; + for( i = 0; i < len; i++ ) { + sum = silk_ADD_RSHIFT32( sum, silk_SMULBB( inVec1[ i ], inVec2[ i ] ), scale ); + } + return sum; +} diff --git a/thirdparty/opus/silk/interpolate.c b/thirdparty/opus/silk/interpolate.c new file mode 100644 index 000000000000..1bd8ca4d5327 --- /dev/null +++ b/thirdparty/opus/silk/interpolate.c @@ -0,0 +1,51 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* Interpolate two vectors */ +void silk_interpolate( + opus_int16 xi[ MAX_LPC_ORDER ], /* O interpolated vector */ + const opus_int16 x0[ MAX_LPC_ORDER ], /* I first vector */ + const opus_int16 x1[ MAX_LPC_ORDER ], /* I second vector */ + const opus_int ifact_Q2, /* I interp. factor, weight on 2nd vector */ + const opus_int d /* I number of parameters */ +) +{ + opus_int i; + + silk_assert( ifact_Q2 >= 0 ); + silk_assert( ifact_Q2 <= 4 ); + + for( i = 0; i < d; i++ ) { + xi[ i ] = (opus_int16)silk_ADD_RSHIFT( x0[ i ], silk_SMULBB( x1[ i ] - x0[ i ], ifact_Q2 ), 2 ); + } +} diff --git a/thirdparty/opus/silk/lin2log.c b/thirdparty/opus/silk/lin2log.c new file mode 100644 index 000000000000..d4fe515321f0 --- /dev/null +++ b/thirdparty/opus/silk/lin2log.c @@ -0,0 +1,46 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" +/* Approximation of 128 * log2() (very close inverse of silk_log2lin()) */ +/* Convert input to a log scale */ +opus_int32 silk_lin2log( + const opus_int32 inLin /* I input in linear scale */ +) +{ + opus_int32 lz, frac_Q7; + + silk_CLZ_FRAC( inLin, &lz, &frac_Q7 ); + + /* Piece-wise parabolic approximation */ + return silk_LSHIFT( 31 - lz, 7 ) + silk_SMLAWB( frac_Q7, silk_MUL( frac_Q7, 128 - frac_Q7 ), 179 ); +} + diff --git a/thirdparty/opus/silk/log2lin.c b/thirdparty/opus/silk/log2lin.c new file mode 100644 index 000000000000..b7c48e47407b --- /dev/null +++ b/thirdparty/opus/silk/log2lin.c @@ -0,0 +1,58 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" + +/* Approximation of 2^() (very close inverse of silk_lin2log()) */ +/* Convert input to a linear scale */ +opus_int32 silk_log2lin( + const opus_int32 inLog_Q7 /* I input on log scale */ +) +{ + opus_int32 out, frac_Q7; + + if( inLog_Q7 < 0 ) { + return 0; + } else if ( inLog_Q7 >= 3967 ) { + return silk_int32_MAX; + } + + out = silk_LSHIFT( 1, silk_RSHIFT( inLog_Q7, 7 ) ); + frac_Q7 = inLog_Q7 & 0x7F; + if( inLog_Q7 < 2048 ) { + /* Piece-wise parabolic approximation */ + out = silk_ADD_RSHIFT32( out, silk_MUL( out, silk_SMLAWB( frac_Q7, silk_SMULBB( frac_Q7, 128 - frac_Q7 ), -174 ) ), 7 ); + } else { + /* Piece-wise parabolic approximation */ + out = silk_MLA( out, silk_RSHIFT( out, 7 ), silk_SMLAWB( frac_Q7, silk_SMULBB( frac_Q7, 128 - frac_Q7 ), -174 ) ); + } + return out; +} diff --git a/thirdparty/opus/silk/macros.h b/thirdparty/opus/silk/macros.h new file mode 100644 index 000000000000..d3ca34752004 --- /dev/null +++ b/thirdparty/opus/silk/macros.h @@ -0,0 +1,159 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_MACROS_H +#define SILK_MACROS_H + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opus_types.h" +#include "opus_defines.h" +#include "arch.h" + +#if OPUS_GNUC_PREREQ(3, 0) +#define opus_likely(x) (__builtin_expect(!!(x), 1)) +#define opus_unlikely(x) (__builtin_expect(!!(x), 0)) +#else +#define opus_likely(x) (!!(x)) +#define opus_unlikely(x) (!!(x)) +#endif + +/* This is an OPUS_INLINE header file for general platform. */ + +/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */ +#if OPUS_FAST_INT64 +#define silk_SMULWB(a32, b32) ((opus_int32)(((a32) * (opus_int64)((opus_int16)(b32))) >> 16)) +#else +#define silk_SMULWB(a32, b32) ((((a32) >> 16) * (opus_int32)((opus_int16)(b32))) + ((((a32) & 0x0000FFFF) * (opus_int32)((opus_int16)(b32))) >> 16)) +#endif + +/* a32 + (b32 * (opus_int32)((opus_int16)(c32))) >> 16 output have to be 32bit int */ +#if OPUS_FAST_INT64 +#define silk_SMLAWB(a32, b32, c32) ((opus_int32)((a32) + (((b32) * (opus_int64)((opus_int16)(c32))) >> 16))) +#else +#define silk_SMLAWB(a32, b32, c32) ((a32) + ((((b32) >> 16) * (opus_int32)((opus_int16)(c32))) + ((((b32) & 0x0000FFFF) * (opus_int32)((opus_int16)(c32))) >> 16))) +#endif + +/* (a32 * (b32 >> 16)) >> 16 */ +#if OPUS_FAST_INT64 +#define silk_SMULWT(a32, b32) ((opus_int32)(((a32) * (opus_int64)((b32) >> 16)) >> 16)) +#else +#define silk_SMULWT(a32, b32) (((a32) >> 16) * ((b32) >> 16) + ((((a32) & 0x0000FFFF) * ((b32) >> 16)) >> 16)) +#endif + +/* a32 + (b32 * (c32 >> 16)) >> 16 */ +#if OPUS_FAST_INT64 +#define silk_SMLAWT(a32, b32, c32) ((opus_int32)((a32) + (((b32) * ((opus_int64)(c32) >> 16)) >> 16))) +#else +#define silk_SMLAWT(a32, b32, c32) ((a32) + (((b32) >> 16) * ((c32) >> 16)) + ((((b32) & 0x0000FFFF) * ((c32) >> 16)) >> 16)) +#endif + +/* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */ +#define silk_SMULBB(a32, b32) ((opus_int32)((opus_int16)(a32)) * (opus_int32)((opus_int16)(b32))) + +/* a32 + (opus_int32)((opus_int16)(b32)) * (opus_int32)((opus_int16)(c32)) output have to be 32bit int */ +#define silk_SMLABB(a32, b32, c32) ((a32) + ((opus_int32)((opus_int16)(b32))) * (opus_int32)((opus_int16)(c32))) + +/* (opus_int32)((opus_int16)(a32)) * (b32 >> 16) */ +#define silk_SMULBT(a32, b32) ((opus_int32)((opus_int16)(a32)) * ((b32) >> 16)) + +/* a32 + (opus_int32)((opus_int16)(b32)) * (c32 >> 16) */ +#define silk_SMLABT(a32, b32, c32) ((a32) + ((opus_int32)((opus_int16)(b32))) * ((c32) >> 16)) + +/* a64 + (b32 * c32) */ +#define silk_SMLAL(a64, b32, c32) (silk_ADD64((a64), ((opus_int64)(b32) * (opus_int64)(c32)))) + +/* (a32 * b32) >> 16 */ +#if OPUS_FAST_INT64 +#define silk_SMULWW(a32, b32) ((opus_int32)(((opus_int64)(a32) * (b32)) >> 16)) +#else +#define silk_SMULWW(a32, b32) silk_MLA(silk_SMULWB((a32), (b32)), (a32), silk_RSHIFT_ROUND((b32), 16)) +#endif + +/* a32 + ((b32 * c32) >> 16) */ +#if OPUS_FAST_INT64 +#define silk_SMLAWW(a32, b32, c32) ((opus_int32)((a32) + (((opus_int64)(b32) * (c32)) >> 16))) +#else +#define silk_SMLAWW(a32, b32, c32) silk_MLA(silk_SMLAWB((a32), (b32), (c32)), (b32), silk_RSHIFT_ROUND((c32), 16)) +#endif + +/* add/subtract with output saturated */ +#define silk_ADD_SAT32(a, b) ((((opus_uint32)(a) + (opus_uint32)(b)) & 0x80000000) == 0 ? \ + ((((a) & (b)) & 0x80000000) != 0 ? silk_int32_MIN : (a)+(b)) : \ + ((((a) | (b)) & 0x80000000) == 0 ? silk_int32_MAX : (a)+(b)) ) + +#define silk_SUB_SAT32(a, b) ((((opus_uint32)(a)-(opus_uint32)(b)) & 0x80000000) == 0 ? \ + (( (a) & ((b)^0x80000000) & 0x80000000) ? silk_int32_MIN : (a)-(b)) : \ + ((((a)^0x80000000) & (b) & 0x80000000) ? silk_int32_MAX : (a)-(b)) ) + +#if defined(MIPSr1_ASM) +#include "mips/macros_mipsr1.h" +#endif + +#include "ecintrin.h" +#ifndef OVERRIDE_silk_CLZ16 +static OPUS_INLINE opus_int32 silk_CLZ16(opus_int16 in16) +{ + return 32 - EC_ILOG(in16<<16|0x8000); +} +#endif + +#ifndef OVERRIDE_silk_CLZ32 +static OPUS_INLINE opus_int32 silk_CLZ32(opus_int32 in32) +{ + return in32 ? 32 - EC_ILOG(in32) : 32; +} +#endif + +/* Row based */ +#define matrix_ptr(Matrix_base_adr, row, column, N) \ + (*((Matrix_base_adr) + ((row)*(N)+(column)))) +#define matrix_adr(Matrix_base_adr, row, column, N) \ + ((Matrix_base_adr) + ((row)*(N)+(column))) + +/* Column based */ +#ifndef matrix_c_ptr +# define matrix_c_ptr(Matrix_base_adr, row, column, M) \ + (*((Matrix_base_adr) + ((row)+(M)*(column)))) +#endif + +#ifdef OPUS_ARM_INLINE_ASM +#include "arm/macros_armv4.h" +#endif + +#ifdef OPUS_ARM_INLINE_EDSP +#include "arm/macros_armv5e.h" +#endif + +#ifdef OPUS_ARM_PRESUME_AARCH64_NEON_INTR +#include "arm/macros_arm64.h" +#endif + +#endif /* SILK_MACROS_H */ + diff --git a/thirdparty/opus/silk/main.h b/thirdparty/opus/silk/main.h new file mode 100644 index 000000000000..2f90d68f7da0 --- /dev/null +++ b/thirdparty/opus/silk/main.h @@ -0,0 +1,471 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_MAIN_H +#define SILK_MAIN_H + +#include "SigProc_FIX.h" +#include "define.h" +#include "structs.h" +#include "tables.h" +#include "PLC.h" +#include "control.h" +#include "debug.h" +#include "entenc.h" +#include "entdec.h" + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) +#include "x86/main_sse.h" +#endif + +/* Convert Left/Right stereo signal to adaptive Mid/Side representation */ +void silk_stereo_LR_to_MS( + stereo_enc_state *state, /* I/O State */ + opus_int16 x1[], /* I/O Left input signal, becomes mid signal */ + opus_int16 x2[], /* I/O Right input signal, becomes side signal */ + opus_int8 ix[ 2 ][ 3 ], /* O Quantization indices */ + opus_int8 *mid_only_flag, /* O Flag: only mid signal coded */ + opus_int32 mid_side_rates_bps[], /* O Bitrates for mid and side signals */ + opus_int32 total_rate_bps, /* I Total bitrate */ + opus_int prev_speech_act_Q8, /* I Speech activity level in previous frame */ + opus_int toMono, /* I Last frame before a stereo->mono transition */ + opus_int fs_kHz, /* I Sample rate (kHz) */ + opus_int frame_length /* I Number of samples */ +); + +/* Convert adaptive Mid/Side representation to Left/Right stereo signal */ +void silk_stereo_MS_to_LR( + stereo_dec_state *state, /* I/O State */ + opus_int16 x1[], /* I/O Left input signal, becomes mid signal */ + opus_int16 x2[], /* I/O Right input signal, becomes side signal */ + const opus_int32 pred_Q13[], /* I Predictors */ + opus_int fs_kHz, /* I Samples rate (kHz) */ + opus_int frame_length /* I Number of samples */ +); + +/* Find least-squares prediction gain for one signal based on another and quantize it */ +opus_int32 silk_stereo_find_predictor( /* O Returns predictor in Q13 */ + opus_int32 *ratio_Q14, /* O Ratio of residual and mid energies */ + const opus_int16 x[], /* I Basis signal */ + const opus_int16 y[], /* I Target signal */ + opus_int32 mid_res_amp_Q0[], /* I/O Smoothed mid, residual norms */ + opus_int length, /* I Number of samples */ + opus_int smooth_coef_Q16 /* I Smoothing coefficient */ +); + +/* Quantize mid/side predictors */ +void silk_stereo_quant_pred( + opus_int32 pred_Q13[], /* I/O Predictors (out: quantized) */ + opus_int8 ix[ 2 ][ 3 ] /* O Quantization indices */ +); + +/* Entropy code the mid/side quantization indices */ +void silk_stereo_encode_pred( + ec_enc *psRangeEnc, /* I/O Compressor data structure */ + opus_int8 ix[ 2 ][ 3 ] /* I Quantization indices */ +); + +/* Entropy code the mid-only flag */ +void silk_stereo_encode_mid_only( + ec_enc *psRangeEnc, /* I/O Compressor data structure */ + opus_int8 mid_only_flag +); + +/* Decode mid/side predictors */ +void silk_stereo_decode_pred( + ec_dec *psRangeDec, /* I/O Compressor data structure */ + opus_int32 pred_Q13[] /* O Predictors */ +); + +/* Decode mid-only flag */ +void silk_stereo_decode_mid_only( + ec_dec *psRangeDec, /* I/O Compressor data structure */ + opus_int *decode_only_mid /* O Flag that only mid channel has been coded */ +); + +/* Encodes signs of excitation */ +void silk_encode_signs( + ec_enc *psRangeEnc, /* I/O Compressor data structure */ + const opus_int8 pulses[], /* I pulse signal */ + opus_int length, /* I length of input */ + const opus_int signalType, /* I Signal type */ + const opus_int quantOffsetType, /* I Quantization offset type */ + const opus_int sum_pulses[ MAX_NB_SHELL_BLOCKS ] /* I Sum of absolute pulses per block */ +); + +/* Decodes signs of excitation */ +void silk_decode_signs( + ec_dec *psRangeDec, /* I/O Compressor data structure */ + opus_int16 pulses[], /* I/O pulse signal */ + opus_int length, /* I length of input */ + const opus_int signalType, /* I Signal type */ + const opus_int quantOffsetType, /* I Quantization offset type */ + const opus_int sum_pulses[ MAX_NB_SHELL_BLOCKS ] /* I Sum of absolute pulses per block */ +); + +/* Check encoder control struct */ +opus_int check_control_input( + silk_EncControlStruct *encControl /* I Control structure */ +); + +/* Control internal sampling rate */ +opus_int silk_control_audio_bandwidth( + silk_encoder_state *psEncC, /* I/O Pointer to Silk encoder state */ + silk_EncControlStruct *encControl /* I Control structure */ +); + +/* Control SNR of redidual quantizer */ +opus_int silk_control_SNR( + silk_encoder_state *psEncC, /* I/O Pointer to Silk encoder state */ + opus_int32 TargetRate_bps /* I Target max bitrate (bps) */ +); + +/***************/ +/* Shell coder */ +/***************/ + +/* Encode quantization indices of excitation */ +void silk_encode_pulses( + ec_enc *psRangeEnc, /* I/O compressor data structure */ + const opus_int signalType, /* I Signal type */ + const opus_int quantOffsetType, /* I quantOffsetType */ + opus_int8 pulses[], /* I quantization indices */ + const opus_int frame_length /* I Frame length */ +); + +/* Shell encoder, operates on one shell code frame of 16 pulses */ +void silk_shell_encoder( + ec_enc *psRangeEnc, /* I/O compressor data structure */ + const opus_int *pulses0 /* I data: nonnegative pulse amplitudes */ +); + +/* Shell decoder, operates on one shell code frame of 16 pulses */ +void silk_shell_decoder( + opus_int16 *pulses0, /* O data: nonnegative pulse amplitudes */ + ec_dec *psRangeDec, /* I/O Compressor data structure */ + const opus_int pulses4 /* I number of pulses per pulse-subframe */ +); + +/* Gain scalar quantization with hysteresis, uniform on log scale */ +void silk_gains_quant( + opus_int8 ind[ MAX_NB_SUBFR ], /* O gain indices */ + opus_int32 gain_Q16[ MAX_NB_SUBFR ], /* I/O gains (quantized out) */ + opus_int8 *prev_ind, /* I/O last index in previous frame */ + const opus_int conditional, /* I first gain is delta coded if 1 */ + const opus_int nb_subfr /* I number of subframes */ +); + +/* Gains scalar dequantization, uniform on log scale */ +void silk_gains_dequant( + opus_int32 gain_Q16[ MAX_NB_SUBFR ], /* O quantized gains */ + const opus_int8 ind[ MAX_NB_SUBFR ], /* I gain indices */ + opus_int8 *prev_ind, /* I/O last index in previous frame */ + const opus_int conditional, /* I first gain is delta coded if 1 */ + const opus_int nb_subfr /* I number of subframes */ +); + +/* Compute unique identifier of gain indices vector */ +opus_int32 silk_gains_ID( /* O returns unique identifier of gains */ + const opus_int8 ind[ MAX_NB_SUBFR ], /* I gain indices */ + const opus_int nb_subfr /* I number of subframes */ +); + +/* Interpolate two vectors */ +void silk_interpolate( + opus_int16 xi[ MAX_LPC_ORDER ], /* O interpolated vector */ + const opus_int16 x0[ MAX_LPC_ORDER ], /* I first vector */ + const opus_int16 x1[ MAX_LPC_ORDER ], /* I second vector */ + const opus_int ifact_Q2, /* I interp. factor, weight on 2nd vector */ + const opus_int d /* I number of parameters */ +); + +/* LTP tap quantizer */ +void silk_quant_LTP_gains( + opus_int16 B_Q14[ MAX_NB_SUBFR * LTP_ORDER ], /* I/O (un)quantized LTP gains */ + opus_int8 cbk_index[ MAX_NB_SUBFR ], /* O Codebook Index */ + opus_int8 *periodicity_index, /* O Periodicity Index */ + opus_int32 *sum_gain_dB_Q7, /* I/O Cumulative max prediction gain */ + const opus_int32 W_Q18[ MAX_NB_SUBFR*LTP_ORDER*LTP_ORDER ], /* I Error Weights in Q18 */ + opus_int mu_Q9, /* I Mu value (R/D tradeoff) */ + opus_int lowComplexity, /* I Flag for low complexity */ + const opus_int nb_subfr, /* I number of subframes */ + int arch /* I Run-time architecture */ +); + +/* Entropy constrained matrix-weighted VQ, for a single input data vector */ +void silk_VQ_WMat_EC_c( + opus_int8 *ind, /* O index of best codebook vector */ + opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */ + opus_int *gain_Q7, /* O sum of absolute LTP coefficients */ + const opus_int16 *in_Q14, /* I input vector to be quantized */ + const opus_int32 *W_Q18, /* I weighting matrix */ + const opus_int8 *cb_Q7, /* I codebook */ + const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */ + const opus_uint8 *cl_Q5, /* I code length for each codebook vector */ + const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */ + const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */ + opus_int L /* I number of vectors in codebook */ +); + +#if !defined(OVERRIDE_silk_VQ_WMat_EC) +#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ + mu_Q9, max_gain_Q7, L, arch) \ + ((void)(arch),silk_VQ_WMat_EC_c(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ + mu_Q9, max_gain_Q7, L)) +#endif + +/************************************/ +/* Noise shaping quantization (NSQ) */ +/************************************/ + +void silk_NSQ_c( + const silk_encoder_state *psEncC, /* I/O Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int32 x_Q3[], /* I Prefiltered input signal */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ +); + +#if !defined(OVERRIDE_silk_NSQ) +#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((void)(arch),silk_NSQ_c(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) +#endif + +/* Noise shaping using delayed decision */ +void silk_NSQ_del_dec_c( + const silk_encoder_state *psEncC, /* I/O Encoder State */ + silk_nsq_state *NSQ, /* I/O NSQ state */ + SideInfoIndices *psIndices, /* I/O Quantization Indices */ + const opus_int32 x_Q3[], /* I Prefiltered input signal */ + opus_int8 pulses[], /* O Quantized pulse signal */ + const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */ + const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ + const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ + const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ + const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ + const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ + const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ + const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ + const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ + const opus_int LTP_scale_Q14 /* I LTP state scaling */ +); + +#if !defined(OVERRIDE_silk_NSQ_del_dec) +#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((void)(arch),silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) +#endif + +/************/ +/* Silk VAD */ +/************/ +/* Initialize the Silk VAD */ +opus_int silk_VAD_Init( /* O Return value, 0 if success */ + silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */ +); + +/* Get speech activity level in Q8 */ +opus_int silk_VAD_GetSA_Q8_c( /* O Return value, 0 if success */ + silk_encoder_state *psEncC, /* I/O Encoder state */ + const opus_int16 pIn[] /* I PCM input */ +); + +#if !defined(OVERRIDE_silk_VAD_GetSA_Q8) +#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_c(psEnC, pIn)) +#endif + +/* Low-pass filter with variable cutoff frequency based on */ +/* piece-wise linear interpolation between elliptic filters */ +/* Start by setting transition_frame_no = 1; */ +void silk_LP_variable_cutoff( + silk_LP_state *psLP, /* I/O LP filter state */ + opus_int16 *frame, /* I/O Low-pass filtered output signal */ + const opus_int frame_length /* I Frame length */ +); + +/******************/ +/* NLSF Quantizer */ +/******************/ +/* Limit, stabilize, convert and quantize NLSFs */ +void silk_process_NLSFs( + silk_encoder_state *psEncC, /* I/O Encoder state */ + opus_int16 PredCoef_Q12[ 2 ][ MAX_LPC_ORDER ], /* O Prediction coefficients */ + opus_int16 pNLSF_Q15[ MAX_LPC_ORDER ], /* I/O Normalized LSFs (quant out) (0 - (2^15-1)) */ + const opus_int16 prev_NLSFq_Q15[ MAX_LPC_ORDER ] /* I Previous Normalized LSFs (0 - (2^15-1)) */ +); + +opus_int32 silk_NLSF_encode( /* O Returns RD value in Q25 */ + opus_int8 *NLSFIndices, /* I Codebook path vector [ LPC_ORDER + 1 ] */ + opus_int16 *pNLSF_Q15, /* I/O Quantized NLSF vector [ LPC_ORDER ] */ + const silk_NLSF_CB_struct *psNLSF_CB, /* I Codebook object */ + const opus_int16 *pW_QW, /* I NLSF weight vector [ LPC_ORDER ] */ + const opus_int NLSF_mu_Q20, /* I Rate weight for the RD optimization */ + const opus_int nSurvivors, /* I Max survivors after first stage */ + const opus_int signalType /* I Signal type: 0/1/2 */ +); + +/* Compute quantization errors for an LPC_order element input vector for a VQ codebook */ +void silk_NLSF_VQ( + opus_int32 err_Q26[], /* O Quantization errors [K] */ + const opus_int16 in_Q15[], /* I Input vectors to be quantized [LPC_order] */ + const opus_uint8 pCB_Q8[], /* I Codebook vectors [K*LPC_order] */ + const opus_int K, /* I Number of codebook vectors */ + const opus_int LPC_order /* I Number of LPCs */ +); + +/* Delayed-decision quantizer for NLSF residuals */ +opus_int32 silk_NLSF_del_dec_quant( /* O Returns RD value in Q25 */ + opus_int8 indices[], /* O Quantization indices [ order ] */ + const opus_int16 x_Q10[], /* I Input [ order ] */ + const opus_int16 w_Q5[], /* I Weights [ order ] */ + const opus_uint8 pred_coef_Q8[], /* I Backward predictor coefs [ order ] */ + const opus_int16 ec_ix[], /* I Indices to entropy coding tables [ order ] */ + const opus_uint8 ec_rates_Q5[], /* I Rates [] */ + const opus_int quant_step_size_Q16, /* I Quantization step size */ + const opus_int16 inv_quant_step_size_Q6, /* I Inverse quantization step size */ + const opus_int32 mu_Q20, /* I R/D tradeoff */ + const opus_int16 order /* I Number of input values */ +); + +/* Unpack predictor values and indices for entropy coding tables */ +void silk_NLSF_unpack( + opus_int16 ec_ix[], /* O Indices to entropy tables [ LPC_ORDER ] */ + opus_uint8 pred_Q8[], /* O LSF predictor [ LPC_ORDER ] */ + const silk_NLSF_CB_struct *psNLSF_CB, /* I Codebook object */ + const opus_int CB1_index /* I Index of vector in first LSF codebook */ +); + +/***********************/ +/* NLSF vector decoder */ +/***********************/ +void silk_NLSF_decode( + opus_int16 *pNLSF_Q15, /* O Quantized NLSF vector [ LPC_ORDER ] */ + opus_int8 *NLSFIndices, /* I Codebook path vector [ LPC_ORDER + 1 ] */ + const silk_NLSF_CB_struct *psNLSF_CB /* I Codebook object */ +); + +/****************************************************/ +/* Decoder Functions */ +/****************************************************/ +opus_int silk_init_decoder( + silk_decoder_state *psDec /* I/O Decoder state pointer */ +); + +/* Set decoder sampling rate */ +opus_int silk_decoder_set_fs( + silk_decoder_state *psDec, /* I/O Decoder state pointer */ + opus_int fs_kHz, /* I Sampling frequency (kHz) */ + opus_int32 fs_API_Hz /* I API Sampling frequency (Hz) */ +); + +/****************/ +/* Decode frame */ +/****************/ +opus_int silk_decode_frame( + silk_decoder_state *psDec, /* I/O Pointer to Silk decoder state */ + ec_dec *psRangeDec, /* I/O Compressor data structure */ + opus_int16 pOut[], /* O Pointer to output speech frame */ + opus_int32 *pN, /* O Pointer to size of output frame */ + opus_int lostFlag, /* I 0: no loss, 1 loss, 2 decode fec */ + opus_int condCoding, /* I The type of conditional coding to use */ + int arch /* I Run-time architecture */ +); + +/* Decode indices from bitstream */ +void silk_decode_indices( + silk_decoder_state *psDec, /* I/O State */ + ec_dec *psRangeDec, /* I/O Compressor data structure */ + opus_int FrameIndex, /* I Frame number */ + opus_int decode_LBRR, /* I Flag indicating LBRR data is being decoded */ + opus_int condCoding /* I The type of conditional coding to use */ +); + +/* Decode parameters from payload */ +void silk_decode_parameters( + silk_decoder_state *psDec, /* I/O State */ + silk_decoder_control *psDecCtrl, /* I/O Decoder control */ + opus_int condCoding /* I The type of conditional coding to use */ +); + +/* Core decoder. Performs inverse NSQ operation LTP + LPC */ +void silk_decode_core( + silk_decoder_state *psDec, /* I/O Decoder state */ + silk_decoder_control *psDecCtrl, /* I Decoder control */ + opus_int16 xq[], /* O Decoded speech */ + const opus_int16 pulses[ MAX_FRAME_LENGTH ], /* I Pulse signal */ + int arch /* I Run-time architecture */ +); + +/* Decode quantization indices of excitation (Shell coding) */ +void silk_decode_pulses( + ec_dec *psRangeDec, /* I/O Compressor data structure */ + opus_int16 pulses[], /* O Excitation signal */ + const opus_int signalType, /* I Sigtype */ + const opus_int quantOffsetType, /* I quantOffsetType */ + const opus_int frame_length /* I Frame length */ +); + +/******************/ +/* CNG */ +/******************/ + +/* Reset CNG */ +void silk_CNG_Reset( + silk_decoder_state *psDec /* I/O Decoder state */ +); + +/* Updates CNG estimate, and applies the CNG when packet was lost */ +void silk_CNG( + silk_decoder_state *psDec, /* I/O Decoder state */ + silk_decoder_control *psDecCtrl, /* I/O Decoder control */ + opus_int16 frame[], /* I/O Signal */ + opus_int length /* I Length of residual */ +); + +/* Encoding of various parameters */ +void silk_encode_indices( + silk_encoder_state *psEncC, /* I/O Encoder state */ + ec_enc *psRangeEnc, /* I/O Compressor data structure */ + opus_int FrameIndex, /* I Frame number */ + opus_int encode_LBRR, /* I Flag indicating LBRR data is being encoded */ + opus_int condCoding /* I The type of conditional coding to use */ +); + +#endif diff --git a/thirdparty/opus/silk/mips/NSQ_del_dec_mipsr1.h b/thirdparty/opus/silk/mips/NSQ_del_dec_mipsr1.h new file mode 100644 index 000000000000..ad1cfe2a9b08 --- /dev/null +++ b/thirdparty/opus/silk/mips/NSQ_del_dec_mipsr1.h @@ -0,0 +1,409 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef __NSQ_DEL_DEC_MIPSR1_H__ +#define __NSQ_DEL_DEC_MIPSR1_H__ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" +#include "stack_alloc.h" + +#define OVERRIDE_silk_noise_shape_quantizer_del_dec +static inline void silk_noise_shape_quantizer_del_dec( + silk_nsq_state *NSQ, /* I/O NSQ state */ + NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ + opus_int signalType, /* I Signal type */ + const opus_int32 x_Q10[], /* I */ + opus_int8 pulses[], /* O */ + opus_int16 xq[], /* O */ + opus_int32 sLTP_Q15[], /* I/O LTP filter state */ + opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */ + const opus_int16 a_Q12[], /* I Short term prediction coefs */ + const opus_int16 b_Q14[], /* I Long term prediction coefs */ + const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ + opus_int lag, /* I Pitch lag */ + opus_int32 HarmShapeFIRPacked_Q14, /* I */ + opus_int Tilt_Q14, /* I Spectral tilt */ + opus_int32 LF_shp_Q14, /* I */ + opus_int32 Gain_Q16, /* I */ + opus_int Lambda_Q10, /* I */ + opus_int offset_Q10, /* I */ + opus_int length, /* I Input length */ + opus_int subfr, /* I Subframe number */ + opus_int shapingLPCOrder, /* I Shaping LPC filter order */ + opus_int predictLPCOrder, /* I Prediction filter order */ + opus_int warping_Q16, /* I */ + opus_int nStatesDelayedDecision, /* I Number of states in decision tree */ + opus_int *smpl_buf_idx, /* I Index to newest samples in buffers */ + opus_int decisionDelay, /* I */ + int arch /* I */ +) +{ + opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx; + opus_int32 Winner_rand_state; + opus_int32 LTP_pred_Q14, LPC_pred_Q14, n_AR_Q14, n_LTP_Q14; + opus_int32 n_LF_Q14, r_Q10, rr_Q10, rd1_Q10, rd2_Q10, RDmin_Q10, RDmax_Q10; + opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10; + opus_int32 tmp1, tmp2, sLF_AR_shp_Q14; + opus_int32 *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14; + NSQ_sample_struct psSampleState[ MAX_DEL_DEC_STATES ][ 2 ]; + NSQ_del_dec_struct *psDD; + NSQ_sample_struct *psSS; + opus_int16 b_Q14_0, b_Q14_1, b_Q14_2, b_Q14_3, b_Q14_4; + opus_int16 a_Q12_0, a_Q12_1, a_Q12_2, a_Q12_3, a_Q12_4, a_Q12_5, a_Q12_6; + opus_int16 a_Q12_7, a_Q12_8, a_Q12_9, a_Q12_10, a_Q12_11, a_Q12_12, a_Q12_13; + opus_int16 a_Q12_14, a_Q12_15; + + opus_int32 cur, prev, next; + + /*Unused.*/ + (void)arch; + + //Intialize b_Q14 variables + b_Q14_0 = b_Q14[ 0 ]; + b_Q14_1 = b_Q14[ 1 ]; + b_Q14_2 = b_Q14[ 2 ]; + b_Q14_3 = b_Q14[ 3 ]; + b_Q14_4 = b_Q14[ 4 ]; + + //Intialize a_Q12 variables + a_Q12_0 = a_Q12[0]; + a_Q12_1 = a_Q12[1]; + a_Q12_2 = a_Q12[2]; + a_Q12_3 = a_Q12[3]; + a_Q12_4 = a_Q12[4]; + a_Q12_5 = a_Q12[5]; + a_Q12_6 = a_Q12[6]; + a_Q12_7 = a_Q12[7]; + a_Q12_8 = a_Q12[8]; + a_Q12_9 = a_Q12[9]; + a_Q12_10 = a_Q12[10]; + a_Q12_11 = a_Q12[11]; + a_Q12_12 = a_Q12[12]; + a_Q12_13 = a_Q12[13]; + a_Q12_14 = a_Q12[14]; + a_Q12_15 = a_Q12[15]; + + long long temp64; + + silk_assert( nStatesDelayedDecision > 0 ); + + shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ]; + pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; + Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 ); + + for( i = 0; i < length; i++ ) { + /* Perform common calculations used in all states */ + + /* Long-term prediction */ + if( signalType == TYPE_VOICED ) { + /* Unrolled loop */ + /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ + temp64 = __builtin_mips_mult(pred_lag_ptr[ 0 ], b_Q14_0 ); + temp64 = __builtin_mips_madd( temp64, pred_lag_ptr[ -1 ], b_Q14_1 ); + temp64 = __builtin_mips_madd( temp64, pred_lag_ptr[ -2 ], b_Q14_2 ); + temp64 = __builtin_mips_madd( temp64, pred_lag_ptr[ -3 ], b_Q14_3 ); + temp64 = __builtin_mips_madd( temp64, pred_lag_ptr[ -4 ], b_Q14_4 ); + temp64 += 32768; + LTP_pred_Q14 = __builtin_mips_extr_w(temp64, 16); + LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */ + pred_lag_ptr++; + } else { + LTP_pred_Q14 = 0; + } + + /* Long-term shaping */ + if( lag > 0 ) { + /* Symmetric, packed FIR coefficients */ + n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); + n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); + n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */ + shp_lag_ptr++; + } else { + n_LTP_Q14 = 0; + } + + for( k = 0; k < nStatesDelayedDecision; k++ ) { + /* Delayed decision state */ + psDD = &psDelDec[ k ]; + + /* Sample state */ + psSS = psSampleState[ k ]; + + /* Generate dither */ + psDD->Seed = silk_RAND( psDD->Seed ); + + /* Pointer used in short term prediction and shaping */ + psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ]; + /* Short-term prediction */ + silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 ); + temp64 = __builtin_mips_mult(psLPC_Q14[ 0 ], a_Q12_0 ); + temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -1 ], a_Q12_1 ); + temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -2 ], a_Q12_2 ); + temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -3 ], a_Q12_3 ); + temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -4 ], a_Q12_4 ); + temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -5 ], a_Q12_5 ); + temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -6 ], a_Q12_6 ); + temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -7 ], a_Q12_7 ); + temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -8 ], a_Q12_8 ); + temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -9 ], a_Q12_9 ); + if( predictLPCOrder == 16 ) { + temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -10 ], a_Q12_10 ); + temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -11 ], a_Q12_11 ); + temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -12 ], a_Q12_12 ); + temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -13 ], a_Q12_13 ); + temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -14 ], a_Q12_14 ); + temp64 = __builtin_mips_madd( temp64, psLPC_Q14[ -15 ], a_Q12_15 ); + } + temp64 += 32768; + LPC_pred_Q14 = __builtin_mips_extr_w(temp64, 16); + + LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */ + + /* Noise shape feedback */ + silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */ + /* Output of lowpass section */ + tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 ); + /* Output of allpass section */ + tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 ); + psDD->sAR2_Q14[ 0 ] = tmp2; + + temp64 = __builtin_mips_mult(tmp2, AR_shp_Q13[ 0 ] ); + + prev = psDD->sAR2_Q14[ 1 ]; + + /* Loop over allpass sections */ + for( j = 2; j < shapingLPCOrder; j += 2 ) { + cur = psDD->sAR2_Q14[ j ]; + next = psDD->sAR2_Q14[ j+1 ]; + /* Output of allpass section */ + tmp2 = silk_SMLAWB( prev, cur - tmp1, warping_Q16 ); + psDD->sAR2_Q14[ j - 1 ] = tmp1; + temp64 = __builtin_mips_madd( temp64, tmp1, AR_shp_Q13[ j - 1 ] ); + temp64 = __builtin_mips_madd( temp64, tmp2, AR_shp_Q13[ j ] ); + /* Output of allpass section */ + tmp1 = silk_SMLAWB( cur, next - tmp2, warping_Q16 ); + psDD->sAR2_Q14[ j + 0 ] = tmp2; + prev = next; + } + psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1; + temp64 = __builtin_mips_madd( temp64, tmp1, AR_shp_Q13[ shapingLPCOrder - 1 ] ); + temp64 += 32768; + n_AR_Q14 = __builtin_mips_extr_w(temp64, 16); + n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 ); /* Q11 -> Q12 */ + n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 ); /* Q12 */ + n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 2 ); /* Q12 -> Q14 */ + + n_LF_Q14 = silk_SMULWB( psDD->Shape_Q14[ *smpl_buf_idx ], LF_shp_Q14 ); /* Q12 */ + n_LF_Q14 = silk_SMLAWT( n_LF_Q14, psDD->LF_AR_Q14, LF_shp_Q14 ); /* Q12 */ + n_LF_Q14 = silk_LSHIFT( n_LF_Q14, 2 ); /* Q12 -> Q14 */ + + /* Input minus prediction plus noise feedback */ + /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */ + tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 ); /* Q14 */ + tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */ + tmp1 = silk_SUB32( tmp2, tmp1 ); /* Q13 */ + tmp1 = silk_RSHIFT_ROUND( tmp1, 4 ); /* Q10 */ + + r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 ); /* residual error Q10 */ + + /* Flip sign depending on dither */ + if ( psDD->Seed < 0 ) { + r_Q10 = -r_Q10; + } + r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 ); + + /* Find two quantization level candidates and measure their rate-distortion */ + q1_Q10 = silk_SUB32( r_Q10, offset_Q10 ); + q1_Q0 = silk_RSHIFT( q1_Q10, 10 ); + if( q1_Q0 > 0 ) { + q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 ); + q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 ); + q2_Q10 = silk_ADD32( q1_Q10, 1024 ); + rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 ); + rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 ); + } else if( q1_Q0 == 0 ) { + q1_Q10 = offset_Q10; + q2_Q10 = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 ); + rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 ); + rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 ); + } else if( q1_Q0 == -1 ) { + q2_Q10 = offset_Q10; + q1_Q10 = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 ); + rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 ); + rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 ); + } else { /* q1_Q0 < -1 */ + q1_Q10 = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 ); + q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 ); + q2_Q10 = silk_ADD32( q1_Q10, 1024 ); + rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 ); + rd2_Q10 = silk_SMULBB( -q2_Q10, Lambda_Q10 ); + } + rr_Q10 = silk_SUB32( r_Q10, q1_Q10 ); + rd1_Q10 = silk_RSHIFT( silk_SMLABB( rd1_Q10, rr_Q10, rr_Q10 ), 10 ); + rr_Q10 = silk_SUB32( r_Q10, q2_Q10 ); + rd2_Q10 = silk_RSHIFT( silk_SMLABB( rd2_Q10, rr_Q10, rr_Q10 ), 10 ); + + if( rd1_Q10 < rd2_Q10 ) { + psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 ); + psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 ); + psSS[ 0 ].Q_Q10 = q1_Q10; + psSS[ 1 ].Q_Q10 = q2_Q10; + } else { + psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 ); + psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 ); + psSS[ 0 ].Q_Q10 = q2_Q10; + psSS[ 1 ].Q_Q10 = q1_Q10; + } + + /* Update states for best quantization */ + + /* Quantized excitation */ + exc_Q14 = silk_LSHIFT32( psSS[ 0 ].Q_Q10, 4 ); + if ( psDD->Seed < 0 ) { + exc_Q14 = -exc_Q14; + } + + /* Add predictions */ + LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 ); + xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 ); + + /* Update states */ + sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 ); + psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 ); + psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14; + psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14; + psSS[ 0 ].xq_Q14 = xq_Q14; + + /* Update states for second best quantization */ + + /* Quantized excitation */ + exc_Q14 = silk_LSHIFT32( psSS[ 1 ].Q_Q10, 4 ); + if ( psDD->Seed < 0 ) { + exc_Q14 = -exc_Q14; + } + + + /* Add predictions */ + LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 ); + xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 ); + + /* Update states */ + sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 ); + psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 ); + psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14; + psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14; + psSS[ 1 ].xq_Q14 = xq_Q14; + } + + *smpl_buf_idx = ( *smpl_buf_idx - 1 ) & DECISION_DELAY_MASK; /* Index to newest samples */ + last_smple_idx = ( *smpl_buf_idx + decisionDelay ) & DECISION_DELAY_MASK; /* Index to decisionDelay old samples */ + + /* Find winner */ + RDmin_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10; + Winner_ind = 0; + for( k = 1; k < nStatesDelayedDecision; k++ ) { + if( psSampleState[ k ][ 0 ].RD_Q10 < RDmin_Q10 ) { + RDmin_Q10 = psSampleState[ k ][ 0 ].RD_Q10; + Winner_ind = k; + } + } + + /* Increase RD values of expired states */ + Winner_rand_state = psDelDec[ Winner_ind ].RandState[ last_smple_idx ]; + for( k = 0; k < nStatesDelayedDecision; k++ ) { + if( psDelDec[ k ].RandState[ last_smple_idx ] != Winner_rand_state ) { + psSampleState[ k ][ 0 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 0 ].RD_Q10, silk_int32_MAX >> 4 ); + psSampleState[ k ][ 1 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 1 ].RD_Q10, silk_int32_MAX >> 4 ); + silk_assert( psSampleState[ k ][ 0 ].RD_Q10 >= 0 ); + } + } + + /* Find worst in first set and best in second set */ + RDmax_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10; + RDmin_Q10 = psSampleState[ 0 ][ 1 ].RD_Q10; + RDmax_ind = 0; + RDmin_ind = 0; + for( k = 1; k < nStatesDelayedDecision; k++ ) { + /* find worst in first set */ + if( psSampleState[ k ][ 0 ].RD_Q10 > RDmax_Q10 ) { + RDmax_Q10 = psSampleState[ k ][ 0 ].RD_Q10; + RDmax_ind = k; + } + /* find best in second set */ + if( psSampleState[ k ][ 1 ].RD_Q10 < RDmin_Q10 ) { + RDmin_Q10 = psSampleState[ k ][ 1 ].RD_Q10; + RDmin_ind = k; + } + } + + /* Replace a state if best from second set outperforms worst in first set */ + if( RDmin_Q10 < RDmax_Q10 ) { + silk_memcpy( ( (opus_int32 *)&psDelDec[ RDmax_ind ] ) + i, + ( (opus_int32 *)&psDelDec[ RDmin_ind ] ) + i, sizeof( NSQ_del_dec_struct ) - i * sizeof( opus_int32) ); + silk_memcpy( &psSampleState[ RDmax_ind ][ 0 ], &psSampleState[ RDmin_ind ][ 1 ], sizeof( NSQ_sample_struct ) ); + } + + /* Write samples from winner to output and long-term filter states */ + psDD = &psDelDec[ Winner_ind ]; + if( subfr > 0 || i >= decisionDelay ) { + pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 ); + xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( + silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], delayedGain_Q10[ last_smple_idx ] ), 8 ) ); + NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDD->Shape_Q14[ last_smple_idx ]; + sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDD->Pred_Q15[ last_smple_idx ]; + } + NSQ->sLTP_shp_buf_idx++; + NSQ->sLTP_buf_idx++; + + /* Update states */ + for( k = 0; k < nStatesDelayedDecision; k++ ) { + psDD = &psDelDec[ k ]; + psSS = &psSampleState[ k ][ 0 ]; + psDD->LF_AR_Q14 = psSS->LF_AR_Q14; + psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14; + psDD->Xq_Q14[ *smpl_buf_idx ] = psSS->xq_Q14; + psDD->Q_Q10[ *smpl_buf_idx ] = psSS->Q_Q10; + psDD->Pred_Q15[ *smpl_buf_idx ] = silk_LSHIFT32( psSS->LPC_exc_Q14, 1 ); + psDD->Shape_Q14[ *smpl_buf_idx ] = psSS->sLTP_shp_Q14; + psDD->Seed = silk_ADD32_ovflw( psDD->Seed, silk_RSHIFT_ROUND( psSS->Q_Q10, 10 ) ); + psDD->RandState[ *smpl_buf_idx ] = psDD->Seed; + psDD->RD_Q10 = psSS->RD_Q10; + } + delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10; + } + /* Update LPC states */ + for( k = 0; k < nStatesDelayedDecision; k++ ) { + psDD = &psDelDec[ k ]; + silk_memcpy( psDD->sLPC_Q14, &psDD->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); + } +} + +#endif /* __NSQ_DEL_DEC_MIPSR1_H__ */ diff --git a/thirdparty/opus/silk/mips/macros_mipsr1.h b/thirdparty/opus/silk/mips/macros_mipsr1.h new file mode 100644 index 000000000000..12ed981a6ee4 --- /dev/null +++ b/thirdparty/opus/silk/mips/macros_mipsr1.h @@ -0,0 +1,92 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + + +#ifndef __SILK_MACROS_MIPSR1_H__ +#define __SILK_MACROS_MIPSR1_H__ + +#define mips_clz(x) __builtin_clz(x) + +#undef silk_SMULWB +static inline int silk_SMULWB(int a, int b) +{ + long long ac; + int c; + + ac = __builtin_mips_mult(a, (opus_int32)(opus_int16)b); + c = __builtin_mips_extr_w(ac, 16); + + return c; +} + +#undef silk_SMLAWB +#define silk_SMLAWB(a32, b32, c32) ((a32) + silk_SMULWB(b32, c32)) + +#undef silk_SMULWW +static inline int silk_SMULWW(int a, int b) +{ + long long ac; + int c; + + ac = __builtin_mips_mult(a, b); + c = __builtin_mips_extr_w(ac, 16); + + return c; +} + +#undef silk_SMLAWW +static inline int silk_SMLAWW(int a, int b, int c) +{ + long long ac; + int res; + + ac = __builtin_mips_mult(b, c); + res = __builtin_mips_extr_w(ac, 16); + res += a; + + return res; +} + +#define OVERRIDE_silk_CLZ16 +static inline opus_int32 silk_CLZ16(opus_int16 in16) +{ + int re32; + opus_int32 in32 = (opus_int32 )in16; + re32 = mips_clz(in32); + re32-=16; + return re32; +} + +#define OVERRIDE_silk_CLZ32 +static inline opus_int32 silk_CLZ32(opus_int32 in32) +{ + int re32; + re32 = mips_clz(in32); + return re32; +} + +#endif /* __SILK_MACROS_MIPSR1_H__ */ diff --git a/thirdparty/opus/silk/mips/sigproc_fix_mipsr1.h b/thirdparty/opus/silk/mips/sigproc_fix_mipsr1.h new file mode 100644 index 000000000000..3b0a6953656d --- /dev/null +++ b/thirdparty/opus/silk/mips/sigproc_fix_mipsr1.h @@ -0,0 +1,65 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_SIGPROC_FIX_MIPSR1_H +#define SILK_SIGPROC_FIX_MIPSR1_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +#undef silk_SAT16 +static inline short int silk_SAT16(int a) +{ + int c; + c = __builtin_mips_shll_s_w(a, 16); + c = c>>16; + + return c; +} + +#undef silk_LSHIFT_SAT32 +static inline int silk_LSHIFT_SAT32(int a, int shift) +{ + int r; + + r = __builtin_mips_shll_s_w(a, shift); + + return r; +} + +#undef silk_RSHIFT_ROUND +static inline int silk_RSHIFT_ROUND(int a, int shift) +{ + int r; + + r = __builtin_mips_shra_r_w(a, shift); + return r; +} + +#endif /* SILK_SIGPROC_FIX_MIPSR1_H */ diff --git a/thirdparty/opus/silk/pitch_est_defines.h b/thirdparty/opus/silk/pitch_est_defines.h new file mode 100644 index 000000000000..e1e4b5d76866 --- /dev/null +++ b/thirdparty/opus/silk/pitch_est_defines.h @@ -0,0 +1,88 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_PE_DEFINES_H +#define SILK_PE_DEFINES_H + +#include "SigProc_FIX.h" + +/********************************************************/ +/* Definitions for pitch estimator */ +/********************************************************/ + +#define PE_MAX_FS_KHZ 16 /* Maximum sampling frequency used */ + +#define PE_MAX_NB_SUBFR 4 +#define PE_SUBFR_LENGTH_MS 5 /* 5 ms */ + +#define PE_LTP_MEM_LENGTH_MS ( 4 * PE_SUBFR_LENGTH_MS ) + +#define PE_MAX_FRAME_LENGTH_MS ( PE_LTP_MEM_LENGTH_MS + PE_MAX_NB_SUBFR * PE_SUBFR_LENGTH_MS ) +#define PE_MAX_FRAME_LENGTH ( PE_MAX_FRAME_LENGTH_MS * PE_MAX_FS_KHZ ) +#define PE_MAX_FRAME_LENGTH_ST_1 ( PE_MAX_FRAME_LENGTH >> 2 ) +#define PE_MAX_FRAME_LENGTH_ST_2 ( PE_MAX_FRAME_LENGTH >> 1 ) + +#define PE_MAX_LAG_MS 18 /* 18 ms -> 56 Hz */ +#define PE_MIN_LAG_MS 2 /* 2 ms -> 500 Hz */ +#define PE_MAX_LAG ( PE_MAX_LAG_MS * PE_MAX_FS_KHZ ) +#define PE_MIN_LAG ( PE_MIN_LAG_MS * PE_MAX_FS_KHZ ) + +#define PE_D_SRCH_LENGTH 24 + +#define PE_NB_STAGE3_LAGS 5 + +#define PE_NB_CBKS_STAGE2 3 +#define PE_NB_CBKS_STAGE2_EXT 11 + +#define PE_NB_CBKS_STAGE3_MAX 34 +#define PE_NB_CBKS_STAGE3_MID 24 +#define PE_NB_CBKS_STAGE3_MIN 16 + +#define PE_NB_CBKS_STAGE3_10MS 12 +#define PE_NB_CBKS_STAGE2_10MS 3 + +#define PE_SHORTLAG_BIAS 0.2f /* for logarithmic weighting */ +#define PE_PREVLAG_BIAS 0.2f /* for logarithmic weighting */ +#define PE_FLATCONTOUR_BIAS 0.05f + +#define SILK_PE_MIN_COMPLEX 0 +#define SILK_PE_MID_COMPLEX 1 +#define SILK_PE_MAX_COMPLEX 2 + +/* Tables for 20 ms frames */ +extern const opus_int8 silk_CB_lags_stage2[ PE_MAX_NB_SUBFR ][ PE_NB_CBKS_STAGE2_EXT ]; +extern const opus_int8 silk_CB_lags_stage3[ PE_MAX_NB_SUBFR ][ PE_NB_CBKS_STAGE3_MAX ]; +extern const opus_int8 silk_Lag_range_stage3[ SILK_PE_MAX_COMPLEX + 1 ] [ PE_MAX_NB_SUBFR ][ 2 ]; +extern const opus_int8 silk_nb_cbk_searchs_stage3[ SILK_PE_MAX_COMPLEX + 1 ]; + +/* Tables for 10 ms frames */ +extern const opus_int8 silk_CB_lags_stage2_10_ms[ PE_MAX_NB_SUBFR >> 1][ 3 ]; +extern const opus_int8 silk_CB_lags_stage3_10_ms[ PE_MAX_NB_SUBFR >> 1 ][ 12 ]; +extern const opus_int8 silk_Lag_range_stage3_10_ms[ PE_MAX_NB_SUBFR >> 1 ][ 2 ]; + +#endif + diff --git a/thirdparty/opus/silk/pitch_est_tables.c b/thirdparty/opus/silk/pitch_est_tables.c new file mode 100644 index 000000000000..81a8bacaca2b --- /dev/null +++ b/thirdparty/opus/silk/pitch_est_tables.c @@ -0,0 +1,99 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "typedef.h" +#include "pitch_est_defines.h" + +const opus_int8 silk_CB_lags_stage2_10_ms[ PE_MAX_NB_SUBFR >> 1][ PE_NB_CBKS_STAGE2_10MS ] = +{ + {0, 1, 0}, + {0, 0, 1} +}; + +const opus_int8 silk_CB_lags_stage3_10_ms[ PE_MAX_NB_SUBFR >> 1 ][ PE_NB_CBKS_STAGE3_10MS ] = +{ + { 0, 0, 1,-1, 1,-1, 2,-2, 2,-2, 3,-3}, + { 0, 1, 0, 1,-1, 2,-1, 2,-2, 3,-2, 3} +}; + +const opus_int8 silk_Lag_range_stage3_10_ms[ PE_MAX_NB_SUBFR >> 1 ][ 2 ] = +{ + {-3, 7}, + {-2, 7} +}; + +const opus_int8 silk_CB_lags_stage2[ PE_MAX_NB_SUBFR ][ PE_NB_CBKS_STAGE2_EXT ] = +{ + {0, 2,-1,-1,-1, 0, 0, 1, 1, 0, 1}, + {0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0}, + {0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0}, + {0,-1, 2, 1, 0, 1, 1, 0, 0,-1,-1} +}; + +const opus_int8 silk_CB_lags_stage3[ PE_MAX_NB_SUBFR ][ PE_NB_CBKS_STAGE3_MAX ] = +{ + {0, 0, 1,-1, 0, 1,-1, 0,-1, 1,-2, 2,-2,-2, 2,-3, 2, 3,-3,-4, 3,-4, 4, 4,-5, 5,-6,-5, 6,-7, 6, 5, 8,-9}, + {0, 0, 1, 0, 0, 0, 0, 0, 0, 0,-1, 1, 0, 0, 1,-1, 0, 1,-1,-1, 1,-1, 2, 1,-1, 2,-2,-2, 2,-2, 2, 2, 3,-3}, + {0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,-1, 1, 0, 0, 2, 1,-1, 2,-1,-1, 2,-1, 2, 2,-1, 3,-2,-2,-2, 3}, + {0, 1, 0, 0, 1, 0, 1,-1, 2,-1, 2,-1, 2, 3,-2, 3,-2,-2, 4, 4,-3, 5,-3,-4, 6,-4, 6, 5,-5, 8,-6,-5,-7, 9} +}; + +const opus_int8 silk_Lag_range_stage3[ SILK_PE_MAX_COMPLEX + 1 ] [ PE_MAX_NB_SUBFR ][ 2 ] = +{ + /* Lags to search for low number of stage3 cbks */ + { + {-5,8}, + {-1,6}, + {-1,6}, + {-4,10} + }, + /* Lags to search for middle number of stage3 cbks */ + { + {-6,10}, + {-2,6}, + {-1,6}, + {-5,10} + }, + /* Lags to search for max number of stage3 cbks */ + { + {-9,12}, + {-3,7}, + {-2,7}, + {-7,13} + } +}; + +const opus_int8 silk_nb_cbk_searchs_stage3[ SILK_PE_MAX_COMPLEX + 1 ] = +{ + PE_NB_CBKS_STAGE3_MIN, + PE_NB_CBKS_STAGE3_MID, + PE_NB_CBKS_STAGE3_MAX +}; diff --git a/thirdparty/opus/silk/process_NLSFs.c b/thirdparty/opus/silk/process_NLSFs.c new file mode 100644 index 000000000000..0ab71f016340 --- /dev/null +++ b/thirdparty/opus/silk/process_NLSFs.c @@ -0,0 +1,107 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* Limit, stabilize, convert and quantize NLSFs */ +void silk_process_NLSFs( + silk_encoder_state *psEncC, /* I/O Encoder state */ + opus_int16 PredCoef_Q12[ 2 ][ MAX_LPC_ORDER ], /* O Prediction coefficients */ + opus_int16 pNLSF_Q15[ MAX_LPC_ORDER ], /* I/O Normalized LSFs (quant out) (0 - (2^15-1)) */ + const opus_int16 prev_NLSFq_Q15[ MAX_LPC_ORDER ] /* I Previous Normalized LSFs (0 - (2^15-1)) */ +) +{ + opus_int i, doInterpolate; + opus_int NLSF_mu_Q20; + opus_int16 i_sqr_Q15; + opus_int16 pNLSF0_temp_Q15[ MAX_LPC_ORDER ]; + opus_int16 pNLSFW_QW[ MAX_LPC_ORDER ]; + opus_int16 pNLSFW0_temp_QW[ MAX_LPC_ORDER ]; + + silk_assert( psEncC->speech_activity_Q8 >= 0 ); + silk_assert( psEncC->speech_activity_Q8 <= SILK_FIX_CONST( 1.0, 8 ) ); + silk_assert( psEncC->useInterpolatedNLSFs == 1 || psEncC->indices.NLSFInterpCoef_Q2 == ( 1 << 2 ) ); + + /***********************/ + /* Calculate mu values */ + /***********************/ + /* NLSF_mu = 0.003 - 0.0015 * psEnc->speech_activity; */ + NLSF_mu_Q20 = silk_SMLAWB( SILK_FIX_CONST( 0.003, 20 ), SILK_FIX_CONST( -0.001, 28 ), psEncC->speech_activity_Q8 ); + if( psEncC->nb_subfr == 2 ) { + /* Multiply by 1.5 for 10 ms packets */ + NLSF_mu_Q20 = silk_ADD_RSHIFT( NLSF_mu_Q20, NLSF_mu_Q20, 1 ); + } + + silk_assert( NLSF_mu_Q20 > 0 ); + silk_assert( NLSF_mu_Q20 <= SILK_FIX_CONST( 0.005, 20 ) ); + + /* Calculate NLSF weights */ + silk_NLSF_VQ_weights_laroia( pNLSFW_QW, pNLSF_Q15, psEncC->predictLPCOrder ); + + /* Update NLSF weights for interpolated NLSFs */ + doInterpolate = ( psEncC->useInterpolatedNLSFs == 1 ) && ( psEncC->indices.NLSFInterpCoef_Q2 < 4 ); + if( doInterpolate ) { + /* Calculate the interpolated NLSF vector for the first half */ + silk_interpolate( pNLSF0_temp_Q15, prev_NLSFq_Q15, pNLSF_Q15, + psEncC->indices.NLSFInterpCoef_Q2, psEncC->predictLPCOrder ); + + /* Calculate first half NLSF weights for the interpolated NLSFs */ + silk_NLSF_VQ_weights_laroia( pNLSFW0_temp_QW, pNLSF0_temp_Q15, psEncC->predictLPCOrder ); + + /* Update NLSF weights with contribution from first half */ + i_sqr_Q15 = silk_LSHIFT( silk_SMULBB( psEncC->indices.NLSFInterpCoef_Q2, psEncC->indices.NLSFInterpCoef_Q2 ), 11 ); + for( i = 0; i < psEncC->predictLPCOrder; i++ ) { + pNLSFW_QW[ i ] = silk_ADD16( silk_RSHIFT( pNLSFW_QW[ i ], 1 ), silk_RSHIFT( + silk_SMULBB( pNLSFW0_temp_QW[ i ], i_sqr_Q15 ), 16) ); + silk_assert( pNLSFW_QW[ i ] >= 1 ); + } + } + + silk_NLSF_encode( psEncC->indices.NLSFIndices, pNLSF_Q15, psEncC->psNLSF_CB, pNLSFW_QW, + NLSF_mu_Q20, psEncC->NLSF_MSVQ_Survivors, psEncC->indices.signalType ); + + /* Convert quantized NLSFs back to LPC coefficients */ + silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder ); + + if( doInterpolate ) { + /* Calculate the interpolated, quantized LSF vector for the first half */ + silk_interpolate( pNLSF0_temp_Q15, prev_NLSFq_Q15, pNLSF_Q15, + psEncC->indices.NLSFInterpCoef_Q2, psEncC->predictLPCOrder ); + + /* Convert back to LPC coefficients */ + silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEncC->predictLPCOrder ); + + } else { + /* Copy LPC coefficients for first half from second half */ + silk_assert( psEncC->predictLPCOrder <= MAX_LPC_ORDER ); + silk_memcpy( PredCoef_Q12[ 0 ], PredCoef_Q12[ 1 ], psEncC->predictLPCOrder * sizeof( opus_int16 ) ); + } +} diff --git a/thirdparty/opus/silk/quant_LTP_gains.c b/thirdparty/opus/silk/quant_LTP_gains.c new file mode 100644 index 000000000000..513a8c4468ff --- /dev/null +++ b/thirdparty/opus/silk/quant_LTP_gains.c @@ -0,0 +1,129 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" +#include "tuning_parameters.h" + +void silk_quant_LTP_gains( + opus_int16 B_Q14[ MAX_NB_SUBFR * LTP_ORDER ], /* I/O (un)quantized LTP gains */ + opus_int8 cbk_index[ MAX_NB_SUBFR ], /* O Codebook Index */ + opus_int8 *periodicity_index, /* O Periodicity Index */ + opus_int32 *sum_log_gain_Q7, /* I/O Cumulative max prediction gain */ + const opus_int32 W_Q18[ MAX_NB_SUBFR*LTP_ORDER*LTP_ORDER ], /* I Error Weights in Q18 */ + opus_int mu_Q9, /* I Mu value (R/D tradeoff) */ + opus_int lowComplexity, /* I Flag for low complexity */ + const opus_int nb_subfr, /* I number of subframes */ + int arch /* I Run-time architecture */ +) +{ + opus_int j, k, cbk_size; + opus_int8 temp_idx[ MAX_NB_SUBFR ]; + const opus_uint8 *cl_ptr_Q5; + const opus_int8 *cbk_ptr_Q7; + const opus_uint8 *cbk_gain_ptr_Q7; + const opus_int16 *b_Q14_ptr; + const opus_int32 *W_Q18_ptr; + opus_int32 rate_dist_Q14_subfr, rate_dist_Q14, min_rate_dist_Q14; + opus_int32 sum_log_gain_tmp_Q7, best_sum_log_gain_Q7, max_gain_Q7, gain_Q7; + + /***************************************************/ + /* iterate over different codebooks with different */ + /* rates/distortions, and choose best */ + /***************************************************/ + min_rate_dist_Q14 = silk_int32_MAX; + best_sum_log_gain_Q7 = 0; + for( k = 0; k < 3; k++ ) { + /* Safety margin for pitch gain control, to take into account factors + such as state rescaling/rewhitening. */ + opus_int32 gain_safety = SILK_FIX_CONST( 0.4, 7 ); + + cl_ptr_Q5 = silk_LTP_gain_BITS_Q5_ptrs[ k ]; + cbk_ptr_Q7 = silk_LTP_vq_ptrs_Q7[ k ]; + cbk_gain_ptr_Q7 = silk_LTP_vq_gain_ptrs_Q7[ k ]; + cbk_size = silk_LTP_vq_sizes[ k ]; + + /* Set up pointer to first subframe */ + W_Q18_ptr = W_Q18; + b_Q14_ptr = B_Q14; + + rate_dist_Q14 = 0; + sum_log_gain_tmp_Q7 = *sum_log_gain_Q7; + for( j = 0; j < nb_subfr; j++ ) { + max_gain_Q7 = silk_log2lin( ( SILK_FIX_CONST( MAX_SUM_LOG_GAIN_DB / 6.0, 7 ) - sum_log_gain_tmp_Q7 ) + + SILK_FIX_CONST( 7, 7 ) ) - gain_safety; + + silk_VQ_WMat_EC( + &temp_idx[ j ], /* O index of best codebook vector */ + &rate_dist_Q14_subfr, /* O best weighted quantization error + mu * rate */ + &gain_Q7, /* O sum of absolute LTP coefficients */ + b_Q14_ptr, /* I input vector to be quantized */ + W_Q18_ptr, /* I weighting matrix */ + cbk_ptr_Q7, /* I codebook */ + cbk_gain_ptr_Q7, /* I codebook effective gains */ + cl_ptr_Q5, /* I code length for each codebook vector */ + mu_Q9, /* I tradeoff between weighted error and rate */ + max_gain_Q7, /* I maximum sum of absolute LTP coefficients */ + cbk_size, /* I number of vectors in codebook */ + arch /* I Run-time architecture */ + ); + + rate_dist_Q14 = silk_ADD_POS_SAT32( rate_dist_Q14, rate_dist_Q14_subfr ); + sum_log_gain_tmp_Q7 = silk_max(0, sum_log_gain_tmp_Q7 + + silk_lin2log( gain_safety + gain_Q7 ) - SILK_FIX_CONST( 7, 7 )); + + b_Q14_ptr += LTP_ORDER; + W_Q18_ptr += LTP_ORDER * LTP_ORDER; + } + + /* Avoid never finding a codebook */ + rate_dist_Q14 = silk_min( silk_int32_MAX - 1, rate_dist_Q14 ); + + if( rate_dist_Q14 < min_rate_dist_Q14 ) { + min_rate_dist_Q14 = rate_dist_Q14; + *periodicity_index = (opus_int8)k; + silk_memcpy( cbk_index, temp_idx, nb_subfr * sizeof( opus_int8 ) ); + best_sum_log_gain_Q7 = sum_log_gain_tmp_Q7; + } + + /* Break early in low-complexity mode if rate distortion is below threshold */ + if( lowComplexity && ( rate_dist_Q14 < silk_LTP_gain_middle_avg_RD_Q14 ) ) { + break; + } + } + + cbk_ptr_Q7 = silk_LTP_vq_ptrs_Q7[ *periodicity_index ]; + for( j = 0; j < nb_subfr; j++ ) { + for( k = 0; k < LTP_ORDER; k++ ) { + B_Q14[ j * LTP_ORDER + k ] = silk_LSHIFT( cbk_ptr_Q7[ cbk_index[ j ] * LTP_ORDER + k ], 7 ); + } + } + *sum_log_gain_Q7 = best_sum_log_gain_Q7; +} diff --git a/thirdparty/opus/silk/resampler.c b/thirdparty/opus/silk/resampler.c new file mode 100644 index 000000000000..374fbb37225a --- /dev/null +++ b/thirdparty/opus/silk/resampler.c @@ -0,0 +1,215 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/* + * Matrix of resampling methods used: + * Fs_out (kHz) + * 8 12 16 24 48 + * + * 8 C UF U UF UF + * 12 AF C UF U UF + * Fs_in (kHz) 16 D AF C UF UF + * 24 AF D AF C U + * 48 AF AF AF D C + * + * C -> Copy (no resampling) + * D -> Allpass-based 2x downsampling + * U -> Allpass-based 2x upsampling + * UF -> Allpass-based 2x upsampling followed by FIR interpolation + * AF -> AR2 filter followed by FIR interpolation + */ + +#include "resampler_private.h" + +/* Tables with delay compensation values to equalize total delay for different modes */ +static const opus_int8 delay_matrix_enc[ 5 ][ 3 ] = { +/* in \ out 8 12 16 */ +/* 8 */ { 6, 0, 3 }, +/* 12 */ { 0, 7, 3 }, +/* 16 */ { 0, 1, 10 }, +/* 24 */ { 0, 2, 6 }, +/* 48 */ { 18, 10, 12 } +}; + +static const opus_int8 delay_matrix_dec[ 3 ][ 5 ] = { +/* in \ out 8 12 16 24 48 */ +/* 8 */ { 4, 0, 2, 0, 0 }, +/* 12 */ { 0, 9, 4, 7, 4 }, +/* 16 */ { 0, 3, 12, 7, 7 } +}; + +/* Simple way to make [8000, 12000, 16000, 24000, 48000] to [0, 1, 2, 3, 4] */ +#define rateID(R) ( ( ( ((R)>>12) - ((R)>16000) ) >> ((R)>24000) ) - 1 ) + +#define USE_silk_resampler_copy (0) +#define USE_silk_resampler_private_up2_HQ_wrapper (1) +#define USE_silk_resampler_private_IIR_FIR (2) +#define USE_silk_resampler_private_down_FIR (3) + +/* Initialize/reset the resampler state for a given pair of input/output sampling rates */ +opus_int silk_resampler_init( + silk_resampler_state_struct *S, /* I/O Resampler state */ + opus_int32 Fs_Hz_in, /* I Input sampling rate (Hz) */ + opus_int32 Fs_Hz_out, /* I Output sampling rate (Hz) */ + opus_int forEnc /* I If 1: encoder; if 0: decoder */ +) +{ + opus_int up2x; + + /* Clear state */ + silk_memset( S, 0, sizeof( silk_resampler_state_struct ) ); + + /* Input checking */ + if( forEnc ) { + if( ( Fs_Hz_in != 8000 && Fs_Hz_in != 12000 && Fs_Hz_in != 16000 && Fs_Hz_in != 24000 && Fs_Hz_in != 48000 ) || + ( Fs_Hz_out != 8000 && Fs_Hz_out != 12000 && Fs_Hz_out != 16000 ) ) { + silk_assert( 0 ); + return -1; + } + S->inputDelay = delay_matrix_enc[ rateID( Fs_Hz_in ) ][ rateID( Fs_Hz_out ) ]; + } else { + if( ( Fs_Hz_in != 8000 && Fs_Hz_in != 12000 && Fs_Hz_in != 16000 ) || + ( Fs_Hz_out != 8000 && Fs_Hz_out != 12000 && Fs_Hz_out != 16000 && Fs_Hz_out != 24000 && Fs_Hz_out != 48000 ) ) { + silk_assert( 0 ); + return -1; + } + S->inputDelay = delay_matrix_dec[ rateID( Fs_Hz_in ) ][ rateID( Fs_Hz_out ) ]; + } + + S->Fs_in_kHz = silk_DIV32_16( Fs_Hz_in, 1000 ); + S->Fs_out_kHz = silk_DIV32_16( Fs_Hz_out, 1000 ); + + /* Number of samples processed per batch */ + S->batchSize = S->Fs_in_kHz * RESAMPLER_MAX_BATCH_SIZE_MS; + + /* Find resampler with the right sampling ratio */ + up2x = 0; + if( Fs_Hz_out > Fs_Hz_in ) { + /* Upsample */ + if( Fs_Hz_out == silk_MUL( Fs_Hz_in, 2 ) ) { /* Fs_out : Fs_in = 2 : 1 */ + /* Special case: directly use 2x upsampler */ + S->resampler_function = USE_silk_resampler_private_up2_HQ_wrapper; + } else { + /* Default resampler */ + S->resampler_function = USE_silk_resampler_private_IIR_FIR; + up2x = 1; + } + } else if ( Fs_Hz_out < Fs_Hz_in ) { + /* Downsample */ + S->resampler_function = USE_silk_resampler_private_down_FIR; + if( silk_MUL( Fs_Hz_out, 4 ) == silk_MUL( Fs_Hz_in, 3 ) ) { /* Fs_out : Fs_in = 3 : 4 */ + S->FIR_Fracs = 3; + S->FIR_Order = RESAMPLER_DOWN_ORDER_FIR0; + S->Coefs = silk_Resampler_3_4_COEFS; + } else if( silk_MUL( Fs_Hz_out, 3 ) == silk_MUL( Fs_Hz_in, 2 ) ) { /* Fs_out : Fs_in = 2 : 3 */ + S->FIR_Fracs = 2; + S->FIR_Order = RESAMPLER_DOWN_ORDER_FIR0; + S->Coefs = silk_Resampler_2_3_COEFS; + } else if( silk_MUL( Fs_Hz_out, 2 ) == Fs_Hz_in ) { /* Fs_out : Fs_in = 1 : 2 */ + S->FIR_Fracs = 1; + S->FIR_Order = RESAMPLER_DOWN_ORDER_FIR1; + S->Coefs = silk_Resampler_1_2_COEFS; + } else if( silk_MUL( Fs_Hz_out, 3 ) == Fs_Hz_in ) { /* Fs_out : Fs_in = 1 : 3 */ + S->FIR_Fracs = 1; + S->FIR_Order = RESAMPLER_DOWN_ORDER_FIR2; + S->Coefs = silk_Resampler_1_3_COEFS; + } else if( silk_MUL( Fs_Hz_out, 4 ) == Fs_Hz_in ) { /* Fs_out : Fs_in = 1 : 4 */ + S->FIR_Fracs = 1; + S->FIR_Order = RESAMPLER_DOWN_ORDER_FIR2; + S->Coefs = silk_Resampler_1_4_COEFS; + } else if( silk_MUL( Fs_Hz_out, 6 ) == Fs_Hz_in ) { /* Fs_out : Fs_in = 1 : 6 */ + S->FIR_Fracs = 1; + S->FIR_Order = RESAMPLER_DOWN_ORDER_FIR2; + S->Coefs = silk_Resampler_1_6_COEFS; + } else { + /* None available */ + silk_assert( 0 ); + return -1; + } + } else { + /* Input and output sampling rates are equal: copy */ + S->resampler_function = USE_silk_resampler_copy; + } + + /* Ratio of input/output samples */ + S->invRatio_Q16 = silk_LSHIFT32( silk_DIV32( silk_LSHIFT32( Fs_Hz_in, 14 + up2x ), Fs_Hz_out ), 2 ); + /* Make sure the ratio is rounded up */ + while( silk_SMULWW( S->invRatio_Q16, Fs_Hz_out ) < silk_LSHIFT32( Fs_Hz_in, up2x ) ) { + S->invRatio_Q16++; + } + + return 0; +} + +/* Resampler: convert from one sampling rate to another */ +/* Input and output sampling rate are at most 48000 Hz */ +opus_int silk_resampler( + silk_resampler_state_struct *S, /* I/O Resampler state */ + opus_int16 out[], /* O Output signal */ + const opus_int16 in[], /* I Input signal */ + opus_int32 inLen /* I Number of input samples */ +) +{ + opus_int nSamples; + + /* Need at least 1 ms of input data */ + silk_assert( inLen >= S->Fs_in_kHz ); + /* Delay can't exceed the 1 ms of buffering */ + silk_assert( S->inputDelay <= S->Fs_in_kHz ); + + nSamples = S->Fs_in_kHz - S->inputDelay; + + /* Copy to delay buffer */ + silk_memcpy( &S->delayBuf[ S->inputDelay ], in, nSamples * sizeof( opus_int16 ) ); + + switch( S->resampler_function ) { + case USE_silk_resampler_private_up2_HQ_wrapper: + silk_resampler_private_up2_HQ_wrapper( S, out, S->delayBuf, S->Fs_in_kHz ); + silk_resampler_private_up2_HQ_wrapper( S, &out[ S->Fs_out_kHz ], &in[ nSamples ], inLen - S->Fs_in_kHz ); + break; + case USE_silk_resampler_private_IIR_FIR: + silk_resampler_private_IIR_FIR( S, out, S->delayBuf, S->Fs_in_kHz ); + silk_resampler_private_IIR_FIR( S, &out[ S->Fs_out_kHz ], &in[ nSamples ], inLen - S->Fs_in_kHz ); + break; + case USE_silk_resampler_private_down_FIR: + silk_resampler_private_down_FIR( S, out, S->delayBuf, S->Fs_in_kHz ); + silk_resampler_private_down_FIR( S, &out[ S->Fs_out_kHz ], &in[ nSamples ], inLen - S->Fs_in_kHz ); + break; + default: + silk_memcpy( out, S->delayBuf, S->Fs_in_kHz * sizeof( opus_int16 ) ); + silk_memcpy( &out[ S->Fs_out_kHz ], &in[ nSamples ], ( inLen - S->Fs_in_kHz ) * sizeof( opus_int16 ) ); + } + + /* Copy to delay buffer */ + silk_memcpy( S->delayBuf, &in[ inLen - S->inputDelay ], S->inputDelay * sizeof( opus_int16 ) ); + + return 0; +} diff --git a/thirdparty/opus/silk/resampler_down2.c b/thirdparty/opus/silk/resampler_down2.c new file mode 100644 index 000000000000..cec36346408b --- /dev/null +++ b/thirdparty/opus/silk/resampler_down2.c @@ -0,0 +1,74 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" +#include "resampler_rom.h" + +/* Downsample by a factor 2 */ +void silk_resampler_down2( + opus_int32 *S, /* I/O State vector [ 2 ] */ + opus_int16 *out, /* O Output signal [ floor(len/2) ] */ + const opus_int16 *in, /* I Input signal [ len ] */ + opus_int32 inLen /* I Number of input samples */ +) +{ + opus_int32 k, len2 = silk_RSHIFT32( inLen, 1 ); + opus_int32 in32, out32, Y, X; + + silk_assert( silk_resampler_down2_0 > 0 ); + silk_assert( silk_resampler_down2_1 < 0 ); + + /* Internal variables and state are in Q10 format */ + for( k = 0; k < len2; k++ ) { + /* Convert to Q10 */ + in32 = silk_LSHIFT( (opus_int32)in[ 2 * k ], 10 ); + + /* All-pass section for even input sample */ + Y = silk_SUB32( in32, S[ 0 ] ); + X = silk_SMLAWB( Y, Y, silk_resampler_down2_1 ); + out32 = silk_ADD32( S[ 0 ], X ); + S[ 0 ] = silk_ADD32( in32, X ); + + /* Convert to Q10 */ + in32 = silk_LSHIFT( (opus_int32)in[ 2 * k + 1 ], 10 ); + + /* All-pass section for odd input sample, and add to output of previous section */ + Y = silk_SUB32( in32, S[ 1 ] ); + X = silk_SMULWB( Y, silk_resampler_down2_0 ); + out32 = silk_ADD32( out32, S[ 1 ] ); + out32 = silk_ADD32( out32, X ); + S[ 1 ] = silk_ADD32( in32, X ); + + /* Add, convert back to int16 and store to output */ + out[ k ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( out32, 11 ) ); + } +} + diff --git a/thirdparty/opus/silk/resampler_down2_3.c b/thirdparty/opus/silk/resampler_down2_3.c new file mode 100644 index 000000000000..4342614dcccc --- /dev/null +++ b/thirdparty/opus/silk/resampler_down2_3.c @@ -0,0 +1,103 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" +#include "resampler_private.h" +#include "stack_alloc.h" + +#define ORDER_FIR 4 + +/* Downsample by a factor 2/3, low quality */ +void silk_resampler_down2_3( + opus_int32 *S, /* I/O State vector [ 6 ] */ + opus_int16 *out, /* O Output signal [ floor(2*inLen/3) ] */ + const opus_int16 *in, /* I Input signal [ inLen ] */ + opus_int32 inLen /* I Number of input samples */ +) +{ + opus_int32 nSamplesIn, counter, res_Q6; + VARDECL( opus_int32, buf ); + opus_int32 *buf_ptr; + SAVE_STACK; + + ALLOC( buf, RESAMPLER_MAX_BATCH_SIZE_IN + ORDER_FIR, opus_int32 ); + + /* Copy buffered samples to start of buffer */ + silk_memcpy( buf, S, ORDER_FIR * sizeof( opus_int32 ) ); + + /* Iterate over blocks of frameSizeIn input samples */ + while( 1 ) { + nSamplesIn = silk_min( inLen, RESAMPLER_MAX_BATCH_SIZE_IN ); + + /* Second-order AR filter (output in Q8) */ + silk_resampler_private_AR2( &S[ ORDER_FIR ], &buf[ ORDER_FIR ], in, + silk_Resampler_2_3_COEFS_LQ, nSamplesIn ); + + /* Interpolate filtered signal */ + buf_ptr = buf; + counter = nSamplesIn; + while( counter > 2 ) { + /* Inner product */ + res_Q6 = silk_SMULWB( buf_ptr[ 0 ], silk_Resampler_2_3_COEFS_LQ[ 2 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 1 ], silk_Resampler_2_3_COEFS_LQ[ 3 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 2 ], silk_Resampler_2_3_COEFS_LQ[ 5 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 3 ], silk_Resampler_2_3_COEFS_LQ[ 4 ] ); + + /* Scale down, saturate and store in output array */ + *out++ = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( res_Q6, 6 ) ); + + res_Q6 = silk_SMULWB( buf_ptr[ 1 ], silk_Resampler_2_3_COEFS_LQ[ 4 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 2 ], silk_Resampler_2_3_COEFS_LQ[ 5 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 3 ], silk_Resampler_2_3_COEFS_LQ[ 3 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 4 ], silk_Resampler_2_3_COEFS_LQ[ 2 ] ); + + /* Scale down, saturate and store in output array */ + *out++ = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( res_Q6, 6 ) ); + + buf_ptr += 3; + counter -= 3; + } + + in += nSamplesIn; + inLen -= nSamplesIn; + + if( inLen > 0 ) { + /* More iterations to do; copy last part of filtered signal to beginning of buffer */ + silk_memcpy( buf, &buf[ nSamplesIn ], ORDER_FIR * sizeof( opus_int32 ) ); + } else { + break; + } + } + + /* Copy last part of filtered signal to the state for the next call */ + silk_memcpy( S, &buf[ nSamplesIn ], ORDER_FIR * sizeof( opus_int32 ) ); + RESTORE_STACK; +} diff --git a/thirdparty/opus/silk/resampler_private.h b/thirdparty/opus/silk/resampler_private.h new file mode 100644 index 000000000000..422a7d9d9537 --- /dev/null +++ b/thirdparty/opus/silk/resampler_private.h @@ -0,0 +1,88 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_RESAMPLER_PRIVATE_H +#define SILK_RESAMPLER_PRIVATE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "SigProc_FIX.h" +#include "resampler_structs.h" +#include "resampler_rom.h" + +/* Number of input samples to process in the inner loop */ +#define RESAMPLER_MAX_BATCH_SIZE_MS 10 +#define RESAMPLER_MAX_FS_KHZ 48 +#define RESAMPLER_MAX_BATCH_SIZE_IN ( RESAMPLER_MAX_BATCH_SIZE_MS * RESAMPLER_MAX_FS_KHZ ) + +/* Description: Hybrid IIR/FIR polyphase implementation of resampling */ +void silk_resampler_private_IIR_FIR( + void *SS, /* I/O Resampler state */ + opus_int16 out[], /* O Output signal */ + const opus_int16 in[], /* I Input signal */ + opus_int32 inLen /* I Number of input samples */ +); + +/* Description: Hybrid IIR/FIR polyphase implementation of resampling */ +void silk_resampler_private_down_FIR( + void *SS, /* I/O Resampler state */ + opus_int16 out[], /* O Output signal */ + const opus_int16 in[], /* I Input signal */ + opus_int32 inLen /* I Number of input samples */ +); + +/* Upsample by a factor 2, high quality */ +void silk_resampler_private_up2_HQ_wrapper( + void *SS, /* I/O Resampler state (unused) */ + opus_int16 *out, /* O Output signal [ 2 * len ] */ + const opus_int16 *in, /* I Input signal [ len ] */ + opus_int32 len /* I Number of input samples */ +); + +/* Upsample by a factor 2, high quality */ +void silk_resampler_private_up2_HQ( + opus_int32 *S, /* I/O Resampler state [ 6 ] */ + opus_int16 *out, /* O Output signal [ 2 * len ] */ + const opus_int16 *in, /* I Input signal [ len ] */ + opus_int32 len /* I Number of input samples */ +); + +/* Second order AR filter */ +void silk_resampler_private_AR2( + opus_int32 S[], /* I/O State vector [ 2 ] */ + opus_int32 out_Q8[], /* O Output signal */ + const opus_int16 in[], /* I Input signal */ + const opus_int16 A_Q14[], /* I AR coefficients, Q14 */ + opus_int32 len /* I Signal length */ +); + +#ifdef __cplusplus +} +#endif +#endif /* SILK_RESAMPLER_PRIVATE_H */ diff --git a/thirdparty/opus/silk/resampler_private_AR2.c b/thirdparty/opus/silk/resampler_private_AR2.c new file mode 100644 index 000000000000..5fff23714f9e --- /dev/null +++ b/thirdparty/opus/silk/resampler_private_AR2.c @@ -0,0 +1,55 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" +#include "resampler_private.h" + +/* Second order AR filter with single delay elements */ +void silk_resampler_private_AR2( + opus_int32 S[], /* I/O State vector [ 2 ] */ + opus_int32 out_Q8[], /* O Output signal */ + const opus_int16 in[], /* I Input signal */ + const opus_int16 A_Q14[], /* I AR coefficients, Q14 */ + opus_int32 len /* I Signal length */ +) +{ + opus_int32 k; + opus_int32 out32; + + for( k = 0; k < len; k++ ) { + out32 = silk_ADD_LSHIFT32( S[ 0 ], (opus_int32)in[ k ], 8 ); + out_Q8[ k ] = out32; + out32 = silk_LSHIFT( out32, 2 ); + S[ 0 ] = silk_SMLAWB( S[ 1 ], out32, A_Q14[ 0 ] ); + S[ 1 ] = silk_SMULWB( out32, A_Q14[ 1 ] ); + } +} + diff --git a/thirdparty/opus/silk/resampler_private_IIR_FIR.c b/thirdparty/opus/silk/resampler_private_IIR_FIR.c new file mode 100644 index 000000000000..6b2b3a2e1832 --- /dev/null +++ b/thirdparty/opus/silk/resampler_private_IIR_FIR.c @@ -0,0 +1,107 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" +#include "resampler_private.h" +#include "stack_alloc.h" + +static OPUS_INLINE opus_int16 *silk_resampler_private_IIR_FIR_INTERPOL( + opus_int16 *out, + opus_int16 *buf, + opus_int32 max_index_Q16, + opus_int32 index_increment_Q16 +) +{ + opus_int32 index_Q16, res_Q15; + opus_int16 *buf_ptr; + opus_int32 table_index; + + /* Interpolate upsampled signal and store in output array */ + for( index_Q16 = 0; index_Q16 < max_index_Q16; index_Q16 += index_increment_Q16 ) { + table_index = silk_SMULWB( index_Q16 & 0xFFFF, 12 ); + buf_ptr = &buf[ index_Q16 >> 16 ]; + + res_Q15 = silk_SMULBB( buf_ptr[ 0 ], silk_resampler_frac_FIR_12[ table_index ][ 0 ] ); + res_Q15 = silk_SMLABB( res_Q15, buf_ptr[ 1 ], silk_resampler_frac_FIR_12[ table_index ][ 1 ] ); + res_Q15 = silk_SMLABB( res_Q15, buf_ptr[ 2 ], silk_resampler_frac_FIR_12[ table_index ][ 2 ] ); + res_Q15 = silk_SMLABB( res_Q15, buf_ptr[ 3 ], silk_resampler_frac_FIR_12[ table_index ][ 3 ] ); + res_Q15 = silk_SMLABB( res_Q15, buf_ptr[ 4 ], silk_resampler_frac_FIR_12[ 11 - table_index ][ 3 ] ); + res_Q15 = silk_SMLABB( res_Q15, buf_ptr[ 5 ], silk_resampler_frac_FIR_12[ 11 - table_index ][ 2 ] ); + res_Q15 = silk_SMLABB( res_Q15, buf_ptr[ 6 ], silk_resampler_frac_FIR_12[ 11 - table_index ][ 1 ] ); + res_Q15 = silk_SMLABB( res_Q15, buf_ptr[ 7 ], silk_resampler_frac_FIR_12[ 11 - table_index ][ 0 ] ); + *out++ = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( res_Q15, 15 ) ); + } + return out; +} +/* Upsample using a combination of allpass-based 2x upsampling and FIR interpolation */ +void silk_resampler_private_IIR_FIR( + void *SS, /* I/O Resampler state */ + opus_int16 out[], /* O Output signal */ + const opus_int16 in[], /* I Input signal */ + opus_int32 inLen /* I Number of input samples */ +) +{ + silk_resampler_state_struct *S = (silk_resampler_state_struct *)SS; + opus_int32 nSamplesIn; + opus_int32 max_index_Q16, index_increment_Q16; + VARDECL( opus_int16, buf ); + SAVE_STACK; + + ALLOC( buf, 2 * S->batchSize + RESAMPLER_ORDER_FIR_12, opus_int16 ); + + /* Copy buffered samples to start of buffer */ + silk_memcpy( buf, S->sFIR.i16, RESAMPLER_ORDER_FIR_12 * sizeof( opus_int16 ) ); + + /* Iterate over blocks of frameSizeIn input samples */ + index_increment_Q16 = S->invRatio_Q16; + while( 1 ) { + nSamplesIn = silk_min( inLen, S->batchSize ); + + /* Upsample 2x */ + silk_resampler_private_up2_HQ( S->sIIR, &buf[ RESAMPLER_ORDER_FIR_12 ], in, nSamplesIn ); + + max_index_Q16 = silk_LSHIFT32( nSamplesIn, 16 + 1 ); /* + 1 because 2x upsampling */ + out = silk_resampler_private_IIR_FIR_INTERPOL( out, buf, max_index_Q16, index_increment_Q16 ); + in += nSamplesIn; + inLen -= nSamplesIn; + + if( inLen > 0 ) { + /* More iterations to do; copy last part of filtered signal to beginning of buffer */ + silk_memcpy( buf, &buf[ nSamplesIn << 1 ], RESAMPLER_ORDER_FIR_12 * sizeof( opus_int16 ) ); + } else { + break; + } + } + + /* Copy last part of filtered signal to the state for the next call */ + silk_memcpy( S->sFIR.i16, &buf[ nSamplesIn << 1 ], RESAMPLER_ORDER_FIR_12 * sizeof( opus_int16 ) ); + RESTORE_STACK; +} diff --git a/thirdparty/opus/silk/resampler_private_down_FIR.c b/thirdparty/opus/silk/resampler_private_down_FIR.c new file mode 100644 index 000000000000..783e42b35617 --- /dev/null +++ b/thirdparty/opus/silk/resampler_private_down_FIR.c @@ -0,0 +1,194 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" +#include "resampler_private.h" +#include "stack_alloc.h" + +static OPUS_INLINE opus_int16 *silk_resampler_private_down_FIR_INTERPOL( + opus_int16 *out, + opus_int32 *buf, + const opus_int16 *FIR_Coefs, + opus_int FIR_Order, + opus_int FIR_Fracs, + opus_int32 max_index_Q16, + opus_int32 index_increment_Q16 +) +{ + opus_int32 index_Q16, res_Q6; + opus_int32 *buf_ptr; + opus_int32 interpol_ind; + const opus_int16 *interpol_ptr; + + switch( FIR_Order ) { + case RESAMPLER_DOWN_ORDER_FIR0: + for( index_Q16 = 0; index_Q16 < max_index_Q16; index_Q16 += index_increment_Q16 ) { + /* Integer part gives pointer to buffered input */ + buf_ptr = buf + silk_RSHIFT( index_Q16, 16 ); + + /* Fractional part gives interpolation coefficients */ + interpol_ind = silk_SMULWB( index_Q16 & 0xFFFF, FIR_Fracs ); + + /* Inner product */ + interpol_ptr = &FIR_Coefs[ RESAMPLER_DOWN_ORDER_FIR0 / 2 * interpol_ind ]; + res_Q6 = silk_SMULWB( buf_ptr[ 0 ], interpol_ptr[ 0 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 1 ], interpol_ptr[ 1 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 2 ], interpol_ptr[ 2 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 3 ], interpol_ptr[ 3 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 4 ], interpol_ptr[ 4 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 5 ], interpol_ptr[ 5 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 6 ], interpol_ptr[ 6 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 7 ], interpol_ptr[ 7 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 8 ], interpol_ptr[ 8 ] ); + interpol_ptr = &FIR_Coefs[ RESAMPLER_DOWN_ORDER_FIR0 / 2 * ( FIR_Fracs - 1 - interpol_ind ) ]; + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 17 ], interpol_ptr[ 0 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 16 ], interpol_ptr[ 1 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 15 ], interpol_ptr[ 2 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 14 ], interpol_ptr[ 3 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 13 ], interpol_ptr[ 4 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 12 ], interpol_ptr[ 5 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 11 ], interpol_ptr[ 6 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 10 ], interpol_ptr[ 7 ] ); + res_Q6 = silk_SMLAWB( res_Q6, buf_ptr[ 9 ], interpol_ptr[ 8 ] ); + + /* Scale down, saturate and store in output array */ + *out++ = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( res_Q6, 6 ) ); + } + break; + case RESAMPLER_DOWN_ORDER_FIR1: + for( index_Q16 = 0; index_Q16 < max_index_Q16; index_Q16 += index_increment_Q16 ) { + /* Integer part gives pointer to buffered input */ + buf_ptr = buf + silk_RSHIFT( index_Q16, 16 ); + + /* Inner product */ + res_Q6 = silk_SMULWB( silk_ADD32( buf_ptr[ 0 ], buf_ptr[ 23 ] ), FIR_Coefs[ 0 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 1 ], buf_ptr[ 22 ] ), FIR_Coefs[ 1 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 2 ], buf_ptr[ 21 ] ), FIR_Coefs[ 2 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 3 ], buf_ptr[ 20 ] ), FIR_Coefs[ 3 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 4 ], buf_ptr[ 19 ] ), FIR_Coefs[ 4 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 5 ], buf_ptr[ 18 ] ), FIR_Coefs[ 5 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 6 ], buf_ptr[ 17 ] ), FIR_Coefs[ 6 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 7 ], buf_ptr[ 16 ] ), FIR_Coefs[ 7 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 8 ], buf_ptr[ 15 ] ), FIR_Coefs[ 8 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 9 ], buf_ptr[ 14 ] ), FIR_Coefs[ 9 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 10 ], buf_ptr[ 13 ] ), FIR_Coefs[ 10 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 11 ], buf_ptr[ 12 ] ), FIR_Coefs[ 11 ] ); + + /* Scale down, saturate and store in output array */ + *out++ = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( res_Q6, 6 ) ); + } + break; + case RESAMPLER_DOWN_ORDER_FIR2: + for( index_Q16 = 0; index_Q16 < max_index_Q16; index_Q16 += index_increment_Q16 ) { + /* Integer part gives pointer to buffered input */ + buf_ptr = buf + silk_RSHIFT( index_Q16, 16 ); + + /* Inner product */ + res_Q6 = silk_SMULWB( silk_ADD32( buf_ptr[ 0 ], buf_ptr[ 35 ] ), FIR_Coefs[ 0 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 1 ], buf_ptr[ 34 ] ), FIR_Coefs[ 1 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 2 ], buf_ptr[ 33 ] ), FIR_Coefs[ 2 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 3 ], buf_ptr[ 32 ] ), FIR_Coefs[ 3 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 4 ], buf_ptr[ 31 ] ), FIR_Coefs[ 4 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 5 ], buf_ptr[ 30 ] ), FIR_Coefs[ 5 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 6 ], buf_ptr[ 29 ] ), FIR_Coefs[ 6 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 7 ], buf_ptr[ 28 ] ), FIR_Coefs[ 7 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 8 ], buf_ptr[ 27 ] ), FIR_Coefs[ 8 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 9 ], buf_ptr[ 26 ] ), FIR_Coefs[ 9 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 10 ], buf_ptr[ 25 ] ), FIR_Coefs[ 10 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 11 ], buf_ptr[ 24 ] ), FIR_Coefs[ 11 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 12 ], buf_ptr[ 23 ] ), FIR_Coefs[ 12 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 13 ], buf_ptr[ 22 ] ), FIR_Coefs[ 13 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 14 ], buf_ptr[ 21 ] ), FIR_Coefs[ 14 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 15 ], buf_ptr[ 20 ] ), FIR_Coefs[ 15 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 16 ], buf_ptr[ 19 ] ), FIR_Coefs[ 16 ] ); + res_Q6 = silk_SMLAWB( res_Q6, silk_ADD32( buf_ptr[ 17 ], buf_ptr[ 18 ] ), FIR_Coefs[ 17 ] ); + + /* Scale down, saturate and store in output array */ + *out++ = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( res_Q6, 6 ) ); + } + break; + default: + silk_assert( 0 ); + } + return out; +} + +/* Resample with a 2nd order AR filter followed by FIR interpolation */ +void silk_resampler_private_down_FIR( + void *SS, /* I/O Resampler state */ + opus_int16 out[], /* O Output signal */ + const opus_int16 in[], /* I Input signal */ + opus_int32 inLen /* I Number of input samples */ +) +{ + silk_resampler_state_struct *S = (silk_resampler_state_struct *)SS; + opus_int32 nSamplesIn; + opus_int32 max_index_Q16, index_increment_Q16; + VARDECL( opus_int32, buf ); + const opus_int16 *FIR_Coefs; + SAVE_STACK; + + ALLOC( buf, S->batchSize + S->FIR_Order, opus_int32 ); + + /* Copy buffered samples to start of buffer */ + silk_memcpy( buf, S->sFIR.i32, S->FIR_Order * sizeof( opus_int32 ) ); + + FIR_Coefs = &S->Coefs[ 2 ]; + + /* Iterate over blocks of frameSizeIn input samples */ + index_increment_Q16 = S->invRatio_Q16; + while( 1 ) { + nSamplesIn = silk_min( inLen, S->batchSize ); + + /* Second-order AR filter (output in Q8) */ + silk_resampler_private_AR2( S->sIIR, &buf[ S->FIR_Order ], in, S->Coefs, nSamplesIn ); + + max_index_Q16 = silk_LSHIFT32( nSamplesIn, 16 ); + + /* Interpolate filtered signal */ + out = silk_resampler_private_down_FIR_INTERPOL( out, buf, FIR_Coefs, S->FIR_Order, + S->FIR_Fracs, max_index_Q16, index_increment_Q16 ); + + in += nSamplesIn; + inLen -= nSamplesIn; + + if( inLen > 1 ) { + /* More iterations to do; copy last part of filtered signal to beginning of buffer */ + silk_memcpy( buf, &buf[ nSamplesIn ], S->FIR_Order * sizeof( opus_int32 ) ); + } else { + break; + } + } + + /* Copy last part of filtered signal to the state for the next call */ + silk_memcpy( S->sFIR.i32, &buf[ nSamplesIn ], S->FIR_Order * sizeof( opus_int32 ) ); + RESTORE_STACK; +} diff --git a/thirdparty/opus/silk/resampler_private_up2_HQ.c b/thirdparty/opus/silk/resampler_private_up2_HQ.c new file mode 100644 index 000000000000..c7ec8de3651d --- /dev/null +++ b/thirdparty/opus/silk/resampler_private_up2_HQ.c @@ -0,0 +1,113 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" +#include "resampler_private.h" + +/* Upsample by a factor 2, high quality */ +/* Uses 2nd order allpass filters for the 2x upsampling, followed by a */ +/* notch filter just above Nyquist. */ +void silk_resampler_private_up2_HQ( + opus_int32 *S, /* I/O Resampler state [ 6 ] */ + opus_int16 *out, /* O Output signal [ 2 * len ] */ + const opus_int16 *in, /* I Input signal [ len ] */ + opus_int32 len /* I Number of input samples */ +) +{ + opus_int32 k; + opus_int32 in32, out32_1, out32_2, Y, X; + + silk_assert( silk_resampler_up2_hq_0[ 0 ] > 0 ); + silk_assert( silk_resampler_up2_hq_0[ 1 ] > 0 ); + silk_assert( silk_resampler_up2_hq_0[ 2 ] < 0 ); + silk_assert( silk_resampler_up2_hq_1[ 0 ] > 0 ); + silk_assert( silk_resampler_up2_hq_1[ 1 ] > 0 ); + silk_assert( silk_resampler_up2_hq_1[ 2 ] < 0 ); + + /* Internal variables and state are in Q10 format */ + for( k = 0; k < len; k++ ) { + /* Convert to Q10 */ + in32 = silk_LSHIFT( (opus_int32)in[ k ], 10 ); + + /* First all-pass section for even output sample */ + Y = silk_SUB32( in32, S[ 0 ] ); + X = silk_SMULWB( Y, silk_resampler_up2_hq_0[ 0 ] ); + out32_1 = silk_ADD32( S[ 0 ], X ); + S[ 0 ] = silk_ADD32( in32, X ); + + /* Second all-pass section for even output sample */ + Y = silk_SUB32( out32_1, S[ 1 ] ); + X = silk_SMULWB( Y, silk_resampler_up2_hq_0[ 1 ] ); + out32_2 = silk_ADD32( S[ 1 ], X ); + S[ 1 ] = silk_ADD32( out32_1, X ); + + /* Third all-pass section for even output sample */ + Y = silk_SUB32( out32_2, S[ 2 ] ); + X = silk_SMLAWB( Y, Y, silk_resampler_up2_hq_0[ 2 ] ); + out32_1 = silk_ADD32( S[ 2 ], X ); + S[ 2 ] = silk_ADD32( out32_2, X ); + + /* Apply gain in Q15, convert back to int16 and store to output */ + out[ 2 * k ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( out32_1, 10 ) ); + + /* First all-pass section for odd output sample */ + Y = silk_SUB32( in32, S[ 3 ] ); + X = silk_SMULWB( Y, silk_resampler_up2_hq_1[ 0 ] ); + out32_1 = silk_ADD32( S[ 3 ], X ); + S[ 3 ] = silk_ADD32( in32, X ); + + /* Second all-pass section for odd output sample */ + Y = silk_SUB32( out32_1, S[ 4 ] ); + X = silk_SMULWB( Y, silk_resampler_up2_hq_1[ 1 ] ); + out32_2 = silk_ADD32( S[ 4 ], X ); + S[ 4 ] = silk_ADD32( out32_1, X ); + + /* Third all-pass section for odd output sample */ + Y = silk_SUB32( out32_2, S[ 5 ] ); + X = silk_SMLAWB( Y, Y, silk_resampler_up2_hq_1[ 2 ] ); + out32_1 = silk_ADD32( S[ 5 ], X ); + S[ 5 ] = silk_ADD32( out32_2, X ); + + /* Apply gain in Q15, convert back to int16 and store to output */ + out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( out32_1, 10 ) ); + } +} + +void silk_resampler_private_up2_HQ_wrapper( + void *SS, /* I/O Resampler state (unused) */ + opus_int16 *out, /* O Output signal [ 2 * len ] */ + const opus_int16 *in, /* I Input signal [ len ] */ + opus_int32 len /* I Number of input samples */ +) +{ + silk_resampler_state_struct *S = (silk_resampler_state_struct *)SS; + silk_resampler_private_up2_HQ( S->sIIR, out, in, len ); +} diff --git a/thirdparty/opus/silk/resampler_rom.c b/thirdparty/opus/silk/resampler_rom.c new file mode 100644 index 000000000000..5e6b04476aae --- /dev/null +++ b/thirdparty/opus/silk/resampler_rom.c @@ -0,0 +1,96 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/* Filter coefficients for IIR/FIR polyphase resampling * + * Total size: 179 Words (358 Bytes) */ + +#include "resampler_private.h" + +/* Matlab code for the notch filter coefficients: */ +/* B = [1, 0.147, 1]; A = [1, 0.107, 0.89]; G = 0.93; freqz(G * B, A, 2^14, 16e3); axis([0, 8000, -10, 1]) */ +/* fprintf('\t%6d, %6d, %6d, %6d\n', round(B(2)*2^16), round(-A(2)*2^16), round((1-A(3))*2^16), round(G*2^15)) */ +/* const opus_int16 silk_resampler_up2_hq_notch[ 4 ] = { 9634, -7012, 7209, 30474 }; */ + +/* Tables with IIR and FIR coefficients for fractional downsamplers (123 Words) */ +silk_DWORD_ALIGN const opus_int16 silk_Resampler_3_4_COEFS[ 2 + 3 * RESAMPLER_DOWN_ORDER_FIR0 / 2 ] = { + -20694, -13867, + -49, 64, 17, -157, 353, -496, 163, 11047, 22205, + -39, 6, 91, -170, 186, 23, -896, 6336, 19928, + -19, -36, 102, -89, -24, 328, -951, 2568, 15909, +}; + +silk_DWORD_ALIGN const opus_int16 silk_Resampler_2_3_COEFS[ 2 + 2 * RESAMPLER_DOWN_ORDER_FIR0 / 2 ] = { + -14457, -14019, + 64, 128, -122, 36, 310, -768, 584, 9267, 17733, + 12, 128, 18, -142, 288, -117, -865, 4123, 14459, +}; + +silk_DWORD_ALIGN const opus_int16 silk_Resampler_1_2_COEFS[ 2 + RESAMPLER_DOWN_ORDER_FIR1 / 2 ] = { + 616, -14323, + -10, 39, 58, -46, -84, 120, 184, -315, -541, 1284, 5380, 9024, +}; + +silk_DWORD_ALIGN const opus_int16 silk_Resampler_1_3_COEFS[ 2 + RESAMPLER_DOWN_ORDER_FIR2 / 2 ] = { + 16102, -15162, + -13, 0, 20, 26, 5, -31, -43, -4, 65, 90, 7, -157, -248, -44, 593, 1583, 2612, 3271, +}; + +silk_DWORD_ALIGN const opus_int16 silk_Resampler_1_4_COEFS[ 2 + RESAMPLER_DOWN_ORDER_FIR2 / 2 ] = { + 22500, -15099, + 3, -14, -20, -15, 2, 25, 37, 25, -16, -71, -107, -79, 50, 292, 623, 982, 1288, 1464, +}; + +silk_DWORD_ALIGN const opus_int16 silk_Resampler_1_6_COEFS[ 2 + RESAMPLER_DOWN_ORDER_FIR2 / 2 ] = { + 27540, -15257, + 17, 12, 8, 1, -10, -22, -30, -32, -22, 3, 44, 100, 168, 243, 317, 381, 429, 455, +}; + +silk_DWORD_ALIGN const opus_int16 silk_Resampler_2_3_COEFS_LQ[ 2 + 2 * 2 ] = { + -2797, -6507, + 4697, 10739, + 1567, 8276, +}; + +/* Table with interplation fractions of 1/24, 3/24, 5/24, ... , 23/24 : 23/24 (46 Words) */ +silk_DWORD_ALIGN const opus_int16 silk_resampler_frac_FIR_12[ 12 ][ RESAMPLER_ORDER_FIR_12 / 2 ] = { + { 189, -600, 617, 30567 }, + { 117, -159, -1070, 29704 }, + { 52, 221, -2392, 28276 }, + { -4, 529, -3350, 26341 }, + { -48, 758, -3956, 23973 }, + { -80, 905, -4235, 21254 }, + { -99, 972, -4222, 18278 }, + { -107, 967, -3957, 15143 }, + { -103, 896, -3487, 11950 }, + { -91, 773, -2865, 8798 }, + { -71, 611, -2143, 5784 }, + { -46, 425, -1375, 2996 }, +}; diff --git a/thirdparty/opus/silk/resampler_rom.h b/thirdparty/opus/silk/resampler_rom.h new file mode 100644 index 000000000000..490b3388dced --- /dev/null +++ b/thirdparty/opus/silk/resampler_rom.h @@ -0,0 +1,68 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_FIX_RESAMPLER_ROM_H +#define SILK_FIX_RESAMPLER_ROM_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "typedef.h" +#include "resampler_structs.h" + +#define RESAMPLER_DOWN_ORDER_FIR0 18 +#define RESAMPLER_DOWN_ORDER_FIR1 24 +#define RESAMPLER_DOWN_ORDER_FIR2 36 +#define RESAMPLER_ORDER_FIR_12 8 + +/* Tables for 2x downsampler */ +static const opus_int16 silk_resampler_down2_0 = 9872; +static const opus_int16 silk_resampler_down2_1 = 39809 - 65536; + +/* Tables for 2x upsampler, high quality */ +static const opus_int16 silk_resampler_up2_hq_0[ 3 ] = { 1746, 14986, 39083 - 65536 }; +static const opus_int16 silk_resampler_up2_hq_1[ 3 ] = { 6854, 25769, 55542 - 65536 }; + +/* Tables with IIR and FIR coefficients for fractional downsamplers */ +extern const opus_int16 silk_Resampler_3_4_COEFS[ 2 + 3 * RESAMPLER_DOWN_ORDER_FIR0 / 2 ]; +extern const opus_int16 silk_Resampler_2_3_COEFS[ 2 + 2 * RESAMPLER_DOWN_ORDER_FIR0 / 2 ]; +extern const opus_int16 silk_Resampler_1_2_COEFS[ 2 + RESAMPLER_DOWN_ORDER_FIR1 / 2 ]; +extern const opus_int16 silk_Resampler_1_3_COEFS[ 2 + RESAMPLER_DOWN_ORDER_FIR2 / 2 ]; +extern const opus_int16 silk_Resampler_1_4_COEFS[ 2 + RESAMPLER_DOWN_ORDER_FIR2 / 2 ]; +extern const opus_int16 silk_Resampler_1_6_COEFS[ 2 + RESAMPLER_DOWN_ORDER_FIR2 / 2 ]; +extern const opus_int16 silk_Resampler_2_3_COEFS_LQ[ 2 + 2 * 2 ]; + +/* Table with interplation fractions of 1/24, 3/24, ..., 23/24 */ +extern const opus_int16 silk_resampler_frac_FIR_12[ 12 ][ RESAMPLER_ORDER_FIR_12 / 2 ]; + +#ifdef __cplusplus +} +#endif + +#endif /* SILK_FIX_RESAMPLER_ROM_H */ diff --git a/thirdparty/opus/silk/resampler_structs.h b/thirdparty/opus/silk/resampler_structs.h new file mode 100644 index 000000000000..9e9457d11cac --- /dev/null +++ b/thirdparty/opus/silk/resampler_structs.h @@ -0,0 +1,60 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_RESAMPLER_STRUCTS_H +#define SILK_RESAMPLER_STRUCTS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define SILK_RESAMPLER_MAX_FIR_ORDER 36 +#define SILK_RESAMPLER_MAX_IIR_ORDER 6 + +typedef struct _silk_resampler_state_struct{ + opus_int32 sIIR[ SILK_RESAMPLER_MAX_IIR_ORDER ]; /* this must be the first element of this struct */ + union{ + opus_int32 i32[ SILK_RESAMPLER_MAX_FIR_ORDER ]; + opus_int16 i16[ SILK_RESAMPLER_MAX_FIR_ORDER ]; + } sFIR; + opus_int16 delayBuf[ 48 ]; + opus_int resampler_function; + opus_int batchSize; + opus_int32 invRatio_Q16; + opus_int FIR_Order; + opus_int FIR_Fracs; + opus_int Fs_in_kHz; + opus_int Fs_out_kHz; + opus_int inputDelay; + const opus_int16 *Coefs; +} silk_resampler_state_struct; + +#ifdef __cplusplus +} +#endif +#endif /* SILK_RESAMPLER_STRUCTS_H */ + diff --git a/thirdparty/opus/silk/shell_coder.c b/thirdparty/opus/silk/shell_coder.c new file mode 100644 index 000000000000..4af341474bc7 --- /dev/null +++ b/thirdparty/opus/silk/shell_coder.c @@ -0,0 +1,151 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* shell coder; pulse-subframe length is hardcoded */ + +static OPUS_INLINE void combine_pulses( + opus_int *out, /* O combined pulses vector [len] */ + const opus_int *in, /* I input vector [2 * len] */ + const opus_int len /* I number of OUTPUT samples */ +) +{ + opus_int k; + for( k = 0; k < len; k++ ) { + out[ k ] = in[ 2 * k ] + in[ 2 * k + 1 ]; + } +} + +static OPUS_INLINE void encode_split( + ec_enc *psRangeEnc, /* I/O compressor data structure */ + const opus_int p_child1, /* I pulse amplitude of first child subframe */ + const opus_int p, /* I pulse amplitude of current subframe */ + const opus_uint8 *shell_table /* I table of shell cdfs */ +) +{ + if( p > 0 ) { + ec_enc_icdf( psRangeEnc, p_child1, &shell_table[ silk_shell_code_table_offsets[ p ] ], 8 ); + } +} + +static OPUS_INLINE void decode_split( + opus_int16 *p_child1, /* O pulse amplitude of first child subframe */ + opus_int16 *p_child2, /* O pulse amplitude of second child subframe */ + ec_dec *psRangeDec, /* I/O Compressor data structure */ + const opus_int p, /* I pulse amplitude of current subframe */ + const opus_uint8 *shell_table /* I table of shell cdfs */ +) +{ + if( p > 0 ) { + p_child1[ 0 ] = ec_dec_icdf( psRangeDec, &shell_table[ silk_shell_code_table_offsets[ p ] ], 8 ); + p_child2[ 0 ] = p - p_child1[ 0 ]; + } else { + p_child1[ 0 ] = 0; + p_child2[ 0 ] = 0; + } +} + +/* Shell encoder, operates on one shell code frame of 16 pulses */ +void silk_shell_encoder( + ec_enc *psRangeEnc, /* I/O compressor data structure */ + const opus_int *pulses0 /* I data: nonnegative pulse amplitudes */ +) +{ + opus_int pulses1[ 8 ], pulses2[ 4 ], pulses3[ 2 ], pulses4[ 1 ]; + + /* this function operates on one shell code frame of 16 pulses */ + silk_assert( SHELL_CODEC_FRAME_LENGTH == 16 ); + + /* tree representation per pulse-subframe */ + combine_pulses( pulses1, pulses0, 8 ); + combine_pulses( pulses2, pulses1, 4 ); + combine_pulses( pulses3, pulses2, 2 ); + combine_pulses( pulses4, pulses3, 1 ); + + encode_split( psRangeEnc, pulses3[ 0 ], pulses4[ 0 ], silk_shell_code_table3 ); + + encode_split( psRangeEnc, pulses2[ 0 ], pulses3[ 0 ], silk_shell_code_table2 ); + + encode_split( psRangeEnc, pulses1[ 0 ], pulses2[ 0 ], silk_shell_code_table1 ); + encode_split( psRangeEnc, pulses0[ 0 ], pulses1[ 0 ], silk_shell_code_table0 ); + encode_split( psRangeEnc, pulses0[ 2 ], pulses1[ 1 ], silk_shell_code_table0 ); + + encode_split( psRangeEnc, pulses1[ 2 ], pulses2[ 1 ], silk_shell_code_table1 ); + encode_split( psRangeEnc, pulses0[ 4 ], pulses1[ 2 ], silk_shell_code_table0 ); + encode_split( psRangeEnc, pulses0[ 6 ], pulses1[ 3 ], silk_shell_code_table0 ); + + encode_split( psRangeEnc, pulses2[ 2 ], pulses3[ 1 ], silk_shell_code_table2 ); + + encode_split( psRangeEnc, pulses1[ 4 ], pulses2[ 2 ], silk_shell_code_table1 ); + encode_split( psRangeEnc, pulses0[ 8 ], pulses1[ 4 ], silk_shell_code_table0 ); + encode_split( psRangeEnc, pulses0[ 10 ], pulses1[ 5 ], silk_shell_code_table0 ); + + encode_split( psRangeEnc, pulses1[ 6 ], pulses2[ 3 ], silk_shell_code_table1 ); + encode_split( psRangeEnc, pulses0[ 12 ], pulses1[ 6 ], silk_shell_code_table0 ); + encode_split( psRangeEnc, pulses0[ 14 ], pulses1[ 7 ], silk_shell_code_table0 ); +} + + +/* Shell decoder, operates on one shell code frame of 16 pulses */ +void silk_shell_decoder( + opus_int16 *pulses0, /* O data: nonnegative pulse amplitudes */ + ec_dec *psRangeDec, /* I/O Compressor data structure */ + const opus_int pulses4 /* I number of pulses per pulse-subframe */ +) +{ + opus_int16 pulses3[ 2 ], pulses2[ 4 ], pulses1[ 8 ]; + + /* this function operates on one shell code frame of 16 pulses */ + silk_assert( SHELL_CODEC_FRAME_LENGTH == 16 ); + + decode_split( &pulses3[ 0 ], &pulses3[ 1 ], psRangeDec, pulses4, silk_shell_code_table3 ); + + decode_split( &pulses2[ 0 ], &pulses2[ 1 ], psRangeDec, pulses3[ 0 ], silk_shell_code_table2 ); + + decode_split( &pulses1[ 0 ], &pulses1[ 1 ], psRangeDec, pulses2[ 0 ], silk_shell_code_table1 ); + decode_split( &pulses0[ 0 ], &pulses0[ 1 ], psRangeDec, pulses1[ 0 ], silk_shell_code_table0 ); + decode_split( &pulses0[ 2 ], &pulses0[ 3 ], psRangeDec, pulses1[ 1 ], silk_shell_code_table0 ); + + decode_split( &pulses1[ 2 ], &pulses1[ 3 ], psRangeDec, pulses2[ 1 ], silk_shell_code_table1 ); + decode_split( &pulses0[ 4 ], &pulses0[ 5 ], psRangeDec, pulses1[ 2 ], silk_shell_code_table0 ); + decode_split( &pulses0[ 6 ], &pulses0[ 7 ], psRangeDec, pulses1[ 3 ], silk_shell_code_table0 ); + + decode_split( &pulses2[ 2 ], &pulses2[ 3 ], psRangeDec, pulses3[ 1 ], silk_shell_code_table2 ); + + decode_split( &pulses1[ 4 ], &pulses1[ 5 ], psRangeDec, pulses2[ 2 ], silk_shell_code_table1 ); + decode_split( &pulses0[ 8 ], &pulses0[ 9 ], psRangeDec, pulses1[ 4 ], silk_shell_code_table0 ); + decode_split( &pulses0[ 10 ], &pulses0[ 11 ], psRangeDec, pulses1[ 5 ], silk_shell_code_table0 ); + + decode_split( &pulses1[ 6 ], &pulses1[ 7 ], psRangeDec, pulses2[ 3 ], silk_shell_code_table1 ); + decode_split( &pulses0[ 12 ], &pulses0[ 13 ], psRangeDec, pulses1[ 6 ], silk_shell_code_table0 ); + decode_split( &pulses0[ 14 ], &pulses0[ 15 ], psRangeDec, pulses1[ 7 ], silk_shell_code_table0 ); +} diff --git a/thirdparty/opus/silk/sigm_Q15.c b/thirdparty/opus/silk/sigm_Q15.c new file mode 100644 index 000000000000..3c507d255b15 --- /dev/null +++ b/thirdparty/opus/silk/sigm_Q15.c @@ -0,0 +1,76 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/* Approximate sigmoid function */ + +#include "SigProc_FIX.h" + +/* fprintf(1, '%d, ', round(1024 * ([1 ./ (1 + exp(-(1:5))), 1] - 1 ./ (1 + exp(-(0:5)))))); */ +static const opus_int32 sigm_LUT_slope_Q10[ 6 ] = { + 237, 153, 73, 30, 12, 7 +}; +/* fprintf(1, '%d, ', round(32767 * 1 ./ (1 + exp(-(0:5))))); */ +static const opus_int32 sigm_LUT_pos_Q15[ 6 ] = { + 16384, 23955, 28861, 31213, 32178, 32548 +}; +/* fprintf(1, '%d, ', round(32767 * 1 ./ (1 + exp((0:5))))); */ +static const opus_int32 sigm_LUT_neg_Q15[ 6 ] = { + 16384, 8812, 3906, 1554, 589, 219 +}; + +opus_int silk_sigm_Q15( + opus_int in_Q5 /* I */ +) +{ + opus_int ind; + + if( in_Q5 < 0 ) { + /* Negative input */ + in_Q5 = -in_Q5; + if( in_Q5 >= 6 * 32 ) { + return 0; /* Clip */ + } else { + /* Linear interpolation of look up table */ + ind = silk_RSHIFT( in_Q5, 5 ); + return( sigm_LUT_neg_Q15[ ind ] - silk_SMULBB( sigm_LUT_slope_Q10[ ind ], in_Q5 & 0x1F ) ); + } + } else { + /* Positive input */ + if( in_Q5 >= 6 * 32 ) { + return 32767; /* clip */ + } else { + /* Linear interpolation of look up table */ + ind = silk_RSHIFT( in_Q5, 5 ); + return( sigm_LUT_pos_Q15[ ind ] + silk_SMULBB( sigm_LUT_slope_Q10[ ind ], in_Q5 & 0x1F ) ); + } + } +} + diff --git a/thirdparty/opus/silk/sort.c b/thirdparty/opus/silk/sort.c new file mode 100644 index 000000000000..7187c9efb113 --- /dev/null +++ b/thirdparty/opus/silk/sort.c @@ -0,0 +1,154 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/* Insertion sort (fast for already almost sorted arrays): */ +/* Best case: O(n) for an already sorted array */ +/* Worst case: O(n^2) for an inversely sorted array */ +/* */ +/* Shell short: https://en.wikipedia.org/wiki/Shell_sort */ + +#include "SigProc_FIX.h" + +void silk_insertion_sort_increasing( + opus_int32 *a, /* I/O Unsorted / Sorted vector */ + opus_int *idx, /* O Index vector for the sorted elements */ + const opus_int L, /* I Vector length */ + const opus_int K /* I Number of correctly sorted positions */ +) +{ + opus_int32 value; + opus_int i, j; + + /* Safety checks */ + silk_assert( K > 0 ); + silk_assert( L > 0 ); + silk_assert( L >= K ); + + /* Write start indices in index vector */ + for( i = 0; i < K; i++ ) { + idx[ i ] = i; + } + + /* Sort vector elements by value, increasing order */ + for( i = 1; i < K; i++ ) { + value = a[ i ]; + for( j = i - 1; ( j >= 0 ) && ( value < a[ j ] ); j-- ) { + a[ j + 1 ] = a[ j ]; /* Shift value */ + idx[ j + 1 ] = idx[ j ]; /* Shift index */ + } + a[ j + 1 ] = value; /* Write value */ + idx[ j + 1 ] = i; /* Write index */ + } + + /* If less than L values are asked for, check the remaining values, */ + /* but only spend CPU to ensure that the K first values are correct */ + for( i = K; i < L; i++ ) { + value = a[ i ]; + if( value < a[ K - 1 ] ) { + for( j = K - 2; ( j >= 0 ) && ( value < a[ j ] ); j-- ) { + a[ j + 1 ] = a[ j ]; /* Shift value */ + idx[ j + 1 ] = idx[ j ]; /* Shift index */ + } + a[ j + 1 ] = value; /* Write value */ + idx[ j + 1 ] = i; /* Write index */ + } + } +} + +#ifdef FIXED_POINT +/* This function is only used by the fixed-point build */ +void silk_insertion_sort_decreasing_int16( + opus_int16 *a, /* I/O Unsorted / Sorted vector */ + opus_int *idx, /* O Index vector for the sorted elements */ + const opus_int L, /* I Vector length */ + const opus_int K /* I Number of correctly sorted positions */ +) +{ + opus_int i, j; + opus_int value; + + /* Safety checks */ + silk_assert( K > 0 ); + silk_assert( L > 0 ); + silk_assert( L >= K ); + + /* Write start indices in index vector */ + for( i = 0; i < K; i++ ) { + idx[ i ] = i; + } + + /* Sort vector elements by value, decreasing order */ + for( i = 1; i < K; i++ ) { + value = a[ i ]; + for( j = i - 1; ( j >= 0 ) && ( value > a[ j ] ); j-- ) { + a[ j + 1 ] = a[ j ]; /* Shift value */ + idx[ j + 1 ] = idx[ j ]; /* Shift index */ + } + a[ j + 1 ] = value; /* Write value */ + idx[ j + 1 ] = i; /* Write index */ + } + + /* If less than L values are asked for, check the remaining values, */ + /* but only spend CPU to ensure that the K first values are correct */ + for( i = K; i < L; i++ ) { + value = a[ i ]; + if( value > a[ K - 1 ] ) { + for( j = K - 2; ( j >= 0 ) && ( value > a[ j ] ); j-- ) { + a[ j + 1 ] = a[ j ]; /* Shift value */ + idx[ j + 1 ] = idx[ j ]; /* Shift index */ + } + a[ j + 1 ] = value; /* Write value */ + idx[ j + 1 ] = i; /* Write index */ + } + } +} +#endif + +void silk_insertion_sort_increasing_all_values_int16( + opus_int16 *a, /* I/O Unsorted / Sorted vector */ + const opus_int L /* I Vector length */ +) +{ + opus_int value; + opus_int i, j; + + /* Safety checks */ + silk_assert( L > 0 ); + + /* Sort vector elements by value, increasing order */ + for( i = 1; i < L; i++ ) { + value = a[ i ]; + for( j = i - 1; ( j >= 0 ) && ( value < a[ j ] ); j-- ) { + a[ j + 1 ] = a[ j ]; /* Shift value */ + } + a[ j + 1 ] = value; /* Write value */ + } +} diff --git a/thirdparty/opus/silk/stereo_LR_to_MS.c b/thirdparty/opus/silk/stereo_LR_to_MS.c new file mode 100644 index 000000000000..dda0298de27a --- /dev/null +++ b/thirdparty/opus/silk/stereo_LR_to_MS.c @@ -0,0 +1,229 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" +#include "stack_alloc.h" + +/* Convert Left/Right stereo signal to adaptive Mid/Side representation */ +void silk_stereo_LR_to_MS( + stereo_enc_state *state, /* I/O State */ + opus_int16 x1[], /* I/O Left input signal, becomes mid signal */ + opus_int16 x2[], /* I/O Right input signal, becomes side signal */ + opus_int8 ix[ 2 ][ 3 ], /* O Quantization indices */ + opus_int8 *mid_only_flag, /* O Flag: only mid signal coded */ + opus_int32 mid_side_rates_bps[], /* O Bitrates for mid and side signals */ + opus_int32 total_rate_bps, /* I Total bitrate */ + opus_int prev_speech_act_Q8, /* I Speech activity level in previous frame */ + opus_int toMono, /* I Last frame before a stereo->mono transition */ + opus_int fs_kHz, /* I Sample rate (kHz) */ + opus_int frame_length /* I Number of samples */ +) +{ + opus_int n, is10msFrame, denom_Q16, delta0_Q13, delta1_Q13; + opus_int32 sum, diff, smooth_coef_Q16, pred_Q13[ 2 ], pred0_Q13, pred1_Q13; + opus_int32 LP_ratio_Q14, HP_ratio_Q14, frac_Q16, frac_3_Q16, min_mid_rate_bps, width_Q14, w_Q24, deltaw_Q24; + VARDECL( opus_int16, side ); + VARDECL( opus_int16, LP_mid ); + VARDECL( opus_int16, HP_mid ); + VARDECL( opus_int16, LP_side ); + VARDECL( opus_int16, HP_side ); + opus_int16 *mid = &x1[ -2 ]; + SAVE_STACK; + + ALLOC( side, frame_length + 2, opus_int16 ); + /* Convert to basic mid/side signals */ + for( n = 0; n < frame_length + 2; n++ ) { + sum = x1[ n - 2 ] + (opus_int32)x2[ n - 2 ]; + diff = x1[ n - 2 ] - (opus_int32)x2[ n - 2 ]; + mid[ n ] = (opus_int16)silk_RSHIFT_ROUND( sum, 1 ); + side[ n ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( diff, 1 ) ); + } + + /* Buffering */ + silk_memcpy( mid, state->sMid, 2 * sizeof( opus_int16 ) ); + silk_memcpy( side, state->sSide, 2 * sizeof( opus_int16 ) ); + silk_memcpy( state->sMid, &mid[ frame_length ], 2 * sizeof( opus_int16 ) ); + silk_memcpy( state->sSide, &side[ frame_length ], 2 * sizeof( opus_int16 ) ); + + /* LP and HP filter mid signal */ + ALLOC( LP_mid, frame_length, opus_int16 ); + ALLOC( HP_mid, frame_length, opus_int16 ); + for( n = 0; n < frame_length; n++ ) { + sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 2 ); + LP_mid[ n ] = sum; + HP_mid[ n ] = mid[ n + 1 ] - sum; + } + + /* LP and HP filter side signal */ + ALLOC( LP_side, frame_length, opus_int16 ); + ALLOC( HP_side, frame_length, opus_int16 ); + for( n = 0; n < frame_length; n++ ) { + sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT( side[ n ] + (opus_int32)side[ n + 2 ], side[ n + 1 ], 1 ), 2 ); + LP_side[ n ] = sum; + HP_side[ n ] = side[ n + 1 ] - sum; + } + + /* Find energies and predictors */ + is10msFrame = frame_length == 10 * fs_kHz; + smooth_coef_Q16 = is10msFrame ? + SILK_FIX_CONST( STEREO_RATIO_SMOOTH_COEF / 2, 16 ) : + SILK_FIX_CONST( STEREO_RATIO_SMOOTH_COEF, 16 ); + smooth_coef_Q16 = silk_SMULWB( silk_SMULBB( prev_speech_act_Q8, prev_speech_act_Q8 ), smooth_coef_Q16 ); + + pred_Q13[ 0 ] = silk_stereo_find_predictor( &LP_ratio_Q14, LP_mid, LP_side, &state->mid_side_amp_Q0[ 0 ], frame_length, smooth_coef_Q16 ); + pred_Q13[ 1 ] = silk_stereo_find_predictor( &HP_ratio_Q14, HP_mid, HP_side, &state->mid_side_amp_Q0[ 2 ], frame_length, smooth_coef_Q16 ); + /* Ratio of the norms of residual and mid signals */ + frac_Q16 = silk_SMLABB( HP_ratio_Q14, LP_ratio_Q14, 3 ); + frac_Q16 = silk_min( frac_Q16, SILK_FIX_CONST( 1, 16 ) ); + + /* Determine bitrate distribution between mid and side, and possibly reduce stereo width */ + total_rate_bps -= is10msFrame ? 1200 : 600; /* Subtract approximate bitrate for coding stereo parameters */ + if( total_rate_bps < 1 ) { + total_rate_bps = 1; + } + min_mid_rate_bps = silk_SMLABB( 2000, fs_kHz, 900 ); + silk_assert( min_mid_rate_bps < 32767 ); + /* Default bitrate distribution: 8 parts for Mid and (5+3*frac) parts for Side. so: mid_rate = ( 8 / ( 13 + 3 * frac ) ) * total_ rate */ + frac_3_Q16 = silk_MUL( 3, frac_Q16 ); + mid_side_rates_bps[ 0 ] = silk_DIV32_varQ( total_rate_bps, SILK_FIX_CONST( 8 + 5, 16 ) + frac_3_Q16, 16+3 ); + /* If Mid bitrate below minimum, reduce stereo width */ + if( mid_side_rates_bps[ 0 ] < min_mid_rate_bps ) { + mid_side_rates_bps[ 0 ] = min_mid_rate_bps; + mid_side_rates_bps[ 1 ] = total_rate_bps - mid_side_rates_bps[ 0 ]; + /* width = 4 * ( 2 * side_rate - min_rate ) / ( ( 1 + 3 * frac ) * min_rate ) */ + width_Q14 = silk_DIV32_varQ( silk_LSHIFT( mid_side_rates_bps[ 1 ], 1 ) - min_mid_rate_bps, + silk_SMULWB( SILK_FIX_CONST( 1, 16 ) + frac_3_Q16, min_mid_rate_bps ), 14+2 ); + width_Q14 = silk_LIMIT( width_Q14, 0, SILK_FIX_CONST( 1, 14 ) ); + } else { + mid_side_rates_bps[ 1 ] = total_rate_bps - mid_side_rates_bps[ 0 ]; + width_Q14 = SILK_FIX_CONST( 1, 14 ); + } + + /* Smoother */ + state->smth_width_Q14 = (opus_int16)silk_SMLAWB( state->smth_width_Q14, width_Q14 - state->smth_width_Q14, smooth_coef_Q16 ); + + /* At very low bitrates or for inputs that are nearly amplitude panned, switch to panned-mono coding */ + *mid_only_flag = 0; + if( toMono ) { + /* Last frame before stereo->mono transition; collapse stereo width */ + width_Q14 = 0; + pred_Q13[ 0 ] = 0; + pred_Q13[ 1 ] = 0; + silk_stereo_quant_pred( pred_Q13, ix ); + } else if( state->width_prev_Q14 == 0 && + ( 8 * total_rate_bps < 13 * min_mid_rate_bps || silk_SMULWB( frac_Q16, state->smth_width_Q14 ) < SILK_FIX_CONST( 0.05, 14 ) ) ) + { + /* Code as panned-mono; previous frame already had zero width */ + /* Scale down and quantize predictors */ + pred_Q13[ 0 ] = silk_RSHIFT( silk_SMULBB( state->smth_width_Q14, pred_Q13[ 0 ] ), 14 ); + pred_Q13[ 1 ] = silk_RSHIFT( silk_SMULBB( state->smth_width_Q14, pred_Q13[ 1 ] ), 14 ); + silk_stereo_quant_pred( pred_Q13, ix ); + /* Collapse stereo width */ + width_Q14 = 0; + pred_Q13[ 0 ] = 0; + pred_Q13[ 1 ] = 0; + mid_side_rates_bps[ 0 ] = total_rate_bps; + mid_side_rates_bps[ 1 ] = 0; + *mid_only_flag = 1; + } else if( state->width_prev_Q14 != 0 && + ( 8 * total_rate_bps < 11 * min_mid_rate_bps || silk_SMULWB( frac_Q16, state->smth_width_Q14 ) < SILK_FIX_CONST( 0.02, 14 ) ) ) + { + /* Transition to zero-width stereo */ + /* Scale down and quantize predictors */ + pred_Q13[ 0 ] = silk_RSHIFT( silk_SMULBB( state->smth_width_Q14, pred_Q13[ 0 ] ), 14 ); + pred_Q13[ 1 ] = silk_RSHIFT( silk_SMULBB( state->smth_width_Q14, pred_Q13[ 1 ] ), 14 ); + silk_stereo_quant_pred( pred_Q13, ix ); + /* Collapse stereo width */ + width_Q14 = 0; + pred_Q13[ 0 ] = 0; + pred_Q13[ 1 ] = 0; + } else if( state->smth_width_Q14 > SILK_FIX_CONST( 0.95, 14 ) ) { + /* Full-width stereo coding */ + silk_stereo_quant_pred( pred_Q13, ix ); + width_Q14 = SILK_FIX_CONST( 1, 14 ); + } else { + /* Reduced-width stereo coding; scale down and quantize predictors */ + pred_Q13[ 0 ] = silk_RSHIFT( silk_SMULBB( state->smth_width_Q14, pred_Q13[ 0 ] ), 14 ); + pred_Q13[ 1 ] = silk_RSHIFT( silk_SMULBB( state->smth_width_Q14, pred_Q13[ 1 ] ), 14 ); + silk_stereo_quant_pred( pred_Q13, ix ); + width_Q14 = state->smth_width_Q14; + } + + /* Make sure to keep on encoding until the tapered output has been transmitted */ + if( *mid_only_flag == 1 ) { + state->silent_side_len += frame_length - STEREO_INTERP_LEN_MS * fs_kHz; + if( state->silent_side_len < LA_SHAPE_MS * fs_kHz ) { + *mid_only_flag = 0; + } else { + /* Limit to avoid wrapping around */ + state->silent_side_len = 10000; + } + } else { + state->silent_side_len = 0; + } + + if( *mid_only_flag == 0 && mid_side_rates_bps[ 1 ] < 1 ) { + mid_side_rates_bps[ 1 ] = 1; + mid_side_rates_bps[ 0 ] = silk_max_int( 1, total_rate_bps - mid_side_rates_bps[ 1 ]); + } + + /* Interpolate predictors and subtract prediction from side channel */ + pred0_Q13 = -state->pred_prev_Q13[ 0 ]; + pred1_Q13 = -state->pred_prev_Q13[ 1 ]; + w_Q24 = silk_LSHIFT( state->width_prev_Q14, 10 ); + denom_Q16 = silk_DIV32_16( (opus_int32)1 << 16, STEREO_INTERP_LEN_MS * fs_kHz ); + delta0_Q13 = -silk_RSHIFT_ROUND( silk_SMULBB( pred_Q13[ 0 ] - state->pred_prev_Q13[ 0 ], denom_Q16 ), 16 ); + delta1_Q13 = -silk_RSHIFT_ROUND( silk_SMULBB( pred_Q13[ 1 ] - state->pred_prev_Q13[ 1 ], denom_Q16 ), 16 ); + deltaw_Q24 = silk_LSHIFT( silk_SMULWB( width_Q14 - state->width_prev_Q14, denom_Q16 ), 10 ); + for( n = 0; n < STEREO_INTERP_LEN_MS * fs_kHz; n++ ) { + pred0_Q13 += delta0_Q13; + pred1_Q13 += delta1_Q13; + w_Q24 += deltaw_Q24; + sum = silk_LSHIFT( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 ); /* Q11 */ + sum = silk_SMLAWB( silk_SMULWB( w_Q24, side[ n + 1 ] ), sum, pred0_Q13 ); /* Q8 */ + sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)mid[ n + 1 ], 11 ), pred1_Q13 ); /* Q8 */ + x2[ n - 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) ); + } + + pred0_Q13 = -pred_Q13[ 0 ]; + pred1_Q13 = -pred_Q13[ 1 ]; + w_Q24 = silk_LSHIFT( width_Q14, 10 ); + for( n = STEREO_INTERP_LEN_MS * fs_kHz; n < frame_length; n++ ) { + sum = silk_LSHIFT( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 ); /* Q11 */ + sum = silk_SMLAWB( silk_SMULWB( w_Q24, side[ n + 1 ] ), sum, pred0_Q13 ); /* Q8 */ + sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)mid[ n + 1 ], 11 ), pred1_Q13 ); /* Q8 */ + x2[ n - 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) ); + } + state->pred_prev_Q13[ 0 ] = (opus_int16)pred_Q13[ 0 ]; + state->pred_prev_Q13[ 1 ] = (opus_int16)pred_Q13[ 1 ]; + state->width_prev_Q14 = (opus_int16)width_Q14; + RESTORE_STACK; +} diff --git a/thirdparty/opus/silk/stereo_MS_to_LR.c b/thirdparty/opus/silk/stereo_MS_to_LR.c new file mode 100644 index 000000000000..62521a4f352f --- /dev/null +++ b/thirdparty/opus/silk/stereo_MS_to_LR.c @@ -0,0 +1,85 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* Convert adaptive Mid/Side representation to Left/Right stereo signal */ +void silk_stereo_MS_to_LR( + stereo_dec_state *state, /* I/O State */ + opus_int16 x1[], /* I/O Left input signal, becomes mid signal */ + opus_int16 x2[], /* I/O Right input signal, becomes side signal */ + const opus_int32 pred_Q13[], /* I Predictors */ + opus_int fs_kHz, /* I Samples rate (kHz) */ + opus_int frame_length /* I Number of samples */ +) +{ + opus_int n, denom_Q16, delta0_Q13, delta1_Q13; + opus_int32 sum, diff, pred0_Q13, pred1_Q13; + + /* Buffering */ + silk_memcpy( x1, state->sMid, 2 * sizeof( opus_int16 ) ); + silk_memcpy( x2, state->sSide, 2 * sizeof( opus_int16 ) ); + silk_memcpy( state->sMid, &x1[ frame_length ], 2 * sizeof( opus_int16 ) ); + silk_memcpy( state->sSide, &x2[ frame_length ], 2 * sizeof( opus_int16 ) ); + + /* Interpolate predictors and add prediction to side channel */ + pred0_Q13 = state->pred_prev_Q13[ 0 ]; + pred1_Q13 = state->pred_prev_Q13[ 1 ]; + denom_Q16 = silk_DIV32_16( (opus_int32)1 << 16, STEREO_INTERP_LEN_MS * fs_kHz ); + delta0_Q13 = silk_RSHIFT_ROUND( silk_SMULBB( pred_Q13[ 0 ] - state->pred_prev_Q13[ 0 ], denom_Q16 ), 16 ); + delta1_Q13 = silk_RSHIFT_ROUND( silk_SMULBB( pred_Q13[ 1 ] - state->pred_prev_Q13[ 1 ], denom_Q16 ), 16 ); + for( n = 0; n < STEREO_INTERP_LEN_MS * fs_kHz; n++ ) { + pred0_Q13 += delta0_Q13; + pred1_Q13 += delta1_Q13; + sum = silk_LSHIFT( silk_ADD_LSHIFT( x1[ n ] + x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 ); /* Q11 */ + sum = silk_SMLAWB( silk_LSHIFT( (opus_int32)x2[ n + 1 ], 8 ), sum, pred0_Q13 ); /* Q8 */ + sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)x1[ n + 1 ], 11 ), pred1_Q13 ); /* Q8 */ + x2[ n + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) ); + } + pred0_Q13 = pred_Q13[ 0 ]; + pred1_Q13 = pred_Q13[ 1 ]; + for( n = STEREO_INTERP_LEN_MS * fs_kHz; n < frame_length; n++ ) { + sum = silk_LSHIFT( silk_ADD_LSHIFT( x1[ n ] + x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 ); /* Q11 */ + sum = silk_SMLAWB( silk_LSHIFT( (opus_int32)x2[ n + 1 ], 8 ), sum, pred0_Q13 ); /* Q8 */ + sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)x1[ n + 1 ], 11 ), pred1_Q13 ); /* Q8 */ + x2[ n + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) ); + } + state->pred_prev_Q13[ 0 ] = pred_Q13[ 0 ]; + state->pred_prev_Q13[ 1 ] = pred_Q13[ 1 ]; + + /* Convert to left/right signals */ + for( n = 0; n < frame_length; n++ ) { + sum = x1[ n + 1 ] + (opus_int32)x2[ n + 1 ]; + diff = x1[ n + 1 ] - (opus_int32)x2[ n + 1 ]; + x1[ n + 1 ] = (opus_int16)silk_SAT16( sum ); + x2[ n + 1 ] = (opus_int16)silk_SAT16( diff ); + } +} diff --git a/thirdparty/opus/silk/stereo_decode_pred.c b/thirdparty/opus/silk/stereo_decode_pred.c new file mode 100644 index 000000000000..56ba3925e871 --- /dev/null +++ b/thirdparty/opus/silk/stereo_decode_pred.c @@ -0,0 +1,73 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* Decode mid/side predictors */ +void silk_stereo_decode_pred( + ec_dec *psRangeDec, /* I/O Compressor data structure */ + opus_int32 pred_Q13[] /* O Predictors */ +) +{ + opus_int n, ix[ 2 ][ 3 ]; + opus_int32 low_Q13, step_Q13; + + /* Entropy decoding */ + n = ec_dec_icdf( psRangeDec, silk_stereo_pred_joint_iCDF, 8 ); + ix[ 0 ][ 2 ] = silk_DIV32_16( n, 5 ); + ix[ 1 ][ 2 ] = n - 5 * ix[ 0 ][ 2 ]; + for( n = 0; n < 2; n++ ) { + ix[ n ][ 0 ] = ec_dec_icdf( psRangeDec, silk_uniform3_iCDF, 8 ); + ix[ n ][ 1 ] = ec_dec_icdf( psRangeDec, silk_uniform5_iCDF, 8 ); + } + + /* Dequantize */ + for( n = 0; n < 2; n++ ) { + ix[ n ][ 0 ] += 3 * ix[ n ][ 2 ]; + low_Q13 = silk_stereo_pred_quant_Q13[ ix[ n ][ 0 ] ]; + step_Q13 = silk_SMULWB( silk_stereo_pred_quant_Q13[ ix[ n ][ 0 ] + 1 ] - low_Q13, + SILK_FIX_CONST( 0.5 / STEREO_QUANT_SUB_STEPS, 16 ) ); + pred_Q13[ n ] = silk_SMLABB( low_Q13, step_Q13, 2 * ix[ n ][ 1 ] + 1 ); + } + + /* Subtract second from first predictor (helps when actually applying these) */ + pred_Q13[ 0 ] -= pred_Q13[ 1 ]; +} + +/* Decode mid-only flag */ +void silk_stereo_decode_mid_only( + ec_dec *psRangeDec, /* I/O Compressor data structure */ + opus_int *decode_only_mid /* O Flag that only mid channel has been coded */ +) +{ + /* Decode flag that only mid channel is coded */ + *decode_only_mid = ec_dec_icdf( psRangeDec, silk_stereo_only_code_mid_iCDF, 8 ); +} diff --git a/thirdparty/opus/silk/stereo_encode_pred.c b/thirdparty/opus/silk/stereo_encode_pred.c new file mode 100644 index 000000000000..e6dd19506640 --- /dev/null +++ b/thirdparty/opus/silk/stereo_encode_pred.c @@ -0,0 +1,62 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* Entropy code the mid/side quantization indices */ +void silk_stereo_encode_pred( + ec_enc *psRangeEnc, /* I/O Compressor data structure */ + opus_int8 ix[ 2 ][ 3 ] /* I Quantization indices */ +) +{ + opus_int n; + + /* Entropy coding */ + n = 5 * ix[ 0 ][ 2 ] + ix[ 1 ][ 2 ]; + silk_assert( n < 25 ); + ec_enc_icdf( psRangeEnc, n, silk_stereo_pred_joint_iCDF, 8 ); + for( n = 0; n < 2; n++ ) { + silk_assert( ix[ n ][ 0 ] < 3 ); + silk_assert( ix[ n ][ 1 ] < STEREO_QUANT_SUB_STEPS ); + ec_enc_icdf( psRangeEnc, ix[ n ][ 0 ], silk_uniform3_iCDF, 8 ); + ec_enc_icdf( psRangeEnc, ix[ n ][ 1 ], silk_uniform5_iCDF, 8 ); + } +} + +/* Entropy code the mid-only flag */ +void silk_stereo_encode_mid_only( + ec_enc *psRangeEnc, /* I/O Compressor data structure */ + opus_int8 mid_only_flag +) +{ + /* Encode flag that only mid channel is coded */ + ec_enc_icdf( psRangeEnc, mid_only_flag, silk_stereo_only_code_mid_iCDF, 8 ); +} diff --git a/thirdparty/opus/silk/stereo_find_predictor.c b/thirdparty/opus/silk/stereo_find_predictor.c new file mode 100644 index 000000000000..e30e90bddc28 --- /dev/null +++ b/thirdparty/opus/silk/stereo_find_predictor.c @@ -0,0 +1,79 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* Find least-squares prediction gain for one signal based on another and quantize it */ +opus_int32 silk_stereo_find_predictor( /* O Returns predictor in Q13 */ + opus_int32 *ratio_Q14, /* O Ratio of residual and mid energies */ + const opus_int16 x[], /* I Basis signal */ + const opus_int16 y[], /* I Target signal */ + opus_int32 mid_res_amp_Q0[], /* I/O Smoothed mid, residual norms */ + opus_int length, /* I Number of samples */ + opus_int smooth_coef_Q16 /* I Smoothing coefficient */ +) +{ + opus_int scale, scale1, scale2; + opus_int32 nrgx, nrgy, corr, pred_Q13, pred2_Q10; + + /* Find predictor */ + silk_sum_sqr_shift( &nrgx, &scale1, x, length ); + silk_sum_sqr_shift( &nrgy, &scale2, y, length ); + scale = silk_max_int( scale1, scale2 ); + scale = scale + ( scale & 1 ); /* make even */ + nrgy = silk_RSHIFT32( nrgy, scale - scale2 ); + nrgx = silk_RSHIFT32( nrgx, scale - scale1 ); + nrgx = silk_max_int( nrgx, 1 ); + corr = silk_inner_prod_aligned_scale( x, y, scale, length ); + pred_Q13 = silk_DIV32_varQ( corr, nrgx, 13 ); + pred_Q13 = silk_LIMIT( pred_Q13, -(1 << 14), 1 << 14 ); + pred2_Q10 = silk_SMULWB( pred_Q13, pred_Q13 ); + + /* Faster update for signals with large prediction parameters */ + smooth_coef_Q16 = (opus_int)silk_max_int( smooth_coef_Q16, silk_abs( pred2_Q10 ) ); + + /* Smoothed mid and residual norms */ + silk_assert( smooth_coef_Q16 < 32768 ); + scale = silk_RSHIFT( scale, 1 ); + mid_res_amp_Q0[ 0 ] = silk_SMLAWB( mid_res_amp_Q0[ 0 ], silk_LSHIFT( silk_SQRT_APPROX( nrgx ), scale ) - mid_res_amp_Q0[ 0 ], + smooth_coef_Q16 ); + /* Residual energy = nrgy - 2 * pred * corr + pred^2 * nrgx */ + nrgy = silk_SUB_LSHIFT32( nrgy, silk_SMULWB( corr, pred_Q13 ), 3 + 1 ); + nrgy = silk_ADD_LSHIFT32( nrgy, silk_SMULWB( nrgx, pred2_Q10 ), 6 ); + mid_res_amp_Q0[ 1 ] = silk_SMLAWB( mid_res_amp_Q0[ 1 ], silk_LSHIFT( silk_SQRT_APPROX( nrgy ), scale ) - mid_res_amp_Q0[ 1 ], + smooth_coef_Q16 ); + + /* Ratio of smoothed residual and mid norms */ + *ratio_Q14 = silk_DIV32_varQ( mid_res_amp_Q0[ 1 ], silk_max( mid_res_amp_Q0[ 0 ], 1 ), 14 ); + *ratio_Q14 = silk_LIMIT( *ratio_Q14, 0, 32767 ); + + return pred_Q13; +} diff --git a/thirdparty/opus/silk/stereo_quant_pred.c b/thirdparty/opus/silk/stereo_quant_pred.c new file mode 100644 index 000000000000..d4ced6c3e82f --- /dev/null +++ b/thirdparty/opus/silk/stereo_quant_pred.c @@ -0,0 +1,73 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +/* Quantize mid/side predictors */ +void silk_stereo_quant_pred( + opus_int32 pred_Q13[], /* I/O Predictors (out: quantized) */ + opus_int8 ix[ 2 ][ 3 ] /* O Quantization indices */ +) +{ + opus_int i, j, n; + opus_int32 low_Q13, step_Q13, lvl_Q13, err_min_Q13, err_Q13, quant_pred_Q13 = 0; + + /* Quantize */ + for( n = 0; n < 2; n++ ) { + /* Brute-force search over quantization levels */ + err_min_Q13 = silk_int32_MAX; + for( i = 0; i < STEREO_QUANT_TAB_SIZE - 1; i++ ) { + low_Q13 = silk_stereo_pred_quant_Q13[ i ]; + step_Q13 = silk_SMULWB( silk_stereo_pred_quant_Q13[ i + 1 ] - low_Q13, + SILK_FIX_CONST( 0.5 / STEREO_QUANT_SUB_STEPS, 16 ) ); + for( j = 0; j < STEREO_QUANT_SUB_STEPS; j++ ) { + lvl_Q13 = silk_SMLABB( low_Q13, step_Q13, 2 * j + 1 ); + err_Q13 = silk_abs( pred_Q13[ n ] - lvl_Q13 ); + if( err_Q13 < err_min_Q13 ) { + err_min_Q13 = err_Q13; + quant_pred_Q13 = lvl_Q13; + ix[ n ][ 0 ] = i; + ix[ n ][ 1 ] = j; + } else { + /* Error increasing, so we're past the optimum */ + goto done; + } + } + } + done: + ix[ n ][ 2 ] = silk_DIV32_16( ix[ n ][ 0 ], 3 ); + ix[ n ][ 0 ] -= ix[ n ][ 2 ] * 3; + pred_Q13[ n ] = quant_pred_Q13; + } + + /* Subtract second from first predictor (helps when actually applying these) */ + pred_Q13[ 0 ] -= pred_Q13[ 1 ]; +} diff --git a/thirdparty/opus/silk/structs.h b/thirdparty/opus/silk/structs.h new file mode 100644 index 000000000000..827829dc6f32 --- /dev/null +++ b/thirdparty/opus/silk/structs.h @@ -0,0 +1,327 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_STRUCTS_H +#define SILK_STRUCTS_H + +#include "typedef.h" +#include "SigProc_FIX.h" +#include "define.h" +#include "entenc.h" +#include "entdec.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/************************************/ +/* Noise shaping quantization state */ +/************************************/ +typedef struct { + opus_int16 xq[ 2 * MAX_FRAME_LENGTH ]; /* Buffer for quantized output signal */ + opus_int32 sLTP_shp_Q14[ 2 * MAX_FRAME_LENGTH ]; + opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ]; + opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ]; + opus_int32 sLF_AR_shp_Q14; + opus_int lagPrev; + opus_int sLTP_buf_idx; + opus_int sLTP_shp_buf_idx; + opus_int32 rand_seed; + opus_int32 prev_gain_Q16; + opus_int rewhite_flag; +} silk_nsq_state; + +/********************************/ +/* VAD state */ +/********************************/ +typedef struct { + opus_int32 AnaState[ 2 ]; /* Analysis filterbank state: 0-8 kHz */ + opus_int32 AnaState1[ 2 ]; /* Analysis filterbank state: 0-4 kHz */ + opus_int32 AnaState2[ 2 ]; /* Analysis filterbank state: 0-2 kHz */ + opus_int32 XnrgSubfr[ VAD_N_BANDS ]; /* Subframe energies */ + opus_int32 NrgRatioSmth_Q8[ VAD_N_BANDS ]; /* Smoothed energy level in each band */ + opus_int16 HPstate; /* State of differentiator in the lowest band */ + opus_int32 NL[ VAD_N_BANDS ]; /* Noise energy level in each band */ + opus_int32 inv_NL[ VAD_N_BANDS ]; /* Inverse noise energy level in each band */ + opus_int32 NoiseLevelBias[ VAD_N_BANDS ]; /* Noise level estimator bias/offset */ + opus_int32 counter; /* Frame counter used in the initial phase */ +} silk_VAD_state; + +/* Variable cut-off low-pass filter state */ +typedef struct { + opus_int32 In_LP_State[ 2 ]; /* Low pass filter state */ + opus_int32 transition_frame_no; /* Counter which is mapped to a cut-off frequency */ + opus_int mode; /* Operating mode, <0: switch down, >0: switch up; 0: do nothing */ +} silk_LP_state; + +/* Structure containing NLSF codebook */ +typedef struct { + const opus_int16 nVectors; + const opus_int16 order; + const opus_int16 quantStepSize_Q16; + const opus_int16 invQuantStepSize_Q6; + const opus_uint8 *CB1_NLSF_Q8; + const opus_uint8 *CB1_iCDF; + const opus_uint8 *pred_Q8; + const opus_uint8 *ec_sel; + const opus_uint8 *ec_iCDF; + const opus_uint8 *ec_Rates_Q5; + const opus_int16 *deltaMin_Q15; +} silk_NLSF_CB_struct; + +typedef struct { + opus_int16 pred_prev_Q13[ 2 ]; + opus_int16 sMid[ 2 ]; + opus_int16 sSide[ 2 ]; + opus_int32 mid_side_amp_Q0[ 4 ]; + opus_int16 smth_width_Q14; + opus_int16 width_prev_Q14; + opus_int16 silent_side_len; + opus_int8 predIx[ MAX_FRAMES_PER_PACKET ][ 2 ][ 3 ]; + opus_int8 mid_only_flags[ MAX_FRAMES_PER_PACKET ]; +} stereo_enc_state; + +typedef struct { + opus_int16 pred_prev_Q13[ 2 ]; + opus_int16 sMid[ 2 ]; + opus_int16 sSide[ 2 ]; +} stereo_dec_state; + +typedef struct { + opus_int8 GainsIndices[ MAX_NB_SUBFR ]; + opus_int8 LTPIndex[ MAX_NB_SUBFR ]; + opus_int8 NLSFIndices[ MAX_LPC_ORDER + 1 ]; + opus_int16 lagIndex; + opus_int8 contourIndex; + opus_int8 signalType; + opus_int8 quantOffsetType; + opus_int8 NLSFInterpCoef_Q2; + opus_int8 PERIndex; + opus_int8 LTP_scaleIndex; + opus_int8 Seed; +} SideInfoIndices; + +/********************************/ +/* Encoder state */ +/********************************/ +typedef struct { + opus_int32 In_HP_State[ 2 ]; /* High pass filter state */ + opus_int32 variable_HP_smth1_Q15; /* State of first smoother */ + opus_int32 variable_HP_smth2_Q15; /* State of second smoother */ + silk_LP_state sLP; /* Low pass filter state */ + silk_VAD_state sVAD; /* Voice activity detector state */ + silk_nsq_state sNSQ; /* Noise Shape Quantizer State */ + opus_int16 prev_NLSFq_Q15[ MAX_LPC_ORDER ]; /* Previously quantized NLSF vector */ + opus_int speech_activity_Q8; /* Speech activity */ + opus_int allow_bandwidth_switch; /* Flag indicating that switching of internal bandwidth is allowed */ + opus_int8 LBRRprevLastGainIndex; + opus_int8 prevSignalType; + opus_int prevLag; + opus_int pitch_LPC_win_length; + opus_int max_pitch_lag; /* Highest possible pitch lag (samples) */ + opus_int32 API_fs_Hz; /* API sampling frequency (Hz) */ + opus_int32 prev_API_fs_Hz; /* Previous API sampling frequency (Hz) */ + opus_int maxInternal_fs_Hz; /* Maximum internal sampling frequency (Hz) */ + opus_int minInternal_fs_Hz; /* Minimum internal sampling frequency (Hz) */ + opus_int desiredInternal_fs_Hz; /* Soft request for internal sampling frequency (Hz) */ + opus_int fs_kHz; /* Internal sampling frequency (kHz) */ + opus_int nb_subfr; /* Number of 5 ms subframes in a frame */ + opus_int frame_length; /* Frame length (samples) */ + opus_int subfr_length; /* Subframe length (samples) */ + opus_int ltp_mem_length; /* Length of LTP memory */ + opus_int la_pitch; /* Look-ahead for pitch analysis (samples) */ + opus_int la_shape; /* Look-ahead for noise shape analysis (samples) */ + opus_int shapeWinLength; /* Window length for noise shape analysis (samples) */ + opus_int32 TargetRate_bps; /* Target bitrate (bps) */ + opus_int PacketSize_ms; /* Number of milliseconds to put in each packet */ + opus_int PacketLoss_perc; /* Packet loss rate measured by farend */ + opus_int32 frameCounter; + opus_int Complexity; /* Complexity setting */ + opus_int nStatesDelayedDecision; /* Number of states in delayed decision quantization */ + opus_int useInterpolatedNLSFs; /* Flag for using NLSF interpolation */ + opus_int shapingLPCOrder; /* Filter order for noise shaping filters */ + opus_int predictLPCOrder; /* Filter order for prediction filters */ + opus_int pitchEstimationComplexity; /* Complexity level for pitch estimator */ + opus_int pitchEstimationLPCOrder; /* Whitening filter order for pitch estimator */ + opus_int32 pitchEstimationThreshold_Q16; /* Threshold for pitch estimator */ + opus_int LTPQuantLowComplexity; /* Flag for low complexity LTP quantization */ + opus_int mu_LTP_Q9; /* Rate-distortion tradeoff in LTP quantization */ + opus_int32 sum_log_gain_Q7; /* Cumulative max prediction gain */ + opus_int NLSF_MSVQ_Survivors; /* Number of survivors in NLSF MSVQ */ + opus_int first_frame_after_reset; /* Flag for deactivating NLSF interpolation, pitch prediction */ + opus_int controlled_since_last_payload; /* Flag for ensuring codec_control only runs once per packet */ + opus_int warping_Q16; /* Warping parameter for warped noise shaping */ + opus_int useCBR; /* Flag to enable constant bitrate */ + opus_int prefillFlag; /* Flag to indicate that only buffers are prefilled, no coding */ + const opus_uint8 *pitch_lag_low_bits_iCDF; /* Pointer to iCDF table for low bits of pitch lag index */ + const opus_uint8 *pitch_contour_iCDF; /* Pointer to iCDF table for pitch contour index */ + const silk_NLSF_CB_struct *psNLSF_CB; /* Pointer to NLSF codebook */ + opus_int input_quality_bands_Q15[ VAD_N_BANDS ]; + opus_int input_tilt_Q15; + opus_int SNR_dB_Q7; /* Quality setting */ + + opus_int8 VAD_flags[ MAX_FRAMES_PER_PACKET ]; + opus_int8 LBRR_flag; + opus_int LBRR_flags[ MAX_FRAMES_PER_PACKET ]; + + SideInfoIndices indices; + opus_int8 pulses[ MAX_FRAME_LENGTH ]; + + int arch; + + /* Input/output buffering */ + opus_int16 inputBuf[ MAX_FRAME_LENGTH + 2 ]; /* Buffer containing input signal */ + opus_int inputBufIx; + opus_int nFramesPerPacket; + opus_int nFramesEncoded; /* Number of frames analyzed in current packet */ + + opus_int nChannelsAPI; + opus_int nChannelsInternal; + opus_int channelNb; + + /* Parameters For LTP scaling Control */ + opus_int frames_since_onset; + + /* Specifically for entropy coding */ + opus_int ec_prevSignalType; + opus_int16 ec_prevLagIndex; + + silk_resampler_state_struct resampler_state; + + /* DTX */ + opus_int useDTX; /* Flag to enable DTX */ + opus_int inDTX; /* Flag to signal DTX period */ + opus_int noSpeechCounter; /* Counts concecutive nonactive frames, used by DTX */ + + /* Inband Low Bitrate Redundancy (LBRR) data */ + opus_int useInBandFEC; /* Saves the API setting for query */ + opus_int LBRR_enabled; /* Depends on useInBandFRC, bitrate and packet loss rate */ + opus_int LBRR_GainIncreases; /* Gains increment for coding LBRR frames */ + SideInfoIndices indices_LBRR[ MAX_FRAMES_PER_PACKET ]; + opus_int8 pulses_LBRR[ MAX_FRAMES_PER_PACKET ][ MAX_FRAME_LENGTH ]; +} silk_encoder_state; + + +/* Struct for Packet Loss Concealment */ +typedef struct { + opus_int32 pitchL_Q8; /* Pitch lag to use for voiced concealment */ + opus_int16 LTPCoef_Q14[ LTP_ORDER ]; /* LTP coeficients to use for voiced concealment */ + opus_int16 prevLPC_Q12[ MAX_LPC_ORDER ]; + opus_int last_frame_lost; /* Was previous frame lost */ + opus_int32 rand_seed; /* Seed for unvoiced signal generation */ + opus_int16 randScale_Q14; /* Scaling of unvoiced random signal */ + opus_int32 conc_energy; + opus_int conc_energy_shift; + opus_int16 prevLTP_scale_Q14; + opus_int32 prevGain_Q16[ 2 ]; + opus_int fs_kHz; + opus_int nb_subfr; + opus_int subfr_length; +} silk_PLC_struct; + +/* Struct for CNG */ +typedef struct { + opus_int32 CNG_exc_buf_Q14[ MAX_FRAME_LENGTH ]; + opus_int16 CNG_smth_NLSF_Q15[ MAX_LPC_ORDER ]; + opus_int32 CNG_synth_state[ MAX_LPC_ORDER ]; + opus_int32 CNG_smth_Gain_Q16; + opus_int32 rand_seed; + opus_int fs_kHz; +} silk_CNG_struct; + +/********************************/ +/* Decoder state */ +/********************************/ +typedef struct { + opus_int32 prev_gain_Q16; + opus_int32 exc_Q14[ MAX_FRAME_LENGTH ]; + opus_int32 sLPC_Q14_buf[ MAX_LPC_ORDER ]; + opus_int16 outBuf[ MAX_FRAME_LENGTH + 2 * MAX_SUB_FRAME_LENGTH ]; /* Buffer for output signal */ + opus_int lagPrev; /* Previous Lag */ + opus_int8 LastGainIndex; /* Previous gain index */ + opus_int fs_kHz; /* Sampling frequency in kHz */ + opus_int32 fs_API_hz; /* API sample frequency (Hz) */ + opus_int nb_subfr; /* Number of 5 ms subframes in a frame */ + opus_int frame_length; /* Frame length (samples) */ + opus_int subfr_length; /* Subframe length (samples) */ + opus_int ltp_mem_length; /* Length of LTP memory */ + opus_int LPC_order; /* LPC order */ + opus_int16 prevNLSF_Q15[ MAX_LPC_ORDER ]; /* Used to interpolate LSFs */ + opus_int first_frame_after_reset; /* Flag for deactivating NLSF interpolation */ + const opus_uint8 *pitch_lag_low_bits_iCDF; /* Pointer to iCDF table for low bits of pitch lag index */ + const opus_uint8 *pitch_contour_iCDF; /* Pointer to iCDF table for pitch contour index */ + + /* For buffering payload in case of more frames per packet */ + opus_int nFramesDecoded; + opus_int nFramesPerPacket; + + /* Specifically for entropy coding */ + opus_int ec_prevSignalType; + opus_int16 ec_prevLagIndex; + + opus_int VAD_flags[ MAX_FRAMES_PER_PACKET ]; + opus_int LBRR_flag; + opus_int LBRR_flags[ MAX_FRAMES_PER_PACKET ]; + + silk_resampler_state_struct resampler_state; + + const silk_NLSF_CB_struct *psNLSF_CB; /* Pointer to NLSF codebook */ + + /* Quantization indices */ + SideInfoIndices indices; + + /* CNG state */ + silk_CNG_struct sCNG; + + /* Stuff used for PLC */ + opus_int lossCnt; + opus_int prevSignalType; + + silk_PLC_struct sPLC; + +} silk_decoder_state; + +/************************/ +/* Decoder control */ +/************************/ +typedef struct { + /* Prediction and coding parameters */ + opus_int pitchL[ MAX_NB_SUBFR ]; + opus_int32 Gains_Q16[ MAX_NB_SUBFR ]; + /* Holds interpolated and final coefficients, 4-byte aligned */ + silk_DWORD_ALIGN opus_int16 PredCoef_Q12[ 2 ][ MAX_LPC_ORDER ]; + opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ]; + opus_int LTP_scale_Q14; +} silk_decoder_control; + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/thirdparty/opus/silk/sum_sqr_shift.c b/thirdparty/opus/silk/sum_sqr_shift.c new file mode 100644 index 000000000000..129df191d8de --- /dev/null +++ b/thirdparty/opus/silk/sum_sqr_shift.c @@ -0,0 +1,86 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "SigProc_FIX.h" + +/* Compute number of bits to right shift the sum of squares of a vector */ +/* of int16s to make it fit in an int32 */ +void silk_sum_sqr_shift( + opus_int32 *energy, /* O Energy of x, after shifting to the right */ + opus_int *shift, /* O Number of bits right shift applied to energy */ + const opus_int16 *x, /* I Input vector */ + opus_int len /* I Length of input vector */ +) +{ + opus_int i, shft; + opus_int32 nrg_tmp, nrg; + + nrg = 0; + shft = 0; + len--; + for( i = 0; i < len; i += 2 ) { + nrg = silk_SMLABB_ovflw( nrg, x[ i ], x[ i ] ); + nrg = silk_SMLABB_ovflw( nrg, x[ i + 1 ], x[ i + 1 ] ); + if( nrg < 0 ) { + /* Scale down */ + nrg = (opus_int32)silk_RSHIFT_uint( (opus_uint32)nrg, 2 ); + shft = 2; + i+=2; + break; + } + } + for( ; i < len; i += 2 ) { + nrg_tmp = silk_SMULBB( x[ i ], x[ i ] ); + nrg_tmp = silk_SMLABB_ovflw( nrg_tmp, x[ i + 1 ], x[ i + 1 ] ); + nrg = (opus_int32)silk_ADD_RSHIFT_uint( nrg, (opus_uint32)nrg_tmp, shft ); + if( nrg < 0 ) { + /* Scale down */ + nrg = (opus_int32)silk_RSHIFT_uint( (opus_uint32)nrg, 2 ); + shft += 2; + } + } + if( i == len ) { + /* One sample left to process */ + nrg_tmp = silk_SMULBB( x[ i ], x[ i ] ); + nrg = (opus_int32)silk_ADD_RSHIFT_uint( nrg, nrg_tmp, shft ); + } + + /* Make sure to have at least one extra leading zero (two leading zeros in total) */ + if( nrg & 0xC0000000 ) { + nrg = silk_RSHIFT_uint( (opus_uint32)nrg, 2 ); + shft += 2; + } + + /* Output arguments */ + *shift = shft; + *energy = nrg; +} + diff --git a/thirdparty/opus/silk/table_LSF_cos.c b/thirdparty/opus/silk/table_LSF_cos.c new file mode 100644 index 000000000000..ec9dc6392791 --- /dev/null +++ b/thirdparty/opus/silk/table_LSF_cos.c @@ -0,0 +1,70 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "tables.h" + +/* Cosine approximation table for LSF conversion */ +/* Q12 values (even) */ +const opus_int16 silk_LSFCosTab_FIX_Q12[ LSF_COS_TAB_SZ_FIX + 1 ] = { + 8192, 8190, 8182, 8170, + 8152, 8130, 8104, 8072, + 8034, 7994, 7946, 7896, + 7840, 7778, 7714, 7644, + 7568, 7490, 7406, 7318, + 7226, 7128, 7026, 6922, + 6812, 6698, 6580, 6458, + 6332, 6204, 6070, 5934, + 5792, 5648, 5502, 5352, + 5198, 5040, 4880, 4718, + 4552, 4382, 4212, 4038, + 3862, 3684, 3502, 3320, + 3136, 2948, 2760, 2570, + 2378, 2186, 1990, 1794, + 1598, 1400, 1202, 1002, + 802, 602, 402, 202, + 0, -202, -402, -602, + -802, -1002, -1202, -1400, + -1598, -1794, -1990, -2186, + -2378, -2570, -2760, -2948, + -3136, -3320, -3502, -3684, + -3862, -4038, -4212, -4382, + -4552, -4718, -4880, -5040, + -5198, -5352, -5502, -5648, + -5792, -5934, -6070, -6204, + -6332, -6458, -6580, -6698, + -6812, -6922, -7026, -7128, + -7226, -7318, -7406, -7490, + -7568, -7644, -7714, -7778, + -7840, -7896, -7946, -7994, + -8034, -8072, -8104, -8130, + -8152, -8170, -8182, -8190, + -8192 +}; diff --git a/thirdparty/opus/silk/tables.h b/thirdparty/opus/silk/tables.h new file mode 100644 index 000000000000..7fea6fda39da --- /dev/null +++ b/thirdparty/opus/silk/tables.h @@ -0,0 +1,122 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_TABLES_H +#define SILK_TABLES_H + +#include "define.h" +#include "structs.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/* Entropy coding tables (with size in bytes indicated) */ +extern const opus_uint8 silk_gain_iCDF[ 3 ][ N_LEVELS_QGAIN / 8 ]; /* 24 */ +extern const opus_uint8 silk_delta_gain_iCDF[ MAX_DELTA_GAIN_QUANT - MIN_DELTA_GAIN_QUANT + 1 ]; /* 41 */ + +extern const opus_uint8 silk_pitch_lag_iCDF[ 2 * ( PITCH_EST_MAX_LAG_MS - PITCH_EST_MIN_LAG_MS ) ];/* 32 */ +extern const opus_uint8 silk_pitch_delta_iCDF[ 21 ]; /* 21 */ +extern const opus_uint8 silk_pitch_contour_iCDF[ 34 ]; /* 34 */ +extern const opus_uint8 silk_pitch_contour_NB_iCDF[ 11 ]; /* 11 */ +extern const opus_uint8 silk_pitch_contour_10_ms_iCDF[ 12 ]; /* 12 */ +extern const opus_uint8 silk_pitch_contour_10_ms_NB_iCDF[ 3 ]; /* 3 */ + +extern const opus_uint8 silk_pulses_per_block_iCDF[ N_RATE_LEVELS ][ SILK_MAX_PULSES + 2 ]; /* 180 */ +extern const opus_uint8 silk_pulses_per_block_BITS_Q5[ N_RATE_LEVELS - 1 ][ SILK_MAX_PULSES + 2 ]; /* 162 */ + +extern const opus_uint8 silk_rate_levels_iCDF[ 2 ][ N_RATE_LEVELS - 1 ]; /* 18 */ +extern const opus_uint8 silk_rate_levels_BITS_Q5[ 2 ][ N_RATE_LEVELS - 1 ]; /* 18 */ + +extern const opus_uint8 silk_max_pulses_table[ 4 ]; /* 4 */ + +extern const opus_uint8 silk_shell_code_table0[ 152 ]; /* 152 */ +extern const opus_uint8 silk_shell_code_table1[ 152 ]; /* 152 */ +extern const opus_uint8 silk_shell_code_table2[ 152 ]; /* 152 */ +extern const opus_uint8 silk_shell_code_table3[ 152 ]; /* 152 */ +extern const opus_uint8 silk_shell_code_table_offsets[ SILK_MAX_PULSES + 1 ]; /* 17 */ + +extern const opus_uint8 silk_lsb_iCDF[ 2 ]; /* 2 */ + +extern const opus_uint8 silk_sign_iCDF[ 42 ]; /* 42 */ + +extern const opus_uint8 silk_uniform3_iCDF[ 3 ]; /* 3 */ +extern const opus_uint8 silk_uniform4_iCDF[ 4 ]; /* 4 */ +extern const opus_uint8 silk_uniform5_iCDF[ 5 ]; /* 5 */ +extern const opus_uint8 silk_uniform6_iCDF[ 6 ]; /* 6 */ +extern const opus_uint8 silk_uniform8_iCDF[ 8 ]; /* 8 */ + +extern const opus_uint8 silk_NLSF_EXT_iCDF[ 7 ]; /* 7 */ + +extern const opus_uint8 silk_LTP_per_index_iCDF[ 3 ]; /* 3 */ +extern const opus_uint8 * const silk_LTP_gain_iCDF_ptrs[ NB_LTP_CBKS ]; /* 3 */ +extern const opus_uint8 * const silk_LTP_gain_BITS_Q5_ptrs[ NB_LTP_CBKS ]; /* 3 */ +extern const opus_int16 silk_LTP_gain_middle_avg_RD_Q14; +extern const opus_int8 * const silk_LTP_vq_ptrs_Q7[ NB_LTP_CBKS ]; /* 168 */ +extern const opus_uint8 * const silk_LTP_vq_gain_ptrs_Q7[NB_LTP_CBKS]; + +extern const opus_int8 silk_LTP_vq_sizes[ NB_LTP_CBKS ]; /* 3 */ + +extern const opus_uint8 silk_LTPscale_iCDF[ 3 ]; /* 4 */ +extern const opus_int16 silk_LTPScales_table_Q14[ 3 ]; /* 6 */ + +extern const opus_uint8 silk_type_offset_VAD_iCDF[ 4 ]; /* 4 */ +extern const opus_uint8 silk_type_offset_no_VAD_iCDF[ 2 ]; /* 2 */ + +extern const opus_int16 silk_stereo_pred_quant_Q13[ STEREO_QUANT_TAB_SIZE ]; /* 32 */ +extern const opus_uint8 silk_stereo_pred_joint_iCDF[ 25 ]; /* 25 */ +extern const opus_uint8 silk_stereo_only_code_mid_iCDF[ 2 ]; /* 2 */ + +extern const opus_uint8 * const silk_LBRR_flags_iCDF_ptr[ 2 ]; /* 10 */ + +extern const opus_uint8 silk_NLSF_interpolation_factor_iCDF[ 5 ]; /* 5 */ + +extern const silk_NLSF_CB_struct silk_NLSF_CB_WB; /* 1040 */ +extern const silk_NLSF_CB_struct silk_NLSF_CB_NB_MB; /* 728 */ + +/* Piece-wise linear mapping from bitrate in kbps to coding quality in dB SNR */ +extern const opus_int32 silk_TargetRate_table_NB[ TARGET_RATE_TAB_SZ ]; /* 32 */ +extern const opus_int32 silk_TargetRate_table_MB[ TARGET_RATE_TAB_SZ ]; /* 32 */ +extern const opus_int32 silk_TargetRate_table_WB[ TARGET_RATE_TAB_SZ ]; /* 32 */ +extern const opus_int16 silk_SNR_table_Q1[ TARGET_RATE_TAB_SZ ]; /* 32 */ + +/* Quantization offsets */ +extern const opus_int16 silk_Quantization_Offsets_Q10[ 2 ][ 2 ]; /* 8 */ + +/* Interpolation points for filter coefficients used in the bandwidth transition smoother */ +extern const opus_int32 silk_Transition_LP_B_Q28[ TRANSITION_INT_NUM ][ TRANSITION_NB ]; /* 60 */ +extern const opus_int32 silk_Transition_LP_A_Q28[ TRANSITION_INT_NUM ][ TRANSITION_NA ]; /* 60 */ + +/* Rom table with cosine values */ +extern const opus_int16 silk_LSFCosTab_FIX_Q12[ LSF_COS_TAB_SZ_FIX + 1 ]; /* 258 */ + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/thirdparty/opus/silk/tables_LTP.c b/thirdparty/opus/silk/tables_LTP.c new file mode 100644 index 000000000000..0e6a0254d5db --- /dev/null +++ b/thirdparty/opus/silk/tables_LTP.c @@ -0,0 +1,296 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "tables.h" + +const opus_uint8 silk_LTP_per_index_iCDF[3] = { + 179, 99, 0 +}; + +static const opus_uint8 silk_LTP_gain_iCDF_0[8] = { + 71, 56, 43, 30, 21, 12, 6, 0 +}; + +static const opus_uint8 silk_LTP_gain_iCDF_1[16] = { + 199, 165, 144, 124, 109, 96, 84, 71, + 61, 51, 42, 32, 23, 15, 8, 0 +}; + +static const opus_uint8 silk_LTP_gain_iCDF_2[32] = { + 241, 225, 211, 199, 187, 175, 164, 153, + 142, 132, 123, 114, 105, 96, 88, 80, + 72, 64, 57, 50, 44, 38, 33, 29, + 24, 20, 16, 12, 9, 5, 2, 0 +}; + +const opus_int16 silk_LTP_gain_middle_avg_RD_Q14 = 12304; + +static const opus_uint8 silk_LTP_gain_BITS_Q5_0[8] = { + 15, 131, 138, 138, 155, 155, 173, 173 +}; + +static const opus_uint8 silk_LTP_gain_BITS_Q5_1[16] = { + 69, 93, 115, 118, 131, 138, 141, 138, + 150, 150, 155, 150, 155, 160, 166, 160 +}; + +static const opus_uint8 silk_LTP_gain_BITS_Q5_2[32] = { + 131, 128, 134, 141, 141, 141, 145, 145, + 145, 150, 155, 155, 155, 155, 160, 160, + 160, 160, 166, 166, 173, 173, 182, 192, + 182, 192, 192, 192, 205, 192, 205, 224 +}; + +const opus_uint8 * const silk_LTP_gain_iCDF_ptrs[NB_LTP_CBKS] = { + silk_LTP_gain_iCDF_0, + silk_LTP_gain_iCDF_1, + silk_LTP_gain_iCDF_2 +}; + +const opus_uint8 * const silk_LTP_gain_BITS_Q5_ptrs[NB_LTP_CBKS] = { + silk_LTP_gain_BITS_Q5_0, + silk_LTP_gain_BITS_Q5_1, + silk_LTP_gain_BITS_Q5_2 +}; + +static const opus_int8 silk_LTP_gain_vq_0[8][5] = +{ +{ + 4, 6, 24, 7, 5 +}, +{ + 0, 0, 2, 0, 0 +}, +{ + 12, 28, 41, 13, -4 +}, +{ + -9, 15, 42, 25, 14 +}, +{ + 1, -2, 62, 41, -9 +}, +{ + -10, 37, 65, -4, 3 +}, +{ + -6, 4, 66, 7, -8 +}, +{ + 16, 14, 38, -3, 33 +} +}; + +static const opus_int8 silk_LTP_gain_vq_1[16][5] = +{ +{ + 13, 22, 39, 23, 12 +}, +{ + -1, 36, 64, 27, -6 +}, +{ + -7, 10, 55, 43, 17 +}, +{ + 1, 1, 8, 1, 1 +}, +{ + 6, -11, 74, 53, -9 +}, +{ + -12, 55, 76, -12, 8 +}, +{ + -3, 3, 93, 27, -4 +}, +{ + 26, 39, 59, 3, -8 +}, +{ + 2, 0, 77, 11, 9 +}, +{ + -8, 22, 44, -6, 7 +}, +{ + 40, 9, 26, 3, 9 +}, +{ + -7, 20, 101, -7, 4 +}, +{ + 3, -8, 42, 26, 0 +}, +{ + -15, 33, 68, 2, 23 +}, +{ + -2, 55, 46, -2, 15 +}, +{ + 3, -1, 21, 16, 41 +} +}; + +static const opus_int8 silk_LTP_gain_vq_2[32][5] = +{ +{ + -6, 27, 61, 39, 5 +}, +{ + -11, 42, 88, 4, 1 +}, +{ + -2, 60, 65, 6, -4 +}, +{ + -1, -5, 73, 56, 1 +}, +{ + -9, 19, 94, 29, -9 +}, +{ + 0, 12, 99, 6, 4 +}, +{ + 8, -19, 102, 46, -13 +}, +{ + 3, 2, 13, 3, 2 +}, +{ + 9, -21, 84, 72, -18 +}, +{ + -11, 46, 104, -22, 8 +}, +{ + 18, 38, 48, 23, 0 +}, +{ + -16, 70, 83, -21, 11 +}, +{ + 5, -11, 117, 22, -8 +}, +{ + -6, 23, 117, -12, 3 +}, +{ + 3, -8, 95, 28, 4 +}, +{ + -10, 15, 77, 60, -15 +}, +{ + -1, 4, 124, 2, -4 +}, +{ + 3, 38, 84, 24, -25 +}, +{ + 2, 13, 42, 13, 31 +}, +{ + 21, -4, 56, 46, -1 +}, +{ + -1, 35, 79, -13, 19 +}, +{ + -7, 65, 88, -9, -14 +}, +{ + 20, 4, 81, 49, -29 +}, +{ + 20, 0, 75, 3, -17 +}, +{ + 5, -9, 44, 92, -8 +}, +{ + 1, -3, 22, 69, 31 +}, +{ + -6, 95, 41, -12, 5 +}, +{ + 39, 67, 16, -4, 1 +}, +{ + 0, -6, 120, 55, -36 +}, +{ + -13, 44, 122, 4, -24 +}, +{ + 81, 5, 11, 3, 7 +}, +{ + 2, 0, 9, 10, 88 +} +}; + +const opus_int8 * const silk_LTP_vq_ptrs_Q7[NB_LTP_CBKS] = { + (opus_int8 *)&silk_LTP_gain_vq_0[0][0], + (opus_int8 *)&silk_LTP_gain_vq_1[0][0], + (opus_int8 *)&silk_LTP_gain_vq_2[0][0] +}; + +/* Maximum frequency-dependent response of the pitch taps above, + computed as max(abs(freqz(taps))) */ +static const opus_uint8 silk_LTP_gain_vq_0_gain[8] = { + 46, 2, 90, 87, 93, 91, 82, 98 +}; + +static const opus_uint8 silk_LTP_gain_vq_1_gain[16] = { + 109, 120, 118, 12, 113, 115, 117, 119, + 99, 59, 87, 111, 63, 111, 112, 80 +}; + +static const opus_uint8 silk_LTP_gain_vq_2_gain[32] = { + 126, 124, 125, 124, 129, 121, 126, 23, + 132, 127, 127, 127, 126, 127, 122, 133, + 130, 134, 101, 118, 119, 145, 126, 86, + 124, 120, 123, 119, 170, 173, 107, 109 +}; + +const opus_uint8 * const silk_LTP_vq_gain_ptrs_Q7[NB_LTP_CBKS] = { + &silk_LTP_gain_vq_0_gain[0], + &silk_LTP_gain_vq_1_gain[0], + &silk_LTP_gain_vq_2_gain[0] +}; + +const opus_int8 silk_LTP_vq_sizes[NB_LTP_CBKS] = { + 8, 16, 32 +}; diff --git a/thirdparty/opus/silk/tables_NLSF_CB_NB_MB.c b/thirdparty/opus/silk/tables_NLSF_CB_NB_MB.c new file mode 100644 index 000000000000..74d7f91f16bf --- /dev/null +++ b/thirdparty/opus/silk/tables_NLSF_CB_NB_MB.c @@ -0,0 +1,159 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "tables.h" + +static const opus_uint8 silk_NLSF_CB1_NB_MB_Q8[ 320 ] = { + 12, 35, 60, 83, 108, 132, 157, 180, + 206, 228, 15, 32, 55, 77, 101, 125, + 151, 175, 201, 225, 19, 42, 66, 89, + 114, 137, 162, 184, 209, 230, 12, 25, + 50, 72, 97, 120, 147, 172, 200, 223, + 26, 44, 69, 90, 114, 135, 159, 180, + 205, 225, 13, 22, 53, 80, 106, 130, + 156, 180, 205, 228, 15, 25, 44, 64, + 90, 115, 142, 168, 196, 222, 19, 24, + 62, 82, 100, 120, 145, 168, 190, 214, + 22, 31, 50, 79, 103, 120, 151, 170, + 203, 227, 21, 29, 45, 65, 106, 124, + 150, 171, 196, 224, 30, 49, 75, 97, + 121, 142, 165, 186, 209, 229, 19, 25, + 52, 70, 93, 116, 143, 166, 192, 219, + 26, 34, 62, 75, 97, 118, 145, 167, + 194, 217, 25, 33, 56, 70, 91, 113, + 143, 165, 196, 223, 21, 34, 51, 72, + 97, 117, 145, 171, 196, 222, 20, 29, + 50, 67, 90, 117, 144, 168, 197, 221, + 22, 31, 48, 66, 95, 117, 146, 168, + 196, 222, 24, 33, 51, 77, 116, 134, + 158, 180, 200, 224, 21, 28, 70, 87, + 106, 124, 149, 170, 194, 217, 26, 33, + 53, 64, 83, 117, 152, 173, 204, 225, + 27, 34, 65, 95, 108, 129, 155, 174, + 210, 225, 20, 26, 72, 99, 113, 131, + 154, 176, 200, 219, 34, 43, 61, 78, + 93, 114, 155, 177, 205, 229, 23, 29, + 54, 97, 124, 138, 163, 179, 209, 229, + 30, 38, 56, 89, 118, 129, 158, 178, + 200, 231, 21, 29, 49, 63, 85, 111, + 142, 163, 193, 222, 27, 48, 77, 103, + 133, 158, 179, 196, 215, 232, 29, 47, + 74, 99, 124, 151, 176, 198, 220, 237, + 33, 42, 61, 76, 93, 121, 155, 174, + 207, 225, 29, 53, 87, 112, 136, 154, + 170, 188, 208, 227, 24, 30, 52, 84, + 131, 150, 166, 186, 203, 229, 37, 48, + 64, 84, 104, 118, 156, 177, 201, 230 +}; + +static const opus_uint8 silk_NLSF_CB1_iCDF_NB_MB[ 64 ] = { + 212, 178, 148, 129, 108, 96, 85, 82, + 79, 77, 61, 59, 57, 56, 51, 49, + 48, 45, 42, 41, 40, 38, 36, 34, + 31, 30, 21, 12, 10, 3, 1, 0, + 255, 245, 244, 236, 233, 225, 217, 203, + 190, 176, 175, 161, 149, 136, 125, 114, + 102, 91, 81, 71, 60, 52, 43, 35, + 28, 20, 19, 18, 12, 11, 5, 0 +}; + +static const opus_uint8 silk_NLSF_CB2_SELECT_NB_MB[ 160 ] = { + 16, 0, 0, 0, 0, 99, 66, 36, + 36, 34, 36, 34, 34, 34, 34, 83, + 69, 36, 52, 34, 116, 102, 70, 68, + 68, 176, 102, 68, 68, 34, 65, 85, + 68, 84, 36, 116, 141, 152, 139, 170, + 132, 187, 184, 216, 137, 132, 249, 168, + 185, 139, 104, 102, 100, 68, 68, 178, + 218, 185, 185, 170, 244, 216, 187, 187, + 170, 244, 187, 187, 219, 138, 103, 155, + 184, 185, 137, 116, 183, 155, 152, 136, + 132, 217, 184, 184, 170, 164, 217, 171, + 155, 139, 244, 169, 184, 185, 170, 164, + 216, 223, 218, 138, 214, 143, 188, 218, + 168, 244, 141, 136, 155, 170, 168, 138, + 220, 219, 139, 164, 219, 202, 216, 137, + 168, 186, 246, 185, 139, 116, 185, 219, + 185, 138, 100, 100, 134, 100, 102, 34, + 68, 68, 100, 68, 168, 203, 221, 218, + 168, 167, 154, 136, 104, 70, 164, 246, + 171, 137, 139, 137, 155, 218, 219, 139 +}; + +static const opus_uint8 silk_NLSF_CB2_iCDF_NB_MB[ 72 ] = { + 255, 254, 253, 238, 14, 3, 2, 1, + 0, 255, 254, 252, 218, 35, 3, 2, + 1, 0, 255, 254, 250, 208, 59, 4, + 2, 1, 0, 255, 254, 246, 194, 71, + 10, 2, 1, 0, 255, 252, 236, 183, + 82, 8, 2, 1, 0, 255, 252, 235, + 180, 90, 17, 2, 1, 0, 255, 248, + 224, 171, 97, 30, 4, 1, 0, 255, + 254, 236, 173, 95, 37, 7, 1, 0 +}; + +static const opus_uint8 silk_NLSF_CB2_BITS_NB_MB_Q5[ 72 ] = { + 255, 255, 255, 131, 6, 145, 255, 255, + 255, 255, 255, 236, 93, 15, 96, 255, + 255, 255, 255, 255, 194, 83, 25, 71, + 221, 255, 255, 255, 255, 162, 73, 34, + 66, 162, 255, 255, 255, 210, 126, 73, + 43, 57, 173, 255, 255, 255, 201, 125, + 71, 48, 58, 130, 255, 255, 255, 166, + 110, 73, 57, 62, 104, 210, 255, 255, + 251, 123, 65, 55, 68, 100, 171, 255 +}; + +static const opus_uint8 silk_NLSF_PRED_NB_MB_Q8[ 18 ] = { + 179, 138, 140, 148, 151, 149, 153, 151, + 163, 116, 67, 82, 59, 92, 72, 100, + 89, 92 +}; + +static const opus_int16 silk_NLSF_DELTA_MIN_NB_MB_Q15[ 11 ] = { + 250, 3, 6, 3, 3, 3, 4, 3, + 3, 3, 461 +}; + +const silk_NLSF_CB_struct silk_NLSF_CB_NB_MB = +{ + 32, + 10, + 11797, // Precomputed value of SILK_FIX_CONST(0.18, 16) + 364, // Precomputed value of SILK_FIX_CONST(1.0 / 0.18, 6) + silk_NLSF_CB1_NB_MB_Q8, + silk_NLSF_CB1_iCDF_NB_MB, + silk_NLSF_PRED_NB_MB_Q8, + silk_NLSF_CB2_SELECT_NB_MB, + silk_NLSF_CB2_iCDF_NB_MB, + silk_NLSF_CB2_BITS_NB_MB_Q5, + silk_NLSF_DELTA_MIN_NB_MB_Q15, +}; diff --git a/thirdparty/opus/silk/tables_NLSF_CB_WB.c b/thirdparty/opus/silk/tables_NLSF_CB_WB.c new file mode 100644 index 000000000000..87f2906e3374 --- /dev/null +++ b/thirdparty/opus/silk/tables_NLSF_CB_WB.c @@ -0,0 +1,198 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "tables.h" + +static const opus_uint8 silk_NLSF_CB1_WB_Q8[ 512 ] = { + 7, 23, 38, 54, 69, 85, 100, 116, + 131, 147, 162, 178, 193, 208, 223, 239, + 13, 25, 41, 55, 69, 83, 98, 112, + 127, 142, 157, 171, 187, 203, 220, 236, + 15, 21, 34, 51, 61, 78, 92, 106, + 126, 136, 152, 167, 185, 205, 225, 240, + 10, 21, 36, 50, 63, 79, 95, 110, + 126, 141, 157, 173, 189, 205, 221, 237, + 17, 20, 37, 51, 59, 78, 89, 107, + 123, 134, 150, 164, 184, 205, 224, 240, + 10, 15, 32, 51, 67, 81, 96, 112, + 129, 142, 158, 173, 189, 204, 220, 236, + 8, 21, 37, 51, 65, 79, 98, 113, + 126, 138, 155, 168, 179, 192, 209, 218, + 12, 15, 34, 55, 63, 78, 87, 108, + 118, 131, 148, 167, 185, 203, 219, 236, + 16, 19, 32, 36, 56, 79, 91, 108, + 118, 136, 154, 171, 186, 204, 220, 237, + 11, 28, 43, 58, 74, 89, 105, 120, + 135, 150, 165, 180, 196, 211, 226, 241, + 6, 16, 33, 46, 60, 75, 92, 107, + 123, 137, 156, 169, 185, 199, 214, 225, + 11, 19, 30, 44, 57, 74, 89, 105, + 121, 135, 152, 169, 186, 202, 218, 234, + 12, 19, 29, 46, 57, 71, 88, 100, + 120, 132, 148, 165, 182, 199, 216, 233, + 17, 23, 35, 46, 56, 77, 92, 106, + 123, 134, 152, 167, 185, 204, 222, 237, + 14, 17, 45, 53, 63, 75, 89, 107, + 115, 132, 151, 171, 188, 206, 221, 240, + 9, 16, 29, 40, 56, 71, 88, 103, + 119, 137, 154, 171, 189, 205, 222, 237, + 16, 19, 36, 48, 57, 76, 87, 105, + 118, 132, 150, 167, 185, 202, 218, 236, + 12, 17, 29, 54, 71, 81, 94, 104, + 126, 136, 149, 164, 182, 201, 221, 237, + 15, 28, 47, 62, 79, 97, 115, 129, + 142, 155, 168, 180, 194, 208, 223, 238, + 8, 14, 30, 45, 62, 78, 94, 111, + 127, 143, 159, 175, 192, 207, 223, 239, + 17, 30, 49, 62, 79, 92, 107, 119, + 132, 145, 160, 174, 190, 204, 220, 235, + 14, 19, 36, 45, 61, 76, 91, 108, + 121, 138, 154, 172, 189, 205, 222, 238, + 12, 18, 31, 45, 60, 76, 91, 107, + 123, 138, 154, 171, 187, 204, 221, 236, + 13, 17, 31, 43, 53, 70, 83, 103, + 114, 131, 149, 167, 185, 203, 220, 237, + 17, 22, 35, 42, 58, 78, 93, 110, + 125, 139, 155, 170, 188, 206, 224, 240, + 8, 15, 34, 50, 67, 83, 99, 115, + 131, 146, 162, 178, 193, 209, 224, 239, + 13, 16, 41, 66, 73, 86, 95, 111, + 128, 137, 150, 163, 183, 206, 225, 241, + 17, 25, 37, 52, 63, 75, 92, 102, + 119, 132, 144, 160, 175, 191, 212, 231, + 19, 31, 49, 65, 83, 100, 117, 133, + 147, 161, 174, 187, 200, 213, 227, 242, + 18, 31, 52, 68, 88, 103, 117, 126, + 138, 149, 163, 177, 192, 207, 223, 239, + 16, 29, 47, 61, 76, 90, 106, 119, + 133, 147, 161, 176, 193, 209, 224, 240, + 15, 21, 35, 50, 61, 73, 86, 97, + 110, 119, 129, 141, 175, 198, 218, 237 +}; + +static const opus_uint8 silk_NLSF_CB1_iCDF_WB[ 64 ] = { + 225, 204, 201, 184, 183, 175, 158, 154, + 153, 135, 119, 115, 113, 110, 109, 99, + 98, 95, 79, 68, 52, 50, 48, 45, + 43, 32, 31, 27, 18, 10, 3, 0, + 255, 251, 235, 230, 212, 201, 196, 182, + 167, 166, 163, 151, 138, 124, 110, 104, + 90, 78, 76, 70, 69, 57, 45, 34, + 24, 21, 11, 6, 5, 4, 3, 0 +}; + +static const opus_uint8 silk_NLSF_CB2_SELECT_WB[ 256 ] = { + 0, 0, 0, 0, 0, 0, 0, 1, + 100, 102, 102, 68, 68, 36, 34, 96, + 164, 107, 158, 185, 180, 185, 139, 102, + 64, 66, 36, 34, 34, 0, 1, 32, + 208, 139, 141, 191, 152, 185, 155, 104, + 96, 171, 104, 166, 102, 102, 102, 132, + 1, 0, 0, 0, 0, 16, 16, 0, + 80, 109, 78, 107, 185, 139, 103, 101, + 208, 212, 141, 139, 173, 153, 123, 103, + 36, 0, 0, 0, 0, 0, 0, 1, + 48, 0, 0, 0, 0, 0, 0, 32, + 68, 135, 123, 119, 119, 103, 69, 98, + 68, 103, 120, 118, 118, 102, 71, 98, + 134, 136, 157, 184, 182, 153, 139, 134, + 208, 168, 248, 75, 189, 143, 121, 107, + 32, 49, 34, 34, 34, 0, 17, 2, + 210, 235, 139, 123, 185, 137, 105, 134, + 98, 135, 104, 182, 100, 183, 171, 134, + 100, 70, 68, 70, 66, 66, 34, 131, + 64, 166, 102, 68, 36, 2, 1, 0, + 134, 166, 102, 68, 34, 34, 66, 132, + 212, 246, 158, 139, 107, 107, 87, 102, + 100, 219, 125, 122, 137, 118, 103, 132, + 114, 135, 137, 105, 171, 106, 50, 34, + 164, 214, 141, 143, 185, 151, 121, 103, + 192, 34, 0, 0, 0, 0, 0, 1, + 208, 109, 74, 187, 134, 249, 159, 137, + 102, 110, 154, 118, 87, 101, 119, 101, + 0, 2, 0, 36, 36, 66, 68, 35, + 96, 164, 102, 100, 36, 0, 2, 33, + 167, 138, 174, 102, 100, 84, 2, 2, + 100, 107, 120, 119, 36, 197, 24, 0 +}; + +static const opus_uint8 silk_NLSF_CB2_iCDF_WB[ 72 ] = { + 255, 254, 253, 244, 12, 3, 2, 1, + 0, 255, 254, 252, 224, 38, 3, 2, + 1, 0, 255, 254, 251, 209, 57, 4, + 2, 1, 0, 255, 254, 244, 195, 69, + 4, 2, 1, 0, 255, 251, 232, 184, + 84, 7, 2, 1, 0, 255, 254, 240, + 186, 86, 14, 2, 1, 0, 255, 254, + 239, 178, 91, 30, 5, 1, 0, 255, + 248, 227, 177, 100, 19, 2, 1, 0 +}; + +static const opus_uint8 silk_NLSF_CB2_BITS_WB_Q5[ 72 ] = { + 255, 255, 255, 156, 4, 154, 255, 255, + 255, 255, 255, 227, 102, 15, 92, 255, + 255, 255, 255, 255, 213, 83, 24, 72, + 236, 255, 255, 255, 255, 150, 76, 33, + 63, 214, 255, 255, 255, 190, 121, 77, + 43, 55, 185, 255, 255, 255, 245, 137, + 71, 43, 59, 139, 255, 255, 255, 255, + 131, 66, 50, 66, 107, 194, 255, 255, + 166, 116, 76, 55, 53, 125, 255, 255 +}; + +static const opus_uint8 silk_NLSF_PRED_WB_Q8[ 30 ] = { + 175, 148, 160, 176, 178, 173, 174, 164, + 177, 174, 196, 182, 198, 192, 182, 68, + 62, 66, 60, 72, 117, 85, 90, 118, + 136, 151, 142, 160, 142, 155 +}; + +static const opus_int16 silk_NLSF_DELTA_MIN_WB_Q15[ 17 ] = { + 100, 3, 40, 3, 3, 3, 5, 14, + 14, 10, 11, 3, 8, 9, 7, 3, + 347 +}; + +const silk_NLSF_CB_struct silk_NLSF_CB_WB = +{ + 32, + 16, + 9830, // Precomputed value of SILK_FIX_CONST(0.15, 16) + 427, // Precomputed value of SILK_FIX_CONST(1.0 / 0.15, 6) + silk_NLSF_CB1_WB_Q8, + silk_NLSF_CB1_iCDF_WB, + silk_NLSF_PRED_WB_Q8, + silk_NLSF_CB2_SELECT_WB, + silk_NLSF_CB2_iCDF_WB, + silk_NLSF_CB2_BITS_WB_Q5, + silk_NLSF_DELTA_MIN_WB_Q15, +}; + diff --git a/thirdparty/opus/silk/tables_gain.c b/thirdparty/opus/silk/tables_gain.c new file mode 100644 index 000000000000..37e41d890c33 --- /dev/null +++ b/thirdparty/opus/silk/tables_gain.c @@ -0,0 +1,63 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "tables.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +const opus_uint8 silk_gain_iCDF[ 3 ][ N_LEVELS_QGAIN / 8 ] = +{ +{ + 224, 112, 44, 15, 3, 2, 1, 0 +}, +{ + 254, 237, 192, 132, 70, 23, 4, 0 +}, +{ + 255, 252, 226, 155, 61, 11, 2, 0 +} +}; + +const opus_uint8 silk_delta_gain_iCDF[ MAX_DELTA_GAIN_QUANT - MIN_DELTA_GAIN_QUANT + 1 ] = { + 250, 245, 234, 203, 71, 50, 42, 38, + 35, 33, 31, 29, 28, 27, 26, 25, + 24, 23, 22, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 11, 10, 9, + 8, 7, 6, 5, 4, 3, 2, 1, + 0 +}; + +#ifdef __cplusplus +} +#endif diff --git a/thirdparty/opus/silk/tables_other.c b/thirdparty/opus/silk/tables_other.c new file mode 100644 index 000000000000..398686bf26b9 --- /dev/null +++ b/thirdparty/opus/silk/tables_other.c @@ -0,0 +1,138 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "structs.h" +#include "define.h" +#include "tables.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/* Piece-wise linear mapping from bitrate in kbps to coding quality in dB SNR */ +const opus_int32 silk_TargetRate_table_NB[ TARGET_RATE_TAB_SZ ] = { + 0, 8000, 9400, 11500, 13500, 17500, 25000, MAX_TARGET_RATE_BPS +}; +const opus_int32 silk_TargetRate_table_MB[ TARGET_RATE_TAB_SZ ] = { + 0, 9000, 12000, 14500, 18500, 24500, 35500, MAX_TARGET_RATE_BPS +}; +const opus_int32 silk_TargetRate_table_WB[ TARGET_RATE_TAB_SZ ] = { + 0, 10500, 14000, 17000, 21500, 28500, 42000, MAX_TARGET_RATE_BPS +}; +const opus_int16 silk_SNR_table_Q1[ TARGET_RATE_TAB_SZ ] = { + 18, 29, 38, 40, 46, 52, 62, 84 +}; + +/* Tables for stereo predictor coding */ +const opus_int16 silk_stereo_pred_quant_Q13[ STEREO_QUANT_TAB_SIZE ] = { + -13732, -10050, -8266, -7526, -6500, -5000, -2950, -820, + 820, 2950, 5000, 6500, 7526, 8266, 10050, 13732 +}; +const opus_uint8 silk_stereo_pred_joint_iCDF[ 25 ] = { + 249, 247, 246, 245, 244, + 234, 210, 202, 201, 200, + 197, 174, 82, 59, 56, + 55, 54, 46, 22, 12, + 11, 10, 9, 7, 0 +}; +const opus_uint8 silk_stereo_only_code_mid_iCDF[ 2 ] = { 64, 0 }; + +/* Tables for LBRR flags */ +static const opus_uint8 silk_LBRR_flags_2_iCDF[ 3 ] = { 203, 150, 0 }; +static const opus_uint8 silk_LBRR_flags_3_iCDF[ 7 ] = { 215, 195, 166, 125, 110, 82, 0 }; +const opus_uint8 * const silk_LBRR_flags_iCDF_ptr[ 2 ] = { + silk_LBRR_flags_2_iCDF, + silk_LBRR_flags_3_iCDF +}; + +/* Table for LSB coding */ +const opus_uint8 silk_lsb_iCDF[ 2 ] = { 120, 0 }; + +/* Tables for LTPScale */ +const opus_uint8 silk_LTPscale_iCDF[ 3 ] = { 128, 64, 0 }; + +/* Tables for signal type and offset coding */ +const opus_uint8 silk_type_offset_VAD_iCDF[ 4 ] = { + 232, 158, 10, 0 +}; +const opus_uint8 silk_type_offset_no_VAD_iCDF[ 2 ] = { + 230, 0 +}; + +/* Tables for NLSF interpolation factor */ +const opus_uint8 silk_NLSF_interpolation_factor_iCDF[ 5 ] = { 243, 221, 192, 181, 0 }; + +/* Quantization offsets */ +const opus_int16 silk_Quantization_Offsets_Q10[ 2 ][ 2 ] = { + { OFFSET_UVL_Q10, OFFSET_UVH_Q10 }, { OFFSET_VL_Q10, OFFSET_VH_Q10 } +}; + +/* Table for LTPScale */ +const opus_int16 silk_LTPScales_table_Q14[ 3 ] = { 15565, 12288, 8192 }; + +/* Uniform entropy tables */ +const opus_uint8 silk_uniform3_iCDF[ 3 ] = { 171, 85, 0 }; +const opus_uint8 silk_uniform4_iCDF[ 4 ] = { 192, 128, 64, 0 }; +const opus_uint8 silk_uniform5_iCDF[ 5 ] = { 205, 154, 102, 51, 0 }; +const opus_uint8 silk_uniform6_iCDF[ 6 ] = { 213, 171, 128, 85, 43, 0 }; +const opus_uint8 silk_uniform8_iCDF[ 8 ] = { 224, 192, 160, 128, 96, 64, 32, 0 }; + +const opus_uint8 silk_NLSF_EXT_iCDF[ 7 ] = { 100, 40, 16, 7, 3, 1, 0 }; + +/* Elliptic/Cauer filters designed with 0.1 dB passband ripple, + 80 dB minimum stopband attenuation, and + [0.95 : 0.15 : 0.35] normalized cut off frequencies. */ + +/* Interpolation points for filter coefficients used in the bandwidth transition smoother */ +const opus_int32 silk_Transition_LP_B_Q28[ TRANSITION_INT_NUM ][ TRANSITION_NB ] = +{ +{ 250767114, 501534038, 250767114 }, +{ 209867381, 419732057, 209867381 }, +{ 170987846, 341967853, 170987846 }, +{ 131531482, 263046905, 131531482 }, +{ 89306658, 178584282, 89306658 } +}; + +/* Interpolation points for filter coefficients used in the bandwidth transition smoother */ +const opus_int32 silk_Transition_LP_A_Q28[ TRANSITION_INT_NUM ][ TRANSITION_NA ] = +{ +{ 506393414, 239854379 }, +{ 411067935, 169683996 }, +{ 306733530, 116694253 }, +{ 185807084, 77959395 }, +{ 35497197, 57401098 } +}; + +#ifdef __cplusplus +} +#endif + diff --git a/thirdparty/opus/silk/tables_pitch_lag.c b/thirdparty/opus/silk/tables_pitch_lag.c new file mode 100644 index 000000000000..e80cc59a2729 --- /dev/null +++ b/thirdparty/opus/silk/tables_pitch_lag.c @@ -0,0 +1,69 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "tables.h" + +const opus_uint8 silk_pitch_lag_iCDF[ 2 * ( PITCH_EST_MAX_LAG_MS - PITCH_EST_MIN_LAG_MS ) ] = { + 253, 250, 244, 233, 212, 182, 150, 131, + 120, 110, 98, 85, 72, 60, 49, 40, + 32, 25, 19, 15, 13, 11, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0 +}; + +const opus_uint8 silk_pitch_delta_iCDF[21] = { + 210, 208, 206, 203, 199, 193, 183, 168, + 142, 104, 74, 52, 37, 27, 20, 14, + 10, 6, 4, 2, 0 +}; + +const opus_uint8 silk_pitch_contour_iCDF[34] = { + 223, 201, 183, 167, 152, 138, 124, 111, + 98, 88, 79, 70, 62, 56, 50, 44, + 39, 35, 31, 27, 24, 21, 18, 16, + 14, 12, 10, 8, 6, 4, 3, 2, + 1, 0 +}; + +const opus_uint8 silk_pitch_contour_NB_iCDF[11] = { + 188, 176, 155, 138, 119, 97, 67, 43, + 26, 10, 0 +}; + +const opus_uint8 silk_pitch_contour_10_ms_iCDF[12] = { + 165, 119, 80, 61, 47, 35, 27, 20, + 14, 9, 4, 0 +}; + +const opus_uint8 silk_pitch_contour_10_ms_NB_iCDF[3] = { + 113, 63, 0 +}; + + diff --git a/thirdparty/opus/silk/tables_pulses_per_block.c b/thirdparty/opus/silk/tables_pulses_per_block.c new file mode 100644 index 000000000000..c7c01c8893fa --- /dev/null +++ b/thirdparty/opus/silk/tables_pulses_per_block.c @@ -0,0 +1,264 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "tables.h" + +const opus_uint8 silk_max_pulses_table[ 4 ] = { + 8, 10, 12, 16 +}; + +const opus_uint8 silk_pulses_per_block_iCDF[ 10 ][ 18 ] = { +{ + 125, 51, 26, 18, 15, 12, 11, 10, + 9, 8, 7, 6, 5, 4, 3, 2, + 1, 0 +}, +{ + 198, 105, 45, 22, 15, 12, 11, 10, + 9, 8, 7, 6, 5, 4, 3, 2, + 1, 0 +}, +{ + 213, 162, 116, 83, 59, 43, 32, 24, + 18, 15, 12, 9, 7, 6, 5, 3, + 2, 0 +}, +{ + 239, 187, 116, 59, 28, 16, 11, 10, + 9, 8, 7, 6, 5, 4, 3, 2, + 1, 0 +}, +{ + 250, 229, 188, 135, 86, 51, 30, 19, + 13, 10, 8, 6, 5, 4, 3, 2, + 1, 0 +}, +{ + 249, 235, 213, 185, 156, 128, 103, 83, + 66, 53, 42, 33, 26, 21, 17, 13, + 10, 0 +}, +{ + 254, 249, 235, 206, 164, 118, 77, 46, + 27, 16, 10, 7, 5, 4, 3, 2, + 1, 0 +}, +{ + 255, 253, 249, 239, 220, 191, 156, 119, + 85, 57, 37, 23, 15, 10, 6, 4, + 2, 0 +}, +{ + 255, 253, 251, 246, 237, 223, 203, 179, + 152, 124, 98, 75, 55, 40, 29, 21, + 15, 0 +}, +{ + 255, 254, 253, 247, 220, 162, 106, 67, + 42, 28, 18, 12, 9, 6, 4, 3, + 2, 0 +} +}; + +const opus_uint8 silk_pulses_per_block_BITS_Q5[ 9 ][ 18 ] = { +{ + 31, 57, 107, 160, 205, 205, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255 +}, +{ + 69, 47, 67, 111, 166, 205, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255 +}, +{ + 82, 74, 79, 95, 109, 128, 145, 160, + 173, 205, 205, 205, 224, 255, 255, 224, + 255, 224 +}, +{ + 125, 74, 59, 69, 97, 141, 182, 255, + 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255 +}, +{ + 173, 115, 85, 73, 76, 92, 115, 145, + 173, 205, 224, 224, 255, 255, 255, 255, + 255, 255 +}, +{ + 166, 134, 113, 102, 101, 102, 107, 118, + 125, 138, 145, 155, 166, 182, 192, 192, + 205, 150 +}, +{ + 224, 182, 134, 101, 83, 79, 85, 97, + 120, 145, 173, 205, 224, 255, 255, 255, + 255, 255 +}, +{ + 255, 224, 192, 150, 120, 101, 92, 89, + 93, 102, 118, 134, 160, 182, 192, 224, + 224, 224 +}, +{ + 255, 224, 224, 182, 155, 134, 118, 109, + 104, 102, 106, 111, 118, 131, 145, 160, + 173, 131 +} +}; + +const opus_uint8 silk_rate_levels_iCDF[ 2 ][ 9 ] = +{ +{ + 241, 190, 178, 132, 87, 74, 41, 14, + 0 +}, +{ + 223, 193, 157, 140, 106, 57, 39, 18, + 0 +} +}; + +const opus_uint8 silk_rate_levels_BITS_Q5[ 2 ][ 9 ] = +{ +{ + 131, 74, 141, 79, 80, 138, 95, 104, + 134 +}, +{ + 95, 99, 91, 125, 93, 76, 123, 115, + 123 +} +}; + +const opus_uint8 silk_shell_code_table0[ 152 ] = { + 128, 0, 214, 42, 0, 235, 128, 21, + 0, 244, 184, 72, 11, 0, 248, 214, + 128, 42, 7, 0, 248, 225, 170, 80, + 25, 5, 0, 251, 236, 198, 126, 54, + 18, 3, 0, 250, 238, 211, 159, 82, + 35, 15, 5, 0, 250, 231, 203, 168, + 128, 88, 53, 25, 6, 0, 252, 238, + 216, 185, 148, 108, 71, 40, 18, 4, + 0, 253, 243, 225, 199, 166, 128, 90, + 57, 31, 13, 3, 0, 254, 246, 233, + 212, 183, 147, 109, 73, 44, 23, 10, + 2, 0, 255, 250, 240, 223, 198, 166, + 128, 90, 58, 33, 16, 6, 1, 0, + 255, 251, 244, 231, 210, 181, 146, 110, + 75, 46, 25, 12, 5, 1, 0, 255, + 253, 248, 238, 221, 196, 164, 128, 92, + 60, 35, 18, 8, 3, 1, 0, 255, + 253, 249, 242, 229, 208, 180, 146, 110, + 76, 48, 27, 14, 7, 3, 1, 0 +}; + +const opus_uint8 silk_shell_code_table1[ 152 ] = { + 129, 0, 207, 50, 0, 236, 129, 20, + 0, 245, 185, 72, 10, 0, 249, 213, + 129, 42, 6, 0, 250, 226, 169, 87, + 27, 4, 0, 251, 233, 194, 130, 62, + 20, 4, 0, 250, 236, 207, 160, 99, + 47, 17, 3, 0, 255, 240, 217, 182, + 131, 81, 41, 11, 1, 0, 255, 254, + 233, 201, 159, 107, 61, 20, 2, 1, + 0, 255, 249, 233, 206, 170, 128, 86, + 50, 23, 7, 1, 0, 255, 250, 238, + 217, 186, 148, 108, 70, 39, 18, 6, + 1, 0, 255, 252, 243, 226, 200, 166, + 128, 90, 56, 30, 13, 4, 1, 0, + 255, 252, 245, 231, 209, 180, 146, 110, + 76, 47, 25, 11, 4, 1, 0, 255, + 253, 248, 237, 219, 194, 163, 128, 93, + 62, 37, 19, 8, 3, 1, 0, 255, + 254, 250, 241, 226, 205, 177, 145, 111, + 79, 51, 30, 15, 6, 2, 1, 0 +}; + +const opus_uint8 silk_shell_code_table2[ 152 ] = { + 129, 0, 203, 54, 0, 234, 129, 23, + 0, 245, 184, 73, 10, 0, 250, 215, + 129, 41, 5, 0, 252, 232, 173, 86, + 24, 3, 0, 253, 240, 200, 129, 56, + 15, 2, 0, 253, 244, 217, 164, 94, + 38, 10, 1, 0, 253, 245, 226, 189, + 132, 71, 27, 7, 1, 0, 253, 246, + 231, 203, 159, 105, 56, 23, 6, 1, + 0, 255, 248, 235, 213, 179, 133, 85, + 47, 19, 5, 1, 0, 255, 254, 243, + 221, 194, 159, 117, 70, 37, 12, 2, + 1, 0, 255, 254, 248, 234, 208, 171, + 128, 85, 48, 22, 8, 2, 1, 0, + 255, 254, 250, 240, 220, 189, 149, 107, + 67, 36, 16, 6, 2, 1, 0, 255, + 254, 251, 243, 227, 201, 166, 128, 90, + 55, 29, 13, 5, 2, 1, 0, 255, + 254, 252, 246, 234, 213, 183, 147, 109, + 73, 43, 22, 10, 4, 2, 1, 0 +}; + +const opus_uint8 silk_shell_code_table3[ 152 ] = { + 130, 0, 200, 58, 0, 231, 130, 26, + 0, 244, 184, 76, 12, 0, 249, 214, + 130, 43, 6, 0, 252, 232, 173, 87, + 24, 3, 0, 253, 241, 203, 131, 56, + 14, 2, 0, 254, 246, 221, 167, 94, + 35, 8, 1, 0, 254, 249, 232, 193, + 130, 65, 23, 5, 1, 0, 255, 251, + 239, 211, 162, 99, 45, 15, 4, 1, + 0, 255, 251, 243, 223, 186, 131, 74, + 33, 11, 3, 1, 0, 255, 252, 245, + 230, 202, 158, 105, 57, 24, 8, 2, + 1, 0, 255, 253, 247, 235, 214, 179, + 132, 84, 44, 19, 7, 2, 1, 0, + 255, 254, 250, 240, 223, 196, 159, 112, + 69, 36, 15, 6, 2, 1, 0, 255, + 254, 253, 245, 231, 209, 176, 136, 93, + 55, 27, 11, 3, 2, 1, 0, 255, + 254, 253, 252, 239, 221, 194, 158, 117, + 76, 42, 18, 4, 3, 2, 1, 0 +}; + +const opus_uint8 silk_shell_code_table_offsets[ 17 ] = { + 0, 0, 2, 5, 9, 14, 20, 27, + 35, 44, 54, 65, 77, 90, 104, 119, + 135 +}; + +const opus_uint8 silk_sign_iCDF[ 42 ] = { + 254, 49, 67, 77, 82, 93, 99, + 198, 11, 18, 24, 31, 36, 45, + 255, 46, 66, 78, 87, 94, 104, + 208, 14, 21, 32, 42, 51, 66, + 255, 94, 104, 109, 112, 115, 118, + 248, 53, 69, 80, 88, 95, 102 +}; diff --git a/thirdparty/opus/silk/tuning_parameters.h b/thirdparty/opus/silk/tuning_parameters.h new file mode 100644 index 000000000000..5b8f404235f4 --- /dev/null +++ b/thirdparty/opus/silk/tuning_parameters.h @@ -0,0 +1,171 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_TUNING_PARAMETERS_H +#define SILK_TUNING_PARAMETERS_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +/* Decay time for bitreservoir */ +#define BITRESERVOIR_DECAY_TIME_MS 500 + +/*******************/ +/* Pitch estimator */ +/*******************/ + +/* Level of noise floor for whitening filter LPC analysis in pitch analysis */ +#define FIND_PITCH_WHITE_NOISE_FRACTION 1e-3f + +/* Bandwidth expansion for whitening filter in pitch analysis */ +#define FIND_PITCH_BANDWIDTH_EXPANSION 0.99f + +/*********************/ +/* Linear prediction */ +/*********************/ + +/* LPC analysis regularization */ +#define FIND_LPC_COND_FAC 1e-5f + +/* LTP analysis defines */ +#define FIND_LTP_COND_FAC 1e-5f +#define LTP_DAMPING 0.05f +#define LTP_SMOOTHING 0.1f + +/* LTP quantization settings */ +#define MU_LTP_QUANT_NB 0.03f +#define MU_LTP_QUANT_MB 0.025f +#define MU_LTP_QUANT_WB 0.02f + +/* Max cumulative LTP gain */ +#define MAX_SUM_LOG_GAIN_DB 250.0f + +/***********************/ +/* High pass filtering */ +/***********************/ + +/* Smoothing parameters for low end of pitch frequency range estimation */ +#define VARIABLE_HP_SMTH_COEF1 0.1f +#define VARIABLE_HP_SMTH_COEF2 0.015f +#define VARIABLE_HP_MAX_DELTA_FREQ 0.4f + +/* Min and max cut-off frequency values (-3 dB points) */ +#define VARIABLE_HP_MIN_CUTOFF_HZ 60 +#define VARIABLE_HP_MAX_CUTOFF_HZ 100 + +/***********/ +/* Various */ +/***********/ + +/* VAD threshold */ +#define SPEECH_ACTIVITY_DTX_THRES 0.05f + +/* Speech Activity LBRR enable threshold */ +#define LBRR_SPEECH_ACTIVITY_THRES 0.3f + +/*************************/ +/* Perceptual parameters */ +/*************************/ + +/* reduction in coding SNR during low speech activity */ +#define BG_SNR_DECR_dB 2.0f + +/* factor for reducing quantization noise during voiced speech */ +#define HARM_SNR_INCR_dB 2.0f + +/* factor for reducing quantization noise for unvoiced sparse signals */ +#define SPARSE_SNR_INCR_dB 2.0f + +/* threshold for sparseness measure above which to use lower quantization offset during unvoiced */ +#define SPARSENESS_THRESHOLD_QNT_OFFSET 0.75f + +/* warping control */ +#define WARPING_MULTIPLIER 0.015f + +/* fraction added to first autocorrelation value */ +#define SHAPE_WHITE_NOISE_FRACTION 5e-5f + +/* noise shaping filter chirp factor */ +#define BANDWIDTH_EXPANSION 0.95f + +/* difference between chirp factors for analysis and synthesis noise shaping filters at low bitrates */ +#define LOW_RATE_BANDWIDTH_EXPANSION_DELTA 0.01f + +/* extra harmonic boosting (signal shaping) at low bitrates */ +#define LOW_RATE_HARMONIC_BOOST 0.1f + +/* extra harmonic boosting (signal shaping) for noisy input signals */ +#define LOW_INPUT_QUALITY_HARMONIC_BOOST 0.1f + +/* harmonic noise shaping */ +#define HARMONIC_SHAPING 0.3f + +/* extra harmonic noise shaping for high bitrates or noisy input */ +#define HIGH_RATE_OR_LOW_QUALITY_HARMONIC_SHAPING 0.2f + +/* parameter for shaping noise towards higher frequencies */ +#define HP_NOISE_COEF 0.25f + +/* parameter for shaping noise even more towards higher frequencies during voiced speech */ +#define HARM_HP_NOISE_COEF 0.35f + +/* parameter for applying a high-pass tilt to the input signal */ +#define INPUT_TILT 0.05f + +/* parameter for extra high-pass tilt to the input signal at high rates */ +#define HIGH_RATE_INPUT_TILT 0.1f + +/* parameter for reducing noise at the very low frequencies */ +#define LOW_FREQ_SHAPING 4.0f + +/* less reduction of noise at the very low frequencies for signals with low SNR at low frequencies */ +#define LOW_QUALITY_LOW_FREQ_SHAPING_DECR 0.5f + +/* subframe smoothing coefficient for HarmBoost, HarmShapeGain, Tilt (lower -> more smoothing) */ +#define SUBFR_SMTH_COEF 0.4f + +/* parameters defining the R/D tradeoff in the residual quantizer */ +#define LAMBDA_OFFSET 1.2f +#define LAMBDA_SPEECH_ACT -0.2f +#define LAMBDA_DELAYED_DECISIONS -0.05f +#define LAMBDA_INPUT_QUALITY -0.1f +#define LAMBDA_CODING_QUALITY -0.2f +#define LAMBDA_QUANT_OFFSET 0.8f + +/* Compensation in bitrate calculations for 10 ms modes */ +#define REDUCE_BITRATE_10_MS_BPS 2200 + +/* Maximum time before allowing a bandwidth transition */ +#define MAX_BANDWIDTH_SWITCH_DELAY_MS 5000 + +#ifdef __cplusplus +} +#endif + +#endif /* SILK_TUNING_PARAMETERS_H */ diff --git a/thirdparty/opus/silk/typedef.h b/thirdparty/opus/silk/typedef.h new file mode 100644 index 000000000000..97b7e709be5b --- /dev/null +++ b/thirdparty/opus/silk/typedef.h @@ -0,0 +1,78 @@ +/*********************************************************************** +Copyright (c) 2006-2011, Skype Limited. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_TYPEDEF_H +#define SILK_TYPEDEF_H + +#include "opus_types.h" +#include "opus_defines.h" + +#ifndef FIXED_POINT +# include +# define silk_float float +# define silk_float_MAX FLT_MAX +#endif + +#define silk_int64_MAX ((opus_int64)0x7FFFFFFFFFFFFFFFLL) /* 2^63 - 1 */ +#define silk_int64_MIN ((opus_int64)0x8000000000000000LL) /* -2^63 */ +#define silk_int32_MAX 0x7FFFFFFF /* 2^31 - 1 = 2147483647 */ +#define silk_int32_MIN ((opus_int32)0x80000000) /* -2^31 = -2147483648 */ +#define silk_int16_MAX 0x7FFF /* 2^15 - 1 = 32767 */ +#define silk_int16_MIN ((opus_int16)0x8000) /* -2^15 = -32768 */ +#define silk_int8_MAX 0x7F /* 2^7 - 1 = 127 */ +#define silk_int8_MIN ((opus_int8)0x80) /* -2^7 = -128 */ +#define silk_uint8_MAX 0xFF /* 2^8 - 1 = 255 */ + +#define silk_TRUE 1 +#define silk_FALSE 0 + +/* assertions */ +#if (defined _WIN32 && !defined _WINCE && !defined(__GNUC__) && !defined(NO_ASSERTS)) +# ifndef silk_assert +# include /* ASSERTE() */ +# define silk_assert(COND) _ASSERTE(COND) +# endif +#else +# ifdef ENABLE_ASSERTIONS +# include +# include +#define silk_fatal(str) _silk_fatal(str, __FILE__, __LINE__); +#ifdef __GNUC__ +__attribute__((noreturn)) +#endif +static OPUS_INLINE void _silk_fatal(const char *str, const char *file, int line) +{ + fprintf (stderr, "Fatal (internal) error in %s, line %d: %s\n", file, line, str); + abort(); +} +# define silk_assert(COND) {if (!(COND)) {silk_fatal("assertion failed: " #COND);}} +# else +# define silk_assert(COND) +# endif +#endif + +#endif /* SILK_TYPEDEF_H */ diff --git a/thirdparty/opus/stream.c b/thirdparty/opus/stream.c new file mode 100644 index 000000000000..0238a6b31bd9 --- /dev/null +++ b/thirdparty/opus/stream.c @@ -0,0 +1,366 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE libopusfile SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE libopusfile SOURCE CODE IS (C) COPYRIGHT 1994-2012 * + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * + * * + ******************************************************************** + + function: stdio-based convenience library for opening/seeking/decoding + last mod: $Id: vorbisfile.c 17573 2010-10-27 14:53:59Z xiphmont $ + + ********************************************************************/ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "internal.h" +#include +#include +#include +#include +#include +#if defined(_WIN32) +# include +#endif + +typedef struct OpusMemStream OpusMemStream; + +#define OP_MEM_SIZE_MAX (~(size_t)0>>1) +#define OP_MEM_DIFF_MAX ((ptrdiff_t)OP_MEM_SIZE_MAX) + +/*The context information needed to read from a block of memory as if it were a + file.*/ +struct OpusMemStream{ + /*The block of memory to read from.*/ + const unsigned char *data; + /*The total size of the block. + This must be at most OP_MEM_SIZE_MAX to prevent signed overflow while + seeking.*/ + ptrdiff_t size; + /*The current file position. + This is allowed to be set arbitrarily greater than size (i.e., past the end + of the block, though we will not read data past the end of the block), but + is not allowed to be negative (i.e., before the beginning of the block).*/ + ptrdiff_t pos; +}; + +static int op_fread(void *_stream,unsigned char *_ptr,int _buf_size){ + FILE *stream; + size_t ret; + /*Check for empty read.*/ + if(_buf_size<=0)return 0; + stream=(FILE *)_stream; + ret=fread(_ptr,1,_buf_size,stream); + OP_ASSERT(ret<=(size_t)_buf_size); + /*If ret==0 and !feof(stream), there was a read error.*/ + return ret>0||feof(stream)?(int)ret:OP_EREAD; +} + +static int op_fseek(void *_stream,opus_int64 _offset,int _whence){ +#if defined(_WIN32) + /*_fseeki64() is not exposed until MSCVCRT80. + This is the default starting with MSVC 2005 (_MSC_VER>=1400), but we want + to allow linking against older MSVCRT versions for compatibility back to + XP without installing extra runtime libraries. + i686-pc-mingw32 does not have fseeko() and requires + __MSVCRT_VERSION__>=0x800 for _fseeki64(), which screws up linking with + other libraries (that don't use MSVCRT80 from MSVC 2005 by default). + i686-w64-mingw32 does have fseeko() and respects _FILE_OFFSET_BITS, but I + don't know how to detect that at compile time. + We could just use fseeko64() (which is available in both), but its + implemented using fgetpos()/fsetpos() just like this code, except without + the overflow checking, so we prefer our version.*/ + opus_int64 pos; + /*We don't use fpos_t directly because it might be a struct if __STDC__ is + non-zero or _INTEGRAL_MAX_BITS < 64. + I'm not certain when the latter is true, but someone could in theory set + the former. + Either way, it should be binary compatible with a normal 64-bit int (this + assumption is not portable, but I believe it is true for MSVCRT).*/ + OP_ASSERT(sizeof(pos)==sizeof(fpos_t)); + /*Translate the seek to an absolute one.*/ + if(_whence==SEEK_CUR){ + int ret; + ret=fgetpos((FILE *)_stream,(fpos_t *)&pos); + if(ret)return ret; + } + else if(_whence==SEEK_END)pos=_filelengthi64(_fileno((FILE *)_stream)); + else if(_whence==SEEK_SET)pos=0; + else return -1; + /*Check for errors or overflow.*/ + if(pos<0||_offset<-pos||_offset>OP_INT64_MAX-pos)return -1; + pos+=_offset; + return fsetpos((FILE *)_stream,(fpos_t *)&pos); +#else + /*This function actually conforms to the SUSv2 and POSIX.1-2001, so we prefer + it except on Windows.*/ + return fseeko((FILE *)_stream,(off_t)_offset,_whence); +#endif +} + +static opus_int64 op_ftell(void *_stream){ +#if defined(_WIN32) + /*_ftelli64() is not exposed until MSCVCRT80, and ftello()/ftello64() have + the same problems as fseeko()/fseeko64() in MingW. + See above for a more detailed explanation.*/ + opus_int64 pos; + OP_ASSERT(sizeof(pos)==sizeof(fpos_t)); + return fgetpos((FILE *)_stream,(fpos_t *)&pos)?-1:pos; +#else + /*This function actually conforms to the SUSv2 and POSIX.1-2001, so we prefer + it except on Windows.*/ + return ftello((FILE *)_stream); +#endif +} + +static const OpusFileCallbacks OP_FILE_CALLBACKS={ + op_fread, + op_fseek, + op_ftell, + (op_close_func)fclose +}; + +#if defined(_WIN32) +# include +# include + +/*Windows doesn't accept UTF-8 by default, and we don't have a wchar_t API, + so if we just pass the path to fopen(), then there'd be no way for a user + of our API to open a Unicode filename. + Instead, we translate from UTF-8 to UTF-16 and use Windows' wchar_t API. + This makes this API more consistent with platforms where the character set + used by fopen is the same as used on disk, which is generally UTF-8, and + with our metadata API, which always uses UTF-8.*/ +static wchar_t *op_utf8_to_utf16(const char *_src){ + wchar_t *dst; + size_t len; + len=strlen(_src); + /*Worst-case output is 1 wide character per 1 input character.*/ + dst=(wchar_t *)_ogg_malloc(sizeof(*dst)*(len+1)); + if(dst!=NULL){ + size_t si; + size_t di; + for(di=si=0;si=0x80U){ + /*This is a 2-byte sequence that is not overlong.*/ + dst[di++]=w; + si++; + continue; + } + } + else{ + int c2; + /*This is safe, because c1 was not 0 and _src is NUL-terminated.*/ + c2=(unsigned char)_src[si+2]; + if((c2&0xC0)==0x80){ + /*Found at least two continuation bytes.*/ + if((c0&0xF0)==0xE0){ + wchar_t w; + /*Start byte says this is a 3-byte sequence.*/ + w=(c0&0xF)<<12|(c1&0x3F)<<6|c2&0x3F; + if(w>=0x800U&&(w<0xD800||w>=0xE000)&&w<0xFFFE){ + /*This is a 3-byte sequence that is not overlong, not a + UTF-16 surrogate pair value, and not a 'not a character' + value.*/ + dst[di++]=w; + si+=2; + continue; + } + } + else{ + int c3; + /*This is safe, because c2 was not 0 and _src is + NUL-terminated.*/ + c3=(unsigned char)_src[si+3]; + if((c3&0xC0)==0x80){ + /*Found at least three continuation bytes.*/ + if((c0&0xF8)==0xF0){ + opus_uint32 w; + /*Start byte says this is a 4-byte sequence.*/ + w=(c0&7)<<18|(c1&0x3F)<<12|(c2&0x3F)<<6&(c3&0x3F); + if(w>=0x10000U&&w<0x110000U){ + /*This is a 4-byte sequence that is not overlong and not + greater than the largest valid Unicode code point. + Convert it to a surrogate pair.*/ + w-=0x10000; + dst[di++]=(wchar_t)(0xD800+(w>>10)); + dst[di++]=(wchar_t)(0xDC00+(w&0x3FF)); + si+=3; + continue; + } + } + } + } + } + } + } + } + /*If we got here, we encountered an illegal UTF-8 sequence.*/ + _ogg_free(dst); + return NULL; + } + OP_ASSERT(di<=len); + dst[di]='\0'; + } + return dst; +} + +#endif + +void *op_fopen(OpusFileCallbacks *_cb,const char *_path,const char *_mode){ + FILE *fp; +#if !defined(_WIN32) + fp=fopen(_path,_mode); +#else + fp=NULL; + if(_path==NULL||_mode==NULL)errno=EINVAL; + else{ + wchar_t *wpath; + wchar_t *wmode; + wpath=op_utf8_to_utf16(_path); + wmode=op_utf8_to_utf16(_mode); + if(wmode==NULL)errno=EINVAL; + else if(wpath==NULL)errno=ENOENT; + else fp=_wfopen(wpath,wmode); + _ogg_free(wmode); + _ogg_free(wpath); + } +#endif + if(fp!=NULL)*_cb=*&OP_FILE_CALLBACKS; + return fp; +} + +void *op_fdopen(OpusFileCallbacks *_cb,int _fd,const char *_mode){ + FILE *fp; + fp=fdopen(_fd,_mode); + if(fp!=NULL)*_cb=*&OP_FILE_CALLBACKS; + return fp; +} + +void *op_freopen(OpusFileCallbacks *_cb,const char *_path,const char *_mode, + void *_stream){ + FILE *fp; +#if !defined(_WIN32) + fp=freopen(_path,_mode,(FILE *)_stream); +#else + fp=NULL; + if(_path==NULL||_mode==NULL)errno=EINVAL; + else{ + wchar_t *wpath; + wchar_t *wmode; + wpath=op_utf8_to_utf16(_path); + wmode=op_utf8_to_utf16(_mode); + if(wmode==NULL)errno=EINVAL; + else if(wpath==NULL)errno=ENOENT; + else fp=_wfreopen(wpath,wmode,(FILE *)_stream); + _ogg_free(wmode); + _ogg_free(wpath); + } +#endif + if(fp!=NULL)*_cb=*&OP_FILE_CALLBACKS; + return fp; +} + +static int op_mem_read(void *_stream,unsigned char *_ptr,int _buf_size){ + OpusMemStream *stream; + ptrdiff_t size; + ptrdiff_t pos; + stream=(OpusMemStream *)_stream; + /*Check for empty read.*/ + if(_buf_size<=0)return 0; + size=stream->size; + pos=stream->pos; + /*Check for EOF.*/ + if(pos>=size)return 0; + /*Check for a short read.*/ + _buf_size=(int)OP_MIN(size-pos,_buf_size); + memcpy(_ptr,stream->data+pos,_buf_size); + pos+=_buf_size; + stream->pos=pos; + return _buf_size; +} + +static int op_mem_seek(void *_stream,opus_int64 _offset,int _whence){ + OpusMemStream *stream; + ptrdiff_t pos; + stream=(OpusMemStream *)_stream; + pos=stream->pos; + OP_ASSERT(pos>=0); + switch(_whence){ + case SEEK_SET:{ + /*Check for overflow:*/ + if(_offset<0||_offset>OP_MEM_DIFF_MAX)return -1; + pos=(ptrdiff_t)_offset; + }break; + case SEEK_CUR:{ + /*Check for overflow:*/ + if(_offset<-pos||_offset>OP_MEM_DIFF_MAX-pos)return -1; + pos=(ptrdiff_t)(pos+_offset); + }break; + case SEEK_END:{ + ptrdiff_t size; + size=stream->size; + OP_ASSERT(size>=0); + /*Check for overflow:*/ + if(_offset>size||_offsetpos=pos; + return 0; +} + +static opus_int64 op_mem_tell(void *_stream){ + OpusMemStream *stream; + stream=(OpusMemStream *)_stream; + return (ogg_int64_t)stream->pos; +} + +static int op_mem_close(void *_stream){ + _ogg_free(_stream); + return 0; +} + +static const OpusFileCallbacks OP_MEM_CALLBACKS={ + op_mem_read, + op_mem_seek, + op_mem_tell, + op_mem_close +}; + +void *op_mem_stream_create(OpusFileCallbacks *_cb, + const unsigned char *_data,size_t _size){ + OpusMemStream *stream; + if(_size>OP_MEM_SIZE_MAX)return NULL; + stream=(OpusMemStream *)_ogg_malloc(sizeof(*stream)); + if(stream!=NULL){ + *_cb=*&OP_MEM_CALLBACKS; + stream->data=_data; + stream->size=_size; + stream->pos=0; + } + return stream; +} diff --git a/thirdparty/opus/tansig_table.h b/thirdparty/opus/tansig_table.h new file mode 100644 index 000000000000..c76f844a72f4 --- /dev/null +++ b/thirdparty/opus/tansig_table.h @@ -0,0 +1,45 @@ +/* This file is auto-generated by gen_tables */ + +static const float tansig_table[201] = { +0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f, +0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f, +0.379949f, 0.413644f, 0.446244f, 0.477700f, 0.507977f, +0.537050f, 0.564900f, 0.591519f, 0.616909f, 0.641077f, +0.664037f, 0.685809f, 0.706419f, 0.725897f, 0.744277f, +0.761594f, 0.777888f, 0.793199f, 0.807569f, 0.821040f, +0.833655f, 0.845456f, 0.856485f, 0.866784f, 0.876393f, +0.885352f, 0.893698f, 0.901468f, 0.908698f, 0.915420f, +0.921669f, 0.927473f, 0.932862f, 0.937863f, 0.942503f, +0.946806f, 0.950795f, 0.954492f, 0.957917f, 0.961090f, +0.964028f, 0.966747f, 0.969265f, 0.971594f, 0.973749f, +0.975743f, 0.977587f, 0.979293f, 0.980869f, 0.982327f, +0.983675f, 0.984921f, 0.986072f, 0.987136f, 0.988119f, +0.989027f, 0.989867f, 0.990642f, 0.991359f, 0.992020f, +0.992631f, 0.993196f, 0.993718f, 0.994199f, 0.994644f, +0.995055f, 0.995434f, 0.995784f, 0.996108f, 0.996407f, +0.996682f, 0.996937f, 0.997172f, 0.997389f, 0.997590f, +0.997775f, 0.997946f, 0.998104f, 0.998249f, 0.998384f, +0.998508f, 0.998623f, 0.998728f, 0.998826f, 0.998916f, +0.999000f, 0.999076f, 0.999147f, 0.999213f, 0.999273f, +0.999329f, 0.999381f, 0.999428f, 0.999472f, 0.999513f, +0.999550f, 0.999585f, 0.999617f, 0.999646f, 0.999673f, +0.999699f, 0.999722f, 0.999743f, 0.999763f, 0.999781f, +0.999798f, 0.999813f, 0.999828f, 0.999841f, 0.999853f, +0.999865f, 0.999875f, 0.999885f, 0.999893f, 0.999902f, +0.999909f, 0.999916f, 0.999923f, 0.999929f, 0.999934f, +0.999939f, 0.999944f, 0.999948f, 0.999952f, 0.999956f, +0.999959f, 0.999962f, 0.999965f, 0.999968f, 0.999970f, +0.999973f, 0.999975f, 0.999977f, 0.999978f, 0.999980f, +0.999982f, 0.999983f, 0.999984f, 0.999986f, 0.999987f, +0.999988f, 0.999989f, 0.999990f, 0.999990f, 0.999991f, +0.999992f, 0.999992f, 0.999993f, 0.999994f, 0.999994f, +0.999994f, 0.999995f, 0.999995f, 0.999996f, 0.999996f, +0.999996f, 0.999997f, 0.999997f, 0.999997f, 0.999997f, +0.999997f, 0.999998f, 0.999998f, 0.999998f, 0.999998f, +0.999998f, 0.999998f, 0.999999f, 0.999999f, 0.999999f, +0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f, +0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f, +1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, +1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, +1.000000f, +};