summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorhkuang <hkuang@google.com>2013-11-14 22:50:46 -0800
committerAndroid Git Automerger <android-git-automerger@android.com>2013-11-14 22:50:46 -0800
commitaa43d6ba5f5fb7eaeca1d0573ecb16ea4d388197 (patch)
tree6ff3bfc5090953c49fa26fac842b924d88e0983d
parenta1b7a7bb1ccf3f479bbca69a52a76eb05789dbaf (diff)
parent9b35249446b07f40ac5fcc3205f2c048616efacc (diff)
downloadandroid_external_libvpx-aa43d6ba5f5fb7eaeca1d0573ecb16ea4d388197.tar.gz
android_external_libvpx-aa43d6ba5f5fb7eaeca1d0573ecb16ea4d388197.tar.bz2
android_external_libvpx-aa43d6ba5f5fb7eaeca1d0573ecb16ea4d388197.zip
am 9b352494: Roll latest libvpx to fix scalling bug. Checkout is from master:Ib748eb287520c794631697204da6ebe19523ce95
* commit '9b35249446b07f40ac5fcc3205f2c048616efacc': Roll latest libvpx to fix scalling bug. Checkout is from master:Ib748eb287520c794631697204da6ebe19523ce95
-rw-r--r--armv7a-neon/libvpx_srcs.txt1
-rw-r--r--armv7a-neon/vp9_rtcd.h3
-rw-r--r--armv7a-neon/vpx_version.h6
-rw-r--r--armv7a/vpx_version.h6
-rw-r--r--generic/vpx_version.h6
-rwxr-xr-xlibvpx/build/make/ads2gas_apple.pl2
-rw-r--r--libvpx/examples.mk1
-rw-r--r--libvpx/test/android/Android.mk42
-rw-r--r--libvpx/test/android/README32
-rw-r--r--libvpx/test/android/get_files.py118
-rw-r--r--libvpx/test/datarate_test.cc4
-rw-r--r--libvpx/test/fdct4x4_test.cc378
-rw-r--r--libvpx/test/svc_test.cc101
-rw-r--r--libvpx/test/test-data.sha14
-rw-r--r--libvpx/test/test.mk4
-rw-r--r--libvpx/test/test_vector_test.cc1
-rw-r--r--libvpx/test/vp9_lossless_test.cc6
-rw-r--r--libvpx/tools_common.h40
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm144
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm2
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c332
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c232
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c610
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c69
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c309
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h755
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h470
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h365
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c652
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c795
-rw-r--r--libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c840
-rw-r--r--libvpx/vp9/common/vp9_alloccommon.c51
-rw-r--r--libvpx/vp9/common/vp9_alloccommon.h1
-rw-r--r--libvpx/vp9/common/vp9_blockd.h95
-rw-r--r--libvpx/vp9/common/vp9_entropy.c73
-rw-r--r--libvpx/vp9/common/vp9_entropy.h21
-rw-r--r--libvpx/vp9/common/vp9_entropymode.c26
-rw-r--r--libvpx/vp9/common/vp9_entropymv.c7
-rw-r--r--libvpx/vp9/common/vp9_extend.c2
-rw-r--r--libvpx/vp9/common/vp9_loopfilter.c363
-rw-r--r--libvpx/vp9/common/vp9_pred_common.h48
-rw-r--r--libvpx/vp9/common/vp9_reconinter.c62
-rw-r--r--libvpx/vp9/common/vp9_rtcd_defs.sh32
-rw-r--r--libvpx/vp9/common/vp9_treecoder.c29
-rw-r--r--libvpx/vp9/common/vp9_treecoder.h8
-rw-r--r--libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c103
-rw-r--r--libvpx/vp9/decoder/vp9_decodemv.c83
-rw-r--r--libvpx/vp9/decoder/vp9_decodframe.c294
-rw-r--r--libvpx/vp9/decoder/vp9_detokenize.c54
-rw-r--r--libvpx/vp9/decoder/vp9_detokenize.h2
-rw-r--r--libvpx/vp9/decoder/vp9_onyxd_if.c14
-rw-r--r--libvpx/vp9/decoder/vp9_onyxd_int.h6
-rw-r--r--libvpx/vp9/decoder/vp9_treereader.h2
-rw-r--r--libvpx/vp9/encoder/vp9_bitstream.c161
-rw-r--r--libvpx/vp9/encoder/vp9_block.h49
-rw-r--r--libvpx/vp9/encoder/vp9_encodeframe.c129
-rw-r--r--libvpx/vp9/encoder/vp9_encodemb.c123
-rw-r--r--libvpx/vp9/encoder/vp9_encodemv.c13
-rw-r--r--libvpx/vp9/encoder/vp9_firstpass.c13
-rw-r--r--libvpx/vp9/encoder/vp9_onyx_if.c106
-rw-r--r--libvpx/vp9/encoder/vp9_onyx_int.h1
-rw-r--r--libvpx/vp9/encoder/vp9_quantize.c78
-rw-r--r--libvpx/vp9/encoder/vp9_rdopt.c193
-rw-r--r--libvpx/vp9/encoder/vp9_tokenize.c6
-rw-r--r--libvpx/vp9/encoder/x86/vp9_dct_sse2.c5
-rw-r--r--libvpx/vp9/encoder/x86/vp9_subpel_variance.asm198
-rw-r--r--libvpx/vp9/vp9_common.mk11
-rw-r--r--libvpx/vpx_scale/generic/yv12config.c5
-rw-r--r--libvpx/vpxdec.c15
-rw-r--r--libvpx/vpxenc.c186
-rw-r--r--libvpx/vpxstats.c135
-rw-r--r--libvpx/vpxstats.h37
-rw-r--r--libvpx/webmenc.c14
-rw-r--r--mips-dspr2/libvpx_srcs.txt10
-rw-r--r--mips-dspr2/vp9_rtcd.h45
-rw-r--r--mips-dspr2/vpx_version.h6
-rw-r--r--mips/vpx_version.h6
77 files changed, 7844 insertions, 1407 deletions
diff --git a/armv7a-neon/libvpx_srcs.txt b/armv7a-neon/libvpx_srcs.txt
index 16d30b8..fe24a3f 100644
--- a/armv7a-neon/libvpx_srcs.txt
+++ b/armv7a-neon/libvpx_srcs.txt
@@ -215,6 +215,7 @@ vp9/common/arm/neon/vp9_mb_lpf_neon.asm.s
vp9/common/arm/neon/vp9_save_reg_neon.asm.s
vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm.s
vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm.s
+vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm.s
vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm.s
vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm.s
vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm.s
diff --git a/armv7a-neon/vp9_rtcd.h b/armv7a-neon/vp9_rtcd.h
index d9e2b15..847f8c8 100644
--- a/armv7a-neon/vp9_rtcd.h
+++ b/armv7a-neon/vp9_rtcd.h
@@ -285,7 +285,8 @@ void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride
#define vp9_idct32x32_34_add vp9_idct32x32_34_add_c
void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_1_add vp9_idct32x32_1_add_c
+void vp9_idct32x32_1_add_neon(const int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_idct32x32_1_add vp9_idct32x32_1_add_neon
void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
void vp9_iht4x4_16_add_neon(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
diff --git a/armv7a-neon/vpx_version.h b/armv7a-neon/vpx_version.h
index d3359dc..9a354bb 100644
--- a/armv7a-neon/vpx_version.h
+++ b/armv7a-neon/vpx_version.h
@@ -1,7 +1,7 @@
#define VERSION_MAJOR 1
#define VERSION_MINOR 2
#define VERSION_PATCH 0
-#define VERSION_EXTRA "4989-gabdefea"
+#define VERSION_EXTRA "5082-g58f7543"
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.2.0-4989-gabdefea"
-#define VERSION_STRING " v1.2.0-4989-gabdefea"
+#define VERSION_STRING_NOSP "v1.2.0-5082-g58f7543"
+#define VERSION_STRING " v1.2.0-5082-g58f7543"
diff --git a/armv7a/vpx_version.h b/armv7a/vpx_version.h
index d3359dc..9a354bb 100644
--- a/armv7a/vpx_version.h
+++ b/armv7a/vpx_version.h
@@ -1,7 +1,7 @@
#define VERSION_MAJOR 1
#define VERSION_MINOR 2
#define VERSION_PATCH 0
-#define VERSION_EXTRA "4989-gabdefea"
+#define VERSION_EXTRA "5082-g58f7543"
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.2.0-4989-gabdefea"
-#define VERSION_STRING " v1.2.0-4989-gabdefea"
+#define VERSION_STRING_NOSP "v1.2.0-5082-g58f7543"
+#define VERSION_STRING " v1.2.0-5082-g58f7543"
diff --git a/generic/vpx_version.h b/generic/vpx_version.h
index d3359dc..9a354bb 100644
--- a/generic/vpx_version.h
+++ b/generic/vpx_version.h
@@ -1,7 +1,7 @@
#define VERSION_MAJOR 1
#define VERSION_MINOR 2
#define VERSION_PATCH 0
-#define VERSION_EXTRA "4989-gabdefea"
+#define VERSION_EXTRA "5082-g58f7543"
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.2.0-4989-gabdefea"
-#define VERSION_STRING " v1.2.0-4989-gabdefea"
+#define VERSION_STRING_NOSP "v1.2.0-5082-g58f7543"
+#define VERSION_STRING " v1.2.0-5082-g58f7543"
diff --git a/libvpx/build/make/ads2gas_apple.pl b/libvpx/build/make/ads2gas_apple.pl
index 51e6fbc..fdafa98 100755
--- a/libvpx/build/make/ads2gas_apple.pl
+++ b/libvpx/build/make/ads2gas_apple.pl
@@ -188,7 +188,7 @@ while (<STDIN>)
$trimmed =~ s/,//g;
# string to array
- @incoming_array = split(/ /, $trimmed);
+ @incoming_array = split(/\s+/, $trimmed);
print ".macro @incoming_array[0]\n";
diff --git a/libvpx/examples.mk b/libvpx/examples.mk
index 2cee298..16f3c8f 100644
--- a/libvpx/examples.mk
+++ b/libvpx/examples.mk
@@ -41,6 +41,7 @@ vpxenc.SRCS += webmenc.c webmenc.h
vpxenc.SRCS += vpx_ports/mem_ops.h
vpxenc.SRCS += vpx_ports/mem_ops_aligned.h
vpxenc.SRCS += vpx_ports/vpx_timer.h
+vpxenc.SRCS += vpxstats.c vpxstats.h
vpxenc.SRCS += third_party/libmkv/EbmlIDs.h
vpxenc.SRCS += third_party/libmkv/EbmlWriter.c
vpxenc.SRCS += third_party/libmkv/EbmlWriter.h
diff --git a/libvpx/test/android/Android.mk b/libvpx/test/android/Android.mk
new file mode 100644
index 0000000..13af601
--- /dev/null
+++ b/libvpx/test/android/Android.mk
@@ -0,0 +1,42 @@
+# Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+#
+# This make file builds vpx_test app for android.
+# The test app itself runs on the command line through adb shell
+# The paths are really messed up as the libvpx make file
+# expects to be made from a parent directory.
+CUR_WD := $(call my-dir)
+BINDINGS_DIR := $(CUR_WD)/../../..
+LOCAL_PATH := $(CUR_WD)/../../..
+
+#libvpx
+include $(CLEAR_VARS)
+include $(BINDINGS_DIR)/libvpx/build/make/Android.mk
+LOCAL_PATH := $(CUR_WD)/../..
+
+#libgtest
+include $(CLEAR_VARS)
+LOCAL_ARM_MODE := arm
+LOCAL_CPP_EXTENSION := .cc
+LOCAL_MODULE := gtest
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/
+LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc
+include $(BUILD_STATIC_LIBRARY)
+
+#libvpx_test
+include $(CLEAR_VARS)
+LOCAL_ARM_MODE := arm
+LOCAL_MODULE := libvpx_test
+LOCAL_STATIC_LIBRARIES := gtest
+LOCAL_SHARED_LIBRARIES := vpx
+include $(LOCAL_PATH)/test/test.mk
+LOCAL_C_INCLUDES := $(BINDINGS_DIR)
+FILTERED_SRC := $(sort $(filter %.cc %.c, $(LIBVPX_TEST_SRCS-yes)))
+LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC))
+include $(BUILD_EXECUTABLE)
diff --git a/libvpx/test/android/README b/libvpx/test/android/README
new file mode 100644
index 0000000..6840d91
--- /dev/null
+++ b/libvpx/test/android/README
@@ -0,0 +1,32 @@
+Android.mk will build vpx unittests on android.
+1) Configure libvpx from the parent directory:
+./libvpx/configure --target=armv7-android-gcc --enable-external-build \
+ --enable-postproc --disable-install-srcs --enable-multi-res-encoding \
+ --enable-temporal-denoising --disable-unit-tests --disable-install-docs \
+ --disable-examples --disable-runtime-cpu-detect --sdk=$NDK
+
+2) From the parent directory, invoke ndk-build:
+NDK_PROJECT_PATH=. ndk-build APP_BUILD_SCRIPT=./libvpx/test/android/Android.mk \
+ APP_ABI=armeabi-v7a APP_PLATFORM=android-18 APP_OPTIM=release \
+ APP_STL=gnustl_static
+
+Note: Both adb and ndk-build are available prebuilt at:
+ https://chromium.googlesource.com/android_tools
+
+3) Run get_files.py to download the test files:
+python get_files.py -i /path/to/test-data.sha1 -o /path/to/put/files \
+ -u http://downloads.webmproject.org/test_data/libvpx
+
+4) Transfer files to device using adb. Ensure you have proper permissions for
+the target
+
+adb push /path/to/test_files /data/local/tmp
+adb push /path/to/built_libs /data/local/tmp
+
+NOTE: Built_libs defaults to parent_dir/libs/armeabi-v7a
+
+5) Run tests:
+adb shell
+(on device)
+cd /data/local/tmp
+LD_LIBRARY_PATH=. ./vpx_test
diff --git a/libvpx/test/android/get_files.py b/libvpx/test/android/get_files.py
new file mode 100644
index 0000000..1c69740
--- /dev/null
+++ b/libvpx/test/android/get_files.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+#
+# This simple script pulls test files from the webm homepage
+# It is intelligent enough to only pull files if
+# 1) File / test_data folder does not exist
+# 2) SHA mismatch
+
+import pycurl
+import csv
+import hashlib
+import re
+import os.path
+import time
+import itertools
+import sys
+import getopt
+
+#globals
+url = ''
+file_list_path = ''
+local_resource_path = ''
+
+# Helper functions:
+# A simple function which returns the sha hash of a file in hex
+def get_file_sha(filename):
+ try:
+ sha_hash = hashlib.sha1()
+ with open(filename, 'rb') as file:
+ buf = file.read(HASH_CHUNK)
+ while len(buf) > 0:
+ sha_hash.update(buf)
+ buf = file.read(HASH_CHUNK)
+ return sha_hash.hexdigest()
+ except IOError:
+ print "Error reading " + filename
+
+# Downloads a file from a url, and then checks the sha against the passed
+# in sha
+def download_and_check_sha(url, filename, sha):
+ path = os.path.join(local_resource_path, filename)
+ fp = open(path, "wb")
+ curl = pycurl.Curl()
+ curl.setopt(pycurl.URL, url + "/" + filename)
+ curl.setopt(pycurl.WRITEDATA, fp)
+ curl.perform()
+ curl.close()
+ fp.close()
+ return get_file_sha(path) == sha
+
+#constants
+ftp_retries = 3
+
+SHA_COL = 0
+NAME_COL = 1
+EXPECTED_COL = 2
+HASH_CHUNK = 65536
+
+# Main script
+try:
+ opts, args = \
+ getopt.getopt(sys.argv[1:], \
+ "u:i:o:", ["url=", "input_csv=", "output_dir="])
+except:
+ print 'get_files.py -u <url> -i <input_csv> -o <output_dir>'
+ sys.exit(2)
+
+for opt, arg in opts:
+ if opt == '-u':
+ url = arg
+ elif opt in ("-i", "--input_csv"):
+ file_list_path = os.path.join(arg)
+ elif opt in ("-o", "--output_dir"):
+ local_resource_path = os.path.join(arg)
+
+if len(sys.argv) != 7:
+ print "Expects two paths and a url!"
+ exit(1)
+
+if not os.path.isdir(local_resource_path):
+ os.makedirs(local_resource_path)
+
+file_list_csv = open(file_list_path, "rb")
+
+# Our 'csv' file uses multiple spaces as a delimiter, python's
+# csv class only uses single character delimiters, so we convert them below
+file_list_reader = csv.reader((re.sub(' +', ' ', line) \
+ for line in file_list_csv), delimiter = ' ')
+
+file_shas = []
+file_names = []
+
+for row in file_list_reader:
+ if len(row) != EXPECTED_COL:
+ continue
+ file_shas.append(row[SHA_COL])
+ file_names.append(row[NAME_COL])
+
+file_list_csv.close()
+
+# Download files, only if they don't already exist and have correct shas
+for filename, sha in itertools.izip(file_names, file_shas):
+ path = os.path.join(local_resource_path, filename)
+ if os.path.isfile(path) \
+ and get_file_sha(path) == sha:
+ print path + ' exists, skipping'
+ continue
+ for retry in range(0, ftp_retries):
+ print "Downloading " + path
+ if not download_and_check_sha(url, filename, sha):
+ print "Sha does not match, retrying..."
+ else:
+ break
diff --git a/libvpx/test/datarate_test.cc b/libvpx/test/datarate_test.cc
index 85f4bb6..5785a0a 100644
--- a/libvpx/test/datarate_test.cc
+++ b/libvpx/test/datarate_test.cc
@@ -248,9 +248,9 @@ TEST_P(DatarateTestVP9, BasicRateTargeting) {
cfg_.rc_target_bitrate = i;
ResetModel();
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.8)
+ ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.9)
<< " The datarate for the file exceeds the target by too much!";
- ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_ * 1.3)
+ ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_ * 1.1)
<< " The datarate for the file missed the target!";
}
}
diff --git a/libvpx/test/fdct4x4_test.cc b/libvpx/test/fdct4x4_test.cc
index 796a2e9..9d8b0bd 100644
--- a/libvpx/test/fdct4x4_test.cc
+++ b/libvpx/test/fdct4x4_test.cc
@@ -13,178 +13,288 @@
#include <string.h>
#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
extern "C" {
+#include "vp9/common/vp9_entropy.h"
#include "./vp9_rtcd.h"
+void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *output, int pitch);
}
-
-#include "test/acm_random.h"
#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
using libvpx_test::ACMRandom;
namespace {
-void fdct4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
- int stride, int /*tx_type*/) {
+const int kNumCoeffs = 16;
+typedef void (*fdct_t)(const int16_t *in, int16_t *out, int stride);
+typedef void (*idct_t)(const int16_t *in, uint8_t *out, int stride);
+typedef void (*fht_t) (const int16_t *in, int16_t *out, int stride,
+ int tx_type);
+typedef void (*iht_t) (const int16_t *in, uint8_t *out, int stride,
+ int tx_type);
+
+void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
vp9_fdct4x4_c(in, out, stride);
}
-void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
- int stride, int /*tx_type*/) {
- vp9_idct4x4_16_add_c(out, dst, stride);
-}
-void fht4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
- int stride, int tx_type) {
+
+void fht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
vp9_short_fht4x4_c(in, out, stride, tx_type);
}
-void iht4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
- int stride, int tx_type) {
- vp9_iht4x4_16_add_c(out, dst, stride, tx_type);
-}
-class FwdTrans4x4Test : public ::testing::TestWithParam<int> {
+class Trans4x4TestBase {
public:
- virtual ~FwdTrans4x4Test() {}
- virtual void SetUp() {
- tx_type_ = GetParam();
- if (tx_type_ == 0) {
- fwd_txfm_ = fdct4x4;
- inv_txfm_ = idct4x4_add;
- } else {
- fwd_txfm_ = fht4x4;
- inv_txfm_ = iht4x4_add;
- }
- }
+ virtual ~Trans4x4TestBase() {}
protected:
- void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
- int stride, int tx_type) {
- (*fwd_txfm_)(in, out, dst, stride, tx_type);
- }
+ virtual void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) = 0;
+
+ virtual void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) = 0;
+
+ void RunAccuracyCheck() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ uint32_t max_error = 0;
+ int64_t total_error = 0;
+ const int count_test_block = 10000;
+ for (int i = 0; i < count_test_block; ++i) {
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+ // Initialize a test block with input range [-255, 255].
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ test_input_block[j] = src[j] - dst[j];
+ }
+
+ REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
+ test_temp_block, pitch_));
+ REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ const uint32_t diff = dst[j] - src[j];
+ const uint32_t error = diff * diff;
+ if (max_error < error)
+ max_error = error;
+ total_error += error;
+ }
+ }
- void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
- int stride, int tx_type) {
- (*inv_txfm_)(in, out, dst, stride, tx_type);
+ EXPECT_GE(1u, max_error)
+ << "Error: 4x4 FHT/IHT has an individual round trip error > 1";
+
+ EXPECT_GE(count_test_block , total_error)
+ << "Error: 4x4 FHT/IHT has average round trip error > 1 per block";
}
- int tx_type_;
- void (*fwd_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
- int stride, int tx_type);
- void (*inv_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
- int stride, int tx_type);
-};
+ void RunCoeffCheck() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 5000;
+ DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with input range [-255, 255].
+ for (int j = 0; j < kNumCoeffs; ++j)
+ input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+ fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
+ REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
-TEST_P(FwdTrans4x4Test, SignBiasCheck) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
- DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 16);
- const int pitch = 4;
- int count_sign_block[16][2];
- const int count_test_block = 1000000;
-
- memset(count_sign_block, 0, sizeof(count_sign_block));
- for (int i = 0; i < count_test_block; ++i) {
- // Initialize a test block with input range [-255, 255].
- for (int j = 0; j < 16; ++j)
- test_input_block[j] = rnd.Rand8() - rnd.Rand8();
-
- RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
-
- for (int j = 0; j < 16; ++j) {
- if (test_output_block[j] < 0)
- ++count_sign_block[j][0];
- else if (test_output_block[j] > 0)
- ++count_sign_block[j][1];
+ // The minimum quant value is 4.
+ for (int j = 0; j < kNumCoeffs; ++j)
+ EXPECT_EQ(output_block[j], output_ref_block[j]);
}
}
- for (int j = 0; j < 16; ++j) {
- const bool bias_acceptable = (abs(count_sign_block[j][0] -
- count_sign_block[j][1]) < 10000);
- EXPECT_TRUE(bias_acceptable)
- << "Error: 4x4 FDCT/FHT has a sign bias > 1%"
- << " for input range [-255, 255] at index " << j
- << " tx_type " << tx_type_;
+ void RunMemCheck() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 5000;
+ DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with input range [-255, 255].
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ input_block[j] = rnd.Rand8() - rnd.Rand8();
+ input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+ }
+ if (i == 0)
+ for (int j = 0; j < kNumCoeffs; ++j)
+ input_extreme_block[j] = 255;
+ if (i == 1)
+ for (int j = 0; j < kNumCoeffs; ++j)
+ input_extreme_block[j] = -255;
+
+ fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
+ REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
+ output_block, pitch_));
+
+ // The minimum quant value is 4.
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ EXPECT_EQ(output_block[j], output_ref_block[j]);
+ EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+ << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+ }
+ }
}
- memset(count_sign_block, 0, sizeof(count_sign_block));
- for (int i = 0; i < count_test_block; ++i) {
- // Initialize a test block with input range [-15, 15].
- for (int j = 0; j < 16; ++j)
- test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
+ void RunInvAccuracyCheck() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 1000;
+ DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
- RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with input range [-255, 255].
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ in[j] = src[j] - dst[j];
+ }
- for (int j = 0; j < 16; ++j) {
- if (test_output_block[j] < 0)
- ++count_sign_block[j][0];
- else if (test_output_block[j] > 0)
- ++count_sign_block[j][1];
+ fwd_txfm_ref(in, coeff, pitch_, tx_type_);
+
+ REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ const uint32_t diff = dst[j] - src[j];
+ const uint32_t error = diff * diff;
+ EXPECT_GE(1u, error)
+ << "Error: 16x16 IDCT has error " << error
+ << " at index " << j;
+ }
}
}
- for (int j = 0; j < 16; ++j) {
- const bool bias_acceptable = (abs(count_sign_block[j][0] -
- count_sign_block[j][1]) < 100000);
- EXPECT_TRUE(bias_acceptable)
- << "Error: 4x4 FDCT/FHT has a sign bias > 10%"
- << " for input range [-15, 15] at index " << j;
+ int pitch_;
+ int tx_type_;
+ fht_t fwd_txfm_ref;
+};
+
+class Trans4x4DCT
+ : public Trans4x4TestBase,
+ public PARAMS(fdct_t, idct_t, int) {
+ public:
+ virtual ~Trans4x4DCT() {}
+
+ virtual void SetUp() {
+ fwd_txfm_ = GET_PARAM(0);
+ inv_txfm_ = GET_PARAM(1);
+ tx_type_ = GET_PARAM(2);
+ pitch_ = 4;
+ fwd_txfm_ref = fdct4x4_ref;
}
+ virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+ void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+ fwd_txfm_(in, out, stride);
+ }
+ void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+ inv_txfm_(out, dst, stride);
+ }
+
+ fdct_t fwd_txfm_;
+ idct_t inv_txfm_;
+};
+
+TEST_P(Trans4x4DCT, AccuracyCheck) {
+ RunAccuracyCheck();
}
-TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
-
- int max_error = 0;
- int total_error = 0;
- const int count_test_block = 1000000;
- for (int i = 0; i < count_test_block; ++i) {
- DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
- DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 16);
- DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 16);
- DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 16);
-
- for (int j = 0; j < 16; ++j) {
- src[j] = rnd.Rand8();
- dst[j] = rnd.Rand8();
- }
- // Initialize a test block with input range [-255, 255].
- for (int j = 0; j < 16; ++j)
- test_input_block[j] = src[j] - dst[j];
-
- const int pitch = 4;
- RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-
- for (int j = 0; j < 16; ++j) {
- if (test_temp_block[j] > 0) {
- test_temp_block[j] += 2;
- test_temp_block[j] /= 4;
- test_temp_block[j] *= 4;
- } else {
- test_temp_block[j] -= 2;
- test_temp_block[j] /= 4;
- test_temp_block[j] *= 4;
- }
- }
+TEST_P(Trans4x4DCT, CoeffCheck) {
+ RunCoeffCheck();
+}
- // inverse transform and reconstruct the pixel block
- RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+TEST_P(Trans4x4DCT, MemCheck) {
+ RunMemCheck();
+}
- for (int j = 0; j < 16; ++j) {
- const int diff = dst[j] - src[j];
- const int error = diff * diff;
- if (max_error < error)
- max_error = error;
- total_error += error;
- }
+TEST_P(Trans4x4DCT, InvAccuracyCheck) {
+ RunInvAccuracyCheck();
+}
+
+class Trans4x4HT
+ : public Trans4x4TestBase,
+ public PARAMS(fht_t, iht_t, int) {
+ public:
+ virtual ~Trans4x4HT() {}
+
+ virtual void SetUp() {
+ fwd_txfm_ = GET_PARAM(0);
+ inv_txfm_ = GET_PARAM(1);
+ tx_type_ = GET_PARAM(2);
+ pitch_ = 4;
+ fwd_txfm_ref = fht4x4_ref;
}
- EXPECT_GE(1, max_error)
- << "Error: FDCT/IDCT or FHT/IHT has an individual roundtrip error > 1";
+ virtual void TearDown() { libvpx_test::ClearSystemState(); }
- EXPECT_GE(count_test_block, total_error)
- << "Error: FDCT/IDCT or FHT/IHT has average "
- << "roundtrip error > 1 per block";
+ protected:
+ void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+ fwd_txfm_(in, out, stride, tx_type_);
+ }
+
+ void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+ inv_txfm_(out, dst, stride, tx_type_);
+ }
+
+ fht_t fwd_txfm_;
+ iht_t inv_txfm_;
+};
+
+TEST_P(Trans4x4HT, AccuracyCheck) {
+ RunAccuracyCheck();
+}
+
+TEST_P(Trans4x4HT, CoeffCheck) {
+ RunCoeffCheck();
+}
+
+TEST_P(Trans4x4HT, MemCheck) {
+ RunMemCheck();
+}
+
+TEST_P(Trans4x4HT, InvAccuracyCheck) {
+ RunInvAccuracyCheck();
}
-INSTANTIATE_TEST_CASE_P(VP9, FwdTrans4x4Test, ::testing::Range(0, 4));
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+ C, Trans4x4DCT,
+ ::testing::Values(
+ make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+ C, Trans4x4HT,
+ ::testing::Values(
+ make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 0),
+ make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 1),
+ make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 2),
+ make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 3)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+ SSE2, Trans4x4DCT,
+ ::testing::Values(
+ make_tuple(&vp9_fdct4x4_sse2,
+ &vp9_idct4x4_16_add_sse2, 0)));
+INSTANTIATE_TEST_CASE_P(
+ SSE2, Trans4x4HT,
+ ::testing::Values(
+ make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0),
+ make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1),
+ make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2),
+ make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3)));
+#endif
+
} // namespace
diff --git a/libvpx/test/svc_test.cc b/libvpx/test/svc_test.cc
index 5941cae..98a5d94 100644
--- a/libvpx/test/svc_test.cc
+++ b/libvpx/test/svc_test.cc
@@ -31,12 +31,16 @@ class SvcTest : public ::testing::Test {
SvcTest()
: codec_iface_(0),
test_file_name_("hantro_collage_w352h288.yuv"),
- decoder_(0) {}
+ codec_initialized_(false),
+ decoder_(0) {
+ memset(&svc_, 0, sizeof(svc_));
+ memset(&codec_, 0, sizeof(codec_));
+ memset(&codec_enc_, 0, sizeof(codec_enc_));
+ }
virtual ~SvcTest() {}
virtual void SetUp() {
- memset(&svc_, 0, sizeof(svc_));
svc_.first_frame_full_size = 1;
svc_.encoding_mode = INTER_LAYER_PREDICTION_IP;
svc_.log_level = SVC_LOG_DEBUG;
@@ -61,6 +65,8 @@ class SvcTest : public ::testing::Test {
virtual void TearDown() {
vpx_svc_release(&svc_);
+ delete(decoder_);
+ if (codec_initialized_) vpx_codec_destroy(&codec_);
}
SvcContext svc_;
@@ -68,22 +74,16 @@ class SvcTest : public ::testing::Test {
struct vpx_codec_enc_cfg codec_enc_;
vpx_codec_iface_t *codec_iface_;
std::string test_file_name_;
-
+ bool codec_initialized_;
Decoder *decoder_;
};
TEST_F(SvcTest, SvcInit) {
- svc_.spatial_layers = 0; // use default layers
- vpx_codec_err_t res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
- EXPECT_EQ(VPX_CODEC_OK, res);
- EXPECT_EQ(VPX_SS_DEFAULT_LAYERS, svc_.spatial_layers);
-
- res = vpx_svc_init(NULL, &codec_, codec_iface_, &codec_enc_);
+ // test missing parameters
+ vpx_codec_err_t res = vpx_svc_init(NULL, &codec_, codec_iface_, &codec_enc_);
EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
res = vpx_svc_init(&svc_, NULL, codec_iface_, &codec_enc_);
EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
res = vpx_svc_init(&svc_, &codec_, NULL, &codec_enc_);
EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
@@ -94,58 +94,88 @@ TEST_F(SvcTest, SvcInit) {
res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+ svc_.spatial_layers = 0; // use default layers
+ res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
+ EXPECT_EQ(VPX_CODEC_OK, res);
+ codec_initialized_ = true;
+ EXPECT_EQ(VPX_SS_DEFAULT_LAYERS, svc_.spatial_layers);
+}
+
+TEST_F(SvcTest, InitTwoLayers) {
svc_.spatial_layers = 2;
vpx_svc_set_scale_factors(&svc_, "4/16,16*16"); // invalid scale values
- res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
+ vpx_codec_err_t res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
vpx_svc_set_scale_factors(&svc_, "4/16,16/16"); // valid scale values
res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
EXPECT_EQ(VPX_CODEC_OK, res);
+ codec_initialized_ = true;
}
-TEST_F(SvcTest, SetOptions) {
- vpx_codec_err_t res = vpx_svc_set_options(NULL, "layers=3");
+TEST_F(SvcTest, InvalidOptions) {
+ vpx_codec_err_t res = vpx_svc_set_options(&svc_, NULL);
EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
- vpx_svc_set_options(&svc_, NULL);
+ res = vpx_svc_set_options(&svc_, "not-an-option=1");
+ EXPECT_EQ(VPX_CODEC_OK, res);
+ res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+}
- vpx_svc_set_options(&svc_, "layers=3");
+TEST_F(SvcTest, SetLayersOption) {
+ vpx_codec_err_t res = vpx_svc_set_options(&svc_, "layers=3");
+ EXPECT_EQ(VPX_CODEC_OK, res);
res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
EXPECT_EQ(VPX_CODEC_OK, res);
+ codec_initialized_ = true;
EXPECT_EQ(3, svc_.spatial_layers);
+}
- vpx_svc_set_options(&svc_, "not-an-option=1");
- res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
- EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
- vpx_svc_set_options(&svc_, "encoding-mode=alt-ip");
+TEST_F(SvcTest, SetEncodingMode) {
+ vpx_codec_err_t res = vpx_svc_set_options(&svc_, "encoding-mode=alt-ip");
+ EXPECT_EQ(VPX_CODEC_OK, res);
res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
EXPECT_EQ(VPX_CODEC_OK, res);
+ codec_initialized_ = true;
EXPECT_EQ(ALT_INTER_LAYER_PREDICTION_IP, svc_.encoding_mode);
+}
- vpx_svc_set_options(&svc_, "layers=2 encoding-mode=ip");
+TEST_F(SvcTest, SetMultipleOptions) {
+ vpx_codec_err_t res = vpx_svc_set_options(&svc_, "layers=2 encoding-mode=ip");
res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
EXPECT_EQ(VPX_CODEC_OK, res);
+ codec_initialized_ = true;
EXPECT_EQ(2, svc_.spatial_layers);
EXPECT_EQ(INTER_LAYER_PREDICTION_IP, svc_.encoding_mode);
+}
- vpx_svc_set_options(&svc_, "scale-factors=not-scale-factors");
+TEST_F(SvcTest, SetScaleFactorsOption) {
+ svc_.spatial_layers = 2;
+ vpx_codec_err_t res =
+ vpx_svc_set_options(&svc_, "scale-factors=not-scale-factors");
+ EXPECT_EQ(VPX_CODEC_OK, res);
res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
- vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3");
+ res = vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3");
+ EXPECT_EQ(VPX_CODEC_OK, res);
res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
EXPECT_EQ(VPX_CODEC_OK, res);
+ codec_initialized_ = true;
+}
- vpx_svc_set_options(&svc_, "quantizers=not-quantizers");
+TEST_F(SvcTest, SetQuantizersOption) {
+ svc_.spatial_layers = 2;
+ vpx_codec_err_t res = vpx_svc_set_options(&svc_, "quantizers=not-quantizers");
+ EXPECT_EQ(VPX_CODEC_OK, res);
res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
vpx_svc_set_options(&svc_, "quantizers=40,45");
res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
EXPECT_EQ(VPX_CODEC_OK, res);
+ codec_initialized_ = true;
}
TEST_F(SvcTest, SetQuantizers) {
@@ -157,15 +187,16 @@ TEST_F(SvcTest, SetQuantizers) {
svc_.first_frame_full_size = 0;
svc_.spatial_layers = 2;
- res = vpx_svc_set_quantizers(&svc_, "40,30");
+ res = vpx_svc_set_quantizers(&svc_, "40");
EXPECT_EQ(VPX_CODEC_OK, res);
res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
- EXPECT_EQ(VPX_CODEC_OK, res);
+ EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
- res = vpx_svc_set_quantizers(&svc_, "40");
+ res = vpx_svc_set_quantizers(&svc_, "40,30");
EXPECT_EQ(VPX_CODEC_OK, res);
res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
- EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+ EXPECT_EQ(VPX_CODEC_OK, res);
+ codec_initialized_ = true;
}
TEST_F(SvcTest, SetScaleFactors) {
@@ -177,15 +208,16 @@ TEST_F(SvcTest, SetScaleFactors) {
svc_.first_frame_full_size = 0;
svc_.spatial_layers = 2;
- res = vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
+ res = vpx_svc_set_scale_factors(&svc_, "4/16");
EXPECT_EQ(VPX_CODEC_OK, res);
res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
- EXPECT_EQ(VPX_CODEC_OK, res);
+ EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
- res = vpx_svc_set_scale_factors(&svc_, "4/16");
+ res = vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
EXPECT_EQ(VPX_CODEC_OK, res);
res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
- EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+ EXPECT_EQ(VPX_CODEC_OK, res);
+ codec_initialized_ = true;
}
// test that decoder can handle an SVC frame as the first frame in a sequence
@@ -200,6 +232,7 @@ TEST_F(SvcTest, DISABLED_FirstFrameHasLayers) {
vpx_codec_err_t res =
vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
EXPECT_EQ(VPX_CODEC_OK, res);
+ codec_initialized_ = true;
libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
codec_enc_.g_timebase.den,
@@ -227,6 +260,7 @@ TEST_F(SvcTest, EncodeThreeFrames) {
vpx_codec_err_t res =
vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
ASSERT_EQ(VPX_CODEC_OK, res);
+ codec_initialized_ = true;
libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
codec_enc_.g_timebase.den,
@@ -280,6 +314,7 @@ TEST_F(SvcTest, GetLayerResolution) {
vpx_codec_err_t res =
vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
EXPECT_EQ(VPX_CODEC_OK, res);
+ codec_initialized_ = true;
// ensure that requested layer is a valid layer
uint32_t layer_width, layer_height;
diff --git a/libvpx/test/test-data.sha1 b/libvpx/test/test-data.sha1
index 44220d5..5229d09 100644
--- a/libvpx/test/test-data.sha1
+++ b/libvpx/test/test-data.sha1
@@ -538,3 +538,7 @@ cf8ea970c776797aae71dac8317ea926d9431cab vp90-2-08-tile_1x4_frame_parallel.webm
a481fbea465010b57af5a19ebf6d4a5cfe5b9278 vp90-2-08-tile_1x4_frame_parallel.webm.md5
0203ec456277a01aec401e7fb6c72c9a7e5e3f9d vp90-2-08-tile_1x4.webm
c9b237dfcc01c1b414fbcaa481d014a906ef7998 vp90-2-08-tile_1x4.webm.md5
+20c75157e91ab41f82f70ffa73d5d01df8469287 vp90-2-08-tile-4x4.webm
+ae7451810247fd13975cc257aa0301ff17102255 vp90-2-08-tile-4x4.webm.md5
+2ec6e15422ac7a61af072dc5f27fcaf1942ce116 vp90-2-08-tile-4x1.webm
+0094f5ee5e46345017c30e0aa4835b550212d853 vp90-2-08-tile-4x1.webm.md5
diff --git a/libvpx/test/test.mk b/libvpx/test/test.mk
index f7a5d15..ac072d0 100644
--- a/libvpx/test/test.mk
+++ b/libvpx/test/test.mk
@@ -646,5 +646,9 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm.md5
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm.md5
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
diff --git a/libvpx/test/test_vector_test.cc b/libvpx/test/test_vector_test.cc
index c6ad1c5..08449a5 100644
--- a/libvpx/test/test_vector_test.cc
+++ b/libvpx/test/test_vector_test.cc
@@ -164,6 +164,7 @@ const char *kVP9TestVectors[] = {
"vp90-2-07-frame_parallel.webm",
"vp90-2-08-tile_1x2_frame_parallel.webm", "vp90-2-08-tile_1x2.webm",
"vp90-2-08-tile_1x4_frame_parallel.webm", "vp90-2-08-tile_1x4.webm",
+ "vp90-2-08-tile-4x4.webm", "vp90-2-08-tile-4x1.webm",
#if CONFIG_NON420
"vp91-2-04-yv444.webm"
#endif
diff --git a/libvpx/test/vp9_lossless_test.cc b/libvpx/test/vp9_lossless_test.cc
index 441cc44..30a3118 100644
--- a/libvpx/test/vp9_lossless_test.cc
+++ b/libvpx/test/vp9_lossless_test.cc
@@ -35,7 +35,7 @@ class LossLessTest : public ::libvpx_test::EncoderTest,
}
virtual void BeginPassHook(unsigned int /*pass*/) {
- psnr_ = 0.0;
+ psnr_ = kMaxPsnr;
nframes_ = 0;
}
@@ -65,9 +65,9 @@ TEST_P(LossLessTest, TestLossLessEncoding) {
init_flags_ = VPX_CODEC_USE_PSNR;
// intentionally changed the dimension for better testing coverage
- libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 356, 284,
+ libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
timebase.den, timebase.num, 0, 30);
-
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
const double psnr_lossless = GetMinPsnr();
EXPECT_GE(psnr_lossless, kMaxPsnr);
}
diff --git a/libvpx/tools_common.h b/libvpx/tools_common.h
index 7dfd5ad..068e7b5 100644
--- a/libvpx/tools_common.h
+++ b/libvpx/tools_common.h
@@ -12,6 +12,46 @@
#include <stdio.h>
+#include "./vpx_config.h"
+
+#if defined(_MSC_VER)
+/* MSVS doesn't define off_t, and uses _f{seek,tell}i64. */
+typedef __int64 off_t;
+#define fseeko _fseeki64
+#define ftello _ftelli64
+#elif defined(_WIN32)
+/* MinGW defines off_t as long and uses f{seek,tell}o64/off64_t for large
+ * files. */
+#define fseeko fseeko64
+#define ftello ftello64
+#define off_t off64_t
+#endif /* _WIN32 */
+
+#if CONFIG_OS_SUPPORT
+#if defined(_MSC_VER)
+#include <io.h> /* NOLINT */
+#define snprintf _snprintf
+#define isatty _isatty
+#define fileno _fileno
+#else
+#include <unistd.h> /* NOLINT */
+#endif /* _MSC_VER */
+#endif /* CONFIG_OS_SUPPORT */
+
+/* Use 32-bit file operations in WebM file format when building ARM
+ * executables (.axf) with RVCT. */
+#if !CONFIG_OS_SUPPORT
+typedef long off_t; /* NOLINT */
+#define fseeko fseek
+#define ftello ftell
+#endif /* CONFIG_OS_SUPPORT */
+
+#define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo)
+
+#ifndef PATH_MAX
+#define PATH_MAX 512
+#endif
+
#define VP8_FOURCC (0x30385056)
#define VP9_FOURCC (0x30395056)
#define VP8_FOURCC_MASK (0x00385056)
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm
new file mode 100644
index 0000000..d290d07
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm
@@ -0,0 +1,144 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+ EXPORT |vp9_idct32x32_1_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ;TODO(hkuang): put the following macros in a seperate
+ ;file so other idct function could also use them.
+ MACRO
+ LD_16x8 $src, $stride
+ vld1.8 {q8}, [$src], $stride
+ vld1.8 {q9}, [$src], $stride
+ vld1.8 {q10}, [$src], $stride
+ vld1.8 {q11}, [$src], $stride
+ vld1.8 {q12}, [$src], $stride
+ vld1.8 {q13}, [$src], $stride
+ vld1.8 {q14}, [$src], $stride
+ vld1.8 {q15}, [$src], $stride
+ MEND
+
+ MACRO
+ ADD_DIFF_16x8 $diff
+ vqadd.u8 q8, q8, $diff
+ vqadd.u8 q9, q9, $diff
+ vqadd.u8 q10, q10, $diff
+ vqadd.u8 q11, q11, $diff
+ vqadd.u8 q12, q12, $diff
+ vqadd.u8 q13, q13, $diff
+ vqadd.u8 q14, q14, $diff
+ vqadd.u8 q15, q15, $diff
+ MEND
+
+ MACRO
+ SUB_DIFF_16x8 $diff
+ vqsub.u8 q8, q8, $diff
+ vqsub.u8 q9, q9, $diff
+ vqsub.u8 q10, q10, $diff
+ vqsub.u8 q11, q11, $diff
+ vqsub.u8 q12, q12, $diff
+ vqsub.u8 q13, q13, $diff
+ vqsub.u8 q14, q14, $diff
+ vqsub.u8 q15, q15, $diff
+ MEND
+
+ MACRO
+ ST_16x8 $dst, $stride
+ vst1.8 {q8}, [$dst], $stride
+ vst1.8 {q9}, [$dst], $stride
+ vst1.8 {q10},[$dst], $stride
+ vst1.8 {q11},[$dst], $stride
+ vst1.8 {q12},[$dst], $stride
+ vst1.8 {q13},[$dst], $stride
+ vst1.8 {q14},[$dst], $stride
+ vst1.8 {q15},[$dst], $stride
+ MEND
+
+;void vp9_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
+; int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride
+
+|vp9_idct32x32_1_add_neon| PROC
+ push {lr}
+ pld [r1]
+ add r3, r1, #16 ; r3 dest + 16 for second loop
+ ldrsh r0, [r0]
+
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ ; out = dct_const_round_shift(input[0] * cospi_16_64)
+ mul r0, r0, r12 ; input[0] * cospi_16_64
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; out = dct_const_round_shift(out * cospi_16_64)
+ mul r0, r0, r12 ; out * cospi_16_64
+ mov r12, r1 ; save dest
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; a1 = ROUND_POWER_OF_TWO(out, 6)
+ add r0, r0, #32 ; + (1 <<((6) - 1))
+ asrs r0, r0, #6 ; >> 6
+ bge diff_positive_32_32
+
+diff_negative_32_32
+ neg r0, r0
+ usat r0, #8, r0
+ vdup.u8 q0, r0
+ mov r0, #4
+
+diff_negative_32_32_loop
+ sub r0, #1
+ LD_16x8 r1, r2
+ SUB_DIFF_16x8 q0
+ ST_16x8 r12, r2
+
+ LD_16x8 r1, r2
+ SUB_DIFF_16x8 q0
+ ST_16x8 r12, r2
+ cmp r0, #2
+ moveq r1, r3
+ moveq r12, r3
+ cmp r0, #0
+ bne diff_negative_32_32_loop
+ pop {pc}
+
+diff_positive_32_32
+ usat r0, #8, r0
+ vdup.u8 q0, r0
+ mov r0, #4
+
+diff_positive_32_32_loop
+ sub r0, #1
+ LD_16x8 r1, r2
+ ADD_DIFF_16x8 q0
+ ST_16x8 r12, r2
+
+ LD_16x8 r1, r2
+ ADD_DIFF_16x8 q0
+ ST_16x8 r12, r2
+ cmp r0, #2
+ moveq r1, r3
+ moveq r12, r3
+ cmp r0, #0
+ bne diff_positive_32_32_loop
+ pop {pc}
+
+ ENDP ; |vp9_idct32x32_1_add_neon|
+ END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
index f00d027..388a7d7 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
@@ -1145,7 +1145,7 @@ idct32_bands_end_1st_pass
; pass loop processing
add r5, r5, #1
- B idct32_pass_loop
+ b idct32_pass_loop
idct32_bands_end_2nd_pass
STORE_COMBINE_CENTER_RESULTS
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c
new file mode 100644
index 0000000..b0dc496
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+
+ __asm__ __volatile__ (
+ "lb %[tmp1], (%[left]) \n\t"
+ "lb %[tmp2], 1(%[left]) \n\t"
+ "lb %[tmp3], 2(%[left]) \n\t"
+ "lb %[tmp4], 3(%[left]) \n\t"
+ "lb %[tmp5], 4(%[left]) \n\t"
+ "lb %[tmp6], 5(%[left]) \n\t"
+ "lb %[tmp7], 6(%[left]) \n\t"
+ "lb %[tmp8], 7(%[left]) \n\t"
+ "lb %[tmp9], 8(%[left]) \n\t"
+ "lb %[tmp10], 9(%[left]) \n\t"
+ "lb %[tmp11], 10(%[left]) \n\t"
+ "lb %[tmp12], 11(%[left]) \n\t"
+ "lb %[tmp13], 12(%[left]) \n\t"
+ "lb %[tmp14], 13(%[left]) \n\t"
+ "lb %[tmp15], 14(%[left]) \n\t"
+ "lb %[tmp16], 15(%[left]) \n\t"
+
+ "replv.qb %[tmp1], %[tmp1] \n\t"
+ "replv.qb %[tmp2], %[tmp2] \n\t"
+ "replv.qb %[tmp3], %[tmp3] \n\t"
+ "replv.qb %[tmp4], %[tmp4] \n\t"
+ "replv.qb %[tmp5], %[tmp5] \n\t"
+ "replv.qb %[tmp6], %[tmp6] \n\t"
+ "replv.qb %[tmp7], %[tmp7] \n\t"
+ "replv.qb %[tmp8], %[tmp8] \n\t"
+ "replv.qb %[tmp9], %[tmp9] \n\t"
+ "replv.qb %[tmp10], %[tmp10] \n\t"
+ "replv.qb %[tmp11], %[tmp11] \n\t"
+ "replv.qb %[tmp12], %[tmp12] \n\t"
+ "replv.qb %[tmp13], %[tmp13] \n\t"
+ "replv.qb %[tmp14], %[tmp14] \n\t"
+ "replv.qb %[tmp15], %[tmp15] \n\t"
+ "replv.qb %[tmp16], %[tmp16] \n\t"
+
+ "sw %[tmp1], (%[dst]) \n\t"
+ "sw %[tmp1], 4(%[dst]) \n\t"
+ "sw %[tmp1], 8(%[dst]) \n\t"
+ "sw %[tmp1], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp2], (%[dst]) \n\t"
+ "sw %[tmp2], 4(%[dst]) \n\t"
+ "sw %[tmp2], 8(%[dst]) \n\t"
+ "sw %[tmp2], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp3], (%[dst]) \n\t"
+ "sw %[tmp3], 4(%[dst]) \n\t"
+ "sw %[tmp3], 8(%[dst]) \n\t"
+ "sw %[tmp3], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp4], (%[dst]) \n\t"
+ "sw %[tmp4], 4(%[dst]) \n\t"
+ "sw %[tmp4], 8(%[dst]) \n\t"
+ "sw %[tmp4], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp5], (%[dst]) \n\t"
+ "sw %[tmp5], 4(%[dst]) \n\t"
+ "sw %[tmp5], 8(%[dst]) \n\t"
+ "sw %[tmp5], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp6], (%[dst]) \n\t"
+ "sw %[tmp6], 4(%[dst]) \n\t"
+ "sw %[tmp6], 8(%[dst]) \n\t"
+ "sw %[tmp6], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp7], (%[dst]) \n\t"
+ "sw %[tmp7], 4(%[dst]) \n\t"
+ "sw %[tmp7], 8(%[dst]) \n\t"
+ "sw %[tmp7], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp8], (%[dst]) \n\t"
+ "sw %[tmp8], 4(%[dst]) \n\t"
+ "sw %[tmp8], 8(%[dst]) \n\t"
+ "sw %[tmp8], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp9], (%[dst]) \n\t"
+ "sw %[tmp9], 4(%[dst]) \n\t"
+ "sw %[tmp9], 8(%[dst]) \n\t"
+ "sw %[tmp9], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp10], (%[dst]) \n\t"
+ "sw %[tmp10], 4(%[dst]) \n\t"
+ "sw %[tmp10], 8(%[dst]) \n\t"
+ "sw %[tmp10], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp11], (%[dst]) \n\t"
+ "sw %[tmp11], 4(%[dst]) \n\t"
+ "sw %[tmp11], 8(%[dst]) \n\t"
+ "sw %[tmp11], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp12], (%[dst]) \n\t"
+ "sw %[tmp12], 4(%[dst]) \n\t"
+ "sw %[tmp12], 8(%[dst]) \n\t"
+ "sw %[tmp12], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp13], (%[dst]) \n\t"
+ "sw %[tmp13], 4(%[dst]) \n\t"
+ "sw %[tmp13], 8(%[dst]) \n\t"
+ "sw %[tmp13], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp14], (%[dst]) \n\t"
+ "sw %[tmp14], 4(%[dst]) \n\t"
+ "sw %[tmp14], 8(%[dst]) \n\t"
+ "sw %[tmp14], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp15], (%[dst]) \n\t"
+ "sw %[tmp15], 4(%[dst]) \n\t"
+ "sw %[tmp15], 8(%[dst]) \n\t"
+ "sw %[tmp15], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp16], (%[dst]) \n\t"
+ "sw %[tmp16], 4(%[dst]) \n\t"
+ "sw %[tmp16], 8(%[dst]) \n\t"
+ "sw %[tmp16], 12(%[dst]) \n\t"
+
+ : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
+ [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
+ [tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7),
+ [tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8),
+ [tmp9] "=&r" (tmp9), [tmp10] "=&r" (tmp10),
+ [tmp11] "=&r" (tmp11), [tmp12] "=&r" (tmp12),
+ [tmp13] "=&r" (tmp13), [tmp14] "=&r" (tmp14),
+ [tmp15] "=&r" (tmp15), [tmp16] "=&r" (tmp16)
+ : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
+ );
+}
+
+void vp9_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t expected_dc;
+ int32_t average;
+ int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+ int32_t above2, left2;
+
+ __asm__ __volatile__ (
+ "lw %[above1], (%[above]) \n\t"
+ "lw %[above2], 4(%[above]) \n\t"
+ "lw %[left1], (%[left]) \n\t"
+ "lw %[left2], 4(%[left]) \n\t"
+
+ "preceu.ph.qbl %[above_l1], %[above1] \n\t"
+ "preceu.ph.qbr %[above_r1], %[above1] \n\t"
+ "preceu.ph.qbl %[left_l1], %[left1] \n\t"
+ "preceu.ph.qbr %[left_r1], %[left1] \n\t"
+
+ "addu.ph %[average], %[above_r1], %[above_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_r1] \n\t"
+
+ "preceu.ph.qbl %[above_l1], %[above2] \n\t"
+ "preceu.ph.qbr %[above_r1], %[above2] \n\t"
+ "preceu.ph.qbl %[left_l1], %[left2] \n\t"
+ "preceu.ph.qbr %[left_r1], %[left2] \n\t"
+
+ "addu.ph %[average], %[average], %[above_l1] \n\t"
+ "addu.ph %[average], %[average], %[above_r1] \n\t"
+ "addu.ph %[average], %[average], %[left_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_r1] \n\t"
+
+ "lw %[above1], 8(%[above]) \n\t"
+ "lw %[above2], 12(%[above]) \n\t"
+ "lw %[left1], 8(%[left]) \n\t"
+ "lw %[left2], 12(%[left]) \n\t"
+
+ "preceu.ph.qbl %[above_l1], %[above1] \n\t"
+ "preceu.ph.qbr %[above_r1], %[above1] \n\t"
+ "preceu.ph.qbl %[left_l1], %[left1] \n\t"
+ "preceu.ph.qbr %[left_r1], %[left1] \n\t"
+
+ "addu.ph %[average], %[average], %[above_l1] \n\t"
+ "addu.ph %[average], %[average], %[above_r1] \n\t"
+ "addu.ph %[average], %[average], %[left_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_r1] \n\t"
+
+ "preceu.ph.qbl %[above_l1], %[above2] \n\t"
+ "preceu.ph.qbr %[above_r1], %[above2] \n\t"
+ "preceu.ph.qbl %[left_l1], %[left2] \n\t"
+ "preceu.ph.qbr %[left_r1], %[left2] \n\t"
+
+ "addu.ph %[average], %[average], %[above_l1] \n\t"
+ "addu.ph %[average], %[average], %[above_r1] \n\t"
+ "addu.ph %[average], %[average], %[left_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_r1] \n\t"
+
+ "addiu %[average], %[average], 16 \n\t"
+ "srl %[tmp], %[average], 16 \n\t"
+ "addu.ph %[average], %[tmp], %[average] \n\t"
+ "srl %[expected_dc], %[average], 5 \n\t"
+ "replv.qb %[expected_dc], %[expected_dc] \n\t"
+
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+ "sw %[expected_dc], 8(%[dst]) \n\t"
+ "sw %[expected_dc], 12(%[dst]) \n\t"
+
+ : [left1] "=&r" (left1), [above1] "=&r" (above1),
+ [left_l1] "=&r" (left_l1), [above_l1] "=&r" (above_l1),
+ [left_r1] "=&r" (left_r1), [above_r1] "=&r" (above_r1),
+ [above2] "=&r" (above2), [left2] "=&r" (left2),
+ [average] "=&r" (average), [tmp] "=&r" (tmp),
+ [expected_dc] "=&r" (expected_dc)
+ : [above] "r" (above), [left] "r" (left),
+ [dst] "r" (dst), [stride] "r" (stride)
+ );
+}
+#endif // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c
new file mode 100644
index 0000000..a53c623
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t tmp1, tmp2, tmp3, tmp4;
+
+ __asm__ __volatile__ (
+ "lb %[tmp1], (%[left]) \n\t"
+ "lb %[tmp2], 1(%[left]) \n\t"
+ "lb %[tmp3], 2(%[left]) \n\t"
+ "lb %[tmp4], 3(%[left]) \n\t"
+ "replv.qb %[tmp1], %[tmp1] \n\t"
+ "replv.qb %[tmp2], %[tmp2] \n\t"
+ "replv.qb %[tmp3], %[tmp3] \n\t"
+ "replv.qb %[tmp4], %[tmp4] \n\t"
+ "sw %[tmp1], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp2], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp3], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp4], (%[dst]) \n\t"
+
+ : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
+ [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4)
+ : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
+ );
+}
+
+void vp9_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t expected_dc;
+ int32_t average;
+ int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l;
+
+ __asm__ __volatile__ (
+ "lw %[above_c], (%[above]) \n\t"
+ "lw %[left_c], (%[left]) \n\t"
+
+ "preceu.ph.qbl %[above_l], %[above_c] \n\t"
+ "preceu.ph.qbr %[above_r], %[above_c] \n\t"
+ "preceu.ph.qbl %[left_l], %[left_c] \n\t"
+ "preceu.ph.qbr %[left_r], %[left_c] \n\t"
+
+ "addu.ph %[average], %[above_r], %[above_l] \n\t"
+ "addu.ph %[average], %[average], %[left_l] \n\t"
+ "addu.ph %[average], %[average], %[left_r] \n\t"
+ "addiu %[average], %[average], 4 \n\t"
+ "srl %[tmp], %[average], 16 \n\t"
+ "addu.ph %[average], %[tmp], %[average] \n\t"
+ "srl %[expected_dc], %[average], 3 \n\t"
+ "replv.qb %[expected_dc], %[expected_dc] \n\t"
+
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+
+ : [above_c] "=&r" (above_c), [above_l] "=&r" (above_l),
+ [above_r] "=&r" (above_r), [left_c] "=&r" (left_c),
+ [left_l] "=&r" (left_l), [left_r] "=&r" (left_r),
+ [average] "=&r" (average), [tmp] "=&r" (tmp),
+ [expected_dc] "=&r" (expected_dc)
+ : [above] "r" (above), [left] "r" (left),
+ [dst] "r" (dst), [stride] "r" (stride)
+ );
+}
+
+void vp9_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t abovel, abover;
+ int32_t left0, left1, left2, left3;
+ int32_t res0, res1;
+ int32_t resl;
+ int32_t resr;
+ int32_t top_left;
+ uint8_t *cm = vp9_ff_cropTbl;
+
+ __asm__ __volatile__ (
+ "ulw %[resl], (%[above]) \n\t"
+
+ "lbu %[left0], (%[left]) \n\t"
+ "lbu %[left1], 1(%[left]) \n\t"
+ "lbu %[left2], 2(%[left]) \n\t"
+ "lbu %[left3], 3(%[left]) \n\t"
+
+ "lbu %[top_left], -1(%[above]) \n\t"
+
+ "preceu.ph.qbl %[abovel], %[resl] \n\t"
+ "preceu.ph.qbr %[abover], %[resl] \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "replv.ph %[left1], %[left1] \n\t"
+ "replv.ph %[left2], %[left2] \n\t"
+ "replv.ph %[left3], %[left3] \n\t"
+
+ "replv.ph %[top_left], %[top_left] \n\t"
+
+ "addu.ph %[resl], %[abovel], %[left0] \n\t"
+ "subu.ph %[resl], %[resl], %[top_left] \n\t"
+
+ "addu.ph %[resr], %[abover], %[left0] \n\t"
+ "subu.ph %[resr], %[resr], %[top_left] \n\t"
+
+ "sll %[res0], %[resr], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+ "sra %[res1], %[resr], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "sb %[res0], (%[dst]) \n\t"
+
+ "sll %[res0], %[resl], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+
+ "sra %[res1], %[resl], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+
+ "addu.ph %[resl], %[abovel], %[left1] \n\t"
+ "subu.ph %[resl], %[resl], %[top_left] \n\t"
+
+ "addu.ph %[resr], %[abover], %[left1] \n\t"
+ "subu.ph %[resr], %[resr], %[top_left] \n\t"
+
+ "sb %[res0], 2(%[dst]) \n\t"
+ "sb %[res1], 3(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "sll %[res0], %[resr], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+ "sra %[res1], %[resr], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "sb %[res0], (%[dst]) \n\t"
+
+ "sll %[res0], %[resl], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sra %[res1], %[resl], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+
+ "addu.ph %[resl], %[abovel], %[left2] \n\t"
+ "subu.ph %[resl], %[resl], %[top_left] \n\t"
+
+ "addu.ph %[resr], %[abover], %[left2] \n\t"
+ "subu.ph %[resr], %[resr], %[top_left] \n\t"
+
+ "sb %[res0], 2(%[dst]) \n\t"
+ "sb %[res1], 3(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "sll %[res0], %[resr], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+
+ "sra %[res1], %[resr], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "sb %[res0], (%[dst]) \n\t"
+
+ "sll %[res0], %[resl], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sra %[res1], %[resl], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+
+ "addu.ph %[resl], %[abovel], %[left3] \n\t"
+ "subu.ph %[resl], %[resl], %[top_left] \n\t"
+
+ "addu.ph %[resr], %[abover], %[left3] \n\t"
+ "subu.ph %[resr], %[resr], %[top_left] \n\t"
+
+ "sb %[res0], 2(%[dst]) \n\t"
+ "sb %[res1], 3(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "sll %[res0], %[resr], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+
+ "sra %[res1], %[resr], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "sb %[res0], (%[dst]) \n\t"
+
+ "sll %[res0], %[resl], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+
+ "sra %[res1], %[resl], 16 \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+
+ "sb %[res0], 2(%[dst]) \n\t"
+ "sb %[res1], 3(%[dst]) \n\t"
+
+ : [abovel] "=&r" (abovel), [abover] "=&r" (abover),
+ [left0] "=&r" (left0), [left1] "=&r" (left1), [left2] "=&r" (left2),
+ [res0] "=&r" (res0), [res1] "=&r" (res1), [left3] "=&r" (left3),
+ [resl] "=&r" (resl), [resr] "=&r" (resr), [top_left] "=&r" (top_left)
+ : [above] "r" (above), [left] "r" (left),
+ [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm)
+ );
+}
+#endif // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c
new file mode 100644
index 0000000..40d93ae
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c
@@ -0,0 +1,610 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+
+ __asm__ __volatile__ (
+ "lb %[tmp1], (%[left]) \n\t"
+ "lb %[tmp2], 1(%[left]) \n\t"
+ "lb %[tmp3], 2(%[left]) \n\t"
+ "lb %[tmp4], 3(%[left]) \n\t"
+ "lb %[tmp5], 4(%[left]) \n\t"
+ "lb %[tmp6], 5(%[left]) \n\t"
+ "lb %[tmp7], 6(%[left]) \n\t"
+ "lb %[tmp8], 7(%[left]) \n\t"
+
+ "replv.qb %[tmp1], %[tmp1] \n\t"
+ "replv.qb %[tmp2], %[tmp2] \n\t"
+ "replv.qb %[tmp3], %[tmp3] \n\t"
+ "replv.qb %[tmp4], %[tmp4] \n\t"
+ "replv.qb %[tmp5], %[tmp5] \n\t"
+ "replv.qb %[tmp6], %[tmp6] \n\t"
+ "replv.qb %[tmp7], %[tmp7] \n\t"
+ "replv.qb %[tmp8], %[tmp8] \n\t"
+
+ "sw %[tmp1], (%[dst]) \n\t"
+ "sw %[tmp1], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp2], (%[dst]) \n\t"
+ "sw %[tmp2], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp3], (%[dst]) \n\t"
+ "sw %[tmp3], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp4], (%[dst]) \n\t"
+ "sw %[tmp4], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp5], (%[dst]) \n\t"
+ "sw %[tmp5], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp6], (%[dst]) \n\t"
+ "sw %[tmp6], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp7], (%[dst]) \n\t"
+ "sw %[tmp7], 4(%[dst]) \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[tmp8], (%[dst]) \n\t"
+ "sw %[tmp8], 4(%[dst]) \n\t"
+
+ : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
+ [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
+ [tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7),
+ [tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8)
+ : [left] "r" (left), [dst] "r" (dst),
+ [stride] "r" (stride)
+ );
+}
+
+void vp9_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t expected_dc;
+ int32_t average;
+ int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+ int32_t above2, above_l2, above_r2, left2, left_r2, left_l2;
+
+ __asm__ __volatile__ (
+ "lw %[above1], (%[above]) \n\t"
+ "lw %[above2], 4(%[above]) \n\t"
+ "lw %[left1], (%[left]) \n\t"
+ "lw %[left2], 4(%[left]) \n\t"
+
+ "preceu.ph.qbl %[above_l1], %[above1] \n\t"
+ "preceu.ph.qbr %[above_r1], %[above1] \n\t"
+ "preceu.ph.qbl %[left_l1], %[left1] \n\t"
+ "preceu.ph.qbr %[left_r1], %[left1] \n\t"
+
+ "preceu.ph.qbl %[above_l2], %[above2] \n\t"
+ "preceu.ph.qbr %[above_r2], %[above2] \n\t"
+ "preceu.ph.qbl %[left_l2], %[left2] \n\t"
+ "preceu.ph.qbr %[left_r2], %[left2] \n\t"
+
+ "addu.ph %[average], %[above_r1], %[above_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_l1] \n\t"
+ "addu.ph %[average], %[average], %[left_r1] \n\t"
+
+ "addu.ph %[average], %[average], %[above_l2] \n\t"
+ "addu.ph %[average], %[average], %[above_r2] \n\t"
+ "addu.ph %[average], %[average], %[left_l2] \n\t"
+ "addu.ph %[average], %[average], %[left_r2] \n\t"
+
+ "addiu %[average], %[average], 8 \n\t"
+
+ "srl %[tmp], %[average], 16 \n\t"
+ "addu.ph %[average], %[tmp], %[average] \n\t"
+ "srl %[expected_dc], %[average], 4 \n\t"
+ "replv.qb %[expected_dc], %[expected_dc] \n\t"
+
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ "add %[dst], %[dst], %[stride] \n\t"
+ "sw %[expected_dc], (%[dst]) \n\t"
+ "sw %[expected_dc], 4(%[dst]) \n\t"
+
+ : [above1] "=&r" (above1), [above_l1] "=&r" (above_l1),
+ [above_r1] "=&r" (above_r1), [left1] "=&r" (left1),
+ [left_l1] "=&r" (left_l1), [left_r1] "=&r" (left_r1),
+ [above2] "=&r" (above2), [above_l2] "=&r" (above_l2),
+ [above_r2] "=&r" (above_r2), [left2] "=&r" (left2),
+ [left_l2] "=&r" (left_l2), [left_r2] "=&r" (left_r2),
+ [average] "=&r" (average), [tmp] "=&r" (tmp),
+ [expected_dc] "=&r" (expected_dc)
+ : [above] "r" (above), [left] "r" (left), [dst] "r" (dst),
+ [stride] "r" (stride)
+ );
+}
+
+void vp9_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ int32_t abovel, abover;
+ int32_t abovel_1, abover_1;
+ int32_t left0;
+ int32_t res0, res1, res2, res3;
+ int32_t reshw;
+ int32_t top_left;
+ uint8_t *cm = vp9_ff_cropTbl;
+
+ __asm__ __volatile__ (
+ "ulw %[reshw], (%[above]) \n\t"
+ "ulw %[top_left], 4(%[above]) \n\t"
+
+ "lbu %[left0], (%[left]) \n\t"
+
+ "preceu.ph.qbl %[abovel], %[reshw] \n\t"
+ "preceu.ph.qbr %[abover], %[reshw] \n\t"
+ "preceu.ph.qbl %[abovel_1], %[top_left] \n\t"
+ "preceu.ph.qbr %[abover_1], %[top_left] \n\t"
+
+ "lbu %[top_left], -1(%[above]) \n\t"
+ "replv.ph %[left0], %[left0] \n\t"
+
+ "replv.ph %[top_left], %[top_left] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 1(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 2(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 3(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 4(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 5(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 6(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbu %[left0], 7(%[left]) \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ "replv.ph %[left0], %[left0] \n\t"
+ "add %[dst], %[dst], %[stride] \n\t"
+
+ "addu.ph %[reshw], %[abovel], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], (%[dst]) \n\t"
+ "sb %[res1], 1(%[dst]) \n\t"
+ "sb %[res2], 2(%[dst]) \n\t"
+ "sb %[res3], 3(%[dst]) \n\t"
+
+ "addu.ph %[reshw], %[abovel_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res2], %[reshw], 16 \n\t"
+ "sra %[res2], %[res2], 16 \n\t"
+ "sra %[res3], %[reshw], 16 \n\t"
+
+ "addu.ph %[reshw], %[abover_1], %[left0] \n\t"
+ "subu.ph %[reshw], %[reshw], %[top_left] \n\t"
+
+ "sll %[res0], %[reshw], 16 \n\t"
+ "sra %[res0], %[res0], 16 \n\t"
+ "sra %[res1], %[reshw], 16 \n\t"
+
+ "lbux %[res0], %[res0](%[cm]) \n\t"
+ "lbux %[res1], %[res1](%[cm]) \n\t"
+ "lbux %[res2], %[res2](%[cm]) \n\t"
+ "lbux %[res3], %[res3](%[cm]) \n\t"
+
+ "sb %[res0], 4(%[dst]) \n\t"
+ "sb %[res1], 5(%[dst]) \n\t"
+ "sb %[res2], 6(%[dst]) \n\t"
+ "sb %[res3], 7(%[dst]) \n\t"
+
+ : [abovel] "=&r" (abovel), [abover] "=&r" (abover),
+ [abovel_1] "=&r" (abovel_1), [abover_1] "=&r" (abover_1),
+ [left0] "=&r" (left0), [res2] "=&r" (res2), [res3] "=&r" (res3),
+ [res0] "=&r" (res0), [res1] "=&r" (res1),
+ [reshw] "=&r" (reshw), [top_left] "=&r" (top_left)
+ : [above] "r" (above), [left] "r" (left),
+ [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm)
+ );
+}
+#endif // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
index d3aee73..bc67594 100644
--- a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
+++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
@@ -19,7 +19,8 @@
#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
#if HAVE_DSPR2
-static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output,
+ uint32_t no_rows) {
int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
@@ -42,7 +43,7 @@ static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) {
const int const_2_power_13 = 8192;
const int32_t *input_int;
- for (i = 32; i--; ) {
+ for (i = no_rows; i--; ) {
input_int = (const int32_t *)input;
if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] |
@@ -881,12 +882,74 @@ void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
);
// Rows
- idct32_1d_rows_dspr2(input, outptr);
+ idct32_1d_rows_dspr2(input, outptr, 32);
// Columns
vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride);
}
+void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
+ int16_t *outptr = out;
+ uint32_t i;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // Rows
+ idct32_1d_rows_dspr2(input, outptr, 8);
+
+ outptr += 8;
+ __asm__ __volatile__ (
+ "sw $zero, 0(%[outptr]) \n\t"
+ "sw $zero, 4(%[outptr]) \n\t"
+ "sw $zero, 8(%[outptr]) \n\t"
+ "sw $zero, 12(%[outptr]) \n\t"
+ "sw $zero, 16(%[outptr]) \n\t"
+ "sw $zero, 20(%[outptr]) \n\t"
+ "sw $zero, 24(%[outptr]) \n\t"
+ "sw $zero, 28(%[outptr]) \n\t"
+ "sw $zero, 32(%[outptr]) \n\t"
+ "sw $zero, 36(%[outptr]) \n\t"
+ "sw $zero, 40(%[outptr]) \n\t"
+ "sw $zero, 44(%[outptr]) \n\t"
+
+ :
+ : [outptr] "r" (outptr)
+ );
+
+ for (i = 0; i < 31; ++i) {
+ outptr += 32;
+
+ __asm__ __volatile__ (
+ "sw $zero, 0(%[outptr]) \n\t"
+ "sw $zero, 4(%[outptr]) \n\t"
+ "sw $zero, 8(%[outptr]) \n\t"
+ "sw $zero, 12(%[outptr]) \n\t"
+ "sw $zero, 16(%[outptr]) \n\t"
+ "sw $zero, 20(%[outptr]) \n\t"
+ "sw $zero, 24(%[outptr]) \n\t"
+ "sw $zero, 28(%[outptr]) \n\t"
+ "sw $zero, 32(%[outptr]) \n\t"
+ "sw $zero, 36(%[outptr]) \n\t"
+ "sw $zero, 40(%[outptr]) \n\t"
+ "sw $zero, 44(%[outptr]) \n\t"
+
+ :
+ : [outptr] "r" (outptr)
+ );
+ }
+
+ // Columns
+ vp9_idct32_1d_cols_add_blk_dspr2(out, dest, stride);
+}
+
void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
int stride) {
int r, out;
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
new file mode 100644
index 0000000..36cfc83
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_loop_filter_horizontal_edge_dspr2(unsigned char *s,
+ int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh,
+ int count) {
+ uint8_t i;
+ uint32_t mask;
+ uint32_t hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__ (
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+ [limit_vec] "=r" (limit_vec)
+ : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+ );
+
+ /* prefetch data for store */
+ vp9_prefetch_store(s);
+
+ /* loop filter designed to work using chars so that we can make maximum use
+ of 8 bit simd instructions. */
+ for (i = 0; i < 2; i++) {
+ sm1 = s - (pitch << 2);
+ s0 = sm1 + pitch;
+ s1 = s0 + pitch;
+ s2 = s - pitch;
+ s3 = s;
+ s4 = s + pitch;
+ s5 = s4 + pitch;
+ s6 = s5 + pitch;
+
+ __asm__ __volatile__ (
+ "lw %[p1], (%[s1]) \n\t"
+ "lw %[p2], (%[s2]) \n\t"
+ "lw %[p3], (%[s3]) \n\t"
+ "lw %[p4], (%[s4]) \n\t"
+
+ : [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4)
+ : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+ );
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ mask will be zero and filtering is not needed */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ __asm__ __volatile__ (
+ "lw %[pm1], (%[sm1]) \n\t"
+ "lw %[p0], (%[s0]) \n\t"
+ "lw %[p5], (%[s5]) \n\t"
+ "lw %[p6], (%[s6]) \n\t"
+
+ : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5),
+ [p6] "=&r" (p6)
+ : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
+ );
+
+ vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
+ pm1, p0, p3, p4, p5, p6,
+ thresh_vec, &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+ __asm__ __volatile__ (
+ "sw %[p1], (%[s1]) \n\t"
+ "sw %[p2], (%[s2]) \n\t"
+ "sw %[p3], (%[s3]) \n\t"
+ "sw %[p4], (%[s4]) \n\t"
+
+ :
+ : [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4),
+ [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+ );
+ }
+ }
+
+ s = s + 4;
+ }
+}
+
+void vp9_loop_filter_vertical_edge_dspr2(unsigned char *s,
+ int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh,
+ int count) {
+ uint8_t i;
+ uint32_t mask, hev;
+ uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+ uint8_t *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__ (
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+ [limit_vec] "=r" (limit_vec)
+ : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+ );
+
+ /* prefetch data for store */
+ vp9_prefetch_store(s + pitch);
+
+ for (i = 0; i < 2; i++) {
+ s1 = s;
+ s2 = s + pitch;
+ s3 = s2 + pitch;
+ s4 = s3 + pitch;
+ s = s4 + pitch;
+
+ /* load quad-byte vectors
+ * memory is 4 byte aligned
+ */
+ p2 = *((uint32_t *)(s1 - 4));
+ p6 = *((uint32_t *)(s1));
+ p1 = *((uint32_t *)(s2 - 4));
+ p5 = *((uint32_t *)(s2));
+ p0 = *((uint32_t *)(s3 - 4));
+ p4 = *((uint32_t *)(s3));
+ pm1 = *((uint32_t *)(s4 - 4));
+ p3 = *((uint32_t *)(s4));
+
+ /* transpose pm1, p0, p1, p2 */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
+ "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[pm1], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* transpose p3, p4, p5, p6 */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+ * mask will be zero and filtering is not needed
+ */
+ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+ vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
+ p0, p3, p4, p5, p6, thresh_vec,
+ &hev, &mask);
+
+ /* if mask == 0 do filtering is not needed */
+ if (mask) {
+ /* filtering */
+ vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+ /* unpack processed 4x4 neighborhood
+ * don't use transpose on output data
+ * because memory isn't aligned
+ */
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s4]) \n\t"
+ "sb %[p3], 0(%[s4]) \n\t"
+ "sb %[p2], -1(%[s4]) \n\t"
+ "sb %[p1], -2(%[s4]) \n\t"
+
+ :
+ : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
+ [s4] "r" (s4)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+
+ : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s3]) \n\t"
+ "sb %[p3], 0(%[s3]) \n\t"
+ "sb %[p2], -1(%[s3]) \n\t"
+ "sb %[p1], -2(%[s3]) \n\t"
+
+ : [p1] "+r" (p1)
+ : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+
+ : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s2]) \n\t"
+ "sb %[p3], 0(%[s2]) \n\t"
+ "sb %[p2], -1(%[s2]) \n\t"
+ "sb %[p1], -2(%[s2]) \n\t"
+
+ :
+ : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
+ [s2] "r" (s2)
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p4], %[p4], 8 \n\t"
+ "srl %[p3], %[p3], 8 \n\t"
+ "srl %[p2], %[p2], 8 \n\t"
+ "srl %[p1], %[p1], 8 \n\t"
+
+ : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "sb %[p4], 1(%[s1]) \n\t"
+ "sb %[p3], 0(%[s1]) \n\t"
+ "sb %[p2], -1(%[s1]) \n\t"
+ "sb %[p1], -2(%[s1]) \n\t"
+
+ :
+ : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
+ [s1] "r" (s1)
+ );
+ }
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
new file mode 100644
index 0000000..98bfcfa
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
@@ -0,0 +1,755 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
+#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#if HAVE_DSPR2
+/* inputs & outputs are quad-byte vectors */
+static INLINE void vp9_filter_dspr2(uint32_t mask, uint32_t hev,
+ uint32_t *ps1, uint32_t *ps0,
+ uint32_t *qs0, uint32_t *qs1) {
+ int32_t vp9_filter_l, vp9_filter_r;
+ int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+ int32_t subr_r, subr_l;
+ uint32_t t1, t2, HWM, t3;
+ uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+ int32_t vps1, vps0, vqs0, vqs1;
+ int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+ uint32_t N128;
+
+ N128 = 0x80808080;
+ t1 = 0x03000300;
+ t2 = 0x04000400;
+ t3 = 0x01000100;
+ HWM = 0xFF00FF00;
+
+ vps0 = (*ps0) ^ N128;
+ vps1 = (*ps1) ^ N128;
+ vqs0 = (*qs0) ^ N128;
+ vqs1 = (*qs1) ^ N128;
+
+ /* use halfword pairs instead quad-bytes because of accuracy */
+ vps0_l = vps0 & HWM;
+ vps0_r = vps0 << 8;
+ vps0_r = vps0_r & HWM;
+
+ vps1_l = vps1 & HWM;
+ vps1_r = vps1 << 8;
+ vps1_r = vps1_r & HWM;
+
+ vqs0_l = vqs0 & HWM;
+ vqs0_r = vqs0 << 8;
+ vqs0_r = vqs0_r & HWM;
+
+ vqs1_l = vqs1 & HWM;
+ vqs1_r = vqs1 << 8;
+ vqs1_r = vqs1_r & HWM;
+
+ mask_l = mask & HWM;
+ mask_r = mask << 8;
+ mask_r = mask_r & HWM;
+
+ hev_l = hev & HWM;
+ hev_r = hev << 8;
+ hev_r = hev_r & HWM;
+
+ __asm__ __volatile__ (
+ /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
+ "subq_s.ph %[vp9_filter_l], %[vps1_l], %[vqs1_l] \n\t"
+ "subq_s.ph %[vp9_filter_r], %[vps1_r], %[vqs1_r] \n\t"
+
+ /* qs0 - ps0 */
+ "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
+ "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
+
+ /* vp9_filter &= hev; */
+ "and %[vp9_filter_l], %[vp9_filter_l], %[hev_l] \n\t"
+ "and %[vp9_filter_r], %[vp9_filter_r], %[hev_r] \n\t"
+
+ /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
+ "addq_s.ph %[vp9_filter_l], %[vp9_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp9_filter_r], %[vp9_filter_r], %[subr_r] \n\t"
+ "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
+ "addq_s.ph %[vp9_filter_l], %[vp9_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp9_filter_r], %[vp9_filter_r], %[subr_r] \n\t"
+ "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
+ "addq_s.ph %[vp9_filter_l], %[vp9_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp9_filter_r], %[vp9_filter_r], %[subr_r] \n\t"
+
+ /* vp9_filter &= mask; */
+ "and %[vp9_filter_l], %[vp9_filter_l], %[mask_l] \n\t"
+ "and %[vp9_filter_r], %[vp9_filter_r], %[mask_r] \n\t"
+
+ : [vp9_filter_l] "=&r" (vp9_filter_l),
+ [vp9_filter_r] "=&r" (vp9_filter_r),
+ [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
+ [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
+ : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+ [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+ [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
+ [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
+ [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
+ [HWM] "r" (HWM)
+ );
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ __asm__ __volatile__ (
+ /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
+ "addq_s.ph %[Filter1_l], %[vp9_filter_l], %[t2] \n\t"
+ "addq_s.ph %[Filter1_r], %[vp9_filter_r], %[t2] \n\t"
+
+ /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
+ "addq_s.ph %[Filter2_l], %[vp9_filter_l], %[t1] \n\t"
+ "addq_s.ph %[Filter2_r], %[vp9_filter_r], %[t1] \n\t"
+ "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
+ "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
+
+ "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
+ "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
+
+ "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
+
+ /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+ "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
+ "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
+
+ /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+ "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
+
+ : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+ [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
+ [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+ [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+ : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
+ [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
+ );
+
+ __asm__ __volatile__ (
+ /* (vp9_filter += 1) >>= 1 */
+ "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
+ "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
+
+ /* vp9_filter &= ~hev; */
+ "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t"
+
+ /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
+ "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t"
+ "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t"
+
+ /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
+ "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
+
+ : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
+ [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+ [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+ : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+ );
+
+ /* Create quad-bytes from halfword pairs */
+ vqs0_l = vqs0_l & HWM;
+ vqs1_l = vqs1_l & HWM;
+ vps0_l = vps0_l & HWM;
+ vps1_l = vps1_l & HWM;
+
+ __asm__ __volatile__ (
+ "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
+ "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
+ "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
+ "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
+
+ : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
+ [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
+ :
+ );
+
+ vqs0 = vqs0_l | vqs0_r;
+ vqs1 = vqs1_l | vqs1_r;
+ vps0 = vps0_l | vps0_r;
+ vps1 = vps1_l | vps1_r;
+
+ *ps0 = vps0 ^ N128;
+ *ps1 = vps1 ^ N128;
+ *qs0 = vqs0 ^ N128;
+ *qs1 = vqs1 ^ N128;
+}
+
+static INLINE void vp9_filter1_dspr2(uint32_t mask, uint32_t hev,
+ uint32_t ps1, uint32_t ps0,
+ uint32_t qs0, uint32_t qs1,
+ uint32_t *p1_f0, uint32_t *p0_f0,
+ uint32_t *q0_f0, uint32_t *q1_f0) {
+ int32_t vp9_filter_l, vp9_filter_r;
+ int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+ int32_t subr_r, subr_l;
+ uint32_t t1, t2, HWM, t3;
+ uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+ int32_t vps1, vps0, vqs0, vqs1;
+ int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+ uint32_t N128;
+
+ N128 = 0x80808080;
+ t1 = 0x03000300;
+ t2 = 0x04000400;
+ t3 = 0x01000100;
+ HWM = 0xFF00FF00;
+
+ vps0 = (ps0) ^ N128;
+ vps1 = (ps1) ^ N128;
+ vqs0 = (qs0) ^ N128;
+ vqs1 = (qs1) ^ N128;
+
+ /* use halfword pairs instead quad-bytes because of accuracy */
+ vps0_l = vps0 & HWM;
+ vps0_r = vps0 << 8;
+ vps0_r = vps0_r & HWM;
+
+ vps1_l = vps1 & HWM;
+ vps1_r = vps1 << 8;
+ vps1_r = vps1_r & HWM;
+
+ vqs0_l = vqs0 & HWM;
+ vqs0_r = vqs0 << 8;
+ vqs0_r = vqs0_r & HWM;
+
+ vqs1_l = vqs1 & HWM;
+ vqs1_r = vqs1 << 8;
+ vqs1_r = vqs1_r & HWM;
+
+ mask_l = mask & HWM;
+ mask_r = mask << 8;
+ mask_r = mask_r & HWM;
+
+ hev_l = hev & HWM;
+ hev_r = hev << 8;
+ hev_r = hev_r & HWM;
+
+ __asm__ __volatile__ (
+ /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
+ "subq_s.ph %[vp9_filter_l], %[vps1_l], %[vqs1_l] \n\t"
+ "subq_s.ph %[vp9_filter_r], %[vps1_r], %[vqs1_r] \n\t"
+
+ /* qs0 - ps0 */
+ "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
+ "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
+
+ /* vp9_filter &= hev; */
+ "and %[vp9_filter_l], %[vp9_filter_l], %[hev_l] \n\t"
+ "and %[vp9_filter_r], %[vp9_filter_r], %[hev_r] \n\t"
+
+ /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
+ "addq_s.ph %[vp9_filter_l], %[vp9_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp9_filter_r], %[vp9_filter_r], %[subr_r] \n\t"
+ "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
+ "addq_s.ph %[vp9_filter_l], %[vp9_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp9_filter_r], %[vp9_filter_r], %[subr_r] \n\t"
+ "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
+ "addq_s.ph %[vp9_filter_l], %[vp9_filter_l], %[subr_l] \n\t"
+ "addq_s.ph %[vp9_filter_r], %[vp9_filter_r], %[subr_r] \n\t"
+
+ /* vp9_filter &= mask; */
+ "and %[vp9_filter_l], %[vp9_filter_l], %[mask_l] \n\t"
+ "and %[vp9_filter_r], %[vp9_filter_r], %[mask_r] \n\t"
+
+ : [vp9_filter_l] "=&r" (vp9_filter_l),
+ [vp9_filter_r] "=&r" (vp9_filter_r),
+ [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
+ [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
+ : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+ [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+ [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
+ [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
+ [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), [HWM] "r" (HWM)
+ );
+
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */
+ __asm__ __volatile__ (
+ /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
+ "addq_s.ph %[Filter1_l], %[vp9_filter_l], %[t2] \n\t"
+ "addq_s.ph %[Filter1_r], %[vp9_filter_r], %[t2] \n\t"
+
+ /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
+ "addq_s.ph %[Filter2_l], %[vp9_filter_l], %[t1] \n\t"
+ "addq_s.ph %[Filter2_r], %[vp9_filter_r], %[t1] \n\t"
+ "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
+ "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
+
+ "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
+ "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
+
+ "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
+
+ /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+ "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
+ "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
+
+ /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+ "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
+
+ : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+ [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
+ [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+ [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+ : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
+ [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
+ );
+
+ __asm__ __volatile__ (
+ /* (vp9_filter += 1) >>= 1 */
+ "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
+ "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
+
+ /* vp9_filter &= ~hev; */
+ "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t"
+ "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t"
+
+ /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
+ "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t"
+ "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t"
+
+ /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
+ "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
+ "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
+
+ : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
+ [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+ [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+ : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+ );
+
+ /* Create quad-bytes from halfword pairs */
+ vqs0_l = vqs0_l & HWM;
+ vqs1_l = vqs1_l & HWM;
+ vps0_l = vps0_l & HWM;
+ vps1_l = vps1_l & HWM;
+
+ __asm__ __volatile__ (
+ "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
+ "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
+ "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
+ "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
+
+ : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
+ [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
+ :
+ );
+
+ vqs0 = vqs0_l | vqs0_r;
+ vqs1 = vqs1_l | vqs1_r;
+ vps0 = vps0_l | vps0_r;
+ vps1 = vps1_l | vps1_r;
+
+ *p0_f0 = vps0 ^ N128;
+ *p1_f0 = vps1 ^ N128;
+ *q0_f0 = vqs0 ^ N128;
+ *q1_f0 = vqs1 ^ N128;
+}
+
+static INLINE void vp9_mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
+ uint32_t *op1, uint32_t *op0,
+ uint32_t *oq0, uint32_t *oq1,
+ uint32_t *oq2, uint32_t *oq3) {
+ /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+ const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+ uint32_t res_op2, res_op1, res_op0;
+ uint32_t res_oq0, res_oq1, res_oq2;
+ uint32_t tmp;
+ uint32_t add_p210_q012;
+ uint32_t u32Four = 0x00040004;
+
+ /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */
+ /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */
+ /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */
+ /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */
+ /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */
+ /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */
+
+ __asm__ __volatile__ (
+ "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t"
+
+ "shll.ph %[tmp], %[p3], 1 \n\t"
+ "addu.ph %[res_op2], %[tmp], %[p3] \n\t"
+ "addu.ph %[res_op1], %[p3], %[p3] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
+ "addu.ph %[res_op1], %[res_op1], %[p1] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t"
+ "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q1] \n\t"
+ "subu.ph %[res_op1], %[res_op1], %[q2] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q2] \n\t"
+ "shrl.ph %[res_op1], %[res_op1], 3 \n\t"
+ "shrl.ph %[res_op2], %[res_op2], 3 \n\t"
+ "addu.ph %[res_op0], %[p3], %[p0] \n\t"
+ "addu.ph %[res_oq0], %[q0], %[q3] \n\t"
+ "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq1], %[q3], %[q3] \n\t"
+ "shll.ph %[tmp], %[q3], 1 \n\t"
+ "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t"
+ "addu.ph %[res_oq2], %[tmp], %[q3] \n\t"
+ "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t"
+ "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
+ "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t"
+ "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t"
+ "shrl.ph %[res_op0], %[res_op0], 3 \n\t"
+ "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t"
+
+ : [add_p210_q012] "=&r" (add_p210_q012),
+ [tmp] "=&r" (tmp), [res_op2] "=&r" (res_op2),
+ [res_op1] "=&r" (res_op1), [res_op0] "=&r" (res_op0),
+ [res_oq0] "=&r" (res_oq0), [res_oq1] "=&r" (res_oq1),
+ [res_oq2] "=&r" (res_oq2)
+ : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
+ [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
+ [u32Four] "r" (u32Four)
+ );
+
+ *op2 = res_op2;
+ *op1 = res_op1;
+ *op0 = res_op0;
+ *oq0 = res_oq0;
+ *oq1 = res_oq1;
+ *oq2 = res_oq2;
+}
+
+static INLINE void vp9_mbfilter1_dspr2(uint32_t p3, uint32_t p2,
+ uint32_t p1, uint32_t p0,
+ uint32_t q0, uint32_t q1,
+ uint32_t q2, uint32_t q3,
+ uint32_t *op2_f1,
+ uint32_t *op1_f1, uint32_t *op0_f1,
+ uint32_t *oq0_f1, uint32_t *oq1_f1,
+ uint32_t *oq2_f1) {
+ /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+ uint32_t res_op2, res_op1, res_op0;
+ uint32_t res_oq0, res_oq1, res_oq2;
+ uint32_t tmp;
+ uint32_t add_p210_q012;
+ uint32_t u32Four = 0x00040004;
+
+ /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */
+ /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */
+ /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */
+ /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */
+ /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */
+ /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */
+
+ __asm__ __volatile__ (
+ "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t"
+ "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t"
+
+ "shll.ph %[tmp], %[p3], 1 \n\t"
+ "addu.ph %[res_op2], %[tmp], %[p3] \n\t"
+ "addu.ph %[res_op1], %[p3], %[p3] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
+ "addu.ph %[res_op1], %[res_op1], %[p1] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t"
+ "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q1] \n\t"
+ "subu.ph %[res_op1], %[res_op1], %[q2] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q2] \n\t"
+ "shrl.ph %[res_op1], %[res_op1], 3 \n\t"
+ "shrl.ph %[res_op2], %[res_op2], 3 \n\t"
+ "addu.ph %[res_op0], %[p3], %[p0] \n\t"
+ "addu.ph %[res_oq0], %[q0], %[q3] \n\t"
+ "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq1], %[q3], %[q3] \n\t"
+ "shll.ph %[tmp], %[q3], 1 \n\t"
+ "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t"
+ "addu.ph %[res_oq2], %[tmp], %[q3] \n\t"
+ "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t"
+ "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
+ "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t"
+ "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t"
+ "shrl.ph %[res_op0], %[res_op0], 3 \n\t"
+ "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t"
+
+ : [add_p210_q012] "=&r" (add_p210_q012), [tmp] "=&r" (tmp),
+ [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
+ [res_op0] "=&r" (res_op0), [res_oq0] "=&r" (res_oq0),
+ [res_oq1] "=&r" (res_oq1), [res_oq2] "=&r" (res_oq2)
+ : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
+ [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
+ [u32Four] "r" (u32Four)
+ );
+
+ *op2_f1 = res_op2;
+ *op1_f1 = res_op1;
+ *op0_f1 = res_op0;
+ *oq0_f1 = res_oq0;
+ *oq1_f1 = res_oq1;
+ *oq2_f1 = res_oq2;
+}
+
+static INLINE void vp9_wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
+ uint32_t *op5, uint32_t *op4,
+ uint32_t *op3, uint32_t *op2,
+ uint32_t *op1, uint32_t *op0,
+ uint32_t *oq0, uint32_t *oq1,
+ uint32_t *oq2, uint32_t *oq3,
+ uint32_t *oq4, uint32_t *oq5,
+ uint32_t *oq6, uint32_t *oq7) {
+ const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
+ const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+ const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+ const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
+ uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
+ uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
+ uint32_t tmp;
+ uint32_t add_p6toq6;
+ uint32_t u32Eight = 0x00080008;
+
+ __asm__ __volatile__ (
+ /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
+ which is used most of the time */
+ "addu.ph %[add_p6toq6], %[p6], %[p5] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[p4] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[p3] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[p2] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[p1] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[p0] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q0] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q1] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q2] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q3] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q4] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q5] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t"
+ "addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t"
+
+ : [add_p6toq6] "=&r" (add_p6toq6)
+ : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
+ [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+ [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3),
+ [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
+ [u32Eight] "r" (u32Eight)
+ );
+
+ __asm__ __volatile__ (
+ /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
+ p3 + p2 + p1 + p0 + q0, 4) */
+ "shll.ph %[tmp], %[p7], 3 \n\t"
+ "subu.ph %[res_op6], %[tmp], %[p7] \n\t"
+ "addu.ph %[res_op6], %[res_op6], %[p6] \n\t"
+ "addu.ph %[res_op6], %[res_op6], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q1] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q2] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q3] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q4] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q5] \n\t"
+ "subu.ph %[res_op6], %[res_op6], %[q6] \n\t"
+ "shrl.ph %[res_op6], %[res_op6], 4 \n\t"
+
+ /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
+ p2 + p1 + p0 + q0 + q1, 4) */
+ "shll.ph %[tmp], %[p7], 2 \n\t"
+ "addu.ph %[res_op5], %[tmp], %[p7] \n\t"
+ "addu.ph %[res_op5], %[res_op5], %[p7] \n\t"
+ "addu.ph %[res_op5], %[res_op5], %[p5] \n\t"
+ "addu.ph %[res_op5], %[res_op5], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op5], %[res_op5], %[q2] \n\t"
+ "subu.ph %[res_op5], %[res_op5], %[q3] \n\t"
+ "subu.ph %[res_op5], %[res_op5], %[q4] \n\t"
+ "subu.ph %[res_op5], %[res_op5], %[q5] \n\t"
+ "subu.ph %[res_op5], %[res_op5], %[q6] \n\t"
+ "shrl.ph %[res_op5], %[res_op5], 4 \n\t"
+
+ /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
+ p1 + p0 + q0 + q1 + q2, 4) */
+ "shll.ph %[tmp], %[p7], 2 \n\t"
+ "addu.ph %[res_op4], %[tmp], %[p7] \n\t"
+ "addu.ph %[res_op4], %[res_op4], %[p4] \n\t"
+ "addu.ph %[res_op4], %[res_op4], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op4], %[res_op4], %[q3] \n\t"
+ "subu.ph %[res_op4], %[res_op4], %[q4] \n\t"
+ "subu.ph %[res_op4], %[res_op4], %[q5] \n\t"
+ "subu.ph %[res_op4], %[res_op4], %[q6] \n\t"
+ "shrl.ph %[res_op4], %[res_op4], 4 \n\t"
+
+ /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
+ p1 + p0 + q0 + q1 + q2 + q3, 4) */
+ "shll.ph %[tmp], %[p7], 2 \n\t"
+ "addu.ph %[res_op3], %[tmp], %[p3] \n\t"
+ "addu.ph %[res_op3], %[res_op3], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op3], %[res_op3], %[q4] \n\t"
+ "subu.ph %[res_op3], %[res_op3], %[q5] \n\t"
+ "subu.ph %[res_op3], %[res_op3], %[q6] \n\t"
+ "shrl.ph %[res_op3], %[res_op3], 4 \n\t"
+
+ /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
+ p0 + q0 + q1 + q2 + q3 + q4, 4) */
+ "shll.ph %[tmp], %[p7], 1 \n\t"
+ "addu.ph %[res_op2], %[tmp], %[p7] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
+ "addu.ph %[res_op2], %[res_op2], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q5] \n\t"
+ "subu.ph %[res_op2], %[res_op2], %[q6] \n\t"
+ "shrl.ph %[res_op2], %[res_op2], 4 \n\t"
+
+ /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
+ p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
+ "shll.ph %[tmp], %[p7], 1 \n\t"
+ "addu.ph %[res_op1], %[tmp], %[p1] \n\t"
+ "addu.ph %[res_op1], %[res_op1], %[add_p6toq6] \n\t"
+ "subu.ph %[res_op1], %[res_op1], %[q6] \n\t"
+ "shrl.ph %[res_op1], %[res_op1], 4 \n\t"
+
+ /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+ q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
+ "addu.ph %[res_op0], %[p7], %[p0] \n\t"
+ "addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t"
+ "shrl.ph %[res_op0], %[res_op0], 4 \n\t"
+
+ : [res_op6] "=&r" (res_op6), [res_op5] "=&r" (res_op5),
+ [res_op4] "=&r" (res_op4), [res_op3] "=&r" (res_op3),
+ [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
+ [res_op0] "=&r" (res_op0), [tmp] "=&r" (tmp)
+ : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
+ [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+ [q2] "r" (q2), [q1] "r" (q1),
+ [q3] "r" (q3), [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
+ [add_p6toq6] "r" (add_p6toq6)
+ );
+
+ *op6 = res_op6;
+ *op5 = res_op5;
+ *op4 = res_op4;
+ *op3 = res_op3;
+ *op2 = res_op2;
+ *op1 = res_op1;
+ *op0 = res_op0;
+
+ __asm__ __volatile__ (
+ /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
+ q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
+ "addu.ph %[res_oq0], %[q7], %[q0] \n\t"
+ "addu.ph %[res_oq0], %[res_oq0], %[add_p6toq6] \n\t"
+ "shrl.ph %[res_oq0], %[res_oq0], 4 \n\t"
+
+ /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
+ q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
+ "shll.ph %[tmp], %[q7], 1 \n\t"
+ "addu.ph %[res_oq1], %[tmp], %[q1] \n\t"
+ "addu.ph %[res_oq1], %[res_oq1], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq1], %[res_oq1], %[p6] \n\t"
+ "shrl.ph %[res_oq1], %[res_oq1], 4 \n\t"
+
+ /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
+ q3 + q4 + q5 + q6 + q7 * 3, 4) */
+ "shll.ph %[tmp], %[q7], 1 \n\t"
+ "addu.ph %[res_oq2], %[tmp], %[q7] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
+ "addu.ph %[res_oq2], %[res_oq2], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p5] \n\t"
+ "subu.ph %[res_oq2], %[res_oq2], %[p6] \n\t"
+ "shrl.ph %[res_oq2], %[res_oq2], 4 \n\t"
+
+ /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
+ q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
+ "shll.ph %[tmp], %[q7], 2 \n\t"
+ "addu.ph %[res_oq3], %[tmp], %[q3] \n\t"
+ "addu.ph %[res_oq3], %[res_oq3], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq3], %[res_oq3], %[p4] \n\t"
+ "subu.ph %[res_oq3], %[res_oq3], %[p5] \n\t"
+ "subu.ph %[res_oq3], %[res_oq3], %[p6] \n\t"
+ "shrl.ph %[res_oq3], %[res_oq3], 4 \n\t"
+
+ /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
+ q4 * 2 + q5 + q6 + q7 * 5, 4) */
+ "shll.ph %[tmp], %[q7], 2 \n\t"
+ "addu.ph %[res_oq4], %[tmp], %[q7] \n\t"
+ "addu.ph %[res_oq4], %[res_oq4], %[q4] \n\t"
+ "addu.ph %[res_oq4], %[res_oq4], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq4], %[res_oq4], %[p3] \n\t"
+ "subu.ph %[res_oq4], %[res_oq4], %[p4] \n\t"
+ "subu.ph %[res_oq4], %[res_oq4], %[p5] \n\t"
+ "subu.ph %[res_oq4], %[res_oq4], %[p6] \n\t"
+ "shrl.ph %[res_oq4], %[res_oq4], 4 \n\t"
+
+ /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
+ q5 * 2 + q6 + q7 * 6, 4) */
+ "shll.ph %[tmp], %[q7], 2 \n\t"
+ "addu.ph %[res_oq5], %[tmp], %[q7] \n\t"
+ "addu.ph %[res_oq5], %[res_oq5], %[q7] \n\t"
+ "addu.ph %[res_oq5], %[res_oq5], %[q5] \n\t"
+ "addu.ph %[res_oq5], %[res_oq5], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq5], %[res_oq5], %[p2] \n\t"
+ "subu.ph %[res_oq5], %[res_oq5], %[p3] \n\t"
+ "subu.ph %[res_oq5], %[res_oq5], %[p4] \n\t"
+ "subu.ph %[res_oq5], %[res_oq5], %[p5] \n\t"
+ "subu.ph %[res_oq5], %[res_oq5], %[p6] \n\t"
+ "shrl.ph %[res_oq5], %[res_oq5], 4 \n\t"
+
+ /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
+ q4 + q5 + q6 * 2 + q7 * 7, 4) */
+ "shll.ph %[tmp], %[q7], 3 \n\t"
+ "subu.ph %[res_oq6], %[tmp], %[q7] \n\t"
+ "addu.ph %[res_oq6], %[res_oq6], %[q6] \n\t"
+ "addu.ph %[res_oq6], %[res_oq6], %[add_p6toq6] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p1] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p2] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p3] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p4] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p5] \n\t"
+ "subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t"
+ "shrl.ph %[res_oq6], %[res_oq6], 4 \n\t"
+
+ : [res_oq6] "=&r" (res_oq6), [res_oq5] "=&r" (res_oq5),
+ [res_oq4] "=&r" (res_oq4), [res_oq3] "=&r" (res_oq3),
+ [res_oq2] "=&r" (res_oq2), [res_oq1] "=&r" (res_oq1),
+ [res_oq0] "=&r" (res_oq0), [tmp] "=&r" (tmp)
+ : [q7] "r" (q7), [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4),
+ [q3] "r" (q3), [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
+ [p1] "r" (p1), [p2] "r" (p2),
+ [p3] "r" (p3), [p4] "r" (p4), [p5] "r" (p5), [p6] "r" (p6),
+ [add_p6toq6] "r" (add_p6toq6)
+ );
+
+ *oq0 = res_oq0;
+ *oq1 = res_oq1;
+ *oq2 = res_oq2;
+ *oq3 = res_oq3;
+ *oq4 = res_oq4;
+ *oq5 = res_oq5;
+ *oq6 = res_oq6;
+}
+#endif // #if HAVE_DSPR2
+#endif // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
new file mode 100644
index 0000000..4cb2ebb
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
@@ -0,0 +1,470 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
+#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#if HAVE_DSPR2
+#define STORE_F0() { \
+ __asm__ __volatile__ ( \
+ "sb %[q1_f0], 1(%[s4]) \n\t" \
+ "sb %[q0_f0], 0(%[s4]) \n\t" \
+ "sb %[p0_f0], -1(%[s4]) \n\t" \
+ "sb %[p1_f0], -2(%[s4]) \n\t" \
+ \
+ : \
+ : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \
+ [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0), \
+ [s4] "r" (s4) \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "srl %[q1_f0], %[q1_f0], 8 \n\t" \
+ "srl %[q0_f0], %[q0_f0], 8 \n\t" \
+ "srl %[p0_f0], %[p0_f0], 8 \n\t" \
+ "srl %[p1_f0], %[p1_f0], 8 \n\t" \
+ \
+ : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0), \
+ [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0) \
+ : \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "sb %[q1_f0], 1(%[s3]) \n\t" \
+ "sb %[q0_f0], 0(%[s3]) \n\t" \
+ "sb %[p0_f0], -1(%[s3]) \n\t" \
+ "sb %[p1_f0], -2(%[s3]) \n\t" \
+ \
+ : [p1_f0] "+r" (p1_f0) \
+ : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \
+ [s3] "r" (s3), [p0_f0] "r" (p0_f0) \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "srl %[q1_f0], %[q1_f0], 8 \n\t" \
+ "srl %[q0_f0], %[q0_f0], 8 \n\t" \
+ "srl %[p0_f0], %[p0_f0], 8 \n\t" \
+ "srl %[p1_f0], %[p1_f0], 8 \n\t" \
+ \
+ : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0), \
+ [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0) \
+ : \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "sb %[q1_f0], 1(%[s2]) \n\t" \
+ "sb %[q0_f0], 0(%[s2]) \n\t" \
+ "sb %[p0_f0], -1(%[s2]) \n\t" \
+ "sb %[p1_f0], -2(%[s2]) \n\t" \
+ \
+ : \
+ : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \
+ [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0), \
+ [s2] "r" (s2) \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "srl %[q1_f0], %[q1_f0], 8 \n\t" \
+ "srl %[q0_f0], %[q0_f0], 8 \n\t" \
+ "srl %[p0_f0], %[p0_f0], 8 \n\t" \
+ "srl %[p1_f0], %[p1_f0], 8 \n\t" \
+ \
+ : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0), \
+ [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0) \
+ : \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "sb %[q1_f0], 1(%[s1]) \n\t" \
+ "sb %[q0_f0], 0(%[s1]) \n\t" \
+ "sb %[p0_f0], -1(%[s1]) \n\t" \
+ "sb %[p1_f0], -2(%[s1]) \n\t" \
+ \
+ : \
+ : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \
+ [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0), \
+ [s1] "r" (s1) \
+ ); \
+}
+
+#define STORE_F1() { \
+ __asm__ __volatile__ ( \
+ "sb %[q2_r], 2(%[s4]) \n\t" \
+ "sb %[q1_r], 1(%[s4]) \n\t" \
+ "sb %[q0_r], 0(%[s4]) \n\t" \
+ "sb %[p0_r], -1(%[s4]) \n\t" \
+ "sb %[p1_r], -2(%[s4]) \n\t" \
+ "sb %[p2_r], -3(%[s4]) \n\t" \
+ \
+ : \
+ : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r), \
+ [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \
+ [s4] "r" (s4) \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "srl %[q2_r], %[q2_r], 16 \n\t" \
+ "srl %[q1_r], %[q1_r], 16 \n\t" \
+ "srl %[q0_r], %[q0_r], 16 \n\t" \
+ "srl %[p0_r], %[p0_r], 16 \n\t" \
+ "srl %[p1_r], %[p1_r], 16 \n\t" \
+ "srl %[p2_r], %[p2_r], 16 \n\t" \
+ \
+ : [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r), [q0_r] "+r" (q0_r), \
+ [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r) \
+ : \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "sb %[q2_r], 2(%[s3]) \n\t" \
+ "sb %[q1_r], 1(%[s3]) \n\t" \
+ "sb %[q0_r], 0(%[s3]) \n\t" \
+ "sb %[p0_r], -1(%[s3]) \n\t" \
+ "sb %[p1_r], -2(%[s3]) \n\t" \
+ "sb %[p2_r], -3(%[s3]) \n\t" \
+ \
+ : \
+ : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r), \
+ [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \
+ [s3] "r" (s3) \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "sb %[q2_l], 2(%[s2]) \n\t" \
+ "sb %[q1_l], 1(%[s2]) \n\t" \
+ "sb %[q0_l], 0(%[s2]) \n\t" \
+ "sb %[p0_l], -1(%[s2]) \n\t" \
+ "sb %[p1_l], -2(%[s2]) \n\t" \
+ "sb %[p2_l], -3(%[s2]) \n\t" \
+ \
+ : \
+ : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l), \
+ [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \
+ [s2] "r" (s2) \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "srl %[q2_l], %[q2_l], 16 \n\t" \
+ "srl %[q1_l], %[q1_l], 16 \n\t" \
+ "srl %[q0_l], %[q0_l], 16 \n\t" \
+ "srl %[p0_l], %[p0_l], 16 \n\t" \
+ "srl %[p1_l], %[p1_l], 16 \n\t" \
+ "srl %[p2_l], %[p2_l], 16 \n\t" \
+ \
+ : [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l), [q0_l] "+r" (q0_l), \
+ [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l) \
+ : \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "sb %[q2_l], 2(%[s1]) \n\t" \
+ "sb %[q1_l], 1(%[s1]) \n\t" \
+ "sb %[q0_l], 0(%[s1]) \n\t" \
+ "sb %[p0_l], -1(%[s1]) \n\t" \
+ "sb %[p1_l], -2(%[s1]) \n\t" \
+ "sb %[p2_l], -3(%[s1]) \n\t" \
+ \
+ : \
+ : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l), \
+ [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \
+ [s1] "r" (s1) \
+ ); \
+}
+
+#define STORE_F2() { \
+ __asm__ __volatile__ ( \
+ "sb %[q6_r], 6(%[s4]) \n\t" \
+ "sb %[q5_r], 5(%[s4]) \n\t" \
+ "sb %[q4_r], 4(%[s4]) \n\t" \
+ "sb %[q3_r], 3(%[s4]) \n\t" \
+ "sb %[q2_r], 2(%[s4]) \n\t" \
+ "sb %[q1_r], 1(%[s4]) \n\t" \
+ "sb %[q0_r], 0(%[s4]) \n\t" \
+ "sb %[p0_r], -1(%[s4]) \n\t" \
+ "sb %[p1_r], -2(%[s4]) \n\t" \
+ "sb %[p2_r], -3(%[s4]) \n\t" \
+ "sb %[p3_r], -4(%[s4]) \n\t" \
+ "sb %[p4_r], -5(%[s4]) \n\t" \
+ "sb %[p5_r], -6(%[s4]) \n\t" \
+ "sb %[p6_r], -7(%[s4]) \n\t" \
+ \
+ : \
+ : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r), \
+ [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), \
+ [q0_r] "r" (q0_r), \
+ [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \
+ [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r), \
+ [p6_r] "r" (p6_r), \
+ [s4] "r" (s4) \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "srl %[q6_r], %[q6_r], 16 \n\t" \
+ "srl %[q5_r], %[q5_r], 16 \n\t" \
+ "srl %[q4_r], %[q4_r], 16 \n\t" \
+ "srl %[q3_r], %[q3_r], 16 \n\t" \
+ "srl %[q2_r], %[q2_r], 16 \n\t" \
+ "srl %[q1_r], %[q1_r], 16 \n\t" \
+ "srl %[q0_r], %[q0_r], 16 \n\t" \
+ "srl %[p0_r], %[p0_r], 16 \n\t" \
+ "srl %[p1_r], %[p1_r], 16 \n\t" \
+ "srl %[p2_r], %[p2_r], 16 \n\t" \
+ "srl %[p3_r], %[p3_r], 16 \n\t" \
+ "srl %[p4_r], %[p4_r], 16 \n\t" \
+ "srl %[p5_r], %[p5_r], 16 \n\t" \
+ "srl %[p6_r], %[p6_r], 16 \n\t" \
+ \
+ : [q6_r] "+r" (q6_r), [q5_r] "+r" (q5_r), [q4_r] "+r" (q4_r), \
+ [q3_r] "+r" (q3_r), [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r), \
+ [q0_r] "+r" (q0_r), \
+ [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r), \
+ [p3_r] "+r" (p3_r), [p4_r] "+r" (p4_r), [p5_r] "+r" (p5_r), \
+ [p6_r] "+r" (p6_r) \
+ : \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "sb %[q6_r], 6(%[s3]) \n\t" \
+ "sb %[q5_r], 5(%[s3]) \n\t" \
+ "sb %[q4_r], 4(%[s3]) \n\t" \
+ "sb %[q3_r], 3(%[s3]) \n\t" \
+ "sb %[q2_r], 2(%[s3]) \n\t" \
+ "sb %[q1_r], 1(%[s3]) \n\t" \
+ "sb %[q0_r], 0(%[s3]) \n\t" \
+ "sb %[p0_r], -1(%[s3]) \n\t" \
+ "sb %[p1_r], -2(%[s3]) \n\t" \
+ "sb %[p2_r], -3(%[s3]) \n\t" \
+ "sb %[p3_r], -4(%[s3]) \n\t" \
+ "sb %[p4_r], -5(%[s3]) \n\t" \
+ "sb %[p5_r], -6(%[s3]) \n\t" \
+ "sb %[p6_r], -7(%[s3]) \n\t" \
+ \
+ : \
+ : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r), \
+ [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), \
+ [q0_r] "r" (q0_r), \
+ [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \
+ [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r), \
+ [p6_r] "r" (p6_r), \
+ [s3] "r" (s3) \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "sb %[q6_l], 6(%[s2]) \n\t" \
+ "sb %[q5_l], 5(%[s2]) \n\t" \
+ "sb %[q4_l], 4(%[s2]) \n\t" \
+ "sb %[q3_l], 3(%[s2]) \n\t" \
+ "sb %[q2_l], 2(%[s2]) \n\t" \
+ "sb %[q1_l], 1(%[s2]) \n\t" \
+ "sb %[q0_l], 0(%[s2]) \n\t" \
+ "sb %[p0_l], -1(%[s2]) \n\t" \
+ "sb %[p1_l], -2(%[s2]) \n\t" \
+ "sb %[p2_l], -3(%[s2]) \n\t" \
+ "sb %[p3_l], -4(%[s2]) \n\t" \
+ "sb %[p4_l], -5(%[s2]) \n\t" \
+ "sb %[p5_l], -6(%[s2]) \n\t" \
+ "sb %[p6_l], -7(%[s2]) \n\t" \
+ \
+ : \
+ : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l), \
+ [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), \
+ [q0_l] "r" (q0_l), \
+ [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \
+ [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l), \
+ [p6_l] "r" (p6_l), \
+ [s2] "r" (s2) \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "srl %[q6_l], %[q6_l], 16 \n\t" \
+ "srl %[q5_l], %[q5_l], 16 \n\t" \
+ "srl %[q4_l], %[q4_l], 16 \n\t" \
+ "srl %[q3_l], %[q3_l], 16 \n\t" \
+ "srl %[q2_l], %[q2_l], 16 \n\t" \
+ "srl %[q1_l], %[q1_l], 16 \n\t" \
+ "srl %[q0_l], %[q0_l], 16 \n\t" \
+ "srl %[p0_l], %[p0_l], 16 \n\t" \
+ "srl %[p1_l], %[p1_l], 16 \n\t" \
+ "srl %[p2_l], %[p2_l], 16 \n\t" \
+ "srl %[p3_l], %[p3_l], 16 \n\t" \
+ "srl %[p4_l], %[p4_l], 16 \n\t" \
+ "srl %[p5_l], %[p5_l], 16 \n\t" \
+ "srl %[p6_l], %[p6_l], 16 \n\t" \
+ \
+ : [q6_l] "+r" (q6_l), [q5_l] "+r" (q5_l), [q4_l] "+r" (q4_l), \
+ [q3_l] "+r" (q3_l), [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l), \
+ [q0_l] "+r" (q0_l), \
+ [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l), \
+ [p3_l] "+r" (p3_l), [p4_l] "+r" (p4_l), [p5_l] "+r" (p5_l), \
+ [p6_l] "+r" (p6_l) \
+ : \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "sb %[q6_l], 6(%[s1]) \n\t" \
+ "sb %[q5_l], 5(%[s1]) \n\t" \
+ "sb %[q4_l], 4(%[s1]) \n\t" \
+ "sb %[q3_l], 3(%[s1]) \n\t" \
+ "sb %[q2_l], 2(%[s1]) \n\t" \
+ "sb %[q1_l], 1(%[s1]) \n\t" \
+ "sb %[q0_l], 0(%[s1]) \n\t" \
+ "sb %[p0_l], -1(%[s1]) \n\t" \
+ "sb %[p1_l], -2(%[s1]) \n\t" \
+ "sb %[p2_l], -3(%[s1]) \n\t" \
+ "sb %[p3_l], -4(%[s1]) \n\t" \
+ "sb %[p4_l], -5(%[s1]) \n\t" \
+ "sb %[p5_l], -6(%[s1]) \n\t" \
+ "sb %[p6_l], -7(%[s1]) \n\t" \
+ \
+ : \
+ : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l), \
+ [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), \
+ [q0_l] "r" (q0_l), \
+ [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \
+ [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l), \
+ [p6_l] "r" (p6_l), \
+ [s1] "r" (s1) \
+ ); \
+}
+
+#define PACK_LEFT_0TO3() { \
+ __asm__ __volatile__ ( \
+ "preceu.ph.qbl %[p3_l], %[p3] \n\t" \
+ "preceu.ph.qbl %[p2_l], %[p2] \n\t" \
+ "preceu.ph.qbl %[p1_l], %[p1] \n\t" \
+ "preceu.ph.qbl %[p0_l], %[p0] \n\t" \
+ "preceu.ph.qbl %[q0_l], %[q0] \n\t" \
+ "preceu.ph.qbl %[q1_l], %[q1] \n\t" \
+ "preceu.ph.qbl %[q2_l], %[q2] \n\t" \
+ "preceu.ph.qbl %[q3_l], %[q3] \n\t" \
+ \
+ : [p3_l] "=&r" (p3_l), [p2_l] "=&r" (p2_l), \
+ [p1_l] "=&r" (p1_l), [p0_l] "=&r" (p0_l), \
+ [q0_l] "=&r" (q0_l), [q1_l] "=&r" (q1_l), \
+ [q2_l] "=&r" (q2_l), [q3_l] "=&r" (q3_l) \
+ : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), \
+ [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3) \
+ ); \
+}
+
+#define PACK_LEFT_4TO7() { \
+ __asm__ __volatile__ ( \
+ "preceu.ph.qbl %[p7_l], %[p7] \n\t" \
+ "preceu.ph.qbl %[p6_l], %[p6] \n\t" \
+ "preceu.ph.qbl %[p5_l], %[p5] \n\t" \
+ "preceu.ph.qbl %[p4_l], %[p4] \n\t" \
+ "preceu.ph.qbl %[q4_l], %[q4] \n\t" \
+ "preceu.ph.qbl %[q5_l], %[q5] \n\t" \
+ "preceu.ph.qbl %[q6_l], %[q6] \n\t" \
+ "preceu.ph.qbl %[q7_l], %[q7] \n\t" \
+ \
+ : [p7_l] "=&r" (p7_l), [p6_l] "=&r" (p6_l), \
+ [p5_l] "=&r" (p5_l), [p4_l] "=&r" (p4_l), \
+ [q4_l] "=&r" (q4_l), [q5_l] "=&r" (q5_l), \
+ [q6_l] "=&r" (q6_l), [q7_l] "=&r" (q7_l) \
+ : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), \
+ [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7) \
+ ); \
+}
+
+#define PACK_RIGHT_0TO3() { \
+ __asm__ __volatile__ ( \
+ "preceu.ph.qbr %[p3_r], %[p3] \n\t" \
+ "preceu.ph.qbr %[p2_r], %[p2] \n\t" \
+ "preceu.ph.qbr %[p1_r], %[p1] \n\t" \
+ "preceu.ph.qbr %[p0_r], %[p0] \n\t" \
+ "preceu.ph.qbr %[q0_r], %[q0] \n\t" \
+ "preceu.ph.qbr %[q1_r], %[q1] \n\t" \
+ "preceu.ph.qbr %[q2_r], %[q2] \n\t" \
+ "preceu.ph.qbr %[q3_r], %[q3] \n\t" \
+ \
+ : [p3_r] "=&r" (p3_r), [p2_r] "=&r" (p2_r), \
+ [p1_r] "=&r" (p1_r), [p0_r] "=&r" (p0_r), \
+ [q0_r] "=&r" (q0_r), [q1_r] "=&r" (q1_r), \
+ [q2_r] "=&r" (q2_r), [q3_r] "=&r" (q3_r) \
+ : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), \
+ [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3) \
+ ); \
+}
+
+#define PACK_RIGHT_4TO7() { \
+ __asm__ __volatile__ ( \
+ "preceu.ph.qbr %[p7_r], %[p7] \n\t" \
+ "preceu.ph.qbr %[p6_r], %[p6] \n\t" \
+ "preceu.ph.qbr %[p5_r], %[p5] \n\t" \
+ "preceu.ph.qbr %[p4_r], %[p4] \n\t" \
+ "preceu.ph.qbr %[q4_r], %[q4] \n\t" \
+ "preceu.ph.qbr %[q5_r], %[q5] \n\t" \
+ "preceu.ph.qbr %[q6_r], %[q6] \n\t" \
+ "preceu.ph.qbr %[q7_r], %[q7] \n\t" \
+ \
+ : [p7_r] "=&r" (p7_r), [p6_r] "=&r" (p6_r), \
+ [p5_r] "=&r" (p5_r), [p4_r] "=&r" (p4_r), \
+ [q4_r] "=&r" (q4_r), [q5_r] "=&r" (q5_r), \
+ [q6_r] "=&r" (q6_r), [q7_r] "=&r" (q7_r) \
+ : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), \
+ [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7) \
+ ); \
+}
+
+#define COMBINE_LEFT_RIGHT_0TO2() { \
+ __asm__ __volatile__ ( \
+ "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \
+ "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \
+ "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \
+ "precr.qb.ph %[q0], %[q0_l], %[q0_r] \n\t" \
+ "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \
+ "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \
+ \
+ : [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0), \
+ [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2) \
+ : [p2_l] "r" (p2_l), [p2_r] "r" (p2_r), \
+ [p1_l] "r" (p1_l), [p1_r] "r" (p1_r), \
+ [p0_l] "r" (p0_l), [p0_r] "r" (p0_r), \
+ [q0_l] "r" (q0_l), [q0_r] "r" (q0_r), \
+ [q1_l] "r" (q1_l), [q1_r] "r" (q1_r), \
+ [q2_l] "r" (q2_l), [q2_r] "r" (q2_r) \
+ ); \
+}
+
+#define COMBINE_LEFT_RIGHT_3TO6() { \
+ __asm__ __volatile__ ( \
+ "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \
+ "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \
+ "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \
+ "precr.qb.ph %[p3], %[p3_l], %[p3_r] \n\t" \
+ "precr.qb.ph %[q3], %[q3_l], %[q3_r] \n\t" \
+ "precr.qb.ph %[q4], %[q4_l], %[q4_r] \n\t" \
+ "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \
+ "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \
+ \
+ : [p6] "=&r" (p6),[p5] "=&r" (p5), \
+ [p4] "=&r" (p4),[p3] "=&r" (p3), \
+ [q3] "=&r" (q3),[q4] "=&r" (q4), \
+ [q5] "=&r" (q5),[q6] "=&r" (q6) \
+ : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), \
+ [p4_l] "r" (p4_l), [p3_l] "r" (p3_l), \
+ [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), \
+ [p4_r] "r" (p4_r), [p3_r] "r" (p3_r), \
+ [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), \
+ [q5_l] "r" (q5_l), [q6_l] "r" (q6_l), \
+ [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), \
+ [q5_r] "r" (q5_r), [q6_r] "r" (q6_r) \
+ ); \
+}
+
+#endif // #if HAVE_DSPR2
+#endif // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
new file mode 100644
index 0000000..b9e0aca
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
+#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#if HAVE_DSPR2
+/* processing 4 pixels at the same time
+ * compute hev and mask in the same function */
+static INLINE void vp9_filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
+ uint32_t p1, uint32_t p0,
+ uint32_t p3, uint32_t p2,
+ uint32_t q0, uint32_t q1,
+ uint32_t q2, uint32_t q3,
+ uint32_t thresh, uint32_t *hev,
+ uint32_t *mask) {
+ uint32_t c, r, r3, r_k;
+ uint32_t s1, s2, s3;
+ uint32_t ones = 0xFFFFFFFF;
+ uint32_t hev1;
+
+ __asm__ __volatile__ (
+ /* mask |= (abs(p3 - p2) > limit) */
+ "subu_s.qb %[c], %[p3], %[p2] \n\t"
+ "subu_s.qb %[r_k], %[p2], %[p3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], $0, %[c] \n\t"
+
+ /* mask |= (abs(p2 - p1) > limit) */
+ "subu_s.qb %[c], %[p2], %[p1] \n\t"
+ "subu_s.qb %[r_k], %[p1], %[p2] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(p1 - p0) > limit)
+ * hev |= (abs(p1 - p0) > thresh)
+ */
+ "subu_s.qb %[c], %[p1], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
+ "or %[r3], $0, %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(q1 - q0) > limit)
+ * hev |= (abs(q1 - q0) > thresh)
+ */
+ "subu_s.qb %[c], %[q1], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
+ "or %[r3], %[r3], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(q2 - q1) > limit) */
+ "subu_s.qb %[c], %[q2], %[q1] \n\t"
+ "subu_s.qb %[r_k], %[q1], %[q2] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r3], %[r3], 24 \n\t"
+
+ /* mask |= (abs(q3 - q2) > limit) */
+ "subu_s.qb %[c], %[q3], %[q2] \n\t"
+ "subu_s.qb %[r_k], %[q2], %[q3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ : [c] "=&r" (c), [r_k] "=&r" (r_k),
+ [r] "=&r" (r), [r3] "=&r" (r3)
+ : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
+ [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
+ [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
+ );
+
+ __asm__ __volatile__ (
+ /* abs(p0 - q0) */
+ "subu_s.qb %[c], %[p0], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[p0] \n\t"
+ "wrdsp %[r3] \n\t"
+ "or %[s1], %[r_k], %[c] \n\t"
+
+ /* abs(p1 - q1) */
+ "subu_s.qb %[c], %[p1], %[q1] \n\t"
+ "addu_s.qb %[s3], %[s1], %[s1] \n\t"
+ "pick.qb %[hev1], %[ones], $0 \n\t"
+ "subu_s.qb %[r_k], %[q1], %[p1] \n\t"
+ "or %[s2], %[r_k], %[c] \n\t"
+
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */
+ "shrl.qb %[s2], %[s2], 1 \n\t"
+ "addu_s.qb %[s1], %[s2], %[s3] \n\t"
+ "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r], %[r], 24 \n\t"
+
+ "wrdsp %[r] \n\t"
+ "pick.qb %[s2], $0, %[ones] \n\t"
+
+ : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
+ [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
+ : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
+ [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
+ );
+
+ *hev = hev1;
+ *mask = s2;
+}
+
+static INLINE void vp9_filter_hev_mask_flatmask4_dspr2(uint32_t limit,
+ uint32_t flimit,
+ uint32_t thresh,
+ uint32_t p1, uint32_t p0,
+ uint32_t p3, uint32_t p2,
+ uint32_t q0, uint32_t q1,
+ uint32_t q2, uint32_t q3,
+ uint32_t *hev,
+ uint32_t *mask,
+ uint32_t *flat) {
+ uint32_t c, r, r3, r_k, r_flat;
+ uint32_t s1, s2, s3;
+ uint32_t ones = 0xFFFFFFFF;
+ uint32_t flat_thresh = 0x01010101;
+ uint32_t hev1;
+ uint32_t flat1;
+
+ __asm__ __volatile__ (
+ /* mask |= (abs(p3 - p2) > limit) */
+ "subu_s.qb %[c], %[p3], %[p2] \n\t"
+ "subu_s.qb %[r_k], %[p2], %[p3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], $0, %[c] \n\t"
+
+ /* mask |= (abs(p2 - p1) > limit) */
+ "subu_s.qb %[c], %[p2], %[p1] \n\t"
+ "subu_s.qb %[r_k], %[p1], %[p2] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ /* mask |= (abs(p1 - p0) > limit)
+ * hev |= (abs(p1 - p0) > thresh)
+ * flat |= (abs(p1 - p0) > thresh)
+ */
+ "subu_s.qb %[c], %[p1], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
+ "or %[r3], $0, %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], $0, %[c] \n\t"
+
+ /* mask |= (abs(q1 - q0) > limit)
+ * hev |= (abs(q1 - q0) > thresh)
+ * flat |= (abs(q1 - q0) > thresh)
+ */
+ "subu_s.qb %[c], %[q1], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
+ "or %[r3], %[r3], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(p0 - p2) > thresh) */
+ "subu_s.qb %[c], %[p0], %[p2] \n\t"
+ "subu_s.qb %[r_k], %[p2], %[p0] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(q0 - q2) > thresh) */
+ "subu_s.qb %[c], %[q0], %[q2] \n\t"
+ "subu_s.qb %[r_k], %[q2], %[q0] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(p3 - p0) > thresh) */
+ "subu_s.qb %[c], %[p3], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(q3 - q0) > thresh) */
+ "subu_s.qb %[c], %[q3], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+ "sll %[r_flat], %[r_flat], 24 \n\t"
+ /* look at stall here */
+ "wrdsp %[r_flat] \n\t"
+ "pick.qb %[flat1], $0, %[ones] \n\t"
+
+ /* mask |= (abs(q2 - q1) > limit) */
+ "subu_s.qb %[c], %[q2], %[q1] \n\t"
+ "subu_s.qb %[r_k], %[q1], %[q2] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r3], %[r3], 24 \n\t"
+
+ /* mask |= (abs(q3 - q2) > limit) */
+ "subu_s.qb %[c], %[q3], %[q2] \n\t"
+ "subu_s.qb %[r_k], %[q2], %[q3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+
+ : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), [r3] "=&r" (r3),
+ [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1)
+ : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
+ [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
+ [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh),
+ [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
+ );
+
+ __asm__ __volatile__ (
+ /* abs(p0 - q0) */
+ "subu_s.qb %[c], %[p0], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[p0] \n\t"
+ "wrdsp %[r3] \n\t"
+ "or %[s1], %[r_k], %[c] \n\t"
+
+ /* abs(p1 - q1) */
+ "subu_s.qb %[c], %[p1], %[q1] \n\t"
+ "addu_s.qb %[s3], %[s1], %[s1] \n\t"
+ "pick.qb %[hev1], %[ones], $0 \n\t"
+ "subu_s.qb %[r_k], %[q1], %[p1] \n\t"
+ "or %[s2], %[r_k], %[c] \n\t"
+
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */
+ "shrl.qb %[s2], %[s2], 1 \n\t"
+ "addu_s.qb %[s1], %[s2], %[s3] \n\t"
+ "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r], %[r], 24 \n\t"
+
+ "wrdsp %[r] \n\t"
+ "pick.qb %[s2], $0, %[ones] \n\t"
+
+ : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
+ [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
+ : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
+ [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
+ );
+
+ *hev = hev1;
+ *mask = s2;
+ *flat = flat1;
+}
+
+static INLINE void vp9_flatmask5(uint32_t p4, uint32_t p3,
+ uint32_t p2, uint32_t p1,
+ uint32_t p0, uint32_t q0,
+ uint32_t q1, uint32_t q2,
+ uint32_t q3, uint32_t q4,
+ uint32_t *flat2) {
+ uint32_t c, r, r_k, r_flat;
+ uint32_t ones = 0xFFFFFFFF;
+ uint32_t flat_thresh = 0x01010101;
+ uint32_t flat1, flat3;
+
+ __asm__ __volatile__ (
+ /* flat |= (abs(p4 - p0) > thresh) */
+ "subu_s.qb %[c], %[p4], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p4] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r], $0, %[c] \n\t"
+
+ /* flat |= (abs(q4 - q0) > thresh) */
+ "subu_s.qb %[c], %[q4], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q4] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r], %[r], %[c] \n\t"
+ "sll %[r], %[r], 24 \n\t"
+ "wrdsp %[r] \n\t"
+ "pick.qb %[flat3], $0, %[ones] \n\t"
+
+ /* flat |= (abs(p1 - p0) > thresh) */
+ "subu_s.qb %[c], %[p1], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], $0, %[c] \n\t"
+
+ /* flat |= (abs(q1 - q0) > thresh) */
+ "subu_s.qb %[c], %[q1], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q1] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(p0 - p2) > thresh) */
+ "subu_s.qb %[c], %[p0], %[p2] \n\t"
+ "subu_s.qb %[r_k], %[p2], %[p0] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(q0 - q2) > thresh) */
+ "subu_s.qb %[c], %[q0], %[q2] \n\t"
+ "subu_s.qb %[r_k], %[q2], %[q0] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(p3 - p0) > thresh) */
+ "subu_s.qb %[c], %[p3], %[p0] \n\t"
+ "subu_s.qb %[r_k], %[p0], %[p3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+
+ /* flat |= (abs(q3 - q0) > thresh) */
+ "subu_s.qb %[c], %[q3], %[q0] \n\t"
+ "subu_s.qb %[r_k], %[q0], %[q3] \n\t"
+ "or %[r_k], %[r_k], %[c] \n\t"
+ "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
+ "or %[r_flat], %[r_flat], %[c] \n\t"
+ "sll %[r_flat], %[r_flat], 24 \n\t"
+ "wrdsp %[r_flat] \n\t"
+ "pick.qb %[flat1], $0, %[ones] \n\t"
+ /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
+ "and %[flat1], %[flat3], %[flat1] \n\t"
+
+ : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r),
+ [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1), [flat3] "=&r" (flat3)
+ : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2),
+ [p1] "r" (p1), [p0] "r" (p0), [q0] "r" (q0), [q1] "r" (q1),
+ [q2] "r" (q2), [q3] "r" (q3), [q4] "r" (q4),
+ [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
+ );
+
+ *flat2 = flat1;
+}
+#endif // #if HAVE_DSPR2
+#endif // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
new file mode 100644
index 0000000..adfd755
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_mbloop_filter_horizontal_edge_dspr2(unsigned char *s,
+ int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh,
+ int count) {
+ uint32_t mask;
+ uint32_t hev, flat;
+ uint8_t i;
+ uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+ uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+ uint32_t p3, p2, p1, p0, q0, q1, q2, q3;
+ uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+ uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__ (
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+ [limit_vec] "=r" (limit_vec)
+ : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+ );
+
+ /* prefetch data for store */
+ vp9_prefetch_store(s);
+
+ for (i = 0; i < 2; i++) {
+ sp3 = s - (pitch << 2);
+ sp2 = sp3 + pitch;
+ sp1 = sp2 + pitch;
+ sp0 = sp1 + pitch;
+ sq0 = s;
+ sq1 = s + pitch;
+ sq2 = sq1 + pitch;
+ sq3 = sq2 + pitch;
+
+ __asm__ __volatile__ (
+ "lw %[p3], (%[sp3]) \n\t"
+ "lw %[p2], (%[sp2]) \n\t"
+ "lw %[p1], (%[sp1]) \n\t"
+ "lw %[p0], (%[sp0]) \n\t"
+ "lw %[q0], (%[sq0]) \n\t"
+ "lw %[q1], (%[sq1]) \n\t"
+ "lw %[q2], (%[sq2]) \n\t"
+ "lw %[q3], (%[sq3]) \n\t"
+
+ : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
+ [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0)
+ : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
+ );
+
+ vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+ p1, p0, p3, p2, q0, q1, q2, q3,
+ &hev, &mask, &flat);
+
+ if ((flat == 0) && (mask != 0)) {
+ vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+ &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ __asm__ __volatile__ (
+ "sw %[p1_f0], (%[sp1]) \n\t"
+ "sw %[p0_f0], (%[sp0]) \n\t"
+ "sw %[q0_f0], (%[sq0]) \n\t"
+ "sw %[q1_f0], (%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1)
+ );
+ } else if ((mask & flat) == 0xFFFFFFFF) {
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+ &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+ &q0_r, &q1_r, &q2_r, &q3_r);
+
+ COMBINE_LEFT_RIGHT_0TO2()
+
+ __asm__ __volatile__ (
+ "sw %[p2], (%[sp2]) \n\t"
+ "sw %[p1], (%[sp1]) \n\t"
+ "sw %[p0], (%[sp0]) \n\t"
+ "sw %[q0], (%[sq0]) \n\t"
+ "sw %[q1], (%[sq1]) \n\t"
+ "sw %[q2], (%[sq2]) \n\t"
+
+ :
+ : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+ [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+ );
+ } else if ((flat != 0) && (mask != 0)) {
+ /* filtering */
+ vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+ &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+ &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+ &q0_r, &q1_r, &q2_r, &q3_r);
+
+ if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__ (
+ "sb %[p2_r], (%[sp2]) \n\t"
+ "sb %[p1_r], (%[sp1]) \n\t"
+ "sb %[p0_r], (%[sp0]) \n\t"
+ "sb %[q0_r], (%[sq0]) \n\t"
+ "sb %[q1_r], (%[sq1]) \n\t"
+ "sb %[q2_r], (%[sq2]) \n\t"
+
+ :
+ : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+ [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+ );
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], (%[sp1]) \n\t"
+ "sb %[p0_f0], (%[sp0]) \n\t"
+ "sb %[q0_f0], (%[sq0]) \n\t"
+ "sb %[q1_f0], (%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+ [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+ [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__ (
+ "sb %[p2_r], +1(%[sp2]) \n\t"
+ "sb %[p1_r], +1(%[sp1]) \n\t"
+ "sb %[p0_r], +1(%[sp0]) \n\t"
+ "sb %[q0_r], +1(%[sq0]) \n\t"
+ "sb %[q1_r], +1(%[sq1]) \n\t"
+ "sb %[q2_r], +1(%[sq2]) \n\t"
+
+ :
+ : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+ [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+ );
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], +1(%[sp1]) \n\t"
+ "sb %[p0_f0], +1(%[sp0]) \n\t"
+ "sb %[q0_f0], +1(%[sq0]) \n\t"
+ "sb %[q1_f0], +1(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
+ [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
+ [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__ (
+ "sb %[p2_l], +2(%[sp2]) \n\t"
+ "sb %[p1_l], +2(%[sp1]) \n\t"
+ "sb %[p0_l], +2(%[sp0]) \n\t"
+ "sb %[q0_l], +2(%[sq0]) \n\t"
+ "sb %[q1_l], +2(%[sq1]) \n\t"
+ "sb %[q2_l], +2(%[sq2]) \n\t"
+
+ :
+ : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+ [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+ );
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], +2(%[sp1]) \n\t"
+ "sb %[p0_f0], +2(%[sp0]) \n\t"
+ "sb %[q0_f0], +2(%[sq0]) \n\t"
+ "sb %[q1_f0], +2(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+ [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+ [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__ (
+ "sb %[p2_l], +3(%[sp2]) \n\t"
+ "sb %[p1_l], +3(%[sp1]) \n\t"
+ "sb %[p0_l], +3(%[sp0]) \n\t"
+ "sb %[q0_l], +3(%[sq0]) \n\t"
+ "sb %[q1_l], +3(%[sq1]) \n\t"
+ "sb %[q2_l], +3(%[sq2]) \n\t"
+
+ :
+ : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+ [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+ );
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], +3(%[sp1]) \n\t"
+ "sb %[p0_f0], +3(%[sp0]) \n\t"
+ "sb %[q0_f0], +3(%[sq0]) \n\t"
+ "sb %[q1_f0], +3(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1)
+ );
+ }
+ }
+
+ s = s + 4;
+ }
+}
+
+void vp9_mbloop_filter_vertical_edge_dspr2(unsigned char *s,
+ int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh,
+ int count) {
+ uint8_t i;
+ uint32_t mask, hev, flat;
+ uint8_t *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+ uint32_t p3, p2, p1, p0, q3, q2, q1, q0;
+ uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+ uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+ uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__ (
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+ [limit_vec] "=r" (limit_vec)
+ : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+ );
+
+ vp9_prefetch_store(s + pitch);
+
+ for (i = 0; i < 2; i++) {
+ s1 = s;
+ s2 = s + pitch;
+ s3 = s2 + pitch;
+ s4 = s3 + pitch;
+ s = s4 + pitch;
+
+ __asm__ __volatile__ (
+ "lw %[p0], -4(%[s1]) \n\t"
+ "lw %[p1], -4(%[s2]) \n\t"
+ "lw %[p2], -4(%[s3]) \n\t"
+ "lw %[p3], -4(%[s4]) \n\t"
+ "lw %[q3], (%[s1]) \n\t"
+ "lw %[q2], (%[s2]) \n\t"
+ "lw %[q1], (%[s3]) \n\t"
+ "lw %[q0], (%[s4]) \n\t"
+
+ : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
+ [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2), [q3] "=&r" (q3)
+ : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+ );
+
+ /* transpose p3, p2, p1, p0
+ original (when loaded from memory)
+ register -4 -3 -2 -1
+ p0 p0_0 p0_1 p0_2 p0_3
+ p1 p1_0 p1_1 p1_2 p1_3
+ p2 p2_0 p2_1 p2_2 p2_3
+ p3 p3_0 p3_1 p3_2 p3_3
+
+ after transpose
+ register
+ p0 p3_3 p2_3 p1_3 p0_3
+ p1 p3_2 p2_2 p1_2 p0_2
+ p2 p3_1 p2_1 p1_1 p0_1
+ p3 p3_0 p2_0 p1_0 p0_0
+ */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p0], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p2], %[p3] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p0], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p2], %[p3], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* transpose q0, q1, q2, q3
+ original (when loaded from memory)
+ register +1 +2 +3 +4
+ q3 q3_0 q3_1 q3_2 q3_3
+ q2 q2_0 q2_1 q2_2 q2_3
+ q1 q1_0 q1_1 q1_2 q1_3
+ q0 q0_0 q0_1 q0_2 q0_3
+
+ after transpose
+ register
+ q3 q0_3 q1_3 q2_3 q3_3
+ q2 q0_2 q1_2 q2_2 q3_2
+ q1 q0_1 q1_1 q2_1 q3_1
+ q0 q0_0 q1_0 q2_0 q3_0
+ */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t"
+ "precr.qb.ph %[prim2], %[q3], %[q2] \n\t"
+ "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t"
+ "precr.qb.ph %[prim4], %[q1], %[q0] \n\t"
+
+ "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[q3], %[q2], %[sec3] \n\t"
+ "precrq.ph.w %[q1], %[q0], %[sec4] \n\t"
+ "append %[q2], %[sec3], 16 \n\t"
+ "append %[q0], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+ p1, p0, p3, p2, q0, q1, q2, q3,
+ &hev, &mask, &flat);
+
+ if ((flat == 0) && (mask != 0)) {
+ vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+ &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+ STORE_F0()
+ } else if ((mask & flat) == 0xFFFFFFFF) {
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+ &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+ &q0_r, &q1_r, &q2_r, &q3_r);
+
+ STORE_F1()
+ } else if ((flat != 0) && (mask != 0)) {
+ vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+ &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+ &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+ &q0_r, &q1_r, &q2_r, &q3_r);
+
+ if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__ (
+ "sb %[p2_r], -3(%[s4]) \n\t"
+ "sb %[p1_r], -2(%[s4]) \n\t"
+ "sb %[p0_r], -1(%[s4]) \n\t"
+ "sb %[q0_r], (%[s4]) \n\t"
+ "sb %[q1_r], +1(%[s4]) \n\t"
+ "sb %[q2_r], +2(%[s4]) \n\t"
+
+ :
+ : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+ [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+ [s4] "r" (s4)
+ );
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], -2(%[s4]) \n\t"
+ "sb %[p0_f0], -1(%[s4]) \n\t"
+ "sb %[q0_f0], (%[s4]) \n\t"
+ "sb %[q1_f0], +1(%[s4]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [s4] "r" (s4)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+ [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+ [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__ (
+ "sb %[p2_r], -3(%[s3]) \n\t"
+ "sb %[p1_r], -2(%[s3]) \n\t"
+ "sb %[p0_r], -1(%[s3]) \n\t"
+ "sb %[q0_r], (%[s3]) \n\t"
+ "sb %[q1_r], +1(%[s3]) \n\t"
+ "sb %[q2_r], +2(%[s3]) \n\t"
+
+ :
+ : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+ [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+ [s3] "r" (s3)
+ );
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], -2(%[s3]) \n\t"
+ "sb %[p0_f0], -1(%[s3]) \n\t"
+ "sb %[q0_f0], (%[s3]) \n\t"
+ "sb %[q1_f0], +1(%[s3]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [s3] "r" (s3)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
+ [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
+ [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__ (
+ "sb %[p2_l], -3(%[s2]) \n\t"
+ "sb %[p1_l], -2(%[s2]) \n\t"
+ "sb %[p0_l], -1(%[s2]) \n\t"
+ "sb %[q0_l], (%[s2]) \n\t"
+ "sb %[q1_l], +1(%[s2]) \n\t"
+ "sb %[q2_l], +2(%[s2]) \n\t"
+
+ :
+ : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+ [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+ [s2] "r" (s2)
+ );
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], -2(%[s2]) \n\t"
+ "sb %[p0_f0], -1(%[s2]) \n\t"
+ "sb %[q0_f0], (%[s2]) \n\t"
+ "sb %[q1_f0], +1(%[s2]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [s2] "r" (s2)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+ [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+ [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__ (
+ "sb %[p2_l], -3(%[s1]) \n\t"
+ "sb %[p1_l], -2(%[s1]) \n\t"
+ "sb %[p0_l], -1(%[s1]) \n\t"
+ "sb %[q0_l], (%[s1]) \n\t"
+ "sb %[q1_l], +1(%[s1]) \n\t"
+ "sb %[q2_l], +2(%[s1]) \n\t"
+
+ :
+ : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+ [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+ [s1] "r" (s1)
+ );
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], -2(%[s1]) \n\t"
+ "sb %[p0_f0], -1(%[s1]) \n\t"
+ "sb %[q0_f0], (%[s1]) \n\t"
+ "sb %[q1_f0], +1(%[s1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+ [q1_f0] "r" (q1_f0), [s1] "r" (s1)
+ );
+ }
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
new file mode 100644
index 0000000..0759755
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
@@ -0,0 +1,795 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_mb_lpf_horizontal_edge_w_dspr2(unsigned char *s,
+ int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh,
+ int count) {
+ uint32_t mask;
+ uint32_t hev, flat, flat2;
+ uint8_t i;
+ uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
+ uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+ uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+ uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+ uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+ uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+ uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+ uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+ uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__ (
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+ [limit_vec] "=r" (limit_vec)
+ : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+ );
+
+ /* prefetch data for store */
+ vp9_prefetch_store(s);
+
+ for (i = 0; i < (2 * count); i++) {
+ sp7 = s - (pitch << 3);
+ sp6 = sp7 + pitch;
+ sp5 = sp6 + pitch;
+ sp4 = sp5 + pitch;
+ sp3 = sp4 + pitch;
+ sp2 = sp3 + pitch;
+ sp1 = sp2 + pitch;
+ sp0 = sp1 + pitch;
+ sq0 = s;
+ sq1 = s + pitch;
+ sq2 = sq1 + pitch;
+ sq3 = sq2 + pitch;
+ sq4 = sq3 + pitch;
+ sq5 = sq4 + pitch;
+ sq6 = sq5 + pitch;
+ sq7 = sq6 + pitch;
+
+ __asm__ __volatile__ (
+ "lw %[p7], (%[sp7]) \n\t"
+ "lw %[p6], (%[sp6]) \n\t"
+ "lw %[p5], (%[sp5]) \n\t"
+ "lw %[p4], (%[sp4]) \n\t"
+ "lw %[p3], (%[sp3]) \n\t"
+ "lw %[p2], (%[sp2]) \n\t"
+ "lw %[p1], (%[sp1]) \n\t"
+ "lw %[p0], (%[sp0]) \n\t"
+
+ : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
+ [p7] "=&r" (p7), [p6] "=&r" (p6), [p5] "=&r" (p5), [p4] "=&r" (p4)
+ : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sp4] "r" (sp4), [sp5] "r" (sp5), [sp6] "r" (sp6), [sp7] "r" (sp7)
+ );
+
+ __asm__ __volatile__ (
+ "lw %[q0], (%[sq0]) \n\t"
+ "lw %[q1], (%[sq1]) \n\t"
+ "lw %[q2], (%[sq2]) \n\t"
+ "lw %[q3], (%[sq3]) \n\t"
+ "lw %[q4], (%[sq4]) \n\t"
+ "lw %[q5], (%[sq5]) \n\t"
+ "lw %[q6], (%[sq6]) \n\t"
+ "lw %[q7], (%[sq7]) \n\t"
+
+ : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0),
+ [q7] "=&r" (q7), [q6] "=&r" (q6), [q5] "=&r" (q5), [q4] "=&r" (q4)
+ : [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0),
+ [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7)
+ );
+
+ vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+ p1, p0, p3, p2, q0, q1, q2, q3,
+ &hev, &mask, &flat);
+
+ vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+ /* f0 */
+ if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+ ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+ vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+ &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ __asm__ __volatile__ (
+ "sw %[p1_f0], (%[sp1]) \n\t"
+ "sw %[p0_f0], (%[sp0]) \n\t"
+ "sw %[q0_f0], (%[sq0]) \n\t"
+ "sw %[q1_f0], (%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1)
+ );
+ } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+ (mask == 0xFFFFFFFF)) {
+ /* f2 */
+ PACK_LEFT_0TO3()
+ PACK_LEFT_4TO7()
+ vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+ &p3_l, &p2_l, &p1_l, &p0_l,
+ &q0_l, &q1_l, &q2_l, &q3_l,
+ &q4_l, &q5_l, &q6_l, &q7_l);
+
+ PACK_RIGHT_0TO3()
+ PACK_RIGHT_4TO7()
+ vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+ &p3_r, &p2_r, &p1_r, &p0_r,
+ &q0_r, &q1_r, &q2_r, &q3_r,
+ &q4_r, &q5_r, &q6_r, &q7_r);
+
+ COMBINE_LEFT_RIGHT_0TO2()
+ COMBINE_LEFT_RIGHT_3TO6()
+
+ __asm__ __volatile__ (
+ "sw %[p6], (%[sp6]) \n\t"
+ "sw %[p5], (%[sp5]) \n\t"
+ "sw %[p4], (%[sp4]) \n\t"
+ "sw %[p3], (%[sp3]) \n\t"
+ "sw %[p2], (%[sp2]) \n\t"
+ "sw %[p1], (%[sp1]) \n\t"
+ "sw %[p0], (%[sp0]) \n\t"
+
+ :
+ : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3),
+ [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+ [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4), [sp3] "r" (sp3),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
+ );
+
+ __asm__ __volatile__ (
+ "sw %[q6], (%[sq6]) \n\t"
+ "sw %[q5], (%[sq5]) \n\t"
+ "sw %[q4], (%[sq4]) \n\t"
+ "sw %[q3], (%[sq3]) \n\t"
+ "sw %[q2], (%[sq2]) \n\t"
+ "sw %[q1], (%[sq1]) \n\t"
+ "sw %[q0], (%[sq0]) \n\t"
+
+ :
+ : [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4), [q3] "r" (q3),
+ [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
+ [sq6] "r" (sq6), [sq5] "r" (sq5), [sq4] "r" (sq4), [sq3] "r" (sq3),
+ [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
+ );
+ } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+ /* f1 */
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+ &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+ &q0_r, &q1_r, &q2_r, &q3_r);
+
+ COMBINE_LEFT_RIGHT_0TO2()
+
+ __asm__ __volatile__ (
+ "sw %[p2], (%[sp2]) \n\t"
+ "sw %[p1], (%[sp1]) \n\t"
+ "sw %[p0], (%[sp0]) \n\t"
+ "sw %[q0], (%[sq0]) \n\t"
+ "sw %[q1], (%[sq1]) \n\t"
+ "sw %[q2], (%[sq2]) \n\t"
+
+ :
+ : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+ [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+ );
+ } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+ /* f0+f1 */
+ vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+ &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+ &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+ &q0_r, &q1_r, &q2_r, &q3_r);
+
+ if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__ (
+ "sb %[p2_r], (%[sp2]) \n\t"
+ "sb %[p1_r], (%[sp1]) \n\t"
+ "sb %[p0_r], (%[sp0]) \n\t"
+ "sb %[q0_r], (%[sq0]) \n\t"
+ "sb %[q1_r], (%[sq1]) \n\t"
+ "sb %[q2_r], (%[sq2]) \n\t"
+
+ :
+ : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+ [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+ );
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], (%[sp1]) \n\t"
+ "sb %[p0_f0], (%[sp0]) \n\t"
+ "sb %[q0_f0], (%[sq0]) \n\t"
+ "sb %[q1_f0], (%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+ [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+ [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__ (
+ "sb %[p2_r], +1(%[sp2]) \n\t"
+ "sb %[p1_r], +1(%[sp1]) \n\t"
+ "sb %[p0_r], +1(%[sp0]) \n\t"
+ "sb %[q0_r], +1(%[sq0]) \n\t"
+ "sb %[q1_r], +1(%[sq1]) \n\t"
+ "sb %[q2_r], +1(%[sq2]) \n\t"
+
+ :
+ : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+ [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+ );
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], +1(%[sp1]) \n\t"
+ "sb %[p0_f0], +1(%[sp0]) \n\t"
+ "sb %[q0_f0], +1(%[sq0]) \n\t"
+ "sb %[q1_f0], +1(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__ (
+ "sb %[p2_l], +2(%[sp2]) \n\t"
+ "sb %[p1_l], +2(%[sp1]) \n\t"
+ "sb %[p0_l], +2(%[sp0]) \n\t"
+ "sb %[q0_l], +2(%[sq0]) \n\t"
+ "sb %[q1_l], +2(%[sq1]) \n\t"
+ "sb %[q2_l], +2(%[sq2]) \n\t"
+
+ :
+ : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+ [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+ );
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], +2(%[sp1]) \n\t"
+ "sb %[p0_f0], +2(%[sp0]) \n\t"
+ "sb %[q0_f0], +2(%[sq0]) \n\t"
+ "sb %[q1_f0], +2(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+ [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+ [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__ (
+ "sb %[p2_l], +3(%[sp2]) \n\t"
+ "sb %[p1_l], +3(%[sp1]) \n\t"
+ "sb %[p0_l], +3(%[sp0]) \n\t"
+ "sb %[q0_l], +3(%[sq0]) \n\t"
+ "sb %[q1_l], +3(%[sq1]) \n\t"
+ "sb %[q2_l], +3(%[sq2]) \n\t"
+
+ :
+ : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+ [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+ );
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], +3(%[sp1]) \n\t"
+ "sb %[p0_f0], +3(%[sp0]) \n\t"
+ "sb %[q0_f0], +3(%[sq0]) \n\t"
+ "sb %[q1_f0], +3(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1)
+ );
+ }
+ } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+ /* f0 + f1 + f2 */
+ /* f0 function */
+ vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+ &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ /* f1 function */
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+ q0_l, q1_l, q2_l, q3_l,
+ &p2_l_f1, &p1_l_f1, &p0_l_f1,
+ &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+ q0_r, q1_r, q2_r, q3_r,
+ &p2_r_f1, &p1_r_f1, &p0_r_f1,
+ &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+ /* f2 function */
+ PACK_LEFT_4TO7()
+ vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+ &p3_l, &p2_l, &p1_l, &p0_l,
+ &q0_l, &q1_l, &q2_l, &q3_l,
+ &q4_l, &q5_l, &q6_l, &q7_l);
+
+ PACK_RIGHT_4TO7()
+ vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+ &p3_r, &p2_r, &p1_r, &p0_r,
+ &q0_r, &q1_r, &q2_r, &q3_r,
+ &q4_r, &q5_r, &q6_r, &q7_r);
+
+ if (mask & flat & flat2 & 0x000000FF) {
+ __asm__ __volatile__ (
+ "sb %[p6_r], (%[sp6]) \n\t"
+ "sb %[p5_r], (%[sp5]) \n\t"
+ "sb %[p4_r], (%[sp4]) \n\t"
+ "sb %[p3_r], (%[sp3]) \n\t"
+ "sb %[p2_r], (%[sp2]) \n\t"
+ "sb %[p1_r], (%[sp1]) \n\t"
+ "sb %[p0_r], (%[sp0]) \n\t"
+
+ :
+ : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
+ [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+ [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4),
+ [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1),
+ [p0_r] "r" (p0_r), [sp0] "r" (sp0)
+ );
+
+ __asm__ __volatile__ (
+ "sb %[q0_r], (%[sq0]) \n\t"
+ "sb %[q1_r], (%[sq1]) \n\t"
+ "sb %[q2_r], (%[sq2]) \n\t"
+ "sb %[q3_r], (%[sq3]) \n\t"
+ "sb %[q4_r], (%[sq4]) \n\t"
+ "sb %[q5_r], (%[sq5]) \n\t"
+ "sb %[q6_r], (%[sq6]) \n\t"
+
+ :
+ : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+ [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+ [q6_r] "r" (q6_r),
+ [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
+ [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
+ [sq6] "r" (sq6)
+ );
+ } else if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__ (
+ "sb %[p2_r_f1], (%[sp2]) \n\t"
+ "sb %[p1_r_f1], (%[sp1]) \n\t"
+ "sb %[p0_r_f1], (%[sp0]) \n\t"
+ "sb %[q0_r_f1], (%[sq0]) \n\t"
+ "sb %[q1_r_f1], (%[sq1]) \n\t"
+ "sb %[q2_r_f1], (%[sq2]) \n\t"
+
+ :
+ : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+ [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+ [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+ );
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], (%[sp1]) \n\t"
+ "sb %[p0_f0], (%[sp0]) \n\t"
+ "sb %[q0_f0], (%[sq0]) \n\t"
+ "sb %[q1_f0], (%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+ [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p6_r], %[p6_r], 16 \n\t"
+ "srl %[p5_r], %[p5_r], 16 \n\t"
+ "srl %[p4_r], %[p4_r], 16 \n\t"
+ "srl %[p3_r], %[p3_r], 16 \n\t"
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[q3_r], %[q3_r], 16 \n\t"
+ "srl %[q4_r], %[q4_r], 16 \n\t"
+ "srl %[q5_r], %[q5_r], 16 \n\t"
+ "srl %[q6_r], %[q6_r], 16 \n\t"
+
+ : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+ [q3_r] "+r" (q3_r), [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
+ [p6_r] "+r" (p6_r), [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
+ [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r),
+ [q6_r] "+r" (q6_r), [p0_r] "+r" (p0_r)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t"
+ "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t"
+ "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t"
+ "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t"
+ "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t"
+ "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1),
+ [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1),
+ [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1),
+ [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & flat2 & 0x0000FF00) {
+ __asm__ __volatile__ (
+ "sb %[p6_r], +1(%[sp6]) \n\t"
+ "sb %[p5_r], +1(%[sp5]) \n\t"
+ "sb %[p4_r], +1(%[sp4]) \n\t"
+ "sb %[p3_r], +1(%[sp3]) \n\t"
+ "sb %[p2_r], +1(%[sp2]) \n\t"
+ "sb %[p1_r], +1(%[sp1]) \n\t"
+ "sb %[p0_r], +1(%[sp0]) \n\t"
+
+ :
+ : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
+ [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+ [p0_r] "r" (p0_r), [sp6] "r" (sp6), [sp5] "r" (sp5),
+ [sp4] "r" (sp4), [sp3] "r" (sp3),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
+ );
+
+ __asm__ __volatile__ (
+ "sb %[q0_r], +1(%[sq0]) \n\t"
+ "sb %[q1_r], +1(%[sq1]) \n\t"
+ "sb %[q2_r], +1(%[sq2]) \n\t"
+ "sb %[q3_r], +1(%[sq3]) \n\t"
+ "sb %[q4_r], +1(%[sq4]) \n\t"
+ "sb %[q5_r], +1(%[sq5]) \n\t"
+ "sb %[q6_r], +1(%[sq6]) \n\t"
+
+ :
+ : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+ [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+ [q6_r] "r" (q6_r), [sq0] "r" (sq0), [sq1] "r" (sq1),
+ [sq2] "r" (sq2), [sq3] "r" (sq3),
+ [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
+ );
+ } else if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__ (
+ "sb %[p2_r_f1], +1(%[sp2]) \n\t"
+ "sb %[p1_r_f1], +1(%[sp1]) \n\t"
+ "sb %[p0_r_f1], +1(%[sp0]) \n\t"
+ "sb %[q0_r_f1], +1(%[sq0]) \n\t"
+ "sb %[q1_r_f1], +1(%[sq1]) \n\t"
+ "sb %[q2_r_f1], +1(%[sq2]) \n\t"
+
+ :
+ : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+ [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+ [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+ );
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], +1(%[sp1]) \n\t"
+ "sb %[p0_f0], +1(%[sp0]) \n\t"
+ "sb %[q0_f0], +1(%[sq0]) \n\t"
+ "sb %[q1_f0], +1(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+ [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & flat2 & 0x00FF0000) {
+ __asm__ __volatile__ (
+ "sb %[p6_l], +2(%[sp6]) \n\t"
+ "sb %[p5_l], +2(%[sp5]) \n\t"
+ "sb %[p4_l], +2(%[sp4]) \n\t"
+ "sb %[p3_l], +2(%[sp3]) \n\t"
+ "sb %[p2_l], +2(%[sp2]) \n\t"
+ "sb %[p1_l], +2(%[sp1]) \n\t"
+ "sb %[p0_l], +2(%[sp0]) \n\t"
+
+ :
+ : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+ [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+ [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
+ [sp4] "r" (sp4), [sp3] "r" (sp3),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
+ );
+
+ __asm__ __volatile__ (
+ "sb %[q0_l], +2(%[sq0]) \n\t"
+ "sb %[q1_l], +2(%[sq1]) \n\t"
+ "sb %[q2_l], +2(%[sq2]) \n\t"
+ "sb %[q3_l], +2(%[sq3]) \n\t"
+ "sb %[q4_l], +2(%[sq4]) \n\t"
+ "sb %[q5_l], +2(%[sq5]) \n\t"
+ "sb %[q6_l], +2(%[sq6]) \n\t"
+
+ :
+ : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+ [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+ [q6_l] "r" (q6_l), [sq0] "r" (sq0), [sq1] "r" (sq1),
+ [sq2] "r" (sq2), [sq3] "r" (sq3),
+ [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
+ );
+ } else if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__ (
+ "sb %[p2_l_f1], +2(%[sp2]) \n\t"
+ "sb %[p1_l_f1], +2(%[sp1]) \n\t"
+ "sb %[p0_l_f1], +2(%[sp0]) \n\t"
+ "sb %[q0_l_f1], +2(%[sq0]) \n\t"
+ "sb %[q1_l_f1], +2(%[sq1]) \n\t"
+ "sb %[q2_l_f1], +2(%[sq2]) \n\t"
+
+ :
+ : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+ [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+ [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+ );
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], +2(%[sp1]) \n\t"
+ "sb %[p0_f0], +2(%[sp0]) \n\t"
+ "sb %[q0_f0], +2(%[sq0]) \n\t"
+ "sb %[q1_f0], +2(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+ [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p6_l], %[p6_l], 16 \n\t"
+ "srl %[p5_l], %[p5_l], 16 \n\t"
+ "srl %[p4_l], %[p4_l], 16 \n\t"
+ "srl %[p3_l], %[p3_l], 16 \n\t"
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[q3_l], %[q3_l], 16 \n\t"
+ "srl %[q4_l], %[q4_l], 16 \n\t"
+ "srl %[q5_l], %[q5_l], 16 \n\t"
+ "srl %[q6_l], %[q6_l], 16 \n\t"
+
+ : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+ [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l),
+ [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l),
+ [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l),
+ [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t"
+ "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t"
+ "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t"
+ "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t"
+ "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t"
+ "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1),
+ [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1),
+ [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1),
+ [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & flat2 & 0xFF000000) {
+ __asm__ __volatile__ (
+ "sb %[p6_l], +3(%[sp6]) \n\t"
+ "sb %[p5_l], +3(%[sp5]) \n\t"
+ "sb %[p4_l], +3(%[sp4]) \n\t"
+ "sb %[p3_l], +3(%[sp3]) \n\t"
+ "sb %[p2_l], +3(%[sp2]) \n\t"
+ "sb %[p1_l], +3(%[sp1]) \n\t"
+ "sb %[p0_l], +3(%[sp0]) \n\t"
+
+ :
+ : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+ [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+ [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
+ [sp4] "r" (sp4), [sp3] "r" (sp3), [sp2] "r" (sp2),
+ [sp1] "r" (sp1), [sp0] "r" (sp0)
+ );
+
+ __asm__ __volatile__ (
+ "sb %[q0_l], +3(%[sq0]) \n\t"
+ "sb %[q1_l], +3(%[sq1]) \n\t"
+ "sb %[q2_l], +3(%[sq2]) \n\t"
+ "sb %[q3_l], +3(%[sq3]) \n\t"
+ "sb %[q4_l], +3(%[sq4]) \n\t"
+ "sb %[q5_l], +3(%[sq5]) \n\t"
+ "sb %[q6_l], +3(%[sq6]) \n\t"
+
+ :
+ : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l),
+ [q2_l] "r" (q2_l), [q3_l] "r" (q3_l),
+ [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+ [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
+ [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
+ [q6_l] "r" (q6_l), [sq6] "r" (sq6)
+ );
+ } else if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__ (
+ "sb %[p2_l_f1], +3(%[sp2]) \n\t"
+ "sb %[p1_l_f1], +3(%[sp1]) \n\t"
+ "sb %[p0_l_f1], +3(%[sp0]) \n\t"
+ "sb %[q0_l_f1], +3(%[sq0]) \n\t"
+ "sb %[q1_l_f1], +3(%[sq1]) \n\t"
+ "sb %[q2_l_f1], +3(%[sq2]) \n\t"
+
+ :
+ : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+ [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+ [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+ [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+ );
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], +3(%[sp1]) \n\t"
+ "sb %[p0_f0], +3(%[sp0]) \n\t"
+ "sb %[q0_f0], +3(%[sq0]) \n\t"
+ "sb %[q1_f0], +3(%[sq1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [sp1] "r" (sp1), [sp0] "r" (sp0),
+ [sq0] "r" (sq0), [sq1] "r" (sq1)
+ );
+ }
+ }
+
+ s = s + 4;
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
new file mode 100644
index 0000000..9e9171c
--- /dev/null
+++ b/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
@@ -0,0 +1,840 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_mb_lpf_vertical_edge_w_dspr2(uint8_t *s,
+ int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ uint8_t i;
+ uint32_t mask, hev, flat, flat2;
+ uint8_t *s1, *s2, *s3, *s4;
+ uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+ uint32_t thresh_vec, flimit_vec, limit_vec;
+ uint32_t uflimit, ulimit, uthresh;
+ uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+ uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+ uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+ uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+ uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+ uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+ uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+ uflimit = *blimit;
+ ulimit = *limit;
+ uthresh = *thresh;
+
+ /* create quad-byte */
+ __asm__ __volatile__ (
+ "replv.qb %[thresh_vec], %[uthresh] \n\t"
+ "replv.qb %[flimit_vec], %[uflimit] \n\t"
+ "replv.qb %[limit_vec], %[ulimit] \n\t"
+
+ : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+ [limit_vec] "=r" (limit_vec)
+ : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+ );
+
+ vp9_prefetch_store(s + pitch);
+
+ for (i = 0; i < 2; i++) {
+ s1 = s;
+ s2 = s + pitch;
+ s3 = s2 + pitch;
+ s4 = s3 + pitch;
+ s = s4 + pitch;
+
+ __asm__ __volatile__ (
+ "lw %[p0], -4(%[s1]) \n\t"
+ "lw %[p1], -4(%[s2]) \n\t"
+ "lw %[p2], -4(%[s3]) \n\t"
+ "lw %[p3], -4(%[s4]) \n\t"
+ "lw %[p4], -8(%[s1]) \n\t"
+ "lw %[p5], -8(%[s2]) \n\t"
+ "lw %[p6], -8(%[s3]) \n\t"
+ "lw %[p7], -8(%[s4]) \n\t"
+
+ : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1),
+ [p0] "=&r" (p0), [p7] "=&r" (p7), [p6] "=&r" (p6),
+ [p5] "=&r" (p5), [p4] "=&r" (p4)
+ : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+ );
+
+ __asm__ __volatile__ (
+ "lw %[q3], (%[s1]) \n\t"
+ "lw %[q2], (%[s2]) \n\t"
+ "lw %[q1], (%[s3]) \n\t"
+ "lw %[q0], (%[s4]) \n\t"
+ "lw %[q7], +4(%[s1]) \n\t"
+ "lw %[q6], +4(%[s2]) \n\t"
+ "lw %[q5], +4(%[s3]) \n\t"
+ "lw %[q4], +4(%[s4]) \n\t"
+
+ : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1),
+ [q0] "=&r" (q0), [q7] "=&r" (q7), [q6] "=&r" (q6),
+ [q5] "=&r" (q5), [q4] "=&r" (q4)
+ : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+ );
+
+ /* transpose p3, p2, p1, p0
+ original (when loaded from memory)
+ register -4 -3 -2 -1
+ p0 p0_0 p0_1 p0_2 p0_3
+ p1 p1_0 p1_1 p1_2 p1_3
+ p2 p2_0 p2_1 p2_2 p2_3
+ p3 p3_0 p3_1 p3_2 p3_3
+
+ after transpose
+ register
+ p0 p3_3 p2_3 p1_3 p0_3
+ p1 p3_2 p2_2 p1_2 p0_2
+ p2 p3_1 p2_1 p1_1 p0_1
+ p3 p3_0 p2_0 p1_0 p0_0
+ */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t"
+ "precr.qb.ph %[prim2], %[p0], %[p1] \n\t"
+ "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t"
+ "precr.qb.ph %[prim4], %[p2], %[p3] \n\t"
+
+ "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p0], %[p1], %[sec3] \n\t"
+ "precrq.ph.w %[p2], %[p3], %[sec4] \n\t"
+ "append %[p1], %[sec3], 16 \n\t"
+ "append %[p3], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* transpose q0, q1, q2, q3
+ original (when loaded from memory)
+ register +1 +2 +3 +4
+ q3 q3_0 q3_1 q3_2 q3_3
+ q2 q2_0 q2_1 q2_2 q2_3
+ q1 q1_0 q1_1 q1_2 q1_3
+ q0 q0_0 q0_1 q0_2 q0_3
+
+ after transpose
+ register
+ q3 q0_3 q1_3 q2_3 q3_3
+ q2 q0_2 q1_2 q2_2 q3_2
+ q1 q0_1 q1_1 q2_1 q3_1
+ q0 q0_0 q1_0 q2_0 q3_0
+ */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t"
+ "precr.qb.ph %[prim2], %[q3], %[q2] \n\t"
+ "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t"
+ "precr.qb.ph %[prim4], %[q1], %[q0] \n\t"
+
+ "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[q3], %[q2], %[sec3] \n\t"
+ "precrq.ph.w %[q1], %[q0], %[sec4] \n\t"
+ "append %[q2], %[sec3], 16 \n\t"
+ "append %[q0], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* transpose p7, p6, p5, p4
+ original (when loaded from memory)
+ register -8 -7 -6 -5
+ p4 p4_0 p4_1 p4_2 p4_3
+ p5 p5_0 p5_1 p5_2 p5_3
+ p6 p6_0 p6_1 p6_2 p6_3
+ p7 p7_0 p7_1 p7_2 p7_3
+
+ after transpose
+ register
+ p4 p7_3 p6_3 p5_3 p4_3
+ p5 p7_2 p6_2 p5_2 p4_2
+ p6 p7_1 p6_1 p5_1 p4_1
+ p7 p7_0 p6_0 p5_0 p4_0
+ */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[p4], %[p5] \n\t"
+ "precr.qb.ph %[prim2], %[p4], %[p5] \n\t"
+ "precrq.qb.ph %[prim3], %[p6], %[p7] \n\t"
+ "precr.qb.ph %[prim4], %[p6], %[p7] \n\t"
+
+ "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[p7], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[p4], %[p5], %[sec3] \n\t"
+ "precrq.ph.w %[p6], %[p7], %[sec4] \n\t"
+ "append %[p5], %[sec3], 16 \n\t"
+ "append %[p7], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [p4] "+r" (p4), [p5] "+r" (p5), [p6] "+r" (p6), [p7] "+r" (p7),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ /* transpose q4, q5, q6, q7
+ original (when loaded from memory)
+ register +5 +6 +7 +8
+ q7 q7_0 q7_1 q7_2 q7_3
+ q6 q6_0 q6_1 q6_2 q6_3
+ q5 q5_0 q5_1 q5_2 q5_3
+ q4 q4_0 q4_1 q4_2 q4_3
+
+ after transpose
+ register
+ q7 q4_3 q5_3 q26_3 q7_3
+ q6 q4_2 q5_2 q26_2 q7_2
+ q5 q4_1 q5_1 q26_1 q7_1
+ q4 q4_0 q5_0 q26_0 q7_0
+ */
+ __asm__ __volatile__ (
+ "precrq.qb.ph %[prim1], %[q7], %[q6] \n\t"
+ "precr.qb.ph %[prim2], %[q7], %[q6] \n\t"
+ "precrq.qb.ph %[prim3], %[q5], %[q4] \n\t"
+ "precr.qb.ph %[prim4], %[q5], %[q4] \n\t"
+
+ "precrq.qb.ph %[q6], %[prim1], %[prim2] \n\t"
+ "precr.qb.ph %[q4], %[prim1], %[prim2] \n\t"
+ "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
+ "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
+
+ "precrq.ph.w %[q7], %[q6], %[sec3] \n\t"
+ "precrq.ph.w %[q5], %[q4], %[sec4] \n\t"
+ "append %[q6], %[sec3], 16 \n\t"
+ "append %[q4], %[sec4], 16 \n\t"
+
+ : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+ [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+ [q7] "+r" (q7), [q6] "+r" (q6), [q5] "+r" (q5), [q4] "+r" (q4),
+ [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+ :
+ );
+
+ vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+ p1, p0, p3, p2, q0, q1, q2, q3,
+ &hev, &mask, &flat);
+
+ vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+ /* f0 */
+ if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+ ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+ vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+ &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+ STORE_F0()
+ } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+ (mask == 0xFFFFFFFF)) {
+ /* f2 */
+ PACK_LEFT_0TO3()
+ PACK_LEFT_4TO7()
+ vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+ &p3_l, &p2_l, &p1_l, &p0_l,
+ &q0_l, &q1_l, &q2_l, &q3_l,
+ &q4_l, &q5_l, &q6_l, &q7_l);
+
+ PACK_RIGHT_0TO3()
+ PACK_RIGHT_4TO7()
+ vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+ &p3_r, &p2_r, &p1_r, &p0_r,
+ &q0_r, &q1_r, &q2_r, &q3_r,
+ &q4_r, &q5_r, &q6_r, &q7_r);
+
+ STORE_F2()
+ } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+ /* f1 */
+ PACK_LEFT_0TO3()
+ vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+ &q0_l, &q1_l, &q2_l, &q3_l);
+
+ PACK_RIGHT_0TO3()
+ vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+ &q0_r, &q1_r, &q2_r, &q3_r);
+
+ STORE_F1()
+ } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+ /* f0 + f1 */
+ vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+ &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ /* left 2 element operation */
+ PACK_LEFT_0TO3()
+ vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+ &q0_l, &q1_l, &q2_l, &q3_l);
+
+ /* right 2 element operation */
+ PACK_RIGHT_0TO3()
+ vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+ &q0_r, &q1_r, &q2_r, &q3_r);
+
+ if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__ (
+ "sb %[p2_r], -3(%[s4]) \n\t"
+ "sb %[p1_r], -2(%[s4]) \n\t"
+ "sb %[p0_r], -1(%[s4]) \n\t"
+ "sb %[q0_r], (%[s4]) \n\t"
+ "sb %[q1_r], +1(%[s4]) \n\t"
+ "sb %[q2_r], +2(%[s4]) \n\t"
+
+ :
+ : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+ [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+ [s4] "r" (s4)
+ );
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], -2(%[s4]) \n\t"
+ "sb %[p0_f0], -1(%[s4]) \n\t"
+ "sb %[q0_f0], (%[s4]) \n\t"
+ "sb %[q1_f0], +1(%[s4]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [s4] "r" (s4)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+ [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+ [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__ (
+ "sb %[p2_r], -3(%[s3]) \n\t"
+ "sb %[p1_r], -2(%[s3]) \n\t"
+ "sb %[p0_r], -1(%[s3]) \n\t"
+ "sb %[q0_r], (%[s3]) \n\t"
+ "sb %[q1_r], +1(%[s3]) \n\t"
+ "sb %[q2_r], +2(%[s3]) \n\t"
+
+ :
+ : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+ [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+ [s3] "r" (s3)
+ );
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], -2(%[s3]) \n\t"
+ "sb %[p0_f0], -1(%[s3]) \n\t"
+ "sb %[q0_f0], (%[s3]) \n\t"
+ "sb %[q1_f0], +1(%[s3]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [s3] "r" (s3)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__ (
+ "sb %[p2_l], -3(%[s2]) \n\t"
+ "sb %[p1_l], -2(%[s2]) \n\t"
+ "sb %[p0_l], -1(%[s2]) \n\t"
+ "sb %[q0_l], (%[s2]) \n\t"
+ "sb %[q1_l], +1(%[s2]) \n\t"
+ "sb %[q2_l], +2(%[s2]) \n\t"
+
+ :
+ : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+ [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+ [s2] "r" (s2)
+ );
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], -2(%[s2]) \n\t"
+ "sb %[p0_f0], -1(%[s2]) \n\t"
+ "sb %[q0_f0], (%[s2]) \n\t"
+ "sb %[q1_f0], +1(%[s2]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [s2] "r" (s2)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+ [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+ [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__ (
+ "sb %[p2_l], -3(%[s1]) \n\t"
+ "sb %[p1_l], -2(%[s1]) \n\t"
+ "sb %[p0_l], -1(%[s1]) \n\t"
+ "sb %[q0_l], (%[s1]) \n\t"
+ "sb %[q1_l], +1(%[s1]) \n\t"
+ "sb %[q2_l], +2(%[s1]) \n\t"
+
+ :
+ : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+ [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+ [s1] "r" (s1)
+ );
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], -2(%[s1]) \n\t"
+ "sb %[p0_f0], -1(%[s1]) \n\t"
+ "sb %[q0_f0], (%[s1]) \n\t"
+ "sb %[q1_f0], +1(%[s1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [s1] "r" (s1)
+ );
+ }
+ } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+ /* f0+f1+f2 */
+ vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+ &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+ PACK_LEFT_0TO3()
+ vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+ q0_l, q1_l, q2_l, q3_l,
+ &p2_l_f1, &p1_l_f1, &p0_l_f1,
+ &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+ PACK_RIGHT_0TO3()
+ vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+ q0_r, q1_r, q2_r, q3_r,
+ &p2_r_f1, &p1_r_f1, &p0_r_f1,
+ &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+ PACK_LEFT_4TO7()
+ vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+ &p3_l, &p2_l, &p1_l, &p0_l,
+ &q0_l, &q1_l, &q2_l, &q3_l,
+ &q4_l, &q5_l, &q6_l, &q7_l);
+
+ PACK_RIGHT_4TO7()
+ vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+ &p3_r, &p2_r, &p1_r, &p0_r,
+ &q0_r, &q1_r, &q2_r, &q3_r,
+ &q4_r, &q5_r, &q6_r, &q7_r);
+
+ if (mask & flat & flat2 & 0x000000FF) {
+ __asm__ __volatile__ (
+ "sb %[p6_r], -7(%[s4]) \n\t"
+ "sb %[p5_r], -6(%[s4]) \n\t"
+ "sb %[p4_r], -5(%[s4]) \n\t"
+ "sb %[p3_r], -4(%[s4]) \n\t"
+ "sb %[p2_r], -3(%[s4]) \n\t"
+ "sb %[p1_r], -2(%[s4]) \n\t"
+ "sb %[p0_r], -1(%[s4]) \n\t"
+
+ :
+ : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r),
+ [p4_r] "r" (p4_r), [p3_r] "r" (p3_r),
+ [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+ [p0_r] "r" (p0_r), [s4] "r" (s4)
+ );
+
+ __asm__ __volatile__ (
+ "sb %[q0_r], (%[s4]) \n\t"
+ "sb %[q1_r], +1(%[s4]) \n\t"
+ "sb %[q2_r], +2(%[s4]) \n\t"
+ "sb %[q3_r], +3(%[s4]) \n\t"
+ "sb %[q4_r], +4(%[s4]) \n\t"
+ "sb %[q5_r], +5(%[s4]) \n\t"
+ "sb %[q6_r], +6(%[s4]) \n\t"
+
+ :
+ : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r),
+ [q2_r] "r" (q2_r), [q3_r] "r" (q3_r),
+ [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+ [q6_r] "r" (q6_r), [s4] "r" (s4)
+ );
+ } else if (mask & flat & 0x000000FF) {
+ __asm__ __volatile__ (
+ "sb %[p2_r_f1], -3(%[s4]) \n\t"
+ "sb %[p1_r_f1], -2(%[s4]) \n\t"
+ "sb %[p0_r_f1], -1(%[s4]) \n\t"
+ "sb %[q0_r_f1], (%[s4]) \n\t"
+ "sb %[q1_r_f1], +1(%[s4]) \n\t"
+ "sb %[q2_r_f1], +2(%[s4]) \n\t"
+
+ :
+ : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+ [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+ [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+ [s4] "r" (s4)
+ );
+ } else if (mask & 0x000000FF) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], -2(%[s4]) \n\t"
+ "sb %[p0_f0], -1(%[s4]) \n\t"
+ "sb %[q0_f0], (%[s4]) \n\t"
+ "sb %[q1_f0], +1(%[s4]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [s4] "r" (s4)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p6_r], %[p6_r], 16 \n\t"
+ "srl %[p5_r], %[p5_r], 16 \n\t"
+ "srl %[p4_r], %[p4_r], 16 \n\t"
+ "srl %[p3_r], %[p3_r], 16 \n\t"
+ "srl %[p2_r], %[p2_r], 16 \n\t"
+ "srl %[p1_r], %[p1_r], 16 \n\t"
+ "srl %[p0_r], %[p0_r], 16 \n\t"
+ "srl %[q0_r], %[q0_r], 16 \n\t"
+ "srl %[q1_r], %[q1_r], 16 \n\t"
+ "srl %[q2_r], %[q2_r], 16 \n\t"
+ "srl %[q3_r], %[q3_r], 16 \n\t"
+ "srl %[q4_r], %[q4_r], 16 \n\t"
+ "srl %[q5_r], %[q5_r], 16 \n\t"
+ "srl %[q6_r], %[q6_r], 16 \n\t"
+
+ : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r),
+ [q2_r] "+r" (q2_r), [q3_r] "+r" (q3_r),
+ [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
+ [q6_r] "+r" (q6_r), [p6_r] "+r" (p6_r),
+ [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
+ [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r),
+ [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t"
+ "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t"
+ "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t"
+ "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t"
+ "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t"
+ "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1),
+ [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1),
+ [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1),
+ [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & flat2 & 0x0000FF00) {
+ __asm__ __volatile__ (
+ "sb %[p6_r], -7(%[s3]) \n\t"
+ "sb %[p5_r], -6(%[s3]) \n\t"
+ "sb %[p4_r], -5(%[s3]) \n\t"
+ "sb %[p3_r], -4(%[s3]) \n\t"
+ "sb %[p2_r], -3(%[s3]) \n\t"
+ "sb %[p1_r], -2(%[s3]) \n\t"
+ "sb %[p0_r], -1(%[s3]) \n\t"
+
+ :
+ : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
+ [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+ [p0_r] "r" (p0_r), [s3] "r" (s3)
+ );
+
+ __asm__ __volatile__ (
+ "sb %[q0_r], (%[s3]) \n\t"
+ "sb %[q1_r], +1(%[s3]) \n\t"
+ "sb %[q2_r], +2(%[s3]) \n\t"
+ "sb %[q3_r], +3(%[s3]) \n\t"
+ "sb %[q4_r], +4(%[s3]) \n\t"
+ "sb %[q5_r], +5(%[s3]) \n\t"
+ "sb %[q6_r], +6(%[s3]) \n\t"
+
+ :
+ : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r),
+ [q2_r] "r" (q2_r), [q3_r] "r" (q3_r),
+ [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+ [q6_r] "r" (q6_r), [s3] "r" (s3)
+ );
+ } else if (mask & flat & 0x0000FF00) {
+ __asm__ __volatile__ (
+ "sb %[p2_r_f1], -3(%[s3]) \n\t"
+ "sb %[p1_r_f1], -2(%[s3]) \n\t"
+ "sb %[p0_r_f1], -1(%[s3]) \n\t"
+ "sb %[q0_r_f1], (%[s3]) \n\t"
+ "sb %[q1_r_f1], +1(%[s3]) \n\t"
+ "sb %[q2_r_f1], +2(%[s3]) \n\t"
+
+ :
+ : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+ [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+ [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+ [s3] "r" (s3)
+ );
+ } else if (mask & 0x0000FF00) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], -2(%[s3]) \n\t"
+ "sb %[p0_f0], -1(%[s3]) \n\t"
+ "sb %[q0_f0], (%[s3]) \n\t"
+ "sb %[q1_f0], +1(%[s3]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [s3] "r" (s3)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & flat2 & 0x00FF0000) {
+ __asm__ __volatile__ (
+ "sb %[p6_l], -7(%[s2]) \n\t"
+ "sb %[p5_l], -6(%[s2]) \n\t"
+ "sb %[p4_l], -5(%[s2]) \n\t"
+ "sb %[p3_l], -4(%[s2]) \n\t"
+ "sb %[p2_l], -3(%[s2]) \n\t"
+ "sb %[p1_l], -2(%[s2]) \n\t"
+ "sb %[p0_l], -1(%[s2]) \n\t"
+
+ :
+ : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+ [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+ [p0_l] "r" (p0_l), [s2] "r" (s2)
+ );
+
+ __asm__ __volatile__ (
+ "sb %[q0_l], (%[s2]) \n\t"
+ "sb %[q1_l], +1(%[s2]) \n\t"
+ "sb %[q2_l], +2(%[s2]) \n\t"
+ "sb %[q3_l], +3(%[s2]) \n\t"
+ "sb %[q4_l], +4(%[s2]) \n\t"
+ "sb %[q5_l], +5(%[s2]) \n\t"
+ "sb %[q6_l], +6(%[s2]) \n\t"
+
+ :
+ : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+ [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+ [q6_l] "r" (q6_l), [s2] "r" (s2)
+ );
+ } else if (mask & flat & 0x00FF0000) {
+ __asm__ __volatile__ (
+ "sb %[p2_l_f1], -3(%[s2]) \n\t"
+ "sb %[p1_l_f1], -2(%[s2]) \n\t"
+ "sb %[p0_l_f1], -1(%[s2]) \n\t"
+ "sb %[q0_l_f1], (%[s2]) \n\t"
+ "sb %[q1_l_f1], +1(%[s2]) \n\t"
+ "sb %[q2_l_f1], +2(%[s2]) \n\t"
+
+ :
+ : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+ [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+ [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+ [s2] "r" (s2)
+ );
+ } else if (mask & 0x00FF0000) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], -2(%[s2]) \n\t"
+ "sb %[p0_f0], -1(%[s2]) \n\t"
+ "sb %[q0_f0], (%[s2]) \n\t"
+ "sb %[q1_f0], +1(%[s2]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [s2] "r" (s2)
+ );
+ }
+
+ __asm__ __volatile__ (
+ "srl %[p6_l], %[p6_l], 16 \n\t"
+ "srl %[p5_l], %[p5_l], 16 \n\t"
+ "srl %[p4_l], %[p4_l], 16 \n\t"
+ "srl %[p3_l], %[p3_l], 16 \n\t"
+ "srl %[p2_l], %[p2_l], 16 \n\t"
+ "srl %[p1_l], %[p1_l], 16 \n\t"
+ "srl %[p0_l], %[p0_l], 16 \n\t"
+ "srl %[q0_l], %[q0_l], 16 \n\t"
+ "srl %[q1_l], %[q1_l], 16 \n\t"
+ "srl %[q2_l], %[q2_l], 16 \n\t"
+ "srl %[q3_l], %[q3_l], 16 \n\t"
+ "srl %[q4_l], %[q4_l], 16 \n\t"
+ "srl %[q5_l], %[q5_l], 16 \n\t"
+ "srl %[q6_l], %[q6_l], 16 \n\t"
+
+ : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+ [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l),
+ [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l),
+ [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l),
+ [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l)
+ :
+ );
+
+ __asm__ __volatile__ (
+ "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t"
+ "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t"
+ "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t"
+ "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t"
+ "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t"
+ "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t"
+ "srl %[p1_f0], %[p1_f0], 8 \n\t"
+ "srl %[p0_f0], %[p0_f0], 8 \n\t"
+ "srl %[q0_f0], %[q0_f0], 8 \n\t"
+ "srl %[q1_f0], %[q1_f0], 8 \n\t"
+
+ : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1),
+ [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1),
+ [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1),
+ [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+ [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+ :
+ );
+
+ if (mask & flat & flat2 & 0xFF000000) {
+ __asm__ __volatile__ (
+ "sb %[p6_l], -7(%[s1]) \n\t"
+ "sb %[p5_l], -6(%[s1]) \n\t"
+ "sb %[p4_l], -5(%[s1]) \n\t"
+ "sb %[p3_l], -4(%[s1]) \n\t"
+ "sb %[p2_l], -3(%[s1]) \n\t"
+ "sb %[p1_l], -2(%[s1]) \n\t"
+ "sb %[p0_l], -1(%[s1]) \n\t"
+
+ :
+ : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+ [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+ [p0_l] "r" (p0_l),
+ [s1] "r" (s1)
+ );
+
+ __asm__ __volatile__ (
+ "sb %[q0_l], (%[s1]) \n\t"
+ "sb %[q1_l], 1(%[s1]) \n\t"
+ "sb %[q2_l], 2(%[s1]) \n\t"
+ "sb %[q3_l], 3(%[s1]) \n\t"
+ "sb %[q4_l], 4(%[s1]) \n\t"
+ "sb %[q5_l], 5(%[s1]) \n\t"
+ "sb %[q6_l], 6(%[s1]) \n\t"
+
+ :
+ : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+ [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+ [q6_l] "r" (q6_l),
+ [s1] "r" (s1)
+ );
+ } else if (mask & flat & 0xFF000000) {
+ __asm__ __volatile__ (
+ "sb %[p2_l_f1], -3(%[s1]) \n\t"
+ "sb %[p1_l_f1], -2(%[s1]) \n\t"
+ "sb %[p0_l_f1], -1(%[s1]) \n\t"
+ "sb %[q0_l_f1], (%[s1]) \n\t"
+ "sb %[q1_l_f1], +1(%[s1]) \n\t"
+ "sb %[q2_l_f1], +2(%[s1]) \n\t"
+
+ :
+ : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+ [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+ [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+ [s1] "r" (s1)
+ );
+ } else if (mask & 0xFF000000) {
+ __asm__ __volatile__ (
+ "sb %[p1_f0], -2(%[s1]) \n\t"
+ "sb %[p0_f0], -1(%[s1]) \n\t"
+ "sb %[q0_f0], (%[s1]) \n\t"
+ "sb %[q1_f0], +1(%[s1]) \n\t"
+
+ :
+ : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+ [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+ [s1] "r" (s1)
+ );
+ }
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c
index 0d65651..d298160 100644
--- a/libvpx/vp9/common/vp9_alloccommon.c
+++ b/libvpx/vp9/common/vp9_alloccommon.c
@@ -79,6 +79,57 @@ static void setup_mi(VP9_COMMON *cm) {
vp9_update_mode_info_border(cm, cm->prev_mip);
}
+int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {
+ const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
+ const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+ const int ss_x = cm->subsampling_x;
+ const int ss_y = cm->subsampling_y;
+ int mi_size;
+
+ if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
+ VP9BORDERINPIXELS) < 0)
+ goto fail;
+
+ set_mb_mi(cm, aligned_width, aligned_height);
+
+ // Allocation
+ mi_size = cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE);
+
+ vpx_free(cm->mip);
+ cm->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+ if (!cm->mip)
+ goto fail;
+
+ vpx_free(cm->prev_mip);
+ cm->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+ if (!cm->prev_mip)
+ goto fail;
+
+ vpx_free(cm->mi_grid_base);
+ cm->mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
+ if (!cm->mi_grid_base)
+ goto fail;
+
+ vpx_free(cm->prev_mi_grid_base);
+ cm->prev_mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base));
+ if (!cm->prev_mi_grid_base)
+ goto fail;
+
+ setup_mi(cm);
+
+ // Create the segmentation map structure and set to 0.
+ vpx_free(cm->last_frame_seg_map);
+ cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
+ if (!cm->last_frame_seg_map)
+ goto fail;
+
+ return 0;
+
+ fail:
+ vp9_free_frame_buffers(cm);
+ return 1;
+}
+
int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
int i;
diff --git a/libvpx/vp9/common/vp9_alloccommon.h b/libvpx/vp9/common/vp9_alloccommon.h
index 5d5fae9..cf8dca5 100644
--- a/libvpx/vp9/common/vp9_alloccommon.h
+++ b/libvpx/vp9/common/vp9_alloccommon.h
@@ -21,6 +21,7 @@ void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi);
void vp9_create_common(VP9_COMMON *cm);
void vp9_remove_common(VP9_COMMON *cm);
+int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height);
int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height);
void vp9_free_frame_buffers(VP9_COMMON *cm);
diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h
index d0d4852..c5da375 100644
--- a/libvpx/vp9/common/vp9_blockd.h
+++ b/libvpx/vp9/common/vp9_blockd.h
@@ -82,9 +82,8 @@ static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
#define INTER_MODES (1 + NEWMV - NEARESTMV)
-static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) {
- return (mode - NEARESTMV);
-}
+#define INTER_OFFSET(mode) ((mode) - NEARESTMV)
+
/* For keyframes, intra block modes are predicted by the (already decoded)
modes for the Y blocks to the left and above us; for interframes, there
@@ -171,9 +170,9 @@ struct buf_2d {
};
struct macroblockd_plane {
- DECLARE_ALIGNED(16, int16_t, qcoeff[64 * 64]);
- DECLARE_ALIGNED(16, int16_t, dqcoeff[64 * 64]);
- DECLARE_ALIGNED(16, uint16_t, eobs[256]);
+ int16_t *qcoeff;
+ int16_t *dqcoeff;
+ uint16_t *eobs;
PLANE_TYPE plane_type;
int subsampling_x;
int subsampling_y;
@@ -216,13 +215,6 @@ typedef struct macroblockd {
int corrupted;
- unsigned char sb_index; // index of 32x32 block inside the 64x64 block
- unsigned char mb_index; // index of 16x16 block inside the 32x32 block
- unsigned char b_index; // index of 8x8 block inside the 16x16 block
- unsigned char ab_index; // index of 4x4 block inside the 8x8 block
-
- int q_index;
-
/* Y,U,V,(A) */
ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
@@ -457,57 +449,46 @@ static void extend_for_intra(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize,
}
}
}
-static void set_contexts_on_border(const MACROBLOCKD *xd,
- struct macroblockd_plane *pd,
- BLOCK_SIZE plane_bsize,
- int tx_size_in_blocks, int has_eob,
- int aoff, int loff,
- ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
- int mi_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
- int mi_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
- int above_contexts = tx_size_in_blocks;
- int left_contexts = tx_size_in_blocks;
- int pt;
-
- // xd->mb_to_right_edge is in units of pixels * 8. This converts
- // it to 4x4 block sizes.
- if (xd->mb_to_right_edge < 0)
- mi_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-
- if (xd->mb_to_bottom_edge < 0)
- mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
- // this code attempts to avoid copying into contexts that are outside
- // our border. Any blocks that do are set to 0...
- if (above_contexts + aoff > mi_blocks_wide)
- above_contexts = mi_blocks_wide - aoff;
-
- if (left_contexts + loff > mi_blocks_high)
- left_contexts = mi_blocks_high - loff;
-
- for (pt = 0; pt < above_contexts; pt++)
- A[pt] = has_eob;
- for (pt = above_contexts; pt < tx_size_in_blocks; pt++)
- A[pt] = 0;
- for (pt = 0; pt < left_contexts; pt++)
- L[pt] = has_eob;
- for (pt = left_contexts; pt < tx_size_in_blocks; pt++)
- L[pt] = 0;
-}
static void set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
int has_eob, int aoff, int loff) {
- ENTROPY_CONTEXT *const A = pd->above_context + aoff;
- ENTROPY_CONTEXT *const L = pd->left_context + loff;
+ ENTROPY_CONTEXT *const a = pd->above_context + aoff;
+ ENTROPY_CONTEXT *const l = pd->left_context + loff;
const int tx_size_in_blocks = 1 << tx_size;
- if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
- set_contexts_on_border(xd, pd, plane_bsize, tx_size_in_blocks, has_eob,
- aoff, loff, A, L);
+ // above
+ if (has_eob && xd->mb_to_right_edge < 0) {
+ int i;
+ const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] +
+ (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+ int above_contexts = tx_size_in_blocks;
+ if (above_contexts + aoff > blocks_wide)
+ above_contexts = blocks_wide - aoff;
+
+ for (i = 0; i < above_contexts; ++i)
+ a[i] = has_eob;
+ for (i = above_contexts; i < tx_size_in_blocks; ++i)
+ a[i] = 0;
+ } else {
+ vpx_memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+ }
+
+ // left
+ if (has_eob && xd->mb_to_bottom_edge < 0) {
+ int i;
+ const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] +
+ (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+ int left_contexts = tx_size_in_blocks;
+ if (left_contexts + loff > blocks_high)
+ left_contexts = blocks_high - loff;
+
+ for (i = 0; i < left_contexts; ++i)
+ l[i] = has_eob;
+ for (i = left_contexts; i < tx_size_in_blocks; ++i)
+ l[i] = 0;
} else {
- vpx_memset(A, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
- vpx_memset(L, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+ vpx_memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
}
}
diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c
index d3a867c..feceb66 100644
--- a/libvpx/vp9/common/vp9_entropy.c
+++ b/libvpx/vp9/common/vp9_entropy.c
@@ -37,15 +37,78 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
};
DECLARE_ALIGNED(16, const uint8_t,
- vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1]) = {
+ vp9_coefband_trans_8x8plus[1024]) = {
0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 5
+ 4, 4, 4, 4, 4, 5,
+ // beyond MAXBAND_INDEX+1 all values are filled as 5
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
};
DECLARE_ALIGNED(16, const uint8_t,
- vp9_coefband_trans_4x4[MAXBAND_INDEX + 1]) = {
+ vp9_coefband_trans_4x4[16]) = {
0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
- 5, 5, 5, 5, 5, 5
};
DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
@@ -332,7 +395,7 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
if (l >= 3 && k == 0)
continue;
vp9_tree_probs_from_distribution(vp9_coefmodel_tree, branch_ct,
- coef_counts[i][j][k][l], 0);
+ coef_counts[i][j][k][l]);
branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
for (m = 0; m < UNCONSTRAINED_NODES; ++m)
dst_coef_probs[i][j][k][l][m] = merge_probs(
diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h
index c58e852..e133d65 100644
--- a/libvpx/vp9/common/vp9_entropy.h
+++ b/libvpx/vp9/common/vp9_entropy.h
@@ -51,7 +51,7 @@ extern const vp9_tree_index vp9_coefmodel_tree[];
extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
typedef struct {
- vp9_tree_index *tree;
+ const vp9_tree_index *tree;
const vp9_prob *prob;
int len;
int base_val;
@@ -120,17 +120,15 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
// This is the index in the scan order beyond which all coefficients for
// 8x8 transform and above are in the top band.
-// For 4x4 blocks the index is less but to keep things common the lookup
-// table for 4x4 is padded out to this index.
+// This macro is currently unused but may be used by certain implementations
#define MAXBAND_INDEX 21
-extern const uint8_t vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1];
-extern const uint8_t vp9_coefband_trans_4x4[MAXBAND_INDEX + 1];
+extern const uint8_t vp9_coefband_trans_8x8plus[1024];
+extern const uint8_t vp9_coefband_trans_4x4[16];
-
-static int get_coef_band(const uint8_t * band_translate, int coef_index) {
- return (coef_index > MAXBAND_INDEX)
- ? (COEF_BANDS-1) : band_translate[coef_index];
+static const uint8_t *get_band_translate(TX_SIZE tx_size) {
+ return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
+ : vp9_coefband_trans_8x8plus;
}
// 128 lists of probabilities are stored for the following ONE node probs:
@@ -181,11 +179,6 @@ static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
return combine_entropy_contexts(above_ec, left_ec);
}
-static const uint8_t *get_band_translate(TX_SIZE tx_size) {
- return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
- : vp9_coefband_trans_8x8plus;
-}
-
static void get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
PLANE_TYPE type, int block_idx,
const int16_t **scan, const int16_t **scan_nb) {
diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c
index a963d55..3b2510d 100644
--- a/libvpx/vp9/common/vp9_entropymode.c
+++ b/libvpx/vp9/common/vp9_entropymode.c
@@ -235,9 +235,9 @@ const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
- -ZEROMV, 2,
- -NEARESTMV, 4,
- -NEARMV, -NEWMV
+ -INTER_OFFSET(ZEROMV), 2,
+ -INTER_OFFSET(NEARESTMV), 4,
+ -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV)
};
struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
@@ -343,8 +343,7 @@ void vp9_entropy_mode_init() {
vp9_tokens_from_tree(vp9_switchable_interp_encodings,
vp9_switchable_interp_tree);
vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
- vp9_tokens_from_tree_offset(vp9_inter_mode_encodings,
- vp9_inter_mode_tree, NEARESTMV);
+ vp9_tokens_from_tree(vp9_inter_mode_encodings, vp9_inter_mode_tree);
}
#define COUNT_SAT 20
@@ -356,9 +355,9 @@ static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) {
static void adapt_probs(const vp9_tree_index *tree,
const vp9_prob *pre_probs, const unsigned int *counts,
- unsigned int offset, vp9_prob *probs) {
- tree_merge_probs(tree, pre_probs, counts, offset,
- COUNT_SAT, MAX_UPDATE_FACTOR, probs);
+ vp9_prob *probs) {
+ tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR,
+ probs);
}
void vp9_adapt_mode_probs(VP9_COMMON *cm) {
@@ -383,25 +382,24 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
for (i = 0; i < INTER_MODE_CONTEXTS; i++)
adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
- counts->inter_mode[i], NEARESTMV, fc->inter_mode_probs[i]);
+ counts->inter_mode[i], fc->inter_mode_probs[i]);
for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
- counts->y_mode[i], 0, fc->y_mode_prob[i]);
+ counts->y_mode[i], fc->y_mode_prob[i]);
for (i = 0; i < INTRA_MODES; ++i)
adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
- counts->uv_mode[i], 0, fc->uv_mode_prob[i]);
+ counts->uv_mode[i], fc->uv_mode_prob[i]);
for (i = 0; i < PARTITION_CONTEXTS; i++)
adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i],
- counts->partition[i], 0, fc->partition_prob[i]);
+ counts->partition[i], fc->partition_prob[i]);
if (cm->mcomp_filter_type == SWITCHABLE) {
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i],
- counts->switchable_interp[i], 0,
- fc->switchable_interp_prob[i]);
+ counts->switchable_interp[i], fc->switchable_interp_prob[i]);
}
if (cm->tx_mode == TX_MODE_SELECT) {
diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c
index b061cdb..290dcdd 100644
--- a/libvpx/vp9/common/vp9_entropymv.c
+++ b/libvpx/vp9/common/vp9_entropymv.c
@@ -196,8 +196,8 @@ static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
const unsigned int *counts, vp9_prob *probs) {
- tree_merge_probs(tree, pre_probs, counts, 0,
- MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR, probs);
+ tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR,
+ probs);
}
void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
@@ -207,8 +207,7 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
const nmv_context_counts *counts = &cm->counts.mv;
- adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints,
- fc->joints);
+ adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, fc->joints);
for (i = 0; i < 2; ++i) {
nmv_component *comp = &fc->comps[i];
diff --git a/libvpx/vp9/common/vp9_extend.c b/libvpx/vp9/common/vp9_extend.c
index 07c68c8..836bf0e 100644
--- a/libvpx/vp9/common/vp9_extend.c
+++ b/libvpx/vp9/common/vp9_extend.c
@@ -62,7 +62,7 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
const int et_y = 16;
const int el_y = 16;
// Motion estimation may use src block variance with the block size up
- // to 64x64, so the right and bottom need to be extended to 64 mulitple
+ // to 64x64, so the right and bottom need to be extended to 64 multiple
// or up to 16, whichever is greater.
const int eb_y = MAX(ALIGN_POWER_OF_TWO(src->y_width, 6) - src->y_width,
16);
diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c
index 218e12e..ff504a1 100644
--- a/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libvpx/vp9/common/vp9_loopfilter.c
@@ -32,6 +32,8 @@ typedef struct {
uint16_t left_uv[TX_SIZES];
uint16_t above_uv[TX_SIZES];
uint16_t int_4x4_uv;
+ uint8_t lfl_y[64];
+ uint8_t lfl_uv[16];
} LOOP_FILTER_MASK;
// 64 bit masks for left transform size. Each 1 represents a position where
@@ -322,20 +324,14 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
}
}
-static int build_lfi(const loop_filter_info_n *lfi_n,
- const MB_MODE_INFO *mbmi,
- const loop_filter_thresh **lfi) {
+static uint8_t build_lfi(const loop_filter_info_n *lfi_n,
+ const MB_MODE_INFO *mbmi) {
const int seg = mbmi->segment_id;
const int ref = mbmi->ref_frame[0];
const int mode = lfi_n->mode_lf_lut[mbmi->mode];
const int filter_level = lfi_n->lvl[seg][ref][mode];
- if (filter_level > 0) {
- *lfi = &lfi_n->lfthr[filter_level];
- return 1;
- } else {
- return 0;
- }
+ return filter_level;
}
static void filter_selectively_vert(uint8_t *s, int pitch,
@@ -343,12 +339,13 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
unsigned int mask_8x8,
unsigned int mask_4x4,
unsigned int mask_4x4_int,
- const loop_filter_thresh **p_lfi) {
+ const loop_filter_info_n *lfi_n,
+ const uint8_t *lfl) {
unsigned int mask;
for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
mask; mask >>= 1) {
- const loop_filter_thresh *lfi = *p_lfi;
+ const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
if (mask & 1) {
if (mask_16x16 & 1) {
@@ -373,7 +370,7 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
lfi->hev_thr, 1);
s += 8;
- p_lfi++;
+ lfl += 1;
mask_16x16 >>= 1;
mask_8x8 >>= 1;
mask_4x4 >>= 1;
@@ -386,49 +383,112 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
unsigned int mask_8x8,
unsigned int mask_4x4,
unsigned int mask_4x4_int,
- int only_4x4_1,
- const loop_filter_thresh **p_lfi) {
+ const loop_filter_info_n *lfi_n,
+ const uint8_t *lfl) {
unsigned int mask;
int count;
for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
mask; mask >>= count) {
- const loop_filter_thresh *lfi = *p_lfi;
+ const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
count = 1;
if (mask & 1) {
- if (!only_4x4_1) {
- if (mask_16x16 & 1) {
- if ((mask_16x16 & 3) == 3) {
- vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, 2);
- count = 2;
+ if (mask_16x16 & 1) {
+ if ((mask_16x16 & 3) == 3) {
+ vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
+ count = 2;
+ } else {
+ vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
+ }
+ assert(!(mask_8x8 & 1));
+ assert(!(mask_4x4 & 1));
+ assert(!(mask_4x4_int & 1));
+ } else if (mask_8x8 & 1) {
+ if ((mask_8x8 & 3) == 3) {
+ // Next block's thresholds
+ const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+ // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
+ vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
+ vp9_mbloop_filter_horizontal_edge(s + 8, pitch, lfin->mblim,
+ lfin->lim, lfin->hev_thr, 1);
+
+ if ((mask_4x4_int & 3) == 3) {
+ // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
+ vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
+ vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
+ lfin->mblim, lfin->lim,
+ lfin->hev_thr, 1);
} else {
- vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ if (mask_4x4_int & 1)
+ vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
+ else if (mask_4x4_int & 2)
+ vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
+ lfin->mblim, lfin->lim,
+ lfin->hev_thr, 1);
}
- assert(!(mask_8x8 & 1));
- assert(!(mask_4x4 & 1));
- assert(!(mask_4x4_int & 1));
- } else if (mask_8x8 & 1) {
+ count = 2;
+ } else {
vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
lfi->hev_thr, 1);
- assert(!(mask_16x16 & 1));
- assert(!(mask_4x4 & 1));
- } else if (mask_4x4 & 1) {
- vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
- assert(!(mask_16x16 & 1));
- assert(!(mask_8x8 & 1));
+
+ if (mask_4x4_int & 1)
+ vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
}
- }
+ assert(!(mask_16x16 & 1));
+ assert(!(mask_4x4 & 1));
+ } else if (mask_4x4 & 1) {
+ if ((mask_4x4 & 3) == 3) {
+ // Next block's thresholds
+ const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
- if (mask_4x4_int & 1)
+ // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
+ vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
+ vp9_loop_filter_horizontal_edge(s + 8, pitch, lfin->mblim, lfin->lim,
+ lfin->hev_thr, 1);
+
+ if ((mask_4x4_int & 3) == 3) {
+ // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
+ vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
+ vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
+ lfin->mblim, lfin->lim,
+ lfin->hev_thr, 1);
+ } else {
+ if (mask_4x4_int & 1)
+ vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
+ else if (mask_4x4_int & 2)
+ vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
+ lfin->mblim, lfin->lim,
+ lfin->hev_thr, 1);
+ }
+ count = 2;
+ } else {
+ vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
+
+ if (mask_4x4_int & 1)
+ vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+ lfi->lim, lfi->hev_thr, 1);
+ }
+ assert(!(mask_16x16 & 1));
+ assert(!(mask_8x8 & 1));
+ } else if (mask_4x4_int & 1) {
vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
lfi->lim, lfi->hev_thr, 1);
+ }
}
s += 8 * count;
- p_lfi += count;
+ lfl += count;
mask_16x16 >>= count;
mask_8x8 >>= count;
mask_4x4 >>= count;
@@ -461,10 +521,20 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
uint16_t *left_uv = &lfm->left_uv[tx_size_uv];
uint16_t *above_uv = &lfm->above_uv[tx_size_uv];
uint16_t *int_4x4_uv = &lfm->int_4x4_uv;
+ int i;
+ int w = num_8x8_blocks_wide_lookup[block_size];
+ int h = num_8x8_blocks_high_lookup[block_size];
// If filter level is 0 we don't loop filter.
- if (!filter_level)
+ if (!filter_level) {
return;
+ } else {
+ int index = shift_y;
+ for (i = 0; i < h; i++) {
+ vpx_memset(&lfm->lfl_y[index], filter_level, w);
+ index += 8;
+ }
+ }
// These set 1 in the current block size for the block size edges.
// For instance if the block size is 32x16, we'll set :
@@ -530,9 +600,19 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
uint64_t *left_y = &lfm->left_y[tx_size_y];
uint64_t *above_y = &lfm->above_y[tx_size_y];
uint64_t *int_4x4_y = &lfm->int_4x4_y;
+ int i;
+ int w = num_8x8_blocks_wide_lookup[block_size];
+ int h = num_8x8_blocks_high_lookup[block_size];
- if (!filter_level)
+ if (!filter_level) {
return;
+ } else {
+ int index = shift_y;
+ for (i = 0; i < h; i++) {
+ vpx_memset(&lfm->lfl_y[index], filter_level, w);
+ index += 8;
+ }
+ }
*above_y |= above_prediction_mask[block_size] << shift_y;
*left_y |= left_prediction_mask[block_size] << shift_y;
@@ -785,6 +865,7 @@ static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
}
}
}
+
#if CONFIG_NON420
static void filter_block_plane_non420(VP9_COMMON *cm,
struct macroblockd_plane *plane,
@@ -801,7 +882,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
- const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+ uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE];
int r, c;
for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
@@ -830,7 +911,8 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
// Filter level can vary per MI
- if (!build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]))
+ if (!(lfl[(r << 3) + (c >> ss_x)] =
+ build_lfi(&cm->lf_info, &mi[0].mbmi)))
continue;
// Build masks based on the transform size of each block
@@ -887,7 +969,8 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
mask_16x16_c & border_mask,
mask_8x8_c & border_mask,
mask_4x4_c & border_mask,
- mask_4x4_int[r], lfi[r]);
+ mask_4x4_int[r],
+ &cm->lf_info, &lfl[r << 3]);
dst->buf += 8 * dst->stride;
mi_8x8 += row_step_stride;
}
@@ -898,11 +981,26 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
+ unsigned int mask_16x16_r;
+ unsigned int mask_8x8_r;
+ unsigned int mask_4x4_r;
+
+ if (mi_row + r == 0) {
+ mask_16x16_r = 0;
+ mask_8x8_r = 0;
+ mask_4x4_r = 0;
+ } else {
+ mask_16x16_r = mask_16x16[r];
+ mask_8x8_r = mask_8x8[r];
+ mask_4x4_r = mask_4x4[r];
+ }
+
filter_selectively_horiz(dst->buf, dst->stride,
- mask_16x16[r],
- mask_8x8[r],
- mask_4x4[r],
- mask_4x4_int_r, mi_row + r == 0, lfi[r]);
+ mask_16x16_r,
+ mask_8x8_r,
+ mask_4x4_r,
+ mask_4x4_int_r,
+ &cm->lf_info, &lfl[r << 3]);
dst->buf += 8 * dst->stride;
}
}
@@ -910,81 +1008,137 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
static void filter_block_plane(VP9_COMMON *const cm,
struct macroblockd_plane *const plane,
- MODE_INFO **mi_8x8,
- int mi_row, int mi_col,
+ int mi_row,
LOOP_FILTER_MASK *lfm) {
- const int ss_x = plane->subsampling_x;
- const int ss_y = plane->subsampling_y;
- const int row_step = 1 << ss_x;
- const int col_step = 1 << ss_y;
- const int row_step_stride = cm->mode_info_stride * row_step;
struct buf_2d *const dst = &plane->dst;
uint8_t* const dst0 = dst->buf;
- unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
- const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+ unsigned int mask_4x4_int_row[MI_BLOCK_SIZE] = {0};
int r, c;
- int row_shift = 3 - ss_x;
- int row_mask = 0xff >> (ss_x << 2);
-#define MASK_ROW(value) ((value >> (r_sampled << row_shift)) & row_mask)
+ if (!plane->plane_type) {
+ uint64_t mask_16x16 = lfm->left_y[TX_16X16];
+ uint64_t mask_8x8 = lfm->left_y[TX_8X8];
+ uint64_t mask_4x4 = lfm->left_y[TX_4X4];
+ uint64_t mask_4x4_int = lfm->int_4x4_y;
- for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
- int r_sampled = r >> ss_x;
+ // Vertical pass
+ for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
+ mask_4x4_int_row[r] = mask_4x4_int & 0xff;
- // Determine the vertical edges that need filtering
- for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
- const MODE_INFO *mi = mi_8x8[c];
-
- build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]);
- }
- if (!plane->plane_type) {
- mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y);
// Disable filtering on the leftmost column
filter_selectively_vert(dst->buf, dst->stride,
- MASK_ROW(lfm->left_y[TX_16X16]),
- MASK_ROW(lfm->left_y[TX_8X8]),
- MASK_ROW(lfm->left_y[TX_4X4]),
- MASK_ROW(lfm->int_4x4_y),
- lfi[r]);
- } else {
- mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_uv);
+ mask_16x16 & 0xff,
+ mask_8x8 & 0xff,
+ mask_4x4 & 0xff,
+ mask_4x4_int_row[r],
+ &cm->lf_info, &lfm->lfl_y[r << 3]);
+
+ dst->buf += 8 * dst->stride;
+ mask_16x16 >>= 8;
+ mask_8x8 >>= 8;
+ mask_4x4 >>= 8;
+ mask_4x4_int >>= 8;
+ }
+
+ // Horizontal pass
+ dst->buf = dst0;
+ mask_16x16 = lfm->above_y[TX_16X16];
+ mask_8x8 = lfm->above_y[TX_8X8];
+ mask_4x4 = lfm->above_y[TX_4X4];
+
+ for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
+ unsigned int mask_16x16_r;
+ unsigned int mask_8x8_r;
+ unsigned int mask_4x4_r;
+
+ if (mi_row + r == 0) {
+ mask_16x16_r = 0;
+ mask_8x8_r = 0;
+ mask_4x4_r = 0;
+ } else {
+ mask_16x16_r = mask_16x16 & 0xff;
+ mask_8x8_r = mask_8x8 & 0xff;
+ mask_4x4_r = mask_4x4 & 0xff;
+ }
+
+ filter_selectively_horiz(dst->buf, dst->stride,
+ mask_16x16_r,
+ mask_8x8_r,
+ mask_4x4_r,
+ mask_4x4_int_row[r],
+ &cm->lf_info, &lfm->lfl_y[r << 3]);
+
+ dst->buf += 8 * dst->stride;
+ mask_16x16 >>= 8;
+ mask_8x8 >>= 8;
+ mask_4x4 >>= 8;
+ }
+ } else {
+ uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
+ uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
+ uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
+ uint16_t mask_4x4_int = lfm->int_4x4_uv;
+
+ // Vertical pass
+ for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+ if (plane->plane_type == 1) {
+ for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++)
+ lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
+ }
+
+ mask_4x4_int_row[r] = mask_4x4_int & 0xf;
// Disable filtering on the leftmost column
filter_selectively_vert(dst->buf, dst->stride,
- MASK_ROW(lfm->left_uv[TX_16X16]),
- MASK_ROW(lfm->left_uv[TX_8X8]),
- MASK_ROW(lfm->left_uv[TX_4X4]),
- MASK_ROW(lfm->int_4x4_uv),
- lfi[r]);
+ mask_16x16 & 0xf,
+ mask_8x8 & 0xf,
+ mask_4x4 & 0xf,
+ mask_4x4_int_row[r],
+ &cm->lf_info, &lfm->lfl_uv[r << 1]);
+
+ dst->buf += 8 * dst->stride;
+ mask_16x16 >>= 4;
+ mask_8x8 >>= 4;
+ mask_4x4 >>= 4;
+ mask_4x4_int >>= 4;
}
- dst->buf += 8 * dst->stride;
- mi_8x8 += row_step_stride;
- }
- // Now do horizontal pass
- dst->buf = dst0;
- for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
- const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
- const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
- int r_sampled = r >> ss_x;
+ // Horizontal pass
+ dst->buf = dst0;
+ mask_16x16 = lfm->above_uv[TX_16X16];
+ mask_8x8 = lfm->above_uv[TX_8X8];
+ mask_4x4 = lfm->above_uv[TX_4X4];
+
+ for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+ const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
+ const unsigned int mask_4x4_int_r = skip_border_4x4_r ?
+ 0 : (mask_4x4_int_row[r]);
+ unsigned int mask_16x16_r;
+ unsigned int mask_8x8_r;
+ unsigned int mask_4x4_r;
+
+ if (mi_row + r == 0) {
+ mask_16x16_r = 0;
+ mask_8x8_r = 0;
+ mask_4x4_r = 0;
+ } else {
+ mask_16x16_r = mask_16x16 & 0xf;
+ mask_8x8_r = mask_8x8 & 0xf;
+ mask_4x4_r = mask_4x4 & 0xf;
+ }
- if (!plane->plane_type) {
filter_selectively_horiz(dst->buf, dst->stride,
- MASK_ROW(lfm->above_y[TX_16X16]),
- MASK_ROW(lfm->above_y[TX_8X8]),
- MASK_ROW(lfm->above_y[TX_4X4]),
- MASK_ROW(lfm->int_4x4_y),
- mi_row + r == 0, lfi[r]);
- } else {
- filter_selectively_horiz(dst->buf, dst->stride,
- MASK_ROW(lfm->above_uv[TX_16X16]),
- MASK_ROW(lfm->above_uv[TX_8X8]),
- MASK_ROW(lfm->above_uv[TX_4X4]),
+ mask_16x16_r,
+ mask_8x8_r,
+ mask_4x4_r,
mask_4x4_int_r,
- mi_row + r == 0, lfi[r]);
+ &cm->lf_info, &lfm->lfl_uv[r << 1]);
+
+ dst->buf += 8 * dst->stride;
+ mask_16x16 >>= 4;
+ mask_8x8 >>= 4;
+ mask_4x4 >>= 4;
}
- dst->buf += 8 * dst->stride;
}
-#undef MASK_ROW
}
void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
@@ -1017,8 +1171,7 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
#if CONFIG_NON420
if (use_420)
#endif
- filter_block_plane(cm, &xd->plane[plane], mi_8x8 + mi_col, mi_row,
- mi_col, &lfm);
+ filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm);
#if CONFIG_NON420
else
filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col,
diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h
index 19032bf..9190930 100644
--- a/libvpx/vp9/common/vp9_pred_common.h
+++ b/libvpx/vp9/common/vp9_pred_common.h
@@ -109,32 +109,40 @@ static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd);
-static const vp9_prob *get_tx_probs(BLOCK_SIZE bsize, uint8_t context,
+static const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
const struct tx_probs *tx_probs) {
- if (bsize < BLOCK_16X16)
- return tx_probs->p8x8[context];
- else if (bsize < BLOCK_32X32)
- return tx_probs->p16x16[context];
- else
- return tx_probs->p32x32[context];
+ switch (max_tx_size) {
+ case TX_8X8:
+ return tx_probs->p8x8[ctx];
+ case TX_16X16:
+ return tx_probs->p16x16[ctx];
+ case TX_32X32:
+ return tx_probs->p32x32[ctx];
+ default:
+ assert(!"Invalid max_tx_size.");
+ return NULL;
+ }
}
-static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
- const struct tx_probs *tx_probs,
- const MODE_INFO *m) {
- const BLOCK_SIZE bsize = m->mbmi.sb_type;
- const int context = vp9_get_pred_context_tx_size(xd);
- return get_tx_probs(bsize, context, tx_probs);
+static const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size, const MACROBLOCKD *xd,
+ const struct tx_probs *tx_probs) {
+ const int ctx = vp9_get_pred_context_tx_size(xd);
+ return get_tx_probs(max_tx_size, ctx, tx_probs);
}
-static unsigned int *get_tx_counts(BLOCK_SIZE bsize, uint8_t context,
+static unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
struct tx_counts *tx_counts) {
- if (bsize < BLOCK_16X16)
- return tx_counts->p8x8[context];
- else if (bsize < BLOCK_32X32)
- return tx_counts->p16x16[context];
- else
- return tx_counts->p32x32[context];
+ switch (max_tx_size) {
+ case TX_8X8:
+ return tx_counts->p8x8[ctx];
+ case TX_16X16:
+ return tx_counts->p16x16[ctx];
+ case TX_32X32:
+ return tx_counts->p32x32[ctx];
+ default:
+ assert(!"Invalid max_tx_size.");
+ return NULL;
+ }
}
#endif // VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c
index 1c96788..7cc66c8 100644
--- a/libvpx/vp9/common/vp9_reconinter.c
+++ b/libvpx/vp9/common/vp9_reconinter.c
@@ -117,16 +117,13 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
return clamped_mv;
}
-struct build_inter_predictors_args {
- MACROBLOCKD *xd;
- int x, y;
-};
-
-static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
- int pred_w, int pred_h,
- void *argv) {
- const struct build_inter_predictors_args* const arg = argv;
- MACROBLOCKD *const xd = arg->xd;
+
+// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
+// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
+// sizes smaller than 16x16 yet.
+static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+ BLOCK_SIZE bsize, int pred_w, int pred_h,
+ int mi_x, int mi_y) {
struct macroblockd_plane *const pd = &xd->plane[plane];
const int bwl = b_width_log2(bsize) - pd->subsampling_x;
const int bw = 4 << bwl;
@@ -172,7 +169,7 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
if (vp9_is_scaled(scale->sfc)) {
pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, scale);
- scale->sfc->set_scaled_offsets(scale, arg->y + y, arg->x + x);
+ scale->sfc->set_scaled_offsets(scale, mi_y + y, mi_x + x);
scaled_mv = scale->sfc->scale_mv(&mv_q4, scale);
xs = scale->sfc->x_step_q4;
ys = scale->sfc->y_step_q4;
@@ -190,40 +187,25 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
}
}
-// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
-// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
-// sizes smaller than 16x16 yet.
-typedef void (*foreach_predicted_block_visitor)(int plane, int block,
- BLOCK_SIZE bsize,
- int pred_w, int pred_h,
- void *arg);
-static INLINE void foreach_predicted_block_in_plane(
- const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane,
- foreach_predicted_block_visitor visit, void *arg) {
- const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
- const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
-
- if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) {
- int i = 0, x, y;
- assert(bsize == BLOCK_8X8);
- for (y = 0; y < 1 << bhl; ++y)
- for (x = 0; x < 1 << bwl; ++x)
- visit(plane, i++, bsize, 0, 0, arg);
- } else {
- visit(plane, 0, bsize, bwl, bhl, arg);
- }
-}
-
static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
int mi_row, int mi_col,
int plane_from, int plane_to) {
int plane;
for (plane = plane_from; plane <= plane_to; ++plane) {
- struct build_inter_predictors_args args = {
- xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
- };
- foreach_predicted_block_in_plane(xd, bsize, plane, build_inter_predictors,
- &args);
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+ const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+
+ if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) {
+ int i = 0, x, y;
+ assert(bsize == BLOCK_8X8);
+ for (y = 0; y < 1 << bhl; ++y)
+ for (x = 0; x < 1 << bwl; ++x)
+ build_inter_predictors(xd, plane, i++, bsize, 0, 0, mi_x, mi_y);
+ } else {
+ build_inter_predictors(xd, plane, 0, bsize, bwl, bhl, mi_x, mi_y);
+ }
}
}
diff --git a/libvpx/vp9/common/vp9_rtcd_defs.sh b/libvpx/vp9/common/vp9_rtcd_defs.sh
index debec61..2c0864e 100644
--- a/libvpx/vp9/common/vp9_rtcd_defs.sh
+++ b/libvpx/vp9/common/vp9_rtcd_defs.sh
@@ -41,7 +41,7 @@ prototype void vp9_d63_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const ui
specialize vp9_d63_predictor_4x4 $ssse3_x86inc
prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_4x4 $ssse3_x86inc
+specialize vp9_h_predictor_4x4 $ssse3_x86inc dspr2
prototype void vp9_d117_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d117_predictor_4x4
@@ -56,10 +56,10 @@ prototype void vp9_v_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint
specialize vp9_v_predictor_4x4 $sse_x86inc
prototype void vp9_tm_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_4x4 $sse_x86inc
+specialize vp9_tm_predictor_4x4 $sse_x86inc dspr2
prototype void vp9_dc_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_4x4 $sse_x86inc
+specialize vp9_dc_predictor_4x4 $sse_x86inc dspr2
prototype void vp9_dc_top_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_dc_top_predictor_4x4
@@ -80,7 +80,7 @@ prototype void vp9_d63_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const ui
specialize vp9_d63_predictor_8x8 $ssse3_x86inc
prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_8x8 $ssse3_x86inc
+specialize vp9_h_predictor_8x8 $ssse3_x86inc dspr2
prototype void vp9_d117_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d117_predictor_8x8
@@ -95,10 +95,10 @@ prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint
specialize vp9_v_predictor_8x8 $sse_x86inc
prototype void vp9_tm_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_8x8 $sse2_x86inc
+specialize vp9_tm_predictor_8x8 $sse2_x86inc dspr2
prototype void vp9_dc_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_8x8 $sse_x86inc
+specialize vp9_dc_predictor_8x8 $sse_x86inc dspr2
prototype void vp9_dc_top_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_dc_top_predictor_8x8
@@ -119,7 +119,7 @@ prototype void vp9_d63_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const
specialize vp9_d63_predictor_16x16 $ssse3_x86inc
prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_16x16 $ssse3_x86inc
+specialize vp9_h_predictor_16x16 $ssse3_x86inc dspr2
prototype void vp9_d117_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d117_predictor_16x16
@@ -137,7 +137,7 @@ prototype void vp9_tm_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const u
specialize vp9_tm_predictor_16x16 $sse2_x86inc
prototype void vp9_dc_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_16x16 $sse2_x86inc
+specialize vp9_dc_predictor_16x16 $sse2_x86inc dspr2
prototype void vp9_dc_top_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_dc_top_predictor_16x16
@@ -191,22 +191,22 @@ specialize vp9_dc_128_predictor_32x32
# Loopfilter
#
prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
-specialize vp9_mb_lpf_vertical_edge_w sse2 neon
+specialize vp9_mb_lpf_vertical_edge_w sse2 neon dspr2
prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mbloop_filter_vertical_edge sse2 neon
+specialize vp9_mbloop_filter_vertical_edge sse2 neon dspr2
prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_loop_filter_vertical_edge mmx neon
+specialize vp9_loop_filter_vertical_edge mmx neon dspr2
prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon
+specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon dspr2
prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mbloop_filter_horizontal_edge sse2 neon
+specialize vp9_mbloop_filter_horizontal_edge sse2 neon dspr2
prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_loop_filter_horizontal_edge mmx neon
+specialize vp9_loop_filter_horizontal_edge mmx neon dspr2
#
# post proc
@@ -296,10 +296,10 @@ prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int
specialize vp9_idct32x32_1024_add sse2 neon dspr2
prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_34_add sse2
+specialize vp9_idct32x32_34_add sse2 dspr2
prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_1_add sse2 dspr2
+specialize vp9_idct32x32_1_add sse2 neon dspr2
prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
specialize vp9_iht4x4_16_add sse2 neon dspr2
diff --git a/libvpx/vp9/common/vp9_treecoder.c b/libvpx/vp9/common/vp9_treecoder.c
index 1805fb4..e2a5b9f 100644
--- a/libvpx/vp9/common/vp9_treecoder.c
+++ b/libvpx/vp9/common/vp9_treecoder.c
@@ -35,28 +35,20 @@ void vp9_tokens_from_tree(struct vp9_token *p, vp9_tree t) {
tree2tok(p, t, 0, 0, 0);
}
-void vp9_tokens_from_tree_offset(struct vp9_token *p, vp9_tree t,
- int offset) {
- tree2tok(p - offset, t, 0, 0, 0);
-}
-
static unsigned int convert_distribution(unsigned int i, vp9_tree tree,
unsigned int branch_ct[][2],
- const unsigned int num_events[],
- unsigned int tok0_offset) {
+ const unsigned int num_events[]) {
unsigned int left, right;
- if (tree[i] <= 0) {
- left = num_events[-tree[i] - tok0_offset];
- } else {
- left = convert_distribution(tree[i], tree, branch_ct, num_events,
- tok0_offset);
- }
+ if (tree[i] <= 0)
+ left = num_events[-tree[i]];
+ else
+ left = convert_distribution(tree[i], tree, branch_ct, num_events);
+
if (tree[i + 1] <= 0)
- right = num_events[-tree[i + 1] - tok0_offset];
+ right = num_events[-tree[i + 1]];
else
- right = convert_distribution(tree[i + 1], tree, branch_ct, num_events,
- tok0_offset);
+ right = convert_distribution(tree[i + 1], tree, branch_ct, num_events);
branch_ct[i >> 1][0] = left;
branch_ct[i >> 1][1] = right;
@@ -65,9 +57,8 @@ static unsigned int convert_distribution(unsigned int i, vp9_tree tree,
void vp9_tree_probs_from_distribution(vp9_tree tree,
unsigned int branch_ct[/* n-1 */][2],
- const unsigned int num_events[/* n */],
- unsigned int tok0_offset) {
- convert_distribution(0, tree, branch_ct, num_events, tok0_offset);
+ const unsigned int num_events[/* n */]) {
+ convert_distribution(0, tree, branch_ct, num_events);
}
diff --git a/libvpx/vp9/common/vp9_treecoder.h b/libvpx/vp9/common/vp9_treecoder.h
index 9c776d6..a79b156 100644
--- a/libvpx/vp9/common/vp9_treecoder.h
+++ b/libvpx/vp9/common/vp9_treecoder.h
@@ -42,7 +42,6 @@ struct vp9_token {
/* Construct encoding array from tree. */
void vp9_tokens_from_tree(struct vp9_token*, vp9_tree);
-void vp9_tokens_from_tree_offset(struct vp9_token*, vp9_tree, int offset);
/* Convert array of token occurrence counts into a table of probabilities
for the associated binary encoding tree. Also writes count of branches
@@ -51,8 +50,7 @@ void vp9_tokens_from_tree_offset(struct vp9_token*, vp9_tree, int offset);
void vp9_tree_probs_from_distribution(vp9_tree tree,
unsigned int branch_ct[ /* n - 1 */ ][2],
- const unsigned int num_events[ /* n */ ],
- unsigned int tok0_offset);
+ const unsigned int num_events[ /* n */ ]);
static INLINE vp9_prob clip_prob(int p) {
@@ -116,10 +114,10 @@ static unsigned int tree_merge_probs_impl(unsigned int i,
static void tree_merge_probs(const vp9_tree_index *tree,
const vp9_prob *pre_probs,
- const unsigned int *counts, int offset,
+ const unsigned int *counts,
unsigned int count_sat,
unsigned int max_update_factor, vp9_prob *probs) {
- tree_merge_probs_impl(0, tree, pre_probs, &counts[-offset],
+ tree_merge_probs_impl(0, tree, pre_probs, counts,
count_sat, max_update_factor, probs);
}
diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index ccf5aac..2a33844 100644
--- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -15,6 +15,16 @@
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_idct.h"
+#define RECON_AND_STORE4X4(dest, in_x) \
+{ \
+ __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+ d0 = _mm_unpacklo_epi8(d0, zero); \
+ d0 = _mm_add_epi16(in_x, d0); \
+ d0 = _mm_packus_epi16(d0, d0); \
+ *(int *)dest = _mm_cvtsi128_si32(d0); \
+ dest += stride; \
+}
+
void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
@@ -26,21 +36,19 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
__m128i input0, input1, input2, input3;
// Rows
- input0 = _mm_loadl_epi64((const __m128i *)input);
- input1 = _mm_loadl_epi64((const __m128i *)(input + 4));
- input2 = _mm_loadl_epi64((const __m128i *)(input + 8));
- input3 = _mm_loadl_epi64((const __m128i *)(input + 12));
+ input0 = _mm_load_si128((const __m128i *)input);
+ input2 = _mm_load_si128((const __m128i *)(input + 8));
// Construct i3, i1, i3, i1, i2, i0, i2, i0
input0 = _mm_shufflelo_epi16(input0, 0xd8);
- input1 = _mm_shufflelo_epi16(input1, 0xd8);
+ input0 = _mm_shufflehi_epi16(input0, 0xd8);
input2 = _mm_shufflelo_epi16(input2, 0xd8);
- input3 = _mm_shufflelo_epi16(input3, 0xd8);
+ input2 = _mm_shufflehi_epi16(input2, 0xd8);
+ input1 = _mm_unpackhi_epi32(input0, input0);
input0 = _mm_unpacklo_epi32(input0, input0);
- input1 = _mm_unpacklo_epi32(input1, input1);
+ input3 = _mm_unpackhi_epi32(input2, input2);
input2 = _mm_unpacklo_epi32(input2, input2);
- input3 = _mm_unpacklo_epi32(input3, input3);
// Stage 1
input0 = _mm_madd_epi16(input0, cst);
@@ -59,16 +67,14 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
// Stage 2
- input0 = _mm_packs_epi32(input0, zero);
- input1 = _mm_packs_epi32(input1, zero);
- input2 = _mm_packs_epi32(input2, zero);
- input3 = _mm_packs_epi32(input3, zero);
+ input0 = _mm_packs_epi32(input0, input1);
+ input1 = _mm_packs_epi32(input2, input3);
// Transpose
- input1 = _mm_unpacklo_epi16(input0, input1);
- input3 = _mm_unpacklo_epi16(input2, input3);
- input0 = _mm_unpacklo_epi32(input1, input3);
- input1 = _mm_unpackhi_epi32(input1, input3);
+ input2 = _mm_unpacklo_epi16(input0, input1);
+ input3 = _mm_unpackhi_epi16(input0, input1);
+ input0 = _mm_unpacklo_epi32(input2, input3);
+ input1 = _mm_unpackhi_epi32(input2, input3);
// Switch column2, column 3, and then, we got:
// input2: column1, column 0; input3: column2, column 3.
@@ -78,14 +84,9 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
// Columns
// Construct i3, i1, i3, i1, i2, i0, i2, i0
- input0 = _mm_shufflelo_epi16(input2, 0xd8);
- input1 = _mm_shufflehi_epi16(input2, 0xd8);
- input2 = _mm_shufflehi_epi16(input3, 0xd8);
- input3 = _mm_shufflelo_epi16(input3, 0xd8);
-
- input0 = _mm_unpacklo_epi32(input0, input0);
- input1 = _mm_unpackhi_epi32(input1, input1);
- input2 = _mm_unpackhi_epi32(input2, input2);
+ input0 = _mm_unpacklo_epi32(input2, input2);
+ input1 = _mm_unpackhi_epi32(input2, input2);
+ input2 = _mm_unpackhi_epi32(input3, input3);
input3 = _mm_unpacklo_epi32(input3, input3);
// Stage 1
@@ -105,16 +106,14 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
// Stage 2
- input0 = _mm_packs_epi32(input0, zero);
- input1 = _mm_packs_epi32(input1, zero);
- input2 = _mm_packs_epi32(input2, zero);
- input3 = _mm_packs_epi32(input3, zero);
+ input0 = _mm_packs_epi32(input0, input2);
+ input1 = _mm_packs_epi32(input1, input3);
// Transpose
- input1 = _mm_unpacklo_epi16(input0, input1);
- input3 = _mm_unpacklo_epi16(input2, input3);
- input0 = _mm_unpacklo_epi32(input1, input3);
- input1 = _mm_unpackhi_epi32(input1, input3);
+ input2 = _mm_unpacklo_epi16(input0, input1);
+ input3 = _mm_unpackhi_epi16(input0, input1);
+ input0 = _mm_unpacklo_epi32(input2, input3);
+ input1 = _mm_unpackhi_epi32(input2, input3);
// Switch column2, column 3, and then, we got:
// input2: column1, column 0; input3: column2, column 3.
@@ -129,23 +128,31 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
input2 = _mm_srai_epi16(input2, 4);
input3 = _mm_srai_epi16(input3, 4);
-#define RECON_AND_STORE4X4(dest, in_x) \
- { \
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
- d0 = _mm_unpacklo_epi8(d0, zero); \
- d0 = _mm_add_epi16(in_x, d0); \
- d0 = _mm_packus_epi16(d0, d0); \
- *(int *)dest = _mm_cvtsi128_si32(d0); \
- dest += stride; \
+ // Reconstruction and Store
+ {
+ __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+ __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+ d0 = _mm_unpacklo_epi32(d0,
+ _mm_cvtsi32_si128(*(const int *) (dest + stride)));
+ d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
+ *(const int *) (dest + stride * 3)), d2);
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d2 = _mm_unpacklo_epi8(d2, zero);
+ d0 = _mm_add_epi16(d0, input2);
+ d2 = _mm_add_epi16(d2, input3);
+ d0 = _mm_packus_epi16(d0, d2);
+ // store input0
+ *(int *)dest = _mm_cvtsi128_si32(d0);
+ // store input1
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+ // store input2
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+ // store input3
+ d0 = _mm_srli_si128(d0, 4);
+ *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
}
-
- input0 = _mm_srli_si128(input2, 8);
- input1 = _mm_srli_si128(input3, 8);
-
- RECON_AND_STORE4X4(dest, input2);
- RECON_AND_STORE4X4(dest, input0);
- RECON_AND_STORE4X4(dest, input1);
- RECON_AND_STORE4X4(dest, input3);
}
void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c
index 9792d2c..b948429 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/libvpx/vp9/decoder/vp9_decodemv.c
@@ -48,12 +48,13 @@ static MB_PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r,
}
static MB_PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r,
- uint8_t context) {
- const MB_PREDICTION_MODE mode = treed_read(r, vp9_inter_mode_tree,
- cm->fc.inter_mode_probs[context]);
+ int ctx) {
+ const int mode = treed_read(r, vp9_inter_mode_tree,
+ cm->fc.inter_mode_probs[ctx]);
if (!cm->frame_parallel_decoding_mode)
- ++cm->counts.inter_mode[context][inter_mode_offset(mode)];
- return mode;
+ ++cm->counts.inter_mode[ctx][mode];
+
+ return NEARESTMV + mode;
}
static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
@@ -61,31 +62,28 @@ static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
}
static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
- BLOCK_SIZE bsize, vp9_reader *r) {
- const uint8_t context = vp9_get_pred_context_tx_size(xd);
- const vp9_prob *tx_probs = get_tx_probs(bsize, context, &cm->fc.tx_probs);
+ TX_SIZE max_tx_size, vp9_reader *r) {
+ const int ctx = vp9_get_pred_context_tx_size(xd);
+ const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc.tx_probs);
TX_SIZE tx_size = vp9_read(r, tx_probs[0]);
- if (tx_size != TX_4X4 && bsize >= BLOCK_16X16) {
+ if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
tx_size += vp9_read(r, tx_probs[1]);
- if (tx_size != TX_8X8 && bsize >= BLOCK_32X32)
+ if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
tx_size += vp9_read(r, tx_probs[2]);
}
if (!cm->frame_parallel_decoding_mode)
- ++get_tx_counts(bsize, context, &cm->counts.tx)[tx_size];
+ ++get_tx_counts(max_tx_size, ctx, &cm->counts.tx)[tx_size];
return tx_size;
}
-static TX_SIZE read_tx_size(VP9_COMMON *const cm, MACROBLOCKD *const xd,
- TX_MODE tx_mode, BLOCK_SIZE bsize, int allow_select,
- vp9_reader *r) {
- if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8) {
- return read_selected_tx_size(cm, xd, bsize, r);
- } else {
- const TX_SIZE max_tx_size_block = max_txsize_lookup[bsize];
- const TX_SIZE max_tx_size_txmode = tx_mode_to_biggest_tx_size[tx_mode];
- return MIN(max_tx_size_block, max_tx_size_txmode);
- }
+static TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, TX_MODE tx_mode,
+ BLOCK_SIZE bsize, int allow_select, vp9_reader *r) {
+ const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+ if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
+ return read_selected_tx_size(cm, xd, max_tx_size, r);
+ else
+ return MIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
}
static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
@@ -260,6 +258,16 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
mv->col = ref->col + diff.col;
}
+static COMPPREDMODE_TYPE read_reference_mode(VP9_COMMON *cm,
+ const MACROBLOCKD *xd,
+ vp9_reader *r) {
+ const int ctx = vp9_get_pred_context_comp_inter_inter(cm, xd);
+ const int mode = vp9_read(r, cm->fc.comp_inter_prob[ctx]);
+ if (!cm->frame_parallel_decoding_mode)
+ ++cm->counts.comp_inter[ctx][mode];
+ return mode; // SINGLE_PREDICTION_ONLY or COMP_PREDICTION_ONLY
+}
+
// Read the referncence frame
static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
vp9_reader *r,
@@ -271,27 +279,20 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
ref_frame[0] = vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
ref_frame[1] = NONE;
} else {
- const int comp_ctx = vp9_get_pred_context_comp_inter_inter(cm, xd);
- int is_comp;
-
- if (cm->comp_pred_mode == HYBRID_PREDICTION) {
- is_comp = vp9_read(r, fc->comp_inter_prob[comp_ctx]);
- if (!cm->frame_parallel_decoding_mode)
- ++counts->comp_inter[comp_ctx][is_comp];
- } else {
- is_comp = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
- }
+ const COMPPREDMODE_TYPE mode = (cm->comp_pred_mode == HYBRID_PREDICTION)
+ ? read_reference_mode(cm, xd, r)
+ : cm->comp_pred_mode;
// FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
- if (is_comp) {
- const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
- const int ref_ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
- const int b = vp9_read(r, fc->comp_ref_prob[ref_ctx]);
+ if (mode == COMP_PREDICTION_ONLY) {
+ const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+ const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
+ const int bit = vp9_read(r, fc->comp_ref_prob[ctx]);
if (!cm->frame_parallel_decoding_mode)
- ++counts->comp_ref[ref_ctx][b];
- ref_frame[fix_ref_idx] = cm->comp_fixed_ref;
- ref_frame[!fix_ref_idx] = cm->comp_var_ref[b];
- } else {
+ ++counts->comp_ref[ctx][bit];
+ ref_frame[idx] = cm->comp_fixed_ref;
+ ref_frame[!idx] = cm->comp_var_ref[bit];
+ } else if (mode == SINGLE_PREDICTION_ONLY) {
const int ctx0 = vp9_get_pred_context_single_ref_p1(xd);
const int bit0 = vp9_read(r, fc->single_ref_prob[ctx0][0]);
if (!cm->frame_parallel_decoding_mode)
@@ -299,14 +300,16 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
if (bit0) {
const int ctx1 = vp9_get_pred_context_single_ref_p2(xd);
const int bit1 = vp9_read(r, fc->single_ref_prob[ctx1][1]);
- ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
if (!cm->frame_parallel_decoding_mode)
++counts->single_ref[ctx1][1][bit1];
+ ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
} else {
ref_frame[0] = LAST_FRAME;
}
ref_frame[1] = NONE;
+ } else {
+ assert(!"Invalid prediction mode.");
}
}
}
diff --git a/libvpx/vp9/decoder/vp9_decodframe.c b/libvpx/vp9/decoder/vp9_decodframe.c
index 4746a3a..3c4781b 100644
--- a/libvpx/vp9/decoder/vp9_decodframe.c
+++ b/libvpx/vp9/decoder/vp9_decodframe.c
@@ -42,6 +42,9 @@ typedef struct TileWorkerData {
vp9_reader bit_reader;
DECLARE_ALIGNED(16, MACROBLOCKD, xd);
DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
+ DECLARE_ALIGNED(16, int16_t, qcoeff[MAX_MB_PLANE][64 * 64]);
+ DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
+ DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]);
} TileWorkerData;
static int read_be32(const uint8_t *p) {
@@ -153,47 +156,38 @@ static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
}
-static void update_mv(vp9_reader *r, vp9_prob *p) {
- if (vp9_read(r, NMV_UPDATE_PROB))
- *p = (vp9_read_literal(r, 7) << 1) | 1;
+static void update_mv_probs(vp9_prob *p, int n, vp9_reader *r) {
+ int i;
+ for (i = 0; i < n; ++i)
+ if (vp9_read(r, NMV_UPDATE_PROB))
+ p[i] = (vp9_read_literal(r, 7) << 1) | 1;
}
-static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int allow_hp) {
- int i, j, k;
+static void read_mv_probs(nmv_context *ctx, int allow_hp, vp9_reader *r) {
+ int i, j;
- for (j = 0; j < MV_JOINTS - 1; ++j)
- update_mv(r, &mvc->joints[j]);
+ update_mv_probs(ctx->joints, MV_JOINTS - 1, r);
for (i = 0; i < 2; ++i) {
- nmv_component *const comp = &mvc->comps[i];
-
- update_mv(r, &comp->sign);
-
- for (j = 0; j < MV_CLASSES - 1; ++j)
- update_mv(r, &comp->classes[j]);
-
- for (j = 0; j < CLASS0_SIZE - 1; ++j)
- update_mv(r, &comp->class0[j]);
-
- for (j = 0; j < MV_OFFSET_BITS; ++j)
- update_mv(r, &comp->bits[j]);
+ nmv_component *const comp_ctx = &ctx->comps[i];
+ update_mv_probs(&comp_ctx->sign, 1, r);
+ update_mv_probs(comp_ctx->classes, MV_CLASSES - 1, r);
+ update_mv_probs(comp_ctx->class0, CLASS0_SIZE - 1, r);
+ update_mv_probs(comp_ctx->bits, MV_OFFSET_BITS, r);
}
for (i = 0; i < 2; ++i) {
- nmv_component *const comp = &mvc->comps[i];
-
+ nmv_component *const comp_ctx = &ctx->comps[i];
for (j = 0; j < CLASS0_SIZE; ++j)
- for (k = 0; k < 3; ++k)
- update_mv(r, &comp->class0_fp[j][k]);
-
- for (j = 0; j < 3; ++j)
- update_mv(r, &comp->fp[j]);
+ update_mv_probs(comp_ctx->class0_fp[j], 3, r);
+ update_mv_probs(comp_ctx->fp, 3, r);
}
if (allow_hp) {
for (i = 0; i < 2; ++i) {
- update_mv(r, &mvc->comps[i].class0_hp);
- update_mv(r, &mvc->comps[i].hp);
+ nmv_component *const comp_ctx = &ctx->comps[i];
+ update_mv_probs(&comp_ctx->class0_hp, 1, r);
+ update_mv_probs(&comp_ctx->hp, 1, r);
}
}
}
@@ -209,20 +203,22 @@ static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
// Allocate storage for each tile column.
// TODO(jzern): when max_threads <= 1 the same storage could be used for each
// tile.
-static void alloc_tile_storage(VP9D_COMP *pbi, int tile_cols) {
+static void alloc_tile_storage(VP9D_COMP *pbi, int tile_rows, int tile_cols) {
VP9_COMMON *const cm = &pbi->common;
const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
- int i, tile_col;
+ int i, tile_row, tile_col;
CHECK_MEM_ERROR(cm, pbi->mi_streams,
- vpx_realloc(pbi->mi_streams, tile_cols *
+ vpx_realloc(pbi->mi_streams, tile_rows * tile_cols *
sizeof(*pbi->mi_streams)));
- for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
- TileInfo tile;
-
- vp9_tile_init(&tile, cm, 0, tile_col);
- pbi->mi_streams[tile_col] =
- &cm->mi[cm->mi_rows * tile.mi_col_start];
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileInfo tile;
+ vp9_tile_init(&tile, cm, tile_row, tile_col);
+ pbi->mi_streams[tile_row * tile_cols + tile_col] =
+ &cm->mi[tile.mi_row_start * cm->mode_info_stride
+ + tile.mi_col_start];
+ }
}
// 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
@@ -246,31 +242,30 @@ static void alloc_tile_storage(VP9D_COMP *pbi, int tile_cols) {
}
static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
- BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+ TX_SIZE tx_size, int x, int y) {
struct macroblockd_plane *const pd = &xd->plane[plane];
- int16_t* const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
- const int stride = pd->dst.stride;
const int eob = pd->eobs[block];
if (eob > 0) {
TX_TYPE tx_type;
- const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
- block);
- uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
- pd->dst.buf, stride);
+ const int plane_type = pd->plane_type;
+ const int stride = pd->dst.stride;
+ int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint8_t *const dst = &pd->dst.buf[4 * y * stride + 4 * x];
+
switch (tx_size) {
case TX_4X4:
- tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block);
+ tx_type = get_tx_type_4x4(plane_type, xd, block);
if (tx_type == DCT_DCT)
xd->itxm_add(dqcoeff, dst, stride, eob);
else
vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type);
break;
case TX_8X8:
- tx_type = get_tx_type_8x8(pd->plane_type, xd);
+ tx_type = get_tx_type_8x8(plane_type, xd);
vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
break;
case TX_16X16:
- tx_type = get_tx_type_16x16(pd->plane_type, xd);
+ tx_type = get_tx_type_16x16(plane_type, xd);
vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
break;
case TX_32X32:
@@ -298,7 +293,7 @@ struct intra_args {
VP9_COMMON *cm;
MACROBLOCKD *xd;
vp9_reader *r;
- unsigned char* token_cache;
+ uint8_t *token_cache;
};
static void predict_and_reconstruct_intra_block(int plane, int block,
@@ -307,29 +302,28 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
struct intra_args *const args = arg;
VP9_COMMON *const cm = args->cm;
MACROBLOCKD *const xd = args->xd;
-
struct macroblockd_plane *const pd = &xd->plane[plane];
MODE_INFO *const mi = xd->mi_8x8[0];
- const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
- block);
- uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
- pd->dst.buf, pd->dst.stride);
const MB_PREDICTION_MODE mode = (plane == 0)
- ? ((mi->mbmi.sb_type < BLOCK_8X8) ? mi->bmi[raster_block].as_mode
- : mi->mbmi.mode)
- : mi->mbmi.uv_mode;
+ ? ((mi->mbmi.sb_type < BLOCK_8X8) ? mi->bmi[block].as_mode
+ : mi->mbmi.mode)
+ : mi->mbmi.uv_mode;
+ int x, y;
+ uint8_t *dst;
+ txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
+ dst = &pd->dst.buf[4 * y * pd->dst.stride + 4 * x];
if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0)
extend_for_intra(xd, plane_bsize, plane, block, tx_size);
- vp9_predict_intra_block(xd, raster_block >> tx_size,
+ vp9_predict_intra_block(xd, block >> (tx_size << 1),
b_width_log2(plane_bsize), tx_size, mode,
dst, pd->dst.stride, dst, pd->dst.stride);
if (!mi->mbmi.skip_coeff) {
- vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size,
+ vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, x, y, tx_size,
args->r, args->token_cache);
- inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
+ inverse_transform_block(xd, plane, block, tx_size, x, y);
}
}
@@ -338,7 +332,7 @@ struct inter_args {
MACROBLOCKD *xd;
vp9_reader *r;
int *eobtotal;
- unsigned char* token_cache;
+ uint8_t *token_cache;
};
static void reconstruct_inter_block(int plane, int block,
@@ -347,11 +341,13 @@ static void reconstruct_inter_block(int plane, int block,
struct inter_args *args = arg;
VP9_COMMON *const cm = args->cm;
MACROBLOCKD *const xd = args->xd;
+ int x, y;
+ txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
*args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
- plane_bsize, tx_size,
+ plane_bsize, x, y, tx_size,
args->r, args->token_cache);
- inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
+ inverse_transform_block(xd, plane, block, tx_size, x, y);
}
static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
@@ -360,16 +356,15 @@ static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
const int bh = num_8x8_blocks_high_lookup[bsize];
const int bw = num_8x8_blocks_wide_lookup[bsize];
const int offset = mi_row * cm->mode_info_stride + mi_col;
-
- xd->mode_info_stride = cm->mode_info_stride;
+ const int tile_offset = tile->mi_row_start * cm->mode_info_stride +
+ tile->mi_col_start;
xd->mi_8x8 = cm->mi_grid_visible + offset;
xd->prev_mi_8x8 = cm->prev_mi_grid_visible + offset;
// we are using the mode info context stream here
- xd->mi_8x8[0] = xd->mi_stream;
+ xd->mi_8x8[0] = xd->mi_stream + offset - tile_offset;
xd->mi_8x8[0]->mbmi.sb_type = bsize;
- ++xd->mi_stream;
// Special case: if prev_mi is NULL, the previous mode info context
// cannot be used.
@@ -403,7 +398,7 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
const TileInfo *const tile,
int mi_row, int mi_col,
vp9_reader *r, BLOCK_SIZE bsize,
- unsigned char *token_cache) {
+ uint8_t *token_cache) {
const int less8x8 = bsize < BLOCK_8X8;
MB_MODE_INFO *mbmi;
@@ -425,7 +420,9 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
}
if (!is_inter_block(mbmi)) {
- struct intra_args arg = { cm, xd, r, token_cache };
+ struct intra_args arg = {
+ cm, xd, r, token_cache
+ };
foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block,
&arg);
} else {
@@ -443,7 +440,9 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
// Reconstruction
if (!mbmi->skip_coeff) {
int eobtotal = 0;
- struct inter_args arg = { cm, xd, r, &eobtotal, token_cache };
+ struct inter_args arg = {
+ cm, xd, r, &eobtotal, token_cache
+ };
foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
if (!less8x8 && eobtotal == 0)
mbmi->skip_coeff = 1; // skip loopfilter
@@ -483,7 +482,7 @@ static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
const TileInfo *const tile,
int mi_row, int mi_col,
vp9_reader* r, BLOCK_SIZE bsize,
- unsigned char *token_cache) {
+ uint8_t *token_cache) {
const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
PARTITION_TYPE partition;
BLOCK_SIZE subsize;
@@ -705,20 +704,19 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) {
VP9_COMMON *cm = &pbi->common;
if (cm->width != width || cm->height != height) {
- if (!pbi->initial_width || !pbi->initial_height) {
- if (vp9_alloc_frame_buffers(cm, width, height))
+ // Change in frame size.
+ if (cm->width == 0 || cm->height == 0) {
+ // Assign new frame buffer on first call.
+ cm->new_fb_idx = NUM_YV12_BUFFERS - 1;
+ cm->fb_idx_ref_cnt[cm->new_fb_idx] = 1;
+ }
+
+ // TODO(agrange) Don't test width/height, check overall size.
+ if (width > cm->width || height > cm->height) {
+ // Rescale frame buffers only if they're not big enough already.
+ if (vp9_resize_frame_buffers(cm, width, height))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffers");
- pbi->initial_width = width;
- pbi->initial_height = height;
- } else {
- if (width > pbi->initial_width)
- vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
- "Frame width too large");
-
- if (height > pbi->initial_height)
- vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
- "Frame height too large");
}
cm->width = width;
@@ -768,9 +766,10 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi,
}
static void setup_tile_context(VP9D_COMP *const pbi, MACROBLOCKD *const xd,
- int tile_col) {
+ int tile_row, int tile_col) {
int i;
- xd->mi_stream = pbi->mi_streams[tile_col];
+ const int tile_cols = 1 << pbi->common.log2_tile_cols;
+ xd->mi_stream = pbi->mi_streams[tile_row * tile_cols + tile_col];
for (i = 0; i < MAX_MB_PLANE; ++i) {
xd->above_context[i] = pbi->above_context[i];
@@ -802,9 +801,10 @@ static void decode_tile(VP9D_COMP *pbi, const TileInfo *const tile,
vp9_zero(xd->left_context);
vp9_zero(xd->left_seg_context);
for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
- mi_col += MI_BLOCK_SIZE)
+ mi_col += MI_BLOCK_SIZE) {
decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64,
pbi->token_cache);
+ }
if (pbi->do_loopfilter_inline) {
const int lf_start = mi_row - MI_BLOCK_SIZE;
@@ -874,81 +874,85 @@ static size_t get_tile(const uint8_t *const data_end,
return size;
}
-static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
- vp9_reader residual_bc;
+typedef struct TileBuffer {
+ const uint8_t *data;
+ size_t size;
+} TileBuffer;
+static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
-
- const uint8_t *const data_end = pbi->source + pbi->source_sz;
- const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+ const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
const int tile_cols = 1 << cm->log2_tile_cols;
const int tile_rows = 1 << cm->log2_tile_rows;
+ TileBuffer tile_buffers[4][1 << 6];
int tile_row, tile_col;
+ const uint8_t *const data_end = pbi->source + pbi->source_sz;
+ const uint8_t *end = NULL;
+ vp9_reader r;
+
+ assert(tile_rows <= 4);
+ assert(tile_cols <= (1 << 6));
// Note: this memset assumes above_context[0], [1] and [2]
// are allocated as part of the same buffer.
vpx_memset(pbi->above_context[0], 0,
- sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
- 2 * aligned_mi_cols);
+ sizeof(*pbi->above_context[0]) * MAX_MB_PLANE * 2 * aligned_cols);
vpx_memset(pbi->above_seg_context, 0,
- sizeof(*pbi->above_seg_context) * aligned_mi_cols);
-
- if (pbi->oxcf.inv_tile_order) {
- const uint8_t *data_ptr2[4][1 << 6];
- vp9_reader bc_bak = {0};
-
- // pre-initialize the offsets, we're going to decode in inverse order
- data_ptr2[0][0] = data;
- for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- for (tile_col = 0; tile_col < tile_cols; tile_col++) {
- const int last_tile =
- tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
- const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
- data_ptr2[tile_row][tile_col] = data;
- data += size;
- }
+ sizeof(*pbi->above_seg_context) * aligned_cols);
+
+ // Load tile data into tile_buffers
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ const int last_tile = tile_row == tile_rows - 1 &&
+ tile_col == tile_cols - 1;
+ const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
+ TileBuffer *const buf = &tile_buffers[tile_row][tile_col];
+ buf->data = data;
+ buf->size = size;
+ data += size;
}
+ }
- for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) {
- TileInfo tile;
-
- vp9_tile_init(&tile, cm, tile_row, tile_col);
- setup_token_decoder(data_ptr2[tile_row][tile_col], data_end,
- data_end - data_ptr2[tile_row][tile_col],
- &cm->error, &residual_bc);
- setup_tile_context(pbi, xd, tile_col);
- decode_tile(pbi, &tile, &residual_bc);
- if (tile_row == tile_rows - 1 && tile_col == tile_cols - 1)
- bc_bak = residual_bc;
- }
- }
- residual_bc = bc_bak;
- } else {
- for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- for (tile_col = 0; tile_col < tile_cols; tile_col++) {
- const int last_tile =
- tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
- const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
- TileInfo tile;
-
- vp9_tile_init(&tile, cm, tile_row, tile_col);
-
- setup_token_decoder(data, data_end, size, &cm->error, &residual_bc);
- setup_tile_context(pbi, xd, tile_col);
- decode_tile(pbi, &tile, &residual_bc);
- data += size;
- }
+ // Decode tiles using data from tile_buffers
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ const int col = pbi->oxcf.inv_tile_order ? tile_cols - tile_col - 1
+ : tile_col;
+ const int last_tile = tile_row == tile_rows - 1 &&
+ col == tile_cols - 1;
+ const TileBuffer *const buf = &tile_buffers[tile_row][col];
+ TileInfo tile;
+
+ vp9_tile_init(&tile, cm, tile_row, col);
+ setup_token_decoder(buf->data, data_end, buf->size, &cm->error, &r);
+ setup_tile_context(pbi, xd, tile_row, col);
+ decode_tile(pbi, &tile, &r);
+
+ if (last_tile)
+ end = vp9_reader_find_end(&r);
}
}
- return vp9_reader_find_end(&residual_bc);
+ return end;
+}
+
+static void setup_tile_macroblockd(TileWorkerData *const tile_data) {
+ MACROBLOCKD *xd = &tile_data->xd;
+ struct macroblockd_plane *const pd = xd->plane;
+ int i;
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ pd[i].qcoeff = tile_data->qcoeff[i];
+ pd[i].dqcoeff = tile_data->dqcoeff[i];
+ pd[i].eobs = tile_data->eobs[i];
+ vpx_memset(xd->plane[i].dqcoeff, 0, 64 * 64 * sizeof(int16_t));
+ }
}
static int tile_worker_hook(void *arg1, void *arg2) {
- TileWorkerData *tile_data = (TileWorkerData*)arg1;
+ TileWorkerData *const tile_data = (TileWorkerData*)arg1;
const TileInfo *const tile = (TileInfo*)arg2;
int mi_row, mi_col;
@@ -1023,7 +1027,8 @@ static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) {
setup_token_decoder(data, data_end, size, &cm->error,
&tile_data->bit_reader);
- setup_tile_context(pbi, &tile_data->xd, tile_col);
+ setup_tile_context(pbi, &tile_data->xd, 0, tile_col);
+ setup_tile_macroblockd(tile_data);
worker->had_error = 0;
if (i == num_workers - 1 || tile_col == tile_cols - 1) {
@@ -1227,7 +1232,7 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
for (i = 0; i < PARTITION_TYPES - 1; ++i)
vp9_diff_update_prob(&r, &fc->partition_prob[j][i]);
- read_mv_probs(&r, nmvc, cm->allow_high_precision_mv);
+ read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
}
return vp9_reader_has_error(&r);
@@ -1323,9 +1328,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
}
}
- alloc_tile_storage(pbi, tile_cols);
+ alloc_tile_storage(pbi, tile_rows, tile_cols);
- xd->mi_8x8 = cm->mi_grid_visible;
xd->mode_info_stride = cm->mode_info_stride;
set_prev_mi(cm);
@@ -1335,7 +1339,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
cm->fc = cm->frame_contexts[cm->frame_context_idx];
vp9_zero(cm->counts);
for (i = 0; i < MAX_MB_PLANE; ++i)
- vp9_zero(xd->plane[i].dqcoeff);
+ vpx_memset(xd->plane[i].dqcoeff, 0, 64 * 64 * sizeof(int16_t));
xd->corrupted = 0;
new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
diff --git a/libvpx/vp9/decoder/vp9_detokenize.c b/libvpx/vp9/decoder/vp9_detokenize.c
index b8d670b..05a2b42 100644
--- a/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/libvpx/vp9/decoder/vp9_detokenize.c
@@ -67,27 +67,27 @@ static const int token_to_counttoken[MAX_ENTROPY_TOKENS] = {
TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, DCT_EOB_MODEL_TOKEN
};
-#define INCREMENT_COUNT(token) \
- do { \
- if (!cm->frame_parallel_decoding_mode) { \
+#define INCREMENT_COUNT(token) \
+ do { \
+ if (!cm->frame_parallel_decoding_mode) \
++coef_counts[band][pt][token_to_counttoken[token]]; \
- } \
- } while (0);
+ } while (0)
+
#define WRITE_COEF_CONTINUE(val, token) \
{ \
- dqcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \
+ dqcoeff_ptr[scan[c]] = (vp9_read_bit(r) ? -val : val) * \
dq[c > 0] / (1 + (tx_size == TX_32X32)); \
INCREMENT_COUNT(token); \
token_cache[scan[c]] = vp9_pt_energy_class[token]; \
- c++; \
+ ++c; \
continue; \
}
#define ADJUST_COEF(prob, bits_count) \
do { \
val += (vp9_read(r, prob) << bits_count); \
- } while (0);
+ } while (0)
static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
vp9_reader *r, int block_idx,
@@ -108,31 +108,31 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
unsigned int (*eob_branch_count)[PREV_COEF_CONTEXTS] =
counts->eob_branch[tx_size][type][ref];
const int16_t *scan, *nb;
- const uint8_t *const band_translate = get_band_translate(tx_size);
+ const uint8_t *cat6;
+ const uint8_t *band_translate = get_band_translate(tx_size);
get_scan(xd, tx_size, type, block_idx, &scan, &nb);
- while (1) {
+ while (c < seg_eob) {
int val;
- const uint8_t *cat6 = cat6_prob;
- if (c >= seg_eob)
- break;
if (c)
pt = get_coef_context(nb, token_cache, c);
- band = get_coef_band(band_translate, c);
+ band = *band_translate++;
prob = coef_probs[band][pt];
if (!cm->frame_parallel_decoding_mode)
++eob_branch_count[band][pt];
if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
break;
+ goto DECODE_ZERO;
SKIP_START:
if (c >= seg_eob)
break;
if (c)
pt = get_coef_context(nb, token_cache, c);
- band = get_coef_band(band_translate, c);
+ band = *band_translate++;
prob = coef_probs[band][pt];
+ DECODE_ZERO:
if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
INCREMENT_COUNT(ZERO_TOKEN);
token_cache[scan[c]] = vp9_pt_energy_class[ZERO_TOKEN];
@@ -200,10 +200,11 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY5);
}
val = 0;
- while (*cat6) {
+ cat6 = cat6_prob;
+ while (*cat6)
val = (val << 1) | vp9_read(r, *cat6++);
- }
val += CAT6_MIN_VAL;
+
WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6);
}
@@ -217,22 +218,17 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, vp9_reader *r,
+ int x, int y, TX_SIZE tx_size, vp9_reader *r,
uint8_t *token_cache) {
struct macroblockd_plane *const pd = &xd->plane[plane];
const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
tx_size);
- int aoff, loff, eob, pt;
- txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
- pt = get_entropy_context(tx_size, pd->above_context + aoff,
- pd->left_context + loff);
-
- eob = decode_coefs(cm, xd, r, block,
- pd->plane_type, seg_eob, BLOCK_OFFSET(pd->dqcoeff, block),
- tx_size, pd->dequant, pt, token_cache);
-
- set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff);
-
+ const int pt = get_entropy_context(tx_size, pd->above_context + x,
+ pd->left_context + y);
+ const int eob = decode_coefs(cm, xd, r, block, pd->plane_type, seg_eob,
+ BLOCK_OFFSET(pd->dqcoeff, block), tx_size,
+ pd->dequant, pt, token_cache);
+ set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
pd->eobs[block] = eob;
return eob;
}
diff --git a/libvpx/vp9/decoder/vp9_detokenize.h b/libvpx/vp9/decoder/vp9_detokenize.h
index 04939ea..e858a19 100644
--- a/libvpx/vp9/decoder/vp9_detokenize.h
+++ b/libvpx/vp9/decoder/vp9_detokenize.h
@@ -17,7 +17,7 @@
int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, vp9_reader *r,
+ int x, int y, TX_SIZE tx_size, vp9_reader *r,
uint8_t *token_cache);
#endif // VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/libvpx/vp9/decoder/vp9_onyxd_if.c b/libvpx/vp9/decoder/vp9_onyxd_if.c
index 5f970a3..cb45d37 100644
--- a/libvpx/vp9/decoder/vp9_onyxd_if.c
+++ b/libvpx/vp9/decoder/vp9_onyxd_if.c
@@ -107,6 +107,18 @@ void vp9_initialize_dec() {
}
}
+static void init_macroblockd(VP9D_COMP *const pbi) {
+ MACROBLOCKD *xd = &pbi->mb;
+ struct macroblockd_plane *const pd = xd->plane;
+ int i;
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ pd[i].qcoeff = pbi->qcoeff[i];
+ pd[i].dqcoeff = pbi->dqcoeff[i];
+ pd[i].eobs = pbi->eobs[i];
+ }
+}
+
VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
VP9D_COMP *const pbi = vpx_memalign(32, sizeof(VP9D_COMP));
VP9_COMMON *const cm = pbi ? &pbi->common : NULL;
@@ -141,6 +153,8 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
cm->error.setjmp = 0;
pbi->decoded_key_frame = 0;
+ init_macroblockd(pbi);
+
vp9_worker_init(&pbi->lf_worker);
return pbi;
diff --git a/libvpx/vp9/decoder/vp9_onyxd_int.h b/libvpx/vp9/decoder/vp9_onyxd_int.h
index 7c4c9db..d3d29e9 100644
--- a/libvpx/vp9/decoder/vp9_onyxd_int.h
+++ b/libvpx/vp9/decoder/vp9_onyxd_int.h
@@ -22,6 +22,10 @@ typedef struct VP9Decompressor {
DECLARE_ALIGNED(16, VP9_COMMON, common);
+ DECLARE_ALIGNED(16, int16_t, qcoeff[MAX_MB_PLANE][64 * 64]);
+ DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
+ DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]);
+
VP9D_CONFIG oxcf;
const uint8_t *source;
@@ -50,7 +54,7 @@ typedef struct VP9Decompressor {
ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
PARTITION_CONTEXT *above_seg_context;
- DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
+ DECLARE_ALIGNED(16, uint8_t, token_cache[1024]);
} VP9D_COMP;
#endif // VP9_DECODER_VP9_ONYXD_INT_H_
diff --git a/libvpx/vp9/decoder/vp9_treereader.h b/libvpx/vp9/decoder/vp9_treereader.h
index f612497..41680d2 100644
--- a/libvpx/vp9/decoder/vp9_treereader.h
+++ b/libvpx/vp9/decoder/vp9_treereader.h
@@ -15,8 +15,6 @@
#include "vp9/common/vp9_treecoder.h"
#include "vp9/decoder/vp9_dboolhuff.h"
-#define vp9_read_and_apply_sign(r, value) (vp9_read_bit(r) ? -(value) : (value))
-
// Intent of tree data structure is to make decoding trivial.
static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */
vp9_tree t,
diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c
index 87bd36c..efbadba 100644
--- a/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libvpx/vp9/encoder/vp9_bitstream.c
@@ -169,10 +169,8 @@ static void update_mode(vp9_writer *w, int n, vp9_tree tree,
const unsigned int num_events[/* n */]) {
int i = 0;
- vp9_tree_probs_from_distribution(tree, bct, num_events, 0);
- n--;
-
- for (i = 0; i < n; ++i)
+ vp9_tree_probs_from_distribution(tree, bct, num_events);
+ for (i = 0; i < n - 1; ++i)
vp9_cond_prob_diff_update(w, &Pcur[i], bct[i]);
}
@@ -191,12 +189,14 @@ static void update_mbintra_mode_probs(VP9_COMP* const cpi,
static void write_selected_tx_size(const VP9_COMP *cpi, MODE_INFO *m,
TX_SIZE tx_size, BLOCK_SIZE bsize,
vp9_writer *w) {
+ const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs, m);
+ const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
+ &cpi->common.fc.tx_probs);
vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
- if (bsize >= BLOCK_16X16 && tx_size != TX_4X4) {
+ if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
- if (bsize >= BLOCK_32X32 && tx_size != TX_8X8)
+ if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
vp9_write(w, tx_size != TX_16X16, tx_probs[2]);
}
}
@@ -231,7 +231,7 @@ static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) {
int i, j;
for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
vp9_tree_probs_from_distribution(vp9_switchable_interp_tree, branch_ct,
- cm->counts.switchable_interp[j], 0);
+ cm->counts.switchable_interp[j]);
for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
vp9_cond_prob_diff_update(w, &cm->fc.switchable_interp_prob[j][i],
@@ -250,7 +250,7 @@ static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer *w) {
for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
unsigned int branch_ct[INTER_MODES - 1][2];
vp9_tree_probs_from_distribution(vp9_inter_mode_tree, branch_ct,
- cm->counts.inter_mode[i], NEARESTMV);
+ cm->counts.inter_mode[i]);
for (j = 0; j < INTER_MODES - 1; ++j)
vp9_cond_prob_diff_update(w, &cm->fc.inter_mode_probs[i][j],
@@ -258,15 +258,15 @@ static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer *w) {
}
}
-static void pack_mb_tokens(vp9_writer* const bc,
+static void pack_mb_tokens(vp9_writer* const w,
TOKENEXTRA **tp,
const TOKENEXTRA *const stop) {
TOKENEXTRA *p = *tp;
while (p < stop && p->token != EOSB_TOKEN) {
const int t = p->token;
- const struct vp9_token *const a = vp9_coef_encodings + t;
- const vp9_extra_bit *const b = vp9_extra_bits + t;
+ const struct vp9_token *const a = &vp9_coef_encodings[t];
+ const vp9_extra_bit *const b = &vp9_extra_bits[t];
int i = 0;
const vp9_prob *pp;
int v = a->value;
@@ -289,7 +289,7 @@ static void pack_mb_tokens(vp9_writer* const bc,
do {
const int bb = (v >> --n) & 1;
- vp9_write(bc, bb, pp[i >> 1]);
+ vp9_write(w, bb, pp[i >> 1]);
i = vp9_coef_tree[i + bb];
} while (n);
@@ -304,12 +304,12 @@ static void pack_mb_tokens(vp9_writer* const bc,
do {
const int bb = (v >> --n) & 1;
- vp9_write(bc, bb, pb[i >> 1]);
+ vp9_write(w, bb, pb[i >> 1]);
i = b->tree[i + bb];
} while (n);
}
- vp9_write_bit(bc, e & 1);
+ vp9_write_bit(w, e & 1);
}
++p;
}
@@ -321,7 +321,7 @@ static void write_sb_mv_ref(vp9_writer *w, MB_PREDICTION_MODE mode,
const vp9_prob *p) {
assert(is_inter_mode(mode));
write_token(w, vp9_inter_mode_tree, p,
- &vp9_inter_mode_encodings[inter_mode_offset(mode)]);
+ &vp9_inter_mode_encodings[INTER_OFFSET(mode)]);
}
@@ -448,7 +448,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
if (bsize >= BLOCK_8X8) {
write_sb_mv_ref(bc, mode, mv_ref_p);
++cm->counts.inter_mode[mi->mode_context[rf]]
- [inter_mode_offset(mode)];
+ [INTER_OFFSET(mode)];
}
}
@@ -471,7 +471,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode;
write_sb_mv_ref(bc, blockmode, mv_ref_p);
++cm->counts.inter_mode[mi->mode_context[rf]]
- [inter_mode_offset(blockmode)];
+ [INTER_OFFSET(blockmode)];
if (blockmode == NEWMV) {
#ifdef ENTROPY_STATS
@@ -545,37 +545,33 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
}
static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
- MODE_INFO **mi_8x8, vp9_writer *bc,
- TOKENEXTRA **tok, TOKENEXTRA *tok_end,
- int mi_row, int mi_col, int index) {
+ vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+ int mi_row, int mi_col) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- MODE_INFO *m = mi_8x8[0];
-
- if (m->mbmi.sb_type < BLOCK_8X8)
- if (index > 0)
- return;
+ MODE_INFO *m;
- xd->mi_8x8 = mi_8x8;
+ xd->mi_8x8 = cm->mi_grid_visible + (mi_row * cm->mode_info_stride + mi_col);
+ m = xd->mi_8x8[0];
set_mi_row_col(xd, tile,
mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
cm->mi_rows, cm->mi_cols);
if (frame_is_intra_only(cm)) {
- write_mb_modes_kf(cpi, mi_8x8, bc);
+ write_mb_modes_kf(cpi, xd->mi_8x8, w);
#ifdef ENTROPY_STATS
active_section = 8;
#endif
} else {
- pack_inter_mode_mvs(cpi, m, bc);
+ pack_inter_mode_mvs(cpi, m, w);
#ifdef ENTROPY_STATS
active_section = 1;
#endif
}
assert(*tok < tok_end);
- pack_mb_tokens(bc, tok, tok_end);
+ pack_mb_tokens(w, tok, tok_end);
}
static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col,
@@ -602,59 +598,50 @@ static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col,
}
static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
- MODE_INFO **mi_8x8, vp9_writer *bc,
- TOKENEXTRA **tok, TOKENEXTRA *tok_end,
- int mi_row, int mi_col, BLOCK_SIZE bsize,
- int index) {
+ vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &cpi->common;
- const int mis = cm->mode_info_stride;
- int bsl = b_width_log2(bsize);
- int bs = (1 << bsl) / 4; // mode_info step for subsize
- int n;
- PARTITION_TYPE partition = PARTITION_NONE;
+ const int bsl = b_width_log2(bsize);
+ const int bs = (1 << bsl) / 4;
+ PARTITION_TYPE partition;
BLOCK_SIZE subsize;
- MODE_INFO *m = mi_8x8[0];
+ MODE_INFO *m = cm->mi_grid_visible[mi_row * cm->mode_info_stride + mi_col];
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
partition = partition_lookup[bsl][m->mbmi.sb_type];
-
- if (bsize < BLOCK_8X8) {
- if (index > 0)
- return;
- } else {
- write_partition(cpi, bs, mi_row, mi_col, partition, bsize, bc);
- }
-
+ write_partition(cpi, bs, mi_row, mi_col, partition, bsize, w);
subsize = get_subsize(bsize, partition);
-
- switch (partition) {
- case PARTITION_NONE:
- write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
- break;
- case PARTITION_HORZ:
- write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
- if ((mi_row + bs) < cm->mi_rows)
- write_modes_b(cpi, tile, mi_8x8 + bs * mis, bc, tok, tok_end,
- mi_row + bs, mi_col, 1);
- break;
- case PARTITION_VERT:
- write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
- if ((mi_col + bs) < cm->mi_cols)
- write_modes_b(cpi, tile, mi_8x8 + bs, bc, tok, tok_end,
- mi_row, mi_col + bs, 1);
- break;
- case PARTITION_SPLIT:
- for (n = 0; n < 4; n++) {
- const int j = n >> 1, i = n & 1;
- write_modes_sb(cpi, tile, mi_8x8 + j * bs * mis + i * bs, bc,
- tok, tok_end,
- mi_row + j * bs, mi_col + i * bs, subsize, n);
- }
- break;
- default:
- assert(0);
+ if (subsize < BLOCK_8X8) {
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ } else {
+ switch (partition) {
+ case PARTITION_NONE:
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ break;
+ case PARTITION_HORZ:
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ if (mi_row + bs < cm->mi_rows)
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col);
+ break;
+ case PARTITION_VERT:
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ if (mi_col + bs < cm->mi_cols)
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs);
+ break;
+ case PARTITION_SPLIT:
+ write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+ write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs,
+ subsize);
+ write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col,
+ subsize);
+ write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
+ subsize);
+ break;
+ default:
+ assert(0);
+ }
}
// update partition context
@@ -665,25 +652,15 @@ static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
}
static void write_modes(VP9_COMP *cpi, const TileInfo *const tile,
- vp9_writer* const bc,
- TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
- VP9_COMMON *const cm = &cpi->common;
- const int mis = cm->mode_info_stride;
+ vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
int mi_row, mi_col;
- MODE_INFO **mi_8x8 = cm->mi_grid_visible;
- MODE_INFO **m_8x8;
-
- mi_8x8 += tile->mi_col_start + tile->mi_row_start * mis;
for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
- mi_row += 8, mi_8x8 += 8 * mis) {
- m_8x8 = mi_8x8;
- vp9_zero(cpi->left_seg_context);
+ mi_row += MI_BLOCK_SIZE) {
+ vp9_zero(cpi->left_seg_context);
for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
- mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) {
- write_modes_sb(cpi, tile, m_8x8, bc, tok, tok_end, mi_row, mi_col,
- BLOCK_64X64, 0);
- }
+ mi_col += MI_BLOCK_SIZE)
+ write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, BLOCK_64X64);
}
}
@@ -703,7 +680,7 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) {
continue;
vp9_tree_probs_from_distribution(vp9_coef_tree,
coef_branch_ct[i][j][k][l],
- coef_counts[i][j][k][l], 0);
+ coef_counts[i][j][k][l]);
coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
coef_branch_ct[i][j][k][l][0][0];
for (m = 0; m < UNCONSTRAINED_NODES; ++m)
@@ -1217,7 +1194,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
TileInfo tile;
- vp9_tile_init(&tile, cm, 0, tile_col);
+ vp9_tile_init(&tile, cm, tile_row, tile_col);
tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h
index 8033a4d..4445970 100644
--- a/libvpx/vp9/encoder/vp9_block.h
+++ b/libvpx/vp9/encoder/vp9_block.h
@@ -27,6 +27,18 @@ typedef struct {
typedef struct {
MODE_INFO mic;
uint8_t *zcoeff_blk;
+ int16_t *coeff[MAX_MB_PLANE][3];
+ int16_t *qcoeff[MAX_MB_PLANE][3];
+ int16_t *dqcoeff[MAX_MB_PLANE][3];
+ uint16_t *eobs[MAX_MB_PLANE][3];
+
+ // dual buffer pointers, 0: in use, 1: best in store
+ int16_t *coeff_pbuf[MAX_MB_PLANE][3];
+ int16_t *qcoeff_pbuf[MAX_MB_PLANE][3];
+ int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
+ uint16_t *eobs_pbuf[MAX_MB_PLANE][3];
+
+ int is_coded;
int num_4x4_blk;
int skip;
int_mv best_ref_mv;
@@ -57,7 +69,7 @@ typedef struct {
struct macroblock_plane {
DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
- DECLARE_ALIGNED(16, int16_t, coeff[64 * 64]);
+ int16_t *coeff;
struct buf_2d src;
// Quantizer setings
@@ -81,6 +93,10 @@ struct macroblock {
MACROBLOCKD e_mbd;
int skip_block;
+ int select_txfm_size;
+ int skip_recode;
+ int skip_optimize;
+ int q_index;
search_site *ss;
int ss_count;
@@ -120,6 +136,11 @@ struct macroblock {
int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+ unsigned char sb_index; // index of 32x32 block inside the 64x64 block
+ unsigned char mb_index; // index of 16x16 block inside the 32x32 block
+ unsigned char b_index; // index of 8x8 block inside the 16x16 block
+ unsigned char ab_index; // index of 4x4 block inside the 8x8 block
+
// These define limits to motion vector components to prevent them
// from extending outside the UMV borders
int mv_col_min;
@@ -179,35 +200,33 @@ struct macroblock {
// refactoring on organizing the temporary buffers, when recursive
// partition down to 4x4 block size is enabled.
static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
- MACROBLOCKD *const xd = &x->e_mbd;
-
switch (bsize) {
case BLOCK_64X64:
return &x->sb64_context;
case BLOCK_64X32:
- return &x->sb64x32_context[xd->sb_index];
+ return &x->sb64x32_context[x->sb_index];
case BLOCK_32X64:
- return &x->sb32x64_context[xd->sb_index];
+ return &x->sb32x64_context[x->sb_index];
case BLOCK_32X32:
- return &x->sb32_context[xd->sb_index];
+ return &x->sb32_context[x->sb_index];
case BLOCK_32X16:
- return &x->sb32x16_context[xd->sb_index][xd->mb_index];
+ return &x->sb32x16_context[x->sb_index][x->mb_index];
case BLOCK_16X32:
- return &x->sb16x32_context[xd->sb_index][xd->mb_index];
+ return &x->sb16x32_context[x->sb_index][x->mb_index];
case BLOCK_16X16:
- return &x->mb_context[xd->sb_index][xd->mb_index];
+ return &x->mb_context[x->sb_index][x->mb_index];
case BLOCK_16X8:
- return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+ return &x->sb16x8_context[x->sb_index][x->mb_index][x->b_index];
case BLOCK_8X16:
- return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
+ return &x->sb8x16_context[x->sb_index][x->mb_index][x->b_index];
case BLOCK_8X8:
- return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+ return &x->sb8x8_context[x->sb_index][x->mb_index][x->b_index];
case BLOCK_8X4:
- return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+ return &x->sb8x4_context[x->sb_index][x->mb_index][x->b_index];
case BLOCK_4X8:
- return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+ return &x->sb4x8_context[x->sb_index][x->mb_index][x->b_index];
case BLOCK_4X4:
- return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+ return &x->ab4x4_context[x->sb_index][x->mb_index][x->b_index];
default:
assert(0);
return NULL;
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c
index a45299b..3e75f3b 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -50,25 +50,25 @@
int enc_debug = 0;
#endif
-static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
+static INLINE uint8_t *get_sb_index(MACROBLOCK *x, BLOCK_SIZE subsize) {
switch (subsize) {
case BLOCK_64X64:
case BLOCK_64X32:
case BLOCK_32X64:
case BLOCK_32X32:
- return &xd->sb_index;
+ return &x->sb_index;
case BLOCK_32X16:
case BLOCK_16X32:
case BLOCK_16X16:
- return &xd->mb_index;
+ return &x->mb_index;
case BLOCK_16X8:
case BLOCK_8X16:
case BLOCK_8X8:
- return &xd->b_index;
+ return &x->b_index;
case BLOCK_8X4:
case BLOCK_4X8:
case BLOCK_4X4:
- return &xd->ab_index;
+ return &x->ab_index;
default:
assert(0);
return NULL;
@@ -367,6 +367,8 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
MODE_INFO *mi = &ctx->mic;
MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
MODE_INFO *mi_addr = xd->mi_8x8[0];
@@ -375,6 +377,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
const int mis = cm->mode_info_stride;
const int mi_width = num_8x8_blocks_wide_lookup[bsize];
const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ int max_plane;
assert(mi->mbmi.mode < MB_MODE_COUNT);
assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES);
@@ -383,6 +386,21 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
*mi_addr = *mi;
+ max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
+ for (i = 0; i < max_plane; ++i) {
+ p[i].coeff = ctx->coeff_pbuf[i][1];
+ pd[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+ pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+ pd[i].eobs = ctx->eobs_pbuf[i][1];
+ }
+
+ for (i = max_plane; i < MAX_MB_PLANE; ++i) {
+ p[i].coeff = ctx->coeff_pbuf[i][2];
+ pd[i].qcoeff = ctx->qcoeff_pbuf[i][2];
+ pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
+ pd[i].eobs = ctx->eobs_pbuf[i][2];
+ }
+
// Restore the coding context of the MB to that that was in place
// when the mode was picked for it
for (y = 0; y < mi_height; y++)
@@ -578,6 +596,9 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ int i;
int orig_rdmult = x->rdmult;
double rdmult_ratio;
@@ -590,7 +611,7 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
if (bsize < BLOCK_8X8) {
// When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
// there is nothing to be done.
- if (xd->ab_index != 0) {
+ if (x->ab_index != 0) {
*totalrate = 0;
*totaldist = 0;
return;
@@ -600,6 +621,15 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
set_offsets(cpi, tile, mi_row, mi_col, bsize);
xd->mi_8x8[0]->mbmi.sb_type = bsize;
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ p[i].coeff = ctx->coeff_pbuf[i][0];
+ pd[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+ pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
+ pd[i].eobs = ctx->eobs_pbuf[i][0];
+ }
+ ctx->is_coded = 0;
+ x->skip_recode = 0;
+
// Set to zero to make sure we do not use the previous encoded frame stats
xd->mi_8x8[0]->mbmi.skip_coeff = 0;
@@ -687,16 +717,15 @@ static void update_stats(VP9_COMP *cpi) {
}
static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) {
- MACROBLOCKD *const xd = &x->e_mbd;
switch (bsize) {
case BLOCK_64X64:
return &x->sb64_partitioning;
case BLOCK_32X32:
- return &x->sb_partitioning[xd->sb_index];
+ return &x->sb_partitioning[x->sb_index];
case BLOCK_16X16:
- return &x->mb_partitioning[xd->sb_index][xd->mb_index];
+ return &x->mb_partitioning[x->sb_index][x->mb_index];
case BLOCK_8X8:
- return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];
+ return &x->b_partitioning[x->sb_index][x->mb_index][x->b_index];
default:
assert(0);
return NULL;
@@ -769,20 +798,19 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
TOKENEXTRA **tp, int mi_row, int mi_col,
int output_enabled, BLOCK_SIZE bsize, int sub_index) {
- VP9_COMMON * const cm = &cpi->common;
- MACROBLOCK * const x = &cpi->mb;
- MACROBLOCKD * const xd = &x->e_mbd;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->mb;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
if (sub_index != -1)
- *get_sb_index(xd, bsize) = sub_index;
+ *get_sb_index(x, bsize) = sub_index;
if (bsize < BLOCK_8X8) {
// When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
// there is nothing to be done.
- if (xd->ab_index > 0)
+ if (x->ab_index > 0)
return;
}
set_offsets(cpi, tile, mi_row, mi_col, bsize);
@@ -800,9 +828,8 @@ static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
TOKENEXTRA **tp, int mi_row, int mi_col,
int output_enabled, BLOCK_SIZE bsize) {
- VP9_COMMON * const cm = &cpi->common;
- MACROBLOCK * const x = &cpi->mb;
- MACROBLOCKD * const xd = &x->e_mbd;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->mb;
BLOCK_SIZE c1 = BLOCK_8X8;
const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4;
int pl = 0;
@@ -848,7 +875,7 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
for (i = 0; i < 4; i++) {
const int x_idx = i & 1, y_idx = i >> 1;
- *get_sb_index(xd, subsize) = i;
+ *get_sb_index(x, subsize) = i;
encode_sb(cpi, tile, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
output_enabled, subsize);
}
@@ -975,9 +1002,8 @@ static void rd_use_partition(VP9_COMP *cpi,
TOKENEXTRA **tp, int mi_row, int mi_col,
BLOCK_SIZE bsize, int *rate, int64_t *dist,
int do_recon) {
- VP9_COMMON * const cm = &cpi->common;
- MACROBLOCK * const x = &cpi->mb;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->mb;
const int mis = cm->mode_info_stride;
int bsl = b_width_log2(bsize);
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
@@ -1012,7 +1038,7 @@ static void rd_use_partition(VP9_COMP *cpi,
if (bsize < BLOCK_8X8) {
// When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
// there is nothing to be done.
- if (xd->ab_index != 0) {
+ if (x->ab_index != 0) {
*rate = 0;
*dist = 0;
return;
@@ -1070,7 +1096,7 @@ static void rd_use_partition(VP9_COMP *cpi,
bsize, get_block_context(x, bsize), INT64_MAX);
break;
case PARTITION_HORZ:
- *get_sb_index(xd, subsize) = 0;
+ *get_sb_index(x, subsize) = 0;
pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
subsize, get_block_context(x, subsize), INT64_MAX);
if (last_part_rate != INT_MAX &&
@@ -1079,7 +1105,7 @@ static void rd_use_partition(VP9_COMP *cpi,
int64_t dt = 0;
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
- *get_sb_index(xd, subsize) = 1;
+ *get_sb_index(x, subsize) = 1;
pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
get_block_context(x, subsize), INT64_MAX);
if (rt == INT_MAX || dt == INT_MAX) {
@@ -1093,7 +1119,7 @@ static void rd_use_partition(VP9_COMP *cpi,
}
break;
case PARTITION_VERT:
- *get_sb_index(xd, subsize) = 0;
+ *get_sb_index(x, subsize) = 0;
pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
subsize, get_block_context(x, subsize), INT64_MAX);
if (last_part_rate != INT_MAX &&
@@ -1102,7 +1128,7 @@ static void rd_use_partition(VP9_COMP *cpi,
int64_t dt = 0;
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
- *get_sb_index(xd, subsize) = 1;
+ *get_sb_index(x, subsize) = 1;
pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
get_block_context(x, subsize), INT64_MAX);
if (rt == INT_MAX || dt == INT_MAX) {
@@ -1128,7 +1154,7 @@ static void rd_use_partition(VP9_COMP *cpi,
if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
continue;
- *get_sb_index(xd, subsize) = i;
+ *get_sb_index(x, subsize) = i;
rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt,
@@ -1169,11 +1195,10 @@ static void rd_use_partition(VP9_COMP *cpi,
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
PARTITION_CONTEXT sl[8], sa[8];
- if ((mi_row + y_idx >= cm->mi_rows)
- || (mi_col + x_idx >= cm->mi_cols))
+ if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
continue;
- *get_sb_index(xd, split_subsize) = i;
+ *get_sb_index(x, split_subsize) = i;
*get_sb_partitioning(x, bsize) = split_subsize;
*get_sb_partitioning(x, split_subsize) = split_subsize;
@@ -1353,7 +1378,6 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
// Only use 8x8 result for non HD videos.
// int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0;
@@ -1366,9 +1390,9 @@ static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) {
PICK_MODE_CONTEXT *block_context = NULL;
if (bsize == BLOCK_16X16) {
- block_context = x->sb8x8_context[xd->sb_index][xd->mb_index];
+ block_context = x->sb8x8_context[x->sb_index][x->mb_index];
} else if (bsize == BLOCK_32X32) {
- block_context = x->mb_context[xd->sb_index];
+ block_context = x->mb_context[x->sb_index];
} else if (bsize == BLOCK_64X64) {
block_context = x->sb32_context;
}
@@ -1456,9 +1480,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
TOKENEXTRA **tp, int mi_row,
int mi_col, BLOCK_SIZE bsize, int *rate,
int64_t *dist, int do_recon, int64_t best_rd) {
- VP9_COMMON * const cm = &cpi->common;
- MACROBLOCK * const x = &cpi->mb;
- MACROBLOCKD * const xd = &x->e_mbd;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->mb;
const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
PARTITION_CONTEXT sl[8], sa[8];
@@ -1484,7 +1507,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
if (bsize < BLOCK_8X8) {
// When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
// there is nothing to be done.
- if (xd->ab_index != 0) {
+ if (x->ab_index != 0) {
*rate = 0;
*dist = 0;
return;
@@ -1582,7 +1605,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
continue;
- *get_sb_index(xd, subsize) = i;
+ *get_sb_index(x, subsize) = i;
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, get_block_context(x, bsize));
rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, subsize,
@@ -1629,7 +1652,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
// PARTITION_HORZ
if (partition_horz_allowed && do_rect) {
subsize = get_subsize(bsize, PARTITION_HORZ);
- *get_sb_index(xd, subsize) = 0;
+ *get_sb_index(x, subsize) = 0;
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, get_block_context(x, bsize));
pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
@@ -1640,7 +1663,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
- *get_sb_index(xd, subsize) = 1;
+ *get_sb_index(x, subsize) = 1;
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, get_block_context(x, bsize));
pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate,
@@ -1674,7 +1697,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
if (partition_vert_allowed && do_rect) {
subsize = get_subsize(bsize, PARTITION_VERT);
- *get_sb_index(xd, subsize) = 0;
+ *get_sb_index(x, subsize) = 0;
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, get_block_context(x, bsize));
pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
@@ -1684,7 +1707,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
- *get_sb_index(xd, subsize) = 1;
+ *get_sb_index(x, subsize) = 1;
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, get_block_context(x, bsize));
pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate,
@@ -1765,7 +1788,7 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile,
}
static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
- int mi_row, TOKENEXTRA **tp, int *totalrate) {
+ int mi_row, TOKENEXTRA **tp) {
VP9_COMMON * const cm = &cpi->common;
int mi_col;
@@ -1910,7 +1933,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
MACROBLOCK * const x = &cpi->mb;
VP9_COMMON * const cm = &cpi->common;
MACROBLOCKD * const xd = &x->e_mbd;
- int totalrate;
// fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
// cpi->common.current_video_frame, cpi->common.show_frame,
@@ -1926,8 +1948,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
}
#endif
- totalrate = 0;
-
vp9_zero(cm->counts.switchable_interp);
vp9_zero(cpi->tx_stepdown_count);
@@ -1989,7 +2009,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
vp9_tile_init(&tile, cm, tile_row, tile_col);
for (mi_row = tile.mi_row_start;
mi_row < tile.mi_row_end; mi_row += 8)
- encode_sb_row(cpi, &tile, mi_row, &tp, &totalrate);
+ encode_sb_row(cpi, &tile, mi_row, &tp);
cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -2015,10 +2035,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
cpi->sf.skip_encode_frame = 0;
}
- // 256 rate units to the bit,
- // projected_frame_size in units of BYTES
- cpi->projected_frame_size = totalrate >> 8;
-
#if 0
// Keep record of the total distortion this time around for future use
cpi->last_frame_distortion = cpi->frame_distortion;
@@ -2395,13 +2411,17 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
MODE_INFO **mi_8x8 = xd->mi_8x8;
MODE_INFO *mi = mi_8x8[0];
MB_MODE_INFO *mbmi = &mi->mbmi;
+ PICK_MODE_CONTEXT *ctx = get_block_context(x, bsize);
unsigned int segment_id = mbmi->segment_id;
const int mis = cm->mode_info_stride;
const int mi_width = num_8x8_blocks_wide_lookup[bsize];
const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8;
+ x->skip_optimize = ctx->is_coded;
+ ctx->is_coded = 1;
x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame &&
- xd->q_index < QIDX_SKIP_THRESH);
+ x->q_index < QIDX_SKIP_THRESH);
if (x->skip_encode)
return;
@@ -2487,7 +2507,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
(mbmi->skip_coeff ||
vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
const uint8_t context = vp9_get_pred_context_tx_size(xd);
- ++get_tx_counts(bsize, context, &cm->counts.tx)[mbmi->tx_size];
+ ++get_tx_counts(max_txsize_lookup[bsize],
+ context, &cm->counts.tx)[mbmi->tx_size];
} else {
int x, y;
TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode];
diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c
index 75ed8ea..a85ddee 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/libvpx/vp9/encoder/vp9_encodemb.c
@@ -136,14 +136,13 @@ static void optimize_b(MACROBLOCK *mb,
const int16_t *scan, *nb;
const int mul = 1 + (tx_size == TX_32X32);
uint8_t token_cache[1024];
- const int ib = txfrm_block_to_raster_block(plane_bsize, tx_size, block);
const int16_t *dequant_ptr = pd->dequant;
const uint8_t *const band_translate = get_band_translate(tx_size);
assert((!type && !plane) || (type && plane));
dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
- get_scan(xd, tx_size, type, ib, &scan, &nb);
+ get_scan(xd, tx_size, type, block, &scan, &nb);
assert(eob <= default_eob);
/* Now set up a Viterbi trellis to evaluate alternative roundings. */
@@ -179,7 +178,7 @@ static void optimize_b(MACROBLOCK *mb,
t0 = (vp9_dct_value_tokens_ptr + x)->token;
/* Consider both possible successor states. */
if (next < default_eob) {
- band = get_coef_band(band_translate, i + 1);
+ band = band_translate[i + 1];
pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
rate0 +=
mb->token_costs[tx_size][type][ref][band][0][pt]
@@ -230,7 +229,7 @@ static void optimize_b(MACROBLOCK *mb,
t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
}
if (next < default_eob) {
- band = get_coef_band(band_translate, i + 1);
+ band = band_translate[i + 1];
if (t0 != DCT_EOB_TOKEN) {
pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
@@ -264,7 +263,7 @@ static void optimize_b(MACROBLOCK *mb,
/* There's no choice to make for a zero coefficient, so we don't
* add a new trellis node, but we do need to update the costs.
*/
- band = get_coef_band(band_translate, i + 1);
+ band = band_translate[i + 1];
t0 = tokens[next][0].token;
t1 = tokens[next][1].token;
/* Update the cost of each path if we're past the EOB token. */
@@ -284,7 +283,7 @@ static void optimize_b(MACROBLOCK *mb,
}
/* Now pick the best path through the whole trellis. */
- band = get_coef_band(band_translate, i + 1);
+ band = band_translate[i + 1];
pt = combine_entropy_contexts(*a, *l);
rate0 = tokens[next][0].rate;
rate1 = tokens[next][1].rate;
@@ -420,28 +419,30 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
MACROBLOCKD *const xd = &x->e_mbd;
struct optimize_ctx *const ctx = args->ctx;
struct macroblockd_plane *const pd = &xd->plane[plane];
- const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
- block);
-
int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
- uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
- pd->dst.buf, pd->dst.stride);
+ int i, j;
+ uint8_t *dst;
+ txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+ dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
// TODO(jingning): per transformed block zero forcing only enabled for
// luma component. will integrate chroma components as well.
if (x->zcoeff_blk[tx_size][block] && plane == 0) {
- int i, j;
pd->eobs[block] = 0;
- txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
ctx->ta[plane][i] = 0;
ctx->tl[plane][j] = 0;
return;
}
- vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
+ if (!x->skip_recode)
+ vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
- if (x->optimize)
+ if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
+ } else {
+ ctx->ta[plane][i] = pd->eobs[block] > 0;
+ ctx->tl[plane][j] = pd->eobs[block] > 0;
+ }
if (x->skip_encode || pd->eobs[block] == 0)
return;
@@ -505,9 +506,10 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
struct optimize_ctx ctx;
struct encode_b_args arg = {x, &ctx};
- vp9_subtract_sb(x, bsize);
+ if (!x->skip_recode)
+ vp9_subtract_sb(x, bsize);
- if (x->optimize) {
+ if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
int i;
for (i = 0; i < MAX_MB_PLANE; ++i)
optimize_init_b(i, bsize, &arg);
@@ -552,19 +554,22 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
xoff = 32 * (block & twmask);
yoff = 32 * (block >> twl);
dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
- src = p->src.buf + yoff * p->src.stride + xoff;
- src_diff = p->src_diff + 4 * bw * yoff + xoff;
vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode,
dst, pd->dst.stride, dst, pd->dst.stride);
- vp9_subtract_block(32, 32, src_diff, bw * 4,
- src, p->src.stride, dst, pd->dst.stride);
- if (x->use_lp32x32fdct)
- vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
- else
- vp9_fdct32x32(src_diff, coeff, bw * 4);
- vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
- p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob, scan, iscan);
+
+ if (!x->skip_recode) {
+ src = p->src.buf + yoff * p->src.stride + xoff;
+ src_diff = p->src_diff + 4 * bw * yoff + xoff;
+ vp9_subtract_block(32, 32, src_diff, bw * 4,
+ src, p->src.stride, dst, pd->dst.stride);
+ if (x->use_lp32x32fdct)
+ vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
+ else
+ vp9_fdct32x32(src_diff, coeff, bw * 4);
+ vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob, scan, iscan);
+ }
if (!x->skip_encode && *eob)
vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob);
break;
@@ -577,16 +582,18 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
xoff = 16 * (block & twmask);
yoff = 16 * (block >> twl);
dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
- src = p->src.buf + yoff * p->src.stride + xoff;
- src_diff = p->src_diff + 4 * bw * yoff + xoff;
vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode,
dst, pd->dst.stride, dst, pd->dst.stride);
- vp9_subtract_block(16, 16, src_diff, bw * 4,
- src, p->src.stride, dst, pd->dst.stride);
- vp9_fht16x16(tx_type, src_diff, coeff, bw * 4);
- vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
- p->quant, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob, scan, iscan);
+ if (!x->skip_recode) {
+ src = p->src.buf + yoff * p->src.stride + xoff;
+ src_diff = p->src_diff + 4 * bw * yoff + xoff;
+ vp9_subtract_block(16, 16, src_diff, bw * 4,
+ src, p->src.stride, dst, pd->dst.stride);
+ vp9_fht16x16(tx_type, src_diff, coeff, bw * 4);
+ vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob, scan, iscan);
+ }
if (!x->skip_encode && *eob)
vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
break;
@@ -599,16 +606,18 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
xoff = 8 * (block & twmask);
yoff = 8 * (block >> twl);
dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
- src = p->src.buf + yoff * p->src.stride + xoff;
- src_diff = p->src_diff + 4 * bw * yoff + xoff;
vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode,
dst, pd->dst.stride, dst, pd->dst.stride);
- vp9_subtract_block(8, 8, src_diff, bw * 4,
- src, p->src.stride, dst, pd->dst.stride);
- vp9_fht8x8(tx_type, src_diff, coeff, bw * 4);
- vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob, scan, iscan);
+ if (!x->skip_recode) {
+ src = p->src.buf + yoff * p->src.stride + xoff;
+ src_diff = p->src_diff + 4 * bw * yoff + xoff;
+ vp9_subtract_block(8, 8, src_diff, bw * 4,
+ src, p->src.stride, dst, pd->dst.stride);
+ vp9_fht8x8(tx_type, src_diff, coeff, bw * 4);
+ vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob, scan, iscan);
+ }
if (!x->skip_encode && *eob)
vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
break;
@@ -624,19 +633,23 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
xoff = 4 * (block & twmask);
yoff = 4 * (block >> twl);
dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
- src = p->src.buf + yoff * p->src.stride + xoff;
- src_diff = p->src_diff + 4 * bw * yoff + xoff;
vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
dst, pd->dst.stride, dst, pd->dst.stride);
- vp9_subtract_block(4, 4, src_diff, bw * 4,
- src, p->src.stride, dst, pd->dst.stride);
- if (tx_type != DCT_DCT)
- vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type);
- else
- x->fwd_txm4x4(src_diff, coeff, bw * 4);
- vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, p->zbin_extra, eob, scan, iscan);
+
+ if (!x->skip_recode) {
+ src = p->src.buf + yoff * p->src.stride + xoff;
+ src_diff = p->src_diff + 4 * bw * yoff + xoff;
+ vp9_subtract_block(4, 4, src_diff, bw * 4,
+ src, p->src.stride, dst, pd->dst.stride);
+ if (tx_type != DCT_DCT)
+ vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type);
+ else
+ x->fwd_txm4x4(src_diff, coeff, bw * 4);
+ vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob, scan, iscan);
+ }
+
if (!x->skip_encode && *eob) {
if (tx_type == DCT_DCT)
// this is like vp9_short_idct4x4 but has a special case around eob<=1
diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c
index e2c6c4c..030ca64 100644
--- a/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/libvpx/vp9/encoder/vp9_encodemv.c
@@ -155,9 +155,8 @@ static void counts_to_nmv_context(
unsigned int (*branch_ct_class0_hp)[2],
unsigned int (*branch_ct_hp)[2]) {
int i, j, k;
- vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
- branch_ct_joint,
- nmv_count->joints, 0);
+ vp9_tree_probs_from_distribution(vp9_mv_joint_tree, branch_ct_joint,
+ nmv_count->joints);
for (i = 0; i < 2; ++i) {
const uint32_t s0 = nmv_count->comps[i].sign[0];
const uint32_t s1 = nmv_count->comps[i].sign[1];
@@ -166,10 +165,10 @@ static void counts_to_nmv_context(
branch_ct_sign[i][1] = s1;
vp9_tree_probs_from_distribution(vp9_mv_class_tree,
branch_ct_classes[i],
- nmv_count->comps[i].classes, 0);
+ nmv_count->comps[i].classes);
vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
branch_ct_class0[i],
- nmv_count->comps[i].class0, 0);
+ nmv_count->comps[i].class0);
for (j = 0; j < MV_OFFSET_BITS; ++j) {
const uint32_t b0 = nmv_count->comps[i].bits[j][0];
const uint32_t b1 = nmv_count->comps[i].bits[j][1];
@@ -182,11 +181,11 @@ static void counts_to_nmv_context(
for (k = 0; k < CLASS0_SIZE; ++k) {
vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
branch_ct_class0_fp[i][k],
- nmv_count->comps[i].class0_fp[k], 0);
+ nmv_count->comps[i].class0_fp[k]);
}
vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
branch_ct_fp[i],
- nmv_count->comps[i].fp, 0);
+ nmv_count->comps[i].fp);
}
if (usehp) {
for (i = 0; i < 2; ++i) {
diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c
index 6a3555d..974c300 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/libvpx/vp9/encoder/vp9_firstpass.c
@@ -482,6 +482,10 @@ void vp9_first_pass(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
TileInfo tile;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ PICK_MODE_CONTEXT *ctx = &x->sb64_context;
+ int i;
int recon_yoffset, recon_uvoffset;
const int lst_yv12_idx = cm->ref_frame_map[cpi->lst_fb_idx];
@@ -525,6 +529,15 @@ void vp9_first_pass(VP9_COMP *cpi) {
vp9_frame_init_quantizer(cpi);
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ p[i].coeff = ctx->coeff_pbuf[i][1];
+ pd[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+ pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+ pd[i].eobs = ctx->eobs_pbuf[i][1];
+ }
+ x->skip_recode = 0;
+
+
// Initialise the MV cost table to the defaults
// if( cm->current_video_frame == 0)
// if ( 0 )
diff --git a/libvpx/vp9/encoder/vp9_onyx_if.c b/libvpx/vp9/encoder/vp9_onyx_if.c
index f922f90..dd4705d 100644
--- a/libvpx/vp9/encoder/vp9_onyx_if.c
+++ b/libvpx/vp9/encoder/vp9_onyx_if.c
@@ -834,6 +834,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->adaptive_rd_thresh = 2;
sf->recode_loop = 2;
+ sf->use_lp32x32fdct = 1;
sf->mode_skip_start = 11;
sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
@@ -1436,90 +1437,121 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
} while (++i <= MV_MAX);
}
+static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
+ PICK_MODE_CONTEXT *ctx) {
+ int num_pix = num_4x4_blk << 4;
+ int i, k;
+ ctx->num_4x4_blk = num_4x4_blk;
+ CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+ vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ for (k = 0; k < 3; ++k) {
+ CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
+ vpx_memalign(16, num_pix * sizeof(int16_t)));
+ CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
+ vpx_memalign(16, num_pix * sizeof(int16_t)));
+ CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
+ vpx_memalign(16, num_pix * sizeof(int16_t)));
+ CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
+ vpx_memalign(16, num_pix * sizeof(uint16_t)));
+ ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
+ ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
+ ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
+ ctx->eobs_pbuf[i][k] = ctx->eobs[i][k];
+ }
+ }
+}
+
+static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
+ int i, k;
+ vpx_free(ctx->zcoeff_blk);
+ ctx->zcoeff_blk = 0;
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ for (k = 0; k < 3; ++k) {
+ vpx_free(ctx->coeff[i][k]);
+ ctx->coeff[i][k] = 0;
+ vpx_free(ctx->qcoeff[i][k]);
+ ctx->qcoeff[i][k] = 0;
+ vpx_free(ctx->dqcoeff[i][k]);
+ ctx->dqcoeff[i][k] = 0;
+ vpx_free(ctx->eobs[i][k]);
+ ctx->eobs[i][k] = 0;
+ }
+ }
+}
+
static void init_pick_mode_context(VP9_COMP *cpi) {
int i;
- MACROBLOCK *x = &cpi->mb;
- MACROBLOCKD *xd = &x->e_mbd;
- VP9_COMMON *cm = &cpi->common;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->mb;
+
for (i = 0; i < BLOCK_SIZES; ++i) {
const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
const int num_4x4_h = num_4x4_blocks_high_lookup[i];
const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
if (i < BLOCK_16X16) {
- for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
- for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) {
- for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) {
+ for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) {
+ for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) {
+ for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) {
PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
- ctx->num_4x4_blk = num_4x4_blk;
- CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
- vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+ alloc_mode_context(cm, num_4x4_blk, ctx);
}
}
}
} else if (i < BLOCK_32X32) {
- for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
- for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk;
- ++xd->mb_index) {
+ for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) {
+ for (x->mb_index = 0; x->mb_index < 64 / num_4x4_blk; ++x->mb_index) {
PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
ctx->num_4x4_blk = num_4x4_blk;
- CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
- vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+ alloc_mode_context(cm, num_4x4_blk, ctx);
}
}
} else if (i < BLOCK_64X64) {
- for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) {
+ for (x->sb_index = 0; x->sb_index < 256 / num_4x4_blk; ++x->sb_index) {
PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
ctx->num_4x4_blk = num_4x4_blk;
- CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
- vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+ alloc_mode_context(cm, num_4x4_blk, ctx);
}
} else {
PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
ctx->num_4x4_blk = num_4x4_blk;
- CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
- vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+ alloc_mode_context(cm, num_4x4_blk, ctx);
}
}
}
static void free_pick_mode_context(MACROBLOCK *x) {
int i;
- MACROBLOCKD *xd = &x->e_mbd;
for (i = 0; i < BLOCK_SIZES; ++i) {
const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
const int num_4x4_h = num_4x4_blocks_high_lookup[i];
const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
if (i < BLOCK_16X16) {
- for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
- for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) {
- for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) {
+ for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) {
+ for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) {
+ for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) {
PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
- vpx_free(ctx->zcoeff_blk);
- ctx->zcoeff_blk = 0;
+ free_mode_context(ctx);
}
}
}
} else if (i < BLOCK_32X32) {
- for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
- for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk;
- ++xd->mb_index) {
+ for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) {
+ for (x->mb_index = 0; x->mb_index < 64 / num_4x4_blk; ++x->mb_index) {
PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
- vpx_free(ctx->zcoeff_blk);
- ctx->zcoeff_blk = 0;
+ free_mode_context(ctx);
}
}
} else if (i < BLOCK_64X64) {
- for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) {
+ for (x->sb_index = 0; x->sb_index < 256 / num_4x4_blk; ++x->sb_index) {
PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
- vpx_free(ctx->zcoeff_blk);
- ctx->zcoeff_blk = 0;
+ free_mode_context(ctx);
}
} else {
PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
- vpx_free(ctx->zcoeff_blk);
- ctx->zcoeff_blk = 0;
+ free_mode_context(ctx);
}
}
}
@@ -3404,7 +3436,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
// Post encode loop adjustment of Q prediction.
if (!active_worst_qchanged)
- vp9_update_rate_correction_factors(cpi, (cpi->sf.recode_loop) ? 2 : 0);
+ vp9_update_rate_correction_factors(cpi, (cpi->sf.recode_loop ||
+ cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0);
+
cpi->last_q[cm->frame_type] = cm->base_qindex;
diff --git a/libvpx/vp9/encoder/vp9_onyx_int.h b/libvpx/vp9/encoder/vp9_onyx_int.h
index 9429c7f..9e80212 100644
--- a/libvpx/vp9/encoder/vp9_onyx_int.h
+++ b/libvpx/vp9/encoder/vp9_onyx_int.h
@@ -312,7 +312,6 @@ typedef struct VP9_COMP {
VP9_COMMON common;
VP9_CONFIG oxcf;
struct rdcost_block_args rdcost_stack;
-
struct lookahead_ctx *lookahead;
struct lookahead_entry *source;
#if CONFIG_MULTIPLE_ARF
diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c
index fca7525..d24be96 100644
--- a/libvpx/vp9/encoder/vp9_quantize.c
+++ b/libvpx/vp9/encoder/vp9_quantize.c
@@ -22,7 +22,7 @@
extern int enc_debug;
#endif
-void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
int skip_block,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
@@ -30,58 +30,44 @@ void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *dequant_ptr,
int zbin_oq_value, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- int i, rc, eob;
- int zbins[2], nzbins[2], zbin;
- int x, y, z, sz;
- int zero_flag = n_coeffs;
+ int i, non_zero_count = count, eob = -1;
+ const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
+ zbin_ptr[1] + zbin_oq_value };
+ const int nzbins[2] = { zbins[0] * -1,
+ zbins[1] * -1 };
- vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
- vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
-
- eob = -1;
-
- // Base ZBIN
- zbins[0] = zbin_ptr[0] + zbin_oq_value;
- zbins[1] = zbin_ptr[1] + zbin_oq_value;
- nzbins[0] = zbins[0] * -1;
- nzbins[1] = zbins[1] * -1;
+ vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+ vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
if (!skip_block) {
// Pre-scan pass
- for (i = n_coeffs - 1; i >= 0; i--) {
- rc = scan[i];
- z = coeff_ptr[rc];
+ for (i = count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
- if (z < zbins[rc != 0] && z > nzbins[rc != 0]) {
- zero_flag--;
- } else {
+ if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+ non_zero_count--;
+ else
break;
- }
}
// Quantization pass: All coefficients with index >= zero_flag are
// skippable. Note: zero_flag can be zero.
- for (i = 0; i < zero_flag; i++) {
- rc = scan[i];
- z = coeff_ptr[rc];
-
- zbin = (zbins[rc != 0]);
-
- sz = (z >> 31); // sign of z
- x = (z ^ sz) - sz;
-
- if (x >= zbin) {
- x += (round_ptr[rc != 0]);
- x = clamp(x, INT16_MIN, INT16_MAX);
- y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
- quant_shift_ptr[rc != 0]) >> 16; // quantize (x)
- x = (y ^ sz) - sz; // get the sign back
- qcoeff_ptr[rc] = x; // write to destination
- dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0]; // dequantized value
-
- if (y) {
- eob = i; // last nonzero coeffs
- }
+ for (i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ if (abs_coeff >= zbins[rc != 0]) {
+ int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+ tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ quant_shift_ptr[rc != 0]) >> 16; // quantization
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+ if (tmp)
+ eob = i;
}
}
}
@@ -315,17 +301,17 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
SEG_LVL_SKIP);
/* save this macroblock QIndex for vp9_update_zbin_extra() */
- x->e_mbd.q_index = qindex;
+ x->q_index = qindex;
/* R/D setup */
cpi->mb.errorperbit = rdmult >> 6;
cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
- vp9_initialize_me_consts(cpi, xd->q_index);
+ vp9_initialize_me_consts(cpi, x->q_index);
}
void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
- const int qindex = x->e_mbd.q_index;
+ const int qindex = x->q_index;
const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] *
(cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] *
diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c
index 993919e..78cb06b 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libvpx/vp9/encoder/vp9_rdopt.c
@@ -246,6 +246,10 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
vp9_set_speed_features(cpi);
+ cpi->mb.select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
+ cm->frame_type != KEY_FRAME) ?
+ 0 : 1;
+
set_block_thresholds(cpi);
fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs);
@@ -268,10 +272,10 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
MB_PREDICTION_MODE m;
for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
- cpi->mb.inter_mode_cost[i][inter_mode_offset(m)] =
+ cpi->mb.inter_mode_cost[i][INTER_OFFSET(m)] =
cost_token(vp9_inter_mode_tree,
cm->fc.inter_mode_probs[i],
- &vp9_inter_mode_encodings[inter_mode_offset(m)]);
+ &vp9_inter_mode_encodings[INTER_OFFSET(m)]);
}
}
}
@@ -609,7 +613,7 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
// TODO(jingning): temporarily enabled only for luma component
rd = MIN(rd1, rd2);
- if (plane == 0)
+ if (!xd->lossless && plane == 0)
x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block];
args->this_rate += args->rate;
@@ -740,7 +744,7 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
int n, m;
int s0, s1;
- const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]);
+ const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
for (n = TX_4X4; n <= max_tx_size; n++) {
r[n][1] = r[n][0];
@@ -845,7 +849,7 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
// double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00};
- const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]);
+ const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
// for (n = TX_4X4; n <= max_txfm_size; n++)
// r[n][0] = (r[n][0] * scale_r[n]);
@@ -1326,6 +1330,7 @@ static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
}
static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+ PICK_MODE_CONTEXT *ctx,
int *rate, int *rate_tokenonly,
int64_t *distortion, int *skippable,
BLOCK_SIZE bsize) {
@@ -1361,6 +1366,27 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
*rate_tokenonly = this_rate_tokenonly;
*distortion = this_distortion;
*skippable = s;
+ if (!x->select_txfm_size) {
+ int i;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = x->e_mbd.plane;
+ for (i = 1; i < MAX_MB_PLANE; ++i) {
+ p[i].coeff = ctx->coeff_pbuf[i][2];
+ pd[i].qcoeff = ctx->qcoeff_pbuf[i][2];
+ pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
+ pd[i].eobs = ctx->eobs_pbuf[i][2];
+
+ ctx->coeff_pbuf[i][2] = ctx->coeff_pbuf[i][0];
+ ctx->qcoeff_pbuf[i][2] = ctx->qcoeff_pbuf[i][0];
+ ctx->dqcoeff_pbuf[i][2] = ctx->dqcoeff_pbuf[i][0];
+ ctx->eobs_pbuf[i][2] = ctx->eobs_pbuf[i][0];
+
+ ctx->coeff_pbuf[i][0] = p[i].coeff;
+ ctx->qcoeff_pbuf[i][0] = pd[i].qcoeff;
+ ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
+ ctx->eobs_pbuf[i][0] = pd[i].eobs;
+ }
+ }
}
}
@@ -1386,8 +1412,9 @@ static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
return this_rd;
}
-static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize,
- int *rate_uv, int *rate_uv_tokenonly,
+static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+ BLOCK_SIZE bsize, int *rate_uv,
+ int *rate_uv_tokenonly,
int64_t *dist_uv, int *skip_uv,
MB_PREDICTION_MODE *mode_uv) {
MACROBLOCK *const x = &cpi->mb;
@@ -1400,7 +1427,7 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize,
// Else do a proper rd search for each possible transform size that may
// be considered in the main rd loop.
} else {
- rd_pick_intra_sbuv_mode(cpi, x,
+ rd_pick_intra_sbuv_mode(cpi, x, ctx,
rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
}
@@ -1416,7 +1443,7 @@ static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
// Don't account for mode here if segment skip is enabled.
if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
assert(is_inter_mode(mode));
- return x->inter_mode_cost[mode_context][inter_mode_offset(mode)];
+ return x->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
} else {
return 0;
}
@@ -1707,7 +1734,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
const struct buf_2d orig_src = x->plane[0].src;
struct buf_2d orig_pre[2];
- mode_idx = inter_mode_offset(this_mode);
+ mode_idx = INTER_OFFSET(this_mode);
bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
// if we're near/nearest and mv == 0,0, compare to zeromv
@@ -1901,6 +1928,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
x->mvcost, cpi);
+
bsi->rdstat[i][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int;
if (num_4x4_blocks_wide > 1)
bsi->rdstat[i + 1][mode_idx].mvs[0].as_int =
@@ -2002,7 +2030,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
return;
}
- mode_idx = inter_mode_offset(mode_selected);
+ mode_idx = INTER_OFFSET(mode_selected);
vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
@@ -2078,7 +2106,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
return INT64_MAX;
/* set it to the best */
for (i = 0; i < 4; i++) {
- mode_idx = inter_mode_offset(bsi->modes[i]);
+ mode_idx = INTER_OFFSET(bsi->modes[i]);
mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
if (has_second_ref(mbmi))
mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
@@ -2477,54 +2505,41 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
- int refs[2] = { mbmi->ref_frame[0],
- (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+ const int refs[2] = { mbmi->ref_frame[0],
+ mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
int_mv ref_mv[2];
const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
- int ite;
+ int ite, ref;
// Prediction buffer from second frame.
uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
// Do joint motion search in compound mode to get more accurate mv.
- struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
- struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
- struct buf_2d scaled_first_yv12;
+ struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+ struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0];
int last_besterr[2] = {INT_MAX, INT_MAX};
- YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
- scaled_ref_frame[0] = get_scaled_ref_frame(cpi, mbmi->ref_frame[0]);
- scaled_ref_frame[1] = get_scaled_ref_frame(cpi, mbmi->ref_frame[1]);
-
- ref_mv[0] = mbmi->ref_mvs[refs[0]][0];
- ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
-
- if (scaled_ref_frame[0]) {
- int i;
- // Swap out the reference frame for a version that's been scaled to
- // match the resolution of the current frame, allowing the existing
- // motion search code to be used without additional modifications.
- for (i = 0; i < MAX_MB_PLANE; i++)
- backup_yv12[i] = xd->plane[i].pre[0];
- setup_pre_planes(xd, 0, scaled_ref_frame[0], mi_row, mi_col, NULL);
- }
+ YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+ get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
+ get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
+ };
- if (scaled_ref_frame[1]) {
- int i;
- for (i = 0; i < MAX_MB_PLANE; i++)
- backup_second_yv12[i] = xd->plane[i].pre[1];
+ for (ref = 0; ref < 2; ++ref) {
+ ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
+
+ if (scaled_ref_frame[ref]) {
+ int i;
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ backup_yv12[ref][i] = xd->plane[i].pre[ref];
+ setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, NULL);
+ }
- setup_pre_planes(xd, 1, scaled_ref_frame[1], mi_row, mi_col, NULL);
+ xd->scale_factor[ref].sfc->set_scaled_offsets(&xd->scale_factor[ref],
+ mi_row, mi_col);
+ frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
}
- xd->scale_factor[0].sfc->set_scaled_offsets(&xd->scale_factor[0],
- mi_row, mi_col);
- xd->scale_factor[1].sfc->set_scaled_offsets(&xd->scale_factor[1],
- mi_row, mi_col);
- scaled_first_yv12 = xd->plane[0].pre[0];
-
- // Initialize mv using single prediction mode result.
- frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
- frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
-
// Allow joint search multiple times iteratively for each ref frame
// and break out the search loop if it couldn't find better mv.
for (ite = 0; ite < 4; ite++) {
@@ -2604,24 +2619,20 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
}
}
- // restore the predictor
- if (scaled_ref_frame[0]) {
- int i;
- for (i = 0; i < MAX_MB_PLANE; i++)
- xd->plane[i].pre[0] = backup_yv12[i];
- }
+ *rate_mv = 0;
- if (scaled_ref_frame[1]) {
- int i;
- for (i = 0; i < MAX_MB_PLANE; i++)
- xd->plane[i].pre[1] = backup_second_yv12[i];
+ for (ref = 0; ref < 2; ++ref) {
+ if (scaled_ref_frame[ref]) {
+ // restore the predictor
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ xd->plane[i].pre[ref] = backup_yv12[ref][i];
+ }
+
+ *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+ &mbmi->ref_mvs[refs[ref]][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
}
- *rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
- &mbmi->ref_mvs[refs[0]][0].as_mv,
- x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
- *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
- &mbmi->ref_mvs[refs[1]][0].as_mv,
- x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
vpx_free(second_pred);
}
@@ -3046,6 +3057,30 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
return this_rd; // if 0, this will be re-calculated by caller
}
+static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+ int max_plane) {
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = x->e_mbd.plane;
+ int i;
+
+ for (i = 0; i < max_plane; ++i) {
+ p[i].coeff = ctx->coeff_pbuf[i][1];
+ pd[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+ pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+ pd[i].eobs = ctx->eobs_pbuf[i][1];
+
+ ctx->coeff_pbuf[i][1] = ctx->coeff_pbuf[i][0];
+ ctx->qcoeff_pbuf[i][1] = ctx->qcoeff_pbuf[i][0];
+ ctx->dqcoeff_pbuf[i][1] = ctx->dqcoeff_pbuf[i][0];
+ ctx->eobs_pbuf[i][1] = ctx->eobs_pbuf[i][0];
+
+ ctx->coeff_pbuf[i][0] = p[i].coeff;
+ ctx->qcoeff_pbuf[i][0] = pd[i].qcoeff;
+ ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
+ ctx->eobs_pbuf[i][0] = pd[i].eobs;
+ }
+}
+
void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int *returnrate, int64_t *returndist,
BLOCK_SIZE bsize,
@@ -3065,7 +3100,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
*returnrate = INT_MAX;
return;
}
- rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+ rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
&dist_uv, &uv_skip, bsize);
} else {
y_skip = 0;
@@ -3074,7 +3109,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
*returnrate = INT_MAX;
return;
}
- rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+ rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
&dist_uv, &uv_skip, BLOCK_8X8);
}
@@ -3157,7 +3192,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
int best_skip2 = 0;
- x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
+ x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
// Everywhere the flag is set the error is much higher than its neighbors.
ctx->frames_with_high_error = 0;
@@ -3196,8 +3231,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
case BLOCK_32X32:
for (i = 0; i < 4; i++) {
ref_frame_mask |=
- x->mb_context[xd->sb_index][i].frames_with_high_error;
- mode_mask |= x->mb_context[xd->sb_index][i].modes_with_high_error;
+ x->mb_context[x->sb_index][i].frames_with_high_error;
+ mode_mask |= x->mb_context[x->sb_index][i].modes_with_high_error;
}
break;
default:
@@ -3440,7 +3475,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
uv_tx = MIN(mbmi->tx_size, max_uv_txsize_lookup[bsize]);
if (rate_uv_intra[uv_tx] == INT_MAX) {
- choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx],
+ choose_intra_uv_mode(cpi, ctx, bsize, &rate_uv_intra[uv_tx],
&rate_uv_tokenonly[uv_tx],
&dist_uv[uv_tx], &skip_uv[uv_tx],
&mode_uv[uv_tx]);
@@ -3574,6 +3609,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// Did this mode help.. i.e. is it the new best mode
if (this_rd < best_rd || x->skip) {
+ int max_plane = MAX_MB_PLANE;
if (!mode_excluded) {
// Note index of best mode so far
best_mode_index = mode_index;
@@ -3581,6 +3617,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (ref_frame == INTRA_FRAME) {
/* required for left and above block mv */
mbmi->mv[0].as_int = 0;
+ max_plane = 1;
}
*returnrate = rate2;
@@ -3588,6 +3625,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
best_rd = this_rd;
best_mbmode = *mbmi;
best_skip2 = this_skip2;
+ if (!x->select_txfm_size)
+ swap_block_ptr(x, ctx, max_plane);
vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
sizeof(uint8_t) * ctx->num_4x4_blk);
@@ -3694,7 +3733,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// Do Intra UV best rd mode selection if best mode choice above was intra.
if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
- rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
+ rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
&rate_uv_tokenonly[uv_tx_size],
&dist_uv[uv_tx_size],
&skip_uv[uv_tx_size],
@@ -3850,7 +3889,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
b_mode_info best_bmodes[4];
int best_skip2 = 0;
- x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
+ x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
for (i = 0; i < 4; i++) {
@@ -4063,7 +4102,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
distortion2 += distortion_y;
if (rate_uv_intra[TX_4X4] == INT_MAX) {
- choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4],
+ choose_intra_uv_mode(cpi, ctx, bsize, &rate_uv_intra[TX_4X4],
&rate_uv_tokenonly[TX_4X4],
&dist_uv[TX_4X4], &skip_uv[TX_4X4],
&mode_uv[TX_4X4]);
@@ -4317,12 +4356,14 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
// Did this mode help.. i.e. is it the new best mode
if (this_rd < best_rd || x->skip) {
if (!mode_excluded) {
+ int max_plane = MAX_MB_PLANE;
// Note index of best mode so far
best_mode_index = mode_index;
if (ref_frame == INTRA_FRAME) {
/* required for left and above block mv */
mbmi->mv[0].as_int = 0;
+ max_plane = 1;
}
*returnrate = rate2;
@@ -4332,6 +4373,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
best_mbmode = *mbmi;
best_skip2 = this_skip2;
+ if (!x->select_txfm_size)
+ swap_block_ptr(x, ctx, max_plane);
vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
sizeof(uint8_t) * ctx->num_4x4_blk);
@@ -4438,7 +4481,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
// Do Intra UV best rd mode selection if best mode choice above was intra.
if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) {
TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
- rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
+ rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
&rate_uv_tokenonly[uv_tx_size],
&dist_uv[uv_tx_size],
&skip_uv[uv_tx_size],
diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c
index 7d4676e..c7336d0 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/libvpx/vp9/encoder/vp9_tokenize.c
@@ -57,7 +57,7 @@ static void fill_value_tokens() {
// initialize the cost for extra bits for all possible coefficient value.
{
int cost = 0;
- const vp9_extra_bit *p = vp9_extra_bits + t[i].token;
+ const vp9_extra_bit *p = &vp9_extra_bits[t[i].token];
if (p->base_val) {
const int extra = t[i].extra;
@@ -73,7 +73,7 @@ static void fill_value_tokens() {
} while (++i < DCT_MAX_VALUE);
vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
- vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
+ vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
}
struct tokenize_b_args {
@@ -127,7 +127,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
get_scan(xd, tx_size, type, block, &scan, &nb);
c = 0;
do {
- const int band = get_coef_band(band_translate, c);
+ const int band = band_translate[c];
int token;
int v = 0;
rc = scan[c];
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
index dc11501..fefca66 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
@@ -206,12 +206,12 @@ void fadst4_1d_sse2(__m128i *in) {
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[8], v[8];
__m128i in7 = _mm_add_epi16(in[0], in[1]);
- in7 = _mm_sub_epi16(in7, in[3]);
u[0] = _mm_unpacklo_epi16(in[0], in[1]);
u[1] = _mm_unpacklo_epi16(in[2], in[3]);
u[2] = _mm_unpacklo_epi16(in7, kZero);
u[3] = _mm_unpacklo_epi16(in[2], kZero);
+ u[4] = _mm_unpacklo_epi16(in[3], kZero);
v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
@@ -219,9 +219,10 @@ void fadst4_1d_sse2(__m128i *in) {
v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
+ v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
u[0] = _mm_add_epi32(v[0], v[1]);
- u[1] = v[2];
+ u[1] = _mm_sub_epi32(v[2], v[6]);
u[2] = _mm_add_epi32(v[3], v[4]);
u[3] = _mm_sub_epi32(u[2], u[0]);
u[4] = _mm_slli_epi32(v[5], 2);
diff --git a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
index 533456b..1a9e4e8 100644
--- a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
@@ -118,6 +118,14 @@ SECTION .text
RET
%endmacro
+%macro INC_SRC_BY_SRC_STRIDE 0
+%if ARCH_X86=1 && CONFIG_PIC=1
+ add srcq, src_stridemp
+%else
+ add srcq, src_strideq
+%endif
+%endmacro
+
%macro SUBPEL_VARIANCE 1-2 0 ; W
%if cpuflag(ssse3)
%define bilin_filter_m bilin_filter_m_ssse3
@@ -129,41 +137,85 @@ SECTION .text
; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
; 11, not 13, if the registers are ordered correctly. May make a minor speed
; difference on Win64
-%ifdef PIC
-%if %2 == 1 ; avg
-cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, \
- sec, sec_stride, height, sse
-%define sec_str sec_strideq
-%else
-cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \
- dst, dst_stride, height, sse
-%endif
-%define h heightd
-%define bilin_filter sseq
-%else
-%if %2 == 1 ; avg
-cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
- 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, \
- sec, sec_stride, \
- height, sse
-%if ARCH_X86_64
-%define h heightd
-%define sec_str sec_strideq
-%else
-%define h dword heightm
-%define sec_str sec_stridemp
-%endif
+
+%ifdef PIC ; 64bit PIC
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define sec_str sec_strideq
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, height, sse
+ %endif
+ %define h heightd
+ %define bilin_filter sseq
%else
-cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
- dst, dst_stride, height, sse
-%define h heightd
-%endif
-%define bilin_filter bilin_filter_m
+ %if ARCH_X86=1 && CONFIG_PIC=1
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse, g_bilin_filter, g_pw_8
+ %define h dword heightm
+ %define sec_str sec_stridemp
+
+ ;Store bilin_filter and pw_8 location in stack
+ GET_GOT eax
+ add esp, 4 ; restore esp
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, height, sse, \
+ g_bilin_filter, g_pw_8
+ %define h heightd
+
+ ;Store bilin_filter and pw_8 location in stack
+ GET_GOT eax
+ add esp, 4 ; restore esp
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %endif
+ %else
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+ 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse
+ %if ARCH_X86_64
+ %define h heightd
+ %define sec_str sec_strideq
+ %else
+ %define h dword heightm
+ %define sec_str sec_stridemp
+ %endif
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, height, sse
+ %define h heightd
+ %endif
+
+ %define bilin_filter bilin_filter_m
+ %endif
%endif
+
ASSERT %1 <= 16 ; m6 overflows if w > 16
pxor m6, m6 ; sum
pxor m7, m7 ; sse
@@ -329,11 +381,22 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%define filter_y_b m9
%define filter_rnd m10
%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
add y_offsetq, bilin_filter
%define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16]
%define filter_rnd [pw_8]
%endif
+%endif
+
.x_zero_y_other_loop:
%if %1 == 16
movu m0, [srcq]
@@ -615,12 +678,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%define filter_y_a m8
%define filter_y_b m9
%define filter_rnd m10
+%else ;x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
%else
add y_offsetq, bilin_filter
%define filter_y_a [y_offsetq]
%define filter_y_b [y_offsetq+16]
%define filter_rnd [pw_8]
%endif
+%endif
+
%if %1 == 16
movu m0, [srcq]
movu m3, [srcq+1]
@@ -752,12 +826,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%define filter_x_a m8
%define filter_x_b m9
%define filter_rnd m10
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+;y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
%else
add x_offsetq, bilin_filter
%define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16]
%define filter_rnd [pw_8]
%endif
+%endif
+
.x_other_y_zero_loop:
%if %1 == 16
movu m0, [srcq]
@@ -873,12 +958,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%define filter_x_a m8
%define filter_x_b m9
%define filter_rnd m10
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
%else
add x_offsetq, bilin_filter
%define filter_x_a [x_offsetq]
%define filter_x_b [x_offsetq+16]
%define filter_rnd [pw_8]
%endif
+%endif
+
%if %1 == 16
movu m0, [srcq]
movu m1, [srcq+1]
@@ -1057,6 +1153,21 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%define filter_y_a m10
%define filter_y_b m11
%define filter_rnd m12
+%else ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+ mov tempq, g_bilin_filterm
+ add x_offsetq, tempq
+ add y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
%else
add x_offsetq, bilin_filter
add y_offsetq, bilin_filter
@@ -1066,6 +1177,8 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%define filter_y_b [y_offsetq+16]
%define filter_rnd [pw_8]
%endif
+%endif
+
; x_offset == bilin interpolation && y_offset == bilin interpolation
%if %1 == 16
movu m0, [srcq]
@@ -1093,7 +1206,9 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%endif
psraw m0, 4
psraw m2, 4
- add srcq, src_strideq
+
+ INC_SRC_BY_SRC_STRIDE
+
packuswb m0, m2
.x_other_y_other_loop:
%if cpuflag(ssse3)
@@ -1163,7 +1278,7 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
- add srcq, src_strideq
+ INC_SRC_BY_SRC_STRIDE
add dstq, dst_strideq
%else ; %1 < 16
movh m0, [srcq]
@@ -1184,12 +1299,17 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
%if cpuflag(ssse3)
packuswb m0, m0
%endif
- add srcq, src_strideq
+
+ INC_SRC_BY_SRC_STRIDE
+
.x_other_y_other_loop:
movh m2, [srcq]
movh m1, [srcq+1]
- movh m4, [srcq+src_strideq]
- movh m3, [srcq+src_strideq+1]
+
+ INC_SRC_BY_SRC_STRIDE
+ movh m4, [srcq]
+ movh m3, [srcq+1]
+
%if cpuflag(ssse3)
punpcklbw m2, m1
punpcklbw m4, m3
@@ -1253,7 +1373,7 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
- lea srcq, [srcq+src_strideq*2]
+ INC_SRC_BY_SRC_STRIDE
lea dstq, [dstq+dst_strideq*2]
%endif
%if %2 == 1 ; avg
diff --git a/libvpx/vp9/vp9_common.mk b/libvpx/vp9/vp9_common.mk
index 0badb08..2dd2bf0 100644
--- a/libvpx/vp9/vp9_common.mk
+++ b/libvpx/vp9/vp9_common.mk
@@ -103,11 +103,21 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_avg_horiz_dspr
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_vert_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_intrapred4_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_intrapred8_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_intrapred16_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans32_cols_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans32_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
@@ -123,6 +133,7 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_1_add_neon$(AS
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_1_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct32x32_1_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct32x32_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht4x4_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht8x8_add_neon$(ASM)
diff --git a/libvpx/vpx_scale/generic/yv12config.c b/libvpx/vpx_scale/generic/yv12config.c
index a89e29d..7c3f7ec 100644
--- a/libvpx/vpx_scale/generic/yv12config.c
+++ b/libvpx/vpx_scale/generic/yv12config.c
@@ -148,7 +148,10 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
#else
const int frame_size = yplane_size + 2 * uvplane_size;
#endif
- if (!ybf->buffer_alloc) {
+ if (frame_size > ybf->buffer_alloc_sz) {
+ // Allocation to hold larger frame, or first allocation.
+ if (ybf->buffer_alloc)
+ vpx_free(ybf->buffer_alloc);
ybf->buffer_alloc = vpx_memalign(32, frame_size);
ybf->buffer_alloc_sz = frame_size;
}
diff --git a/libvpx/vpxdec.c b/libvpx/vpxdec.c
index 8e575e1..110e4ac 100644
--- a/libvpx/vpxdec.c
+++ b/libvpx/vpxdec.c
@@ -33,21 +33,6 @@
#include "nestegg/include/nestegg/nestegg.h"
#include "third_party/libyuv/include/libyuv/scale.h"
-#if CONFIG_OS_SUPPORT
-#if defined(_MSC_VER)
-#include <io.h>
-#define snprintf _snprintf
-#define isatty _isatty
-#define fileno _fileno
-#else
-#include <unistd.h>
-#endif
-#endif
-
-#ifndef PATH_MAX
-#define PATH_MAX 256
-#endif
-
static const char *exec_name;
static const struct {
diff --git a/libvpx/vpxenc.c b/libvpx/vpxenc.c
index df75b85..b7897db 100644
--- a/libvpx/vpxenc.c
+++ b/libvpx/vpxenc.c
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vpx_config.h"
+#include "./vpx_config.h"
#if defined(_WIN32) || defined(__OS2__) || !CONFIG_OS_SUPPORT
#define USE_POSIX_MMAP 0
@@ -16,6 +16,7 @@
#define USE_POSIX_MMAP 1
#endif
+#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
@@ -43,36 +44,12 @@
#include "vpx/vp8dx.h"
#endif
+#include "./tools_common.h"
#include "vpx_ports/mem_ops.h"
#include "vpx_ports/vpx_timer.h"
-#include "tools_common.h"
-#include "webmenc.h"
-#include "y4minput.h"
-
-
-/* Need special handling of these functions on Windows */
-#if defined(_MSC_VER)
-/* MSVS doesn't define off_t, and uses _f{seek,tell}i64 */
-typedef __int64 off_t;
-#define fseeko _fseeki64
-#define ftello _ftelli64
-#elif defined(_WIN32)
-/* MinGW defines off_t as long
- and uses f{seek,tell}o64/off64_t for large files */
-#define fseeko fseeko64
-#define ftello ftello64
-#define off_t off64_t
-#endif
-
-#define LITERALU64(hi,lo) ((((uint64_t)hi)<<32)|lo)
-
-/* We should use 32-bit file operations in WebM file format
- * when building ARM executable file (.axf) with RVCT */
-#if !CONFIG_OS_SUPPORT
-typedef long off_t;
-#define fseeko fseek
-#define ftello ftell
-#endif
+#include "./vpxstats.h"
+#include "./webmenc.h"
+#include "./y4minput.h"
/* Swallow warnings about unused results of fread/fwrite */
static size_t wrap_fread(void *ptr, size_t size, size_t nmemb,
@@ -141,126 +118,6 @@ static void warn_or_exit_on_error(vpx_codec_ctx_t *ctx, int fatal,
va_end(ap);
}
-/* This structure is used to abstract the different ways of handling
- * first pass statistics.
- */
-typedef struct {
- vpx_fixed_buf_t buf;
- int pass;
- FILE *file;
- char *buf_ptr;
- size_t buf_alloc_sz;
-} stats_io_t;
-
-int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
- int res;
-
- stats->pass = pass;
-
- if (pass == 0) {
- stats->file = fopen(fpf, "wb");
- stats->buf.sz = 0;
- stats->buf.buf = NULL,
- res = (stats->file != NULL);
- } else {
-#if 0
-#elif USE_POSIX_MMAP
- struct stat stat_buf;
- int fd;
-
- fd = open(fpf, O_RDONLY);
- stats->file = fdopen(fd, "rb");
- fstat(fd, &stat_buf);
- stats->buf.sz = stat_buf.st_size;
- stats->buf.buf = mmap(NULL, stats->buf.sz, PROT_READ, MAP_PRIVATE,
- fd, 0);
- res = (stats->buf.buf != NULL);
-#else
- size_t nbytes;
-
- stats->file = fopen(fpf, "rb");
-
- if (fseek(stats->file, 0, SEEK_END))
- fatal("First-pass stats file must be seekable!");
-
- stats->buf.sz = stats->buf_alloc_sz = ftell(stats->file);
- rewind(stats->file);
-
- stats->buf.buf = malloc(stats->buf_alloc_sz);
-
- if (!stats->buf.buf)
- fatal("Failed to allocate first-pass stats buffer (%lu bytes)",
- (unsigned long)stats->buf_alloc_sz);
-
- nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
- res = (nbytes == stats->buf.sz);
-#endif
- }
-
- return res;
-}
-
-int stats_open_mem(stats_io_t *stats, int pass) {
- int res;
- stats->pass = pass;
-
- if (!pass) {
- stats->buf.sz = 0;
- stats->buf_alloc_sz = 64 * 1024;
- stats->buf.buf = malloc(stats->buf_alloc_sz);
- }
-
- stats->buf_ptr = stats->buf.buf;
- res = (stats->buf.buf != NULL);
- return res;
-}
-
-
-void stats_close(stats_io_t *stats, int last_pass) {
- if (stats->file) {
- if (stats->pass == last_pass) {
-#if 0
-#elif USE_POSIX_MMAP
- munmap(stats->buf.buf, stats->buf.sz);
-#else
- free(stats->buf.buf);
-#endif
- }
-
- fclose(stats->file);
- stats->file = NULL;
- } else {
- if (stats->pass == last_pass)
- free(stats->buf.buf);
- }
-}
-
-void stats_write(stats_io_t *stats, const void *pkt, size_t len) {
- if (stats->file) {
- (void) fwrite(pkt, 1, len, stats->file);
- } else {
- if (stats->buf.sz + len > stats->buf_alloc_sz) {
- size_t new_sz = stats->buf_alloc_sz + 64 * 1024;
- char *new_ptr = realloc(stats->buf.buf, new_sz);
-
- if (new_ptr) {
- stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf);
- stats->buf.buf = new_ptr;
- stats->buf_alloc_sz = new_sz;
- } else
- fatal("Failed to realloc firstpass stats buffer.");
- }
-
- memcpy(stats->buf_ptr, pkt, len);
- stats->buf.sz += len;
- stats->buf_ptr += len;
- }
-}
-
-vpx_fixed_buf_t stats_get(stats_io_t *stats) {
- return stats->buf;
-}
-
enum video_file_type {
FILE_TYPE_RAW,
FILE_TYPE_IVF,
@@ -288,7 +145,6 @@ struct input_state {
int only_i420;
};
-
#define IVF_FRAME_HDR_SZ (4+8) /* 4 byte size + 8 byte timestamp */
static int read_frame(struct input_state *input, vpx_image_t *img) {
FILE *f = input->file;
@@ -503,22 +359,6 @@ static unsigned int murmur(const void *key, int len, unsigned int seed) {
return h;
}
-#include "math.h"
-#define MAX_PSNR 100
-static double vp8_mse2psnr(double Samples, double Peak, double Mse) {
- double psnr;
-
- if ((double)Mse > 0.0)
- psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
- else
- psnr = MAX_PSNR; /* Limit to prevent / 0 */
-
- if (psnr > MAX_PSNR)
- psnr = MAX_PSNR;
-
- return psnr;
-}
-
#include "args.h"
static const arg_def_t debugmode = ARG_DEF("D", "debug", 0,
@@ -1409,17 +1249,7 @@ void open_input_file(struct input_state *input) {
} else
fatal("Unsupported Y4M stream.");
} else if (input->detect.buf_read == 4 && file_is_ivf(input, &fourcc)) {
- input->file_type = FILE_TYPE_IVF;
- switch (fourcc) {
- case 0x32315659:
- input->use_i420 = 0;
- break;
- case 0x30323449:
- input->use_i420 = 1;
- break;
- default:
- fatal("Unsupported fourcc (%08x) in IVF", fourcc);
- }
+ fatal("IVF is not supported as input.");
} else {
input->file_type = FILE_TYPE_RAW;
}
@@ -1433,7 +1263,7 @@ static void close_input_file(struct input_state *input) {
}
static struct stream_state *new_stream(struct global_config *global,
- struct stream_state *prev) {
+ struct stream_state *prev) {
struct stream_state *stream;
stream = calloc(1, sizeof(*stream));
diff --git a/libvpx/vpxstats.c b/libvpx/vpxstats.c
new file mode 100644
index 0000000..70cea3e
--- /dev/null
+++ b/libvpx/vpxstats.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpxstats.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./tools_common.h"
+
+int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
+ int res;
+ stats->pass = pass;
+
+ if (pass == 0) {
+ stats->file = fopen(fpf, "wb");
+ stats->buf.sz = 0;
+ stats->buf.buf = NULL;
+ res = (stats->file != NULL);
+ } else {
+#if USE_POSIX_MMAP
+ struct stat stat_buf;
+ int fd;
+
+ fd = open(fpf, O_RDONLY);
+ stats->file = fdopen(fd, "rb");
+ fstat(fd, &stat_buf);
+ stats->buf.sz = stat_buf.st_size;
+ stats->buf.buf = mmap(NULL, stats->buf.sz, PROT_READ, MAP_PRIVATE, fd, 0);
+ res = (stats->buf.buf != NULL);
+#else
+ size_t nbytes;
+
+ stats->file = fopen(fpf, "rb");
+
+ if (fseek(stats->file, 0, SEEK_END))
+ fatal("First-pass stats file must be seekable!");
+
+ stats->buf.sz = stats->buf_alloc_sz = ftell(stats->file);
+ rewind(stats->file);
+
+ stats->buf.buf = malloc(stats->buf_alloc_sz);
+
+ if (!stats->buf.buf)
+ fatal("Failed to allocate first-pass stats buffer (%lu bytes)",
+ (unsigned int)stats->buf_alloc_sz);
+
+ nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
+ res = (nbytes == stats->buf.sz);
+#endif /* USE_POSIX_MMAP */
+ }
+
+ return res;
+}
+
+int stats_open_mem(stats_io_t *stats, int pass) {
+ int res;
+ stats->pass = pass;
+
+ if (!pass) {
+ stats->buf.sz = 0;
+ stats->buf_alloc_sz = 64 * 1024;
+ stats->buf.buf = malloc(stats->buf_alloc_sz);
+ }
+
+ stats->buf_ptr = stats->buf.buf;
+ res = (stats->buf.buf != NULL);
+ return res;
+}
+
+void stats_close(stats_io_t *stats, int last_pass) {
+ if (stats->file) {
+ if (stats->pass == last_pass) {
+#if USE_POSIX_MMAP
+ munmap(stats->buf.buf, stats->buf.sz);
+#else
+ free(stats->buf.buf);
+#endif /* USE_POSIX_MMAP */
+ }
+
+ fclose(stats->file);
+ stats->file = NULL;
+ } else {
+ if (stats->pass == last_pass)
+ free(stats->buf.buf);
+ }
+}
+
+void stats_write(stats_io_t *stats, const void *pkt, size_t len) {
+ if (stats->file) {
+ (void) fwrite(pkt, 1, len, stats->file);
+ } else {
+ if (stats->buf.sz + len > stats->buf_alloc_sz) {
+ size_t new_sz = stats->buf_alloc_sz + 64 * 1024;
+ char *new_ptr = realloc(stats->buf.buf, new_sz);
+
+ if (new_ptr) {
+ stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf);
+ stats->buf.buf = new_ptr;
+ stats->buf_alloc_sz = new_sz;
+ } else {
+ fatal("Failed to realloc firstpass stats buffer.");
+ }
+ }
+
+ memcpy(stats->buf_ptr, pkt, len);
+ stats->buf.sz += len;
+ stats->buf_ptr += len;
+ }
+}
+
+vpx_fixed_buf_t stats_get(stats_io_t *stats) {
+ return stats->buf;
+}
+
+double vp8_mse2psnr(double samples, double peak, double mse) {
+ const int kMaxPSNR = 100;
+ double psnr = kMaxPSNR;
+
+ if (mse > 0.0)
+ psnr = 10.0 * log10(peak * peak * samples / mse);
+
+ if (psnr > kMaxPSNR)
+ psnr = kMaxPSNR;
+
+ return psnr;
+}
diff --git a/libvpx/vpxstats.h b/libvpx/vpxstats.h
new file mode 100644
index 0000000..18b3acd
--- /dev/null
+++ b/libvpx/vpxstats.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPXSTATS_H_
+#define VPXSTATS_H_
+
+#include <stdio.h>
+
+#include "vpx/vpx_encoder.h"
+
+/* This structure is used to abstract the different ways of handling
+ * first pass statistics
+ */
+typedef struct {
+ vpx_fixed_buf_t buf;
+ int pass;
+ FILE *file;
+ char *buf_ptr;
+ size_t buf_alloc_sz;
+} stats_io_t;
+
+int stats_open_file(stats_io_t *stats, const char *fpf, int pass);
+int stats_open_mem(stats_io_t *stats, int pass);
+void stats_close(stats_io_t *stats, int last_pass);
+void stats_write(stats_io_t *stats, const void *pkt, size_t len);
+vpx_fixed_buf_t stats_get(stats_io_t *stats);
+
+double vp8_mse2psnr(double samples, double peak, double mse);
+
+#endif // VPXSTATS_H_
diff --git a/libvpx/webmenc.c b/libvpx/webmenc.c
index a584e9d..17bbeec 100644
--- a/libvpx/webmenc.c
+++ b/libvpx/webmenc.c
@@ -15,20 +15,6 @@
#include "third_party/libmkv/EbmlWriter.h"
#include "third_party/libmkv/EbmlIDs.h"
-#if defined(_MSC_VER)
-/* MSVS uses _f{seek,tell}i64 */
-#define fseeko _fseeki64
-#define ftello _ftelli64
-#elif defined(_WIN32)
-/* MinGW defines off_t as long, and uses f{seek,tell}o64/off64_t for large
- * files */
-#define fseeko fseeko64
-#define ftello ftello64
-#define off_t off64_t
-#endif
-
-#define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo)
-
void Ebml_Write(struct EbmlGlobal *glob,
const void *buffer_in,
unsigned long len) {
diff --git a/mips-dspr2/libvpx_srcs.txt b/mips-dspr2/libvpx_srcs.txt
index afe7d13..e723074 100644
--- a/mips-dspr2/libvpx_srcs.txt
+++ b/mips-dspr2/libvpx_srcs.txt
@@ -143,11 +143,21 @@ vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
+vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c
+vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c
+vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c
vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
+vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
+vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
+vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
+vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
+vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
+vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
+vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
vp9/common/vp9_alloccommon.c
vp9/common/vp9_alloccommon.h
vp9/common/vp9_blockd.h
diff --git a/mips-dspr2/vp9_rtcd.h b/mips-dspr2/vp9_rtcd.h
index 2198578..ded4c59 100644
--- a/mips-dspr2/vp9_rtcd.h
+++ b/mips-dspr2/vp9_rtcd.h
@@ -34,7 +34,8 @@ void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c
void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c
+void vp9_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_dspr2
void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c
@@ -49,10 +50,12 @@ void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c
void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c
+void vp9_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_dspr2
void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c
+void vp9_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_dspr2
void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c
@@ -73,7 +76,8 @@ void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *ab
#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c
void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c
+void vp9_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_dspr2
void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c
@@ -88,10 +92,12 @@ void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c
void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c
+void vp9_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_dspr2
void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c
+void vp9_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_dspr2
void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c
@@ -112,7 +118,8 @@ void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *
#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c
void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c
+void vp9_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_dspr2
void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c
@@ -130,7 +137,8 @@ void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a
#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c
void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c
+void vp9_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_dspr2
void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c
@@ -181,22 +189,28 @@ void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_
#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c
void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_c
+void vp9_mb_lpf_vertical_edge_w_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_dspr2
void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_c
+void vp9_mbloop_filter_vertical_edge_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_dspr2
void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c
+void vp9_loop_filter_vertical_edge_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_dspr2
void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c
+void vp9_mb_lpf_horizontal_edge_w_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_dspr2
void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_c
+void vp9_mbloop_filter_horizontal_edge_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_dspr2
void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_c
+void vp9_loop_filter_horizontal_edge_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_dspr2
void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
#define vp9_blend_mb_inner vp9_blend_mb_inner_c
@@ -276,7 +290,8 @@ void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, int dest_
#define vp9_idct32x32_1024_add vp9_idct32x32_1024_add_dspr2
void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_34_add vp9_idct32x32_34_add_c
+void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_idct32x32_34_add vp9_idct32x32_34_add_dspr2
void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride);
diff --git a/mips-dspr2/vpx_version.h b/mips-dspr2/vpx_version.h
index d3359dc..9a354bb 100644
--- a/mips-dspr2/vpx_version.h
+++ b/mips-dspr2/vpx_version.h
@@ -1,7 +1,7 @@
#define VERSION_MAJOR 1
#define VERSION_MINOR 2
#define VERSION_PATCH 0
-#define VERSION_EXTRA "4989-gabdefea"
+#define VERSION_EXTRA "5082-g58f7543"
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.2.0-4989-gabdefea"
-#define VERSION_STRING " v1.2.0-4989-gabdefea"
+#define VERSION_STRING_NOSP "v1.2.0-5082-g58f7543"
+#define VERSION_STRING " v1.2.0-5082-g58f7543"
diff --git a/mips/vpx_version.h b/mips/vpx_version.h
index d3359dc..9a354bb 100644
--- a/mips/vpx_version.h
+++ b/mips/vpx_version.h
@@ -1,7 +1,7 @@
#define VERSION_MAJOR 1
#define VERSION_MINOR 2
#define VERSION_PATCH 0
-#define VERSION_EXTRA "4989-gabdefea"
+#define VERSION_EXTRA "5082-g58f7543"
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.2.0-4989-gabdefea"
-#define VERSION_STRING " v1.2.0-4989-gabdefea"
+#define VERSION_STRING_NOSP "v1.2.0-5082-g58f7543"
+#define VERSION_STRING " v1.2.0-5082-g58f7543"