Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSs__H264__App
changeset 1:11d15c47beaf
add h264 decoder code
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/ffmpeg_smp/benchmark.sh Mon Aug 27 12:09:56 2012 +0200 1.3 @@ -0,0 +1,126 @@ 1.4 +#! /bin/bash 1.5 + 1.6 +workers=(1 4 8 12 16 20 24 28 32) 1.7 +cpus=(0 3 7 15 15 23 23 31 31) 1.8 +nodes=(0 0 0 1 1 2 2 3 3) 1.9 + 1.10 +confs=( "1 1 1" "1 2 2" "2 3 4" "2 4 5" "3 5 8" "3 6 10" "4 7 12" "4 8 15" "5 8 17" #small 1.11 + "1 1 1" "1 2 2" "2 3 4" "2 4 5" "3 5 7" "3 6 9" "4 7 12" "4 8 13" "5 10 15") #large 1.12 + 1.13 + 1.14 + 1.15 +#confsmall=("1 1 1" "1 2 2" "2 3 4" "2 4 5" "3 5 8" "3 6 10" "4 7 12" "4 8 15" "5 8 17") 1.16 +# "7 10 21" "8 12 25" "10 15 29" "11 17 32") 1.17 +#conflarge=("1 1 1" "1 2 2" "2 3 4" "2 4 5" "3 5 7" "3 6 9" "4 7 12" "4 8 13" "5 10 15") 1.18 +#"5 12 21" "6 15 25" "7 17 30" "8 19 36") 1.19 + 1.20 + 1.21 +configs=9 1.22 + 1.23 +average_ompss_2d=0 1.24 +average_ompss_3d=0 1.25 +average_pthread=0 1.26 +average_serial=0 1.27 + 1.28 +iterations_low=4 1.29 +iterations_high=8 1.30 + 1.31 +nframes=10000 # max frames limit for debug purpose 1.32 +inputs=("14" "10") 1.33 +inputs_vebose=("Big Bug Bunny 1920x1080 10000 frames" "Park Joy 3840x2160 2500 frames") 1.34 +osargs=("-z 8 8" "-z 12 12 --static-3d") 1.35 + 1.36 +time_stamp=`date +%Y.%m.%d_%H.%M.%S` 1.37 +outputdir="/home/stefan.hauser/ffmpeg_smp/ppopp_results/rx600s5-1t/$time_stamp" 1.38 +ompss_2d="$outputdir/ompss_2d.txt" 1.39 +ompss_3d="$outputdir/ompss_3d.txt" 1.40 +pthread="$outputdir/pthread.txt" 1.41 +serial="$outputdir/serial.txt" 1.42 + 1.43 +#executes the experiments for a single conf $1=confnum $2 iterations $3 input_idx 1.44 +function execute_single_conf { 1.45 + conf=$1 1.46 + iter=$2 1.47 + iidx=$3 1.48 + 1.49 + average_ompss_2d=0 1.50 + average_ompss_3d=0 1.51 + average_pthread=0 1.52 + 1.53 + echo "Workers: " ${workers[$conf]} | tee -a $ompss_2d $ompss_3d $pthread $serial 1.54 + 1.55 + cd build-ss 1.56 + for ((i=1;i<=$iter;i+=1)); do 1.57 + # OMPSS 1.58 + #export CSS_NUM_CPUS=$worker 1.59 + NX_PES=${workers[$conf]} numactl --interleave=0-${nodes[$conf]} time -p ./ffmpeg -i ${inputs[$iidx]} -n $nframes -e $((${workers[$conf]}+1)) ${osargs[0]} 2> output 1.60 + runtime=$(cat output | grep real | sed s/^.*l.//g) 1.61 + average_ompss_2d=$(echo "$average_ompss_2d + $runtime"|bc) 1.62 + echo -n $runtime " " >> $ompss_2d 1.63 + done 1.64 + 1.65 + for ((i=1;i<=$iter;i+=1)); do 1.66 + NX_PES=${workers[$conf]} numactl --interleave=0-${nodes[$conf]} time -p ./ffmpeg -i ${inputs[$iidx]} -n $nframes -e $((${workers[$conf]}+1)) ${osargs[1]} 2> output 1.67 + runtime=$(cat output | grep real | sed s/^.*l.//g) 1.68 + average_ompss_3d=$(echo "$average_ompss_3d + $runtime"|bc) 1.69 + echo -n $runtime " " >> $ompss_3d 1.70 + done 1.71 + cd .. 1.72 + 1.73 + cd build 1.74 + for ((i=1;i<=$iter;i+=1)); do 1.75 + # Pthreads 1.76 + numactl --physcpubind=0-$((${cpus[$conf]})) time -p ./ffmpeg -i ${inputs[$iidx]} -n $nframes -t ${confs[$(($conf + $iidx * $configs))]} 2> output 1.77 + runtime=$(cat output | grep real | sed s/^.*l.//g) 1.78 + average_pthread=$(echo "$average_pthread + $runtime"|bc) 1.79 + echo -n $runtime " " >> $pthread 1.80 + done 1.81 + cd .. 1.82 + 1.83 + echo "" | tee -a $pthread $ompss_2d $ompss_3d 1.84 + average_ompss_2d=$(echo "scale=5;$average_ompss_2d/$iter"|bc) 1.85 + average_ompss_3d=$(echo "scale=5;$average_ompss_3d/$iter"|bc) 1.86 + average_pthread=$(echo "scale=5;$average_pthread/$iter"|bc) 1.87 + 1.88 + echo "time: " $average_ompss_2d >> $ompss_2d 1.89 + echo "time: " $average_ompss_3d >> $ompss_3d 1.90 + echo "time: " $average_pthread >> $pthread 1.91 + echo "time: " $average_serial >> $serial 1.92 +} 1.93 + 1.94 + 1.95 +mkdir $outputdir 1.96 + 1.97 +echo "Processing inputs ..." 1.98 + 1.99 +echo "h264dec Benchmark" | tee $ompss_2d $ompss_3d $pthread $serial 1.100 + 1.101 +for n in 0 1; do 1.102 + echo "Input: ${inputs_vebose[$n]}" | tee -a $ompss_2d $ompss_3d $pthread $serial 1.103 + echo "" | tee -a $ompss_2d $ompss_3d $pthread $serial 1.104 + 1.105 + # Serial 1.106 + cd build 1.107 + numactl --physcpubind=0 time -p ./ffmpeg -i ${inputs[$n]} -n $nframes -s 2> output 1.108 + runtime=$(cat output | grep real | sed s/^.*l.//g) 1.109 + average_serial=$runtime 1.110 + cd .. 1.111 + 1.112 + execute_single_conf 0 1 $n 1.113 + 1.114 + #Parallel 1.115 + for ((confidx=1;confidx<=4;confidx+=1)); do 1.116 + execute_single_conf $confidx $iterations_low $n 1.117 + done 1.118 + 1.119 + for ((confidx=5;confidx<=$(($configs-1));confidx+=1)); do 1.120 + execute_single_conf $confidx $iterations_high $n 1.121 + done 1.122 + 1.123 + echo "-------------------" | tee -a $ompss_2d $ompss_3d $pthread $serial 1.124 +done 1.125 + 1.126 +echo "FINISHED" 1.127 + 1.128 +rm build/output build-ss/output 1.129 +
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/ffmpeg_smp/h264dec/COPYING.GPLv3 Mon Aug 27 12:09:56 2012 +0200 2.3 @@ -0,0 +1,674 @@ 2.4 + GNU GENERAL PUBLIC LICENSE 2.5 + Version 3, 29 June 2007 2.6 + 2.7 + Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/> 2.8 + Everyone is permitted to copy and distribute verbatim copies 2.9 + of this license document, but changing it is not allowed. 2.10 + 2.11 + Preamble 2.12 + 2.13 + The GNU General Public License is a free, copyleft license for 2.14 +software and other kinds of works. 2.15 + 2.16 + The licenses for most software and other practical works are designed 2.17 +to take away your freedom to share and change the works. By contrast, 2.18 +the GNU General Public License is intended to guarantee your freedom to 2.19 +share and change all versions of a program--to make sure it remains free 2.20 +software for all its users. We, the Free Software Foundation, use the 2.21 +GNU General Public License for most of our software; it applies also to 2.22 +any other work released this way by its authors. You can apply it to 2.23 +your programs, too. 2.24 + 2.25 + When we speak of free software, we are referring to freedom, not 2.26 +price. Our General Public Licenses are designed to make sure that you 2.27 +have the freedom to distribute copies of free software (and charge for 2.28 +them if you wish), that you receive source code or can get it if you 2.29 +want it, that you can change the software or use pieces of it in new 2.30 +free programs, and that you know you can do these things. 2.31 + 2.32 + To protect your rights, we need to prevent others from denying you 2.33 +these rights or asking you to surrender the rights. Therefore, you have 2.34 +certain responsibilities if you distribute copies of the software, or if 2.35 +you modify it: responsibilities to respect the freedom of others. 2.36 + 2.37 + For example, if you distribute copies of such a program, whether 2.38 +gratis or for a fee, you must pass on to the recipients the same 2.39 +freedoms that you received. You must make sure that they, too, receive 2.40 +or can get the source code. And you must show them these terms so they 2.41 +know their rights. 2.42 + 2.43 + Developers that use the GNU GPL protect your rights with two steps: 2.44 +(1) assert copyright on the software, and (2) offer you this License 2.45 +giving you legal permission to copy, distribute and/or modify it. 2.46 + 2.47 + For the developers' and authors' protection, the GPL clearly explains 2.48 +that there is no warranty for this free software. For both users' and 2.49 +authors' sake, the GPL requires that modified versions be marked as 2.50 +changed, so that their problems will not be attributed erroneously to 2.51 +authors of previous versions. 2.52 + 2.53 + Some devices are designed to deny users access to install or run 2.54 +modified versions of the software inside them, although the manufacturer 2.55 +can do so. This is fundamentally incompatible with the aim of 2.56 +protecting users' freedom to change the software. The systematic 2.57 +pattern of such abuse occurs in the area of products for individuals to 2.58 +use, which is precisely where it is most unacceptable. Therefore, we 2.59 +have designed this version of the GPL to prohibit the practice for those 2.60 +products. If such problems arise substantially in other domains, we 2.61 +stand ready to extend this provision to those domains in future versions 2.62 +of the GPL, as needed to protect the freedom of users. 2.63 + 2.64 + Finally, every program is threatened constantly by software patents. 2.65 +States should not allow patents to restrict development and use of 2.66 +software on general-purpose computers, but in those that do, we wish to 2.67 +avoid the special danger that patents applied to a free program could 2.68 +make it effectively proprietary. To prevent this, the GPL assures that 2.69 +patents cannot be used to render the program non-free. 2.70 + 2.71 + The precise terms and conditions for copying, distribution and 2.72 +modification follow. 2.73 + 2.74 + TERMS AND CONDITIONS 2.75 + 2.76 + 0. Definitions. 2.77 + 2.78 + "This License" refers to version 3 of the GNU General Public License. 2.79 + 2.80 + "Copyright" also means copyright-like laws that apply to other kinds of 2.81 +works, such as semiconductor masks. 2.82 + 2.83 + "The Program" refers to any copyrightable work licensed under this 2.84 +License. Each licensee is addressed as "you". "Licensees" and 2.85 +"recipients" may be individuals or organizations. 2.86 + 2.87 + To "modify" a work means to copy from or adapt all or part of the work 2.88 +in a fashion requiring copyright permission, other than the making of an 2.89 +exact copy. The resulting work is called a "modified version" of the 2.90 +earlier work or a work "based on" the earlier work. 2.91 + 2.92 + A "covered work" means either the unmodified Program or a work based 2.93 +on the Program. 2.94 + 2.95 + To "propagate" a work means to do anything with it that, without 2.96 +permission, would make you directly or secondarily liable for 2.97 +infringement under applicable copyright law, except executing it on a 2.98 +computer or modifying a private copy. Propagation includes copying, 2.99 +distribution (with or without modification), making available to the 2.100 +public, and in some countries other activities as well. 2.101 + 2.102 + To "convey" a work means any kind of propagation that enables other 2.103 +parties to make or receive copies. Mere interaction with a user through 2.104 +a computer network, with no transfer of a copy, is not conveying. 2.105 + 2.106 + An interactive user interface displays "Appropriate Legal Notices" 2.107 +to the extent that it includes a convenient and prominently visible 2.108 +feature that (1) displays an appropriate copyright notice, and (2) 2.109 +tells the user that there is no warranty for the work (except to the 2.110 +extent that warranties are provided), that licensees may convey the 2.111 +work under this License, and how to view a copy of this License. If 2.112 +the interface presents a list of user commands or options, such as a 2.113 +menu, a prominent item in the list meets this criterion. 2.114 + 2.115 + 1. Source Code. 2.116 + 2.117 + The "source code" for a work means the preferred form of the work 2.118 +for making modifications to it. "Object code" means any non-source 2.119 +form of a work. 2.120 + 2.121 + A "Standard Interface" means an interface that either is an official 2.122 +standard defined by a recognized standards body, or, in the case of 2.123 +interfaces specified for a particular programming language, one that 2.124 +is widely used among developers working in that language. 2.125 + 2.126 + The "System Libraries" of an executable work include anything, other 2.127 +than the work as a whole, that (a) is included in the normal form of 2.128 +packaging a Major Component, but which is not part of that Major 2.129 +Component, and (b) serves only to enable use of the work with that 2.130 +Major Component, or to implement a Standard Interface for which an 2.131 +implementation is available to the public in source code form. A 2.132 +"Major Component", in this context, means a major essential component 2.133 +(kernel, window system, and so on) of the specific operating system 2.134 +(if any) on which the executable work runs, or a compiler used to 2.135 +produce the work, or an object code interpreter used to run it. 2.136 + 2.137 + The "Corresponding Source" for a work in object code form means all 2.138 +the source code needed to generate, install, and (for an executable 2.139 +work) run the object code and to modify the work, including scripts to 2.140 +control those activities. However, it does not include the work's 2.141 +System Libraries, or general-purpose tools or generally available free 2.142 +programs which are used unmodified in performing those activities but 2.143 +which are not part of the work. For example, Corresponding Source 2.144 +includes interface definition files associated with source files for 2.145 +the work, and the source code for shared libraries and dynamically 2.146 +linked subprograms that the work is specifically designed to require, 2.147 +such as by intimate data communication or control flow between those 2.148 +subprograms and other parts of the work. 2.149 + 2.150 + The Corresponding Source need not include anything that users 2.151 +can regenerate automatically from other parts of the Corresponding 2.152 +Source. 2.153 + 2.154 + The Corresponding Source for a work in source code form is that 2.155 +same work. 2.156 + 2.157 + 2. Basic Permissions. 2.158 + 2.159 + All rights granted under this License are granted for the term of 2.160 +copyright on the Program, and are irrevocable provided the stated 2.161 +conditions are met. This License explicitly affirms your unlimited 2.162 +permission to run the unmodified Program. The output from running a 2.163 +covered work is covered by this License only if the output, given its 2.164 +content, constitutes a covered work. This License acknowledges your 2.165 +rights of fair use or other equivalent, as provided by copyright law. 2.166 + 2.167 + You may make, run and propagate covered works that you do not 2.168 +convey, without conditions so long as your license otherwise remains 2.169 +in force. You may convey covered works to others for the sole purpose 2.170 +of having them make modifications exclusively for you, or provide you 2.171 +with facilities for running those works, provided that you comply with 2.172 +the terms of this License in conveying all material for which you do 2.173 +not control copyright. Those thus making or running the covered works 2.174 +for you must do so exclusively on your behalf, under your direction 2.175 +and control, on terms that prohibit them from making any copies of 2.176 +your copyrighted material outside their relationship with you. 2.177 + 2.178 + Conveying under any other circumstances is permitted solely under 2.179 +the conditions stated below. Sublicensing is not allowed; section 10 2.180 +makes it unnecessary. 2.181 + 2.182 + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 2.183 + 2.184 + No covered work shall be deemed part of an effective technological 2.185 +measure under any applicable law fulfilling obligations under article 2.186 +11 of the WIPO copyright treaty adopted on 20 December 1996, or 2.187 +similar laws prohibiting or restricting circumvention of such 2.188 +measures. 2.189 + 2.190 + When you convey a covered work, you waive any legal power to forbid 2.191 +circumvention of technological measures to the extent such circumvention 2.192 +is effected by exercising rights under this License with respect to 2.193 +the covered work, and you disclaim any intention to limit operation or 2.194 +modification of the work as a means of enforcing, against the work's 2.195 +users, your or third parties' legal rights to forbid circumvention of 2.196 +technological measures. 2.197 + 2.198 + 4. Conveying Verbatim Copies. 2.199 + 2.200 + You may convey verbatim copies of the Program's source code as you 2.201 +receive it, in any medium, provided that you conspicuously and 2.202 +appropriately publish on each copy an appropriate copyright notice; 2.203 +keep intact all notices stating that this License and any 2.204 +non-permissive terms added in accord with section 7 apply to the code; 2.205 +keep intact all notices of the absence of any warranty; and give all 2.206 +recipients a copy of this License along with the Program. 2.207 + 2.208 + You may charge any price or no price for each copy that you convey, 2.209 +and you may offer support or warranty protection for a fee. 2.210 + 2.211 + 5. Conveying Modified Source Versions. 2.212 + 2.213 + You may convey a work based on the Program, or the modifications to 2.214 +produce it from the Program, in the form of source code under the 2.215 +terms of section 4, provided that you also meet all of these conditions: 2.216 + 2.217 + a) The work must carry prominent notices stating that you modified 2.218 + it, and giving a relevant date. 2.219 + 2.220 + b) The work must carry prominent notices stating that it is 2.221 + released under this License and any conditions added under section 2.222 + 7. This requirement modifies the requirement in section 4 to 2.223 + "keep intact all notices". 2.224 + 2.225 + c) You must license the entire work, as a whole, under this 2.226 + License to anyone who comes into possession of a copy. This 2.227 + License will therefore apply, along with any applicable section 7 2.228 + additional terms, to the whole of the work, and all its parts, 2.229 + regardless of how they are packaged. This License gives no 2.230 + permission to license the work in any other way, but it does not 2.231 + invalidate such permission if you have separately received it. 2.232 + 2.233 + d) If the work has interactive user interfaces, each must display 2.234 + Appropriate Legal Notices; however, if the Program has interactive 2.235 + interfaces that do not display Appropriate Legal Notices, your 2.236 + work need not make them do so. 2.237 + 2.238 + A compilation of a covered work with other separate and independent 2.239 +works, which are not by their nature extensions of the covered work, 2.240 +and which are not combined with it such as to form a larger program, 2.241 +in or on a volume of a storage or distribution medium, is called an 2.242 +"aggregate" if the compilation and its resulting copyright are not 2.243 +used to limit the access or legal rights of the compilation's users 2.244 +beyond what the individual works permit. Inclusion of a covered work 2.245 +in an aggregate does not cause this License to apply to the other 2.246 +parts of the aggregate. 2.247 + 2.248 + 6. Conveying Non-Source Forms. 2.249 + 2.250 + You may convey a covered work in object code form under the terms 2.251 +of sections 4 and 5, provided that you also convey the 2.252 +machine-readable Corresponding Source under the terms of this License, 2.253 +in one of these ways: 2.254 + 2.255 + a) Convey the object code in, or embodied in, a physical product 2.256 + (including a physical distribution medium), accompanied by the 2.257 + Corresponding Source fixed on a durable physical medium 2.258 + customarily used for software interchange. 2.259 + 2.260 + b) Convey the object code in, or embodied in, a physical product 2.261 + (including a physical distribution medium), accompanied by a 2.262 + written offer, valid for at least three years and valid for as 2.263 + long as you offer spare parts or customer support for that product 2.264 + model, to give anyone who possesses the object code either (1) a 2.265 + copy of the Corresponding Source for all the software in the 2.266 + product that is covered by this License, on a durable physical 2.267 + medium customarily used for software interchange, for a price no 2.268 + more than your reasonable cost of physically performing this 2.269 + conveying of source, or (2) access to copy the 2.270 + Corresponding Source from a network server at no charge. 2.271 + 2.272 + c) Convey individual copies of the object code with a copy of the 2.273 + written offer to provide the Corresponding Source. This 2.274 + alternative is allowed only occasionally and noncommercially, and 2.275 + only if you received the object code with such an offer, in accord 2.276 + with subsection 6b. 2.277 + 2.278 + d) Convey the object code by offering access from a designated 2.279 + place (gratis or for a charge), and offer equivalent access to the 2.280 + Corresponding Source in the same way through the same place at no 2.281 + further charge. You need not require recipients to copy the 2.282 + Corresponding Source along with the object code. If the place to 2.283 + copy the object code is a network server, the Corresponding Source 2.284 + may be on a different server (operated by you or a third party) 2.285 + that supports equivalent copying facilities, provided you maintain 2.286 + clear directions next to the object code saying where to find the 2.287 + Corresponding Source. Regardless of what server hosts the 2.288 + Corresponding Source, you remain obligated to ensure that it is 2.289 + available for as long as needed to satisfy these requirements. 2.290 + 2.291 + e) Convey the object code using peer-to-peer transmission, provided 2.292 + you inform other peers where the object code and Corresponding 2.293 + Source of the work are being offered to the general public at no 2.294 + charge under subsection 6d. 2.295 + 2.296 + A separable portion of the object code, whose source code is excluded 2.297 +from the Corresponding Source as a System Library, need not be 2.298 +included in conveying the object code work. 2.299 + 2.300 + A "User Product" is either (1) a "consumer product", which means any 2.301 +tangible personal property which is normally used for personal, family, 2.302 +or household purposes, or (2) anything designed or sold for incorporation 2.303 +into a dwelling. In determining whether a product is a consumer product, 2.304 +doubtful cases shall be resolved in favor of coverage. For a particular 2.305 +product received by a particular user, "normally used" refers to a 2.306 +typical or common use of that class of product, regardless of the status 2.307 +of the particular user or of the way in which the particular user 2.308 +actually uses, or expects or is expected to use, the product. A product 2.309 +is a consumer product regardless of whether the product has substantial 2.310 +commercial, industrial or non-consumer uses, unless such uses represent 2.311 +the only significant mode of use of the product. 2.312 + 2.313 + "Installation Information" for a User Product means any methods, 2.314 +procedures, authorization keys, or other information required to install 2.315 +and execute modified versions of a covered work in that User Product from 2.316 +a modified version of its Corresponding Source. The information must 2.317 +suffice to ensure that the continued functioning of the modified object 2.318 +code is in no case prevented or interfered with solely because 2.319 +modification has been made. 2.320 + 2.321 + If you convey an object code work under this section in, or with, or 2.322 +specifically for use in, a User Product, and the conveying occurs as 2.323 +part of a transaction in which the right of possession and use of the 2.324 +User Product is transferred to the recipient in perpetuity or for a 2.325 +fixed term (regardless of how the transaction is characterized), the 2.326 +Corresponding Source conveyed under this section must be accompanied 2.327 +by the Installation Information. But this requirement does not apply 2.328 +if neither you nor any third party retains the ability to install 2.329 +modified object code on the User Product (for example, the work has 2.330 +been installed in ROM). 2.331 + 2.332 + The requirement to provide Installation Information does not include a 2.333 +requirement to continue to provide support service, warranty, or updates 2.334 +for a work that has been modified or installed by the recipient, or for 2.335 +the User Product in which it has been modified or installed. Access to a 2.336 +network may be denied when the modification itself materially and 2.337 +adversely affects the operation of the network or violates the rules and 2.338 +protocols for communication across the network. 2.339 + 2.340 + Corresponding Source conveyed, and Installation Information provided, 2.341 +in accord with this section must be in a format that is publicly 2.342 +documented (and with an implementation available to the public in 2.343 +source code form), and must require no special password or key for 2.344 +unpacking, reading or copying. 2.345 + 2.346 + 7. Additional Terms. 2.347 + 2.348 + "Additional permissions" are terms that supplement the terms of this 2.349 +License by making exceptions from one or more of its conditions. 2.350 +Additional permissions that are applicable to the entire Program shall 2.351 +be treated as though they were included in this License, to the extent 2.352 +that they are valid under applicable law. If additional permissions 2.353 +apply only to part of the Program, that part may be used separately 2.354 +under those permissions, but the entire Program remains governed by 2.355 +this License without regard to the additional permissions. 2.356 + 2.357 + When you convey a copy of a covered work, you may at your option 2.358 +remove any additional permissions from that copy, or from any part of 2.359 +it. (Additional permissions may be written to require their own 2.360 +removal in certain cases when you modify the work.) You may place 2.361 +additional permissions on material, added by you to a covered work, 2.362 +for which you have or can give appropriate copyright permission. 2.363 + 2.364 + Notwithstanding any other provision of this License, for material you 2.365 +add to a covered work, you may (if authorized by the copyright holders of 2.366 +that material) supplement the terms of this License with terms: 2.367 + 2.368 + a) Disclaiming warranty or limiting liability differently from the 2.369 + terms of sections 15 and 16 of this License; or 2.370 + 2.371 + b) Requiring preservation of specified reasonable legal notices or 2.372 + author attributions in that material or in the Appropriate Legal 2.373 + Notices displayed by works containing it; or 2.374 + 2.375 + c) Prohibiting misrepresentation of the origin of that material, or 2.376 + requiring that modified versions of such material be marked in 2.377 + reasonable ways as different from the original version; or 2.378 + 2.379 + d) Limiting the use for publicity purposes of names of licensors or 2.380 + authors of the material; or 2.381 + 2.382 + e) Declining to grant rights under trademark law for use of some 2.383 + trade names, trademarks, or service marks; or 2.384 + 2.385 + f) Requiring indemnification of licensors and authors of that 2.386 + material by anyone who conveys the material (or modified versions of 2.387 + it) with contractual assumptions of liability to the recipient, for 2.388 + any liability that these contractual assumptions directly impose on 2.389 + those licensors and authors. 2.390 + 2.391 + All other non-permissive additional terms are considered "further 2.392 +restrictions" within the meaning of section 10. If the Program as you 2.393 +received it, or any part of it, contains a notice stating that it is 2.394 +governed by this License along with a term that is a further 2.395 +restriction, you may remove that term. If a license document contains 2.396 +a further restriction but permits relicensing or conveying under this 2.397 +License, you may add to a covered work material governed by the terms 2.398 +of that license document, provided that the further restriction does 2.399 +not survive such relicensing or conveying. 2.400 + 2.401 + If you add terms to a covered work in accord with this section, you 2.402 +must place, in the relevant source files, a statement of the 2.403 +additional terms that apply to those files, or a notice indicating 2.404 +where to find the applicable terms. 2.405 + 2.406 + Additional terms, permissive or non-permissive, may be stated in the 2.407 +form of a separately written license, or stated as exceptions; 2.408 +the above requirements apply either way. 2.409 + 2.410 + 8. Termination. 2.411 + 2.412 + You may not propagate or modify a covered work except as expressly 2.413 +provided under this License. Any attempt otherwise to propagate or 2.414 +modify it is void, and will automatically terminate your rights under 2.415 +this License (including any patent licenses granted under the third 2.416 +paragraph of section 11). 2.417 + 2.418 + However, if you cease all violation of this License, then your 2.419 +license from a particular copyright holder is reinstated (a) 2.420 +provisionally, unless and until the copyright holder explicitly and 2.421 +finally terminates your license, and (b) permanently, if the copyright 2.422 +holder fails to notify you of the violation by some reasonable means 2.423 +prior to 60 days after the cessation. 2.424 + 2.425 + Moreover, your license from a particular copyright holder is 2.426 +reinstated permanently if the copyright holder notifies you of the 2.427 +violation by some reasonable means, this is the first time you have 2.428 +received notice of violation of this License (for any work) from that 2.429 +copyright holder, and you cure the violation prior to 30 days after 2.430 +your receipt of the notice. 2.431 + 2.432 + Termination of your rights under this section does not terminate the 2.433 +licenses of parties who have received copies or rights from you under 2.434 +this License. If your rights have been terminated and not permanently 2.435 +reinstated, you do not qualify to receive new licenses for the same 2.436 +material under section 10. 2.437 + 2.438 + 9. Acceptance Not Required for Having Copies. 2.439 + 2.440 + You are not required to accept this License in order to receive or 2.441 +run a copy of the Program. Ancillary propagation of a covered work 2.442 +occurring solely as a consequence of using peer-to-peer transmission 2.443 +to receive a copy likewise does not require acceptance. However, 2.444 +nothing other than this License grants you permission to propagate or 2.445 +modify any covered work. These actions infringe copyright if you do 2.446 +not accept this License. Therefore, by modifying or propagating a 2.447 +covered work, you indicate your acceptance of this License to do so. 2.448 + 2.449 + 10. Automatic Licensing of Downstream Recipients. 2.450 + 2.451 + Each time you convey a covered work, the recipient automatically 2.452 +receives a license from the original licensors, to run, modify and 2.453 +propagate that work, subject to this License. You are not responsible 2.454 +for enforcing compliance by third parties with this License. 2.455 + 2.456 + An "entity transaction" is a transaction transferring control of an 2.457 +organization, or substantially all assets of one, or subdividing an 2.458 +organization, or merging organizations. If propagation of a covered 2.459 +work results from an entity transaction, each party to that 2.460 +transaction who receives a copy of the work also receives whatever 2.461 +licenses to the work the party's predecessor in interest had or could 2.462 +give under the previous paragraph, plus a right to possession of the 2.463 +Corresponding Source of the work from the predecessor in interest, if 2.464 +the predecessor has it or can get it with reasonable efforts. 2.465 + 2.466 + You may not impose any further restrictions on the exercise of the 2.467 +rights granted or affirmed under this License. For example, you may 2.468 +not impose a license fee, royalty, or other charge for exercise of 2.469 +rights granted under this License, and you may not initiate litigation 2.470 +(including a cross-claim or counterclaim in a lawsuit) alleging that 2.471 +any patent claim is infringed by making, using, selling, offering for 2.472 +sale, or importing the Program or any portion of it. 2.473 + 2.474 + 11. Patents. 2.475 + 2.476 + A "contributor" is a copyright holder who authorizes use under this 2.477 +License of the Program or a work on which the Program is based. The 2.478 +work thus licensed is called the contributor's "contributor version". 2.479 + 2.480 + A contributor's "essential patent claims" are all patent claims 2.481 +owned or controlled by the contributor, whether already acquired or 2.482 +hereafter acquired, that would be infringed by some manner, permitted 2.483 +by this License, of making, using, or selling its contributor version, 2.484 +but do not include claims that would be infringed only as a 2.485 +consequence of further modification of the contributor version. For 2.486 +purposes of this definition, "control" includes the right to grant 2.487 +patent sublicenses in a manner consistent with the requirements of 2.488 +this License. 2.489 + 2.490 + Each contributor grants you a non-exclusive, worldwide, royalty-free 2.491 +patent license under the contributor's essential patent claims, to 2.492 +make, use, sell, offer for sale, import and otherwise run, modify and 2.493 +propagate the contents of its contributor version. 2.494 + 2.495 + In the following three paragraphs, a "patent license" is any express 2.496 +agreement or commitment, however denominated, not to enforce a patent 2.497 +(such as an express permission to practice a patent or covenant not to 2.498 +sue for patent infringement). To "grant" such a patent license to a 2.499 +party means to make such an agreement or commitment not to enforce a 2.500 +patent against the party. 2.501 + 2.502 + If you convey a covered work, knowingly relying on a patent license, 2.503 +and the Corresponding Source of the work is not available for anyone 2.504 +to copy, free of charge and under the terms of this License, through a 2.505 +publicly available network server or other readily accessible means, 2.506 +then you must either (1) cause the Corresponding Source to be so 2.507 +available, or (2) arrange to deprive yourself of the benefit of the 2.508 +patent license for this particular work, or (3) arrange, in a manner 2.509 +consistent with the requirements of this License, to extend the patent 2.510 +license to downstream recipients. "Knowingly relying" means you have 2.511 +actual knowledge that, but for the patent license, your conveying the 2.512 +covered work in a country, or your recipient's use of the covered work 2.513 +in a country, would infringe one or more identifiable patents in that 2.514 +country that you have reason to believe are valid. 2.515 + 2.516 + If, pursuant to or in connection with a single transaction or 2.517 +arrangement, you convey, or propagate by procuring conveyance of, a 2.518 +covered work, and grant a patent license to some of the parties 2.519 +receiving the covered work authorizing them to use, propagate, modify 2.520 +or convey a specific copy of the covered work, then the patent license 2.521 +you grant is automatically extended to all recipients of the covered 2.522 +work and works based on it. 2.523 + 2.524 + A patent license is "discriminatory" if it does not include within 2.525 +the scope of its coverage, prohibits the exercise of, or is 2.526 +conditioned on the non-exercise of one or more of the rights that are 2.527 +specifically granted under this License. You may not convey a covered 2.528 +work if you are a party to an arrangement with a third party that is 2.529 +in the business of distributing software, under which you make payment 2.530 +to the third party based on the extent of your activity of conveying 2.531 +the work, and under which the third party grants, to any of the 2.532 +parties who would receive the covered work from you, a discriminatory 2.533 +patent license (a) in connection with copies of the covered work 2.534 +conveyed by you (or copies made from those copies), or (b) primarily 2.535 +for and in connection with specific products or compilations that 2.536 +contain the covered work, unless you entered into that arrangement, 2.537 +or that patent license was granted, prior to 28 March 2007. 2.538 + 2.539 + Nothing in this License shall be construed as excluding or limiting 2.540 +any implied license or other defenses to infringement that may 2.541 +otherwise be available to you under applicable patent law. 2.542 + 2.543 + 12. No Surrender of Others' Freedom. 2.544 + 2.545 + If conditions are imposed on you (whether by court order, agreement or 2.546 +otherwise) that contradict the conditions of this License, they do not 2.547 +excuse you from the conditions of this License. If you cannot convey a 2.548 +covered work so as to satisfy simultaneously your obligations under this 2.549 +License and any other pertinent obligations, then as a consequence you may 2.550 +not convey it at all. For example, if you agree to terms that obligate you 2.551 +to collect a royalty for further conveying from those to whom you convey 2.552 +the Program, the only way you could satisfy both those terms and this 2.553 +License would be to refrain entirely from conveying the Program. 2.554 + 2.555 + 13. Use with the GNU Affero General Public License. 2.556 + 2.557 + Notwithstanding any other provision of this License, you have 2.558 +permission to link or combine any covered work with a work licensed 2.559 +under version 3 of the GNU Affero General Public License into a single 2.560 +combined work, and to convey the resulting work. The terms of this 2.561 +License will continue to apply to the part which is the covered work, 2.562 +but the special requirements of the GNU Affero General Public License, 2.563 +section 13, concerning interaction through a network will apply to the 2.564 +combination as such. 2.565 + 2.566 + 14. Revised Versions of this License. 2.567 + 2.568 + The Free Software Foundation may publish revised and/or new versions of 2.569 +the GNU General Public License from time to time. Such new versions will 2.570 +be similar in spirit to the present version, but may differ in detail to 2.571 +address new problems or concerns. 2.572 + 2.573 + Each version is given a distinguishing version number. If the 2.574 +Program specifies that a certain numbered version of the GNU General 2.575 +Public License "or any later version" applies to it, you have the 2.576 +option of following the terms and conditions either of that numbered 2.577 +version or of any later version published by the Free Software 2.578 +Foundation. If the Program does not specify a version number of the 2.579 +GNU General Public License, you may choose any version ever published 2.580 +by the Free Software Foundation. 2.581 + 2.582 + If the Program specifies that a proxy can decide which future 2.583 +versions of the GNU General Public License can be used, that proxy's 2.584 +public statement of acceptance of a version permanently authorizes you 2.585 +to choose that version for the Program. 2.586 + 2.587 + Later license versions may give you additional or different 2.588 +permissions. However, no additional obligations are imposed on any 2.589 +author or copyright holder as a result of your choosing to follow a 2.590 +later version. 2.591 + 2.592 + 15. Disclaimer of Warranty. 2.593 + 2.594 + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 2.595 +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 2.596 +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 2.597 +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 2.598 +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 2.599 +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 2.600 +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 2.601 +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 2.602 + 2.603 + 16. Limitation of Liability. 2.604 + 2.605 + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 2.606 +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 2.607 +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 2.608 +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 2.609 +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 2.610 +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 2.611 +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 2.612 +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 2.613 +SUCH DAMAGES. 2.614 + 2.615 + 17. Interpretation of Sections 15 and 16. 2.616 + 2.617 + If the disclaimer of warranty and limitation of liability provided 2.618 +above cannot be given local legal effect according to their terms, 2.619 +reviewing courts shall apply local law that most closely approximates 2.620 +an absolute waiver of all civil liability in connection with the 2.621 +Program, unless a warranty or assumption of liability accompanies a 2.622 +copy of the Program in return for a fee. 2.623 + 2.624 + END OF TERMS AND CONDITIONS 2.625 + 2.626 + How to Apply These Terms to Your New Programs 2.627 + 2.628 + If you develop a new program, and you want it to be of the greatest 2.629 +possible use to the public, the best way to achieve this is to make it 2.630 +free software which everyone can redistribute and change under these terms. 2.631 + 2.632 + To do so, attach the following notices to the program. It is safest 2.633 +to attach them to the start of each source file to most effectively 2.634 +state the exclusion of warranty; and each file should have at least 2.635 +the "copyright" line and a pointer to where the full notice is found. 2.636 + 2.637 + <one line to give the program's name and a brief idea of what it does.> 2.638 + Copyright (C) <year> <name of author> 2.639 + 2.640 + This program is free software: you can redistribute it and/or modify 2.641 + it under the terms of the GNU General Public License as published by 2.642 + the Free Software Foundation, either version 3 of the License, or 2.643 + (at your option) any later version. 2.644 + 2.645 + This program is distributed in the hope that it will be useful, 2.646 + but WITHOUT ANY WARRANTY; without even the implied warranty of 2.647 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 2.648 + GNU General Public License for more details. 2.649 + 2.650 + You should have received a copy of the GNU General Public License 2.651 + along with this program. If not, see <http://www.gnu.org/licenses/>. 2.652 + 2.653 +Also add information on how to contact you by electronic and paper mail. 2.654 + 2.655 + If the program does terminal interaction, make it output a short 2.656 +notice like this when it starts in an interactive mode: 2.657 + 2.658 + <program> Copyright (C) <year> <name of author> 2.659 + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 2.660 + This is free software, and you are welcome to redistribute it 2.661 + under certain conditions; type `show c' for details. 2.662 + 2.663 +The hypothetical commands `show w' and `show c' should show the appropriate 2.664 +parts of the General Public License. Of course, your program's commands 2.665 +might be different; for a GUI interface, you would use an "about box". 2.666 + 2.667 + You should also get your employer (if you work as a programmer) or school, 2.668 +if any, to sign a "copyright disclaimer" for the program, if necessary. 2.669 +For more information on this, and how to apply and follow the GNU GPL, see 2.670 +<http://www.gnu.org/licenses/>. 2.671 + 2.672 + The GNU General Public License does not permit incorporating your program 2.673 +into proprietary programs. If your program is a subroutine library, you 2.674 +may consider it more useful to permit linking proprietary applications with 2.675 +the library. If this is what you want to do, use the GNU Lesser General 2.676 +Public License instead of this License. But first, please read 2.677 +<http://www.gnu.org/philosophy/why-not-lgpl.html>.
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/ffmpeg_smp/h264dec/README.txt Mon Aug 27 12:09:56 2012 +0200 3.3 @@ -0,0 +1,79 @@ 3.4 +App: h264dec 3.5 + 3.6 +This application decodes H.264 raw videos. 3.7 + 3.8 +Build Sequential/Pthreads: 3.9 + 3.10 +autoreconf -i -f 3.11 +mkdir build 3.12 +cd build 3.13 +../configure --enable-ssse3 --enable-sdl2 3.14 +make 3.15 + 3.16 +Build OmpSs: 3.17 + 3.18 +autoreconf -i -f 3.19 +mkdir build 3.20 +cd build-ss 3.21 +../configure CC=sscc --enable-ssse3 --enable-sdl2 3.22 +make 3.23 + 3.24 +ssse3 enables assembler optimizations up to ssse3 (optional) 3.25 +sdl enables a rudimentary viewing capability (optional) 3.26 + 3.27 +Usage Sequential/Pthreads: 3.28 +./h264dec -i $(INPUT_VIDEO) -s 3.29 +./h264dec -i $(INPUT_VIDEO) -t $(THREADS) 3.30 + 3.31 +Usage OmpSs: 3.32 +NX_PES=<numthreads> ./h264dec -i <inputfile> -e <num parallel entropy frames> -z <width> <height> --static-3d 3.33 + 3.34 +-e specify the number of entropy decode pipeline buffers and should be ideally 3.35 +the same as the number of threads. 3.36 + 3.37 +-z allows to set the MB reconstruction grouped block size. A size between 6 by 6 to 10 by 10 3.38 +was found to strike a good balance between overhead and parallelism, but is machine and input 3.39 +dependent. 3.40 + 3.41 +--static-3d performs overlapping wavefront decoding. 3.42 + 3.43 +General usage: 3.44 +-d displays output 3.45 +-f fullscreen 3.46 +-o $(OUT_FILE) write raw YUV 3.47 +-v show framerate 3.48 + 3.49 + 3.50 +The INPUT_VIDEOs are in "inputs_encore", but should be able to decode any raw H.264 stream using 3.51 +one slice per frame, non-interlaced, and CABAC, YUV420. 3.52 + 3.53 + 3.54 +Integrated OmpSs player demo 3.55 +---------------------------- 3.56 +NOTE: for the player demo SDL2 must be installed. 3.57 + 3.58 +1. Go to the OmpSs build directory (/home/cchi/Projects/ffmpeg_smp/build-ss) 3.59 + 3.60 +2. Launch the H.264 decoder with the desired options: 3.61 + 3.62 +NX_PES=<numthreads> ./h264dec <inputfile> -v (verbose) -e <num parallel entropy frames> -z <width> <height> -d (display) -f (fullscreen) 3.63 + 3.64 +note that <num parallel entropy frames> should be equal or higher than <numthreads> for optimal performance 3.65 + 3.66 +Examples: 3.67 + 3.68 +NX_PES=7 ./h264dec -i ../../h264_movies/park_joy_2160px5.h264 -v -z 8 8 -df -e 9 3.69 +NX_PES=7 ./h264dec -i ../../h264_movies/big_buck_bunny_1080p24.h264 -v -d -z 6 6 -e 9 3.70 + 3.71 +Interacting with the program 3.72 +---------------------------- 3.73 +<CTRL+F> Fullscreen mode 3.74 +<ESCAPE> Window mode 3.75 +<SPACE> Pause/resume 3.76 +<M> Show/hide macroblock borders 3.77 +<arrows> When macroblock borders are shown resizes the macroblocks 3.78 +<ALT+F4> Close 3.79 + 3.80 +Force close in case of lockup 3.81 +----------------------------- 3.82 +On a terminal: killall -9 h264dec
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/ffmpeg_smp/h264dec/configure.ac Mon Aug 27 12:09:56 2012 +0200 4.3 @@ -0,0 +1,171 @@ 4.4 +# -*- Autoconf -*- 4.5 +# Process this file with autoconf to produce a configure script. 4.6 + 4.7 +AC_PREREQ(2.61) 4.8 +AC_INIT([h264_mt], [0.1], [cchi@cs.tu-berlin.de]) 4.9 +#AM_INIT_AUTOMAKE(AC_PACKAGE_NAME, AC_PACKAGE_VERSION) 4.10 +AM_INIT_AUTOMAKE([-Wall -Werror foreign]) 4.11 + 4.12 +AC_CONFIG_SRCDIR([h264dec.c]) 4.13 +AC_PROG_RANLIB 4.14 + 4.15 +# Checks for programs. 4.16 +AC_GNU_SOURCE 4.17 +AC_PROG_CC 4.18 +AM_CONDITIONAL([HAVE_OMPSS], [test $CC = "sscc"]) 4.19 +AC_DEFINE([OMPSS], [0], [Define to 1 on when using the OmpSs compiler sscc]) 4.20 +if test $CC = "sscc";then 4.21 +AC_DEFINE([OMPSS], [1], [Define to 1 on when using the OmpSs compiler sscc]) 4.22 +fi 4.23 + 4.24 +#if [ test -n "${CFLAGS+x}" ] ; then 4.25 +# CFLAGS="-O3 -g" 4.26 +#fi 4.27 + 4.28 +# Checks for libraries. 4.29 +AC_CHECK_LIB([pthread], [pthread_yield]) 4.30 +AC_CHECK_LIB([spe2], [spe_image_open]) 4.31 +AC_CHECK_LIB([sync], [mutex_init]) 4.32 +AC_CHECK_LIB([rt], [clock_gettime]) 4.33 + 4.34 +AC_ARG_ENABLE([sdl2], AS_HELP_STRING([--enable-sdl2], [Enable SDL2 playback])) 4.35 +if test "$enable_sdl2" = "yes"; then 4.36 + AC_CHECK_LIB([SDL2], [SDL_CreateWindow], [], [echo "Error! libSDL2 required for playback." exit -1]) 4.37 +fi 4.38 + 4.39 +if test "$enable_sdl2" = "yes"; then 4.40 + AC_CHECK_LIB([X11], [XInitThreads], [], [echo "Error! libX11 currently required for SDL2 workaround." exit -1]) 4.41 +fi 4.42 + 4.43 +AC_ARG_ENABLE([sdl_ttf], AS_HELP_STRING([--enable-sdl_ttf], [Enable SDL_ttf for overlaying fonts])) 4.44 +if test "$enable_sdl_ttf" = "yes"; then 4.45 + AC_CHECK_LIB([SDL_ttf], [TTF_Init], [], [echo "Error! libSDL_ttf required for font rendering." exit -1]) 4.46 +fi 4.47 + 4.48 + 4.49 + 4.50 +AC_ARG_ENABLE([opencl], AS_HELP_STRING([--enable-opencl], [Enable GPU decoder])) 4.51 +if test "$enable_opencl" = "yes"; then 4.52 + AC_CHECK_LIB([OpenCL], [clGetPlatformIDs], [], [echo "Error! libOpenCL required for GPU functionality." exit -1]) 4.53 +fi 4.54 +AM_CONDITIONAL([HAVE_OPENCL], [test "$enable_opencl" = "yes"]) 4.55 + 4.56 + 4.57 +# Checks for header files. 4.58 +AC_HEADER_STDC 4.59 +AC_CHECK_HEADERS([stdint.h stdlib.h string.h unistd.h]) 4.60 + 4.61 +# Checks for typedefs, structures, and compiler characteristics. 4.62 +AC_C_CONST 4.63 +AC_TYPE_UINT32_T 4.64 +AC_TYPE_UINT64_T 4.65 +AC_TYPE_UINT8_T 4.66 +AC_C_VOLATILE 4.67 +AC_C_BIGENDIAN 4.68 + 4.69 +# Checks for library functions. 4.70 +AC_CHECK_FUNCS([malloc realloc memalign posix_memalign memmove memset]) 4.71 + 4.72 +AC_CANONICAL_HOST 4.73 +AC_CANONICAL_BUILD 4.74 + 4.75 +AC_MSG_CHECKING([for architecture]) 4.76 + 4.77 +AC_DEFINE([ARCH_ARM], [0], [Define to 1 on arm architectures.]) 4.78 +AC_DEFINE([ARCH_X86_32], [0], [Define to 1 on x86 architectures.]) 4.79 +AC_DEFINE([ARCH_X86_64], [0], [Define to 1 on x86_64 architectures.]) 4.80 +AC_DEFINE([ARCH_X86], [ARCH_X86_32 ||ARCH_X86_64], [True on x86]) 4.81 +AC_DEFINE([ARCH_PPC], [0], [Define to 1 on ppc architectures.]) 4.82 +AC_DEFINE([ARCH_PPC64], [0], [Define to 1 on ppc64 architectures.]) 4.83 +AC_DEFINE([ARCH_CELL], [0], [Define to 1 on cell architectures.]) 4.84 + 4.85 +if test "$enable_optimizations" != "no"; then 4.86 + case $build_cpu in 4.87 + arm ) 4.88 + arch="arm" 4.89 + AC_MSG_RESULT([arm]) 4.90 + AC_DEFINE([ARCH_ARM], [1], [Define to 1 on arm architectures.]) 4.91 + ;; 4.92 + i686 ) 4.93 + arch="x86" 4.94 + AC_MSG_RESULT([x86]) 4.95 + AC_DEFINE([ARCH_X86_32], [1], [Define to 1 on x86 architectures.]) 4.96 + ;; 4.97 + x86_64 ) 4.98 + arch="x86_64" 4.99 + AC_MSG_RESULT([x86_64]) 4.100 + AC_DEFINE([ARCH_X86_64], [1], [Define to 1 on x86 architectures.]) 4.101 + ;; 4.102 + powerpc64 ) 4.103 + AC_DEFINE([HAVE_BIGENDIAN], [1], [Define to 1 on bigendian architectures.]) 4.104 + if grep -E ^cpu /proc/cpuinfo | grep -q Cell ; then 4.105 + arch="cell" 4.106 + AC_MSG_RESULT([cell]) 4.107 + AC_DEFINE([ARCH_CELL], [1], [Define to 1 on cell architectures.]) 4.108 + else 4.109 + arch="powerpc64" 4.110 + AC_MSG_RESULT([ppc64]) 4.111 + AC_DEFINE([ARCH_PPC64], [1], [Define to 1 on ppc64 architectures.]) 4.112 + fi 4.113 + ;; 4.114 + * ) 4.115 + AC_MSG_RESULT([default (little endian).]) 4.116 + ;; 4.117 + esac 4.118 +fi 4.119 + 4.120 +AM_CONDITIONAL([HAVE_CELL], [test $arch = "cell"]) 4.121 + 4.122 +# Additional options 4.123 +AC_ARG_ENABLE([optimizations], AS_HELP_STRING([--disable-optimizations], [Disable all architecture specific optimizations. Compiler optimizations are not disabled.])) 4.124 + 4.125 +AC_DEFINE([HAVE_SSE], [0], [Define to 1 to enable sse optimizations.]) 4.126 +AC_DEFINE([HAVE_MMX], [0], [Define to 1 to enable mmx optimizations.]) 4.127 +AC_DEFINE([HAVE_MMX2], [0], [Define to 1 to enable mmx2 optimizations.]) 4.128 +AC_DEFINE([HAVE_SSSE3], [0], [Define to 1 to enable ssse3 optimizations.]) 4.129 +AC_DEFINE([HAVE_ALTIVEC], [0], [Define to 1 to enable altivec optimizations.]) 4.130 +AC_DEFINE([HAVE_NEON], [0], [Define to 1 to enable neon optimizations.]) 4.131 + 4.132 +AC_ARG_ENABLE([ssse3], AS_HELP_STRING([--enable-ssse3], [Enable ssse3 optimizations])) 4.133 +if test "$enable_ssse3" = "yes"; then 4.134 + AC_DEFINE([HAVE_SSSE3], [1], [Define to 1 to enable ssse3 optimizations.]) 4.135 + AC_DEFINE([HAVE_SSE], [1], [Define to 1 to enable sse optimizations.]) 4.136 + AC_DEFINE([HAVE_MMX], [1], [Define to 1 to enable mmx optimizations.]) 4.137 + AC_DEFINE([HAVE_MMX2], [1], [Define to 1 to enable mmx2 optimizations.]) 4.138 + ARCH_SUBDIR=x86 4.139 +fi 4.140 + 4.141 +AC_ARG_ENABLE([sse], AS_HELP_STRING([--enable-sse], [Enable sse optimizations])) 4.142 +if test "$enable_sse" = "yes"; then 4.143 + AC_DEFINE([HAVE_SSE], [1], [Define to 1 to enable sse optimizations.]) 4.144 + AC_DEFINE([HAVE_MMX], [1], [Define to 1 to enable mmx optimizations.]) 4.145 + AC_DEFINE([HAVE_MMX2], [1], [Define to 1 to enable mmx2 optimizations.]) 4.146 + ARCH_SUBDIR=x86 4.147 +fi 4.148 + 4.149 +AC_ARG_ENABLE([altivec], AS_HELP_STRING([--enable-altivec], [Enable altivec optimizations])) 4.150 +if test "$enable_altivec" = "yes"; then 4.151 + AC_DEFINE([HAVE_ALTIVEC], [1], [Define to 1 to enable altivec optimizations.]) 4.152 + ARCH_SUBDIR="$ARCH_SUBDIR ppc" 4.153 + TMPCLAGS=$CFLAGS 4.154 + CFLAGS="$CFLAGS -maltivec" 4.155 + AC_CHECK_HEADERS(altivec.h) 4.156 + CFLAGS=$TMPCLAGS 4.157 +fi 4.158 + 4.159 +AC_ARG_ENABLE([neon], AS_HELP_STRING([--enable-neon], [Enable neon optimizations])) 4.160 +if test "$enable_neon" = "yes"; then 4.161 + AC_DEFINE([HAVE_NEON], [1], [Define to 1 to enable neon optimizations.]) 4.162 + ARCH_SUBDIR=arm 4.163 +fi 4.164 + 4.165 +AM_CONDITIONAL([HAVE_ARCH_SUBDIR], [test "$ARCH_SUBDIR" != ""]) 4.166 +AC_SUBST([ARCH_SUBDIR]) 4.167 + 4.168 +AC_DEFINE([HAVE_NEON], [0], [Define to 1 to enable neon optimizations.]) 4.169 + 4.170 +AC_CONFIG_HEADER([config.h]) 4.171 + 4.172 +AC_CONFIG_FILES([Makefile libavutil/Makefile libavcodec/Makefile libavcodec/x86/Makefile libavcodec/ppc/Makefile libavcodec/cell/Makefile]) 4.173 + 4.174 +AC_OUTPUT
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/ffmpeg_smp/h264dec/h264dec.c Mon Aug 27 12:09:56 2012 +0200 5.3 @@ -0,0 +1,288 @@ 5.4 +/* 5.5 +* H264 decoder main 5.6 +*/ 5.7 + 5.8 +#include "config.h" 5.9 +#include "libavcodec/h264.h" 5.10 + 5.11 +#include <string.h> 5.12 +#include <stdlib.h> 5.13 +#include <errno.h> 5.14 +#include <signal.h> 5.15 +#include <unistd.h> 5.16 +#include <getopt.h> 5.17 +#include <fcntl.h> 5.18 + 5.19 +#include <sys/types.h> 5.20 +#include <sys/time.h> 5.21 +#include <sys/resource.h> 5.22 +#include <time.h> 5.23 + 5.24 +#include <assert.h> 5.25 + 5.26 + 5.27 +static const char program_name[] = "h264dec"; 5.28 +static const int program_birth_year = 2010; 5.29 + 5.30 +static const char *file_name; 5.31 +static int ifile, ofile; 5.32 +static int no_arch =0; 5.33 +static int parallel = 1; 5.34 +static int frame_width = 0; 5.35 +static int frame_height = 0; 5.36 + 5.37 +static void av_exit(int ret) 5.38 +{ 5.39 + //do some free calls 5.40 +#undef exit 5.41 + exit(ret); 5.42 +} 5.43 + 5.44 +static void opt_input_file(const char *filename) 5.45 +{ 5.46 + /* open the input file */ 5.47 + ifile = open(filename, O_RDONLY, 0666); 5.48 + if (ifile < 0){ 5.49 + fprintf(stderr, "Failed to open %s\n", filename); 5.50 + av_exit(-1); 5.51 + } 5.52 + 5.53 + //parse first frame to get resolution (other information available but not used) 5.54 + H264Slice slice; 5.55 + PictureInfo pi; 5.56 + GetBitContext gb = {0,}; 5.57 + ParserContext *pc; 5.58 + NalContext *nc; 5.59 + 5.60 + pc = get_parse_context(ifile); 5.61 + nc = get_nal_context(0, 0); 5.62 + 5.63 + memset(&slice, 0, sizeof(H264Slice)); 5.64 + slice.current_picture_info=π 5.65 + 5.66 + av_read_frame_internal(pc, &gb); 5.67 + decode_nal_units(nc, &slice, &gb); 5.68 + 5.69 + frame_width = nc->width; 5.70 + frame_height= nc->height; 5.71 + 5.72 + //clean up 5.73 + av_freep(&gb.raw); 5.74 + if (gb.rbsp) 5.75 + av_freep(&gb.rbsp); 5.76 + free_parse_context(pc); 5.77 + free_nal_context(nc); 5.78 + 5.79 + //rewind file 5.80 + int offset; 5.81 + if ( (offset=lseek(ifile, 0, SEEK_SET)) ){ 5.82 + fprintf(stderr, "Rewind input file %s failed at offset %d\n", filename, offset); 5.83 + } 5.84 + 5.85 +} 5.86 + 5.87 +static void opt_output_file(const char *filename) 5.88 +{ 5.89 + if (filename){ 5.90 + if (!strcmp(filename, "-")) 5.91 + filename = "pipe:"; 5.92 + 5.93 + ofile = open(filename, O_CREAT | O_TRUNC | O_WRONLY, 0666); 5.94 + }else{ 5.95 + ofile =0; 5.96 + } 5.97 +} 5.98 + 5.99 +static void show_usage(void) 5.100 +{ 5.101 + printf("usage: ffmpeg [options] -i infile }...\n"); 5.102 + printf("\n"); 5.103 +} 5.104 + 5.105 +static struct option long_options[] = { 5.106 + {"static-sched", 0, 0, 0}, 5.107 + {"static-mbd", 0, 0, 0}, 5.108 + {"numamap", 0, 0, 0}, 5.109 + {"no-mbd", 0, 0, 0}, 5.110 + {"static-3d", 0, 0, 0}, 5.111 + {"slice-bufs", 1, 0, 0}, 5.112 + {"smt", 0, 0, 0}, 5.113 + {"noarch", 0, 0, 'a'}, 5.114 + {"display", 0, 0, 'd'}, 5.115 + {"fullscreen", 0, 0, 'f'}, 5.116 + {"numframes", 1, 0, 'n'}, 5.117 + {"use-ppe-ed", 1, 0, 'p'}, 5.118 + {"sequential", 0, 0, 's'}, 5.119 + {"threads", 1, 0, 't'}, 5.120 + {"verbose", 1, 0, 'v'}, 5.121 + {"wave-order", 1, 0, 'w'}, 5.122 + {"smb-size", 1, 0, 'z'}, 5.123 + {"pipe-bufs", 1, 0, 'e'}, 5.124 + {0, 0, 0, 0} 5.125 +}; 5.126 + 5.127 +static h264_options cli_opts; 5.128 +static void parse_cmd(int argc, char **argv) 5.129 +{ 5.130 + int c; 5.131 + int digit_optind = 0; 5.132 + int option_index = 0; 5.133 + char ofile_name[1024]; 5.134 + extern char *optarg; 5.135 + extern int optind, optopt; 5.136 + 5.137 + cli_opts.statsched =0; 5.138 + cli_opts.numamap =0; 5.139 + cli_opts.statmbd =0; 5.140 + cli_opts.no_mbd= 0; 5.141 + cli_opts.numframes = INT_MAX; 5.142 + cli_opts.display=0; 5.143 + cli_opts.fullscreen=0; 5.144 + cli_opts.verbose=0; 5.145 + cli_opts.ppe_ed=0; 5.146 + cli_opts.profile=0; 5.147 + cli_opts.threads = 1; 5.148 + cli_opts.smb_size[0] = cli_opts.smb_size[1] = 1; 5.149 + cli_opts.wave_order=0; 5.150 + cli_opts.static_3d=0; 5.151 + cli_opts.pipe_bufs=8; 5.152 + cli_opts.slice_bufs=1; 5.153 + cli_opts.smt= 0; 5.154 + while ((c = getopt_long(argc, argv, "ade:fi:n:o:p:st:vwz:", long_options, &option_index)) != -1 ){ 5.155 + int this_option_optind = optind ? optind : 1; 5.156 + 5.157 + switch (c){ 5.158 + case 0: 5.159 + if (option_index==0){ 5.160 + cli_opts.statsched=1; 5.161 + }else if (option_index==1){ 5.162 + cli_opts.statmbd= 1; 5.163 + }else if (option_index==2){ 5.164 + cli_opts.numamap= 1; 5.165 + }else if (option_index==3){ 5.166 + cli_opts.no_mbd= 1; 5.167 + }else if (option_index==4){ 5.168 + cli_opts.static_3d= 1; 5.169 + }else if (option_index==5){ 5.170 + cli_opts.slice_bufs= (unsigned) atoi(optarg); 5.171 + }else if (option_index==6){ 5.172 + cli_opts.smt= 1; 5.173 + } 5.174 + break; 5.175 + case '0': 5.176 + case '1': 5.177 + case '2': 5.178 + if (digit_optind != 0 && digit_optind != this_option_optind) 5.179 + printf("digits occur in two different argv-elements.\n"); 5.180 + digit_optind = this_option_optind; 5.181 + printf("option %c\n", c); 5.182 + break; 5.183 + case 'a': 5.184 + no_arch=1; 5.185 + break; 5.186 + case 'd': 5.187 + cli_opts.display=1; 5.188 + break; 5.189 + case 'f': 5.190 + cli_opts.fullscreen=1; 5.191 + break; 5.192 + case 'i': 5.193 + file_name = (const char *)optarg; 5.194 + opt_input_file(file_name); 5.195 + break; 5.196 + case 'n': 5.197 + cli_opts.numframes = (unsigned) atoi(optarg); 5.198 + break; 5.199 + case 'o': 5.200 + strcpy(ofile_name, optarg); 5.201 + opt_output_file(ofile_name); 5.202 + break; 5.203 + case 'p': 5.204 + cli_opts.profile = (unsigned) atoi(optarg); 5.205 + break; 5.206 + case 's': 5.207 + cli_opts.threads = 0; 5.208 + parallel = 0; 5.209 + break; 5.210 + case 't': 5.211 + cli_opts.threads = atoi(optarg); 5.212 + if (cli_opts.threads<=0){ 5.213 + fprintf(stderr, "Option -%c requires thread numbers > 0\n", c); 5.214 + av_exit(-1); 5.215 + } 5.216 + break; 5.217 + case 'v': 5.218 + cli_opts.verbose = 1; 5.219 + break; 5.220 + case 'w': 5.221 + cli_opts.wave_order = 1; 5.222 + break; 5.223 + case 'z': // only useful in ompss 5.224 + if (argc < optind +1){ 5.225 + fprintf(stderr, "Option -%c (--smb-size) requires 2 arguments\n", c); 5.226 + av_exit(-1); 5.227 + } 5.228 + optind--; 5.229 + for (int i=0; i<2; i++){ 5.230 + cli_opts.smb_size[i] = atoi(argv[optind++]); 5.231 + if (!(cli_opts.smb_size > 0)){ 5.232 + fprintf(stderr, "Option -%c (--smb-size) requires dimensions > 0\n", c); 5.233 + av_exit(-1); 5.234 + } 5.235 + } 5.236 + break; 5.237 + case 'e': 5.238 + cli_opts.pipe_bufs = atoi(optarg); 5.239 + break; 5.240 + case ':': 5.241 + fprintf(stderr, "Option -%c requires an operand\n", optopt); 5.242 + av_exit(-1); 5.243 + break; 5.244 + case '?': 5.245 + fprintf(stderr, "Unrecognized option: -%c\n", optopt); 5.246 + av_exit(-1); 5.247 + break; 5.248 + } 5.249 + } 5.250 + 5.251 +} 5.252 + 5.253 +int main(int argc, char **argv) 5.254 +{ 5.255 + /* parse options */ 5.256 + parse_cmd(argc, argv); 5.257 + 5.258 + if(!ifile ) { 5.259 + show_usage(); 5.260 + av_exit(1); 5.261 + } 5.262 + 5.263 + H264Context *h = get_h264dec_context(file_name, ifile, ofile, frame_width, frame_height, &cli_opts); 5.264 +#if OMPSS 5.265 + if (h264_decode_ompss( h ) < 0) 5.266 + av_exit(-1); 5.267 +#else 5.268 + if (parallel){ 5.269 + if (ARCH_CELL && !no_arch){ 5.270 + if (h264_decode_cell( h ) < 0) 5.271 + av_exit(-1); 5.272 + }else{ 5.273 + if (h264_decode_pthread( h ) < 0) 5.274 + av_exit(1); 5.275 + } 5.276 + }else{ 5.277 + if (ARCH_CELL && !no_arch){ 5.278 + if (h264_decode_cell_seq( h ) < 0) 5.279 + av_exit(1); 5.280 + }else{ 5.281 + if (h264_decode_seq( h ) < 0) 5.282 + av_exit(1); 5.283 + } 5.284 + } 5.285 +#endif 5.286 + free_h264dec_context(h); 5.287 + close(ifile); 5.288 + close(ofile); 5.289 + 5.290 + return 0; 5.291 +}
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 6.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/aac.h Mon Aug 27 12:09:56 2012 +0200 6.3 @@ -0,0 +1,137 @@ 6.4 +/* 6.5 + * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> 6.6 + * 6.7 + * This file is part of FFmpeg. 6.8 + * 6.9 + * FFmpeg is free software; you can redistribute it and/or 6.10 + * modify it under the terms of the GNU Lesser General Public 6.11 + * License as published by the Free Software Foundation; either 6.12 + * version 2.1 of the License, or (at your option) any later version. 6.13 + * 6.14 + * FFmpeg is distributed in the hope that it will be useful, 6.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 6.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 6.17 + * Lesser General Public License for more details. 6.18 + * 6.19 + * You should have received a copy of the GNU Lesser General Public 6.20 + * License along with FFmpeg; if not, write to the Free Software 6.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6.22 + */ 6.23 + 6.24 +#ifndef AVCODEC_ARM_AAC_H 6.25 +#define AVCODEC_ARM_AAC_H 6.26 + 6.27 +#include "config.h" 6.28 + 6.29 +#if HAVE_NEON && HAVE_INLINE_ASM 6.30 + 6.31 +#define VMUL2 VMUL2 6.32 +static inline float *VMUL2(float *dst, const float *v, unsigned idx, 6.33 + const float *scale) 6.34 +{ 6.35 + unsigned v0, v1; 6.36 + __asm__ volatile ("ubfx %0, %4, #0, #4 \n\t" 6.37 + "ubfx %1, %4, #4, #4 \n\t" 6.38 + "ldr %0, [%3, %0, lsl #2] \n\t" 6.39 + "ldr %1, [%3, %1, lsl #2] \n\t" 6.40 + "vld1.32 {d1[]}, [%5,:32] \n\t" 6.41 + "vmov d0, %0, %1 \n\t" 6.42 + "vmul.f32 d0, d0, d1 \n\t" 6.43 + "vst1.32 {d0}, [%2,:64]! \n\t" 6.44 + : "=&r"(v0), "=&r"(v1), "+r"(dst) 6.45 + : "r"(v), "r"(idx), "r"(scale) 6.46 + : "d0", "d1"); 6.47 + return dst; 6.48 +} 6.49 + 6.50 +#define VMUL4 VMUL4 6.51 +static inline float *VMUL4(float *dst, const float *v, unsigned idx, 6.52 + const float *scale) 6.53 +{ 6.54 + unsigned v0, v1, v2, v3; 6.55 + __asm__ volatile ("ubfx %0, %6, #0, #2 \n\t" 6.56 + "ubfx %1, %6, #2, #2 \n\t" 6.57 + "ldr %0, [%5, %0, lsl #2] \n\t" 6.58 + "ubfx %2, %6, #4, #2 \n\t" 6.59 + "ldr %1, [%5, %1, lsl #2] \n\t" 6.60 + "ubfx %3, %6, #6, #2 \n\t" 6.61 + "ldr %2, [%5, %2, lsl #2] \n\t" 6.62 + "vmov d0, %0, %1 \n\t" 6.63 + "ldr %3, [%5, %3, lsl #2] \n\t" 6.64 + "vld1.32 {d2[],d3[]},[%7,:32] \n\t" 6.65 + "vmov d1, %2, %3 \n\t" 6.66 + "vmul.f32 q0, q0, q1 \n\t" 6.67 + "vst1.32 {q0}, [%4,:128]! \n\t" 6.68 + : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst) 6.69 + : "r"(v), "r"(idx), "r"(scale) 6.70 + : "d0", "d1", "d2", "d3"); 6.71 + return dst; 6.72 +} 6.73 + 6.74 +#define VMUL2S VMUL2S 6.75 +static inline float *VMUL2S(float *dst, const float *v, unsigned idx, 6.76 + unsigned sign, const float *scale) 6.77 +{ 6.78 + unsigned v0, v1, v2, v3; 6.79 + __asm__ volatile ("ubfx %0, %6, #0, #4 \n\t" 6.80 + "ubfx %1, %6, #4, #4 \n\t" 6.81 + "ldr %0, [%5, %0, lsl #2] \n\t" 6.82 + "lsl %2, %8, #30 \n\t" 6.83 + "ldr %1, [%5, %1, lsl #2] \n\t" 6.84 + "lsl %3, %8, #31 \n\t" 6.85 + "vmov d0, %0, %1 \n\t" 6.86 + "bic %2, %2, #1<<30 \n\t" 6.87 + "vld1.32 {d1[]}, [%7,:32] \n\t" 6.88 + "vmov d2, %2, %3 \n\t" 6.89 + "veor d0, d0, d2 \n\t" 6.90 + "vmul.f32 d0, d0, d1 \n\t" 6.91 + "vst1.32 {d0}, [%4,:64]! \n\t" 6.92 + : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst) 6.93 + : "r"(v), "r"(idx), "r"(scale), "r"(sign) 6.94 + : "d0", "d1", "d2"); 6.95 + return dst; 6.96 +} 6.97 + 6.98 +#define VMUL4S VMUL4S 6.99 +static inline float *VMUL4S(float *dst, const float *v, unsigned idx, 6.100 + unsigned sign, const float *scale) 6.101 +{ 6.102 + unsigned v0, v1, v2, v3, nz; 6.103 + __asm__ volatile ("vld1.32 {d2[],d3[]},[%9,:32] \n\t" 6.104 + "ubfx %0, %8, #0, #2 \n\t" 6.105 + "ubfx %1, %8, #2, #2 \n\t" 6.106 + "ldr %0, [%7, %0, lsl #2] \n\t" 6.107 + "ubfx %2, %8, #4, #2 \n\t" 6.108 + "ldr %1, [%7, %1, lsl #2] \n\t" 6.109 + "ubfx %3, %8, #6, #2 \n\t" 6.110 + "ldr %2, [%7, %2, lsl #2] \n\t" 6.111 + "vmov d0, %0, %1 \n\t" 6.112 + "ldr %3, [%7, %3, lsl #2] \n\t" 6.113 + "lsr %6, %8, #12 \n\t" 6.114 + "rbit %6, %6 \n\t" 6.115 + "vmov d1, %2, %3 \n\t" 6.116 + "lsls %6, %6, #1 \n\t" 6.117 + "and %0, %5, #1<<31 \n\t" 6.118 + "lslcs %5, %5, #1 \n\t" 6.119 + "lsls %6, %6, #1 \n\t" 6.120 + "and %1, %5, #1<<31 \n\t" 6.121 + "lslcs %5, %5, #1 \n\t" 6.122 + "lsls %6, %6, #1 \n\t" 6.123 + "and %2, %5, #1<<31 \n\t" 6.124 + "lslcs %5, %5, #1 \n\t" 6.125 + "vmov d4, %0, %1 \n\t" 6.126 + "and %3, %5, #1<<31 \n\t" 6.127 + "vmov d5, %2, %3 \n\t" 6.128 + "veor q0, q0, q2 \n\t" 6.129 + "vmul.f32 q0, q0, q1 \n\t" 6.130 + "vst1.32 {q0}, [%4,:128]! \n\t" 6.131 + : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst), 6.132 + "+r"(sign), "=r"(nz) 6.133 + : "r"(v), "r"(idx), "r"(scale) 6.134 + : "d0", "d1", "d2", "d3", "d4", "d5"); 6.135 + return dst; 6.136 +} 6.137 + 6.138 +#endif /* HAVE_NEON && HAVE_INLINE_ASM */ 6.139 + 6.140 +#endif /* AVCODEC_ARM_AAC_H */
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/asm.S Mon Aug 27 12:09:56 2012 +0200 7.3 @@ -0,0 +1,72 @@ 7.4 +/* 7.5 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 7.6 + * 7.7 + * This file is part of FFmpeg. 7.8 + * 7.9 + * FFmpeg is free software; you can redistribute it and/or 7.10 + * modify it under the terms of the GNU Lesser General Public 7.11 + * License as published by the Free Software Foundation; either 7.12 + * version 2.1 of the License, or (at your option) any later version. 7.13 + * 7.14 + * FFmpeg is distributed in the hope that it will be useful, 7.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 7.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 7.17 + * Lesser General Public License for more details. 7.18 + * 7.19 + * You should have received a copy of the GNU Lesser General Public 7.20 + * License along with FFmpeg; if not, write to the Free Software 7.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 7.22 + */ 7.23 + 7.24 +#include "config.h" 7.25 + 7.26 +#ifdef __ELF__ 7.27 +# define ELF 7.28 +#else 7.29 +# define ELF @ 7.30 +#endif 7.31 + 7.32 + .macro require8, val=1 7.33 +ELF .eabi_attribute 24, \val 7.34 + .endm 7.35 + 7.36 + .macro preserve8, val=1 7.37 +ELF .eabi_attribute 25, \val 7.38 + .endm 7.39 + 7.40 + .macro function name, export=0 7.41 + .macro endfunc 7.42 +ELF .size \name, . - \name 7.43 + .endfunc 7.44 + .purgem endfunc 7.45 + .endm 7.46 +.if \export 7.47 + .global EXTERN_ASM\name 7.48 +EXTERN_ASM\name: 7.49 +.endif 7.50 +ELF .type \name, %function 7.51 + .func \name 7.52 +\name: 7.53 + .endm 7.54 + 7.55 + .macro movrel rd, val 7.56 +#if HAVE_ARMV6T2 && !CONFIG_PIC 7.57 + movw \rd, #:lower16:\val 7.58 + movt \rd, #:upper16:\val 7.59 +#else 7.60 + ldr \rd, =\val 7.61 +#endif 7.62 + .endm 7.63 + 7.64 +#if HAVE_VFP_ARGS 7.65 + .eabi_attribute 28, 1 7.66 +# define VFP 7.67 +# define NOVFP @ 7.68 +#else 7.69 +# define VFP @ 7.70 +# define NOVFP 7.71 +#endif 7.72 + 7.73 +#define GLUE(a, b) a ## b 7.74 +#define JOIN(a, b) GLUE(a, b) 7.75 +#define X(s) JOIN(EXTERN_ASM, s)
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 8.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_init_arm.c Mon Aug 27 12:09:56 2012 +0200 8.3 @@ -0,0 +1,32 @@ 8.4 +/* 8.5 + * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> 8.6 + * 8.7 + * This file is part of FFmpeg. 8.8 + * 8.9 + * FFmpeg is free software; you can redistribute it and/or 8.10 + * modify it under the terms of the GNU Lesser General Public 8.11 + * License as published by the Free Software Foundation; either 8.12 + * version 2.1 of the License, or (at your option) any later version. 8.13 + * 8.14 + * FFmpeg is distributed in the hope that it will be useful, 8.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 8.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 8.17 + * Lesser General Public License for more details. 8.18 + * 8.19 + * You should have received a copy of the GNU Lesser General Public 8.20 + * License along with FFmpeg; if not, write to the Free Software 8.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 8.22 + */ 8.23 + 8.24 +#include "config.h" 8.25 +#include "libavutil/attributes.h" 8.26 +#include "libavcodec/dcadsp.h" 8.27 + 8.28 +void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs, 8.29 + int decifactor, float scale, float bias); 8.30 + 8.31 +void av_cold ff_dcadsp_init_arm(DCADSPContext *s) 8.32 +{ 8.33 + if (HAVE_NEON) 8.34 + s->lfe_fir = ff_dca_lfe_fir_neon; 8.35 +}
9.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 9.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_neon.S Mon Aug 27 12:09:56 2012 +0200 9.3 @@ -0,0 +1,61 @@ 9.4 +/* 9.5 + * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> 9.6 + * 9.7 + * This file is part of FFmpeg. 9.8 + * 9.9 + * FFmpeg is free software; you can redistribute it and/or 9.10 + * modify it under the terms of the GNU Lesser General Public 9.11 + * License as published by the Free Software Foundation; either 9.12 + * version 2.1 of the License, or (at your option) any later version. 9.13 + * 9.14 + * FFmpeg is distributed in the hope that it will be useful, 9.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 9.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 9.17 + * Lesser General Public License for more details. 9.18 + * 9.19 + * You should have received a copy of the GNU Lesser General Public 9.20 + * License along with FFmpeg; if not, write to the Free Software 9.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 9.22 + */ 9.23 + 9.24 +#include "asm.S" 9.25 + 9.26 +function ff_dca_lfe_fir_neon, export=1 9.27 + push {r4-r6,lr} 9.28 + 9.29 + add r4, r0, r3, lsl #2 @ out2 9.30 + add r5, r2, #256*4-16 @ cf1 9.31 + sub r1, r1, #12 9.32 + cmp r3, #32 9.33 + moveq r6, #256/32 9.34 + movne r6, #256/64 9.35 +NOVFP vldr d0, [sp, #16] @ scale, bias 9.36 + mov lr, #-16 9.37 +1: 9.38 + vmov.f32 q2, #0.0 @ v0 9.39 + vmov.f32 q3, #0.0 @ v1 9.40 + mov r12, r6 9.41 +2: 9.42 + vld1.32 {q8}, [r2,:128]! @ cf0 9.43 + vld1.32 {q9}, [r5,:128], lr @ cf1 9.44 + vld1.32 {q1}, [r1], lr @ in 9.45 + subs r12, r12, #4 9.46 + vrev64.32 q10, q8 9.47 + vmla.f32 q3, q1, q9 9.48 + vmla.f32 d4, d2, d21 9.49 + vmla.f32 d5, d3, d20 9.50 + bne 2b 9.51 + 9.52 + add r1, r1, r6, lsl #2 9.53 + subs r3, r3, #1 9.54 + vadd.f32 d4, d4, d5 9.55 + vadd.f32 d6, d6, d7 9.56 + vpadd.f32 d4, d4, d6 9.57 + vdup.32 d5, d0[1] 9.58 + vmla.f32 d5, d4, d0[0] 9.59 + vst1.32 {d5[0]}, [r0,:32]! 9.60 + vst1.32 {d5[1]}, [r4,:32]! 9.61 + bne 1b 9.62 + 9.63 + pop {r4-r6,pc} 9.64 +endfunc
10.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 10.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.S Mon Aug 27 12:09:56 2012 +0200 10.3 @@ -0,0 +1,712 @@ 10.4 +@ 10.5 +@ ARMv4 optimized DSP utils 10.6 +@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp> 10.7 +@ 10.8 +@ This file is part of FFmpeg. 10.9 +@ 10.10 +@ FFmpeg is free software; you can redistribute it and/or 10.11 +@ modify it under the terms of the GNU Lesser General Public 10.12 +@ License as published by the Free Software Foundation; either 10.13 +@ version 2.1 of the License, or (at your option) any later version. 10.14 +@ 10.15 +@ FFmpeg is distributed in the hope that it will be useful, 10.16 +@ but WITHOUT ANY WARRANTY; without even the implied warranty of 10.17 +@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10.18 +@ Lesser General Public License for more details. 10.19 +@ 10.20 +@ You should have received a copy of the GNU Lesser General Public 10.21 +@ License along with FFmpeg; if not, write to the Free Software 10.22 +@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 10.23 +@ 10.24 + 10.25 +#include "config.h" 10.26 +#include "asm.S" 10.27 + 10.28 + preserve8 10.29 + 10.30 +#if !HAVE_PLD 10.31 +.macro pld reg 10.32 +.endm 10.33 +#endif 10.34 + 10.35 +#if HAVE_ARMV5TE 10.36 +function ff_prefetch_arm, export=1 10.37 + subs r2, r2, #1 10.38 + pld [r0] 10.39 + add r0, r0, r1 10.40 + bne ff_prefetch_arm 10.41 + bx lr 10.42 +endfunc 10.43 +#endif 10.44 + 10.45 +.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 10.46 + mov \Rd0, \Rn0, lsr #(\shift * 8) 10.47 + mov \Rd1, \Rn1, lsr #(\shift * 8) 10.48 + mov \Rd2, \Rn2, lsr #(\shift * 8) 10.49 + mov \Rd3, \Rn3, lsr #(\shift * 8) 10.50 + orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) 10.51 + orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) 10.52 + orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) 10.53 + orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) 10.54 +.endm 10.55 +.macro ALIGN_DWORD shift, R0, R1, R2 10.56 + mov \R0, \R0, lsr #(\shift * 8) 10.57 + orr \R0, \R0, \R1, lsl #(32 - \shift * 8) 10.58 + mov \R1, \R1, lsr #(\shift * 8) 10.59 + orr \R1, \R1, \R2, lsl #(32 - \shift * 8) 10.60 +.endm 10.61 +.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 10.62 + mov \Rdst0, \Rsrc0, lsr #(\shift * 8) 10.63 + mov \Rdst1, \Rsrc1, lsr #(\shift * 8) 10.64 + orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) 10.65 + orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) 10.66 +.endm 10.67 + 10.68 +.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask 10.69 + @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) 10.70 + @ Rmask = 0xFEFEFEFE 10.71 + @ Rn = destroy 10.72 + eor \Rd0, \Rn0, \Rm0 10.73 + eor \Rd1, \Rn1, \Rm1 10.74 + orr \Rn0, \Rn0, \Rm0 10.75 + orr \Rn1, \Rn1, \Rm1 10.76 + and \Rd0, \Rd0, \Rmask 10.77 + and \Rd1, \Rd1, \Rmask 10.78 + sub \Rd0, \Rn0, \Rd0, lsr #1 10.79 + sub \Rd1, \Rn1, \Rd1, lsr #1 10.80 +.endm 10.81 + 10.82 +.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask 10.83 + @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) 10.84 + @ Rmask = 0xFEFEFEFE 10.85 + @ Rn = destroy 10.86 + eor \Rd0, \Rn0, \Rm0 10.87 + eor \Rd1, \Rn1, \Rm1 10.88 + and \Rn0, \Rn0, \Rm0 10.89 + and \Rn1, \Rn1, \Rm1 10.90 + and \Rd0, \Rd0, \Rmask 10.91 + and \Rd1, \Rd1, \Rmask 10.92 + add \Rd0, \Rn0, \Rd0, lsr #1 10.93 + add \Rd1, \Rn1, \Rd1, lsr #1 10.94 +.endm 10.95 + 10.96 +.macro JMP_ALIGN tmp, reg 10.97 + ands \tmp, \reg, #3 10.98 + bic \reg, \reg, #3 10.99 + beq 1f 10.100 + subs \tmp, \tmp, #1 10.101 + beq 2f 10.102 + subs \tmp, \tmp, #1 10.103 + beq 3f 10.104 + b 4f 10.105 +.endm 10.106 + 10.107 +@ ---------------------------------------------------------------- 10.108 + .align 5 10.109 +function ff_put_pixels16_arm, export=1 10.110 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) 10.111 + @ block = word aligned, pixles = unaligned 10.112 + pld [r1] 10.113 + push {r4-r11, lr} 10.114 + JMP_ALIGN r5, r1 10.115 +1: 10.116 + ldm r1, {r4-r7} 10.117 + add r1, r1, r2 10.118 + stm r0, {r4-r7} 10.119 + pld [r1] 10.120 + subs r3, r3, #1 10.121 + add r0, r0, r2 10.122 + bne 1b 10.123 + pop {r4-r11, pc} 10.124 + .align 5 10.125 +2: 10.126 + ldm r1, {r4-r8} 10.127 + add r1, r1, r2 10.128 + ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 10.129 + pld [r1] 10.130 + subs r3, r3, #1 10.131 + stm r0, {r9-r12} 10.132 + add r0, r0, r2 10.133 + bne 2b 10.134 + pop {r4-r11, pc} 10.135 + .align 5 10.136 +3: 10.137 + ldm r1, {r4-r8} 10.138 + add r1, r1, r2 10.139 + ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 10.140 + pld [r1] 10.141 + subs r3, r3, #1 10.142 + stm r0, {r9-r12} 10.143 + add r0, r0, r2 10.144 + bne 3b 10.145 + pop {r4-r11, pc} 10.146 + .align 5 10.147 +4: 10.148 + ldm r1, {r4-r8} 10.149 + add r1, r1, r2 10.150 + ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 10.151 + pld [r1] 10.152 + subs r3, r3, #1 10.153 + stm r0, {r9-r12} 10.154 + add r0, r0, r2 10.155 + bne 4b 10.156 + pop {r4-r11,pc} 10.157 +endfunc 10.158 + 10.159 +@ ---------------------------------------------------------------- 10.160 + .align 5 10.161 +function ff_put_pixels8_arm, export=1 10.162 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) 10.163 + @ block = word aligned, pixles = unaligned 10.164 + pld [r1] 10.165 + push {r4-r5,lr} 10.166 + JMP_ALIGN r5, r1 10.167 +1: 10.168 + ldm r1, {r4-r5} 10.169 + add r1, r1, r2 10.170 + subs r3, r3, #1 10.171 + pld [r1] 10.172 + stm r0, {r4-r5} 10.173 + add r0, r0, r2 10.174 + bne 1b 10.175 + pop {r4-r5,pc} 10.176 + .align 5 10.177 +2: 10.178 + ldm r1, {r4-r5, r12} 10.179 + add r1, r1, r2 10.180 + ALIGN_DWORD 1, r4, r5, r12 10.181 + pld [r1] 10.182 + subs r3, r3, #1 10.183 + stm r0, {r4-r5} 10.184 + add r0, r0, r2 10.185 + bne 2b 10.186 + pop {r4-r5,pc} 10.187 + .align 5 10.188 +3: 10.189 + ldm r1, {r4-r5, r12} 10.190 + add r1, r1, r2 10.191 + ALIGN_DWORD 2, r4, r5, r12 10.192 + pld [r1] 10.193 + subs r3, r3, #1 10.194 + stm r0, {r4-r5} 10.195 + add r0, r0, r2 10.196 + bne 3b 10.197 + pop {r4-r5,pc} 10.198 + .align 5 10.199 +4: 10.200 + ldm r1, {r4-r5, r12} 10.201 + add r1, r1, r2 10.202 + ALIGN_DWORD 3, r4, r5, r12 10.203 + pld [r1] 10.204 + subs r3, r3, #1 10.205 + stm r0, {r4-r5} 10.206 + add r0, r0, r2 10.207 + bne 4b 10.208 + pop {r4-r5,pc} 10.209 +endfunc 10.210 + 10.211 +@ ---------------------------------------------------------------- 10.212 + .align 5 10.213 +function ff_put_pixels8_x2_arm, export=1 10.214 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) 10.215 + @ block = word aligned, pixles = unaligned 10.216 + pld [r1] 10.217 + push {r4-r10,lr} 10.218 + ldr r12, =0xfefefefe 10.219 + JMP_ALIGN r5, r1 10.220 +1: 10.221 + ldm r1, {r4-r5, r10} 10.222 + add r1, r1, r2 10.223 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 10.224 + pld [r1] 10.225 + RND_AVG32 r8, r9, r4, r5, r6, r7, r12 10.226 + subs r3, r3, #1 10.227 + stm r0, {r8-r9} 10.228 + add r0, r0, r2 10.229 + bne 1b 10.230 + pop {r4-r10,pc} 10.231 + .align 5 10.232 +2: 10.233 + ldm r1, {r4-r5, r10} 10.234 + add r1, r1, r2 10.235 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 10.236 + ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 10.237 + pld [r1] 10.238 + RND_AVG32 r4, r5, r6, r7, r8, r9, r12 10.239 + subs r3, r3, #1 10.240 + stm r0, {r4-r5} 10.241 + add r0, r0, r2 10.242 + bne 2b 10.243 + pop {r4-r10,pc} 10.244 + .align 5 10.245 +3: 10.246 + ldm r1, {r4-r5, r10} 10.247 + add r1, r1, r2 10.248 + ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 10.249 + ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 10.250 + pld [r1] 10.251 + RND_AVG32 r4, r5, r6, r7, r8, r9, r12 10.252 + subs r3, r3, #1 10.253 + stm r0, {r4-r5} 10.254 + add r0, r0, r2 10.255 + bne 3b 10.256 + pop {r4-r10,pc} 10.257 + .align 5 10.258 +4: 10.259 + ldm r1, {r4-r5, r10} 10.260 + add r1, r1, r2 10.261 + ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 10.262 + pld [r1] 10.263 + RND_AVG32 r8, r9, r6, r7, r5, r10, r12 10.264 + subs r3, r3, #1 10.265 + stm r0, {r8-r9} 10.266 + add r0, r0, r2 10.267 + bne 4b 10.268 + pop {r4-r10,pc} 10.269 +endfunc 10.270 + 10.271 + .align 5 10.272 +function ff_put_no_rnd_pixels8_x2_arm, export=1 10.273 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) 10.274 + @ block = word aligned, pixles = unaligned 10.275 + pld [r1] 10.276 + push {r4-r10,lr} 10.277 + ldr r12, =0xfefefefe 10.278 + JMP_ALIGN r5, r1 10.279 +1: 10.280 + ldm r1, {r4-r5, r10} 10.281 + add r1, r1, r2 10.282 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 10.283 + pld [r1] 10.284 + NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 10.285 + subs r3, r3, #1 10.286 + stm r0, {r8-r9} 10.287 + add r0, r0, r2 10.288 + bne 1b 10.289 + pop {r4-r10,pc} 10.290 + .align 5 10.291 +2: 10.292 + ldm r1, {r4-r5, r10} 10.293 + add r1, r1, r2 10.294 + ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 10.295 + ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 10.296 + pld [r1] 10.297 + NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 10.298 + subs r3, r3, #1 10.299 + stm r0, {r4-r5} 10.300 + add r0, r0, r2 10.301 + bne 2b 10.302 + pop {r4-r10,pc} 10.303 + .align 5 10.304 +3: 10.305 + ldm r1, {r4-r5, r10} 10.306 + add r1, r1, r2 10.307 + ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 10.308 + ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 10.309 + pld [r1] 10.310 + NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 10.311 + subs r3, r3, #1 10.312 + stm r0, {r4-r5} 10.313 + add r0, r0, r2 10.314 + bne 3b 10.315 + pop {r4-r10,pc} 10.316 + .align 5 10.317 +4: 10.318 + ldm r1, {r4-r5, r10} 10.319 + add r1, r1, r2 10.320 + ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 10.321 + pld [r1] 10.322 + NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 10.323 + subs r3, r3, #1 10.324 + stm r0, {r8-r9} 10.325 + add r0, r0, r2 10.326 + bne 4b 10.327 + pop {r4-r10,pc} 10.328 +endfunc 10.329 + 10.330 + 10.331 +@ ---------------------------------------------------------------- 10.332 + .align 5 10.333 +function ff_put_pixels8_y2_arm, export=1 10.334 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) 10.335 + @ block = word aligned, pixles = unaligned 10.336 + pld [r1] 10.337 + push {r4-r11,lr} 10.338 + mov r3, r3, lsr #1 10.339 + ldr r12, =0xfefefefe 10.340 + JMP_ALIGN r5, r1 10.341 +1: 10.342 + ldm r1, {r4-r5} 10.343 + add r1, r1, r2 10.344 +6: ldm r1, {r6-r7} 10.345 + add r1, r1, r2 10.346 + pld [r1] 10.347 + RND_AVG32 r8, r9, r4, r5, r6, r7, r12 10.348 + ldm r1, {r4-r5} 10.349 + add r1, r1, r2 10.350 + stm r0, {r8-r9} 10.351 + add r0, r0, r2 10.352 + pld [r1] 10.353 + RND_AVG32 r8, r9, r6, r7, r4, r5, r12 10.354 + subs r3, r3, #1 10.355 + stm r0, {r8-r9} 10.356 + add r0, r0, r2 10.357 + bne 6b 10.358 + pop {r4-r11,pc} 10.359 + .align 5 10.360 +2: 10.361 + ldm r1, {r4-r6} 10.362 + add r1, r1, r2 10.363 + pld [r1] 10.364 + ALIGN_DWORD 1, r4, r5, r6 10.365 +6: ldm r1, {r7-r9} 10.366 + add r1, r1, r2 10.367 + pld [r1] 10.368 + ALIGN_DWORD 1, r7, r8, r9 10.369 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 10.370 + stm r0, {r10-r11} 10.371 + add r0, r0, r2 10.372 + ldm r1, {r4-r6} 10.373 + add r1, r1, r2 10.374 + pld [r1] 10.375 + ALIGN_DWORD 1, r4, r5, r6 10.376 + subs r3, r3, #1 10.377 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 10.378 + stm r0, {r10-r11} 10.379 + add r0, r0, r2 10.380 + bne 6b 10.381 + pop {r4-r11,pc} 10.382 + .align 5 10.383 +3: 10.384 + ldm r1, {r4-r6} 10.385 + add r1, r1, r2 10.386 + pld [r1] 10.387 + ALIGN_DWORD 2, r4, r5, r6 10.388 +6: ldm r1, {r7-r9} 10.389 + add r1, r1, r2 10.390 + pld [r1] 10.391 + ALIGN_DWORD 2, r7, r8, r9 10.392 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 10.393 + stm r0, {r10-r11} 10.394 + add r0, r0, r2 10.395 + ldm r1, {r4-r6} 10.396 + add r1, r1, r2 10.397 + pld [r1] 10.398 + ALIGN_DWORD 2, r4, r5, r6 10.399 + subs r3, r3, #1 10.400 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 10.401 + stm r0, {r10-r11} 10.402 + add r0, r0, r2 10.403 + bne 6b 10.404 + pop {r4-r11,pc} 10.405 + .align 5 10.406 +4: 10.407 + ldm r1, {r4-r6} 10.408 + add r1, r1, r2 10.409 + pld [r1] 10.410 + ALIGN_DWORD 3, r4, r5, r6 10.411 +6: ldm r1, {r7-r9} 10.412 + add r1, r1, r2 10.413 + pld [r1] 10.414 + ALIGN_DWORD 3, r7, r8, r9 10.415 + RND_AVG32 r10, r11, r4, r5, r7, r8, r12 10.416 + stm r0, {r10-r11} 10.417 + add r0, r0, r2 10.418 + ldm r1, {r4-r6} 10.419 + add r1, r1, r2 10.420 + pld [r1] 10.421 + ALIGN_DWORD 3, r4, r5, r6 10.422 + subs r3, r3, #1 10.423 + RND_AVG32 r10, r11, r7, r8, r4, r5, r12 10.424 + stm r0, {r10-r11} 10.425 + add r0, r0, r2 10.426 + bne 6b 10.427 + pop {r4-r11,pc} 10.428 +endfunc 10.429 + 10.430 + .align 5 10.431 +function ff_put_no_rnd_pixels8_y2_arm, export=1 10.432 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) 10.433 + @ block = word aligned, pixles = unaligned 10.434 + pld [r1] 10.435 + push {r4-r11,lr} 10.436 + mov r3, r3, lsr #1 10.437 + ldr r12, =0xfefefefe 10.438 + JMP_ALIGN r5, r1 10.439 +1: 10.440 + ldm r1, {r4-r5} 10.441 + add r1, r1, r2 10.442 +6: ldm r1, {r6-r7} 10.443 + add r1, r1, r2 10.444 + pld [r1] 10.445 + NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 10.446 + ldm r1, {r4-r5} 10.447 + add r1, r1, r2 10.448 + stm r0, {r8-r9} 10.449 + add r0, r0, r2 10.450 + pld [r1] 10.451 + NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 10.452 + subs r3, r3, #1 10.453 + stm r0, {r8-r9} 10.454 + add r0, r0, r2 10.455 + bne 6b 10.456 + pop {r4-r11,pc} 10.457 + .align 5 10.458 +2: 10.459 + ldm r1, {r4-r6} 10.460 + add r1, r1, r2 10.461 + pld [r1] 10.462 + ALIGN_DWORD 1, r4, r5, r6 10.463 +6: ldm r1, {r7-r9} 10.464 + add r1, r1, r2 10.465 + pld [r1] 10.466 + ALIGN_DWORD 1, r7, r8, r9 10.467 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 10.468 + stm r0, {r10-r11} 10.469 + add r0, r0, r2 10.470 + ldm r1, {r4-r6} 10.471 + add r1, r1, r2 10.472 + pld [r1] 10.473 + ALIGN_DWORD 1, r4, r5, r6 10.474 + subs r3, r3, #1 10.475 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 10.476 + stm r0, {r10-r11} 10.477 + add r0, r0, r2 10.478 + bne 6b 10.479 + pop {r4-r11,pc} 10.480 + .align 5 10.481 +3: 10.482 + ldm r1, {r4-r6} 10.483 + add r1, r1, r2 10.484 + pld [r1] 10.485 + ALIGN_DWORD 2, r4, r5, r6 10.486 +6: ldm r1, {r7-r9} 10.487 + add r1, r1, r2 10.488 + pld [r1] 10.489 + ALIGN_DWORD 2, r7, r8, r9 10.490 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 10.491 + stm r0, {r10-r11} 10.492 + add r0, r0, r2 10.493 + ldm r1, {r4-r6} 10.494 + add r1, r1, r2 10.495 + pld [r1] 10.496 + ALIGN_DWORD 2, r4, r5, r6 10.497 + subs r3, r3, #1 10.498 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 10.499 + stm r0, {r10-r11} 10.500 + add r0, r0, r2 10.501 + bne 6b 10.502 + pop {r4-r11,pc} 10.503 + .align 5 10.504 +4: 10.505 + ldm r1, {r4-r6} 10.506 + add r1, r1, r2 10.507 + pld [r1] 10.508 + ALIGN_DWORD 3, r4, r5, r6 10.509 +6: ldm r1, {r7-r9} 10.510 + add r1, r1, r2 10.511 + pld [r1] 10.512 + ALIGN_DWORD 3, r7, r8, r9 10.513 + NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 10.514 + stm r0, {r10-r11} 10.515 + add r0, r0, r2 10.516 + ldm r1, {r4-r6} 10.517 + add r1, r1, r2 10.518 + pld [r1] 10.519 + ALIGN_DWORD 3, r4, r5, r6 10.520 + subs r3, r3, #1 10.521 + NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 10.522 + stm r0, {r10-r11} 10.523 + add r0, r0, r2 10.524 + bne 6b 10.525 + pop {r4-r11,pc} 10.526 +endfunc 10.527 + 10.528 + .ltorg 10.529 + 10.530 +@ ---------------------------------------------------------------- 10.531 +.macro RND_XY2_IT align, rnd 10.532 + @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) 10.533 + @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) 10.534 +.if \align == 0 10.535 + ldm r1, {r6-r8} 10.536 +.elseif \align == 3 10.537 + ldm r1, {r5-r7} 10.538 +.else 10.539 + ldm r1, {r8-r10} 10.540 +.endif 10.541 + add r1, r1, r2 10.542 + pld [r1] 10.543 +.if \align == 0 10.544 + ALIGN_DWORD_D 1, r4, r5, r6, r7, r8 10.545 +.elseif \align == 1 10.546 + ALIGN_DWORD_D 1, r4, r5, r8, r9, r10 10.547 + ALIGN_DWORD_D 2, r6, r7, r8, r9, r10 10.548 +.elseif \align == 2 10.549 + ALIGN_DWORD_D 2, r4, r5, r8, r9, r10 10.550 + ALIGN_DWORD_D 3, r6, r7, r8, r9, r10 10.551 +.elseif \align == 3 10.552 + ALIGN_DWORD_D 3, r4, r5, r5, r6, r7 10.553 +.endif 10.554 + ldr r14, =0x03030303 10.555 + tst r3, #1 10.556 + and r8, r4, r14 10.557 + and r9, r5, r14 10.558 + and r10, r6, r14 10.559 + and r11, r7, r14 10.560 + andeq r14, r14, r14, \rnd #1 10.561 + add r8, r8, r10 10.562 + add r9, r9, r11 10.563 + ldr r12, =0xfcfcfcfc >> 2 10.564 + addeq r8, r8, r14 10.565 + addeq r9, r9, r14 10.566 + and r4, r12, r4, lsr #2 10.567 + and r5, r12, r5, lsr #2 10.568 + and r6, r12, r6, lsr #2 10.569 + and r7, r12, r7, lsr #2 10.570 + add r10, r4, r6 10.571 + add r11, r5, r7 10.572 + subs r3, r3, #1 10.573 +.endm 10.574 + 10.575 +.macro RND_XY2_EXPAND align, rnd 10.576 + RND_XY2_IT \align, \rnd 10.577 +6: push {r8-r11} 10.578 + RND_XY2_IT \align, \rnd 10.579 + pop {r4-r7} 10.580 + add r4, r4, r8 10.581 + add r5, r5, r9 10.582 + ldr r14, =0x0f0f0f0f 10.583 + add r6, r6, r10 10.584 + add r7, r7, r11 10.585 + and r4, r14, r4, lsr #2 10.586 + and r5, r14, r5, lsr #2 10.587 + add r4, r4, r6 10.588 + add r5, r5, r7 10.589 + stm r0, {r4-r5} 10.590 + add r0, r0, r2 10.591 + bge 6b 10.592 + pop {r4-r11,pc} 10.593 +.endm 10.594 + 10.595 + .align 5 10.596 +function ff_put_pixels8_xy2_arm, export=1 10.597 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) 10.598 + @ block = word aligned, pixles = unaligned 10.599 + pld [r1] 10.600 + push {r4-r11,lr} @ R14 is also called LR 10.601 + JMP_ALIGN r5, r1 10.602 +1: RND_XY2_EXPAND 0, lsl 10.603 + .align 5 10.604 +2: RND_XY2_EXPAND 1, lsl 10.605 + .align 5 10.606 +3: RND_XY2_EXPAND 2, lsl 10.607 + .align 5 10.608 +4: RND_XY2_EXPAND 3, lsl 10.609 +endfunc 10.610 + 10.611 + .align 5 10.612 +function ff_put_no_rnd_pixels8_xy2_arm, export=1 10.613 + @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) 10.614 + @ block = word aligned, pixles = unaligned 10.615 + pld [r1] 10.616 + push {r4-r11,lr} 10.617 + JMP_ALIGN r5, r1 10.618 +1: RND_XY2_EXPAND 0, lsr 10.619 + .align 5 10.620 +2: RND_XY2_EXPAND 1, lsr 10.621 + .align 5 10.622 +3: RND_XY2_EXPAND 2, lsr 10.623 + .align 5 10.624 +4: RND_XY2_EXPAND 3, lsr 10.625 +endfunc 10.626 + 10.627 + .align 5 10.628 +@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride) 10.629 +function ff_add_pixels_clamped_arm, export=1 10.630 + push {r4-r10} 10.631 + mov r10, #8 10.632 +1: 10.633 + ldr r4, [r1] /* load dest */ 10.634 + /* block[0] and block[1]*/ 10.635 + ldrsh r5, [r0] 10.636 + ldrsh r7, [r0, #2] 10.637 + and r6, r4, #0xFF 10.638 + and r8, r4, #0xFF00 10.639 + add r6, r5, r6 10.640 + add r8, r7, r8, lsr #8 10.641 + mvn r5, r5 10.642 + mvn r7, r7 10.643 + tst r6, #0x100 10.644 + movne r6, r5, lsr #24 10.645 + tst r8, #0x100 10.646 + movne r8, r7, lsr #24 10.647 + mov r9, r6 10.648 + ldrsh r5, [r0, #4] /* moved form [A] */ 10.649 + orr r9, r9, r8, lsl #8 10.650 + /* block[2] and block[3] */ 10.651 + /* [A] */ 10.652 + ldrsh r7, [r0, #6] 10.653 + and r6, r4, #0xFF0000 10.654 + and r8, r4, #0xFF000000 10.655 + add r6, r5, r6, lsr #16 10.656 + add r8, r7, r8, lsr #24 10.657 + mvn r5, r5 10.658 + mvn r7, r7 10.659 + tst r6, #0x100 10.660 + movne r6, r5, lsr #24 10.661 + tst r8, #0x100 10.662 + movne r8, r7, lsr #24 10.663 + orr r9, r9, r6, lsl #16 10.664 + ldr r4, [r1, #4] /* moved form [B] */ 10.665 + orr r9, r9, r8, lsl #24 10.666 + /* store dest */ 10.667 + ldrsh r5, [r0, #8] /* moved form [C] */ 10.668 + str r9, [r1] 10.669 + 10.670 + /* load dest */ 10.671 + /* [B] */ 10.672 + /* block[4] and block[5] */ 10.673 + /* [C] */ 10.674 + ldrsh r7, [r0, #10] 10.675 + and r6, r4, #0xFF 10.676 + and r8, r4, #0xFF00 10.677 + add r6, r5, r6 10.678 + add r8, r7, r8, lsr #8 10.679 + mvn r5, r5 10.680 + mvn r7, r7 10.681 + tst r6, #0x100 10.682 + movne r6, r5, lsr #24 10.683 + tst r8, #0x100 10.684 + movne r8, r7, lsr #24 10.685 + mov r9, r6 10.686 + ldrsh r5, [r0, #12] /* moved from [D] */ 10.687 + orr r9, r9, r8, lsl #8 10.688 + /* block[6] and block[7] */ 10.689 + /* [D] */ 10.690 + ldrsh r7, [r0, #14] 10.691 + and r6, r4, #0xFF0000 10.692 + and r8, r4, #0xFF000000 10.693 + add r6, r5, r6, lsr #16 10.694 + add r8, r7, r8, lsr #24 10.695 + mvn r5, r5 10.696 + mvn r7, r7 10.697 + tst r6, #0x100 10.698 + movne r6, r5, lsr #24 10.699 + tst r8, #0x100 10.700 + movne r8, r7, lsr #24 10.701 + orr r9, r9, r6, lsl #16 10.702 + add r0, r0, #16 /* moved from [E] */ 10.703 + orr r9, r9, r8, lsl #24 10.704 + subs r10, r10, #1 /* moved from [F] */ 10.705 + /* store dest */ 10.706 + str r9, [r1, #4] 10.707 + 10.708 + /* [E] */ 10.709 + /* [F] */ 10.710 + add r1, r1, r2 10.711 + bne 1b 10.712 + 10.713 + pop {r4-r10} 10.714 + bx lr 10.715 +endfunc
11.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 11.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.h Mon Aug 27 12:09:56 2012 +0200 11.3 @@ -0,0 +1,33 @@ 11.4 +/* 11.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 11.6 + * 11.7 + * This file is part of FFmpeg. 11.8 + * 11.9 + * FFmpeg is free software; you can redistribute it and/or 11.10 + * modify it under the terms of the GNU Lesser General Public 11.11 + * License as published by the Free Software Foundation; either 11.12 + * version 2.1 of the License, or (at your option) any later version. 11.13 + * 11.14 + * FFmpeg is distributed in the hope that it will be useful, 11.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 11.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11.17 + * Lesser General Public License for more details. 11.18 + * 11.19 + * You should have received a copy of the GNU Lesser General Public 11.20 + * License along with FFmpeg; if not, write to the Free Software 11.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 11.22 + */ 11.23 + 11.24 +#ifndef AVCODEC_ARM_DSPUTIL_H 11.25 +#define AVCODEC_ARM_DSPUTIL_H 11.26 + 11.27 +#include "libavcodec/avcodec.h" 11.28 +#include "libavcodec/dsputil.h" 11.29 + 11.30 +void ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx); 11.31 +void ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx); 11.32 +void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx); 11.33 +void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx); 11.34 +void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx); 11.35 + 11.36 +#endif
12.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 12.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_armv6.S Mon Aug 27 12:09:56 2012 +0200 12.3 @@ -0,0 +1,623 @@ 12.4 +/* 12.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 12.6 + * 12.7 + * This file is part of FFmpeg. 12.8 + * 12.9 + * FFmpeg is free software; you can redistribute it and/or 12.10 + * modify it under the terms of the GNU Lesser General Public 12.11 + * License as published by the Free Software Foundation; either 12.12 + * version 2.1 of the License, or (at your option) any later version. 12.13 + * 12.14 + * FFmpeg is distributed in the hope that it will be useful, 12.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 12.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12.17 + * Lesser General Public License for more details. 12.18 + * 12.19 + * You should have received a copy of the GNU Lesser General Public 12.20 + * License along with FFmpeg; if not, write to the Free Software 12.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 12.22 + */ 12.23 + 12.24 +#include "asm.S" 12.25 + 12.26 + preserve8 12.27 + 12.28 + .text 12.29 + 12.30 +.macro call_2x_pixels type, subp 12.31 +function ff_\type\()_pixels16\subp\()_armv6, export=1 12.32 + push {r0-r3, lr} 12.33 + bl ff_\type\()_pixels8\subp\()_armv6 12.34 + pop {r0-r3, lr} 12.35 + add r0, r0, #8 12.36 + add r1, r1, #8 12.37 + b ff_\type\()_pixels8\subp\()_armv6 12.38 +endfunc 12.39 +.endm 12.40 + 12.41 +call_2x_pixels avg 12.42 +call_2x_pixels put, _x2 12.43 +call_2x_pixels put, _y2 12.44 +call_2x_pixels put, _x2_no_rnd 12.45 +call_2x_pixels put, _y2_no_rnd 12.46 + 12.47 +function ff_put_pixels16_armv6, export=1 12.48 + push {r4-r11} 12.49 +1: 12.50 + ldr r5, [r1, #4] 12.51 + ldr r6, [r1, #8] 12.52 + ldr r7, [r1, #12] 12.53 + ldr r4, [r1], r2 12.54 + strd r6, r7, [r0, #8] 12.55 + ldr r9, [r1, #4] 12.56 + strd r4, r5, [r0], r2 12.57 + ldr r10, [r1, #8] 12.58 + ldr r11, [r1, #12] 12.59 + ldr r8, [r1], r2 12.60 + strd r10, r11, [r0, #8] 12.61 + subs r3, r3, #2 12.62 + strd r8, r9, [r0], r2 12.63 + bne 1b 12.64 + 12.65 + pop {r4-r11} 12.66 + bx lr 12.67 +endfunc 12.68 + 12.69 +function ff_put_pixels8_armv6, export=1 12.70 + push {r4-r7} 12.71 +1: 12.72 + ldr r5, [r1, #4] 12.73 + ldr r4, [r1], r2 12.74 + ldr r7, [r1, #4] 12.75 + strd r4, r5, [r0], r2 12.76 + ldr r6, [r1], r2 12.77 + subs r3, r3, #2 12.78 + strd r6, r7, [r0], r2 12.79 + bne 1b 12.80 + 12.81 + pop {r4-r7} 12.82 + bx lr 12.83 +endfunc 12.84 + 12.85 +function ff_put_pixels8_x2_armv6, export=1 12.86 + push {r4-r11, lr} 12.87 + mov r12, #1 12.88 + orr r12, r12, r12, lsl #8 12.89 + orr r12, r12, r12, lsl #16 12.90 +1: 12.91 + ldr r4, [r1] 12.92 + subs r3, r3, #2 12.93 + ldr r5, [r1, #4] 12.94 + ldr r7, [r1, #5] 12.95 + lsr r6, r4, #8 12.96 + ldr r8, [r1, r2]! 12.97 + orr r6, r6, r5, lsl #24 12.98 + ldr r9, [r1, #4] 12.99 + ldr r11, [r1, #5] 12.100 + lsr r10, r8, #8 12.101 + add r1, r1, r2 12.102 + orr r10, r10, r9, lsl #24 12.103 + eor r14, r4, r6 12.104 + uhadd8 r4, r4, r6 12.105 + eor r6, r5, r7 12.106 + uhadd8 r5, r5, r7 12.107 + and r14, r14, r12 12.108 + and r6, r6, r12 12.109 + uadd8 r4, r4, r14 12.110 + eor r14, r8, r10 12.111 + uadd8 r5, r5, r6 12.112 + eor r6, r9, r11 12.113 + uhadd8 r8, r8, r10 12.114 + and r14, r14, r12 12.115 + uhadd8 r9, r9, r11 12.116 + and r6, r6, r12 12.117 + uadd8 r8, r8, r14 12.118 + strd r4, r5, [r0], r2 12.119 + uadd8 r9, r9, r6 12.120 + strd r8, r9, [r0], r2 12.121 + bne 1b 12.122 + 12.123 + pop {r4-r11, pc} 12.124 +endfunc 12.125 + 12.126 +function ff_put_pixels8_y2_armv6, export=1 12.127 + push {r4-r11} 12.128 + mov r12, #1 12.129 + orr r12, r12, r12, lsl #8 12.130 + orr r12, r12, r12, lsl #16 12.131 + ldr r4, [r1] 12.132 + ldr r5, [r1, #4] 12.133 + ldr r6, [r1, r2]! 12.134 + ldr r7, [r1, #4] 12.135 +1: 12.136 + subs r3, r3, #2 12.137 + uhadd8 r8, r4, r6 12.138 + eor r10, r4, r6 12.139 + uhadd8 r9, r5, r7 12.140 + eor r11, r5, r7 12.141 + and r10, r10, r12 12.142 + ldr r4, [r1, r2]! 12.143 + uadd8 r8, r8, r10 12.144 + and r11, r11, r12 12.145 + uadd8 r9, r9, r11 12.146 + ldr r5, [r1, #4] 12.147 + uhadd8 r10, r4, r6 12.148 + eor r6, r4, r6 12.149 + uhadd8 r11, r5, r7 12.150 + and r6, r6, r12 12.151 + eor r7, r5, r7 12.152 + uadd8 r10, r10, r6 12.153 + and r7, r7, r12 12.154 + ldr r6, [r1, r2]! 12.155 + uadd8 r11, r11, r7 12.156 + strd r8, r9, [r0], r2 12.157 + ldr r7, [r1, #4] 12.158 + strd r10, r11, [r0], r2 12.159 + bne 1b 12.160 + 12.161 + pop {r4-r11} 12.162 + bx lr 12.163 +endfunc 12.164 + 12.165 +function ff_put_pixels8_x2_no_rnd_armv6, export=1 12.166 + push {r4-r9, lr} 12.167 +1: 12.168 + subs r3, r3, #2 12.169 + ldr r4, [r1] 12.170 + ldr r5, [r1, #4] 12.171 + ldr r7, [r1, #5] 12.172 + ldr r8, [r1, r2]! 12.173 + ldr r9, [r1, #4] 12.174 + ldr r14, [r1, #5] 12.175 + add r1, r1, r2 12.176 + lsr r6, r4, #8 12.177 + orr r6, r6, r5, lsl #24 12.178 + lsr r12, r8, #8 12.179 + orr r12, r12, r9, lsl #24 12.180 + uhadd8 r4, r4, r6 12.181 + uhadd8 r5, r5, r7 12.182 + uhadd8 r8, r8, r12 12.183 + uhadd8 r9, r9, r14 12.184 + stm r0, {r4,r5} 12.185 + add r0, r0, r2 12.186 + stm r0, {r8,r9} 12.187 + add r0, r0, r2 12.188 + bne 1b 12.189 + 12.190 + pop {r4-r9, pc} 12.191 +endfunc 12.192 + 12.193 +function ff_put_pixels8_y2_no_rnd_armv6, export=1 12.194 + push {r4-r9, lr} 12.195 + ldr r4, [r1] 12.196 + ldr r5, [r1, #4] 12.197 + ldr r6, [r1, r2]! 12.198 + ldr r7, [r1, #4] 12.199 +1: 12.200 + subs r3, r3, #2 12.201 + uhadd8 r8, r4, r6 12.202 + ldr r4, [r1, r2]! 12.203 + uhadd8 r9, r5, r7 12.204 + ldr r5, [r1, #4] 12.205 + uhadd8 r12, r4, r6 12.206 + ldr r6, [r1, r2]! 12.207 + uhadd8 r14, r5, r7 12.208 + ldr r7, [r1, #4] 12.209 + stm r0, {r8,r9} 12.210 + add r0, r0, r2 12.211 + stm r0, {r12,r14} 12.212 + add r0, r0, r2 12.213 + bne 1b 12.214 + 12.215 + pop {r4-r9, pc} 12.216 +endfunc 12.217 + 12.218 +function ff_avg_pixels8_armv6, export=1 12.219 + pld [r1, r2] 12.220 + push {r4-r10, lr} 12.221 + mov lr, #1 12.222 + orr lr, lr, lr, lsl #8 12.223 + orr lr, lr, lr, lsl #16 12.224 + ldrd r4, r5, [r0] 12.225 + ldr r10, [r1, #4] 12.226 + ldr r9, [r1], r2 12.227 + subs r3, r3, #2 12.228 +1: 12.229 + pld [r1, r2] 12.230 + eor r8, r4, r9 12.231 + uhadd8 r4, r4, r9 12.232 + eor r12, r5, r10 12.233 + ldrd r6, r7, [r0, r2] 12.234 + uhadd8 r5, r5, r10 12.235 + and r8, r8, lr 12.236 + ldr r10, [r1, #4] 12.237 + and r12, r12, lr 12.238 + uadd8 r4, r4, r8 12.239 + ldr r9, [r1], r2 12.240 + eor r8, r6, r9 12.241 + uadd8 r5, r5, r12 12.242 + pld [r1, r2, lsl #1] 12.243 + eor r12, r7, r10 12.244 + uhadd8 r6, r6, r9 12.245 + strd r4, r5, [r0], r2 12.246 + uhadd8 r7, r7, r10 12.247 + beq 2f 12.248 + and r8, r8, lr 12.249 + ldrd r4, r5, [r0, r2] 12.250 + uadd8 r6, r6, r8 12.251 + ldr r10, [r1, #4] 12.252 + and r12, r12, lr 12.253 + subs r3, r3, #2 12.254 + uadd8 r7, r7, r12 12.255 + ldr r9, [r1], r2 12.256 + strd r6, r7, [r0], r2 12.257 + b 1b 12.258 +2: 12.259 + and r8, r8, lr 12.260 + and r12, r12, lr 12.261 + uadd8 r6, r6, r8 12.262 + uadd8 r7, r7, r12 12.263 + strd r6, r7, [r0], r2 12.264 + 12.265 + pop {r4-r10, pc} 12.266 +endfunc 12.267 + 12.268 +function ff_add_pixels_clamped_armv6, export=1 12.269 + push {r4-r8,lr} 12.270 + mov r3, #8 12.271 +1: 12.272 + ldm r0!, {r4,r5,r12,lr} 12.273 + ldrd r6, r7, [r1] 12.274 + pkhbt r8, r4, r5, lsl #16 12.275 + pkhtb r5, r5, r4, asr #16 12.276 + pkhbt r4, r12, lr, lsl #16 12.277 + pkhtb lr, lr, r12, asr #16 12.278 + pld [r1, r2] 12.279 + uxtab16 r8, r8, r6 12.280 + uxtab16 r5, r5, r6, ror #8 12.281 + uxtab16 r4, r4, r7 12.282 + uxtab16 lr, lr, r7, ror #8 12.283 + usat16 r8, #8, r8 12.284 + usat16 r5, #8, r5 12.285 + usat16 r4, #8, r4 12.286 + usat16 lr, #8, lr 12.287 + orr r6, r8, r5, lsl #8 12.288 + orr r7, r4, lr, lsl #8 12.289 + subs r3, r3, #1 12.290 + strd r6, r7, [r1], r2 12.291 + bgt 1b 12.292 + pop {r4-r8,pc} 12.293 +endfunc 12.294 + 12.295 +function ff_get_pixels_armv6, export=1 12.296 + pld [r1, r2] 12.297 + push {r4-r8, lr} 12.298 + mov lr, #8 12.299 +1: 12.300 + ldrd r4, r5, [r1], r2 12.301 + subs lr, lr, #1 12.302 + uxtb16 r6, r4 12.303 + uxtb16 r4, r4, ror #8 12.304 + uxtb16 r12, r5 12.305 + uxtb16 r8, r5, ror #8 12.306 + pld [r1, r2] 12.307 + pkhbt r5, r6, r4, lsl #16 12.308 + pkhtb r6, r4, r6, asr #16 12.309 + pkhbt r7, r12, r8, lsl #16 12.310 + pkhtb r12, r8, r12, asr #16 12.311 + stm r0!, {r5,r6,r7,r12} 12.312 + bgt 1b 12.313 + 12.314 + pop {r4-r8, pc} 12.315 +endfunc 12.316 + 12.317 +function ff_diff_pixels_armv6, export=1 12.318 + pld [r1, r3] 12.319 + pld [r2, r3] 12.320 + push {r4-r9, lr} 12.321 + mov lr, #8 12.322 +1: 12.323 + ldrd r4, r5, [r1], r3 12.324 + ldrd r6, r7, [r2], r3 12.325 + uxtb16 r8, r4 12.326 + uxtb16 r4, r4, ror #8 12.327 + uxtb16 r9, r6 12.328 + uxtb16 r6, r6, ror #8 12.329 + pld [r1, r3] 12.330 + ssub16 r9, r8, r9 12.331 + ssub16 r6, r4, r6 12.332 + uxtb16 r8, r5 12.333 + uxtb16 r5, r5, ror #8 12.334 + pld [r2, r3] 12.335 + pkhbt r4, r9, r6, lsl #16 12.336 + pkhtb r6, r6, r9, asr #16 12.337 + uxtb16 r9, r7 12.338 + uxtb16 r7, r7, ror #8 12.339 + ssub16 r9, r8, r9 12.340 + ssub16 r5, r5, r7 12.341 + subs lr, lr, #1 12.342 + pkhbt r8, r9, r5, lsl #16 12.343 + pkhtb r9, r5, r9, asr #16 12.344 + stm r0!, {r4,r6,r8,r9} 12.345 + bgt 1b 12.346 + 12.347 + pop {r4-r9, pc} 12.348 +endfunc 12.349 + 12.350 +function ff_pix_abs16_armv6, export=1 12.351 + ldr r0, [sp] 12.352 + push {r4-r9, lr} 12.353 + mov r12, #0 12.354 + mov lr, #0 12.355 + ldm r1, {r4-r7} 12.356 + ldr r8, [r2] 12.357 +1: 12.358 + ldr r9, [r2, #4] 12.359 + pld [r1, r3] 12.360 + usada8 r12, r4, r8, r12 12.361 + ldr r8, [r2, #8] 12.362 + pld [r2, r3] 12.363 + usada8 lr, r5, r9, lr 12.364 + ldr r9, [r2, #12] 12.365 + usada8 r12, r6, r8, r12 12.366 + subs r0, r0, #1 12.367 + usada8 lr, r7, r9, lr 12.368 + beq 2f 12.369 + add r1, r1, r3 12.370 + ldm r1, {r4-r7} 12.371 + add r2, r2, r3 12.372 + ldr r8, [r2] 12.373 + b 1b 12.374 +2: 12.375 + add r0, r12, lr 12.376 + pop {r4-r9, pc} 12.377 +endfunc 12.378 + 12.379 +function ff_pix_abs16_x2_armv6, export=1 12.380 + ldr r12, [sp] 12.381 + push {r4-r11, lr} 12.382 + mov r0, #0 12.383 + mov lr, #1 12.384 + orr lr, lr, lr, lsl #8 12.385 + orr lr, lr, lr, lsl #16 12.386 +1: 12.387 + ldr r8, [r2] 12.388 + ldr r9, [r2, #4] 12.389 + lsr r10, r8, #8 12.390 + ldr r4, [r1] 12.391 + lsr r6, r9, #8 12.392 + orr r10, r10, r9, lsl #24 12.393 + ldr r5, [r2, #8] 12.394 + eor r11, r8, r10 12.395 + uhadd8 r7, r8, r10 12.396 + orr r6, r6, r5, lsl #24 12.397 + and r11, r11, lr 12.398 + uadd8 r7, r7, r11 12.399 + ldr r8, [r1, #4] 12.400 + usada8 r0, r4, r7, r0 12.401 + eor r7, r9, r6 12.402 + lsr r10, r5, #8 12.403 + and r7, r7, lr 12.404 + uhadd8 r4, r9, r6 12.405 + ldr r6, [r2, #12] 12.406 + uadd8 r4, r4, r7 12.407 + pld [r1, r3] 12.408 + orr r10, r10, r6, lsl #24 12.409 + usada8 r0, r8, r4, r0 12.410 + ldr r4, [r1, #8] 12.411 + eor r11, r5, r10 12.412 + ldrb r7, [r2, #16] 12.413 + and r11, r11, lr 12.414 + uhadd8 r8, r5, r10 12.415 + ldr r5, [r1, #12] 12.416 + uadd8 r8, r8, r11 12.417 + pld [r2, r3] 12.418 + lsr r10, r6, #8 12.419 + usada8 r0, r4, r8, r0 12.420 + orr r10, r10, r7, lsl #24 12.421 + subs r12, r12, #1 12.422 + eor r11, r6, r10 12.423 + add r1, r1, r3 12.424 + uhadd8 r9, r6, r10 12.425 + and r11, r11, lr 12.426 + uadd8 r9, r9, r11 12.427 + add r2, r2, r3 12.428 + usada8 r0, r5, r9, r0 12.429 + bgt 1b 12.430 + 12.431 + pop {r4-r11, pc} 12.432 +endfunc 12.433 + 12.434 +.macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3 12.435 + ldr \n0, [r2] 12.436 + eor \n1, \p0, \n0 12.437 + uhadd8 \p0, \p0, \n0 12.438 + and \n1, \n1, lr 12.439 + ldr \n2, [r1] 12.440 + uadd8 \p0, \p0, \n1 12.441 + ldr \n1, [r2, #4] 12.442 + usada8 r0, \p0, \n2, r0 12.443 + pld [r1, r3] 12.444 + eor \n3, \p1, \n1 12.445 + uhadd8 \p1, \p1, \n1 12.446 + and \n3, \n3, lr 12.447 + ldr \p0, [r1, #4] 12.448 + uadd8 \p1, \p1, \n3 12.449 + ldr \n2, [r2, #8] 12.450 + usada8 r0, \p1, \p0, r0 12.451 + pld [r2, r3] 12.452 + eor \p0, \p2, \n2 12.453 + uhadd8 \p2, \p2, \n2 12.454 + and \p0, \p0, lr 12.455 + ldr \p1, [r1, #8] 12.456 + uadd8 \p2, \p2, \p0 12.457 + ldr \n3, [r2, #12] 12.458 + usada8 r0, \p2, \p1, r0 12.459 + eor \p1, \p3, \n3 12.460 + uhadd8 \p3, \p3, \n3 12.461 + and \p1, \p1, lr 12.462 + ldr \p0, [r1, #12] 12.463 + uadd8 \p3, \p3, \p1 12.464 + add r1, r1, r3 12.465 + usada8 r0, \p3, \p0, r0 12.466 + add r2, r2, r3 12.467 +.endm 12.468 + 12.469 +function ff_pix_abs16_y2_armv6, export=1 12.470 + pld [r1] 12.471 + pld [r2] 12.472 + ldr r12, [sp] 12.473 + push {r4-r11, lr} 12.474 + mov r0, #0 12.475 + mov lr, #1 12.476 + orr lr, lr, lr, lsl #8 12.477 + orr lr, lr, lr, lsl #16 12.478 + ldr r4, [r2] 12.479 + ldr r5, [r2, #4] 12.480 + ldr r6, [r2, #8] 12.481 + ldr r7, [r2, #12] 12.482 + add r2, r2, r3 12.483 +1: 12.484 + usad_y2 r4, r5, r6, r7, r8, r9, r10, r11 12.485 + subs r12, r12, #2 12.486 + usad_y2 r8, r9, r10, r11, r4, r5, r6, r7 12.487 + bgt 1b 12.488 + 12.489 + pop {r4-r11, pc} 12.490 +endfunc 12.491 + 12.492 +function ff_pix_abs8_armv6, export=1 12.493 + pld [r2, r3] 12.494 + ldr r12, [sp] 12.495 + push {r4-r9, lr} 12.496 + mov r0, #0 12.497 + mov lr, #0 12.498 + ldrd r4, r5, [r1], r3 12.499 +1: 12.500 + subs r12, r12, #2 12.501 + ldr r7, [r2, #4] 12.502 + ldr r6, [r2], r3 12.503 + ldrd r8, r9, [r1], r3 12.504 + usada8 r0, r4, r6, r0 12.505 + pld [r2, r3] 12.506 + usada8 lr, r5, r7, lr 12.507 + ldr r7, [r2, #4] 12.508 + ldr r6, [r2], r3 12.509 + beq 2f 12.510 + ldrd r4, r5, [r1], r3 12.511 + usada8 r0, r8, r6, r0 12.512 + pld [r2, r3] 12.513 + usada8 lr, r9, r7, lr 12.514 + b 1b 12.515 +2: 12.516 + usada8 r0, r8, r6, r0 12.517 + usada8 lr, r9, r7, lr 12.518 + add r0, r0, lr 12.519 + pop {r4-r9, pc} 12.520 +endfunc 12.521 + 12.522 +function ff_sse16_armv6, export=1 12.523 + ldr r12, [sp] 12.524 + push {r4-r9, lr} 12.525 + mov r0, #0 12.526 +1: 12.527 + ldrd r4, r5, [r1] 12.528 + ldr r8, [r2] 12.529 + uxtb16 lr, r4 12.530 + uxtb16 r4, r4, ror #8 12.531 + uxtb16 r9, r8 12.532 + uxtb16 r8, r8, ror #8 12.533 + ldr r7, [r2, #4] 12.534 + usub16 lr, lr, r9 12.535 + usub16 r4, r4, r8 12.536 + smlad r0, lr, lr, r0 12.537 + uxtb16 r6, r5 12.538 + uxtb16 lr, r5, ror #8 12.539 + uxtb16 r8, r7 12.540 + uxtb16 r9, r7, ror #8 12.541 + smlad r0, r4, r4, r0 12.542 + ldrd r4, r5, [r1, #8] 12.543 + usub16 r6, r6, r8 12.544 + usub16 r8, lr, r9 12.545 + ldr r7, [r2, #8] 12.546 + smlad r0, r6, r6, r0 12.547 + uxtb16 lr, r4 12.548 + uxtb16 r4, r4, ror #8 12.549 + uxtb16 r9, r7 12.550 + uxtb16 r7, r7, ror #8 12.551 + smlad r0, r8, r8, r0 12.552 + ldr r8, [r2, #12] 12.553 + usub16 lr, lr, r9 12.554 + usub16 r4, r4, r7 12.555 + smlad r0, lr, lr, r0 12.556 + uxtb16 r6, r5 12.557 + uxtb16 r5, r5, ror #8 12.558 + uxtb16 r9, r8 12.559 + uxtb16 r8, r8, ror #8 12.560 + smlad r0, r4, r4, r0 12.561 + usub16 r6, r6, r9 12.562 + usub16 r5, r5, r8 12.563 + smlad r0, r6, r6, r0 12.564 + add r1, r1, r3 12.565 + add r2, r2, r3 12.566 + subs r12, r12, #1 12.567 + smlad r0, r5, r5, r0 12.568 + bgt 1b 12.569 + 12.570 + pop {r4-r9, pc} 12.571 +endfunc 12.572 + 12.573 +function ff_pix_norm1_armv6, export=1 12.574 + push {r4-r6, lr} 12.575 + mov r12, #16 12.576 + mov lr, #0 12.577 +1: 12.578 + ldm r0, {r2-r5} 12.579 + uxtb16 r6, r2 12.580 + uxtb16 r2, r2, ror #8 12.581 + smlad lr, r6, r6, lr 12.582 + uxtb16 r6, r3 12.583 + smlad lr, r2, r2, lr 12.584 + uxtb16 r3, r3, ror #8 12.585 + smlad lr, r6, r6, lr 12.586 + uxtb16 r6, r4 12.587 + smlad lr, r3, r3, lr 12.588 + uxtb16 r4, r4, ror #8 12.589 + smlad lr, r6, r6, lr 12.590 + uxtb16 r6, r5 12.591 + smlad lr, r4, r4, lr 12.592 + uxtb16 r5, r5, ror #8 12.593 + smlad lr, r6, r6, lr 12.594 + subs r12, r12, #1 12.595 + add r0, r0, r1 12.596 + smlad lr, r5, r5, lr 12.597 + bgt 1b 12.598 + 12.599 + mov r0, lr 12.600 + pop {r4-r6, pc} 12.601 +endfunc 12.602 + 12.603 +function ff_pix_sum_armv6, export=1 12.604 + push {r4-r7, lr} 12.605 + mov r12, #16 12.606 + mov r2, #0 12.607 + mov r3, #0 12.608 + mov lr, #0 12.609 + ldr r4, [r0] 12.610 +1: 12.611 + subs r12, r12, #1 12.612 + ldr r5, [r0, #4] 12.613 + usada8 r2, r4, lr, r2 12.614 + ldr r6, [r0, #8] 12.615 + usada8 r3, r5, lr, r3 12.616 + ldr r7, [r0, #12] 12.617 + usada8 r2, r6, lr, r2 12.618 + beq 2f 12.619 + ldr r4, [r0, r1]! 12.620 + usada8 r3, r7, lr, r3 12.621 + bgt 1b 12.622 +2: 12.623 + usada8 r3, r7, lr, r3 12.624 + add r0, r2, r3 12.625 + pop {r4-r7, pc} 12.626 +endfunc
13.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 13.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_arm.c Mon Aug 27 12:09:56 2012 +0200 13.3 @@ -0,0 +1,112 @@ 13.4 +/* 13.5 + * ARM optimized DSP utils 13.6 + * Copyright (c) 2001 Lionel Ulmer 13.7 + * 13.8 + * This file is part of FFmpeg. 13.9 + * 13.10 + * FFmpeg is free software; you can redistribute it and/or 13.11 + * modify it under the terms of the GNU Lesser General Public 13.12 + * License as published by the Free Software Foundation; either 13.13 + * version 2.1 of the License, or (at your option) any later version. 13.14 + * 13.15 + * FFmpeg is distributed in the hope that it will be useful, 13.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 13.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13.18 + * Lesser General Public License for more details. 13.19 + * 13.20 + * You should have received a copy of the GNU Lesser General Public 13.21 + * License along with FFmpeg; if not, write to the Free Software 13.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 13.23 + */ 13.24 + 13.25 +#include "libavcodec/dsputil.h" 13.26 +#include "dsputil_arm.h" 13.27 + 13.28 +void ff_j_rev_dct_arm(DCTELEM *data); 13.29 +void ff_simple_idct_arm(DCTELEM *data); 13.30 + 13.31 +/* XXX: local hack */ 13.32 +static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); 13.33 +static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size); 13.34 + 13.35 +void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); 13.36 +void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); 13.37 +void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); 13.38 +void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); 13.39 + 13.40 +void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); 13.41 +void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); 13.42 +void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); 13.43 + 13.44 +void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h); 13.45 + 13.46 +CALL_2X_PIXELS(ff_put_pixels16_x2_arm, ff_put_pixels8_x2_arm, 8) 13.47 +CALL_2X_PIXELS(ff_put_pixels16_y2_arm, ff_put_pixels8_y2_arm, 8) 13.48 +CALL_2X_PIXELS(ff_put_pixels16_xy2_arm, ff_put_pixels8_xy2_arm, 8) 13.49 +CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm, ff_put_no_rnd_pixels8_x2_arm, 8) 13.50 +CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm, ff_put_no_rnd_pixels8_y2_arm, 8) 13.51 +CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8) 13.52 + 13.53 +void ff_add_pixels_clamped_arm(const DCTELEM *block, uint8_t *dest, 13.54 + int line_size); 13.55 + 13.56 +/* XXX: those functions should be suppressed ASAP when all IDCTs are 13.57 + converted */ 13.58 +static void j_rev_dct_arm_put(uint8_t *dest, int line_size, DCTELEM *block) 13.59 +{ 13.60 + ff_j_rev_dct_arm (block); 13.61 + ff_put_pixels_clamped(block, dest, line_size); 13.62 +} 13.63 +static void j_rev_dct_arm_add(uint8_t *dest, int line_size, DCTELEM *block) 13.64 +{ 13.65 + ff_j_rev_dct_arm (block); 13.66 + ff_add_pixels_clamped(block, dest, line_size); 13.67 +} 13.68 +static void simple_idct_arm_put(uint8_t *dest, int line_size, DCTELEM *block) 13.69 +{ 13.70 + ff_simple_idct_arm (block); 13.71 + ff_put_pixels_clamped(block, dest, line_size); 13.72 +} 13.73 +static void simple_idct_arm_add(uint8_t *dest, int line_size, DCTELEM *block) 13.74 +{ 13.75 + ff_simple_idct_arm (block); 13.76 + ff_add_pixels_clamped(block, dest, line_size); 13.77 +} 13.78 + 13.79 +int mm_support(void) 13.80 +{ 13.81 + return HAVE_IWMMXT * FF_MM_IWMMXT; 13.82 +} 13.83 + 13.84 +void dsputil_init_arm(DSPContext* c) 13.85 +{ 13.86 + ff_put_pixels_clamped = c->put_pixels_clamped; 13.87 + ff_add_pixels_clamped = c->add_pixels_clamped; 13.88 + 13.89 + c->idct_put = simple_idct_arm_put; 13.90 + c->idct_add = simple_idct_arm_add; 13.91 + c->idct = ff_simple_idct_arm; 13.92 + c->idct_permutation_type = FF_NO_IDCT_PERM; 13.93 + 13.94 + c->add_pixels_clamped = ff_add_pixels_clamped_arm; 13.95 + 13.96 + c->put_pixels_tab[0][0] = ff_put_pixels16_arm; 13.97 + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm; 13.98 + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm; 13.99 + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm; 13.100 + c->put_pixels_tab[1][0] = ff_put_pixels8_arm; 13.101 + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm; 13.102 + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm; 13.103 + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm; 13.104 + 13.105 + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm; 13.106 + c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm; 13.107 + c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm; 13.108 + c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm; 13.109 + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm; 13.110 + c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm; 13.111 + c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm; 13.112 + c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm; 13.113 + 13.114 + if (HAVE_NEON) ff_dsputil_init_neon(c); 13.115 +}
14.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 14.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv5te.c Mon Aug 27 12:09:56 2012 +0200 14.3 @@ -0,0 +1,41 @@ 14.4 +/* 14.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 14.6 + * 14.7 + * This file is part of FFmpeg. 14.8 + * 14.9 + * FFmpeg is free software; you can redistribute it and/or 14.10 + * modify it under the terms of the GNU Lesser General Public 14.11 + * License as published by the Free Software Foundation; either 14.12 + * version 2.1 of the License, or (at your option) any later version. 14.13 + * 14.14 + * FFmpeg is distributed in the hope that it will be useful, 14.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 14.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14.17 + * Lesser General Public License for more details. 14.18 + * 14.19 + * You should have received a copy of the GNU Lesser General Public 14.20 + * License along with FFmpeg; if not, write to the Free Software 14.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 14.22 + */ 14.23 + 14.24 +#include "libavcodec/dsputil.h" 14.25 +#include "dsputil_arm.h" 14.26 + 14.27 +void ff_simple_idct_armv5te(DCTELEM *data); 14.28 +void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, DCTELEM *data); 14.29 +void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, DCTELEM *data); 14.30 + 14.31 +void ff_prefetch_arm(void *mem, int stride, int h); 14.32 + 14.33 +void av_cold ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx) 14.34 +{ 14.35 + if (!avctx->lowres && (avctx->idct_algo == FF_IDCT_AUTO || 14.36 + avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) { 14.37 + c->idct_put = ff_simple_idct_put_armv5te; 14.38 + c->idct_add = ff_simple_idct_add_armv5te; 14.39 + c->idct = ff_simple_idct_armv5te; 14.40 + c->idct_permutation_type = FF_NO_IDCT_PERM; 14.41 + } 14.42 + 14.43 + c->prefetch = ff_prefetch_arm; 14.44 +}
15.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 15.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv6.c Mon Aug 27 12:09:56 2012 +0200 15.3 @@ -0,0 +1,121 @@ 15.4 +/* 15.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 15.6 + * 15.7 + * This file is part of FFmpeg. 15.8 + * 15.9 + * FFmpeg is free software; you can redistribute it and/or 15.10 + * modify it under the terms of the GNU Lesser General Public 15.11 + * License as published by the Free Software Foundation; either 15.12 + * version 2.1 of the License, or (at your option) any later version. 15.13 + * 15.14 + * FFmpeg is distributed in the hope that it will be useful, 15.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 15.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15.17 + * Lesser General Public License for more details. 15.18 + * 15.19 + * You should have received a copy of the GNU Lesser General Public 15.20 + * License along with FFmpeg; if not, write to the Free Software 15.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 15.22 + */ 15.23 + 15.24 +#include <stdint.h> 15.25 + 15.26 +#include "libavcodec/avcodec.h" 15.27 +#include "libavcodec/dsputil.h" 15.28 +#include "dsputil_arm.h" 15.29 + 15.30 +void ff_simple_idct_armv6(DCTELEM *data); 15.31 +void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); 15.32 +void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); 15.33 + 15.34 +void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, int, int); 15.35 +void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, int, int); 15.36 +void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, int, int); 15.37 + 15.38 +void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int); 15.39 +void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int); 15.40 + 15.41 +void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, int, int); 15.42 + 15.43 +void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, int, int); 15.44 +void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, int, int); 15.45 +void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, int, int); 15.46 + 15.47 +void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int); 15.48 +void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int); 15.49 + 15.50 +void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, int, int); 15.51 + 15.52 +void ff_add_pixels_clamped_armv6(const DCTELEM *block, 15.53 + uint8_t *restrict pixels, 15.54 + int line_size); 15.55 + 15.56 +void ff_get_pixels_armv6(DCTELEM *block, const uint8_t *pixels, int stride); 15.57 +void ff_diff_pixels_armv6(DCTELEM *block, const uint8_t *s1, 15.58 + const uint8_t *s2, int stride); 15.59 + 15.60 +int ff_pix_abs16_armv6(void *s, uint8_t *blk1, uint8_t *blk2, 15.61 + int line_size, int h); 15.62 +int ff_pix_abs16_x2_armv6(void *s, uint8_t *blk1, uint8_t *blk2, 15.63 + int line_size, int h); 15.64 +int ff_pix_abs16_y2_armv6(void *s, uint8_t *blk1, uint8_t *blk2, 15.65 + int line_size, int h); 15.66 + 15.67 +int ff_pix_abs8_armv6(void *s, uint8_t *blk1, uint8_t *blk2, 15.68 + int line_size, int h); 15.69 + 15.70 +int ff_sse16_armv6(void *s, uint8_t *blk1, uint8_t *blk2, 15.71 + int line_size, int h); 15.72 + 15.73 +int ff_pix_norm1_armv6(uint8_t *pix, int line_size); 15.74 +int ff_pix_sum_armv6(uint8_t *pix, int line_size); 15.75 + 15.76 +void av_cold ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx) 15.77 +{ 15.78 + if (!avctx->lowres && (avctx->idct_algo == FF_IDCT_AUTO || 15.79 + avctx->idct_algo == FF_IDCT_SIMPLEARMV6)) { 15.80 + c->idct_put = ff_simple_idct_put_armv6; 15.81 + c->idct_add = ff_simple_idct_add_armv6; 15.82 + c->idct = ff_simple_idct_armv6; 15.83 + c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; 15.84 + } 15.85 + 15.86 + c->put_pixels_tab[0][0] = ff_put_pixels16_armv6; 15.87 + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6; 15.88 + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6; 15.89 +/* c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */ 15.90 + c->put_pixels_tab[1][0] = ff_put_pixels8_armv6; 15.91 + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6; 15.92 + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6; 15.93 +/* c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */ 15.94 + 15.95 + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6; 15.96 + c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6; 15.97 + c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6; 15.98 +/* c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */ 15.99 + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6; 15.100 + c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6; 15.101 + c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6; 15.102 +/* c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */ 15.103 + 15.104 + c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6; 15.105 + c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6; 15.106 + 15.107 + c->add_pixels_clamped = ff_add_pixels_clamped_armv6; 15.108 + c->get_pixels = ff_get_pixels_armv6; 15.109 + c->diff_pixels = ff_diff_pixels_armv6; 15.110 + 15.111 + c->pix_abs[0][0] = ff_pix_abs16_armv6; 15.112 + c->pix_abs[0][1] = ff_pix_abs16_x2_armv6; 15.113 + c->pix_abs[0][2] = ff_pix_abs16_y2_armv6; 15.114 + 15.115 + c->pix_abs[1][0] = ff_pix_abs8_armv6; 15.116 + 15.117 + c->sad[0] = ff_pix_abs16_armv6; 15.118 + c->sad[1] = ff_pix_abs8_armv6; 15.119 + 15.120 + c->sse[0] = ff_sse16_armv6; 15.121 + 15.122 + c->pix_norm1 = ff_pix_norm1_armv6; 15.123 + c->pix_sum = ff_pix_sum_armv6; 15.124 +}
16.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 16.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_neon.c Mon Aug 27 12:09:56 2012 +0200 16.3 @@ -0,0 +1,308 @@ 16.4 +/* 16.5 + * ARM NEON optimised DSP functions 16.6 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 16.7 + * 16.8 + * This file is part of FFmpeg. 16.9 + * 16.10 + * FFmpeg is free software; you can redistribute it and/or 16.11 + * modify it under the terms of the GNU Lesser General Public 16.12 + * License as published by the Free Software Foundation; either 16.13 + * version 2.1 of the License, or (at your option) any later version. 16.14 + * 16.15 + * FFmpeg is distributed in the hope that it will be useful, 16.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 16.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16.18 + * Lesser General Public License for more details. 16.19 + * 16.20 + * You should have received a copy of the GNU Lesser General Public 16.21 + * License along with FFmpeg; if not, write to the Free Software 16.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 16.23 + */ 16.24 + 16.25 +#include <stdint.h> 16.26 + 16.27 +#include "libavcodec/avcodec.h" 16.28 +#include "libavcodec/dsputil.h" 16.29 +#include "dsputil_arm.h" 16.30 + 16.31 +void ff_simple_idct_neon(DCTELEM *data); 16.32 +void ff_simple_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); 16.33 +void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); 16.34 + 16.35 +void ff_vp3_idct_neon(DCTELEM *data); 16.36 +void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data); 16.37 +void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data); 16.38 +void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const DCTELEM *data); 16.39 + 16.40 +void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int); 16.41 +void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int); 16.42 +void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int); 16.43 +void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int); 16.44 +void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int); 16.45 +void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int); 16.46 +void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int); 16.47 +void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int); 16.48 +void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); 16.49 +void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); 16.50 +void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); 16.51 +void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); 16.52 +void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); 16.53 +void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int); 16.54 + 16.55 +void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int); 16.56 +void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, int, int); 16.57 + 16.58 +void ff_add_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); 16.59 +void ff_put_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); 16.60 +void ff_put_signed_pixels_clamped_neon(const DCTELEM *, uint8_t *, int); 16.61 + 16.62 +void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); 16.63 +void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int); 16.64 +void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int); 16.65 +void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int); 16.66 +void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int); 16.67 +void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int); 16.68 +void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int); 16.69 +void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int); 16.70 +void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int); 16.71 +void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int); 16.72 +void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int); 16.73 +void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int); 16.74 +void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int); 16.75 +void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int); 16.76 +void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int); 16.77 +void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int); 16.78 + 16.79 +void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int); 16.80 +void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int); 16.81 +void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int); 16.82 +void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int); 16.83 +void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int); 16.84 +void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int); 16.85 +void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int); 16.86 +void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int); 16.87 +void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int); 16.88 +void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int); 16.89 +void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int); 16.90 +void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int); 16.91 +void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int); 16.92 +void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int); 16.93 +void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int); 16.94 +void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int); 16.95 + 16.96 +void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int); 16.97 +void ff_avg_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int); 16.98 +void ff_avg_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int); 16.99 +void ff_avg_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int); 16.100 +void ff_avg_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int); 16.101 +void ff_avg_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int); 16.102 +void ff_avg_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int); 16.103 +void ff_avg_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int); 16.104 +void ff_avg_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int); 16.105 +void ff_avg_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int); 16.106 +void ff_avg_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int); 16.107 +void ff_avg_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int); 16.108 +void ff_avg_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int); 16.109 +void ff_avg_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int); 16.110 +void ff_avg_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int); 16.111 +void ff_avg_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int); 16.112 + 16.113 +void ff_avg_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int); 16.114 +void ff_avg_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int); 16.115 +void ff_avg_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int); 16.116 +void ff_avg_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int); 16.117 +void ff_avg_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int); 16.118 +void ff_avg_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int); 16.119 +void ff_avg_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int); 16.120 +void ff_avg_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int); 16.121 +void ff_avg_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int); 16.122 +void ff_avg_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int); 16.123 +void ff_avg_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int); 16.124 +void ff_avg_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int); 16.125 +void ff_avg_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int); 16.126 +void ff_avg_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int); 16.127 +void ff_avg_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int); 16.128 +void ff_avg_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int); 16.129 + 16.130 +void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); 16.131 +void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); 16.132 +void ff_put_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int); 16.133 + 16.134 +void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); 16.135 +void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); 16.136 +void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int); 16.137 + 16.138 +void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *); 16.139 +void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *); 16.140 + 16.141 +void ff_vector_fmul_neon(float *dst, const float *src, int len); 16.142 +void ff_vector_fmul_window_neon(float *dst, const float *src0, 16.143 + const float *src1, const float *win, 16.144 + float add_bias, int len); 16.145 +void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul, 16.146 + int len); 16.147 +void ff_vector_fmul_sv_scalar_2_neon(float *dst, const float *src, 16.148 + const float **vp, float mul, int len); 16.149 +void ff_vector_fmul_sv_scalar_4_neon(float *dst, const float *src, 16.150 + const float **vp, float mul, int len); 16.151 +void ff_sv_fmul_scalar_2_neon(float *dst, const float **vp, float mul, 16.152 + int len); 16.153 +void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul, 16.154 + int len); 16.155 +void ff_butterflies_float_neon(float *v1, float *v2, int len); 16.156 +float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len); 16.157 +void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, 16.158 + float mul, int len); 16.159 +void ff_vector_fmul_reverse_neon(float *dst, const float *src0, 16.160 + const float *src1, int len); 16.161 +void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, 16.162 + const float *src2, int len); 16.163 + 16.164 +void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, 16.165 + int len); 16.166 +void ff_float_to_int16_neon(int16_t *, const float *, long); 16.167 +void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); 16.168 + 16.169 +void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize); 16.170 + 16.171 +int32_t ff_scalarproduct_int16_neon(int16_t *v1, int16_t *v2, int len, 16.172 + int shift); 16.173 +int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, int16_t *v2, 16.174 + int16_t *v3, int len, int mul); 16.175 + 16.176 +void ff_dsputil_init_neon(DSPContext *c) 16.177 +{ 16.178 + 16.179 + { 16.180 + c->idct_put = ff_simple_idct_put_neon; 16.181 + c->idct_add = ff_simple_idct_add_neon; 16.182 + c->idct = ff_simple_idct_neon; 16.183 + c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM; 16.184 + 16.185 + } 16.186 + 16.187 + c->put_pixels_tab[0][0] = ff_put_pixels16_neon; 16.188 + c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; 16.189 + c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; 16.190 + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; 16.191 + c->put_pixels_tab[1][0] = ff_put_pixels8_neon; 16.192 + c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; 16.193 + c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; 16.194 + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; 16.195 + 16.196 + c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; 16.197 + c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; 16.198 + c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; 16.199 + c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; 16.200 + c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; 16.201 + c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; 16.202 + c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; 16.203 + c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; 16.204 + 16.205 + c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon; 16.206 + c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon; 16.207 + 16.208 + c->add_pixels_clamped = ff_add_pixels_clamped_neon; 16.209 + c->put_pixels_clamped = ff_put_pixels_clamped_neon; 16.210 + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; 16.211 + 16.212 + 16.213 + c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon; 16.214 + c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon; 16.215 + c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon; 16.216 + 16.217 + c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon; 16.218 + c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon; 16.219 + c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon; 16.220 + 16.221 + c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon; 16.222 + c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon; 16.223 + c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon; 16.224 + c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon; 16.225 + c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon; 16.226 + c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon; 16.227 + c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon; 16.228 + c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon; 16.229 + c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon; 16.230 + c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon; 16.231 + c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon; 16.232 + c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon; 16.233 + c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon; 16.234 + c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon; 16.235 + c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon; 16.236 + c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon; 16.237 + 16.238 + c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; 16.239 + c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon; 16.240 + c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon; 16.241 + c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon; 16.242 + c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon; 16.243 + c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon; 16.244 + c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon; 16.245 + c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon; 16.246 + c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon; 16.247 + c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon; 16.248 + c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon; 16.249 + c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon; 16.250 + c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon; 16.251 + c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon; 16.252 + c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon; 16.253 + c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon; 16.254 + 16.255 + c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon; 16.256 + c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon; 16.257 + c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon; 16.258 + c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon; 16.259 + c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon; 16.260 + c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon; 16.261 + c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon; 16.262 + c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon; 16.263 + c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon; 16.264 + c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon; 16.265 + c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon; 16.266 + c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon; 16.267 + c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon; 16.268 + c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon; 16.269 + c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon; 16.270 + c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon; 16.271 + 16.272 + c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon; 16.273 + c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon; 16.274 + c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon; 16.275 + c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon; 16.276 + c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon; 16.277 + c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon; 16.278 + c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon; 16.279 + c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon; 16.280 + c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon; 16.281 + c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon; 16.282 + c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon; 16.283 + c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon; 16.284 + c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon; 16.285 + c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon; 16.286 + c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon; 16.287 + c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon; 16.288 + 16.289 + c->vector_fmul = ff_vector_fmul_neon; 16.290 + c->vector_fmul_window = ff_vector_fmul_window_neon; 16.291 + c->vector_fmul_scalar = ff_vector_fmul_scalar_neon; 16.292 + c->butterflies_float = ff_butterflies_float_neon; 16.293 + c->scalarproduct_float = ff_scalarproduct_float_neon; 16.294 + c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; 16.295 + c->vector_fmul_reverse = ff_vector_fmul_reverse_neon; 16.296 + c->vector_fmul_add = ff_vector_fmul_add_neon; 16.297 + c->vector_clipf = ff_vector_clipf_neon; 16.298 + 16.299 + c->vector_fmul_sv_scalar[0] = ff_vector_fmul_sv_scalar_2_neon; 16.300 + c->vector_fmul_sv_scalar[1] = ff_vector_fmul_sv_scalar_4_neon; 16.301 + 16.302 + c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon; 16.303 + c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon; 16.304 + 16.305 + 16.306 + c->float_to_int16 = ff_float_to_int16_neon; 16.307 + c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; 16.308 + 16.309 + c->scalarproduct_int16 = ff_scalarproduct_int16_neon; 16.310 + c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon; 16.311 +}
17.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 17.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_vfp.c Mon Aug 27 12:09:56 2012 +0200 17.3 @@ -0,0 +1,36 @@ 17.4 +/* 17.5 + * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> 17.6 + * 17.7 + * This file is part of FFmpeg. 17.8 + * 17.9 + * FFmpeg is free software; you can redistribute it and/or 17.10 + * modify it under the terms of the GNU Lesser General Public 17.11 + * License as published by the Free Software Foundation; either 17.12 + * version 2.1 of the License, or (at your option) any later version. 17.13 + * 17.14 + * FFmpeg is distributed in the hope that it will be useful, 17.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 17.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17.17 + * Lesser General Public License for more details. 17.18 + * 17.19 + * You should have received a copy of the GNU Lesser General Public 17.20 + * License along with FFmpeg; if not, write to the Free Software 17.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17.22 + */ 17.23 + 17.24 +#include "libavcodec/dsputil.h" 17.25 +#include "dsputil_arm.h" 17.26 + 17.27 +void ff_vector_fmul_vfp(float *dst, const float *src, int len); 17.28 +void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, 17.29 + const float *src1, int len); 17.30 +void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); 17.31 + 17.32 +void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx) 17.33 +{ 17.34 + c->vector_fmul = ff_vector_fmul_vfp; 17.35 + c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; 17.36 +#if HAVE_ARMV6 17.37 + c->float_to_int16 = ff_float_to_int16_vfp; 17.38 +#endif 17.39 +}
18.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 18.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt.c Mon Aug 27 12:09:56 2012 +0200 18.3 @@ -0,0 +1,205 @@ 18.4 +/* 18.5 + * iWMMXt optimized DSP utils 18.6 + * Copyright (c) 2004 AGAWA Koji 18.7 + * 18.8 + * This file is part of FFmpeg. 18.9 + * 18.10 + * FFmpeg is free software; you can redistribute it and/or 18.11 + * modify it under the terms of the GNU Lesser General Public 18.12 + * License as published by the Free Software Foundation; either 18.13 + * version 2.1 of the License, or (at your option) any later version. 18.14 + * 18.15 + * FFmpeg is distributed in the hope that it will be useful, 18.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 18.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18.18 + * Lesser General Public License for more details. 18.19 + * 18.20 + * You should have received a copy of the GNU Lesser General Public 18.21 + * License along with FFmpeg; if not, write to the Free Software 18.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 18.23 + */ 18.24 + 18.25 +#include "libavcodec/dsputil.h" 18.26 + 18.27 +#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt 18.28 +#define SET_RND(regd) __asm__ volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12"); 18.29 +#define WAVG2B "wavg2b" 18.30 +#include "dsputil_iwmmxt_rnd_template.c" 18.31 +#undef DEF 18.32 +#undef SET_RND 18.33 +#undef WAVG2B 18.34 + 18.35 +#define DEF(x, y) x ## _ ## y ##_iwmmxt 18.36 +#define SET_RND(regd) __asm__ volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12"); 18.37 +#define WAVG2B "wavg2br" 18.38 +#include "dsputil_iwmmxt_rnd_template.c" 18.39 +#undef DEF 18.40 +#undef SET_RND 18.41 +#undef WAVG2BR 18.42 + 18.43 +// need scheduling 18.44 +#define OP(AVG) \ 18.45 + __asm__ volatile ( \ 18.46 + /* alignment */ \ 18.47 + "and r12, %[pixels], #7 \n\t" \ 18.48 + "bic %[pixels], %[pixels], #7 \n\t" \ 18.49 + "tmcr wcgr1, r12 \n\t" \ 18.50 + \ 18.51 + "wldrd wr0, [%[pixels]] \n\t" \ 18.52 + "wldrd wr1, [%[pixels], #8] \n\t" \ 18.53 + "add %[pixels], %[pixels], %[line_size] \n\t" \ 18.54 + "walignr1 wr4, wr0, wr1 \n\t" \ 18.55 + \ 18.56 + "1: \n\t" \ 18.57 + \ 18.58 + "wldrd wr2, [%[pixels]] \n\t" \ 18.59 + "wldrd wr3, [%[pixels], #8] \n\t" \ 18.60 + "add %[pixels], %[pixels], %[line_size] \n\t" \ 18.61 + "pld [%[pixels]] \n\t" \ 18.62 + "walignr1 wr5, wr2, wr3 \n\t" \ 18.63 + AVG " wr6, wr4, wr5 \n\t" \ 18.64 + "wstrd wr6, [%[block]] \n\t" \ 18.65 + "add %[block], %[block], %[line_size] \n\t" \ 18.66 + \ 18.67 + "wldrd wr0, [%[pixels]] \n\t" \ 18.68 + "wldrd wr1, [%[pixels], #8] \n\t" \ 18.69 + "add %[pixels], %[pixels], %[line_size] \n\t" \ 18.70 + "walignr1 wr4, wr0, wr1 \n\t" \ 18.71 + "pld [%[pixels]] \n\t" \ 18.72 + AVG " wr6, wr4, wr5 \n\t" \ 18.73 + "wstrd wr6, [%[block]] \n\t" \ 18.74 + "add %[block], %[block], %[line_size] \n\t" \ 18.75 + \ 18.76 + "subs %[h], %[h], #2 \n\t" \ 18.77 + "bne 1b \n\t" \ 18.78 + : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \ 18.79 + : [line_size]"r"(line_size) \ 18.80 + : "memory", "r12"); 18.81 +void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 18.82 +{ 18.83 + OP("wavg2br"); 18.84 +} 18.85 +void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 18.86 +{ 18.87 + OP("wavg2b"); 18.88 +} 18.89 +#undef OP 18.90 + 18.91 +void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size) 18.92 +{ 18.93 + uint8_t *pixels2 = pixels + line_size; 18.94 + 18.95 + __asm__ volatile ( 18.96 + "mov r12, #4 \n\t" 18.97 + "1: \n\t" 18.98 + "pld [%[pixels], %[line_size2]] \n\t" 18.99 + "pld [%[pixels2], %[line_size2]] \n\t" 18.100 + "wldrd wr4, [%[pixels]] \n\t" 18.101 + "wldrd wr5, [%[pixels2]] \n\t" 18.102 + "pld [%[block], #32] \n\t" 18.103 + "wunpckelub wr6, wr4 \n\t" 18.104 + "wldrd wr0, [%[block]] \n\t" 18.105 + "wunpckehub wr7, wr4 \n\t" 18.106 + "wldrd wr1, [%[block], #8] \n\t" 18.107 + "wunpckelub wr8, wr5 \n\t" 18.108 + "wldrd wr2, [%[block], #16] \n\t" 18.109 + "wunpckehub wr9, wr5 \n\t" 18.110 + "wldrd wr3, [%[block], #24] \n\t" 18.111 + "add %[block], %[block], #32 \n\t" 18.112 + "waddhss wr10, wr0, wr6 \n\t" 18.113 + "waddhss wr11, wr1, wr7 \n\t" 18.114 + "waddhss wr12, wr2, wr8 \n\t" 18.115 + "waddhss wr13, wr3, wr9 \n\t" 18.116 + "wpackhus wr14, wr10, wr11 \n\t" 18.117 + "wpackhus wr15, wr12, wr13 \n\t" 18.118 + "wstrd wr14, [%[pixels]] \n\t" 18.119 + "add %[pixels], %[pixels], %[line_size2] \n\t" 18.120 + "subs r12, r12, #1 \n\t" 18.121 + "wstrd wr15, [%[pixels2]] \n\t" 18.122 + "add %[pixels2], %[pixels2], %[line_size2] \n\t" 18.123 + "bne 1b \n\t" 18.124 + : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2) 18.125 + : [line_size2]"r"(line_size << 1) 18.126 + : "cc", "memory", "r12"); 18.127 +} 18.128 + 18.129 +static void clear_blocks_iwmmxt(DCTELEM *blocks) 18.130 +{ 18.131 + __asm__ volatile( 18.132 + "wzero wr0 \n\t" 18.133 + "mov r1, #(128 * 6 / 32) \n\t" 18.134 + "1: \n\t" 18.135 + "wstrd wr0, [%0] \n\t" 18.136 + "wstrd wr0, [%0, #8] \n\t" 18.137 + "wstrd wr0, [%0, #16] \n\t" 18.138 + "wstrd wr0, [%0, #24] \n\t" 18.139 + "subs r1, r1, #1 \n\t" 18.140 + "add %0, %0, #32 \n\t" 18.141 + "bne 1b \n\t" 18.142 + : "+r"(blocks) 18.143 + : 18.144 + : "r1" 18.145 + ); 18.146 +} 18.147 + 18.148 +static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h) 18.149 +{ 18.150 + return; 18.151 +} 18.152 + 18.153 +/* A run time test is not simple. If this file is compiled in 18.154 + * then we should install the functions 18.155 + */ 18.156 +int mm_flags = FF_MM_IWMMXT; /* multimedia extension flags */ 18.157 + 18.158 +void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx) 18.159 +{ 18.160 + if (avctx->dsp_mask) { 18.161 + if (avctx->dsp_mask & FF_MM_FORCE) 18.162 + mm_flags |= (avctx->dsp_mask & 0xffff); 18.163 + else 18.164 + mm_flags &= ~(avctx->dsp_mask & 0xffff); 18.165 + } 18.166 + 18.167 + if (!(mm_flags & FF_MM_IWMMXT)) return; 18.168 + 18.169 + c->add_pixels_clamped = add_pixels_clamped_iwmmxt; 18.170 + 18.171 + c->clear_blocks = clear_blocks_iwmmxt; 18.172 + 18.173 + c->put_pixels_tab[0][0] = put_pixels16_iwmmxt; 18.174 + c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt; 18.175 + c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt; 18.176 + c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt; 18.177 + c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt; 18.178 + c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt; 18.179 + c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt; 18.180 + c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt; 18.181 + 18.182 + c->put_pixels_tab[1][0] = put_pixels8_iwmmxt; 18.183 + c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt; 18.184 + c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt; 18.185 + c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt; 18.186 + c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt; 18.187 + c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt; 18.188 + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt; 18.189 + c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt; 18.190 + 18.191 + c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt; 18.192 + c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt; 18.193 + c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt; 18.194 + c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt; 18.195 + c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt; 18.196 + c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt; 18.197 + c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt; 18.198 + c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt; 18.199 + 18.200 + c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt; 18.201 + c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt; 18.202 + c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt; 18.203 + c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt; 18.204 + c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt; 18.205 + c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt; 18.206 + c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt; 18.207 + c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt; 18.208 +}
19.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 19.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt_rnd_template.c Mon Aug 27 12:09:56 2012 +0200 19.3 @@ -0,0 +1,1114 @@ 19.4 +/* 19.5 + * iWMMXt optimized DSP utils 19.6 + * copyright (c) 2004 AGAWA Koji 19.7 + * 19.8 + * This file is part of FFmpeg. 19.9 + * 19.10 + * FFmpeg is free software; you can redistribute it and/or 19.11 + * modify it under the terms of the GNU Lesser General Public 19.12 + * License as published by the Free Software Foundation; either 19.13 + * version 2.1 of the License, or (at your option) any later version. 19.14 + * 19.15 + * FFmpeg is distributed in the hope that it will be useful, 19.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 19.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19.18 + * Lesser General Public License for more details. 19.19 + * 19.20 + * You should have received a copy of the GNU Lesser General Public 19.21 + * License along with FFmpeg; if not, write to the Free Software 19.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19.23 + */ 19.24 + 19.25 +void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 19.26 +{ 19.27 + int stride = line_size; 19.28 + __asm__ volatile ( 19.29 + "and r12, %[pixels], #7 \n\t" 19.30 + "bic %[pixels], %[pixels], #7 \n\t" 19.31 + "tmcr wcgr1, r12 \n\t" 19.32 + "add r4, %[pixels], %[line_size] \n\t" 19.33 + "add r5, %[block], %[line_size] \n\t" 19.34 + "mov %[line_size], %[line_size], lsl #1 \n\t" 19.35 + "1: \n\t" 19.36 + "wldrd wr0, [%[pixels]] \n\t" 19.37 + "subs %[h], %[h], #2 \n\t" 19.38 + "wldrd wr1, [%[pixels], #8] \n\t" 19.39 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.40 + "wldrd wr3, [r4] \n\t" 19.41 + "pld [%[pixels]] \n\t" 19.42 + "pld [%[pixels], #32] \n\t" 19.43 + "wldrd wr4, [r4, #8] \n\t" 19.44 + "add r4, r4, %[line_size] \n\t" 19.45 + "walignr1 wr8, wr0, wr1 \n\t" 19.46 + "pld [r4] \n\t" 19.47 + "pld [r4, #32] \n\t" 19.48 + "walignr1 wr10, wr3, wr4 \n\t" 19.49 + "wstrd wr8, [%[block]] \n\t" 19.50 + "add %[block], %[block], %[line_size] \n\t" 19.51 + "wstrd wr10, [r5] \n\t" 19.52 + "add r5, r5, %[line_size] \n\t" 19.53 + "bne 1b \n\t" 19.54 + : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) 19.55 + : 19.56 + : "memory", "r4", "r5", "r12"); 19.57 +} 19.58 + 19.59 +void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 19.60 +{ 19.61 + int stride = line_size; 19.62 + __asm__ volatile ( 19.63 + "and r12, %[pixels], #7 \n\t" 19.64 + "bic %[pixels], %[pixels], #7 \n\t" 19.65 + "tmcr wcgr1, r12 \n\t" 19.66 + "add r4, %[pixels], %[line_size] \n\t" 19.67 + "add r5, %[block], %[line_size] \n\t" 19.68 + "mov %[line_size], %[line_size], lsl #1 \n\t" 19.69 + "1: \n\t" 19.70 + "wldrd wr0, [%[pixels]] \n\t" 19.71 + "subs %[h], %[h], #2 \n\t" 19.72 + "wldrd wr1, [%[pixels], #8] \n\t" 19.73 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.74 + "wldrd wr3, [r4] \n\t" 19.75 + "pld [%[pixels]] \n\t" 19.76 + "pld [%[pixels], #32] \n\t" 19.77 + "wldrd wr4, [r4, #8] \n\t" 19.78 + "add r4, r4, %[line_size] \n\t" 19.79 + "walignr1 wr8, wr0, wr1 \n\t" 19.80 + "wldrd wr0, [%[block]] \n\t" 19.81 + "wldrd wr2, [r5] \n\t" 19.82 + "pld [r4] \n\t" 19.83 + "pld [r4, #32] \n\t" 19.84 + "walignr1 wr10, wr3, wr4 \n\t" 19.85 + WAVG2B" wr8, wr8, wr0 \n\t" 19.86 + WAVG2B" wr10, wr10, wr2 \n\t" 19.87 + "wstrd wr8, [%[block]] \n\t" 19.88 + "add %[block], %[block], %[line_size] \n\t" 19.89 + "wstrd wr10, [r5] \n\t" 19.90 + "pld [%[block]] \n\t" 19.91 + "pld [%[block], #32] \n\t" 19.92 + "add r5, r5, %[line_size] \n\t" 19.93 + "pld [r5] \n\t" 19.94 + "pld [r5, #32] \n\t" 19.95 + "bne 1b \n\t" 19.96 + : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) 19.97 + : 19.98 + : "memory", "r4", "r5", "r12"); 19.99 +} 19.100 + 19.101 +void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 19.102 +{ 19.103 + int stride = line_size; 19.104 + __asm__ volatile ( 19.105 + "and r12, %[pixels], #7 \n\t" 19.106 + "bic %[pixels], %[pixels], #7 \n\t" 19.107 + "tmcr wcgr1, r12 \n\t" 19.108 + "add r4, %[pixels], %[line_size] \n\t" 19.109 + "add r5, %[block], %[line_size] \n\t" 19.110 + "mov %[line_size], %[line_size], lsl #1 \n\t" 19.111 + "1: \n\t" 19.112 + "wldrd wr0, [%[pixels]] \n\t" 19.113 + "wldrd wr1, [%[pixels], #8] \n\t" 19.114 + "subs %[h], %[h], #2 \n\t" 19.115 + "wldrd wr2, [%[pixels], #16] \n\t" 19.116 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.117 + "wldrd wr3, [r4] \n\t" 19.118 + "pld [%[pixels]] \n\t" 19.119 + "pld [%[pixels], #32] \n\t" 19.120 + "walignr1 wr8, wr0, wr1 \n\t" 19.121 + "wldrd wr4, [r4, #8] \n\t" 19.122 + "walignr1 wr9, wr1, wr2 \n\t" 19.123 + "wldrd wr5, [r4, #16] \n\t" 19.124 + "add r4, r4, %[line_size] \n\t" 19.125 + "pld [r4] \n\t" 19.126 + "pld [r4, #32] \n\t" 19.127 + "walignr1 wr10, wr3, wr4 \n\t" 19.128 + "wstrd wr8, [%[block]] \n\t" 19.129 + "walignr1 wr11, wr4, wr5 \n\t" 19.130 + "wstrd wr9, [%[block], #8] \n\t" 19.131 + "add %[block], %[block], %[line_size] \n\t" 19.132 + "wstrd wr10, [r5] \n\t" 19.133 + "wstrd wr11, [r5, #8] \n\t" 19.134 + "add r5, r5, %[line_size] \n\t" 19.135 + "bne 1b \n\t" 19.136 + : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) 19.137 + : 19.138 + : "memory", "r4", "r5", "r12"); 19.139 +} 19.140 + 19.141 +void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 19.142 +{ 19.143 + int stride = line_size; 19.144 + __asm__ volatile ( 19.145 + "pld [%[pixels]] \n\t" 19.146 + "pld [%[pixels], #32] \n\t" 19.147 + "pld [%[block]] \n\t" 19.148 + "pld [%[block], #32] \n\t" 19.149 + "and r12, %[pixels], #7 \n\t" 19.150 + "bic %[pixels], %[pixels], #7 \n\t" 19.151 + "tmcr wcgr1, r12 \n\t" 19.152 + "add r4, %[pixels], %[line_size]\n\t" 19.153 + "add r5, %[block], %[line_size] \n\t" 19.154 + "mov %[line_size], %[line_size], lsl #1 \n\t" 19.155 + "1: \n\t" 19.156 + "wldrd wr0, [%[pixels]] \n\t" 19.157 + "wldrd wr1, [%[pixels], #8] \n\t" 19.158 + "subs %[h], %[h], #2 \n\t" 19.159 + "wldrd wr2, [%[pixels], #16] \n\t" 19.160 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.161 + "wldrd wr3, [r4] \n\t" 19.162 + "pld [%[pixels]] \n\t" 19.163 + "pld [%[pixels], #32] \n\t" 19.164 + "walignr1 wr8, wr0, wr1 \n\t" 19.165 + "wldrd wr4, [r4, #8] \n\t" 19.166 + "walignr1 wr9, wr1, wr2 \n\t" 19.167 + "wldrd wr5, [r4, #16] \n\t" 19.168 + "add r4, r4, %[line_size] \n\t" 19.169 + "wldrd wr0, [%[block]] \n\t" 19.170 + "pld [r4] \n\t" 19.171 + "wldrd wr1, [%[block], #8] \n\t" 19.172 + "pld [r4, #32] \n\t" 19.173 + "wldrd wr2, [r5] \n\t" 19.174 + "walignr1 wr10, wr3, wr4 \n\t" 19.175 + "wldrd wr3, [r5, #8] \n\t" 19.176 + WAVG2B" wr8, wr8, wr0 \n\t" 19.177 + WAVG2B" wr9, wr9, wr1 \n\t" 19.178 + WAVG2B" wr10, wr10, wr2 \n\t" 19.179 + "wstrd wr8, [%[block]] \n\t" 19.180 + "walignr1 wr11, wr4, wr5 \n\t" 19.181 + WAVG2B" wr11, wr11, wr3 \n\t" 19.182 + "wstrd wr9, [%[block], #8] \n\t" 19.183 + "add %[block], %[block], %[line_size] \n\t" 19.184 + "wstrd wr10, [r5] \n\t" 19.185 + "pld [%[block]] \n\t" 19.186 + "pld [%[block], #32] \n\t" 19.187 + "wstrd wr11, [r5, #8] \n\t" 19.188 + "add r5, r5, %[line_size] \n\t" 19.189 + "pld [r5] \n\t" 19.190 + "pld [r5, #32] \n\t" 19.191 + "bne 1b \n\t" 19.192 + : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) 19.193 + : 19.194 + : "memory", "r4", "r5", "r12"); 19.195 +} 19.196 + 19.197 +void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 19.198 +{ 19.199 + int stride = line_size; 19.200 + // [wr0 wr1 wr2 wr3] for previous line 19.201 + // [wr4 wr5 wr6 wr7] for current line 19.202 + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 19.203 + __asm__ volatile( 19.204 + "pld [%[pixels]] \n\t" 19.205 + "pld [%[pixels], #32] \n\t" 19.206 + "and r12, %[pixels], #7 \n\t" 19.207 + "bic %[pixels], %[pixels], #7 \n\t" 19.208 + "tmcr wcgr1, r12 \n\t" 19.209 + "add r12, r12, #1 \n\t" 19.210 + "add r4, %[pixels], %[line_size]\n\t" 19.211 + "tmcr wcgr2, r12 \n\t" 19.212 + "add r5, %[block], %[line_size] \n\t" 19.213 + "mov %[line_size], %[line_size], lsl #1 \n\t" 19.214 + 19.215 + "1: \n\t" 19.216 + "wldrd wr10, [%[pixels]] \n\t" 19.217 + "cmp r12, #8 \n\t" 19.218 + "wldrd wr11, [%[pixels], #8] \n\t" 19.219 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.220 + "wldrd wr13, [r4] \n\t" 19.221 + "pld [%[pixels]] \n\t" 19.222 + "wldrd wr14, [r4, #8] \n\t" 19.223 + "pld [%[pixels], #32] \n\t" 19.224 + "add r4, r4, %[line_size] \n\t" 19.225 + "walignr1 wr0, wr10, wr11 \n\t" 19.226 + "pld [r4] \n\t" 19.227 + "pld [r4, #32] \n\t" 19.228 + "walignr1 wr2, wr13, wr14 \n\t" 19.229 + "wmoveq wr4, wr11 \n\t" 19.230 + "wmoveq wr6, wr14 \n\t" 19.231 + "walignr2ne wr4, wr10, wr11 \n\t" 19.232 + "walignr2ne wr6, wr13, wr14 \n\t" 19.233 + WAVG2B" wr0, wr0, wr4 \n\t" 19.234 + WAVG2B" wr2, wr2, wr6 \n\t" 19.235 + "wstrd wr0, [%[block]] \n\t" 19.236 + "subs %[h], %[h], #2 \n\t" 19.237 + "wstrd wr2, [r5] \n\t" 19.238 + "add %[block], %[block], %[line_size] \n\t" 19.239 + "add r5, r5, %[line_size] \n\t" 19.240 + "bne 1b \n\t" 19.241 + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 19.242 + : 19.243 + : "r4", "r5", "r12", "memory"); 19.244 +} 19.245 + 19.246 +void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 19.247 +{ 19.248 + int stride = line_size; 19.249 + // [wr0 wr1 wr2 wr3] for previous line 19.250 + // [wr4 wr5 wr6 wr7] for current line 19.251 + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 19.252 + __asm__ volatile( 19.253 + "pld [%[pixels]] \n\t" 19.254 + "pld [%[pixels], #32] \n\t" 19.255 + "and r12, %[pixels], #7 \n\t" 19.256 + "bic %[pixels], %[pixels], #7 \n\t" 19.257 + "tmcr wcgr1, r12 \n\t" 19.258 + "add r12, r12, #1 \n\t" 19.259 + "add r4, %[pixels], %[line_size]\n\t" 19.260 + "tmcr wcgr2, r12 \n\t" 19.261 + "add r5, %[block], %[line_size] \n\t" 19.262 + "mov %[line_size], %[line_size], lsl #1 \n\t" 19.263 + 19.264 + "1: \n\t" 19.265 + "wldrd wr10, [%[pixels]] \n\t" 19.266 + "cmp r12, #8 \n\t" 19.267 + "wldrd wr11, [%[pixels], #8] \n\t" 19.268 + "wldrd wr12, [%[pixels], #16] \n\t" 19.269 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.270 + "wldrd wr13, [r4] \n\t" 19.271 + "pld [%[pixels]] \n\t" 19.272 + "wldrd wr14, [r4, #8] \n\t" 19.273 + "pld [%[pixels], #32] \n\t" 19.274 + "wldrd wr15, [r4, #16] \n\t" 19.275 + "add r4, r4, %[line_size] \n\t" 19.276 + "walignr1 wr0, wr10, wr11 \n\t" 19.277 + "pld [r4] \n\t" 19.278 + "pld [r4, #32] \n\t" 19.279 + "walignr1 wr1, wr11, wr12 \n\t" 19.280 + "walignr1 wr2, wr13, wr14 \n\t" 19.281 + "walignr1 wr3, wr14, wr15 \n\t" 19.282 + "wmoveq wr4, wr11 \n\t" 19.283 + "wmoveq wr5, wr12 \n\t" 19.284 + "wmoveq wr6, wr14 \n\t" 19.285 + "wmoveq wr7, wr15 \n\t" 19.286 + "walignr2ne wr4, wr10, wr11 \n\t" 19.287 + "walignr2ne wr5, wr11, wr12 \n\t" 19.288 + "walignr2ne wr6, wr13, wr14 \n\t" 19.289 + "walignr2ne wr7, wr14, wr15 \n\t" 19.290 + WAVG2B" wr0, wr0, wr4 \n\t" 19.291 + WAVG2B" wr1, wr1, wr5 \n\t" 19.292 + "wstrd wr0, [%[block]] \n\t" 19.293 + WAVG2B" wr2, wr2, wr6 \n\t" 19.294 + "wstrd wr1, [%[block], #8] \n\t" 19.295 + WAVG2B" wr3, wr3, wr7 \n\t" 19.296 + "add %[block], %[block], %[line_size] \n\t" 19.297 + "wstrd wr2, [r5] \n\t" 19.298 + "subs %[h], %[h], #2 \n\t" 19.299 + "wstrd wr3, [r5, #8] \n\t" 19.300 + "add r5, r5, %[line_size] \n\t" 19.301 + "bne 1b \n\t" 19.302 + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 19.303 + : 19.304 + : "r4", "r5", "r12", "memory"); 19.305 +} 19.306 + 19.307 +void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 19.308 +{ 19.309 + int stride = line_size; 19.310 + // [wr0 wr1 wr2 wr3] for previous line 19.311 + // [wr4 wr5 wr6 wr7] for current line 19.312 + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 19.313 + __asm__ volatile( 19.314 + "pld [%[pixels]] \n\t" 19.315 + "pld [%[pixels], #32] \n\t" 19.316 + "pld [%[block]] \n\t" 19.317 + "pld [%[block], #32] \n\t" 19.318 + "and r12, %[pixels], #7 \n\t" 19.319 + "bic %[pixels], %[pixels], #7 \n\t" 19.320 + "tmcr wcgr1, r12 \n\t" 19.321 + "add r12, r12, #1 \n\t" 19.322 + "add r4, %[pixels], %[line_size]\n\t" 19.323 + "tmcr wcgr2, r12 \n\t" 19.324 + "add r5, %[block], %[line_size] \n\t" 19.325 + "mov %[line_size], %[line_size], lsl #1 \n\t" 19.326 + "pld [r5] \n\t" 19.327 + "pld [r5, #32] \n\t" 19.328 + 19.329 + "1: \n\t" 19.330 + "wldrd wr10, [%[pixels]] \n\t" 19.331 + "cmp r12, #8 \n\t" 19.332 + "wldrd wr11, [%[pixels], #8] \n\t" 19.333 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.334 + "wldrd wr13, [r4] \n\t" 19.335 + "pld [%[pixels]] \n\t" 19.336 + "wldrd wr14, [r4, #8] \n\t" 19.337 + "pld [%[pixels], #32] \n\t" 19.338 + "add r4, r4, %[line_size] \n\t" 19.339 + "walignr1 wr0, wr10, wr11 \n\t" 19.340 + "pld [r4] \n\t" 19.341 + "pld [r4, #32] \n\t" 19.342 + "walignr1 wr2, wr13, wr14 \n\t" 19.343 + "wmoveq wr4, wr11 \n\t" 19.344 + "wmoveq wr6, wr14 \n\t" 19.345 + "walignr2ne wr4, wr10, wr11 \n\t" 19.346 + "wldrd wr10, [%[block]] \n\t" 19.347 + "walignr2ne wr6, wr13, wr14 \n\t" 19.348 + "wldrd wr12, [r5] \n\t" 19.349 + WAVG2B" wr0, wr0, wr4 \n\t" 19.350 + WAVG2B" wr2, wr2, wr6 \n\t" 19.351 + WAVG2B" wr0, wr0, wr10 \n\t" 19.352 + WAVG2B" wr2, wr2, wr12 \n\t" 19.353 + "wstrd wr0, [%[block]] \n\t" 19.354 + "subs %[h], %[h], #2 \n\t" 19.355 + "wstrd wr2, [r5] \n\t" 19.356 + "add %[block], %[block], %[line_size] \n\t" 19.357 + "add r5, r5, %[line_size] \n\t" 19.358 + "pld [%[block]] \n\t" 19.359 + "pld [%[block], #32] \n\t" 19.360 + "pld [r5] \n\t" 19.361 + "pld [r5, #32] \n\t" 19.362 + "bne 1b \n\t" 19.363 + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 19.364 + : 19.365 + : "r4", "r5", "r12", "memory"); 19.366 +} 19.367 + 19.368 +void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 19.369 +{ 19.370 + int stride = line_size; 19.371 + // [wr0 wr1 wr2 wr3] for previous line 19.372 + // [wr4 wr5 wr6 wr7] for current line 19.373 + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 19.374 + __asm__ volatile( 19.375 + "pld [%[pixels]] \n\t" 19.376 + "pld [%[pixels], #32] \n\t" 19.377 + "pld [%[block]] \n\t" 19.378 + "pld [%[block], #32] \n\t" 19.379 + "and r12, %[pixels], #7 \n\t" 19.380 + "bic %[pixels], %[pixels], #7 \n\t" 19.381 + "tmcr wcgr1, r12 \n\t" 19.382 + "add r12, r12, #1 \n\t" 19.383 + "add r4, %[pixels], %[line_size]\n\t" 19.384 + "tmcr wcgr2, r12 \n\t" 19.385 + "add r5, %[block], %[line_size] \n\t" 19.386 + "mov %[line_size], %[line_size], lsl #1 \n\t" 19.387 + "pld [r5] \n\t" 19.388 + "pld [r5, #32] \n\t" 19.389 + 19.390 + "1: \n\t" 19.391 + "wldrd wr10, [%[pixels]] \n\t" 19.392 + "cmp r12, #8 \n\t" 19.393 + "wldrd wr11, [%[pixels], #8] \n\t" 19.394 + "wldrd wr12, [%[pixels], #16] \n\t" 19.395 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.396 + "wldrd wr13, [r4] \n\t" 19.397 + "pld [%[pixels]] \n\t" 19.398 + "wldrd wr14, [r4, #8] \n\t" 19.399 + "pld [%[pixels], #32] \n\t" 19.400 + "wldrd wr15, [r4, #16] \n\t" 19.401 + "add r4, r4, %[line_size] \n\t" 19.402 + "walignr1 wr0, wr10, wr11 \n\t" 19.403 + "pld [r4] \n\t" 19.404 + "pld [r4, #32] \n\t" 19.405 + "walignr1 wr1, wr11, wr12 \n\t" 19.406 + "walignr1 wr2, wr13, wr14 \n\t" 19.407 + "walignr1 wr3, wr14, wr15 \n\t" 19.408 + "wmoveq wr4, wr11 \n\t" 19.409 + "wmoveq wr5, wr12 \n\t" 19.410 + "wmoveq wr6, wr14 \n\t" 19.411 + "wmoveq wr7, wr15 \n\t" 19.412 + "walignr2ne wr4, wr10, wr11 \n\t" 19.413 + "walignr2ne wr5, wr11, wr12 \n\t" 19.414 + "walignr2ne wr6, wr13, wr14 \n\t" 19.415 + "walignr2ne wr7, wr14, wr15 \n\t" 19.416 + "wldrd wr10, [%[block]] \n\t" 19.417 + WAVG2B" wr0, wr0, wr4 \n\t" 19.418 + "wldrd wr11, [%[block], #8] \n\t" 19.419 + WAVG2B" wr1, wr1, wr5 \n\t" 19.420 + "wldrd wr12, [r5] \n\t" 19.421 + WAVG2B" wr2, wr2, wr6 \n\t" 19.422 + "wldrd wr13, [r5, #8] \n\t" 19.423 + WAVG2B" wr3, wr3, wr7 \n\t" 19.424 + WAVG2B" wr0, wr0, wr10 \n\t" 19.425 + WAVG2B" wr1, wr1, wr11 \n\t" 19.426 + WAVG2B" wr2, wr2, wr12 \n\t" 19.427 + WAVG2B" wr3, wr3, wr13 \n\t" 19.428 + "wstrd wr0, [%[block]] \n\t" 19.429 + "subs %[h], %[h], #2 \n\t" 19.430 + "wstrd wr1, [%[block], #8] \n\t" 19.431 + "add %[block], %[block], %[line_size] \n\t" 19.432 + "wstrd wr2, [r5] \n\t" 19.433 + "pld [%[block]] \n\t" 19.434 + "wstrd wr3, [r5, #8] \n\t" 19.435 + "add r5, r5, %[line_size] \n\t" 19.436 + "pld [%[block], #32] \n\t" 19.437 + "pld [r5] \n\t" 19.438 + "pld [r5, #32] \n\t" 19.439 + "bne 1b \n\t" 19.440 + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 19.441 + : 19.442 + :"r4", "r5", "r12", "memory"); 19.443 +} 19.444 + 19.445 +void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 19.446 +{ 19.447 + int stride = line_size; 19.448 + // [wr0 wr1 wr2 wr3] for previous line 19.449 + // [wr4 wr5 wr6 wr7] for current line 19.450 + __asm__ volatile( 19.451 + "pld [%[pixels]] \n\t" 19.452 + "pld [%[pixels], #32] \n\t" 19.453 + "and r12, %[pixels], #7 \n\t" 19.454 + "tmcr wcgr1, r12 \n\t" 19.455 + "bic %[pixels], %[pixels], #7 \n\t" 19.456 + 19.457 + "wldrd wr10, [%[pixels]] \n\t" 19.458 + "wldrd wr11, [%[pixels], #8] \n\t" 19.459 + "pld [%[block]] \n\t" 19.460 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.461 + "walignr1 wr0, wr10, wr11 \n\t" 19.462 + "pld [%[pixels]] \n\t" 19.463 + "pld [%[pixels], #32] \n\t" 19.464 + 19.465 + "1: \n\t" 19.466 + "wldrd wr10, [%[pixels]] \n\t" 19.467 + "wldrd wr11, [%[pixels], #8] \n\t" 19.468 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.469 + "pld [%[pixels]] \n\t" 19.470 + "pld [%[pixels], #32] \n\t" 19.471 + "walignr1 wr4, wr10, wr11 \n\t" 19.472 + "wldrd wr10, [%[block]] \n\t" 19.473 + WAVG2B" wr8, wr0, wr4 \n\t" 19.474 + WAVG2B" wr8, wr8, wr10 \n\t" 19.475 + "wstrd wr8, [%[block]] \n\t" 19.476 + "add %[block], %[block], %[line_size] \n\t" 19.477 + 19.478 + "wldrd wr10, [%[pixels]] \n\t" 19.479 + "wldrd wr11, [%[pixels], #8] \n\t" 19.480 + "pld [%[block]] \n\t" 19.481 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.482 + "pld [%[pixels]] \n\t" 19.483 + "pld [%[pixels], #32] \n\t" 19.484 + "walignr1 wr0, wr10, wr11 \n\t" 19.485 + "wldrd wr10, [%[block]] \n\t" 19.486 + WAVG2B" wr8, wr0, wr4 \n\t" 19.487 + WAVG2B" wr8, wr8, wr10 \n\t" 19.488 + "wstrd wr8, [%[block]] \n\t" 19.489 + "add %[block], %[block], %[line_size] \n\t" 19.490 + 19.491 + "subs %[h], %[h], #2 \n\t" 19.492 + "pld [%[block]] \n\t" 19.493 + "bne 1b \n\t" 19.494 + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 19.495 + : 19.496 + : "cc", "memory", "r12"); 19.497 +} 19.498 + 19.499 +void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 19.500 +{ 19.501 + int stride = line_size; 19.502 + // [wr0 wr1 wr2 wr3] for previous line 19.503 + // [wr4 wr5 wr6 wr7] for current line 19.504 + __asm__ volatile( 19.505 + "pld [%[pixels]] \n\t" 19.506 + "pld [%[pixels], #32] \n\t" 19.507 + "and r12, %[pixels], #7 \n\t" 19.508 + "tmcr wcgr1, r12 \n\t" 19.509 + "bic %[pixels], %[pixels], #7 \n\t" 19.510 + 19.511 + "wldrd wr10, [%[pixels]] \n\t" 19.512 + "wldrd wr11, [%[pixels], #8] \n\t" 19.513 + "wldrd wr12, [%[pixels], #16] \n\t" 19.514 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.515 + "pld [%[pixels]] \n\t" 19.516 + "pld [%[pixels], #32] \n\t" 19.517 + "walignr1 wr0, wr10, wr11 \n\t" 19.518 + "walignr1 wr1, wr11, wr12 \n\t" 19.519 + 19.520 + "1: \n\t" 19.521 + "wldrd wr10, [%[pixels]] \n\t" 19.522 + "wldrd wr11, [%[pixels], #8] \n\t" 19.523 + "wldrd wr12, [%[pixels], #16] \n\t" 19.524 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.525 + "pld [%[pixels]] \n\t" 19.526 + "pld [%[pixels], #32] \n\t" 19.527 + "walignr1 wr4, wr10, wr11 \n\t" 19.528 + "walignr1 wr5, wr11, wr12 \n\t" 19.529 + WAVG2B" wr8, wr0, wr4 \n\t" 19.530 + WAVG2B" wr9, wr1, wr5 \n\t" 19.531 + "wstrd wr8, [%[block]] \n\t" 19.532 + "wstrd wr9, [%[block], #8] \n\t" 19.533 + "add %[block], %[block], %[line_size] \n\t" 19.534 + 19.535 + "wldrd wr10, [%[pixels]] \n\t" 19.536 + "wldrd wr11, [%[pixels], #8] \n\t" 19.537 + "wldrd wr12, [%[pixels], #16] \n\t" 19.538 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.539 + "pld [%[pixels]] \n\t" 19.540 + "pld [%[pixels], #32] \n\t" 19.541 + "walignr1 wr0, wr10, wr11 \n\t" 19.542 + "walignr1 wr1, wr11, wr12 \n\t" 19.543 + WAVG2B" wr8, wr0, wr4 \n\t" 19.544 + WAVG2B" wr9, wr1, wr5 \n\t" 19.545 + "wstrd wr8, [%[block]] \n\t" 19.546 + "wstrd wr9, [%[block], #8] \n\t" 19.547 + "add %[block], %[block], %[line_size] \n\t" 19.548 + 19.549 + "subs %[h], %[h], #2 \n\t" 19.550 + "bne 1b \n\t" 19.551 + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 19.552 + : 19.553 + : "r4", "r5", "r12", "memory"); 19.554 +} 19.555 + 19.556 +void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 19.557 +{ 19.558 + int stride = line_size; 19.559 + // [wr0 wr1 wr2 wr3] for previous line 19.560 + // [wr4 wr5 wr6 wr7] for current line 19.561 + __asm__ volatile( 19.562 + "pld [%[pixels]] \n\t" 19.563 + "pld [%[pixels], #32] \n\t" 19.564 + "and r12, %[pixels], #7 \n\t" 19.565 + "tmcr wcgr1, r12 \n\t" 19.566 + "bic %[pixels], %[pixels], #7 \n\t" 19.567 + 19.568 + "wldrd wr10, [%[pixels]] \n\t" 19.569 + "wldrd wr11, [%[pixels], #8] \n\t" 19.570 + "pld [%[block]] \n\t" 19.571 + "wldrd wr12, [%[pixels], #16] \n\t" 19.572 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.573 + "pld [%[pixels]] \n\t" 19.574 + "pld [%[pixels], #32] \n\t" 19.575 + "walignr1 wr0, wr10, wr11 \n\t" 19.576 + "walignr1 wr1, wr11, wr12 \n\t" 19.577 + 19.578 + "1: \n\t" 19.579 + "wldrd wr10, [%[pixels]] \n\t" 19.580 + "wldrd wr11, [%[pixels], #8] \n\t" 19.581 + "wldrd wr12, [%[pixels], #16] \n\t" 19.582 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.583 + "pld [%[pixels]] \n\t" 19.584 + "pld [%[pixels], #32] \n\t" 19.585 + "walignr1 wr4, wr10, wr11 \n\t" 19.586 + "walignr1 wr5, wr11, wr12 \n\t" 19.587 + "wldrd wr10, [%[block]] \n\t" 19.588 + "wldrd wr11, [%[block], #8] \n\t" 19.589 + WAVG2B" wr8, wr0, wr4 \n\t" 19.590 + WAVG2B" wr9, wr1, wr5 \n\t" 19.591 + WAVG2B" wr8, wr8, wr10 \n\t" 19.592 + WAVG2B" wr9, wr9, wr11 \n\t" 19.593 + "wstrd wr8, [%[block]] \n\t" 19.594 + "wstrd wr9, [%[block], #8] \n\t" 19.595 + "add %[block], %[block], %[line_size] \n\t" 19.596 + 19.597 + "wldrd wr10, [%[pixels]] \n\t" 19.598 + "wldrd wr11, [%[pixels], #8] \n\t" 19.599 + "pld [%[block]] \n\t" 19.600 + "wldrd wr12, [%[pixels], #16] \n\t" 19.601 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.602 + "pld [%[pixels]] \n\t" 19.603 + "pld [%[pixels], #32] \n\t" 19.604 + "walignr1 wr0, wr10, wr11 \n\t" 19.605 + "walignr1 wr1, wr11, wr12 \n\t" 19.606 + "wldrd wr10, [%[block]] \n\t" 19.607 + "wldrd wr11, [%[block], #8] \n\t" 19.608 + WAVG2B" wr8, wr0, wr4 \n\t" 19.609 + WAVG2B" wr9, wr1, wr5 \n\t" 19.610 + WAVG2B" wr8, wr8, wr10 \n\t" 19.611 + WAVG2B" wr9, wr9, wr11 \n\t" 19.612 + "wstrd wr8, [%[block]] \n\t" 19.613 + "wstrd wr9, [%[block], #8] \n\t" 19.614 + "add %[block], %[block], %[line_size] \n\t" 19.615 + 19.616 + "subs %[h], %[h], #2 \n\t" 19.617 + "pld [%[block]] \n\t" 19.618 + "bne 1b \n\t" 19.619 + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 19.620 + : 19.621 + : "r4", "r5", "r12", "memory"); 19.622 +} 19.623 + 19.624 +void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 19.625 +{ 19.626 + // [wr0 wr1 wr2 wr3] for previous line 19.627 + // [wr4 wr5 wr6 wr7] for current line 19.628 + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 19.629 + __asm__ volatile( 19.630 + "pld [%[pixels]] \n\t" 19.631 + "mov r12, #2 \n\t" 19.632 + "pld [%[pixels], #32] \n\t" 19.633 + "tmcr wcgr0, r12 \n\t" /* for shift value */ 19.634 + "and r12, %[pixels], #7 \n\t" 19.635 + "bic %[pixels], %[pixels], #7 \n\t" 19.636 + "tmcr wcgr1, r12 \n\t" 19.637 + 19.638 + // [wr0 wr1 wr2 wr3] <= * 19.639 + // [wr4 wr5 wr6 wr7] 19.640 + "wldrd wr12, [%[pixels]] \n\t" 19.641 + "add r12, r12, #1 \n\t" 19.642 + "wldrd wr13, [%[pixels], #8] \n\t" 19.643 + "tmcr wcgr2, r12 \n\t" 19.644 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.645 + "cmp r12, #8 \n\t" 19.646 + "pld [%[pixels]] \n\t" 19.647 + "pld [%[pixels], #32] \n\t" 19.648 + "walignr1 wr2, wr12, wr13 \n\t" 19.649 + "wmoveq wr10, wr13 \n\t" 19.650 + "walignr2ne wr10, wr12, wr13 \n\t" 19.651 + "wunpckelub wr0, wr2 \n\t" 19.652 + "wunpckehub wr1, wr2 \n\t" 19.653 + "wunpckelub wr8, wr10 \n\t" 19.654 + "wunpckehub wr9, wr10 \n\t" 19.655 + "waddhus wr0, wr0, wr8 \n\t" 19.656 + "waddhus wr1, wr1, wr9 \n\t" 19.657 + 19.658 + "1: \n\t" 19.659 + // [wr0 wr1 wr2 wr3] 19.660 + // [wr4 wr5 wr6 wr7] <= * 19.661 + "wldrd wr12, [%[pixels]] \n\t" 19.662 + "cmp r12, #8 \n\t" 19.663 + "wldrd wr13, [%[pixels], #8] \n\t" 19.664 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.665 + "walignr1 wr6, wr12, wr13 \n\t" 19.666 + "pld [%[pixels]] \n\t" 19.667 + "pld [%[pixels], #32] \n\t" 19.668 + "wmoveq wr10, wr13 \n\t" 19.669 + "walignr2ne wr10, wr12, wr13 \n\t" 19.670 + "wunpckelub wr4, wr6 \n\t" 19.671 + "wunpckehub wr5, wr6 \n\t" 19.672 + "wunpckelub wr8, wr10 \n\t" 19.673 + "wunpckehub wr9, wr10 \n\t" 19.674 + "waddhus wr4, wr4, wr8 \n\t" 19.675 + "waddhus wr5, wr5, wr9 \n\t" 19.676 + "waddhus wr8, wr0, wr4 \n\t" 19.677 + "waddhus wr9, wr1, wr5 \n\t" 19.678 + "waddhus wr8, wr8, wr15 \n\t" 19.679 + "waddhus wr9, wr9, wr15 \n\t" 19.680 + "wsrlhg wr8, wr8, wcgr0 \n\t" 19.681 + "wsrlhg wr9, wr9, wcgr0 \n\t" 19.682 + "wpackhus wr8, wr8, wr9 \n\t" 19.683 + "wstrd wr8, [%[block]] \n\t" 19.684 + "add %[block], %[block], %[line_size] \n\t" 19.685 + 19.686 + // [wr0 wr1 wr2 wr3] <= * 19.687 + // [wr4 wr5 wr6 wr7] 19.688 + "wldrd wr12, [%[pixels]] \n\t" 19.689 + "wldrd wr13, [%[pixels], #8] \n\t" 19.690 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.691 + "walignr1 wr2, wr12, wr13 \n\t" 19.692 + "pld [%[pixels]] \n\t" 19.693 + "pld [%[pixels], #32] \n\t" 19.694 + "wmoveq wr10, wr13 \n\t" 19.695 + "walignr2ne wr10, wr12, wr13 \n\t" 19.696 + "wunpckelub wr0, wr2 \n\t" 19.697 + "wunpckehub wr1, wr2 \n\t" 19.698 + "wunpckelub wr8, wr10 \n\t" 19.699 + "wunpckehub wr9, wr10 \n\t" 19.700 + "waddhus wr0, wr0, wr8 \n\t" 19.701 + "waddhus wr1, wr1, wr9 \n\t" 19.702 + "waddhus wr8, wr0, wr4 \n\t" 19.703 + "waddhus wr9, wr1, wr5 \n\t" 19.704 + "waddhus wr8, wr8, wr15 \n\t" 19.705 + "waddhus wr9, wr9, wr15 \n\t" 19.706 + "wsrlhg wr8, wr8, wcgr0 \n\t" 19.707 + "wsrlhg wr9, wr9, wcgr0 \n\t" 19.708 + "wpackhus wr8, wr8, wr9 \n\t" 19.709 + "subs %[h], %[h], #2 \n\t" 19.710 + "wstrd wr8, [%[block]] \n\t" 19.711 + "add %[block], %[block], %[line_size] \n\t" 19.712 + "bne 1b \n\t" 19.713 + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) 19.714 + : [line_size]"r"(line_size) 19.715 + : "r12", "memory"); 19.716 +} 19.717 + 19.718 +void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 19.719 +{ 19.720 + // [wr0 wr1 wr2 wr3] for previous line 19.721 + // [wr4 wr5 wr6 wr7] for current line 19.722 + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 19.723 + __asm__ volatile( 19.724 + "pld [%[pixels]] \n\t" 19.725 + "mov r12, #2 \n\t" 19.726 + "pld [%[pixels], #32] \n\t" 19.727 + "tmcr wcgr0, r12 \n\t" /* for shift value */ 19.728 + /* alignment */ 19.729 + "and r12, %[pixels], #7 \n\t" 19.730 + "bic %[pixels], %[pixels], #7 \n\t" 19.731 + "tmcr wcgr1, r12 \n\t" 19.732 + "add r12, r12, #1 \n\t" 19.733 + "tmcr wcgr2, r12 \n\t" 19.734 + 19.735 + // [wr0 wr1 wr2 wr3] <= * 19.736 + // [wr4 wr5 wr6 wr7] 19.737 + "wldrd wr12, [%[pixels]] \n\t" 19.738 + "cmp r12, #8 \n\t" 19.739 + "wldrd wr13, [%[pixels], #8] \n\t" 19.740 + "wldrd wr14, [%[pixels], #16] \n\t" 19.741 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.742 + "pld [%[pixels]] \n\t" 19.743 + "walignr1 wr2, wr12, wr13 \n\t" 19.744 + "pld [%[pixels], #32] \n\t" 19.745 + "walignr1 wr3, wr13, wr14 \n\t" 19.746 + "wmoveq wr10, wr13 \n\t" 19.747 + "wmoveq wr11, wr14 \n\t" 19.748 + "walignr2ne wr10, wr12, wr13 \n\t" 19.749 + "walignr2ne wr11, wr13, wr14 \n\t" 19.750 + "wunpckelub wr0, wr2 \n\t" 19.751 + "wunpckehub wr1, wr2 \n\t" 19.752 + "wunpckelub wr2, wr3 \n\t" 19.753 + "wunpckehub wr3, wr3 \n\t" 19.754 + "wunpckelub wr8, wr10 \n\t" 19.755 + "wunpckehub wr9, wr10 \n\t" 19.756 + "wunpckelub wr10, wr11 \n\t" 19.757 + "wunpckehub wr11, wr11 \n\t" 19.758 + "waddhus wr0, wr0, wr8 \n\t" 19.759 + "waddhus wr1, wr1, wr9 \n\t" 19.760 + "waddhus wr2, wr2, wr10 \n\t" 19.761 + "waddhus wr3, wr3, wr11 \n\t" 19.762 + 19.763 + "1: \n\t" 19.764 + // [wr0 wr1 wr2 wr3] 19.765 + // [wr4 wr5 wr6 wr7] <= * 19.766 + "wldrd wr12, [%[pixels]] \n\t" 19.767 + "cmp r12, #8 \n\t" 19.768 + "wldrd wr13, [%[pixels], #8] \n\t" 19.769 + "wldrd wr14, [%[pixels], #16] \n\t" 19.770 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.771 + "walignr1 wr6, wr12, wr13 \n\t" 19.772 + "pld [%[pixels]] \n\t" 19.773 + "pld [%[pixels], #32] \n\t" 19.774 + "walignr1 wr7, wr13, wr14 \n\t" 19.775 + "wmoveq wr10, wr13 \n\t" 19.776 + "wmoveq wr11, wr14 \n\t" 19.777 + "walignr2ne wr10, wr12, wr13 \n\t" 19.778 + "walignr2ne wr11, wr13, wr14 \n\t" 19.779 + "wunpckelub wr4, wr6 \n\t" 19.780 + "wunpckehub wr5, wr6 \n\t" 19.781 + "wunpckelub wr6, wr7 \n\t" 19.782 + "wunpckehub wr7, wr7 \n\t" 19.783 + "wunpckelub wr8, wr10 \n\t" 19.784 + "wunpckehub wr9, wr10 \n\t" 19.785 + "wunpckelub wr10, wr11 \n\t" 19.786 + "wunpckehub wr11, wr11 \n\t" 19.787 + "waddhus wr4, wr4, wr8 \n\t" 19.788 + "waddhus wr5, wr5, wr9 \n\t" 19.789 + "waddhus wr6, wr6, wr10 \n\t" 19.790 + "waddhus wr7, wr7, wr11 \n\t" 19.791 + "waddhus wr8, wr0, wr4 \n\t" 19.792 + "waddhus wr9, wr1, wr5 \n\t" 19.793 + "waddhus wr10, wr2, wr6 \n\t" 19.794 + "waddhus wr11, wr3, wr7 \n\t" 19.795 + "waddhus wr8, wr8, wr15 \n\t" 19.796 + "waddhus wr9, wr9, wr15 \n\t" 19.797 + "waddhus wr10, wr10, wr15 \n\t" 19.798 + "waddhus wr11, wr11, wr15 \n\t" 19.799 + "wsrlhg wr8, wr8, wcgr0 \n\t" 19.800 + "wsrlhg wr9, wr9, wcgr0 \n\t" 19.801 + "wsrlhg wr10, wr10, wcgr0 \n\t" 19.802 + "wsrlhg wr11, wr11, wcgr0 \n\t" 19.803 + "wpackhus wr8, wr8, wr9 \n\t" 19.804 + "wpackhus wr9, wr10, wr11 \n\t" 19.805 + "wstrd wr8, [%[block]] \n\t" 19.806 + "wstrd wr9, [%[block], #8] \n\t" 19.807 + "add %[block], %[block], %[line_size] \n\t" 19.808 + 19.809 + // [wr0 wr1 wr2 wr3] <= * 19.810 + // [wr4 wr5 wr6 wr7] 19.811 + "wldrd wr12, [%[pixels]] \n\t" 19.812 + "wldrd wr13, [%[pixels], #8] \n\t" 19.813 + "wldrd wr14, [%[pixels], #16] \n\t" 19.814 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.815 + "walignr1 wr2, wr12, wr13 \n\t" 19.816 + "pld [%[pixels]] \n\t" 19.817 + "pld [%[pixels], #32] \n\t" 19.818 + "walignr1 wr3, wr13, wr14 \n\t" 19.819 + "wmoveq wr10, wr13 \n\t" 19.820 + "wmoveq wr11, wr14 \n\t" 19.821 + "walignr2ne wr10, wr12, wr13 \n\t" 19.822 + "walignr2ne wr11, wr13, wr14 \n\t" 19.823 + "wunpckelub wr0, wr2 \n\t" 19.824 + "wunpckehub wr1, wr2 \n\t" 19.825 + "wunpckelub wr2, wr3 \n\t" 19.826 + "wunpckehub wr3, wr3 \n\t" 19.827 + "wunpckelub wr8, wr10 \n\t" 19.828 + "wunpckehub wr9, wr10 \n\t" 19.829 + "wunpckelub wr10, wr11 \n\t" 19.830 + "wunpckehub wr11, wr11 \n\t" 19.831 + "waddhus wr0, wr0, wr8 \n\t" 19.832 + "waddhus wr1, wr1, wr9 \n\t" 19.833 + "waddhus wr2, wr2, wr10 \n\t" 19.834 + "waddhus wr3, wr3, wr11 \n\t" 19.835 + "waddhus wr8, wr0, wr4 \n\t" 19.836 + "waddhus wr9, wr1, wr5 \n\t" 19.837 + "waddhus wr10, wr2, wr6 \n\t" 19.838 + "waddhus wr11, wr3, wr7 \n\t" 19.839 + "waddhus wr8, wr8, wr15 \n\t" 19.840 + "waddhus wr9, wr9, wr15 \n\t" 19.841 + "waddhus wr10, wr10, wr15 \n\t" 19.842 + "waddhus wr11, wr11, wr15 \n\t" 19.843 + "wsrlhg wr8, wr8, wcgr0 \n\t" 19.844 + "wsrlhg wr9, wr9, wcgr0 \n\t" 19.845 + "wsrlhg wr10, wr10, wcgr0 \n\t" 19.846 + "wsrlhg wr11, wr11, wcgr0 \n\t" 19.847 + "wpackhus wr8, wr8, wr9 \n\t" 19.848 + "wpackhus wr9, wr10, wr11 \n\t" 19.849 + "wstrd wr8, [%[block]] \n\t" 19.850 + "wstrd wr9, [%[block], #8] \n\t" 19.851 + "add %[block], %[block], %[line_size] \n\t" 19.852 + 19.853 + "subs %[h], %[h], #2 \n\t" 19.854 + "bne 1b \n\t" 19.855 + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) 19.856 + : [line_size]"r"(line_size) 19.857 + : "r12", "memory"); 19.858 +} 19.859 + 19.860 +void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 19.861 +{ 19.862 + // [wr0 wr1 wr2 wr3] for previous line 19.863 + // [wr4 wr5 wr6 wr7] for current line 19.864 + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 19.865 + __asm__ volatile( 19.866 + "pld [%[block]] \n\t" 19.867 + "pld [%[block], #32] \n\t" 19.868 + "pld [%[pixels]] \n\t" 19.869 + "mov r12, #2 \n\t" 19.870 + "pld [%[pixels], #32] \n\t" 19.871 + "tmcr wcgr0, r12 \n\t" /* for shift value */ 19.872 + "and r12, %[pixels], #7 \n\t" 19.873 + "bic %[pixels], %[pixels], #7 \n\t" 19.874 + "tmcr wcgr1, r12 \n\t" 19.875 + 19.876 + // [wr0 wr1 wr2 wr3] <= * 19.877 + // [wr4 wr5 wr6 wr7] 19.878 + "wldrd wr12, [%[pixels]] \n\t" 19.879 + "add r12, r12, #1 \n\t" 19.880 + "wldrd wr13, [%[pixels], #8] \n\t" 19.881 + "tmcr wcgr2, r12 \n\t" 19.882 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.883 + "cmp r12, #8 \n\t" 19.884 + "pld [%[pixels]] \n\t" 19.885 + "pld [%[pixels], #32] \n\t" 19.886 + "walignr1 wr2, wr12, wr13 \n\t" 19.887 + "wmoveq wr10, wr13 \n\t" 19.888 + "walignr2ne wr10, wr12, wr13 \n\t" 19.889 + "wunpckelub wr0, wr2 \n\t" 19.890 + "wunpckehub wr1, wr2 \n\t" 19.891 + "wunpckelub wr8, wr10 \n\t" 19.892 + "wunpckehub wr9, wr10 \n\t" 19.893 + "waddhus wr0, wr0, wr8 \n\t" 19.894 + "waddhus wr1, wr1, wr9 \n\t" 19.895 + 19.896 + "1: \n\t" 19.897 + // [wr0 wr1 wr2 wr3] 19.898 + // [wr4 wr5 wr6 wr7] <= * 19.899 + "wldrd wr12, [%[pixels]] \n\t" 19.900 + "cmp r12, #8 \n\t" 19.901 + "wldrd wr13, [%[pixels], #8] \n\t" 19.902 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.903 + "walignr1 wr6, wr12, wr13 \n\t" 19.904 + "pld [%[pixels]] \n\t" 19.905 + "pld [%[pixels], #32] \n\t" 19.906 + "wmoveq wr10, wr13 \n\t" 19.907 + "walignr2ne wr10, wr12, wr13 \n\t" 19.908 + "wunpckelub wr4, wr6 \n\t" 19.909 + "wunpckehub wr5, wr6 \n\t" 19.910 + "wunpckelub wr8, wr10 \n\t" 19.911 + "wunpckehub wr9, wr10 \n\t" 19.912 + "waddhus wr4, wr4, wr8 \n\t" 19.913 + "waddhus wr5, wr5, wr9 \n\t" 19.914 + "waddhus wr8, wr0, wr4 \n\t" 19.915 + "waddhus wr9, wr1, wr5 \n\t" 19.916 + "waddhus wr8, wr8, wr15 \n\t" 19.917 + "waddhus wr9, wr9, wr15 \n\t" 19.918 + "wldrd wr12, [%[block]] \n\t" 19.919 + "wsrlhg wr8, wr8, wcgr0 \n\t" 19.920 + "wsrlhg wr9, wr9, wcgr0 \n\t" 19.921 + "wpackhus wr8, wr8, wr9 \n\t" 19.922 + WAVG2B" wr8, wr8, wr12 \n\t" 19.923 + "wstrd wr8, [%[block]] \n\t" 19.924 + "add %[block], %[block], %[line_size] \n\t" 19.925 + "wldrd wr12, [%[pixels]] \n\t" 19.926 + "pld [%[block]] \n\t" 19.927 + "pld [%[block], #32] \n\t" 19.928 + 19.929 + // [wr0 wr1 wr2 wr3] <= * 19.930 + // [wr4 wr5 wr6 wr7] 19.931 + "wldrd wr13, [%[pixels], #8] \n\t" 19.932 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.933 + "walignr1 wr2, wr12, wr13 \n\t" 19.934 + "pld [%[pixels]] \n\t" 19.935 + "pld [%[pixels], #32] \n\t" 19.936 + "wmoveq wr10, wr13 \n\t" 19.937 + "walignr2ne wr10, wr12, wr13 \n\t" 19.938 + "wunpckelub wr0, wr2 \n\t" 19.939 + "wunpckehub wr1, wr2 \n\t" 19.940 + "wunpckelub wr8, wr10 \n\t" 19.941 + "wunpckehub wr9, wr10 \n\t" 19.942 + "waddhus wr0, wr0, wr8 \n\t" 19.943 + "waddhus wr1, wr1, wr9 \n\t" 19.944 + "waddhus wr8, wr0, wr4 \n\t" 19.945 + "waddhus wr9, wr1, wr5 \n\t" 19.946 + "waddhus wr8, wr8, wr15 \n\t" 19.947 + "waddhus wr9, wr9, wr15 \n\t" 19.948 + "wldrd wr12, [%[block]] \n\t" 19.949 + "wsrlhg wr8, wr8, wcgr0 \n\t" 19.950 + "wsrlhg wr9, wr9, wcgr0 \n\t" 19.951 + "wpackhus wr8, wr8, wr9 \n\t" 19.952 + "subs %[h], %[h], #2 \n\t" 19.953 + WAVG2B" wr8, wr8, wr12 \n\t" 19.954 + "wstrd wr8, [%[block]] \n\t" 19.955 + "add %[block], %[block], %[line_size] \n\t" 19.956 + "pld [%[block]] \n\t" 19.957 + "pld [%[block], #32] \n\t" 19.958 + "bne 1b \n\t" 19.959 + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) 19.960 + : [line_size]"r"(line_size) 19.961 + : "r12", "memory"); 19.962 +} 19.963 + 19.964 +void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 19.965 +{ 19.966 + // [wr0 wr1 wr2 wr3] for previous line 19.967 + // [wr4 wr5 wr6 wr7] for current line 19.968 + SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 19.969 + __asm__ volatile( 19.970 + "pld [%[block]] \n\t" 19.971 + "pld [%[block], #32] \n\t" 19.972 + "pld [%[pixels]] \n\t" 19.973 + "mov r12, #2 \n\t" 19.974 + "pld [%[pixels], #32] \n\t" 19.975 + "tmcr wcgr0, r12 \n\t" /* for shift value */ 19.976 + /* alignment */ 19.977 + "and r12, %[pixels], #7 \n\t" 19.978 + "bic %[pixels], %[pixels], #7 \n\t" 19.979 + "tmcr wcgr1, r12 \n\t" 19.980 + "add r12, r12, #1 \n\t" 19.981 + "tmcr wcgr2, r12 \n\t" 19.982 + 19.983 + // [wr0 wr1 wr2 wr3] <= * 19.984 + // [wr4 wr5 wr6 wr7] 19.985 + "wldrd wr12, [%[pixels]] \n\t" 19.986 + "cmp r12, #8 \n\t" 19.987 + "wldrd wr13, [%[pixels], #8] \n\t" 19.988 + "wldrd wr14, [%[pixels], #16] \n\t" 19.989 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.990 + "pld [%[pixels]] \n\t" 19.991 + "walignr1 wr2, wr12, wr13 \n\t" 19.992 + "pld [%[pixels], #32] \n\t" 19.993 + "walignr1 wr3, wr13, wr14 \n\t" 19.994 + "wmoveq wr10, wr13 \n\t" 19.995 + "wmoveq wr11, wr14 \n\t" 19.996 + "walignr2ne wr10, wr12, wr13 \n\t" 19.997 + "walignr2ne wr11, wr13, wr14 \n\t" 19.998 + "wunpckelub wr0, wr2 \n\t" 19.999 + "wunpckehub wr1, wr2 \n\t" 19.1000 + "wunpckelub wr2, wr3 \n\t" 19.1001 + "wunpckehub wr3, wr3 \n\t" 19.1002 + "wunpckelub wr8, wr10 \n\t" 19.1003 + "wunpckehub wr9, wr10 \n\t" 19.1004 + "wunpckelub wr10, wr11 \n\t" 19.1005 + "wunpckehub wr11, wr11 \n\t" 19.1006 + "waddhus wr0, wr0, wr8 \n\t" 19.1007 + "waddhus wr1, wr1, wr9 \n\t" 19.1008 + "waddhus wr2, wr2, wr10 \n\t" 19.1009 + "waddhus wr3, wr3, wr11 \n\t" 19.1010 + 19.1011 + "1: \n\t" 19.1012 + // [wr0 wr1 wr2 wr3] 19.1013 + // [wr4 wr5 wr6 wr7] <= * 19.1014 + "wldrd wr12, [%[pixels]] \n\t" 19.1015 + "cmp r12, #8 \n\t" 19.1016 + "wldrd wr13, [%[pixels], #8] \n\t" 19.1017 + "wldrd wr14, [%[pixels], #16] \n\t" 19.1018 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.1019 + "walignr1 wr6, wr12, wr13 \n\t" 19.1020 + "pld [%[pixels]] \n\t" 19.1021 + "pld [%[pixels], #32] \n\t" 19.1022 + "walignr1 wr7, wr13, wr14 \n\t" 19.1023 + "wmoveq wr10, wr13 \n\t" 19.1024 + "wmoveq wr11, wr14 \n\t" 19.1025 + "walignr2ne wr10, wr12, wr13 \n\t" 19.1026 + "walignr2ne wr11, wr13, wr14 \n\t" 19.1027 + "wunpckelub wr4, wr6 \n\t" 19.1028 + "wunpckehub wr5, wr6 \n\t" 19.1029 + "wunpckelub wr6, wr7 \n\t" 19.1030 + "wunpckehub wr7, wr7 \n\t" 19.1031 + "wunpckelub wr8, wr10 \n\t" 19.1032 + "wunpckehub wr9, wr10 \n\t" 19.1033 + "wunpckelub wr10, wr11 \n\t" 19.1034 + "wunpckehub wr11, wr11 \n\t" 19.1035 + "waddhus wr4, wr4, wr8 \n\t" 19.1036 + "waddhus wr5, wr5, wr9 \n\t" 19.1037 + "waddhus wr6, wr6, wr10 \n\t" 19.1038 + "waddhus wr7, wr7, wr11 \n\t" 19.1039 + "waddhus wr8, wr0, wr4 \n\t" 19.1040 + "waddhus wr9, wr1, wr5 \n\t" 19.1041 + "waddhus wr10, wr2, wr6 \n\t" 19.1042 + "waddhus wr11, wr3, wr7 \n\t" 19.1043 + "waddhus wr8, wr8, wr15 \n\t" 19.1044 + "waddhus wr9, wr9, wr15 \n\t" 19.1045 + "waddhus wr10, wr10, wr15 \n\t" 19.1046 + "waddhus wr11, wr11, wr15 \n\t" 19.1047 + "wsrlhg wr8, wr8, wcgr0 \n\t" 19.1048 + "wsrlhg wr9, wr9, wcgr0 \n\t" 19.1049 + "wldrd wr12, [%[block]] \n\t" 19.1050 + "wldrd wr13, [%[block], #8] \n\t" 19.1051 + "wsrlhg wr10, wr10, wcgr0 \n\t" 19.1052 + "wsrlhg wr11, wr11, wcgr0 \n\t" 19.1053 + "wpackhus wr8, wr8, wr9 \n\t" 19.1054 + "wpackhus wr9, wr10, wr11 \n\t" 19.1055 + WAVG2B" wr8, wr8, wr12 \n\t" 19.1056 + WAVG2B" wr9, wr9, wr13 \n\t" 19.1057 + "wstrd wr8, [%[block]] \n\t" 19.1058 + "wstrd wr9, [%[block], #8] \n\t" 19.1059 + "add %[block], %[block], %[line_size] \n\t" 19.1060 + 19.1061 + // [wr0 wr1 wr2 wr3] <= * 19.1062 + // [wr4 wr5 wr6 wr7] 19.1063 + "wldrd wr12, [%[pixels]] \n\t" 19.1064 + "pld [%[block]] \n\t" 19.1065 + "wldrd wr13, [%[pixels], #8] \n\t" 19.1066 + "pld [%[block], #32] \n\t" 19.1067 + "wldrd wr14, [%[pixels], #16] \n\t" 19.1068 + "add %[pixels], %[pixels], %[line_size] \n\t" 19.1069 + "walignr1 wr2, wr12, wr13 \n\t" 19.1070 + "pld [%[pixels]] \n\t" 19.1071 + "pld [%[pixels], #32] \n\t" 19.1072 + "walignr1 wr3, wr13, wr14 \n\t" 19.1073 + "wmoveq wr10, wr13 \n\t" 19.1074 + "wmoveq wr11, wr14 \n\t" 19.1075 + "walignr2ne wr10, wr12, wr13 \n\t" 19.1076 + "walignr2ne wr11, wr13, wr14 \n\t" 19.1077 + "wunpckelub wr0, wr2 \n\t" 19.1078 + "wunpckehub wr1, wr2 \n\t" 19.1079 + "wunpckelub wr2, wr3 \n\t" 19.1080 + "wunpckehub wr3, wr3 \n\t" 19.1081 + "wunpckelub wr8, wr10 \n\t" 19.1082 + "wunpckehub wr9, wr10 \n\t" 19.1083 + "wunpckelub wr10, wr11 \n\t" 19.1084 + "wunpckehub wr11, wr11 \n\t" 19.1085 + "waddhus wr0, wr0, wr8 \n\t" 19.1086 + "waddhus wr1, wr1, wr9 \n\t" 19.1087 + "waddhus wr2, wr2, wr10 \n\t" 19.1088 + "waddhus wr3, wr3, wr11 \n\t" 19.1089 + "waddhus wr8, wr0, wr4 \n\t" 19.1090 + "waddhus wr9, wr1, wr5 \n\t" 19.1091 + "waddhus wr10, wr2, wr6 \n\t" 19.1092 + "waddhus wr11, wr3, wr7 \n\t" 19.1093 + "waddhus wr8, wr8, wr15 \n\t" 19.1094 + "waddhus wr9, wr9, wr15 \n\t" 19.1095 + "waddhus wr10, wr10, wr15 \n\t" 19.1096 + "waddhus wr11, wr11, wr15 \n\t" 19.1097 + "wsrlhg wr8, wr8, wcgr0 \n\t" 19.1098 + "wsrlhg wr9, wr9, wcgr0 \n\t" 19.1099 + "wldrd wr12, [%[block]] \n\t" 19.1100 + "wldrd wr13, [%[block], #8] \n\t" 19.1101 + "wsrlhg wr10, wr10, wcgr0 \n\t" 19.1102 + "wsrlhg wr11, wr11, wcgr0 \n\t" 19.1103 + "wpackhus wr8, wr8, wr9 \n\t" 19.1104 + "wpackhus wr9, wr10, wr11 \n\t" 19.1105 + WAVG2B" wr8, wr8, wr12 \n\t" 19.1106 + WAVG2B" wr9, wr9, wr13 \n\t" 19.1107 + "wstrd wr8, [%[block]] \n\t" 19.1108 + "wstrd wr9, [%[block], #8] \n\t" 19.1109 + "add %[block], %[block], %[line_size] \n\t" 19.1110 + "subs %[h], %[h], #2 \n\t" 19.1111 + "pld [%[block]] \n\t" 19.1112 + "pld [%[block], #32] \n\t" 19.1113 + "bne 1b \n\t" 19.1114 + : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) 19.1115 + : [line_size]"r"(line_size) 19.1116 + : "r12", "memory"); 19.1117 +}
20.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 20.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_neon.S Mon Aug 27 12:09:56 2012 +0200 20.3 @@ -0,0 +1,1146 @@ 20.4 +/* 20.5 + * ARM NEON optimised DSP functions 20.6 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 20.7 + * 20.8 + * This file is part of FFmpeg. 20.9 + * 20.10 + * FFmpeg is free software; you can redistribute it and/or 20.11 + * modify it under the terms of the GNU Lesser General Public 20.12 + * License as published by the Free Software Foundation; either 20.13 + * version 2.1 of the License, or (at your option) any later version. 20.14 + * 20.15 + * FFmpeg is distributed in the hope that it will be useful, 20.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 20.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20.18 + * Lesser General Public License for more details. 20.19 + * 20.20 + * You should have received a copy of the GNU Lesser General Public 20.21 + * License along with FFmpeg; if not, write to the Free Software 20.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20.23 + */ 20.24 + 20.25 +#include "config.h" 20.26 +#include "asm.S" 20.27 + 20.28 + preserve8 20.29 + .text 20.30 + 20.31 + .macro pixels16 avg=0 20.32 +.if \avg 20.33 + mov ip, r0 20.34 +.endif 20.35 +1: vld1.64 {d0, d1}, [r1], r2 20.36 + vld1.64 {d2, d3}, [r1], r2 20.37 + vld1.64 {d4, d5}, [r1], r2 20.38 + pld [r1, r2, lsl #2] 20.39 + vld1.64 {d6, d7}, [r1], r2 20.40 + pld [r1] 20.41 + pld [r1, r2] 20.42 + pld [r1, r2, lsl #1] 20.43 +.if \avg 20.44 + vld1.64 {d16,d17}, [ip,:128], r2 20.45 + vrhadd.u8 q0, q0, q8 20.46 + vld1.64 {d18,d19}, [ip,:128], r2 20.47 + vrhadd.u8 q1, q1, q9 20.48 + vld1.64 {d20,d21}, [ip,:128], r2 20.49 + vrhadd.u8 q2, q2, q10 20.50 + vld1.64 {d22,d23}, [ip,:128], r2 20.51 + vrhadd.u8 q3, q3, q11 20.52 +.endif 20.53 + subs r3, r3, #4 20.54 + vst1.64 {d0, d1}, [r0,:128], r2 20.55 + vst1.64 {d2, d3}, [r0,:128], r2 20.56 + vst1.64 {d4, d5}, [r0,:128], r2 20.57 + vst1.64 {d6, d7}, [r0,:128], r2 20.58 + bne 1b 20.59 + bx lr 20.60 + .endm 20.61 + 20.62 + .macro pixels16_x2 vhadd=vrhadd.u8 20.63 +1: vld1.64 {d0-d2}, [r1], r2 20.64 + vld1.64 {d4-d6}, [r1], r2 20.65 + pld [r1] 20.66 + pld [r1, r2] 20.67 + subs r3, r3, #2 20.68 + vext.8 q1, q0, q1, #1 20.69 + \vhadd q0, q0, q1 20.70 + vext.8 q3, q2, q3, #1 20.71 + \vhadd q2, q2, q3 20.72 + vst1.64 {d0, d1}, [r0,:128], r2 20.73 + vst1.64 {d4, d5}, [r0,:128], r2 20.74 + bne 1b 20.75 + bx lr 20.76 + .endm 20.77 + 20.78 + .macro pixels16_y2 vhadd=vrhadd.u8 20.79 + vld1.64 {d0, d1}, [r1], r2 20.80 + vld1.64 {d2, d3}, [r1], r2 20.81 +1: subs r3, r3, #2 20.82 + \vhadd q2, q0, q1 20.83 + vld1.64 {d0, d1}, [r1], r2 20.84 + \vhadd q3, q0, q1 20.85 + vld1.64 {d2, d3}, [r1], r2 20.86 + pld [r1] 20.87 + pld [r1, r2] 20.88 + vst1.64 {d4, d5}, [r0,:128], r2 20.89 + vst1.64 {d6, d7}, [r0,:128], r2 20.90 + bne 1b 20.91 + bx lr 20.92 + .endm 20.93 + 20.94 + .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 20.95 + vld1.64 {d0-d2}, [r1], r2 20.96 + vld1.64 {d4-d6}, [r1], r2 20.97 +.if \no_rnd 20.98 + vmov.i16 q13, #1 20.99 +.endif 20.100 + pld [r1] 20.101 + pld [r1, r2] 20.102 + vext.8 q1, q0, q1, #1 20.103 + vext.8 q3, q2, q3, #1 20.104 + vaddl.u8 q8, d0, d2 20.105 + vaddl.u8 q10, d1, d3 20.106 + vaddl.u8 q9, d4, d6 20.107 + vaddl.u8 q11, d5, d7 20.108 +1: subs r3, r3, #2 20.109 + vld1.64 {d0-d2}, [r1], r2 20.110 + vadd.u16 q12, q8, q9 20.111 + pld [r1] 20.112 +.if \no_rnd 20.113 + vadd.u16 q12, q12, q13 20.114 +.endif 20.115 + vext.8 q15, q0, q1, #1 20.116 + vadd.u16 q1 , q10, q11 20.117 + \vshrn d28, q12, #2 20.118 +.if \no_rnd 20.119 + vadd.u16 q1, q1, q13 20.120 +.endif 20.121 + \vshrn d29, q1, #2 20.122 + vaddl.u8 q8, d0, d30 20.123 + vld1.64 {d2-d4}, [r1], r2 20.124 + vaddl.u8 q10, d1, d31 20.125 + vst1.64 {d28,d29}, [r0,:128], r2 20.126 + vadd.u16 q12, q8, q9 20.127 + pld [r1, r2] 20.128 +.if \no_rnd 20.129 + vadd.u16 q12, q12, q13 20.130 +.endif 20.131 + vext.8 q2, q1, q2, #1 20.132 + vadd.u16 q0, q10, q11 20.133 + \vshrn d30, q12, #2 20.134 +.if \no_rnd 20.135 + vadd.u16 q0, q0, q13 20.136 +.endif 20.137 + \vshrn d31, q0, #2 20.138 + vaddl.u8 q9, d2, d4 20.139 + vaddl.u8 q11, d3, d5 20.140 + vst1.64 {d30,d31}, [r0,:128], r2 20.141 + bgt 1b 20.142 + bx lr 20.143 + .endm 20.144 + 20.145 + .macro pixels8 avg=0 20.146 +1: vld1.64 {d0}, [r1], r2 20.147 + vld1.64 {d1}, [r1], r2 20.148 + vld1.64 {d2}, [r1], r2 20.149 + pld [r1, r2, lsl #2] 20.150 + vld1.64 {d3}, [r1], r2 20.151 + pld [r1] 20.152 + pld [r1, r2] 20.153 + pld [r1, r2, lsl #1] 20.154 +.if \avg 20.155 + vld1.64 {d4}, [r0,:64], r2 20.156 + vrhadd.u8 d0, d0, d4 20.157 + vld1.64 {d5}, [r0,:64], r2 20.158 + vrhadd.u8 d1, d1, d5 20.159 + vld1.64 {d6}, [r0,:64], r2 20.160 + vrhadd.u8 d2, d2, d6 20.161 + vld1.64 {d7}, [r0,:64], r2 20.162 + vrhadd.u8 d3, d3, d7 20.163 + sub r0, r0, r2, lsl #2 20.164 +.endif 20.165 + subs r3, r3, #4 20.166 + vst1.64 {d0}, [r0,:64], r2 20.167 + vst1.64 {d1}, [r0,:64], r2 20.168 + vst1.64 {d2}, [r0,:64], r2 20.169 + vst1.64 {d3}, [r0,:64], r2 20.170 + bne 1b 20.171 + bx lr 20.172 + .endm 20.173 + 20.174 + .macro pixels8_x2 vhadd=vrhadd.u8 20.175 +1: vld1.64 {d0, d1}, [r1], r2 20.176 + vext.8 d1, d0, d1, #1 20.177 + vld1.64 {d2, d3}, [r1], r2 20.178 + vext.8 d3, d2, d3, #1 20.179 + pld [r1] 20.180 + pld [r1, r2] 20.181 + subs r3, r3, #2 20.182 + vswp d1, d2 20.183 + \vhadd q0, q0, q1 20.184 + vst1.64 {d0}, [r0,:64], r2 20.185 + vst1.64 {d1}, [r0,:64], r2 20.186 + bne 1b 20.187 + bx lr 20.188 + .endm 20.189 + 20.190 + .macro pixels8_y2 vhadd=vrhadd.u8 20.191 + vld1.64 {d0}, [r1], r2 20.192 + vld1.64 {d1}, [r1], r2 20.193 +1: subs r3, r3, #2 20.194 + \vhadd d4, d0, d1 20.195 + vld1.64 {d0}, [r1], r2 20.196 + \vhadd d5, d0, d1 20.197 + vld1.64 {d1}, [r1], r2 20.198 + pld [r1] 20.199 + pld [r1, r2] 20.200 + vst1.64 {d4}, [r0,:64], r2 20.201 + vst1.64 {d5}, [r0,:64], r2 20.202 + bne 1b 20.203 + bx lr 20.204 + .endm 20.205 + 20.206 + .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 20.207 + vld1.64 {d0, d1}, [r1], r2 20.208 + vld1.64 {d2, d3}, [r1], r2 20.209 +.if \no_rnd 20.210 + vmov.i16 q11, #1 20.211 +.endif 20.212 + pld [r1] 20.213 + pld [r1, r2] 20.214 + vext.8 d4, d0, d1, #1 20.215 + vext.8 d6, d2, d3, #1 20.216 + vaddl.u8 q8, d0, d4 20.217 + vaddl.u8 q9, d2, d6 20.218 +1: subs r3, r3, #2 20.219 + vld1.64 {d0, d1}, [r1], r2 20.220 + pld [r1] 20.221 + vadd.u16 q10, q8, q9 20.222 + vext.8 d4, d0, d1, #1 20.223 +.if \no_rnd 20.224 + vadd.u16 q10, q10, q11 20.225 +.endif 20.226 + vaddl.u8 q8, d0, d4 20.227 + \vshrn d5, q10, #2 20.228 + vld1.64 {d2, d3}, [r1], r2 20.229 + vadd.u16 q10, q8, q9 20.230 + pld [r1, r2] 20.231 +.if \no_rnd 20.232 + vadd.u16 q10, q10, q11 20.233 +.endif 20.234 + vst1.64 {d5}, [r0,:64], r2 20.235 + \vshrn d7, q10, #2 20.236 + vext.8 d6, d2, d3, #1 20.237 + vaddl.u8 q9, d2, d6 20.238 + vst1.64 {d7}, [r0,:64], r2 20.239 + bgt 1b 20.240 + bx lr 20.241 + .endm 20.242 + 20.243 + .macro pixfunc pfx name suf rnd_op args:vararg 20.244 +function ff_\pfx\name\suf\()_neon, export=1 20.245 + \name \rnd_op \args 20.246 +endfunc 20.247 + .endm 20.248 + 20.249 + .macro pixfunc2 pfx name args:vararg 20.250 + pixfunc \pfx \name 20.251 + pixfunc \pfx \name \args 20.252 + .endm 20.253 + 20.254 +function ff_put_h264_qpel16_mc00_neon, export=1 20.255 + mov r3, #16 20.256 +endfunc 20.257 + 20.258 + pixfunc put_ pixels16 20.259 + pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 20.260 + pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 20.261 + pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 20.262 + 20.263 +function ff_avg_h264_qpel16_mc00_neon, export=1 20.264 + mov r3, #16 20.265 +endfunc 20.266 + 20.267 + pixfunc avg_ pixels16,, 1 20.268 + 20.269 +function ff_put_h264_qpel8_mc00_neon, export=1 20.270 + mov r3, #8 20.271 +endfunc 20.272 + 20.273 + pixfunc put_ pixels8 20.274 + pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 20.275 + pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 20.276 + pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 20.277 + 20.278 +function ff_avg_h264_qpel8_mc00_neon, export=1 20.279 + mov r3, #8 20.280 +endfunc 20.281 + 20.282 + pixfunc avg_ pixels8,, 1 20.283 + 20.284 +function ff_put_pixels_clamped_neon, export=1 20.285 + vld1.64 {d16-d19}, [r0,:128]! 20.286 + vqmovun.s16 d0, q8 20.287 + vld1.64 {d20-d23}, [r0,:128]! 20.288 + vqmovun.s16 d1, q9 20.289 + vld1.64 {d24-d27}, [r0,:128]! 20.290 + vqmovun.s16 d2, q10 20.291 + vld1.64 {d28-d31}, [r0,:128]! 20.292 + vqmovun.s16 d3, q11 20.293 + vst1.64 {d0}, [r1,:64], r2 20.294 + vqmovun.s16 d4, q12 20.295 + vst1.64 {d1}, [r1,:64], r2 20.296 + vqmovun.s16 d5, q13 20.297 + vst1.64 {d2}, [r1,:64], r2 20.298 + vqmovun.s16 d6, q14 20.299 + vst1.64 {d3}, [r1,:64], r2 20.300 + vqmovun.s16 d7, q15 20.301 + vst1.64 {d4}, [r1,:64], r2 20.302 + vst1.64 {d5}, [r1,:64], r2 20.303 + vst1.64 {d6}, [r1,:64], r2 20.304 + vst1.64 {d7}, [r1,:64], r2 20.305 + bx lr 20.306 +endfunc 20.307 + 20.308 +function ff_put_signed_pixels_clamped_neon, export=1 20.309 + vmov.u8 d31, #128 20.310 + vld1.64 {d16-d17}, [r0,:128]! 20.311 + vqmovn.s16 d0, q8 20.312 + vld1.64 {d18-d19}, [r0,:128]! 20.313 + vqmovn.s16 d1, q9 20.314 + vld1.64 {d16-d17}, [r0,:128]! 20.315 + vqmovn.s16 d2, q8 20.316 + vld1.64 {d18-d19}, [r0,:128]! 20.317 + vadd.u8 d0, d0, d31 20.318 + vld1.64 {d20-d21}, [r0,:128]! 20.319 + vadd.u8 d1, d1, d31 20.320 + vld1.64 {d22-d23}, [r0,:128]! 20.321 + vadd.u8 d2, d2, d31 20.322 + vst1.64 {d0}, [r1,:64], r2 20.323 + vqmovn.s16 d3, q9 20.324 + vst1.64 {d1}, [r1,:64], r2 20.325 + vqmovn.s16 d4, q10 20.326 + vst1.64 {d2}, [r1,:64], r2 20.327 + vqmovn.s16 d5, q11 20.328 + vld1.64 {d24-d25}, [r0,:128]! 20.329 + vadd.u8 d3, d3, d31 20.330 + vld1.64 {d26-d27}, [r0,:128]! 20.331 + vadd.u8 d4, d4, d31 20.332 + vadd.u8 d5, d5, d31 20.333 + vst1.64 {d3}, [r1,:64], r2 20.334 + vqmovn.s16 d6, q12 20.335 + vst1.64 {d4}, [r1,:64], r2 20.336 + vqmovn.s16 d7, q13 20.337 + vst1.64 {d5}, [r1,:64], r2 20.338 + vadd.u8 d6, d6, d31 20.339 + vadd.u8 d7, d7, d31 20.340 + vst1.64 {d6}, [r1,:64], r2 20.341 + vst1.64 {d7}, [r1,:64], r2 20.342 + bx lr 20.343 +endfunc 20.344 + 20.345 +function ff_add_pixels_clamped_neon, export=1 20.346 + mov r3, r1 20.347 + vld1.64 {d16}, [r1,:64], r2 20.348 + vld1.64 {d0-d1}, [r0,:128]! 20.349 + vaddw.u8 q0, q0, d16 20.350 + vld1.64 {d17}, [r1,:64], r2 20.351 + vld1.64 {d2-d3}, [r0,:128]! 20.352 + vqmovun.s16 d0, q0 20.353 + vld1.64 {d18}, [r1,:64], r2 20.354 + vaddw.u8 q1, q1, d17 20.355 + vld1.64 {d4-d5}, [r0,:128]! 20.356 + vaddw.u8 q2, q2, d18 20.357 + vst1.64 {d0}, [r3,:64], r2 20.358 + vqmovun.s16 d2, q1 20.359 + vld1.64 {d19}, [r1,:64], r2 20.360 + vld1.64 {d6-d7}, [r0,:128]! 20.361 + vaddw.u8 q3, q3, d19 20.362 + vqmovun.s16 d4, q2 20.363 + vst1.64 {d2}, [r3,:64], r2 20.364 + vld1.64 {d16}, [r1,:64], r2 20.365 + vqmovun.s16 d6, q3 20.366 + vld1.64 {d0-d1}, [r0,:128]! 20.367 + vaddw.u8 q0, q0, d16 20.368 + vst1.64 {d4}, [r3,:64], r2 20.369 + vld1.64 {d17}, [r1,:64], r2 20.370 + vld1.64 {d2-d3}, [r0,:128]! 20.371 + vaddw.u8 q1, q1, d17 20.372 + vst1.64 {d6}, [r3,:64], r2 20.373 + vqmovun.s16 d0, q0 20.374 + vld1.64 {d18}, [r1,:64], r2 20.375 + vld1.64 {d4-d5}, [r0,:128]! 20.376 + vaddw.u8 q2, q2, d18 20.377 + vst1.64 {d0}, [r3,:64], r2 20.378 + vqmovun.s16 d2, q1 20.379 + vld1.64 {d19}, [r1,:64], r2 20.380 + vqmovun.s16 d4, q2 20.381 + vld1.64 {d6-d7}, [r0,:128]! 20.382 + vaddw.u8 q3, q3, d19 20.383 + vst1.64 {d2}, [r3,:64], r2 20.384 + vqmovun.s16 d6, q3 20.385 + vst1.64 {d4}, [r3,:64], r2 20.386 + vst1.64 {d6}, [r3,:64], r2 20.387 + bx lr 20.388 +endfunc 20.389 + 20.390 +function ff_float_to_int16_neon, export=1 20.391 + subs r2, r2, #8 20.392 + vld1.64 {d0-d1}, [r1,:128]! 20.393 + vcvt.s32.f32 q8, q0, #16 20.394 + vld1.64 {d2-d3}, [r1,:128]! 20.395 + vcvt.s32.f32 q9, q1, #16 20.396 + beq 3f 20.397 + bics ip, r2, #15 20.398 + beq 2f 20.399 +1: subs ip, ip, #16 20.400 + vshrn.s32 d4, q8, #16 20.401 + vld1.64 {d0-d1}, [r1,:128]! 20.402 + vcvt.s32.f32 q0, q0, #16 20.403 + vshrn.s32 d5, q9, #16 20.404 + vld1.64 {d2-d3}, [r1,:128]! 20.405 + vcvt.s32.f32 q1, q1, #16 20.406 + vshrn.s32 d6, q0, #16 20.407 + vst1.64 {d4-d5}, [r0,:128]! 20.408 + vshrn.s32 d7, q1, #16 20.409 + vld1.64 {d16-d17},[r1,:128]! 20.410 + vcvt.s32.f32 q8, q8, #16 20.411 + vld1.64 {d18-d19},[r1,:128]! 20.412 + vcvt.s32.f32 q9, q9, #16 20.413 + vst1.64 {d6-d7}, [r0,:128]! 20.414 + bne 1b 20.415 + ands r2, r2, #15 20.416 + beq 3f 20.417 +2: vld1.64 {d0-d1}, [r1,:128]! 20.418 + vshrn.s32 d4, q8, #16 20.419 + vcvt.s32.f32 q0, q0, #16 20.420 + vld1.64 {d2-d3}, [r1,:128]! 20.421 + vshrn.s32 d5, q9, #16 20.422 + vcvt.s32.f32 q1, q1, #16 20.423 + vshrn.s32 d6, q0, #16 20.424 + vst1.64 {d4-d5}, [r0,:128]! 20.425 + vshrn.s32 d7, q1, #16 20.426 + vst1.64 {d6-d7}, [r0,:128]! 20.427 + bx lr 20.428 +3: vshrn.s32 d4, q8, #16 20.429 + vshrn.s32 d5, q9, #16 20.430 + vst1.64 {d4-d5}, [r0,:128]! 20.431 + bx lr 20.432 +endfunc 20.433 + 20.434 +function ff_float_to_int16_interleave_neon, export=1 20.435 + cmp r3, #2 20.436 + ldrlt r1, [r1] 20.437 + blt ff_float_to_int16_neon 20.438 + bne 4f 20.439 + 20.440 + ldr r3, [r1] 20.441 + ldr r1, [r1, #4] 20.442 + 20.443 + subs r2, r2, #8 20.444 + vld1.64 {d0-d1}, [r3,:128]! 20.445 + vcvt.s32.f32 q8, q0, #16 20.446 + vld1.64 {d2-d3}, [r3,:128]! 20.447 + vcvt.s32.f32 q9, q1, #16 20.448 + vld1.64 {d20-d21},[r1,:128]! 20.449 + vcvt.s32.f32 q10, q10, #16 20.450 + vld1.64 {d22-d23},[r1,:128]! 20.451 + vcvt.s32.f32 q11, q11, #16 20.452 + beq 3f 20.453 + bics ip, r2, #15 20.454 + beq 2f 20.455 +1: subs ip, ip, #16 20.456 + vld1.64 {d0-d1}, [r3,:128]! 20.457 + vcvt.s32.f32 q0, q0, #16 20.458 + vsri.32 q10, q8, #16 20.459 + vld1.64 {d2-d3}, [r3,:128]! 20.460 + vcvt.s32.f32 q1, q1, #16 20.461 + vld1.64 {d24-d25},[r1,:128]! 20.462 + vcvt.s32.f32 q12, q12, #16 20.463 + vld1.64 {d26-d27},[r1,:128]! 20.464 + vsri.32 q11, q9, #16 20.465 + vst1.64 {d20-d21},[r0,:128]! 20.466 + vcvt.s32.f32 q13, q13, #16 20.467 + vst1.64 {d22-d23},[r0,:128]! 20.468 + vsri.32 q12, q0, #16 20.469 + vld1.64 {d16-d17},[r3,:128]! 20.470 + vsri.32 q13, q1, #16 20.471 + vst1.64 {d24-d25},[r0,:128]! 20.472 + vcvt.s32.f32 q8, q8, #16 20.473 + vld1.64 {d18-d19},[r3,:128]! 20.474 + vcvt.s32.f32 q9, q9, #16 20.475 + vld1.64 {d20-d21},[r1,:128]! 20.476 + vcvt.s32.f32 q10, q10, #16 20.477 + vld1.64 {d22-d23},[r1,:128]! 20.478 + vcvt.s32.f32 q11, q11, #16 20.479 + vst1.64 {d26-d27},[r0,:128]! 20.480 + bne 1b 20.481 + ands r2, r2, #15 20.482 + beq 3f 20.483 +2: vsri.32 q10, q8, #16 20.484 + vld1.64 {d0-d1}, [r3,:128]! 20.485 + vcvt.s32.f32 q0, q0, #16 20.486 + vld1.64 {d2-d3}, [r3,:128]! 20.487 + vcvt.s32.f32 q1, q1, #16 20.488 + vld1.64 {d24-d25},[r1,:128]! 20.489 + vcvt.s32.f32 q12, q12, #16 20.490 + vsri.32 q11, q9, #16 20.491 + vld1.64 {d26-d27},[r1,:128]! 20.492 + vcvt.s32.f32 q13, q13, #16 20.493 + vst1.64 {d20-d21},[r0,:128]! 20.494 + vsri.32 q12, q0, #16 20.495 + vst1.64 {d22-d23},[r0,:128]! 20.496 + vsri.32 q13, q1, #16 20.497 + vst1.64 {d24-d27},[r0,:128]! 20.498 + bx lr 20.499 +3: vsri.32 q10, q8, #16 20.500 + vsri.32 q11, q9, #16 20.501 + vst1.64 {d20-d23},[r0,:128]! 20.502 + bx lr 20.503 + 20.504 +4: push {r4-r8,lr} 20.505 + cmp r3, #4 20.506 + lsl ip, r3, #1 20.507 + blt 4f 20.508 + 20.509 + @ 4 channels 20.510 +5: ldmia r1!, {r4-r7} 20.511 + mov lr, r2 20.512 + mov r8, r0 20.513 + vld1.64 {d16-d17},[r4,:128]! 20.514 + vcvt.s32.f32 q8, q8, #16 20.515 + vld1.64 {d18-d19},[r5,:128]! 20.516 + vcvt.s32.f32 q9, q9, #16 20.517 + vld1.64 {d20-d21},[r6,:128]! 20.518 + vcvt.s32.f32 q10, q10, #16 20.519 + vld1.64 {d22-d23},[r7,:128]! 20.520 + vcvt.s32.f32 q11, q11, #16 20.521 +6: subs lr, lr, #8 20.522 + vld1.64 {d0-d1}, [r4,:128]! 20.523 + vcvt.s32.f32 q0, q0, #16 20.524 + vsri.32 q9, q8, #16 20.525 + vld1.64 {d2-d3}, [r5,:128]! 20.526 + vcvt.s32.f32 q1, q1, #16 20.527 + vsri.32 q11, q10, #16 20.528 + vld1.64 {d4-d5}, [r6,:128]! 20.529 + vcvt.s32.f32 q2, q2, #16 20.530 + vzip.32 d18, d22 20.531 + vld1.64 {d6-d7}, [r7,:128]! 20.532 + vcvt.s32.f32 q3, q3, #16 20.533 + vzip.32 d19, d23 20.534 + vst1.64 {d18}, [r8], ip 20.535 + vsri.32 q1, q0, #16 20.536 + vst1.64 {d22}, [r8], ip 20.537 + vsri.32 q3, q2, #16 20.538 + vst1.64 {d19}, [r8], ip 20.539 + vzip.32 d2, d6 20.540 + vst1.64 {d23}, [r8], ip 20.541 + vzip.32 d3, d7 20.542 + beq 7f 20.543 + vld1.64 {d16-d17},[r4,:128]! 20.544 + vcvt.s32.f32 q8, q8, #16 20.545 + vst1.64 {d2}, [r8], ip 20.546 + vld1.64 {d18-d19},[r5,:128]! 20.547 + vcvt.s32.f32 q9, q9, #16 20.548 + vst1.64 {d6}, [r8], ip 20.549 + vld1.64 {d20-d21},[r6,:128]! 20.550 + vcvt.s32.f32 q10, q10, #16 20.551 + vst1.64 {d3}, [r8], ip 20.552 + vld1.64 {d22-d23},[r7,:128]! 20.553 + vcvt.s32.f32 q11, q11, #16 20.554 + vst1.64 {d7}, [r8], ip 20.555 + b 6b 20.556 +7: vst1.64 {d2}, [r8], ip 20.557 + vst1.64 {d6}, [r8], ip 20.558 + vst1.64 {d3}, [r8], ip 20.559 + vst1.64 {d7}, [r8], ip 20.560 + subs r3, r3, #4 20.561 + popeq {r4-r8,pc} 20.562 + cmp r3, #4 20.563 + add r0, r0, #8 20.564 + bge 5b 20.565 + 20.566 + @ 2 channels 20.567 +4: cmp r3, #2 20.568 + blt 4f 20.569 + ldmia r1!, {r4-r5} 20.570 + mov lr, r2 20.571 + mov r8, r0 20.572 + tst lr, #8 20.573 + vld1.64 {d16-d17},[r4,:128]! 20.574 + vcvt.s32.f32 q8, q8, #16 20.575 + vld1.64 {d18-d19},[r5,:128]! 20.576 + vcvt.s32.f32 q9, q9, #16 20.577 + vld1.64 {d20-d21},[r4,:128]! 20.578 + vcvt.s32.f32 q10, q10, #16 20.579 + vld1.64 {d22-d23},[r5,:128]! 20.580 + vcvt.s32.f32 q11, q11, #16 20.581 + beq 6f 20.582 + subs lr, lr, #8 20.583 + beq 7f 20.584 + vsri.32 d18, d16, #16 20.585 + vsri.32 d19, d17, #16 20.586 + vld1.64 {d16-d17},[r4,:128]! 20.587 + vcvt.s32.f32 q8, q8, #16 20.588 + vst1.32 {d18[0]}, [r8], ip 20.589 + vsri.32 d22, d20, #16 20.590 + vst1.32 {d18[1]}, [r8], ip 20.591 + vsri.32 d23, d21, #16 20.592 + vst1.32 {d19[0]}, [r8], ip 20.593 + vst1.32 {d19[1]}, [r8], ip 20.594 + vld1.64 {d18-d19},[r5,:128]! 20.595 + vcvt.s32.f32 q9, q9, #16 20.596 + vst1.32 {d22[0]}, [r8], ip 20.597 + vst1.32 {d22[1]}, [r8], ip 20.598 + vld1.64 {d20-d21},[r4,:128]! 20.599 + vcvt.s32.f32 q10, q10, #16 20.600 + vst1.32 {d23[0]}, [r8], ip 20.601 + vst1.32 {d23[1]}, [r8], ip 20.602 + vld1.64 {d22-d23},[r5,:128]! 20.603 + vcvt.s32.f32 q11, q11, #16 20.604 +6: subs lr, lr, #16 20.605 + vld1.64 {d0-d1}, [r4,:128]! 20.606 + vcvt.s32.f32 q0, q0, #16 20.607 + vsri.32 d18, d16, #16 20.608 + vld1.64 {d2-d3}, [r5,:128]! 20.609 + vcvt.s32.f32 q1, q1, #16 20.610 + vsri.32 d19, d17, #16 20.611 + vld1.64 {d4-d5}, [r4,:128]! 20.612 + vcvt.s32.f32 q2, q2, #16 20.613 + vld1.64 {d6-d7}, [r5,:128]! 20.614 + vcvt.s32.f32 q3, q3, #16 20.615 + vst1.32 {d18[0]}, [r8], ip 20.616 + vsri.32 d22, d20, #16 20.617 + vst1.32 {d18[1]}, [r8], ip 20.618 + vsri.32 d23, d21, #16 20.619 + vst1.32 {d19[0]}, [r8], ip 20.620 + vsri.32 d2, d0, #16 20.621 + vst1.32 {d19[1]}, [r8], ip 20.622 + vsri.32 d3, d1, #16 20.623 + vst1.32 {d22[0]}, [r8], ip 20.624 + vsri.32 d6, d4, #16 20.625 + vst1.32 {d22[1]}, [r8], ip 20.626 + vsri.32 d7, d5, #16 20.627 + vst1.32 {d23[0]}, [r8], ip 20.628 + vst1.32 {d23[1]}, [r8], ip 20.629 + beq 6f 20.630 + vld1.64 {d16-d17},[r4,:128]! 20.631 + vcvt.s32.f32 q8, q8, #16 20.632 + vst1.32 {d2[0]}, [r8], ip 20.633 + vst1.32 {d2[1]}, [r8], ip 20.634 + vld1.64 {d18-d19},[r5,:128]! 20.635 + vcvt.s32.f32 q9, q9, #16 20.636 + vst1.32 {d3[0]}, [r8], ip 20.637 + vst1.32 {d3[1]}, [r8], ip 20.638 + vld1.64 {d20-d21},[r4,:128]! 20.639 + vcvt.s32.f32 q10, q10, #16 20.640 + vst1.32 {d6[0]}, [r8], ip 20.641 + vst1.32 {d6[1]}, [r8], ip 20.642 + vld1.64 {d22-d23},[r5,:128]! 20.643 + vcvt.s32.f32 q11, q11, #16 20.644 + vst1.32 {d7[0]}, [r8], ip 20.645 + vst1.32 {d7[1]}, [r8], ip 20.646 + bgt 6b 20.647 +6: vst1.32 {d2[0]}, [r8], ip 20.648 + vst1.32 {d2[1]}, [r8], ip 20.649 + vst1.32 {d3[0]}, [r8], ip 20.650 + vst1.32 {d3[1]}, [r8], ip 20.651 + vst1.32 {d6[0]}, [r8], ip 20.652 + vst1.32 {d6[1]}, [r8], ip 20.653 + vst1.32 {d7[0]}, [r8], ip 20.654 + vst1.32 {d7[1]}, [r8], ip 20.655 + b 8f 20.656 +7: vsri.32 d18, d16, #16 20.657 + vsri.32 d19, d17, #16 20.658 + vst1.32 {d18[0]}, [r8], ip 20.659 + vsri.32 d22, d20, #16 20.660 + vst1.32 {d18[1]}, [r8], ip 20.661 + vsri.32 d23, d21, #16 20.662 + vst1.32 {d19[0]}, [r8], ip 20.663 + vst1.32 {d19[1]}, [r8], ip 20.664 + vst1.32 {d22[0]}, [r8], ip 20.665 + vst1.32 {d22[1]}, [r8], ip 20.666 + vst1.32 {d23[0]}, [r8], ip 20.667 + vst1.32 {d23[1]}, [r8], ip 20.668 +8: subs r3, r3, #2 20.669 + add r0, r0, #4 20.670 + popeq {r4-r8,pc} 20.671 + 20.672 + @ 1 channel 20.673 +4: ldr r4, [r1],#4 20.674 + tst r2, #8 20.675 + mov lr, r2 20.676 + mov r5, r0 20.677 + vld1.64 {d0-d1}, [r4,:128]! 20.678 + vcvt.s32.f32 q0, q0, #16 20.679 + vld1.64 {d2-d3}, [r4,:128]! 20.680 + vcvt.s32.f32 q1, q1, #16 20.681 + bne 8f 20.682 +6: subs lr, lr, #16 20.683 + vld1.64 {d4-d5}, [r4,:128]! 20.684 + vcvt.s32.f32 q2, q2, #16 20.685 + vld1.64 {d6-d7}, [r4,:128]! 20.686 + vcvt.s32.f32 q3, q3, #16 20.687 + vst1.16 {d0[1]}, [r5,:16], ip 20.688 + vst1.16 {d0[3]}, [r5,:16], ip 20.689 + vst1.16 {d1[1]}, [r5,:16], ip 20.690 + vst1.16 {d1[3]}, [r5,:16], ip 20.691 + vst1.16 {d2[1]}, [r5,:16], ip 20.692 + vst1.16 {d2[3]}, [r5,:16], ip 20.693 + vst1.16 {d3[1]}, [r5,:16], ip 20.694 + vst1.16 {d3[3]}, [r5,:16], ip 20.695 + beq 7f 20.696 + vld1.64 {d0-d1}, [r4,:128]! 20.697 + vcvt.s32.f32 q0, q0, #16 20.698 + vld1.64 {d2-d3}, [r4,:128]! 20.699 + vcvt.s32.f32 q1, q1, #16 20.700 +7: vst1.16 {d4[1]}, [r5,:16], ip 20.701 + vst1.16 {d4[3]}, [r5,:16], ip 20.702 + vst1.16 {d5[1]}, [r5,:16], ip 20.703 + vst1.16 {d5[3]}, [r5,:16], ip 20.704 + vst1.16 {d6[1]}, [r5,:16], ip 20.705 + vst1.16 {d6[3]}, [r5,:16], ip 20.706 + vst1.16 {d7[1]}, [r5,:16], ip 20.707 + vst1.16 {d7[3]}, [r5,:16], ip 20.708 + bgt 6b 20.709 + pop {r4-r8,pc} 20.710 +8: subs lr, lr, #8 20.711 + vst1.16 {d0[1]}, [r5,:16], ip 20.712 + vst1.16 {d0[3]}, [r5,:16], ip 20.713 + vst1.16 {d1[1]}, [r5,:16], ip 20.714 + vst1.16 {d1[3]}, [r5,:16], ip 20.715 + vst1.16 {d2[1]}, [r5,:16], ip 20.716 + vst1.16 {d2[3]}, [r5,:16], ip 20.717 + vst1.16 {d3[1]}, [r5,:16], ip 20.718 + vst1.16 {d3[3]}, [r5,:16], ip 20.719 + popeq {r4-r8,pc} 20.720 + vld1.64 {d0-d1}, [r4,:128]! 20.721 + vcvt.s32.f32 q0, q0, #16 20.722 + vld1.64 {d2-d3}, [r4,:128]! 20.723 + vcvt.s32.f32 q1, q1, #16 20.724 + b 6b 20.725 +endfunc 20.726 + 20.727 +function ff_vector_fmul_neon, export=1 20.728 + mov r3, r0 20.729 + subs r2, r2, #8 20.730 + vld1.64 {d0-d3}, [r0,:128]! 20.731 + vld1.64 {d4-d7}, [r1,:128]! 20.732 + vmul.f32 q8, q0, q2 20.733 + vmul.f32 q9, q1, q3 20.734 + beq 3f 20.735 + bics ip, r2, #15 20.736 + beq 2f 20.737 +1: subs ip, ip, #16 20.738 + vld1.64 {d0-d1}, [r0,:128]! 20.739 + vld1.64 {d4-d5}, [r1,:128]! 20.740 + vmul.f32 q10, q0, q2 20.741 + vld1.64 {d2-d3}, [r0,:128]! 20.742 + vld1.64 {d6-d7}, [r1,:128]! 20.743 + vmul.f32 q11, q1, q3 20.744 + vst1.64 {d16-d19},[r3,:128]! 20.745 + vld1.64 {d0-d1}, [r0,:128]! 20.746 + vld1.64 {d4-d5}, [r1,:128]! 20.747 + vmul.f32 q8, q0, q2 20.748 + vld1.64 {d2-d3}, [r0,:128]! 20.749 + vld1.64 {d6-d7}, [r1,:128]! 20.750 + vmul.f32 q9, q1, q3 20.751 + vst1.64 {d20-d23},[r3,:128]! 20.752 + bne 1b 20.753 + ands r2, r2, #15 20.754 + beq 3f 20.755 +2: vld1.64 {d0-d1}, [r0,:128]! 20.756 + vld1.64 {d4-d5}, [r1,:128]! 20.757 + vst1.64 {d16-d17},[r3,:128]! 20.758 + vmul.f32 q8, q0, q2 20.759 + vld1.64 {d2-d3}, [r0,:128]! 20.760 + vld1.64 {d6-d7}, [r1,:128]! 20.761 + vst1.64 {d18-d19},[r3,:128]! 20.762 + vmul.f32 q9, q1, q3 20.763 +3: vst1.64 {d16-d19},[r3,:128]! 20.764 + bx lr 20.765 +endfunc 20.766 + 20.767 +function ff_vector_fmul_window_neon, export=1 20.768 +VFP vdup.32 q8, d0[0] 20.769 +NOVFP vld1.32 {d16[],d17[]}, [sp,:32] 20.770 + push {r4,r5,lr} 20.771 +VFP ldr lr, [sp, #12] 20.772 +NOVFP ldr lr, [sp, #16] 20.773 + sub r2, r2, #8 20.774 + sub r5, lr, #2 20.775 + add r2, r2, r5, lsl #2 20.776 + add r4, r3, r5, lsl #3 20.777 + add ip, r0, r5, lsl #3 20.778 + mov r5, #-16 20.779 + vld1.64 {d0,d1}, [r1,:128]! 20.780 + vld1.64 {d2,d3}, [r2,:128], r5 20.781 + vld1.64 {d4,d5}, [r3,:128]! 20.782 + vld1.64 {d6,d7}, [r4,:128], r5 20.783 +1: subs lr, lr, #4 20.784 + vmov q11, q8 20.785 + vmla.f32 d22, d0, d4 20.786 + vmov q10, q8 20.787 + vmla.f32 d23, d1, d5 20.788 + vrev64.32 q3, q3 20.789 + vmla.f32 d20, d0, d7 20.790 + vrev64.32 q1, q1 20.791 + vmla.f32 d21, d1, d6 20.792 + beq 2f 20.793 + vmla.f32 d22, d3, d7 20.794 + vld1.64 {d0,d1}, [r1,:128]! 20.795 + vmla.f32 d23, d2, d6 20.796 + vld1.64 {d18,d19},[r2,:128], r5 20.797 + vmls.f32 d20, d3, d4 20.798 + vld1.64 {d24,d25},[r3,:128]! 20.799 + vmls.f32 d21, d2, d5 20.800 + vld1.64 {d6,d7}, [r4,:128], r5 20.801 + vmov q1, q9 20.802 + vrev64.32 q11, q11 20.803 + vmov q2, q12 20.804 + vswp d22, d23 20.805 + vst1.64 {d20,d21},[r0,:128]! 20.806 + vst1.64 {d22,d23},[ip,:128], r5 20.807 + b 1b 20.808 +2: vmla.f32 d22, d3, d7 20.809 + vmla.f32 d23, d2, d6 20.810 + vmls.f32 d20, d3, d4 20.811 + vmls.f32 d21, d2, d5 20.812 + vrev64.32 q11, q11 20.813 + vswp d22, d23 20.814 + vst1.64 {d20,d21},[r0,:128]! 20.815 + vst1.64 {d22,d23},[ip,:128], r5 20.816 + pop {r4,r5,pc} 20.817 +endfunc 20.818 + 20.819 +#if CONFIG_VORBIS_DECODER 20.820 +function ff_vorbis_inverse_coupling_neon, export=1 20.821 + vmov.i32 q10, #1<<31 20.822 + subs r2, r2, #4 20.823 + mov r3, r0 20.824 + mov r12, r1 20.825 + beq 3f 20.826 + 20.827 + vld1.32 {d24-d25},[r1,:128]! 20.828 + vld1.32 {d22-d23},[r0,:128]! 20.829 + vcle.s32 q8, q12, #0 20.830 + vand q9, q11, q10 20.831 + veor q12, q12, q9 20.832 + vand q2, q12, q8 20.833 + vbic q3, q12, q8 20.834 + vadd.f32 q12, q11, q2 20.835 + vsub.f32 q11, q11, q3 20.836 +1: vld1.32 {d2-d3}, [r1,:128]! 20.837 + vld1.32 {d0-d1}, [r0,:128]! 20.838 + vcle.s32 q8, q1, #0 20.839 + vand q9, q0, q10 20.840 + veor q1, q1, q9 20.841 + vst1.32 {d24-d25},[r3, :128]! 20.842 + vst1.32 {d22-d23},[r12,:128]! 20.843 + vand q2, q1, q8 20.844 + vbic q3, q1, q8 20.845 + vadd.f32 q1, q0, q2 20.846 + vsub.f32 q0, q0, q3 20.847 + subs r2, r2, #8 20.848 + ble 2f 20.849 + vld1.32 {d24-d25},[r1,:128]! 20.850 + vld1.32 {d22-d23},[r0,:128]! 20.851 + vcle.s32 q8, q12, #0 20.852 + vand q9, q11, q10 20.853 + veor q12, q12, q9 20.854 + vst1.32 {d2-d3}, [r3, :128]! 20.855 + vst1.32 {d0-d1}, [r12,:128]! 20.856 + vand q2, q12, q8 20.857 + vbic q3, q12, q8 20.858 + vadd.f32 q12, q11, q2 20.859 + vsub.f32 q11, q11, q3 20.860 + b 1b 20.861 + 20.862 +2: vst1.32 {d2-d3}, [r3, :128]! 20.863 + vst1.32 {d0-d1}, [r12,:128]! 20.864 + bxlt lr 20.865 + 20.866 +3: vld1.32 {d2-d3}, [r1,:128] 20.867 + vld1.32 {d0-d1}, [r0,:128] 20.868 + vcle.s32 q8, q1, #0 20.869 + vand q9, q0, q10 20.870 + veor q1, q1, q9 20.871 + vand q2, q1, q8 20.872 + vbic q3, q1, q8 20.873 + vadd.f32 q1, q0, q2 20.874 + vsub.f32 q0, q0, q3 20.875 + vst1.32 {d2-d3}, [r0,:128]! 20.876 + vst1.32 {d0-d1}, [r1,:128]! 20.877 + bx lr 20.878 +endfunc 20.879 +#endif 20.880 + 20.881 +function ff_vector_fmul_scalar_neon, export=1 20.882 +VFP len .req r2 20.883 +NOVFP len .req r3 20.884 +VFP vdup.32 q8, d0[0] 20.885 +NOVFP vdup.32 q8, r2 20.886 + bics r12, len, #15 20.887 + beq 3f 20.888 + vld1.32 {q0},[r1,:128]! 20.889 + vld1.32 {q1},[r1,:128]! 20.890 +1: vmul.f32 q0, q0, q8 20.891 + vld1.32 {q2},[r1,:128]! 20.892 + vmul.f32 q1, q1, q8 20.893 + vld1.32 {q3},[r1,:128]! 20.894 + vmul.f32 q2, q2, q8 20.895 + vst1.32 {q0},[r0,:128]! 20.896 + vmul.f32 q3, q3, q8 20.897 + vst1.32 {q1},[r0,:128]! 20.898 + subs r12, r12, #16 20.899 + beq 2f 20.900 + vld1.32 {q0},[r1,:128]! 20.901 + vst1.32 {q2},[r0,:128]! 20.902 + vld1.32 {q1},[r1,:128]! 20.903 + vst1.32 {q3},[r0,:128]! 20.904 + b 1b 20.905 +2: vst1.32 {q2},[r0,:128]! 20.906 + vst1.32 {q3},[r0,:128]! 20.907 + ands len, len, #15 20.908 + bxeq lr 20.909 +3: vld1.32 {q0},[r1,:128]! 20.910 + vmul.f32 q0, q0, q8 20.911 + vst1.32 {q0},[r0,:128]! 20.912 + subs len, len, #4 20.913 + bgt 3b 20.914 + bx lr 20.915 + .unreq len 20.916 +endfunc 20.917 + 20.918 +function ff_vector_fmul_sv_scalar_2_neon, export=1 20.919 +VFP vdup.32 d16, d0[0] 20.920 +NOVFP vdup.32 d16, r3 20.921 +NOVFP ldr r3, [sp] 20.922 + vld1.32 {d0},[r1,:64]! 20.923 + vld1.32 {d1},[r1,:64]! 20.924 +1: subs r3, r3, #4 20.925 + vmul.f32 d4, d0, d16 20.926 + vmul.f32 d5, d1, d16 20.927 + ldr r12, [r2], #4 20.928 + vld1.32 {d2},[r12,:64] 20.929 + ldr r12, [r2], #4 20.930 + vld1.32 {d3},[r12,:64] 20.931 + vmul.f32 d4, d4, d2 20.932 + vmul.f32 d5, d5, d3 20.933 + beq 2f 20.934 + vld1.32 {d0},[r1,:64]! 20.935 + vld1.32 {d1},[r1,:64]! 20.936 + vst1.32 {d4},[r0,:64]! 20.937 + vst1.32 {d5},[r0,:64]! 20.938 + b 1b 20.939 +2: vst1.32 {d4},[r0,:64]! 20.940 + vst1.32 {d5},[r0,:64]! 20.941 + bx lr 20.942 +endfunc 20.943 + 20.944 +function ff_vector_fmul_sv_scalar_4_neon, export=1 20.945 +VFP vdup.32 q10, d0[0] 20.946 +NOVFP vdup.32 q10, r3 20.947 +NOVFP ldr r3, [sp] 20.948 + push {lr} 20.949 + bics lr, r3, #7 20.950 + beq 3f 20.951 + vld1.32 {q0},[r1,:128]! 20.952 + vld1.32 {q2},[r1,:128]! 20.953 +1: ldr r12, [r2], #4 20.954 + vld1.32 {q1},[r12,:128] 20.955 + ldr r12, [r2], #4 20.956 + vld1.32 {q3},[r12,:128] 20.957 + vmul.f32 q8, q0, q10 20.958 + vmul.f32 q8, q8, q1 20.959 + vmul.f32 q9, q2, q10 20.960 + vmul.f32 q9, q9, q3 20.961 + subs lr, lr, #8 20.962 + beq 2f 20.963 + vld1.32 {q0},[r1,:128]! 20.964 + vld1.32 {q2},[r1,:128]! 20.965 + vst1.32 {q8},[r0,:128]! 20.966 + vst1.32 {q9},[r0,:128]! 20.967 + b 1b 20.968 +2: vst1.32 {q8},[r0,:128]! 20.969 + vst1.32 {q9},[r0,:128]! 20.970 + ands r3, r3, #7 20.971 + popeq {pc} 20.972 +3: vld1.32 {q0},[r1,:128]! 20.973 + ldr r12, [r2], #4 20.974 + vld1.32 {q1},[r12,:128] 20.975 + vmul.f32 q0, q0, q10 20.976 + vmul.f32 q0, q0, q1 20.977 + vst1.32 {q0},[r0,:128]! 20.978 + subs r3, r3, #4 20.979 + bgt 3b 20.980 + pop {pc} 20.981 +endfunc 20.982 + 20.983 +function ff_sv_fmul_scalar_2_neon, export=1 20.984 +VFP len .req r2 20.985 +NOVFP len .req r3 20.986 +VFP vdup.32 q8, d0[0] 20.987 +NOVFP vdup.32 q8, r2 20.988 + ldr r12, [r1], #4 20.989 + vld1.32 {d0},[r12,:64] 20.990 + ldr r12, [r1], #4 20.991 + vld1.32 {d1},[r12,:64] 20.992 +1: vmul.f32 q1, q0, q8 20.993 + subs len, len, #4 20.994 + beq 2f 20.995 + ldr r12, [r1], #4 20.996 + vld1.32 {d0},[r12,:64] 20.997 + ldr r12, [r1], #4 20.998 + vld1.32 {d1},[r12,:64] 20.999 + vst1.32 {q1},[r0,:128]! 20.1000 + b 1b 20.1001 +2: vst1.32 {q1},[r0,:128]! 20.1002 + bx lr 20.1003 + .unreq len 20.1004 +endfunc 20.1005 + 20.1006 +function ff_sv_fmul_scalar_4_neon, export=1 20.1007 +VFP len .req r2 20.1008 +NOVFP len .req r3 20.1009 +VFP vdup.32 q8, d0[0] 20.1010 +NOVFP vdup.32 q8, r2 20.1011 +1: ldr r12, [r1], #4 20.1012 + vld1.32 {q0},[r12,:128] 20.1013 + vmul.f32 q0, q0, q8 20.1014 + vst1.32 {q0},[r0,:128]! 20.1015 + subs len, len, #4 20.1016 + bgt 1b 20.1017 + bx lr 20.1018 + .unreq len 20.1019 +endfunc 20.1020 + 20.1021 +function ff_butterflies_float_neon, export=1 20.1022 +1: vld1.32 {q0},[r0,:128] 20.1023 + vld1.32 {q1},[r1,:128] 20.1024 + vsub.f32 q2, q0, q1 20.1025 + vadd.f32 q1, q0, q1 20.1026 + vst1.32 {q2},[r1,:128]! 20.1027 + vst1.32 {q1},[r0,:128]! 20.1028 + subs r2, r2, #4 20.1029 + bgt 1b 20.1030 + bx lr 20.1031 +endfunc 20.1032 + 20.1033 +function ff_scalarproduct_float_neon, export=1 20.1034 + vmov.f32 q2, #0.0 20.1035 +1: vld1.32 {q0},[r0,:128]! 20.1036 + vld1.32 {q1},[r1,:128]! 20.1037 + vmla.f32 q2, q0, q1 20.1038 + subs r2, r2, #4 20.1039 + bgt 1b 20.1040 + vadd.f32 d0, d4, d5 20.1041 + vpadd.f32 d0, d0, d0 20.1042 +NOVFP vmov.32 r0, d0[0] 20.1043 + bx lr 20.1044 +endfunc 20.1045 + 20.1046 +function ff_int32_to_float_fmul_scalar_neon, export=1 20.1047 +VFP vdup.32 q0, d0[0] 20.1048 +VFP len .req r2 20.1049 +NOVFP vdup.32 q0, r2 20.1050 +NOVFP len .req r3 20.1051 + 20.1052 + vld1.32 {q1},[r1,:128]! 20.1053 + vcvt.f32.s32 q3, q1 20.1054 + vld1.32 {q2},[r1,:128]! 20.1055 + vcvt.f32.s32 q8, q2 20.1056 +1: subs len, len, #8 20.1057 + pld [r1, #16] 20.1058 + vmul.f32 q9, q3, q0 20.1059 + vmul.f32 q10, q8, q0 20.1060 + beq 2f 20.1061 + vld1.32 {q1},[r1,:128]! 20.1062 + vcvt.f32.s32 q3, q1 20.1063 + vld1.32 {q2},[r1,:128]! 20.1064 + vcvt.f32.s32 q8, q2 20.1065 + vst1.32 {q9}, [r0,:128]! 20.1066 + vst1.32 {q10},[r0,:128]! 20.1067 + b 1b 20.1068 +2: vst1.32 {q9}, [r0,:128]! 20.1069 + vst1.32 {q10},[r0,:128]! 20.1070 + bx lr 20.1071 + .unreq len 20.1072 +endfunc 20.1073 + 20.1074 +function ff_vector_fmul_reverse_neon, export=1 20.1075 + add r2, r2, r3, lsl #2 20.1076 + sub r2, r2, #32 20.1077 + mov r12, #-32 20.1078 + vld1.32 {q0-q1}, [r1,:128]! 20.1079 + vld1.32 {q2-q3}, [r2,:128], r12 20.1080 +1: pld [r1, #32] 20.1081 + vrev64.32 q3, q3 20.1082 + vmul.f32 d16, d0, d7 20.1083 + vmul.f32 d17, d1, d6 20.1084 + pld [r2, #-32] 20.1085 + vrev64.32 q2, q2 20.1086 + vmul.f32 d18, d2, d5 20.1087 + vmul.f32 d19, d3, d4 20.1088 + subs r3, r3, #8 20.1089 + beq 2f 20.1090 + vld1.32 {q0-q1}, [r1,:128]! 20.1091 + vld1.32 {q2-q3}, [r2,:128], r12 20.1092 + vst1.32 {q8-q9}, [r0,:128]! 20.1093 + b 1b 20.1094 +2: vst1.32 {q8-q9}, [r0,:128]! 20.1095 + bx lr 20.1096 +endfunc 20.1097 + 20.1098 +function ff_vector_fmul_add_neon, export=1 20.1099 + ldr r12, [sp] 20.1100 + vld1.32 {q0-q1}, [r1,:128]! 20.1101 + vld1.32 {q8-q9}, [r2,:128]! 20.1102 + vld1.32 {q2-q3}, [r3,:128]! 20.1103 + vmul.f32 q10, q0, q8 20.1104 + vmul.f32 q11, q1, q9 20.1105 +1: vadd.f32 q12, q2, q10 20.1106 + vadd.f32 q13, q3, q11 20.1107 + pld [r1, #16] 20.1108 + pld [r2, #16] 20.1109 + pld [r3, #16] 20.1110 + subs r12, r12, #8 20.1111 + beq 2f 20.1112 + vld1.32 {q0}, [r1,:128]! 20.1113 + vld1.32 {q8}, [r2,:128]! 20.1114 + vmul.f32 q10, q0, q8 20.1115 + vld1.32 {q1}, [r1,:128]! 20.1116 + vld1.32 {q9}, [r2,:128]! 20.1117 + vmul.f32 q11, q1, q9 20.1118 + vld1.32 {q2-q3}, [r3,:128]! 20.1119 + vst1.32 {q12-q13},[r0,:128]! 20.1120 + b 1b 20.1121 +2: vst1.32 {q12-q13},[r0,:128]! 20.1122 + bx lr 20.1123 +endfunc 20.1124 + 20.1125 +function ff_vector_clipf_neon, export=1 20.1126 +VFP vdup.32 q1, d0[1] 20.1127 +VFP vdup.32 q0, d0[0] 20.1128 +NOVFP vdup.32 q0, r2 20.1129 +NOVFP vdup.32 q1, r3 20.1130 +NOVFP ldr r2, [sp] 20.1131 + vld1.f32 {q2},[r1,:128]! 20.1132 + vmin.f32 q10, q2, q1 20.1133 + vld1.f32 {q3},[r1,:128]! 20.1134 + vmin.f32 q11, q3, q1 20.1135 +1: vmax.f32 q8, q10, q0 20.1136 + vmax.f32 q9, q11, q0 20.1137 + subs r2, r2, #8 20.1138 + beq 2f 20.1139 + vld1.f32 {q2},[r1,:128]! 20.1140 + vmin.f32 q10, q2, q1 20.1141 + vld1.f32 {q3},[r1,:128]! 20.1142 + vmin.f32 q11, q3, q1 20.1143 + vst1.f32 {q8},[r0,:128]! 20.1144 + vst1.f32 {q9},[r0,:128]! 20.1145 + b 1b 20.1146 +2: vst1.f32 {q8},[r0,:128]! 20.1147 + vst1.f32 {q9},[r0,:128]! 20.1148 + bx lr 20.1149 +endfunc
21.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 21.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_vfp.S Mon Aug 27 12:09:56 2012 +0200 21.3 @@ -0,0 +1,189 @@ 21.4 +/* 21.5 + * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> 21.6 + * 21.7 + * This file is part of FFmpeg. 21.8 + * 21.9 + * FFmpeg is free software; you can redistribute it and/or 21.10 + * modify it under the terms of the GNU Lesser General Public 21.11 + * License as published by the Free Software Foundation; either 21.12 + * version 2.1 of the License, or (at your option) any later version. 21.13 + * 21.14 + * FFmpeg is distributed in the hope that it will be useful, 21.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 21.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21.17 + * Lesser General Public License for more details. 21.18 + * 21.19 + * You should have received a copy of the GNU Lesser General Public 21.20 + * License along with FFmpeg; if not, write to the Free Software 21.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21.22 + */ 21.23 + 21.24 +#include "config.h" 21.25 +#include "asm.S" 21.26 + 21.27 + .syntax unified 21.28 +/* 21.29 + * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle 21.30 + * throughput for almost all the instructions (except for double precision 21.31 + * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles 21.32 + * for arithmetic operations. Scheduling code to avoid pipeline stalls is very 21.33 + * important for performance. One more interesting feature is that VFP has 21.34 + * independent load/store and arithmetics pipelines, so it is possible to make 21.35 + * them work simultaneously and get more than 1 operation per cycle. Load/store 21.36 + * pipeline can process 2 single precision floating point values per cycle and 21.37 + * supports bulk loads and stores for large sets of registers. Arithmetic operations 21.38 + * can be done on vectors, which allows to keep the arithmetics pipeline busy, 21.39 + * while the processor may issue and execute other instructions. Detailed 21.40 + * optimization manuals can be found at http://www.arm.com 21.41 + */ 21.42 + 21.43 +/** 21.44 + * ARM VFP optimized implementation of 'vector_fmul_c' function. 21.45 + * Assume that len is a positive number and is multiple of 8 21.46 + */ 21.47 +@ void ff_vector_fmul_vfp(float *dst, const float *src, int len) 21.48 +function ff_vector_fmul_vfp, export=1 21.49 + vpush {d8-d15} 21.50 + mov r3, r0 21.51 + fmrx r12, fpscr 21.52 + orr r12, r12, #(3 << 16) /* set vector size to 4 */ 21.53 + fmxr fpscr, r12 21.54 + 21.55 + vldmia r3!, {s0-s3} 21.56 + vldmia r1!, {s8-s11} 21.57 + vldmia r3!, {s4-s7} 21.58 + vldmia r1!, {s12-s15} 21.59 + vmul.f32 s8, s0, s8 21.60 +1: 21.61 + subs r2, r2, #16 21.62 + vmul.f32 s12, s4, s12 21.63 + vldmiage r3!, {s16-s19} 21.64 + vldmiage r1!, {s24-s27} 21.65 + vldmiage r3!, {s20-s23} 21.66 + vldmiage r1!, {s28-s31} 21.67 + vmulge.f32 s24, s16, s24 21.68 + vstmia r0!, {s8-s11} 21.69 + vstmia r0!, {s12-s15} 21.70 + vmulge.f32 s28, s20, s28 21.71 + vldmiagt r3!, {s0-s3} 21.72 + vldmiagt r1!, {s8-s11} 21.73 + vldmiagt r3!, {s4-s7} 21.74 + vldmiagt r1!, {s12-s15} 21.75 + vmulge.f32 s8, s0, s8 21.76 + vstmiage r0!, {s24-s27} 21.77 + vstmiage r0!, {s28-s31} 21.78 + bgt 1b 21.79 + 21.80 + bic r12, r12, #(7 << 16) /* set vector size back to 1 */ 21.81 + fmxr fpscr, r12 21.82 + vpop {d8-d15} 21.83 + bx lr 21.84 +endfunc 21.85 + 21.86 +/** 21.87 + * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. 21.88 + * Assume that len is a positive number and is multiple of 8 21.89 + */ 21.90 +@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, 21.91 +@ const float *src1, int len) 21.92 +function ff_vector_fmul_reverse_vfp, export=1 21.93 + vpush {d8-d15} 21.94 + add r2, r2, r3, lsl #2 21.95 + vldmdb r2!, {s0-s3} 21.96 + vldmia r1!, {s8-s11} 21.97 + vldmdb r2!, {s4-s7} 21.98 + vldmia r1!, {s12-s15} 21.99 + vmul.f32 s8, s3, s8 21.100 + vmul.f32 s9, s2, s9 21.101 + vmul.f32 s10, s1, s10 21.102 + vmul.f32 s11, s0, s11 21.103 +1: 21.104 + subs r3, r3, #16 21.105 + vldmdbge r2!, {s16-s19} 21.106 + vmul.f32 s12, s7, s12 21.107 + vldmiage r1!, {s24-s27} 21.108 + vmul.f32 s13, s6, s13 21.109 + vldmdbge r2!, {s20-s23} 21.110 + vmul.f32 s14, s5, s14 21.111 + vldmiage r1!, {s28-s31} 21.112 + vmul.f32 s15, s4, s15 21.113 + vmulge.f32 s24, s19, s24 21.114 + vldmdbgt r2!, {s0-s3} 21.115 + vmulge.f32 s25, s18, s25 21.116 + vstmia r0!, {s8-s13} 21.117 + vmulge.f32 s26, s17, s26 21.118 + vldmiagt r1!, {s8-s11} 21.119 + vmulge.f32 s27, s16, s27 21.120 + vmulge.f32 s28, s23, s28 21.121 + vldmdbgt r2!, {s4-s7} 21.122 + vmulge.f32 s29, s22, s29 21.123 + vstmia r0!, {s14-s15} 21.124 + vmulge.f32 s30, s21, s30 21.125 + vmulge.f32 s31, s20, s31 21.126 + vmulge.f32 s8, s3, s8 21.127 + vldmiagt r1!, {s12-s15} 21.128 + vmulge.f32 s9, s2, s9 21.129 + vmulge.f32 s10, s1, s10 21.130 + vstmiage r0!, {s24-s27} 21.131 + vmulge.f32 s11, s0, s11 21.132 + vstmiage r0!, {s28-s31} 21.133 + bgt 1b 21.134 + 21.135 + vpop {d8-d15} 21.136 + bx lr 21.137 +endfunc 21.138 + 21.139 +#if HAVE_ARMV6 21.140 +/** 21.141 + * ARM VFP optimized float to int16 conversion. 21.142 + * Assume that len is a positive number and is multiple of 8, destination 21.143 + * buffer is at least 4 bytes aligned (8 bytes alignment is better for 21.144 + * performance), little endian byte sex 21.145 + */ 21.146 +@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) 21.147 +function ff_float_to_int16_vfp, export=1 21.148 + push {r4-r8,lr} 21.149 + vpush {d8-d11} 21.150 + vldmia r1!, {s16-s23} 21.151 + vcvt.s32.f32 s0, s16 21.152 + vcvt.s32.f32 s1, s17 21.153 + vcvt.s32.f32 s2, s18 21.154 + vcvt.s32.f32 s3, s19 21.155 + vcvt.s32.f32 s4, s20 21.156 + vcvt.s32.f32 s5, s21 21.157 + vcvt.s32.f32 s6, s22 21.158 + vcvt.s32.f32 s7, s23 21.159 +1: 21.160 + subs r2, r2, #8 21.161 + vmov r3, r4, s0, s1 21.162 + vmov r5, r6, s2, s3 21.163 + vmov r7, r8, s4, s5 21.164 + vmov ip, lr, s6, s7 21.165 + vldmiagt r1!, {s16-s23} 21.166 + ssat r4, #16, r4 21.167 + ssat r3, #16, r3 21.168 + ssat r6, #16, r6 21.169 + ssat r5, #16, r5 21.170 + pkhbt r3, r3, r4, lsl #16 21.171 + pkhbt r4, r5, r6, lsl #16 21.172 + vcvtgt.s32.f32 s0, s16 21.173 + vcvtgt.s32.f32 s1, s17 21.174 + vcvtgt.s32.f32 s2, s18 21.175 + vcvtgt.s32.f32 s3, s19 21.176 + vcvtgt.s32.f32 s4, s20 21.177 + vcvtgt.s32.f32 s5, s21 21.178 + vcvtgt.s32.f32 s6, s22 21.179 + vcvtgt.s32.f32 s7, s23 21.180 + ssat r8, #16, r8 21.181 + ssat r7, #16, r7 21.182 + ssat lr, #16, lr 21.183 + ssat ip, #16, ip 21.184 + pkhbt r5, r7, r8, lsl #16 21.185 + pkhbt r6, ip, lr, lsl #16 21.186 + stmia r0!, {r3-r6} 21.187 + bgt 1b 21.188 + 21.189 + vpop {d8-d11} 21.190 + pop {r4-r8,pc} 21.191 +endfunc 21.192 +#endif
22.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 22.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/fft_init_arm.c Mon Aug 27 12:09:56 2012 +0200 22.3 @@ -0,0 +1,65 @@ 22.4 +/* 22.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 22.6 + * 22.7 + * This file is part of FFmpeg. 22.8 + * 22.9 + * FFmpeg is free software; you can redistribute it and/or 22.10 + * modify it under the terms of the GNU Lesser General Public 22.11 + * License as published by the Free Software Foundation; either 22.12 + * version 2.1 of the License, or (at your option) any later version. 22.13 + * 22.14 + * FFmpeg is distributed in the hope that it will be useful, 22.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 22.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22.17 + * Lesser General Public License for more details. 22.18 + * 22.19 + * You should have received a copy of the GNU Lesser General Public 22.20 + * License along with FFmpeg; if not, write to the Free Software 22.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22.22 + */ 22.23 + 22.24 +#include "libavcodec/fft.h" 22.25 +#include "libavcodec/synth_filter.h" 22.26 + 22.27 +void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); 22.28 +void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); 22.29 + 22.30 +void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); 22.31 +void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); 22.32 +void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); 22.33 + 22.34 +void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z); 22.35 + 22.36 +void ff_synth_filter_float_neon(FFTContext *imdct, 22.37 + float *synth_buf_ptr, int *synth_buf_offset, 22.38 + float synth_buf2[32], const float window[512], 22.39 + float out[32], const float in[32], 22.40 + float scale, float bias); 22.41 + 22.42 +av_cold void ff_fft_init_arm(FFTContext *s) 22.43 +{ 22.44 + if (HAVE_NEON) { 22.45 + s->fft_permute = ff_fft_permute_neon; 22.46 + s->fft_calc = ff_fft_calc_neon; 22.47 + s->imdct_calc = ff_imdct_calc_neon; 22.48 + s->imdct_half = ff_imdct_half_neon; 22.49 + s->mdct_calc = ff_mdct_calc_neon; 22.50 + s->permutation = FF_MDCT_PERM_INTERLEAVE; 22.51 + } 22.52 +} 22.53 + 22.54 +#if CONFIG_RDFT 22.55 +av_cold void ff_rdft_init_arm(RDFTContext *s) 22.56 +{ 22.57 + if (HAVE_NEON) 22.58 + s->rdft_calc = ff_rdft_calc_neon; 22.59 +} 22.60 +#endif 22.61 + 22.62 +#if CONFIG_DCA_DECODER 22.63 +av_cold void ff_synth_filter_init_arm(SynthFilterContext *s) 22.64 +{ 22.65 + if (HAVE_NEON) 22.66 + s->synth_filter_float = ff_synth_filter_float_neon; 22.67 +} 22.68 +#endif
23.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 23.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/fft_neon.S Mon Aug 27 12:09:56 2012 +0200 23.3 @@ -0,0 +1,371 @@ 23.4 +/* 23.5 + * ARM NEON optimised FFT 23.6 + * 23.7 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 23.8 + * Copyright (c) 2009 Naotoshi Nojiri 23.9 + * 23.10 + * This file is part of FFmpeg. 23.11 + * 23.12 + * FFmpeg is free software; you can redistribute it and/or 23.13 + * modify it under the terms of the GNU Lesser General Public 23.14 + * License as published by the Free Software Foundation; either 23.15 + * version 2.1 of the License, or (at your option) any later version. 23.16 + * 23.17 + * FFmpeg is distributed in the hope that it will be useful, 23.18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 23.19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 23.20 + * Lesser General Public License for more details. 23.21 + * 23.22 + * You should have received a copy of the GNU Lesser General Public 23.23 + * License along with FFmpeg; if not, write to the Free Software 23.24 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23.25 + */ 23.26 + 23.27 +#include "asm.S" 23.28 + 23.29 +#define M_SQRT1_2 0.70710678118654752440 23.30 + 23.31 + .text 23.32 + 23.33 +function fft4_neon 23.34 + vld1.32 {d0-d3}, [r0,:128] 23.35 + 23.36 + vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2 23.37 + vsub.f32 d6, d0, d1 @ r0-r1,i0-i1 23.38 + vsub.f32 d7, d16, d17 @ r3-r2,i2-i3 23.39 + vadd.f32 d4, d0, d1 @ r0+r1,i0+i1 23.40 + vadd.f32 d5, d2, d3 @ i2+i3,r2+r3 23.41 + vadd.f32 d1, d6, d7 23.42 + vsub.f32 d3, d6, d7 23.43 + vadd.f32 d0, d4, d5 23.44 + vsub.f32 d2, d4, d5 23.45 + 23.46 + vst1.32 {d0-d3}, [r0,:128] 23.47 + 23.48 + bx lr 23.49 +endfunc 23.50 + 23.51 +function fft8_neon 23.52 + mov r1, r0 23.53 + vld1.32 {d0-d3}, [r1,:128]! 23.54 + vld1.32 {d16-d19}, [r1,:128] 23.55 + 23.56 + movw r2, #0x04f3 @ sqrt(1/2) 23.57 + movt r2, #0x3f35 23.58 + eor r3, r2, #1<<31 23.59 + vdup.32 d31, r2 23.60 + 23.61 + vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2 23.62 + vadd.f32 d4, d16, d17 @ r4+r5,i4+i5 23.63 + vmov d28, r3, r2 23.64 + vadd.f32 d5, d18, d19 @ r6+r7,i6+i7 23.65 + vsub.f32 d17, d16, d17 @ r4-r5,i4-i5 23.66 + vsub.f32 d19, d18, d19 @ r6-r7,i6-i7 23.67 + vrev64.32 d29, d28 23.68 + vadd.f32 d20, d0, d1 @ r0+r1,i0+i1 23.69 + vadd.f32 d21, d2, d3 @ r2+r3,i2+i3 23.70 + vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w 23.71 + vext.32 q3, q2, q2, #1 23.72 + vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w 23.73 + vsub.f32 d23, d22, d23 @ i2-i3,r3-r2 23.74 + vsub.f32 d22, d0, d1 @ r0-r1,i0-i1 23.75 + vmul.f32 d24, d17, d31 @ a2r*w,a2i*w 23.76 + vmul.f32 d25, d19, d31 @ a3r*w,a3i*w 23.77 + vadd.f32 d0, d20, d21 23.78 + vsub.f32 d2, d20, d21 23.79 + vadd.f32 d1, d22, d23 23.80 + vrev64.32 q13, q13 23.81 + vsub.f32 d3, d22, d23 23.82 + vsub.f32 d6, d6, d7 23.83 + vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2 23.84 + vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6 23.85 + vadd.f32 d7, d4, d5 23.86 + vsub.f32 d18, d2, d6 23.87 + vext.32 q13, q12, q12, #1 23.88 + vadd.f32 d2, d2, d6 23.89 + vsub.f32 d16, d0, d7 23.90 + vadd.f32 d5, d25, d24 23.91 + vsub.f32 d4, d26, d27 23.92 + vadd.f32 d0, d0, d7 23.93 + vsub.f32 d17, d1, d5 23.94 + vsub.f32 d19, d3, d4 23.95 + vadd.f32 d3, d3, d4 23.96 + vadd.f32 d1, d1, d5 23.97 + 23.98 + vst1.32 {d16-d19}, [r1,:128] 23.99 + vst1.32 {d0-d3}, [r0,:128] 23.100 + 23.101 + bx lr 23.102 +endfunc 23.103 + 23.104 +function fft16_neon 23.105 + movrel r1, mppm 23.106 + vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3} 23.107 + pld [r0, #32] 23.108 + vld1.32 {d2-d3}, [r1,:128] 23.109 + vext.32 q13, q9, q9, #1 23.110 + vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7} 23.111 + vadd.f32 d4, d16, d17 23.112 + vsub.f32 d5, d16, d17 23.113 + vadd.f32 d18, d18, d19 23.114 + vsub.f32 d19, d26, d27 23.115 + 23.116 + vadd.f32 d20, d22, d23 23.117 + vsub.f32 d22, d22, d23 23.118 + vsub.f32 d23, d24, d25 23.119 + vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1} 23.120 + vadd.f32 d21, d24, d25 23.121 + vmul.f32 d24, d22, d2 23.122 + vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3} 23.123 + vmul.f32 d25, d23, d3 23.124 + vuzp.32 d16, d17 @ {r0,r1,i0,i1} 23.125 + vmul.f32 q1, q11, d2[1] 23.126 + vuzp.32 d18, d19 @ {r2,r3,i2,i3} 23.127 + vrev64.32 q12, q12 23.128 + vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6} 23.129 + vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11} 23.130 + vzip.32 q10, q11 23.131 + vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15} 23.132 + vadd.f32 d0, d22, d20 23.133 + vadd.f32 d1, d21, d23 23.134 + vsub.f32 d2, d21, d23 23.135 + vsub.f32 d3, d22, d20 23.136 + sub r0, r0, #96 23.137 + vext.32 q13, q13, q13, #1 23.138 + vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5} 23.139 + vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} 23.140 + vext.32 q15, q15, q15, #1 23.141 + vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7} 23.142 + vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10} 23.143 + vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3} 23.144 + vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14} 23.145 + vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6} 23.146 + vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a} 23.147 + movrel r2, X(ff_cos_16) 23.148 + vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8} 23.149 + vrev64.32 d1, d1 23.150 + vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a} 23.151 + vrev64.32 d3, d3 23.152 + movrel r3, pmmp 23.153 + vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8} 23.154 + vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a} 23.155 + vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9} 23.156 + vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13} 23.157 + vld1.32 {d4-d5}, [r2,:64] 23.158 + vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11} 23.159 + vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15} 23.160 + vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13} 23.161 + vld1.32 {d6-d7}, [r3,:128] 23.162 + vrev64.32 q1, q14 23.163 + vmul.f32 q14, q14, d4[1] 23.164 + vmul.f32 q1, q1, q3 23.165 + vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a} 23.166 + vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15} 23.167 + vzip.32 q12, q14 23.168 + vadd.f32 d0, d28, d24 23.169 + vadd.f32 d1, d25, d29 23.170 + vsub.f32 d2, d25, d29 23.171 + vsub.f32 d3, d28, d24 23.172 + vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9} 23.173 + vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} 23.174 + vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13} 23.175 + mov r1, #32 23.176 + vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5} 23.177 + vrev64.32 q0, q13 23.178 + vmul.f32 q13, q13, d5[0] 23.179 + vrev64.32 q1, q15 23.180 + vmul.f32 q15, q15, d5[1] 23.181 + vst2.32 {d16-d17},[r0,:128], r1 23.182 + vmul.f32 q0, q0, q3 23.183 + vst2.32 {d20-d21},[r0,:128], r1 23.184 + vmul.f32 q1, q1, q3 23.185 + vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6} 23.186 + vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a} 23.187 + vst2.32 {d24-d25},[r0,:128], r1 23.188 + vst2.32 {d28-d29},[r0,:128] 23.189 + vzip.32 q13, q15 23.190 + sub r0, r0, #80 23.191 + vadd.f32 d0, d30, d26 23.192 + vadd.f32 d1, d27, d31 23.193 + vsub.f32 d2, d27, d31 23.194 + vsub.f32 d3, d30, d26 23.195 + vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11} 23.196 + vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3} 23.197 + vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15} 23.198 + vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7} 23.199 + vst2.32 {d18-d19},[r0,:128], r1 23.200 + vst2.32 {d22-d23},[r0,:128], r1 23.201 + vst2.32 {d26-d27},[r0,:128], r1 23.202 + vst2.32 {d30-d31},[r0,:128] 23.203 + bx lr 23.204 +endfunc 23.205 + 23.206 +function fft_pass_neon 23.207 + push {r4-r6,lr} 23.208 + mov r6, r2 @ n 23.209 + lsl r5, r2, #3 @ 2 * n * sizeof FFTSample 23.210 + lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex 23.211 + lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex 23.212 + add r3, r2, r4 23.213 + add r4, r4, r0 @ &z[o1] 23.214 + add r2, r2, r0 @ &z[o2] 23.215 + add r3, r3, r0 @ &z[o3] 23.216 + vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} 23.217 + movrel r12, pmmp 23.218 + vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} 23.219 + add r5, r5, r1 @ wim 23.220 + vld1.32 {d6-d7}, [r12,:128] @ pmmp 23.221 + vswp d21, d22 23.222 + vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]} 23.223 + sub r5, r5, #4 @ wim-- 23.224 + vrev64.32 q1, q11 23.225 + vmul.f32 q11, q11, d4[1] 23.226 + vmul.f32 q1, q1, q3 23.227 + vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1] 23.228 + vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} 23.229 + vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} 23.230 + sub r6, r6, #1 @ n-- 23.231 + vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} 23.232 + vzip.32 q10, q11 23.233 + vadd.f32 d0, d22, d20 23.234 + vadd.f32 d1, d21, d23 23.235 + vsub.f32 d2, d21, d23 23.236 + vsub.f32 d3, d22, d20 23.237 + vsub.f32 q10, q8, q0 23.238 + vadd.f32 q8, q8, q0 23.239 + vsub.f32 q11, q9, q1 23.240 + vadd.f32 q9, q9, q1 23.241 + vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]} 23.242 + vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]} 23.243 + vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]} 23.244 + vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]} 23.245 + sub r5, r5, #8 @ wim -= 2 23.246 +1: 23.247 + vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} 23.248 + vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} 23.249 + vswp d21, d22 23.250 + vld1.32 {d4}, [r1]! @ {wre[0],wre[1]} 23.251 + vrev64.32 q0, q10 23.252 + vmul.f32 q10, q10, d4[0] 23.253 + vrev64.32 q1, q11 23.254 + vmul.f32 q11, q11, d4[1] 23.255 + vld1.32 {d5}, [r5] @ {wim[-1],wim[0]} 23.256 + vmul.f32 q0, q0, q3 23.257 + sub r5, r5, #8 @ wim -= 2 23.258 + vmul.f32 q1, q1, q3 23.259 + vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6} 23.260 + vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} 23.261 + vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} 23.262 + subs r6, r6, #1 @ n-- 23.263 + vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} 23.264 + vzip.32 q10, q11 23.265 + vadd.f32 d0, d22, d20 23.266 + vadd.f32 d1, d21, d23 23.267 + vsub.f32 d2, d21, d23 23.268 + vsub.f32 d3, d22, d20 23.269 + vsub.f32 q10, q8, q0 23.270 + vadd.f32 q8, q8, q0 23.271 + vsub.f32 q11, q9, q1 23.272 + vadd.f32 q9, q9, q1 23.273 + vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]} 23.274 + vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]} 23.275 + vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]} 23.276 + vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]} 23.277 + bne 1b 23.278 + 23.279 + pop {r4-r6,pc} 23.280 +endfunc 23.281 + 23.282 +.macro def_fft n, n2, n4 23.283 + .align 6 23.284 +function fft\n\()_neon 23.285 + push {r4, lr} 23.286 + mov r4, r0 23.287 + bl fft\n2\()_neon 23.288 + add r0, r4, #\n4*2*8 23.289 + bl fft\n4\()_neon 23.290 + add r0, r4, #\n4*3*8 23.291 + bl fft\n4\()_neon 23.292 + mov r0, r4 23.293 + pop {r4, lr} 23.294 + movrel r1, X(ff_cos_\n) 23.295 + mov r2, #\n4/2 23.296 + b fft_pass_neon 23.297 +endfunc 23.298 +.endm 23.299 + 23.300 + def_fft 32, 16, 8 23.301 + def_fft 64, 32, 16 23.302 + def_fft 128, 64, 32 23.303 + def_fft 256, 128, 64 23.304 + def_fft 512, 256, 128 23.305 + def_fft 1024, 512, 256 23.306 + def_fft 2048, 1024, 512 23.307 + def_fft 4096, 2048, 1024 23.308 + def_fft 8192, 4096, 2048 23.309 + def_fft 16384, 8192, 4096 23.310 + def_fft 32768, 16384, 8192 23.311 + def_fft 65536, 32768, 16384 23.312 + 23.313 +function ff_fft_calc_neon, export=1 23.314 + ldr r2, [r0] 23.315 + sub r2, r2, #2 23.316 + movrel r3, fft_tab_neon 23.317 + ldr r3, [r3, r2, lsl #2] 23.318 + mov r0, r1 23.319 + bx r3 23.320 +endfunc 23.321 + 23.322 +function ff_fft_permute_neon, export=1 23.323 + push {r4,lr} 23.324 + mov r12, #1 23.325 + ldr r2, [r0] @ nbits 23.326 + ldr r3, [r0, #20] @ tmp_buf 23.327 + ldr r0, [r0, #8] @ revtab 23.328 + lsl r12, r12, r2 23.329 + mov r2, r12 23.330 +1: 23.331 + vld1.32 {d0-d1}, [r1,:128]! 23.332 + ldr r4, [r0], #4 23.333 + uxth lr, r4 23.334 + uxth r4, r4, ror #16 23.335 + add lr, r3, lr, lsl #3 23.336 + add r4, r3, r4, lsl #3 23.337 + vst1.32 {d0}, [lr,:64] 23.338 + vst1.32 {d1}, [r4,:64] 23.339 + subs r12, r12, #2 23.340 + bgt 1b 23.341 + 23.342 + sub r1, r1, r2, lsl #3 23.343 +1: 23.344 + vld1.32 {d0-d3}, [r3,:128]! 23.345 + vst1.32 {d0-d3}, [r1,:128]! 23.346 + subs r2, r2, #4 23.347 + bgt 1b 23.348 + 23.349 + pop {r4,pc} 23.350 +endfunc 23.351 + 23.352 + .section .rodata 23.353 + .align 4 23.354 +fft_tab_neon: 23.355 + .word fft4_neon 23.356 + .word fft8_neon 23.357 + .word fft16_neon 23.358 + .word fft32_neon 23.359 + .word fft64_neon 23.360 + .word fft128_neon 23.361 + .word fft256_neon 23.362 + .word fft512_neon 23.363 + .word fft1024_neon 23.364 + .word fft2048_neon 23.365 + .word fft4096_neon 23.366 + .word fft8192_neon 23.367 + .word fft16384_neon 23.368 + .word fft32768_neon 23.369 + .word fft65536_neon 23.370 + .size fft_tab_neon, . - fft_tab_neon 23.371 + 23.372 + .align 4 23.373 +pmmp: .float +1.0, -1.0, -1.0, +1.0 23.374 +mppm: .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
24.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 24.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_init_arm.c Mon Aug 27 12:09:56 2012 +0200 24.3 @@ -0,0 +1,126 @@ 24.4 +/* 24.5 + * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> 24.6 + * 24.7 + * This file is part of FFmpeg. 24.8 + * 24.9 + * FFmpeg is free software; you can redistribute it and/or 24.10 + * modify it under the terms of the GNU Lesser General Public 24.11 + * License as published by the Free Software Foundation; either 24.12 + * version 2.1 of the License, or (at your option) any later version. 24.13 + * 24.14 + * FFmpeg is distributed in the hope that it will be useful, 24.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 24.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 24.17 + * Lesser General Public License for more details. 24.18 + * 24.19 + * You should have received a copy of the GNU Lesser General Public 24.20 + * License along with FFmpeg; if not, write to the Free Software 24.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24.22 + */ 24.23 + 24.24 +#include <stdint.h> 24.25 + 24.26 +#include "libavcodec/dsputil.h" 24.27 +#include "libavcodec/h264dsp.h" 24.28 + 24.29 +void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, 24.30 + int beta, int8_t *tc0); 24.31 +void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, 24.32 + int beta, int8_t *tc0); 24.33 +void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, 24.34 + int beta, int8_t *tc0); 24.35 +void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, 24.36 + int beta, int8_t *tc0); 24.37 + 24.38 +void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den, 24.39 + int weight, int offset); 24.40 +void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den, 24.41 + int weight, int offset); 24.42 +void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den, 24.43 + int weight, int offset); 24.44 +void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den, 24.45 + int weight, int offset); 24.46 +void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den, 24.47 + int weight, int offset); 24.48 +void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den, 24.49 + int weight, int offset); 24.50 +void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den, 24.51 + int weight, int offset); 24.52 +void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den, 24.53 + int weight, int offset); 24.54 + 24.55 +void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride, 24.56 + int log2_den, int weightd, int weights, 24.57 + int offset); 24.58 +void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride, 24.59 + int log2_den, int weightd, int weights, 24.60 + int offset); 24.61 +void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride, 24.62 + int log2_den, int weightd, int weights, 24.63 + int offset); 24.64 +void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride, 24.65 + int log2_den, int weightd, int weights, 24.66 + int offset); 24.67 +void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride, 24.68 + int log2_den, int weightd, int weights, 24.69 + int offset); 24.70 +void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride, 24.71 + int log2_den, int weightd, int weights, 24.72 + int offset); 24.73 +void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride, 24.74 + int log2_den, int weightd, int weights, 24.75 + int offset); 24.76 +void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride, 24.77 + int log2_den, int weightd, int weights, 24.78 + int offset); 24.79 + 24.80 +void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); 24.81 +void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); 24.82 +void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset, 24.83 + DCTELEM *block, int stride, 24.84 + const uint8_t nnzc[6*8]); 24.85 +void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset, 24.86 + DCTELEM *block, int stride, 24.87 + const uint8_t nnzc[6*8]); 24.88 +void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset, 24.89 + DCTELEM *block, int stride, 24.90 + const uint8_t nnzc[6*8]); 24.91 + 24.92 +#if HAVE_NEON 24.93 +static void ff_h264dsp_init_neon(H264DSPContext *c) 24.94 +{ 24.95 + c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; 24.96 + c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; 24.97 + c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; 24.98 + c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; 24.99 + 24.100 + c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon; 24.101 + c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon; 24.102 + c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon; 24.103 + c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon; 24.104 + c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon; 24.105 + c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon; 24.106 + c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon; 24.107 + c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon; 24.108 + 24.109 + c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon; 24.110 + c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon; 24.111 + c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon; 24.112 + c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon; 24.113 + c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon; 24.114 + c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon; 24.115 + c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon; 24.116 + c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon; 24.117 + 24.118 + c->h264_idct_add = ff_h264_idct_add_neon; 24.119 + c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; 24.120 + c->h264_idct_add16 = ff_h264_idct_add16_neon; 24.121 + c->h264_idct_add16intra = ff_h264_idct_add16intra_neon; 24.122 + c->h264_idct_add8 = ff_h264_idct_add8_neon; 24.123 +} 24.124 +#endif 24.125 + 24.126 +void ff_h264dsp_init_arm(H264DSPContext *c) 24.127 +{ 24.128 + if (HAVE_NEON) ff_h264dsp_init_neon(c); 24.129 +}
25.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 25.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_neon.S Mon Aug 27 12:09:56 2012 +0200 25.3 @@ -0,0 +1,1883 @@ 25.4 +/* 25.5 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 25.6 + * 25.7 + * This file is part of FFmpeg. 25.8 + * 25.9 + * FFmpeg is free software; you can redistribute it and/or 25.10 + * modify it under the terms of the GNU Lesser General Public 25.11 + * License as published by the Free Software Foundation; either 25.12 + * version 2.1 of the License, or (at your option) any later version. 25.13 + * 25.14 + * FFmpeg is distributed in the hope that it will be useful, 25.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 25.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 25.17 + * Lesser General Public License for more details. 25.18 + * 25.19 + * You should have received a copy of the GNU Lesser General Public 25.20 + * License along with FFmpeg; if not, write to the Free Software 25.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25.22 + */ 25.23 + 25.24 +#include "asm.S" 25.25 + 25.26 + .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7 25.27 + vtrn.32 \r0, \r4 25.28 + vtrn.32 \r1, \r5 25.29 + vtrn.32 \r2, \r6 25.30 + vtrn.32 \r3, \r7 25.31 + vtrn.16 \r0, \r2 25.32 + vtrn.16 \r1, \r3 25.33 + vtrn.16 \r4, \r6 25.34 + vtrn.16 \r5, \r7 25.35 + vtrn.8 \r0, \r1 25.36 + vtrn.8 \r2, \r3 25.37 + vtrn.8 \r4, \r5 25.38 + vtrn.8 \r6, \r7 25.39 + .endm 25.40 + 25.41 + .macro transpose_4x4 r0 r1 r2 r3 25.42 + vtrn.16 \r0, \r2 25.43 + vtrn.16 \r1, \r3 25.44 + vtrn.8 \r0, \r1 25.45 + vtrn.8 \r2, \r3 25.46 + .endm 25.47 + 25.48 + .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7 25.49 + vswp \r0, \r4 25.50 + vswp \r1, \r5 25.51 + vswp \r2, \r6 25.52 + vswp \r3, \r7 25.53 + .endm 25.54 + 25.55 + .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7 25.56 + vtrn.32 \r0, \r2 25.57 + vtrn.32 \r1, \r3 25.58 + vtrn.32 \r4, \r6 25.59 + vtrn.32 \r5, \r7 25.60 + vtrn.16 \r0, \r1 25.61 + vtrn.16 \r2, \r3 25.62 + vtrn.16 \r4, \r5 25.63 + vtrn.16 \r6, \r7 25.64 + .endm 25.65 + 25.66 +/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ 25.67 + .macro h264_chroma_mc8 type 25.68 +function ff_\type\()_h264_chroma_mc8_neon, export=1 25.69 + push {r4-r7, lr} 25.70 + ldrd r4, [sp, #20] 25.71 +.ifc \type,avg 25.72 + mov lr, r0 25.73 +.endif 25.74 + pld [r1] 25.75 + pld [r1, r2] 25.76 + 25.77 + muls r7, r4, r5 25.78 + rsb r6, r7, r5, lsl #3 25.79 + rsb ip, r7, r4, lsl #3 25.80 + sub r4, r7, r4, lsl #3 25.81 + sub r4, r4, r5, lsl #3 25.82 + add r4, r4, #64 25.83 + 25.84 + beq 2f 25.85 + 25.86 + add r5, r1, r2 25.87 + 25.88 + vdup.8 d0, r4 25.89 + lsl r4, r2, #1 25.90 + vdup.8 d1, ip 25.91 + vld1.64 {d4, d5}, [r1], r4 25.92 + vdup.8 d2, r6 25.93 + vld1.64 {d6, d7}, [r5], r4 25.94 + vdup.8 d3, r7 25.95 + 25.96 + vext.8 d5, d4, d5, #1 25.97 + vext.8 d7, d6, d7, #1 25.98 + 25.99 +1: pld [r5] 25.100 + vmull.u8 q8, d4, d0 25.101 + vmlal.u8 q8, d5, d1 25.102 + vld1.64 {d4, d5}, [r1], r4 25.103 + vmlal.u8 q8, d6, d2 25.104 + vext.8 d5, d4, d5, #1 25.105 + vmlal.u8 q8, d7, d3 25.106 + vmull.u8 q9, d6, d0 25.107 + subs r3, r3, #2 25.108 + vmlal.u8 q9, d7, d1 25.109 + vmlal.u8 q9, d4, d2 25.110 + vmlal.u8 q9, d5, d3 25.111 + vrshrn.u16 d16, q8, #6 25.112 + vld1.64 {d6, d7}, [r5], r4 25.113 + pld [r1] 25.114 + vrshrn.u16 d17, q9, #6 25.115 +.ifc \type,avg 25.116 + vld1.64 {d20}, [lr,:64], r2 25.117 + vld1.64 {d21}, [lr,:64], r2 25.118 + vrhadd.u8 q8, q8, q10 25.119 +.endif 25.120 + vext.8 d7, d6, d7, #1 25.121 + vst1.64 {d16}, [r0,:64], r2 25.122 + vst1.64 {d17}, [r0,:64], r2 25.123 + bgt 1b 25.124 + 25.125 + pop {r4-r7, pc} 25.126 + 25.127 +2: tst r6, r6 25.128 + add ip, ip, r6 25.129 + vdup.8 d0, r4 25.130 + vdup.8 d1, ip 25.131 + 25.132 + beq 4f 25.133 + 25.134 + add r5, r1, r2 25.135 + lsl r4, r2, #1 25.136 + vld1.64 {d4}, [r1], r4 25.137 + vld1.64 {d6}, [r5], r4 25.138 + 25.139 +3: pld [r5] 25.140 + vmull.u8 q8, d4, d0 25.141 + vmlal.u8 q8, d6, d1 25.142 + vld1.64 {d4}, [r1], r4 25.143 + vmull.u8 q9, d6, d0 25.144 + vmlal.u8 q9, d4, d1 25.145 + vld1.64 {d6}, [r5], r4 25.146 + vrshrn.u16 d16, q8, #6 25.147 + vrshrn.u16 d17, q9, #6 25.148 +.ifc \type,avg 25.149 + vld1.64 {d20}, [lr,:64], r2 25.150 + vld1.64 {d21}, [lr,:64], r2 25.151 + vrhadd.u8 q8, q8, q10 25.152 +.endif 25.153 + subs r3, r3, #2 25.154 + pld [r1] 25.155 + vst1.64 {d16}, [r0,:64], r2 25.156 + vst1.64 {d17}, [r0,:64], r2 25.157 + bgt 3b 25.158 + 25.159 + pop {r4-r7, pc} 25.160 + 25.161 +4: vld1.64 {d4, d5}, [r1], r2 25.162 + vld1.64 {d6, d7}, [r1], r2 25.163 + vext.8 d5, d4, d5, #1 25.164 + vext.8 d7, d6, d7, #1 25.165 + 25.166 +5: pld [r1] 25.167 + subs r3, r3, #2 25.168 + vmull.u8 q8, d4, d0 25.169 + vmlal.u8 q8, d5, d1 25.170 + vld1.64 {d4, d5}, [r1], r2 25.171 + vmull.u8 q9, d6, d0 25.172 + vmlal.u8 q9, d7, d1 25.173 + pld [r1] 25.174 + vext.8 d5, d4, d5, #1 25.175 + vrshrn.u16 d16, q8, #6 25.176 + vrshrn.u16 d17, q9, #6 25.177 +.ifc \type,avg 25.178 + vld1.64 {d20}, [lr,:64], r2 25.179 + vld1.64 {d21}, [lr,:64], r2 25.180 + vrhadd.u8 q8, q8, q10 25.181 +.endif 25.182 + vld1.64 {d6, d7}, [r1], r2 25.183 + vext.8 d7, d6, d7, #1 25.184 + vst1.64 {d16}, [r0,:64], r2 25.185 + vst1.64 {d17}, [r0,:64], r2 25.186 + bgt 5b 25.187 + 25.188 + pop {r4-r7, pc} 25.189 +endfunc 25.190 + .endm 25.191 + 25.192 +/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ 25.193 + .macro h264_chroma_mc4 type 25.194 +function ff_\type\()_h264_chroma_mc4_neon, export=1 25.195 + push {r4-r7, lr} 25.196 + ldrd r4, [sp, #20] 25.197 +.ifc \type,avg 25.198 + mov lr, r0 25.199 +.endif 25.200 + pld [r1] 25.201 + pld [r1, r2] 25.202 + 25.203 + muls r7, r4, r5 25.204 + rsb r6, r7, r5, lsl #3 25.205 + rsb ip, r7, r4, lsl #3 25.206 + sub r4, r7, r4, lsl #3 25.207 + sub r4, r4, r5, lsl #3 25.208 + add r4, r4, #64 25.209 + 25.210 + beq 2f 25.211 + 25.212 + add r5, r1, r2 25.213 + 25.214 + vdup.8 d0, r4 25.215 + lsl r4, r2, #1 25.216 + vdup.8 d1, ip 25.217 + vld1.64 {d4}, [r1], r4 25.218 + vdup.8 d2, r6 25.219 + vld1.64 {d6}, [r5], r4 25.220 + vdup.8 d3, r7 25.221 + 25.222 + vext.8 d5, d4, d5, #1 25.223 + vext.8 d7, d6, d7, #1 25.224 + vtrn.32 d4, d5 25.225 + vtrn.32 d6, d7 25.226 + 25.227 + vtrn.32 d0, d1 25.228 + vtrn.32 d2, d3 25.229 + 25.230 +1: pld [r5] 25.231 + vmull.u8 q8, d4, d0 25.232 + vmlal.u8 q8, d6, d2 25.233 + vld1.64 {d4}, [r1], r4 25.234 + vext.8 d5, d4, d5, #1 25.235 + vtrn.32 d4, d5 25.236 + vmull.u8 q9, d6, d0 25.237 + vmlal.u8 q9, d4, d2 25.238 + vld1.64 {d6}, [r5], r4 25.239 + vadd.i16 d16, d16, d17 25.240 + vadd.i16 d17, d18, d19 25.241 + vrshrn.u16 d16, q8, #6 25.242 + subs r3, r3, #2 25.243 + pld [r1] 25.244 +.ifc \type,avg 25.245 + vld1.32 {d20[0]}, [lr,:32], r2 25.246 + vld1.32 {d20[1]}, [lr,:32], r2 25.247 + vrhadd.u8 d16, d16, d20 25.248 +.endif 25.249 + vext.8 d7, d6, d7, #1 25.250 + vtrn.32 d6, d7 25.251 + vst1.32 {d16[0]}, [r0,:32], r2 25.252 + vst1.32 {d16[1]}, [r0,:32], r2 25.253 + bgt 1b 25.254 + 25.255 + pop {r4-r7, pc} 25.256 + 25.257 +2: tst r6, r6 25.258 + add ip, ip, r6 25.259 + vdup.8 d0, r4 25.260 + vdup.8 d1, ip 25.261 + vtrn.32 d0, d1 25.262 + 25.263 + beq 4f 25.264 + 25.265 + vext.32 d1, d0, d1, #1 25.266 + add r5, r1, r2 25.267 + lsl r4, r2, #1 25.268 + vld1.32 {d4[0]}, [r1], r4 25.269 + vld1.32 {d4[1]}, [r5], r4 25.270 + 25.271 +3: pld [r5] 25.272 + vmull.u8 q8, d4, d0 25.273 + vld1.32 {d4[0]}, [r1], r4 25.274 + vmull.u8 q9, d4, d1 25.275 + vld1.32 {d4[1]}, [r5], r4 25.276 + vadd.i16 d16, d16, d17 25.277 + vadd.i16 d17, d18, d19 25.278 + vrshrn.u16 d16, q8, #6 25.279 +.ifc \type,avg 25.280 + vld1.32 {d20[0]}, [lr,:32], r2 25.281 + vld1.32 {d20[1]}, [lr,:32], r2 25.282 + vrhadd.u8 d16, d16, d20 25.283 +.endif 25.284 + subs r3, r3, #2 25.285 + pld [r1] 25.286 + vst1.32 {d16[0]}, [r0,:32], r2 25.287 + vst1.32 {d16[1]}, [r0,:32], r2 25.288 + bgt 3b 25.289 + 25.290 + pop {r4-r7, pc} 25.291 + 25.292 +4: vld1.64 {d4}, [r1], r2 25.293 + vld1.64 {d6}, [r1], r2 25.294 + vext.8 d5, d4, d5, #1 25.295 + vext.8 d7, d6, d7, #1 25.296 + vtrn.32 d4, d5 25.297 + vtrn.32 d6, d7 25.298 + 25.299 +5: vmull.u8 q8, d4, d0 25.300 + vmull.u8 q9, d6, d0 25.301 + subs r3, r3, #2 25.302 + vld1.64 {d4}, [r1], r2 25.303 + vext.8 d5, d4, d5, #1 25.304 + vtrn.32 d4, d5 25.305 + vadd.i16 d16, d16, d17 25.306 + vadd.i16 d17, d18, d19 25.307 + pld [r1] 25.308 + vrshrn.u16 d16, q8, #6 25.309 +.ifc \type,avg 25.310 + vld1.32 {d20[0]}, [lr,:32], r2 25.311 + vld1.32 {d20[1]}, [lr,:32], r2 25.312 + vrhadd.u8 d16, d16, d20 25.313 +.endif 25.314 + vld1.64 {d6}, [r1], r2 25.315 + vext.8 d7, d6, d7, #1 25.316 + vtrn.32 d6, d7 25.317 + pld [r1] 25.318 + vst1.32 {d16[0]}, [r0,:32], r2 25.319 + vst1.32 {d16[1]}, [r0,:32], r2 25.320 + bgt 5b 25.321 + 25.322 + pop {r4-r7, pc} 25.323 +endfunc 25.324 + .endm 25.325 + 25.326 + .macro h264_chroma_mc2 type 25.327 +function ff_\type\()_h264_chroma_mc2_neon, export=1 25.328 + push {r4-r6, lr} 25.329 + ldr r4, [sp, #16] 25.330 + ldr lr, [sp, #20] 25.331 + pld [r1] 25.332 + pld [r1, r2] 25.333 + orrs r5, r4, lr 25.334 + beq 2f 25.335 + 25.336 + mul r5, r4, lr 25.337 + rsb r6, r5, lr, lsl #3 25.338 + rsb r12, r5, r4, lsl #3 25.339 + sub r4, r5, r4, lsl #3 25.340 + sub r4, r4, lr, lsl #3 25.341 + add r4, r4, #64 25.342 + vdup.8 d0, r4 25.343 + vdup.8 d2, r12 25.344 + vdup.8 d1, r6 25.345 + vdup.8 d3, r5 25.346 + vtrn.16 q0, q1 25.347 +1: 25.348 + vld1.32 {d4[0]}, [r1], r2 25.349 + vld1.32 {d4[1]}, [r1], r2 25.350 + vrev64.32 d5, d4 25.351 + vld1.32 {d5[1]}, [r1] 25.352 + vext.8 q3, q2, q2, #1 25.353 + vtrn.16 q2, q3 25.354 + vmull.u8 q8, d4, d0 25.355 + vmlal.u8 q8, d5, d1 25.356 +.ifc \type,avg 25.357 + vld1.16 {d18[0]}, [r0,:16], r2 25.358 + vld1.16 {d18[1]}, [r0,:16] 25.359 + sub r0, r0, r2 25.360 +.endif 25.361 + vtrn.32 d16, d17 25.362 + vadd.i16 d16, d16, d17 25.363 + vrshrn.u16 d16, q8, #6 25.364 +.ifc \type,avg 25.365 + vrhadd.u8 d16, d16, d18 25.366 +.endif 25.367 + vst1.16 {d16[0]}, [r0,:16], r2 25.368 + vst1.16 {d16[1]}, [r0,:16], r2 25.369 + subs r3, r3, #2 25.370 + bgt 1b 25.371 + pop {r4-r6, pc} 25.372 +2: 25.373 +.ifc \type,put 25.374 + ldrh r5, [r1], r2 25.375 + strh r5, [r0], r2 25.376 + ldrh r6, [r1], r2 25.377 + strh r6, [r0], r2 25.378 +.else 25.379 + vld1.16 {d16[0]}, [r1], r2 25.380 + vld1.16 {d16[1]}, [r1], r2 25.381 + vld1.16 {d18[0]}, [r0,:16], r2 25.382 + vld1.16 {d18[1]}, [r0,:16] 25.383 + sub r0, r0, r2 25.384 + vrhadd.u8 d16, d16, d18 25.385 + vst1.16 {d16[0]}, [r0,:16], r2 25.386 + vst1.16 {d16[1]}, [r0,:16], r2 25.387 +.endif 25.388 + subs r3, r3, #2 25.389 + bgt 2b 25.390 + pop {r4-r6, pc} 25.391 +endfunc 25.392 +.endm 25.393 + 25.394 + .text 25.395 + .align 25.396 + 25.397 + h264_chroma_mc8 put 25.398 + h264_chroma_mc8 avg 25.399 + h264_chroma_mc4 put 25.400 + h264_chroma_mc4 avg 25.401 + h264_chroma_mc2 put 25.402 + h264_chroma_mc2 avg 25.403 + 25.404 + /* H.264 loop filter */ 25.405 + 25.406 + .macro h264_loop_filter_start 25.407 + ldr ip, [sp] 25.408 + tst r2, r2 25.409 + ldr ip, [ip] 25.410 + tstne r3, r3 25.411 + vmov.32 d24[0], ip 25.412 + and ip, ip, ip, lsl #16 25.413 + bxeq lr 25.414 + ands ip, ip, ip, lsl #8 25.415 + bxlt lr 25.416 + .endm 25.417 + 25.418 + .macro align_push_regs 25.419 + and ip, sp, #15 25.420 + add ip, ip, #32 25.421 + sub sp, sp, ip 25.422 + vst1.64 {d12-d15}, [sp,:128] 25.423 + sub sp, sp, #32 25.424 + vst1.64 {d8-d11}, [sp,:128] 25.425 + .endm 25.426 + 25.427 + .macro align_pop_regs 25.428 + vld1.64 {d8-d11}, [sp,:128]! 25.429 + vld1.64 {d12-d15}, [sp,:128], ip 25.430 + .endm 25.431 + 25.432 + .macro h264_loop_filter_luma 25.433 + vdup.8 q11, r2 @ alpha 25.434 + vmovl.u8 q12, d24 25.435 + vabd.u8 q6, q8, q0 @ abs(p0 - q0) 25.436 + vmovl.u16 q12, d24 25.437 + vabd.u8 q14, q9, q8 @ abs(p1 - p0) 25.438 + vsli.16 q12, q12, #8 25.439 + vabd.u8 q15, q1, q0 @ abs(q1 - q0) 25.440 + vsli.32 q12, q12, #16 25.441 + vclt.u8 q6, q6, q11 @ < alpha 25.442 + vdup.8 q11, r3 @ beta 25.443 + vclt.s8 q7, q12, #0 25.444 + vclt.u8 q14, q14, q11 @ < beta 25.445 + vclt.u8 q15, q15, q11 @ < beta 25.446 + vbic q6, q6, q7 25.447 + vabd.u8 q4, q10, q8 @ abs(p2 - p0) 25.448 + vand q6, q6, q14 25.449 + vabd.u8 q5, q2, q0 @ abs(q2 - q0) 25.450 + vclt.u8 q4, q4, q11 @ < beta 25.451 + vand q6, q6, q15 25.452 + vclt.u8 q5, q5, q11 @ < beta 25.453 + vand q4, q4, q6 25.454 + vand q5, q5, q6 25.455 + vand q12, q12, q6 25.456 + vrhadd.u8 q14, q8, q0 25.457 + vsub.i8 q6, q12, q4 25.458 + vqadd.u8 q7, q9, q12 25.459 + vhadd.u8 q10, q10, q14 25.460 + vsub.i8 q6, q6, q5 25.461 + vhadd.u8 q14, q2, q14 25.462 + vmin.u8 q7, q7, q10 25.463 + vqsub.u8 q11, q9, q12 25.464 + vqadd.u8 q2, q1, q12 25.465 + vmax.u8 q7, q7, q11 25.466 + vqsub.u8 q11, q1, q12 25.467 + vmin.u8 q14, q2, q14 25.468 + vmovl.u8 q2, d0 25.469 + vmax.u8 q14, q14, q11 25.470 + vmovl.u8 q10, d1 25.471 + vsubw.u8 q2, q2, d16 25.472 + vsubw.u8 q10, q10, d17 25.473 + vshl.i16 q2, q2, #2 25.474 + vshl.i16 q10, q10, #2 25.475 + vaddw.u8 q2, q2, d18 25.476 + vaddw.u8 q10, q10, d19 25.477 + vsubw.u8 q2, q2, d2 25.478 + vsubw.u8 q10, q10, d3 25.479 + vrshrn.i16 d4, q2, #3 25.480 + vrshrn.i16 d5, q10, #3 25.481 + vbsl q4, q7, q9 25.482 + vbsl q5, q14, q1 25.483 + vneg.s8 q7, q6 25.484 + vmovl.u8 q14, d16 25.485 + vmin.s8 q2, q2, q6 25.486 + vmovl.u8 q6, d17 25.487 + vmax.s8 q2, q2, q7 25.488 + vmovl.u8 q11, d0 25.489 + vmovl.u8 q12, d1 25.490 + vaddw.s8 q14, q14, d4 25.491 + vaddw.s8 q6, q6, d5 25.492 + vsubw.s8 q11, q11, d4 25.493 + vsubw.s8 q12, q12, d5 25.494 + vqmovun.s16 d16, q14 25.495 + vqmovun.s16 d17, q6 25.496 + vqmovun.s16 d0, q11 25.497 + vqmovun.s16 d1, q12 25.498 + .endm 25.499 + 25.500 +function ff_h264_v_loop_filter_luma_neon, export=1 25.501 + h264_loop_filter_start 25.502 + 25.503 + vld1.64 {d0, d1}, [r0,:128], r1 25.504 + vld1.64 {d2, d3}, [r0,:128], r1 25.505 + vld1.64 {d4, d5}, [r0,:128], r1 25.506 + sub r0, r0, r1, lsl #2 25.507 + sub r0, r0, r1, lsl #1 25.508 + vld1.64 {d20,d21}, [r0,:128], r1 25.509 + vld1.64 {d18,d19}, [r0,:128], r1 25.510 + vld1.64 {d16,d17}, [r0,:128], r1 25.511 + 25.512 + align_push_regs 25.513 + 25.514 + h264_loop_filter_luma 25.515 + 25.516 + sub r0, r0, r1, lsl #1 25.517 + vst1.64 {d8, d9}, [r0,:128], r1 25.518 + vst1.64 {d16,d17}, [r0,:128], r1 25.519 + vst1.64 {d0, d1}, [r0,:128], r1 25.520 + vst1.64 {d10,d11}, [r0,:128] 25.521 + 25.522 + align_pop_regs 25.523 + bx lr 25.524 +endfunc 25.525 + 25.526 +function ff_h264_h_loop_filter_luma_neon, export=1 25.527 + h264_loop_filter_start 25.528 + 25.529 + sub r0, r0, #4 25.530 + vld1.64 {d6}, [r0], r1 25.531 + vld1.64 {d20}, [r0], r1 25.532 + vld1.64 {d18}, [r0], r1 25.533 + vld1.64 {d16}, [r0], r1 25.534 + vld1.64 {d0}, [r0], r1 25.535 + vld1.64 {d2}, [r0], r1 25.536 + vld1.64 {d4}, [r0], r1 25.537 + vld1.64 {d26}, [r0], r1 25.538 + vld1.64 {d7}, [r0], r1 25.539 + vld1.64 {d21}, [r0], r1 25.540 + vld1.64 {d19}, [r0], r1 25.541 + vld1.64 {d17}, [r0], r1 25.542 + vld1.64 {d1}, [r0], r1 25.543 + vld1.64 {d3}, [r0], r1 25.544 + vld1.64 {d5}, [r0], r1 25.545 + vld1.64 {d27}, [r0], r1 25.546 + 25.547 + transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 25.548 + 25.549 + align_push_regs 25.550 + 25.551 + h264_loop_filter_luma 25.552 + 25.553 + transpose_4x4 q4, q8, q0, q5 25.554 + 25.555 + sub r0, r0, r1, lsl #4 25.556 + add r0, r0, #2 25.557 + vst1.32 {d8[0]}, [r0], r1 25.558 + vst1.32 {d16[0]}, [r0], r1 25.559 + vst1.32 {d0[0]}, [r0], r1 25.560 + vst1.32 {d10[0]}, [r0], r1 25.561 + vst1.32 {d8[1]}, [r0], r1 25.562 + vst1.32 {d16[1]}, [r0], r1 25.563 + vst1.32 {d0[1]}, [r0], r1 25.564 + vst1.32 {d10[1]}, [r0], r1 25.565 + vst1.32 {d9[0]}, [r0], r1 25.566 + vst1.32 {d17[0]}, [r0], r1 25.567 + vst1.32 {d1[0]}, [r0], r1 25.568 + vst1.32 {d11[0]}, [r0], r1 25.569 + vst1.32 {d9[1]}, [r0], r1 25.570 + vst1.32 {d17[1]}, [r0], r1 25.571 + vst1.32 {d1[1]}, [r0], r1 25.572 + vst1.32 {d11[1]}, [r0], r1 25.573 + 25.574 + align_pop_regs 25.575 + bx lr 25.576 +endfunc 25.577 + 25.578 + .macro h264_loop_filter_chroma 25.579 + vdup.8 d22, r2 @ alpha 25.580 + vmovl.u8 q12, d24 25.581 + vabd.u8 d26, d16, d0 @ abs(p0 - q0) 25.582 + vmovl.u8 q2, d0 25.583 + vabd.u8 d28, d18, d16 @ abs(p1 - p0) 25.584 + vsubw.u8 q2, q2, d16 25.585 + vsli.16 d24, d24, #8 25.586 + vshl.i16 q2, q2, #2 25.587 + vabd.u8 d30, d2, d0 @ abs(q1 - q0) 25.588 + vaddw.u8 q2, q2, d18 25.589 + vclt.u8 d26, d26, d22 @ < alpha 25.590 + vsubw.u8 q2, q2, d2 25.591 + vdup.8 d22, r3 @ beta 25.592 + vclt.s8 d25, d24, #0 25.593 + vrshrn.i16 d4, q2, #3 25.594 + vclt.u8 d28, d28, d22 @ < beta 25.595 + vbic d26, d26, d25 25.596 + vclt.u8 d30, d30, d22 @ < beta 25.597 + vand d26, d26, d28 25.598 + vneg.s8 d25, d24 25.599 + vand d26, d26, d30 25.600 + vmin.s8 d4, d4, d24 25.601 + vmovl.u8 q14, d16 25.602 + vand d4, d4, d26 25.603 + vmax.s8 d4, d4, d25 25.604 + vmovl.u8 q11, d0 25.605 + vaddw.s8 q14, q14, d4 25.606 + vsubw.s8 q11, q11, d4 25.607 + vqmovun.s16 d16, q14 25.608 + vqmovun.s16 d0, q11 25.609 + .endm 25.610 + 25.611 +function ff_h264_v_loop_filter_chroma_neon, export=1 25.612 + h264_loop_filter_start 25.613 + 25.614 + sub r0, r0, r1, lsl #1 25.615 + vld1.64 {d18}, [r0,:64], r1 25.616 + vld1.64 {d16}, [r0,:64], r1 25.617 + vld1.64 {d0}, [r0,:64], r1 25.618 + vld1.64 {d2}, [r0,:64] 25.619 + 25.620 + h264_loop_filter_chroma 25.621 + 25.622 + sub r0, r0, r1, lsl #1 25.623 + vst1.64 {d16}, [r0,:64], r1 25.624 + vst1.64 {d0}, [r0,:64], r1 25.625 + 25.626 + bx lr 25.627 +endfunc 25.628 + 25.629 +function ff_h264_h_loop_filter_chroma_neon, export=1 25.630 + h264_loop_filter_start 25.631 + 25.632 + sub r0, r0, #2 25.633 + vld1.32 {d18[0]}, [r0], r1 25.634 + vld1.32 {d16[0]}, [r0], r1 25.635 + vld1.32 {d0[0]}, [r0], r1 25.636 + vld1.32 {d2[0]}, [r0], r1 25.637 + vld1.32 {d18[1]}, [r0], r1 25.638 + vld1.32 {d16[1]}, [r0], r1 25.639 + vld1.32 {d0[1]}, [r0], r1 25.640 + vld1.32 {d2[1]}, [r0], r1 25.641 + 25.642 + vtrn.16 d18, d0 25.643 + vtrn.16 d16, d2 25.644 + vtrn.8 d18, d16 25.645 + vtrn.8 d0, d2 25.646 + 25.647 + h264_loop_filter_chroma 25.648 + 25.649 + vtrn.16 d18, d0 25.650 + vtrn.16 d16, d2 25.651 + vtrn.8 d18, d16 25.652 + vtrn.8 d0, d2 25.653 + 25.654 + sub r0, r0, r1, lsl #3 25.655 + vst1.32 {d18[0]}, [r0], r1 25.656 + vst1.32 {d16[0]}, [r0], r1 25.657 + vst1.32 {d0[0]}, [r0], r1 25.658 + vst1.32 {d2[0]}, [r0], r1 25.659 + vst1.32 {d18[1]}, [r0], r1 25.660 + vst1.32 {d16[1]}, [r0], r1 25.661 + vst1.32 {d0[1]}, [r0], r1 25.662 + vst1.32 {d2[1]}, [r0], r1 25.663 + 25.664 + bx lr 25.665 +endfunc 25.666 + 25.667 + /* H.264 qpel MC */ 25.668 + 25.669 + .macro lowpass_const r 25.670 + movw \r, #5 25.671 + movt \r, #20 25.672 + vmov.32 d6[0], \r 25.673 + .endm 25.674 + 25.675 + .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 25.676 +.if \narrow 25.677 + t0 .req q0 25.678 + t1 .req q8 25.679 +.else 25.680 + t0 .req \d0 25.681 + t1 .req \d1 25.682 +.endif 25.683 + vext.8 d2, \r0, \r1, #2 25.684 + vext.8 d3, \r0, \r1, #3 25.685 + vaddl.u8 q1, d2, d3 25.686 + vext.8 d4, \r0, \r1, #1 25.687 + vext.8 d5, \r0, \r1, #4 25.688 + vaddl.u8 q2, d4, d5 25.689 + vext.8 d30, \r0, \r1, #5 25.690 + vaddl.u8 t0, \r0, d30 25.691 + vext.8 d18, \r2, \r3, #2 25.692 + vmla.i16 t0, q1, d6[1] 25.693 + vext.8 d19, \r2, \r3, #3 25.694 + vaddl.u8 q9, d18, d19 25.695 + vext.8 d20, \r2, \r3, #1 25.696 + vmls.i16 t0, q2, d6[0] 25.697 + vext.8 d21, \r2, \r3, #4 25.698 + vaddl.u8 q10, d20, d21 25.699 + vext.8 d31, \r2, \r3, #5 25.700 + vaddl.u8 t1, \r2, d31 25.701 + vmla.i16 t1, q9, d6[1] 25.702 + vmls.i16 t1, q10, d6[0] 25.703 +.if \narrow 25.704 + vqrshrun.s16 \d0, t0, #5 25.705 + vqrshrun.s16 \d1, t1, #5 25.706 +.endif 25.707 + .unreq t0 25.708 + .unreq t1 25.709 + .endm 25.710 + 25.711 + .macro lowpass_8_1 r0, r1, d0, narrow=1 25.712 +.if \narrow 25.713 + t0 .req q0 25.714 +.else 25.715 + t0 .req \d0 25.716 +.endif 25.717 + vext.8 d2, \r0, \r1, #2 25.718 + vext.8 d3, \r0, \r1, #3 25.719 + vaddl.u8 q1, d2, d3 25.720 + vext.8 d4, \r0, \r1, #1 25.721 + vext.8 d5, \r0, \r1, #4 25.722 + vaddl.u8 q2, d4, d5 25.723 + vext.8 d30, \r0, \r1, #5 25.724 + vaddl.u8 t0, \r0, d30 25.725 + vmla.i16 t0, q1, d6[1] 25.726 + vmls.i16 t0, q2, d6[0] 25.727 +.if \narrow 25.728 + vqrshrun.s16 \d0, t0, #5 25.729 +.endif 25.730 + .unreq t0 25.731 + .endm 25.732 + 25.733 + .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d 25.734 + vext.16 q1, \r0, \r1, #2 25.735 + vext.16 q0, \r0, \r1, #3 25.736 + vaddl.s16 q9, d2, d0 25.737 + vext.16 q2, \r0, \r1, #1 25.738 + vaddl.s16 q1, d3, d1 25.739 + vext.16 q3, \r0, \r1, #4 25.740 + vaddl.s16 q10, d4, d6 25.741 + vext.16 \r1, \r0, \r1, #5 25.742 + vaddl.s16 q2, d5, d7 25.743 + vaddl.s16 q0, \h0, \h1 25.744 + vaddl.s16 q8, \l0, \l1 25.745 + 25.746 + vshl.i32 q3, q9, #4 25.747 + vshl.i32 q9, q9, #2 25.748 + vshl.i32 q15, q10, #2 25.749 + vadd.i32 q9, q9, q3 25.750 + vadd.i32 q10, q10, q15 25.751 + 25.752 + vshl.i32 q3, q1, #4 25.753 + vshl.i32 q1, q1, #2 25.754 + vshl.i32 q15, q2, #2 25.755 + vadd.i32 q1, q1, q3 25.756 + vadd.i32 q2, q2, q15 25.757 + 25.758 + vadd.i32 q9, q9, q8 25.759 + vsub.i32 q9, q9, q10 25.760 + 25.761 + vadd.i32 q1, q1, q0 25.762 + vsub.i32 q1, q1, q2 25.763 + 25.764 + vrshrn.s32 d18, q9, #10 25.765 + vrshrn.s32 d19, q1, #10 25.766 + 25.767 + vqmovun.s16 \d, q9 25.768 + .endm 25.769 + 25.770 +function put_h264_qpel16_h_lowpass_neon_packed 25.771 + mov r4, lr 25.772 + mov ip, #16 25.773 + mov r3, #8 25.774 + bl put_h264_qpel8_h_lowpass_neon 25.775 + sub r1, r1, r2, lsl #4 25.776 + add r1, r1, #8 25.777 + mov ip, #16 25.778 + mov lr, r4 25.779 + b put_h264_qpel8_h_lowpass_neon 25.780 +endfunc 25.781 + 25.782 + .macro h264_qpel_h_lowpass type 25.783 +function \type\()_h264_qpel16_h_lowpass_neon 25.784 + push {lr} 25.785 + mov ip, #16 25.786 + bl \type\()_h264_qpel8_h_lowpass_neon 25.787 + sub r0, r0, r3, lsl #4 25.788 + sub r1, r1, r2, lsl #4 25.789 + add r0, r0, #8 25.790 + add r1, r1, #8 25.791 + mov ip, #16 25.792 + pop {lr} 25.793 +endfunc 25.794 + 25.795 +function \type\()_h264_qpel8_h_lowpass_neon 25.796 +1: vld1.64 {d0, d1}, [r1], r2 25.797 + vld1.64 {d16,d17}, [r1], r2 25.798 + subs ip, ip, #2 25.799 + lowpass_8 d0, d1, d16, d17, d0, d16 25.800 +.ifc \type,avg 25.801 + vld1.8 {d2}, [r0,:64], r3 25.802 + vrhadd.u8 d0, d0, d2 25.803 + vld1.8 {d3}, [r0,:64] 25.804 + vrhadd.u8 d16, d16, d3 25.805 + sub r0, r0, r3 25.806 +.endif 25.807 + vst1.64 {d0}, [r0,:64], r3 25.808 + vst1.64 {d16}, [r0,:64], r3 25.809 + bne 1b 25.810 + bx lr 25.811 +endfunc 25.812 + .endm 25.813 + 25.814 + h264_qpel_h_lowpass put 25.815 + h264_qpel_h_lowpass avg 25.816 + 25.817 + .macro h264_qpel_h_lowpass_l2 type 25.818 +function \type\()_h264_qpel16_h_lowpass_l2_neon 25.819 + push {lr} 25.820 + mov ip, #16 25.821 + bl \type\()_h264_qpel8_h_lowpass_l2_neon 25.822 + sub r0, r0, r2, lsl #4 25.823 + sub r1, r1, r2, lsl #4 25.824 + sub r3, r3, r2, lsl #4 25.825 + add r0, r0, #8 25.826 + add r1, r1, #8 25.827 + add r3, r3, #8 25.828 + mov ip, #16 25.829 + pop {lr} 25.830 +endfunc 25.831 + 25.832 +function \type\()_h264_qpel8_h_lowpass_l2_neon 25.833 +1: vld1.64 {d0, d1}, [r1], r2 25.834 + vld1.64 {d16,d17}, [r1], r2 25.835 + vld1.64 {d28}, [r3], r2 25.836 + vld1.64 {d29}, [r3], r2 25.837 + subs ip, ip, #2 25.838 + lowpass_8 d0, d1, d16, d17, d0, d1 25.839 + vrhadd.u8 q0, q0, q14 25.840 +.ifc \type,avg 25.841 + vld1.8 {d2}, [r0,:64], r2 25.842 + vrhadd.u8 d0, d0, d2 25.843 + vld1.8 {d3}, [r0,:64] 25.844 + vrhadd.u8 d1, d1, d3 25.845 + sub r0, r0, r2 25.846 +.endif 25.847 + vst1.64 {d0}, [r0,:64], r2 25.848 + vst1.64 {d1}, [r0,:64], r2 25.849 + bne 1b 25.850 + bx lr 25.851 +endfunc 25.852 + .endm 25.853 + 25.854 + h264_qpel_h_lowpass_l2 put 25.855 + h264_qpel_h_lowpass_l2 avg 25.856 + 25.857 +function put_h264_qpel16_v_lowpass_neon_packed 25.858 + mov r4, lr 25.859 + mov r2, #8 25.860 + bl put_h264_qpel8_v_lowpass_neon 25.861 + sub r1, r1, r3, lsl #2 25.862 + bl put_h264_qpel8_v_lowpass_neon 25.863 + sub r1, r1, r3, lsl #4 25.864 + sub r1, r1, r3, lsl #2 25.865 + add r1, r1, #8 25.866 + bl put_h264_qpel8_v_lowpass_neon 25.867 + sub r1, r1, r3, lsl #2 25.868 + mov lr, r4 25.869 + b put_h264_qpel8_v_lowpass_neon 25.870 +endfunc 25.871 + 25.872 + .macro h264_qpel_v_lowpass type 25.873 +function \type\()_h264_qpel16_v_lowpass_neon 25.874 + mov r4, lr 25.875 + bl \type\()_h264_qpel8_v_lowpass_neon 25.876 + sub r1, r1, r3, lsl #2 25.877 + bl \type\()_h264_qpel8_v_lowpass_neon 25.878 + sub r0, r0, r2, lsl #4 25.879 + add r0, r0, #8 25.880 + sub r1, r1, r3, lsl #4 25.881 + sub r1, r1, r3, lsl #2 25.882 + add r1, r1, #8 25.883 + bl \type\()_h264_qpel8_v_lowpass_neon 25.884 + sub r1, r1, r3, lsl #2 25.885 + mov lr, r4 25.886 +endfunc 25.887 + 25.888 +function \type\()_h264_qpel8_v_lowpass_neon 25.889 + vld1.64 {d8}, [r1], r3 25.890 + vld1.64 {d10}, [r1], r3 25.891 + vld1.64 {d12}, [r1], r3 25.892 + vld1.64 {d14}, [r1], r3 25.893 + vld1.64 {d22}, [r1], r3 25.894 + vld1.64 {d24}, [r1], r3 25.895 + vld1.64 {d26}, [r1], r3 25.896 + vld1.64 {d28}, [r1], r3 25.897 + vld1.64 {d9}, [r1], r3 25.898 + vld1.64 {d11}, [r1], r3 25.899 + vld1.64 {d13}, [r1], r3 25.900 + vld1.64 {d15}, [r1], r3 25.901 + vld1.64 {d23}, [r1] 25.902 + 25.903 + transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 25.904 + lowpass_8 d8, d9, d10, d11, d8, d10 25.905 + lowpass_8 d12, d13, d14, d15, d12, d14 25.906 + lowpass_8 d22, d23, d24, d25, d22, d24 25.907 + lowpass_8 d26, d27, d28, d29, d26, d28 25.908 + transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 25.909 + 25.910 +.ifc \type,avg 25.911 + vld1.8 {d9}, [r0,:64], r2 25.912 + vrhadd.u8 d8, d8, d9 25.913 + vld1.8 {d11}, [r0,:64], r2 25.914 + vrhadd.u8 d10, d10, d11 25.915 + vld1.8 {d13}, [r0,:64], r2 25.916 + vrhadd.u8 d12, d12, d13 25.917 + vld1.8 {d15}, [r0,:64], r2 25.918 + vrhadd.u8 d14, d14, d15 25.919 + vld1.8 {d23}, [r0,:64], r2 25.920 + vrhadd.u8 d22, d22, d23 25.921 + vld1.8 {d25}, [r0,:64], r2 25.922 + vrhadd.u8 d24, d24, d25 25.923 + vld1.8 {d27}, [r0,:64], r2 25.924 + vrhadd.u8 d26, d26, d27 25.925 + vld1.8 {d29}, [r0,:64], r2 25.926 + vrhadd.u8 d28, d28, d29 25.927 + sub r0, r0, r2, lsl #3 25.928 +.endif 25.929 + 25.930 + vst1.64 {d8}, [r0,:64], r2 25.931 + vst1.64 {d10}, [r0,:64], r2 25.932 + vst1.64 {d12}, [r0,:64], r2 25.933 + vst1.64 {d14}, [r0,:64], r2 25.934 + vst1.64 {d22}, [r0,:64], r2 25.935 + vst1.64 {d24}, [r0,:64], r2 25.936 + vst1.64 {d26}, [r0,:64], r2 25.937 + vst1.64 {d28}, [r0,:64], r2 25.938 + 25.939 + bx lr 25.940 +endfunc 25.941 + .endm 25.942 + 25.943 + h264_qpel_v_lowpass put 25.944 + h264_qpel_v_lowpass avg 25.945 + 25.946 + .macro h264_qpel_v_lowpass_l2 type 25.947 +function \type\()_h264_qpel16_v_lowpass_l2_neon 25.948 + mov r4, lr 25.949 + bl \type\()_h264_qpel8_v_lowpass_l2_neon 25.950 + sub r1, r1, r3, lsl #2 25.951 + bl \type\()_h264_qpel8_v_lowpass_l2_neon 25.952 + sub r0, r0, r3, lsl #4 25.953 + sub ip, ip, r2, lsl #4 25.954 + add r0, r0, #8 25.955 + add ip, ip, #8 25.956 + sub r1, r1, r3, lsl #4 25.957 + sub r1, r1, r3, lsl #2 25.958 + add r1, r1, #8 25.959 + bl \type\()_h264_qpel8_v_lowpass_l2_neon 25.960 + sub r1, r1, r3, lsl #2 25.961 + mov lr, r4 25.962 +endfunc 25.963 + 25.964 +function \type\()_h264_qpel8_v_lowpass_l2_neon 25.965 + vld1.64 {d8}, [r1], r3 25.966 + vld1.64 {d10}, [r1], r3 25.967 + vld1.64 {d12}, [r1], r3 25.968 + vld1.64 {d14}, [r1], r3 25.969 + vld1.64 {d22}, [r1], r3 25.970 + vld1.64 {d24}, [r1], r3 25.971 + vld1.64 {d26}, [r1], r3 25.972 + vld1.64 {d28}, [r1], r3 25.973 + vld1.64 {d9}, [r1], r3 25.974 + vld1.64 {d11}, [r1], r3 25.975 + vld1.64 {d13}, [r1], r3 25.976 + vld1.64 {d15}, [r1], r3 25.977 + vld1.64 {d23}, [r1] 25.978 + 25.979 + transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 25.980 + lowpass_8 d8, d9, d10, d11, d8, d9 25.981 + lowpass_8 d12, d13, d14, d15, d12, d13 25.982 + lowpass_8 d22, d23, d24, d25, d22, d23 25.983 + lowpass_8 d26, d27, d28, d29, d26, d27 25.984 + transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 25.985 + 25.986 + vld1.64 {d0}, [ip], r2 25.987 + vld1.64 {d1}, [ip], r2 25.988 + vld1.64 {d2}, [ip], r2 25.989 + vld1.64 {d3}, [ip], r2 25.990 + vld1.64 {d4}, [ip], r2 25.991 + vrhadd.u8 q0, q0, q4 25.992 + vld1.64 {d5}, [ip], r2 25.993 + vrhadd.u8 q1, q1, q6 25.994 + vld1.64 {d10}, [ip], r2 25.995 + vrhadd.u8 q2, q2, q11 25.996 + vld1.64 {d11}, [ip], r2 25.997 + vrhadd.u8 q5, q5, q13 25.998 + 25.999 +.ifc \type,avg 25.1000 + vld1.8 {d16}, [r0,:64], r3 25.1001 + vrhadd.u8 d0, d0, d16 25.1002 + vld1.8 {d17}, [r0,:64], r3 25.1003 + vrhadd.u8 d1, d1, d17 25.1004 + vld1.8 {d16}, [r0,:64], r3 25.1005 + vrhadd.u8 d2, d2, d16 25.1006 + vld1.8 {d17}, [r0,:64], r3 25.1007 + vrhadd.u8 d3, d3, d17 25.1008 + vld1.8 {d16}, [r0,:64], r3 25.1009 + vrhadd.u8 d4, d4, d16 25.1010 + vld1.8 {d17}, [r0,:64], r3 25.1011 + vrhadd.u8 d5, d5, d17 25.1012 + vld1.8 {d16}, [r0,:64], r3 25.1013 + vrhadd.u8 d10, d10, d16 25.1014 + vld1.8 {d17}, [r0,:64], r3 25.1015 + vrhadd.u8 d11, d11, d17 25.1016 + sub r0, r0, r3, lsl #3 25.1017 +.endif 25.1018 + 25.1019 + vst1.64 {d0}, [r0,:64], r3 25.1020 + vst1.64 {d1}, [r0,:64], r3 25.1021 + vst1.64 {d2}, [r0,:64], r3 25.1022 + vst1.64 {d3}, [r0,:64], r3 25.1023 + vst1.64 {d4}, [r0,:64], r3 25.1024 + vst1.64 {d5}, [r0,:64], r3 25.1025 + vst1.64 {d10}, [r0,:64], r3 25.1026 + vst1.64 {d11}, [r0,:64], r3 25.1027 + 25.1028 + bx lr 25.1029 +endfunc 25.1030 + .endm 25.1031 + 25.1032 + h264_qpel_v_lowpass_l2 put 25.1033 + h264_qpel_v_lowpass_l2 avg 25.1034 + 25.1035 +function put_h264_qpel8_hv_lowpass_neon_top 25.1036 + lowpass_const ip 25.1037 + mov ip, #12 25.1038 +1: vld1.64 {d0, d1}, [r1], r3 25.1039 + vld1.64 {d16,d17}, [r1], r3 25.1040 + subs ip, ip, #2 25.1041 + lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 25.1042 + vst1.64 {d22-d25}, [r4,:128]! 25.1043 + bne 1b 25.1044 + 25.1045 + vld1.64 {d0, d1}, [r1] 25.1046 + lowpass_8_1 d0, d1, q12, narrow=0 25.1047 + 25.1048 + mov ip, #-16 25.1049 + add r4, r4, ip 25.1050 + vld1.64 {d30,d31}, [r4,:128], ip 25.1051 + vld1.64 {d20,d21}, [r4,:128], ip 25.1052 + vld1.64 {d18,d19}, [r4,:128], ip 25.1053 + vld1.64 {d16,d17}, [r4,:128], ip 25.1054 + vld1.64 {d14,d15}, [r4,:128], ip 25.1055 + vld1.64 {d12,d13}, [r4,:128], ip 25.1056 + vld1.64 {d10,d11}, [r4,:128], ip 25.1057 + vld1.64 {d8, d9}, [r4,:128], ip 25.1058 + vld1.64 {d6, d7}, [r4,:128], ip 25.1059 + vld1.64 {d4, d5}, [r4,:128], ip 25.1060 + vld1.64 {d2, d3}, [r4,:128], ip 25.1061 + vld1.64 {d0, d1}, [r4,:128] 25.1062 + 25.1063 + swap4 d1, d3, d5, d7, d8, d10, d12, d14 25.1064 + transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 25.1065 + 25.1066 + swap4 d17, d19, d21, d31, d24, d26, d28, d22 25.1067 + transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 25.1068 + 25.1069 + vst1.64 {d30,d31}, [r4,:128]! 25.1070 + vst1.64 {d6, d7}, [r4,:128]! 25.1071 + vst1.64 {d20,d21}, [r4,:128]! 25.1072 + vst1.64 {d4, d5}, [r4,:128]! 25.1073 + vst1.64 {d18,d19}, [r4,:128]! 25.1074 + vst1.64 {d2, d3}, [r4,:128]! 25.1075 + vst1.64 {d16,d17}, [r4,:128]! 25.1076 + vst1.64 {d0, d1}, [r4,:128] 25.1077 + 25.1078 + lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 25.1079 + lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 25.1080 + lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 25.1081 + lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 25.1082 + 25.1083 + vld1.64 {d16,d17}, [r4,:128], ip 25.1084 + vld1.64 {d30,d31}, [r4,:128], ip 25.1085 + lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 25.1086 + vld1.64 {d16,d17}, [r4,:128], ip 25.1087 + vld1.64 {d30,d31}, [r4,:128], ip 25.1088 + lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 25.1089 + vld1.64 {d16,d17}, [r4,:128], ip 25.1090 + vld1.64 {d30,d31}, [r4,:128], ip 25.1091 + lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 25.1092 + vld1.64 {d16,d17}, [r4,:128], ip 25.1093 + vld1.64 {d30,d31}, [r4,:128] 25.1094 + lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 25.1095 + 25.1096 + transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 25.1097 + 25.1098 + bx lr 25.1099 +endfunc 25.1100 + 25.1101 + .macro h264_qpel8_hv_lowpass type 25.1102 +function \type\()_h264_qpel8_hv_lowpass_neon 25.1103 + mov r10, lr 25.1104 + bl put_h264_qpel8_hv_lowpass_neon_top 25.1105 +.ifc \type,avg 25.1106 + vld1.8 {d0}, [r0,:64], r2 25.1107 + vrhadd.u8 d12, d12, d0 25.1108 + vld1.8 {d1}, [r0,:64], r2 25.1109 + vrhadd.u8 d13, d13, d1 25.1110 + vld1.8 {d2}, [r0,:64], r2 25.1111 + vrhadd.u8 d14, d14, d2 25.1112 + vld1.8 {d3}, [r0,:64], r2 25.1113 + vrhadd.u8 d15, d15, d3 25.1114 + vld1.8 {d4}, [r0,:64], r2 25.1115 + vrhadd.u8 d8, d8, d4 25.1116 + vld1.8 {d5}, [r0,:64], r2 25.1117 + vrhadd.u8 d9, d9, d5 25.1118 + vld1.8 {d6}, [r0,:64], r2 25.1119 + vrhadd.u8 d10, d10, d6 25.1120 + vld1.8 {d7}, [r0,:64], r2 25.1121 + vrhadd.u8 d11, d11, d7 25.1122 + sub r0, r0, r2, lsl #3 25.1123 +.endif 25.1124 + vst1.64 {d12}, [r0,:64], r2 25.1125 + vst1.64 {d13}, [r0,:64], r2 25.1126 + vst1.64 {d14}, [r0,:64], r2 25.1127 + vst1.64 {d15}, [r0,:64], r2 25.1128 + vst1.64 {d8}, [r0,:64], r2 25.1129 + vst1.64 {d9}, [r0,:64], r2 25.1130 + vst1.64 {d10}, [r0,:64], r2 25.1131 + vst1.64 {d11}, [r0,:64], r2 25.1132 + 25.1133 + mov lr, r10 25.1134 + bx lr 25.1135 +endfunc 25.1136 + .endm 25.1137 + 25.1138 + h264_qpel8_hv_lowpass put 25.1139 + h264_qpel8_hv_lowpass avg 25.1140 + 25.1141 + .macro h264_qpel8_hv_lowpass_l2 type 25.1142 +function \type\()_h264_qpel8_hv_lowpass_l2_neon 25.1143 + mov r10, lr 25.1144 + bl put_h264_qpel8_hv_lowpass_neon_top 25.1145 + 25.1146 + vld1.64 {d0, d1}, [r2,:128]! 25.1147 + vld1.64 {d2, d3}, [r2,:128]! 25.1148 + vrhadd.u8 q0, q0, q6 25.1149 + vld1.64 {d4, d5}, [r2,:128]! 25.1150 + vrhadd.u8 q1, q1, q7 25.1151 + vld1.64 {d6, d7}, [r2,:128]! 25.1152 + vrhadd.u8 q2, q2, q4 25.1153 + vrhadd.u8 q3, q3, q5 25.1154 +.ifc \type,avg 25.1155 + vld1.8 {d16}, [r0,:64], r3 25.1156 + vrhadd.u8 d0, d0, d16 25.1157 + vld1.8 {d17}, [r0,:64], r3 25.1158 + vrhadd.u8 d1, d1, d17 25.1159 + vld1.8 {d18}, [r0,:64], r3 25.1160 + vrhadd.u8 d2, d2, d18 25.1161 + vld1.8 {d19}, [r0,:64], r3 25.1162 + vrhadd.u8 d3, d3, d19 25.1163 + vld1.8 {d20}, [r0,:64], r3 25.1164 + vrhadd.u8 d4, d4, d20 25.1165 + vld1.8 {d21}, [r0,:64], r3 25.1166 + vrhadd.u8 d5, d5, d21 25.1167 + vld1.8 {d22}, [r0,:64], r3 25.1168 + vrhadd.u8 d6, d6, d22 25.1169 + vld1.8 {d23}, [r0,:64], r3 25.1170 + vrhadd.u8 d7, d7, d23 25.1171 + sub r0, r0, r3, lsl #3 25.1172 +.endif 25.1173 + vst1.64 {d0}, [r0,:64], r3 25.1174 + vst1.64 {d1}, [r0,:64], r3 25.1175 + vst1.64 {d2}, [r0,:64], r3 25.1176 + vst1.64 {d3}, [r0,:64], r3 25.1177 + vst1.64 {d4}, [r0,:64], r3 25.1178 + vst1.64 {d5}, [r0,:64], r3 25.1179 + vst1.64 {d6}, [r0,:64], r3 25.1180 + vst1.64 {d7}, [r0,:64], r3 25.1181 + 25.1182 + mov lr, r10 25.1183 + bx lr 25.1184 +endfunc 25.1185 + .endm 25.1186 + 25.1187 + h264_qpel8_hv_lowpass_l2 put 25.1188 + h264_qpel8_hv_lowpass_l2 avg 25.1189 + 25.1190 + .macro h264_qpel16_hv type 25.1191 +function \type\()_h264_qpel16_hv_lowpass_neon 25.1192 + mov r9, lr 25.1193 + bl \type\()_h264_qpel8_hv_lowpass_neon 25.1194 + sub r1, r1, r3, lsl #2 25.1195 + bl \type\()_h264_qpel8_hv_lowpass_neon 25.1196 + sub r1, r1, r3, lsl #4 25.1197 + sub r1, r1, r3, lsl #2 25.1198 + add r1, r1, #8 25.1199 + sub r0, r0, r2, lsl #4 25.1200 + add r0, r0, #8 25.1201 + bl \type\()_h264_qpel8_hv_lowpass_neon 25.1202 + sub r1, r1, r3, lsl #2 25.1203 + mov lr, r9 25.1204 + b \type\()_h264_qpel8_hv_lowpass_neon 25.1205 +endfunc 25.1206 + 25.1207 +function \type\()_h264_qpel16_hv_lowpass_l2_neon 25.1208 + mov r9, lr 25.1209 + sub r2, r4, #256 25.1210 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon 25.1211 + sub r1, r1, r3, lsl #2 25.1212 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon 25.1213 + sub r1, r1, r3, lsl #4 25.1214 + sub r1, r1, r3, lsl #2 25.1215 + add r1, r1, #8 25.1216 + sub r0, r0, r3, lsl #4 25.1217 + add r0, r0, #8 25.1218 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon 25.1219 + sub r1, r1, r3, lsl #2 25.1220 + mov lr, r9 25.1221 + b \type\()_h264_qpel8_hv_lowpass_l2_neon 25.1222 +endfunc 25.1223 + .endm 25.1224 + 25.1225 + h264_qpel16_hv put 25.1226 + h264_qpel16_hv avg 25.1227 + 25.1228 + .macro h264_qpel8 type 25.1229 +function ff_\type\()_h264_qpel8_mc10_neon, export=1 25.1230 + lowpass_const r3 25.1231 + mov r3, r1 25.1232 + sub r1, r1, #2 25.1233 + mov ip, #8 25.1234 + b \type\()_h264_qpel8_h_lowpass_l2_neon 25.1235 +endfunc 25.1236 + 25.1237 +function ff_\type\()_h264_qpel8_mc20_neon, export=1 25.1238 + lowpass_const r3 25.1239 + sub r1, r1, #2 25.1240 + mov r3, r2 25.1241 + mov ip, #8 25.1242 + b \type\()_h264_qpel8_h_lowpass_neon 25.1243 +endfunc 25.1244 + 25.1245 +function ff_\type\()_h264_qpel8_mc30_neon, export=1 25.1246 + lowpass_const r3 25.1247 + add r3, r1, #1 25.1248 + sub r1, r1, #2 25.1249 + mov ip, #8 25.1250 + b \type\()_h264_qpel8_h_lowpass_l2_neon 25.1251 +endfunc 25.1252 + 25.1253 +function ff_\type\()_h264_qpel8_mc01_neon, export=1 25.1254 + push {lr} 25.1255 + mov ip, r1 25.1256 +\type\()_h264_qpel8_mc01: 25.1257 + lowpass_const r3 25.1258 + mov r3, r2 25.1259 + sub r1, r1, r2, lsl #1 25.1260 + vpush {d8-d15} 25.1261 + bl \type\()_h264_qpel8_v_lowpass_l2_neon 25.1262 + vpop {d8-d15} 25.1263 + pop {pc} 25.1264 +endfunc 25.1265 + 25.1266 +function ff_\type\()_h264_qpel8_mc11_neon, export=1 25.1267 + push {r0, r1, r11, lr} 25.1268 +\type\()_h264_qpel8_mc11: 25.1269 + lowpass_const r3 25.1270 + mov r11, sp 25.1271 + bic sp, sp, #15 25.1272 + sub sp, sp, #64 25.1273 + mov r0, sp 25.1274 + sub r1, r1, #2 25.1275 + mov r3, #8 25.1276 + mov ip, #8 25.1277 + vpush {d8-d15} 25.1278 + bl put_h264_qpel8_h_lowpass_neon 25.1279 + ldrd r0, [r11] 25.1280 + mov r3, r2 25.1281 + add ip, sp, #64 25.1282 + sub r1, r1, r2, lsl #1 25.1283 + mov r2, #8 25.1284 + bl \type\()_h264_qpel8_v_lowpass_l2_neon 25.1285 + vpop {d8-d15} 25.1286 + add sp, r11, #8 25.1287 + pop {r11, pc} 25.1288 +endfunc 25.1289 + 25.1290 +function ff_\type\()_h264_qpel8_mc21_neon, export=1 25.1291 + push {r0, r1, r4, r10, r11, lr} 25.1292 +\type\()_h264_qpel8_mc21: 25.1293 + lowpass_const r3 25.1294 + mov r11, sp 25.1295 + bic sp, sp, #15 25.1296 + sub sp, sp, #(8*8+16*12) 25.1297 + sub r1, r1, #2 25.1298 + mov r3, #8 25.1299 + mov r0, sp 25.1300 + mov ip, #8 25.1301 + vpush {d8-d15} 25.1302 + bl put_h264_qpel8_h_lowpass_neon 25.1303 + mov r4, r0 25.1304 + ldrd r0, [r11] 25.1305 + sub r1, r1, r2, lsl #1 25.1306 + sub r1, r1, #2 25.1307 + mov r3, r2 25.1308 + sub r2, r4, #64 25.1309 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon 25.1310 + vpop {d8-d15} 25.1311 + add sp, r11, #8 25.1312 + pop {r4, r10, r11, pc} 25.1313 +endfunc 25.1314 + 25.1315 +function ff_\type\()_h264_qpel8_mc31_neon, export=1 25.1316 + add r1, r1, #1 25.1317 + push {r0, r1, r11, lr} 25.1318 + sub r1, r1, #1 25.1319 + b \type\()_h264_qpel8_mc11 25.1320 +endfunc 25.1321 + 25.1322 +function ff_\type\()_h264_qpel8_mc02_neon, export=1 25.1323 + push {lr} 25.1324 + lowpass_const r3 25.1325 + sub r1, r1, r2, lsl #1 25.1326 + mov r3, r2 25.1327 + vpush {d8-d15} 25.1328 + bl \type\()_h264_qpel8_v_lowpass_neon 25.1329 + vpop {d8-d15} 25.1330 + pop {pc} 25.1331 +endfunc 25.1332 + 25.1333 +function ff_\type\()_h264_qpel8_mc12_neon, export=1 25.1334 + push {r0, r1, r4, r10, r11, lr} 25.1335 +\type\()_h264_qpel8_mc12: 25.1336 + lowpass_const r3 25.1337 + mov r11, sp 25.1338 + bic sp, sp, #15 25.1339 + sub sp, sp, #(8*8+16*12) 25.1340 + sub r1, r1, r2, lsl #1 25.1341 + mov r3, r2 25.1342 + mov r2, #8 25.1343 + mov r0, sp 25.1344 + vpush {d8-d15} 25.1345 + bl put_h264_qpel8_v_lowpass_neon 25.1346 + mov r4, r0 25.1347 + ldrd r0, [r11] 25.1348 + sub r1, r1, r3, lsl #1 25.1349 + sub r1, r1, #2 25.1350 + sub r2, r4, #64 25.1351 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon 25.1352 + vpop {d8-d15} 25.1353 + add sp, r11, #8 25.1354 + pop {r4, r10, r11, pc} 25.1355 +endfunc 25.1356 + 25.1357 +function ff_\type\()_h264_qpel8_mc22_neon, export=1 25.1358 + push {r4, r10, r11, lr} 25.1359 + mov r11, sp 25.1360 + bic sp, sp, #15 25.1361 + sub r1, r1, r2, lsl #1 25.1362 + sub r1, r1, #2 25.1363 + mov r3, r2 25.1364 + sub sp, sp, #(16*12) 25.1365 + mov r4, sp 25.1366 + vpush {d8-d15} 25.1367 + bl \type\()_h264_qpel8_hv_lowpass_neon 25.1368 + vpop {d8-d15} 25.1369 + mov sp, r11 25.1370 + pop {r4, r10, r11, pc} 25.1371 +endfunc 25.1372 + 25.1373 +function ff_\type\()_h264_qpel8_mc32_neon, export=1 25.1374 + push {r0, r1, r4, r10, r11, lr} 25.1375 + add r1, r1, #1 25.1376 + b \type\()_h264_qpel8_mc12 25.1377 +endfunc 25.1378 + 25.1379 +function ff_\type\()_h264_qpel8_mc03_neon, export=1 25.1380 + push {lr} 25.1381 + add ip, r1, r2 25.1382 + b \type\()_h264_qpel8_mc01 25.1383 +endfunc 25.1384 + 25.1385 +function ff_\type\()_h264_qpel8_mc13_neon, export=1 25.1386 + push {r0, r1, r11, lr} 25.1387 + add r1, r1, r2 25.1388 + b \type\()_h264_qpel8_mc11 25.1389 +endfunc 25.1390 + 25.1391 +function ff_\type\()_h264_qpel8_mc23_neon, export=1 25.1392 + push {r0, r1, r4, r10, r11, lr} 25.1393 + add r1, r1, r2 25.1394 + b \type\()_h264_qpel8_mc21 25.1395 +endfunc 25.1396 + 25.1397 +function ff_\type\()_h264_qpel8_mc33_neon, export=1 25.1398 + add r1, r1, #1 25.1399 + push {r0, r1, r11, lr} 25.1400 + add r1, r1, r2 25.1401 + sub r1, r1, #1 25.1402 + b \type\()_h264_qpel8_mc11 25.1403 +endfunc 25.1404 + .endm 25.1405 + 25.1406 + h264_qpel8 put 25.1407 + h264_qpel8 avg 25.1408 + 25.1409 + .macro h264_qpel16 type 25.1410 +function ff_\type\()_h264_qpel16_mc10_neon, export=1 25.1411 + lowpass_const r3 25.1412 + mov r3, r1 25.1413 + sub r1, r1, #2 25.1414 + b \type\()_h264_qpel16_h_lowpass_l2_neon 25.1415 +endfunc 25.1416 + 25.1417 +function ff_\type\()_h264_qpel16_mc20_neon, export=1 25.1418 + lowpass_const r3 25.1419 + sub r1, r1, #2 25.1420 + mov r3, r2 25.1421 + b \type\()_h264_qpel16_h_lowpass_neon 25.1422 +endfunc 25.1423 + 25.1424 +function ff_\type\()_h264_qpel16_mc30_neon, export=1 25.1425 + lowpass_const r3 25.1426 + add r3, r1, #1 25.1427 + sub r1, r1, #2 25.1428 + b \type\()_h264_qpel16_h_lowpass_l2_neon 25.1429 +endfunc 25.1430 + 25.1431 +function ff_\type\()_h264_qpel16_mc01_neon, export=1 25.1432 + push {r4, lr} 25.1433 + mov ip, r1 25.1434 +\type\()_h264_qpel16_mc01: 25.1435 + lowpass_const r3 25.1436 + mov r3, r2 25.1437 + sub r1, r1, r2, lsl #1 25.1438 + vpush {d8-d15} 25.1439 + bl \type\()_h264_qpel16_v_lowpass_l2_neon 25.1440 + vpop {d8-d15} 25.1441 + pop {r4, pc} 25.1442 +endfunc 25.1443 + 25.1444 +function ff_\type\()_h264_qpel16_mc11_neon, export=1 25.1445 + push {r0, r1, r4, r11, lr} 25.1446 +\type\()_h264_qpel16_mc11: 25.1447 + lowpass_const r3 25.1448 + mov r11, sp 25.1449 + bic sp, sp, #15 25.1450 + sub sp, sp, #256 25.1451 + mov r0, sp 25.1452 + sub r1, r1, #2 25.1453 + mov r3, #16 25.1454 + vpush {d8-d15} 25.1455 + bl put_h264_qpel16_h_lowpass_neon 25.1456 + ldrd r0, [r11] 25.1457 + mov r3, r2 25.1458 + add ip, sp, #64 25.1459 + sub r1, r1, r2, lsl #1 25.1460 + mov r2, #16 25.1461 + bl \type\()_h264_qpel16_v_lowpass_l2_neon 25.1462 + vpop {d8-d15} 25.1463 + add sp, r11, #8 25.1464 + pop {r4, r11, pc} 25.1465 +endfunc 25.1466 + 25.1467 +function ff_\type\()_h264_qpel16_mc21_neon, export=1 25.1468 + push {r0, r1, r4-r5, r9-r11, lr} 25.1469 +\type\()_h264_qpel16_mc21: 25.1470 + lowpass_const r3 25.1471 + mov r11, sp 25.1472 + bic sp, sp, #15 25.1473 + sub sp, sp, #(16*16+16*12) 25.1474 + sub r1, r1, #2 25.1475 + mov r0, sp 25.1476 + vpush {d8-d15} 25.1477 + bl put_h264_qpel16_h_lowpass_neon_packed 25.1478 + mov r4, r0 25.1479 + ldrd r0, [r11] 25.1480 + sub r1, r1, r2, lsl #1 25.1481 + sub r1, r1, #2 25.1482 + mov r3, r2 25.1483 + bl \type\()_h264_qpel16_hv_lowpass_l2_neon 25.1484 + vpop {d8-d15} 25.1485 + add sp, r11, #8 25.1486 + pop {r4-r5, r9-r11, pc} 25.1487 +endfunc 25.1488 + 25.1489 +function ff_\type\()_h264_qpel16_mc31_neon, export=1 25.1490 + add r1, r1, #1 25.1491 + push {r0, r1, r4, r11, lr} 25.1492 + sub r1, r1, #1 25.1493 + b \type\()_h264_qpel16_mc11 25.1494 +endfunc 25.1495 + 25.1496 +function ff_\type\()_h264_qpel16_mc02_neon, export=1 25.1497 + push {r4, lr} 25.1498 + lowpass_const r3 25.1499 + sub r1, r1, r2, lsl #1 25.1500 + mov r3, r2 25.1501 + vpush {d8-d15} 25.1502 + bl \type\()_h264_qpel16_v_lowpass_neon 25.1503 + vpop {d8-d15} 25.1504 + pop {r4, pc} 25.1505 +endfunc 25.1506 + 25.1507 +function ff_\type\()_h264_qpel16_mc12_neon, export=1 25.1508 + push {r0, r1, r4-r5, r9-r11, lr} 25.1509 +\type\()_h264_qpel16_mc12: 25.1510 + lowpass_const r3 25.1511 + mov r11, sp 25.1512 + bic sp, sp, #15 25.1513 + sub sp, sp, #(16*16+16*12) 25.1514 + sub r1, r1, r2, lsl #1 25.1515 + mov r0, sp 25.1516 + mov r3, r2 25.1517 + vpush {d8-d15} 25.1518 + bl put_h264_qpel16_v_lowpass_neon_packed 25.1519 + mov r4, r0 25.1520 + ldrd r0, [r11] 25.1521 + sub r1, r1, r3, lsl #1 25.1522 + sub r1, r1, #2 25.1523 + mov r2, r3 25.1524 + bl \type\()_h264_qpel16_hv_lowpass_l2_neon 25.1525 + vpop {d8-d15} 25.1526 + add sp, r11, #8 25.1527 + pop {r4-r5, r9-r11, pc} 25.1528 +endfunc 25.1529 + 25.1530 +function ff_\type\()_h264_qpel16_mc22_neon, export=1 25.1531 + push {r4, r9-r11, lr} 25.1532 + lowpass_const r3 25.1533 + mov r11, sp 25.1534 + bic sp, sp, #15 25.1535 + sub r1, r1, r2, lsl #1 25.1536 + sub r1, r1, #2 25.1537 + mov r3, r2 25.1538 + sub sp, sp, #(16*12) 25.1539 + mov r4, sp 25.1540 + vpush {d8-d15} 25.1541 + bl \type\()_h264_qpel16_hv_lowpass_neon 25.1542 + vpop {d8-d15} 25.1543 + mov sp, r11 25.1544 + pop {r4, r9-r11, pc} 25.1545 +endfunc 25.1546 + 25.1547 +function ff_\type\()_h264_qpel16_mc32_neon, export=1 25.1548 + push {r0, r1, r4-r5, r9-r11, lr} 25.1549 + add r1, r1, #1 25.1550 + b \type\()_h264_qpel16_mc12 25.1551 +endfunc 25.1552 + 25.1553 +function ff_\type\()_h264_qpel16_mc03_neon, export=1 25.1554 + push {r4, lr} 25.1555 + add ip, r1, r2 25.1556 + b \type\()_h264_qpel16_mc01 25.1557 +endfunc 25.1558 + 25.1559 +function ff_\type\()_h264_qpel16_mc13_neon, export=1 25.1560 + push {r0, r1, r4, r11, lr} 25.1561 + add r1, r1, r2 25.1562 + b \type\()_h264_qpel16_mc11 25.1563 +endfunc 25.1564 + 25.1565 +function ff_\type\()_h264_qpel16_mc23_neon, export=1 25.1566 + push {r0, r1, r4-r5, r9-r11, lr} 25.1567 + add r1, r1, r2 25.1568 + b \type\()_h264_qpel16_mc21 25.1569 +endfunc 25.1570 + 25.1571 +function ff_\type\()_h264_qpel16_mc33_neon, export=1 25.1572 + add r1, r1, #1 25.1573 + push {r0, r1, r4, r11, lr} 25.1574 + add r1, r1, r2 25.1575 + sub r1, r1, #1 25.1576 + b \type\()_h264_qpel16_mc11 25.1577 +endfunc 25.1578 + .endm 25.1579 + 25.1580 + h264_qpel16 put 25.1581 + h264_qpel16 avg 25.1582 + 25.1583 +@ Biweighted prediction 25.1584 + 25.1585 + .macro biweight_16 macs, macd 25.1586 + vdup.8 d0, r4 25.1587 + vdup.8 d1, r5 25.1588 + vmov q2, q8 25.1589 + vmov q3, q8 25.1590 +1: subs ip, ip, #2 25.1591 + vld1.8 {d20-d21},[r0,:128], r2 25.1592 + \macd q2, d0, d20 25.1593 + pld [r0] 25.1594 + \macd q3, d0, d21 25.1595 + vld1.8 {d22-d23},[r1,:128], r2 25.1596 + \macs q2, d1, d22 25.1597 + pld [r1] 25.1598 + \macs q3, d1, d23 25.1599 + vmov q12, q8 25.1600 + vld1.8 {d28-d29},[r0,:128], r2 25.1601 + vmov q13, q8 25.1602 + \macd q12, d0, d28 25.1603 + pld [r0] 25.1604 + \macd q13, d0, d29 25.1605 + vld1.8 {d30-d31},[r1,:128], r2 25.1606 + \macs q12, d1, d30 25.1607 + pld [r1] 25.1608 + \macs q13, d1, d31 25.1609 + vshl.s16 q2, q2, q9 25.1610 + vshl.s16 q3, q3, q9 25.1611 + vqmovun.s16 d4, q2 25.1612 + vqmovun.s16 d5, q3 25.1613 + vshl.s16 q12, q12, q9 25.1614 + vshl.s16 q13, q13, q9 25.1615 + vqmovun.s16 d24, q12 25.1616 + vqmovun.s16 d25, q13 25.1617 + vmov q3, q8 25.1618 + vst1.8 {d4- d5}, [r6,:128], r2 25.1619 + vmov q2, q8 25.1620 + vst1.8 {d24-d25},[r6,:128], r2 25.1621 + bne 1b 25.1622 + pop {r4-r6, pc} 25.1623 + .endm 25.1624 + 25.1625 + .macro biweight_8 macs, macd 25.1626 + vdup.8 d0, r4 25.1627 + vdup.8 d1, r5 25.1628 + vmov q1, q8 25.1629 + vmov q10, q8 25.1630 +1: subs ip, ip, #2 25.1631 + vld1.8 {d4},[r0,:64], r2 25.1632 + \macd q1, d0, d4 25.1633 + pld [r0] 25.1634 + vld1.8 {d5},[r1,:64], r2 25.1635 + \macs q1, d1, d5 25.1636 + pld [r1] 25.1637 + vld1.8 {d6},[r0,:64], r2 25.1638 + \macd q10, d0, d6 25.1639 + pld [r0] 25.1640 + vld1.8 {d7},[r1,:64], r2 25.1641 + \macs q10, d1, d7 25.1642 + pld [r1] 25.1643 + vshl.s16 q1, q1, q9 25.1644 + vqmovun.s16 d2, q1 25.1645 + vshl.s16 q10, q10, q9 25.1646 + vqmovun.s16 d4, q10 25.1647 + vmov q10, q8 25.1648 + vst1.8 {d2},[r6,:64], r2 25.1649 + vmov q1, q8 25.1650 + vst1.8 {d4},[r6,:64], r2 25.1651 + bne 1b 25.1652 + pop {r4-r6, pc} 25.1653 + .endm 25.1654 + 25.1655 + .macro biweight_4 macs, macd 25.1656 + vdup.8 d0, r4 25.1657 + vdup.8 d1, r5 25.1658 + vmov q1, q8 25.1659 + vmov q10, q8 25.1660 +1: subs ip, ip, #4 25.1661 + vld1.32 {d4[0]},[r0,:32], r2 25.1662 + vld1.32 {d4[1]},[r0,:32], r2 25.1663 + \macd q1, d0, d4 25.1664 + pld [r0] 25.1665 + vld1.32 {d5[0]},[r1,:32], r2 25.1666 + vld1.32 {d5[1]},[r1,:32], r2 25.1667 + \macs q1, d1, d5 25.1668 + pld [r1] 25.1669 + blt 2f 25.1670 + vld1.32 {d6[0]},[r0,:32], r2 25.1671 + vld1.32 {d6[1]},[r0,:32], r2 25.1672 + \macd q10, d0, d6 25.1673 + pld [r0] 25.1674 + vld1.32 {d7[0]},[r1,:32], r2 25.1675 + vld1.32 {d7[1]},[r1,:32], r2 25.1676 + \macs q10, d1, d7 25.1677 + pld [r1] 25.1678 + vshl.s16 q1, q1, q9 25.1679 + vqmovun.s16 d2, q1 25.1680 + vshl.s16 q10, q10, q9 25.1681 + vqmovun.s16 d4, q10 25.1682 + vmov q10, q8 25.1683 + vst1.32 {d2[0]},[r6,:32], r2 25.1684 + vst1.32 {d2[1]},[r6,:32], r2 25.1685 + vmov q1, q8 25.1686 + vst1.32 {d4[0]},[r6,:32], r2 25.1687 + vst1.32 {d4[1]},[r6,:32], r2 25.1688 + bne 1b 25.1689 + pop {r4-r6, pc} 25.1690 +2: vshl.s16 q1, q1, q9 25.1691 + vqmovun.s16 d2, q1 25.1692 + vst1.32 {d2[0]},[r6,:32], r2 25.1693 + vst1.32 {d2[1]},[r6,:32], r2 25.1694 + pop {r4-r6, pc} 25.1695 + .endm 25.1696 + 25.1697 + .macro biweight_func w 25.1698 +function biweight_h264_pixels_\w\()_neon 25.1699 + push {r4-r6, lr} 25.1700 + add r4, sp, #16 25.1701 + ldm r4, {r4-r6} 25.1702 + lsr lr, r4, #31 25.1703 + add r6, r6, #1 25.1704 + eors lr, lr, r5, lsr #30 25.1705 + orr r6, r6, #1 25.1706 + vdup.16 q9, r3 25.1707 + lsl r6, r6, r3 25.1708 + vmvn q9, q9 25.1709 + vdup.16 q8, r6 25.1710 + mov r6, r0 25.1711 + beq 10f 25.1712 + subs lr, lr, #1 25.1713 + beq 20f 25.1714 + subs lr, lr, #1 25.1715 + beq 30f 25.1716 + b 40f 25.1717 +10: biweight_\w vmlal.u8, vmlal.u8 25.1718 +20: rsb r4, r4, #0 25.1719 + biweight_\w vmlal.u8, vmlsl.u8 25.1720 +30: rsb r4, r4, #0 25.1721 + rsb r5, r5, #0 25.1722 + biweight_\w vmlsl.u8, vmlsl.u8 25.1723 +40: rsb r5, r5, #0 25.1724 + biweight_\w vmlsl.u8, vmlal.u8 25.1725 +endfunc 25.1726 + .endm 25.1727 + 25.1728 + .macro biweight_entry w, h, b=1 25.1729 +function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 25.1730 + mov ip, #\h 25.1731 +.if \b 25.1732 + b biweight_h264_pixels_\w\()_neon 25.1733 +.endif 25.1734 +endfunc 25.1735 + .endm 25.1736 + 25.1737 + biweight_entry 16, 8 25.1738 + biweight_entry 16, 16, b=0 25.1739 + biweight_func 16 25.1740 + 25.1741 + biweight_entry 8, 16 25.1742 + biweight_entry 8, 4 25.1743 + biweight_entry 8, 8, b=0 25.1744 + biweight_func 8 25.1745 + 25.1746 + biweight_entry 4, 8 25.1747 + biweight_entry 4, 2 25.1748 + biweight_entry 4, 4, b=0 25.1749 + biweight_func 4 25.1750 + 25.1751 +@ Weighted prediction 25.1752 + 25.1753 + .macro weight_16 add 25.1754 + vdup.8 d0, r3 25.1755 +1: subs ip, ip, #2 25.1756 + vld1.8 {d20-d21},[r0,:128], r1 25.1757 + vmull.u8 q2, d0, d20 25.1758 + pld [r0] 25.1759 + vmull.u8 q3, d0, d21 25.1760 + vld1.8 {d28-d29},[r0,:128], r1 25.1761 + vmull.u8 q12, d0, d28 25.1762 + pld [r0] 25.1763 + vmull.u8 q13, d0, d29 25.1764 + \add q2, q8, q2 25.1765 + vrshl.s16 q2, q2, q9 25.1766 + \add q3, q8, q3 25.1767 + vrshl.s16 q3, q3, q9 25.1768 + vqmovun.s16 d4, q2 25.1769 + vqmovun.s16 d5, q3 25.1770 + \add q12, q8, q12 25.1771 + vrshl.s16 q12, q12, q9 25.1772 + \add q13, q8, q13 25.1773 + vrshl.s16 q13, q13, q9 25.1774 + vqmovun.s16 d24, q12 25.1775 + vqmovun.s16 d25, q13 25.1776 + vst1.8 {d4- d5}, [r4,:128], r1 25.1777 + vst1.8 {d24-d25},[r4,:128], r1 25.1778 + bne 1b 25.1779 + pop {r4, pc} 25.1780 + .endm 25.1781 + 25.1782 + .macro weight_8 add 25.1783 + vdup.8 d0, r3 25.1784 +1: subs ip, ip, #2 25.1785 + vld1.8 {d4},[r0,:64], r1 25.1786 + vmull.u8 q1, d0, d4 25.1787 + pld [r0] 25.1788 + vld1.8 {d6},[r0,:64], r1 25.1789 + vmull.u8 q10, d0, d6 25.1790 + \add q1, q8, q1 25.1791 + pld [r0] 25.1792 + vrshl.s16 q1, q1, q9 25.1793 + vqmovun.s16 d2, q1 25.1794 + \add q10, q8, q10 25.1795 + vrshl.s16 q10, q10, q9 25.1796 + vqmovun.s16 d4, q10 25.1797 + vst1.8 {d2},[r4,:64], r1 25.1798 + vst1.8 {d4},[r4,:64], r1 25.1799 + bne 1b 25.1800 + pop {r4, pc} 25.1801 + .endm 25.1802 + 25.1803 + .macro weight_4 add 25.1804 + vdup.8 d0, r3 25.1805 + vmov q1, q8 25.1806 + vmov q10, q8 25.1807 +1: subs ip, ip, #4 25.1808 + vld1.32 {d4[0]},[r0,:32], r1 25.1809 + vld1.32 {d4[1]},[r0,:32], r1 25.1810 + vmull.u8 q1, d0, d4 25.1811 + pld [r0] 25.1812 + blt 2f 25.1813 + vld1.32 {d6[0]},[r0,:32], r1 25.1814 + vld1.32 {d6[1]},[r0,:32], r1 25.1815 + vmull.u8 q10, d0, d6 25.1816 + pld [r0] 25.1817 + \add q1, q8, q1 25.1818 + vrshl.s16 q1, q1, q9 25.1819 + vqmovun.s16 d2, q1 25.1820 + \add q10, q8, q10 25.1821 + vrshl.s16 q10, q10, q9 25.1822 + vqmovun.s16 d4, q10 25.1823 + vmov q10, q8 25.1824 + vst1.32 {d2[0]},[r4,:32], r1 25.1825 + vst1.32 {d2[1]},[r4,:32], r1 25.1826 + vmov q1, q8 25.1827 + vst1.32 {d4[0]},[r4,:32], r1 25.1828 + vst1.32 {d4[1]},[r4,:32], r1 25.1829 + bne 1b 25.1830 + pop {r4, pc} 25.1831 +2: \add q1, q8, q1 25.1832 + vrshl.s16 q1, q1, q9 25.1833 + vqmovun.s16 d2, q1 25.1834 + vst1.32 {d2[0]},[r4,:32], r1 25.1835 + vst1.32 {d2[1]},[r4,:32], r1 25.1836 + pop {r4, pc} 25.1837 + .endm 25.1838 + 25.1839 + .macro weight_func w 25.1840 +function weight_h264_pixels_\w\()_neon 25.1841 + push {r4, lr} 25.1842 + ldr r4, [sp, #8] 25.1843 + cmp r2, #1 25.1844 + lsl r4, r4, r2 25.1845 + vdup.16 q8, r4 25.1846 + mov r4, r0 25.1847 + ble 20f 25.1848 + rsb lr, r2, #1 25.1849 + vdup.16 q9, lr 25.1850 + cmp r3, #0 25.1851 + blt 10f 25.1852 + weight_\w vhadd.s16 25.1853 +10: rsb r3, r3, #0 25.1854 + weight_\w vhsub.s16 25.1855 +20: rsb lr, r2, #0 25.1856 + vdup.16 q9, lr 25.1857 + cmp r3, #0 25.1858 + blt 10f 25.1859 + weight_\w vadd.s16 25.1860 +10: rsb r3, r3, #0 25.1861 + weight_\w vsub.s16 25.1862 +endfunc 25.1863 + .endm 25.1864 + 25.1865 + .macro weight_entry w, h, b=1 25.1866 +function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1 25.1867 + mov ip, #\h 25.1868 +.if \b 25.1869 + b weight_h264_pixels_\w\()_neon 25.1870 +.endif 25.1871 +endfunc 25.1872 + .endm 25.1873 + 25.1874 + weight_entry 16, 8 25.1875 + weight_entry 16, 16, b=0 25.1876 + weight_func 16 25.1877 + 25.1878 + weight_entry 8, 16 25.1879 + weight_entry 8, 4 25.1880 + weight_entry 8, 8, b=0 25.1881 + weight_func 8 25.1882 + 25.1883 + weight_entry 4, 8 25.1884 + weight_entry 4, 2 25.1885 + weight_entry 4, 4, b=0 25.1886 + weight_func 4
26.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 26.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/h264idct_neon.S Mon Aug 27 12:09:56 2012 +0200 26.3 @@ -0,0 +1,180 @@ 26.4 +/* 26.5 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 26.6 + * 26.7 + * This file is part of FFmpeg. 26.8 + * 26.9 + * FFmpeg is free software; you can redistribute it and/or 26.10 + * modify it under the terms of the GNU Lesser General Public 26.11 + * License as published by the Free Software Foundation; either 26.12 + * version 2.1 of the License, or (at your option) any later version. 26.13 + * 26.14 + * FFmpeg is distributed in the hope that it will be useful, 26.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 26.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 26.17 + * Lesser General Public License for more details. 26.18 + * 26.19 + * You should have received a copy of the GNU Lesser General Public 26.20 + * License along with FFmpeg; if not, write to the Free Software 26.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 26.22 + */ 26.23 + 26.24 +#include "asm.S" 26.25 + 26.26 + preserve8 26.27 + .text 26.28 + 26.29 +function ff_h264_idct_add_neon, export=1 26.30 + vld1.64 {d0-d3}, [r1,:128] 26.31 + 26.32 + vswp d1, d2 26.33 + vadd.i16 d4, d0, d1 26.34 + vshr.s16 q8, q1, #1 26.35 + vsub.i16 d5, d0, d1 26.36 + vadd.i16 d6, d2, d17 26.37 + vsub.i16 d7, d16, d3 26.38 + vadd.i16 q0, q2, q3 26.39 + vsub.i16 q1, q2, q3 26.40 + 26.41 + vtrn.16 d0, d1 26.42 + vtrn.16 d3, d2 26.43 + vtrn.32 d0, d3 26.44 + vtrn.32 d1, d2 26.45 + 26.46 + vadd.i16 d4, d0, d3 26.47 + vld1.32 {d18[0]}, [r0,:32], r2 26.48 + vswp d1, d3 26.49 + vshr.s16 q8, q1, #1 26.50 + vld1.32 {d19[1]}, [r0,:32], r2 26.51 + vsub.i16 d5, d0, d1 26.52 + vld1.32 {d18[1]}, [r0,:32], r2 26.53 + vadd.i16 d6, d16, d3 26.54 + vld1.32 {d19[0]}, [r0,:32], r2 26.55 + vsub.i16 d7, d2, d17 26.56 + sub r0, r0, r2, lsl #2 26.57 + vadd.i16 q0, q2, q3 26.58 + vsub.i16 q1, q2, q3 26.59 + 26.60 + vrshr.s16 q0, q0, #6 26.61 + vrshr.s16 q1, q1, #6 26.62 + 26.63 + vaddw.u8 q0, q0, d18 26.64 + vaddw.u8 q1, q1, d19 26.65 + 26.66 + vqmovun.s16 d0, q0 26.67 + vqmovun.s16 d1, q1 26.68 + 26.69 + vst1.32 {d0[0]}, [r0,:32], r2 26.70 + vst1.32 {d1[1]}, [r0,:32], r2 26.71 + vst1.32 {d0[1]}, [r0,:32], r2 26.72 + vst1.32 {d1[0]}, [r0,:32], r2 26.73 + 26.74 + bx lr 26.75 +endfunc 26.76 + 26.77 +function ff_h264_idct_dc_add_neon, export=1 26.78 + vld1.16 {d2[],d3[]}, [r1,:16] 26.79 + vrshr.s16 q1, q1, #6 26.80 + vld1.32 {d0[0]}, [r0,:32], r2 26.81 + vld1.32 {d0[1]}, [r0,:32], r2 26.82 + vaddw.u8 q2, q1, d0 26.83 + vld1.32 {d1[0]}, [r0,:32], r2 26.84 + vld1.32 {d1[1]}, [r0,:32], r2 26.85 + vaddw.u8 q1, q1, d1 26.86 + vqmovun.s16 d0, q2 26.87 + vqmovun.s16 d1, q1 26.88 + sub r0, r0, r2, lsl #2 26.89 + vst1.32 {d0[0]}, [r0,:32], r2 26.90 + vst1.32 {d0[1]}, [r0,:32], r2 26.91 + vst1.32 {d1[0]}, [r0,:32], r2 26.92 + vst1.32 {d1[1]}, [r0,:32], r2 26.93 + bx lr 26.94 +endfunc 26.95 + 26.96 +function ff_h264_idct_add16_neon, export=1 26.97 + push {r4-r8,lr} 26.98 + mov r4, r0 26.99 + mov r5, r1 26.100 + mov r1, r2 26.101 + mov r2, r3 26.102 + ldr r6, [sp, #24] 26.103 + movrel r7, scan8 26.104 + mov ip, #16 26.105 +1: ldrb r8, [r7], #1 26.106 + ldr r0, [r5], #4 26.107 + ldrb r8, [r6, r8] 26.108 + subs r8, r8, #1 26.109 + blt 2f 26.110 + ldrsh lr, [r1] 26.111 + add r0, r0, r4 26.112 + movne lr, #0 26.113 + cmp lr, #0 26.114 + adrne lr, ff_h264_idct_dc_add_neon 26.115 + adreq lr, ff_h264_idct_add_neon 26.116 + blx lr 26.117 +2: subs ip, ip, #1 26.118 + add r1, r1, #32 26.119 + bne 1b 26.120 + pop {r4-r8,pc} 26.121 +endfunc 26.122 + 26.123 +function ff_h264_idct_add16intra_neon, export=1 26.124 + push {r4-r8,lr} 26.125 + mov r4, r0 26.126 + mov r5, r1 26.127 + mov r1, r2 26.128 + mov r2, r3 26.129 + ldr r6, [sp, #24] 26.130 + movrel r7, scan8 26.131 + mov ip, #16 26.132 +1: ldrb r8, [r7], #1 26.133 + ldr r0, [r5], #4 26.134 + ldrb r8, [r6, r8] 26.135 + add r0, r0, r4 26.136 + cmp r8, #0 26.137 + ldrsh r8, [r1] 26.138 + adrne lr, ff_h264_idct_add_neon 26.139 + adreq lr, ff_h264_idct_dc_add_neon 26.140 + cmpeq r8, #0 26.141 + blxne lr 26.142 + subs ip, ip, #1 26.143 + add r1, r1, #32 26.144 + bne 1b 26.145 + pop {r4-r8,pc} 26.146 +endfunc 26.147 + 26.148 +function ff_h264_idct_add8_neon, export=1 26.149 + push {r4-r10,lr} 26.150 + ldm r0, {r4,r9} 26.151 + add r5, r1, #16*4 26.152 + add r1, r2, #16*32 26.153 + mov r2, r3 26.154 + ldr r6, [sp, #32] 26.155 + movrel r7, scan8+16 26.156 + mov ip, #8 26.157 +1: ldrb r8, [r7], #1 26.158 + ldr r0, [r5], #4 26.159 + ldrb r8, [r6, r8] 26.160 + tst ip, #4 26.161 + addeq r0, r0, r4 26.162 + addne r0, r0, r9 26.163 + cmp r8, #0 26.164 + ldrsh r8, [r1] 26.165 + adrne lr, ff_h264_idct_add_neon 26.166 + adreq lr, ff_h264_idct_dc_add_neon 26.167 + cmpeq r8, #0 26.168 + blxne lr 26.169 + subs ip, ip, #1 26.170 + add r1, r1, #32 26.171 + bne 1b 26.172 + pop {r4-r10,pc} 26.173 +endfunc 26.174 + 26.175 + .section .rodata 26.176 +scan8: .byte 4+1*8, 5+1*8, 4+2*8, 5+2*8 26.177 + .byte 6+1*8, 7+1*8, 6+2*8, 7+2*8 26.178 + .byte 4+3*8, 5+3*8, 4+4*8, 5+4*8 26.179 + .byte 6+3*8, 7+3*8, 6+4*8, 7+4*8 26.180 + .byte 1+1*8, 2+1*8 26.181 + .byte 1+2*8, 2+2*8 26.182 + .byte 1+4*8, 2+4*8 26.183 + .byte 1+5*8, 2+5*8
27.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 27.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/h264pred_init_arm.c Mon Aug 27 12:09:56 2012 +0200 27.3 @@ -0,0 +1,75 @@ 27.4 +/* 27.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 27.6 + * 27.7 + * This file is part of FFmpeg. 27.8 + * 27.9 + * FFmpeg is free software; you can redistribute it and/or 27.10 + * modify it under the terms of the GNU Lesser General Public 27.11 + * License as published by the Free Software Foundation; either 27.12 + * version 2.1 of the License, or (at your option) any later version. 27.13 + * 27.14 + * FFmpeg is distributed in the hope that it will be useful, 27.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 27.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 27.17 + * Lesser General Public License for more details. 27.18 + * 27.19 + * You should have received a copy of the GNU Lesser General Public 27.20 + * License along with FFmpeg; if not, write to the Free Software 27.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 27.22 + */ 27.23 + 27.24 +#include <stdint.h> 27.25 + 27.26 +#include "libavcodec/h264pred.h" 27.27 + 27.28 +void ff_pred16x16_vert_neon(uint8_t *src, int stride); 27.29 +void ff_pred16x16_hor_neon(uint8_t *src, int stride); 27.30 +void ff_pred16x16_plane_neon(uint8_t *src, int stride); 27.31 +void ff_pred16x16_dc_neon(uint8_t *src, int stride); 27.32 +void ff_pred16x16_128_dc_neon(uint8_t *src, int stride); 27.33 +void ff_pred16x16_left_dc_neon(uint8_t *src, int stride); 27.34 +void ff_pred16x16_top_dc_neon(uint8_t *src, int stride); 27.35 + 27.36 +void ff_pred8x8_vert_neon(uint8_t *src, int stride); 27.37 +void ff_pred8x8_hor_neon(uint8_t *src, int stride); 27.38 +void ff_pred8x8_plane_neon(uint8_t *src, int stride); 27.39 +void ff_pred8x8_dc_neon(uint8_t *src, int stride); 27.40 +void ff_pred8x8_128_dc_neon(uint8_t *src, int stride); 27.41 +void ff_pred8x8_left_dc_neon(uint8_t *src, int stride); 27.42 +void ff_pred8x8_top_dc_neon(uint8_t *src, int stride); 27.43 +void ff_pred8x8_l0t_dc_neon(uint8_t *src, int stride); 27.44 +void ff_pred8x8_0lt_dc_neon(uint8_t *src, int stride); 27.45 +void ff_pred8x8_l00_dc_neon(uint8_t *src, int stride); 27.46 +void ff_pred8x8_0l0_dc_neon(uint8_t *src, int stride); 27.47 + 27.48 +#if HAVE_NEON 27.49 +static void ff_h264_pred_init_neon(H264PredContext *h) 27.50 +{ 27.51 + h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon; 27.52 + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon; 27.53 + h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_neon; 27.54 + h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon; 27.55 + 27.56 + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon; 27.57 + h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon; 27.58 + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon; 27.59 + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon; 27.60 + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon; 27.61 + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon; 27.62 + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon; 27.63 + 27.64 + 27.65 + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon; 27.66 + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon; 27.67 + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon; 27.68 + h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon; 27.69 + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon; 27.70 + h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon; 27.71 + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; 27.72 +} 27.73 +#endif 27.74 + 27.75 +void ff_h264_pred_init_arm(H264PredContext *h) 27.76 +{ 27.77 + if (HAVE_NEON) ff_h264_pred_init_neon(h); 27.78 +}
28.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 28.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/h264pred_neon.S Mon Aug 27 12:09:56 2012 +0200 28.3 @@ -0,0 +1,362 @@ 28.4 +/* 28.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 28.6 + * 28.7 + * This file is part of FFmpeg. 28.8 + * 28.9 + * FFmpeg is free software; you can redistribute it and/or 28.10 + * modify it under the terms of the GNU Lesser General Public 28.11 + * License as published by the Free Software Foundation; either 28.12 + * version 2.1 of the License, or (at your option) any later version. 28.13 + * 28.14 + * FFmpeg is distributed in the hope that it will be useful, 28.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 28.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 28.17 + * Lesser General Public License for more details. 28.18 + * 28.19 + * You should have received a copy of the GNU Lesser General Public 28.20 + * License along with FFmpeg; if not, write to the Free Software 28.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 28.22 + */ 28.23 + 28.24 +#include "asm.S" 28.25 + 28.26 + .macro ldcol.8 rd, rs, rt, n=8, hi=0 28.27 +.if \n == 8 || \hi == 0 28.28 + vld1.8 {\rd[0]}, [\rs], \rt 28.29 + vld1.8 {\rd[1]}, [\rs], \rt 28.30 + vld1.8 {\rd[2]}, [\rs], \rt 28.31 + vld1.8 {\rd[3]}, [\rs], \rt 28.32 +.endif 28.33 +.if \n == 8 || \hi == 1 28.34 + vld1.8 {\rd[4]}, [\rs], \rt 28.35 + vld1.8 {\rd[5]}, [\rs], \rt 28.36 + vld1.8 {\rd[6]}, [\rs], \rt 28.37 + vld1.8 {\rd[7]}, [\rs], \rt 28.38 +.endif 28.39 + .endm 28.40 + 28.41 + .macro add16x8 dq, dl, dh, rl, rh 28.42 + vaddl.u8 \dq, \rl, \rh 28.43 + vadd.u16 \dl, \dl, \dh 28.44 + vpadd.u16 \dl, \dl, \dl 28.45 + vpadd.u16 \dl, \dl, \dl 28.46 + .endm 28.47 + 28.48 +function ff_pred16x16_128_dc_neon, export=1 28.49 + vmov.i8 q0, #128 28.50 + b .L_pred16x16_dc_end 28.51 +endfunc 28.52 + 28.53 +function ff_pred16x16_top_dc_neon, export=1 28.54 + sub r2, r0, r1 28.55 + vld1.8 {q0}, [r2,:128] 28.56 + add16x8 q0, d0, d1, d0, d1 28.57 + vrshrn.u16 d0, q0, #4 28.58 + vdup.8 q0, d0[0] 28.59 + b .L_pred16x16_dc_end 28.60 +endfunc 28.61 + 28.62 +function ff_pred16x16_left_dc_neon, export=1 28.63 + sub r2, r0, #1 28.64 + ldcol.8 d0, r2, r1 28.65 + ldcol.8 d1, r2, r1 28.66 + add16x8 q0, d0, d1, d0, d1 28.67 + vrshrn.u16 d0, q0, #4 28.68 + vdup.8 q0, d0[0] 28.69 + b .L_pred16x16_dc_end 28.70 +endfunc 28.71 + 28.72 +function ff_pred16x16_dc_neon, export=1 28.73 + sub r2, r0, r1 28.74 + vld1.8 {q0}, [r2,:128] 28.75 + sub r2, r0, #1 28.76 + ldcol.8 d2, r2, r1 28.77 + ldcol.8 d3, r2, r1 28.78 + vaddl.u8 q0, d0, d1 28.79 + vaddl.u8 q1, d2, d3 28.80 + vadd.u16 q0, q0, q1 28.81 + vadd.u16 d0, d0, d1 28.82 + vpadd.u16 d0, d0, d0 28.83 + vpadd.u16 d0, d0, d0 28.84 + vrshrn.u16 d0, q0, #5 28.85 + vdup.8 q0, d0[0] 28.86 +.L_pred16x16_dc_end: 28.87 + mov r3, #8 28.88 +6: vst1.8 {q0}, [r0,:128], r1 28.89 + vst1.8 {q0}, [r0,:128], r1 28.90 + subs r3, r3, #1 28.91 + bne 6b 28.92 + bx lr 28.93 +endfunc 28.94 + 28.95 +function ff_pred16x16_hor_neon, export=1 28.96 + sub r2, r0, #1 28.97 + mov r3, #16 28.98 +1: vld1.8 {d0[],d1[]},[r2], r1 28.99 + vst1.8 {q0}, [r0,:128], r1 28.100 + subs r3, r3, #1 28.101 + bne 1b 28.102 + bx lr 28.103 +endfunc 28.104 + 28.105 +function ff_pred16x16_vert_neon, export=1 28.106 + sub r0, r0, r1 28.107 + vld1.8 {q0}, [r0,:128], r1 28.108 + mov r3, #8 28.109 +1: vst1.8 {q0}, [r0,:128], r1 28.110 + vst1.8 {q0}, [r0,:128], r1 28.111 + subs r3, r3, #1 28.112 + bne 1b 28.113 + bx lr 28.114 +endfunc 28.115 + 28.116 +function ff_pred16x16_plane_neon, export=1 28.117 + sub r3, r0, r1 28.118 + add r2, r3, #8 28.119 + sub r3, r3, #1 28.120 + vld1.8 {d0}, [r3] 28.121 + vld1.8 {d2}, [r2,:64], r1 28.122 + ldcol.8 d1, r3, r1 28.123 + add r3, r3, r1 28.124 + ldcol.8 d3, r3, r1 28.125 + vrev64.8 q0, q0 28.126 + vaddl.u8 q8, d2, d3 28.127 + vsubl.u8 q2, d2, d0 28.128 + vsubl.u8 q3, d3, d1 28.129 + movrel r3, p16weight 28.130 + vld1.8 {q0}, [r3,:128] 28.131 + vmul.s16 q2, q2, q0 28.132 + vmul.s16 q3, q3, q0 28.133 + vadd.i16 d4, d4, d5 28.134 + vadd.i16 d5, d6, d7 28.135 + vpadd.i16 d4, d4, d5 28.136 + vpadd.i16 d4, d4, d4 28.137 + vshl.i16 d5, d4, #2 28.138 + vaddl.s16 q2, d4, d5 28.139 + vrshrn.s32 d4, q2, #6 28.140 + mov r3, #0 28.141 + vtrn.16 d4, d5 28.142 + vadd.i16 d2, d4, d5 28.143 + vshl.i16 d3, d2, #3 28.144 + vrev64.16 d16, d17 28.145 + vsub.i16 d3, d3, d2 28.146 + vadd.i16 d16, d16, d0 28.147 + vshl.i16 d2, d16, #4 28.148 + vsub.i16 d2, d2, d3 28.149 + vshl.i16 d3, d4, #4 28.150 + vext.16 q0, q0, q0, #7 28.151 + vsub.i16 d6, d5, d3 28.152 + vmov.16 d0[0], r3 28.153 + vmul.i16 q0, q0, d4[0] 28.154 + vdup.16 q1, d2[0] 28.155 + vdup.16 q2, d4[0] 28.156 + vdup.16 q3, d6[0] 28.157 + vshl.i16 q2, q2, #3 28.158 + vadd.i16 q1, q1, q0 28.159 + vadd.i16 q3, q3, q2 28.160 + mov r3, #16 28.161 +1: 28.162 + vqshrun.s16 d0, q1, #5 28.163 + vadd.i16 q1, q1, q2 28.164 + vqshrun.s16 d1, q1, #5 28.165 + vadd.i16 q1, q1, q3 28.166 + vst1.8 {q0}, [r0,:128], r1 28.167 + subs r3, r3, #1 28.168 + bne 1b 28.169 + bx lr 28.170 +endfunc 28.171 + 28.172 + .section .rodata 28.173 + .align 4 28.174 +p16weight: 28.175 + .short 1,2,3,4,5,6,7,8 28.176 + 28.177 + .text 28.178 + 28.179 +function ff_pred8x8_hor_neon, export=1 28.180 + sub r2, r0, #1 28.181 + mov r3, #8 28.182 +1: vld1.8 {d0[]}, [r2], r1 28.183 + vst1.8 {d0}, [r0,:64], r1 28.184 + subs r3, r3, #1 28.185 + bne 1b 28.186 + bx lr 28.187 +endfunc 28.188 + 28.189 +function ff_pred8x8_vert_neon, export=1 28.190 + sub r0, r0, r1 28.191 + vld1.8 {d0}, [r0,:64], r1 28.192 + mov r3, #4 28.193 +1: vst1.8 {d0}, [r0,:64], r1 28.194 + vst1.8 {d0}, [r0,:64], r1 28.195 + subs r3, r3, #1 28.196 + bne 1b 28.197 + bx lr 28.198 +endfunc 28.199 + 28.200 +function ff_pred8x8_plane_neon, export=1 28.201 + sub r3, r0, r1 28.202 + add r2, r3, #4 28.203 + sub r3, r3, #1 28.204 + vld1.32 {d0[0]}, [r3] 28.205 + vld1.32 {d2[0]}, [r2,:32], r1 28.206 + ldcol.8 d0, r3, r1, 4, hi=1 28.207 + add r3, r3, r1 28.208 + ldcol.8 d3, r3, r1, 4 28.209 + vaddl.u8 q8, d2, d3 28.210 + vrev32.8 d0, d0 28.211 + vtrn.32 d2, d3 28.212 + vsubl.u8 q2, d2, d0 28.213 + movrel r3, p16weight 28.214 + vld1.16 {q0}, [r3,:128] 28.215 + vmul.s16 d4, d4, d0 28.216 + vmul.s16 d5, d5, d0 28.217 + vpadd.i16 d4, d4, d5 28.218 + vpaddl.s16 d4, d4 28.219 + vshl.i32 d5, d4, #4 28.220 + vadd.s32 d4, d4, d5 28.221 + vrshrn.s32 d4, q2, #5 28.222 + mov r3, #0 28.223 + vtrn.16 d4, d5 28.224 + vadd.i16 d2, d4, d5 28.225 + vshl.i16 d3, d2, #2 28.226 + vrev64.16 d16, d16 28.227 + vsub.i16 d3, d3, d2 28.228 + vadd.i16 d16, d16, d0 28.229 + vshl.i16 d2, d16, #4 28.230 + vsub.i16 d2, d2, d3 28.231 + vshl.i16 d3, d4, #3 28.232 + vext.16 q0, q0, q0, #7 28.233 + vsub.i16 d6, d5, d3 28.234 + vmov.16 d0[0], r3 28.235 + vmul.i16 q0, q0, d4[0] 28.236 + vdup.16 q1, d2[0] 28.237 + vdup.16 q2, d4[0] 28.238 + vdup.16 q3, d6[0] 28.239 + vshl.i16 q2, q2, #3 28.240 + vadd.i16 q1, q1, q0 28.241 + vadd.i16 q3, q3, q2 28.242 + mov r3, #8 28.243 +1: 28.244 + vqshrun.s16 d0, q1, #5 28.245 + vadd.i16 q1, q1, q3 28.246 + vst1.8 {d0}, [r0,:64], r1 28.247 + subs r3, r3, #1 28.248 + bne 1b 28.249 + bx lr 28.250 +endfunc 28.251 + 28.252 +function ff_pred8x8_128_dc_neon, export=1 28.253 + vmov.i8 q0, #128 28.254 + b .L_pred8x8_dc_end 28.255 +endfunc 28.256 + 28.257 +function ff_pred8x8_top_dc_neon, export=1 28.258 + sub r2, r0, r1 28.259 + vld1.8 {d0}, [r2,:64] 28.260 + vpaddl.u8 d0, d0 28.261 + vpadd.u16 d0, d0, d0 28.262 + vrshrn.u16 d0, q0, #2 28.263 + vdup.8 d1, d0[1] 28.264 + vdup.8 d0, d0[0] 28.265 + vtrn.32 d0, d1 28.266 + b .L_pred8x8_dc_end 28.267 +endfunc 28.268 + 28.269 +function ff_pred8x8_left_dc_neon, export=1 28.270 + sub r2, r0, #1 28.271 + ldcol.8 d0, r2, r1 28.272 + vpaddl.u8 d0, d0 28.273 + vpadd.u16 d0, d0, d0 28.274 + vrshrn.u16 d0, q0, #2 28.275 + vdup.8 d1, d0[1] 28.276 + vdup.8 d0, d0[0] 28.277 + b .L_pred8x8_dc_end 28.278 +endfunc 28.279 + 28.280 +function ff_pred8x8_dc_neon, export=1 28.281 + sub r2, r0, r1 28.282 + vld1.8 {d0}, [r2,:64] 28.283 + sub r2, r0, #1 28.284 + ldcol.8 d1, r2, r1 28.285 + vtrn.32 d0, d1 28.286 + vpaddl.u8 q0, q0 28.287 + vpadd.u16 d0, d0, d1 28.288 + vpadd.u16 d1, d0, d0 28.289 + vrshrn.u16 d2, q0, #3 28.290 + vrshrn.u16 d3, q0, #2 28.291 + vdup.8 d0, d2[4] 28.292 + vdup.8 d1, d3[3] 28.293 + vdup.8 d4, d3[2] 28.294 + vdup.8 d5, d2[5] 28.295 + vtrn.32 q0, q2 28.296 +.L_pred8x8_dc_end: 28.297 + mov r3, #4 28.298 + add r2, r0, r1, lsl #2 28.299 +6: vst1.8 {d0}, [r0,:64], r1 28.300 + vst1.8 {d1}, [r2,:64], r1 28.301 + subs r3, r3, #1 28.302 + bne 6b 28.303 + bx lr 28.304 +endfunc 28.305 + 28.306 +function ff_pred8x8_l0t_dc_neon, export=1 28.307 + sub r2, r0, r1 28.308 + vld1.8 {d0}, [r2,:64] 28.309 + sub r2, r0, #1 28.310 + ldcol.8 d1, r2, r1, 4 28.311 + vtrn.32 d0, d1 28.312 + vpaddl.u8 q0, q0 28.313 + vpadd.u16 d0, d0, d1 28.314 + vpadd.u16 d1, d0, d0 28.315 + vrshrn.u16 d2, q0, #3 28.316 + vrshrn.u16 d3, q0, #2 28.317 + vdup.8 d0, d2[4] 28.318 + vdup.8 d1, d3[0] 28.319 + vdup.8 q2, d3[2] 28.320 + vtrn.32 q0, q2 28.321 + b .L_pred8x8_dc_end 28.322 +endfunc 28.323 + 28.324 +function ff_pred8x8_l00_dc_neon, export=1 28.325 + sub r2, r0, #1 28.326 + ldcol.8 d0, r2, r1, 4 28.327 + vpaddl.u8 d0, d0 28.328 + vpadd.u16 d0, d0, d0 28.329 + vrshrn.u16 d0, q0, #2 28.330 + vmov.i8 d1, #128 28.331 + vdup.8 d0, d0[0] 28.332 + b .L_pred8x8_dc_end 28.333 +endfunc 28.334 + 28.335 +function ff_pred8x8_0lt_dc_neon, export=1 28.336 + sub r2, r0, r1 28.337 + vld1.8 {d0}, [r2,:64] 28.338 + add r2, r0, r1, lsl #2 28.339 + sub r2, r2, #1 28.340 + ldcol.8 d1, r2, r1, 4, hi=1 28.341 + vtrn.32 d0, d1 28.342 + vpaddl.u8 q0, q0 28.343 + vpadd.u16 d0, d0, d1 28.344 + vpadd.u16 d1, d0, d0 28.345 + vrshrn.u16 d3, q0, #2 28.346 + vrshrn.u16 d2, q0, #3 28.347 + vdup.8 d0, d3[0] 28.348 + vdup.8 d1, d3[3] 28.349 + vdup.8 d4, d3[2] 28.350 + vdup.8 d5, d2[5] 28.351 + vtrn.32 q0, q2 28.352 + b .L_pred8x8_dc_end 28.353 +endfunc 28.354 + 28.355 +function ff_pred8x8_0l0_dc_neon, export=1 28.356 + add r2, r0, r1, lsl #2 28.357 + sub r2, r2, #1 28.358 + ldcol.8 d1, r2, r1, 4 28.359 + vpaddl.u8 d2, d1 28.360 + vpadd.u16 d2, d2, d2 28.361 + vrshrn.u16 d1, q1, #2 28.362 + vmov.i8 d0, #128 28.363 + vdup.8 d1, d1[0] 28.364 + b .L_pred8x8_dc_end 28.365 +endfunc
29.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 29.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/int_neon.S Mon Aug 27 12:09:56 2012 +0200 29.3 @@ -0,0 +1,118 @@ 29.4 +/* 29.5 + * ARM NEON optimised integer operations 29.6 + * Copyright (c) 2009 Kostya Shishkov 29.7 + * 29.8 + * This file is part of FFmpeg. 29.9 + * 29.10 + * FFmpeg is free software; you can redistribute it and/or 29.11 + * modify it under the terms of the GNU Lesser General Public 29.12 + * License as published by the Free Software Foundation; either 29.13 + * version 2.1 of the License, or (at your option) any later version. 29.14 + * 29.15 + * FFmpeg is distributed in the hope that it will be useful, 29.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 29.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 29.18 + * Lesser General Public License for more details. 29.19 + * 29.20 + * You should have received a copy of the GNU Lesser General Public 29.21 + * License along with FFmpeg; if not, write to the Free Software 29.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 29.23 + */ 29.24 + 29.25 +#include "asm.S" 29.26 + 29.27 + preserve8 29.28 + .fpu neon 29.29 + .text 29.30 + 29.31 +function ff_scalarproduct_int16_neon, export=1 29.32 + vmov.i16 q0, #0 29.33 + vmov.i16 q1, #0 29.34 + vmov.i16 q2, #0 29.35 + vmov.i16 q3, #0 29.36 + negs r3, r3 29.37 + beq 2f 29.38 + 29.39 + vdup.s32 q12, r3 29.40 +1: vld1.16 {d16-d17}, [r0]! 29.41 + vld1.16 {d20-d21}, [r1,:128]! 29.42 + vmull.s16 q12, d16, d20 29.43 + vld1.16 {d18-d19}, [r0]! 29.44 + vmull.s16 q13, d17, d21 29.45 + vld1.16 {d22-d23}, [r1,:128]! 29.46 + vmull.s16 q14, d18, d22 29.47 + vmull.s16 q15, d19, d23 29.48 + vshl.s32 q8, q12, q12 29.49 + vshl.s32 q9, q13, q12 29.50 + vadd.s32 q0, q0, q8 29.51 + vshl.s32 q10, q14, q12 29.52 + vadd.s32 q1, q1, q9 29.53 + vshl.s32 q11, q15, q12 29.54 + vadd.s32 q2, q2, q10 29.55 + vadd.s32 q3, q3, q11 29.56 + subs r2, r2, #16 29.57 + bne 1b 29.58 + b 3f 29.59 + 29.60 +2: vld1.16 {d16-d17}, [r0]! 29.61 + vld1.16 {d20-d21}, [r1,:128]! 29.62 + vmlal.s16 q0, d16, d20 29.63 + vld1.16 {d18-d19}, [r0]! 29.64 + vmlal.s16 q1, d17, d21 29.65 + vld1.16 {d22-d23}, [r1,:128]! 29.66 + vmlal.s16 q2, d18, d22 29.67 + vmlal.s16 q3, d19, d23 29.68 + subs r2, r2, #16 29.69 + bne 2b 29.70 + 29.71 +3: vpadd.s32 d16, d0, d1 29.72 + vpadd.s32 d17, d2, d3 29.73 + vpadd.s32 d10, d4, d5 29.74 + vpadd.s32 d11, d6, d7 29.75 + vpadd.s32 d0, d16, d17 29.76 + vpadd.s32 d1, d10, d11 29.77 + vpadd.s32 d2, d0, d1 29.78 + vpaddl.s32 d3, d2 29.79 + vmov.32 r0, d3[0] 29.80 + bx lr 29.81 +endfunc 29.82 + 29.83 +@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul) 29.84 +function ff_scalarproduct_and_madd_int16_neon, export=1 29.85 + vld1.16 {d28[],d29[]}, [sp] 29.86 + vmov.i16 q0, #0 29.87 + vmov.i16 q1, #0 29.88 + vmov.i16 q2, #0 29.89 + vmov.i16 q3, #0 29.90 + mov r12, r0 29.91 + 29.92 +1: vld1.16 {d16-d17}, [r0,:128]! 29.93 + vld1.16 {d18-d19}, [r1]! 29.94 + vld1.16 {d20-d21}, [r2]! 29.95 + vld1.16 {d22-d23}, [r0,:128]! 29.96 + vld1.16 {d24-d25}, [r1]! 29.97 + vld1.16 {d26-d27}, [r2]! 29.98 + vmul.s16 q10, q10, q14 29.99 + vmul.s16 q13, q13, q14 29.100 + vmlal.s16 q0, d16, d18 29.101 + vmlal.s16 q1, d17, d19 29.102 + vadd.s16 q10, q8, q10 29.103 + vadd.s16 q13, q11, q13 29.104 + vmlal.s16 q2, d22, d24 29.105 + vmlal.s16 q3, d23, d25 29.106 + vst1.16 {q10}, [r12,:128]! 29.107 + subs r3, r3, #16 29.108 + vst1.16 {q13}, [r12,:128]! 29.109 + bne 1b 29.110 + 29.111 + vpadd.s32 d16, d0, d1 29.112 + vpadd.s32 d17, d2, d3 29.113 + vpadd.s32 d10, d4, d5 29.114 + vpadd.s32 d11, d6, d7 29.115 + vpadd.s32 d0, d16, d17 29.116 + vpadd.s32 d1, d10, d11 29.117 + vpadd.s32 d2, d0, d1 29.118 + vpaddl.s32 d3, d2 29.119 + vmov.32 r0, d3[0] 29.120 + bx lr 29.121 +endfunc
30.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 30.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/jrevdct_arm.S Mon Aug 27 12:09:56 2012 +0200 30.3 @@ -0,0 +1,388 @@ 30.4 +/* 30.5 + C-like prototype : 30.6 + void j_rev_dct_arm(DCTBLOCK data) 30.7 + 30.8 + With DCTBLOCK being a pointer to an array of 64 'signed shorts' 30.9 + 30.10 + Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org) 30.11 + 30.12 + Permission is hereby granted, free of charge, to any person obtaining a copy 30.13 + of this software and associated documentation files (the "Software"), to deal 30.14 + in the Software without restriction, including without limitation the rights 30.15 + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 30.16 + copies of the Software, and to permit persons to whom the Software is 30.17 + furnished to do so, subject to the following conditions: 30.18 + 30.19 + The above copyright notice and this permission notice shall be included in 30.20 + all copies or substantial portions of the Software. 30.21 + 30.22 + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30.23 + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 30.24 + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 30.25 + COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 30.26 + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30.27 + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 30.28 + 30.29 +*/ 30.30 + 30.31 +#include "asm.S" 30.32 + 30.33 +#define FIX_0_298631336 2446 30.34 +#define FIX_0_541196100 4433 30.35 +#define FIX_0_765366865 6270 30.36 +#define FIX_1_175875602 9633 30.37 +#define FIX_1_501321110 12299 30.38 +#define FIX_2_053119869 16819 30.39 +#define FIX_3_072711026 25172 30.40 +#define FIX_M_0_390180644 -3196 30.41 +#define FIX_M_0_899976223 -7373 30.42 +#define FIX_M_1_847759065 -15137 30.43 +#define FIX_M_1_961570560 -16069 30.44 +#define FIX_M_2_562915447 -20995 30.45 +#define FIX_0xFFFF 0xFFFF 30.46 + 30.47 +#define FIX_0_298631336_ID 0 30.48 +#define FIX_0_541196100_ID 4 30.49 +#define FIX_0_765366865_ID 8 30.50 +#define FIX_1_175875602_ID 12 30.51 +#define FIX_1_501321110_ID 16 30.52 +#define FIX_2_053119869_ID 20 30.53 +#define FIX_3_072711026_ID 24 30.54 +#define FIX_M_0_390180644_ID 28 30.55 +#define FIX_M_0_899976223_ID 32 30.56 +#define FIX_M_1_847759065_ID 36 30.57 +#define FIX_M_1_961570560_ID 40 30.58 +#define FIX_M_2_562915447_ID 44 30.59 +#define FIX_0xFFFF_ID 48 30.60 + .text 30.61 + .align 30.62 + 30.63 +function ff_j_rev_dct_arm, export=1 30.64 + stmdb sp!, { r4 - r12, lr } @ all callee saved regs 30.65 + 30.66 + sub sp, sp, #4 @ reserve some space on the stack 30.67 + str r0, [ sp ] @ save the DCT pointer to the stack 30.68 + 30.69 + mov lr, r0 @ lr = pointer to the current row 30.70 + mov r12, #8 @ r12 = row-counter 30.71 + adr r11, const_array @ r11 = base pointer to the constants array 30.72 +row_loop: 30.73 + ldrsh r0, [lr, # 0] @ r0 = 'd0' 30.74 + ldrsh r2, [lr, # 2] @ r2 = 'd2' 30.75 + 30.76 + @ Optimization for row that have all items except the first set to 0 30.77 + @ (this works as the DCTELEMS are always 4-byte aligned) 30.78 + ldr r5, [lr, # 0] 30.79 + ldr r6, [lr, # 4] 30.80 + ldr r3, [lr, # 8] 30.81 + ldr r4, [lr, #12] 30.82 + orr r3, r3, r4 30.83 + orr r3, r3, r6 30.84 + orrs r5, r3, r5 30.85 + beq end_of_row_loop @ nothing to be done as ALL of them are '0' 30.86 + orrs r3, r3, r2 30.87 + beq empty_row 30.88 + 30.89 + ldrsh r1, [lr, # 8] @ r1 = 'd1' 30.90 + ldrsh r4, [lr, # 4] @ r4 = 'd4' 30.91 + ldrsh r6, [lr, # 6] @ r6 = 'd6' 30.92 + 30.93 + ldr r3, [r11, #FIX_0_541196100_ID] 30.94 + add r7, r2, r6 30.95 + ldr r5, [r11, #FIX_M_1_847759065_ID] 30.96 + mul r7, r3, r7 @ r7 = z1 30.97 + ldr r3, [r11, #FIX_0_765366865_ID] 30.98 + mla r6, r5, r6, r7 @ r6 = tmp2 30.99 + add r5, r0, r4 @ r5 = tmp0 30.100 + mla r2, r3, r2, r7 @ r2 = tmp3 30.101 + sub r3, r0, r4 @ r3 = tmp1 30.102 + 30.103 + add r0, r2, r5, lsl #13 @ r0 = tmp10 30.104 + rsb r2, r2, r5, lsl #13 @ r2 = tmp13 30.105 + add r4, r6, r3, lsl #13 @ r4 = tmp11 30.106 + rsb r3, r6, r3, lsl #13 @ r3 = tmp12 30.107 + 30.108 + stmdb sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11 30.109 + 30.110 + ldrsh r3, [lr, #10] @ r3 = 'd3' 30.111 + ldrsh r5, [lr, #12] @ r5 = 'd5' 30.112 + ldrsh r7, [lr, #14] @ r7 = 'd7' 30.113 + 30.114 + add r0, r3, r5 @ r0 = 'z2' 30.115 + add r2, r1, r7 @ r2 = 'z1' 30.116 + add r4, r3, r7 @ r4 = 'z3' 30.117 + add r6, r1, r5 @ r6 = 'z4' 30.118 + ldr r9, [r11, #FIX_1_175875602_ID] 30.119 + add r8, r4, r6 @ r8 = z3 + z4 30.120 + ldr r10, [r11, #FIX_M_0_899976223_ID] 30.121 + mul r8, r9, r8 @ r8 = 'z5' 30.122 + ldr r9, [r11, #FIX_M_2_562915447_ID] 30.123 + mul r2, r10, r2 @ r2 = 'z1' 30.124 + ldr r10, [r11, #FIX_M_1_961570560_ID] 30.125 + mul r0, r9, r0 @ r0 = 'z2' 30.126 + ldr r9, [r11, #FIX_M_0_390180644_ID] 30.127 + mla r4, r10, r4, r8 @ r4 = 'z3' 30.128 + ldr r10, [r11, #FIX_0_298631336_ID] 30.129 + mla r6, r9, r6, r8 @ r6 = 'z4' 30.130 + ldr r9, [r11, #FIX_2_053119869_ID] 30.131 + mla r7, r10, r7, r2 @ r7 = tmp0 + z1 30.132 + ldr r10, [r11, #FIX_3_072711026_ID] 30.133 + mla r5, r9, r5, r0 @ r5 = tmp1 + z2 30.134 + ldr r9, [r11, #FIX_1_501321110_ID] 30.135 + mla r3, r10, r3, r0 @ r3 = tmp2 + z2 30.136 + add r7, r7, r4 @ r7 = tmp0 30.137 + mla r1, r9, r1, r2 @ r1 = tmp3 + z1 30.138 + add r5, r5, r6 @ r5 = tmp1 30.139 + add r3, r3, r4 @ r3 = tmp2 30.140 + add r1, r1, r6 @ r1 = tmp3 30.141 + 30.142 + ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11 30.143 + @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 30.144 + 30.145 + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) 30.146 + add r8, r0, r1 30.147 + add r8, r8, #(1<<10) 30.148 + mov r8, r8, asr #11 30.149 + strh r8, [lr, # 0] 30.150 + 30.151 + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) 30.152 + sub r8, r0, r1 30.153 + add r8, r8, #(1<<10) 30.154 + mov r8, r8, asr #11 30.155 + strh r8, [lr, #14] 30.156 + 30.157 + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) 30.158 + add r8, r6, r3 30.159 + add r8, r8, #(1<<10) 30.160 + mov r8, r8, asr #11 30.161 + strh r8, [lr, # 2] 30.162 + 30.163 + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) 30.164 + sub r8, r6, r3 30.165 + add r8, r8, #(1<<10) 30.166 + mov r8, r8, asr #11 30.167 + strh r8, [lr, #12] 30.168 + 30.169 + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) 30.170 + add r8, r4, r5 30.171 + add r8, r8, #(1<<10) 30.172 + mov r8, r8, asr #11 30.173 + strh r8, [lr, # 4] 30.174 + 30.175 + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) 30.176 + sub r8, r4, r5 30.177 + add r8, r8, #(1<<10) 30.178 + mov r8, r8, asr #11 30.179 + strh r8, [lr, #10] 30.180 + 30.181 + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) 30.182 + add r8, r2, r7 30.183 + add r8, r8, #(1<<10) 30.184 + mov r8, r8, asr #11 30.185 + strh r8, [lr, # 6] 30.186 + 30.187 + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) 30.188 + sub r8, r2, r7 30.189 + add r8, r8, #(1<<10) 30.190 + mov r8, r8, asr #11 30.191 + strh r8, [lr, # 8] 30.192 + 30.193 + @ End of row loop 30.194 + add lr, lr, #16 30.195 + subs r12, r12, #1 30.196 + bne row_loop 30.197 + beq start_column_loop 30.198 + 30.199 +empty_row: 30.200 + ldr r1, [r11, #FIX_0xFFFF_ID] 30.201 + mov r0, r0, lsl #2 30.202 + and r0, r0, r1 30.203 + add r0, r0, r0, lsl #16 30.204 + str r0, [lr, # 0] 30.205 + str r0, [lr, # 4] 30.206 + str r0, [lr, # 8] 30.207 + str r0, [lr, #12] 30.208 + 30.209 +end_of_row_loop: 30.210 + @ End of loop 30.211 + add lr, lr, #16 30.212 + subs r12, r12, #1 30.213 + bne row_loop 30.214 + 30.215 +start_column_loop: 30.216 + @ Start of column loop 30.217 + ldr lr, [ sp ] 30.218 + mov r12, #8 30.219 +column_loop: 30.220 + ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0' 30.221 + ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2' 30.222 + ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4' 30.223 + ldrsh r6, [lr, #(12*8)] @ r6 = 'd6' 30.224 + 30.225 + ldr r3, [r11, #FIX_0_541196100_ID] 30.226 + add r1, r2, r6 30.227 + ldr r5, [r11, #FIX_M_1_847759065_ID] 30.228 + mul r1, r3, r1 @ r1 = z1 30.229 + ldr r3, [r11, #FIX_0_765366865_ID] 30.230 + mla r6, r5, r6, r1 @ r6 = tmp2 30.231 + add r5, r0, r4 @ r5 = tmp0 30.232 + mla r2, r3, r2, r1 @ r2 = tmp3 30.233 + sub r3, r0, r4 @ r3 = tmp1 30.234 + 30.235 + add r0, r2, r5, lsl #13 @ r0 = tmp10 30.236 + rsb r2, r2, r5, lsl #13 @ r2 = tmp13 30.237 + add r4, r6, r3, lsl #13 @ r4 = tmp11 30.238 + rsb r6, r6, r3, lsl #13 @ r6 = tmp12 30.239 + 30.240 + ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1' 30.241 + ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3' 30.242 + ldrsh r5, [lr, #(10*8)] @ r5 = 'd5' 30.243 + ldrsh r7, [lr, #(14*8)] @ r7 = 'd7' 30.244 + 30.245 + @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats) 30.246 + orr r9, r1, r3 30.247 + orr r10, r5, r7 30.248 + orrs r10, r9, r10 30.249 + beq empty_odd_column 30.250 + 30.251 + stmdb sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11 30.252 + 30.253 + add r0, r3, r5 @ r0 = 'z2' 30.254 + add r2, r1, r7 @ r2 = 'z1' 30.255 + add r4, r3, r7 @ r4 = 'z3' 30.256 + add r6, r1, r5 @ r6 = 'z4' 30.257 + ldr r9, [r11, #FIX_1_175875602_ID] 30.258 + add r8, r4, r6 30.259 + ldr r10, [r11, #FIX_M_0_899976223_ID] 30.260 + mul r8, r9, r8 @ r8 = 'z5' 30.261 + ldr r9, [r11, #FIX_M_2_562915447_ID] 30.262 + mul r2, r10, r2 @ r2 = 'z1' 30.263 + ldr r10, [r11, #FIX_M_1_961570560_ID] 30.264 + mul r0, r9, r0 @ r0 = 'z2' 30.265 + ldr r9, [r11, #FIX_M_0_390180644_ID] 30.266 + mla r4, r10, r4, r8 @ r4 = 'z3' 30.267 + ldr r10, [r11, #FIX_0_298631336_ID] 30.268 + mla r6, r9, r6, r8 @ r6 = 'z4' 30.269 + ldr r9, [r11, #FIX_2_053119869_ID] 30.270 + mla r7, r10, r7, r2 @ r7 = tmp0 + z1 30.271 + ldr r10, [r11, #FIX_3_072711026_ID] 30.272 + mla r5, r9, r5, r0 @ r5 = tmp1 + z2 30.273 + ldr r9, [r11, #FIX_1_501321110_ID] 30.274 + mla r3, r10, r3, r0 @ r3 = tmp2 + z2 30.275 + add r7, r7, r4 @ r7 = tmp0 30.276 + mla r1, r9, r1, r2 @ r1 = tmp3 + z1 30.277 + add r5, r5, r6 @ r5 = tmp1 30.278 + add r3, r3, r4 @ r3 = tmp2 30.279 + add r1, r1, r6 @ r1 = tmp3 30.280 + 30.281 + ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12 30.282 + @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 30.283 + 30.284 + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) 30.285 + add r8, r0, r1 30.286 + add r8, r8, #(1<<17) 30.287 + mov r8, r8, asr #18 30.288 + strh r8, [lr, #( 0*8)] 30.289 + 30.290 + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) 30.291 + sub r8, r0, r1 30.292 + add r8, r8, #(1<<17) 30.293 + mov r8, r8, asr #18 30.294 + strh r8, [lr, #(14*8)] 30.295 + 30.296 + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) 30.297 + add r8, r4, r3 30.298 + add r8, r8, #(1<<17) 30.299 + mov r8, r8, asr #18 30.300 + strh r8, [lr, #( 2*8)] 30.301 + 30.302 + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) 30.303 + sub r8, r4, r3 30.304 + add r8, r8, #(1<<17) 30.305 + mov r8, r8, asr #18 30.306 + strh r8, [lr, #(12*8)] 30.307 + 30.308 + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) 30.309 + add r8, r6, r5 30.310 + add r8, r8, #(1<<17) 30.311 + mov r8, r8, asr #18 30.312 + strh r8, [lr, #( 4*8)] 30.313 + 30.314 + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) 30.315 + sub r8, r6, r5 30.316 + add r8, r8, #(1<<17) 30.317 + mov r8, r8, asr #18 30.318 + strh r8, [lr, #(10*8)] 30.319 + 30.320 + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) 30.321 + add r8, r2, r7 30.322 + add r8, r8, #(1<<17) 30.323 + mov r8, r8, asr #18 30.324 + strh r8, [lr, #( 6*8)] 30.325 + 30.326 + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) 30.327 + sub r8, r2, r7 30.328 + add r8, r8, #(1<<17) 30.329 + mov r8, r8, asr #18 30.330 + strh r8, [lr, #( 8*8)] 30.331 + 30.332 + @ End of row loop 30.333 + add lr, lr, #2 30.334 + subs r12, r12, #1 30.335 + bne column_loop 30.336 + beq the_end 30.337 + 30.338 +empty_odd_column: 30.339 + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) 30.340 + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) 30.341 + add r0, r0, #(1<<17) 30.342 + mov r0, r0, asr #18 30.343 + strh r0, [lr, #( 0*8)] 30.344 + strh r0, [lr, #(14*8)] 30.345 + 30.346 + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) 30.347 + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) 30.348 + add r4, r4, #(1<<17) 30.349 + mov r4, r4, asr #18 30.350 + strh r4, [lr, #( 2*8)] 30.351 + strh r4, [lr, #(12*8)] 30.352 + 30.353 + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) 30.354 + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) 30.355 + add r6, r6, #(1<<17) 30.356 + mov r6, r6, asr #18 30.357 + strh r6, [lr, #( 4*8)] 30.358 + strh r6, [lr, #(10*8)] 30.359 + 30.360 + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) 30.361 + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) 30.362 + add r2, r2, #(1<<17) 30.363 + mov r2, r2, asr #18 30.364 + strh r2, [lr, #( 6*8)] 30.365 + strh r2, [lr, #( 8*8)] 30.366 + 30.367 + @ End of row loop 30.368 + add lr, lr, #2 30.369 + subs r12, r12, #1 30.370 + bne column_loop 30.371 + 30.372 +the_end: 30.373 + @ The end.... 30.374 + add sp, sp, #4 30.375 + ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return 30.376 + 30.377 +const_array: 30.378 + .align 30.379 + .word FIX_0_298631336 30.380 + .word FIX_0_541196100 30.381 + .word FIX_0_765366865 30.382 + .word FIX_1_175875602 30.383 + .word FIX_1_501321110 30.384 + .word FIX_2_053119869 30.385 + .word FIX_3_072711026 30.386 + .word FIX_M_0_390180644 30.387 + .word FIX_M_0_899976223 30.388 + .word FIX_M_1_847759065 30.389 + .word FIX_M_1_961570560 30.390 + .word FIX_M_2_562915447 30.391 + .word FIX_0xFFFF
31.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 31.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/mathops.h Mon Aug 27 12:09:56 2012 +0200 31.3 @@ -0,0 +1,116 @@ 31.4 +/* 31.5 + * simple math operations 31.6 + * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al 31.7 + * 31.8 + * This file is part of FFmpeg. 31.9 + * 31.10 + * FFmpeg is free software; you can redistribute it and/or 31.11 + * modify it under the terms of the GNU Lesser General Public 31.12 + * License as published by the Free Software Foundation; either 31.13 + * version 2.1 of the License, or (at your option) any later version. 31.14 + * 31.15 + * FFmpeg is distributed in the hope that it will be useful, 31.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 31.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 31.18 + * Lesser General Public License for more details. 31.19 + * 31.20 + * You should have received a copy of the GNU Lesser General Public 31.21 + * License along with FFmpeg; if not, write to the Free Software 31.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 31.23 + */ 31.24 + 31.25 +#ifndef AVCODEC_ARM_MATHOPS_H 31.26 +#define AVCODEC_ARM_MATHOPS_H 31.27 + 31.28 +#include <stdint.h> 31.29 +#include "config.h" 31.30 +#include "libavutil/common.h" 31.31 + 31.32 +#if HAVE_INLINE_ASM 31.33 + 31.34 +# define MULL MULL 31.35 +static inline av_const int MULL(int a, int b, unsigned shift) 31.36 +{ 31.37 + int lo, hi; 31.38 + __asm__("smull %0, %1, %2, %3 \n\t" 31.39 + "mov %0, %0, lsr %4 \n\t" 31.40 + "add %1, %0, %1, lsl %5 \n\t" 31.41 + : "=&r"(lo), "=&r"(hi) 31.42 + : "r"(b), "r"(a), "ir"(shift), "ir"(32-shift)); 31.43 + return hi; 31.44 +} 31.45 + 31.46 +#define MULH MULH 31.47 +#if HAVE_ARMV6 31.48 +static inline av_const int MULH(int a, int b) 31.49 +{ 31.50 + int r; 31.51 + __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); 31.52 + return r; 31.53 +} 31.54 +#else 31.55 +static inline av_const int MULH(int a, int b) 31.56 +{ 31.57 + int lo, hi; 31.58 + __asm__ ("smull %0, %1, %2, %3" : "=&r"(lo), "=&r"(hi) : "r"(b), "r"(a)); 31.59 + return hi; 31.60 +} 31.61 +#endif 31.62 + 31.63 +static inline av_const int64_t MUL64(int a, int b) 31.64 +{ 31.65 + union { uint64_t x; unsigned hl[2]; } x; 31.66 + __asm__ ("smull %0, %1, %2, %3" 31.67 + : "=r"(x.hl[0]), "=r"(x.hl[1]) : "r"(a), "r"(b)); 31.68 + return x.x; 31.69 +} 31.70 +#define MUL64 MUL64 31.71 + 31.72 +static inline av_const int64_t MAC64(int64_t d, int a, int b) 31.73 +{ 31.74 + union { uint64_t x; unsigned hl[2]; } x = { d }; 31.75 + __asm__ ("smlal %0, %1, %2, %3" 31.76 + : "+r"(x.hl[0]), "+r"(x.hl[1]) : "r"(a), "r"(b)); 31.77 + return x.x; 31.78 +} 31.79 +#define MAC64(d, a, b) ((d) = MAC64(d, a, b)) 31.80 +#define MLS64(d, a, b) MAC64(d, -(a), b) 31.81 + 31.82 +#if HAVE_ARMV5TE 31.83 + 31.84 +/* signed 16x16 -> 32 multiply add accumulate */ 31.85 +# define MAC16(rt, ra, rb) \ 31.86 + __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb)); 31.87 + 31.88 +/* signed 16x16 -> 32 multiply */ 31.89 +# define MUL16 MUL16 31.90 +static inline av_const int MUL16(int ra, int rb) 31.91 +{ 31.92 + int rt; 31.93 + __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb)); 31.94 + return rt; 31.95 +} 31.96 + 31.97 +#endif 31.98 + 31.99 +#define mid_pred mid_pred 31.100 +static inline av_const int mid_pred(int a, int b, int c) 31.101 +{ 31.102 + int m; 31.103 + __asm__ volatile ( 31.104 + "mov %0, %2 \n\t" 31.105 + "cmp %1, %2 \n\t" 31.106 + "movgt %0, %1 \n\t" 31.107 + "movgt %1, %2 \n\t" 31.108 + "cmp %1, %3 \n\t" 31.109 + "movle %1, %3 \n\t" 31.110 + "cmp %0, %1 \n\t" 31.111 + "movgt %0, %1 \n\t" 31.112 + : "=&r"(m), "+r"(a) 31.113 + : "r"(b), "r"(c)); 31.114 + return m; 31.115 +} 31.116 + 31.117 +#endif /* HAVE_INLINE_ASM */ 31.118 + 31.119 +#endif /* AVCODEC_ARM_MATHOPS_H */
32.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 32.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/mdct_neon.S Mon Aug 27 12:09:56 2012 +0200 32.3 @@ -0,0 +1,303 @@ 32.4 +/* 32.5 + * ARM NEON optimised MDCT 32.6 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 32.7 + * 32.8 + * This file is part of FFmpeg. 32.9 + * 32.10 + * FFmpeg is free software; you can redistribute it and/or 32.11 + * modify it under the terms of the GNU Lesser General Public 32.12 + * License as published by the Free Software Foundation; either 32.13 + * version 2.1 of the License, or (at your option) any later version. 32.14 + * 32.15 + * FFmpeg is distributed in the hope that it will be useful, 32.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 32.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 32.18 + * Lesser General Public License for more details. 32.19 + * 32.20 + * You should have received a copy of the GNU Lesser General Public 32.21 + * License along with FFmpeg; if not, write to the Free Software 32.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 32.23 + */ 32.24 + 32.25 +#include "asm.S" 32.26 + 32.27 + preserve8 32.28 + 32.29 + .text 32.30 + 32.31 +#define ff_fft_calc_neon X(ff_fft_calc_neon) 32.32 + 32.33 +function ff_imdct_half_neon, export=1 32.34 + push {r4-r8,lr} 32.35 + 32.36 + mov r12, #1 32.37 + ldr lr, [r0, #28] @ mdct_bits 32.38 + ldr r4, [r0, #32] @ tcos 32.39 + ldr r3, [r0, #8] @ revtab 32.40 + lsl r12, r12, lr @ n = 1 << nbits 32.41 + lsr lr, r12, #2 @ n4 = n >> 2 32.42 + add r7, r2, r12, lsl #1 32.43 + mov r12, #-16 32.44 + sub r7, r7, #16 32.45 + 32.46 + vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0 32.47 + vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x 32.48 + vrev64.32 d17, d17 32.49 + vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 32.50 + vmul.f32 d6, d17, d2 32.51 + vmul.f32 d7, d0, d2 32.52 +1: 32.53 + subs lr, lr, #2 32.54 + ldr r6, [r3], #4 32.55 + vmul.f32 d4, d0, d3 32.56 + vmul.f32 d5, d17, d3 32.57 + vsub.f32 d4, d6, d4 32.58 + vadd.f32 d5, d5, d7 32.59 + uxth r8, r6, ror #16 32.60 + uxth r6, r6 32.61 + add r8, r1, r8, lsl #3 32.62 + add r6, r1, r6, lsl #3 32.63 + beq 1f 32.64 + vld2.32 {d16-d17},[r7,:128],r12 32.65 + vld2.32 {d0-d1}, [r2,:128]! 32.66 + vrev64.32 d17, d17 32.67 + vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 32.68 + vmul.f32 d6, d17, d2 32.69 + vmul.f32 d7, d0, d2 32.70 + vst2.32 {d4[0],d5[0]}, [r6,:64] 32.71 + vst2.32 {d4[1],d5[1]}, [r8,:64] 32.72 + b 1b 32.73 +1: 32.74 + vst2.32 {d4[0],d5[0]}, [r6,:64] 32.75 + vst2.32 {d4[1],d5[1]}, [r8,:64] 32.76 + 32.77 + mov r4, r0 32.78 + mov r6, r1 32.79 + bl ff_fft_calc_neon 32.80 + 32.81 + mov r12, #1 32.82 + ldr lr, [r4, #28] @ mdct_bits 32.83 + ldr r4, [r4, #32] @ tcos 32.84 + lsl r12, r12, lr @ n = 1 << nbits 32.85 + lsr lr, r12, #3 @ n8 = n >> 3 32.86 + 32.87 + add r4, r4, lr, lsl #3 32.88 + add r6, r6, lr, lsl #3 32.89 + sub r1, r4, #16 32.90 + sub r3, r6, #16 32.91 + 32.92 + mov r7, #-16 32.93 + mov r8, r6 32.94 + mov r0, r3 32.95 + 32.96 + vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0 32.97 + vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3 32.98 + vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 32.99 +1: 32.100 + subs lr, lr, #2 32.101 + vmul.f32 d7, d0, d18 32.102 + vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3 32.103 + vmul.f32 d4, d1, d18 32.104 + vmul.f32 d5, d21, d19 32.105 + vmul.f32 d6, d20, d19 32.106 + vmul.f32 d22, d1, d16 32.107 + vmul.f32 d23, d21, d17 32.108 + vmul.f32 d24, d0, d16 32.109 + vmul.f32 d25, d20, d17 32.110 + vadd.f32 d7, d7, d22 32.111 + vadd.f32 d6, d6, d23 32.112 + vsub.f32 d4, d4, d24 32.113 + vsub.f32 d5, d5, d25 32.114 + beq 1f 32.115 + vld2.32 {d0-d1}, [r3,:128], r7 32.116 + vld2.32 {d20-d21},[r6,:128]! 32.117 + vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 32.118 + vrev64.32 q3, q3 32.119 + vst2.32 {d4,d6}, [r0,:128], r7 32.120 + vst2.32 {d5,d7}, [r8,:128]! 32.121 + b 1b 32.122 +1: 32.123 + vrev64.32 q3, q3 32.124 + vst2.32 {d4,d6}, [r0,:128] 32.125 + vst2.32 {d5,d7}, [r8,:128] 32.126 + 32.127 + pop {r4-r8,pc} 32.128 +endfunc 32.129 + 32.130 +function ff_imdct_calc_neon, export=1 32.131 + push {r4-r6,lr} 32.132 + 32.133 + ldr r3, [r0, #28] 32.134 + mov r4, #1 32.135 + mov r5, r1 32.136 + lsl r4, r4, r3 32.137 + add r1, r1, r4 32.138 + 32.139 + bl ff_imdct_half_neon 32.140 + 32.141 + add r0, r5, r4, lsl #2 32.142 + add r1, r5, r4, lsl #1 32.143 + sub r0, r0, #8 32.144 + sub r2, r1, #16 32.145 + mov r3, #-16 32.146 + mov r6, #-8 32.147 + vmov.i32 d30, #1<<31 32.148 +1: 32.149 + vld1.32 {d0-d1}, [r2,:128], r3 32.150 + pld [r0, #-16] 32.151 + vrev64.32 q0, q0 32.152 + vld1.32 {d2-d3}, [r1,:128]! 32.153 + veor d4, d1, d30 32.154 + pld [r2, #-16] 32.155 + vrev64.32 q1, q1 32.156 + veor d5, d0, d30 32.157 + vst1.32 {d2}, [r0,:64], r6 32.158 + vst1.32 {d3}, [r0,:64], r6 32.159 + vst1.32 {d4-d5}, [r5,:128]! 32.160 + subs r4, r4, #16 32.161 + bgt 1b 32.162 + 32.163 + pop {r4-r6,pc} 32.164 +endfunc 32.165 + 32.166 +function ff_mdct_calc_neon, export=1 32.167 + push {r4-r10,lr} 32.168 + 32.169 + mov r12, #1 32.170 + ldr lr, [r0, #28] @ mdct_bits 32.171 + ldr r4, [r0, #32] @ tcos 32.172 + ldr r3, [r0, #8] @ revtab 32.173 + lsl lr, r12, lr @ n = 1 << nbits 32.174 + add r7, r2, lr @ in4u 32.175 + sub r9, r7, #16 @ in4d 32.176 + add r2, r7, lr, lsl #1 @ in3u 32.177 + add r8, r9, lr, lsl #1 @ in3d 32.178 + add r5, r4, lr, lsl #1 32.179 + sub r5, r5, #16 32.180 + sub r3, r3, #4 32.181 + mov r12, #-16 32.182 + 32.183 + vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 32.184 + vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 32.185 + vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 32.186 + vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 32.187 + vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 32.188 + vsub.f32 d0, d18, d0 @ in4d-in4u I 32.189 + vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 32.190 + vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 32.191 + vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 32.192 + vadd.f32 d1, d1, d19 @ in3u+in3d -R 32.193 + vsub.f32 d16, d16, d2 @ in0u-in2d R 32.194 + vadd.f32 d17, d17, d3 @ in2u+in1d -I 32.195 +1: 32.196 + vmul.f32 d7, d0, d21 @ I*s 32.197 + ldr r10, [r3, lr, lsr #1] 32.198 + vmul.f32 d6, d1, d20 @ -R*c 32.199 + ldr r6, [r3, #4]! 32.200 + vmul.f32 d4, d1, d21 @ -R*s 32.201 + vmul.f32 d5, d0, d20 @ I*c 32.202 + vmul.f32 d24, d16, d30 @ R*c 32.203 + vmul.f32 d25, d17, d31 @ -I*s 32.204 + vmul.f32 d22, d16, d31 @ R*s 32.205 + vmul.f32 d23, d17, d30 @ I*c 32.206 + subs lr, lr, #16 32.207 + vsub.f32 d6, d6, d7 @ -R*c-I*s 32.208 + vadd.f32 d7, d4, d5 @ -R*s+I*c 32.209 + vsub.f32 d24, d25, d24 @ I*s-R*c 32.210 + vadd.f32 d25, d22, d23 @ R*s-I*c 32.211 + beq 1f 32.212 + mov r12, #-16 32.213 + vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 32.214 + vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 32.215 + vneg.f32 d7, d7 @ R*s-I*c 32.216 + vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 32.217 + vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 32.218 + vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 32.219 + vsub.f32 d0, d18, d0 @ in4d-in4u I 32.220 + vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 32.221 + vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 32.222 + vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 32.223 + vadd.f32 d1, d1, d19 @ in3u+in3d -R 32.224 + vsub.f32 d16, d16, d2 @ in0u-in2d R 32.225 + vadd.f32 d17, d17, d3 @ in2u+in1d -I 32.226 + uxth r12, r6, ror #16 32.227 + uxth r6, r6 32.228 + add r12, r1, r12, lsl #3 32.229 + add r6, r1, r6, lsl #3 32.230 + vst2.32 {d6[0],d7[0]}, [r6,:64] 32.231 + vst2.32 {d6[1],d7[1]}, [r12,:64] 32.232 + uxth r6, r10, ror #16 32.233 + uxth r10, r10 32.234 + add r6 , r1, r6, lsl #3 32.235 + add r10, r1, r10, lsl #3 32.236 + vst2.32 {d24[0],d25[0]},[r10,:64] 32.237 + vst2.32 {d24[1],d25[1]},[r6,:64] 32.238 + b 1b 32.239 +1: 32.240 + vneg.f32 d7, d7 @ R*s-I*c 32.241 + uxth r12, r6, ror #16 32.242 + uxth r6, r6 32.243 + add r12, r1, r12, lsl #3 32.244 + add r6, r1, r6, lsl #3 32.245 + vst2.32 {d6[0],d7[0]}, [r6,:64] 32.246 + vst2.32 {d6[1],d7[1]}, [r12,:64] 32.247 + uxth r6, r10, ror #16 32.248 + uxth r10, r10 32.249 + add r6 , r1, r6, lsl #3 32.250 + add r10, r1, r10, lsl #3 32.251 + vst2.32 {d24[0],d25[0]},[r10,:64] 32.252 + vst2.32 {d24[1],d25[1]},[r6,:64] 32.253 + 32.254 + mov r4, r0 32.255 + mov r6, r1 32.256 + bl ff_fft_calc_neon 32.257 + 32.258 + mov r12, #1 32.259 + ldr lr, [r4, #28] @ mdct_bits 32.260 + ldr r4, [r4, #32] @ tcos 32.261 + lsl r12, r12, lr @ n = 1 << nbits 32.262 + lsr lr, r12, #3 @ n8 = n >> 3 32.263 + 32.264 + add r4, r4, lr, lsl #3 32.265 + add r6, r6, lr, lsl #3 32.266 + sub r1, r4, #16 32.267 + sub r3, r6, #16 32.268 + 32.269 + mov r7, #-16 32.270 + mov r8, r6 32.271 + mov r0, r3 32.272 + 32.273 + vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0 32.274 + vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3 32.275 + vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 32.276 +1: 32.277 + subs lr, lr, #2 32.278 + vmul.f32 d7, d0, d18 @ r1*s1,r0*s0 32.279 + vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3 32.280 + vmul.f32 d4, d1, d18 @ i1*s1,i0*s0 32.281 + vmul.f32 d5, d21, d19 @ i2*s2,i3*s3 32.282 + vmul.f32 d6, d20, d19 @ r2*s2,r3*s3 32.283 + vmul.f32 d24, d0, d16 @ r1*c1,r0*c0 32.284 + vmul.f32 d25, d20, d17 @ r2*c2,r3*c3 32.285 + vmul.f32 d22, d21, d17 @ i2*c2,i3*c3 32.286 + vmul.f32 d23, d1, d16 @ i1*c1,i0*c0 32.287 + vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0 32.288 + vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3 32.289 + vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3 32.290 + vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0 32.291 + vneg.f32 q2, q2 32.292 + beq 1f 32.293 + vld2.32 {d0-d1}, [r3,:128], r7 32.294 + vld2.32 {d20-d21},[r6,:128]! 32.295 + vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 32.296 + vrev64.32 q3, q3 32.297 + vst2.32 {d4,d6}, [r0,:128], r7 32.298 + vst2.32 {d5,d7}, [r8,:128]! 32.299 + b 1b 32.300 +1: 32.301 + vrev64.32 q3, q3 32.302 + vst2.32 {d4,d6}, [r0,:128] 32.303 + vst2.32 {d5,d7}, [r8,:128] 32.304 + 32.305 + pop {r4-r10,pc} 32.306 +endfunc
33.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 33.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.c Mon Aug 27 12:09:56 2012 +0200 33.3 @@ -0,0 +1,38 @@ 33.4 +/* 33.5 + * Copyright (c) 2002 Michael Niedermayer 33.6 + * 33.7 + * This file is part of FFmpeg. 33.8 + * 33.9 + * FFmpeg is free software; you can redistribute it and/or 33.10 + * modify it under the terms of the GNU Lesser General Public 33.11 + * License as published by the Free Software Foundation; either 33.12 + * version 2.1 of the License, or (at your option) any later version. 33.13 + * 33.14 + * FFmpeg is distributed in the hope that it will be useful, 33.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 33.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 33.17 + * Lesser General Public License for more details. 33.18 + * 33.19 + * You should have received a copy of the GNU Lesser General Public 33.20 + * License along with FFmpeg; if not, write to the Free Software 33.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 33.22 + */ 33.23 + 33.24 +#include "libavcodec/avcodec.h" 33.25 +#include "libavcodec/dsputil.h" 33.26 +#include "libavcodec/mpegvideo.h" 33.27 +#include "mpegvideo_arm.h" 33.28 + 33.29 +void MPV_common_init_arm(MpegEncContext *s) 33.30 +{ 33.31 + /* IWMMXT support is a superset of armv5te, so 33.32 + * allow optimized functions for armv5te unless 33.33 + * a better iwmmxt function exists 33.34 + */ 33.35 +#if HAVE_ARMV5TE 33.36 + MPV_common_init_armv5te(s); 33.37 +#endif 33.38 +#if HAVE_IWMMXT 33.39 + MPV_common_init_iwmmxt(s); 33.40 +#endif 33.41 +}
34.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 34.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.h Mon Aug 27 12:09:56 2012 +0200 34.3 @@ -0,0 +1,27 @@ 34.4 +/* 34.5 + * This file is part of FFmpeg. 34.6 + * 34.7 + * FFmpeg is free software; you can redistribute it and/or 34.8 + * modify it under the terms of the GNU Lesser General Public 34.9 + * License as published by the Free Software Foundation; either 34.10 + * version 2.1 of the License, or (at your option) any later version. 34.11 + * 34.12 + * FFmpeg is distributed in the hope that it will be useful, 34.13 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 34.14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 34.15 + * Lesser General Public License for more details. 34.16 + * 34.17 + * You should have received a copy of the GNU Lesser General Public 34.18 + * License along with FFmpeg; if not, write to the Free Software 34.19 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 34.20 + */ 34.21 + 34.22 +#ifndef AVCODEC_ARM_MPEGVIDEO_H 34.23 +#define AVCODEC_ARM_MPEGVIDEO_H 34.24 + 34.25 +#include "libavcodec/mpegvideo.h" 34.26 + 34.27 +void MPV_common_init_iwmmxt(MpegEncContext *s); 34.28 +void MPV_common_init_armv5te(MpegEncContext *s); 34.29 + 34.30 +#endif
35.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 35.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te.c Mon Aug 27 12:09:56 2012 +0200 35.3 @@ -0,0 +1,101 @@ 35.4 +/* 35.5 + * Optimization of some functions from mpegvideo.c for armv5te 35.6 + * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net> 35.7 + * 35.8 + * This file is part of FFmpeg. 35.9 + * 35.10 + * FFmpeg is free software; you can redistribute it and/or 35.11 + * modify it under the terms of the GNU Lesser General Public 35.12 + * License as published by the Free Software Foundation; either 35.13 + * version 2.1 of the License, or (at your option) any later version. 35.14 + * 35.15 + * FFmpeg is distributed in the hope that it will be useful, 35.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 35.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 35.18 + * Lesser General Public License for more details. 35.19 + * 35.20 + * You should have received a copy of the GNU Lesser General Public 35.21 + * License along with FFmpeg; if not, write to the Free Software 35.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 35.23 + */ 35.24 + 35.25 +#include "libavcodec/avcodec.h" 35.26 +#include "libavcodec/dsputil.h" 35.27 +#include "libavcodec/mpegvideo.h" 35.28 +#include "mpegvideo_arm.h" 35.29 + 35.30 +void ff_dct_unquantize_h263_armv5te(DCTELEM *block, int qmul, int qadd, int count); 35.31 + 35.32 +#ifdef ENABLE_ARM_TESTS 35.33 +/** 35.34 + * h263 dequantizer supplementary function, it is performance critical and needs to 35.35 + * have optimized implementations for each architecture. Is also used as a reference 35.36 + * implementation in regression tests 35.37 + */ 35.38 +static inline void dct_unquantize_h263_helper_c(DCTELEM *block, int qmul, int qadd, int count) 35.39 +{ 35.40 + int i, level; 35.41 + for (i = 0; i < count; i++) { 35.42 + level = block[i]; 35.43 + if (level) { 35.44 + if (level < 0) { 35.45 + level = level * qmul - qadd; 35.46 + } else { 35.47 + level = level * qmul + qadd; 35.48 + } 35.49 + block[i] = level; 35.50 + } 35.51 + } 35.52 +} 35.53 +#endif 35.54 + 35.55 +static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s, 35.56 + DCTELEM *block, int n, int qscale) 35.57 +{ 35.58 + int level, qmul, qadd; 35.59 + int nCoeffs; 35.60 + 35.61 + assert(s->block_last_index[n]>=0); 35.62 + 35.63 + qmul = qscale << 1; 35.64 + 35.65 + if (!s->h263_aic) { 35.66 + if (n < 4) 35.67 + level = block[0] * s->y_dc_scale; 35.68 + else 35.69 + level = block[0] * s->c_dc_scale; 35.70 + qadd = (qscale - 1) | 1; 35.71 + }else{ 35.72 + qadd = 0; 35.73 + level = block[0]; 35.74 + } 35.75 + if(s->ac_pred) 35.76 + nCoeffs=63; 35.77 + else 35.78 + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; 35.79 + 35.80 + ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1); 35.81 + block[0] = level; 35.82 +} 35.83 + 35.84 +static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s, 35.85 + DCTELEM *block, int n, int qscale) 35.86 +{ 35.87 + int qmul, qadd; 35.88 + int nCoeffs; 35.89 + 35.90 + assert(s->block_last_index[n]>=0); 35.91 + 35.92 + qadd = (qscale - 1) | 1; 35.93 + qmul = qscale << 1; 35.94 + 35.95 + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; 35.96 + 35.97 + ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1); 35.98 +} 35.99 + 35.100 +void MPV_common_init_armv5te(MpegEncContext *s) 35.101 +{ 35.102 + s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te; 35.103 + s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te; 35.104 +}
36.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 36.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te_s.S Mon Aug 27 12:09:56 2012 +0200 36.3 @@ -0,0 +1,117 @@ 36.4 +/* 36.5 + * Optimization of some functions from mpegvideo.c for armv5te 36.6 + * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net> 36.7 + * 36.8 + * This file is part of FFmpeg. 36.9 + * 36.10 + * FFmpeg is free software; you can redistribute it and/or 36.11 + * modify it under the terms of the GNU Lesser General Public 36.12 + * License as published by the Free Software Foundation; either 36.13 + * version 2.1 of the License, or (at your option) any later version. 36.14 + * 36.15 + * FFmpeg is distributed in the hope that it will be useful, 36.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 36.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 36.18 + * Lesser General Public License for more details. 36.19 + * 36.20 + * You should have received a copy of the GNU Lesser General Public 36.21 + * License along with FFmpeg; if not, write to the Free Software 36.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 36.23 + */ 36.24 + 36.25 +#include "config.h" 36.26 +#include "asm.S" 36.27 + 36.28 +/* 36.29 + * Special optimized version of dct_unquantize_h263_helper_c, it 36.30 + * requires the block to be at least 8 bytes aligned, and may process 36.31 + * more elements than requested. But it is guaranteed to never 36.32 + * process more than 64 elements provided that count argument is <= 64, 36.33 + * so it is safe. This function is optimized for a common distribution 36.34 + * of values for nCoeffs (they are mostly multiple of 8 plus one or 36.35 + * two extra elements). So this function processes data as 8 elements 36.36 + * per loop iteration and contains optional 2 elements processing in 36.37 + * the end. 36.38 + * 36.39 + * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770) 36.40 + */ 36.41 +function ff_dct_unquantize_h263_armv5te, export=1 36.42 + push {r4-r9,lr} 36.43 + mov ip, #0 36.44 + subs r3, r3, #2 36.45 + ble 2f 36.46 + ldrd r4, [r0, #0] 36.47 +1: 36.48 + ldrd r6, [r0, #8] 36.49 + 36.50 + rsbs r9, ip, r4, asr #16 36.51 + addgt r9, r2, #0 36.52 + rsblt r9, r2, #0 36.53 + smlatbne r9, r4, r1, r9 36.54 + 36.55 + rsbs lr, ip, r5, asr #16 36.56 + addgt lr, r2, #0 36.57 + rsblt lr, r2, #0 36.58 + smlatbne lr, r5, r1, lr 36.59 + 36.60 + rsbs r8, ip, r4, asl #16 36.61 + addgt r8, r2, #0 36.62 + rsblt r8, r2, #0 36.63 + smlabbne r4, r4, r1, r8 36.64 + 36.65 + rsbs r8, ip, r5, asl #16 36.66 + addgt r8, r2, #0 36.67 + rsblt r8, r2, #0 36.68 + smlabbne r5, r5, r1, r8 36.69 + 36.70 + strh r4, [r0], #2 36.71 + strh r9, [r0], #2 36.72 + strh r5, [r0], #2 36.73 + strh lr, [r0], #2 36.74 + 36.75 + rsbs r9, ip, r6, asr #16 36.76 + addgt r9, r2, #0 36.77 + rsblt r9, r2, #0 36.78 + smlatbne r9, r6, r1, r9 36.79 + 36.80 + rsbs lr, ip, r7, asr #16 36.81 + addgt lr, r2, #0 36.82 + rsblt lr, r2, #0 36.83 + smlatbne lr, r7, r1, lr 36.84 + 36.85 + rsbs r8, ip, r6, asl #16 36.86 + addgt r8, r2, #0 36.87 + rsblt r8, r2, #0 36.88 + smlabbne r6, r6, r1, r8 36.89 + 36.90 + rsbs r8, ip, r7, asl #16 36.91 + addgt r8, r2, #0 36.92 + rsblt r8, r2, #0 36.93 + smlabbne r7, r7, r1, r8 36.94 + 36.95 + strh r6, [r0], #2 36.96 + strh r9, [r0], #2 36.97 + strh r7, [r0], #2 36.98 + strh lr, [r0], #2 36.99 + 36.100 + subs r3, r3, #8 36.101 + ldrgtd r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ 36.102 + bgt 1b 36.103 + 36.104 + adds r3, r3, #2 36.105 + pople {r4-r9,pc} 36.106 +2: 36.107 + ldrsh r9, [r0, #0] 36.108 + ldrsh lr, [r0, #2] 36.109 + mov r8, r2 36.110 + cmp r9, #0 36.111 + rsblt r8, r2, #0 36.112 + smlabbne r9, r9, r1, r8 36.113 + mov r8, r2 36.114 + cmp lr, #0 36.115 + rsblt r8, r2, #0 36.116 + smlabbne lr, lr, r1, r8 36.117 + strh r9, [r0], #2 36.118 + strh lr, [r0], #2 36.119 + pop {r4-r9,pc} 36.120 +endfunc
37.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 37.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_iwmmxt.c Mon Aug 27 12:09:56 2012 +0200 37.3 @@ -0,0 +1,120 @@ 37.4 +/* 37.5 + * copyright (c) 2004 AGAWA Koji 37.6 + * 37.7 + * This file is part of FFmpeg. 37.8 + * 37.9 + * FFmpeg is free software; you can redistribute it and/or 37.10 + * modify it under the terms of the GNU Lesser General Public 37.11 + * License as published by the Free Software Foundation; either 37.12 + * version 2.1 of the License, or (at your option) any later version. 37.13 + * 37.14 + * FFmpeg is distributed in the hope that it will be useful, 37.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 37.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 37.17 + * Lesser General Public License for more details. 37.18 + * 37.19 + * You should have received a copy of the GNU Lesser General Public 37.20 + * License along with FFmpeg; if not, write to the Free Software 37.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 37.22 + */ 37.23 + 37.24 +#include "libavcodec/avcodec.h" 37.25 +#include "libavcodec/dsputil.h" 37.26 +#include "libavcodec/mpegvideo.h" 37.27 +#include "mpegvideo_arm.h" 37.28 + 37.29 +static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s, 37.30 + DCTELEM *block, int n, int qscale) 37.31 +{ 37.32 + int level, qmul, qadd; 37.33 + int nCoeffs; 37.34 + DCTELEM *block_orig = block; 37.35 + 37.36 + assert(s->block_last_index[n]>=0); 37.37 + 37.38 + qmul = qscale << 1; 37.39 + 37.40 + if (!s->h263_aic) { 37.41 + if (n < 4) 37.42 + level = block[0] * s->y_dc_scale; 37.43 + else 37.44 + level = block[0] * s->c_dc_scale; 37.45 + qadd = (qscale - 1) | 1; 37.46 + }else{ 37.47 + qadd = 0; 37.48 + level = block[0]; 37.49 + } 37.50 + if(s->ac_pred) 37.51 + nCoeffs=63; 37.52 + else 37.53 + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; 37.54 + 37.55 + __asm__ volatile ( 37.56 +/* "movd %1, %%mm6 \n\t" //qmul */ 37.57 +/* "packssdw %%mm6, %%mm6 \n\t" */ 37.58 +/* "packssdw %%mm6, %%mm6 \n\t" */ 37.59 + "tbcsth wr6, %[qmul] \n\t" 37.60 +/* "movd %2, %%mm5 \n\t" //qadd */ 37.61 +/* "packssdw %%mm5, %%mm5 \n\t" */ 37.62 +/* "packssdw %%mm5, %%mm5 \n\t" */ 37.63 + "tbcsth wr5, %[qadd] \n\t" 37.64 + "wzero wr7 \n\t" /* "pxor %%mm7, %%mm7 \n\t" */ 37.65 + "wzero wr4 \n\t" /* "pxor %%mm4, %%mm4 \n\t" */ 37.66 + "wsubh wr7, wr5, wr7 \n\t" /* "psubw %%mm5, %%mm7 \n\t" */ 37.67 + "1: \n\t" 37.68 + "wldrd wr2, [%[block]] \n\t" /* "movq (%0, %3), %%mm0 \n\t" */ 37.69 + "wldrd wr3, [%[block], #8] \n\t" /* "movq 8(%0, %3), %%mm1 \n\t" */ 37.70 + "wmulsl wr0, wr6, wr2 \n\t" /* "pmullw %%mm6, %%mm0 \n\t" */ 37.71 + "wmulsl wr1, wr6, wr3 \n\t" /* "pmullw %%mm6, %%mm1 \n\t" */ 37.72 +/* "movq (%0, %3), %%mm2 \n\t" */ 37.73 +/* "movq 8(%0, %3), %%mm3 \n\t" */ 37.74 + "wcmpgtsh wr2, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 */ 37.75 + "wcmpgtsh wr3, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 */ 37.76 + "wxor wr0, wr2, wr0 \n\t" /* "pxor %%mm2, %%mm0 \n\t" */ 37.77 + "wxor wr1, wr3, wr1 \n\t" /* "pxor %%mm3, %%mm1 \n\t" */ 37.78 + "waddh wr0, wr7, wr0 \n\t" /* "paddw %%mm7, %%mm0 \n\t" */ 37.79 + "waddh wr1, wr7, wr1 \n\t" /* "paddw %%mm7, %%mm1 \n\t" */ 37.80 + "wxor wr2, wr0, wr2 \n\t" /* "pxor %%mm0, %%mm2 \n\t" */ 37.81 + "wxor wr3, wr1, wr3 \n\t" /* "pxor %%mm1, %%mm3 \n\t" */ 37.82 + "wcmpeqh wr0, wr7, wr0 \n\t" /* "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 */ 37.83 + "wcmpeqh wr1, wr7, wr1 \n\t" /* "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 */ 37.84 + "wandn wr0, wr2, wr0 \n\t" /* "pandn %%mm2, %%mm0 \n\t" */ 37.85 + "wandn wr1, wr3, wr1 \n\t" /* "pandn %%mm3, %%mm1 \n\t" */ 37.86 + "wstrd wr0, [%[block]] \n\t" /* "movq %%mm0, (%0, %3) \n\t" */ 37.87 + "wstrd wr1, [%[block], #8] \n\t" /* "movq %%mm1, 8(%0, %3) \n\t" */ 37.88 + "add %[block], %[block], #16 \n\t" /* "addl $16, %3 \n\t" */ 37.89 + "subs %[i], %[i], #1 \n\t" 37.90 + "bne 1b \n\t" /* "jng 1b \n\t" */ 37.91 + :[block]"+r"(block) 37.92 + :[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd) 37.93 + :"memory"); 37.94 + 37.95 + block_orig[0] = level; 37.96 +} 37.97 + 37.98 +#if 0 37.99 +static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s, 37.100 + DCTELEM *block, int n, int qscale) 37.101 +{ 37.102 + int nCoeffs; 37.103 + 37.104 + assert(s->block_last_index[n]>=0); 37.105 + 37.106 + if(s->ac_pred) 37.107 + nCoeffs=63; 37.108 + else 37.109 + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; 37.110 + 37.111 + ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale); 37.112 +} 37.113 +#endif 37.114 + 37.115 +void MPV_common_init_iwmmxt(MpegEncContext *s) 37.116 +{ 37.117 + if (!(mm_flags & FF_MM_IWMMXT)) return; 37.118 + 37.119 + s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt; 37.120 +#if 0 37.121 + s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt; 37.122 +#endif 37.123 +}
38.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 38.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/rdft_neon.S Mon Aug 27 12:09:56 2012 +0200 38.3 @@ -0,0 +1,151 @@ 38.4 +/* 38.5 + * ARM NEON optimised RDFT 38.6 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 38.7 + * 38.8 + * This file is part of FFmpeg. 38.9 + * 38.10 + * FFmpeg is free software; you can redistribute it and/or 38.11 + * modify it under the terms of the GNU Lesser General Public 38.12 + * License as published by the Free Software Foundation; either 38.13 + * version 2.1 of the License, or (at your option) any later version. 38.14 + * 38.15 + * FFmpeg is distributed in the hope that it will be useful, 38.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 38.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 38.18 + * Lesser General Public License for more details. 38.19 + * 38.20 + * You should have received a copy of the GNU Lesser General Public 38.21 + * License along with FFmpeg; if not, write to the Free Software 38.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 38.23 + */ 38.24 + 38.25 +#include "asm.S" 38.26 + 38.27 + preserve8 38.28 + 38.29 +function ff_rdft_calc_neon, export=1 38.30 + push {r4-r8,lr} 38.31 + 38.32 + ldr r6, [r0, #4] @ inverse 38.33 + mov r4, r0 38.34 + mov r5, r1 38.35 + 38.36 + lsls r6, r6, #31 38.37 + bne 1f 38.38 + add r0, r4, #20 38.39 + bl X(ff_fft_permute_neon) 38.40 + add r0, r4, #20 38.41 + mov r1, r5 38.42 + bl X(ff_fft_calc_neon) 38.43 +1: 38.44 + ldr r12, [r4, #0] @ nbits 38.45 + mov r2, #1 38.46 + lsl r12, r2, r12 38.47 + add r0, r5, #8 38.48 + add r1, r5, r12, lsl #2 38.49 + lsr r12, r12, #2 38.50 + ldr r2, [r4, #12] @ tcos 38.51 + sub r12, r12, #2 38.52 + ldr r3, [r4, #16] @ tsin 38.53 + mov r7, r0 38.54 + sub r1, r1, #8 38.55 + mov lr, r1 38.56 + mov r8, #-8 38.57 + vld1.32 {d0}, [r0,:64]! @ d1[0,1] 38.58 + vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] 38.59 + vld1.32 {d4}, [r2,:64]! @ tcos[i] 38.60 + vld1.32 {d5}, [r3,:64]! @ tsin[i] 38.61 + vmov.f32 d18, #0.5 @ k1 38.62 + vdup.32 d19, r6 38.63 + pld [r0, #32] 38.64 + veor d19, d18, d19 @ k2 38.65 + vmov.i32 d16, #0 38.66 + vmov.i32 d17, #1<<31 38.67 + pld [r1, #-32] 38.68 + vtrn.32 d16, d17 38.69 + pld [r2, #32] 38.70 + vrev64.32 d16, d16 @ d16=1,0 d17=0,1 38.71 + pld [r3, #32] 38.72 +2: 38.73 + veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] 38.74 + vld1.32 {d24}, [r0,:64]! @ d1[0,1] 38.75 + vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] 38.76 + vld1.32 {d25}, [r1,:64], r8 @ d2[0,1] 38.77 + vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] 38.78 + veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1] 38.79 + pld [r0, #32] 38.80 + vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re 38.81 + pld [r1, #-32] 38.82 + vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1] 38.83 + vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1] 38.84 + vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re 38.85 + veor d7, d21, d16 @ -od.im, od.re 38.86 + vrev64.32 d3, d21 @ od.re, od.im 38.87 + veor d6, d20, d17 @ ev.re,-ev.im 38.88 + veor d2, d3, d16 @ -od.re, od.im 38.89 + vmla.f32 d20, d3, d4[1] 38.90 + vmla.f32 d20, d7, d5[1] 38.91 + vmla.f32 d6, d2, d4[1] 38.92 + vmla.f32 d6, d21, d5[1] 38.93 + vld1.32 {d4}, [r2,:64]! @ tcos[i] 38.94 + veor d7, d23, d16 @ -od.im, od.re 38.95 + vld1.32 {d5}, [r3,:64]! @ tsin[i] 38.96 + veor d24, d22, d17 @ ev.re,-ev.im 38.97 + vrev64.32 d3, d23 @ od.re, od.im 38.98 + pld [r2, #32] 38.99 + veor d2, d3, d16 @ -od.re, od.im 38.100 + pld [r3, #32] 38.101 + vmla.f32 d22, d3, d4[0] 38.102 + vmla.f32 d22, d7, d5[0] 38.103 + vmla.f32 d24, d2, d4[0] 38.104 + vmla.f32 d24, d23, d5[0] 38.105 + vld1.32 {d0}, [r0,:64]! @ d1[0,1] 38.106 + vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] 38.107 + vst1.32 {d20}, [r7,:64]! 38.108 + vst1.32 {d6}, [lr,:64], r8 38.109 + vst1.32 {d22}, [r7,:64]! 38.110 + vst1.32 {d24}, [lr,:64], r8 38.111 + subs r12, r12, #2 38.112 + bgt 2b 38.113 + 38.114 + veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] 38.115 + vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] 38.116 + vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] 38.117 + ldr r2, [r4, #8] @ sign_convention 38.118 + vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re 38.119 + add r0, r0, #4 38.120 + bfc r2, #0, #31 38.121 + vld1.32 {d0[0]}, [r0,:32] 38.122 + veor d7, d21, d16 @ -od.im, od.re 38.123 + vrev64.32 d3, d21 @ od.re, od.im 38.124 + veor d6, d20, d17 @ ev.re,-ev.im 38.125 + vld1.32 {d22}, [r5,:64] 38.126 + vdup.32 d1, r2 38.127 + vmov d23, d22 38.128 + veor d2, d3, d16 @ -od.re, od.im 38.129 + vtrn.32 d22, d23 38.130 + veor d0, d0, d1 38.131 + veor d23, d23, d17 38.132 + vmla.f32 d20, d3, d4[1] 38.133 + vmla.f32 d20, d7, d5[1] 38.134 + vmla.f32 d6, d2, d4[1] 38.135 + vmla.f32 d6, d21, d5[1] 38.136 + vadd.f32 d22, d22, d23 38.137 + vst1.32 {d20}, [r7,:64] 38.138 + vst1.32 {d6}, [lr,:64] 38.139 + vst1.32 {d0[0]}, [r0,:32] 38.140 + vst1.32 {d22}, [r5,:64] 38.141 + 38.142 + cmp r6, #0 38.143 + popeq {r4-r8,pc} 38.144 + 38.145 + vmul.f32 d22, d22, d18 38.146 + vst1.32 {d22}, [r5,:64] 38.147 + add r0, r4, #20 38.148 + mov r1, r5 38.149 + bl X(ff_fft_permute_neon) 38.150 + add r0, r4, #20 38.151 + mov r1, r5 38.152 + pop {r4-r8,lr} 38.153 + b X(ff_fft_calc_neon) 38.154 +endfunc
39.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 39.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_arm.S Mon Aug 27 12:09:56 2012 +0200 39.3 @@ -0,0 +1,486 @@ 39.4 +/* 39.5 + * simple_idct_arm.S 39.6 + * Copyright (C) 2002 Frederic 'dilb' Boulay 39.7 + * 39.8 + * Author: Frederic Boulay <dilb@handhelds.org> 39.9 + * 39.10 + * The function defined in this file is derived from the simple_idct function 39.11 + * from the libavcodec library part of the FFmpeg project. 39.12 + * 39.13 + * This file is part of FFmpeg. 39.14 + * 39.15 + * FFmpeg is free software; you can redistribute it and/or 39.16 + * modify it under the terms of the GNU Lesser General Public 39.17 + * License as published by the Free Software Foundation; either 39.18 + * version 2.1 of the License, or (at your option) any later version. 39.19 + * 39.20 + * FFmpeg is distributed in the hope that it will be useful, 39.21 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 39.22 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 39.23 + * Lesser General Public License for more details. 39.24 + * 39.25 + * You should have received a copy of the GNU Lesser General Public 39.26 + * License along with FFmpeg; if not, write to the Free Software 39.27 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 39.28 + */ 39.29 + 39.30 +#include "asm.S" 39.31 + 39.32 +/* useful constants for the algorithm, they are save in __constant_ptr__ at */ 39.33 +/* the end of the source code.*/ 39.34 +#define W1 22725 39.35 +#define W2 21407 39.36 +#define W3 19266 39.37 +#define W4 16383 39.38 +#define W5 12873 39.39 +#define W6 8867 39.40 +#define W7 4520 39.41 +#define MASK_MSHW 0xFFFF0000 39.42 + 39.43 +/* offsets of the constants in the vector */ 39.44 +#define offW1 0 39.45 +#define offW2 4 39.46 +#define offW3 8 39.47 +#define offW4 12 39.48 +#define offW5 16 39.49 +#define offW6 20 39.50 +#define offW7 24 39.51 +#define offMASK_MSHW 28 39.52 + 39.53 +#define ROW_SHIFT 11 39.54 +#define ROW_SHIFT2MSHW (16-11) 39.55 +#define COL_SHIFT 20 39.56 +#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */ 39.57 +#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */ 39.58 + 39.59 + 39.60 + .text 39.61 + 39.62 +function ff_simple_idct_arm, export=1 39.63 + @@ void simple_idct_arm(int16_t *block) 39.64 + @@ save stack for reg needed (take all of them), 39.65 + @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block 39.66 + @@ so it must not be overwritten, if it is not saved!! 39.67 + @@ R12 is another scratch register, so it should not be saved too 39.68 + @@ save all registers 39.69 + stmfd sp!, {r4-r11, r14} @ R14 is also called LR 39.70 + @@ at this point, R0=block, other registers are free. 39.71 + add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block. 39.72 + adr r12, __constant_ptr__ @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it 39.73 + @@ add 2 temporary variables in the stack: R0 and R14 39.74 + sub sp, sp, #8 @ allow 2 local variables 39.75 + str r0, [sp, #0] @ save block in sp[0] 39.76 + @@ stack status 39.77 + @@ sp+4 free 39.78 + @@ sp+0 R0 (block) 39.79 + 39.80 + 39.81 + @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free 39.82 + 39.83 + 39.84 +__row_loop: 39.85 + @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :) 39.86 + ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer) 39.87 + ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1] 39.88 + ldr r3, [r14, #8] @ R3=ROWr32[2] 39.89 + ldr r4, [r14, #12] @ R4=ROWr32[3] 39.90 + @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop), 39.91 + @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row) 39.92 + @@ else follow the complete algorithm. 39.93 + @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], 39.94 + @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free 39.95 + orr r5, r4, r3 @ R5=R4 | R3 39.96 + orr r5, r5, r2 @ R5=R4 | R3 | R2 39.97 + orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null) 39.98 + beq __end_row_loop 39.99 + mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later) 39.100 + ldrsh r6, [r14, #0] @ R6=ROWr16[0] 39.101 + orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7 39.102 + beq __almost_empty_row 39.103 + 39.104 +__b_evaluation: 39.105 + @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3], 39.106 + @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free, 39.107 + @@ R12=__const_ptr_, R14=&block[n] 39.108 + @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3 39.109 + 39.110 + @@ MUL16(b0, W1, row[1]); 39.111 + @@ MUL16(b1, W3, row[1]); 39.112 + @@ MUL16(b2, W5, row[1]); 39.113 + @@ MUL16(b3, W7, row[1]); 39.114 + @@ MAC16(b0, W3, row[3]); 39.115 + @@ MAC16(b1, -W7, row[3]); 39.116 + @@ MAC16(b2, -W1, row[3]); 39.117 + @@ MAC16(b3, -W5, row[3]); 39.118 + ldr r8, [r12, #offW1] @ R8=W1 39.119 + mov r2, r2, asr #16 @ R2=ROWr16[3] 39.120 + mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 39.121 + ldr r9, [r12, #offW3] @ R9=W3 39.122 + ldr r10, [r12, #offW5] @ R10=W5 39.123 + mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 39.124 + ldr r11, [r12, #offW7] @ R11=W7 39.125 + mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 39.126 + mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 39.127 + teq r2, #0 @ if null avoid muls 39.128 + mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 39.129 + rsbne r2, r2, #0 @ R2=-ROWr16[3] 39.130 + mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 39.131 + mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 39.132 + mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 39.133 + 39.134 + @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], 39.135 + @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, 39.136 + @@ R12=__const_ptr_, R14=&block[n] 39.137 + @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; 39.138 + @@ if (temp != 0) {} 39.139 + orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3] 39.140 + beq __end_b_evaluation 39.141 + 39.142 + @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], 39.143 + @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, 39.144 + @@ R12=__const_ptr_, R14=&block[n] 39.145 + @@ MAC16(b0, W5, row[5]); 39.146 + @@ MAC16(b2, W7, row[5]); 39.147 + @@ MAC16(b3, W3, row[5]); 39.148 + @@ MAC16(b1, -W1, row[5]); 39.149 + @@ MAC16(b0, W7, row[7]); 39.150 + @@ MAC16(b2, W3, row[7]); 39.151 + @@ MAC16(b3, -W1, row[7]); 39.152 + @@ MAC16(b1, -W5, row[7]); 39.153 + mov r3, r3, asr #16 @ R3=ROWr16[5] 39.154 + teq r3, #0 @ if null avoid muls 39.155 + mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 39.156 + mov r4, r4, asr #16 @ R4=ROWr16[7] 39.157 + mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 39.158 + mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 39.159 + rsbne r3, r3, #0 @ R3=-ROWr16[5] 39.160 + mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 39.161 + @@ R3 is free now 39.162 + teq r4, #0 @ if null avoid muls 39.163 + mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 39.164 + mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 39.165 + rsbne r4, r4, #0 @ R4=-ROWr16[7] 39.166 + mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 39.167 + mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 39.168 + @@ R4 is free now 39.169 +__end_b_evaluation: 39.170 + @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free), 39.171 + @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), 39.172 + @@ R12=__const_ptr_, R14=&block[n] 39.173 + 39.174 +__a_evaluation: 39.175 + @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); 39.176 + @@ a1 = a0 + W6 * row[2]; 39.177 + @@ a2 = a0 - W6 * row[2]; 39.178 + @@ a3 = a0 - W2 * row[2]; 39.179 + @@ a0 = a0 + W2 * row[2]; 39.180 + ldr r9, [r12, #offW4] @ R9=W4 39.181 + mul r6, r9, r6 @ R6=W4*ROWr16[0] 39.182 + ldr r10, [r12, #offW6] @ R10=W6 39.183 + ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet) 39.184 + add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0) 39.185 + 39.186 + mul r11, r10, r4 @ R11=W6*ROWr16[2] 39.187 + ldr r8, [r12, #offW2] @ R8=W2 39.188 + sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) 39.189 + @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; 39.190 + @@ if (temp != 0) {} 39.191 + teq r2, #0 39.192 + beq __end_bef_a_evaluation 39.193 + 39.194 + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) 39.195 + mul r11, r8, r4 @ R11=W2*ROWr16[2] 39.196 + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) 39.197 + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) 39.198 + 39.199 + 39.200 + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, 39.201 + @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), 39.202 + @@ R12=__const_ptr_, R14=&block[n] 39.203 + 39.204 + 39.205 + @@ a0 += W4*row[4] 39.206 + @@ a1 -= W4*row[4] 39.207 + @@ a2 -= W4*row[4] 39.208 + @@ a3 += W4*row[4] 39.209 + ldrsh r11, [r14, #8] @ R11=ROWr16[4] 39.210 + teq r11, #0 @ if null avoid muls 39.211 + mulne r11, r9, r11 @ R11=W4*ROWr16[4] 39.212 + @@ R9 is free now 39.213 + ldrsh r9, [r14, #12] @ R9=ROWr16[6] 39.214 + addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) 39.215 + subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) 39.216 + subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) 39.217 + addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) 39.218 + @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead 39.219 + teq r9, #0 @ if null avoid muls 39.220 + mulne r11, r10, r9 @ R11=W6*ROWr16[6] 39.221 + addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) 39.222 + mulne r10, r8, r9 @ R10=W2*ROWr16[6] 39.223 + @@ a0 += W6*row[6]; 39.224 + @@ a3 -= W6*row[6]; 39.225 + @@ a1 -= W2*row[6]; 39.226 + @@ a2 += W2*row[6]; 39.227 + subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) 39.228 + subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) 39.229 + addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) 39.230 + 39.231 +__end_a_evaluation: 39.232 + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, 39.233 + @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), 39.234 + @@ R12=__const_ptr_, R14=&block[n] 39.235 + @@ row[0] = (a0 + b0) >> ROW_SHIFT; 39.236 + @@ row[1] = (a1 + b1) >> ROW_SHIFT; 39.237 + @@ row[2] = (a2 + b2) >> ROW_SHIFT; 39.238 + @@ row[3] = (a3 + b3) >> ROW_SHIFT; 39.239 + @@ row[4] = (a3 - b3) >> ROW_SHIFT; 39.240 + @@ row[5] = (a2 - b2) >> ROW_SHIFT; 39.241 + @@ row[6] = (a1 - b1) >> ROW_SHIFT; 39.242 + @@ row[7] = (a0 - b0) >> ROW_SHIFT; 39.243 + add r8, r6, r0 @ R8=a0+b0 39.244 + add r9, r2, r1 @ R9=a1+b1 39.245 + @@ put 2 16 bits half-words in a 32bits word 39.246 + @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!) 39.247 + ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000 39.248 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5) 39.249 + mvn r11, r10 @ R11= NOT R10= 0x0000FFFF 39.250 + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11) 39.251 + orr r8, r8, r9 39.252 + str r8, [r14, #0] 39.253 + 39.254 + add r8, r3, r5 @ R8=a2+b2 39.255 + add r9, r4, r7 @ R9=a3+b3 39.256 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5) 39.257 + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11) 39.258 + orr r8, r8, r9 39.259 + str r8, [r14, #4] 39.260 + 39.261 + sub r8, r4, r7 @ R8=a3-b3 39.262 + sub r9, r3, r5 @ R9=a2-b2 39.263 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5) 39.264 + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11) 39.265 + orr r8, r8, r9 39.266 + str r8, [r14, #8] 39.267 + 39.268 + sub r8, r2, r1 @ R8=a1-b1 39.269 + sub r9, r6, r0 @ R9=a0-b0 39.270 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5) 39.271 + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11) 39.272 + orr r8, r8, r9 39.273 + str r8, [r14, #12] 39.274 + 39.275 + bal __end_row_loop 39.276 + 39.277 +__almost_empty_row: 39.278 + @@ the row was empty, except ROWr16[0], now, management of this special case 39.279 + @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], 39.280 + @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1], 39.281 + @@ R8=0xFFFF (temp), R9-R11 free 39.282 + mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run). 39.283 + sub r8, r8, #1 @ R8 is now ready. 39.284 + and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF 39.285 + orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16) 39.286 + str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5 39.287 + str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5 39.288 + str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5 39.289 + str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5 39.290 + 39.291 +__end_row_loop: 39.292 + @@ at this point, R0-R11 (free) 39.293 + @@ R12=__const_ptr_, R14=&block[n] 39.294 + ldr r0, [sp, #0] @ R0=block 39.295 + teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished. 39.296 + sub r14, r14, #16 39.297 + bne __row_loop 39.298 + 39.299 + 39.300 + 39.301 + @@ at this point, R0=block, R1-R11 (free) 39.302 + @@ R12=__const_ptr_, R14=&block[n] 39.303 + add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block. 39.304 +__col_loop: 39.305 + 39.306 +__b_evaluation2: 39.307 + @@ at this point, R0=block (temp), R1-R11 (free) 39.308 + @@ R12=__const_ptr_, R14=&block[n] 39.309 + @@ proceed with b0-b3 first, followed by a0-a3 39.310 + @@ MUL16(b0, W1, col[8x1]); 39.311 + @@ MUL16(b1, W3, col[8x1]); 39.312 + @@ MUL16(b2, W5, col[8x1]); 39.313 + @@ MUL16(b3, W7, col[8x1]); 39.314 + @@ MAC16(b0, W3, col[8x3]); 39.315 + @@ MAC16(b1, -W7, col[8x3]); 39.316 + @@ MAC16(b2, -W1, col[8x3]); 39.317 + @@ MAC16(b3, -W5, col[8x3]); 39.318 + ldr r8, [r12, #offW1] @ R8=W1 39.319 + ldrsh r7, [r14, #16] 39.320 + mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 39.321 + ldr r9, [r12, #offW3] @ R9=W3 39.322 + ldr r10, [r12, #offW5] @ R10=W5 39.323 + mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 39.324 + ldr r11, [r12, #offW7] @ R11=W7 39.325 + mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 39.326 + ldrsh r2, [r14, #48] 39.327 + mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 39.328 + teq r2, #0 @ if 0, then avoid muls 39.329 + mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 39.330 + rsbne r2, r2, #0 @ R2=-ROWr16[3] 39.331 + mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 39.332 + mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 39.333 + mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 39.334 + 39.335 + @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), 39.336 + @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, 39.337 + @@ R12=__const_ptr_, R14=&block[n] 39.338 + @@ MAC16(b0, W5, col[5x8]); 39.339 + @@ MAC16(b2, W7, col[5x8]); 39.340 + @@ MAC16(b3, W3, col[5x8]); 39.341 + @@ MAC16(b1, -W1, col[5x8]); 39.342 + @@ MAC16(b0, W7, col[7x8]); 39.343 + @@ MAC16(b2, W3, col[7x8]); 39.344 + @@ MAC16(b3, -W1, col[7x8]); 39.345 + @@ MAC16(b1, -W5, col[7x8]); 39.346 + ldrsh r3, [r14, #80] @ R3=COLr16[5x8] 39.347 + teq r3, #0 @ if 0 then avoid muls 39.348 + mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 39.349 + mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 39.350 + mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 39.351 + rsbne r3, r3, #0 @ R3=-ROWr16[5x8] 39.352 + ldrsh r4, [r14, #112] @ R4=COLr16[7x8] 39.353 + mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 39.354 + @@ R3 is free now 39.355 + teq r4, #0 @ if 0 then avoid muls 39.356 + mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 39.357 + mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 39.358 + rsbne r4, r4, #0 @ R4=-ROWr16[7x8] 39.359 + mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 39.360 + mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 39.361 + @@ R4 is free now 39.362 +__end_b_evaluation2: 39.363 + @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), 39.364 + @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), 39.365 + @@ R12=__const_ptr_, R14=&block[n] 39.366 + 39.367 +__a_evaluation2: 39.368 + @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1)); 39.369 + @@ a1 = a0 + W6 * row[2]; 39.370 + @@ a2 = a0 - W6 * row[2]; 39.371 + @@ a3 = a0 - W2 * row[2]; 39.372 + @@ a0 = a0 + W2 * row[2]; 39.373 + ldrsh r6, [r14, #0] 39.374 + ldr r9, [r12, #offW4] @ R9=W4 39.375 + mul r6, r9, r6 @ R6=W4*ROWr16[0] 39.376 + ldr r10, [r12, #offW6] @ R10=W6 39.377 + ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet) 39.378 + add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0) 39.379 + mul r11, r10, r4 @ R11=W6*ROWr16[2] 39.380 + ldr r8, [r12, #offW2] @ R8=W2 39.381 + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) 39.382 + sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) 39.383 + mul r11, r8, r4 @ R11=W2*ROWr16[2] 39.384 + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) 39.385 + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) 39.386 + 39.387 + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, 39.388 + @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), 39.389 + @@ R12=__const_ptr_, R14=&block[n] 39.390 + @@ a0 += W4*row[4] 39.391 + @@ a1 -= W4*row[4] 39.392 + @@ a2 -= W4*row[4] 39.393 + @@ a3 += W4*row[4] 39.394 + ldrsh r11, [r14, #64] @ R11=ROWr16[4] 39.395 + teq r11, #0 @ if null avoid muls 39.396 + mulne r11, r9, r11 @ R11=W4*ROWr16[4] 39.397 + @@ R9 is free now 39.398 + addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) 39.399 + subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) 39.400 + subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) 39.401 + ldrsh r9, [r14, #96] @ R9=ROWr16[6] 39.402 + addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) 39.403 + @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead 39.404 + teq r9, #0 @ if null avoid muls 39.405 + mulne r11, r10, r9 @ R11=W6*ROWr16[6] 39.406 + addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) 39.407 + mulne r10, r8, r9 @ R10=W2*ROWr16[6] 39.408 + @@ a0 += W6*row[6]; 39.409 + @@ a3 -= W6*row[6]; 39.410 + @@ a1 -= W2*row[6]; 39.411 + @@ a2 += W2*row[6]; 39.412 + subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) 39.413 + subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) 39.414 + addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) 39.415 +__end_a_evaluation2: 39.416 + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, 39.417 + @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), 39.418 + @@ R12=__const_ptr_, R14=&block[n] 39.419 + @@ col[0 ] = ((a0 + b0) >> COL_SHIFT); 39.420 + @@ col[8 ] = ((a1 + b1) >> COL_SHIFT); 39.421 + @@ col[16] = ((a2 + b2) >> COL_SHIFT); 39.422 + @@ col[24] = ((a3 + b3) >> COL_SHIFT); 39.423 + @@ col[32] = ((a3 - b3) >> COL_SHIFT); 39.424 + @@ col[40] = ((a2 - b2) >> COL_SHIFT); 39.425 + @@ col[48] = ((a1 - b1) >> COL_SHIFT); 39.426 + @@ col[56] = ((a0 - b0) >> COL_SHIFT); 39.427 + @@@@@ no optimization here @@@@@ 39.428 + add r8, r6, r0 @ R8=a0+b0 39.429 + add r9, r2, r1 @ R9=a1+b1 39.430 + mov r8, r8, asr #COL_SHIFT 39.431 + mov r9, r9, asr #COL_SHIFT 39.432 + strh r8, [r14, #0] 39.433 + strh r9, [r14, #16] 39.434 + add r8, r3, r5 @ R8=a2+b2 39.435 + add r9, r4, r7 @ R9=a3+b3 39.436 + mov r8, r8, asr #COL_SHIFT 39.437 + mov r9, r9, asr #COL_SHIFT 39.438 + strh r8, [r14, #32] 39.439 + strh r9, [r14, #48] 39.440 + sub r8, r4, r7 @ R8=a3-b3 39.441 + sub r9, r3, r5 @ R9=a2-b2 39.442 + mov r8, r8, asr #COL_SHIFT 39.443 + mov r9, r9, asr #COL_SHIFT 39.444 + strh r8, [r14, #64] 39.445 + strh r9, [r14, #80] 39.446 + sub r8, r2, r1 @ R8=a1-b1 39.447 + sub r9, r6, r0 @ R9=a0-b0 39.448 + mov r8, r8, asr #COL_SHIFT 39.449 + mov r9, r9, asr #COL_SHIFT 39.450 + strh r8, [r14, #96] 39.451 + strh r9, [r14, #112] 39.452 + 39.453 +__end_col_loop: 39.454 + @@ at this point, R0-R11 (free) 39.455 + @@ R12=__const_ptr_, R14=&block[n] 39.456 + ldr r0, [sp, #0] @ R0=block 39.457 + teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished. 39.458 + sub r14, r14, #2 39.459 + bne __col_loop 39.460 + 39.461 + 39.462 + 39.463 + 39.464 +__end_simple_idct_arm: 39.465 + @@ restore registers to previous status! 39.466 + add sp, sp, #8 @@ the local variables! 39.467 + ldmfd sp!, {r4-r11, r15} @@ update PC with LR content. 39.468 + 39.469 + 39.470 + 39.471 +@@ kind of sub-function, here not to overload the common case. 39.472 +__end_bef_a_evaluation: 39.473 + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) 39.474 + mul r11, r8, r4 @ R11=W2*ROWr16[2] 39.475 + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) 39.476 + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) 39.477 + bal __end_a_evaluation 39.478 + 39.479 + 39.480 +__constant_ptr__: @@ see #defines at the beginning of the source code for values. 39.481 + .align 39.482 + .word W1 39.483 + .word W2 39.484 + .word W3 39.485 + .word W4 39.486 + .word W5 39.487 + .word W6 39.488 + .word W7 39.489 + .word MASK_MSHW
40.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 40.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv5te.S Mon Aug 27 12:09:56 2012 +0200 40.3 @@ -0,0 +1,703 @@ 40.4 +/* 40.5 + * Simple IDCT 40.6 + * 40.7 + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> 40.8 + * Copyright (c) 2006 Mans Rullgard <mans@mansr.com> 40.9 + * 40.10 + * This file is part of FFmpeg. 40.11 + * 40.12 + * FFmpeg is free software; you can redistribute it and/or 40.13 + * modify it under the terms of the GNU Lesser General Public 40.14 + * License as published by the Free Software Foundation; either 40.15 + * version 2.1 of the License, or (at your option) any later version. 40.16 + * 40.17 + * FFmpeg is distributed in the hope that it will be useful, 40.18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 40.19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 40.20 + * Lesser General Public License for more details. 40.21 + * 40.22 + * You should have received a copy of the GNU Lesser General Public 40.23 + * License along with FFmpeg; if not, write to the Free Software 40.24 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 40.25 + */ 40.26 + 40.27 +#include "asm.S" 40.28 + 40.29 +#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 40.30 +#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 40.31 +#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 40.32 +#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 40.33 +#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 40.34 +#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 40.35 +#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 40.36 +#define ROW_SHIFT 11 40.37 +#define COL_SHIFT 20 40.38 + 40.39 +#define W13 (W1 | (W3 << 16)) 40.40 +#define W26 (W2 | (W6 << 16)) 40.41 +#define W57 (W5 | (W7 << 16)) 40.42 + 40.43 + .text 40.44 + .align 40.45 +w13: .long W13 40.46 +w26: .long W26 40.47 +w57: .long W57 40.48 + 40.49 +function idct_row_armv5te 40.50 + str lr, [sp, #-4]! 40.51 + 40.52 + ldrd v1, [a1, #8] 40.53 + ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */ 40.54 + orrs v1, v1, v2 40.55 + cmpeq v1, a4 40.56 + cmpeq v1, a3, lsr #16 40.57 + beq row_dc_only 40.58 + 40.59 + mov v1, #(1<<(ROW_SHIFT-1)) 40.60 + mov ip, #16384 40.61 + sub ip, ip, #1 /* ip = W4 */ 40.62 + smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */ 40.63 + ldr ip, w26 /* ip = W2 | (W6 << 16) */ 40.64 + smultb a2, ip, a4 40.65 + smulbb lr, ip, a4 40.66 + add v2, v1, a2 40.67 + sub v3, v1, a2 40.68 + sub v4, v1, lr 40.69 + add v1, v1, lr 40.70 + 40.71 + ldr ip, w13 /* ip = W1 | (W3 << 16) */ 40.72 + ldr lr, w57 /* lr = W5 | (W7 << 16) */ 40.73 + smulbt v5, ip, a3 40.74 + smultt v6, lr, a4 40.75 + smlatt v5, ip, a4, v5 40.76 + smultt a2, ip, a3 40.77 + smulbt v7, lr, a3 40.78 + sub v6, v6, a2 40.79 + smulbt a2, ip, a4 40.80 + smultt fp, lr, a3 40.81 + sub v7, v7, a2 40.82 + smulbt a2, lr, a4 40.83 + ldrd a3, [a1, #8] /* a3=row[5:4] a4=row[7:6] */ 40.84 + sub fp, fp, a2 40.85 + 40.86 + orrs a2, a3, a4 40.87 + beq 1f 40.88 + 40.89 + smlabt v5, lr, a3, v5 40.90 + smlabt v6, ip, a3, v6 40.91 + smlatt v5, lr, a4, v5 40.92 + smlabt v6, lr, a4, v6 40.93 + smlatt v7, lr, a3, v7 40.94 + smlatt fp, ip, a3, fp 40.95 + smulbt a2, ip, a4 40.96 + smlatt v7, ip, a4, v7 40.97 + sub fp, fp, a2 40.98 + 40.99 + ldr ip, w26 /* ip = W2 | (W6 << 16) */ 40.100 + mov a2, #16384 40.101 + sub a2, a2, #1 /* a2 = W4 */ 40.102 + smulbb a2, a2, a3 /* a2 = W4*row[4] */ 40.103 + smultb lr, ip, a4 /* lr = W6*row[6] */ 40.104 + add v1, v1, a2 /* v1 += W4*row[4] */ 40.105 + add v1, v1, lr /* v1 += W6*row[6] */ 40.106 + add v4, v4, a2 /* v4 += W4*row[4] */ 40.107 + sub v4, v4, lr /* v4 -= W6*row[6] */ 40.108 + smulbb lr, ip, a4 /* lr = W2*row[6] */ 40.109 + sub v2, v2, a2 /* v2 -= W4*row[4] */ 40.110 + sub v2, v2, lr /* v2 -= W2*row[6] */ 40.111 + sub v3, v3, a2 /* v3 -= W4*row[4] */ 40.112 + add v3, v3, lr /* v3 += W2*row[6] */ 40.113 + 40.114 +1: add a2, v1, v5 40.115 + mov a3, a2, lsr #11 40.116 + bic a3, a3, #0x1f0000 40.117 + sub a2, v2, v6 40.118 + mov a2, a2, lsr #11 40.119 + add a3, a3, a2, lsl #16 40.120 + add a2, v3, v7 40.121 + mov a4, a2, lsr #11 40.122 + bic a4, a4, #0x1f0000 40.123 + add a2, v4, fp 40.124 + mov a2, a2, lsr #11 40.125 + add a4, a4, a2, lsl #16 40.126 + strd a3, [a1] 40.127 + 40.128 + sub a2, v4, fp 40.129 + mov a3, a2, lsr #11 40.130 + bic a3, a3, #0x1f0000 40.131 + sub a2, v3, v7 40.132 + mov a2, a2, lsr #11 40.133 + add a3, a3, a2, lsl #16 40.134 + add a2, v2, v6 40.135 + mov a4, a2, lsr #11 40.136 + bic a4, a4, #0x1f0000 40.137 + sub a2, v1, v5 40.138 + mov a2, a2, lsr #11 40.139 + add a4, a4, a2, lsl #16 40.140 + strd a3, [a1, #8] 40.141 + 40.142 + ldr pc, [sp], #4 40.143 + 40.144 +row_dc_only: 40.145 + orr a3, a3, a3, lsl #16 40.146 + bic a3, a3, #0xe000 40.147 + mov a3, a3, lsl #3 40.148 + mov a4, a3 40.149 + strd a3, [a1] 40.150 + strd a3, [a1, #8] 40.151 + 40.152 + ldr pc, [sp], #4 40.153 +endfunc 40.154 + 40.155 + .macro idct_col 40.156 + ldr a4, [a1] /* a4 = col[1:0] */ 40.157 + mov ip, #16384 40.158 + sub ip, ip, #1 /* ip = W4 */ 40.159 +#if 0 40.160 + mov v1, #(1<<(COL_SHIFT-1)) 40.161 + smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */ 40.162 + smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */ 40.163 + ldr a4, [a1, #(16*4)] 40.164 +#else 40.165 + mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */ 40.166 + add v2, v1, a4, asr #16 40.167 + rsb v2, v2, v2, lsl #14 40.168 + mov a4, a4, lsl #16 40.169 + add v1, v1, a4, asr #16 40.170 + ldr a4, [a1, #(16*4)] 40.171 + rsb v1, v1, v1, lsl #14 40.172 +#endif 40.173 + 40.174 + smulbb lr, ip, a4 40.175 + smulbt a3, ip, a4 40.176 + sub v3, v1, lr 40.177 + sub v5, v1, lr 40.178 + add v7, v1, lr 40.179 + add v1, v1, lr 40.180 + sub v4, v2, a3 40.181 + sub v6, v2, a3 40.182 + add fp, v2, a3 40.183 + ldr ip, w26 40.184 + ldr a4, [a1, #(16*2)] 40.185 + add v2, v2, a3 40.186 + 40.187 + smulbb lr, ip, a4 40.188 + smultb a3, ip, a4 40.189 + add v1, v1, lr 40.190 + sub v7, v7, lr 40.191 + add v3, v3, a3 40.192 + sub v5, v5, a3 40.193 + smulbt lr, ip, a4 40.194 + smultt a3, ip, a4 40.195 + add v2, v2, lr 40.196 + sub fp, fp, lr 40.197 + add v4, v4, a3 40.198 + ldr a4, [a1, #(16*6)] 40.199 + sub v6, v6, a3 40.200 + 40.201 + smultb lr, ip, a4 40.202 + smulbb a3, ip, a4 40.203 + add v1, v1, lr 40.204 + sub v7, v7, lr 40.205 + sub v3, v3, a3 40.206 + add v5, v5, a3 40.207 + smultt lr, ip, a4 40.208 + smulbt a3, ip, a4 40.209 + add v2, v2, lr 40.210 + sub fp, fp, lr 40.211 + sub v4, v4, a3 40.212 + add v6, v6, a3 40.213 + 40.214 + stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp} 40.215 + 40.216 + ldr ip, w13 40.217 + ldr a4, [a1, #(16*1)] 40.218 + ldr lr, w57 40.219 + smulbb v1, ip, a4 40.220 + smultb v3, ip, a4 40.221 + smulbb v5, lr, a4 40.222 + smultb v7, lr, a4 40.223 + smulbt v2, ip, a4 40.224 + smultt v4, ip, a4 40.225 + smulbt v6, lr, a4 40.226 + smultt fp, lr, a4 40.227 + rsb v4, v4, #0 40.228 + ldr a4, [a1, #(16*3)] 40.229 + rsb v3, v3, #0 40.230 + 40.231 + smlatb v1, ip, a4, v1 40.232 + smlatb v3, lr, a4, v3 40.233 + smulbb a3, ip, a4 40.234 + smulbb a2, lr, a4 40.235 + sub v5, v5, a3 40.236 + sub v7, v7, a2 40.237 + smlatt v2, ip, a4, v2 40.238 + smlatt v4, lr, a4, v4 40.239 + smulbt a3, ip, a4 40.240 + smulbt a2, lr, a4 40.241 + sub v6, v6, a3 40.242 + ldr a4, [a1, #(16*5)] 40.243 + sub fp, fp, a2 40.244 + 40.245 + smlabb v1, lr, a4, v1 40.246 + smlabb v3, ip, a4, v3 40.247 + smlatb v5, lr, a4, v5 40.248 + smlatb v7, ip, a4, v7 40.249 + smlabt v2, lr, a4, v2 40.250 + smlabt v4, ip, a4, v4 40.251 + smlatt v6, lr, a4, v6 40.252 + ldr a3, [a1, #(16*7)] 40.253 + smlatt fp, ip, a4, fp 40.254 + 40.255 + smlatb v1, lr, a3, v1 40.256 + smlabb v3, lr, a3, v3 40.257 + smlatb v5, ip, a3, v5 40.258 + smulbb a4, ip, a3 40.259 + smlatt v2, lr, a3, v2 40.260 + sub v7, v7, a4 40.261 + smlabt v4, lr, a3, v4 40.262 + smulbt a4, ip, a3 40.263 + smlatt v6, ip, a3, v6 40.264 + sub fp, fp, a4 40.265 + .endm 40.266 + 40.267 +function idct_col_armv5te 40.268 + str lr, [sp, #-4]! 40.269 + 40.270 + idct_col 40.271 + 40.272 + ldmfd sp!, {a3, a4} 40.273 + adds a2, a3, v1 40.274 + mov a2, a2, lsr #20 40.275 + orrmi a2, a2, #0xf000 40.276 + add ip, a4, v2 40.277 + mov ip, ip, asr #20 40.278 + orr a2, a2, ip, lsl #16 40.279 + str a2, [a1] 40.280 + subs a3, a3, v1 40.281 + mov a2, a3, lsr #20 40.282 + orrmi a2, a2, #0xf000 40.283 + sub a4, a4, v2 40.284 + mov a4, a4, asr #20 40.285 + orr a2, a2, a4, lsl #16 40.286 + ldmfd sp!, {a3, a4} 40.287 + str a2, [a1, #(16*7)] 40.288 + 40.289 + subs a2, a3, v3 40.290 + mov a2, a2, lsr #20 40.291 + orrmi a2, a2, #0xf000 40.292 + sub ip, a4, v4 40.293 + mov ip, ip, asr #20 40.294 + orr a2, a2, ip, lsl #16 40.295 + str a2, [a1, #(16*1)] 40.296 + adds a3, a3, v3 40.297 + mov a2, a3, lsr #20 40.298 + orrmi a2, a2, #0xf000 40.299 + add a4, a4, v4 40.300 + mov a4, a4, asr #20 40.301 + orr a2, a2, a4, lsl #16 40.302 + ldmfd sp!, {a3, a4} 40.303 + str a2, [a1, #(16*6)] 40.304 + 40.305 + adds a2, a3, v5 40.306 + mov a2, a2, lsr #20 40.307 + orrmi a2, a2, #0xf000 40.308 + add ip, a4, v6 40.309 + mov ip, ip, asr #20 40.310 + orr a2, a2, ip, lsl #16 40.311 + str a2, [a1, #(16*2)] 40.312 + subs a3, a3, v5 40.313 + mov a2, a3, lsr #20 40.314 + orrmi a2, a2, #0xf000 40.315 + sub a4, a4, v6 40.316 + mov a4, a4, asr #20 40.317 + orr a2, a2, a4, lsl #16 40.318 + ldmfd sp!, {a3, a4} 40.319 + str a2, [a1, #(16*5)] 40.320 + 40.321 + adds a2, a3, v7 40.322 + mov a2, a2, lsr #20 40.323 + orrmi a2, a2, #0xf000 40.324 + add ip, a4, fp 40.325 + mov ip, ip, asr #20 40.326 + orr a2, a2, ip, lsl #16 40.327 + str a2, [a1, #(16*3)] 40.328 + subs a3, a3, v7 40.329 + mov a2, a3, lsr #20 40.330 + orrmi a2, a2, #0xf000 40.331 + sub a4, a4, fp 40.332 + mov a4, a4, asr #20 40.333 + orr a2, a2, a4, lsl #16 40.334 + str a2, [a1, #(16*4)] 40.335 + 40.336 + ldr pc, [sp], #4 40.337 +endfunc 40.338 + 40.339 +function idct_col_put_armv5te 40.340 + str lr, [sp, #-4]! 40.341 + 40.342 + idct_col 40.343 + 40.344 + ldmfd sp!, {a3, a4} 40.345 + ldr lr, [sp, #32] 40.346 + add a2, a3, v1 40.347 + movs a2, a2, asr #20 40.348 + movmi a2, #0 40.349 + cmp a2, #255 40.350 + movgt a2, #255 40.351 + add ip, a4, v2 40.352 + movs ip, ip, asr #20 40.353 + movmi ip, #0 40.354 + cmp ip, #255 40.355 + movgt ip, #255 40.356 + orr a2, a2, ip, lsl #8 40.357 + sub a3, a3, v1 40.358 + movs a3, a3, asr #20 40.359 + movmi a3, #0 40.360 + cmp a3, #255 40.361 + movgt a3, #255 40.362 + sub a4, a4, v2 40.363 + movs a4, a4, asr #20 40.364 + movmi a4, #0 40.365 + cmp a4, #255 40.366 + ldr v1, [sp, #28] 40.367 + movgt a4, #255 40.368 + strh a2, [v1] 40.369 + add a2, v1, #2 40.370 + str a2, [sp, #28] 40.371 + orr a2, a3, a4, lsl #8 40.372 + rsb v2, lr, lr, lsl #3 40.373 + ldmfd sp!, {a3, a4} 40.374 + strh a2, [v2, v1]! 40.375 + 40.376 + sub a2, a3, v3 40.377 + movs a2, a2, asr #20 40.378 + movmi a2, #0 40.379 + cmp a2, #255 40.380 + movgt a2, #255 40.381 + sub ip, a4, v4 40.382 + movs ip, ip, asr #20 40.383 + movmi ip, #0 40.384 + cmp ip, #255 40.385 + movgt ip, #255 40.386 + orr a2, a2, ip, lsl #8 40.387 + strh a2, [v1, lr]! 40.388 + add a3, a3, v3 40.389 + movs a2, a3, asr #20 40.390 + movmi a2, #0 40.391 + cmp a2, #255 40.392 + movgt a2, #255 40.393 + add a4, a4, v4 40.394 + movs a4, a4, asr #20 40.395 + movmi a4, #0 40.396 + cmp a4, #255 40.397 + movgt a4, #255 40.398 + orr a2, a2, a4, lsl #8 40.399 + ldmfd sp!, {a3, a4} 40.400 + strh a2, [v2, -lr]! 40.401 + 40.402 + add a2, a3, v5 40.403 + movs a2, a2, asr #20 40.404 + movmi a2, #0 40.405 + cmp a2, #255 40.406 + movgt a2, #255 40.407 + add ip, a4, v6 40.408 + movs ip, ip, asr #20 40.409 + movmi ip, #0 40.410 + cmp ip, #255 40.411 + movgt ip, #255 40.412 + orr a2, a2, ip, lsl #8 40.413 + strh a2, [v1, lr]! 40.414 + sub a3, a3, v5 40.415 + movs a2, a3, asr #20 40.416 + movmi a2, #0 40.417 + cmp a2, #255 40.418 + movgt a2, #255 40.419 + sub a4, a4, v6 40.420 + movs a4, a4, asr #20 40.421 + movmi a4, #0 40.422 + cmp a4, #255 40.423 + movgt a4, #255 40.424 + orr a2, a2, a4, lsl #8 40.425 + ldmfd sp!, {a3, a4} 40.426 + strh a2, [v2, -lr]! 40.427 + 40.428 + add a2, a3, v7 40.429 + movs a2, a2, asr #20 40.430 + movmi a2, #0 40.431 + cmp a2, #255 40.432 + movgt a2, #255 40.433 + add ip, a4, fp 40.434 + movs ip, ip, asr #20 40.435 + movmi ip, #0 40.436 + cmp ip, #255 40.437 + movgt ip, #255 40.438 + orr a2, a2, ip, lsl #8 40.439 + strh a2, [v1, lr] 40.440 + sub a3, a3, v7 40.441 + movs a2, a3, asr #20 40.442 + movmi a2, #0 40.443 + cmp a2, #255 40.444 + movgt a2, #255 40.445 + sub a4, a4, fp 40.446 + movs a4, a4, asr #20 40.447 + movmi a4, #0 40.448 + cmp a4, #255 40.449 + movgt a4, #255 40.450 + orr a2, a2, a4, lsl #8 40.451 + strh a2, [v2, -lr] 40.452 + 40.453 + ldr pc, [sp], #4 40.454 +endfunc 40.455 + 40.456 +function idct_col_add_armv5te 40.457 + str lr, [sp, #-4]! 40.458 + 40.459 + idct_col 40.460 + 40.461 + ldr lr, [sp, #36] 40.462 + 40.463 + ldmfd sp!, {a3, a4} 40.464 + ldrh ip, [lr] 40.465 + add a2, a3, v1 40.466 + mov a2, a2, asr #20 40.467 + sub a3, a3, v1 40.468 + and v1, ip, #255 40.469 + adds a2, a2, v1 40.470 + movmi a2, #0 40.471 + cmp a2, #255 40.472 + movgt a2, #255 40.473 + add v1, a4, v2 40.474 + mov v1, v1, asr #20 40.475 + adds v1, v1, ip, lsr #8 40.476 + movmi v1, #0 40.477 + cmp v1, #255 40.478 + movgt v1, #255 40.479 + orr a2, a2, v1, lsl #8 40.480 + ldr v1, [sp, #32] 40.481 + sub a4, a4, v2 40.482 + rsb v2, v1, v1, lsl #3 40.483 + ldrh ip, [v2, lr]! 40.484 + strh a2, [lr] 40.485 + mov a3, a3, asr #20 40.486 + and a2, ip, #255 40.487 + adds a3, a3, a2 40.488 + movmi a3, #0 40.489 + cmp a3, #255 40.490 + movgt a3, #255 40.491 + mov a4, a4, asr #20 40.492 + adds a4, a4, ip, lsr #8 40.493 + movmi a4, #0 40.494 + cmp a4, #255 40.495 + movgt a4, #255 40.496 + add a2, lr, #2 40.497 + str a2, [sp, #28] 40.498 + orr a2, a3, a4, lsl #8 40.499 + strh a2, [v2] 40.500 + 40.501 + ldmfd sp!, {a3, a4} 40.502 + ldrh ip, [lr, v1]! 40.503 + sub a2, a3, v3 40.504 + mov a2, a2, asr #20 40.505 + add a3, a3, v3 40.506 + and v3, ip, #255 40.507 + adds a2, a2, v3 40.508 + movmi a2, #0 40.509 + cmp a2, #255 40.510 + movgt a2, #255 40.511 + sub v3, a4, v4 40.512 + mov v3, v3, asr #20 40.513 + adds v3, v3, ip, lsr #8 40.514 + movmi v3, #0 40.515 + cmp v3, #255 40.516 + movgt v3, #255 40.517 + orr a2, a2, v3, lsl #8 40.518 + add a4, a4, v4 40.519 + ldrh ip, [v2, -v1]! 40.520 + strh a2, [lr] 40.521 + mov a3, a3, asr #20 40.522 + and a2, ip, #255 40.523 + adds a3, a3, a2 40.524 + movmi a3, #0 40.525 + cmp a3, #255 40.526 + movgt a3, #255 40.527 + mov a4, a4, asr #20 40.528 + adds a4, a4, ip, lsr #8 40.529 + movmi a4, #0 40.530 + cmp a4, #255 40.531 + movgt a4, #255 40.532 + orr a2, a3, a4, lsl #8 40.533 + strh a2, [v2] 40.534 + 40.535 + ldmfd sp!, {a3, a4} 40.536 + ldrh ip, [lr, v1]! 40.537 + add a2, a3, v5 40.538 + mov a2, a2, asr #20 40.539 + sub a3, a3, v5 40.540 + and v3, ip, #255 40.541 + adds a2, a2, v3 40.542 + movmi a2, #0 40.543 + cmp a2, #255 40.544 + movgt a2, #255 40.545 + add v3, a4, v6 40.546 + mov v3, v3, asr #20 40.547 + adds v3, v3, ip, lsr #8 40.548 + movmi v3, #0 40.549 + cmp v3, #255 40.550 + movgt v3, #255 40.551 + orr a2, a2, v3, lsl #8 40.552 + sub a4, a4, v6 40.553 + ldrh ip, [v2, -v1]! 40.554 + strh a2, [lr] 40.555 + mov a3, a3, asr #20 40.556 + and a2, ip, #255 40.557 + adds a3, a3, a2 40.558 + movmi a3, #0 40.559 + cmp a3, #255 40.560 + movgt a3, #255 40.561 + mov a4, a4, asr #20 40.562 + adds a4, a4, ip, lsr #8 40.563 + movmi a4, #0 40.564 + cmp a4, #255 40.565 + movgt a4, #255 40.566 + orr a2, a3, a4, lsl #8 40.567 + strh a2, [v2] 40.568 + 40.569 + ldmfd sp!, {a3, a4} 40.570 + ldrh ip, [lr, v1]! 40.571 + add a2, a3, v7 40.572 + mov a2, a2, asr #20 40.573 + sub a3, a3, v7 40.574 + and v3, ip, #255 40.575 + adds a2, a2, v3 40.576 + movmi a2, #0 40.577 + cmp a2, #255 40.578 + movgt a2, #255 40.579 + add v3, a4, fp 40.580 + mov v3, v3, asr #20 40.581 + adds v3, v3, ip, lsr #8 40.582 + movmi v3, #0 40.583 + cmp v3, #255 40.584 + movgt v3, #255 40.585 + orr a2, a2, v3, lsl #8 40.586 + sub a4, a4, fp 40.587 + ldrh ip, [v2, -v1]! 40.588 + strh a2, [lr] 40.589 + mov a3, a3, asr #20 40.590 + and a2, ip, #255 40.591 + adds a3, a3, a2 40.592 + movmi a3, #0 40.593 + cmp a3, #255 40.594 + movgt a3, #255 40.595 + mov a4, a4, asr #20 40.596 + adds a4, a4, ip, lsr #8 40.597 + movmi a4, #0 40.598 + cmp a4, #255 40.599 + movgt a4, #255 40.600 + orr a2, a3, a4, lsl #8 40.601 + strh a2, [v2] 40.602 + 40.603 + ldr pc, [sp], #4 40.604 +endfunc 40.605 + 40.606 +function ff_simple_idct_armv5te, export=1 40.607 + stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} 40.608 + 40.609 + bl idct_row_armv5te 40.610 + add a1, a1, #16 40.611 + bl idct_row_armv5te 40.612 + add a1, a1, #16 40.613 + bl idct_row_armv5te 40.614 + add a1, a1, #16 40.615 + bl idct_row_armv5te 40.616 + add a1, a1, #16 40.617 + bl idct_row_armv5te 40.618 + add a1, a1, #16 40.619 + bl idct_row_armv5te 40.620 + add a1, a1, #16 40.621 + bl idct_row_armv5te 40.622 + add a1, a1, #16 40.623 + bl idct_row_armv5te 40.624 + 40.625 + sub a1, a1, #(16*7) 40.626 + 40.627 + bl idct_col_armv5te 40.628 + add a1, a1, #4 40.629 + bl idct_col_armv5te 40.630 + add a1, a1, #4 40.631 + bl idct_col_armv5te 40.632 + add a1, a1, #4 40.633 + bl idct_col_armv5te 40.634 + 40.635 + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} 40.636 +endfunc 40.637 + 40.638 +function ff_simple_idct_add_armv5te, export=1 40.639 + stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} 40.640 + 40.641 + mov a1, a3 40.642 + 40.643 + bl idct_row_armv5te 40.644 + add a1, a1, #16 40.645 + bl idct_row_armv5te 40.646 + add a1, a1, #16 40.647 + bl idct_row_armv5te 40.648 + add a1, a1, #16 40.649 + bl idct_row_armv5te 40.650 + add a1, a1, #16 40.651 + bl idct_row_armv5te 40.652 + add a1, a1, #16 40.653 + bl idct_row_armv5te 40.654 + add a1, a1, #16 40.655 + bl idct_row_armv5te 40.656 + add a1, a1, #16 40.657 + bl idct_row_armv5te 40.658 + 40.659 + sub a1, a1, #(16*7) 40.660 + 40.661 + bl idct_col_add_armv5te 40.662 + add a1, a1, #4 40.663 + bl idct_col_add_armv5te 40.664 + add a1, a1, #4 40.665 + bl idct_col_add_armv5te 40.666 + add a1, a1, #4 40.667 + bl idct_col_add_armv5te 40.668 + 40.669 + add sp, sp, #8 40.670 + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} 40.671 +endfunc 40.672 + 40.673 +function ff_simple_idct_put_armv5te, export=1 40.674 + stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} 40.675 + 40.676 + mov a1, a3 40.677 + 40.678 + bl idct_row_armv5te 40.679 + add a1, a1, #16 40.680 + bl idct_row_armv5te 40.681 + add a1, a1, #16 40.682 + bl idct_row_armv5te 40.683 + add a1, a1, #16 40.684 + bl idct_row_armv5te 40.685 + add a1, a1, #16 40.686 + bl idct_row_armv5te 40.687 + add a1, a1, #16 40.688 + bl idct_row_armv5te 40.689 + add a1, a1, #16 40.690 + bl idct_row_armv5te 40.691 + add a1, a1, #16 40.692 + bl idct_row_armv5te 40.693 + 40.694 + sub a1, a1, #(16*7) 40.695 + 40.696 + bl idct_col_put_armv5te 40.697 + add a1, a1, #4 40.698 + bl idct_col_put_armv5te 40.699 + add a1, a1, #4 40.700 + bl idct_col_put_armv5te 40.701 + add a1, a1, #4 40.702 + bl idct_col_put_armv5te 40.703 + 40.704 + add sp, sp, #8 40.705 + ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} 40.706 +endfunc
41.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 41.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv6.S Mon Aug 27 12:09:56 2012 +0200 41.3 @@ -0,0 +1,433 @@ 41.4 +/* 41.5 + * Simple IDCT 41.6 + * 41.7 + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> 41.8 + * Copyright (c) 2007 Mans Rullgard <mans@mansr.com> 41.9 + * 41.10 + * This file is part of FFmpeg. 41.11 + * 41.12 + * FFmpeg is free software; you can redistribute it and/or 41.13 + * modify it under the terms of the GNU Lesser General Public 41.14 + * License as published by the Free Software Foundation; either 41.15 + * version 2.1 of the License, or (at your option) any later version. 41.16 + * 41.17 + * FFmpeg is distributed in the hope that it will be useful, 41.18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 41.19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 41.20 + * Lesser General Public License for more details. 41.21 + * 41.22 + * You should have received a copy of the GNU Lesser General Public 41.23 + * License along with FFmpeg; if not, write to the Free Software 41.24 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 41.25 + */ 41.26 + 41.27 +#include "asm.S" 41.28 + 41.29 +#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 41.30 +#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 41.31 +#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 41.32 +#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 41.33 +#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 41.34 +#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 41.35 +#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 41.36 +#define ROW_SHIFT 11 41.37 +#define COL_SHIFT 20 41.38 + 41.39 +#define W13 (W1 | (W3 << 16)) 41.40 +#define W26 (W2 | (W6 << 16)) 41.41 +#define W42 (W4 | (W2 << 16)) 41.42 +#define W42n (-W4&0xffff | (-W2 << 16)) 41.43 +#define W46 (W4 | (W6 << 16)) 41.44 +#define W57 (W5 | (W7 << 16)) 41.45 + 41.46 + .text 41.47 + .align 41.48 +w13: .long W13 41.49 +w26: .long W26 41.50 +w42: .long W42 41.51 +w42n: .long W42n 41.52 +w46: .long W46 41.53 +w57: .long W57 41.54 + 41.55 +/* 41.56 + Compute partial IDCT of single row. 41.57 + shift = left-shift amount 41.58 + r0 = source address 41.59 + r2 = row[2,0] <= 2 cycles 41.60 + r3 = row[3,1] 41.61 + ip = w42 <= 2 cycles 41.62 + 41.63 + Output in registers r4--r11 41.64 +*/ 41.65 + .macro idct_row shift 41.66 + ldr lr, w46 /* lr = W4 | (W6 << 16) */ 41.67 + mov r1, #(1<<(\shift-1)) 41.68 + smlad r4, r2, ip, r1 41.69 + smlsd r7, r2, ip, r1 41.70 + ldr ip, w13 /* ip = W1 | (W3 << 16) */ 41.71 + ldr r10,w57 /* r10 = W5 | (W7 << 16) */ 41.72 + smlad r5, r2, lr, r1 41.73 + smlsd r6, r2, lr, r1 41.74 + 41.75 + smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ 41.76 + smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ 41.77 + ldr lr, [r0, #12] /* lr = row[7,5] */ 41.78 + pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ 41.79 + pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ 41.80 + smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ 41.81 + smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */ 41.82 + smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ 41.83 + 41.84 + ldr r3, w42n /* r3 = -W4 | (-W2 << 16) */ 41.85 + smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */ 41.86 + ldr r2, [r0, #4] /* r2 = row[6,4] */ 41.87 + smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */ 41.88 + ldr ip, w46 /* ip = W4 | (W6 << 16) */ 41.89 + smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */ 41.90 + 41.91 + smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */ 41.92 + smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */ 41.93 + smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */ 41.94 + smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */ 41.95 + .endm 41.96 + 41.97 +/* 41.98 + Compute partial IDCT of half row. 41.99 + shift = left-shift amount 41.100 + r2 = row[2,0] 41.101 + r3 = row[3,1] 41.102 + ip = w42 41.103 + 41.104 + Output in registers r4--r11 41.105 +*/ 41.106 + .macro idct_row4 shift 41.107 + ldr lr, w46 /* lr = W4 | (W6 << 16) */ 41.108 + ldr r10,w57 /* r10 = W5 | (W7 << 16) */ 41.109 + mov r1, #(1<<(\shift-1)) 41.110 + smlad r4, r2, ip, r1 41.111 + smlsd r7, r2, ip, r1 41.112 + ldr ip, w13 /* ip = W1 | (W3 << 16) */ 41.113 + smlad r5, r2, lr, r1 41.114 + smlsd r6, r2, lr, r1 41.115 + smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ 41.116 + smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ 41.117 + pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ 41.118 + pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ 41.119 + smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ 41.120 + smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ 41.121 + .endm 41.122 + 41.123 +/* 41.124 + Compute final part of IDCT single row without shift. 41.125 + Input in registers r4--r11 41.126 + Output in registers ip, r4--r6, lr, r8--r10 41.127 +*/ 41.128 + .macro idct_finish 41.129 + add ip, r4, r8 /* r1 = A0 + B0 */ 41.130 + sub lr, r4, r8 /* r2 = A0 - B0 */ 41.131 + sub r4, r5, r9 /* r2 = A1 + B1 */ 41.132 + add r8, r5, r9 /* r2 = A1 - B1 */ 41.133 + add r5, r6, r10 /* r1 = A2 + B2 */ 41.134 + sub r9, r6, r10 /* r1 = A2 - B2 */ 41.135 + add r6, r7, r11 /* r2 = A3 + B3 */ 41.136 + sub r10,r7, r11 /* r2 = A3 - B3 */ 41.137 + .endm 41.138 + 41.139 +/* 41.140 + Compute final part of IDCT single row. 41.141 + shift = right-shift amount 41.142 + Input/output in registers r4--r11 41.143 +*/ 41.144 + .macro idct_finish_shift shift 41.145 + add r3, r4, r8 /* r3 = A0 + B0 */ 41.146 + sub r2, r4, r8 /* r2 = A0 - B0 */ 41.147 + mov r4, r3, asr #\shift 41.148 + mov r8, r2, asr #\shift 41.149 + 41.150 + sub r3, r5, r9 /* r3 = A1 + B1 */ 41.151 + add r2, r5, r9 /* r2 = A1 - B1 */ 41.152 + mov r5, r3, asr #\shift 41.153 + mov r9, r2, asr #\shift 41.154 + 41.155 + add r3, r6, r10 /* r3 = A2 + B2 */ 41.156 + sub r2, r6, r10 /* r2 = A2 - B2 */ 41.157 + mov r6, r3, asr #\shift 41.158 + mov r10,r2, asr #\shift 41.159 + 41.160 + add r3, r7, r11 /* r3 = A3 + B3 */ 41.161 + sub r2, r7, r11 /* r2 = A3 - B3 */ 41.162 + mov r7, r3, asr #\shift 41.163 + mov r11,r2, asr #\shift 41.164 + .endm 41.165 + 41.166 +/* 41.167 + Compute final part of IDCT single row, saturating results at 8 bits. 41.168 + shift = right-shift amount 41.169 + Input/output in registers r4--r11 41.170 +*/ 41.171 + .macro idct_finish_shift_sat shift 41.172 + add r3, r4, r8 /* r3 = A0 + B0 */ 41.173 + sub ip, r4, r8 /* ip = A0 - B0 */ 41.174 + usat r4, #8, r3, asr #\shift 41.175 + usat r8, #8, ip, asr #\shift 41.176 + 41.177 + sub r3, r5, r9 /* r3 = A1 + B1 */ 41.178 + add ip, r5, r9 /* ip = A1 - B1 */ 41.179 + usat r5, #8, r3, asr #\shift 41.180 + usat r9, #8, ip, asr #\shift 41.181 + 41.182 + add r3, r6, r10 /* r3 = A2 + B2 */ 41.183 + sub ip, r6, r10 /* ip = A2 - B2 */ 41.184 + usat r6, #8, r3, asr #\shift 41.185 + usat r10,#8, ip, asr #\shift 41.186 + 41.187 + add r3, r7, r11 /* r3 = A3 + B3 */ 41.188 + sub ip, r7, r11 /* ip = A3 - B3 */ 41.189 + usat r7, #8, r3, asr #\shift 41.190 + usat r11,#8, ip, asr #\shift 41.191 + .endm 41.192 + 41.193 +/* 41.194 + Compute IDCT of single row, storing as column. 41.195 + r0 = source 41.196 + r1 = dest 41.197 +*/ 41.198 +function idct_row_armv6 41.199 + push {lr} 41.200 + 41.201 + ldr lr, [r0, #12] /* lr = row[7,5] */ 41.202 + ldr ip, [r0, #4] /* ip = row[6,4] */ 41.203 + ldr r3, [r0, #8] /* r3 = row[3,1] */ 41.204 + ldr r2, [r0] /* r2 = row[2,0] */ 41.205 + orrs lr, lr, ip 41.206 + cmpeq lr, r3 41.207 + cmpeq lr, r2, lsr #16 41.208 + beq 1f 41.209 + push {r1} 41.210 + ldr ip, w42 /* ip = W4 | (W2 << 16) */ 41.211 + cmp lr, #0 41.212 + beq 2f 41.213 + 41.214 + idct_row ROW_SHIFT 41.215 + b 3f 41.216 + 41.217 +2: idct_row4 ROW_SHIFT 41.218 + 41.219 +3: pop {r1} 41.220 + idct_finish_shift ROW_SHIFT 41.221 + 41.222 + strh r4, [r1] 41.223 + strh r5, [r1, #(16*2)] 41.224 + strh r6, [r1, #(16*4)] 41.225 + strh r7, [r1, #(16*6)] 41.226 + strh r11,[r1, #(16*1)] 41.227 + strh r10,[r1, #(16*3)] 41.228 + strh r9, [r1, #(16*5)] 41.229 + strh r8, [r1, #(16*7)] 41.230 + 41.231 + pop {pc} 41.232 + 41.233 +1: mov r2, r2, lsl #3 41.234 + strh r2, [r1] 41.235 + strh r2, [r1, #(16*2)] 41.236 + strh r2, [r1, #(16*4)] 41.237 + strh r2, [r1, #(16*6)] 41.238 + strh r2, [r1, #(16*1)] 41.239 + strh r2, [r1, #(16*3)] 41.240 + strh r2, [r1, #(16*5)] 41.241 + strh r2, [r1, #(16*7)] 41.242 + pop {pc} 41.243 +endfunc 41.244 + 41.245 +/* 41.246 + Compute IDCT of single column, read as row. 41.247 + r0 = source 41.248 + r1 = dest 41.249 +*/ 41.250 +function idct_col_armv6 41.251 + push {r1, lr} 41.252 + 41.253 + ldr r2, [r0] /* r2 = row[2,0] */ 41.254 + ldr ip, w42 /* ip = W4 | (W2 << 16) */ 41.255 + ldr r3, [r0, #8] /* r3 = row[3,1] */ 41.256 + idct_row COL_SHIFT 41.257 + pop {r1} 41.258 + idct_finish_shift COL_SHIFT 41.259 + 41.260 + strh r4, [r1] 41.261 + strh r5, [r1, #(16*1)] 41.262 + strh r6, [r1, #(16*2)] 41.263 + strh r7, [r1, #(16*3)] 41.264 + strh r11,[r1, #(16*4)] 41.265 + strh r10,[r1, #(16*5)] 41.266 + strh r9, [r1, #(16*6)] 41.267 + strh r8, [r1, #(16*7)] 41.268 + 41.269 + pop {pc} 41.270 +endfunc 41.271 + 41.272 +/* 41.273 + Compute IDCT of single column, read as row, store saturated 8-bit. 41.274 + r0 = source 41.275 + r1 = dest 41.276 + r2 = line size 41.277 +*/ 41.278 +function idct_col_put_armv6 41.279 + push {r1, r2, lr} 41.280 + 41.281 + ldr r2, [r0] /* r2 = row[2,0] */ 41.282 + ldr ip, w42 /* ip = W4 | (W2 << 16) */ 41.283 + ldr r3, [r0, #8] /* r3 = row[3,1] */ 41.284 + idct_row COL_SHIFT 41.285 + pop {r1, r2} 41.286 + idct_finish_shift_sat COL_SHIFT 41.287 + 41.288 + strb r4, [r1], r2 41.289 + strb r5, [r1], r2 41.290 + strb r6, [r1], r2 41.291 + strb r7, [r1], r2 41.292 + strb r11,[r1], r2 41.293 + strb r10,[r1], r2 41.294 + strb r9, [r1], r2 41.295 + strb r8, [r1], r2 41.296 + 41.297 + sub r1, r1, r2, lsl #3 41.298 + 41.299 + pop {pc} 41.300 +endfunc 41.301 + 41.302 +/* 41.303 + Compute IDCT of single column, read as row, add/store saturated 8-bit. 41.304 + r0 = source 41.305 + r1 = dest 41.306 + r2 = line size 41.307 +*/ 41.308 +function idct_col_add_armv6 41.309 + push {r1, r2, lr} 41.310 + 41.311 + ldr r2, [r0] /* r2 = row[2,0] */ 41.312 + ldr ip, w42 /* ip = W4 | (W2 << 16) */ 41.313 + ldr r3, [r0, #8] /* r3 = row[3,1] */ 41.314 + idct_row COL_SHIFT 41.315 + pop {r1, r2} 41.316 + idct_finish 41.317 + 41.318 + ldrb r3, [r1] 41.319 + ldrb r7, [r1, r2] 41.320 + ldrb r11,[r1, r2, lsl #2] 41.321 + add ip, r3, ip, asr #COL_SHIFT 41.322 + usat ip, #8, ip 41.323 + add r4, r7, r4, asr #COL_SHIFT 41.324 + strb ip, [r1], r2 41.325 + ldrb ip, [r1, r2] 41.326 + usat r4, #8, r4 41.327 + ldrb r11,[r1, r2, lsl #2] 41.328 + add r5, ip, r5, asr #COL_SHIFT 41.329 + usat r5, #8, r5 41.330 + strb r4, [r1], r2 41.331 + ldrb r3, [r1, r2] 41.332 + ldrb ip, [r1, r2, lsl #2] 41.333 + strb r5, [r1], r2 41.334 + ldrb r7, [r1, r2] 41.335 + ldrb r4, [r1, r2, lsl #2] 41.336 + add r6, r3, r6, asr #COL_SHIFT 41.337 + usat r6, #8, r6 41.338 + add r10,r7, r10,asr #COL_SHIFT 41.339 + usat r10,#8, r10 41.340 + add r9, r11,r9, asr #COL_SHIFT 41.341 + usat r9, #8, r9 41.342 + add r8, ip, r8, asr #COL_SHIFT 41.343 + usat r8, #8, r8 41.344 + add lr, r4, lr, asr #COL_SHIFT 41.345 + usat lr, #8, lr 41.346 + strb r6, [r1], r2 41.347 + strb r10,[r1], r2 41.348 + strb r9, [r1], r2 41.349 + strb r8, [r1], r2 41.350 + strb lr, [r1], r2 41.351 + 41.352 + sub r1, r1, r2, lsl #3 41.353 + 41.354 + pop {pc} 41.355 +endfunc 41.356 + 41.357 +/* 41.358 + Compute 8 IDCT row transforms. 41.359 + func = IDCT row->col function 41.360 + width = width of columns in bytes 41.361 +*/ 41.362 + .macro idct_rows func width 41.363 + bl \func 41.364 + add r0, r0, #(16*2) 41.365 + add r1, r1, #\width 41.366 + bl \func 41.367 + add r0, r0, #(16*2) 41.368 + add r1, r1, #\width 41.369 + bl \func 41.370 + add r0, r0, #(16*2) 41.371 + add r1, r1, #\width 41.372 + bl \func 41.373 + sub r0, r0, #(16*5) 41.374 + add r1, r1, #\width 41.375 + bl \func 41.376 + add r0, r0, #(16*2) 41.377 + add r1, r1, #\width 41.378 + bl \func 41.379 + add r0, r0, #(16*2) 41.380 + add r1, r1, #\width 41.381 + bl \func 41.382 + add r0, r0, #(16*2) 41.383 + add r1, r1, #\width 41.384 + bl \func 41.385 + 41.386 + sub r0, r0, #(16*7) 41.387 + .endm 41.388 + 41.389 +/* void ff_simple_idct_armv6(DCTELEM *data); */ 41.390 +function ff_simple_idct_armv6, export=1 41.391 + push {r4-r11, lr} 41.392 + sub sp, sp, #128 41.393 + 41.394 + mov r1, sp 41.395 + idct_rows idct_row_armv6, 2 41.396 + mov r1, r0 41.397 + mov r0, sp 41.398 + idct_rows idct_col_armv6, 2 41.399 + 41.400 + add sp, sp, #128 41.401 + pop {r4-r11, pc} 41.402 +endfunc 41.403 + 41.404 +/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ 41.405 +function ff_simple_idct_add_armv6, export=1 41.406 + push {r0, r1, r4-r11, lr} 41.407 + sub sp, sp, #128 41.408 + 41.409 + mov r0, r2 41.410 + mov r1, sp 41.411 + idct_rows idct_row_armv6, 2 41.412 + mov r0, sp 41.413 + ldr r1, [sp, #128] 41.414 + ldr r2, [sp, #(128+4)] 41.415 + idct_rows idct_col_add_armv6, 1 41.416 + 41.417 + add sp, sp, #(128+8) 41.418 + pop {r4-r11, pc} 41.419 +endfunc 41.420 + 41.421 +/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ 41.422 +function ff_simple_idct_put_armv6, export=1 41.423 + push {r0, r1, r4-r11, lr} 41.424 + sub sp, sp, #128 41.425 + 41.426 + mov r0, r2 41.427 + mov r1, sp 41.428 + idct_rows idct_row_armv6, 2 41.429 + mov r0, sp 41.430 + ldr r1, [sp, #128] 41.431 + ldr r2, [sp, #(128+4)] 41.432 + idct_rows idct_col_put_armv6, 1 41.433 + 41.434 + add sp, sp, #(128+8) 41.435 + pop {r4-r11, pc} 41.436 +endfunc
42.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 42.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_neon.S Mon Aug 27 12:09:56 2012 +0200 42.3 @@ -0,0 +1,373 @@ 42.4 +/* 42.5 + * ARM NEON IDCT 42.6 + * 42.7 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 42.8 + * 42.9 + * Based on Simple IDCT 42.10 + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> 42.11 + * 42.12 + * This file is part of FFmpeg. 42.13 + * 42.14 + * FFmpeg is free software; you can redistribute it and/or 42.15 + * modify it under the terms of the GNU Lesser General Public 42.16 + * License as published by the Free Software Foundation; either 42.17 + * version 2.1 of the License, or (at your option) any later version. 42.18 + * 42.19 + * FFmpeg is distributed in the hope that it will be useful, 42.20 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 42.21 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 42.22 + * Lesser General Public License for more details. 42.23 + * 42.24 + * You should have received a copy of the GNU Lesser General Public 42.25 + * License along with FFmpeg; if not, write to the Free Software 42.26 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 42.27 + */ 42.28 + 42.29 +#include "asm.S" 42.30 + 42.31 +#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 42.32 +#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 42.33 +#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 42.34 +#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 42.35 +#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 42.36 +#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 42.37 +#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 42.38 +#define W4c ((1<<(COL_SHIFT-1))/W4) 42.39 +#define ROW_SHIFT 11 42.40 +#define COL_SHIFT 20 42.41 + 42.42 +#define w1 d0[0] 42.43 +#define w2 d0[1] 42.44 +#define w3 d0[2] 42.45 +#define w4 d0[3] 42.46 +#define w5 d1[0] 42.47 +#define w6 d1[1] 42.48 +#define w7 d1[2] 42.49 +#define w4c d1[3] 42.50 + 42.51 + .macro idct_col4_top 42.52 + vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ 42.53 + vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ 42.54 + vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ 42.55 + vadd.i32 q11, q15, q7 42.56 + vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ 42.57 + vadd.i32 q12, q15, q8 42.58 + vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ 42.59 + vsub.i32 q13, q15, q8 42.60 + vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ 42.61 + vsub.i32 q14, q15, q7 42.62 + 42.63 + vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ 42.64 + vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ 42.65 + vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ 42.66 + vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ 42.67 + .endm 42.68 + 42.69 + .text 42.70 + .align 6 42.71 + 42.72 +function idct_row4_pld_neon 42.73 + pld [r0] 42.74 + add r3, r0, r1, lsl #2 42.75 + pld [r0, r1] 42.76 + pld [r0, r1, lsl #1] 42.77 + pld [r3, -r1] 42.78 + pld [r3] 42.79 + pld [r3, r1] 42.80 + add r3, r3, r1, lsl #1 42.81 + pld [r3] 42.82 + pld [r3, r1] 42.83 +endfunc 42.84 + 42.85 +function idct_row4_neon 42.86 + vmov.i32 q15, #(1<<(ROW_SHIFT-1)) 42.87 + vld1.64 {d2-d5}, [r2,:128]! 42.88 + vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ 42.89 + vld1.64 {d6,d7}, [r2,:128]! 42.90 + vorr d10, d3, d5 42.91 + vld1.64 {d8,d9}, [r2,:128]! 42.92 + add r2, r2, #-64 42.93 + 42.94 + vorr d11, d7, d9 42.95 + vorr d10, d10, d11 42.96 + vmov r3, r4, d10 42.97 + 42.98 + idct_col4_top 42.99 + 42.100 + orrs r3, r3, r4 42.101 + beq 1f 42.102 + 42.103 + vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ 42.104 + vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ 42.105 + vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ 42.106 + vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ 42.107 + vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ 42.108 + vadd.i32 q11, q11, q7 42.109 + vsub.i32 q12, q12, q7 42.110 + vsub.i32 q13, q13, q7 42.111 + vadd.i32 q14, q14, q7 42.112 + vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ 42.113 + vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ 42.114 + vmlal.s16 q9, d9, w7 42.115 + vmlsl.s16 q10, d9, w5 42.116 + vmlal.s16 q5, d9, w3 42.117 + vmlsl.s16 q6, d9, w1 42.118 + vadd.i32 q11, q11, q7 42.119 + vsub.i32 q12, q12, q8 42.120 + vadd.i32 q13, q13, q8 42.121 + vsub.i32 q14, q14, q7 42.122 + 42.123 +1: vadd.i32 q3, q11, q9 42.124 + vadd.i32 q4, q12, q10 42.125 + vshrn.i32 d2, q3, #ROW_SHIFT 42.126 + vshrn.i32 d4, q4, #ROW_SHIFT 42.127 + vadd.i32 q7, q13, q5 42.128 + vadd.i32 q8, q14, q6 42.129 + vtrn.16 d2, d4 42.130 + vshrn.i32 d6, q7, #ROW_SHIFT 42.131 + vshrn.i32 d8, q8, #ROW_SHIFT 42.132 + vsub.i32 q14, q14, q6 42.133 + vsub.i32 q11, q11, q9 42.134 + vtrn.16 d6, d8 42.135 + vsub.i32 q13, q13, q5 42.136 + vshrn.i32 d3, q14, #ROW_SHIFT 42.137 + vtrn.32 d2, d6 42.138 + vsub.i32 q12, q12, q10 42.139 + vtrn.32 d4, d8 42.140 + vshrn.i32 d5, q13, #ROW_SHIFT 42.141 + vshrn.i32 d7, q12, #ROW_SHIFT 42.142 + vshrn.i32 d9, q11, #ROW_SHIFT 42.143 + 42.144 + vtrn.16 d3, d5 42.145 + vtrn.16 d7, d9 42.146 + vtrn.32 d3, d7 42.147 + vtrn.32 d5, d9 42.148 + 42.149 + vst1.64 {d2-d5}, [r2,:128]! 42.150 + vst1.64 {d6-d9}, [r2,:128]! 42.151 + 42.152 + bx lr 42.153 +endfunc 42.154 + 42.155 +function idct_col4_neon 42.156 + mov ip, #16 42.157 + vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */ 42.158 + vdup.16 d30, w4c 42.159 + vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */ 42.160 + vadd.i16 d30, d30, d2 42.161 + vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */ 42.162 + vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/ 42.163 + vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */ 42.164 + 42.165 + ldrd r4, [r2] 42.166 + ldrd r6, [r2, #16] 42.167 + orrs r4, r4, r5 42.168 + 42.169 + idct_col4_top 42.170 + addeq r2, r2, #16 42.171 + beq 1f 42.172 + 42.173 + vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */ 42.174 + vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ 42.175 + vadd.i32 q11, q11, q7 42.176 + vsub.i32 q12, q12, q7 42.177 + vsub.i32 q13, q13, q7 42.178 + vadd.i32 q14, q14, q7 42.179 + 42.180 +1: orrs r6, r6, r7 42.181 + ldrd r4, [r2, #16] 42.182 + addeq r2, r2, #16 42.183 + beq 2f 42.184 + 42.185 + vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */ 42.186 + vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ 42.187 + vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ 42.188 + vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ 42.189 + vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ 42.190 + 42.191 +2: orrs r4, r4, r5 42.192 + ldrd r4, [r2, #16] 42.193 + addeq r2, r2, #16 42.194 + beq 3f 42.195 + 42.196 + vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */ 42.197 + vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ 42.198 + vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ 42.199 + vadd.i32 q11, q11, q7 42.200 + vsub.i32 q14, q14, q7 42.201 + vsub.i32 q12, q12, q8 42.202 + vadd.i32 q13, q13, q8 42.203 + 42.204 +3: orrs r4, r4, r5 42.205 + addeq r2, r2, #16 42.206 + beq 4f 42.207 + 42.208 + vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */ 42.209 + vmlal.s16 q9, d9, w7 42.210 + vmlsl.s16 q10, d9, w5 42.211 + vmlal.s16 q5, d9, w3 42.212 + vmlsl.s16 q6, d9, w1 42.213 + 42.214 +4: vaddhn.i32 d2, q11, q9 42.215 + vaddhn.i32 d3, q12, q10 42.216 + vaddhn.i32 d4, q13, q5 42.217 + vaddhn.i32 d5, q14, q6 42.218 + vsubhn.i32 d9, q11, q9 42.219 + vsubhn.i32 d8, q12, q10 42.220 + vsubhn.i32 d7, q13, q5 42.221 + vsubhn.i32 d6, q14, q6 42.222 + 42.223 + bx lr 42.224 +endfunc 42.225 + 42.226 + .align 6 42.227 + 42.228 +function idct_col4_st8_neon 42.229 + vqshrun.s16 d2, q1, #COL_SHIFT-16 42.230 + vqshrun.s16 d3, q2, #COL_SHIFT-16 42.231 + vqshrun.s16 d4, q3, #COL_SHIFT-16 42.232 + vqshrun.s16 d5, q4, #COL_SHIFT-16 42.233 + vst1.32 {d2[0]}, [r0,:32], r1 42.234 + vst1.32 {d2[1]}, [r0,:32], r1 42.235 + vst1.32 {d3[0]}, [r0,:32], r1 42.236 + vst1.32 {d3[1]}, [r0,:32], r1 42.237 + vst1.32 {d4[0]}, [r0,:32], r1 42.238 + vst1.32 {d4[1]}, [r0,:32], r1 42.239 + vst1.32 {d5[0]}, [r0,:32], r1 42.240 + vst1.32 {d5[1]}, [r0,:32], r1 42.241 + 42.242 + bx lr 42.243 +endfunc 42.244 + 42.245 + .section .rodata 42.246 + .align 4 42.247 +idct_coeff_neon: 42.248 + .short W1, W2, W3, W4, W5, W6, W7, W4c 42.249 + .previous 42.250 + 42.251 + .macro idct_start data 42.252 + push {r4-r7, lr} 42.253 + pld [\data] 42.254 + pld [\data, #64] 42.255 + vpush {d8-d15} 42.256 + movrel r3, idct_coeff_neon 42.257 + vld1.64 {d0,d1}, [r3,:128] 42.258 + .endm 42.259 + 42.260 + .macro idct_end 42.261 + vpop {d8-d15} 42.262 + pop {r4-r7, pc} 42.263 + .endm 42.264 + 42.265 +/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */ 42.266 +function ff_simple_idct_put_neon, export=1 42.267 + idct_start r2 42.268 + 42.269 + bl idct_row4_pld_neon 42.270 + bl idct_row4_neon 42.271 + add r2, r2, #-128 42.272 + bl idct_col4_neon 42.273 + bl idct_col4_st8_neon 42.274 + sub r0, r0, r1, lsl #3 42.275 + add r0, r0, #4 42.276 + add r2, r2, #-120 42.277 + bl idct_col4_neon 42.278 + bl idct_col4_st8_neon 42.279 + 42.280 + idct_end 42.281 +endfunc 42.282 + 42.283 + .align 6 42.284 + 42.285 +function idct_col4_add8_neon 42.286 + mov ip, r0 42.287 + 42.288 + vld1.32 {d10[0]}, [r0,:32], r1 42.289 + vshr.s16 q1, q1, #COL_SHIFT-16 42.290 + vld1.32 {d10[1]}, [r0,:32], r1 42.291 + vshr.s16 q2, q2, #COL_SHIFT-16 42.292 + vld1.32 {d11[0]}, [r0,:32], r1 42.293 + vshr.s16 q3, q3, #COL_SHIFT-16 42.294 + vld1.32 {d11[1]}, [r0,:32], r1 42.295 + vshr.s16 q4, q4, #COL_SHIFT-16 42.296 + vld1.32 {d12[0]}, [r0,:32], r1 42.297 + vaddw.u8 q1, q1, d10 42.298 + vld1.32 {d12[1]}, [r0,:32], r1 42.299 + vaddw.u8 q2, q2, d11 42.300 + vld1.32 {d13[0]}, [r0,:32], r1 42.301 + vqmovun.s16 d2, q1 42.302 + vld1.32 {d13[1]}, [r0,:32], r1 42.303 + vaddw.u8 q3, q3, d12 42.304 + vst1.32 {d2[0]}, [ip,:32], r1 42.305 + vqmovun.s16 d3, q2 42.306 + vst1.32 {d2[1]}, [ip,:32], r1 42.307 + vaddw.u8 q4, q4, d13 42.308 + vst1.32 {d3[0]}, [ip,:32], r1 42.309 + vqmovun.s16 d4, q3 42.310 + vst1.32 {d3[1]}, [ip,:32], r1 42.311 + vqmovun.s16 d5, q4 42.312 + vst1.32 {d4[0]}, [ip,:32], r1 42.313 + vst1.32 {d4[1]}, [ip,:32], r1 42.314 + vst1.32 {d5[0]}, [ip,:32], r1 42.315 + vst1.32 {d5[1]}, [ip,:32], r1 42.316 + 42.317 + bx lr 42.318 +endfunc 42.319 + 42.320 +/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */ 42.321 +function ff_simple_idct_add_neon, export=1 42.322 + idct_start r2 42.323 + 42.324 + bl idct_row4_pld_neon 42.325 + bl idct_row4_neon 42.326 + add r2, r2, #-128 42.327 + bl idct_col4_neon 42.328 + bl idct_col4_add8_neon 42.329 + sub r0, r0, r1, lsl #3 42.330 + add r0, r0, #4 42.331 + add r2, r2, #-120 42.332 + bl idct_col4_neon 42.333 + bl idct_col4_add8_neon 42.334 + 42.335 + idct_end 42.336 +endfunc 42.337 + 42.338 + .align 6 42.339 + 42.340 +function idct_col4_st16_neon 42.341 + mov ip, #16 42.342 + 42.343 + vshr.s16 q1, q1, #COL_SHIFT-16 42.344 + vshr.s16 q2, q2, #COL_SHIFT-16 42.345 + vst1.64 {d2}, [r2,:64], ip 42.346 + vshr.s16 q3, q3, #COL_SHIFT-16 42.347 + vst1.64 {d3}, [r2,:64], ip 42.348 + vshr.s16 q4, q4, #COL_SHIFT-16 42.349 + vst1.64 {d4}, [r2,:64], ip 42.350 + vst1.64 {d5}, [r2,:64], ip 42.351 + vst1.64 {d6}, [r2,:64], ip 42.352 + vst1.64 {d7}, [r2,:64], ip 42.353 + vst1.64 {d8}, [r2,:64], ip 42.354 + vst1.64 {d9}, [r2,:64], ip 42.355 + 42.356 + bx lr 42.357 +endfunc 42.358 + 42.359 +/* void ff_simple_idct_neon(DCTELEM *data); */ 42.360 +function ff_simple_idct_neon, export=1 42.361 + idct_start r0 42.362 + 42.363 + mov r2, r0 42.364 + bl idct_row4_neon 42.365 + bl idct_row4_neon 42.366 + add r2, r2, #-128 42.367 + bl idct_col4_neon 42.368 + add r2, r2, #-128 42.369 + bl idct_col4_st16_neon 42.370 + add r2, r2, #-120 42.371 + bl idct_col4_neon 42.372 + add r2, r2, #-128 42.373 + bl idct_col4_st16_neon 42.374 + 42.375 + idct_end 42.376 +endfunc
43.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 43.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/synth_filter_neon.S Mon Aug 27 12:09:56 2012 +0200 43.3 @@ -0,0 +1,117 @@ 43.4 +/* 43.5 + * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> 43.6 + * 43.7 + * This file is part of FFmpeg. 43.8 + * 43.9 + * FFmpeg is free software; you can redistribute it and/or 43.10 + * modify it under the terms of the GNU Lesser General Public 43.11 + * License as published by the Free Software Foundation; either 43.12 + * version 2.1 of the License, or (at your option) any later version. 43.13 + * 43.14 + * FFmpeg is distributed in the hope that it will be useful, 43.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 43.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 43.17 + * Lesser General Public License for more details. 43.18 + * 43.19 + * You should have received a copy of the GNU Lesser General Public 43.20 + * License along with FFmpeg; if not, write to the Free Software 43.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 43.22 + */ 43.23 + 43.24 +#include "asm.S" 43.25 + 43.26 + preserve8 43.27 + 43.28 +function ff_synth_filter_float_neon, export=1 43.29 + push {r3-r11,lr} 43.30 + 43.31 + ldr r4, [r2] @ synth_buf_offset 43.32 + add r1, r1, r4, lsl #2 @ synth_buf 43.33 + sub r12, r4, #32 43.34 + bfc r12, #9, #23 43.35 + bic r4, r4, #63 43.36 + str r12, [r2] 43.37 + 43.38 + ldr r2, [sp, #12*4] @ in 43.39 + mov r9, r1 @ synth_buf 43.40 + 43.41 +VFP vpush {d0} 43.42 + bl ff_imdct_half_neon 43.43 +VFP vpop {d0} 43.44 + pop {r3} 43.45 + 43.46 + ldr r5, [sp, #9*4] @ window 43.47 + ldr r2, [sp, #10*4] @ out 43.48 +NOVFP vldr d0, [sp, #12*4] @ scale, bias 43.49 + add r8, r9, #12*4 43.50 + 43.51 + mov lr, #64*4 43.52 + mov r1, #4 43.53 +1: 43.54 + add r10, r9, #16*4 @ synth_buf 43.55 + add r11, r8, #16*4 43.56 + add r0, r5, #16*4 @ window 43.57 + add r6, r5, #32*4 43.58 + add r7, r5, #48*4 43.59 + 43.60 + vld1.32 {q10}, [r3,:128] @ a 43.61 + add r3, r3, #16*4 43.62 + vld1.32 {q1}, [r3,:128] @ b 43.63 + vmov.f32 q2, #0.0 @ c 43.64 + vmov.f32 q3, #0.0 @ d 43.65 + 43.66 + mov r12, #512 43.67 +2: 43.68 + vld1.32 {q9}, [r8, :128], lr 43.69 + vrev64.32 q9, q9 43.70 + vld1.32 {q8}, [r5, :128], lr 43.71 + vmls.f32 d20, d16, d19 43.72 + vld1.32 {q11}, [r0, :128], lr 43.73 + vmls.f32 d21, d17, d18 43.74 + vld1.32 {q12}, [r9, :128], lr 43.75 + vmla.f32 d2, d22, d24 43.76 + vld1.32 {q8}, [r6, :128], lr 43.77 + vmla.f32 d3, d23, d25 43.78 + vld1.32 {q9}, [r10,:128], lr 43.79 + vmla.f32 d4, d16, d18 43.80 + vld1.32 {q12}, [r11,:128], lr 43.81 + vmla.f32 d5, d17, d19 43.82 + vrev64.32 q12, q12 43.83 + vld1.32 {q11}, [r7, :128], lr 43.84 + vmla.f32 d6, d22, d25 43.85 + vmla.f32 d7, d23, d24 43.86 + subs r12, r12, #64 43.87 + beq 3f 43.88 + cmp r12, r4 43.89 + bne 2b 43.90 + sub r8, r8, #512*4 43.91 + sub r9, r9, #512*4 43.92 + sub r10, r10, #512*4 43.93 + sub r11, r11, #512*4 43.94 + b 2b 43.95 +3: 43.96 + vdup.32 q8, d0[1] 43.97 + vdup.32 q9, d0[1] 43.98 + vmla.f32 q8, q10, d0[0] 43.99 + vmla.f32 q9, q1, d0[0] 43.100 + vst1.32 {q3}, [r3,:128] 43.101 + sub r3, r3, #16*4 43.102 + vst1.32 {q2}, [r3,:128] 43.103 + vst1.32 {q8}, [r2,:128] 43.104 + add r2, r2, #16*4 43.105 + vst1.32 {q9}, [r2,:128] 43.106 + 43.107 + subs r1, r1, #1 43.108 + popeq {r4-r11,pc} 43.109 + 43.110 + cmp r4, #0 43.111 + subeq r8, r8, #512*4 43.112 + subeq r9, r9, #512*4 43.113 + sub r5, r5, #512*4 43.114 + sub r2, r2, #12*4 @ out 43.115 + add r3, r3, #4*4 @ synth_buf2 43.116 + add r5, r5, #4*4 @ window 43.117 + add r9, r9, #4*4 @ synth_buf 43.118 + sub r8, r8, #4*4 @ synth_buf 43.119 + b 1b 43.120 +endfunc
44.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 44.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/vp3dsp_neon.S Mon Aug 27 12:09:56 2012 +0200 44.3 @@ -0,0 +1,420 @@ 44.4 +/* 44.5 + * Copyright (c) 2009 David Conrad 44.6 + * 44.7 + * This file is part of FFmpeg. 44.8 + * 44.9 + * FFmpeg is free software; you can redistribute it and/or 44.10 + * modify it under the terms of the GNU Lesser General Public 44.11 + * License as published by the Free Software Foundation; either 44.12 + * version 2.1 of the License, or (at your option) any later version. 44.13 + * 44.14 + * FFmpeg is distributed in the hope that it will be useful, 44.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 44.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 44.17 + * Lesser General Public License for more details. 44.18 + * 44.19 + * You should have received a copy of the GNU Lesser General Public 44.20 + * License along with FFmpeg; if not, write to the Free Software 44.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 44.22 + */ 44.23 + 44.24 +#include "asm.S" 44.25 + 44.26 +.section .rodata 44.27 +.align 4 44.28 + 44.29 +vp3_idct_constants: 44.30 +.short 64277, 60547, 54491, 46341, 36410, 25080, 12785 44.31 + 44.32 +#define xC1S7 d0[0] 44.33 +#define xC2S6 d0[1] 44.34 +#define xC3S5 d0[2] 44.35 +#define xC4S4 d0[3] 44.36 +#define xC5S3 d1[0] 44.37 +#define xC6S2 d1[1] 44.38 +#define xC7S1 d1[2] 44.39 + 44.40 +.text 44.41 + 44.42 +.macro vp3_loop_filter 44.43 + vsubl.u8 q3, d18, d17 44.44 + vsubl.u8 q2, d16, d19 44.45 + vadd.i16 q1, q3, q3 44.46 + vadd.i16 q2, q2, q3 44.47 + vadd.i16 q0, q1, q2 44.48 + vrshr.s16 q0, q0, #3 44.49 + vmovl.u8 q9, d18 44.50 + vdup.u16 q15, r2 44.51 + 44.52 + vabs.s16 q1, q0 44.53 + vshr.s16 q0, q0, #15 44.54 + vqsub.u16 q2, q15, q1 44.55 + vqsub.u16 q3, q2, q1 44.56 + vsub.i16 q1, q2, q3 44.57 + veor q1, q1, q0 44.58 + vsub.i16 q0, q1, q0 44.59 + 44.60 + vaddw.u8 q2, q0, d17 44.61 + vsub.i16 q3, q9, q0 44.62 + vqmovun.s16 d0, q2 44.63 + vqmovun.s16 d1, q3 44.64 +.endm 44.65 + 44.66 +function ff_vp3_v_loop_filter_neon, export=1 44.67 + sub ip, r0, r1 44.68 + sub r0, r0, r1, lsl #1 44.69 + vld1.64 {d16}, [r0,:64], r1 44.70 + vld1.64 {d17}, [r0,:64], r1 44.71 + vld1.64 {d18}, [r0,:64], r1 44.72 + vld1.64 {d19}, [r0,:64], r1 44.73 + ldrb r2, [r2, #129*4] 44.74 + 44.75 + vp3_loop_filter 44.76 + 44.77 + vst1.64 {d0}, [ip,:64], r1 44.78 + vst1.64 {d1}, [ip,:64], r1 44.79 + bx lr 44.80 +endfunc 44.81 + 44.82 +function ff_vp3_h_loop_filter_neon, export=1 44.83 + sub ip, r0, #1 44.84 + sub r0, r0, #2 44.85 + vld1.32 {d16[]}, [r0], r1 44.86 + vld1.32 {d17[]}, [r0], r1 44.87 + vld1.32 {d18[]}, [r0], r1 44.88 + vld1.32 {d19[]}, [r0], r1 44.89 + vld1.32 {d16[1]}, [r0], r1 44.90 + vld1.32 {d17[1]}, [r0], r1 44.91 + vld1.32 {d18[1]}, [r0], r1 44.92 + vld1.32 {d19[1]}, [r0], r1 44.93 + ldrb r2, [r2, #129*4] 44.94 + 44.95 + vtrn.8 d16, d17 44.96 + vtrn.8 d18, d19 44.97 + vtrn.16 d16, d18 44.98 + vtrn.16 d17, d19 44.99 + 44.100 + vp3_loop_filter 44.101 + 44.102 + vtrn.8 d0, d1 44.103 + 44.104 + vst1.16 {d0[0]}, [ip], r1 44.105 + vst1.16 {d1[0]}, [ip], r1 44.106 + vst1.16 {d0[1]}, [ip], r1 44.107 + vst1.16 {d1[1]}, [ip], r1 44.108 + vst1.16 {d0[2]}, [ip], r1 44.109 + vst1.16 {d1[2]}, [ip], r1 44.110 + vst1.16 {d0[3]}, [ip], r1 44.111 + vst1.16 {d1[3]}, [ip], r1 44.112 + bx lr 44.113 +endfunc 44.114 + 44.115 + 44.116 +function vp3_idct_start_neon 44.117 + vpush {d8-d15} 44.118 + movrel r3, vp3_idct_constants 44.119 + vld1.64 {d0-d1}, [r3,:128] 44.120 + vld1.64 {d16-d19}, [r2,:128]! 44.121 + vld1.64 {d20-d23}, [r2,:128]! 44.122 + vld1.64 {d24-d27}, [r2,:128]! 44.123 + vadd.s16 q1, q8, q12 44.124 + vsub.s16 q8, q8, q12 44.125 + vld1.64 {d28-d31}, [r2,:128]! 44.126 +endfunc 44.127 + 44.128 +function vp3_idct_core_neon 44.129 + vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16 44.130 + vmull.s16 q3, d19, xC1S7 44.131 + vmull.s16 q4, d2, xC4S4 // ((ip[0] + ip[4]) * C4) << 16 44.132 + vmull.s16 q5, d3, xC4S4 44.133 + vmull.s16 q6, d16, xC4S4 // ((ip[0] - ip[4]) * C4) << 16 44.134 + vmull.s16 q7, d17, xC4S4 44.135 + vshrn.s32 d4, q2, #16 44.136 + vshrn.s32 d5, q3, #16 44.137 + vshrn.s32 d6, q4, #16 44.138 + vshrn.s32 d7, q5, #16 44.139 + vshrn.s32 d8, q6, #16 44.140 + vshrn.s32 d9, q7, #16 44.141 + vadd.s16 q12, q1, q3 // E = (ip[0] + ip[4]) * C4 44.142 + vadd.s16 q8, q8, q4 // F = (ip[0] - ip[4]) * C4 44.143 + vadd.s16 q1, q2, q9 // ip[1] * C1 44.144 + 44.145 + vmull.s16 q2, d30, xC1S7 // (ip[7] * C1) << 16 44.146 + vmull.s16 q3, d31, xC1S7 44.147 + vmull.s16 q4, d30, xC7S1 // (ip[7] * C7) << 16 44.148 + vmull.s16 q5, d31, xC7S1 44.149 + vmull.s16 q6, d18, xC7S1 // (ip[1] * C7) << 16 44.150 + vmull.s16 q7, d19, xC7S1 44.151 + vshrn.s32 d4, q2, #16 44.152 + vshrn.s32 d5, q3, #16 44.153 + vshrn.s32 d6, q4, #16 // ip[7] * C7 44.154 + vshrn.s32 d7, q5, #16 44.155 + vshrn.s32 d8, q6, #16 // ip[1] * C7 44.156 + vshrn.s32 d9, q7, #16 44.157 + vadd.s16 q2, q2, q15 // ip[7] * C1 44.158 + vadd.s16 q9, q1, q3 // A = ip[1] * C1 + ip[7] * C7 44.159 + vsub.s16 q15, q4, q2 // B = ip[1] * C7 - ip[7] * C1 44.160 + 44.161 + vmull.s16 q2, d22, xC5S3 // (ip[3] * C5) << 16 44.162 + vmull.s16 q3, d23, xC5S3 44.163 + vmull.s16 q4, d22, xC3S5 // (ip[3] * C3) << 16 44.164 + vmull.s16 q5, d23, xC3S5 44.165 + vmull.s16 q6, d26, xC5S3 // (ip[5] * C5) << 16 44.166 + vmull.s16 q7, d27, xC5S3 44.167 + vshrn.s32 d4, q2, #16 44.168 + vshrn.s32 d5, q3, #16 44.169 + vshrn.s32 d6, q4, #16 44.170 + vshrn.s32 d7, q5, #16 44.171 + vshrn.s32 d8, q6, #16 44.172 + vshrn.s32 d9, q7, #16 44.173 + vadd.s16 q3, q3, q11 // ip[3] * C3 44.174 + vadd.s16 q4, q4, q13 // ip[5] * C5 44.175 + vadd.s16 q1, q2, q11 // ip[3] * C5 44.176 + vadd.s16 q11, q3, q4 // C = ip[3] * C3 + ip[5] * C5 44.177 + 44.178 + vmull.s16 q2, d26, xC3S5 // (ip[5] * C3) << 16 44.179 + vmull.s16 q3, d27, xC3S5 44.180 + vmull.s16 q4, d20, xC2S6 // (ip[2] * C2) << 16 44.181 + vmull.s16 q5, d21, xC2S6 44.182 + vmull.s16 q6, d28, xC6S2 // (ip[6] * C6) << 16 44.183 + vmull.s16 q7, d29, xC6S2 44.184 + vshrn.s32 d4, q2, #16 44.185 + vshrn.s32 d5, q3, #16 44.186 + vshrn.s32 d6, q4, #16 44.187 + vshrn.s32 d7, q5, #16 44.188 + vshrn.s32 d8, q6, #16 // ip[6] * C6 44.189 + vshrn.s32 d9, q7, #16 44.190 + vadd.s16 q2, q2, q13 // ip[5] * C3 44.191 + vadd.s16 q3, q3, q10 // ip[2] * C2 44.192 + vsub.s16 q13, q2, q1 // D = ip[5] * C3 - ip[3] * C5 44.193 + vsub.s16 q1, q9, q11 // (A - C) 44.194 + vadd.s16 q11, q9, q11 // Cd = A + C 44.195 + vsub.s16 q9, q15, q13 // (B - D) 44.196 + vadd.s16 q13, q15, q13 // Dd = B + D 44.197 + vadd.s16 q15, q3, q4 // G = ip[2] * C2 + ip[6] * C6 44.198 + 44.199 + vmull.s16 q2, d2, xC4S4 // ((A - C) * C4) << 16 44.200 + vmull.s16 q3, d3, xC4S4 44.201 + vmull.s16 q4, d28, xC2S6 // (ip[6] * C2) << 16 44.202 + vmull.s16 q5, d29, xC2S6 44.203 + vmull.s16 q6, d20, xC6S2 // (ip[2] * C6) << 16 44.204 + vmull.s16 q7, d21, xC6S2 44.205 + vshrn.s32 d4, q2, #16 44.206 + vshrn.s32 d5, q3, #16 44.207 + vshrn.s32 d6, q4, #16 44.208 + vshrn.s32 d7, q5, #16 44.209 + vshrn.s32 d8, q6, #16 // ip[2] * C6 44.210 + vmull.s16 q5, d18, xC4S4 // ((B - D) * C4) << 16 44.211 + vmull.s16 q6, d19, xC4S4 44.212 + vshrn.s32 d9, q7, #16 44.213 + vadd.s16 q3, q3, q14 // ip[6] * C2 44.214 + vadd.s16 q10, q1, q2 // Ad = (A - C) * C4 44.215 + vsub.s16 q14, q4, q3 // H = ip[2] * C6 - ip[6] * C2 44.216 + bx lr 44.217 +endfunc 44.218 + 44.219 +.macro VP3_IDCT_END type 44.220 +function vp3_idct_end_\type\()_neon 44.221 +.ifc \type, col 44.222 + vdup.16 q0, r3 44.223 + vadd.s16 q12, q12, q0 44.224 + vadd.s16 q8, q8, q0 44.225 +.endif 44.226 + 44.227 + vshrn.s32 d2, q5, #16 44.228 + vshrn.s32 d3, q6, #16 44.229 + vadd.s16 q2, q12, q15 // Gd = E + G 44.230 + vadd.s16 q9, q1, q9 // (B - D) * C4 44.231 + vsub.s16 q12, q12, q15 // Ed = E - G 44.232 + vsub.s16 q3, q8, q10 // Fd = F - Ad 44.233 + vadd.s16 q10, q8, q10 // Add = F + Ad 44.234 + vadd.s16 q4, q9, q14 // Hd = Bd + H 44.235 + vsub.s16 q14, q9, q14 // Bdd = Bd - H 44.236 + vadd.s16 q8, q2, q11 // [0] = Gd + Cd 44.237 + vsub.s16 q15, q2, q11 // [7] = Gd - Cd 44.238 + vadd.s16 q9, q10, q4 // [1] = Add + Hd 44.239 + vsub.s16 q10, q10, q4 // [2] = Add - Hd 44.240 + vadd.s16 q11, q12, q13 // [3] = Ed + Dd 44.241 + vsub.s16 q12, q12, q13 // [4] = Ed - Dd 44.242 +.ifc \type, row 44.243 + vtrn.16 q8, q9 44.244 +.endif 44.245 + vadd.s16 q13, q3, q14 // [5] = Fd + Bdd 44.246 + vsub.s16 q14, q3, q14 // [6] = Fd - Bdd 44.247 + 44.248 +.ifc \type, row 44.249 + // 8x8 transpose 44.250 + vtrn.16 q10, q11 44.251 + vtrn.16 q12, q13 44.252 + vtrn.16 q14, q15 44.253 + vtrn.32 q8, q10 44.254 + vtrn.32 q9, q11 44.255 + vtrn.32 q12, q14 44.256 + vtrn.32 q13, q15 44.257 + vswp d17, d24 44.258 + vswp d19, d26 44.259 + vadd.s16 q1, q8, q12 44.260 + vswp d21, d28 44.261 + vsub.s16 q8, q8, q12 44.262 + vswp d23, d30 44.263 +.endif 44.264 + bx lr 44.265 +endfunc 44.266 +.endm 44.267 + 44.268 +VP3_IDCT_END row 44.269 +VP3_IDCT_END col 44.270 + 44.271 +function ff_vp3_idct_neon, export=1 44.272 + mov ip, lr 44.273 + mov r2, r0 44.274 + bl vp3_idct_start_neon 44.275 + bl vp3_idct_end_row_neon 44.276 + mov r3, #8 44.277 + bl vp3_idct_core_neon 44.278 + bl vp3_idct_end_col_neon 44.279 + mov lr, ip 44.280 + vpop {d8-d15} 44.281 + 44.282 + vshr.s16 q8, q8, #4 44.283 + vshr.s16 q9, q9, #4 44.284 + vshr.s16 q10, q10, #4 44.285 + vshr.s16 q11, q11, #4 44.286 + vshr.s16 q12, q12, #4 44.287 + vst1.64 {d16-d19}, [r0,:128]! 44.288 + vshr.s16 q13, q13, #4 44.289 + vshr.s16 q14, q14, #4 44.290 + vst1.64 {d20-d23}, [r0,:128]! 44.291 + vshr.s16 q15, q15, #4 44.292 + vst1.64 {d24-d27}, [r0,:128]! 44.293 + vst1.64 {d28-d31}, [r0,:128]! 44.294 + bx lr 44.295 +endfunc 44.296 + 44.297 +function ff_vp3_idct_put_neon, export=1 44.298 + mov ip, lr 44.299 + bl vp3_idct_start_neon 44.300 + bl vp3_idct_end_row_neon 44.301 + mov r3, #8 44.302 + add r3, r3, #2048 // convert signed pixel to unsigned 44.303 + bl vp3_idct_core_neon 44.304 + bl vp3_idct_end_col_neon 44.305 + mov lr, ip 44.306 + vpop {d8-d15} 44.307 + 44.308 + vqshrun.s16 d0, q8, #4 44.309 + vqshrun.s16 d1, q9, #4 44.310 + vqshrun.s16 d2, q10, #4 44.311 + vqshrun.s16 d3, q11, #4 44.312 + vst1.64 {d0}, [r0,:64], r1 44.313 + vqshrun.s16 d4, q12, #4 44.314 + vst1.64 {d1}, [r0,:64], r1 44.315 + vqshrun.s16 d5, q13, #4 44.316 + vst1.64 {d2}, [r0,:64], r1 44.317 + vqshrun.s16 d6, q14, #4 44.318 + vst1.64 {d3}, [r0,:64], r1 44.319 + vqshrun.s16 d7, q15, #4 44.320 + vst1.64 {d4}, [r0,:64], r1 44.321 + vst1.64 {d5}, [r0,:64], r1 44.322 + vst1.64 {d6}, [r0,:64], r1 44.323 + vst1.64 {d7}, [r0,:64], r1 44.324 + bx lr 44.325 +endfunc 44.326 + 44.327 +function ff_vp3_idct_add_neon, export=1 44.328 + mov ip, lr 44.329 + bl vp3_idct_start_neon 44.330 + bl vp3_idct_end_row_neon 44.331 + mov r3, #8 44.332 + bl vp3_idct_core_neon 44.333 + bl vp3_idct_end_col_neon 44.334 + mov lr, ip 44.335 + vpop {d8-d15} 44.336 + mov r2, r0 44.337 + 44.338 + vld1.64 {d0}, [r0,:64], r1 44.339 + vshr.s16 q8, q8, #4 44.340 + vld1.64 {d1}, [r0,:64], r1 44.341 + vshr.s16 q9, q9, #4 44.342 + vld1.64 {d2}, [r0,:64], r1 44.343 + vaddw.u8 q8, q8, d0 44.344 + vld1.64 {d3}, [r0,:64], r1 44.345 + vaddw.u8 q9, q9, d1 44.346 + vld1.64 {d4}, [r0,:64], r1 44.347 + vshr.s16 q10, q10, #4 44.348 + vld1.64 {d5}, [r0,:64], r1 44.349 + vshr.s16 q11, q11, #4 44.350 + vld1.64 {d6}, [r0,:64], r1 44.351 + vqmovun.s16 d0, q8 44.352 + vld1.64 {d7}, [r0,:64], r1 44.353 + vqmovun.s16 d1, q9 44.354 + vaddw.u8 q10, q10, d2 44.355 + vaddw.u8 q11, q11, d3 44.356 + vshr.s16 q12, q12, #4 44.357 + vshr.s16 q13, q13, #4 44.358 + vqmovun.s16 d2, q10 44.359 + vqmovun.s16 d3, q11 44.360 + vaddw.u8 q12, q12, d4 44.361 + vaddw.u8 q13, q13, d5 44.362 + vshr.s16 q14, q14, #4 44.363 + vshr.s16 q15, q15, #4 44.364 + vst1.64 {d0}, [r2,:64], r1 44.365 + vqmovun.s16 d4, q12 44.366 + vst1.64 {d1}, [r2,:64], r1 44.367 + vqmovun.s16 d5, q13 44.368 + vst1.64 {d2}, [r2,:64], r1 44.369 + vaddw.u8 q14, q14, d6 44.370 + vst1.64 {d3}, [r2,:64], r1 44.371 + vaddw.u8 q15, q15, d7 44.372 + vst1.64 {d4}, [r2,:64], r1 44.373 + vqmovun.s16 d6, q14 44.374 + vst1.64 {d5}, [r2,:64], r1 44.375 + vqmovun.s16 d7, q15 44.376 + vst1.64 {d6}, [r2,:64], r1 44.377 + vst1.64 {d7}, [r2,:64], r1 44.378 + bx lr 44.379 +endfunc 44.380 + 44.381 +function ff_vp3_idct_dc_add_neon, export=1 44.382 + ldrsh r2, [r2] 44.383 + movw r3, #46341 44.384 + mul r2, r3, r2 44.385 + smulwt r2, r3, r2 44.386 + mov r3, r0 44.387 + vdup.16 q15, r2 44.388 + vrshr.s16 q15, q15, #4 44.389 + 44.390 + vld1.8 {d0}, [r0,:64], r1 44.391 + vld1.8 {d1}, [r0,:64], r1 44.392 + vld1.8 {d2}, [r0,:64], r1 44.393 + vaddw.u8 q8, q15, d0 44.394 + vld1.8 {d3}, [r0,:64], r1 44.395 + vaddw.u8 q9, q15, d1 44.396 + vld1.8 {d4}, [r0,:64], r1 44.397 + vaddw.u8 q10, q15, d2 44.398 + vld1.8 {d5}, [r0,:64], r1 44.399 + vaddw.u8 q11, q15, d3 44.400 + vld1.8 {d6}, [r0,:64], r1 44.401 + vaddw.u8 q12, q15, d4 44.402 + vld1.8 {d7}, [r0,:64], r1 44.403 + vaddw.u8 q13, q15, d5 44.404 + vqmovun.s16 d0, q8 44.405 + vaddw.u8 q14, q15, d6 44.406 + vqmovun.s16 d1, q9 44.407 + vaddw.u8 q15, q15, d7 44.408 + vqmovun.s16 d2, q10 44.409 + vst1.8 {d0}, [r3,:64], r1 44.410 + vqmovun.s16 d3, q11 44.411 + vst1.8 {d1}, [r3,:64], r1 44.412 + vqmovun.s16 d4, q12 44.413 + vst1.8 {d2}, [r3,:64], r1 44.414 + vqmovun.s16 d5, q13 44.415 + vst1.8 {d3}, [r3,:64], r1 44.416 + vqmovun.s16 d6, q14 44.417 + vst1.8 {d4}, [r3,:64], r1 44.418 + vqmovun.s16 d7, q15 44.419 + vst1.8 {d5}, [r3,:64], r1 44.420 + vst1.8 {d6}, [r3,:64], r1 44.421 + vst1.8 {d7}, [r3,:64], r1 44.422 + bx lr 44.423 +endfunc
45.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 45.2 +++ b/ffmpeg_smp/h264dec/libavcodec/avcodec.h Mon Aug 27 12:09:56 2012 +0200 45.3 @@ -0,0 +1,407 @@ 45.4 +#ifndef AVCODEC_AVCODEC_H 45.5 +#define AVCODEC_AVCODEC_H 45.6 + 45.7 +#include <errno.h> 45.8 +#include <stdint.h> 45.9 +#include "config.h" 45.10 + 45.11 +#include "libavutil/mem.h" 45.12 + 45.13 +#define MAX_SPS_COUNT 32 45.14 +#define MAX_PPS_COUNT 256 45.15 + 45.16 + 45.17 +#ifndef CABAC 45.18 +#define CABAC h->pps.cabac 45.19 +#endif 45.20 + 45.21 +#define EXTENDED_SAR 255 45.22 + 45.23 +#define MB_TYPE_REF0 MB_TYPE_ACPRED //dirty but it fits in 16 bit 45.24 +#define MB_TYPE_8x8DCT 0x01000000 45.25 +#define IS_REF0(a) ((a) & MB_TYPE_REF0) 45.26 +#define IS_8x8DCT(a) ((a) & MB_TYPE_8x8DCT) 45.27 + 45.28 +#define LIST_NOT_USED -1 45.29 +#define PART_NOT_AVAILABLE -2 45.30 + 45.31 +/* dct code */ 45.32 +typedef short DCTELEM; 45.33 + 45.34 +/** 45.35 +* Required number of additionally allocated bytes at the end of the input bitstream for decoding. 45.36 +* This is mainly needed because some optimized bitstream readers read 45.37 +* 32 or 64 bit at once and could read over the end.<br> 45.38 +* Note: If the first 23 bits of the additional bytes are not 0, then damaged 45.39 +* MPEG bitstreams could cause overread and segfault. 45.40 +*/ 45.41 +#define FF_INPUT_BUFFER_PADDING_SIZE 8 45.42 + 45.43 +enum AVColorPrimaries{ 45.44 + AVCOL_PRI_BT709 =1, ///< also ITU-R BT1361 / IEC 61966-2-4 / SMPTE RP177 Annex B 45.45 + AVCOL_PRI_UNSPECIFIED=2, 45.46 + AVCOL_PRI_BT470M =4, 45.47 + AVCOL_PRI_BT470BG =5, ///< also ITU-R BT601-6 625 / ITU-R BT1358 625 / ITU-R BT1700 625 PAL & SECAM 45.48 + AVCOL_PRI_SMPTE170M =6, ///< also ITU-R BT601-6 525 / ITU-R BT1358 525 / ITU-R BT1700 NTSC 45.49 + AVCOL_PRI_SMPTE240M =7, ///< functionally identical to above 45.50 + AVCOL_PRI_FILM =8, 45.51 + AVCOL_PRI_NB , ///< Not part of ABI 45.52 +}; 45.53 + 45.54 +enum AVColorTransferCharacteristic{ 45.55 + AVCOL_TRC_BT709 =1, ///< also ITU-R BT1361 45.56 + AVCOL_TRC_UNSPECIFIED=2, 45.57 + AVCOL_TRC_GAMMA22 =4, ///< also ITU-R BT470M / ITU-R BT1700 625 PAL & SECAM 45.58 + AVCOL_TRC_GAMMA28 =5, ///< also ITU-R BT470BG 45.59 + AVCOL_TRC_NB , ///< Not part of ABI 45.60 +}; 45.61 + 45.62 +enum AVColorSpace{ 45.63 + AVCOL_SPC_RGB =0, 45.64 + AVCOL_SPC_BT709 =1, ///< also ITU-R BT1361 / IEC 61966-2-4 xvYCC709 / SMPTE RP177 Annex B 45.65 + AVCOL_SPC_UNSPECIFIED=2, 45.66 + AVCOL_SPC_FCC =4, 45.67 + AVCOL_SPC_BT470BG =5, ///< also ITU-R BT601-6 625 / ITU-R BT1358 625 / ITU-R BT1700 625 PAL & SECAM / IEC 61966-2-4 xvYCC601 45.68 + AVCOL_SPC_SMPTE170M =6, ///< also ITU-R BT601-6 525 / ITU-R BT1358 525 / ITU-R BT1700 NTSC / functionally identical to above 45.69 + AVCOL_SPC_SMPTE240M =7, 45.70 + AVCOL_SPC_NB , ///< Not part of ABI 45.71 +}; 45.72 + 45.73 +enum AVColorRange{ 45.74 + AVCOL_RANGE_UNSPECIFIED=0, 45.75 + AVCOL_RANGE_MPEG =1, ///< the normal 219*2^(n-8) "MPEG" YUV ranges 45.76 + AVCOL_RANGE_JPEG =2, ///< the normal 2^n-1 "JPEG" YUV ranges 45.77 + AVCOL_RANGE_NB , ///< Not part of ABI 45.78 +}; 45.79 + 45.80 +#define MAX_MMCO_COUNT 66 45.81 +/** 45.82 +* Memory management control operation opcode. 45.83 +*/ 45.84 +typedef enum MMCOOpcode{ 45.85 + MMCO_END=0, 45.86 + MMCO_SHORT2UNUSED, 45.87 + MMCO_LONG2UNUSED, 45.88 + MMCO_SHORT2LONG, 45.89 + MMCO_SET_MAX_LONG, 45.90 + MMCO_RESET, 45.91 + MMCO_LONG, 45.92 +} MMCOOpcode; 45.93 + 45.94 +/* NAL unit types */ 45.95 +enum { 45.96 + NAL_SLICE=1, 45.97 + NAL_DPA, 45.98 + NAL_DPB, 45.99 + NAL_DPC, 45.100 + NAL_IDR_SLICE, 45.101 + NAL_SEI, 45.102 + NAL_SPS, 45.103 + NAL_PPS, 45.104 + NAL_AUD, 45.105 + NAL_END_SEQUENCE, 45.106 + NAL_END_STREAM, 45.107 + NAL_FILLER_DATA, 45.108 + NAL_SPS_EXT, 45.109 + NAL_AUXILIARY_SLICE=19 45.110 +}; 45.111 + 45.112 +/** 45.113 +* SEI message types 45.114 +*/ 45.115 +typedef enum { 45.116 + SEI_BUFFERING_PERIOD = 0, ///< buffering period (H.264, D.1.1) 45.117 + SEI_TYPE_PIC_TIMING = 1, ///< picture timing 45.118 + SEI_TYPE_USER_DATA_UNREGISTERED = 5, ///< unregistered user data 45.119 + SEI_TYPE_RECOVERY_POINT = 6 ///< recovery point (frame # to decoder sync) 45.120 +} SEI_Type; 45.121 + 45.122 +/** 45.123 +* pic_struct in picture timing SEI message 45.124 +*/ 45.125 +typedef enum { 45.126 + SEI_PIC_STRUCT_FRAME = 0, ///< 0: %frame 45.127 + SEI_PIC_STRUCT_TOP_FIELD = 1, ///< 1: top field 45.128 + SEI_PIC_STRUCT_BOTTOM_FIELD = 2, ///< 2: bottom field 45.129 + SEI_PIC_STRUCT_TOP_BOTTOM = 3, ///< 3: top field, bottom field, in that order 45.130 + SEI_PIC_STRUCT_BOTTOM_TOP = 4, ///< 4: bottom field, top field, in that order 45.131 + SEI_PIC_STRUCT_TOP_BOTTOM_TOP = 5, ///< 5: top field, bottom field, top field repeated, in that order 45.132 + SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM = 6, ///< 6: bottom field, top field, bottom field repeated, in that order 45.133 + SEI_PIC_STRUCT_FRAME_DOUBLING = 7, ///< 7: %frame doubling 45.134 + SEI_PIC_STRUCT_FRAME_TRIPLING = 8 ///< 8: %frame tripling 45.135 +} SEI_PicStructType; 45.136 + 45.137 +#define FF_MAX_B_FRAMES 16 45.138 + 45.139 + 45.140 +//The following defines may change, don't expect compatibility if you use them. 45.141 +#define MB_TYPE_INTRA4x4 0x0001 45.142 +#define MB_TYPE_INTRA16x16 0x0002 //FIXME H.264-specific 45.143 +#define MB_TYPE_INTRA_PCM 0x0004 //FIXME H.264-specific 45.144 +#define MB_TYPE_16x16 0x0008 45.145 +#define MB_TYPE_16x8 0x0010 45.146 +#define MB_TYPE_8x16 0x0020 45.147 +#define MB_TYPE_8x8 0x0040 45.148 +#define MB_TYPE_INTERLACED 0x0080 45.149 +#define MB_TYPE_DIRECT2 0x0100 //FIXME 45.150 +#define MB_TYPE_ACPRED 0x0200 45.151 +#define MB_TYPE_GMC 0x0400 45.152 +#define MB_TYPE_SKIP 0x0800 45.153 +#define MB_TYPE_P0L0 0x1000 45.154 +#define MB_TYPE_P1L0 0x2000 45.155 +#define MB_TYPE_P0L1 0x4000 45.156 +#define MB_TYPE_P1L1 0x8000 45.157 +#define MB_TYPE_L0 (MB_TYPE_P0L0 | MB_TYPE_P1L0) 45.158 +#define MB_TYPE_L1 (MB_TYPE_P0L1 | MB_TYPE_P1L1) 45.159 +#define MB_TYPE_L0L1 (MB_TYPE_L0 | MB_TYPE_L1) 45.160 +#define MB_TYPE_QUANT 0x00010000 45.161 +#define MB_TYPE_CBP 0x00020000 45.162 +//Note bits 24-31 are reserved for codec specific use (h264 ref0, mpeg1 0mv, ...) 45.163 + 45.164 +#define FF_BUFFER_TYPE_INTERNAL 1 45.165 +#define FF_BUFFER_TYPE_USER 2 ///< direct rendering buffers (image is (de)allocated by user) 45.166 +#define FF_BUFFER_TYPE_SHARED 4 ///< Buffer from somewhere else; don't deallocate image (data/base), all other tables are not shared. 45.167 +#define FF_BUFFER_TYPE_COPY 8 ///< Just a (modified) copy of some other buffer, don't deallocate anything. 45.168 + 45.169 + 45.170 +#define FF_I_TYPE 1 ///< Intra 45.171 +#define FF_P_TYPE 2 ///< Predicted 45.172 +#define FF_B_TYPE 3 ///< Bi-dir predicted 45.173 +#define FF_S_TYPE 4 ///< S(GMC)-VOP MPEG4 45.174 +#define FF_SI_TYPE 5 ///< Switching Intra 45.175 +#define FF_SP_TYPE 6 ///< Switching Predicted 45.176 +#define FF_BI_TYPE 7 45.177 + 45.178 +#define MB_TYPE_INTRA MB_TYPE_INTRA4x4 //default mb_type if there is just one type 45.179 +#define IS_INTRA4x4(a) ((a)&MB_TYPE_INTRA4x4) 45.180 +#define IS_INTRA16x16(a) ((a)&MB_TYPE_INTRA16x16) 45.181 +#define IS_PCM(a) ((a)&MB_TYPE_INTRA_PCM) 45.182 +#define IS_INTRA(a) ((a)&7) 45.183 +#define IS_INTER(a) ((a)&(MB_TYPE_16x16|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8)) 45.184 +#define IS_SKIP(a) ((a)&MB_TYPE_SKIP) 45.185 +#define IS_INTRA_PCM(a) ((a)&MB_TYPE_INTRA_PCM) 45.186 +#define IS_INTERLACED(a) ((a)&MB_TYPE_INTERLACED) 45.187 +#define IS_DIRECT(a) ((a)&MB_TYPE_DIRECT2) 45.188 +#define IS_GMC(a) ((a)&MB_TYPE_GMC) 45.189 +#define IS_16X16(a) ((a)&MB_TYPE_16x16) 45.190 +#define IS_16X8(a) ((a)&MB_TYPE_16x8) 45.191 +#define IS_8X16(a) ((a)&MB_TYPE_8x16) 45.192 +#define IS_8X8(a) ((a)&MB_TYPE_8x8) 45.193 +#define IS_SUB_8X8(a) ((a)&MB_TYPE_16x16) //note reused 45.194 +#define IS_SUB_8X4(a) ((a)&MB_TYPE_16x8) //note reused 45.195 +#define IS_SUB_4X8(a) ((a)&MB_TYPE_8x16) //note reused 45.196 +#define IS_SUB_4X4(a) ((a)&MB_TYPE_8x8) //note reused 45.197 +#define IS_ACPRED(a) ((a)&MB_TYPE_ACPRED) 45.198 +#define IS_QUANT(a) ((a)&MB_TYPE_QUANT) 45.199 +#define IS_DIR(a, part, list) ((a) & (MB_TYPE_P0L0<<((part)+2*(list)))) 45.200 +#define USES_LIST(a, list) ((a) & ((MB_TYPE_P0L0|MB_TYPE_P1L0)<<(2*(list)))) ///< does this mb use listX, note does not work if subMBs 45.201 +#define HAS_CBP(a) ((a)&MB_TYPE_CBP) 45.202 + 45.203 + 45.204 +#define FF_MM_FORCE 0x80000000 /* Force usage of selected flags (OR) */ 45.205 + /* lower 16 bits - CPU features */ 45.206 +#define FF_MM_MMX 0x0001 ///< standard MMX 45.207 +#define FF_MM_3DNOW 0x0004 ///< AMD 3DNOW 45.208 +#define FF_MM_MMX2 0x0002 ///< SSE integer functions or AMD MMX ext 45.209 +#define FF_MM_SSE 0x0008 ///< SSE functions 45.210 +#define FF_MM_SSE2 0x0010 ///< PIV SSE2 functions 45.211 +#define FF_MM_3DNOWEXT 0x0020 ///< AMD 3DNowExt 45.212 +#define FF_MM_SSE3 0x0040 ///< Prescott SSE3 functions 45.213 +#define FF_MM_SSSE3 0x0080 ///< Conroe SSSE3 functions 45.214 +#define FF_MM_SSE4 0x0100 ///< Penryn SSE4.1 functions 45.215 +#define FF_MM_SSE42 0x0200 ///< Nehalem SSE4.2 functions 45.216 +#define FF_MM_IWMMXT 0x0100 ///< XScale IWMMXT 45.217 +#define FF_MM_ALTIVEC 0x0001 ///< standard AltiVec 45.218 + 45.219 + 45.220 +/** 45.221 +* Sequence parameter set 45.222 +*/ 45.223 +typedef struct SPS{ 45.224 + 45.225 + int profile_idc; 45.226 + int level_idc; 45.227 + int chroma_format_idc; 45.228 + int transform_bypass; ///< qpprime_y_zero_transform_bypass_flag 45.229 + int log2_max_frame_num; ///< log2_max_frame_num_minus4 + 4 45.230 + int poc_type; ///< pic_order_cnt_type 45.231 + int log2_max_poc_lsb; ///< log2_max_pic_order_cnt_lsb_minus4 45.232 + int delta_pic_order_always_zero_flag; 45.233 + int offset_for_non_ref_pic; 45.234 + int offset_for_top_to_bottom_field; 45.235 + int poc_cycle_length; ///< num_ref_frames_in_pic_order_cnt_cycle 45.236 + int ref_frame_count; ///< num_ref_frames 45.237 + int gaps_in_frame_num_allowed_flag; 45.238 + int mb_width; ///< pic_width_in_mbs_minus1 + 1 45.239 + int mb_height; ///< pic_height_in_map_units_minus1 + 1 45.240 + int frame_mbs_only_flag; 45.241 + int mb_aff; ///<mb_adaptive_frame_field_flag 45.242 + int direct_8x8_inference_flag; 45.243 + int crop; ///< frame_cropping_flag 45.244 + unsigned int crop_left; ///< frame_cropping_rect_left_offset 45.245 + unsigned int crop_right; ///< frame_cropping_rect_right_offset 45.246 + unsigned int crop_top; ///< frame_cropping_rect_top_offset 45.247 + unsigned int crop_bottom; ///< frame_cropping_rect_bottom_offset 45.248 + int vui_parameters_present_flag; 45.249 + int num,den; 45.250 + 45.251 + int video_signal_type_present_flag; 45.252 + int full_range; 45.253 + int colour_description_present_flag; 45.254 + enum AVColorPrimaries color_primaries; 45.255 + enum AVColorTransferCharacteristic color_trc; 45.256 + enum AVColorSpace colorspace; 45.257 + int timing_info_present_flag; 45.258 + uint32_t num_units_in_tick; 45.259 + uint32_t time_scale; 45.260 + int fixed_frame_rate_flag; 45.261 + short offset_for_ref_frame[256]; //FIXME dyn aloc? 45.262 + int bitstream_restriction_flag; 45.263 + int num_reorder_frames; 45.264 + int scaling_matrix_present; 45.265 + uint8_t scaling_matrix4[6][16]; 45.266 + uint8_t scaling_matrix8[2][64]; 45.267 + int nal_hrd_parameters_present_flag; 45.268 + int vcl_hrd_parameters_present_flag; 45.269 + int pic_struct_present_flag; 45.270 + int time_offset_length; 45.271 + int cpb_cnt; ///< See H.264 E.1.2 45.272 + int initial_cpb_removal_delay_length; ///< initial_cpb_removal_delay_length_minus1 +1 45.273 + int cpb_removal_delay_length; ///< cpb_removal_delay_length_minus1 + 1 45.274 + int dpb_output_delay_length; ///< dpb_output_delay_length_minus1 + 1 45.275 + int bit_depth_luma; ///< bit_depth_luma_minus8 + 8 45.276 + int bit_depth_chroma; ///< bit_depth_chroma_minus8 + 8 45.277 + int residual_color_transform_flag; ///< residual_colour_transform_flag 45.278 +}SPS; 45.279 + 45.280 +/** 45.281 +* Picture parameter set 45.282 +*/ 45.283 +typedef struct PPS{ 45.284 + unsigned int sps_id; 45.285 + int cabac; ///< entropy_coding_mode_flag 45.286 + int pic_order_present; ///< pic_order_present_flag 45.287 + int slice_group_count; ///< num_slice_groups_minus1 + 1 45.288 + int mb_slice_group_map_type; 45.289 + unsigned int ref_count[2]; ///< num_ref_idx_l0/1_active_minus1 + 1 45.290 + int weighted_pred; ///< weighted_pred_flag 45.291 + int weighted_bipred_idc; 45.292 + int init_qp; ///< pic_init_qp_minus26 + 26 45.293 + int init_qs; ///< pic_init_qs_minus26 + 26 45.294 + int chroma_qp_index_offset[2]; 45.295 + int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag 45.296 + int constrained_intra_pred; ///< constrained_intra_pred_flag 45.297 + int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag 45.298 + int transform_8x8_mode; ///< transform_8x8_mode_flag 45.299 + uint8_t scaling_matrix4[6][16]; 45.300 + uint8_t scaling_matrix8[2][64]; 45.301 + uint8_t chroma_qp_table[2][64]; ///< pre-scaled (with chroma_qp_index_offset) version of qp_table 45.302 + int chroma_qp_diff; 45.303 +}PPS; 45.304 + 45.305 +typedef struct TopBorder{ 45.306 + uint8_t unfiltered_y[16]; 45.307 + uint8_t unfiltered_cb[8]; 45.308 + uint8_t unfiltered_cr[8]; 45.309 + 45.310 + uint8_t top_borders_y[16*4]; 45.311 + uint8_t top_borders_cb[8*2]; 45.312 + uint8_t top_borders_cr[8*2]; 45.313 +}TopBorder; 45.314 + 45.315 +typedef struct LeftBorder{ 45.316 + uint8_t unfiltered_y[17]; 45.317 + uint8_t unfiltered_cb[9]; 45.318 + uint8_t unfiltered_cr[9]; 45.319 +}LeftBorder; 45.320 + 45.321 +typedef struct H264Mb { 45.322 + //variables copied in after cabac decoding 45.323 + int16_t mb_x, mb_y; 45.324 + int32_t mb_type; 45.325 + 45.326 + uint16_t cbp; // coded block pattern, idct, deblock 45.327 + int8_t qscale_mb_xy; // qp, deblock 45.328 + int8_t qscale_left_mb_xy; //not required 45.329 + int8_t qscale_top_mb_xy; 45.330 + 45.331 + DECLARE_ALIGNED(8, uint16_t, sub_mb_type[4]); 45.332 + DECLARE_ALIGNED(8, uint8_t, non_zero_count[24]); //idct deblock 45.333 + DECLARE_ALIGNED(16, int16_t, mb[16*24]); //coeffs, idct 45.334 + 45.335 + union{ 45.336 + struct { 45.337 + DECLARE_ALIGNED(8, int8_t, ref_index[2][4]); //mc, deblock 45.338 + DECLARE_ALIGNED(16, int16_t, mvd[2][16][2]); //mc, deblock 45.339 + }; 45.340 + struct { 45.341 + DECLARE_ALIGNED(8, int8_t, intra4x4_pred_mode[16]); //intra, deblock 45.342 + int8_t chroma_pred_mode; //intra 45.343 + int8_t intra16x16_pred_mode; //intra, deblock 45.344 + }; 45.345 + }; 45.346 + 45.347 +#if OMPSS 45.348 + DECLARE_ALIGNED(8, uint8_t, top_border[16+ 2*8]); 45.349 + DECLARE_ALIGNED(8, uint8_t, top_border_next[8]); 45.350 + DECLARE_ALIGNED(8, uint8_t, left_border[17+2*9]); 45.351 + int8_t intra4x4_pred_mode_left[4]; 45.352 +#endif 45.353 + 45.354 +} H264Mb; 45.355 + 45.356 +typedef struct RawFrame { 45.357 + uint8_t *data; 45.358 + int size; 45.359 + unsigned int data_size; 45.360 + int64_t pos; ///< byte position in stream, -1 if unknown 45.361 + int state; 45.362 +} RawFrame; 45.363 + 45.364 +typedef struct PictureInfo{ 45.365 + int ref_poc[2][16]; ///< h264 POCs of the frames used as reference 45.366 + int ref_count[2]; ///< number of entries in ref_poc 45.367 + int poc; ///< h264 frame POC 45.368 + int frame_num; ///< h264 frame_num (raw frame_num from slice header) 45.369 + int pic_id; 45.370 + int long_ref; 45.371 + int cpn; ///coded picture number 45.372 + int slice_type_nos; 45.373 +// int key_frame; 45.374 +// int mmco_reset; ///< h264 MMCO_RESET set this 1. Reordering code must not mix pictures before and after MMCO_RESET. 45.375 + 45.376 + int reference; //Set to 4 for delayed, non-reference frames. 1-3 for reference. FIXME 45.377 + 45.378 +}PictureInfo; 45.379 + 45.380 +typedef struct DecodedPicture{ 45.381 + int16_t (*motion_val[2])[2]; 45.382 + int16_t (*motion_val_base[2])[2]; 45.383 + 45.384 + /** 45.385 + * motion reference frame index 45.386 + * the order in which these are stored can depend on the codec. 45.387 + * - encoding: Set by user. 45.388 + * - decoding: Set by libavcodec. 45.389 + */ 45.390 + int8_t *ref_index[2]; 45.391 + uint32_t *mb_type; //mb_type_base + mb_width + 2 45.392 + uint32_t *mb_type_base; 45.393 + 45.394 + int8_t *intra4x4_pred_mode; 45.395 + int8_t *non_zero_count; 45.396 + 45.397 + uint8_t *data[3]; //point to first pixel in the frame 45.398 + int linesize[3]; 45.399 + uint8_t *base[3]; //base of picture planes 45.400 + 45.401 + int cpn; /// coded picture number 45.402 + int poc; ///< h264 frame POC 45.403 + int reference; // 0 -> free, 1 -> needs to be displayed, 2 -> needed for reference, 3 -> 1 && 2 45.404 + int key_frame; 45.405 + int mmco_reset; ///< h264 MMCO_RESET set this 1. Reordering code must not mix pictures before and after MMCO_RESET. 45.406 + 45.407 +} DecodedPicture; 45.408 + 45.409 + 45.410 +#endif /* AVCODEC_AVCODEC_H */
46.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 46.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cabac.c Mon Aug 27 12:09:56 2012 +0200 46.3 @@ -0,0 +1,242 @@ 46.4 +/* 46.5 + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder 46.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 46.7 + * 46.8 + * This file is part of FFmpeg. 46.9 + * 46.10 + * FFmpeg is free software; you can redistribute it and/or 46.11 + * modify it under the terms of the GNU Lesser General Public 46.12 + * License as published by the Free Software Foundation; either 46.13 + * version 2.1 of the License, or (at your option) any later version. 46.14 + * 46.15 + * FFmpeg is distributed in the hope that it will be useful, 46.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 46.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 46.18 + * Lesser General Public License for more details. 46.19 + * 46.20 + * You should have received a copy of the GNU Lesser General Public 46.21 + * License along with FFmpeg; if not, write to the Free Software 46.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 46.23 + */ 46.24 + 46.25 +/** 46.26 + * @file 46.27 + * Context Adaptive Binary Arithmetic Coder. 46.28 + */ 46.29 + 46.30 +#include <string.h> 46.31 + 46.32 +#include "libavutil/common.h" 46.33 +//#include "get_bits.h" 46.34 +#include "cabac.h" 46.35 + 46.36 +static const uint8_t lps_range[64][4]= { 46.37 +{128,176,208,240}, {128,167,197,227}, {128,158,187,216}, {123,150,178,205}, 46.38 +{116,142,169,195}, {111,135,160,185}, {105,128,152,175}, {100,122,144,166}, 46.39 +{ 95,116,137,158}, { 90,110,130,150}, { 85,104,123,142}, { 81, 99,117,135}, 46.40 +{ 77, 94,111,128}, { 73, 89,105,122}, { 69, 85,100,116}, { 66, 80, 95,110}, 46.41 +{ 62, 76, 90,104}, { 59, 72, 86, 99}, { 56, 69, 81, 94}, { 53, 65, 77, 89}, 46.42 +{ 51, 62, 73, 85}, { 48, 59, 69, 80}, { 46, 56, 66, 76}, { 43, 53, 63, 72}, 46.43 +{ 41, 50, 59, 69}, { 39, 48, 56, 65}, { 37, 45, 54, 62}, { 35, 43, 51, 59}, 46.44 +{ 33, 41, 48, 56}, { 32, 39, 46, 53}, { 30, 37, 43, 50}, { 29, 35, 41, 48}, 46.45 +{ 27, 33, 39, 45}, { 26, 31, 37, 43}, { 24, 30, 35, 41}, { 23, 28, 33, 39}, 46.46 +{ 22, 27, 32, 37}, { 21, 26, 30, 35}, { 20, 24, 29, 33}, { 19, 23, 27, 31}, 46.47 +{ 18, 22, 26, 30}, { 17, 21, 25, 28}, { 16, 20, 23, 27}, { 15, 19, 22, 25}, 46.48 +{ 14, 18, 21, 24}, { 14, 17, 20, 23}, { 13, 16, 19, 22}, { 12, 15, 18, 21}, 46.49 +{ 12, 14, 17, 20}, { 11, 14, 16, 19}, { 11, 13, 15, 18}, { 10, 12, 15, 17}, 46.50 +{ 10, 12, 14, 16}, { 9, 11, 13, 15}, { 9, 11, 12, 14}, { 8, 10, 12, 14}, 46.51 +{ 8, 9, 11, 13}, { 7, 9, 11, 12}, { 7, 9, 10, 12}, { 7, 8, 10, 11}, 46.52 +{ 6, 8, 9, 11}, { 6, 7, 9, 10}, { 6, 7, 8, 9}, { 2, 2, 2, 2}, 46.53 +}; 46.54 + 46.55 +uint8_t ff_h264_mlps_state[4*64]; 46.56 +uint8_t ff_h264_lps_range[4*2*64]; 46.57 +uint8_t ff_h264_lps_state[2*64]; 46.58 +uint8_t ff_h264_mps_state[2*64]; 46.59 + 46.60 +static const uint8_t mps_state[64]= { 46.61 + 1, 2, 3, 4, 5, 6, 7, 8, 46.62 + 9,10,11,12,13,14,15,16, 46.63 + 17,18,19,20,21,22,23,24, 46.64 + 25,26,27,28,29,30,31,32, 46.65 + 33,34,35,36,37,38,39,40, 46.66 + 41,42,43,44,45,46,47,48, 46.67 + 49,50,51,52,53,54,55,56, 46.68 + 57,58,59,60,61,62,62,63, 46.69 +}; 46.70 + 46.71 +static const uint8_t lps_state[64]= { 46.72 + 0, 0, 1, 2, 2, 4, 4, 5, 46.73 + 6, 7, 8, 9, 9,11,11,12, 46.74 + 13,13,15,15,16,16,18,18, 46.75 + 19,19,21,21,22,22,23,24, 46.76 + 24,25,26,26,27,27,28,29, 46.77 + 29,30,30,30,31,32,32,33, 46.78 + 33,33,34,34,35,35,35,36, 46.79 + 36,36,37,37,37,38,38,63, 46.80 +}; 46.81 + 46.82 +const uint8_t ff_h264_norm_shift[512]= { 46.83 + 9,8,7,7,6,6,6,6,5,5,5,5,5,5,5,5, 46.84 + 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, 46.85 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 46.86 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 46.87 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 46.88 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 46.89 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 46.90 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 46.91 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 46.92 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 46.93 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 46.94 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 46.95 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 46.96 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 46.97 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 46.98 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 46.99 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 46.100 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 46.101 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 46.102 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 46.103 +}; 46.104 + 46.105 +/** 46.106 + * 46.107 + * @param buf_size size of buf in bits 46.108 + */ 46.109 +void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){ 46.110 + c->bytestream_start= 46.111 + c->bytestream= buf; 46.112 + c->bytestream_end= buf + buf_size; 46.113 + 46.114 +#if CABAC_BITS == 16 46.115 + c->low = (*c->bytestream++)<<18; 46.116 + c->low+= (*c->bytestream++)<<10; 46.117 +#else 46.118 + c->low = (*c->bytestream++)<<10; 46.119 +#endif 46.120 + c->low+= ((*c->bytestream++)<<2) + 2; 46.121 + c->range= 0x1FE; 46.122 +} 46.123 + 46.124 +void ff_init_cabac_states(){ 46.125 + int i, j; 46.126 + 46.127 + for(i=0; i<64; i++){ 46.128 + for(j=0; j<4; j++){ //FIXME check if this is worth the 1 shift we save 46.129 + ff_h264_lps_range[j*2*64+2*i+0]= 46.130 + ff_h264_lps_range[j*2*64+2*i+1]= lps_range[i][j]; 46.131 + } 46.132 + 46.133 + ff_h264_mlps_state[128+2*i+0]= 46.134 + ff_h264_mps_state[2*i+0]= 2*mps_state[i]+0; 46.135 + ff_h264_mlps_state[128+2*i+1]= 46.136 + ff_h264_mps_state[2*i+1]= 2*mps_state[i]+1; 46.137 + 46.138 + if( i ){ 46.139 +#ifdef BRANCHLESS_CABAC_DECODER 46.140 + ff_h264_mlps_state[128-2*i-1]= 2*lps_state[i]+0; 46.141 + ff_h264_mlps_state[128-2*i-2]= 2*lps_state[i]+1; 46.142 + }else{ 46.143 + ff_h264_mlps_state[128-2*i-1]= 1; 46.144 + ff_h264_mlps_state[128-2*i-2]= 0; 46.145 +#else 46.146 + ff_h264_lps_state[2*i+0]= 2*lps_state[i]+0; 46.147 + ff_h264_lps_state[2*i+1]= 2*lps_state[i]+1; 46.148 + }else{ 46.149 + ff_h264_lps_state[2*i+0]= 1; 46.150 + ff_h264_lps_state[2*i+1]= 0; 46.151 +#endif 46.152 + } 46.153 + } 46.154 +} 46.155 + 46.156 +#ifdef TEST 46.157 +#define SIZE 10240 46.158 +#define START_TIMER 46.159 +#define STOP_TIMER(...) 46.160 +#define av_log(...) 46.161 +// #include "libavutil/lfg.h" 46.162 +#include "avcodec.h" 46.163 +#include "cabac.h" 46.164 + 46.165 +int main(void){ 46.166 + CABACContext c; 46.167 + uint8_t b[9*SIZE]; 46.168 + uint8_t r[9*SIZE]; 46.169 + int i; 46.170 + uint8_t state[10]= {0}; 46.171 +// AVLFG prng; 46.172 + 46.173 +// // av_lfg_init(&prng, 1); 46.174 +// ff_init_cabac_encoder(&c, b, SIZE); 46.175 +// ff_init_cabac_states(); 46.176 +// 46.177 +// for(i=0; i<SIZE; i++){ 46.178 +// r[i] = i%7; //av_lfg_get(&prng) % 7; 46.179 +// } 46.180 +// 46.181 +// for(i=0; i<SIZE; i++){ 46.182 +// START_TIMER 46.183 +// put_cabac_bypass(&c, r[i]&1); 46.184 +// STOP_TIMER("put_cabac_bypass") 46.185 +// } 46.186 +// 46.187 +// for(i=0; i<SIZE; i++){ 46.188 +// START_TIMER 46.189 +// put_cabac(&c, state, r[i]&1); 46.190 +// STOP_TIMER("put_cabac") 46.191 +// } 46.192 +// 46.193 +// for(i=0; i<SIZE; i++){ 46.194 +// START_TIMER 46.195 +// put_cabac_u(&c, state, r[i], 6, 3, i&1); 46.196 +// STOP_TIMER("put_cabac_u") 46.197 +// } 46.198 +// 46.199 +// for(i=0; i<SIZE; i++){ 46.200 +// START_TIMER 46.201 +// put_cabac_ueg(&c, state, r[i], 3, 0, 1, 2); 46.202 +// STOP_TIMER("put_cabac_ueg") 46.203 +// } 46.204 +// 46.205 +// put_cabac_terminate(&c, 1); 46.206 + 46.207 + ff_init_cabac_decoder(&c, b, SIZE); 46.208 + 46.209 + memset(state, 0, sizeof(state)); 46.210 + 46.211 + for(i=0; i<SIZE; i++){ 46.212 +START_TIMER 46.213 + if( (r[i]&1) != get_cabac_bypass(&c) ) 46.214 + av_log(NULL, AV_LOG_ERROR, "CABAC bypass failure at %d\n", i); 46.215 +STOP_TIMER("get_cabac_bypass") 46.216 + } 46.217 + 46.218 + for(i=0; i<SIZE; i++){ 46.219 +START_TIMER 46.220 + if( (r[i]&1) != get_cabac(&c, state) ) 46.221 + av_log(NULL, AV_LOG_ERROR, "CABAC failure at %d\n", i); 46.222 +STOP_TIMER("get_cabac") 46.223 + } 46.224 +#if 0 46.225 + for(i=0; i<SIZE; i++){ 46.226 +START_TIMER 46.227 + if( r[i] != get_cabac_u(&c, state, (i&1) ? 6 : 7, 3, i&1) ) 46.228 + av_log(NULL, AV_LOG_ERROR, "CABAC unary (truncated) binarization failure at %d\n", i); 46.229 +STOP_TIMER("get_cabac_u") 46.230 + } 46.231 + 46.232 + for(i=0; i<SIZE; i++){ 46.233 +START_TIMER 46.234 + if( r[i] != get_cabac_ueg(&c, state, 3, 0, 1, 2)) 46.235 + av_log(NULL, AV_LOG_ERROR, "CABAC unary (truncated) binarization failure at %d\n", i); 46.236 +STOP_TIMER("get_cabac_ueg") 46.237 + } 46.238 +#endif 46.239 + if(!get_cabac_terminate(&c)) 46.240 + av_log(NULL, AV_LOG_ERROR, "where's the Terminator?\n"); 46.241 + 46.242 + return 0; 46.243 +} 46.244 + 46.245 +#endif /* TEST */
47.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 47.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cabac.h Mon Aug 27 12:09:56 2012 +0200 47.3 @@ -0,0 +1,206 @@ 47.4 +/* 47.5 + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder 47.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 47.7 + * 47.8 + * This file is part of FFmpeg. 47.9 + * 47.10 + * FFmpeg is free software; you can redistribute it and/or 47.11 + * modify it under the terms of the GNU Lesser General Public 47.12 + * License as published by the Free Software Foundation; either 47.13 + * version 2.1 of the License, or (at your option) any later version. 47.14 + * 47.15 + * FFmpeg is distributed in the hope that it will be useful, 47.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 47.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 47.18 + * Lesser General Public License for more details. 47.19 + * 47.20 + * You should have received a copy of the GNU Lesser General Public 47.21 + * License along with FFmpeg; if not, write to the Free Software 47.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 47.23 + */ 47.24 + 47.25 +/** 47.26 + * @file 47.27 + * Context Adaptive Binary Arithmetic Coder. 47.28 + */ 47.29 + 47.30 +#ifndef AVCODEC_CABAC_H 47.31 +#define AVCODEC_CABAC_H 47.32 + 47.33 +//#undef NDEBUG 47.34 +#include <assert.h> 47.35 +#include "libavutil/x86_cpu.h" 47.36 +#include "libavutil/attributes.h" 47.37 + 47.38 +#define CABAC_BITS 16 47.39 +#define CABAC_MASK ((1<<CABAC_BITS)-1) 47.40 +#define BRANCHLESS_CABAC_DECODER 1 47.41 + 47.42 +typedef struct CABACContext{ 47.43 + int low; 47.44 + int range; 47.45 + int outstanding_count; 47.46 +#ifdef STRICT_LIMITS 47.47 + int symCount; 47.48 +#endif 47.49 + const uint8_t *bytestream_start; 47.50 + const uint8_t *bytestream; 47.51 + const uint8_t *bytestream_end; 47.52 + uint8_t cabac_state[460]; 47.53 +}CABACContext; 47.54 + 47.55 +extern uint8_t ff_h264_mlps_state[4*64]; 47.56 +extern uint8_t ff_h264_lps_range[4*2*64]; ///< rangeTabLPS 47.57 +extern uint8_t ff_h264_mps_state[2*64]; ///< transIdxMPS 47.58 +extern uint8_t ff_h264_lps_state[2*64]; ///< transIdxLPS 47.59 +extern const uint8_t ff_h264_norm_shift[512]; 47.60 + 47.61 +void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size); 47.62 +void ff_init_cabac_states(void); 47.63 + 47.64 +static void refill(CABACContext *c){ 47.65 +#if CABAC_BITS == 16 47.66 + c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); 47.67 +#else 47.68 + c->low+= c->bytestream[0]<<1; 47.69 +#endif 47.70 + c->low -= CABAC_MASK; 47.71 + c->bytestream+= CABAC_BITS/8; 47.72 +} 47.73 + 47.74 +static void refill2(CABACContext *c){ 47.75 + int i, x; 47.76 + 47.77 + x= c->low ^ (c->low-1); 47.78 + i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)]; 47.79 + 47.80 + x= -CABAC_MASK; 47.81 + 47.82 +#if CABAC_BITS == 16 47.83 + x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); 47.84 +#else 47.85 + x+= c->bytestream[0]<<1; 47.86 +#endif 47.87 + 47.88 + c->low += x<<i; 47.89 + c->bytestream+= CABAC_BITS/8; 47.90 +} 47.91 + 47.92 +static inline void renorm_cabac_decoder(CABACContext *c){ 47.93 + while(c->range < 0x100){ 47.94 + c->range+= c->range; 47.95 + c->low+= c->low; 47.96 + if(!(c->low & CABAC_MASK)) 47.97 + refill(c); 47.98 + } 47.99 +} 47.100 + 47.101 +static inline void renorm_cabac_decoder_once(CABACContext *c){ 47.102 + 47.103 + int shift= (uint32_t)(c->range - 0x100)>>31; 47.104 + c->range<<= shift; 47.105 + c->low <<= shift; 47.106 + 47.107 + if(!(c->low & CABAC_MASK)) 47.108 + refill(c); 47.109 +} 47.110 + 47.111 +static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){ 47.112 + 47.113 + int s = *state; 47.114 + int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s]; 47.115 + int bit, lps_mask av_unused; 47.116 + 47.117 + c->range -= RangeLPS; 47.118 +#ifndef BRANCHLESS_CABAC_DECODER 47.119 + if(c->low < (c->range<<(CABAC_BITS+1))){ 47.120 + bit= s&1; 47.121 + *state= ff_h264_mps_state[s]; 47.122 + renorm_cabac_decoder_once(c); 47.123 + }else{ 47.124 + bit= ff_h264_norm_shift[RangeLPS]; 47.125 + c->low -= (c->range<<(CABAC_BITS+1)); 47.126 + *state= ff_h264_lps_state[s]; 47.127 + c->range = RangeLPS<<bit; 47.128 + c->low <<= bit; 47.129 + bit= (s&1)^1; 47.130 + 47.131 + if(!(c->low & CABAC_MASK)){ 47.132 + refill2(c); 47.133 + } 47.134 + } 47.135 +#else /* BRANCHLESS_CABAC_DECODER */ 47.136 + lps_mask= ((c->range<<(CABAC_BITS+1)) - c->low)>>31; 47.137 + 47.138 + c->low -= (c->range<<(CABAC_BITS+1)) & lps_mask; 47.139 + c->range += (RangeLPS - c->range) & lps_mask; 47.140 + 47.141 + s^=lps_mask; 47.142 + *state= (ff_h264_mlps_state+128)[s]; 47.143 + bit= s&1; 47.144 + 47.145 + lps_mask= ff_h264_norm_shift[c->range]; 47.146 + c->range<<= lps_mask; 47.147 + c->low <<= lps_mask; 47.148 + if(!(c->low & CABAC_MASK)) 47.149 + refill2(c); 47.150 +#endif /* BRANCHLESS_CABAC_DECODER */ 47.151 + 47.152 + return bit; 47.153 +} 47.154 + 47.155 +static int av_noinline av_unused get_cabac_noinline(CABACContext *c, uint8_t * const state){ 47.156 + return get_cabac_inline(c, state); 47.157 +} 47.158 + 47.159 +static int av_unused get_cabac(CABACContext *c, uint8_t * const state){ 47.160 + return get_cabac_inline(c, state); 47.161 +} 47.162 + 47.163 +static int av_unused get_cabac_bypass(CABACContext *c){ 47.164 + 47.165 + int range; 47.166 + c->low += c->low; 47.167 + 47.168 + if(!(c->low & CABAC_MASK)) 47.169 + refill(c); 47.170 + 47.171 + range= c->range<<(CABAC_BITS+1); 47.172 + if(c->low < range){ 47.173 + return 0; 47.174 + }else{ 47.175 + c->low -= range; 47.176 + return 1; 47.177 + } 47.178 +} 47.179 + 47.180 +static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){ 47.181 + int range, mask; 47.182 + c->low += c->low; 47.183 + 47.184 + if(!(c->low & CABAC_MASK)) 47.185 + refill(c); 47.186 + 47.187 + range= c->range<<(CABAC_BITS+1); 47.188 + c->low -= range; 47.189 + mask= c->low >> 31; 47.190 + range &= mask; 47.191 + c->low += range; 47.192 + return (val^mask)-mask; 47.193 +} 47.194 + 47.195 +/** 47.196 + * 47.197 + * @return the number of bytes read or 0 if no end 47.198 + */ 47.199 +static int av_unused get_cabac_terminate(CABACContext *c){ 47.200 + c->range -= 2; 47.201 + if(c->low < c->range<<(CABAC_BITS+1)){ 47.202 + renorm_cabac_decoder_once(c); 47.203 + return 0; 47.204 + }else{ 47.205 + return c->bytestream - c->bytestream_start; 47.206 + } 47.207 +} 47.208 + 47.209 +#endif /* AVCODEC_CABAC_H */
48.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 48.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.c Mon Aug 27 12:09:56 2012 +0200 48.3 @@ -0,0 +1,140 @@ 48.4 +/* 48.5 + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder 48.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 48.7 + * 48.8 + * This file is part of FFmpeg. 48.9 + * 48.10 + * FFmpeg is free software; you can redistribute it and/or 48.11 + * modify it under the terms of the GNU Lesser General Public 48.12 + * License as published by the Free Software Foundation; either 48.13 + * version 2.1 of the License, or (at your option) any later version. 48.14 + * 48.15 + * FFmpeg is distributed in the hope that it will be useful, 48.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 48.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 48.18 + * Lesser General Public License for more details. 48.19 + * 48.20 + * You should have received a copy of the GNU Lesser General Public 48.21 + * License along with FFmpeg; if not, write to the Free Software 48.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 48.23 + */ 48.24 + 48.25 +/** 48.26 + * @file 48.27 + * Context Adaptive Binary Arithmetic Coder. 48.28 + */ 48.29 + 48.30 +#include <string.h> 48.31 + 48.32 +#include "libavutil/common.h" 48.33 +//#include "get_bits.h" 48.34 +#include "cabac_spu.h" 48.35 +#define av_log(...) 48.36 + 48.37 +int bytecount =0; 48.38 +static const uint8_t lps_range[64][4]= { 48.39 +{128,176,208,240}, {128,167,197,227}, {128,158,187,216}, {123,150,178,205}, 48.40 +{116,142,169,195}, {111,135,160,185}, {105,128,152,175}, {100,122,144,166}, 48.41 +{ 95,116,137,158}, { 90,110,130,150}, { 85,104,123,142}, { 81, 99,117,135}, 48.42 +{ 77, 94,111,128}, { 73, 89,105,122}, { 69, 85,100,116}, { 66, 80, 95,110}, 48.43 +{ 62, 76, 90,104}, { 59, 72, 86, 99}, { 56, 69, 81, 94}, { 53, 65, 77, 89}, 48.44 +{ 51, 62, 73, 85}, { 48, 59, 69, 80}, { 46, 56, 66, 76}, { 43, 53, 63, 72}, 48.45 +{ 41, 50, 59, 69}, { 39, 48, 56, 65}, { 37, 45, 54, 62}, { 35, 43, 51, 59}, 48.46 +{ 33, 41, 48, 56}, { 32, 39, 46, 53}, { 30, 37, 43, 50}, { 29, 35, 41, 48}, 48.47 +{ 27, 33, 39, 45}, { 26, 31, 37, 43}, { 24, 30, 35, 41}, { 23, 28, 33, 39}, 48.48 +{ 22, 27, 32, 37}, { 21, 26, 30, 35}, { 20, 24, 29, 33}, { 19, 23, 27, 31}, 48.49 +{ 18, 22, 26, 30}, { 17, 21, 25, 28}, { 16, 20, 23, 27}, { 15, 19, 22, 25}, 48.50 +{ 14, 18, 21, 24}, { 14, 17, 20, 23}, { 13, 16, 19, 22}, { 12, 15, 18, 21}, 48.51 +{ 12, 14, 17, 20}, { 11, 14, 16, 19}, { 11, 13, 15, 18}, { 10, 12, 15, 17}, 48.52 +{ 10, 12, 14, 16}, { 9, 11, 13, 15}, { 9, 11, 12, 14}, { 8, 10, 12, 14}, 48.53 +{ 8, 9, 11, 13}, { 7, 9, 11, 12}, { 7, 9, 10, 12}, { 7, 8, 10, 11}, 48.54 +{ 6, 8, 9, 11}, { 6, 7, 9, 10}, { 6, 7, 8, 9}, { 2, 2, 2, 2}, 48.55 +}; 48.56 + 48.57 +uint8_t ff_h264_mlps_state[4*64]; 48.58 +uint8_t ff_h264_lps_range[4*2*64]; 48.59 +uint8_t ff_h264_lps_state[2*64]; 48.60 +uint8_t ff_h264_mps_state[2*64]; 48.61 + 48.62 +static const uint8_t mps_state[64]= { 48.63 + 1, 2, 3, 4, 5, 6, 7, 8, 48.64 + 9,10,11,12,13,14,15,16, 48.65 + 17,18,19,20,21,22,23,24, 48.66 + 25,26,27,28,29,30,31,32, 48.67 + 33,34,35,36,37,38,39,40, 48.68 + 41,42,43,44,45,46,47,48, 48.69 + 49,50,51,52,53,54,55,56, 48.70 + 57,58,59,60,61,62,62,63, 48.71 +}; 48.72 + 48.73 +static const uint8_t lps_state[64]= { 48.74 + 0, 0, 1, 2, 2, 4, 4, 5, 48.75 + 6, 7, 8, 9, 9,11,11,12, 48.76 + 13,13,15,15,16,16,18,18, 48.77 + 19,19,21,21,22,22,23,24, 48.78 + 24,25,26,26,27,27,28,29, 48.79 + 29,30,30,30,31,32,32,33, 48.80 + 33,33,34,34,35,35,35,36, 48.81 + 36,36,37,37,37,38,38,63, 48.82 +}; 48.83 + 48.84 +const uint8_t ff_h264_norm_shift[512]= { 48.85 + 9,8,7,7,6,6,6,6,5,5,5,5,5,5,5,5, 48.86 + 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, 48.87 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 48.88 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 48.89 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 48.90 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 48.91 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 48.92 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 48.93 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 48.94 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 48.95 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 48.96 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 48.97 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 48.98 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 48.99 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 48.100 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 48.101 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 48.102 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 48.103 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 48.104 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 48.105 +}; 48.106 + 48.107 +/** 48.108 + * 48.109 + * @param buf_size size of buf in bits 48.110 + */ 48.111 + 48.112 +void ff_init_cabac_states(){ 48.113 + int i, j; 48.114 + 48.115 + for(i=0; i<64; i++){ 48.116 + for(j=0; j<4; j++){ //FIXME check if this is worth the 1 shift we save 48.117 + ff_h264_lps_range[j*2*64+2*i+0]= 48.118 + ff_h264_lps_range[j*2*64+2*i+1]= lps_range[i][j]; 48.119 + } 48.120 + 48.121 + ff_h264_mlps_state[128+2*i+0]= 48.122 + ff_h264_mps_state[2*i+0]= 2*mps_state[i]+0; 48.123 + ff_h264_mlps_state[128+2*i+1]= 48.124 + ff_h264_mps_state[2*i+1]= 2*mps_state[i]+1; 48.125 + 48.126 + if( i ){ 48.127 +#ifdef BRANCHLESS_CABAC_DECODER 48.128 + ff_h264_mlps_state[128-2*i-1]= 2*lps_state[i]+0; 48.129 + ff_h264_mlps_state[128-2*i-2]= 2*lps_state[i]+1; 48.130 + }else{ 48.131 + ff_h264_mlps_state[128-2*i-1]= 1; 48.132 + ff_h264_mlps_state[128-2*i-2]= 0; 48.133 +#else 48.134 + ff_h264_lps_state[2*i+0]= 2*lps_state[i]+0; 48.135 + ff_h264_lps_state[2*i+1]= 2*lps_state[i]+1; 48.136 + }else{ 48.137 + ff_h264_lps_state[2*i+0]= 1; 48.138 + ff_h264_lps_state[2*i+1]= 0; 48.139 +#endif 48.140 + } 48.141 + } 48.142 +} 48.143 +
49.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 49.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.h Mon Aug 27 12:09:56 2012 +0200 49.3 @@ -0,0 +1,233 @@ 49.4 +/* 49.5 + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder 49.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 49.7 + * 49.8 + * This file is part of FFmpeg. 49.9 + * 49.10 + * FFmpeg is free software; you can redistribute it and/or 49.11 + * modify it under the terms of the GNU Lesser General Public 49.12 + * License as published by the Free Software Foundation; either 49.13 + * version 2.1 of the License, or (at your option) any later version. 49.14 + * 49.15 + * FFmpeg is distributed in the hope that it will be useful, 49.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 49.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 49.18 + * Lesser General Public License for more details. 49.19 + * 49.20 + * You should have received a copy of the GNU Lesser General Public 49.21 + * License along with FFmpeg; if not, write to the Free Software 49.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 49.23 + */ 49.24 + 49.25 +/** 49.26 + * @file 49.27 + * Context Adaptive Binary Arithmetic Coder. 49.28 + */ 49.29 + 49.30 +#ifndef AVCODEC_CABAC_H 49.31 +#define AVCODEC_CABAC_H 49.32 + 49.33 +//#undef NDEBUG 49.34 +#include <assert.h> 49.35 +#include "h264_dma.h" 49.36 +#include "libavutil/x86_cpu.h" 49.37 +#include "libavutil/attributes.h" 49.38 + 49.39 +#define CABAC_BITS 16 49.40 +#define CABAC_MASK ((1<<CABAC_BITS)-1) 49.41 +#define BRANCHLESS_CABAC_DECODER 1 49.42 + 49.43 +typedef struct CABACContext{ 49.44 + int low; 49.45 + int range; 49.46 + int outstanding_count; 49.47 +#ifdef STRICT_LIMITS 49.48 + int symCount; 49.49 +#endif 49.50 + const uint8_t *bytestream_ea_start; 49.51 + const uint8_t *bytestream_ea; 49.52 + const uint8_t *bytestream_ea_end; 49.53 + int slot; 49.54 + int bufsize; 49.55 + 49.56 + uint8_t *bytestream_start; 49.57 + uint8_t *bytestream; 49.58 + uint8_t *bytestream_end; 49.59 + uint8_t cabac_state[460]; 49.60 +}CABACContext; 49.61 + 49.62 +extern uint8_t ff_h264_mlps_state[4*64]; 49.63 +extern uint8_t ff_h264_lps_range[4*2*64]; ///< rangeTabLPS 49.64 +extern uint8_t ff_h264_mps_state[2*64]; ///< transIdxMPS 49.65 +extern uint8_t ff_h264_lps_state[2*64]; ///< transIdxLPS 49.66 +extern const uint8_t ff_h264_norm_shift[512]; 49.67 + 49.68 +void ff_init_cabac_states(void); 49.69 + 49.70 +extern DECLARE_ALIGNED(128,uint8_t, bytestream_ls[4096]); 49.71 +extern int bytecount; 49.72 +static inline void dma_cabac(CABACContext *c){ 49.73 + bytecount++; 49.74 + if (c->bytestream == c->bytestream_end){ 49.75 + if (c->bufsize>0){ 49.76 + int size = (c->bufsize > sizeof(bytestream_ls)) ? sizeof(bytestream_ls) : c->bufsize; 49.77 + int align = size &0xF; 49.78 + int dma_size = size + (align? 16-align : 0); 49.79 + 49.80 + spu_dma_get(bytestream_ls, (unsigned) c->bytestream_ea, dma_size, ED_raw); 49.81 + wait_dma_id(ED_raw); 49.82 + c->bytestream = bytestream_ls; 49.83 + c->bytestream_end = &bytestream_ls[size]; 49.84 + c->bytestream_ea += dma_size; 49.85 + c->bufsize -= size; 49.86 + } 49.87 + bytecount =0; 49.88 + }else if((unsigned)c->bytestream > (unsigned)c->bytestream_end +2){ 49.89 + //fprintf(stderr, "Read beyond end of frame %d\n", c->bufsize); 49.90 + bytecount =0; 49.91 + } 49.92 +} 49.93 + 49.94 +static void refill(CABACContext *c){ 49.95 + dma_cabac(c); 49.96 + 49.97 + c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); 49.98 + 49.99 + c->low -= CABAC_MASK; 49.100 + c->bytestream+= CABAC_BITS/8; 49.101 +} 49.102 + 49.103 +static void refill2(CABACContext *c){ 49.104 + int i, x; 49.105 + 49.106 + dma_cabac(c); 49.107 + 49.108 + x= c->low ^ (c->low-1); 49.109 + i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)]; 49.110 + 49.111 + x= -CABAC_MASK; 49.112 + 49.113 + x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); 49.114 + 49.115 + c->low += x<<i; 49.116 + c->bytestream+= CABAC_BITS/8; 49.117 +} 49.118 + 49.119 +static inline void renorm_cabac_decoder(CABACContext *c){ 49.120 + while(c->range < 0x100){ 49.121 + c->range+= c->range; 49.122 + c->low+= c->low; 49.123 + if(!(c->low & CABAC_MASK)) 49.124 + refill(c); 49.125 + } 49.126 +} 49.127 + 49.128 +static inline void renorm_cabac_decoder_once(CABACContext *c){ 49.129 + 49.130 + int shift= (uint32_t)(c->range - 0x100)>>31; 49.131 + c->range<<= shift; 49.132 + c->low <<= shift; 49.133 + 49.134 + if(!(c->low & CABAC_MASK)) 49.135 + refill(c); 49.136 +} 49.137 + 49.138 +static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){ 49.139 + 49.140 + int s = *state; 49.141 + int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s]; 49.142 + int bit, lps_mask av_unused; 49.143 + 49.144 + c->range -= RangeLPS; 49.145 +#ifndef BRANCHLESS_CABAC_DECODER 49.146 + if(c->low < (c->range<<(CABAC_BITS+1))){ 49.147 + bit= s&1; 49.148 + *state= ff_h264_mps_state[s]; 49.149 + renorm_cabac_decoder_once(c); 49.150 + }else{ 49.151 + bit= ff_h264_norm_shift[RangeLPS]; 49.152 + c->low -= (c->range<<(CABAC_BITS+1)); 49.153 + *state= ff_h264_lps_state[s]; 49.154 + c->range = RangeLPS<<bit; 49.155 + c->low <<= bit; 49.156 + bit= (s&1)^1; 49.157 + 49.158 + if(!(c->low & CABAC_MASK)){ 49.159 + refill2(c); 49.160 + } 49.161 + } 49.162 +#else /* BRANCHLESS_CABAC_DECODER */ 49.163 + lps_mask= ((c->range<<(CABAC_BITS+1)) - c->low)>>31; 49.164 + 49.165 + c->low -= (c->range<<(CABAC_BITS+1)) & lps_mask; 49.166 + c->range += (RangeLPS - c->range) & lps_mask; 49.167 + 49.168 + s^=lps_mask; 49.169 + *state= (ff_h264_mlps_state+128)[s]; 49.170 + bit= s&1; 49.171 + 49.172 + lps_mask= ff_h264_norm_shift[c->range]; 49.173 + c->range<<= lps_mask; 49.174 + c->low <<= lps_mask; 49.175 + if(!(c->low & CABAC_MASK)) 49.176 + refill2(c); 49.177 +#endif /* BRANCHLESS_CABAC_DECODER */ 49.178 + 49.179 + return bit; 49.180 +} 49.181 + 49.182 +static int av_noinline av_unused get_cabac_noinline(CABACContext *c, uint8_t * const state){ 49.183 + return get_cabac_inline(c, state); 49.184 +} 49.185 + 49.186 +static int av_unused get_cabac(CABACContext *c, uint8_t * const state){ 49.187 + return get_cabac_inline(c, state); 49.188 +} 49.189 + 49.190 +static int av_unused get_cabac_bypass(CABACContext *c){ 49.191 + 49.192 + int range; 49.193 + c->low += c->low; 49.194 + 49.195 + if(!(c->low & CABAC_MASK)) 49.196 + refill(c); 49.197 + 49.198 + range= c->range<<(CABAC_BITS+1); 49.199 + if(c->low < range){ 49.200 + return 0; 49.201 + }else{ 49.202 + c->low -= range; 49.203 + return 1; 49.204 + } 49.205 +} 49.206 + 49.207 +static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){ 49.208 + int range, mask; 49.209 + c->low += c->low; 49.210 + 49.211 + if(!(c->low & CABAC_MASK)) 49.212 + refill(c); 49.213 + 49.214 + range= c->range<<(CABAC_BITS+1); 49.215 + c->low -= range; 49.216 + mask= c->low >> 31; 49.217 + range &= mask; 49.218 + c->low += range; 49.219 + return (val^mask)-mask; 49.220 +} 49.221 + 49.222 +/** 49.223 + * 49.224 + * @return the number of bytes read or 0 if no end 49.225 + */ 49.226 +static int av_unused get_cabac_terminate(CABACContext *c){ 49.227 + c->range -= 2; 49.228 + if(c->low < c->range<<(CABAC_BITS+1)){ 49.229 + renorm_cabac_decoder_once(c); 49.230 + return 0; 49.231 + }else{ 49.232 + return c->bytestream - c->bytestream_start; 49.233 + } 49.234 +} 49.235 + 49.236 +#endif /* AVCODEC_CABAC_H */
50.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 50.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.c Mon Aug 27 12:09:56 2012 +0200 50.3 @@ -0,0 +1,1147 @@ 50.4 +/* 50.5 + * Copyright (c) 2009 TUDelft 50.6 + * 50.7 + * Cell Parallel SPU - 2DWave Macroblock Decoding. 50.8 + */ 50.9 + 50.10 +/** 50.11 + * @file libavcodec/cell/spu/h264_main_spu.c 50.12 + * Cell Parallel SPU - 2DWave Macroblock Decoding 50.13 + * @author C C Chi <c.c.chi@student.tudelft.nl> 50.14 + * 50.15 + * SIMD SPU kernels 50.16 + * H.264/AVC motion compensation 50.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu> 50.18 + * @author Albert Paradis <apar7632@hotmail.com> 50.19 + */ 50.20 + 50.21 + 50.22 +#include "dsputil_spu.h" 50.23 +#include "h264_idct_spu.h" 50.24 +#include "h264_deblock_spu.h" 50.25 +#include "types_spu.h" 50.26 +#include "libavutil/intreadwrite.h" 50.27 + 50.28 +#include <stdio.h> 50.29 +#include <spu_intrinsics.h> 50.30 +#include <spu_mfcio.h> 50.31 +#include <assert.h> 50.32 + 50.33 +//Luma interpolation 50.34 +#define PUT_OP_U8_SPU(d, s, dst) (void) dst; d = s 50.35 +#define AVG_OP_U8_SPU(d, s, dst) d = spu_avg(dst, s) 50.36 + 50.37 +#define OP_U8_SPU PUT_OP_U8_SPU 50.38 +#define PREFIX_h264_qpel16_h_lowpass_spu put_h264_qpel16_h_lowpass_spu 50.39 +#define PREFIX_h264_qpel16_v_lowpass_spu put_h264_qpel16_v_lowpass_spu 50.40 +#define PREFIX_h264_qpel16_hv_lowpass_spu put_h264_qpel16_hv_lowpass_spu 50.41 +#define PREFIX_h264_qpel8_h_lowpass_spu put_h264_qpel8_h_lowpass_spu 50.42 +#define PREFIX_h264_qpel8_v_lowpass_spu put_h264_qpel8_v_lowpass_spu 50.43 +#define PREFIX_h264_qpel8_hv_lowpass_spu put_h264_qpel8_hv_lowpass_spu 50.44 +#define PREFIX_h264_qpel4_h_lowpass_spu put_h264_qpel4_h_lowpass_spu 50.45 +#define PREFIX_h264_qpel4_v_lowpass_spu put_h264_qpel4_v_lowpass_spu 50.46 +#define PREFIX_h264_qpel4_hv_lowpass_spu put_h264_qpel4_hv_lowpass_spu 50.47 +#include "h264_luma_template_spu.c" 50.48 +#undef OP_U8_SPU 50.49 +#undef PREFIX_h264_qpel16_h_lowpass_spu 50.50 +#undef PREFIX_h264_qpel16_v_lowpass_spu 50.51 +#undef PREFIX_h264_qpel16_hv_lowpass_spu 50.52 +#undef PREFIX_h264_qpel8_h_lowpass_spu 50.53 +#undef PREFIX_h264_qpel8_v_lowpass_spu 50.54 +#undef PREFIX_h264_qpel8_hv_lowpass_spu 50.55 +#undef PREFIX_h264_qpel4_h_lowpass_spu 50.56 +#undef PREFIX_h264_qpel4_v_lowpass_spu 50.57 +#undef PREFIX_h264_qpel4_hv_lowpass_spu 50.58 + 50.59 +#define OP_U8_SPU AVG_OP_U8_SPU 50.60 +#define PREFIX_h264_qpel16_h_lowpass_spu avg_h264_qpel16_h_lowpass_spu 50.61 +#define PREFIX_h264_qpel16_v_lowpass_spu avg_h264_qpel16_v_lowpass_spu 50.62 +#define PREFIX_h264_qpel16_hv_lowpass_spu avg_h264_qpel16_hv_lowpass_spu 50.63 +#define PREFIX_h264_qpel8_h_lowpass_spu avg_h264_qpel8_h_lowpass_spu 50.64 +#define PREFIX_h264_qpel8_v_lowpass_spu avg_h264_qpel8_v_lowpass_spu 50.65 +#define PREFIX_h264_qpel8_hv_lowpass_spu avg_h264_qpel8_hv_lowpass_spu 50.66 +#define PREFIX_h264_qpel4_h_lowpass_spu avg_h264_qpel4_h_lowpass_spu 50.67 +#define PREFIX_h264_qpel4_v_lowpass_spu avg_h264_qpel4_v_lowpass_spu 50.68 +#define PREFIX_h264_qpel4_hv_lowpass_spu avg_h264_qpel4_hv_lowpass_spu 50.69 +#include "h264_luma_template_spu.c" 50.70 +#undef OP_U8_SPU 50.71 +#undef PREFIX_h264_qpel16_h_lowpass_spu 50.72 +#undef PREFIX_h264_qpel16_v_lowpass_spu 50.73 +#undef PREFIX_h264_qpel16_hv_lowpass_spu 50.74 +#undef PREFIX_h264_qpel8_h_lowpass_spu 50.75 +#undef PREFIX_h264_qpel8_v_lowpass_spu 50.76 +#undef PREFIX_h264_qpel8_hv_lowpass_spu 50.77 +#undef PREFIX_h264_qpel4_h_lowpass_spu 50.78 +#undef PREFIX_h264_qpel4_v_lowpass_spu 50.79 +#undef PREFIX_h264_qpel4_hv_lowpass_spu 50.80 + 50.81 +#define H264_MC(OPNAME, SIZE, CODETYPE) \ 50.82 +static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 50.83 + OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, dst_stride, STRIDE_Y, h);\ 50.84 +}\ 50.85 +\ 50.86 +static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){ \ 50.87 + DECLARE_ALIGNED_16(uint8_t, half[16*16]);\ 50.88 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\ 50.89 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\ 50.90 +}\ 50.91 +\ 50.92 +static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 50.93 + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\ 50.94 +}\ 50.95 +\ 50.96 +static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 50.97 + DECLARE_ALIGNED_16(uint8_t, half[16*16]);\ 50.98 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\ 50.99 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, dst_stride, STRIDE_Y, h);\ 50.100 +}\ 50.101 +\ 50.102 +static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 50.103 + DECLARE_ALIGNED_16(uint8_t, half[16*16]);\ 50.104 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\ 50.105 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\ 50.106 +}\ 50.107 +\ 50.108 +static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 50.109 + OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\ 50.110 +}\ 50.111 +\ 50.112 +static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 50.113 + DECLARE_ALIGNED_16(uint8_t, half[16*16]);\ 50.114 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\ 50.115 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+STRIDE_Y, half, dst_stride, STRIDE_Y, h);\ 50.116 +}\ 50.117 +\ 50.118 +static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 50.119 + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ 50.120 + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ 50.121 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\ 50.122 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\ 50.123 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\ 50.124 +}\ 50.125 +\ 50.126 +static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 50.127 + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ 50.128 + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ 50.129 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\ 50.130 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\ 50.131 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\ 50.132 +}\ 50.133 +\ 50.134 +static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 50.135 + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ 50.136 + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ 50.137 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\ 50.138 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\ 50.139 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\ 50.140 +}\ 50.141 +\ 50.142 +static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 50.143 + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ 50.144 + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ 50.145 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\ 50.146 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\ 50.147 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\ 50.148 +}\ 50.149 +\ 50.150 +static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 50.151 + DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ 50.152 + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, dst_stride, 16, h);\ 50.153 +}\ 50.154 +\ 50.155 +static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 50.156 + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ 50.157 + DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\ 50.158 + DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ 50.159 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\ 50.160 + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\ 50.161 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\ 50.162 +}\ 50.163 +\ 50.164 +static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 50.165 + DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\ 50.166 + DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\ 50.167 + DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ 50.168 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\ 50.169 + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\ 50.170 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\ 50.171 +}\ 50.172 +\ 50.173 +static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 50.174 + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ 50.175 + DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\ 50.176 + DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ 50.177 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\ 50.178 + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\ 50.179 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\ 50.180 +}\ 50.181 +\ 50.182 +static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\ 50.183 + DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\ 50.184 + DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\ 50.185 + DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\ 50.186 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\ 50.187 + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\ 50.188 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\ 50.189 +}\ 50.190 + 50.191 + 50.192 +/**************************/ 50.193 +/* put pixels functions */ 50.194 +/*************************/ 50.195 + 50.196 +static void put_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1, 50.197 + const uint8_t * src2, int dst_stride, 50.198 + int src_stride1, int h) 50.199 +{ 50.200 + int i; 50.201 + 50.202 + const int perm_src1 = (unsigned int) src1 & 15; 50.203 + 50.204 + for (i=0; i<h; i++){ 50.205 + //unaligned load of src1 50.206 + const vuint8_t srctmpa1 = *(vuint8_t *)(src1); 50.207 + const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16); 50.208 + const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16)); 50.209 + 50.210 + //aligned load of src2 50.211 + const vuint8_t srcb = *(vuint8_t *)(src2); 50.212 + 50.213 + //average and rounding 50.214 + const vuint8_t avgc = spu_avg(srca,srcb); 50.215 + 50.216 + // 16x16 dest luma blocks are always aligned 50.217 + *(vuint8_t *)dst=avgc; 50.218 + 50.219 + src1 +=src_stride1; 50.220 + src2 +=16; 50.221 + dst +=dst_stride; 50.222 + } 50.223 +} 50.224 + 50.225 +static void avg_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1, 50.226 + const uint8_t * src2, int dst_stride, 50.227 + int src_stride1, int h) 50.228 +{ 50.229 + int i; 50.230 + 50.231 + const int perm_src1 = (unsigned int) src1 & 15; 50.232 + 50.233 + for (i=0; i<h; i++){ 50.234 + //unaligned load of src1 50.235 + const vuint8_t srctmpa1 = *(vuint8_t *)(src1); 50.236 + const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16); 50.237 + const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16)); 50.238 + 50.239 + //aligned load of src2 50.240 + const vuint8_t srcb = *(vuint8_t *)(src2); 50.241 + 50.242 + //average and rounding 50.243 + const vuint8_t avgc = spu_avg(spu_avg(srca,srcb), *(vuint8_t *)dst); 50.244 + 50.245 + // 16x16 dest luma blocks are always aligned 50.246 + *(vuint8_t *)dst=avgc; 50.247 + 50.248 + src1 +=src_stride1; 50.249 + src2 +=16; 50.250 + dst +=dst_stride; 50.251 + } 50.252 +} 50.253 + 50.254 +// next one assumes that ((line_size % 16) == 0) 50.255 +void put_pixels16_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h) 50.256 +{ 50.257 + register vector unsigned char pixelsv1, pixelsv2; 50.258 + register vector unsigned char pixelsv1B, pixelsv2B; 50.259 + register vector unsigned char pixelsv1C, pixelsv2C; 50.260 + register vector unsigned char pixelsv1D, pixelsv2D; 50.261 + 50.262 + const int perm = (unsigned int) src & 15; 50.263 + int i; 50.264 + register int line_size = src_stride; 50.265 + register int line_size_2 = line_size << 1; 50.266 + register int line_size_3 = line_size + line_size_2; 50.267 + register int line_size_4 = line_size << 2; 50.268 + 50.269 + register int dst_stride_2 = dst_stride << 1; 50.270 + register int dst_stride_3 = dst_stride_2 + dst_stride; 50.271 + register int dst_stride_4 = dst_stride << 2; 50.272 + 50.273 + for(i=0; i<h; i+=4) { 50.274 + pixelsv1 = *(vuint8_t *)(src); 50.275 + pixelsv2 = *(vuint8_t *)(src+16); 50.276 + pixelsv1B = *(vuint8_t *)(src + line_size); 50.277 + pixelsv2B = *(vuint8_t *)(src+16 + line_size); 50.278 + pixelsv1C = *(vuint8_t *)(src + line_size_2); 50.279 + pixelsv2C = *(vuint8_t *)(src+16 + line_size_2); 50.280 + pixelsv1D = *(vuint8_t *)(src + line_size_3); 50.281 + pixelsv2D = *(vuint8_t *)(src+16 + line_size_3); 50.282 + 50.283 + *(vuint8_t *) dst = spu_or(spu_slqwbyte(pixelsv1, perm), spu_rlmaskqwbyte(pixelsv2, perm-16)); 50.284 + *(vuint8_t *)(dst + dst_stride) = spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)); 50.285 + *(vuint8_t *)(dst + dst_stride_2) = spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)); 50.286 + *(vuint8_t *)(dst + dst_stride_3) = spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)); 50.287 + 50.288 + src+= line_size_4; 50.289 + dst+= dst_stride_4; 50.290 + } 50.291 +} 50.292 + 50.293 +// next one assumes that ((line_size % 16) == 0) 50.294 +void avg_pixels16_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h) 50.295 +{ 50.296 + register vector unsigned char pixelsv1, pixelsv2; 50.297 + register vector unsigned char pixelsv1B, pixelsv2B; 50.298 + register vector unsigned char pixelsv1C, pixelsv2C; 50.299 + register vector unsigned char pixelsv1D, pixelsv2D; 50.300 + 50.301 + const int perm = (unsigned int) src & 15; 50.302 + int i; 50.303 + register int line_size = src_stride; 50.304 + register int line_size_2 = line_size << 1; 50.305 + register int line_size_3 = line_size + line_size_2; 50.306 + register int line_size_4 = line_size << 2; 50.307 + 50.308 + register int dst_stride_2 = dst_stride << 1; 50.309 + register int dst_stride_3 = dst_stride_2 + dst_stride; 50.310 + register int dst_stride_4 = dst_stride << 2; 50.311 + 50.312 + 50.313 + for(i=0; i<h; i+=4) { 50.314 + pixelsv1 = *(vuint8_t *)(src); 50.315 + pixelsv2 = *(vuint8_t *)(src+16); 50.316 + pixelsv1B = *(vuint8_t *)(src + line_size); 50.317 + pixelsv2B = *(vuint8_t *)(src+16 + line_size); 50.318 + pixelsv1C = *(vuint8_t *)(src + line_size_2); 50.319 + pixelsv2C = *(vuint8_t *)(src+16 + line_size_2); 50.320 + pixelsv1D = *(vuint8_t *)(src + line_size_3); 50.321 + pixelsv2D = *(vuint8_t *)(src+16 + line_size_3); 50.322 + 50.323 + *(vuint8_t *)dst = spu_avg(spu_or(spu_slqwbyte(pixelsv1, perm), spu_rlmaskqwbyte(pixelsv2, perm-16)), *(vuint8_t *)dst); 50.324 + *(vuint8_t *)(dst + dst_stride) = spu_avg(spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), *(vuint8_t *)(dst+dst_stride)); 50.325 + *(vuint8_t *)(dst + dst_stride_2) = spu_avg(spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), *(vuint8_t *)(dst+dst_stride_2)); 50.326 + *(vuint8_t *)(dst + dst_stride_3) = spu_avg(spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), *(vuint8_t *)(dst+dst_stride_3)); 50.327 + 50.328 + src+= line_size_4; 50.329 + dst+= dst_stride_4; 50.330 + } 50.331 +} 50.332 + 50.333 +void put_pixels8_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2, 50.334 + int dst_stride, int src_stride1, int h) 50.335 +{ 50.336 + int i; 50.337 + 50.338 + const int perm_src1 = (unsigned int) src1 & 15; 50.339 + const int shift_dst = (unsigned int) dst & 15; 50.340 + 50.341 + // 8x dest luma blocks are aligned or desaligned by 8 50.342 + vuint8_t dstmask; 50.343 + const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 50.344 + const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; 50.345 + 50.346 + if(shift_dst==0){ 50.347 + dstmask = dst8mask1; 50.348 + } 50.349 + else{ 50.350 + dstmask = dst8mask2; 50.351 + } 50.352 + 50.353 + for (i=0; i<h; i++){ 50.354 + //unaligned load of src1 50.355 + const vuint8_t srctmpa1 = *(vuint8_t *)(src1); 50.356 + const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16); 50.357 + const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16)); 50.358 + 50.359 + //aligned load of src2 50.360 + const vuint8_t srcb = *(vuint8_t *)(src2); 50.361 + 50.362 + //average and rounding 50.363 + const vuint8_t avgc = spu_avg(srca,srcb); 50.364 + 50.365 + const vuint8_t dst1 = *(vuint8_t *)dst; 50.366 + 50.367 + const vuint8_t davgc = spu_shuffle(dst1, avgc, dstmask); 50.368 + 50.369 + *(vuint8_t *)dst=davgc; 50.370 + 50.371 + src1 +=src_stride1; 50.372 + src2 +=16; 50.373 + dst +=dst_stride; 50.374 + } 50.375 +} 50.376 + 50.377 +void avg_pixels8_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2, 50.378 + int dst_stride, int src_stride1, int h) 50.379 +{ 50.380 + int i; 50.381 + 50.382 + const int perm_src1 = (unsigned int) src1 & 15; 50.383 + const int shift_dst = (unsigned int) dst & 15; 50.384 + 50.385 + // 8x dest luma blocks are aligned or desaligned by 8 50.386 + vuint8_t dstmask; 50.387 + const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 50.388 + const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; 50.389 + 50.390 + if(shift_dst==0){ 50.391 + dstmask = dst8mask1; 50.392 + } 50.393 + else{ 50.394 + dstmask = dst8mask2; 50.395 + } 50.396 + 50.397 + for (i=0; i<h; i++){ 50.398 + //unaligned load of src1 50.399 + const vuint8_t srctmpa1 = *(vuint8_t *)(src1); 50.400 + const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16); 50.401 + const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16)); 50.402 + 50.403 + //aligned load of src2 50.404 + const vuint8_t srcb = *(vuint8_t *)(src2); 50.405 + 50.406 + //average and rounding 50.407 + const vuint8_t avgc = spu_avg(srca,srcb); 50.408 + 50.409 + const vuint8_t dst1 = *(vuint8_t *)dst; 50.410 + 50.411 + const vuint8_t davgc1 = spu_shuffle(dst1, avgc, dstmask); 50.412 + 50.413 + const vuint8_t davgc = spu_avg(dst1,davgc1); 50.414 + 50.415 + *(vuint8_t *)dst=davgc; 50.416 + 50.417 + src1 +=src_stride1; 50.418 + src2 +=16; 50.419 + dst +=dst_stride; 50.420 + } 50.421 +} 50.422 + 50.423 +// next one assumes that ((line_size % 16) == 0) 50.424 +void put_pixels8_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h) 50.425 +{ 50.426 + register vector unsigned char pixelsv1A, pixelsv2A; 50.427 + register vector unsigned char pixelsv1B, pixelsv2B; 50.428 + register vector unsigned char pixelsv1C, pixelsv2C; 50.429 + register vector unsigned char pixelsv1D, pixelsv2D; 50.430 + 50.431 + const int perm = (unsigned int) src & 15; 50.432 + const int shift_dst = (unsigned int) dst & 15; 50.433 + 50.434 + // 8x dest luma blocks are aligned or desaligned by 8 50.435 + vuint8_t dstmask; 50.436 + const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 50.437 + const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; 50.438 + 50.439 + if(shift_dst==0){ 50.440 + dstmask = dst8mask1; 50.441 + } 50.442 + else{ 50.443 + dstmask = dst8mask2; 50.444 + } 50.445 + 50.446 + int i; 50.447 + register int line_size = src_stride; 50.448 + register int line_size_2 = line_size << 1; 50.449 + register int line_size_3 = line_size + line_size_2; 50.450 + register int line_size_4 = line_size << 2; 50.451 + 50.452 + register int dst_stride_2 = dst_stride << 1; 50.453 + register int dst_stride_3 = dst_stride_2 + dst_stride; 50.454 + register int dst_stride_4 = dst_stride << 2; 50.455 + 50.456 + for(i=0; i<h; i+=4) { 50.457 + pixelsv1A = *(vuint8_t *)(src); 50.458 + pixelsv2A = *(vuint8_t *)(src+16); 50.459 + pixelsv1B = *(vuint8_t *)(src + line_size); 50.460 + pixelsv2B = *(vuint8_t *)(src+16 + line_size); 50.461 + pixelsv1C = *(vuint8_t *)(src + line_size_2); 50.462 + pixelsv2C = *(vuint8_t *)(src+16 + line_size_2); 50.463 + pixelsv1D = *(vuint8_t *)(src + line_size_3); 50.464 + pixelsv2D = *(vuint8_t *)(src+16 + line_size_3); 50.465 + 50.466 + const vuint8_t block1 = *(vuint8_t *)dst; 50.467 + const vuint8_t put1 = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask); 50.468 + const vuint8_t block2 = *(vuint8_t *)(dst+dst_stride); 50.469 + const vuint8_t put2 = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask); 50.470 + const vuint8_t block3 = *(vuint8_t *)(dst+2*dst_stride); 50.471 + const vuint8_t put3 = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask); 50.472 + const vuint8_t block4 = *(vuint8_t *)(dst+3*dst_stride); 50.473 + const vuint8_t put4 = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask); 50.474 + 50.475 + *(vuint8_t *) dst = put1; 50.476 + *(vuint8_t *)(dst + dst_stride) = put2; 50.477 + *(vuint8_t *)(dst + dst_stride_2) = put3; 50.478 + *(vuint8_t *)(dst + dst_stride_3) = put4; 50.479 + 50.480 + src += line_size_4; 50.481 + dst += dst_stride_4; 50.482 + } 50.483 +} 50.484 + 50.485 +// next one assumes that ((line_size % 16) == 0) 50.486 +void avg_pixels8_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h) 50.487 +{ 50.488 + register vector unsigned char pixelsv1A, pixelsv2A; 50.489 + register vector unsigned char pixelsv1B, pixelsv2B; 50.490 + register vector unsigned char pixelsv1C, pixelsv2C; 50.491 + register vector unsigned char pixelsv1D, pixelsv2D; 50.492 + 50.493 + const int perm = (unsigned int) src & 15; 50.494 + const int shift_dst = (unsigned int) dst & 15; 50.495 + 50.496 + // 8x dest luma blocks are aligned or desaligned by 8 50.497 + vuint8_t dstmask; 50.498 + const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 50.499 + const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; 50.500 + 50.501 + if(shift_dst==0){ 50.502 + dstmask = dst8mask1; 50.503 + } 50.504 + else{ 50.505 + dstmask = dst8mask2; 50.506 + } 50.507 + 50.508 + int i; 50.509 + register int line_size = src_stride; 50.510 + register int line_size_2 = line_size << 1; 50.511 + register int line_size_3 = line_size + line_size_2; 50.512 + register int line_size_4 = line_size << 2; 50.513 + 50.514 + register int dst_stride_2 = dst_stride << 1; 50.515 + register int dst_stride_3 = dst_stride_2 + dst_stride; 50.516 + register int dst_stride_4 = dst_stride << 2; 50.517 + 50.518 + for(i=0; i<h; i+=4) { 50.519 + pixelsv1A = *(vuint8_t *)(src); 50.520 + pixelsv2A = *(vuint8_t *)(src+16); 50.521 + pixelsv1B = *(vuint8_t *)(src + line_size); 50.522 + pixelsv2B = *(vuint8_t *)(src+16 + line_size); 50.523 + pixelsv1C = *(vuint8_t *)(src + line_size_2); 50.524 + pixelsv2C = *(vuint8_t *)(src+16 + line_size_2); 50.525 + pixelsv1D = *(vuint8_t *)(src + line_size_3); 50.526 + pixelsv2D = *(vuint8_t *)(src+16 + line_size_3); 50.527 + 50.528 + const vuint8_t block1 = *(vuint8_t *) dst; 50.529 + const vuint8_t put1a = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask); 50.530 + const vuint8_t put1 = spu_avg(block1,put1a); 50.531 + 50.532 + const vuint8_t block2 = *(vuint8_t *)(dst + dst_stride); 50.533 + const vuint8_t put2a = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask); 50.534 + const vuint8_t put2 = spu_avg(block2,put2a); 50.535 + 50.536 + const vuint8_t block3 = *(vuint8_t *)(dst + dst_stride_2); 50.537 + const vuint8_t put3a = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask); 50.538 + const vuint8_t put3 = spu_avg(block3,put3a); 50.539 + 50.540 + const vuint8_t block4 = *(vuint8_t *)(dst + dst_stride_3); 50.541 + const vuint8_t put4a = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask); 50.542 + const vuint8_t put4 = spu_avg(block4,put4a); 50.543 + 50.544 + *(vuint8_t *) dst = put1; 50.545 + *(vuint8_t *)(dst + dst_stride) = put2; 50.546 + *(vuint8_t *)(dst + dst_stride_2) = put3; 50.547 + *(vuint8_t *)(dst + dst_stride_3) = put4; 50.548 + 50.549 + src+= line_size_4; 50.550 + dst+= dst_stride_4; 50.551 + } 50.552 +} 50.553 + 50.554 +void put_pixels4_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2, 50.555 + int dst_stride, int src_stride1, int h) 50.556 +{ 50.557 + int i; 50.558 + 50.559 + const int perm_src1 = (unsigned int) src1 & 15; 50.560 + const int shift_dst = (unsigned int) dst & 15; 50.561 + 50.562 + // 4x dest luma blocks are desaligned by 0, 4, 8, or 12 50.563 + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 50.564 + const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 50.565 + const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 50.566 + const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; 50.567 + const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; 50.568 + 50.569 + switch(shift_dst){ 50.570 + case 0: dstmask = dstmask0; 50.571 + break; 50.572 + case 4: dstmask = dstmask4; 50.573 + break; 50.574 + case 8: dstmask = dstmask8; 50.575 + break; 50.576 + case 12: dstmask = dstmask12; 50.577 + break; 50.578 + } 50.579 + 50.580 + for (i=0; i<h; i++){ 50.581 + //unaligned load of src1 50.582 + const vuint8_t srctmpa1 = *(vuint8_t *)(src1); 50.583 + const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16); 50.584 + const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16)); 50.585 + 50.586 + //aligned load of src2 50.587 + const vuint8_t srcb = *(vuint8_t *)(src2); 50.588 + 50.589 + //average and rounding 50.590 + const vuint8_t avgc = spu_avg(srca,srcb); 50.591 + 50.592 + const vuint8_t dst1 = *(vuint8_t *)dst; 50.593 + 50.594 + const vuint8_t davgc = spu_shuffle(dst1, avgc, dstmask); 50.595 + 50.596 + *(vuint8_t *)dst=davgc; 50.597 + 50.598 + src1 +=src_stride1; 50.599 + src2 +=16; 50.600 + dst +=dst_stride; 50.601 + } 50.602 +} 50.603 + 50.604 +void avg_pixels4_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2, 50.605 + int dst_stride, int src_stride1, int h) 50.606 +{ 50.607 + int i; 50.608 + 50.609 + const int perm_src1 = (unsigned int) src1 & 15; 50.610 + const int shift_dst = (unsigned int) dst & 15; 50.611 + 50.612 + // 4x dest luma blocks are desaligned by 0, 4, 8, or 12 50.613 + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 50.614 + const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 50.615 + const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 50.616 + const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; 50.617 + const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; 50.618 + 50.619 + switch(shift_dst){ 50.620 + case 0: dstmask = dstmask0; 50.621 + break; 50.622 + case 4: dstmask = dstmask4; 50.623 + break; 50.624 + case 8: dstmask = dstmask8; 50.625 + break; 50.626 + case 12: dstmask = dstmask12; 50.627 + break; 50.628 + } 50.629 + 50.630 + for (i=0; i<h; i++){ 50.631 + //unaligned load of src1 50.632 + const vuint8_t srctmpa1 = *(vuint8_t *)(src1); 50.633 + const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16); 50.634 + const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16)); 50.635 + 50.636 + //aligned load of src2 50.637 + const vuint8_t srcb = *(vuint8_t *)(src2); 50.638 + 50.639 + //average and rounding 50.640 + const vuint8_t avgc = spu_avg(srca,srcb); 50.641 + 50.642 + const vuint8_t dst1 = *(vuint8_t *)dst; 50.643 + 50.644 + const vuint8_t davgc1 = spu_shuffle(dst1, avgc, dstmask); 50.645 + 50.646 + const vuint8_t davgc = spu_avg(dst1,davgc1); 50.647 + 50.648 + *(vuint8_t *)dst=davgc; 50.649 + 50.650 + src1 +=src_stride1; 50.651 + src2 +=16; 50.652 + dst +=dst_stride; 50.653 + } 50.654 +} 50.655 + 50.656 +// next one assumes that ((line_size % 16) == 0) 50.657 +void put_pixels4_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h) 50.658 +{ 50.659 + register vector unsigned char pixelsv1A, pixelsv2A; 50.660 + register vector unsigned char pixelsv1B, pixelsv2B; 50.661 + register vector unsigned char pixelsv1C, pixelsv2C; 50.662 + register vector unsigned char pixelsv1D, pixelsv2D; 50.663 + 50.664 + const int perm = (unsigned int) src & 15; 50.665 + const int shift_dst = (unsigned int) dst & 15; 50.666 + 50.667 + // 4x dest luma blocks are desaligned by 0, 4, 8, or 12 50.668 + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 50.669 + const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 50.670 + const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 50.671 + const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; 50.672 + const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; 50.673 + 50.674 + switch(shift_dst){ 50.675 + case 0: dstmask = dstmask0; 50.676 + break; 50.677 + case 4: dstmask = dstmask4; 50.678 + break; 50.679 + case 8: dstmask = dstmask8; 50.680 + break; 50.681 + case 12: dstmask = dstmask12; 50.682 + break; 50.683 + } 50.684 + 50.685 + int i; 50.686 + register int line_size = src_stride; 50.687 + register int line_size_2 = line_size << 1; 50.688 + register int line_size_3 = line_size + line_size_2; 50.689 + register int line_size_4 = line_size << 2; 50.690 + 50.691 + register int dst_stride_2 = dst_stride << 1; 50.692 + register int dst_stride_3 = dst_stride_2 + dst_stride; 50.693 + register int dst_stride_4 = dst_stride << 2; 50.694 + 50.695 + for(i=0; i<h; i+=4) { 50.696 + pixelsv1A = *(vuint8_t *)(src); 50.697 + pixelsv2A = *(vuint8_t *)(src+16); 50.698 + pixelsv1B = *(vuint8_t *)(src + line_size); 50.699 + pixelsv2B = *(vuint8_t *)(src+16 + line_size); 50.700 + pixelsv1C = *(vuint8_t *)(src + line_size_2); 50.701 + pixelsv2C = *(vuint8_t *)(src+16 + line_size_2); 50.702 + pixelsv1D = *(vuint8_t *)(src + line_size_3); 50.703 + pixelsv2D = *(vuint8_t *)(src+16 + line_size_3); 50.704 + 50.705 + const vuint8_t block1 = *(vuint8_t *)dst; 50.706 + const vuint8_t put1 = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask); 50.707 + const vuint8_t block2 = *(vuint8_t *)(dst+dst_stride); 50.708 + const vuint8_t put2 = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask); 50.709 + const vuint8_t block3 = *(vuint8_t *)(dst+dst_stride_2); 50.710 + const vuint8_t put3 = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask); 50.711 + const vuint8_t block4 = *(vuint8_t *)(dst+dst_stride_3); 50.712 + const vuint8_t put4 = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask); 50.713 + 50.714 + *(vuint8_t *) dst = put1; 50.715 + *(vuint8_t *)(dst + dst_stride) = put2; 50.716 + *(vuint8_t *)(dst + dst_stride_2) = put3; 50.717 + *(vuint8_t *)(dst + dst_stride_3) = put4; 50.718 + 50.719 + src += line_size_4; 50.720 + dst += dst_stride_4; 50.721 + } 50.722 +} 50.723 + 50.724 +// next one assumes that ((line_size % 16) == 0) 50.725 +void avg_pixels4_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h) 50.726 +{ 50.727 + register vector unsigned char pixelsv1A, pixelsv2A; 50.728 + register vector unsigned char pixelsv1B, pixelsv2B; 50.729 + register vector unsigned char pixelsv1C, pixelsv2C; 50.730 + register vector unsigned char pixelsv1D, pixelsv2D; 50.731 + 50.732 + const int perm = (unsigned int) src & 15; 50.733 + const int shift_dst = (unsigned int) dst & 15; 50.734 + 50.735 + // 4x dest luma blocks are desaligned by 0, 4, 8, or 12 50.736 + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 50.737 + const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 50.738 + const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 50.739 + const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; 50.740 + const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; 50.741 + 50.742 + switch(shift_dst){ 50.743 + case 0: dstmask = dstmask0; 50.744 + break; 50.745 + case 4: dstmask = dstmask4; 50.746 + break; 50.747 + case 8: dstmask = dstmask8; 50.748 + break; 50.749 + case 12: dstmask = dstmask12; 50.750 + break; 50.751 + } 50.752 + 50.753 + int i; 50.754 + register int line_size = src_stride; 50.755 + register int line_size_2 = line_size << 1; 50.756 + register int line_size_3 = line_size + line_size_2; 50.757 + register int line_size_4 = line_size << 2; 50.758 + 50.759 + register int dst_stride_2 = dst_stride << 1; 50.760 + register int dst_stride_3 = dst_stride_2 + dst_stride; 50.761 + register int dst_stride_4 = dst_stride << 2; 50.762 + 50.763 + for(i=0; i<h; i+=4) { 50.764 + pixelsv1A = *(vuint8_t *)(src); 50.765 + pixelsv2A = *(vuint8_t *)(src+16); 50.766 + pixelsv1B = *(vuint8_t *)(src + line_size); 50.767 + pixelsv2B = *(vuint8_t *)(src+16 + line_size); 50.768 + pixelsv1C = *(vuint8_t *)(src + line_size_2); 50.769 + pixelsv2C = *(vuint8_t *)(src+16 + line_size_2); 50.770 + pixelsv1D = *(vuint8_t *)(src + line_size_3); 50.771 + pixelsv2D = *(vuint8_t *)(src+16 + line_size_3); 50.772 + 50.773 + const vuint8_t block1 = *(vuint8_t *) dst; 50.774 + const vuint8_t put1a = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask); 50.775 + const vuint8_t put1 = spu_avg(block1,put1a); 50.776 + 50.777 + const vuint8_t block2 = *(vuint8_t *)(dst + dst_stride); 50.778 + const vuint8_t put2a = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask); 50.779 + const vuint8_t put2 = spu_avg(block2,put2a); 50.780 + 50.781 + const vuint8_t block3 = *(vuint8_t *)(dst + dst_stride_2); 50.782 + const vuint8_t put3a = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask); 50.783 + const vuint8_t put3 = spu_avg(block3,put3a); 50.784 + 50.785 + const vuint8_t block4 = *(vuint8_t *)(dst + dst_stride_3); 50.786 + const vuint8_t put4a = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask); 50.787 + const vuint8_t put4 = spu_avg(block4,put4a); 50.788 + 50.789 + *(vuint8_t *) dst = put1; 50.790 + *(vuint8_t *)(dst + dst_stride) = put2; 50.791 + *(vuint8_t *)(dst + dst_stride_2) = put3; 50.792 + *(vuint8_t *)(dst + dst_stride_3) = put4; 50.793 + 50.794 + src+= line_size_4; 50.795 + dst+= dst_stride_4; 50.796 + } 50.797 +} 50.798 + 50.799 +/* Here we create all the interpolation modes H.264 motion compensation stage for luma */ 50.800 + H264_MC(put_, 16, spu) 50.801 + H264_MC(put_, 8, spu) 50.802 + H264_MC(put_, 4, spu) 50.803 + 50.804 + H264_MC(avg_, 16, spu) 50.805 + H264_MC(avg_, 8, spu) 50.806 + H264_MC(avg_, 4, spu) 50.807 + 50.808 + 50.809 +//Chroma interpolation: 50.810 + 50.811 +#define OP_U8_SPU PUT_OP_U8_SPU 50.812 +#define PREFIX_h264_chroma_mc8_spu put_h264_chroma_mc8_spu 50.813 +#define PREFIX_h264_chroma_mc4_spu put_h264_chroma_mc4_spu 50.814 +#define PREFIX_h264_chroma_mc2_spu put_h264_chroma_mc2_spu 50.815 +#include "h264_chroma_template_spu.c" 50.816 +#undef OP_U8_SPU 50.817 +#undef PREFIX_h264_chroma_mc8_spu 50.818 +#undef PREFIX_h264_chroma_mc4_spu 50.819 +#undef PREFIX_h264_chroma_mc2_spu 50.820 + 50.821 +#define OP_U8_SPU AVG_OP_U8_SPU 50.822 +#define PREFIX_h264_chroma_mc8_spu avg_h264_chroma_mc8_spu 50.823 +#define PREFIX_h264_chroma_mc4_spu avg_h264_chroma_mc4_spu 50.824 +#define PREFIX_h264_chroma_mc2_spu avg_h264_chroma_mc2_spu 50.825 +#include "h264_chroma_template_spu.c" 50.826 +#undef OP_U8_SPU 50.827 +#undef PREFIX_h264_chroma_mc8_spu 50.828 +#undef PREFIX_h264_chroma_mc4_spu 50.829 +#undef PREFIX_h264_chroma_mc2_spu 50.830 + 50.831 +// Weight and Biweight functions 50.832 + 50.833 +#define op_scale1(x) dst[x] = av_clip_uint8( (dst[x]*weight + offset) >> log2_denom ) 50.834 +#define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) 50.835 +#define H264_WEIGHT(W,H) \ 50.836 +static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \ 50.837 + int y; \ 50.838 + offset <<= log2_denom; \ 50.839 + if(log2_denom) offset += 1<<(log2_denom-1); \ 50.840 + for(y=0; y<H; y++, dst += stride){ \ 50.841 + op_scale1(0); \ 50.842 + op_scale1(1); \ 50.843 + if(W==2) continue; \ 50.844 + op_scale1(2); \ 50.845 + op_scale1(3); \ 50.846 + if(W==4) continue; \ 50.847 + op_scale1(4); \ 50.848 + op_scale1(5); \ 50.849 + op_scale1(6); \ 50.850 + op_scale1(7); \ 50.851 + if(W==8) continue; \ 50.852 + op_scale1(8); \ 50.853 + op_scale1(9); \ 50.854 + op_scale1(10); \ 50.855 + op_scale1(11); \ 50.856 + op_scale1(12); \ 50.857 + op_scale1(13); \ 50.858 + op_scale1(14); \ 50.859 + op_scale1(15); \ 50.860 + } \ 50.861 +} \ 50.862 +static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int log2_denom, int weightd, int weights, int offset){ \ 50.863 + int y; \ 50.864 + offset = ((offset + 1) | 1) << log2_denom; \ 50.865 + for(y=0; y<H; y++, dst += dst_stride, src += src_stride){ \ 50.866 + op_scale2(0); \ 50.867 + op_scale2(1); \ 50.868 + if(W==2) continue; \ 50.869 + op_scale2(2); \ 50.870 + op_scale2(3); \ 50.871 + if(W==4) continue; \ 50.872 + op_scale2(4); \ 50.873 + op_scale2(5); \ 50.874 + op_scale2(6); \ 50.875 + op_scale2(7); \ 50.876 + if(W==8) continue; \ 50.877 + op_scale2(8); \ 50.878 + op_scale2(9); \ 50.879 + op_scale2(10); \ 50.880 + op_scale2(11); \ 50.881 + op_scale2(12); \ 50.882 + op_scale2(13); \ 50.883 + op_scale2(14); \ 50.884 + op_scale2(15); \ 50.885 + } \ 50.886 +} 50.887 + 50.888 +H264_WEIGHT(16,16) 50.889 +H264_WEIGHT(16,8) 50.890 +H264_WEIGHT(8,16) 50.891 +H264_WEIGHT(8,8) 50.892 +H264_WEIGHT(8,4) 50.893 +H264_WEIGHT(4,8) 50.894 +H264_WEIGHT(4,4) 50.895 +H264_WEIGHT(4,2) 50.896 +H264_WEIGHT(2,4) 50.897 +H264_WEIGHT(2,2) 50.898 + 50.899 +#undef op_scale1 50.900 +#undef op_scale2 50.901 +#undef H264_WEIGHT 50.902 + 50.903 +///////////////////////////////////////////////////////////////////////////////////////// 50.904 + 50.905 +static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) 50.906 +{ 50.907 + int i, d; 50.908 + for( i = 0; i < 4; i++ ) { 50.909 + if( tc0[i] < 0 ) { 50.910 + pix += 4*ystride; 50.911 + continue; 50.912 + } 50.913 + for( d = 0; d < 4; d++ ) { 50.914 + const int p0 = pix[-1*xstride]; 50.915 + const int p1 = pix[-2*xstride]; 50.916 + const int p2 = pix[-3*xstride]; 50.917 + const int q0 = pix[0]; 50.918 + const int q1 = pix[1*xstride]; 50.919 + const int q2 = pix[2*xstride]; 50.920 + 50.921 + if( FFABS( p0 - q0 ) < alpha && 50.922 + FFABS( p1 - p0 ) < beta && 50.923 + FFABS( q1 - q0 ) < beta ) { 50.924 + 50.925 + int tc = tc0[i]; 50.926 + int i_delta; 50.927 + 50.928 + if( FFABS( p2 - p0 ) < beta ) { 50.929 + pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); 50.930 + tc++; 50.931 + } 50.932 + if( FFABS( q2 - q0 ) < beta ) { 50.933 + pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); 50.934 + tc++; 50.935 + } 50.936 + 50.937 + i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); 50.938 + pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */ 50.939 + pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */ 50.940 + } 50.941 + pix += ystride; 50.942 + } 50.943 + } 50.944 +} 50.945 +static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 50.946 +{ 50.947 + h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0); 50.948 +} 50.949 +static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 50.950 +{ 50.951 + h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0); 50.952 +} 50.953 + 50.954 +static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) 50.955 +{ 50.956 + int d; 50.957 + for( d = 0; d < 16; d++ ) { 50.958 + const int p2 = pix[-3*xstride]; 50.959 + const int p1 = pix[-2*xstride]; 50.960 + const int p0 = pix[-1*xstride]; 50.961 + 50.962 + const int q0 = pix[ 0*xstride]; 50.963 + const int q1 = pix[ 1*xstride]; 50.964 + const int q2 = pix[ 2*xstride]; 50.965 + 50.966 + if( FFABS( p0 - q0 ) < alpha && 50.967 + FFABS( p1 - p0 ) < beta && 50.968 + FFABS( q1 - q0 ) < beta ) { 50.969 + 50.970 + if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){ 50.971 + if( FFABS( p2 - p0 ) < beta) 50.972 + { 50.973 + const int p3 = pix[-4*xstride]; 50.974 + /* p0', p1', p2' */ 50.975 + pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; 50.976 + pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; 50.977 + pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; 50.978 + } else { 50.979 + /* p0' */ 50.980 + pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; 50.981 + } 50.982 + if( FFABS( q2 - q0 ) < beta) 50.983 + { 50.984 + const int q3 = pix[3*xstride]; 50.985 + /* q0', q1', q2' */ 50.986 + pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; 50.987 + pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; 50.988 + pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; 50.989 + } else { 50.990 + /* q0' */ 50.991 + pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; 50.992 + } 50.993 + }else{ 50.994 + /* p0', q0' */ 50.995 + pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; 50.996 + pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; 50.997 + } 50.998 + } 50.999 + pix += ystride; 50.1000 + } 50.1001 +} 50.1002 +static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) 50.1003 +{ 50.1004 + h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta); 50.1005 +} 50.1006 +static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) 50.1007 +{ 50.1008 + h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta); 50.1009 +} 50.1010 + 50.1011 +static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) 50.1012 +{ 50.1013 + int i, d; 50.1014 + for( i = 0; i < 4; i++ ) { 50.1015 + const int tc = tc0[i]; 50.1016 + if( tc <= 0 ) { 50.1017 + pix += 2*ystride; 50.1018 + continue; 50.1019 + } 50.1020 + for( d = 0; d < 2; d++ ) { 50.1021 + const int p0 = pix[-1*xstride]; 50.1022 + const int p1 = pix[-2*xstride]; 50.1023 + const int q0 = pix[0]; 50.1024 + const int q1 = pix[1*xstride]; 50.1025 + 50.1026 + if( FFABS( p0 - q0 ) < alpha && 50.1027 + FFABS( p1 - p0 ) < beta && 50.1028 + FFABS( q1 - q0 ) < beta ) { 50.1029 + 50.1030 + int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); 50.1031 + 50.1032 + pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */ 50.1033 + pix[0] = av_clip_uint8( q0 - delta ); /* q0' */ 50.1034 + } 50.1035 + pix += ystride; 50.1036 + } 50.1037 + } 50.1038 +} 50.1039 +static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 50.1040 +{ 50.1041 + h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0); 50.1042 +} 50.1043 +static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 50.1044 +{ 50.1045 + h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0); 50.1046 +} 50.1047 + 50.1048 +static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) 50.1049 +{ 50.1050 + int d; 50.1051 + for( d = 0; d < 8; d++ ) { 50.1052 + const int p0 = pix[-1*xstride]; 50.1053 + const int p1 = pix[-2*xstride]; 50.1054 + const int q0 = pix[0]; 50.1055 + const int q1 = pix[1*xstride]; 50.1056 + 50.1057 + if( FFABS( p0 - q0 ) < alpha && 50.1058 + FFABS( p1 - p0 ) < beta && 50.1059 + FFABS( q1 - q0 ) < beta ) { 50.1060 + 50.1061 + pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */ 50.1062 + pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */ 50.1063 + } 50.1064 + pix += ystride; 50.1065 + } 50.1066 +} 50.1067 +static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) 50.1068 +{ 50.1069 + h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta); 50.1070 +} 50.1071 +static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) 50.1072 +{ 50.1073 + h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta); 50.1074 +} 50.1075 + 50.1076 + 50.1077 +void dsputil_h264_init_cell(DSPContext_spu* c) { 50.1078 + 50.1079 + c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c; 50.1080 + c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c; 50.1081 + c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c; 50.1082 + c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c; 50.1083 + c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c; 50.1084 + c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c; 50.1085 + c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c; 50.1086 + c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c; 50.1087 + 50.1088 + c->h264_idct_add[0] = h264_idct8_add_spu; 50.1089 + c->h264_idct_add[1] = h264_idct4_add_spu; 50.1090 + 50.1091 + 50.1092 + c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_spu; 50.1093 + c->put_h264_chroma_pixels_tab[1] = put_h264_chroma_mc4_spu; 50.1094 + c->put_h264_chroma_pixels_tab[2] = put_h264_chroma_mc2_spu; 50.1095 + c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_spu; 50.1096 + c->avg_h264_chroma_pixels_tab[1] = avg_h264_chroma_mc4_spu; 50.1097 + c->avg_h264_chroma_pixels_tab[2] = avg_h264_chroma_mc2_spu; 50.1098 + 50.1099 + c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; 50.1100 + c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; 50.1101 + c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c; 50.1102 + c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c; 50.1103 + c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c; 50.1104 + c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c; 50.1105 + c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c; 50.1106 + c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c; 50.1107 + c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c; 50.1108 + c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c; 50.1109 + c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c; 50.1110 + c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c; 50.1111 + c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c; 50.1112 + c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c; 50.1113 + c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c; 50.1114 + c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c; 50.1115 + c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c; 50.1116 + c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c; 50.1117 + c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c; 50.1118 + c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c; 50.1119 + 50.1120 + 50.1121 +#define dspfunc(PFX, IDX, NUM) \ 50.1122 + c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_spu; \ 50.1123 + c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_spu; \ 50.1124 + c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_spu; \ 50.1125 + c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_spu; \ 50.1126 + c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_spu; \ 50.1127 + c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_spu; \ 50.1128 + c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_spu; \ 50.1129 + c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_spu; \ 50.1130 + c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_spu; \ 50.1131 + c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_spu; \ 50.1132 + c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_spu; \ 50.1133 + c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_spu; \ 50.1134 + c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_spu; \ 50.1135 + c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_spu; \ 50.1136 + c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_spu; \ 50.1137 + c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_spu 50.1138 + 50.1139 + dspfunc(put_h264_qpel, 0, 16); 50.1140 + dspfunc(put_h264_qpel, 1, 8); 50.1141 + dspfunc(put_h264_qpel, 2, 4); 50.1142 + 50.1143 + dspfunc(avg_h264_qpel, 0, 16); 50.1144 + dspfunc(avg_h264_qpel, 1, 8); 50.1145 + dspfunc(avg_h264_qpel, 2, 4); 50.1146 + 50.1147 +#undef dspfunc 50.1148 + 50.1149 + 50.1150 +}
51.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 51.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.h Mon Aug 27 12:09:56 2012 +0200 51.3 @@ -0,0 +1,34 @@ 51.4 +#ifndef DSPUTIL_CELL_H 51.5 +#define DSPUTIL_CELL_H 51.6 + 51.7 +#include "types_spu.h" 51.8 + 51.9 +typedef struct DSPContext_spu { 51.10 + 51.11 + void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0); 51.12 + void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0); 51.13 + /* v/h_loop_filter_luma_intra: align 16 */ 51.14 + void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta); 51.15 + void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta); 51.16 + void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0); 51.17 + void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0); 51.18 + void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta); 51.19 + void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta); 51.20 + 51.21 + qpel_mc_func put_h264_qpel_pixels_tab[3][16]; 51.22 + qpel_mc_func avg_h264_qpel_pixels_tab[3][16]; 51.23 + 51.24 + h264_chroma_mc_func put_h264_chroma_pixels_tab[3]; 51.25 + h264_chroma_mc_func avg_h264_chroma_pixels_tab[3]; 51.26 + 51.27 + h264_idct_func h264_idct_add[2]; 51.28 + 51.29 + h264_weight_func weight_h264_pixels_tab[10]; 51.30 + h264_biweight_func biweight_h264_pixels_tab[10]; 51.31 + 51.32 +} DSPContext_spu; 51.33 + 51.34 + 51.35 +void dsputil_h264_init_cell(DSPContext_spu* c); 51.36 + 51.37 +#endif
52.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 52.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.c Mon Aug 27 12:09:56 2012 +0200 52.3 @@ -0,0 +1,2633 @@ 52.4 +/* 52.5 + * H.26L/H.264/AVC/JVT/14496-10/... cabac decoding 52.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 52.7 + * 52.8 + * This file is part of FFmpeg. 52.9 + * 52.10 + * FFmpeg is free software; you can redistribute it and/or 52.11 + * modify it under the terms of the GNU Lesser General Public 52.12 + * License as published by the Free Software Foundation; either 52.13 + * version 2.1 of the License, or (at your option) any later version. 52.14 + * 52.15 + * FFmpeg is distributed in the hope that it will be useful, 52.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 52.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 52.18 + * Lesser General Public License for more details. 52.19 + * 52.20 + * You should have received a copy of the GNU Lesser General Public 52.21 + * License along with FFmpeg; if not, write to the Free Software 52.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 52.23 + */ 52.24 + 52.25 +/** 52.26 + * @file 52.27 + * H.264 / AVC / MPEG4 part10 cabac decoding. 52.28 + * @author Michael Niedermayer <michaelni@gmx.at> 52.29 + */ 52.30 +#define CELL_SPE 52.31 +#include <limits.h> 52.32 +#include <stdlib.h> 52.33 +#include "libavutil/intreadwrite.h" 52.34 +#include "libavutil/mem.h" 52.35 +#include "libavcodec/avcodec.h" 52.36 +#include "h264_deblock_spu.h" 52.37 +#include "h264_pred_spu.h" 52.38 +#include "h264_direct_spu.h" 52.39 +#include "h264_tables.h" 52.40 +#include "mathops_spu.h" 52.41 +//#include "libavcodec/h264_data.h" 52.42 +#include "cabac_spu.h" 52.43 +#include "rectangle_spu.h" 52.44 +#include "libavutil/log.h" 52.45 + 52.46 +//#undef NDEBUG 52.47 +#include <assert.h> 52.48 +#define INT_BIT (sizeof(int) * 8) 52.49 +/* Cabac pre state table */ 52.50 +typedef struct IMbInfo{ 52.51 + uint16_t type; 52.52 + uint8_t pred_mode; 52.53 + uint8_t cbp; 52.54 +} IMbInfo; 52.55 + 52.56 +extern int bytecount; 52.57 + 52.58 +static const IMbInfo i_mb_type_info[26]={ 52.59 +{MB_TYPE_INTRA4x4 , -1, -1}, 52.60 +{MB_TYPE_INTRA16x16, 2, 0}, 52.61 +{MB_TYPE_INTRA16x16, 1, 0}, 52.62 +{MB_TYPE_INTRA16x16, 0, 0}, 52.63 +{MB_TYPE_INTRA16x16, 3, 0}, 52.64 +{MB_TYPE_INTRA16x16, 2, 16}, 52.65 +{MB_TYPE_INTRA16x16, 1, 16}, 52.66 +{MB_TYPE_INTRA16x16, 0, 16}, 52.67 +{MB_TYPE_INTRA16x16, 3, 16}, 52.68 +{MB_TYPE_INTRA16x16, 2, 32}, 52.69 +{MB_TYPE_INTRA16x16, 1, 32}, 52.70 +{MB_TYPE_INTRA16x16, 0, 32}, 52.71 +{MB_TYPE_INTRA16x16, 3, 32}, 52.72 +{MB_TYPE_INTRA16x16, 2, 15+0}, 52.73 +{MB_TYPE_INTRA16x16, 1, 15+0}, 52.74 +{MB_TYPE_INTRA16x16, 0, 15+0}, 52.75 +{MB_TYPE_INTRA16x16, 3, 15+0}, 52.76 +{MB_TYPE_INTRA16x16, 2, 15+16}, 52.77 +{MB_TYPE_INTRA16x16, 1, 15+16}, 52.78 +{MB_TYPE_INTRA16x16, 0, 15+16}, 52.79 +{MB_TYPE_INTRA16x16, 3, 15+16}, 52.80 +{MB_TYPE_INTRA16x16, 2, 15+32}, 52.81 +{MB_TYPE_INTRA16x16, 1, 15+32}, 52.82 +{MB_TYPE_INTRA16x16, 0, 15+32}, 52.83 +{MB_TYPE_INTRA16x16, 3, 15+32}, 52.84 +{MB_TYPE_INTRA_PCM , -1, -1}, 52.85 +}; 52.86 + 52.87 +typedef struct PMbInfo{ 52.88 + uint16_t type; 52.89 + uint8_t partition_count; 52.90 +} PMbInfo; 52.91 + 52.92 +static const PMbInfo p_mb_type_info[5]={ 52.93 +{MB_TYPE_16x16|MB_TYPE_P0L0 , 1}, 52.94 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2}, 52.95 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2}, 52.96 +{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 4}, 52.97 +{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_REF0, 4}, 52.98 +}; 52.99 + 52.100 +static const PMbInfo p_sub_mb_type_info[4]={ 52.101 +{MB_TYPE_16x16|MB_TYPE_P0L0 , 1}, 52.102 +{MB_TYPE_16x8 |MB_TYPE_P0L0 , 2}, 52.103 +{MB_TYPE_8x16 |MB_TYPE_P0L0 , 2}, 52.104 +{MB_TYPE_8x8 |MB_TYPE_P0L0 , 4}, 52.105 +}; 52.106 + 52.107 +static const PMbInfo b_mb_type_info[23]={ 52.108 +{MB_TYPE_DIRECT2|MB_TYPE_L0L1 , 1, }, 52.109 +{MB_TYPE_16x16|MB_TYPE_P0L0 , 1, }, 52.110 +{MB_TYPE_16x16 |MB_TYPE_P0L1 , 1, }, 52.111 +{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1 , 1, }, 52.112 +{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, 52.113 +{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, 52.114 +{MB_TYPE_16x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, 52.115 +{MB_TYPE_8x16 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, 52.116 +{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L1, 2, }, 52.117 +{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L1, 2, }, 52.118 +{MB_TYPE_16x8 |MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, 52.119 +{MB_TYPE_8x16 |MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, 52.120 +{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, 52.121 +{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, 52.122 +{MB_TYPE_16x8 |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, 52.123 +{MB_TYPE_8x16 |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, 52.124 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, 52.125 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, 52.126 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, 52.127 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, 52.128 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, 52.129 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, 52.130 +{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, }, 52.131 +}; 52.132 + 52.133 +static const PMbInfo b_sub_mb_type_info[13]={ 52.134 +{MB_TYPE_DIRECT2 , 1, }, 52.135 +{MB_TYPE_16x16|MB_TYPE_P0L0 , 1, }, 52.136 +{MB_TYPE_16x16 |MB_TYPE_P0L1 , 1, }, 52.137 +{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1 , 1, }, 52.138 +{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, 52.139 +{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, 52.140 +{MB_TYPE_16x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, 52.141 +{MB_TYPE_8x16 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, 52.142 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, 52.143 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, 52.144 +{MB_TYPE_8x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 4, }, 52.145 +{MB_TYPE_8x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 4, }, 52.146 +{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, }, 52.147 +}; 52.148 + 52.149 +static const int8_t cabac_context_init_I[460][2] = 52.150 +{ 52.151 + /* 0 - 10 */ 52.152 + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, 52.153 + { 2, 54 }, { 3, 74 }, { -28,127 }, { -23, 104 }, 52.154 + { -6, 53 }, { -1, 54 }, { 7, 51 }, 52.155 + 52.156 + /* 11 - 23 unsused for I */ 52.157 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 52.158 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 52.159 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 52.160 + { 0, 0 }, 52.161 + 52.162 + /* 24- 39 */ 52.163 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 52.164 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 52.165 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 52.166 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 52.167 + 52.168 + /* 40 - 53 */ 52.169 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 52.170 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 52.171 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 52.172 + { 0, 0 }, { 0, 0 }, 52.173 + 52.174 + /* 54 - 59 */ 52.175 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 52.176 + { 0, 0 }, { 0, 0 }, 52.177 + 52.178 + /* 60 - 69 */ 52.179 + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, 52.180 + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, 52.181 + { 13, 41 }, { 3, 62 }, 52.182 + 52.183 + /* 70 -> 87 */ 52.184 + { 0, 11 }, { 1, 55 }, { 0, 69 }, { -17, 127 }, 52.185 + { -13, 102 },{ 0, 82 }, { -7, 74 }, { -21, 107 }, 52.186 + { -27, 127 },{ -31, 127 },{ -24, 127 }, { -18, 95 }, 52.187 + { -27, 127 },{ -21, 114 },{ -30, 127 }, { -17, 123 }, 52.188 + { -12, 115 },{ -16, 122 }, 52.189 + 52.190 + /* 88 -> 104 */ 52.191 + { -11, 115 },{ -12, 63 }, { -2, 68 }, { -15, 84 }, 52.192 + { -13, 104 },{ -3, 70 }, { -8, 93 }, { -10, 90 }, 52.193 + { -30, 127 },{ -1, 74 }, { -6, 97 }, { -7, 91 }, 52.194 + { -20, 127 },{ -4, 56 }, { -5, 82 }, { -7, 76 }, 52.195 + { -22, 125 }, 52.196 + 52.197 + /* 105 -> 135 */ 52.198 + { -7, 93 }, { -11, 87 }, { -3, 77 }, { -5, 71 }, 52.199 + { -4, 63 }, { -4, 68 }, { -12, 84 }, { -7, 62 }, 52.200 + { -7, 65 }, { 8, 61 }, { 5, 56 }, { -2, 66 }, 52.201 + { 1, 64 }, { 0, 61 }, { -2, 78 }, { 1, 50 }, 52.202 + { 7, 52 }, { 10, 35 }, { 0, 44 }, { 11, 38 }, 52.203 + { 1, 45 }, { 0, 46 }, { 5, 44 }, { 31, 17 }, 52.204 + { 1, 51 }, { 7, 50 }, { 28, 19 }, { 16, 33 }, 52.205 + { 14, 62 }, { -13, 108 },{ -15, 100 }, 52.206 + 52.207 + /* 136 -> 165 */ 52.208 + { -13, 101 },{ -13, 91 }, { -12, 94 }, { -10, 88 }, 52.209 + { -16, 84 }, { -10, 86 }, { -7, 83 }, { -13, 87 }, 52.210 + { -19, 94 }, { 1, 70 }, { 0, 72 }, { -5, 74 }, 52.211 + { 18, 59 }, { -8, 102 }, { -15, 100 }, { 0, 95 }, 52.212 + { -4, 75 }, { 2, 72 }, { -11, 75 }, { -3, 71 }, 52.213 + { 15, 46 }, { -13, 69 }, { 0, 62 }, { 0, 65 }, 52.214 + { 21, 37 }, { -15, 72 }, { 9, 57 }, { 16, 54 }, 52.215 + { 0, 62 }, { 12, 72 }, 52.216 + 52.217 + /* 166 -> 196 */ 52.218 + { 24, 0 }, { 15, 9 }, { 8, 25 }, { 13, 18 }, 52.219 + { 15, 9 }, { 13, 19 }, { 10, 37 }, { 12, 18 }, 52.220 + { 6, 29 }, { 20, 33 }, { 15, 30 }, { 4, 45 }, 52.221 + { 1, 58 }, { 0, 62 }, { 7, 61 }, { 12, 38 }, 52.222 + { 11, 45 }, { 15, 39 }, { 11, 42 }, { 13, 44 }, 52.223 + { 16, 45 }, { 12, 41 }, { 10, 49 }, { 30, 34 }, 52.224 + { 18, 42 }, { 10, 55 }, { 17, 51 }, { 17, 46 }, 52.225 + { 0, 89 }, { 26, -19 }, { 22, -17 }, 52.226 + 52.227 + /* 197 -> 226 */ 52.228 + { 26, -17 }, { 30, -25 }, { 28, -20 }, { 33, -23 }, 52.229 + { 37, -27 }, { 33, -23 }, { 40, -28 }, { 38, -17 }, 52.230 + { 33, -11 }, { 40, -15 }, { 41, -6 }, { 38, 1 }, 52.231 + { 41, 17 }, { 30, -6 }, { 27, 3 }, { 26, 22 }, 52.232 + { 37, -16 }, { 35, -4 }, { 38, -8 }, { 38, -3 }, 52.233 + { 37, 3 }, { 38, 5 }, { 42, 0 }, { 35, 16 }, 52.234 + { 39, 22 }, { 14, 48 }, { 27, 37 }, { 21, 60 }, 52.235 + { 12, 68 }, { 2, 97 }, 52.236 + 52.237 + /* 227 -> 251 */ 52.238 + { -3, 71 }, { -6, 42 }, { -5, 50 }, { -3, 54 }, 52.239 + { -2, 62 }, { 0, 58 }, { 1, 63 }, { -2, 72 }, 52.240 + { -1, 74 }, { -9, 91 }, { -5, 67 }, { -5, 27 }, 52.241 + { -3, 39 }, { -2, 44 }, { 0, 46 }, { -16, 64 }, 52.242 + { -8, 68 }, { -10, 78 }, { -6, 77 }, { -10, 86 }, 52.243 + { -12, 92 }, { -15, 55 }, { -10, 60 }, { -6, 62 }, 52.244 + { -4, 65 }, 52.245 + 52.246 + /* 252 -> 275 */ 52.247 + { -12, 73 }, { -8, 76 }, { -7, 80 }, { -9, 88 }, 52.248 + { -17, 110 },{ -11, 97 }, { -20, 84 }, { -11, 79 }, 52.249 + { -6, 73 }, { -4, 74 }, { -13, 86 }, { -13, 96 }, 52.250 + { -11, 97 }, { -19, 117 },{ -8, 78 }, { -5, 33 }, 52.251 + { -4, 48 }, { -2, 53 }, { -3, 62 }, { -13, 71 }, 52.252 + { -10, 79 }, { -12, 86 }, { -13, 90 }, { -14, 97 }, 52.253 + 52.254 + /* 276 a bit special (not used, bypass is used instead) */ 52.255 + { 0, 0 }, 52.256 + 52.257 + /* 277 -> 307 */ 52.258 + { -6, 93 }, { -6, 84 }, { -8, 79 }, { 0, 66 }, 52.259 + { -1, 71 }, { 0, 62 }, { -2, 60 }, { -2, 59 }, 52.260 + { -5, 75 }, { -3, 62 }, { -4, 58 }, { -9, 66 }, 52.261 + { -1, 79 }, { 0, 71 }, { 3, 68 }, { 10, 44 }, 52.262 + { -7, 62 }, { 15, 36 }, { 14, 40 }, { 16, 27 }, 52.263 + { 12, 29 }, { 1, 44 }, { 20, 36 }, { 18, 32 }, 52.264 + { 5, 42 }, { 1, 48 }, { 10, 62 }, { 17, 46 }, 52.265 + { 9, 64 }, { -12, 104 },{ -11, 97 }, 52.266 + 52.267 + /* 308 -> 337 */ 52.268 + { -16, 96 }, { -7, 88 }, { -8, 85 }, { -7, 85 }, 52.269 + { -9, 85 }, { -13, 88 }, { 4, 66 }, { -3, 77 }, 52.270 + { -3, 76 }, { -6, 76 }, { 10, 58 }, { -1, 76 }, 52.271 + { -1, 83 }, { -7, 99 }, { -14, 95 }, { 2, 95 }, 52.272 + { 0, 76 }, { -5, 74 }, { 0, 70 }, { -11, 75 }, 52.273 + { 1, 68 }, { 0, 65 }, { -14, 73 }, { 3, 62 }, 52.274 + { 4, 62 }, { -1, 68 }, { -13, 75 }, { 11, 55 }, 52.275 + { 5, 64 }, { 12, 70 }, 52.276 + 52.277 + /* 338 -> 368 */ 52.278 + { 15, 6 }, { 6, 19 }, { 7, 16 }, { 12, 14 }, 52.279 + { 18, 13 }, { 13, 11 }, { 13, 15 }, { 15, 16 }, 52.280 + { 12, 23 }, { 13, 23 }, { 15, 20 }, { 14, 26 }, 52.281 + { 14, 44 }, { 17, 40 }, { 17, 47 }, { 24, 17 }, 52.282 + { 21, 21 }, { 25, 22 }, { 31, 27 }, { 22, 29 }, 52.283 + { 19, 35 }, { 14, 50 }, { 10, 57 }, { 7, 63 }, 52.284 + { -2, 77 }, { -4, 82 }, { -3, 94 }, { 9, 69 }, 52.285 + { -12, 109 },{ 36, -35 }, { 36, -34 }, 52.286 + 52.287 + /* 369 -> 398 */ 52.288 + { 32, -26 }, { 37, -30 }, { 44, -32 }, { 34, -18 }, 52.289 + { 34, -15 }, { 40, -15 }, { 33, -7 }, { 35, -5 }, 52.290 + { 33, 0 }, { 38, 2 }, { 33, 13 }, { 23, 35 }, 52.291 + { 13, 58 }, { 29, -3 }, { 26, 0 }, { 22, 30 }, 52.292 + { 31, -7 }, { 35, -15 }, { 34, -3 }, { 34, 3 }, 52.293 + { 36, -1 }, { 34, 5 }, { 32, 11 }, { 35, 5 }, 52.294 + { 34, 12 }, { 39, 11 }, { 30, 29 }, { 34, 26 }, 52.295 + { 29, 39 }, { 19, 66 }, 52.296 + 52.297 + /* 399 -> 435 */ 52.298 + { 31, 21 }, { 31, 31 }, { 25, 50 }, 52.299 + { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11, 85 }, 52.300 + { -15, 92 }, { -14, 89 }, { -26, 71 }, { -15, 81 }, 52.301 + { -14, 80 }, { 0, 68 }, { -14, 70 }, { -24, 56 }, 52.302 + { -23, 68 }, { -24, 50 }, { -11, 74 }, { 23, -13 }, 52.303 + { 26, -13 }, { 40, -15 }, { 49, -14 }, { 44, 3 }, 52.304 + { 45, 6 }, { 44, 34 }, { 33, 54 }, { 19, 82 }, 52.305 + { -3, 75 }, { -1, 23 }, { 1, 34 }, { 1, 43 }, 52.306 + { 0, 54 }, { -2, 55 }, { 0, 61 }, { 1, 64 }, 52.307 + { 0, 68 }, { -9, 92 }, 52.308 + 52.309 + /* 436 -> 459 */ 52.310 + { -14, 106 }, { -13, 97 }, { -15, 90 }, { -12, 90 }, 52.311 + { -18, 88 }, { -10, 73 }, { -9, 79 }, { -14, 86 }, 52.312 + { -10, 73 }, { -10, 70 }, { -10, 69 }, { -5, 66 }, 52.313 + { -9, 64 }, { -5, 58 }, { 2, 59 }, { 21, -10 }, 52.314 + { 24, -11 }, { 28, -8 }, { 28, -1 }, { 29, 3 }, 52.315 + { 29, 9 }, { 35, 20 }, { 29, 36 }, { 14, 67 } 52.316 +}; 52.317 + 52.318 +static const int8_t cabac_context_init_PB[3][460][2] = 52.319 +{ 52.320 + /* i_cabac_init_idc == 0 */ 52.321 + { 52.322 + /* 0 - 10 */ 52.323 + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, 52.324 + { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, 52.325 + { -6, 53 }, { -1, 54 }, { 7, 51 }, 52.326 + 52.327 + /* 11 - 23 */ 52.328 + { 23, 33 }, { 23, 2 }, { 21, 0 }, { 1, 9 }, 52.329 + { 0, 49 }, { -37, 118 }, { 5, 57 }, { -13, 78 }, 52.330 + { -11, 65 }, { 1, 62 }, { 12, 49 }, { -4, 73 }, 52.331 + { 17, 50 }, 52.332 + 52.333 + /* 24 - 39 */ 52.334 + { 18, 64 }, { 9, 43 }, { 29, 0 }, { 26, 67 }, 52.335 + { 16, 90 }, { 9, 104 }, { -46, 127 }, { -20, 104 }, 52.336 + { 1, 67 }, { -13, 78 }, { -11, 65 }, { 1, 62 }, 52.337 + { -6, 86 }, { -17, 95 }, { -6, 61 }, { 9, 45 }, 52.338 + 52.339 + /* 40 - 53 */ 52.340 + { -3, 69 }, { -6, 81 }, { -11, 96 }, { 6, 55 }, 52.341 + { 7, 67 }, { -5, 86 }, { 2, 88 }, { 0, 58 }, 52.342 + { -3, 76 }, { -10, 94 }, { 5, 54 }, { 4, 69 }, 52.343 + { -3, 81 }, { 0, 88 }, 52.344 + 52.345 + /* 54 - 59 */ 52.346 + { -7, 67 }, { -5, 74 }, { -4, 74 }, { -5, 80 }, 52.347 + { -7, 72 }, { 1, 58 }, 52.348 + 52.349 + /* 60 - 69 */ 52.350 + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, 52.351 + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, 52.352 + { 13, 41 }, { 3, 62 }, 52.353 + 52.354 + /* 70 - 87 */ 52.355 + { 0, 45 }, { -4, 78 }, { -3, 96 }, { -27, 126 }, 52.356 + { -28, 98 }, { -25, 101 }, { -23, 67 }, { -28, 82 }, 52.357 + { -20, 94 }, { -16, 83 }, { -22, 110 }, { -21, 91 }, 52.358 + { -18, 102 }, { -13, 93 }, { -29, 127 }, { -7, 92 }, 52.359 + { -5, 89 }, { -7, 96 }, { -13, 108 }, { -3, 46 }, 52.360 + { -1, 65 }, { -1, 57 }, { -9, 93 }, { -3, 74 }, 52.361 + { -9, 92 }, { -8, 87 }, { -23, 126 }, { 5, 54 }, 52.362 + { 6, 60 }, { 6, 59 }, { 6, 69 }, { -1, 48 }, 52.363 + { 0, 68 }, { -4, 69 }, { -8, 88 }, 52.364 + 52.365 + /* 105 -> 165 */ 52.366 + { -2, 85 }, { -6, 78 }, { -1, 75 }, { -7, 77 }, 52.367 + { 2, 54 }, { 5, 50 }, { -3, 68 }, { 1, 50 }, 52.368 + { 6, 42 }, { -4, 81 }, { 1, 63 }, { -4, 70 }, 52.369 + { 0, 67 }, { 2, 57 }, { -2, 76 }, { 11, 35 }, 52.370 + { 4, 64 }, { 1, 61 }, { 11, 35 }, { 18, 25 }, 52.371 + { 12, 24 }, { 13, 29 }, { 13, 36 }, { -10, 93 }, 52.372 + { -7, 73 }, { -2, 73 }, { 13, 46 }, { 9, 49 }, 52.373 + { -7, 100 }, { 9, 53 }, { 2, 53 }, { 5, 53 }, 52.374 + { -2, 61 }, { 0, 56 }, { 0, 56 }, { -13, 63 }, 52.375 + { -5, 60 }, { -1, 62 }, { 4, 57 }, { -6, 69 }, 52.376 + { 4, 57 }, { 14, 39 }, { 4, 51 }, { 13, 68 }, 52.377 + { 3, 64 }, { 1, 61 }, { 9, 63 }, { 7, 50 }, 52.378 + { 16, 39 }, { 5, 44 }, { 4, 52 }, { 11, 48 }, 52.379 + { -5, 60 }, { -1, 59 }, { 0, 59 }, { 22, 33 }, 52.380 + { 5, 44 }, { 14, 43 }, { -1, 78 }, { 0, 60 }, 52.381 + { 9, 69 }, 52.382 + 52.383 + /* 166 - 226 */ 52.384 + { 11, 28 }, { 2, 40 }, { 3, 44 }, { 0, 49 }, 52.385 + { 0, 46 }, { 2, 44 }, { 2, 51 }, { 0, 47 }, 52.386 + { 4, 39 }, { 2, 62 }, { 6, 46 }, { 0, 54 }, 52.387 + { 3, 54 }, { 2, 58 }, { 4, 63 }, { 6, 51 }, 52.388 + { 6, 57 }, { 7, 53 }, { 6, 52 }, { 6, 55 }, 52.389 + { 11, 45 }, { 14, 36 }, { 8, 53 }, { -1, 82 }, 52.390 + { 7, 55 }, { -3, 78 }, { 15, 46 }, { 22, 31 }, 52.391 + { -1, 84 }, { 25, 7 }, { 30, -7 }, { 28, 3 }, 52.392 + { 28, 4 }, { 32, 0 }, { 34, -1 }, { 30, 6 }, 52.393 + { 30, 6 }, { 32, 9 }, { 31, 19 }, { 26, 27 }, 52.394 + { 26, 30 }, { 37, 20 }, { 28, 34 }, { 17, 70 }, 52.395 + { 1, 67 }, { 5, 59 }, { 9, 67 }, { 16, 30 }, 52.396 + { 18, 32 }, { 18, 35 }, { 22, 29 }, { 24, 31 }, 52.397 + { 23, 38 }, { 18, 43 }, { 20, 41 }, { 11, 63 }, 52.398 + { 9, 59 }, { 9, 64 }, { -1, 94 }, { -2, 89 }, 52.399 + { -9, 108 }, 52.400 + 52.401 + /* 227 - 275 */ 52.402 + { -6, 76 }, { -2, 44 }, { 0, 45 }, { 0, 52 }, 52.403 + { -3, 64 }, { -2, 59 }, { -4, 70 }, { -4, 75 }, 52.404 + { -8, 82 }, { -17, 102 }, { -9, 77 }, { 3, 24 }, 52.405 + { 0, 42 }, { 0, 48 }, { 0, 55 }, { -6, 59 }, 52.406 + { -7, 71 }, { -12, 83 }, { -11, 87 }, { -30, 119 }, 52.407 + { 1, 58 }, { -3, 29 }, { -1, 36 }, { 1, 38 }, 52.408 + { 2, 43 }, { -6, 55 }, { 0, 58 }, { 0, 64 }, 52.409 + { -3, 74 }, { -10, 90 }, { 0, 70 }, { -4, 29 }, 52.410 + { 5, 31 }, { 7, 42 }, { 1, 59 }, { -2, 58 }, 52.411 + { -3, 72 }, { -3, 81 }, { -11, 97 }, { 0, 58 }, 52.412 + { 8, 5 }, { 10, 14 }, { 14, 18 }, { 13, 27 }, 52.413 + { 2, 40 }, { 0, 58 }, { -3, 70 }, { -6, 79 }, 52.414 + { -8, 85 }, 52.415 + 52.416 + /* 276 a bit special (not used, bypass is used instead) */ 52.417 + { 0, 0 }, 52.418 + 52.419 + /* 277 - 337 */ 52.420 + { -13, 106 }, { -16, 106 }, { -10, 87 }, { -21, 114 }, 52.421 + { -18, 110 }, { -14, 98 }, { -22, 110 }, { -21, 106 }, 52.422 + { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 }, 52.423 + { -10, 96 }, { -12, 95 }, { -5, 91 }, { -9, 93 }, 52.424 + { -22, 94 }, { -5, 86 }, { 9, 67 }, { -4, 80 }, 52.425 + { -10, 85 }, { -1, 70 }, { 7, 60 }, { 9, 58 }, 52.426 + { 5, 61 }, { 12, 50 }, { 15, 50 }, { 18, 49 }, 52.427 + { 17, 54 }, { 10, 41 }, { 7, 46 }, { -1, 51 }, 52.428 + { 7, 49 }, { 8, 52 }, { 9, 41 }, { 6, 47 }, 52.429 + { 2, 55 }, { 13, 41 }, { 10, 44 }, { 6, 50 }, 52.430 + { 5, 53 }, { 13, 49 }, { 4, 63 }, { 6, 64 }, 52.431 + { -2, 69 }, { -2, 59 }, { 6, 70 }, { 10, 44 }, 52.432 + { 9, 31 }, { 12, 43 }, { 3, 53 }, { 14, 34 }, 52.433 + { 10, 38 }, { -3, 52 }, { 13, 40 }, { 17, 32 }, 52.434 + { 7, 44 }, { 7, 38 }, { 13, 50 }, { 10, 57 }, 52.435 + { 26, 43 }, 52.436 + 52.437 + /* 338 - 398 */ 52.438 + { 14, 11 }, { 11, 14 }, { 9, 11 }, { 18, 11 }, 52.439 + { 21, 9 }, { 23, -2 }, { 32, -15 }, { 32, -15 }, 52.440 + { 34, -21 }, { 39, -23 }, { 42, -33 }, { 41, -31 }, 52.441 + { 46, -28 }, { 38, -12 }, { 21, 29 }, { 45, -24 }, 52.442 + { 53, -45 }, { 48, -26 }, { 65, -43 }, { 43, -19 }, 52.443 + { 39, -10 }, { 30, 9 }, { 18, 26 }, { 20, 27 }, 52.444 + { 0, 57 }, { -14, 82 }, { -5, 75 }, { -19, 97 }, 52.445 + { -35, 125 }, { 27, 0 }, { 28, 0 }, { 31, -4 }, 52.446 + { 27, 6 }, { 34, 8 }, { 30, 10 }, { 24, 22 }, 52.447 + { 33, 19 }, { 22, 32 }, { 26, 31 }, { 21, 41 }, 52.448 + { 26, 44 }, { 23, 47 }, { 16, 65 }, { 14, 71 }, 52.449 + { 8, 60 }, { 6, 63 }, { 17, 65 }, { 21, 24 }, 52.450 + { 23, 20 }, { 26, 23 }, { 27, 32 }, { 28, 23 }, 52.451 + { 28, 24 }, { 23, 40 }, { 24, 32 }, { 28, 29 }, 52.452 + { 23, 42 }, { 19, 57 }, { 22, 53 }, { 22, 61 }, 52.453 + { 11, 86 }, 52.454 + 52.455 + /* 399 - 435 */ 52.456 + { 12, 40 }, { 11, 51 }, { 14, 59 }, 52.457 + { -4, 79 }, { -7, 71 }, { -5, 69 }, { -9, 70 }, 52.458 + { -8, 66 }, { -10, 68 }, { -19, 73 }, { -12, 69 }, 52.459 + { -16, 70 }, { -15, 67 }, { -20, 62 }, { -19, 70 }, 52.460 + { -16, 66 }, { -22, 65 }, { -20, 63 }, { 9, -2 }, 52.461 + { 26, -9 }, { 33, -9 }, { 39, -7 }, { 41, -2 }, 52.462 + { 45, 3 }, { 49, 9 }, { 45, 27 }, { 36, 59 }, 52.463 + { -6, 66 }, { -7, 35 }, { -7, 42 }, { -8, 45 }, 52.464 + { -5, 48 }, { -12, 56 }, { -6, 60 }, { -5, 62 }, 52.465 + { -8, 66 }, { -8, 76 }, 52.466 + 52.467 + /* 436 - 459 */ 52.468 + { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, 52.469 + { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, 52.470 + { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, 52.471 + { -14, 66 }, { 0, 59 }, { 2, 59 }, { 21, -13 }, 52.472 + { 33, -14 }, { 39, -7 }, { 46, -2 }, { 51, 2 }, 52.473 + { 60, 6 }, { 61, 17 }, { 55, 34 }, { 42, 62 }, 52.474 + }, 52.475 + 52.476 + /* i_cabac_init_idc == 1 */ 52.477 + { 52.478 + /* 0 - 10 */ 52.479 + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, 52.480 + { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, 52.481 + { -6, 53 }, { -1, 54 }, { 7, 51 }, 52.482 + 52.483 + /* 11 - 23 */ 52.484 + { 22, 25 }, { 34, 0 }, { 16, 0 }, { -2, 9 }, 52.485 + { 4, 41 }, { -29, 118 }, { 2, 65 }, { -6, 71 }, 52.486 + { -13, 79 }, { 5, 52 }, { 9, 50 }, { -3, 70 }, 52.487 + { 10, 54 }, 52.488 + 52.489 + /* 24 - 39 */ 52.490 + { 26, 34 }, { 19, 22 }, { 40, 0 }, { 57, 2 }, 52.491 + { 41, 36 }, { 26, 69 }, { -45, 127 }, { -15, 101 }, 52.492 + { -4, 76 }, { -6, 71 }, { -13, 79 }, { 5, 52 }, 52.493 + { 6, 69 }, { -13, 90 }, { 0, 52 }, { 8, 43 }, 52.494 + 52.495 + /* 40 - 53 */ 52.496 + { -2, 69 },{ -5, 82 },{ -10, 96 },{ 2, 59 }, 52.497 + { 2, 75 },{ -3, 87 },{ -3, 100 },{ 1, 56 }, 52.498 + { -3, 74 },{ -6, 85 },{ 0, 59 },{ -3, 81 }, 52.499 + { -7, 86 },{ -5, 95 }, 52.500 + 52.501 + /* 54 - 59 */ 52.502 + { -1, 66 },{ -1, 77 },{ 1, 70 },{ -2, 86 }, 52.503 + { -5, 72 },{ 0, 61 }, 52.504 + 52.505 + /* 60 - 69 */ 52.506 + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, 52.507 + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, 52.508 + { 13, 41 }, { 3, 62 }, 52.509 + 52.510 + /* 70 - 104 */ 52.511 + { 13, 15 }, { 7, 51 }, { 2, 80 }, { -39, 127 }, 52.512 + { -18, 91 }, { -17, 96 }, { -26, 81 }, { -35, 98 }, 52.513 + { -24, 102 }, { -23, 97 }, { -27, 119 }, { -24, 99 }, 52.514 + { -21, 110 }, { -18, 102 }, { -36, 127 }, { 0, 80 }, 52.515 + { -5, 89 }, { -7, 94 }, { -4, 92 }, { 0, 39 }, 52.516 + { 0, 65 }, { -15, 84 }, { -35, 127 }, { -2, 73 }, 52.517 + { -12, 104 }, { -9, 91 }, { -31, 127 }, { 3, 55 }, 52.518 + { 7, 56 }, { 7, 55 }, { 8, 61 }, { -3, 53 }, 52.519 + { 0, 68 }, { -7, 74 }, { -9, 88 }, 52.520 + 52.521 + /* 105 -> 165 */ 52.522 + { -13, 103 }, { -13, 91 }, { -9, 89 }, { -14, 92 }, 52.523 + { -8, 76 }, { -12, 87 }, { -23, 110 }, { -24, 105 }, 52.524 + { -10, 78 }, { -20, 112 }, { -17, 99 }, { -78, 127 }, 52.525 + { -70, 127 }, { -50, 127 }, { -46, 127 }, { -4, 66 }, 52.526 + { -5, 78 }, { -4, 71 }, { -8, 72 }, { 2, 59 }, 52.527 + { -1, 55 }, { -7, 70 }, { -6, 75 }, { -8, 89 }, 52.528 + { -34, 119 }, { -3, 75 }, { 32, 20 }, { 30, 22 }, 52.529 + { -44, 127 }, { 0, 54 }, { -5, 61 }, { 0, 58 }, 52.530 + { -1, 60 }, { -3, 61 }, { -8, 67 }, { -25, 84 }, 52.531 + { -14, 74 }, { -5, 65 }, { 5, 52 }, { 2, 57 }, 52.532 + { 0, 61 }, { -9, 69 }, { -11, 70 }, { 18, 55 }, 52.533 + { -4, 71 }, { 0, 58 }, { 7, 61 }, { 9, 41 }, 52.534 + { 18, 25 }, { 9, 32 }, { 5, 43 }, { 9, 47 }, 52.535 + { 0, 44 }, { 0, 51 }, { 2, 46 }, { 19, 38 }, 52.536 + { -4, 66 }, { 15, 38 }, { 12, 42 }, { 9, 34 }, 52.537 + { 0, 89 }, 52.538 + 52.539 + /* 166 - 226 */ 52.540 + { 4, 45 }, { 10, 28 }, { 10, 31 }, { 33, -11 }, 52.541 + { 52, -43 }, { 18, 15 }, { 28, 0 }, { 35, -22 }, 52.542 + { 38, -25 }, { 34, 0 }, { 39, -18 }, { 32, -12 }, 52.543 + { 102, -94 }, { 0, 0 }, { 56, -15 }, { 33, -4 }, 52.544 + { 29, 10 }, { 37, -5 }, { 51, -29 }, { 39, -9 }, 52.545 + { 52, -34 }, { 69, -58 }, { 67, -63 }, { 44, -5 }, 52.546 + { 32, 7 }, { 55, -29 }, { 32, 1 }, { 0, 0 }, 52.547 + { 27, 36 }, { 33, -25 }, { 34, -30 }, { 36, -28 }, 52.548 + { 38, -28 }, { 38, -27 }, { 34, -18 }, { 35, -16 }, 52.549 + { 34, -14 }, { 32, -8 }, { 37, -6 }, { 35, 0 }, 52.550 + { 30, 10 }, { 28, 18 }, { 26, 25 }, { 29, 41 }, 52.551 + { 0, 75 }, { 2, 72 }, { 8, 77 }, { 14, 35 }, 52.552 + { 18, 31 }, { 17, 35 }, { 21, 30 }, { 17, 45 }, 52.553 + { 20, 42 }, { 18, 45 }, { 27, 26 }, { 16, 54 }, 52.554 + { 7, 66 }, { 16, 56 }, { 11, 73 }, { 10, 67 }, 52.555 + { -10, 116 }, 52.556 + 52.557 + /* 227 - 275 */ 52.558 + { -23, 112 }, { -15, 71 }, { -7, 61 }, { 0, 53 }, 52.559 + { -5, 66 }, { -11, 77 }, { -9, 80 }, { -9, 84 }, 52.560 + { -10, 87 }, { -34, 127 }, { -21, 101 }, { -3, 39 }, 52.561 + { -5, 53 }, { -7, 61 }, { -11, 75 }, { -15, 77 }, 52.562 + { -17, 91 }, { -25, 107 }, { -25, 111 }, { -28, 122 }, 52.563 + { -11, 76 }, { -10, 44 }, { -10, 52 }, { -10, 57 }, 52.564 + { -9, 58 }, { -16, 72 }, { -7, 69 }, { -4, 69 }, 52.565 + { -5, 74 }, { -9, 86 }, { 2, 66 }, { -9, 34 }, 52.566 + { 1, 32 }, { 11, 31 }, { 5, 52 }, { -2, 55 }, 52.567 + { -2, 67 }, { 0, 73 }, { -8, 89 }, { 3, 52 }, 52.568 + { 7, 4 }, { 10, 8 }, { 17, 8 }, { 16, 19 }, 52.569 + { 3, 37 }, { -1, 61 }, { -5, 73 }, { -1, 70 }, 52.570 + { -4, 78 }, 52.571 + 52.572 + /* 276 a bit special (not used, bypass is used instead) */ 52.573 + { 0, 0 }, 52.574 + 52.575 + /* 277 - 337 */ 52.576 + { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 }, 52.577 + { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 }, 52.578 + { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 }, 52.579 + { -10, 95 }, { -14, 100 }, { -8, 95 }, { -17, 111 }, 52.580 + { -28, 114 }, { -6, 89 }, { -2, 80 }, { -4, 82 }, 52.581 + { -9, 85 }, { -8, 81 }, { -1, 72 }, { 5, 64 }, 52.582 + { 1, 67 }, { 9, 56 }, { 0, 69 }, { 1, 69 }, 52.583 + { 7, 69 }, { -7, 69 }, { -6, 67 }, { -16, 77 }, 52.584 + { -2, 64 }, { 2, 61 }, { -6, 67 }, { -3, 64 }, 52.585 + { 2, 57 }, { -3, 65 }, { -3, 66 }, { 0, 62 }, 52.586 + { 9, 51 }, { -1, 66 }, { -2, 71 }, { -2, 75 }, 52.587 + { -1, 70 }, { -9, 72 }, { 14, 60 }, { 16, 37 }, 52.588 + { 0, 47 }, { 18, 35 }, { 11, 37 }, { 12, 41 }, 52.589 + { 10, 41 }, { 2, 48 }, { 12, 41 }, { 13, 41 }, 52.590 + { 0, 59 }, { 3, 50 }, { 19, 40 }, { 3, 66 }, 52.591 + { 18, 50 }, 52.592 + 52.593 + /* 338 - 398 */ 52.594 + { 19, -6 }, { 18, -6 }, { 14, 0 }, { 26, -12 }, 52.595 + { 31, -16 }, { 33, -25 }, { 33, -22 }, { 37, -28 }, 52.596 + { 39, -30 }, { 42, -30 }, { 47, -42 }, { 45, -36 }, 52.597 + { 49, -34 }, { 41, -17 }, { 32, 9 }, { 69, -71 }, 52.598 + { 63, -63 }, { 66, -64 }, { 77, -74 }, { 54, -39 }, 52.599 + { 52, -35 }, { 41, -10 }, { 36, 0 }, { 40, -1 }, 52.600 + { 30, 14 }, { 28, 26 }, { 23, 37 }, { 12, 55 }, 52.601 + { 11, 65 }, { 37, -33 }, { 39, -36 }, { 40, -37 }, 52.602 + { 38, -30 }, { 46, -33 }, { 42, -30 }, { 40, -24 }, 52.603 + { 49, -29 }, { 38, -12 }, { 40, -10 }, { 38, -3 }, 52.604 + { 46, -5 }, { 31, 20 }, { 29, 30 }, { 25, 44 }, 52.605 + { 12, 48 }, { 11, 49 }, { 26, 45 }, { 22, 22 }, 52.606 + { 23, 22 }, { 27, 21 }, { 33, 20 }, { 26, 28 }, 52.607 + { 30, 24 }, { 27, 34 }, { 18, 42 }, { 25, 39 }, 52.608 + { 18, 50 }, { 12, 70 }, { 21, 54 }, { 14, 71 }, 52.609 + { 11, 83 }, 52.610 + 52.611 + /* 399 - 435 */ 52.612 + { 25, 32 }, { 21, 49 }, { 21, 54 }, 52.613 + { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, 52.614 + { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, 52.615 + { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, 52.616 + { -14, 66 }, { 0, 59 }, { 2, 59 }, { 17, -10 }, 52.617 + { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, 52.618 + { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, 52.619 + { -5, 71 }, { 0, 24 }, { -1, 36 }, { -2, 42 }, 52.620 + { -2, 52 }, { -9, 57 }, { -6, 63 }, { -4, 65 }, 52.621 + { -4, 67 }, { -7, 82 }, 52.622 + 52.623 + /* 436 - 459 */ 52.624 + { -3, 81 }, { -3, 76 }, { -7, 72 }, { -6, 78 }, 52.625 + { -12, 72 }, { -14, 68 }, { -3, 70 }, { -6, 76 }, 52.626 + { -5, 66 }, { -5, 62 }, { 0, 57 }, { -4, 61 }, 52.627 + { -9, 60 }, { 1, 54 }, { 2, 58 }, { 17, -10 }, 52.628 + { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, 52.629 + { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, 52.630 + }, 52.631 + 52.632 + /* i_cabac_init_idc == 2 */ 52.633 + { 52.634 + /* 0 - 10 */ 52.635 + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, 52.636 + { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, 52.637 + { -6, 53 }, { -1, 54 }, { 7, 51 }, 52.638 + 52.639 + /* 11 - 23 */ 52.640 + { 29, 16 }, { 25, 0 }, { 14, 0 }, { -10, 51 }, 52.641 + { -3, 62 }, { -27, 99 }, { 26, 16 }, { -4, 85 }, 52.642 + { -24, 102 }, { 5, 57 }, { 6, 57 }, { -17, 73 }, 52.643 + { 14, 57 }, 52.644 + 52.645 + /* 24 - 39 */ 52.646 + { 20, 40 }, { 20, 10 }, { 29, 0 }, { 54, 0 }, 52.647 + { 37, 42 }, { 12, 97 }, { -32, 127 }, { -22, 117 }, 52.648 + { -2, 74 }, { -4, 85 }, { -24, 102 }, { 5, 57 }, 52.649 + { -6, 93 }, { -14, 88 }, { -6, 44 }, { 4, 55 }, 52.650 + 52.651 + /* 40 - 53 */ 52.652 + { -11, 89 },{ -15, 103 },{ -21, 116 },{ 19, 57 }, 52.653 + { 20, 58 },{ 4, 84 },{ 6, 96 },{ 1, 63 }, 52.654 + { -5, 85 },{ -13, 106 },{ 5, 63 },{ 6, 75 }, 52.655 + { -3, 90 },{ -1, 101 }, 52.656 + 52.657 + /* 54 - 59 */ 52.658 + { 3, 55 },{ -4, 79 },{ -2, 75 },{ -12, 97 }, 52.659 + { -7, 50 },{ 1, 60 }, 52.660 + 52.661 + /* 60 - 69 */ 52.662 + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, 52.663 + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, 52.664 + { 13, 41 }, { 3, 62 }, 52.665 + 52.666 + /* 70 - 104 */ 52.667 + { 7, 34 }, { -9, 88 }, { -20, 127 }, { -36, 127 }, 52.668 + { -17, 91 }, { -14, 95 }, { -25, 84 }, { -25, 86 }, 52.669 + { -12, 89 }, { -17, 91 }, { -31, 127 }, { -14, 76 }, 52.670 + { -18, 103 }, { -13, 90 }, { -37, 127 }, { 11, 80 }, 52.671 + { 5, 76 }, { 2, 84 }, { 5, 78 }, { -6, 55 }, 52.672 + { 4, 61 }, { -14, 83 }, { -37, 127 }, { -5, 79 }, 52.673 + { -11, 104 }, { -11, 91 }, { -30, 127 }, { 0, 65 }, 52.674 + { -2, 79 }, { 0, 72 }, { -4, 92 }, { -6, 56 }, 52.675 + { 3, 68 }, { -8, 71 }, { -13, 98 }, 52.676 + 52.677 + /* 105 -> 165 */ 52.678 + { -4, 86 }, { -12, 88 }, { -5, 82 }, { -3, 72 }, 52.679 + { -4, 67 }, { -8, 72 }, { -16, 89 }, { -9, 69 }, 52.680 + { -1, 59 }, { 5, 66 }, { 4, 57 }, { -4, 71 }, 52.681 + { -2, 71 }, { 2, 58 }, { -1, 74 }, { -4, 44 }, 52.682 + { -1, 69 }, { 0, 62 }, { -7, 51 }, { -4, 47 }, 52.683 + { -6, 42 }, { -3, 41 }, { -6, 53 }, { 8, 76 }, 52.684 + { -9, 78 }, { -11, 83 }, { 9, 52 }, { 0, 67 }, 52.685 + { -5, 90 }, { 1, 67 }, { -15, 72 }, { -5, 75 }, 52.686 + { -8, 80 }, { -21, 83 }, { -21, 64 }, { -13, 31 }, 52.687 + { -25, 64 }, { -29, 94 }, { 9, 75 }, { 17, 63 }, 52.688 + { -8, 74 }, { -5, 35 }, { -2, 27 }, { 13, 91 }, 52.689 + { 3, 65 }, { -7, 69 }, { 8, 77 }, { -10, 66 }, 52.690 + { 3, 62 }, { -3, 68 }, { -20, 81 }, { 0, 30 }, 52.691 + { 1, 7 }, { -3, 23 }, { -21, 74 }, { 16, 66 }, 52.692 + { -23, 124 }, { 17, 37 }, { 44, -18 }, { 50, -34 }, 52.693 + { -22, 127 }, 52.694 + 52.695 + /* 166 - 226 */ 52.696 + { 4, 39 }, { 0, 42 }, { 7, 34 }, { 11, 29 }, 52.697 + { 8, 31 }, { 6, 37 }, { 7, 42 }, { 3, 40 }, 52.698 + { 8, 33 }, { 13, 43 }, { 13, 36 }, { 4, 47 }, 52.699 + { 3, 55 }, { 2, 58 }, { 6, 60 }, { 8, 44 }, 52.700 + { 11, 44 }, { 14, 42 }, { 7, 48 }, { 4, 56 }, 52.701 + { 4, 52 }, { 13, 37 }, { 9, 49 }, { 19, 58 }, 52.702 + { 10, 48 }, { 12, 45 }, { 0, 69 }, { 20, 33 }, 52.703 + { 8, 63 }, { 35, -18 }, { 33, -25 }, { 28, -3 }, 52.704 + { 24, 10 }, { 27, 0 }, { 34, -14 }, { 52, -44 }, 52.705 + { 39, -24 }, { 19, 17 }, { 31, 25 }, { 36, 29 }, 52.706 + { 24, 33 }, { 34, 15 }, { 30, 20 }, { 22, 73 }, 52.707 + { 20, 34 }, { 19, 31 }, { 27, 44 }, { 19, 16 }, 52.708 + { 15, 36 }, { 15, 36 }, { 21, 28 }, { 25, 21 }, 52.709 + { 30, 20 }, { 31, 12 }, { 27, 16 }, { 24, 42 }, 52.710 + { 0, 93 }, { 14, 56 }, { 15, 57 }, { 26, 38 }, 52.711 + { -24, 127 }, 52.712 + 52.713 + /* 227 - 275 */ 52.714 + { -24, 115 }, { -22, 82 }, { -9, 62 }, { 0, 53 }, 52.715 + { 0, 59 }, { -14, 85 }, { -13, 89 }, { -13, 94 }, 52.716 + { -11, 92 }, { -29, 127 }, { -21, 100 }, { -14, 57 }, 52.717 + { -12, 67 }, { -11, 71 }, { -10, 77 }, { -21, 85 }, 52.718 + { -16, 88 }, { -23, 104 }, { -15, 98 }, { -37, 127 }, 52.719 + { -10, 82 }, { -8, 48 }, { -8, 61 }, { -8, 66 }, 52.720 + { -7, 70 }, { -14, 75 }, { -10, 79 }, { -9, 83 }, 52.721 + { -12, 92 }, { -18, 108 }, { -4, 79 }, { -22, 69 }, 52.722 + { -16, 75 }, { -2, 58 }, { 1, 58 }, { -13, 78 }, 52.723 + { -9, 83 }, { -4, 81 }, { -13, 99 }, { -13, 81 }, 52.724 + { -6, 38 }, { -13, 62 }, { -6, 58 }, { -2, 59 }, 52.725 + { -16, 73 }, { -10, 76 }, { -13, 86 }, { -9, 83 }, 52.726 + { -10, 87 }, 52.727 + 52.728 + /* 276 a bit special (not used, bypass is used instead) */ 52.729 + { 0, 0 }, 52.730 + 52.731 + /* 277 - 337 */ 52.732 + { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 }, 52.733 + { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 }, 52.734 + { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 }, 52.735 + { -10, 94 }, { -15, 102 }, { -10, 99 }, { -13, 106 }, 52.736 + { -50, 127 }, { -5, 92 }, { 17, 57 }, { -5, 86 }, 52.737 + { -13, 94 }, { -12, 91 }, { -2, 77 }, { 0, 71 }, 52.738 + { -1, 73 }, { 4, 64 }, { -7, 81 }, { 5, 64 }, 52.739 + { 15, 57 }, { 1, 67 }, { 0, 68 }, { -10, 67 }, 52.740 + { 1, 68 }, { 0, 77 }, { 2, 64 }, { 0, 68 }, 52.741 + { -5, 78 }, { 7, 55 }, { 5, 59 }, { 2, 65 }, 52.742 + { 14, 54 }, { 15, 44 }, { 5, 60 }, { 2, 70 }, 52.743 + { -2, 76 }, { -18, 86 }, { 12, 70 }, { 5, 64 }, 52.744 + { -12, 70 }, { 11, 55 }, { 5, 56 }, { 0, 69 }, 52.745 + { 2, 65 }, { -6, 74 }, { 5, 54 }, { 7, 54 }, 52.746 + { -6, 76 }, { -11, 82 }, { -2, 77 }, { -2, 77 }, 52.747 + { 25, 42 }, 52.748 + 52.749 + /* 338 - 398 */ 52.750 + { 17, -13 }, { 16, -9 }, { 17, -12 }, { 27, -21 }, 52.751 + { 37, -30 }, { 41, -40 }, { 42, -41 }, { 48, -47 }, 52.752 + { 39, -32 }, { 46, -40 }, { 52, -51 }, { 46, -41 }, 52.753 + { 52, -39 }, { 43, -19 }, { 32, 11 }, { 61, -55 }, 52.754 + { 56, -46 }, { 62, -50 }, { 81, -67 }, { 45, -20 }, 52.755 + { 35, -2 }, { 28, 15 }, { 34, 1 }, { 39, 1 }, 52.756 + { 30, 17 }, { 20, 38 }, { 18, 45 }, { 15, 54 }, 52.757 + { 0, 79 }, { 36, -16 }, { 37, -14 }, { 37, -17 }, 52.758 + { 32, 1 }, { 34, 15 }, { 29, 15 }, { 24, 25 }, 52.759 + { 34, 22 }, { 31, 16 }, { 35, 18 }, { 31, 28 }, 52.760 + { 33, 41 }, { 36, 28 }, { 27, 47 }, { 21, 62 }, 52.761 + { 18, 31 }, { 19, 26 }, { 36, 24 }, { 24, 23 }, 52.762 + { 27, 16 }, { 24, 30 }, { 31, 29 }, { 22, 41 }, 52.763 + { 22, 42 }, { 16, 60 }, { 15, 52 }, { 14, 60 }, 52.764 + { 3, 78 }, { -16, 123 }, { 21, 53 }, { 22, 56 }, 52.765 + { 25, 61 }, 52.766 + 52.767 + /* 399 - 435 */ 52.768 + { 21, 33 }, { 19, 50 }, { 17, 61 }, 52.769 + { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, 52.770 + { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, 52.771 + { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, 52.772 + { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, 52.773 + { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, 52.774 + { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, 52.775 + { -9, 71 }, { -7, 37 }, { -8, 44 }, { -11, 49 }, 52.776 + { -10, 56 }, { -12, 59 }, { -8, 63 }, { -9, 67 }, 52.777 + { -6, 68 }, { -10, 79 }, 52.778 + 52.779 + /* 436 - 459 */ 52.780 + { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, 52.781 + { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, 52.782 + { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, 52.783 + { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, 52.784 + { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, 52.785 + { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, 52.786 + } 52.787 +}; 52.788 + 52.789 +static const uint8_t left_block_options[4][16]={ 52.790 + {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8}, 52.791 + {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8}, 52.792 + {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}, 52.793 + {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8} 52.794 +}; 52.795 + 52.796 +void ff_h264_init_cabac_states(EDSlice_spu *s, CABACContext *c) { 52.797 + int i; 52.798 + const int8_t (*tab)[2]; 52.799 + 52.800 + if( s->slice_type_nos == FF_I_TYPE ) tab = cabac_context_init_I; 52.801 + else tab = cabac_context_init_PB[s->cabac_init_idc]; 52.802 + 52.803 + /* calculate pre-state */ 52.804 + for( i= 0; i < 460; i++ ) { 52.805 + int pre = 2*(((tab[i][0] * s->qscale) >>4 ) + tab[i][1]) - 127; 52.806 + 52.807 + pre^= pre>>31; 52.808 + if(pre > 124) 52.809 + pre= 124 + (pre&1); 52.810 + 52.811 + c->cabac_state[i] = pre; 52.812 + } 52.813 +} 52.814 + 52.815 +static void fill_decode_neighbors(H264Cabac_spu *hc, EDSlice_spu *s){ 52.816 + H264Mb *m = s->m; 52.817 + const int mb_x = m->mb_x; 52.818 + const int mb_y = m->mb_y; 52.819 + 52.820 + m->top_type = hc->mb_type_top[mb_x]; 52.821 + m->left_type = hc->mb_type[mb_x-1] ; 52.822 + 52.823 +} 52.824 + 52.825 +static void fill_decode_caches(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){ 52.826 + H264Mb *m = s->m; 52.827 + int topleft_xy, top_xy, topright_xy, left_xy; 52.828 + int topleft_type, top_type, topright_type, left_type; 52.829 + const uint8_t * left_block= left_block_options[0]; 52.830 + const int mb_x = m->mb_x; 52.831 + const int mb_y = m->mb_y; 52.832 + const int b_stride = hc->b_stride; 52.833 + int i; 52.834 + 52.835 + topleft_type = hc->mb_type_top[mb_x-1] ; 52.836 + top_type = m->top_type ; 52.837 + topright_type= hc->mb_type_top[mb_x+1] ; 52.838 + left_type = m->left_type ; 52.839 + 52.840 + if (s->slice_type_nos == FF_B_TYPE){ 52.841 + get_list = get_list_buf; 52.842 + for(int i=0; i<2; i++){ 52.843 + get_dma_list(hc->list1_motion_val[i], s->list1.motion_val[i][4*mb_x + 4*mb_y*b_stride], 16, 4, b_stride*2*sizeof(int16_t), ED_get_mv, 0); 52.844 + } 52.845 + if (hc->blocking) wait_dma_id(ED_get_mv); 52.846 + } 52.847 + 52.848 + if(!IS_SKIP(mb_type)){ 52.849 + if(IS_INTRA(mb_type)){ 52.850 + int type_mask= s->pps.constrained_intra_pred ? IS_INTRA(-1) : -1; 52.851 + m->topleft_samples_available= 52.852 + m->top_samples_available= 52.853 + m->left_samples_available= 0xFFFF; 52.854 + m->topright_samples_available= 0xEEEA; 52.855 + 52.856 + if(!(top_type & type_mask)){ 52.857 + m->topleft_samples_available= 0xB3FF; 52.858 + m->top_samples_available= 0x33FF; 52.859 + m->topright_samples_available= 0x26EA; 52.860 + } 52.861 + if(!(left_type & type_mask)){ 52.862 + m->topleft_samples_available&= 0xDF5F; 52.863 + m->left_samples_available&= 0x5F5F; 52.864 + } 52.865 + 52.866 + if(!(topleft_type & type_mask)) 52.867 + m->topleft_samples_available&= 0x7FFF; 52.868 + 52.869 + if(!(topright_type & type_mask)) 52.870 + m->topright_samples_available&= 0xFBFF; 52.871 + 52.872 + if(IS_INTRA4x4(mb_type)){ 52.873 + if(IS_INTRA4x4(top_type)){ 52.874 + AV_COPY32(m->intra4x4_pred_mode_cache+4+8*0, &hc->intra4x4_pred_mode_top[8*mb_x]); 52.875 + }else{ 52.876 + m->intra4x4_pred_mode_cache[4+8*0]= 52.877 + m->intra4x4_pred_mode_cache[5+8*0]= 52.878 + m->intra4x4_pred_mode_cache[6+8*0]= 52.879 + m->intra4x4_pred_mode_cache[7+8*0]= 2 - 3*!(top_type & type_mask); 52.880 + } 52.881 + for(i=0; i<2; i++){ 52.882 + if(IS_INTRA4x4(left_type)){ 52.883 + int8_t *mode= &hc->intra4x4_pred_mode[8*(mb_x-1)]; 52.884 + m->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= mode[6-left_block[0+2*i]]; 52.885 + m->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= mode[6-left_block[1+2*i]]; 52.886 + }else{ 52.887 + m->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= 52.888 + m->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= 2 - 3*!(left_type & type_mask); 52.889 + } 52.890 + } 52.891 + } 52.892 + } 52.893 + if(top_type){ 52.894 + AV_COPY32(&m->non_zero_count_cache[4+8*0], &hc->non_zero_count_top[mb_x][4+3*8]); 52.895 + m->non_zero_count_cache[1+8*0]= hc->non_zero_count_top[mb_x][1+1*8]; 52.896 + m->non_zero_count_cache[2+8*0]= hc->non_zero_count_top[mb_x][2+1*8]; 52.897 + m->non_zero_count_cache[1+8*3]= hc->non_zero_count_top[mb_x][1+2*8]; 52.898 + m->non_zero_count_cache[2+8*3]= hc->non_zero_count_top[mb_x][2+2*8]; 52.899 + }else { 52.900 + m->non_zero_count_cache[1+8*0]= 52.901 + m->non_zero_count_cache[2+8*0]= 52.902 + m->non_zero_count_cache[1+8*3]= 52.903 + m->non_zero_count_cache[2+8*3]= 52.904 + AV_WN32A(&m->non_zero_count_cache[4+8*0], !IS_INTRA(mb_type) ? 0 : 0x40404040); 52.905 + } 52.906 + 52.907 + for (i=0; i<2; i++) { 52.908 + if(left_type){ 52.909 + m->non_zero_count_cache[3+8*1 + 2*8*i]= hc->non_zero_count[mb_x-1][left_block[8+0+2*i]]; 52.910 + m->non_zero_count_cache[3+8*2 + 2*8*i]= hc->non_zero_count[mb_x-1][left_block[8+1+2*i]]; 52.911 + m->non_zero_count_cache[0+8*1 + 8*i]= hc->non_zero_count[mb_x-1][left_block[8+4+2*i]]; 52.912 + m->non_zero_count_cache[0+8*4 + 8*i]= hc->non_zero_count[mb_x-1][left_block[8+5+2*i]]; 52.913 + }else{ 52.914 + m->non_zero_count_cache[3+8*1 + 2*8*i]= 52.915 + m->non_zero_count_cache[3+8*2 + 2*8*i]= 52.916 + m->non_zero_count_cache[0+8*1 + 8*i]= 52.917 + m->non_zero_count_cache[0+8*4 + 8*i]= !IS_INTRA(mb_type) ? 0 : 64; 52.918 + } 52.919 + } 52.920 + 52.921 + 52.922 + // top_cbp 52.923 + if(top_type) { 52.924 + hc->top_cbp = hc->cbp_top[mb_x]; 52.925 + } else { 52.926 + hc->top_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F; 52.927 + } 52.928 + // left_cbp 52.929 + if (left_type) { 52.930 + hc->left_cbp = (hc->cbp[mb_x-1] & 0x1f0) 52.931 + | ((hc->cbp[mb_x-1]>>(left_block[0]&(~1)))&2) 52.932 + | (((hc->cbp[mb_x-1]>>(left_block[2]&(~1)))&2) << 2); 52.933 + } else { 52.934 + hc->left_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F; 52.935 + } 52.936 + } 52.937 + 52.938 + if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){ 52.939 + int list; 52.940 + 52.941 + m->ref_cache[0][scan8[5 ]+1] = m->ref_cache[0][scan8[7 ]+1] = m->ref_cache[0][scan8[13]+1] = 52.942 + m->ref_cache[1][scan8[5 ]+1] = m->ref_cache[1][scan8[7 ]+1] = m->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE; 52.943 + 52.944 + for(list=0; list<s->list_count; list++){ 52.945 + if(!USES_LIST(mb_type, list)){ 52.946 + continue; 52.947 + } 52.948 + assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred)); 52.949 + 52.950 + if(USES_LIST(top_type, list)){ 52.951 + const int b_xy= 4*mb_x + 3*hc->b_stride; 52.952 + AV_COPY128(m->mv_cache[list][scan8[0] + 0 - 1*8], hc->motion_val_top[list][b_xy + 0]); 52.953 + m->ref_cache[list][scan8[0] + 0 - 1*8]= 52.954 + m->ref_cache[list][scan8[0] + 1 - 1*8]= hc->ref_index_top[list][4*mb_x + 2]; 52.955 + m->ref_cache[list][scan8[0] + 2 - 1*8]= 52.956 + m->ref_cache[list][scan8[0] + 3 - 1*8]= hc->ref_index_top[list][4*mb_x + 3]; 52.957 + }else{ 52.958 + AV_ZERO128(m->mv_cache[list][scan8[0] + 0 - 1*8]); 52.959 + AV_WN32A(&m->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101); 52.960 + } 52.961 + 52.962 + if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){ 52.963 + for(i=0; i<2; i++){ 52.964 + int cache_idx = scan8[0] - 1 + i*2*8; 52.965 + if(USES_LIST(left_type, list)){ 52.966 + const int b_xy= 4*(mb_x-1) + 3; 52.967 + const int b8_x= 4*(mb_x-1) + 1; 52.968 + AV_COPY32(m->mv_cache[list][cache_idx ], hc->motion_val[list][b_xy + hc->b_stride*left_block[0+i*2]]); 52.969 + AV_COPY32(m->mv_cache[list][cache_idx+8], hc->motion_val[list][b_xy + hc->b_stride*left_block[1+i*2]]); 52.970 + m->ref_cache[list][cache_idx ]= hc->ref_index[list][b8_x + (left_block[0+i*2]&~1)]; 52.971 + m->ref_cache[list][cache_idx+8]= hc->ref_index[list][b8_x + (left_block[1+i*2]&~1)]; 52.972 + }else{ 52.973 + AV_ZERO32(m->mv_cache [list][cache_idx ]); 52.974 + AV_ZERO32(m->mv_cache [list][cache_idx+8]); 52.975 + m->ref_cache[list][cache_idx ]= 52.976 + m->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE); 52.977 + } 52.978 + } 52.979 + }else{ 52.980 + if(USES_LIST(left_type, list)){ 52.981 + const int b_x = 4*(mb_x-1) + 3; 52.982 + const int b8_x= 4*(mb_x-1) + 1; 52.983 + AV_COPY32(m->mv_cache[list][scan8[0] - 1], hc->motion_val[list][b_x + hc->b_stride*left_block[0]]); 52.984 + m->ref_cache[list][scan8[0] - 1]= hc->ref_index[list][b8_x + (left_block[0]&~1)]; 52.985 + }else{ 52.986 + AV_ZERO32(m->mv_cache [list][scan8[0] - 1]); 52.987 + m->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; 52.988 + } 52.989 + } 52.990 + 52.991 + if(USES_LIST(topright_type, list)){ 52.992 + const int b_xy= 4*(mb_x+1) + 3*hc->b_stride; 52.993 + AV_COPY32(m->mv_cache[list][scan8[0] + 4 - 1*8], hc->motion_val_top[list][b_xy]); 52.994 + m->ref_cache[list][scan8[0] + 4 - 1*8]= hc->ref_index_top[list][4*(mb_x+1) + 2]; 52.995 + }else{ 52.996 + AV_ZERO32(m->mv_cache [list][scan8[0] + 4 - 1*8]); 52.997 + m->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; 52.998 + } 52.999 + if(m->ref_cache[list][scan8[0] + 4 - 1*8] < 0){ 52.1000 + int topleft_partition= -1; 52.1001 + if(USES_LIST(topleft_type, list)){ 52.1002 + const int b_xy = 4*(mb_x-1) + 3 + hc->b_stride + (topleft_partition & 2*hc->b_stride); 52.1003 + const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2); 52.1004 + AV_COPY32(m->mv_cache[list][scan8[0] - 1 - 1*8], hc->motion_val_top[list][b_xy]); 52.1005 + m->ref_cache[list][scan8[0] - 1 - 1*8]= hc->ref_index_top[list][b8_x]; 52.1006 + }else{ 52.1007 + AV_ZERO32(m->mv_cache[list][scan8[0] - 1 - 1*8]); 52.1008 + m->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; 52.1009 + } 52.1010 + } 52.1011 + 52.1012 + if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) 52.1013 + continue; 52.1014 + 52.1015 + if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) { 52.1016 + m->ref_cache[list][scan8[4 ]] = 52.1017 + m->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE; 52.1018 + AV_ZERO32(m->mv_cache [list][scan8[4 ]]); 52.1019 + AV_ZERO32(m->mv_cache [list][scan8[12]]); 52.1020 + 52.1021 + 52.1022 + /* XXX beurk, Load mvd */ 52.1023 + if(USES_LIST(top_type, list)){ 52.1024 +// const int b_xy= hc->mb2br_top_xy; 52.1025 + AV_COPY64(hc->mvd_cache[list][scan8[0] + 0 - 1*8], hc->mvd_top[list][8*mb_x + 0]); 52.1026 + }else{ 52.1027 + AV_ZERO64(hc->mvd_cache[list][scan8[0] + 0 - 1*8]); 52.1028 + } 52.1029 + if(USES_LIST(left_type, list)){ 52.1030 +// const int b_xy= hc->mb2br_left_xy + 6; 52.1031 + AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 0*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[0]]); 52.1032 + AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 1*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[1]]); 52.1033 + }else{ 52.1034 + AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 0*8]); 52.1035 + AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 1*8]); 52.1036 + } 52.1037 + if(USES_LIST(left_type, list)){ 52.1038 +// const int b_xy= hc->mb2br_left_xy + 6; 52.1039 + AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 2*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[2]]); 52.1040 + AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 3*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[3]]); 52.1041 + }else{ 52.1042 + AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 2*8]); 52.1043 + AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 3*8]); 52.1044 + } 52.1045 + AV_ZERO16(hc->mvd_cache [list][scan8[4 ]]); 52.1046 + AV_ZERO16(hc->mvd_cache [list][scan8[12]]); 52.1047 + if(s->slice_type_nos == FF_B_TYPE){ 52.1048 + fill_rectangle(&hc->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1); 52.1049 + 52.1050 + if(IS_DIRECT(top_type)){ 52.1051 + AV_WN32A(&hc->direct_cache[scan8[0] - 1*8], 0x01010101u*(MB_TYPE_DIRECT2>>1)); 52.1052 + }else if(IS_8X8(top_type)){ 52.1053 + int b8_x = 4*mb_x; 52.1054 + hc->direct_cache[scan8[0] + 0 - 1*8]= hc->direct_top[b8_x + 2]; 52.1055 + hc->direct_cache[scan8[0] + 2 - 1*8]= hc->direct_top[b8_x + 3]; 52.1056 + }else{ 52.1057 + AV_WN32A(&hc->direct_cache[scan8[0] - 1*8], 0x01010101*(MB_TYPE_16x16>>1)); 52.1058 + } 52.1059 + 52.1060 + if(IS_DIRECT(left_type)) 52.1061 + hc->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_DIRECT2>>1; 52.1062 + else if(IS_8X8(left_type)) 52.1063 + hc->direct_cache[scan8[0] - 1 + 0*8]= hc->direct[4*(mb_x-1) + 1 + (left_block[0]&~1)]; 52.1064 + else 52.1065 + hc->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_16x16>>1; 52.1066 + 52.1067 + if(IS_DIRECT(left_type)) 52.1068 + hc->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_DIRECT2>>1; 52.1069 + else if(IS_8X8(left_type)) 52.1070 + hc->direct_cache[scan8[0] - 1 + 2*8]= hc->direct[4*(mb_x-1) + 1 + (left_block[2]&~1)]; 52.1071 + else 52.1072 + hc->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_16x16>>1; 52.1073 + } 52.1074 + } 52.1075 + } 52.1076 + } 52.1077 + hc->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type); 52.1078 + 52.1079 + if (s->slice_type_nos == FF_B_TYPE){ 52.1080 + wait_dma_id(ED_get_mv); 52.1081 + } 52.1082 +} 52.1083 + 52.1084 +static int check_mv(H264Cabac_spu *hc, EDSlice_spu *s, long b_idx, long bn_idx, int mvy_limit){ 52.1085 + int v; 52.1086 + 52.1087 + v= hc->ref_cache[0][b_idx] != hc->ref_cache[0][bn_idx]; 52.1088 + if(!v && hc->ref_cache[0][b_idx]!=-1) 52.1089 + // absolute value >= 7 | ... 52.1090 + v= ((unsigned) (hc->mv_cache[0][b_idx][0] - hc->mv_cache[0][bn_idx][0] + 3) >= 7U) | 52.1091 + ((FFABS( hc->mv_cache[0][b_idx][1] - hc->mv_cache[0][bn_idx][1] )) >= mvy_limit); 52.1092 + 52.1093 + if(s->list_count==2){ 52.1094 + if(!v) 52.1095 + v = (hc->ref_cache[1][b_idx] != hc->ref_cache[1][bn_idx]) | 52.1096 + ((unsigned) (hc->mv_cache[1][b_idx][0] - hc->mv_cache[1][bn_idx][0] + 3) >= 7U) | 52.1097 + ((FFABS( hc->mv_cache[1][b_idx][1] - hc->mv_cache[1][bn_idx][1] )) >= mvy_limit); 52.1098 + 52.1099 + if(v){ 52.1100 + if((hc->ref_cache[0][b_idx] != hc->ref_cache[1][bn_idx]) | 52.1101 + (hc->ref_cache[1][b_idx] != hc->ref_cache[0][bn_idx])) 52.1102 + return 1; 52.1103 + return 52.1104 + ((unsigned) (hc->mv_cache[0][b_idx][0] - hc->mv_cache[1][bn_idx][0] + 3) >= 7U) | 52.1105 + ((FFABS( hc->mv_cache[0][b_idx][1] - hc->mv_cache[1][bn_idx][1] )) >= mvy_limit) | 52.1106 + ((unsigned) (hc->mv_cache[1][b_idx][0] - hc->mv_cache[0][bn_idx][0] + 3) >= 7U) | 52.1107 + ((FFABS( hc->mv_cache[1][b_idx][1] - hc->mv_cache[0][bn_idx][1] )) >= mvy_limit); 52.1108 + } 52.1109 + } 52.1110 + 52.1111 + return v; 52.1112 +} 52.1113 + 52.1114 +static void calc_bS_values(H264Cabac_spu *hc, EDSlice_spu *s, int mvy_limit, int dir) { 52.1115 + H264Mb *m = s->m; 52.1116 + int mb_type = m->mb_type; 52.1117 + int edge; 52.1118 + const int mbm_type = dir == 0 ? m->left_type : m->top_type; 52.1119 + 52.1120 + // how often to recheck mv-based bS when iterating between edges 52.1121 + static const uint8_t mask_edge_tab[2][8]={{0,3,3,3,1,1,1,1}, 52.1122 + {0,3,1,1,3,3,3,3}}; 52.1123 + const int mask_edge = mask_edge_tab[dir][(mb_type>>3)&7]; 52.1124 + const int edges = mask_edge== 3 && !(m->cbp&15) ? 1 : 4; 52.1125 + // how often to recheck mv-based bS when iterating along each edge 52.1126 + const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)); 52.1127 + 52.1128 + m->edges[dir]= edges; 52.1129 + 52.1130 + if(mbm_type){ 52.1131 + int16_t* bS=m->bS[dir][0]; 52.1132 + if( IS_INTRA(mb_type|mbm_type)) { 52.1133 + AV_WN64A(bS, 0x0004000400040004ULL); 52.1134 + } else { 52.1135 + int i; 52.1136 + int mv_done; 52.1137 + if( mask_par0 && ((mbm_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) { 52.1138 + int b_idx= 8 + 4; 52.1139 + int bn_idx= b_idx - (dir ? 8:1); 52.1140 + 52.1141 + bS[0] = bS[1] = bS[2] = bS[3] = check_mv(hc, s, 8 + 4, bn_idx, mvy_limit); 52.1142 + mv_done = 1; 52.1143 + } 52.1144 + else 52.1145 + mv_done = 0; 52.1146 + 52.1147 + for( i = 0; i < 4; i++ ) { 52.1148 + int x = dir == 0 ? 0 : i; 52.1149 + int y = dir == 0 ? i : 0; 52.1150 + int b_idx= 8 + 4 + x + 8*y; 52.1151 + int bn_idx= b_idx - (dir ? 8:1); 52.1152 + 52.1153 + if( hc->non_zero_count_cache[b_idx] | 52.1154 + hc->non_zero_count_cache[bn_idx] ) { 52.1155 + bS[i] = 2; 52.1156 + } 52.1157 + else if(!mv_done) 52.1158 + { 52.1159 + bS[i] = check_mv(hc, s, b_idx, bn_idx, mvy_limit); 52.1160 + } 52.1161 + } 52.1162 + } 52.1163 + } 52.1164 + 52.1165 + /* Calculate bS */ 52.1166 + for( edge = 1; edge < edges; edge++ ) { 52.1167 + int16_t* bS=m->bS[dir][edge]; 52.1168 + 52.1169 + if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type) 52.1170 + continue; 52.1171 + 52.1172 + if( IS_INTRA(mb_type)) { 52.1173 + AV_WN64A(bS, 0x0003000300030003ULL); 52.1174 + } else { 52.1175 + int i; 52.1176 + int mv_done; 52.1177 + 52.1178 + if( edge & mask_edge ) { 52.1179 + AV_ZERO64(bS); 52.1180 + mv_done = 1; 52.1181 + } 52.1182 + else if( mask_par0 ) { 52.1183 + int b_idx= 8 + 4 + edge * (dir ? 8:1); 52.1184 + int bn_idx= b_idx - (dir ? 8:1); 52.1185 + 52.1186 + bS[0] = bS[1] = bS[2] = bS[3] = check_mv(hc, s, b_idx, bn_idx, mvy_limit); 52.1187 + mv_done = 1; 52.1188 + } 52.1189 + else 52.1190 + mv_done = 0; 52.1191 + 52.1192 + for( i = 0; i < 4; i++ ) { 52.1193 + int x = dir == 0 ? edge : i; 52.1194 + int y = dir == 0 ? i : edge; 52.1195 + int b_idx= 8 + 4 + x + 8*y; 52.1196 + int bn_idx= b_idx - (dir ? 8:1); 52.1197 + 52.1198 + if( hc->non_zero_count_cache[b_idx] | 52.1199 + hc->non_zero_count_cache[bn_idx] ) { 52.1200 + bS[i] = 2; 52.1201 + } 52.1202 + else if(!mv_done) 52.1203 + { 52.1204 + bS[i] = check_mv(hc, s, b_idx, bn_idx, mvy_limit); 52.1205 + } 52.1206 + } 52.1207 + 52.1208 + if(bS[0]+bS[1]+bS[2]+bS[3] == 0) 52.1209 + continue; 52.1210 + } 52.1211 + 52.1212 + } 52.1213 +} 52.1214 + 52.1215 +/** 52.1216 +* 52.1217 +* @return zero if the loop filter can be skiped 52.1218 +*/ 52.1219 +static int fill_filter_caches(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){ 52.1220 + H264Mb *m = s->m; 52.1221 + const int mb_x = m->mb_x; 52.1222 + const int mb_y = m->mb_y; 52.1223 + int top_type, left_type; 52.1224 + int qp, top_qp, left_qp; 52.1225 + int qp_thresh = s->qp_thresh; //FIXME strictly we should store qp_thresh for each mb of a slice 52.1226 + 52.1227 + m->dequant4_coeff_y = hc->dequant4_coeff[0][s->qscale][0]; 52.1228 + m->dequant4_coeff_cb = hc->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][s->chroma_qp[0]][0]; 52.1229 + m->dequant4_coeff_cr = hc->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][s->chroma_qp[1]][0]; 52.1230 + 52.1231 + m->qscale_mb_xy = qp = hc->qscale[mb_x]; 52.1232 + m->qscale_left_mb_xy = left_qp = hc->qscale[mb_x-1]; 52.1233 + m->qscale_top_mb_xy = top_qp = hc->qscale_top[mb_x]; 52.1234 + 52.1235 + //for sufficiently low qp, filtering wouldn't do anything 52.1236 + //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp 52.1237 + if(qp <= qp_thresh 52.1238 + && (!(mb_x+mb_y) || ((qp + left_qp + 1)>>1) <= qp_thresh) 52.1239 + && ( mb_y==0 || ((qp + top_qp + 1)>>1) <= qp_thresh)){ 52.1240 + m->deblock_mb = 0; 52.1241 + return 0; 52.1242 + } 52.1243 + 52.1244 + 52.1245 + m->deblock_mb = 1; 52.1246 + 52.1247 + top_type = hc->mb_type_top[mb_x] ; 52.1248 + left_type = hc->mb_type[mb_x -1]; 52.1249 + 52.1250 + m->top_type = top_type ; 52.1251 + m->left_type = left_type; 52.1252 + 52.1253 + if(IS_INTRA(mb_type)){ 52.1254 + calc_bS_values(hc, s, 4, 0); 52.1255 + calc_bS_values(hc, s, 4, 1); 52.1256 + return 1; 52.1257 + } 52.1258 + 52.1259 + AV_COPY64(&hc->non_zero_count_cache[0+8*1], &hc->non_zero_count[mb_x][ 0]); 52.1260 + AV_COPY64(&hc->non_zero_count_cache[0+8*2], &hc->non_zero_count[mb_x][ 8]); 52.1261 + AV_COPY32(&hc->non_zero_count_cache[0+8*5], &hc->non_zero_count[mb_x][16]); 52.1262 + AV_COPY32(&hc->non_zero_count_cache[4+8*3], &hc->non_zero_count[mb_x][20]); 52.1263 + AV_COPY64(&hc->non_zero_count_cache[0+8*4], &hc->non_zero_count[mb_x][24]); 52.1264 + 52.1265 + m->cbp= hc->cbp[mb_x]; 52.1266 + 52.1267 + { 52.1268 + int list; 52.1269 + for(list=0; list<s->list_count; list++){ 52.1270 + int8_t *ref; 52.1271 + int y, b_stride; 52.1272 + int16_t (*mv_dst)[2]; 52.1273 + int16_t (*mv_src)[2]; 52.1274 + 52.1275 + if(!USES_LIST(mb_type, list)){ 52.1276 + fill_rectangle( hc->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); 52.1277 + AV_WN32A(&hc->ref_cache[list][scan8[ 0]], ((LIST_NOT_USED)&0xFF)*0x01010101u); 52.1278 + AV_WN32A(&hc->ref_cache[list][scan8[ 2]], ((LIST_NOT_USED)&0xFF)*0x01010101u); 52.1279 + AV_WN32A(&hc->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u); 52.1280 + AV_WN32A(&hc->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u); 52.1281 + continue; 52.1282 + } 52.1283 + 52.1284 + ref = &hc->ref_index[list][4*mb_x]; 52.1285 + { 52.1286 + int (*ref2frm)[64] =(void *) (s->ref2frm[0] + 2); 52.1287 + AV_WN32A(&hc->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); 52.1288 + AV_WN32A(&hc->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); 52.1289 + ref += 2; 52.1290 + AV_WN32A(&hc->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); 52.1291 + AV_WN32A(&hc->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); 52.1292 + } 52.1293 + b_stride = hc->b_stride; 52.1294 + mv_dst = &hc->mv_cache[list][scan8[0]]; 52.1295 + mv_src = &hc->motion_val[list][4*mb_x]; 52.1296 + for(y=0; y<4; y++){ 52.1297 + AV_COPY128(mv_dst + 8*y, mv_src + y*b_stride); 52.1298 + } 52.1299 + 52.1300 + } 52.1301 + } 52.1302 + 52.1303 + /* 52.1304 + 0 . T T. T T T T 52.1305 + 1 L . .L . . . . 52.1306 + 2 L . .L . . . . 52.1307 + 3 . T TL . . . . 52.1308 + 4 L . .L . . . . 52.1309 + 5 L . .. . . . . 52.1310 + */ 52.1311 + //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec) 52.1312 + if(top_type){ 52.1313 + AV_COPY32(&hc->non_zero_count_cache[4+8*0], &hc->non_zero_count_top[mb_x][4+3*8]); 52.1314 + } 52.1315 + 52.1316 + if(left_type){ 52.1317 + hc->non_zero_count_cache[3+8*1]= hc->non_zero_count[mb_x-1][7+0*8]; 52.1318 + hc->non_zero_count_cache[3+8*2]= hc->non_zero_count[mb_x-1][7+1*8]; 52.1319 + hc->non_zero_count_cache[3+8*3]= hc->non_zero_count[mb_x-1][7+2*8]; 52.1320 + hc->non_zero_count_cache[3+8*4]= hc->non_zero_count[mb_x-1][7+3*8]; 52.1321 + } 52.1322 + 52.1323 + if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){ 52.1324 + int list; 52.1325 + for(list=0; list<s->list_count; list++){ 52.1326 + if(USES_LIST(top_type, list)){ 52.1327 + const int b_xy= 4*mb_x + 3*hc->b_stride; 52.1328 + const int b8_x= 4*mb_x + 2; 52.1329 + int (*ref2frm)[64] = (void *) (s->ref2frm[0] + 2); 52.1330 + AV_COPY128(hc->mv_cache[list][scan8[0] + 0 - 1*8], hc->motion_val_top[list][b_xy + 0]); 52.1331 + hc->ref_cache[list][scan8[0] + 0 - 1*8]= 52.1332 + hc->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][hc->ref_index_top[list][b8_x + 0]]; 52.1333 + hc->ref_cache[list][scan8[0] + 2 - 1*8]= 52.1334 + hc->ref_cache[list][scan8[0] + 3 - 1*8]= ref2frm[list][hc->ref_index_top[list][b8_x + 1]]; 52.1335 + }else{ 52.1336 + AV_ZERO128(hc->mv_cache[list][scan8[0] + 0 - 1*8]); 52.1337 + AV_WN32A(&hc->ref_cache[list][scan8[0] + 0 - 1*8], ((LIST_NOT_USED)&0xFF)*0x01010101u); 52.1338 + } 52.1339 + 52.1340 + if(USES_LIST(left_type, list)){ 52.1341 + const int b_x = 4*(mb_x-1) + 3; 52.1342 + const int b8_x= 4*(mb_x-1) + 1; 52.1343 + int (*ref2frm)[64] = (void *) (s->ref2frm[0] + 2); 52.1344 + AV_COPY32(hc->mv_cache[list][scan8[0] - 1 + 0 ], hc->motion_val[list][b_x + hc->b_stride*0]); 52.1345 + AV_COPY32(hc->mv_cache[list][scan8[0] - 1 + 8 ], hc->motion_val[list][b_x + hc->b_stride*1]); 52.1346 + AV_COPY32(hc->mv_cache[list][scan8[0] - 1 +16 ], hc->motion_val[list][b_x + hc->b_stride*2]); 52.1347 + AV_COPY32(hc->mv_cache[list][scan8[0] - 1 +24 ], hc->motion_val[list][b_x + hc->b_stride*3]); 52.1348 + hc->ref_cache[list][scan8[0] - 1 + 0 ]= 52.1349 + hc->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][hc->ref_index[list][b8_x + 2*0]]; 52.1350 + hc->ref_cache[list][scan8[0] - 1 +16 ]= 52.1351 + hc->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][hc->ref_index[list][b8_x + 2*1]]; 52.1352 + }else{ 52.1353 + AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 + 0 ]); 52.1354 + AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 + 8 ]); 52.1355 + AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 +16 ]); 52.1356 + AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 +24 ]); 52.1357 + hc->ref_cache[list][scan8[0] - 1 + 0 ]= 52.1358 + hc->ref_cache[list][scan8[0] - 1 + 8 ]= 52.1359 + hc->ref_cache[list][scan8[0] - 1 + 16 ]= 52.1360 + hc->ref_cache[list][scan8[0] - 1 + 24 ]= LIST_NOT_USED; 52.1361 + } 52.1362 + } 52.1363 + } 52.1364 + calc_bS_values(hc, s, 4, 0); 52.1365 + calc_bS_values(hc, s, 4, 1); 52.1366 + return 1; 52.1367 +} 52.1368 + 52.1369 + 52.1370 +/** 52.1371 +* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks. 52.1372 +*/ 52.1373 +static int check_intra4x4_pred_mode(EDSlice_spu *s){ 52.1374 + H264Mb *m = s->m; 52.1375 + static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0}; 52.1376 + static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED}; 52.1377 + int i; 52.1378 + 52.1379 + if(!(m->top_samples_available&0x8000)){ 52.1380 + for(i=0; i<4; i++){ 52.1381 + int status= top[ m->intra4x4_pred_mode_cache[scan8[0] + i] ]; 52.1382 + if(status<0){ 52.1383 + fprintf(stderr, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y); 52.1384 + return -1; 52.1385 + } else if(status){ 52.1386 + m->intra4x4_pred_mode_cache[scan8[0] + i]= status; 52.1387 + } 52.1388 + } 52.1389 + } 52.1390 + 52.1391 + if((m->left_samples_available&0x8888)!=0x8888){ 52.1392 + static const int mask[4]={0x8000,0x2000,0x80,0x20}; 52.1393 + for(i=0; i<4; i++){ 52.1394 + if(!(m->left_samples_available&mask[i])){ 52.1395 + int status= left[ m->intra4x4_pred_mode_cache[scan8[0] + 8*i] ]; 52.1396 + if(status<0){ 52.1397 + fprintf(stderr, "left block unavailable for requested intra4x4 mode %d at %d %d, %x\n", status, m->mb_x, m->mb_y, m->left_samples_available); 52.1398 + return -1; 52.1399 + } else if(status){ 52.1400 + m->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status; 52.1401 + } 52.1402 + } 52.1403 + } 52.1404 + } 52.1405 + return 0; 52.1406 +} 52.1407 + 52.1408 +/** 52.1409 +* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks. 52.1410 +*/ 52.1411 +static int check_intra_pred_mode(EDSlice_spu *s, int mode){ 52.1412 + H264Mb *m = s->m; 52.1413 + static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1}; 52.1414 + static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8}; 52.1415 + 52.1416 + if(mode > 6) { 52.1417 + fprintf(stderr, "out of range intra chroma pred mode at %d %d\n", m->mb_x, m->mb_y); 52.1418 + return -1; 52.1419 + } 52.1420 + 52.1421 + if(!(m->top_samples_available&0x8000)){ 52.1422 + mode= top[ mode ]; 52.1423 + if(mode<0){ 52.1424 + fprintf(stderr, "top block unavailable for requested intra mode %d at %d %d\n", mode, m->mb_x, m->mb_y); 52.1425 + return -1; 52.1426 + } 52.1427 + } 52.1428 + 52.1429 + if((m->left_samples_available&0x8080) != 0x8080){ 52.1430 + mode= left[ mode ]; 52.1431 + if(m->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred 52.1432 + mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(m->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8); 52.1433 + } 52.1434 + if(mode<0){ 52.1435 + fprintf(stderr, "left block unavailable for requested intra mode %d at %d %d\n", mode, m->mb_x, m->mb_y); 52.1436 + return -1; 52.1437 + } 52.1438 + } 52.1439 + return mode; 52.1440 +} 52.1441 + 52.1442 +/** 52.1443 + * gets the predicted intra4x4 prediction mode. 52.1444 + */ 52.1445 +static inline int pred_intra_mode(EDSlice_spu *s, int n){ 52.1446 + H264Mb *m = s->m; 52.1447 + const int index8= scan8[n]; 52.1448 + const int left= m->intra4x4_pred_mode_cache[index8 - 1]; 52.1449 + const int top = m->intra4x4_pred_mode_cache[index8 - 8]; 52.1450 + const int min= FFMIN(left, top); 52.1451 + 52.1452 + if(min<0) return DC_PRED; 52.1453 + else return min; 52.1454 +} 52.1455 + 52.1456 +static void write_back_intra_pred_mode(H264Cabac_spu *hc, EDSlice_spu *s){ 52.1457 + H264Mb *m = s->m; 52.1458 + const int mb_x = m->mb_x; 52.1459 + int8_t *mode= &hc->intra4x4_pred_mode[8*mb_x]; 52.1460 + 52.1461 + AV_COPY32(mode, m->intra4x4_pred_mode_cache + 4 + 8*4); 52.1462 + mode[4]= m->intra4x4_pred_mode_cache[7+8*3]; 52.1463 + mode[5]= m->intra4x4_pred_mode_cache[7+8*2]; 52.1464 + mode[6]= m->intra4x4_pred_mode_cache[7+8*1]; 52.1465 +} 52.1466 + 52.1467 +static inline void write_back_non_zero_count(H264Cabac_spu *hc, EDSlice_spu *s){ 52.1468 + H264Mb *m = s->m; 52.1469 + const int mb_x= m->mb_x; 52.1470 + 52.1471 + AV_COPY64(&hc->non_zero_count[mb_x][ 0], &m->non_zero_count_cache[0+8*1]); 52.1472 + AV_COPY64(&hc->non_zero_count[mb_x][ 8], &m->non_zero_count_cache[0+8*2]); 52.1473 + AV_COPY32(&hc->non_zero_count[mb_x][16], &m->non_zero_count_cache[0+8*5]); 52.1474 + AV_COPY32(&hc->non_zero_count[mb_x][20], &m->non_zero_count_cache[4+8*3]); 52.1475 + AV_COPY64(&hc->non_zero_count[mb_x][24], &m->non_zero_count_cache[0+8*4]); 52.1476 +} 52.1477 + 52.1478 +static inline void write_back_motion(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){ 52.1479 + H264Mb *m = s->m; 52.1480 + const int mb_x = m->mb_x; 52.1481 + int b_stride = hc->b_stride; 52.1482 + const int b_x = 4*m->mb_x; //try mb2b(8)_xy 52.1483 + const int b8_x= 4*m->mb_x; 52.1484 + int list; 52.1485 + 52.1486 + if(!USES_LIST(mb_type, 0)) 52.1487 + fill_rectangle(&hc->ref_index[0][b8_x], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1); 52.1488 + 52.1489 + for(list=0; list<s->list_count; list++){ 52.1490 + int y; 52.1491 + int16_t (*mv_dst)[2]; 52.1492 + int16_t (*mv_src)[2]; 52.1493 + 52.1494 + if(!USES_LIST(mb_type, list)) 52.1495 + continue; 52.1496 + 52.1497 + mv_dst = &hc->motion_val[list][b_x]; 52.1498 + mv_src = &m->mv_cache[list][scan8[0]]; 52.1499 + for(y=0; y<4; y++){ 52.1500 + AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y); 52.1501 + } 52.1502 + { 52.1503 + uint8_t (*mvd_dst)[2] = (void *) hc->mvd[list][8*mb_x]; 52.1504 + uint8_t (*mvd_src)[2] = &hc->mvd_cache[list][scan8[0]]; 52.1505 + if(IS_SKIP(mb_type)) 52.1506 + AV_ZERO128(mvd_dst); 52.1507 + else{ 52.1508 + AV_COPY64(mvd_dst, mvd_src + 8*3); 52.1509 + AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0); 52.1510 + AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1); 52.1511 + AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2); 52.1512 + } 52.1513 + } 52.1514 + 52.1515 + { 52.1516 + int8_t *ref_index = &hc->ref_index[list][b8_x]; 52.1517 + ref_index[0+0*2]= m->ref_cache[list][scan8[0]]; 52.1518 + ref_index[1+0*2]= m->ref_cache[list][scan8[4]]; 52.1519 + ref_index[0+1*2]= m->ref_cache[list][scan8[8]]; 52.1520 + ref_index[1+1*2]= m->ref_cache[list][scan8[12]]; 52.1521 + } 52.1522 + } 52.1523 + 52.1524 + if(s->slice_type_nos == FF_B_TYPE){ 52.1525 + if(IS_8X8(mb_type)){ 52.1526 + uint8_t *direct = &hc->direct[4*mb_x]; 52.1527 + direct[1] = m->sub_mb_type[1]>>1; 52.1528 + direct[2] = m->sub_mb_type[2]>>1; 52.1529 + direct[3] = m->sub_mb_type[3]>>1; 52.1530 + } 52.1531 + } 52.1532 +} 52.1533 + 52.1534 +static inline int get_dct8x8_allowed(EDSlice_spu *s){ 52.1535 + H264Mb *m = s->m; 52.1536 + if(s->direct_8x8_inference_flag) 52.1537 + return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8 )*0x0001000100010001ULL)); 52.1538 + else 52.1539 + return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL)); 52.1540 +} 52.1541 + 52.1542 +static inline int fetch_diagonal_mv(EDSlice_spu *s, const int16_t **C, int i, int list, int part_width){ 52.1543 + H264Mb *m = s->m; 52.1544 + const int topright_ref= m->ref_cache[list][ i - 8 + part_width ]; 52.1545 + 52.1546 + if(topright_ref != PART_NOT_AVAILABLE){ 52.1547 + *C= m->mv_cache[list][ i - 8 + part_width ]; 52.1548 + return topright_ref; 52.1549 + }else{ 52.1550 + *C= m->mv_cache[list][ i - 8 - 1 ]; 52.1551 + return m->ref_cache[list][ i - 8 - 1 ]; 52.1552 + } 52.1553 +} 52.1554 + 52.1555 +/** 52.1556 + * gets the predicted MV. 52.1557 + * @param n the block index 52.1558 + * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4) 52.1559 + * @param mx the x component of the predicted motion vector 52.1560 + * @param my the y component of the predicted motion vector 52.1561 + */ 52.1562 +static inline void pred_motion(EDSlice_spu *s, int n, int part_width, int list, int ref, int * const mx, int * const my){ 52.1563 + H264Mb *m = s->m; 52.1564 + const int index8= scan8[n]; 52.1565 + const int top_ref= m->ref_cache[list][ index8 - 8 ]; 52.1566 + const int left_ref= m->ref_cache[list][ index8 - 1 ]; 52.1567 + const int16_t * const A= m->mv_cache[list][ index8 - 1 ]; 52.1568 + const int16_t * const B= m->mv_cache[list][ index8 - 8 ]; 52.1569 + const int16_t * C; 52.1570 + int diagonal_ref, match_count; 52.1571 + 52.1572 + assert(part_width==1 || part_width==2 || part_width==4); 52.1573 + 52.1574 +/* mv_cache 52.1575 + B . . A T T T T 52.1576 + U . . L . . , . 52.1577 + U . . L . . . . 52.1578 + U . . L . . , . 52.1579 + . . . L . . . . 52.1580 +*/ 52.1581 + 52.1582 + diagonal_ref= fetch_diagonal_mv(s, &C, index8, list, part_width); 52.1583 + match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref); 52.1584 + 52.1585 + if(match_count > 1){ //most common 52.1586 + *mx= mid_pred(A[0], B[0], C[0]); 52.1587 + *my= mid_pred(A[1], B[1], C[1]); 52.1588 + }else if(match_count==1){ 52.1589 + if(left_ref==ref){ 52.1590 + *mx= A[0]; 52.1591 + *my= A[1]; 52.1592 + }else if(top_ref==ref){ 52.1593 + *mx= B[0]; 52.1594 + *my= B[1]; 52.1595 + }else{ 52.1596 + *mx= C[0]; 52.1597 + *my= C[1]; 52.1598 + } 52.1599 + }else{ 52.1600 + if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){ 52.1601 + *mx= A[0]; 52.1602 + *my= A[1]; 52.1603 + }else{ 52.1604 + *mx= mid_pred(A[0], B[0], C[0]); 52.1605 + *my= mid_pred(A[1], B[1], C[1]); 52.1606 + } 52.1607 + } 52.1608 + 52.1609 +} 52.1610 + 52.1611 +/** 52.1612 + * gets the directionally predicted 16x8 MV. 52.1613 + * @param n the block index 52.1614 + * @param mx the x component of the predicted motion vector 52.1615 + * @param my the y component of the predicted motion vector 52.1616 + */ 52.1617 +static inline void pred_16x8_motion(EDSlice_spu *s, int n, int list, int ref, int * const mx, int * const my){ 52.1618 + H264Mb *m = s->m; 52.1619 + if(n==0){ 52.1620 + const int top_ref= m->ref_cache[list][ scan8[0] - 8 ]; 52.1621 + const int16_t * const B= m->mv_cache[list][ scan8[0] - 8 ]; 52.1622 + 52.1623 + if(top_ref == ref){ 52.1624 + *mx= B[0]; 52.1625 + *my= B[1]; 52.1626 + return; 52.1627 + } 52.1628 + }else{ 52.1629 + const int left_ref= m->ref_cache[list][ scan8[8] - 1 ]; 52.1630 + const int16_t * const A= m->mv_cache[list][ scan8[8] - 1 ]; 52.1631 + 52.1632 + if(left_ref == ref){ 52.1633 + *mx= A[0]; 52.1634 + *my= A[1]; 52.1635 + return; 52.1636 + } 52.1637 + } 52.1638 + 52.1639 + //RARE 52.1640 + pred_motion(s, n, 4, list, ref, mx, my); 52.1641 +} 52.1642 + 52.1643 +/** 52.1644 + * gets the directionally predicted 8x16 MV. 52.1645 + * @param n the block index 52.1646 + * @param mx the x component of the predicted motion vector 52.1647 + * @param my the y component of the predicted motion vector 52.1648 + */ 52.1649 +static inline void pred_8x16_motion(EDSlice_spu *s, int n, int list, int ref, int * const mx, int * const my){ 52.1650 + H264Mb *m = s->m; 52.1651 + if(n==0){ 52.1652 + const int left_ref= m->ref_cache[list][ scan8[0] - 1 ]; 52.1653 + const int16_t * const A= m->mv_cache[list][ scan8[0] - 1 ]; 52.1654 + 52.1655 + if(left_ref == ref){ 52.1656 + *mx= A[0]; 52.1657 + *my= A[1]; 52.1658 + return; 52.1659 + } 52.1660 + }else{ 52.1661 + const int16_t * C; 52.1662 + int diagonal_ref; 52.1663 + 52.1664 + diagonal_ref= fetch_diagonal_mv(s, &C, scan8[4], list, 2); 52.1665 + if(diagonal_ref == ref){ 52.1666 + *mx= C[0]; 52.1667 + *my= C[1]; 52.1668 + return; 52.1669 + } 52.1670 + } 52.1671 + 52.1672 + //RARE 52.1673 + pred_motion(s, n, 2, list, ref, mx, my); 52.1674 +} 52.1675 + 52.1676 +static inline void pred_pskip_motion(EDSlice_spu *s, int * const mx, int * const my){ 52.1677 + H264Mb *m = s->m; 52.1678 + const int top_ref = m->ref_cache[0][ scan8[0] - 8 ]; 52.1679 + const int left_ref= m->ref_cache[0][ scan8[0] - 1 ]; 52.1680 + 52.1681 + if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE 52.1682 + || !( top_ref | AV_RN32A(m->mv_cache[0][ scan8[0] - 8 ])) 52.1683 + || !(left_ref | AV_RN32A(m->mv_cache[0][ scan8[0] - 1 ]))){ 52.1684 + 52.1685 + *mx = *my = 0; 52.1686 + return; 52.1687 + } 52.1688 + 52.1689 + pred_motion(s, 0, 4, 0, 0, mx, my); 52.1690 + 52.1691 + return; 52.1692 +} 52.1693 + 52.1694 +/** 52.1695 + * decodes a P_SKIP or B_SKIP macroblock 52.1696 + */ 52.1697 +static void decode_mb_skip(H264Cabac_spu *hc, EDSlice_spu *s){ 52.1698 + H264Mb *m = s->m; 52.1699 + const int mb_x = m->mb_x; 52.1700 + int mb_type=0; 52.1701 + 52.1702 + memset(hc->non_zero_count[mb_x], 0, 32); 52.1703 + memset(m->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui 52.1704 + 52.1705 + if( s->slice_type_nos == FF_B_TYPE ) 52.1706 + { 52.1707 + // just for fill_caches. pred_direct_motion will set the real mb_type 52.1708 + mb_type|= MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP; 52.1709 + fill_decode_caches(hc, s, mb_type); //FIXME check what is needed and what not ... 52.1710 + 52.1711 + ff_h264_pred_direct_motion(hc, s, &mb_type); 52.1712 + mb_type|= MB_TYPE_SKIP; 52.1713 + } 52.1714 + else 52.1715 + { 52.1716 + int mx, my; 52.1717 + mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP; 52.1718 + 52.1719 + fill_decode_caches(hc, s, mb_type); //FIXME check what is needed and what not ... 52.1720 + pred_pskip_motion(s, &mx, &my); 52.1721 + fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1); 52.1722 + fill_rectangle( m->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4); 52.1723 + } 52.1724 + 52.1725 + write_back_motion(hc, s, mb_type); 52.1726 + hc->mb_type[mb_x]= mb_type; 52.1727 + m->mb_type = mb_type; 52.1728 + hc->qscale[mb_x]= s->qscale; 52.1729 + fill_filter_caches(hc, s, mb_type); 52.1730 +} 52.1731 + 52.1732 +static int decode_cabac_intra_mb_type(EDSlice_spu *s, CABACContext *c, int ctx_base, int intra_slice) { 52.1733 + H264Mb *m =s->m; 52.1734 + uint8_t *state= &c->cabac_state[ctx_base]; 52.1735 + int mb_type; 52.1736 + 52.1737 + if(intra_slice){ 52.1738 + int ctx=0; 52.1739 + if( m->left_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)) 52.1740 + ctx++; 52.1741 + if( m->top_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)) 52.1742 + ctx++; 52.1743 + if( get_cabac_noinline( c, &state[ctx] ) == 0 ) 52.1744 + return 0; /* I4x4 */ 52.1745 + state += 2; 52.1746 + }else{ 52.1747 + if( get_cabac_noinline( c, state ) == 0 ) 52.1748 + return 0; /* I4x4 */ 52.1749 + } 52.1750 + 52.1751 + if( get_cabac_terminate( c ) ) 52.1752 + return 25; /* PCM */ 52.1753 + 52.1754 + mb_type = 1; /* I16x16 */ 52.1755 + mb_type += 12 * get_cabac_noinline( c, &state[1] ); /* cbp_luma != 0 */ 52.1756 + if( get_cabac_noinline(c, &state[2] ) ) /* cbp_chroma */ 52.1757 + mb_type += 4 + 4 * get_cabac_noinline(c, &state[2+intra_slice] ); 52.1758 + mb_type += 2 * get_cabac_noinline(c, &state[3+intra_slice] ); 52.1759 + mb_type += 1 * get_cabac_noinline(c, &state[3+2*intra_slice] ); 52.1760 + return mb_type; 52.1761 +} 52.1762 + 52.1763 +static int decode_cabac_mb_skip(H264Cabac_spu *hc, EDSlice_spu *s, H264Mb *m, CABACContext *c) { 52.1764 + int ctx = 0; 52.1765 + const int mb_x = m->mb_x; 52.1766 + 52.1767 + if( m->mb_x>0 && !IS_SKIP( hc->mb_type[mb_x-1] )) 52.1768 + ctx++; 52.1769 + if( m->mb_y>0 && !IS_SKIP( hc->mb_type_top[mb_x] )) 52.1770 + ctx++; 52.1771 + 52.1772 + if( s->slice_type_nos == FF_B_TYPE ) 52.1773 + ctx += 13; 52.1774 + return get_cabac_noinline(c, &c->cabac_state[11+ctx] ); 52.1775 +} 52.1776 + 52.1777 +static int decode_cabac_mb_intra4x4_pred_mode( CABACContext *c, int pred_mode ) { 52.1778 + int mode = 0; 52.1779 + 52.1780 + if( get_cabac(c, &c->cabac_state[68] ) ) 52.1781 + return pred_mode; 52.1782 + 52.1783 + mode += 1 * get_cabac(c, &c->cabac_state[69] ); 52.1784 + mode += 2 * get_cabac(c, &c->cabac_state[69] ); 52.1785 + mode += 4 * get_cabac(c, &c->cabac_state[69] ); 52.1786 + 52.1787 + return mode + ( mode >= pred_mode ); 52.1788 +} 52.1789 + 52.1790 +static int decode_cabac_mb_chroma_pre_mode(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c) { 52.1791 + H264Mb *m = s->m; 52.1792 + const int mb_x = m->mb_x; 52.1793 + 52.1794 + int ctx = 0; 52.1795 + 52.1796 + /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode to 0 */ 52.1797 + if( m->left_type && hc->chroma_pred_mode[mb_x-1] != 0 ) 52.1798 + ctx++; 52.1799 + 52.1800 + if( m->top_type && hc->chroma_pred_mode_top[mb_x] != 0 ) 52.1801 + ctx++; 52.1802 + 52.1803 + if( get_cabac_noinline(c, &c->cabac_state[64+ctx] ) == 0 ) 52.1804 + return 0; 52.1805 + 52.1806 + if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 ) 52.1807 + return 1; 52.1808 + if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 ) 52.1809 + return 2; 52.1810 + else 52.1811 + return 3; 52.1812 +} 52.1813 + 52.1814 +static int decode_cabac_mb_cbp_luma(H264Cabac_spu *hc, CABACContext *c) { 52.1815 + int cbp_b, cbp_a, ctx, cbp = 0; 52.1816 + 52.1817 + cbp_a = hc->left_cbp; 52.1818 + cbp_b = hc->top_cbp; 52.1819 + 52.1820 + ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04); 52.1821 + cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]); 52.1822 + ctx = !(cbp & 0x01) + 2 * !(cbp_b & 0x08); 52.1823 + cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 1; 52.1824 + ctx = !(cbp_a & 0x08) + 2 * !(cbp & 0x01); 52.1825 + cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 2; 52.1826 + ctx = !(cbp & 0x04) + 2 * !(cbp & 0x02); 52.1827 + cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 3; 52.1828 + return cbp; 52.1829 +} 52.1830 +static int decode_cabac_mb_cbp_chroma(H264Cabac_spu *hc, CABACContext *c) { 52.1831 + int ctx; 52.1832 + int cbp_a, cbp_b; 52.1833 + 52.1834 + cbp_a = (hc->left_cbp>>4)&0x03; 52.1835 + cbp_b = (hc-> top_cbp>>4)&0x03; 52.1836 + 52.1837 + ctx = 0; 52.1838 + if( cbp_a > 0 ) ctx++; 52.1839 + if( cbp_b > 0 ) ctx += 2; 52.1840 + if( get_cabac_noinline(c, &c->cabac_state[77 + ctx] ) == 0 ) 52.1841 + return 0; 52.1842 + 52.1843 + ctx = 4; 52.1844 + if( cbp_a == 2 ) ctx++; 52.1845 + if( cbp_b == 2 ) ctx += 2; 52.1846 + return 1 + get_cabac_noinline(c, &c->cabac_state[77 + ctx] ); 52.1847 +} 52.1848 + 52.1849 +static int decode_cabac_p_mb_sub_type( CABACContext *c) { 52.1850 + if( get_cabac(c, &c->cabac_state[21] ) ) 52.1851 + return 0; /* 8x8 */ 52.1852 + if( !get_cabac(c, &c->cabac_state[22] ) ) 52.1853 + return 1; /* 8x4 */ 52.1854 + if( get_cabac(c, &c->cabac_state[23] ) ) 52.1855 + return 2; /* 4x8 */ 52.1856 + return 3; /* 4x4 */ 52.1857 +} 52.1858 +static int decode_cabac_b_mb_sub_type(CABACContext *c) { 52.1859 + int type; 52.1860 + if( !get_cabac(c, &c->cabac_state[36] ) ) 52.1861 + return 0; /* B_Direct_8x8 */ 52.1862 + if( !get_cabac(c, &c->cabac_state[37] ) ) 52.1863 + return 1 + get_cabac(c, &c->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */ 52.1864 + type = 3; 52.1865 + if( get_cabac(c, &c->cabac_state[38] ) ) { 52.1866 + if( get_cabac(c, &c->cabac_state[39] ) ) 52.1867 + return 11 + get_cabac(c, &c->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */ 52.1868 + type += 4; 52.1869 + } 52.1870 + type += 2*get_cabac(c, &c->cabac_state[39] ); 52.1871 + type += get_cabac(c, &c->cabac_state[39] ); 52.1872 + return type; 52.1873 +} 52.1874 + 52.1875 +static int decode_cabac_mb_ref(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, int list, int n ) { 52.1876 + H264Mb *m = s->m; 52.1877 + int refa = m->ref_cache[list][scan8[n] - 1]; 52.1878 + int refb = m->ref_cache[list][scan8[n] - 8]; 52.1879 + int ref = 0; 52.1880 + int ctx = 0; 52.1881 + 52.1882 + if( s->slice_type_nos == FF_B_TYPE) { 52.1883 + if( refa > 0 && !(hc->direct_cache[scan8[n] - 1]&(MB_TYPE_DIRECT2>>1)) ) 52.1884 + ctx++; 52.1885 + if( refb > 0 && !(hc->direct_cache[scan8[n] - 8]&(MB_TYPE_DIRECT2>>1)) ) 52.1886 + ctx += 2; 52.1887 + } else { 52.1888 + if( refa > 0 ) 52.1889 + ctx++; 52.1890 + if( refb > 0 ) 52.1891 + ctx += 2; 52.1892 + } 52.1893 + 52.1894 + while( get_cabac(c, &c->cabac_state[54+ctx] ) ) { 52.1895 + ref++; 52.1896 + ctx = (ctx>>2)+4; 52.1897 + if(ref >= 32 /*h->ref_list[list]*/){ 52.1898 + fprintf(stderr, "refcount %d\n", ref); 52.1899 + return -1; 52.1900 + } 52.1901 + } 52.1902 + return ref; 52.1903 +} 52.1904 + 52.1905 +static int decode_cabac_mb_mvd( CABACContext *c, int ctxbase, int amvd, int *mvda) { 52.1906 + int mvd; 52.1907 + 52.1908 + if(!get_cabac(c, &c->cabac_state[ctxbase+((amvd-3)>>(INT_BIT-1))+((amvd-33)>>(INT_BIT-1))+2])){ 52.1909 +// if(!get_cabac(&h->cabac, &c->cabac_state[ctxbase+(amvd>2)+(amvd>32)])){ 52.1910 + *mvda= 0; 52.1911 + return 0; 52.1912 + } 52.1913 + 52.1914 + mvd= 1; 52.1915 + ctxbase+= 3; 52.1916 + while( mvd < 9 && get_cabac(c, &c->cabac_state[ctxbase] ) ) { 52.1917 + if( mvd < 4 ) 52.1918 + ctxbase++; 52.1919 + mvd++; 52.1920 + } 52.1921 + 52.1922 + if( mvd >= 9 ) { 52.1923 + int k = 3; 52.1924 + while( get_cabac_bypass(c ) ) { 52.1925 + mvd += 1 << k; 52.1926 + k++; 52.1927 + if(k>24){ 52.1928 + fprintf(stderr, "overflow in decode_cabac_mb_mvd\n"); 52.1929 + return INT_MIN; 52.1930 + } 52.1931 + } 52.1932 + while( k-- ) { 52.1933 + mvd += get_cabac_bypass(c )<<k; 52.1934 + } 52.1935 + *mvda=mvd < 70 ? mvd : 70; 52.1936 + }else 52.1937 + *mvda=mvd; 52.1938 + return get_cabac_bypass_sign(c, -mvd ); 52.1939 +} 52.1940 + 52.1941 +#define DECODE_CABAC_MB_MVD( hc, c, list, n )\ 52.1942 +{\ 52.1943 + int amvd0 = hc->mvd_cache[list][scan8[n] - 1][0] +\ 52.1944 + hc->mvd_cache[list][scan8[n] - 8][0];\ 52.1945 + int amvd1 = hc->mvd_cache[list][scan8[n] - 1][1] +\ 52.1946 + hc->mvd_cache[list][scan8[n] - 8][1];\ 52.1947 +\ 52.1948 + mx += decode_cabac_mb_mvd( c, 40, amvd0, &mpx );\ 52.1949 + my += decode_cabac_mb_mvd( c, 47, amvd1, &mpy );\ 52.1950 +} 52.1951 + 52.1952 +static av_always_inline int get_cabac_cbf_ctx(H264Cabac_spu *hc, EDSlice_spu *s, int cat, int idx, int is_dc ) { 52.1953 + H264Mb *m = s->m; 52.1954 + int nza, nzb; 52.1955 + int ctx = 0; 52.1956 + 52.1957 + if( is_dc ) { 52.1958 + if( cat == 0 ) { 52.1959 + nza = hc->left_cbp&0x100; 52.1960 + nzb = hc-> top_cbp&0x100; 52.1961 + } else { 52.1962 + nza = (hc->left_cbp>>(6+idx))&0x01; 52.1963 + nzb = (hc-> top_cbp>>(6+idx))&0x01; 52.1964 + } 52.1965 + } else { 52.1966 + assert(cat == 1 || cat == 2 || cat == 4); 52.1967 + nza = m->non_zero_count_cache[scan8[idx] - 1]; 52.1968 + nzb = m->non_zero_count_cache[scan8[idx] - 8]; 52.1969 + } 52.1970 + 52.1971 + if( nza > 0 ) 52.1972 + ctx++; 52.1973 + 52.1974 + if( nzb > 0 ) 52.1975 + ctx += 2; 52.1976 + 52.1977 + return ctx + 4 * cat; 52.1978 +} 52.1979 + 52.1980 + uint8_t last_coeff_flag_offset_8x8[63] = { 52.1981 + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 52.1982 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 52.1983 + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 52.1984 + 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 52.1985 +}; 52.1986 + 52.1987 +static const int significant_coeff_flag_offset[2][6] = { 52.1988 + { 105+0, 105+15, 105+29, 105+44, 105+47, 402 }, 52.1989 + { 277+0, 277+15, 277+29, 277+44, 277+47, 436 } 52.1990 +}; 52.1991 +static const int last_coeff_flag_offset[2][6] = { 52.1992 + { 166+0, 166+15, 166+29, 166+44, 166+47, 417 }, 52.1993 + { 338+0, 338+15, 338+29, 338+44, 338+47, 451 } 52.1994 +}; 52.1995 +static const int coeff_abs_level_m1_offset[6] = { 52.1996 + 227+0, 227+10, 227+20, 227+30, 227+39, 426 52.1997 +}; 52.1998 +static const uint8_t significant_coeff_flag_offset_8x8[2][63] = { 52.1999 + { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5, 52.2000 + 4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7, 52.2001 + 7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11, 52.2002 + 12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 }, 52.2003 + { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5, 52.2004 + 6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11, 52.2005 + 9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9, 52.2006 + 9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 } 52.2007 +}; 52.2008 +/* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0). 52.2009 +* 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter). 52.2010 +* map node ctx => cabac ctx for level=1 */ 52.2011 +static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }; 52.2012 +/* map node ctx => cabac ctx for level>1 */ 52.2013 +static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 }; 52.2014 +static const uint8_t coeff_abs_level_transition[2][8] = { 52.2015 + /* update node ctx after decoding a level=1 */ 52.2016 + { 1, 2, 3, 3, 4, 5, 6, 7 }, 52.2017 + /* update node ctx after decoding a level>1 */ 52.2018 + { 4, 4, 4, 4, 5, 6, 7, 7 } 52.2019 +}; 52.2020 + 52.2021 +static av_always_inline void decode_cabac_residual_internal(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) { 52.2022 + H264Mb *m = s->m; 52.2023 + const int mb_x = m->mb_x; 52.2024 + int index[64]; 52.2025 + 52.2026 + int av_unused last; 52.2027 + int coeff_count = 0; 52.2028 + int node_ctx = 0; 52.2029 + 52.2030 + uint8_t *significant_coeff_ctx_base; 52.2031 + uint8_t *last_coeff_ctx_base; 52.2032 + uint8_t *abs_level_m1_ctx_base; 52.2033 + 52.2034 + /* read coded block flag */ 52.2035 + if( is_dc || cat != 5 ) { 52.2036 + if( get_cabac( c, &c->cabac_state[85 + get_cabac_cbf_ctx( hc, s, cat, n, is_dc ) ] ) == 0 ) { 52.2037 + if( !is_dc ) 52.2038 + m->non_zero_count_cache[scan8[n]] = 0; 52.2039 + return; 52.2040 + } 52.2041 + } 52.2042 + 52.2043 + significant_coeff_ctx_base = c->cabac_state 52.2044 + + significant_coeff_flag_offset[0][cat]; 52.2045 + last_coeff_ctx_base = c->cabac_state 52.2046 + + last_coeff_flag_offset[0][cat]; 52.2047 + abs_level_m1_ctx_base = c->cabac_state 52.2048 + + coeff_abs_level_m1_offset[cat]; 52.2049 + 52.2050 + if( !is_dc && cat == 5 ) { 52.2051 +#define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \ 52.2052 + for(last= 0; last < coefs; last++) { \ 52.2053 + uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \ 52.2054 + if( get_cabac( c, sig_ctx )) { \ 52.2055 + uint8_t *last_ctx = last_coeff_ctx_base + last_off; \ 52.2056 + index[coeff_count++] = last; \ 52.2057 + if( get_cabac( c, last_ctx ) ) { \ 52.2058 + last= max_coeff; \ 52.2059 + break; \ 52.2060 + } \ 52.2061 + } \ 52.2062 + }\ 52.2063 + if( last == max_coeff -1 ) {\ 52.2064 + index[coeff_count++] = last;\ 52.2065 + }\ 52.2066 + 52.2067 + const uint8_t *sig_off = significant_coeff_flag_offset_8x8[0]; 52.2068 + DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] ); 52.2069 + } else { 52.2070 + DECODE_SIGNIFICANCE( max_coeff - 1, last, last ); 52.2071 + } 52.2072 + assert(coeff_count > 0); 52.2073 + 52.2074 + if( is_dc ) { 52.2075 + if( cat == 0 ) 52.2076 + hc->cbp[mb_x] |= 0x100; 52.2077 + else 52.2078 + hc->cbp[mb_x] |= 0x40 << n; 52.2079 + } else { 52.2080 + if( cat == 5 ) 52.2081 + fill_rectangle(&m->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1); 52.2082 + else { 52.2083 + assert( cat == 1 || cat == 2 || cat == 4 ); 52.2084 + m->non_zero_count_cache[scan8[n]] = coeff_count; 52.2085 + } 52.2086 + } 52.2087 + 52.2088 + do { 52.2089 + uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base; 52.2090 + int j= scantable[index[--coeff_count]]; 52.2091 + 52.2092 + if( get_cabac( c, ctx ) == 0 ) { 52.2093 + node_ctx = coeff_abs_level_transition[0][node_ctx]; 52.2094 + if( is_dc ) { 52.2095 + block[j] = get_cabac_bypass_sign( c, -1); 52.2096 + }else{ 52.2097 + block[j] = (get_cabac_bypass_sign( c, -qmul[j]) + 32) >> 6; 52.2098 + } 52.2099 + } else { 52.2100 + int coeff_abs = 2; 52.2101 + ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base; 52.2102 + node_ctx = coeff_abs_level_transition[1][node_ctx]; 52.2103 + 52.2104 + while( coeff_abs < 15 && get_cabac( c, ctx ) ) { 52.2105 + coeff_abs++; 52.2106 + } 52.2107 + 52.2108 + if( coeff_abs >= 15 ) { 52.2109 + int j = 0; 52.2110 + while( get_cabac_bypass( c ) ) { 52.2111 + j++; 52.2112 + } 52.2113 + 52.2114 + coeff_abs=1; 52.2115 + while( j-- ) { 52.2116 + coeff_abs += coeff_abs + get_cabac_bypass( c ); 52.2117 + } 52.2118 + coeff_abs+= 14; 52.2119 + } 52.2120 + 52.2121 + if( is_dc ) { 52.2122 + block[j] = get_cabac_bypass_sign( c, -coeff_abs ); 52.2123 + }else{ 52.2124 + block[j] = (get_cabac_bypass_sign( c, -coeff_abs ) * qmul[j] + 32) >> 6; 52.2125 + } 52.2126 + } 52.2127 + } while( coeff_count ); 52.2128 + 52.2129 +} 52.2130 + 52.2131 +static void decode_cabac_residual_dc( H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) { 52.2132 + decode_cabac_residual_internal( hc, s, c, block, cat, n, scantable, NULL, max_coeff, 1); 52.2133 +} 52.2134 + 52.2135 +static void decode_cabac_residual_nondc( H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) { 52.2136 + decode_cabac_residual_internal( hc, s, c, block, cat, n, scantable, qmul, max_coeff, 0); 52.2137 +} 52.2138 + 52.2139 +/** 52.2140 + * decodes a macroblock 52.2141 + * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed 52.2142 + */ 52.2143 +int ff_h264_decode_mb_cabac(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c) { 52.2144 + H264Mb *m = s->m; 52.2145 + int mb_x = m->mb_x; 52.2146 + int mb_type, partition_count, cbp = 0; 52.2147 + int dct8x8_allowed= s->pps.transform_8x8_mode; 52.2148 + 52.2149 + fill_decode_neighbors(hc, s); 52.2150 + memset(m->mb, 0 , sizeof(m->mb)); 52.2151 + 52.2152 + if( s->slice_type_nos != FF_I_TYPE ) { 52.2153 + int skip; 52.2154 + /* a skipped mb needs the aff flag from the following mb */ 52.2155 + skip = decode_cabac_mb_skip( hc, s, m, c); 52.2156 + 52.2157 + /* read skip flags */ 52.2158 + if( skip ) { 52.2159 + decode_mb_skip(hc, s); 52.2160 + hc->cbp[mb_x] = m->cbp = 0; 52.2161 + hc->chroma_pred_mode[mb_x] = 0; 52.2162 + s->last_qscale_diff = 0; 52.2163 + return 0; 52.2164 + } 52.2165 + } 52.2166 + 52.2167 + if( s->slice_type_nos == FF_B_TYPE ) { 52.2168 + int ctx = 0; 52.2169 + 52.2170 + if( !IS_DIRECT( m->left_type-1 ) ) 52.2171 + ctx++; 52.2172 + if( !IS_DIRECT( m->top_type-1 ) ) 52.2173 + ctx++; 52.2174 + 52.2175 + if( !get_cabac_noinline(c, &c->cabac_state[27+ctx] ) ){ 52.2176 + mb_type= 0; /* B_Direct_16x16 */ 52.2177 + }else if( !get_cabac_noinline(c, &c->cabac_state[27+3] ) ) { 52.2178 + mb_type= 1 + get_cabac_noinline(c, &c->cabac_state[27+5] ); /* B_L[01]_16x16 */ 52.2179 + }else{ 52.2180 + int bits; 52.2181 + bits = get_cabac_noinline(c, &c->cabac_state[27+4] ) << 3; 52.2182 + bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 2; 52.2183 + bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 1; 52.2184 + bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ); 52.2185 + if( bits < 8 ){ 52.2186 + mb_type= bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */ 52.2187 + }else if( bits == 13 ){ 52.2188 + mb_type= decode_cabac_intra_mb_type(s, c, 32, 0); 52.2189 + goto decode_intra_mb; 52.2190 + }else if( bits == 14 ){ 52.2191 + mb_type= 11; /* B_L1_L0_8x16 */ 52.2192 + }else if( bits == 15 ){ 52.2193 + mb_type= 22; /* B_8x8 */ 52.2194 + }else{ 52.2195 + bits= ( bits<<1 ) + get_cabac_noinline(c, &c->cabac_state[27+5] ); 52.2196 + mb_type= bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */ 52.2197 + } 52.2198 + } 52.2199 + partition_count= b_mb_type_info[mb_type].partition_count; 52.2200 + mb_type= b_mb_type_info[mb_type].type; 52.2201 + } else if( s->slice_type_nos == FF_P_TYPE ) { 52.2202 + if( get_cabac_noinline(c, &c->cabac_state[14] ) == 0 ) { 52.2203 + /* P-type */ 52.2204 + if( get_cabac_noinline(c, &c->cabac_state[15] ) == 0 ) { 52.2205 + /* P_L0_D16x16, P_8x8 */ 52.2206 + mb_type= 3 * get_cabac_noinline(c, &c->cabac_state[16] ); 52.2207 + } else { 52.2208 + /* P_L0_D8x16, P_L0_D16x8 */ 52.2209 + mb_type= 2 - get_cabac_noinline(c, &c->cabac_state[17] ); 52.2210 + } 52.2211 + partition_count= p_mb_type_info[mb_type].partition_count; 52.2212 + mb_type= p_mb_type_info[mb_type].type; 52.2213 + } else { 52.2214 + mb_type= decode_cabac_intra_mb_type(s, c, 17, 0); 52.2215 + goto decode_intra_mb; 52.2216 + } 52.2217 + } else { 52.2218 + mb_type= decode_cabac_intra_mb_type(s ,c, 3, 1); 52.2219 + if(s->slice_type == FF_SI_TYPE && mb_type) 52.2220 + mb_type--; 52.2221 + assert(s->slice_type_nos == FF_I_TYPE); 52.2222 +decode_intra_mb: 52.2223 + partition_count = 0; 52.2224 + cbp= i_mb_type_info[mb_type].cbp; 52.2225 + m->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode; 52.2226 + mb_type= i_mb_type_info[mb_type].type; 52.2227 + } 52.2228 + 52.2229 + if(IS_INTRA_PCM(mb_type)) { 52.2230 + uint8_t *ptr; 52.2231 + // We assume these blocks are very rare so we do not optimize it. 52.2232 + // FIXME The two following lines get the bitstream position in the cabac 52.2233 + // decode, I think it should be done by a function in cabac.h (or cabac.c). 52.2234 + ptr=c->bytestream; 52.2235 + if(c->low&0x1) ptr--; 52.2236 + if(CABAC_BITS==16){ 52.2237 + if(c->low&0x1FF) ptr--; 52.2238 + } 52.2239 + if ((unsigned) (ptr + 384) >= (unsigned) c->bytestream_end){ 52.2240 + fprintf(stderr, "Intra PCM mb crossed bytestream buffer\n Known issue."); 52.2241 + } 52.2242 + 52.2243 + // The pixels are stored in the same order as levels in h->mb array. 52.2244 + memcpy(m->mb, ptr, 256); ptr+=256; 52.2245 + memcpy(m->mb+128, ptr, 128); ptr+=128; 52.2246 + 52.2247 + c->bytestream = ptr; 52.2248 + #if CABAC_BITS == 16 52.2249 + c->low = (*c->bytestream++)<<18; 52.2250 + c->low+= (*c->bytestream++)<<10; 52.2251 + #else 52.2252 + c->low = (*c->bytestream++)<<10; 52.2253 + #endif 52.2254 + c->low+= ((*c->bytestream++)<<2) + 2; 52.2255 + c->range= 0x1FE; 52.2256 + 52.2257 + // All blocks are present 52.2258 + hc->cbp[mb_x] = 0x1ef; 52.2259 + hc->chroma_pred_mode[mb_x] = 0; 52.2260 + // In deblocking, the quantizer is 0 52.2261 + hc->qscale[mb_x]= 0; 52.2262 + // All coeffs are present 52.2263 + memset(hc->non_zero_count[mb_x], 16, 32); 52.2264 + hc->mb_type[mb_x]= m->mb_type = mb_type; 52.2265 + s->last_qscale_diff = 0; 52.2266 + fill_filter_caches(hc, s, mb_type); 52.2267 + return 0; 52.2268 + } 52.2269 + fill_decode_caches(hc, s, mb_type); 52.2270 + 52.2271 + if( IS_INTRA( mb_type ) ) { 52.2272 + int i, pred_mode; 52.2273 + if( IS_INTRA4x4( mb_type ) ) { 52.2274 + if( dct8x8_allowed && get_cabac_noinline(c, &c->cabac_state[399 + hc->neighbor_transform_size] ) ) { 52.2275 + mb_type |= MB_TYPE_8x8DCT; 52.2276 + for( i = 0; i < 16; i+=4 ) { 52.2277 + int pred = pred_intra_mode( s, i ); 52.2278 + int mode = decode_cabac_mb_intra4x4_pred_mode(c, pred ); 52.2279 + fill_rectangle( &m->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 ); 52.2280 + } 52.2281 + } else { 52.2282 + for( i = 0; i < 16; i++ ) { 52.2283 + int pred = pred_intra_mode( s, i ); 52.2284 + m->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode(c, pred ); 52.2285 + } 52.2286 + } 52.2287 + write_back_intra_pred_mode(hc, s); 52.2288 + if( check_intra4x4_pred_mode(s) < 0 ) return -1; 52.2289 + } else { 52.2290 + m->intra16x16_pred_mode= check_intra_pred_mode(s, m->intra16x16_pred_mode ); 52.2291 + if( m->intra16x16_pred_mode < 0 ) return -1; 52.2292 + } 52.2293 + 52.2294 + hc->chroma_pred_mode[mb_x] = 52.2295 + pred_mode = decode_cabac_mb_chroma_pre_mode( hc, s, c ); 52.2296 + 52.2297 + pred_mode= check_intra_pred_mode( s, pred_mode ); 52.2298 + if( pred_mode < 0 ) return -1; 52.2299 + m->chroma_pred_mode= pred_mode; 52.2300 + 52.2301 + } else if( partition_count == 4 ) { 52.2302 + int i, j, sub_partition_count[4], list, ref[2][4]; 52.2303 + 52.2304 + if( s->slice_type_nos == FF_B_TYPE ) { 52.2305 + for( i = 0; i < 4; i++ ) { 52.2306 + m->sub_mb_type[i] = decode_cabac_b_mb_sub_type( c ); 52.2307 + sub_partition_count[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count; 52.2308 + m->sub_mb_type[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].type; 52.2309 + } 52.2310 + if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] | 52.2311 + m->sub_mb_type[2] | m->sub_mb_type[3]) ) { 52.2312 + ff_h264_pred_direct_motion(hc, s, &mb_type); 52.2313 + m->ref_cache[0][scan8[4]] = 52.2314 + m->ref_cache[1][scan8[4]] = 52.2315 + m->ref_cache[0][scan8[12]] = 52.2316 + m->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE; 52.2317 + for( i = 0; i < 4; i++ ) 52.2318 + fill_rectangle( &hc->direct_cache[scan8[4*i]], 2, 2, 8, (m->sub_mb_type[i]>>1)&0xFF, 1 ); 52.2319 + } 52.2320 + } else { 52.2321 + for( i = 0; i < 4; i++ ) { 52.2322 + m->sub_mb_type[i] = decode_cabac_p_mb_sub_type( c ); 52.2323 + sub_partition_count[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count; 52.2324 + m->sub_mb_type[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].type; 52.2325 + } 52.2326 + } 52.2327 + 52.2328 + for( list = 0; list < s->list_count; list++ ) { 52.2329 + for( i = 0; i < 4; i++ ) { 52.2330 + if(IS_DIRECT(m->sub_mb_type[i])) continue; 52.2331 + if(IS_DIR(m->sub_mb_type[i], 0, list)){ 52.2332 + if( s->ref_count[list] > 1 ){ 52.2333 + ref[list][i] = decode_cabac_mb_ref(hc, s, c, list, 4*i ); 52.2334 + if(ref[list][i] >= s->ref_count[list]){ 52.2335 + fprintf(stderr, "Reference %d >= %d\n", ref[list][i], s->ref_count[list]); 52.2336 + return -1; 52.2337 + } 52.2338 + }else 52.2339 + ref[list][i] = 0; 52.2340 + } else { 52.2341 + ref[list][i] = -1; 52.2342 + } 52.2343 + m->ref_cache[list][ scan8[4*i]+1 ]= 52.2344 + m->ref_cache[list][ scan8[4*i]+8 ]=m->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i]; 52.2345 + } 52.2346 + } 52.2347 + 52.2348 + if(dct8x8_allowed) 52.2349 + dct8x8_allowed = get_dct8x8_allowed(s); 52.2350 + 52.2351 + for(list=0; list<s->list_count; list++){ 52.2352 + for(i=0; i<4; i++){ 52.2353 + m->ref_cache[list][ scan8[4*i] ]=m->ref_cache[list][ scan8[4*i]+1 ]; 52.2354 + if(IS_DIRECT(m->sub_mb_type[i])){ 52.2355 + fill_rectangle(hc->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2); 52.2356 + continue; 52.2357 + } 52.2358 + 52.2359 + if(IS_DIR(m->sub_mb_type[i], 0, list) && !IS_DIRECT(m->sub_mb_type[i])){ 52.2360 + const int sub_mb_type= m->sub_mb_type[i]; 52.2361 + const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1; 52.2362 + for(j=0; j<sub_partition_count[i]; j++){ 52.2363 + int mpx, mpy; 52.2364 + int mx, my; 52.2365 + const int index= 4*i + block_width*j; 52.2366 + int16_t (* mv_cache)[2]= &m->mv_cache[list][ scan8[index]]; 52.2367 + uint8_t (* mvd_cache)[2]= &hc->mvd_cache[list][ scan8[index]]; 52.2368 + pred_motion(s, index, block_width, list, m->ref_cache[list][ scan8[index] ], &mx, &my); 52.2369 + DECODE_CABAC_MB_MVD( hc, c, list, index) 52.2370 + 52.2371 + if(IS_SUB_8X8(sub_mb_type)){ 52.2372 + mv_cache[ 1 ][0]= 52.2373 + mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx; 52.2374 + mv_cache[ 1 ][1]= 52.2375 + mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my; 52.2376 + 52.2377 + mvd_cache[ 1 ][0]= 52.2378 + mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mpx; 52.2379 + mvd_cache[ 1 ][1]= 52.2380 + mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= mpy; 52.2381 + }else if(IS_SUB_8X4(sub_mb_type)){ 52.2382 + mv_cache[ 1 ][0]= mx; 52.2383 + mv_cache[ 1 ][1]= my; 52.2384 + 52.2385 + mvd_cache[ 1 ][0]= mpx; 52.2386 + mvd_cache[ 1 ][1]= mpy; 52.2387 + }else if(IS_SUB_4X8(sub_mb_type)){ 52.2388 + mv_cache[ 8 ][0]= mx; 52.2389 + mv_cache[ 8 ][1]= my; 52.2390 + 52.2391 + mvd_cache[ 8 ][0]= mpx; 52.2392 + mvd_cache[ 8 ][1]= mpy; 52.2393 + } 52.2394 + mv_cache[ 0 ][0]= mx; 52.2395 + mv_cache[ 0 ][1]= my; 52.2396 + 52.2397 + mvd_cache[ 0 ][0]= mpx; 52.2398 + mvd_cache[ 0 ][1]= mpy; 52.2399 + } 52.2400 + }else{ 52.2401 + fill_rectangle(m->mv_cache [list][ scan8[4*i] ], 2, 2, 8, 0, 4); 52.2402 + fill_rectangle(hc->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 2); 52.2403 + } 52.2404 + } 52.2405 + } 52.2406 + } else if( IS_DIRECT(mb_type) ) { 52.2407 + ff_h264_pred_direct_motion(hc, s, &mb_type); 52.2408 + fill_rectangle(hc->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2); 52.2409 + fill_rectangle(hc->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2); 52.2410 + dct8x8_allowed &= s->direct_8x8_inference_flag; 52.2411 + } else { 52.2412 + int list, i; 52.2413 + if(IS_16X16(mb_type)){ 52.2414 + for(list=0; list<s->list_count; list++){ 52.2415 + if(IS_DIR(mb_type, 0, list)){ 52.2416 + int ref; 52.2417 + if(s->ref_count[list] > 1){ 52.2418 + ref= decode_cabac_mb_ref(hc, s, c, list, 0); 52.2419 + if(ref >= s->ref_count[list]){ 52.2420 + fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]); 52.2421 + return -1; 52.2422 + } 52.2423 + }else 52.2424 + ref=0; 52.2425 + fill_rectangle(&m->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1); 52.2426 + } 52.2427 + } 52.2428 + for(list=0; list<s->list_count; list++){ 52.2429 + if(IS_DIR(mb_type, 0, list)){ 52.2430 + int mx,my,mpx,mpy; 52.2431 + pred_motion(s, 0, 4, list, m->ref_cache[list][ scan8[0] ], &mx, &my); 52.2432 + DECODE_CABAC_MB_MVD( hc, c, list, 0) 52.2433 + 52.2434 + fill_rectangle(hc->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2); 52.2435 + fill_rectangle(m->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4); 52.2436 + } 52.2437 + 52.2438 + } 52.2439 + } 52.2440 + else if(IS_16X8(mb_type)){ 52.2441 + for(list=0; list<s->list_count; list++){ 52.2442 + for(i=0; i<2; i++){ 52.2443 + if(IS_DIR(mb_type, i, list)){ 52.2444 + int ref; 52.2445 + if(s->ref_count[list] > 1){ 52.2446 + ref= decode_cabac_mb_ref(hc, s, c, list, 8*i ); 52.2447 + if(ref >= s->ref_count[list]){ 52.2448 + fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]); 52.2449 + return -1; 52.2450 + } 52.2451 + }else 52.2452 + ref=0; 52.2453 + fill_rectangle(&m->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1); 52.2454 + }else 52.2455 + fill_rectangle(&m->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1); 52.2456 + } 52.2457 + } 52.2458 + for(list=0; list<s->list_count; list++){ 52.2459 + for(i=0; i<2; i++){ 52.2460 + if(IS_DIR(mb_type, i, list)){ 52.2461 + int mx,my,mpx,mpy; 52.2462 + pred_16x8_motion(s, 8*i, list, m->ref_cache[list][scan8[0] + 16*i], &mx, &my); 52.2463 + DECODE_CABAC_MB_MVD( hc, c, list, 8*i) 52.2464 + 52.2465 + fill_rectangle(hc->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2); 52.2466 + fill_rectangle(m->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4); 52.2467 + }else{ 52.2468 + fill_rectangle(hc->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2); 52.2469 + fill_rectangle(m->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4); 52.2470 + } 52.2471 + } 52.2472 + } 52.2473 + }else{ 52.2474 + assert(IS_8X16(mb_type)); 52.2475 + for(list=0; list<s->list_count; list++){ 52.2476 + for(i=0; i<2; i++){ 52.2477 + if(IS_DIR(mb_type, i, list)){ //FIXME optimize 52.2478 + int ref; 52.2479 + if(s->ref_count[list] > 1){ 52.2480 + ref= decode_cabac_mb_ref(hc, s, c, list, 4*i ); 52.2481 + if(ref >= s->ref_count[list]){ 52.2482 + fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]); 52.2483 + return -1; 52.2484 + } 52.2485 + }else 52.2486 + ref=0; 52.2487 + fill_rectangle(&m->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1); 52.2488 + }else 52.2489 + fill_rectangle(&m->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1); 52.2490 + } 52.2491 + } 52.2492 + for(list=0; list<s->list_count; list++){ 52.2493 + for(i=0; i<2; i++){ 52.2494 + if(IS_DIR(mb_type, i, list)){ 52.2495 + int mx,my,mpx,mpy; 52.2496 + pred_8x16_motion( s, i*4, list, m->ref_cache[list][ scan8[0] + 2*i ], &mx, &my); 52.2497 + DECODE_CABAC_MB_MVD( hc, c, list, 4*i) 52.2498 + 52.2499 + fill_rectangle(hc->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2); 52.2500 + fill_rectangle(m->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4); 52.2501 + }else{ 52.2502 + fill_rectangle(hc->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2); 52.2503 + fill_rectangle(m-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4); 52.2504 + } 52.2505 + } 52.2506 + } 52.2507 + } 52.2508 + } 52.2509 + 52.2510 + if( IS_INTER( mb_type ) ) { 52.2511 + hc->chroma_pred_mode[mb_x] = 0; 52.2512 + write_back_motion( hc, s, mb_type ); 52.2513 + } 52.2514 + 52.2515 + if( !IS_INTRA16x16( mb_type ) ) { 52.2516 + cbp = decode_cabac_mb_cbp_luma( hc, c); 52.2517 + cbp |= decode_cabac_mb_cbp_chroma( hc, c ) << 4; 52.2518 + } 52.2519 + 52.2520 + hc->cbp[mb_x] = m->cbp = cbp; 52.2521 + if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) { 52.2522 + mb_type |= MB_TYPE_8x8DCT * get_cabac_noinline(c, &c->cabac_state[399 + hc->neighbor_transform_size] ); 52.2523 + } 52.2524 + 52.2525 + if( cbp || IS_INTRA16x16( mb_type ) ) { 52.2526 + const uint8_t *scan, *scan8x8, *dc_scan; 52.2527 + const uint32_t *qmul; 52.2528 + 52.2529 + if (s->transform_bypass && s->qscale){ 52.2530 + scan8x8= ff_zigzag_direct; 52.2531 + scan= zigzag_scan; 52.2532 + }else{ 52.2533 + scan8x8= hc->zigzag_scan8x8; 52.2534 + scan= hc->zigzag_scan; 52.2535 + } 52.2536 + dc_scan= luma_dc_zigzag_scan; 52.2537 + 52.2538 + // decode_cabac_mb_dqp 52.2539 + if(get_cabac_noinline(c, &c->cabac_state[60 + (s->last_qscale_diff != 0)])){ 52.2540 + int val = 1; 52.2541 + int ctx= 2; 52.2542 + 52.2543 + while( get_cabac_noinline(c, &c->cabac_state[60 + ctx] ) ) { 52.2544 + ctx= 3; 52.2545 + val++; 52.2546 + if(val > 102){ //prevent infinite loop 52.2547 + fprintf(stderr, "cabac decode of qscale diff failed at %d %d (%d)\n", m->mb_x, m->mb_y, val); 52.2548 + return -1; 52.2549 + } 52.2550 + } 52.2551 + 52.2552 + if( val&0x01 ) 52.2553 + val= (val + 1)>>1 ; 52.2554 + else 52.2555 + val= -((val + 1)>>1); 52.2556 + s->last_qscale_diff = val; 52.2557 + s->qscale += val; 52.2558 + if(((unsigned)s->qscale) > 51){ 52.2559 + if(s->qscale<0) s->qscale+= 52; 52.2560 + else s->qscale-= 52; 52.2561 + } 52.2562 + s->chroma_qp[0] = s->pps.chroma_qp_table[0][s->qscale]; 52.2563 + s->chroma_qp[1] = s->pps.chroma_qp_table[1][s->qscale]; 52.2564 + }else 52.2565 + s->last_qscale_diff=0; 52.2566 + 52.2567 + if( IS_INTRA16x16( mb_type ) ) { 52.2568 + int i; 52.2569 + decode_cabac_residual_dc( hc, s, c, m->mb, 0, 0, dc_scan, 16); 52.2570 + 52.2571 + if( cbp&15 ) { 52.2572 + qmul = hc->dequant4_coeff[0][s->qscale]; 52.2573 + for( i = 0; i < 16; i++ ) { 52.2574 + decode_cabac_residual_nondc( hc, s, c, m->mb + 16*i, 1, i, scan + 1, qmul, 15); 52.2575 + } 52.2576 + } else { 52.2577 + fill_rectangle(&m->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1); 52.2578 + } 52.2579 + } else { 52.2580 + int i8x8, i4x4; 52.2581 + for( i8x8 = 0; i8x8 < 4; i8x8++ ) { 52.2582 + if( cbp & (1<<i8x8) ) { 52.2583 + if( IS_8x8DCT(mb_type) ) { 52.2584 + decode_cabac_residual_nondc(hc, s, c, m->mb + 64*i8x8, 5, 4*i8x8, 52.2585 + scan8x8, hc->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64); 52.2586 + } else { 52.2587 + qmul = hc->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale]; 52.2588 + for( i4x4 = 0; i4x4 < 4; i4x4++ ) { 52.2589 + const int index = 4*i8x8 + i4x4; 52.2590 +//START_TIMER 52.2591 + decode_cabac_residual_nondc(hc, s, c, m->mb + 16*index, 2, index, scan, qmul, 16); 52.2592 +//STOP_TIMER("decode_residual") 52.2593 + } 52.2594 + } 52.2595 + } else { 52.2596 + uint8_t * const nnz= &m->non_zero_count_cache[ scan8[4*i8x8] ]; 52.2597 + nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0; 52.2598 + } 52.2599 + } 52.2600 + } 52.2601 + 52.2602 + if( cbp&0x30 ){ 52.2603 + int i; 52.2604 + for( i = 0; i < 2; i++ ) { 52.2605 + decode_cabac_residual_dc(hc, s, c, m->mb + 256 + 16*4*i, 3, i, chroma_dc_scan, 4); 52.2606 + } 52.2607 + } 52.2608 + 52.2609 + if( cbp&0x20 ) { 52.2610 + int i, j; 52.2611 + for( i = 0; i < 2; i++ ) { 52.2612 + qmul = hc->dequant4_coeff[i+1+(IS_INTRA( mb_type ) ? 0:3)][s->chroma_qp[i]]; 52.2613 + for( j = 0; j < 4; j++ ) { 52.2614 + const int index = 16 + 4 * i + j; 52.2615 + decode_cabac_residual_nondc( hc, s, c, m->mb + 16*index, 4, index, scan + 1, qmul, 15); 52.2616 + } 52.2617 + } 52.2618 + } else { 52.2619 + uint8_t * const nnz= &m->non_zero_count_cache[0]; 52.2620 + nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] = 52.2621 + nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0; 52.2622 + } 52.2623 + } else { 52.2624 + uint8_t * const nnz= &m->non_zero_count_cache[0]; 52.2625 + fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1); 52.2626 + nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] = 52.2627 + nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0; 52.2628 + s->last_qscale_diff = 0; 52.2629 + } 52.2630 + hc->mb_type[mb_x]= m->mb_type = mb_type; 52.2631 + hc->qscale[mb_x]= s->qscale; 52.2632 + write_back_non_zero_count(hc, s); 52.2633 + fill_filter_caches(hc, s, mb_type); 52.2634 + 52.2635 + return 0; 52.2636 +}
53.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 53.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.h Mon Aug 27 12:09:56 2012 +0200 53.3 @@ -0,0 +1,17 @@ 53.4 +#ifndef H264_CABAC_H 53.5 +#define H264_CABAC_H 53.6 + 53.7 +#define CELL_SPE 53.8 +#include "libavcodec/avcodec.h" 53.9 +#include "h264_types_spu.h" 53.10 +#include "cabac_spu.h" 53.11 + 53.12 + 53.13 +/** 53.14 + * decodes a CABAC coded macroblock 53.15 + * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed 53.16 + */ 53.17 +int ff_h264_decode_mb_cabac(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c); 53.18 +void ff_h264_init_cabac_states(EDSlice_spu *s, CABACContext *c); 53.19 + 53.20 +#endif
54.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 54.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_chroma_template_spu.c Mon Aug 27 12:09:56 2012 +0200 54.3 @@ -0,0 +1,355 @@ 54.4 +static void PREFIX_h264_chroma_mc8_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) { 54.5 + 54.6 + register int i; 54.7 + 54.8 + const int16_t i32ss= 32; 54.9 + const int16_t imax = 255; 54.10 + const int16_t iABCD1 = ((8 - x) * (8 - y)); 54.11 + const int16_t iABCD2 = ((x) * (8 - y)); 54.12 + const int16_t iABCD3 = ((8 - x) * (y)); 54.13 + const int16_t iABCD4 = ((x) * (y)); 54.14 + 54.15 + const vsint16_t vA = spu_splats(iABCD1); 54.16 + const vsint16_t vB = spu_splats(iABCD2); 54.17 + const vsint16_t vC = spu_splats(iABCD3); 54.18 + const vsint16_t vD = spu_splats(iABCD4); 54.19 + const vsint32_t vzero = spu_splats(0); 54.20 + const vsint16_t v32ss = spu_splats(i32ss); 54.21 + const vsint16_t vmax = (vsint16_t)spu_splats(imax); 54.22 + vuint16_t sat; 54.23 + 54.24 + const int shift_src =(unsigned int) src & 15; 54.25 + const int shift_dst =(unsigned int) dst & 15; 54.26 + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; 54.27 + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; 54.28 + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; 54.29 + const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 54.30 + const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; 54.31 + vuint8_t dstmask; 54.32 + 54.33 + if(shift_dst==0) 54.34 + dstmask=dstmask0; 54.35 + else 54.36 + dstmask=dstmask8; 54.37 + 54.38 + vuint8_t vsrc0uc1; 54.39 + vuint8_t vsrc0uc2; 54.40 + vuint8_t vsrc0uc; 54.41 + vuint8_t vsrc1uc; 54.42 + vsrc0uc1 = *(vuint8_t *)(src); 54.43 + vsrc0uc2 = *(vuint8_t *)(src+16); 54.44 + vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16)); 54.45 + vsrc1uc = spu_slqwbyte(vsrc0uc, 1); 54.46 + 54.47 + vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh); 54.48 + vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh); 54.49 + 54.50 + for (i = 0 ; i < h ; i++) { 54.51 + 54.52 + vuint8_t vsrc2uc1; 54.53 + vuint8_t vsrc2uc2; 54.54 + vuint8_t vsrc2uc; 54.55 + vuint8_t vsrc3uc; 54.56 + vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C); 54.57 + vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16); 54.58 + vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16)); 54.59 + vsrc3uc = spu_slqwbyte(vsrc2uc, 1); 54.60 + 54.61 + vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh); 54.62 + vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh); 54.63 + 54.64 + vsint16_t psum; 54.65 + 54.66 + vsint32_t psum1 = spu_mule(vsrc0ssH, vA); 54.67 + vsint32_t psum2 = spu_mulo(vsrc0ssH, vA); 54.68 + psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); 54.69 + 54.70 + psum1 = spu_mule(vsrc1ssH, vB); 54.71 + psum2 = spu_mulo(vsrc1ssH, vB); 54.72 + vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); 54.73 + psum = spu_add(psum3, psum); 54.74 + 54.75 + psum1 = spu_mule(vsrc2ssH, vC); 54.76 + psum2 = spu_mulo(vsrc2ssH, vC); 54.77 + psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); 54.78 + psum = spu_add(psum3, psum); 54.79 + 54.80 + psum1 = spu_mule(vsrc3ssH, vD); 54.81 + psum2 = spu_mulo(vsrc3ssH, vD); 54.82 + psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); 54.83 + psum = spu_add(psum3, psum); 54.84 + 54.85 + psum = spu_add(v32ss, psum); 54.86 + psum = spu_rlmask(psum, -6); 54.87 + 54.88 + //Saturation from 0 to 255 54.89 + sat = spu_cmpgt(psum,(vsint16_t)vzero); 54.90 + psum = spu_and(psum,(vsint16_t)sat); 54.91 + sat = spu_cmpgt(psum,vmax); 54.92 + psum = spu_sel(psum,vmax,sat); 54.93 + 54.94 + const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu); 54.95 + 54.96 + const vuint8_t dst1 = *(vuint8_t *)dst; 54.97 + 54.98 + const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask); 54.99 + vuint8_t fsum; 54.100 + OP_U8_SPU(fsum, dsum, dst1); 54.101 + 54.102 + *(vuint8_t *)dst=fsum; 54.103 + 54.104 + vsrc0ssH = vsrc2ssH; 54.105 + vsrc1ssH = vsrc3ssH; 54.106 + 54.107 + dst += dst_stride; 54.108 + //src += src_stride; 54.109 + src += STRIDE_C; 54.110 + } 54.111 +} 54.112 + 54.113 +static void PREFIX_h264_chroma_mc4_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) { 54.114 + 54.115 + register int i; 54.116 + 54.117 + const int16_t i32ss= 32; 54.118 + const int16_t imax = 255; 54.119 + const int16_t iABCD1 = ((8 - x) * (8 - y)); 54.120 + const int16_t iABCD2 = ((x) * (8 - y)); 54.121 + const int16_t iABCD3 = ((8 - x) * (y)); 54.122 + const int16_t iABCD4 = ((x) * (y)); 54.123 + 54.124 + const vsint16_t vA = spu_splats(iABCD1); 54.125 + const vsint16_t vB = spu_splats(iABCD2); 54.126 + const vsint16_t vC = spu_splats(iABCD3); 54.127 + const vsint16_t vD = spu_splats(iABCD4); 54.128 + const vsint32_t vzero = spu_splats(0); 54.129 + const vsint16_t v32ss = spu_splats(i32ss); 54.130 + const vsint16_t vmax = (vsint16_t)spu_splats(imax); 54.131 + vuint16_t sat; 54.132 + 54.133 + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; 54.134 + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; 54.135 + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; 54.136 + 54.137 + const int shift_src = (unsigned int) src & 15; 54.138 + const int shift_dst = (unsigned int) dst & 15; 54.139 + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 54.140 + const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 54.141 + const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 54.142 + const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; 54.143 + const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; 54.144 + 54.145 + switch(shift_dst){ 54.146 + case 0: dstmask = dstmask0; 54.147 + break; 54.148 + case 4: dstmask = dstmask4; 54.149 + break; 54.150 + case 8: dstmask = dstmask8; 54.151 + break; 54.152 + case 12: dstmask = dstmask12; 54.153 + break; 54.154 + } 54.155 + 54.156 + vuint8_t vsrc0uc1; 54.157 + vuint8_t vsrc0uc2; 54.158 + vuint8_t vsrc0uc; 54.159 + vuint8_t vsrc1uc; 54.160 + vsrc0uc1 = *(vuint8_t *)(src); 54.161 + vsrc0uc2 = *(vuint8_t *)(src+16); 54.162 + vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16)); 54.163 + vsrc1uc = spu_slqwbyte(vsrc0uc, 1); 54.164 + 54.165 + vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh); 54.166 + vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh); 54.167 + 54.168 + for (i = 0 ; i < h ; i++) { 54.169 + 54.170 + vuint8_t vsrc2uc1; 54.171 + vuint8_t vsrc2uc2; 54.172 + vuint8_t vsrc2uc; 54.173 + vuint8_t vsrc3uc; 54.174 + vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C); 54.175 + vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16); 54.176 + vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16)); 54.177 + vsrc3uc = spu_slqwbyte(vsrc2uc, 1); 54.178 + 54.179 + vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh); 54.180 + vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh); 54.181 + 54.182 + vsint16_t psum; 54.183 + 54.184 + vsint32_t psum1 = spu_mule(vsrc0ssH, vA); 54.185 + vsint32_t psum2 = spu_mulo(vsrc0ssH, vA); 54.186 + psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); 54.187 + 54.188 + psum1 = spu_mule(vsrc1ssH, vB); 54.189 + psum2 = spu_mulo(vsrc1ssH, vB); 54.190 + vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); 54.191 + psum = spu_add(psum3, psum); 54.192 + 54.193 + psum1 = spu_mule(vsrc2ssH, vC); 54.194 + psum2 = spu_mulo(vsrc2ssH, vC); 54.195 + psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); 54.196 + psum = spu_add(psum3, psum); 54.197 + 54.198 + psum1 = spu_mule(vsrc3ssH, vD); 54.199 + psum2 = spu_mulo(vsrc3ssH, vD); 54.200 + psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); 54.201 + psum = spu_add(psum3, psum); 54.202 + 54.203 + psum = spu_add(v32ss, psum); 54.204 + psum = spu_rlmask(psum, -6); 54.205 + 54.206 + //Saturation from 0 to 255 54.207 + sat = spu_cmpgt(psum,(vsint16_t)vzero); 54.208 + psum = spu_and(psum,(vsint16_t)sat); 54.209 + sat = spu_cmpgt(psum,vmax); 54.210 + psum = spu_sel(psum,vmax,sat); 54.211 + 54.212 + const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu); 54.213 + 54.214 + const vuint8_t dst1 = *(vuint8_t *)dst; 54.215 + 54.216 + const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask); 54.217 + vuint8_t fsum; 54.218 + OP_U8_SPU(fsum, dsum, dst1); 54.219 + 54.220 + *(vuint8_t *)dst=fsum; 54.221 + 54.222 + vsrc0ssH = vsrc2ssH; 54.223 + vsrc1ssH = vsrc3ssH; 54.224 + 54.225 + dst += dst_stride; 54.226 + src += STRIDE_C; 54.227 + } 54.228 +} 54.229 + 54.230 +static void PREFIX_h264_chroma_mc2_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) { 54.231 + 54.232 + register int i; 54.233 + 54.234 + const int16_t i32ss= 32; 54.235 + const int16_t imax = 255; 54.236 + const int16_t iABCD1 = ((8 - x) * (8 - y)); 54.237 + const int16_t iABCD2 = ((x) * (8 - y)); 54.238 + const int16_t iABCD3 = ((8 - x) * (y)); 54.239 + const int16_t iABCD4 = ((x) * (y)); 54.240 + 54.241 + const vsint16_t vA = spu_splats(iABCD1); 54.242 + const vsint16_t vB = spu_splats(iABCD2); 54.243 + const vsint16_t vC = spu_splats(iABCD3); 54.244 + const vsint16_t vD = spu_splats(iABCD4); 54.245 + const vsint32_t vzero = spu_splats(0); 54.246 + const vsint16_t v32ss = spu_splats(i32ss); 54.247 + const vsint16_t vmax = (vsint16_t)spu_splats(imax); 54.248 + vuint16_t sat; 54.249 + 54.250 + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; 54.251 + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; 54.252 + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; 54.253 + 54.254 + const int shift_src = (unsigned int) src & 15; 54.255 + const int shift_dst = (unsigned int) dst & 15; 54.256 + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 54.257 + const vuint8_t dstmask0= {0x10,0x11,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 54.258 + const vuint8_t dstmask2= {0x00,0x01,0x10,0x11,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 54.259 + const vuint8_t dstmask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 54.260 + const vuint8_t dstmask6= {0x00,0x01,0x02,0x03,0x04,0x05,0x10,0x11,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 54.261 + const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 54.262 + const vuint8_t dstmask10= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x10,0x11,0x0C,0x0D,0x0E,0x0F}; 54.263 + const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x0E,0x0F}; 54.264 + const vuint8_t dstmask14= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x10,0x11}; 54.265 + 54.266 + switch(shift_dst){ 54.267 + case 0: dstmask = dstmask0; 54.268 + break; 54.269 + case 2: dstmask = dstmask2; 54.270 + break; 54.271 + case 4: dstmask = dstmask4; 54.272 + break; 54.273 + case 6: dstmask = dstmask6; 54.274 + break; 54.275 + case 8: dstmask = dstmask8; 54.276 + break; 54.277 + case 10: dstmask = dstmask10; 54.278 + break; 54.279 + case 12: dstmask = dstmask12; 54.280 + break; 54.281 + case 14: dstmask = dstmask14; 54.282 + break; 54.283 + } 54.284 + 54.285 + vuint8_t vsrc0uc1; 54.286 + vuint8_t vsrc0uc2; 54.287 + vuint8_t vsrc0uc; 54.288 + vuint8_t vsrc1uc; 54.289 + vsrc0uc1 = *(vuint8_t *)(src); 54.290 + vsrc0uc2 = *(vuint8_t *)(src+16); 54.291 + vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16)); 54.292 + vsrc1uc = spu_slqwbyte(vsrc0uc, 1); 54.293 + 54.294 + vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh); 54.295 + vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh); 54.296 + 54.297 + for (i = 0 ; i < h ; i++) { 54.298 + 54.299 + vuint8_t vsrc2uc1; 54.300 + vuint8_t vsrc2uc2; 54.301 + vuint8_t vsrc2uc; 54.302 + vuint8_t vsrc3uc; 54.303 + vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C); 54.304 + vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16); 54.305 + vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16)); 54.306 + vsrc3uc = spu_slqwbyte(vsrc2uc, 1); 54.307 + 54.308 + vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh); 54.309 + vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh); 54.310 + 54.311 + vsint16_t psum; 54.312 + 54.313 + vsint32_t psum1 = spu_mule(vsrc0ssH, vA); 54.314 + vsint32_t psum2 = spu_mulo(vsrc0ssH, vA); 54.315 + psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); 54.316 + 54.317 + psum1 = spu_mule(vsrc1ssH, vB); 54.318 + psum2 = spu_mulo(vsrc1ssH, vB); 54.319 + vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); 54.320 + psum = spu_add(psum3, psum); 54.321 + 54.322 + psum1 = spu_mule(vsrc2ssH, vC); 54.323 + psum2 = spu_mulo(vsrc2ssH, vC); 54.324 + psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); 54.325 + psum = spu_add(psum3, psum); 54.326 + 54.327 + psum1 = spu_mule(vsrc3ssH, vD); 54.328 + psum2 = spu_mulo(vsrc3ssH, vD); 54.329 + psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez); 54.330 + psum = spu_add(psum3, psum); 54.331 + 54.332 + psum = spu_add(v32ss, psum); 54.333 + psum = spu_rlmask(psum, -6); 54.334 + 54.335 + //Saturation from 0 to 255 54.336 + sat = spu_cmpgt(psum,(vsint16_t)vzero); 54.337 + psum = spu_and(psum,(vsint16_t)sat); 54.338 + sat = spu_cmpgt(psum,vmax); 54.339 + psum = spu_sel(psum,vmax,sat); 54.340 + 54.341 + const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu); 54.342 + 54.343 + const vuint8_t dst1 = *(vuint8_t *)dst; 54.344 + 54.345 + const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask); 54.346 + vuint8_t fsum; 54.347 + OP_U8_SPU(fsum, dsum, dst1); 54.348 + 54.349 + *(vuint8_t *)dst=fsum; 54.350 + 54.351 + vsrc0ssH = vsrc2ssH; 54.352 + vsrc1ssH = vsrc3ssH; 54.353 + 54.354 + dst += dst_stride; 54.355 + src += STRIDE_C; 54.356 + } 54.357 +} 54.358 +
55.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 55.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.c Mon Aug 27 12:09:56 2012 +0200 55.3 @@ -0,0 +1,266 @@ 55.4 +/* 55.5 + * Copyright (c) 2009 TUDelft 55.6 + * 55.7 + * Cell Parallel SPU - 2DWave Macroblock Decoding. 55.8 + */ 55.9 + 55.10 +/** 55.11 + * @file libavcodec/cell/spu/h264_main_spu.c 55.12 + * Cell Parallel SPU - 2DWave Macroblock Decoding 55.13 + * @author C C Chi <c.c.chi@student.tudelft.nl> 55.14 + * 55.15 + * SIMD kernels 55.16 + * H.264/AVC motion compensation 55.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu> 55.18 + * @author Albert Paradis <apar7632@hotmail.com> 55.19 + */ 55.20 + 55.21 +#include "h264_deblock_spu.h" 55.22 +#include "h264_decode_mb_spu.h" 55.23 + 55.24 +extern int print_debug; 55.25 + 55.26 +static void filter_mb_edgev( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { 55.27 + H264slice *s= h->s; 55.28 + const int index_a = qp + s->slice_alpha_c0_offset; 55.29 + const int alpha = alpha_table[index_a]; 55.30 + const int beta = beta_table[qp + s->slice_beta_offset]; 55.31 + if (alpha ==0 || beta == 0) return; 55.32 + 55.33 + if( bS[0] < 4 ) { 55.34 + int8_t tc[4]; 55.35 + tc[0] = tc0_table[index_a][bS[0]]; 55.36 + tc[1] = tc0_table[index_a][bS[1]]; 55.37 + tc[2] = tc0_table[index_a][bS[2]]; 55.38 + tc[3] = tc0_table[index_a][bS[3]]; 55.39 + 55.40 + h->dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc); 55.41 + } else { 55.42 + h->dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta); 55.43 + } 55.44 +} 55.45 + 55.46 +static void filter_mb_edgecv( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { 55.47 + H264slice *s= h->s; 55.48 + const int index_a = qp + s->slice_alpha_c0_offset; 55.49 + const int alpha = alpha_table[index_a]; 55.50 + const int beta = beta_table[qp + s->slice_beta_offset]; 55.51 + if (alpha ==0 || beta == 0) return; 55.52 + 55.53 + if( bS[0] < 4 ) { 55.54 + int8_t tc[4]; 55.55 + 55.56 + tc[0] = tc0_table[index_a][bS[0]]+1; 55.57 + tc[1] = tc0_table[index_a][bS[1]]+1; 55.58 + tc[2] = tc0_table[index_a][bS[2]]+1; 55.59 + tc[3] = tc0_table[index_a][bS[3]]+1; 55.60 + 55.61 + h->dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc); 55.62 + } else { 55.63 + h->dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta); 55.64 + } 55.65 +} 55.66 + 55.67 +static void filter_mb_edgeh( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { 55.68 + H264slice *s= h->s; 55.69 + const int index_a = qp + s->slice_alpha_c0_offset; 55.70 + const int alpha = alpha_table[index_a]; 55.71 + const int beta = beta_table[qp + s->slice_beta_offset]; 55.72 + if (alpha ==0 || beta == 0) return; 55.73 + 55.74 + if( bS[0] < 4 ) { 55.75 + int8_t tc[4]; 55.76 + 55.77 + tc[0] = tc0_table[index_a][bS[0]]; 55.78 + tc[1] = tc0_table[index_a][bS[1]]; 55.79 + tc[2] = tc0_table[index_a][bS[2]]; 55.80 + tc[3] = tc0_table[index_a][bS[3]]; 55.81 + 55.82 + h->dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc); 55.83 + } else { 55.84 + h->dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta); 55.85 + } 55.86 +} 55.87 + 55.88 +static void filter_mb_edgech( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) { 55.89 + H264slice *s= h->s; 55.90 + const int index_a = qp + s->slice_alpha_c0_offset; 55.91 + const int alpha = alpha_table[index_a]; 55.92 + const int beta = beta_table[qp + s->slice_beta_offset]; 55.93 + if (alpha ==0 || beta == 0) return; 55.94 + 55.95 + if( bS[0] < 4 ) { 55.96 + int8_t tc[4]; 55.97 + 55.98 + tc[0] = tc0_table[index_a][bS[0]]+1; 55.99 + tc[1] = tc0_table[index_a][bS[1]]+1; 55.100 + tc[2] = tc0_table[index_a][bS[2]]+1; 55.101 + tc[3] = tc0_table[index_a][bS[3]]+1; 55.102 + 55.103 + h->dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc); 55.104 + } else { 55.105 + h->dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta); 55.106 + } 55.107 +} 55.108 + 55.109 +static void filter_mb_dir(H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int dir) { 55.110 + H264Mb *mb = h->mb; 55.111 + H264slice *s = h->s; 55.112 + const int qp_xy= mb->qscale_mb_xy; 55.113 + const int qp_dir = dir == 0 ? mb->qscale_left_mb_xy : mb->qscale_top_mb_xy; 55.114 + const int mbm_type = dir == 0 ? mb->left_type : mb->top_type; 55.115 + const int mb_type = mb->mb_type; 55.116 + int edge; 55.117 + const int edges = mb->edges[dir]; 55.118 + //int (*ref2frm)[64] = s->ref2frm; 55.119 + 55.120 +// int start;//= h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0; 55.121 +// 55.122 +// const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP)) 55.123 +// == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4; 55.124 +// // how often to recheck mv-based bS when iterating between edges 55.125 +// const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 : 55.126 +// (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0; 55.127 +// // how often to recheck mv-based bS when iterating along each edge 55.128 +// const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)); 55.129 + 55.130 +// if ((dir==0 && mb_x==0) || (dir==1 && mb_y==0)) 55.131 +// start =1; 55.132 +// else 55.133 +// start =0; 55.134 +// 55.135 +// /* Calculate bS */ 55.136 +// for( edge = start; edge < edges; edge++ ) { 55.137 +// const int mbn_type = edge > 0 ? mb_type : mbm_type; 55.138 +// const int8_t qscale_mbn_xy = edge > 0 ? mb->qscale_mbxy : qscale_mbm; 55.139 +// int (*ref2frmn)[64] = ref2frm;//edge > 0 ? ref2frm : ref2frmm; 55.140 +// int16_t bS[4]; 55.141 +// int qp; 55.142 +// 55.143 +// if( (edge&1) && IS_8x8DCT(mb_type) ) 55.144 +// continue; 55.145 +// 55.146 +// if( IS_INTRA(mb_type) || 55.147 +// IS_INTRA(mbn_type) ) { 55.148 +// int value; 55.149 +// 55.150 +// if (edge == 0) { 55.151 +// value = 4; 55.152 +// } else { 55.153 +// value = 3; 55.154 +// } 55.155 +// bS[0] = bS[1] = bS[2] = bS[3] = value; 55.156 +// } else { 55.157 +// int i, l; 55.158 +// int mv_done; 55.159 +// 55.160 +// if( edge & mask_edge ) { 55.161 +// 55.162 +// bS[0] = bS[1] = bS[2] = bS[3] = 0; 55.163 +// mv_done = 1; 55.164 +// } 55.165 +// else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) { 55.166 +// int b_idx= 8 + 4 + edge * (dir ? 8:1); 55.167 +// int bn_idx= b_idx - (dir ? 8:1); 55.168 +// int v = 0; 55.169 +// 55.170 +// for( l = 0; !v && l < 1 + (s->slice_type_nos == FF_B_TYPE); l++ ) { 55.171 +// v |= ref2frm[l][mb->ref_cache[l][b_idx]] != ref2frmn[l][mb->ref_cache[l][bn_idx]] || 55.172 +// FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 || 55.173 +// FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit; 55.174 +// } 55.175 +// bS[0] = bS[1] = bS[2] = bS[3] = v; 55.176 +// 55.177 +// mv_done = 1; 55.178 +// } 55.179 +// else 55.180 +// mv_done = 0; 55.181 +// 55.182 +// for( i = 0; i < 4; i++ ) { 55.183 +// int x = dir == 0 ? edge : i; 55.184 +// int y = dir == 0 ? i : edge; 55.185 +// int b_idx= 8 + 4 + x + 8*y; 55.186 +// int bn_idx= b_idx - (dir ? 8:1); 55.187 +// 55.188 +// if( mb->non_zero_count_cache[b_idx] | 55.189 +// mb->non_zero_count_cache[bn_idx] ) { 55.190 +// bS[i] = 2; 55.191 +// } 55.192 +// else if(!mv_done) 55.193 +// { 55.194 +// bS[i] = 0; 55.195 +// for( l = 0; l < 1 + (s->slice_type_nos == FF_B_TYPE); l++ ) { 55.196 +// if( ref2frm[l][mb->ref_cache[l][b_idx]] != ref2frmn[l][mb->ref_cache[l][bn_idx]] || 55.197 +// FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 || 55.198 +// FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit ) { 55.199 +// bS[i] = 1; 55.200 +// break; 55.201 +// } 55.202 +// } 55.203 +// } 55.204 +// } 55.205 +// 55.206 +// if(bS[0]+bS[1]+bS[2]+bS[3] == 0) 55.207 +// continue; 55.208 +// } 55.209 +// qp = ( mb->qscale_mbxy + qscale_mbn_xy + 1 ) >> 1; 55.210 + 55.211 + if(mbm_type){ 55.212 + int16_t* bS=mb->bS[dir][0]; 55.213 + /* Filter edge */ 55.214 + // Do not use s->qscale as luma quantizer because it has not the same 55.215 + // value in IPCM macroblocks. 55.216 + if(bS[0]+bS[1]+bS[2]+bS[3]){ 55.217 + int qp = ( qp_xy + qp_dir + 1 ) >> 1; 55.218 + if( dir == 0 ) { 55.219 + filter_mb_edgev(h, &img_y[0], linesize, bS, qp); 55.220 + { 55.221 + int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1; 55.222 + filter_mb_edgecv(h, &img_cb[0], uvlinesize, bS, qp); 55.223 + filter_mb_edgecv(h, &img_cr[0], uvlinesize, bS, qp); 55.224 + } 55.225 + } else { 55.226 + filter_mb_edgeh(h, &img_y[0], linesize, bS, qp); 55.227 + { 55.228 + int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1; 55.229 + filter_mb_edgech(h, &img_cb[0], uvlinesize, bS, qp); 55.230 + filter_mb_edgech(h, &img_cr[0], uvlinesize, bS, qp); 55.231 + } 55.232 + } 55.233 + } 55.234 + } 55.235 + 55.236 + for( edge = 1; edge < edges; edge++ ) { 55.237 + int16_t* bS=mb->bS[dir][edge]; 55.238 + int qp = qp_xy; 55.239 + 55.240 + if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type) 55.241 + continue; 55.242 + 55.243 + /* Filter edge */ 55.244 + // Do not use s->qscale as luma quantizer because it has not the same 55.245 + // value in IPCM macroblocks. 55.246 + 55.247 + if(bS[0]+bS[1]+bS[2]+bS[3] == 0) 55.248 + continue; 55.249 + 55.250 + if( dir == 0 ) { 55.251 + filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp ); 55.252 + if( (edge&1) == 0 ) { 55.253 + filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, get_chroma_qp( s, 0, qp_xy ) ); 55.254 + filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, get_chroma_qp( s, 1, qp_xy ) ); 55.255 + } 55.256 + } else { 55.257 + filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp ); 55.258 + if( (edge&1) == 0 ) { 55.259 + filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp( s, 0, qp_xy ) ); 55.260 + filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp( s, 1, qp_xy ) ); 55.261 + } 55.262 + } 55.263 + } 55.264 +} 55.265 + 55.266 +void filter_mb( H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) { 55.267 + filter_mb_dir(h, img_y, img_cb, img_cr, linesize, uvlinesize, 0); 55.268 + filter_mb_dir(h, img_y, img_cb, img_cr, linesize, uvlinesize, 1); 55.269 +}
56.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 56.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.h Mon Aug 27 12:09:56 2012 +0200 56.3 @@ -0,0 +1,80 @@ 56.4 +#ifndef H264_FILTER_SPU_H 56.5 +#define H264_FILTER_SPU_H 56.6 + 56.7 +#include "types_spu.h" 56.8 +#include "h264_decode_mb_spu.h" 56.9 + 56.10 +#define FFABS(a) ((a) >= 0 ? (a) : (-(a))) 56.11 + 56.12 +void filter_mb(H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize); 56.13 + 56.14 +/* Deblocking filter (p153) */ 56.15 +static const uint8_t alpha_table[52*3] = { 56.16 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56.17 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56.18 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56.19 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56.20 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56.21 + 0, 0, 0, 0, 0, 0, 4, 4, 5, 6, 56.22 + 7, 8, 9, 10, 12, 13, 15, 17, 20, 22, 56.23 + 25, 28, 32, 36, 40, 45, 50, 56, 63, 71, 56.24 + 80, 90,101,113,127,144,162,182,203,226, 56.25 + 255,255, 56.26 + 255,255,255,255,255,255,255,255,255,255,255,255,255, 56.27 + 255,255,255,255,255,255,255,255,255,255,255,255,255, 56.28 + 255,255,255,255,255,255,255,255,255,255,255,255,255, 56.29 + 255,255,255,255,255,255,255,255,255,255,255,255,255, 56.30 +}; 56.31 + 56.32 +static const uint8_t beta_table[52*3] = { 56.33 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56.34 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56.35 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56.36 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56.37 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56.38 + 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, 56.39 + 3, 3, 3, 4, 4, 4, 6, 6, 7, 7, 56.40 + 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 56.41 + 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 56.42 + 18, 18, 56.43 + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 56.44 + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 56.45 + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 56.46 + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 56.47 +}; 56.48 + 56.49 +static const uint8_t tc0_table[52*3][4] = { 56.50 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 56.51 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 56.52 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 56.53 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 56.54 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 56.55 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 56.56 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 56.57 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 56.58 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 56.59 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 56.60 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 56.61 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 }, 56.62 + {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 }, 56.63 + {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, 56.64 + {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 }, 56.65 + {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 }, 56.66 + {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 }, 56.67 + {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 }, 56.68 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 56.69 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 56.70 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 56.71 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 56.72 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 56.73 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 56.74 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 56.75 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 56.76 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 56.77 +}; 56.78 + 56.79 +static inline int get_chroma_qp(H264slice *s, int t, int qscale){ 56.80 + return s->chroma_qp_table[t][qscale]; 56.81 +} 56.82 + 56.83 +#endif
57.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 57.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.c Mon Aug 27 12:09:56 2012 +0200 57.3 @@ -0,0 +1,725 @@ 57.4 +/* 57.5 + * Copyright (c) 2009 TUDelft 57.6 + * 57.7 + * Cell Parallel SPU - 2DWave Macroblock Decoding. 57.8 + */ 57.9 + 57.10 +/** 57.11 + * @file libavcodec/cell/spu/h264_main_spu.c 57.12 + * Cell Parallel SPU - 2DWave Macroblock Decoding 57.13 + * @author C C Chi <c.c.chi@student.tudelft.nl> 57.14 + * 57.15 + * SIMD kernels 57.16 + * H.264/AVC motion compensation 57.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu> 57.18 + * @author Albert Paradis <apar7632@hotmail.com> 57.19 + */ 57.20 + 57.21 +#include <stdio.h> 57.22 +#include <string.h> 57.23 +#include <spu_intrinsics.h> 57.24 +//#include "dsputil_cell.h" 57.25 +#include "types_spu.h" 57.26 +#include "h264_tables.h" 57.27 +#include "h264_dma.h" 57.28 +#include "h264_mc_spu.h" 57.29 +#include "h264_intra_spu.h" 57.30 +#include "h264_decode_mb_spu.h" 57.31 +#include "h264_deblock_spu.h" 57.32 + 57.33 +//border buffers 57.34 +DECLARE_ALIGNED_16(TopBorder, top_ls[240]); 57.35 +LeftBorder left_ls; 57.36 + 57.37 +//mb line buffer - statically allocated for up to 1920 width video 57.38 +DECLARE_ALIGNED_16(uint8_t, dest_y_ls[2*16*20]); 57.39 +DECLARE_ALIGNED_16(uint8_t, dest_cb_ls[2*8*10]); 57.40 +DECLARE_ALIGNED_16(uint8_t, dest_cr_ls[2*8*10]); 57.41 + 57.42 +//dma transfer buffer 57.43 +DECLARE_ALIGNED_16(uint8_t, dma_y_ls [64*(32+20)]); //EDGE_WIDTH = 32 57.44 +DECLARE_ALIGNED_16(uint8_t, dma_cb_ls[32*(16+10)]); 57.45 +DECLARE_ALIGNED_16(uint8_t, dma_cr_ls[32*(16+10)]); 57.46 + 57.47 +DECLARE_ALIGNED_16(uint8_t, extra_edge_y [32*(32+20)]); //EDGE_WIDTH = 32 57.48 +DECLARE_ALIGNED_16(uint8_t, extra_edge_cr[16*(16+10)]); 57.49 +DECLARE_ALIGNED_16(uint8_t, extra_edge_cb[16*(16+10)]); 57.50 + 57.51 + 57.52 +// For intra mode 57.53 +/// for now do the extra copy before dma, but it's better to skip this and do the dma right away 57.54 +static void backup_mb_border(H264Context_spu *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){ 57.55 + H264Mb* mb= h->mb; 57.56 + 57.57 + int i; 57.58 + uint8_t* top_border_y = top_ls[mb->mb_x].unfiltered_y; 57.59 + uint8_t* top_border_cb = top_ls[mb->mb_x].unfiltered_cb; 57.60 + uint8_t* top_border_cr = top_ls[mb->mb_x].unfiltered_cr; 57.61 + 57.62 + uint8_t* left_border_y = left_ls.unfiltered_y; 57.63 + uint8_t* left_border_cb = left_ls.unfiltered_cb; 57.64 + uint8_t* left_border_cr = left_ls.unfiltered_cr; 57.65 + 57.66 + src_y -= linesize; 57.67 + src_cb -= uvlinesize; 57.68 + src_cr -= uvlinesize; 57.69 + 57.70 + // There are two lines saved, the line above the top macroblock of a pair, 57.71 + // and the line above the bottom macroblock 57.72 + left_border_y[0] = top_border_y[15]; 57.73 + for(i=1; i<17; i++){ 57.74 + left_border_y[i] = src_y[15+i* linesize]; 57.75 + } 57.76 + 57.77 + *(qword*)(top_border_y)= *(qword*)(src_y + 16*linesize); 57.78 + 57.79 + left_border_cb[0] = top_border_cb[7]; 57.80 + left_border_cr[0] = top_border_cr[7]; 57.81 + for(i=1; i<9; i++){ 57.82 + left_border_cb[i] = src_cb[7+i*uvlinesize]; 57.83 + left_border_cr[i] = src_cr[7+i*uvlinesize]; 57.84 + } 57.85 + *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize); 57.86 + *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize); 57.87 +} 57.88 + 57.89 +static void xchg_mb_border(H264Context_spu *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){ 57.90 + H264Mb* mb= h->mb; 57.91 + H264slice* s = h->s; 57.92 + 57.93 + int temp8, i; 57.94 + uint64_t temp64; 57.95 + int deblock_left; 57.96 + int deblock_top; 57.97 + 57.98 + uint8_t* top_border_y = top_ls[mb->mb_x].unfiltered_y; 57.99 + uint8_t* top_border_cb = top_ls[mb->mb_x].unfiltered_cb; 57.100 + uint8_t* top_border_cr = top_ls[mb->mb_x].unfiltered_cr; 57.101 + uint8_t* top_border_y_next = top_ls[mb->mb_x +1].unfiltered_y; 57.102 + 57.103 + uint8_t* left_border_y = left_ls.unfiltered_y; 57.104 + uint8_t* left_border_cb = left_ls.unfiltered_cb; 57.105 + uint8_t* left_border_cr = left_ls.unfiltered_cr; 57.106 + 57.107 + deblock_left = (mb->mb_x > 0); 57.108 + deblock_top = (mb->mb_y > 0); 57.109 + 57.110 + src_y -= ( linesize + 1); 57.111 + src_cb -= (uvlinesize + 1); 57.112 + src_cr -= (uvlinesize + 1); 57.113 + 57.114 + #define XCHG(a,b,t,xchg)\ 57.115 + t= a;\ 57.116 + if(xchg)\ 57.117 + a= b;\ 57.118 + b= t; 57.119 + 57.120 + if(deblock_left){ 57.121 + for(i = !deblock_top; i<16; i++){ 57.122 + XCHG(left_border_y[i], src_y [i* linesize], temp8, xchg); 57.123 + } 57.124 + XCHG(left_border_y[i], src_y [i* linesize], temp8, 1); 57.125 + 57.126 + for(i = !deblock_top; i<8; i++){ 57.127 + XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, xchg); 57.128 + XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, xchg); 57.129 + } 57.130 + XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, 1); 57.131 + XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, 1); 57.132 + } 57.133 + 57.134 + if(deblock_top){ 57.135 + XCHG(*(uint64_t*)(top_border_y+0), *(uint64_t*)(src_y +1), temp64, xchg); 57.136 + XCHG(*(uint64_t*)(top_border_y+8), *(uint64_t*)(src_y +9), temp64, 1); 57.137 + if(mb->mb_x+1 < s->mb_width){ 57.138 + XCHG(*(uint64_t*)(top_border_y_next), *(uint64_t*)(src_y +17), temp64, 1); 57.139 + } 57.140 + XCHG(*(uint64_t*)(top_border_cb), *(uint64_t*)(src_cb+1), temp64, 1); 57.141 + XCHG(*(uint64_t*)(top_border_cr), *(uint64_t*)(src_cr+1), temp64, 1); 57.142 + } 57.143 +} 57.144 + 57.145 +void copy_top_borders(int mb_x, uint8_t *dst_y, uint8_t *dst_cb, uint8_t *dst_cr, int stride_y, int stride_c){ 57.146 + qword *qsrc_y = (qword *) (top_ls[mb_x].top_borders_y); 57.147 + dst_y-= 4*stride_y; 57.148 + 57.149 + *((qword *) (dst_y + 0*stride_y)) = *qsrc_y++; 57.150 + *((qword *) (dst_y + 1*stride_y)) = *qsrc_y++; 57.151 + *((qword *) (dst_y + 2*stride_y)) = *qsrc_y++; 57.152 + *((qword *) (dst_y + 3*stride_y)) = *qsrc_y++; 57.153 + 57.154 + dst_cb-=2*stride_c; 57.155 + uint64_t *dsrc_cb = (uint64_t *) (top_ls[mb_x].top_borders_cb); 57.156 + *((uint64_t *) (dst_cb + 0*stride_c)) = *dsrc_cb++; 57.157 + *((uint64_t *) (dst_cb + 1*stride_c)) = *dsrc_cb++; 57.158 + 57.159 + dst_cr-=2*stride_c; 57.160 + uint64_t *dsrc_cr = (uint64_t *) (top_ls[mb_x].top_borders_cr); 57.161 + *((uint64_t *) (dst_cr + 0*stride_c)) = *dsrc_cr++; 57.162 + *((uint64_t *) (dst_cr + 1*stride_c)) = *dsrc_cr++; 57.163 +} 57.164 + 57.165 +static void send_top_borders(H264Context_spu *h, int mb_x, uint8_t* dest_y, uint8_t* dest_cb, uint8_t* dest_cr, int stride_y, int stride_c){ 57.166 + H264spe *spe= &h->spe; 57.167 + //fill borders (unfiltered borders already filled in backup_mb_border) 57.168 + dest_y+= 12*stride_y; 57.169 + qword *qtop_y = (qword *) top_ls[mb_x].top_borders_y; 57.170 + for(int i=0; i<4; i++){ 57.171 + qword *qdest_y = (qword *) dest_y; 57.172 + *qtop_y++ = *qdest_y; 57.173 + dest_y+=stride_y; 57.174 + } 57.175 + dest_cb+= 6*stride_c; 57.176 + dest_cr+= 6*stride_c; 57.177 + uint64_t *dtop_cb = (uint64_t *) top_ls[mb_x].top_borders_cb; 57.178 + uint64_t *dtop_cr = (uint64_t *) top_ls[mb_x].top_borders_cr; 57.179 + for(int i=0; i<2; i++){ 57.180 + uint64_t *ddest_cb = (uint64_t *) dest_cb; 57.181 + uint64_t *ddest_cr = (uint64_t *) dest_cr; 57.182 + 57.183 + *dtop_cb++ = *ddest_cb; 57.184 + *dtop_cr++ = *ddest_cr; 57.185 + 57.186 + dest_cb+=stride_c; 57.187 + dest_cr+=stride_c; 57.188 + } 57.189 + uint8_t* top_border_tgt = spe->tgt_spe + (unsigned) &top_ls[mb_x]; 57.190 + spu_dma_put(&top_ls[mb_x], (unsigned) top_border_tgt, sizeof(TopBorder), MBD_put); 57.191 +} 57.192 + 57.193 +static void extend_edges_left(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr , int lines, int lines_c){ 57.194 + for (int i=0; i<lines; i++){ 57.195 + memset(dma_y, dma_y[32], 32); 57.196 + dma_y+=64; 57.197 + } 57.198 + 57.199 + for (int i=0; i<lines_c; i++){ 57.200 + memset(dma_cb, dma_cb[16], 16); 57.201 + memset(dma_cr, dma_cr[16], 16); 57.202 + dma_cb+=32; dma_cr+=32; 57.203 + } 57.204 +} 57.205 + 57.206 +static void extend_edges_right(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr , int lines, int lines_c, int slots){ 57.207 + 57.208 + for (int i=0; i<lines; i++){ 57.209 + memset(dma_y, dma_y[-1], slots*16); 57.210 + dma_y+=64; 57.211 + } 57.212 + 57.213 + for (int i=0; i<lines_c; i++){ 57.214 + memset(dma_cb, dma_cb[-1], slots*8); 57.215 + memset(dma_cr, dma_cr[-1], slots*8); 57.216 + dma_cb+=32; dma_cr+=32; 57.217 + } 57.218 +} 57.219 + 57.220 +static void extend_edges_top(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr ){ 57.221 + qword *qborder_y = (qword *) dma_y; 57.222 + for (int i=1; i<=32; i++){ 57.223 + qword *qdma_y = (qword *) (dma_y - i*64); 57.224 + *qdma_y = *qborder_y; 57.225 + } 57.226 + 57.227 + uint64_t *dborder_cb = (uint64_t *) dma_cb; 57.228 + uint64_t *dborder_cr = (uint64_t *) dma_cr; 57.229 + for (int i=1; i<=16; i++){ 57.230 + uint64_t *ddma_cb = (uint64_t *) (dma_cb - i*32); 57.231 + uint64_t *ddma_cr = (uint64_t *) (dma_cr - i*32); 57.232 + *ddma_cb = *dborder_cb; 57.233 + *ddma_cr = *dborder_cr; 57.234 + } 57.235 +} 57.236 + 57.237 +static void extend_edges_bottom(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr){ 57.238 + qword *qborder_y = (qword *) dma_y; 57.239 + for (int i=1; i<=32; i++){ 57.240 + qword *qdma_y = (qword *) (dma_y + i*64); 57.241 + *qdma_y = *qborder_y; 57.242 + } 57.243 + 57.244 + uint64_t *dborder_cb = (uint64_t *) dma_cb; 57.245 + uint64_t *dborder_cr = (uint64_t *) dma_cr; 57.246 + for (int i=1; i<=16; i++){ 57.247 + uint64_t *ddma_cb = (uint64_t *) (dma_cb + i*32); 57.248 + uint64_t *ddma_cr = (uint64_t *) (dma_cr + i*32); 57.249 + *ddma_cb = *dborder_cb; 57.250 + *ddma_cr = *dborder_cr; 57.251 + } 57.252 +} 57.253 + 57.254 +static void extend_extra_edge_right(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr, uint8_t *extra_y, uint8_t *extra_cb, uint8_t *extra_cr, int lines, int lines_c){ 57.255 + 57.256 + for (int i=0; i<lines; i++){ 57.257 + memset(extra_y, dma_y[-1], 32); 57.258 + dma_y+=64; extra_y+=32; 57.259 + } 57.260 + 57.261 + for (int i=0; i<lines_c; i++){ 57.262 + memset(extra_cb, dma_cb[-1], 16); 57.263 + memset(extra_cr, dma_cr[-1], 16); 57.264 + dma_cb+=32; dma_cr+=32; 57.265 + extra_cb+=16; extra_cr+=16; 57.266 + } 57.267 +} 57.268 + 57.269 +static void extend_extra_edge_top(uint8_t *extra_y, uint8_t *extra_cb, uint8_t *extra_cr){ 57.270 + qword *qborder_y = (qword *) extra_y; 57.271 + qword *qborder_y2 = (qword *) (extra_y+16); 57.272 + 57.273 + for (int i=1; i<=32; i++){ 57.274 + qword *qextra_y = (qword *) (extra_y-i*32); 57.275 + *qextra_y = *qborder_y; 57.276 + *(qextra_y+1) = *qborder_y2; 57.277 + } 57.278 + 57.279 + qword *qborder_cb = (qword *) extra_cb; 57.280 + qword *qborder_cr = (qword *) extra_cr; 57.281 + for (int i=1; i<=16; i++){ 57.282 + qword *qextra_cb = (qword *) (extra_cb - i*16); 57.283 + qword *qextra_cr = (qword *) (extra_cr - i*16); 57.284 + *qextra_cb = *qborder_cb; 57.285 + *qextra_cr = *qborder_cr; 57.286 + } 57.287 +} 57.288 + 57.289 +static void extend_extra_edge_bottom(uint8_t *extra_y, uint8_t *extra_cb, uint8_t *extra_cr){ 57.290 + qword *qborder_y = (qword *) extra_y; 57.291 + qword *qborder_y2 = (qword *) (extra_y+16); 57.292 + 57.293 + for (int i=1; i<=32; i++){ 57.294 + qword *qextra_y = (qword *) (extra_y+i*32); 57.295 + *qextra_y = *qborder_y; 57.296 + *(qextra_y+1) = *qborder_y2; 57.297 + } 57.298 + 57.299 + qword *qborder_cb = (qword *) extra_cb; 57.300 + qword *qborder_cr = (qword *) extra_cr; 57.301 + for (int i=1; i<=16; i++){ 57.302 + qword *qextra_cb = (qword *) (extra_cb + i*16); 57.303 + qword *qextra_cr = (qword *) (extra_cr + i*16); 57.304 + *qextra_cb = *qborder_cb; 57.305 + *qextra_cr = *qborder_cr; 57.306 + } 57.307 +} 57.308 + 57.309 +static void extend_edges(H264Context_spu *h, int mb_x, int mb_y){ 57.310 + H264slice *s = h->s; 57.311 + 57.312 + uint8_t *dma_y; 57.313 + uint8_t *dma_cb; 57.314 + uint8_t *dma_cr; 57.315 + 57.316 + uint8_t *extra_y = extra_edge_y; 57.317 + uint8_t *extra_cb = extra_edge_cb; 57.318 + uint8_t *extra_cr = extra_edge_cr; 57.319 + 57.320 + int pos = (mb_x+2) %4; 57.321 + if (mb_x == 0){ 57.322 + if (mb_y ==0){ 57.323 + extend_edges_left(&dma_y_ls[32*64], &dma_cb_ls[16*32], &dma_cr_ls[16*32], 12, 6); 57.324 + }else if (mb_y == s->mb_height -1){ 57.325 + extend_edges_left(dma_y_ls, dma_cb_ls, dma_cr_ls, 20, 10); 57.326 + }else { 57.327 + extend_edges_left(dma_y_ls, dma_cb_ls, dma_cr_ls, 16, 8); 57.328 + } 57.329 + }else if (mb_x == s->mb_width-1){ 57.330 + dma_y = &dma_y_ls [(pos+1)*16]; 57.331 + dma_cb = &dma_cb_ls[(pos+1)*8]; 57.332 + dma_cr = &dma_cr_ls[(pos+1)*8]; 57.333 + if (mb_y ==0){ 57.334 + dma_y += 32*64; 57.335 + dma_cb += 16*32; 57.336 + dma_cr += 16*32; 57.337 + extra_y = extra_edge_y + 32*32; 57.338 + extra_cb= extra_edge_cb + 16*16; 57.339 + extra_cr= extra_edge_cr + 16*16; 57.340 + 57.341 + if (pos==2){ 57.342 + extend_edges_right(dma_y, dma_cb, dma_cr, 12, 6, 1); 57.343 + extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 12, 6); 57.344 + }else if (pos==3){ 57.345 + extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 12, 6); 57.346 + }else{ 57.347 + extend_edges_right(dma_y, dma_cb, dma_cr, 12, 6, 2); 57.348 + } 57.349 + }else if (mb_y == s->mb_height -1){ 57.350 + if (pos==2){ 57.351 + extend_edges_right(dma_y, dma_cb, dma_cr, 20, 10, 1); 57.352 + extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 20, 10); 57.353 + }else if (pos==3){ 57.354 + extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 20, 10); 57.355 + }else{ 57.356 + extend_edges_right(dma_y, dma_cb, dma_cr, 20, 10, 2); 57.357 + } 57.358 + }else { 57.359 + if (pos==2){ 57.360 + extend_edges_right(dma_y, dma_cb, dma_cr, 16, 8, 1); 57.361 + extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 16, 8); 57.362 + }else if (pos==3){ 57.363 + extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 16, 8); 57.364 + }else{ 57.365 + extend_edges_right(dma_y, dma_cb, dma_cr, 16, 8, 1); 57.366 + } 57.367 + } 57.368 + } 57.369 + 57.370 + if (mb_y == 0){ 57.371 + dma_y = &dma_y_ls [32*64]; 57.372 + dma_cb = &dma_cb_ls[16*32]; 57.373 + dma_cr = &dma_cr_ls[16*32]; 57.374 + extra_y = extra_edge_y + 32*32; 57.375 + extra_cb= extra_edge_cb + 16*16; 57.376 + extra_cr= extra_edge_cr + 16*16; 57.377 + 57.378 + if (mb_x ==0){ 57.379 + extend_edges_top (dma_y + 0*16, dma_cb +0*8, dma_cr + 0*8); 57.380 + extend_edges_top (dma_y + 1*16, dma_cb +1*8, dma_cr + 1*8); 57.381 + extend_edges_top (dma_y + 2*16, dma_cb +2*8, dma_cr + 2*8); 57.382 + }else if (mb_x == s->mb_width -1){ 57.383 + if (pos==2){ 57.384 + extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); 57.385 + extend_edges_top (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8); 57.386 + extend_extra_edge_top(extra_y, extra_cb, extra_cr); 57.387 + }else if (pos == 3){ 57.388 + extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); 57.389 + extend_extra_edge_top(extra_y, extra_cb, extra_cr); 57.390 + }else{ 57.391 + extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); 57.392 + extend_edges_top (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8); 57.393 + extend_edges_top (dma_y + (pos+2)*16, dma_cb +(pos+2)*8, dma_cr + (pos+2)*8); 57.394 + } 57.395 + }else { 57.396 + extend_edges_top (dma_y + pos*16, dma_cb + pos*8, dma_cr + pos*8); 57.397 + } 57.398 + }else if (mb_y == s->mb_height -1){ 57.399 + dma_y = &dma_y_ls [19*64]; 57.400 + dma_cb = &dma_cb_ls[9*32]; 57.401 + dma_cr = &dma_cr_ls[9*32]; 57.402 + extra_y = extra_edge_y + 19*32; 57.403 + extra_cb= extra_edge_cb + 9*16; 57.404 + extra_cr= extra_edge_cr + 9*16; 57.405 + 57.406 + if (mb_x ==0){ 57.407 + extend_edges_bottom (dma_y + 0*16, dma_cb +0*8, dma_cr + 0*8); 57.408 + extend_edges_bottom (dma_y + 1*16, dma_cb +1*8, dma_cr + 1*8); 57.409 + extend_edges_bottom (dma_y + 2*16, dma_cb +2*8, dma_cr + 2*8); 57.410 + }else if (mb_x == s->mb_width -1){ 57.411 + if (pos==2){ 57.412 + extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); 57.413 + extend_edges_bottom (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8); 57.414 + extend_extra_edge_bottom(extra_y, extra_cb, extra_cr); 57.415 + }else if (pos == 3){ 57.416 + extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); 57.417 + extend_extra_edge_bottom(extra_y, extra_cb, extra_cr); 57.418 + }else{ 57.419 + extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); 57.420 + extend_edges_bottom (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8); 57.421 + extend_edges_bottom (dma_y + (pos+2)*16, dma_cb +(pos+2)*8, dma_cr + (pos+2)*8); 57.422 + } 57.423 + }else { 57.424 + extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8); 57.425 + } 57.426 + } 57.427 +} 57.428 + 57.429 +static void send_pic_data(H264Context_spu *h, int mb_x, int mb_y, int pos, int stride_y, int stride_c){ 57.430 + H264slice *s = h->s; 57.431 + int lines, lines_c; 57.432 + int linesize = s->linesize; 57.433 + int uvlinesize = s->uvlinesize; 57.434 + 57.435 + uint8_t* dst_y = s->dst_y + (mb_x-pos)*16 + (mb_y*16)*linesize; 57.436 + uint8_t* dst_cb = s->dst_cb +(mb_x-pos)*8 + (mb_y*8)*uvlinesize; 57.437 + uint8_t* dst_cr = s->dst_cr +(mb_x-pos)*8 + (mb_y*8)*uvlinesize; 57.438 + 57.439 + if (mb_y == 0){ 57.440 + dst_y -= 32 *linesize; 57.441 + dst_cb-= 16 *uvlinesize; 57.442 + dst_cr-= 16 *uvlinesize; 57.443 + }else { 57.444 + dst_y -= 4 *linesize; 57.445 + dst_cb-= 2 *uvlinesize; 57.446 + dst_cr-= 2 *uvlinesize; 57.447 + } 57.448 + 57.449 + if (mb_y == 0){ 57.450 + lines = 12+32; lines_c=6+16; 57.451 + }else if (mb_y == s->mb_height-1){ 57.452 + lines = 20+32; lines_c=10+16; 57.453 + }else{ 57.454 + lines = 16; lines_c=8; 57.455 + } 57.456 + 57.457 + put_list = put_list_buf; 57.458 + put_dma_list(dma_y_ls, dst_y, stride_y, lines, linesize, MBD_pic); 57.459 + put_dma_list(dma_cb_ls, dst_cb, stride_c, lines_c, uvlinesize, MBD_pic); 57.460 + put_dma_list(dma_cr_ls, dst_cr, stride_c, lines_c, uvlinesize, MBD_pic); 57.461 + 57.462 + if (mb_x == s->mb_width-1 && pos>1){ 57.463 + put_dma_list(extra_edge_y, dst_y+64, 32, lines, linesize, MBD_pic); 57.464 + put_dma_list(extra_edge_cb, dst_cb+32, 16, lines_c, uvlinesize, MBD_pic); 57.465 + put_dma_list(extra_edge_cr, dst_cr+32, 16, lines_c, uvlinesize, MBD_pic); 57.466 + } 57.467 +} 57.468 + 57.469 +void copy_data_and_send(H264Context_spu *h, int mb_x, int mb_y, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){ 57.470 + H264slice *s = h->s; 57.471 + int lines, lines_c; 57.472 + int pos = (mb_x+2)%4; //4 slots in our 64 byte wide transfer buffer. Offset 2 for edge emulation 57.473 + uint8_t *dma_y = &dma_y_ls[pos*16]; 57.474 + uint8_t *dma_cb = &dma_cb_ls[pos*8]; 57.475 + uint8_t *dma_cr = &dma_cr_ls[pos*8]; 57.476 + 57.477 + if (mb_y == 0){ 57.478 + dma_y += 32*64; 57.479 + dma_cb+= 16*32; 57.480 + dma_cr+= 16*32; 57.481 + }else{ 57.482 + dest_y -= 4*stride_y; 57.483 + dest_cb-= 2*stride_c; 57.484 + dest_cr-= 2*stride_c; 57.485 + } 57.486 + 57.487 + if (mb_y == 0){ 57.488 + lines = 12; lines_c=6; 57.489 + }else if (mb_y == s->mb_height-1){ 57.490 + lines = 20; lines_c=10; 57.491 + }else{ 57.492 + lines = 16; lines_c=8; 57.493 + } 57.494 + 57.495 + for(int i=0; i<lines; i++){ 57.496 + qword *qdest_y = (qword *) dest_y; 57.497 + qword *qdma_y = (qword *) dma_y; 57.498 + *qdma_y = *qdest_y; 57.499 + dma_y +=64; 57.500 + dest_y+=stride_y; 57.501 + } 57.502 + 57.503 + for(int i=0; i<lines_c; i++){ 57.504 + uint64_t *ddest_cb = (uint64_t *) dest_cb; 57.505 + uint64_t *ddest_cr = (uint64_t *) dest_cr; 57.506 + uint64_t *ddma_cb = (uint64_t *) dma_cb; 57.507 + uint64_t *ddma_cr = (uint64_t *) dma_cr; 57.508 + *ddma_cb = *ddest_cb; 57.509 + *ddma_cr = *ddest_cr; 57.510 + dma_cb +=32; 57.511 + dma_cr +=32; 57.512 + dest_cb+=stride_c; 57.513 + dest_cr+=stride_c; 57.514 + } 57.515 + 57.516 + extend_edges(h, mb_x, mb_y); 57.517 + 57.518 + //send when dma buf is full 57.519 + if (pos==3){ 57.520 + send_pic_data(h, mb_x, mb_y, pos, 64, 32); 57.521 + } else if (mb_x == s->mb_width-1){ 57.522 + send_pic_data(h, mb_x, mb_y, pos, 64, 32); 57.523 + } 57.524 +} 57.525 + 57.526 +static void shift_left(int mb_y, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){ 57.527 + int lines, lines_c; 57.528 + if (mb_y > 0){ 57.529 + lines =20; 57.530 + lines_c=10; 57.531 + dest_y -= 4*stride_y; 57.532 + dest_cb -= 2*stride_c; 57.533 + dest_cr -= 2*stride_c; 57.534 + }else { 57.535 + lines =16; 57.536 + lines_c= 8; 57.537 + } 57.538 + 57.539 + for (int i=0; i<lines; i++){ 57.540 + qword *left_y = (qword *) (dest_y -16); 57.541 + qword *qdest_y = (qword *) dest_y; 57.542 + *left_y = *qdest_y; 57.543 + dest_y += stride_y; 57.544 + } 57.545 + 57.546 + for (int i=0; i<lines_c; i++){ 57.547 + uint64_t *left_cb = (uint64_t *) (dest_cb -8); 57.548 + uint64_t *left_cr = (uint64_t *) (dest_cr -8); 57.549 + uint64_t *ddest_cb = (uint64_t *) dest_cb; 57.550 + uint64_t *ddest_cr = (uint64_t *) dest_cr; 57.551 + *left_cb = *ddest_cb; 57.552 + *left_cr = *ddest_cr; 57.553 + dest_cb += stride_c; 57.554 + dest_cr += stride_c; 57.555 + } 57.556 +} 57.557 + 57.558 +void hl_decode_mb_internal(H264Context_spu *h, int stride_y, int stride_c){ 57.559 + H264slice *s = h->s; 57.560 + H264Mb *mb = h->mb; 57.561 + const int mb_x= mb->mb_x; 57.562 + const int mb_y= mb->mb_y; 57.563 + const int mb_type= mb->mb_type; 57.564 + 57.565 + uint8_t *dest_y, *dest_cb, *dest_cr; //ls ptrs (abstracts the fact it is operating in a ls buffer) 57.566 + 57.567 + int i; 57.568 + 57.569 + void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); 57.570 + void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); 57.571 + 57.572 + dest_y = dest_y_ls + 16 + 4*stride_y; 57.573 + dest_cb = dest_cb_ls + 8 + 2*stride_c; 57.574 + dest_cr = dest_cr_ls + 8 + 2*stride_c; 57.575 + 57.576 + if(IS_8x8DCT(mb_type)){ 57.577 + idct_dc_add = ff_idct8_dc_add; 57.578 + idct_add = h->dsp.h264_idct_add[0]; 57.579 + } 57.580 + else{ 57.581 + idct_dc_add = ff_idct_dc_add; 57.582 + idct_add = h->dsp.h264_idct_add[1]; 57.583 + } 57.584 + 57.585 + if (mb_y>0){ 57.586 + copy_top_borders(mb_x, dest_y, dest_cb, dest_cr, stride_y, stride_c); 57.587 + } 57.588 + 57.589 + if(IS_INTRA(mb_type)){ 57.590 + xchg_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c, 1); 57.591 + 57.592 + h->hpc.pred8x8[ mb->chroma_pred_mode ](dest_cb, stride_c); 57.593 + h->hpc.pred8x8[ mb->chroma_pred_mode ](dest_cr, stride_c); 57.594 + 57.595 + if(IS_INTRA4x4(mb_type)){ 57.596 + if(IS_8x8DCT(mb_type)){ 57.597 + 57.598 + for(i=0; i<16; i+=4){ 57.599 + uint8_t * const ptr= dest_y + block_offset[i]; 57.600 + const int dir= mb->intra4x4_pred_mode_cache[ scan8[i] ]; 57.601 + const int nnz = mb->non_zero_count_cache[ scan8[i] ]; 57.602 + h->hpc.pred8x8l[ dir ](ptr, (mb->topleft_samples_available<<i)&0x8000, 57.603 + (mb->topright_samples_available<<i)&0x4000, stride_y); 57.604 + 57.605 + if(nnz){ 57.606 + if(nnz == 1 && mb->mb[i*16]) 57.607 + idct_dc_add(ptr, mb->mb + i*16, stride_y); 57.608 + else{ 57.609 + idct_add (ptr, mb->mb + i*16, stride_y); 57.610 + } 57.611 + } 57.612 + } 57.613 + }else{ 57.614 + for(i=0; i<16; i++){ 57.615 + uint8_t * const ptr= dest_y + block_offset[i]; 57.616 + const int dir= mb->intra4x4_pred_mode_cache[ scan8[i] ]; 57.617 + 57.618 + uint8_t *topright; 57.619 + int nnz, tr; 57.620 + if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){ 57.621 + const int topright_avail= (mb->topright_samples_available<<i)&0x8000; 57.622 + if(!topright_avail){ 57.623 + tr= ptr[3 - stride_y]*0x01010101; 57.624 + topright= (uint8_t*) &tr; 57.625 + }else 57.626 + topright= ptr + 4 - stride_y; 57.627 + }else 57.628 + topright= NULL; 57.629 + 57.630 + h->hpc.pred4x4[ dir ](ptr, topright, stride_y); 57.631 + nnz = mb->non_zero_count_cache[ scan8[i] ]; 57.632 + if(nnz){ 57.633 + if(nnz == 1 && mb->mb[i*16]) 57.634 + idct_dc_add(ptr, mb->mb + i*16, stride_y); 57.635 + else 57.636 + idct_add (ptr, mb->mb + i*16, stride_y); 57.637 + } 57.638 + } 57.639 + } 57.640 + 57.641 + }else{ 57.642 + h->hpc.pred16x16[ mb->intra16x16_pred_mode ](dest_y , stride_y); 57.643 + h264_luma_dc_dequant_idct_c(mb->mb, mb->dequant4_coeff_y); 57.644 + } 57.645 + xchg_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c, 0); 57.646 + 57.647 + }else { 57.648 + hl_motion(h, dest_y, dest_cb, dest_cr, stride_y, stride_c); 57.649 + } 57.650 + 57.651 + if(!IS_INTRA4x4(mb_type)){ 57.652 + if(IS_INTRA16x16(mb_type)){ 57.653 + for(i=0; i<16; i++){ 57.654 + if(mb->non_zero_count_cache[ scan8[i] ]) 57.655 + idct_add(dest_y + block_offset[i], mb->mb + i*16, stride_y); 57.656 + else if(mb->mb[i*16]) 57.657 + idct_dc_add(dest_y + block_offset[i], mb->mb + i*16, stride_y); 57.658 + } 57.659 + }else if(mb->cbp&15){ 57.660 + const int incr = IS_8x8DCT(mb_type) ? 4 : 1; 57.661 + for(i=0; i<16; i+=incr){ 57.662 + int nnz = mb->non_zero_count_cache[ scan8[i] ]; 57.663 + if(nnz){ 57.664 + if(nnz==1 && mb->mb[i*16]) 57.665 + idct_dc_add(dest_y + block_offset[i], mb->mb + i*16, stride_y); 57.666 + else 57.667 + idct_add(dest_y + block_offset[i], mb->mb + i*16, stride_y); 57.668 + } 57.669 + } 57.670 + } 57.671 + } 57.672 + 57.673 + if(mb->cbp&0x30){ 57.674 + uint8_t *dest[2] = {dest_cb, dest_cr}; 57.675 + chroma_dc_dequant_idct_c(mb->mb + 16*16, mb->dequant4_coeff_cb); 57.676 + chroma_dc_dequant_idct_c(mb->mb + 16*16+4*16, mb->dequant4_coeff_cr); 57.677 + 57.678 + idct_add = h->dsp.h264_idct_add[1]; 57.679 + idct_dc_add = ff_idct_dc_add; 57.680 + for(i=16; i<16+8; i++){ 57.681 + if(mb->non_zero_count_cache[ scan8[i] ]) 57.682 + idct_add (dest[(i&4)>>2] + block_offset[i], mb->mb + i*16, stride_c); 57.683 + else if(mb->mb[i*16]) 57.684 + idct_dc_add(dest[(i&4)>>2] + block_offset[i], mb->mb + i*16, stride_c); 57.685 + } 57.686 + } 57.687 + 57.688 + // save unfiltered borders 57.689 + backup_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c); 57.690 + if (mb->deblock_mb){ 57.691 + filter_mb( h, dest_y, dest_cb, dest_cr, stride_y, stride_c); 57.692 + } 57.693 + 57.694 + if (mb_y < s->mb_height-1){ 57.695 + if(mb_x>0){ 57.696 + send_top_borders(h, mb_x-1, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c); 57.697 + } 57.698 + if (mb_x == s->mb_width-1){ 57.699 + send_top_borders(h, mb_x, dest_y, dest_cb, dest_cr, stride_y, stride_c); 57.700 + } 57.701 + } 57.702 + update_tgt_spe_dep(h, 0); 57.703 + 57.704 + if (h->blocking){ 57.705 + if (mb_x>0){ 57.706 + copy_data_and_send(h, mb_x-1, mb_y, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c); 57.707 + wait_dma_id(MBD_pic); 57.708 + } 57.709 + if (mb_x == s->mb_width-1){ 57.710 + copy_data_and_send(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c); 57.711 + wait_dma_id(MBD_pic); 57.712 + } 57.713 + 57.714 + }else{ 57.715 + if (mb_x>0){ 57.716 + wait_dma_id(MBD_pic); 57.717 + copy_data_and_send(h, mb_x-1, mb_y, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c); 57.718 + } 57.719 + if (mb_x == s->mb_width-1){ 57.720 + wait_dma_id(MBD_pic); 57.721 + copy_data_and_send(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c); 57.722 + } 57.723 + } 57.724 + 57.725 + if (mb_x < s->mb_width) 57.726 + shift_left(mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c); 57.727 + 57.728 +}
58.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 58.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.h Mon Aug 27 12:09:56 2012 +0200 58.3 @@ -0,0 +1,97 @@ 58.4 +/* 58.5 + * Copyright (c) 2009 TUDelft 58.6 + * 58.7 + * Cell Parallel SPU - 2DWave Macroblock Decoding. 58.8 + */ 58.9 + 58.10 +/** 58.11 + * @file libavcodec/cell/spu/h264_main_spu.c 58.12 + * Cell Parallel SPU - 2DWave Macroblock Decoding 58.13 + * @author C C Chi <c.c.chi@student.tudelft.nl> 58.14 + * 58.15 + * SIMD kernels 58.16 + * H.264/AVC motion compensation 58.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu> 58.18 + * @author Albert Paradis <apar7632@hotmail.com> 58.19 + */ 58.20 + 58.21 +#ifndef H264_DECODE_MB_SPU_H 58.22 +#define H264_DECODE_MB_SPU_H 58.23 + 58.24 +#define CELL_SPE 58.25 +#include "libavcodec/avcodec.h" 58.26 +#include "types_spu.h" 58.27 +#include "h264_types_spu.h" 58.28 +#include "h264_mc_spu.h" 58.29 +#include "h264_dma.h" 58.30 +#include "dsputil_spu.h" 58.31 +#include "h264_intra_spu.h" 58.32 + 58.33 +/** 58.34 + * H264Context 58.35 + */ 58.36 +typedef struct H264Context_spu{ 58.37 + DECLARE_ALIGNED_16(H264spe, spe); // contains simple type parameters that doesn't change 58.38 + DECLARE_ALIGNED_16(H264Mb, mb_buf[3]); // contains simple type parameters that changes for macroblock 58.39 + DECLARE_ALIGNED_16(H264slice, slice_buf[2]); // contains simple type parameters that changes for slice 58.40 + 58.41 + DSPContext_spu dsp; // struct that contains pointers to mc interpolations functions 58.42 + H264PredContext_spu hpc; // struct that contains pointers to intra prediction functions 58.43 + 58.44 + H264slice *s; 58.45 + int sl_idx; 58.46 + int frames; 58.47 + //mc arg buffer 58.48 + H264mc mc_buf[2]; 58.49 + H264mc *mc; //mc ptr to current decoded mb 58.50 + int mc_idx; 58.51 + int n_mc; //next mb_id to mc 58.52 + int mb_proc; 58.53 + int mb_total; 58.54 + int curr_line; 58.55 + 58.56 + H264Mb* mb; //mb ptr to current decoded mb 58.57 + int mb_id; //next mb_id to dma 58.58 + int mb_dec; //mb_buf index - decoded mb 58.59 + int mb_mc; //mb_buf index - prebuffer motion data 58.60 + int mb_dma; //mb_buf index - target for dma mb data 58.61 + int next_mb_idx; 58.62 +/*// for deblocking filter 58.63 + int edges[2]; 58.64 + int start[2]; 58.65 + int bS[2][4][4]; // dir, edge, bS; 58.66 + int qp[2][4]; // dir, edge; 58.67 + int chroma_qp[2][2][4]; // cb/cr, dir, edge; 58.68 +*/ 58.69 + int blocking; 58.70 +}H264Context_spu; 58.71 + 58.72 +void print_output(H264Context_spu* h, const char* msg); 58.73 +void hl_decode_mb_internal(H264Context_spu *h, int stride_y, int stride_c); 58.74 +void update_tgt_spe_dep(H264Context_spu *h, int end); 58.75 + 58.76 +// IDCT functions 58.77 +void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); 58.78 +void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); 58.79 + 58.80 +void ff_idct_dc_add(uint8_t *dst, DCTELEM *block, int stride); 58.81 +void ff_idct8_dc_add(uint8_t *dst, DCTELEM *block, int stride); 58.82 + 58.83 +void ff_cropTbl_init(); 58.84 +void add_pixels8_c(uint8_t *pixels, DCTELEM *block, int line_size); 58.85 +void add_pixels4_c(uint8_t *pixels, DCTELEM *block, int line_size); 58.86 +void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul); 58.87 +void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul); 58.88 +// Filter functions 58.89 +//void calculate_bS_qp(H264Context_spu *h); 58.90 + 58.91 +// Motion compensation function 58.92 +void fill_ref_buf(H264Context_spu *h, H264Mb *mb, H264mc *mc); 58.93 +void calc_mc_params(H264Mb *mb, H264mc *mc); 58.94 +void hl_motion(H264Context_spu *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c); 58.95 + 58.96 + 58.97 +// Function to get traces 58.98 +void trace_event_SPU(int event, int id); 58.99 + 58.100 +#endif
59.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 59.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.c Mon Aug 27 12:09:56 2012 +0200 59.3 @@ -0,0 +1,332 @@ 59.4 +/* 59.5 + * H.26L/H.264/AVC/JVT/14496-10/... direct mb/block decoding 59.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 59.7 + * 59.8 + * This file is part of FFmpeg. 59.9 + * 59.10 + * FFmpeg is free software; you can redistribute it and/or 59.11 + * modify it under the terms of the GNU Lesser General Public 59.12 + * License as published by the Free Software Foundation; either 59.13 + * version 2.1 of the License, or (at your option) any later version. 59.14 + * 59.15 + * FFmpeg is distributed in the hope that it will be useful, 59.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 59.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 59.18 + * Lesser General Public License for more details. 59.19 + * 59.20 + * You should have received a copy of the GNU Lesser General Public 59.21 + * License along with FFmpeg; if not, write to the Free Software 59.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 59.23 + */ 59.24 + 59.25 +/** 59.26 + * @file 59.27 + * H.264 / AVC / MPEG4 part10 direct mb/block decoding. 59.28 + * @author Michael Niedermayer <michaelni@gmx.at> 59.29 + */ 59.30 +#define CELL_SPE 59.31 +#include "libavcodec/avcodec.h" 59.32 +#include "dsputil_spu.h" 59.33 +#include "h264_tables.h" 59.34 +#include "h264_types_spu.h" 59.35 +#include "libavutil/common.h" 59.36 +#include "libavutil/intreadwrite.h" 59.37 +#include "mathops_spu.h" 59.38 +#include "rectangle_spu.h" 59.39 + 59.40 +//#undef NDEBUG 59.41 +#include <assert.h> 59.42 +static void pred_spatial_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){ 59.43 + H264Mb *m = s->m; 59.44 + int b4_stride = hc->b_stride; 59.45 + const int mb_x = m->mb_x; 59.46 + int mb_type_col[2]; 59.47 + const int16_t (*l1mv0)[2], (*l1mv1)[2]; 59.48 + const int8_t *l1ref0, *l1ref1; 59.49 + const int is_b8x8 = IS_8X8(*mb_type); 59.50 + unsigned int sub_mb_type= MB_TYPE_L0L1; 59.51 + int i8, i4; 59.52 + int ref[2]; 59.53 + int mv[2]; 59.54 + int list; 59.55 + 59.56 + //assert(h->ref_list[1][0].reference&3); 59.57 + 59.58 +#define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM) 59.59 + 59.60 + /* ref = min(neighbors) */ 59.61 + for(list=0; list<2; list++){ 59.62 + int left_ref = m->ref_cache[list][scan8[0] - 1]; 59.63 + int top_ref = m->ref_cache[list][scan8[0] - 8]; 59.64 + int refc = m->ref_cache[list][scan8[0] - 8 + 4]; 59.65 + const int16_t *C= m->mv_cache[list][ scan8[0] - 8 + 4]; 59.66 + if(refc == PART_NOT_AVAILABLE){ 59.67 + refc = m->ref_cache[list][scan8[0] - 8 - 1]; 59.68 + C = m-> mv_cache[list][scan8[0] - 8 - 1]; 59.69 + } 59.70 + ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc); 59.71 + if(ref[list] >= 0){ 59.72 + //this is just pred_motion() but with the cases removed that cannot happen for direct blocks 59.73 + const int16_t * const A= m->mv_cache[list][ scan8[0] - 1 ]; 59.74 + const int16_t * const B= m->mv_cache[list][ scan8[0] - 8 ]; 59.75 + 59.76 + int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]); 59.77 + if(match_count > 1){ //most common 59.78 + mv[list]= pack16to32(mid_pred(A[0], B[0], C[0]), 59.79 + mid_pred(A[1], B[1], C[1]) ); 59.80 + }else { 59.81 + assert(match_count==1); 59.82 + if(left_ref==ref[list]){ 59.83 + mv[list]= AV_RN32A(A); 59.84 + }else if(top_ref==ref[list]){ 59.85 + mv[list]= AV_RN32A(B); 59.86 + }else{ 59.87 + mv[list]= AV_RN32A(C); 59.88 + } 59.89 + } 59.90 + }else{ 59.91 + int mask= ~(MB_TYPE_L0 << (2*list)); 59.92 + mv[list] = 0; 59.93 + ref[list] = -1; 59.94 + if(!is_b8x8) 59.95 + *mb_type &= mask; 59.96 + sub_mb_type &= mask; 59.97 + } 59.98 + } 59.99 + 59.100 + if(ref[0] < 0 && ref[1] < 0){ 59.101 + ref[0] = ref[1] = 0; 59.102 + if(!is_b8x8) 59.103 + *mb_type |= MB_TYPE_L0L1; 59.104 + sub_mb_type |= MB_TYPE_L0L1; 59.105 + } 59.106 + 59.107 + if(!(is_b8x8|mv[0]|mv[1])){ 59.108 + fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1); 59.109 + fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1); 59.110 + fill_rectangle(&m->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4); 59.111 + fill_rectangle(&m->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4); 59.112 + *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2; 59.113 + return; 59.114 + } 59.115 + 59.116 + mb_type_col[0] = 59.117 + mb_type_col[1] = hc->list1_mb_type[mb_x]; 59.118 + 59.119 + sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ 59.120 + if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){ 59.121 + *mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_16x16 */ 59.122 + }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){ 59.123 + *mb_type |= MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); 59.124 + }else{ 59.125 + if(!s->direct_8x8_inference_flag){ 59.126 + /* FIXME save sub mb types from previous frames (or derive from MVs) 59.127 + * so we know exactly what block size to use */ 59.128 + sub_mb_type += (MB_TYPE_8x8-MB_TYPE_16x16); /* B_SUB_4x4 */ 59.129 + } 59.130 + *mb_type |= MB_TYPE_8x8; 59.131 + } 59.132 + 59.133 +// l1mv0 = (void *) &hc->list1_motion_val[0][4*mb_x]; 59.134 +// l1mv1 = (void *) &hc->list1_motion_val[1][4*mb_x]; 59.135 + l1mv0 = (void *) hc->list1_motion_val[0]; 59.136 + l1mv1 = (void *) hc->list1_motion_val[1]; 59.137 + l1ref0 = &hc->list1_ref_index [0][4*mb_x]; 59.138 + l1ref1 = &hc->list1_ref_index [1][4*mb_x]; 59.139 +// if(!b8_stride){ 59.140 +// if(m->mb_y&1){ 59.141 +// l1ref0 += 2; 59.142 +// l1ref1 += 2; 59.143 +// l1mv0 += 2*b4_stride; 59.144 +// l1mv1 += 2*b4_stride; 59.145 +// } 59.146 +// } 59.147 + 59.148 + if(IS_16X16(*mb_type)){ 59.149 + int a,b; 59.150 + 59.151 + fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1); 59.152 + fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1); 59.153 + if(!IS_INTRA(mb_type_col[0]) && ( (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1) 59.154 + || (l1ref0[0] < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1 59.155 + ))){ 59.156 + a=b=0; 59.157 + if(ref[0] > 0) 59.158 + a= mv[0]; 59.159 + if(ref[1] > 0) 59.160 + b= mv[1]; 59.161 + }else{ 59.162 + a= mv[0]; 59.163 + b= mv[1]; 59.164 + } 59.165 + fill_rectangle(&m->mv_cache[0][scan8[0]], 4, 4, 8, a, 4); 59.166 + fill_rectangle(&m->mv_cache[1][scan8[0]], 4, 4, 8, b, 4); 59.167 + }else{ 59.168 + int n=0; 59.169 + for(i8=0; i8<4; i8++){ 59.170 + const int x8 = i8&1; 59.171 + const int y8 = i8>>1; 59.172 + 59.173 + if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8])) 59.174 + continue; 59.175 + m->sub_mb_type[i8] = sub_mb_type; 59.176 + 59.177 + fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, mv[0], 4); 59.178 + fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, mv[1], 4); 59.179 + fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1); 59.180 + fill_rectangle(&m->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1); 59.181 + 59.182 + /* col_zero_flag */ 59.183 + if(!IS_INTRA(mb_type_col[0]) && (l1ref0[i8] == 0 || (l1ref0[i8] < 0 && l1ref1[i8] == 0 )) 59.184 + ){ 59.185 + const int16_t (*l1mv)[2]= l1ref0[i8] == 0 ? l1mv0 : l1mv1; 59.186 + if(IS_SUB_8X8(sub_mb_type)){ 59.187 +// const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride]; 59.188 + const int16_t *mv_col = l1mv[x8*3 + y8*3*4]; 59.189 + if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){ 59.190 + if(ref[0] == 0) 59.191 + fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4); 59.192 + if(ref[1] == 0) 59.193 + fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4); 59.194 + n+=4; 59.195 + } 59.196 + }else{ 59.197 + int k=0; 59.198 + for(i4=0; i4<4; i4++){ 59.199 + //const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride]; 59.200 + const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*4]; 59.201 + if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){ 59.202 + if(ref[0] == 0) 59.203 + AV_ZERO32(m->mv_cache[0][scan8[i8*4+i4]]); 59.204 + if(ref[1] == 0) 59.205 + AV_ZERO32(m->mv_cache[1][scan8[i8*4+i4]]); 59.206 + k++; 59.207 + } 59.208 + } 59.209 + if(!(k&3)) 59.210 + m->sub_mb_type[i8]+= MB_TYPE_16x16 - MB_TYPE_8x8; 59.211 + n+=k; 59.212 + } 59.213 + } 59.214 + } 59.215 + if(!is_b8x8 && !(n&15)){ 59.216 + *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2; 59.217 + } 59.218 + } 59.219 +} 59.220 + 59.221 +static void pred_temp_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){ 59.222 + H264Mb *m = s->m; 59.223 + const int mb_x = m->mb_x; 59.224 + int b4_stride = hc->b_stride; 59.225 + int mb_type_col[2]; 59.226 + const int16_t (*l1mv0)[2], (*l1mv1)[2]; 59.227 + const int8_t *l1ref0, *l1ref1; 59.228 + const int is_b8x8 = IS_8X8(*mb_type); 59.229 + unsigned int sub_mb_type; 59.230 + int i8, i4; 59.231 + const int *map_col_to_list0[2] = {s->map_col_to_list0[0], s->map_col_to_list0[1]}; 59.232 + const int *dist_scale_factor = s->dist_scale_factor; 59.233 + 59.234 + mb_type_col[0] = 59.235 + mb_type_col[1] = hc->list1_mb_type[mb_x]; 59.236 + 59.237 + sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ 59.238 + if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){ 59.239 + *mb_type |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */ 59.240 + }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){ 59.241 + *mb_type |= MB_TYPE_L0L1|MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); 59.242 + }else{ 59.243 + if(!s->direct_8x8_inference_flag){ 59.244 + /* FIXME save sub mb types from previous frames (or derive from MVs) 59.245 + * so we know exactly what block size to use */ 59.246 + sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */ 59.247 + } 59.248 + *mb_type |= MB_TYPE_8x8|MB_TYPE_L0L1; 59.249 + } 59.250 + 59.251 +// l1mv0 = (void *) &hc->list1_motion_val[0][4*mb_x]; 59.252 +// l1mv1 = (void *) &hc->list1_motion_val[1][4*mb_x]; 59.253 + l1mv0 = (void *) hc->list1_motion_val[0]; 59.254 + l1mv1 = (void *) hc->list1_motion_val[1]; 59.255 + l1ref0 = &hc->list1_ref_index [0][4*mb_x]; 59.256 + l1ref1 = &hc->list1_ref_index [1][4*mb_x]; 59.257 + 59.258 + /* one-to-one mv scaling */ 59.259 + if(IS_16X16(*mb_type)){ 59.260 + int ref, mv0, mv1; 59.261 + 59.262 + fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1); 59.263 + if(IS_INTRA(mb_type_col[0])){ 59.264 + ref=mv0=mv1=0; 59.265 + }else{ 59.266 + const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]] 59.267 + : map_col_to_list0[1][l1ref1[0]]; 59.268 + const int scale = dist_scale_factor[ref0]; 59.269 + const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0]; 59.270 + int mv_l0[2]; 59.271 + mv_l0[0] = (scale * mv_col[0] + 128) >> 8; 59.272 + mv_l0[1] = (scale * mv_col[1] + 128) >> 8; 59.273 + ref= ref0; 59.274 + mv0= pack16to32(mv_l0[0],mv_l0[1]); 59.275 + mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]); 59.276 + } 59.277 + fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1); 59.278 + fill_rectangle(&m-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4); 59.279 + fill_rectangle(&m-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4); 59.280 + }else{ 59.281 + for(i8=0; i8<4; i8++){ 59.282 + const int x8 = i8&1; 59.283 + const int y8 = i8>>1; 59.284 + int ref0, scale; 59.285 + const int16_t (*l1mv)[2]= l1mv0; 59.286 + 59.287 + if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8])) 59.288 + continue; 59.289 + m->sub_mb_type[i8] = sub_mb_type; 59.290 + fill_rectangle(&m->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1); 59.291 + if(IS_INTRA(mb_type_col[0])){ 59.292 + fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1); 59.293 + fill_rectangle(&m-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4); 59.294 + fill_rectangle(&m-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4); 59.295 + continue; 59.296 + } 59.297 + 59.298 + ref0 = l1ref0[i8]; 59.299 + if(ref0 >= 0) 59.300 + ref0 = map_col_to_list0[0][ref0 ]; 59.301 + else{ 59.302 + ref0 = map_col_to_list0[1][l1ref1[i8]]; 59.303 + l1mv= l1mv1; 59.304 + } 59.305 + scale = dist_scale_factor[ref0]; 59.306 + 59.307 + fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1); 59.308 + if(IS_SUB_8X8(sub_mb_type)){ 59.309 +// const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride]; 59.310 + const int16_t *mv_col = l1mv[x8*3 + y8*3*4]; 59.311 + int mx = (scale * mv_col[0] + 128) >> 8; 59.312 + int my = (scale * mv_col[1] + 128) >> 8; 59.313 + fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4); 59.314 + fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4); 59.315 + }else 59.316 + for(i4=0; i4<4; i4++){ 59.317 +// const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride]; 59.318 + const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*4]; 59.319 + int16_t *mv_l0 = m->mv_cache[0][scan8[i8*4+i4]]; 59.320 + mv_l0[0] = (scale * mv_col[0] + 128) >> 8; 59.321 + mv_l0[1] = (scale * mv_col[1] + 128) >> 8; 59.322 + AV_WN32A(m->mv_cache[1][scan8[i8*4+i4]], 59.323 + pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1])); 59.324 + } 59.325 + } 59.326 + } 59.327 +} 59.328 + 59.329 +void ff_h264_pred_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){ 59.330 + if(s->direct_spatial_mv_pred){ 59.331 + pred_spatial_direct_motion(hc, s, mb_type); 59.332 + }else{ 59.333 + pred_temp_direct_motion(hc, s, mb_type); 59.334 + } 59.335 +}
60.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 60.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.h Mon Aug 27 12:09:56 2012 +0200 60.3 @@ -0,0 +1,8 @@ 60.4 +#ifndef H264_DIRECT_H 60.5 +#define H264_DIRECT_H 60.6 + 60.7 +#include "h264_types_spu.h" 60.8 + 60.9 +void ff_h264_pred_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type); 60.10 + 60.11 +#endif
61.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 61.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_dma.c Mon Aug 27 12:09:56 2012 +0200 61.3 @@ -0,0 +1,74 @@ 61.4 +#include <spu_mfcio.h> 61.5 +#include "h264_dma.h" 61.6 + 61.7 +DECLARE_ALIGNED_16(dma_list_elem_t, put_list_buf[2*(52+26+26)]); 61.8 +dma_list_elem_t* put_list; 61.9 + 61.10 +DECLARE_ALIGNED_16(dma_list_elem_t, get_list_buf[16*(4+5 + 2*3)]); 61.11 +dma_list_elem_t* get_list; 61.12 + 61.13 +inline void spu_dma_get(void *ls, unsigned ea, int size, int tag){ 61.14 + mfc_get(ls, ea, size, tag, 0, 0); 61.15 +} 61.16 + 61.17 +inline void spu_dma_put(void *ls, unsigned ea, int size, int tag){ 61.18 + mfc_put(ls, ea, size, tag, 0, 0); 61.19 +} 61.20 + 61.21 +inline void spu_dma_barrier_put(void *ls, unsigned ea, int size, int tag){ 61.22 + mfc_putb(ls, ea, size, tag, 0, 0); 61.23 +} 61.24 + 61.25 +// Function that wait to finish a DMA transfer with especific id 61.26 +inline void wait_dma_id(int id){ 61.27 + spu_writech(MFC_WrTagMask, 1<< id); 61.28 + (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); 61.29 +} 61.30 + 61.31 +// Functions to get/put a block from/to main memory 61.32 +void get_dma_list(void *dst, void* ea, unsigned int w, unsigned int h, unsigned int stride, unsigned int tag, int barrier) 61.33 +{ 61.34 + unsigned int i = 0; 61.35 + unsigned int listsize; 61.36 + unsigned int ea_low; 61.37 + 61.38 + dma_list_elem_t* list = get_list; 61.39 + get_list+=h; 61.40 + 61.41 + ea_low=(uint32_t) mfc_ea2l(ea); 61.42 + 61.43 + /* Create the list, size of each list id the "width" parameter defined by the user */ 61.44 + for ( i=0; i<h; i++ ){ 61.45 + list[i].size.all32 = w; 61.46 + list[i].ea_low = ea_low; 61.47 + ea_low += stride; 61.48 + } 61.49 + /* Specify the list size and initiate the list transfer */ 61.50 + listsize = h*sizeof(dma_list_elem_t); 61.51 + if (barrier) 61.52 + mfc_getlb(dst, (unsigned)ea, list, listsize, tag, 0, 0); 61.53 + else 61.54 + mfc_getl(dst, (unsigned)ea, list, listsize, tag, 0, 0); 61.55 +} 61.56 + 61.57 + 61.58 +void put_dma_list(void *src, void* ea, unsigned int size, unsigned int h, unsigned int stride, unsigned int tag){ 61.59 + unsigned int i = 0; 61.60 + unsigned int listsize; 61.61 + unsigned int ea_low; 61.62 + 61.63 + dma_list_elem_t* list = put_list; 61.64 + put_list+=h; 61.65 + 61.66 + ea_low=(uint32_t) mfc_ea2l(ea); 61.67 + 61.68 + /* Create the list, size of each list id the "width" parameter defined by the user */ 61.69 + for ( i=0; i<h; i++ ) { 61.70 + list[i].size.all32 = size; 61.71 + list[i].ea_low = ea_low; 61.72 + ea_low += stride; 61.73 + } 61.74 + /* Specify the list size and initiate the list transfer */ 61.75 + listsize = h*sizeof(dma_list_elem_t); 61.76 + mfc_putl(src, (unsigned) ea, list, listsize, tag, 0, 0); 61.77 +}
62.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 62.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_dma.h Mon Aug 27 12:09:56 2012 +0200 62.3 @@ -0,0 +1,59 @@ 62.4 +#ifndef H264_DMA_H 62.5 +#define H264_DMA_H 62.6 + 62.7 +#include "libavutil/mem.h" 62.8 + 62.9 +typedef struct dma_list_elem { 62.10 + union { 62.11 + unsigned int all32; 62.12 + struct { 62.13 + unsigned int stall : 1; 62.14 + unsigned int reserved : 15; 62.15 + unsigned int nbytes : 16; 62.16 + } bits; 62.17 + } size; 62.18 + uint64_t ea_low : 32; 62.19 +}dma_list_elem_t; 62.20 + 62.21 +extern DECLARE_ALIGNED_16(dma_list_elem_t, put_list_buf[2*(52+26+26)]); 62.22 +extern dma_list_elem_t* put_list; 62.23 + 62.24 +extern DECLARE_ALIGNED_16(dma_list_elem_t, get_list_buf[16*(4+5 + 2*3)]); 62.25 +extern dma_list_elem_t* get_list; 62.26 + 62.27 +enum{ 62.28 + MBD_slice=1, 62.29 + MBD_buf1, 62.30 + MBD_buf2, 62.31 + MBD_buf3, 62.32 + MBD_put, 62.33 + MBD_pic, 62.34 + MBD_mc_buf1, 62.35 + MBD_mc_buf2 62.36 +}; 62.37 + 62.38 +enum{ 62.39 + ED_spe=1, 62.40 + ED_slice, 62.41 + ED_raw, 62.42 + ED_get, 62.43 + ED_get2, 62.44 + ED_get_mv, 62.45 + ED_put, 62.46 + ED_putmb0, 62.47 + ED_putmb1, 62.48 +}; 62.49 + 62.50 +// Functions to get/put a block from/to main memory 62.51 +void get_dma_list(void *dst, void* ea, unsigned int w, unsigned int h, unsigned int stride, unsigned int tag, int barrier); 62.52 +void put_dma_list(void *src, void* ea, unsigned int size, unsigned int h, unsigned int stride, unsigned int tag); 62.53 + 62.54 +//Functions to do a dma transfer for 32-bit 62.55 +void spu_dma_get(void *ls, unsigned ea, int size, int tag); 62.56 +void spu_dma_put(void *ls, unsigned ea, int size, int tag); 62.57 +void spu_dma_barrier_put(void *ls, unsigned ea, int size, int tag); 62.58 + 62.59 +// Function that wait to finish a DMA transfer with especific id 62.60 +void wait_dma_id(int id); 62.61 + 62.62 +#endif
63.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 63.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_filter_spu_vec.c Mon Aug 27 12:09:56 2012 +0200 63.3 @@ -0,0 +1,650 @@ 63.4 +/* 63.5 + * Copyright (c) 2009 TUDelft 63.6 + * 63.7 + * Cell Parallel SPU - 2DWave Macroblock Decoding. 63.8 + */ 63.9 + 63.10 +/** 63.11 + * @file libavcodec/cell/spu/h264_main_spu.c 63.12 + * Cell Parallel SPU - 2DWave Macroblock Decoding 63.13 + * @author C C Chi <c.c.chi@student.tudelft.nl> 63.14 + * 63.15 + * SIMD kernels 63.16 + * H.264/AVC motion compensation 63.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu> 63.18 + * @author Albert Paradis <apar7632@hotmail.com> 63.19 + */ 63.20 + 63.21 + 63.22 +#include <stdio.h> 63.23 +#include <spu_mfcio.h> 63.24 +#include <spu_intrinsics.h> 63.25 + 63.26 +#include "h264_filter_spu.h" 63.27 +#include "h264_decode_mb_spu.h" 63.28 +// To use scan8 table 63.29 +#include "h264_mc_spu.h" 63.30 + 63.31 + 63.32 +int get_chroma_qp(H264Context_spu *h, int t, int qscale){ 63.33 + return h->slice.chroma_qp_table[t][qscale]; 63.34 +} 63.35 + 63.36 +static inline int clip(int a, int amin, int amax){ 63.37 + if (a < amin) 63.38 + return amin; 63.39 + else if (a > amax) 63.40 + return amax; 63.41 + else 63.42 + return a; 63.43 +} 63.44 + 63.45 +static inline vsint16_t clip_altivec(vsint16_t a, vsint16_t amin, vsint16_t amax){ 63.46 + vector unsigned short min_mask,max_mask; 63.47 + min_mask = spu_cmpgt(amin, a); 63.48 + max_mask = spu_cmpgt(a, amax); 63.49 + 63.50 + return spu_sel(spu_sel(a,amin,min_mask),amax,max_mask); 63.51 +} 63.52 + 63.53 +static inline vsint16_t clip_uint8_altivec(vsint16_t a){ 63.54 + const vsint16_t amax = {255,255,255,255,255,255,255,255}; 63.55 + const vsint16_t amin = {0, 0, 0, 0, 0, 0, 0, 0}; 63.56 + vector unsigned short min_mask,max_mask; 63.57 + min_mask = spu_cmpgt(amin, a); 63.58 + max_mask = spu_cmpgt(a, amax); 63.59 + 63.60 + return spu_sel(spu_sel(a,amin,min_mask),amax,max_mask); 63.61 +} 63.62 + 63.63 +static inline void h264_loop_filter_chroma(vsint16_t *pix, int alpha, int beta, int8_t *tc0){ 63.64 + 63.65 + short a = (short) tc0[0]; 63.66 + short b = (short) tc0[1]; 63.67 + short c = (short) tc0[2]; 63.68 + short d = (short) tc0[3]; 63.69 + const vsint16_t vec_tc0 = {a,a,b,b,c,c,d,d}; 63.70 + const vsint16_t vec_v0 = {0, 0, 0, 0, 0, 0, 0, 0}; 63.71 + vector unsigned short mask_B0; 63.72 + 63.73 + mask_B0 = spu_cmpgt(vec_v0, vec_tc0); 63.74 + 63.75 + const vsint16_t p0 = pix[-1]; 63.76 + const vsint16_t p1 = pix[-2]; 63.77 + const vsint16_t q0 = pix[0]; 63.78 + const vsint16_t q1 = pix[1]; 63.79 + 63.80 + const vsint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha}; 63.81 + const vsint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta}; 63.82 + const vsint16_t v_2 = {2,2,2,2,2,2,2,2}; 63.83 + const vuint16_t v_3 = {3,3,3,3,3,3,3,3}; 63.84 + const vsint16_t v_4 = {4,4,4,4,4,4,4,4}; 63.85 + 63.86 + vsint16_t rp0; 63.87 + vsint16_t rq0; 63.88 + vsint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0; 63.89 + vector unsigned short mask_B1, mask_tmp; 63.90 + vsint16_t i_delta; 63.91 + 63.92 + abs_p0mq0 = (vector signed short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0); 63.93 + abs_p1mp0 = (vector signed short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0); 63.94 + abs_q1mq0 = (vector signed short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0); 63.95 + 63.96 + mask_B1 = spu_cmpgt(v_alpha, abs_p0mq0); 63.97 + mask_tmp = spu_cmpgt(v_beta, abs_p1mp0); 63.98 + mask_B1 = spu_and(mask_B1, mask_tmp); 63.99 + mask_tmp = spu_cmpgt( v_beta, abs_q1mq0); 63.100 + mask_B1 = spu_and(mask_B1, mask_tmp); 63.101 + 63.102 + 63.103 + i_delta = clip_altivec(spu_rlmaska(spu_add(spu_sl(spu_sub(q0,p0 ), (vuint16_t)v_2), spu_add(spu_sub(p1,q1),v_4)), (vsint16_t)-v_3), -vec_tc0, vec_tc0); 63.104 + 63.105 + rp0 = clip_uint8_altivec( spu_add(p0,i_delta)); 63.106 + rq0 = clip_uint8_altivec( spu_sub(q0,i_delta)); 63.107 + 63.108 + pix[-1] = spu_sel(spu_sel(p0, rp0, mask_B1), p0,mask_B0); 63.109 + pix[0] = spu_sel(spu_sel(q0, rq0, mask_B1), q0,mask_B0); 63.110 +} 63.111 + 63.112 +static void h264_v_loop_filter_luma_c(vsint16_t *pix, int alpha, int beta, int8_t *tc0, int inc_low2high){ 63.113 + 63.114 + short a = (short) tc0[0 + inc_low2high]; 63.115 + short b = (short) tc0[1 + inc_low2high]; 63.116 + const vsint16_t vec_tc0 = {a,a,a,a,b,b,b,b}; 63.117 + const vsint16_t vec_v0 = {0, 0, 0, 0, 0, 0, 0, 0}; 63.118 + vector unsigned short mask_B0; 63.119 + 63.120 + mask_B0 = spu_cmpgt(vec_v0, vec_tc0); 63.121 + const vsint16_t p0 = pix[-1]; 63.122 + const vsint16_t p1 = pix[-2]; 63.123 + const vsint16_t p2 = pix[-3]; 63.124 + const vsint16_t q0 = pix[0]; 63.125 + const vsint16_t q1 = pix[1]; 63.126 + const vsint16_t q2 = pix[2]; 63.127 + 63.128 + const vuint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha}; 63.129 + const vuint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta}; 63.130 + 63.131 + const vuint16_t v_1 = {1,1,1,1,1,1,1,1}; 63.132 + const vuint16_t v_2 = {2,2,2,2,2,2,2,2}; 63.133 + const vuint16_t v_3 = {3,3,3,3,3,3,3,3}; 63.134 + const vsint16_t v_4 = {4,4,4,4,4,4,4,4}; 63.135 + 63.136 + vsint16_t rp0, rp1; 63.137 + vsint16_t rq0, rq1; 63.138 + vsint16_t tc0_B2P, tc0_B2Q, rtc0; 63.139 + vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0, abs_p2mp0, abs_q2mq0; 63.140 + vector unsigned short mask_B1, mask_B2P, mask_B2Q, mask_tmp; 63.141 + vsint16_t i_delta, i_delta2; 63.142 + 63.143 + abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0); 63.144 + abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0); 63.145 + abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0); 63.146 + abs_p2mp0 = (vector unsigned short) spu_absd((vector unsigned char) p2,(vector unsigned char) p0); 63.147 + abs_q2mq0 = (vector unsigned short) spu_absd((vector unsigned char) q2,(vector unsigned char) q0); 63.148 + 63.149 + mask_B1 = spu_cmpgt(v_alpha, abs_p0mq0); 63.150 + mask_tmp = spu_cmpgt(v_beta, abs_p1mp0); 63.151 + mask_B1 = spu_and(mask_B1, mask_tmp); 63.152 + mask_tmp = spu_cmpgt( v_beta, abs_q1mq0); 63.153 + mask_B1 = spu_and(mask_B1, mask_tmp); 63.154 + 63.155 + mask_B2P = spu_cmpgt(v_beta, abs_p2mp0); 63.156 + mask_B2Q = spu_cmpgt(v_beta ,abs_q2mq0); 63.157 + 63.158 + rp1 = spu_add(p1, clip_altivec(spu_sub(spu_rlmaska(spu_add(p2, (vector signed short) spu_avg((vector unsigned char) p0, (vector unsigned char) q0)),(vsint16_t)-v_1), p1), -vec_tc0, vec_tc0 )); 63.159 + rq1 = spu_add(q1, clip_altivec(spu_sub(spu_rlmaska(spu_add(q2, (vector signed short) spu_avg((vector unsigned char) p0, (vector unsigned char) q0)),(vsint16_t)-v_1), q1), -vec_tc0, vec_tc0 )); 63.160 + 63.161 + tc0_B2P = spu_add(vec_tc0, (vsint16_t) v_1); 63.162 + tc0_B2P = spu_sel(vec_tc0, tc0_B2P, mask_B2P); 63.163 + 63.164 + tc0_B2Q = spu_add(tc0_B2P, (vsint16_t) v_1); 63.165 + rtc0 = spu_sel(tc0_B2P, tc0_B2Q, mask_B2Q); 63.166 + i_delta2 = spu_add(spu_sub(p1,q1),v_4); 63.167 + i_delta = spu_sl(spu_sub(q0,p0 ), v_2); 63.168 + i_delta = spu_add(i_delta,i_delta2 ); 63.169 + i_delta = spu_rlmaska(i_delta, (vsint16_t)-v_3); 63.170 + i_delta = clip_altivec(i_delta, -rtc0, rtc0); 63.171 + 63.172 + rp0 = clip_uint8_altivec( spu_add(p0,i_delta)); /* p0' */ 63.173 + rq0 = clip_uint8_altivec( spu_sub(q0,i_delta)); /* q0' */ 63.174 + 63.175 + pix[-2] = spu_sel(spu_sel(p1,spu_sel(p1,rp1,mask_B2P) ,mask_B1), p1,mask_B0); 63.176 + pix[-1] = spu_sel(spu_sel(p0, rp0, mask_B1), p0,mask_B0); 63.177 + pix[0] = spu_sel(spu_sel(q0, rq0, mask_B1), q0,mask_B0); 63.178 + pix[1] = spu_sel(spu_sel(q1,spu_sel(q1,rq1,mask_B2Q) ,mask_B1), q1,mask_B0); 63.179 +} 63.180 + 63.181 + 63.182 + 63.183 +static inline void h264_loop_filter_chroma_intra(vsint16_t *pix, int alpha, int beta){ 63.184 + 63.185 + const vuint16_t p0 = (vuint16_t) pix[-1]; 63.186 + const vuint16_t p1 = (vuint16_t) pix[-2]; 63.187 + const vuint16_t q0 = (vuint16_t) pix[0]; 63.188 + const vuint16_t q1 = (vuint16_t) pix[1]; 63.189 + 63.190 + const vsint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha}; 63.191 + const vsint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta}; 63.192 + const vuint16_t v_2 = {2,2,2,2,2,2,2,2}; 63.193 + 63.194 + vuint16_t rp0; 63.195 + vuint16_t rq0; 63.196 + vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0; 63.197 + vector unsigned short mask_B0, mask_tmp; 63.198 + 63.199 + abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0); 63.200 + abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0); 63.201 + abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0); 63.202 + 63.203 + mask_B0 = spu_cmpgt(v_alpha, (vsint16_t)abs_p0mq0); 63.204 + mask_tmp = spu_cmpgt(v_beta, (vsint16_t)abs_p1mp0); 63.205 + mask_B0 = spu_and(mask_B0, mask_tmp); 63.206 + mask_tmp = spu_cmpgt( v_beta, (vsint16_t)abs_q1mq0); 63.207 + mask_B0 = spu_and(mask_B0, mask_tmp); 63.208 + 63.209 + rp0 = spu_add(spu_add(spu_add(p1,p0),spu_add(p1,q1)),v_2);//( 2*p1 + p0 + q1 + 2 ) >> 2; 63.210 + rp0 = spu_rlmaska(rp0, (vsint16_t)-v_2); 63.211 + rq0 = spu_add(spu_add(spu_add(q1,q0),spu_add(q1,p1)),v_2);//( 2*q1 + q0 + p1 + 2 ) >> 2; 63.212 + rq0 = spu_rlmaska(rq0, (vsint16_t)-v_2); 63.213 + 63.214 + pix[-1] = (vsint16_t) spu_sel(p0, rp0, mask_B0); 63.215 + pix[0] = (vsint16_t) spu_sel(q0, rq0, mask_B0); 63.216 +} 63.217 +int slice_alpha_c0_offset; 63.218 +int slice_beta_offset; 63.219 +static void filter_mb_edgecv(vsint16_t *pix, int bS[4], int qp ) { 63.220 + int i; 63.221 + const int index_a = qp + slice_alpha_c0_offset; 63.222 + const int alpha = (alpha_table+52)[index_a]; 63.223 + const int beta = (beta_table+52)[qp + slice_beta_offset]; 63.224 + 63.225 + if( bS[0] < 4 ) { 63.226 + int8_t tc[4]; 63.227 + for(i=0; i<4; i++) 63.228 + tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0; 63.229 + h264_loop_filter_chroma(pix, alpha, beta, tc); 63.230 + } else { 63.231 + h264_loop_filter_chroma_intra(pix, alpha, beta); 63.232 + } 63.233 +} 63.234 + 63.235 +static void filter_mb_edgeh(vsint16_t *pix, int bS[4], int qp, int inc_low2high ) { 63.236 + int i; 63.237 + const int index_a = qp + slice_alpha_c0_offset; 63.238 + const int alpha = (alpha_table+52)[index_a]; 63.239 + const int beta = (beta_table+52)[qp + slice_beta_offset]; 63.240 + 63.241 + if( bS[0] < 4 ) { 63.242 + int8_t tc[4]; 63.243 + for(i=0; i<4; i++) 63.244 + tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1; 63.245 + h264_v_loop_filter_luma_c(pix, alpha, beta, tc, inc_low2high); 63.246 + } else { 63.247 + 63.248 + const vuint16_t p0 = (vuint16_t) pix[-1]; 63.249 + const vuint16_t p1 = (vuint16_t) pix[-2]; 63.250 + const vuint16_t p2 = (vuint16_t) pix[-3]; 63.251 + const vuint16_t p3 = (vuint16_t) pix[-4]; 63.252 + const vuint16_t q0 = (vuint16_t) pix[0]; 63.253 + const vuint16_t q1 = (vuint16_t) pix[1]; 63.254 + const vuint16_t q2 = (vuint16_t) pix[2]; 63.255 + const vuint16_t q3 = (vuint16_t) pix[3]; 63.256 + 63.257 + const vuint16_t v_alpha = {(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha}; 63.258 + const vuint16_t v_beta = {(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta}; 63.259 + const vuint16_t v_2 = {2,2,2,2,2,2,2,2}; 63.260 + const vuint16_t v_3 = {3,3,3,3,3,3,3,3}; 63.261 + const vsint16_t v_4 = {4,4,4,4,4,4,4,4}; 63.262 + 63.263 + vuint16_t rp0_B1f, rp0_B2t, rp0_B2f, rp1_B2t, rp2_B2t; 63.264 + vuint16_t rq0_B1f, rq0_B2t, rq0_B2f, rq1_B2t, rq2_B2t; 63.265 + vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0, abs_p2mp0, abs_q2mq0; 63.266 + vuint16_t v_alpha_2 = spu_rlmaska(v_alpha, (vsint16_t)-v_2); 63.267 + vector unsigned short mask_B0, mask_B1, mask_B2P, mask_B2Q, mask_tmp; 63.268 + 63.269 + v_alpha_2 = spu_add(v_alpha_2, v_2); 63.270 + 63.271 + abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0); 63.272 + abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0); 63.273 + abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0); 63.274 + abs_p2mp0 = (vector unsigned short) spu_absd((vector unsigned char) p2,(vector unsigned char) p0); 63.275 + abs_q2mq0 = (vector unsigned short) spu_absd((vector unsigned char) q2,(vector unsigned char) q0); 63.276 + 63.277 + mask_B0 = spu_cmpgt(v_alpha, abs_p0mq0); 63.278 + mask_tmp = spu_cmpgt(v_beta, abs_p1mp0); 63.279 + mask_B0 = spu_and(mask_B0, mask_tmp); 63.280 + mask_tmp = spu_cmpgt( v_beta, abs_q1mq0); 63.281 + mask_B0 = spu_and(mask_B0, mask_tmp); 63.282 + 63.283 + mask_B1 = spu_cmpgt(v_alpha_2, abs_p0mq0); 63.284 + mask_B2P = spu_cmpgt(v_beta,abs_p2mp0); 63.285 + mask_B2Q = spu_cmpgt(v_beta ,abs_q2mq0); 63.286 + 63.287 + rp0_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p2,p1),spu_add(p1,p0)),spu_add(spu_add(p0,q0),spu_add(q0,q1))),(vuint16_t)v_4),(vsint16_t) -v_3); 63.288 + //( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; 63.289 + rp1_B2t = spu_rlmaska(spu_add(spu_add(spu_add(p2,p1),spu_add(q0,p0)),v_2),(vsint16_t)-v_2);//( p2 + p1 + p0 + q0 + 2 ) >> 2; 63.290 + rp2_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p3,p3),spu_add(p2,p2)),spu_add(spu_add(p2,p1),spu_add(q0,p0))),(vuint16_t)v_4),(vsint16_t)-v_3); 63.291 + //( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; 63.292 + rq0_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p1,p0),spu_add(p0,q0)),spu_add(spu_add(q0,q1),spu_add(q1,q2))),(vuint16_t)v_4),(vsint16_t)-v_3); 63.293 + 63.294 + //( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; 63.295 + rq1_B2t = spu_rlmaska(spu_add(spu_add(spu_add(p0,q0),spu_add(q1,q2)),v_2),(vsint16_t)-v_2);//( p0 + q0 + q1 + q2 + 2 ) >> 2; 63.296 + rq2_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(q3,q3),spu_add(q2,q2)),spu_add(spu_add(q2,q1),spu_add(q0,p0))),(vuint16_t)v_4),(vsint16_t)-v_3); 63.297 + //( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; 63.298 + rp0_B1f = 63.299 + rp0_B2f = spu_rlmaska(spu_add(spu_add(spu_add(p1,p0),spu_add(p1,q1)),v_2),(vsint16_t)-v_2);//( 2*p1 + p0 + q1 + 2 ) >> 2; 63.300 + rq0_B1f = 63.301 + rq0_B2f = spu_rlmaska(spu_add(spu_add(spu_add(q1,q0),spu_add(q1,p1)),v_2),(vsint16_t)-v_2);//( 2*q1 + q0 + p1 + 2 ) >> 2; 63.302 + 63.303 + pix[-1] = (vsint16_t) spu_sel(p0, spu_sel(rp0_B1f, spu_sel(rp0_B2f, rp0_B2t, mask_B2P), mask_B1), mask_B0); 63.304 + pix[-2] = (vsint16_t) spu_sel(p1, spu_sel(p1, spu_sel(p1, rp1_B2t, mask_B2P), mask_B1), mask_B0); 63.305 + pix[-3] = (vsint16_t) spu_sel(p2, spu_sel(p2, spu_sel(p2, rp2_B2t, mask_B2P), mask_B1), mask_B0); 63.306 + pix[0] = (vsint16_t) spu_sel(q0, spu_sel(rq0_B1f, spu_sel(rq0_B2f, rq0_B2t, mask_B2Q), mask_B1), mask_B0); 63.307 + pix[1] = (vsint16_t) spu_sel(q1, spu_sel(q1, spu_sel(q1, rq1_B2t,mask_B2Q), mask_B1), mask_B0); 63.308 + pix[2] = (vsint16_t) spu_sel(q2, spu_sel(q2, spu_sel(q2, rq2_B2t,mask_B2Q), mask_B1), mask_B0); 63.309 + } 63.310 +} 63.311 + 63.312 +// This function gets bS and qp for luma and chroma before the filter 63.313 +void calculate_bS_qp(H264Context_spu *h){ 63.314 + H264mb* mb = &h->mb; 63.315 + H264slice* slice = h->slice; 63.316 + int dir; 63.317 + const int mvy_limit = 4; 63.318 + /* FIXME: A given frame may occupy more than one position in 63.319 + * the reference list. So ref2frm should be populated with 63.320 + * frame numbers, not indices. */ 63.321 + 63.322 + int (*ref2frm)[64] = slice->ref2frm; 63.323 + int mb_x = mb->mb_x; 63.324 + int mb_y = mb->mb_y; 63.325 + int mb_type =mb->mb_type; 63.326 + /* dir : 0 -> vertical edge, 1 -> horizontal edge */ 63.327 + for( dir = 0; dir < 2; dir++ ){ 63.328 + int edge; 63.329 + const int mbm_type = dir == 0 ? mb->mb_type_xy_n1 : mb->mb_type_top; 63.330 + const int8_t qscale_mbm = dir == 0 ? mb->qscale_mbxy_n1 : mb->qscale_mbxy_top; 63.331 + 63.332 + // how often to recheck mv-based bS when iterating between edges 63.333 + const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :(mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0; 63.334 + // how often to recheck mv-based bS when iterating along each edge 63.335 + const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)); 63.336 + 63.337 + h->edges[dir] = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP)) == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4; 63.338 + 63.339 + if ((dir==0 && mb_x==0) || (dir==1 && mb_y==0)) 63.340 + h->start[dir] =1; 63.341 + else 63.342 + h->start[dir] =0; 63.343 + 63.344 + /* Calculate bS */ 63.345 + for( edge = h->start[dir]; edge < h->edges[dir]; edge++ ) { 63.346 + /* mbn_xy: neighbor macroblock */ 63.347 + const int mbn_type = edge > 0 ? mb_type : mbm_type; 63.348 + const int8_t qscale_mbn_xy = edge > 0 ? mb->qscale_mbxy : qscale_mbm; 63.349 + int* bS = h->bS[dir][edge]; 63.350 + 63.351 + if( (edge&1) && IS_8x8DCT(mb_type) ){ 63.352 + bS[0] = bS[1] = bS[2] = bS[3] = 0; //extra code due to decoupling 63.353 + continue; 63.354 + } 63.355 + if( IS_INTRA(mb_type) || 63.356 + IS_INTRA(mbn_type) ) { 63.357 + int value; 63.358 + if (edge == 0) { 63.359 + value = 4; 63.360 + } else { 63.361 + value = 3; 63.362 + } 63.363 + bS[0] = bS[1] = bS[2] = bS[3] = value; 63.364 + } else { 63.365 + int i, l; 63.366 + int mv_done; 63.367 + 63.368 + if( edge & mask_edge ) { 63.369 + bS[0] = bS[1] = bS[2] = bS[3] = 0; 63.370 + mv_done = 1; 63.371 + } 63.372 + else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) { 63.373 + int b_idx= 8 + 4 + edge * (dir ? 8:1); 63.374 + int bn_idx= b_idx - (dir ? 8:1); 63.375 + int v = 0; 63.376 + 63.377 + for( l = 0; !v && l < 1 + (slice->slice_type_nos == FF_B_TYPE); l++ ) { 63.378 + v |= ref2frm[mb->ref_cache[l][b_idx]+2] != ref2frm[mb->ref_cache[l][bn_idx]+2] || 63.379 + FFABS(mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 || 63.380 + FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit; 63.381 + } 63.382 + bS[0] = bS[1] = bS[2] = bS[3] = v; 63.383 + 63.384 + mv_done = 1; 63.385 + } 63.386 + else 63.387 + mv_done = 0; 63.388 + 63.389 + for( i = 0; i < 4; i++ ) { 63.390 + int x = dir == 0 ? edge : i; 63.391 + int y = dir == 0 ? i : edge; 63.392 + int b_idx= 8 + 4 + x + 8*y; 63.393 + int bn_idx= b_idx - (dir ? 8:1); 63.394 + 63.395 + if( mb->non_zero_count_cache[b_idx] != 0 || 63.396 + mb->non_zero_count_cache[bn_idx] != 0 ) { 63.397 + bS[i] = 2; 63.398 + } 63.399 + else if(!mv_done) 63.400 + { 63.401 + bS[i] = 0; 63.402 + for( l = 0; l < 1 + (slice->slice_type == B_TYPE); l++ ) { 63.403 + if( ref2frm[mb->ref_cache[l][b_idx]+2] != ref2frm[mb->ref_cache[l][bn_idx]+2] || 63.404 + FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 || 63.405 + FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit ) { 63.406 + bS[i] = 1; 63.407 + break; 63.408 + } 63.409 + } 63.410 + } 63.411 + } 63.412 + 63.413 + if(bS[0]+bS[1]+bS[2]+bS[3] == 0) 63.414 + continue; 63.415 + } 63.416 + 63.417 + /* Filter edge */ 63.418 + // Do not use s->qscale as luma quantizer because it has not the same 63.419 + // value in IPCM macroblocks. 63.420 + h->qp[dir][edge] = ( mb->qscale_mbxy + qscale_mbn_xy + 1 ) >> 1; 63.421 + h->chroma_qp[0][dir][edge] = ( mb->chroma_qp[0] + get_chroma_qp(h, 0, qscale_mbn_xy ) + 1 ) >> 1; 63.422 + 63.423 + h->chroma_qp[1][dir][edge] = ( mb->chroma_qp[1] + get_chroma_qp(h, 1, qscale_mbn_xy ) + 1 ) >> 1; 63.424 + } 63.425 + slice_alpha_c0_offset=slice->slice_alpha_c0_offset; 63.426 + slice_beta_offset= slice->slice_beta_offset; 63.427 + } 63.428 +} 63.429 + 63.430 + 63.431 +#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7,merge_h,merge_l) \ 63.432 + b0 = spu_shuffle( a0, a4, merge_h); \ 63.433 + b1 = spu_shuffle( a0, a4, merge_l ); \ 63.434 + b2 = spu_shuffle( a1, a5, merge_h ); \ 63.435 + b3 = spu_shuffle( a1, a5, merge_l ); \ 63.436 + b4 = spu_shuffle( a2, a6, merge_h ); \ 63.437 + b5 = spu_shuffle( a2, a6, merge_l ); \ 63.438 + b6 = spu_shuffle( a3, a7, merge_h ); \ 63.439 + b7 = spu_shuffle( a3, a7, merge_l ); \ 63.440 + a0 = spu_shuffle( b0, b4, merge_h ); \ 63.441 + a1 = spu_shuffle( b0, b4, merge_l ); \ 63.442 + a2 = spu_shuffle( b1, b5, merge_h ); \ 63.443 + a3 = spu_shuffle( b1, b5, merge_l ); \ 63.444 + a4 = spu_shuffle( b2, b6, merge_h ); \ 63.445 + a5 = spu_shuffle( b2, b6, merge_l); \ 63.446 + a6 = spu_shuffle( b3, b7, merge_h ); \ 63.447 + a7 = spu_shuffle( b3, b7, merge_l ); \ 63.448 + b0 = spu_shuffle( a0, a4, merge_h ); \ 63.449 + b1 = spu_shuffle( a0, a4, merge_l ); \ 63.450 + b2 = spu_shuffle( a1, a5, merge_h ); \ 63.451 + b3 = spu_shuffle( a1, a5, merge_l); \ 63.452 + b4 = spu_shuffle( a2, a6, merge_h ); \ 63.453 + b5 = spu_shuffle( a2, a6, merge_l ); \ 63.454 + b6 = spu_shuffle( a3, a7, merge_h ); \ 63.455 + b7 = spu_shuffle( a3, a7, merge_l ) 63.456 + 63.457 +void filter_mb_spu(vsint16_t *img_y, vsint16_t *img_cb, vsint16_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int edges[2], int bS[2][4][4], int qp[2][4], int chroma_qp[2][2][4], int start[2]){ 63.458 + 63.459 + int dir,x; 63.460 + vsint16_t o_vec_img_y[(16+8)*2]; 63.461 + vsint16_t t_vec_img_y[(16+8)*2]; 63.462 + vsint16_t *vec_img_y_o = o_vec_img_y; 63.463 + vsint16_t *vec_img_y_t = t_vec_img_y; 63.464 + 63.465 + vsint16_t o_vec_img_cb[8+8+4]; 63.466 + vsint16_t t_vec_img_cb[8+8]; 63.467 + vsint16_t *vec_img_cb_o = &o_vec_img_cb[2]; 63.468 + vsint16_t *vec_img_cb_t = t_vec_img_cb; 63.469 + 63.470 + vsint16_t o_vec_img_cr[8+8+4]; 63.471 + vsint16_t t_vec_img_cr[8+8]; 63.472 + vsint16_t *vec_img_cr_o = &o_vec_img_cr[2]; 63.473 + vsint16_t *vec_img_cr_t = t_vec_img_cr; 63.474 + 63.475 + vuint8_t *pvec_tmp; 63.476 + 63.477 + const vuint8_t patt_high = {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7}; 63.478 + const vuint8_t patt_low = {16, 8, 17, 9, 18, 10, 19, 11, 20, 12, 21, 13, 22, 14, 23, 15}; 63.479 + const vuint8_t patt_unpack={ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}; 63.480 + const vuint8_t patt_pack_hw={0, 1, 2, 3, 4, 5, 6, 7, 17, 19, 21, 23, 25, 27, 29, 31}; 63.481 + const vuint8_t patt_pack_chroma_aligned={0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F, 63.482 + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F}; 63.483 + const vuint8_t patt_pack_chroma_unaligned={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 63.484 + 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F}; 63.485 + const vuint8_t v_0 = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; 63.486 + const vuint8_t mergehu16 = {0x00,0x01,0x10,0x11,0x02,0x03,0x12,0x13,0x04,0x05,0x14,0x15,0x06,0x07,0x16,0x17}; 63.487 + const vuint8_t mergelu16 = {0x08,0x09,0x18,0x19,0x0A,0x0B,0x1A,0x1B,0x0C,0x0D,0x1C,0x1D,0x0E,0x0F,0x1E,0x1F}; 63.488 + vuint8_t store_chroma, store_chroma_n1, load_chroma, load_chroma_n1; 63.489 + int mb_xy_n1; 63.490 + const int unalign_chroma = (unsigned int) img_cb & 15; 63.491 + 63.492 + if(unalign_chroma==0){ 63.493 + load_chroma = patt_high; 63.494 + load_chroma_n1 = patt_low; // for load chroma mb_x-1 63.495 + store_chroma = patt_pack_chroma_aligned; 63.496 + store_chroma_n1 = patt_pack_chroma_unaligned; // for store chroma mb_x-1 63.497 + mb_xy_n1 = 1; // si no hay desalineamineto se necesita el bloque anterior para filtrar horizontalmente 63.498 + } 63.499 + else{ 63.500 + load_chroma = patt_low; 63.501 + load_chroma_n1 = patt_high; // for load mb_x-1 63.502 + store_chroma = patt_pack_chroma_unaligned; 63.503 + store_chroma_n1 = patt_pack_chroma_aligned; // for store chroma mb_x-1 63.504 + mb_xy_n1 = 0; // si hay desalineamineto 8 no se necesita el bloque anterior 63.505 + } 63.506 + 63.507 + /* dir : 0 -> vertical edge, 1 -> horizontal edge */ 63.508 + 63.509 + // LOAD MB_X -1 63.510 + 63.511 + for (x = 0; x < 16; x++){ //Unpack Memory to 8 positions vector 63.512 + vec_img_y_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize - 1], v_0 , patt_low); 63.513 + } 63.514 + 63.515 + for (x = 0; x < 8; x++){ //Unpack Memory to 8 positions vector 63.516 + vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t)img_cb[x*uvlinesize - mb_xy_n1], v_0 , load_chroma_n1); 63.517 + vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t)img_cr[x*uvlinesize - mb_xy_n1], v_0 , load_chroma_n1); 63.518 + } 63.519 + 63.520 + VEC_TRANSPOSE_8(vec_img_y_o[0], vec_img_y_o[1], vec_img_y_o[2], vec_img_y_o[3], vec_img_y_o[4], vec_img_y_o[5], vec_img_y_o[6], vec_img_y_o[7], vec_img_y_t[0], vec_img_y_t[1], vec_img_y_t[2], vec_img_y_t[3], vec_img_y_t[4], vec_img_y_t[5], vec_img_y_t[6], vec_img_y_t[7],mergehu16, mergelu16); 63.521 + 63.522 + VEC_TRANSPOSE_8(vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15], vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31],mergehu16, mergelu16); 63.523 + 63.524 + VEC_TRANSPOSE_8(vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7], vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7],mergehu16, mergelu16); 63.525 + 63.526 + VEC_TRANSPOSE_8(vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7], vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7],mergehu16, mergelu16); 63.527 + 63.528 + vec_img_y_t = &vec_img_y_t[8]; 63.529 + vec_img_y_o = &vec_img_y_o[8]; 63.530 + vec_img_cb_t = &vec_img_cb_t[8]; 63.531 + vec_img_cb_o = &vec_img_cb_o[10]; 63.532 + vec_img_cr_t = &vec_img_cr_t[8]; 63.533 + vec_img_cr_o = &vec_img_cr_o[10]; 63.534 + 63.535 + //LOAD CURRENT MB 63.536 + for (x = 0; x < 16; x++){ //Unpack Memory to 8 positions vector 63.537 + pvec_tmp = (vuint8_t *) &img_y[x*linesize]; 63.538 + vec_img_y_o[x] = (vsint16_t) spu_shuffle(*pvec_tmp, v_0 , patt_high); 63.539 + vec_img_y_o[x+24] = (vsint16_t) spu_shuffle(*pvec_tmp, v_0 , patt_low); 63.540 + } 63.541 + 63.542 + for (x = 0; x < 8; x++){ //Unpack Memory to 8 positions vector 63.543 + vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cb[x*uvlinesize], v_0 , load_chroma); 63.544 + vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cr[x*uvlinesize], v_0 , load_chroma); 63.545 + } 63.546 + 63.547 + //TRANSPOSE MATRIX 63.548 + 63.549 + VEC_TRANSPOSE_8(vec_img_y_o[0], vec_img_y_o[1], vec_img_y_o[2], vec_img_y_o[3], vec_img_y_o[4], vec_img_y_o[5], vec_img_y_o[6], vec_img_y_o[7], vec_img_y_t[0], vec_img_y_t[1], vec_img_y_t[2], vec_img_y_t[3], vec_img_y_t[4], vec_img_y_t[5], vec_img_y_t[6], vec_img_y_t[7],mergehu16, mergelu16); 63.550 + 63.551 + VEC_TRANSPOSE_8(vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15], vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31],mergehu16, mergelu16); 63.552 + 63.553 + VEC_TRANSPOSE_8(vec_img_y_o[24], vec_img_y_o[25], vec_img_y_o[26], vec_img_y_o[27], vec_img_y_o[28], vec_img_y_o[29], vec_img_y_o[30], vec_img_y_o[31], vec_img_y_t[ 8], vec_img_y_t[ 9], vec_img_y_t[10], vec_img_y_t[11], vec_img_y_t[12], vec_img_y_t[13], vec_img_y_t[14], vec_img_y_t[15],mergehu16, mergelu16); 63.554 + 63.555 + VEC_TRANSPOSE_8(vec_img_y_o[32], vec_img_y_o[33], vec_img_y_o[34], vec_img_y_o[35], vec_img_y_o[36], vec_img_y_o[37], vec_img_y_o[38], vec_img_y_o[39], vec_img_y_t[32], vec_img_y_t[33], vec_img_y_t[34], vec_img_y_t[35], vec_img_y_t[36], vec_img_y_t[37], vec_img_y_t[38], vec_img_y_t[39],mergehu16, mergelu16); 63.556 + 63.557 + VEC_TRANSPOSE_8(vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7], vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7],mergehu16, mergelu16); 63.558 + 63.559 + VEC_TRANSPOSE_8(vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7], vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7],mergehu16, mergelu16); 63.560 + 63.561 + //PROCESS 63.562 + dir = 0; 63.563 + { 63.564 + int edge; 63.565 + for( edge = start[dir]; edge < edges[dir]; edge++ ) { 63.566 + if(bS[dir][edge][0]+bS[dir][edge][1]+bS[dir][edge][2]+bS[dir][edge][3] != 0) 63.567 + { 63.568 + filter_mb_edgeh( &vec_img_y_t[4*edge ], bS[dir][edge], qp[dir][edge],0);//low 63.569 + filter_mb_edgeh( &vec_img_y_t[4*edge+24], bS[dir][edge], qp[dir][edge],2);//high 63.570 + 63.571 + if( (edge&1) == 0 ) { 63.572 + filter_mb_edgecv( &vec_img_cb_t[2*edge], bS[dir][edge], chroma_qp[0][dir][edge] ); 63.573 + filter_mb_edgecv( &vec_img_cr_t[2*edge], bS[dir][edge], chroma_qp[1][dir][edge] ); 63.574 + } 63.575 + } 63.576 + } 63.577 + } 63.578 + 63.579 + //SAVE MB_X -1 RESULTS 63.580 + 63.581 + VEC_TRANSPOSE_8(vec_img_y_t[-8], vec_img_y_t[-7], vec_img_y_t[-6], vec_img_y_t[-5], vec_img_y_t[-4], vec_img_y_t[-3], vec_img_y_t[-2], vec_img_y_t[-1], vec_img_y_o[-8], vec_img_y_o[-7], vec_img_y_o[-6], vec_img_y_o[-5], vec_img_y_o[-4], vec_img_y_o[-3], vec_img_y_o[-2], vec_img_y_o[-1],mergehu16, mergelu16); 63.582 + 63.583 + VEC_TRANSPOSE_8(vec_img_y_t[16], vec_img_y_t[17], vec_img_y_t[18], vec_img_y_t[19], vec_img_y_t[20], vec_img_y_t[21], vec_img_y_t[22], vec_img_y_t[23], vec_img_y_o[16], vec_img_y_o[17], vec_img_y_o[18], vec_img_y_o[19], vec_img_y_o[20], vec_img_y_o[21], vec_img_y_o[22], vec_img_y_o[23],mergehu16, mergelu16); 63.584 + 63.585 + VEC_TRANSPOSE_8(vec_img_cb_t[ -8], vec_img_cb_t[-7], vec_img_cb_t[-6], vec_img_cb_t[-5], vec_img_cb_t[-4], vec_img_cb_t[-3], vec_img_cb_t[-2], vec_img_cb_t[-1], vec_img_cb_o[-10], vec_img_cb_o[-9], vec_img_cb_o[-8], vec_img_cb_o[-7], vec_img_cb_o[-6], vec_img_cb_o[-5], vec_img_cb_o[-4], vec_img_cb_o[-3],mergehu16, mergelu16); 63.586 + 63.587 + VEC_TRANSPOSE_8(vec_img_cr_t[ -8], vec_img_cr_t[-7], vec_img_cr_t[-6], vec_img_cr_t[-5], vec_img_cr_t[-4], vec_img_cr_t[-3], vec_img_cr_t[-2], vec_img_cr_t[-1], vec_img_cr_o[-10], vec_img_cr_o[-9], vec_img_cr_o[-8], vec_img_cr_o[-7], vec_img_cr_o[-6], vec_img_cr_o[-5], vec_img_cr_o[-4], vec_img_cr_o[-3],mergehu16, mergelu16); 63.588 + 63.589 + for (x = 0; x < 8; x++){ //pack Memory to 8 positions vector ERROR - No check for writing out of the memory 63.590 + img_y[x*linesize - 1] = spu_shuffle(img_y[x*linesize - 1], vec_img_y_o[-8+x], patt_pack_hw); 63.591 + } 63.592 + 63.593 + for (x = 0; x < 8; x++){ //pack Memory to 8 positions vector ERROR - No check for writing out of the memory 63.594 + img_y[(x+8)*linesize - 1] = spu_shuffle(img_y[(x+8)*linesize - 1], vec_img_y_o[16+x], patt_pack_hw); 63.595 + } 63.596 + 63.597 + for (x = 0; x < 8; x++){ //pack Memory to 8 positions vector ERROR - No check for writing out of the memory 63.598 + img_cb[x*uvlinesize - mb_xy_n1] = spu_shuffle(img_cb[x*uvlinesize - mb_xy_n1], vec_img_cb_o[-10+x], store_chroma_n1); 63.599 + img_cr[x*uvlinesize - mb_xy_n1] = spu_shuffle(img_cr[x*uvlinesize - mb_xy_n1], vec_img_cr_o[-10+x], store_chroma_n1); 63.600 + } 63.601 + 63.602 + //TRANSPOSE MATRIX 63.603 + 63.604 + VEC_TRANSPOSE_8(vec_img_y_t[ 0], vec_img_y_t[ 1], vec_img_y_t[ 2], vec_img_y_t[ 3], vec_img_y_t[ 4], vec_img_y_t[ 5], vec_img_y_t[ 6], vec_img_y_t[ 7], vec_img_y_o[ 0], vec_img_y_o[ 1], vec_img_y_o[ 2], vec_img_y_o[ 3], vec_img_y_o[ 4], vec_img_y_o[ 5], vec_img_y_o[ 6], vec_img_y_o[ 7],mergehu16, mergelu16); 63.605 + 63.606 + VEC_TRANSPOSE_8(vec_img_y_t[ 8], vec_img_y_t[ 9], vec_img_y_t[10], vec_img_y_t[11], vec_img_y_t[12], vec_img_y_t[13], vec_img_y_t[14], vec_img_y_t[15], vec_img_y_o[24], vec_img_y_o[25], vec_img_y_o[26], vec_img_y_o[27], vec_img_y_o[28], vec_img_y_o[29], vec_img_y_o[30], vec_img_y_o[31],mergehu16, mergelu16); 63.607 + 63.608 + VEC_TRANSPOSE_8(vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31], vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15],mergehu16, mergelu16); 63.609 + 63.610 + VEC_TRANSPOSE_8(vec_img_y_t[32], vec_img_y_t[33], vec_img_y_t[34], vec_img_y_t[35], vec_img_y_t[36], vec_img_y_t[37], vec_img_y_t[38], vec_img_y_t[39], vec_img_y_o[32], vec_img_y_o[33], vec_img_y_o[34], vec_img_y_o[35], vec_img_y_o[36], vec_img_y_o[37], vec_img_y_o[38], vec_img_y_o[39],mergehu16, mergelu16); 63.611 + 63.612 + VEC_TRANSPOSE_8(vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7], vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7],mergehu16, mergelu16); 63.613 + 63.614 + VEC_TRANSPOSE_8(vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7], vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7],mergehu16, mergelu16); 63.615 + 63.616 + 63.617 + //LOAD MB_Y - 1 63.618 + for (x = -4; x < 0; x++){ //Unpack Memory to 8 positions vector 63.619 + vec_img_y_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize], v_0 , patt_high); 63.620 + vec_img_y_o[x+24] = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize], v_0 , patt_low); 63.621 + } 63.622 + 63.623 + for (x = -2; x < 0; x++){ //Unpack Memory to 8 positions vector 63.624 + vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cb[x*uvlinesize], v_0 , load_chroma); 63.625 + vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cr[x*uvlinesize], v_0 , load_chroma); 63.626 + } 63.627 + 63.628 + //PROCESS 63.629 + dir = 1; 63.630 + { 63.631 + int edge; 63.632 + for( edge = start[dir]; edge < edges[dir]; edge++ ) { 63.633 + if(bS[dir][edge][0]+bS[dir][edge][1]+bS[dir][edge][2]+bS[dir][edge][3] != 0) 63.634 + { 63.635 + filter_mb_edgeh( &vec_img_y_o[4*edge ], bS[dir][edge], qp[dir][edge],0);//low 63.636 + filter_mb_edgeh( &vec_img_y_o[4*edge+24], bS[dir][edge], qp[dir][edge],2);//high 63.637 + if( (edge&1) == 0 ) { 63.638 + filter_mb_edgecv( &vec_img_cb_o[2*edge], bS[dir][edge], chroma_qp[0][dir][edge] ); 63.639 + filter_mb_edgecv( &vec_img_cr_o[2*edge], bS[dir][edge], chroma_qp[1][dir][edge] ); 63.640 + } 63.641 + } 63.642 + } 63.643 + 63.644 + for (x = -3; x < 16; x++){ //pack Memory to 8 positions vector ERROR - No check for writing out of the memory 63.645 + img_y[x*linesize] = spu_shuffle(vec_img_y_o[x], vec_img_y_o[x+24], patt_unpack); 63.646 + } 63.647 + 63.648 + for (x = -1; x < 8; x++){ //pack Memory to 8 positions vector ERROR - No check for writing out of the memory 63.649 + img_cb[x*uvlinesize] = spu_shuffle(img_cb[x*uvlinesize], vec_img_cb_o[x], store_chroma); 63.650 + img_cr[x*uvlinesize] = spu_shuffle(img_cr[x*uvlinesize], vec_img_cr_o[x], store_chroma); 63.651 + } 63.652 + } 63.653 +}
64.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 64.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.c Mon Aug 27 12:09:56 2012 +0200 64.3 @@ -0,0 +1,408 @@ 64.4 +/* 64.5 + * Copyright (c) 2009 TUDelft 64.6 + * 64.7 + * Cell Parallel SPU - Macroblock Decoding. 64.8 + */ 64.9 + 64.10 +/** 64.11 + * @file libavcodec/cell/spu/h264_main_spu.c 64.12 + * Cell Parallel SPU - Macroblock Decoding 64.13 + * @author C C Chi <c.c.chi@student.tudelft.nl> 64.14 + * 64.15 + * SIMD kernels 64.16 + * H.264/AVC motion compensation 64.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu> 64.18 + * @author Albert Paradis <apar7632@hotmail.com> 64.19 + */ 64.20 + 64.21 +#include <spu_intrinsics.h> 64.22 +#include "types_spu.h" 64.23 +#include "h264_tables.h" 64.24 +#include "h264_idct_spu.h" 64.25 +#include "h264_intra_spu.h" 64.26 + 64.27 +/*********************************************************************** 64.28 + * ff_h264_idct_add_spu 64.29 + *********************************************************************** 64.30 + * h264 idct 4x4 transform with SPU SIMD intrinsics 64.31 + * using the factorized algorithm 64.32 + * Mauricio Alvarez: alvarez@ac.upc.edu 64.33 + * - DCTELEM* block: transformed coefficients are stored consecutvely in memory, 64.34 + * - for the 4x4 transform the structure is like that: 64.35 + * || coef_00 | coef_01 || coef_02 | coef_03 ||..||coef_0F|| 64.36 + * - Usually the DCTELEM block is declared with an alignment modificator in such a way 64.37 + * that the array is 128 bit (16 byte, 8 short) aligned. 64.38 + * - The dst pointer can be unaligned with unaligment as a multiple of 4. 64.39 + ***********************************************************************/ 64.40 + 64.41 +// idct_dc 64.42 +void ff_idct_dc_add(uint8_t *dst, short *block, int stride){ 64.43 + int i, j; 64.44 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; 64.45 + int dc = (block[0] + 32) >> 6; 64.46 + for( j = 0; j < 4; j++ ){ 64.47 + for( i = 0; i < 4; i++ ) 64.48 + dst[i] = cm[ dst[i] + dc ]; 64.49 + dst += stride; 64.50 + } 64.51 +} 64.52 + 64.53 +void ff_idct8_dc_add(uint8_t *dst, short *block, int stride){ 64.54 + int i, j; 64.55 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; 64.56 + int dc = (block[0] + 32) >> 6; 64.57 + for( j = 0; j < 8; j++ ){ 64.58 + for( i = 0; i < 8; i++ ) 64.59 + dst[i] = cm[ dst[i] + dc ]; 64.60 + dst += stride; 64.61 + } 64.62 +} 64.63 + 64.64 +// add without idct 64.65 + 64.66 +void add_pixels8_c(uint8_t *pixels, short *block, int line_size) 64.67 +{ 64.68 + int i; 64.69 + for(i=0;i<8;i++) { 64.70 + pixels[0] += block[0]; 64.71 + pixels[1] += block[1]; 64.72 + pixels[2] += block[2]; 64.73 + pixels[3] += block[3]; 64.74 + pixels[4] += block[4]; 64.75 + pixels[5] += block[5]; 64.76 + pixels[6] += block[6]; 64.77 + pixels[7] += block[7]; 64.78 + pixels += line_size; 64.79 + block += 8; 64.80 + } 64.81 +} 64.82 + 64.83 +void add_pixels4_c(uint8_t *pixels, short *block, int line_size) 64.84 +{ 64.85 + int i; 64.86 + for(i=0;i<4;i++) { 64.87 + pixels[0] += block[0]; 64.88 + pixels[1] += block[1]; 64.89 + pixels[2] += block[2]; 64.90 + pixels[3] += block[3]; 64.91 + pixels += line_size; 64.92 + block += 4; 64.93 + } 64.94 +} 64.95 + 64.96 +void h264_luma_dc_dequant_idct_c(short *block, int qmul){ 64.97 + #define stride 16 64.98 + int i; 64.99 + int temp[16]; //FIXME check if this is a good idea 64.100 + static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride}; 64.101 + static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride}; 64.102 + 64.103 + for(i=0; i<4; i++){ 64.104 + const int offset= y_offset[i]; 64.105 + const int z0= block[offset+stride*0] + block[offset+stride*4]; 64.106 + const int z1= block[offset+stride*0] - block[offset+stride*4]; 64.107 + const int z2= block[offset+stride*1] - block[offset+stride*5]; 64.108 + const int z3= block[offset+stride*1] + block[offset+stride*5]; 64.109 + 64.110 + temp[4*i+0]= z0+z3; 64.111 + temp[4*i+1]= z1+z2; 64.112 + temp[4*i+2]= z1-z2; 64.113 + temp[4*i+3]= z0-z3; 64.114 + } 64.115 + 64.116 + for(i=0; i<4; i++){ 64.117 + const int offset= x_offset[i]; 64.118 + const int z0= temp[4*0+i] + temp[4*2+i]; 64.119 + const int z1= temp[4*0+i] - temp[4*2+i]; 64.120 + const int z2= temp[4*1+i] - temp[4*3+i]; 64.121 + const int z3= temp[4*1+i] + temp[4*3+i]; 64.122 + 64.123 + block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual 64.124 + block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8)); 64.125 + block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); 64.126 + block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); 64.127 + } 64.128 +} 64.129 +#undef stride 64.130 + 64.131 +void chroma_dc_dequant_idct_c(short *block, int qmul){ 64.132 + const int stride= 16*2; 64.133 + const int xStride= 16; 64.134 + int a,b,c,d,e; 64.135 + 64.136 + a= block[stride*0 + xStride*0]; 64.137 + b= block[stride*0 + xStride*1]; 64.138 + c= block[stride*1 + xStride*0]; 64.139 + d= block[stride*1 + xStride*1]; 64.140 + 64.141 + e= a-b; 64.142 + a= a+b; 64.143 + b= c-d; 64.144 + c= c+d; 64.145 + 64.146 + block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7; 64.147 + block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7; 64.148 + block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7; 64.149 + block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7; 64.150 +} 64.151 + 64.152 +void h264_idct4_add_spu(uint8_t *dst, short *block, int stride) 64.153 +{ 64.154 + vsint16_t __vz0, __vz1, __vz2, __vz3; // used as temporal storage in for VEC_1D_DCT 64.155 + vsint16_t va0, va1, va2, va3; 64.156 + vsint16_t vtmp0, vtmp1, vtmp2, vtmp3; 64.157 + vuint16_t sat; 64.158 + vuint8_t va_u8; 64.159 + vsint16_t vdst_ss; 64.160 + vuint8_t dstperm; 64.161 + vuint8_t vdst, vdst_orig, vfdst; 64.162 + const int16_t imax = 255; 64.163 + const vsint32_t vzero = spu_splats(0); 64.164 + const vsint16_t vmax = (vsint16_t)spu_splats(imax); 64.165 + const int shift_dst = (unsigned int) dst & 15; 64.166 + const vuint8_t packu16 = AVV(0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F); 64.167 + const vuint8_t mergehu8 = AVV(0x00,0x10,0x01,0x11,0x02,0x12,0x03,0x13,0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17); 64.168 + //for optimized matrix transpose: 64.169 + const vuint8_t tr0 =AVV(0x00,0x01,0x08,0x09,0x10,0x11,0x18,0x19,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00); 64.170 + const vuint8_t tr1 =AVV(0x02,0x03,0x0A,0x0B,0x12,0x13,0x1A,0x1B,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00); 64.171 + const vuint8_t tr2 =AVV(0x04,0x05,0x0C,0x0D,0x14,0x15,0x1C,0x1D,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00); 64.172 + const vuint8_t tr3 =AVV(0x06,0x07,0x0E,0x0F,0x16,0x17,0x1E,0x1F,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00); 64.173 + const vuint8_t conc =AVV(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17); 64.174 + 64.175 + block[0] += 32; // add 32 as a DC-level for rounding 64.176 + 64.177 + //load matrix 64.178 + vtmp0 = *(vsint16_t *)(block); 64.179 + vtmp1 = spu_rlqwbyte(vtmp0,8); 64.180 + vtmp2 = *(vsint16_t *)(block+8); 64.181 + vtmp3 = spu_rlqwbyte(vtmp2,8); 64.182 + 64.183 + VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); 64.184 + 64.185 + //concatenate first two rows of matrix 64.186 + va0=spu_shuffle(va0,va1,conc); 64.187 + //concatenate last two rows of matrix 64.188 + va2=spu_shuffle(va2,va3,conc); 64.189 + 64.190 + //do transpose starting from two vectors, storing as four vectors of which the second part is unused 64.191 + vtmp0 = spu_shuffle( va0, va2, tr0); 64.192 + vtmp1 = spu_shuffle( va0, va2, tr1); 64.193 + vtmp2 = spu_shuffle( va0, va2, tr2); 64.194 + vtmp3 = spu_shuffle( va0, va2, tr3); 64.195 + 64.196 + VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); 64.197 + 64.198 + // division by 64 64.199 + va0 = spu_rlmaska(va0,-6); 64.200 + va1 = spu_rlmaska(va1,-6); 64.201 + va2 = spu_rlmaska(va2,-6); 64.202 + va3 = spu_rlmaska(va3,-6); 64.203 + 64.204 + switch (shift_dst){ 64.205 + case 0: { 64.206 + dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 64.207 + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); 64.208 + } break; 64.209 + case 4: { 64.210 + dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 64.211 + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); 64.212 + } break; 64.213 + case 8: { 64.214 + dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 64.215 + 0x10, 0x11, 0x12, 0x13, 0x0C, 0x0D, 0x0E, 0x0F); 64.216 + } break; 64.217 + case 12: { 64.218 + dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 64.219 + 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13); 64.220 + } break; 64.221 + default: { 64.222 + dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 64.223 + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); 64.224 + } break; 64.225 + } 64.226 + 64.227 + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va0,dstperm); 64.228 + dst += stride; 64.229 + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va1,dstperm); 64.230 + dst += stride; 64.231 + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va2,dstperm); 64.232 + dst += stride; 64.233 + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va3,dstperm); 64.234 +} 64.235 + 64.236 +void h264_idct8_add_spu(uint8_t *dst, short *block, int stride) 64.237 +{ 64.238 + vsint16_t va0, va1, va2, va3, va4, va5, va6, va7; 64.239 + vsint16_t vza0, vza1, vza2, vza3, vza4, vza5, vza6, vza7, vzal,vzah; 64.240 + vsint16_t vzb0, vzb1, vzb2, vzb3, vzb4, vzb5, vzb6, vzb7; 64.241 + vsint16_t vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, vtmp6, vtmp7; 64.242 + vuint16_t sat; 64.243 + vuint8_t va_u8; 64.244 + const int block_stride=8; 64.245 + vsint16_t vdst_ss; 64.246 + const int16_t imax = 255; 64.247 + const vsint32_t vzero = spu_splats(0); 64.248 + const vsint16_t vmax = (vsint16_t)spu_splats(imax); 64.249 + vuint8_t vdst, vdst_orig, vfdst; 64.250 + vuint8_t dstperm; 64.251 + const int shift_dst = (unsigned int) dst & 15; 64.252 + const vuint8_t packu16 = AVV(0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F); 64.253 + const vuint8_t mergehu8 = AVV(0x00,0x10,0x01,0x11,0x02,0x12,0x03,0x13,0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17); 64.254 + const vuint8_t m1 = AVV(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17); 64.255 + const vuint8_t m2 = AVV(0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F); 64.256 + const vuint8_t m3 = AVV(0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x18,0x19,0x1A,0x1B); 64.257 + const vuint8_t m4 = AVV(0x14,0x15,0x16,0x17,0x04,0x05,0x06,0x07,0x1C,0x1D,0x1E,0x1F,0x0C,0x0D,0x0E,0x0F); 64.258 + const vuint8_t m5 = AVV(0x00,0x01,0x10,0x11,0x04,0x05,0x14,0x15,0x08,0x09,0x18,0x19,0x0C,0x0D,0x1C,0x1D); 64.259 + const vuint8_t m6 = AVV(0x12,0x13,0x02,0x03,0x16,0x17,0x06,0x07,0x1A,0x1B,0x0A,0x0B,0x1E,0x1F,0x0E,0x0F); 64.260 + 64.261 + block[0] += 32; // add 32 as a DC-level for rounding 64.262 + 64.263 + vtmp0 = *(vsint16_t *)(block); 64.264 + vtmp1 = *(vsint16_t *)(block + block_stride); 64.265 + vtmp2 = *(vsint16_t *)(block + 2*block_stride); 64.266 + vtmp3 = *(vsint16_t *)(block + 3*block_stride); 64.267 + vtmp4 = *(vsint16_t *)(block + 4*block_stride); 64.268 + vtmp5 = *(vsint16_t *)(block + 5*block_stride); 64.269 + vtmp6 = *(vsint16_t *)(block + 6*block_stride); 64.270 + vtmp7 = *(vsint16_t *)(block + 7*block_stride); 64.271 + 64.272 + VEC_1D_DCT8(vtmp0,vtmp1,vtmp2,vtmp3,vtmp4,vtmp5,vtmp6,vtmp7); 64.273 + VEC_TRANSPOSE_8(vtmp0,vtmp1,vtmp2,vtmp3,vtmp4,vtmp5,vtmp6,vtmp7,va0,va1,va2,va3,va4,va5,va6,va7); 64.274 + VEC_1D_DCT8(va0, va1, va2, va3, va4, va5, va6, va7); 64.275 + 64.276 + va0 = spu_rlmaska(va0,-6); 64.277 + va1 = spu_rlmaska(va1,-6); 64.278 + va2 = spu_rlmaska(va2,-6); 64.279 + va3 = spu_rlmaska(va3,-6); 64.280 + va4 = spu_rlmaska(va4,-6); 64.281 + va5 = spu_rlmaska(va5,-6); 64.282 + va6 = spu_rlmaska(va6,-6); 64.283 + va7 = spu_rlmaska(va7,-6); 64.284 + 64.285 + if (shift_dst==8) 64.286 + dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 64.287 + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17); 64.288 + else dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 64.289 + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); 64.290 + 64.291 + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va0,dstperm); 64.292 + dst += stride; 64.293 + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va1,dstperm); 64.294 + dst += stride; 64.295 + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va2,dstperm); 64.296 + dst += stride; 64.297 + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va3,dstperm); 64.298 + dst += stride; 64.299 + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va4,dstperm); 64.300 + dst += stride; 64.301 + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va5,dstperm); 64.302 + dst += stride; 64.303 + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va6,dstperm); 64.304 + dst += stride; 64.305 + VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va7,dstperm); 64.306 + 64.307 +} 64.308 + 64.309 +/* 64.310 + 64.311 +void h264_idct4_add_spu(uint8_t *dst, short *block, int stride){ 64.312 + int i; 64.313 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; 64.314 + 64.315 + block[0] += 32; 64.316 + 64.317 + for(i=0; i<4; i++){ 64.318 + const int z0= block[0 + 4*i] + block[2 + 4*i]; 64.319 + const int z1= block[0 + 4*i] - block[2 + 4*i]; 64.320 + const int z2= (block[1 + 4*i]>>1) - block[3 + 4*i]; 64.321 + const int z3= block[1 + 4*i] + (block[3 + 4*i]>>1); 64.322 + 64.323 + block[0 + 4*i]= z0 + z3; 64.324 + block[1 + 4*i]= z1 + z2; 64.325 + block[2 + 4*i]= z1 - z2; 64.326 + block[3 + 4*i]= z0 - z3; 64.327 + } 64.328 + 64.329 + for(i=0; i<4; i++){ 64.330 + const int z0= block[i + 4*0] + block[i + 4*2]; 64.331 + const int z1= block[i + 4*0] - block[i + 4*2]; 64.332 + const int z2= (block[i + 4*1]>>1) - block[i + 4*3]; 64.333 + const int z3= block[i + 4*1] + (block[i + 4*3]>>1); 64.334 + 64.335 + dst[i + 0*stride]= cm[ dst[i + 0*stride] + ((z0 + z3) >> 6) ]; 64.336 + dst[i + 1*stride]= cm[ dst[i + 1*stride] + ((z1 + z2) >> 6) ]; 64.337 + dst[i + 2*stride]= cm[ dst[i + 2*stride] + ((z1 - z2) >> 6) ]; 64.338 + dst[i + 3*stride]= cm[ dst[i + 3*stride] + ((z0 - z3) >> 6) ]; 64.339 + } 64.340 +} 64.341 + 64.342 +void h264_idct8_add_spu(uint8_t *dst, short *block, int stride){ 64.343 + int i; 64.344 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; 64.345 + 64.346 + block[0] += 32; 64.347 + 64.348 + for( i = 0; i < 8; i++ ) 64.349 + { 64.350 + const int a0 = block[0+i*8] + block[4+i*8]; 64.351 + const int a2 = block[0+i*8] - block[4+i*8]; 64.352 + const int a4 = (block[2+i*8]>>1) - block[6+i*8]; 64.353 + const int a6 = (block[6+i*8]>>1) + block[2+i*8]; 64.354 + 64.355 + const int b0 = a0 + a6; 64.356 + const int b2 = a2 + a4; 64.357 + const int b4 = a2 - a4; 64.358 + const int b6 = a0 - a6; 64.359 + 64.360 + const int a1 = -block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1); 64.361 + const int a3 = block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1); 64.362 + const int a5 = -block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1); 64.363 + const int a7 = block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1); 64.364 + 64.365 + const int b1 = (a7>>2) + a1; 64.366 + const int b3 = a3 + (a5>>2); 64.367 + const int b5 = (a3>>2) - a5; 64.368 + const int b7 = a7 - (a1>>2); 64.369 + 64.370 + block[0+i*8] = b0 + b7; 64.371 + block[7+i*8] = b0 - b7; 64.372 + block[1+i*8] = b2 + b5; 64.373 + block[6+i*8] = b2 - b5; 64.374 + block[2+i*8] = b4 + b3; 64.375 + block[5+i*8] = b4 - b3; 64.376 + block[3+i*8] = b6 + b1; 64.377 + block[4+i*8] = b6 - b1; 64.378 + } 64.379 + for( i = 0; i < 8; i++ ) 64.380 + { 64.381 + const int a0 = block[i+0*8] + block[i+4*8]; 64.382 + const int a2 = block[i+0*8] - block[i+4*8]; 64.383 + const int a4 = (block[i+2*8]>>1) - block[i+6*8]; 64.384 + const int a6 = (block[i+6*8]>>1) + block[i+2*8]; 64.385 + 64.386 + const int b0 = a0 + a6; 64.387 + const int b2 = a2 + a4; 64.388 + const int b4 = a2 - a4; 64.389 + const int b6 = a0 - a6; 64.390 + 64.391 + const int a1 = -block[i+3*8] + block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1); 64.392 + const int a3 = block[i+1*8] + block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1); 64.393 + const int a5 = -block[i+1*8] + block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1); 64.394 + const int a7 = block[i+3*8] + block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1); 64.395 + 64.396 + const int b1 = (a7>>2) + a1; 64.397 + const int b3 = a3 + (a5>>2); 64.398 + const int b5 = (a3>>2) - a5; 64.399 + const int b7 = a7 - (a1>>2); 64.400 + 64.401 + dst[i + 0*stride] = cm[ dst[i + 0*stride] + ((b0 + b7) >> 6) ]; 64.402 + dst[i + 1*stride] = cm[ dst[i + 1*stride] + ((b2 + b5) >> 6) ]; 64.403 + dst[i + 2*stride] = cm[ dst[i + 2*stride] + ((b4 + b3) >> 6) ]; 64.404 + dst[i + 3*stride] = cm[ dst[i + 3*stride] + ((b6 + b1) >> 6) ]; 64.405 + dst[i + 4*stride] = cm[ dst[i + 4*stride] + ((b6 - b1) >> 6) ]; 64.406 + dst[i + 5*stride] = cm[ dst[i + 5*stride] + ((b4 - b3) >> 6) ]; 64.407 + dst[i + 6*stride] = cm[ dst[i + 6*stride] + ((b2 - b5) >> 6) ]; 64.408 + dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ]; 64.409 + } 64.410 +}*/ 64.411 +
65.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 65.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.h Mon Aug 27 12:09:56 2012 +0200 65.3 @@ -0,0 +1,141 @@ 65.4 +#ifndef H264_IDCT_SPU_H 65.5 +#define H264_IDCT_SPU_H 65.6 + 65.7 +void h264_idct4_add_spu(uint8_t *dst, short *block, int stride); 65.8 +void h264_idct8_add_spu(uint8_t *dst, short *block, int stride); 65.9 + 65.10 +/*********************************************************************** 65.11 + * VEC_1D_IDCT 65.12 + *********************************************************************** 65.13 + * 1-dimensional 4x4 H264 integer DCT inverse transform. 65.14 + * Actually source and destination are 8x4. The low elements of the 65.15 + * source are discarded and the low elements of the destination mustn't 65.16 + * be used. 65.17 + * __vz0-__vz3 registers need to be declared in the caller function 65.18 + ***********************************************************************/ 65.19 +#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \ 65.20 + /* 1st stage */ \ 65.21 + __vz0 = spu_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \ 65.22 + __vz1 = spu_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \ 65.23 + __vz2 = spu_rlmaska(vb1,-1); \ 65.24 + __vz2 = spu_sub(__vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \ 65.25 + __vz3 = spu_rlmaska(vb3,-1); \ 65.26 + __vz3 = spu_add(vb1,__vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \ 65.27 + \ 65.28 + /* 2nd stage: output */ \ 65.29 + va0 = spu_add(__vz0,__vz3); /* x[0] = temp[0] + temp[3] */ \ 65.30 + va1 = spu_add(__vz1,__vz2); /* x[1] = temp[1] + temp[2] */ \ 65.31 + va2 = spu_sub(__vz1,__vz2); /* x[2] = temp[1] - temp[2] */ \ 65.32 + va3 = spu_sub(__vz0,__vz3) /* x[3] = temp[0] - temp[3] */ 65.33 + 65.34 +/*********************************************************************** 65.35 + * VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8 65.36 + *********************************************************************** 65.37 + * load a vuint8_t vector from a unaligned memory position p 65.38 + * Converts the vector to vsint16_t 65.39 + * Adds the loaded and converted vector to a defined vector va 65.40 + * converts back the result to vuint8_t and store it to memory 65.41 + **********************************************************************/ 65.42 + 65.43 +#define VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(p,shift,va,align_dst) \ 65.44 + vdst_orig = *(vuint8_t *) (p); \ 65.45 + vdst = spu_or(spu_slqwbyte(vdst_orig, shift),(vuint8_t) vzero); \ 65.46 + vdst_ss = (vsint16_t) spu_shuffle((vuint8_t)vzero,vdst,mergehu8); \ 65.47 + va = spu_add(va,vdst_ss); \ 65.48 + sat = spu_cmpgt(va,(vsint16_t)vzero); \ 65.49 + va = spu_and(va,(vsint16_t)sat); \ 65.50 + sat = spu_cmpgt(va,vmax); \ 65.51 + va = spu_sel(va,vmax,sat); \ 65.52 + va_u8 = (vuint8_t) spu_shuffle(va,(vsint16_t) vzero,packu16); \ 65.53 + vfdst = spu_shuffle(vdst_orig, va_u8, align_dst); \ 65.54 + *(vuint8_t *) (dst) = vfdst 65.55 + 65.56 +/*********************************************************************** 65.57 + * VEC_TRANSPOSE_8 65.58 + *********************************************************************** 65.59 + * Transposes a 8x8 matrix of s16 vectors 65.60 + **********************************************************************/ 65.61 +#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \ 65.62 + b0 = spu_shuffle( a0, a4, m1 ); \ 65.63 + b1 = spu_shuffle( a1, a5, m1 ); \ 65.64 + b2 = spu_shuffle( a2, a6, m1 ); \ 65.65 + b3 = spu_shuffle( a3, a7, m1 ); \ 65.66 + b4 = spu_shuffle( a4, a0, m2 ); \ 65.67 + b5 = spu_shuffle( a5, a1, m2 ); \ 65.68 + b6 = spu_shuffle( a6, a2, m2 ); \ 65.69 + b7 = spu_shuffle( a7, a3, m2 ); \ 65.70 + a0 = spu_shuffle( b0, b2, m3 ); \ 65.71 + a1 = spu_shuffle( b1, b3, m3 ); \ 65.72 + a2 = spu_shuffle( b2, b0, m4 ); \ 65.73 + a3 = spu_shuffle( b3, b1, m4 ); \ 65.74 + a4 = spu_shuffle( b4, b6, m3 ); \ 65.75 + a5 = spu_shuffle( b5, b7, m3 ); \ 65.76 + a6 = spu_shuffle( b6, b4, m4 ); \ 65.77 + a7 = spu_shuffle( b7, b5, m4 ); \ 65.78 + b0 = spu_shuffle( a0, a1, m5 ); \ 65.79 + b1 = spu_shuffle( a1, a0, m6 ); \ 65.80 + b2 = spu_shuffle( a2, a3, m5 ); \ 65.81 + b3 = spu_shuffle( a3, a2, m6 ); \ 65.82 + b4 = spu_shuffle( a4, a5, m5 ); \ 65.83 + b5 = spu_shuffle( a5, a4, m6 ); \ 65.84 + b6 = spu_shuffle( a6, a7, m5 ); \ 65.85 + b7 = spu_shuffle( a7, a6, m6 ) 65.86 + 65.87 +/*********************************************************************** 65.88 + * VEC_1D_IDCT8 65.89 + *********************************************************************** 65.90 + * 1-dimensional 8x8 H264 integer DCT inverse transform. 65.91 + ***********************************************************************/ 65.92 +#define VEC_1D_DCT8(vb0,vb1,vb2,vb3,vb4,vb5,vb6,vb7) \ 65.93 + vza0 = spu_add(vb0,vb4); /* a[0] = Y[0] + Y[4] */ \ 65.94 + vza2 = spu_sub(vb0,vb4); /* a[2] = Y[0] - Y[4] */ \ 65.95 + vza4 = spu_rlmaska(vb2,-1); \ 65.96 + vza4 = spu_sub(vza4,vb6); /* a[4] = Y[2]>>1 - Y[6] */ \ 65.97 + vza6 = spu_rlmaska(vb6,-1 ); \ 65.98 + vza6 = spu_add(vb2,vza6); /* a[6] = Y[2] + Y[6]>>1 */ \ 65.99 + \ 65.100 + vzb0 = spu_add(vza0,vza6); /* b[0] = a[0] + a[6] */ \ 65.101 + vzb2 = spu_add(vza2,vza4); /* b[2] = a[2] + a[4] */ \ 65.102 + vzb4 = spu_sub(vza2,vza4); /* b[4] = a[2] - a[4] */ \ 65.103 + vzb6 = spu_sub(vza0,vza6); /* b[6] = a[0] - a[6] */ \ 65.104 + \ 65.105 + vza1 = spu_rlmaska(vb7,-1); \ 65.106 + vzal = spu_add(vza1,vb7); \ 65.107 + vzah = spu_sub(vb5,vb3); \ 65.108 + vza1 = spu_sub(vzah,vzal); /* a1 = (-Y[3] + Y[5]) - (Y[7] + (Y[7]>>1)) */ \ 65.109 + \ 65.110 + vza3 = spu_rlmaska(vb3,-1); \ 65.111 + vzal = spu_add(vza3,vb3); \ 65.112 + vzah = spu_add(vb1,vb7); \ 65.113 + vza3 = spu_sub(vzah,vzal); /* a3 = (Y[1] + Y[7]) - (Y[3] + (Y[3]>>1)) */ \ 65.114 + \ 65.115 + vza5 = spu_rlmaska(vb5,-1); \ 65.116 + vzal = spu_add(vza5,vb5); \ 65.117 + vzah = spu_sub(vb7,vb1); \ 65.118 + vza5 = spu_add(vzah,vzal); /* a5 = (-Y[1] + Y[7]) + (Y[5] + Y[5]>>1)) */ \ 65.119 + \ 65.120 + vza7 = spu_rlmaska(vb1,-1); \ 65.121 + vzal = spu_add(vza7,vb1); \ 65.122 + vzah = spu_add(vb3,vb5); \ 65.123 + vza7 = spu_add(vzah,vzal); /* a7 = (Y[3] + Y[5]) + (Y[1] + (Y[1]>>1)) */ \ 65.124 + \ 65.125 + vzb1 = spu_rlmaska(vza7,-2); \ 65.126 + vzb1 = spu_add(vzb1,vza1); /* b1 = (a7>>2) + a1 */ \ 65.127 + vzb3 = spu_rlmaska(vza5,-2); \ 65.128 + vzb3 = spu_add(vzb3,vza3); /* b3 = a3 + (a5>>2) */ \ 65.129 + vzb5 = spu_rlmaska(vza3,-2); \ 65.130 + vzb5 = spu_sub(vzb5,vza5); /* b5 = (a3>>2) - a5 */ \ 65.131 + vzb7 = spu_rlmaska(vza1,-2); \ 65.132 + vzb7 = spu_sub(vza7,vzb7); /* b7 = a7 - (a1>>2) */ \ 65.133 + \ 65.134 + vb0 = spu_add(vzb0,vzb7); /* src[i][0] = b0 + b7 */ \ 65.135 + vb7 = spu_sub(vzb0,vzb7); /* src[i][7] = b0 - b7 */ \ 65.136 + vb1 = spu_add(vzb2,vzb5); /* src[i][1] = b2 + b5 */ \ 65.137 + vb6 = spu_sub(vzb2,vzb5); /* src[i][6] = b2 - b5 */ \ 65.138 + vb2 = spu_add(vzb4,vzb3); /* src[i][2] = b4 + b3 */ \ 65.139 + vb5 = spu_sub(vzb4,vzb3); /* src[i][5] = b4 - b3 */ \ 65.140 + vb3 = spu_add(vzb6,vzb1); /* src[i][3] = b6 + b1 */ \ 65.141 + vb4 = spu_sub(vzb6,vzb1); /* src[i][4] = b6 - b1 */ 65.142 + 65.143 + 65.144 +#endif /*H264_IDCT_SPU_H*/
66.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 66.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.c Mon Aug 27 12:09:56 2012 +0200 66.3 @@ -0,0 +1,802 @@ 66.4 +#include "types_spu.h" 66.5 +#include "h264_tables.h" 66.6 +#include "h264_intra_spu.h" 66.7 +#include <assert.h> 66.8 + 66.9 +void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){ 66.10 + (void) topright; 66.11 + const uint32_t a= ((uint32_t*)(src-stride))[0]; 66.12 + ((uint32_t*)(src+0*stride))[0]= a; 66.13 + ((uint32_t*)(src+1*stride))[0]= a; 66.14 + ((uint32_t*)(src+2*stride))[0]= a; 66.15 + ((uint32_t*)(src+3*stride))[0]= a; 66.16 +} 66.17 + 66.18 +void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){ 66.19 + (void) topright; 66.20 + ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101; 66.21 + ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101; 66.22 + ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101; 66.23 + ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101; 66.24 +} 66.25 + 66.26 +void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){ 66.27 + (void) topright; 66.28 + const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] 66.29 + + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3; 66.30 + ((uint32_t*)(src+0*stride))[0]= 66.31 + ((uint32_t*)(src+1*stride))[0]= 66.32 + ((uint32_t*)(src+2*stride))[0]= 66.33 + ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; 66.34 +} 66.35 + 66.36 +void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){ 66.37 + (void) topright; 66.38 + const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2; 66.39 + 66.40 + ((uint32_t*)(src+0*stride))[0]= 66.41 + ((uint32_t*)(src+1*stride))[0]= 66.42 + ((uint32_t*)(src+2*stride))[0]= 66.43 + ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; 66.44 +} 66.45 + 66.46 +void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){ 66.47 + (void) topright; 66.48 + const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2; 66.49 + 66.50 + ((uint32_t*)(src+0*stride))[0]= 66.51 + ((uint32_t*)(src+1*stride))[0]= 66.52 + ((uint32_t*)(src+2*stride))[0]= 66.53 + ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; 66.54 +} 66.55 + 66.56 +void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){ 66.57 + (void) topright; 66.58 + ((uint32_t*)(src+0*stride))[0]= 66.59 + ((uint32_t*)(src+1*stride))[0]= 66.60 + ((uint32_t*)(src+2*stride))[0]= 66.61 + ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U; 66.62 +} 66.63 + 66.64 + 66.65 +#define LOAD_TOP_RIGHT_EDGE\ 66.66 + const int t4= topright[0];\ 66.67 + const int t5= topright[1];\ 66.68 + const int t6= topright[2];\ 66.69 + const int t7= topright[3];\ 66.70 + 66.71 +#define LOAD_LEFT_EDGE\ 66.72 + const int l0= src[-1+0*stride];\ 66.73 + const int l1= src[-1+1*stride];\ 66.74 + const int l2= src[-1+2*stride];\ 66.75 + const int l3= src[-1+3*stride];\ 66.76 + 66.77 +#define LOAD_TOP_EDGE\ 66.78 + const int t0= src[ 0-1*stride];\ 66.79 + const int t1= src[ 1-1*stride];\ 66.80 + const int t2= src[ 2-1*stride];\ 66.81 + const int t3= src[ 3-1*stride];\ 66.82 + 66.83 +void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){ 66.84 + (void) topright; 66.85 + const int lt= src[-1-1*stride]; 66.86 + LOAD_TOP_EDGE 66.87 + LOAD_LEFT_EDGE 66.88 + 66.89 + src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2; 66.90 + src[0+2*stride]= 66.91 + src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2; 66.92 + src[0+1*stride]= 66.93 + src[1+2*stride]= 66.94 + src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2; 66.95 + src[0+0*stride]= 66.96 + src[1+1*stride]= 66.97 + src[2+2*stride]= 66.98 + src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2; 66.99 + src[1+0*stride]= 66.100 + src[2+1*stride]= 66.101 + src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2; 66.102 + src[2+0*stride]= 66.103 + src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; 66.104 + src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2; 66.105 +} 66.106 + 66.107 +void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){ 66.108 + LOAD_TOP_EDGE 66.109 + LOAD_TOP_RIGHT_EDGE 66.110 +// LOAD_LEFT_EDGE 66.111 + 66.112 + src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2; 66.113 + src[1+0*stride]= 66.114 + src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2; 66.115 + src[2+0*stride]= 66.116 + src[1+1*stride]= 66.117 + src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2; 66.118 + src[3+0*stride]= 66.119 + src[2+1*stride]= 66.120 + src[1+2*stride]= 66.121 + src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2; 66.122 + src[3+1*stride]= 66.123 + src[2+2*stride]= 66.124 + src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2; 66.125 + src[3+2*stride]= 66.126 + src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2; 66.127 + src[3+3*stride]=(t6 + 3*t7 + 2)>>2; 66.128 +} 66.129 + 66.130 +void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){ 66.131 + (void) topright; 66.132 + const int lt= src[-1-1*stride]; 66.133 + LOAD_TOP_EDGE 66.134 + LOAD_LEFT_EDGE 66.135 + (void) l3; 66.136 + 66.137 + src[0+0*stride]= 66.138 + src[1+2*stride]=(lt + t0 + 1)>>1; 66.139 + src[1+0*stride]= 66.140 + src[2+2*stride]=(t0 + t1 + 1)>>1; 66.141 + src[2+0*stride]= 66.142 + src[3+2*stride]=(t1 + t2 + 1)>>1; 66.143 + src[3+0*stride]=(t2 + t3 + 1)>>1; 66.144 + src[0+1*stride]= 66.145 + src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2; 66.146 + src[1+1*stride]= 66.147 + src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2; 66.148 + src[2+1*stride]= 66.149 + src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2; 66.150 + src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2; 66.151 + src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2; 66.152 + src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; 66.153 +} 66.154 + 66.155 +void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){ 66.156 + LOAD_TOP_EDGE 66.157 + LOAD_TOP_RIGHT_EDGE 66.158 + (void) t7; 66.159 + 66.160 + src[0+0*stride]=(t0 + t1 + 1)>>1; 66.161 + src[1+0*stride]= 66.162 + src[0+2*stride]=(t1 + t2 + 1)>>1; 66.163 + src[2+0*stride]= 66.164 + src[1+2*stride]=(t2 + t3 + 1)>>1; 66.165 + src[3+0*stride]= 66.166 + src[2+2*stride]=(t3 + t4+ 1)>>1; 66.167 + src[3+2*stride]=(t4 + t5+ 1)>>1; 66.168 + src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; 66.169 + src[1+1*stride]= 66.170 + src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2; 66.171 + src[2+1*stride]= 66.172 + src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2; 66.173 + src[3+1*stride]= 66.174 + src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2; 66.175 + src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2; 66.176 +} 66.177 + 66.178 +void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){ 66.179 + (void) topright; 66.180 + LOAD_LEFT_EDGE 66.181 + 66.182 + src[0+0*stride]=(l0 + l1 + 1)>>1; 66.183 + src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2; 66.184 + src[2+0*stride]= 66.185 + src[0+1*stride]=(l1 + l2 + 1)>>1; 66.186 + src[3+0*stride]= 66.187 + src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2; 66.188 + src[2+1*stride]= 66.189 + src[0+2*stride]=(l2 + l3 + 1)>>1; 66.190 + src[3+1*stride]= 66.191 + src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2; 66.192 + src[3+2*stride]= 66.193 + src[1+3*stride]= 66.194 + src[0+3*stride]= 66.195 + src[2+2*stride]= 66.196 + src[2+3*stride]= 66.197 + src[3+3*stride]=l3; 66.198 +} 66.199 + 66.200 +void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){ 66.201 + (void) topright; 66.202 + const int lt= src[-1-1*stride]; 66.203 + LOAD_TOP_EDGE 66.204 + LOAD_LEFT_EDGE 66.205 + (void) t3; 66.206 + 66.207 + src[0+0*stride]= 66.208 + src[2+1*stride]=(lt + l0 + 1)>>1; 66.209 + src[1+0*stride]= 66.210 + src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2; 66.211 + src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2; 66.212 + src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2; 66.213 + src[0+1*stride]= 66.214 + src[2+2*stride]=(l0 + l1 + 1)>>1; 66.215 + src[1+1*stride]= 66.216 + src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2; 66.217 + src[0+2*stride]= 66.218 + src[2+3*stride]=(l1 + l2+ 1)>>1; 66.219 + src[1+2*stride]= 66.220 + src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; 66.221 + src[0+3*stride]=(l2 + l3 + 1)>>1; 66.222 + src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2; 66.223 +} 66.224 + 66.225 +void ff_pred16x16_vertical_c(uint8_t *src, int stride){ 66.226 + int i; 66.227 + const vuint32_t v= *((vuint32_t*)(src-stride)); 66.228 + for(i=0; i<4; i++){ 66.229 + *((vuint32_t*) src ) =v; 66.230 + *((vuint32_t*)(src + stride)) =v; 66.231 + *((vuint32_t*)(src + 2*stride)) =v; 66.232 + *((vuint32_t*)(src + 3*stride)) =v; 66.233 + src+= 4*stride; 66.234 + } 66.235 + 66.236 + /*const uint32_t a= ((uint32_t*)(src-stride))[0]; 66.237 + const uint32_t b= ((uint32_t*)(src-stride))[1]; 66.238 + const uint32_t c= ((uint32_t*)(src-stride))[2]; 66.239 + const uint32_t d= ((uint32_t*)(src-stride))[3]; 66.240 + 66.241 + for(i=0; i<16; i++){ 66.242 + ((uint32_t*)(src+i*stride))[0]= a; 66.243 + ((uint32_t*)(src+i*stride))[1]= b; 66.244 + ((uint32_t*)(src+i*stride))[2]= c; 66.245 + ((uint32_t*)(src+i*stride))[3]= d; 66.246 + }*/ 66.247 +} 66.248 + 66.249 +void ff_pred16x16_horizontal_c(uint8_t *src, int stride){ 66.250 + int i; 66.251 + 66.252 + for(i=0; i<16; i++){ 66.253 + ((uint32_t*)(src+i*stride))[0]= 66.254 + ((uint32_t*)(src+i*stride))[1]= 66.255 + ((uint32_t*)(src+i*stride))[2]= 66.256 + ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101; 66.257 + } 66.258 +} 66.259 + 66.260 +void ff_pred16x16_dc_c(uint8_t *src, int stride){ 66.261 + int i; 66.262 + int dc=0; 66.263 + for(i=0;i<16; i++){ 66.264 + dc+= src[-1+i*stride]; 66.265 + } 66.266 + 66.267 + for(i=0;i<16; i++){ 66.268 + dc+= src[i-stride]; 66.269 + } 66.270 + dc= 0x01010101*((dc + 16)>>5); 66.271 + 66.272 + for(i=0; i<16; i++){ 66.273 + ((uint32_t*)(src+i*stride))[0]= 66.274 + ((uint32_t*)(src+i*stride))[1]= 66.275 + ((uint32_t*)(src+i*stride))[2]= 66.276 + ((uint32_t*)(src+i*stride))[3]= dc; 66.277 + } 66.278 +} 66.279 + 66.280 +void ff_pred16x16_left_dc_c(uint8_t *src, int stride){ 66.281 + int i; 66.282 + 66.283 + int dc=0; 66.284 + for(i=0;i<16; i++){ 66.285 + dc+= src[-1+i*stride]; 66.286 + } 66.287 + dc= 0x01010101*((dc + 8)>>4); 66.288 + 66.289 + for(i=0; i<16; i++){ 66.290 + ((uint32_t*)(src+i*stride))[0]= 66.291 + ((uint32_t*)(src+i*stride))[1]= 66.292 + ((uint32_t*)(src+i*stride))[2]= 66.293 + ((uint32_t*)(src+i*stride))[3]= dc; 66.294 + } 66.295 +} 66.296 + 66.297 +void ff_pred16x16_top_dc_c(uint8_t *src, int stride){ 66.298 + int i; 66.299 + int dc0=0; 66.300 + for(i=0;i<16; i++){ 66.301 + dc0+= src[i-stride]; 66.302 + } 66.303 + 66.304 + dc0= 0x01010101*((dc0 + 8)>>4); 66.305 + 66.306 + for(i=0; i<16; i++){ 66.307 + ((uint32_t*)(src+i*stride))[0]= 66.308 + ((uint32_t*)(src+i*stride))[1]= 66.309 + ((uint32_t*)(src+i*stride))[2]= 66.310 + ((uint32_t*)(src+i*stride))[3]= dc0; 66.311 + } 66.312 +} 66.313 + 66.314 +void ff_pred16x16_128_dc_c(uint8_t *src, int stride){ 66.315 + int i; 66.316 + 66.317 + /*const vuint32_t v= AVV(0x01010101U*128U, 0x01010101U*128U,0x01010101U*128U,0x01010101U*128U); 66.318 + for(i=0; i<4; i++){ 66.319 + *((vuint32_t*) src ) =v; 66.320 + *((vuint32_t*)(src + stride)) =v; 66.321 + *((vuint32_t*)(src + 2*stride)) =v; 66.322 + *((vuint32_t*)(src + 3*stride)) =v; 66.323 + src+= 4*stride; 66.324 + }*/ 66.325 + 66.326 + for(i=0; i<16; i++){ 66.327 + ((uint32_t*)(src+i*stride))[0]= 66.328 + ((uint32_t*)(src+i*stride))[1]= 66.329 + ((uint32_t*)(src+i*stride))[2]= 66.330 + ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U; 66.331 + } 66.332 +} 66.333 + 66.334 +void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){ 66.335 + int i, j, k; 66.336 + int a; 66.337 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; 66.338 + const uint8_t * const src0 = src+7-stride; 66.339 + const uint8_t *src1 = src+8*stride-1; 66.340 + const uint8_t *src2 = src1-2*stride; // == src+6*stride-1; 66.341 + int H = src0[1] - src0[-1]; 66.342 + int V = src1[0] - src2[ 0]; 66.343 + for(k=2; k<=8; ++k) { 66.344 + src1 += stride; src2 -= stride; 66.345 + H += k*(src0[k] - src0[-k]); 66.346 + V += k*(src1[0] - src2[ 0]); 66.347 + } 66.348 + if(svq3){ 66.349 + H = ( 5*(H/4) ) / 16; 66.350 + V = ( 5*(V/4) ) / 16; 66.351 + 66.352 + /* required for 100% accuracy */ 66.353 + i = H; H = V; V = i; 66.354 + }else{ 66.355 + H = ( 5*H+32 ) >> 6; 66.356 + V = ( 5*V+32 ) >> 6; 66.357 + } 66.358 + 66.359 + a = 16*(src1[0] + src2[16] + 1) - 7*(V+H); 66.360 + for(j=16; j>0; --j) { 66.361 + int b = a; 66.362 + a += V; 66.363 + for(i=-16; i<0; i+=4) { 66.364 + src[16+i] = cm[ (b ) >> 5 ]; 66.365 + src[17+i] = cm[ (b+ H) >> 5 ]; 66.366 + src[18+i] = cm[ (b+2*H) >> 5 ]; 66.367 + src[19+i] = cm[ (b+3*H) >> 5 ]; 66.368 + b += 4*H; 66.369 + } 66.370 + src += stride; 66.371 + } 66.372 +} 66.373 + 66.374 +void ff_pred16x16_plane_c(uint8_t *src, int stride){ 66.375 + pred16x16_plane_compat_c(src, stride, 0); 66.376 +} 66.377 + 66.378 +void ff_pred8x8_vertical_c(uint8_t *src, int stride){ 66.379 + int i; 66.380 + const uint32_t a= ((uint32_t*)(src-stride))[0]; 66.381 + const uint32_t b= ((uint32_t*)(src-stride))[1]; 66.382 + 66.383 + for(i=0; i<8; i++){ 66.384 + ((uint32_t*)(src+i*stride))[0]= a; 66.385 + ((uint32_t*)(src+i*stride))[1]= b; 66.386 + } 66.387 +} 66.388 + 66.389 +void ff_pred8x8_horizontal_c(uint8_t *src, int stride){ 66.390 + int i; 66.391 + 66.392 + for(i=0; i<8; i++){ 66.393 + ((uint32_t*)(src+i*stride))[0]= 66.394 + ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101; 66.395 + } 66.396 +} 66.397 + 66.398 +void ff_pred8x8_128_dc_c(uint8_t *src, int stride){ 66.399 + int i; 66.400 + 66.401 + for(i=0; i<8; i++){ 66.402 + ((uint32_t*)(src+i*stride))[0]= 66.403 + ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U; 66.404 + } 66.405 +} 66.406 + 66.407 +void ff_pred8x8_left_dc_c(uint8_t *src, int stride){ 66.408 + int i; 66.409 + int dc0, dc2; 66.410 + 66.411 + dc0=dc2=0; 66.412 + for(i=0;i<4; i++){ 66.413 + dc0+= src[-1+i*stride]; 66.414 + dc2+= src[-1+(i+4)*stride]; 66.415 + } 66.416 + dc0= 0x01010101*((dc0 + 2)>>2); 66.417 + dc2= 0x01010101*((dc2 + 2)>>2); 66.418 + 66.419 + for(i=0; i<4; i++){ 66.420 + ((uint32_t*)(src+i*stride))[0]= 66.421 + ((uint32_t*)(src+i*stride))[1]= dc0; 66.422 + } 66.423 + for(i=4; i<8; i++){ 66.424 + ((uint32_t*)(src+i*stride))[0]= 66.425 + ((uint32_t*)(src+i*stride))[1]= dc2; 66.426 + } 66.427 +} 66.428 + 66.429 +void ff_pred8x8_top_dc_c(uint8_t *src, int stride){ 66.430 + int i; 66.431 + int dc0, dc1; 66.432 + 66.433 + dc0=dc1=0; 66.434 + for(i=0;i<4; i++){ 66.435 + dc0+= src[i-stride]; 66.436 + dc1+= src[4+i-stride]; 66.437 + } 66.438 + dc0= 0x01010101*((dc0 + 2)>>2); 66.439 + dc1= 0x01010101*((dc1 + 2)>>2); 66.440 + 66.441 + for(i=0; i<4; i++){ 66.442 + ((uint32_t*)(src+i*stride))[0]= dc0; 66.443 + ((uint32_t*)(src+i*stride))[1]= dc1; 66.444 + } 66.445 + for(i=4; i<8; i++){ 66.446 + ((uint32_t*)(src+i*stride))[0]= dc0; 66.447 + ((uint32_t*)(src+i*stride))[1]= dc1; 66.448 + } 66.449 +} 66.450 + 66.451 + 66.452 +void ff_pred8x8_dc_c(uint8_t *src, int stride){ 66.453 + int i; 66.454 + int dc0, dc1, dc2, dc3; 66.455 + 66.456 + dc0=dc1=dc2=0; 66.457 + for(i=0;i<4; i++){ 66.458 + dc0+= src[-1+i*stride] + src[i-stride]; 66.459 + dc1+= src[4+i-stride]; 66.460 + dc2+= src[-1+(i+4)*stride]; 66.461 + } 66.462 + dc3= 0x01010101*((dc1 + dc2 + 4)>>3); 66.463 + dc0= 0x01010101*((dc0 + 4)>>3); 66.464 + dc1= 0x01010101*((dc1 + 2)>>2); 66.465 + dc2= 0x01010101*((dc2 + 2)>>2); 66.466 + 66.467 + for(i=0; i<4; i++){ 66.468 + ((uint32_t*)(src+i*stride))[0]= dc0; 66.469 + ((uint32_t*)(src+i*stride))[1]= dc1; 66.470 + } 66.471 + for(i=4; i<8; i++){ 66.472 + ((uint32_t*)(src+i*stride))[0]= dc2; 66.473 + ((uint32_t*)(src+i*stride))[1]= dc3; 66.474 + } 66.475 +} 66.476 + 66.477 +void ff_pred8x8_plane_c(uint8_t *src, int stride){ 66.478 + int j, k; 66.479 + int a; 66.480 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; 66.481 + const uint8_t * const src0 = src+3-stride; 66.482 + const uint8_t *src1 = src+4*stride-1; 66.483 + const uint8_t *src2 = src1-2*stride; // == src+2*stride-1; 66.484 + int H = src0[1] - src0[-1]; 66.485 + int V = src1[0] - src2[ 0]; 66.486 + for(k=2; k<=4; ++k) { 66.487 + src1 += stride; src2 -= stride; 66.488 + H += k*(src0[k] - src0[-k]); 66.489 + V += k*(src1[0] - src2[ 0]); 66.490 + } 66.491 + H = ( 17*H+16 ) >> 5; 66.492 + V = ( 17*V+16 ) >> 5; 66.493 + 66.494 + a = 16*(src1[0] + src2[8]+1) - 3*(V+H); 66.495 + for(j=8; j>0; --j) { 66.496 + int b = a; 66.497 + a += V; 66.498 + src[0] = cm[ (b ) >> 5 ]; 66.499 + src[1] = cm[ (b+ H) >> 5 ]; 66.500 + src[2] = cm[ (b+2*H) >> 5 ]; 66.501 + src[3] = cm[ (b+3*H) >> 5 ]; 66.502 + src[4] = cm[ (b+4*H) >> 5 ]; 66.503 + src[5] = cm[ (b+5*H) >> 5 ]; 66.504 + src[6] = cm[ (b+6*H) >> 5 ]; 66.505 + src[7] = cm[ (b+7*H) >> 5 ]; 66.506 + src += stride; 66.507 + } 66.508 +} 66.509 + 66.510 + 66.511 +#define SRC(x,y) src[(x)+(y)*stride] 66.512 +#define PL(y) \ 66.513 + const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2; 66.514 +#define PREDICT_8x8_LOAD_LEFT \ 66.515 + const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \ 66.516 + + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \ 66.517 + PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \ 66.518 + const int l7 = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2 66.519 + 66.520 +#define PT(x) \ 66.521 + const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; 66.522 +#define PREDICT_8x8_LOAD_TOP \ 66.523 + const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \ 66.524 + + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \ 66.525 + PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \ 66.526 + const int t7 = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \ 66.527 + + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2 66.528 + 66.529 +#define PTR(x) \ 66.530 + t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; 66.531 +#define PREDICT_8x8_LOAD_TOPRIGHT \ 66.532 + int t8, t9, t10, t11, t12, t13, t14, t15; \ 66.533 + if(has_topright) { \ 66.534 + PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \ 66.535 + t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \ 66.536 + } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1); 66.537 + 66.538 +#define PREDICT_8x8_LOAD_TOPLEFT \ 66.539 + const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2 66.540 + 66.541 +#define PREDICT_8x8_DC(v) \ 66.542 + int y; \ 66.543 + for( y = 0; y < 8; y++ ) { \ 66.544 + ((uint32_t*)src)[0] = \ 66.545 + ((uint32_t*)src)[1] = v; \ 66.546 + src += stride; \ 66.547 + } 66.548 + 66.549 +static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) 66.550 +{ 66.551 + (void) has_topright; 66.552 + (void) has_topleft; 66.553 + PREDICT_8x8_DC(0x80808080); 66.554 +} 66.555 +static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) 66.556 +{ 66.557 + (void) has_topright; 66.558 + PREDICT_8x8_LOAD_LEFT; 66.559 + const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101; 66.560 + PREDICT_8x8_DC(dc); 66.561 +} 66.562 +static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) 66.563 +{ 66.564 + PREDICT_8x8_LOAD_TOP; 66.565 + const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101; 66.566 + PREDICT_8x8_DC(dc); 66.567 +} 66.568 +static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride) 66.569 +{ 66.570 + PREDICT_8x8_LOAD_LEFT; 66.571 + PREDICT_8x8_LOAD_TOP; 66.572 + const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7 66.573 + +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101; 66.574 + PREDICT_8x8_DC(dc); 66.575 +} 66.576 +static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride) 66.577 +{ 66.578 + (void) has_topright; 66.579 + PREDICT_8x8_LOAD_LEFT; 66.580 +#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\ 66.581 + ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y 66.582 + ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7); 66.583 +#undef ROW 66.584 +} 66.585 +static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride) 66.586 +{ 66.587 + int y; 66.588 + PREDICT_8x8_LOAD_TOP; 66.589 + src[0] = t0; 66.590 + src[1] = t1; 66.591 + src[2] = t2; 66.592 + src[3] = t3; 66.593 + src[4] = t4; 66.594 + src[5] = t5; 66.595 + src[6] = t6; 66.596 + src[7] = t7; 66.597 + for( y = 1; y < 8; y++ ) 66.598 + *(uint64_t*)(src+y*stride) = *(uint64_t*)src; 66.599 +} 66.600 +static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride) 66.601 +{ 66.602 + PREDICT_8x8_LOAD_TOP; 66.603 + PREDICT_8x8_LOAD_TOPRIGHT; 66.604 + SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2; 66.605 + SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2; 66.606 + SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2; 66.607 + SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2; 66.608 + SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2; 66.609 + SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2; 66.610 + SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2; 66.611 + SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2; 66.612 + SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2; 66.613 + SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2; 66.614 + SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2; 66.615 + SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2; 66.616 + SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2; 66.617 + SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2; 66.618 + SRC(7,7)= (t14 + 3*t15 + 2) >> 2; 66.619 +} 66.620 +static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride) 66.621 +{ 66.622 + PREDICT_8x8_LOAD_TOP; 66.623 + PREDICT_8x8_LOAD_LEFT; 66.624 + PREDICT_8x8_LOAD_TOPLEFT; 66.625 + SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2; 66.626 + SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2; 66.627 + SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2; 66.628 + SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2; 66.629 + SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2; 66.630 + SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2; 66.631 + SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2; 66.632 + SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2; 66.633 + SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2; 66.634 + SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2; 66.635 + SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2; 66.636 + SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2; 66.637 + SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2; 66.638 + SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2; 66.639 + SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2; 66.640 + 66.641 +} 66.642 +static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride) 66.643 +{ 66.644 + PREDICT_8x8_LOAD_TOP; 66.645 + PREDICT_8x8_LOAD_LEFT; 66.646 + PREDICT_8x8_LOAD_TOPLEFT; 66.647 + (void) l7; 66.648 + SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2; 66.649 + SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2; 66.650 + SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2; 66.651 + SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2; 66.652 + SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2; 66.653 + SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2; 66.654 + SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2; 66.655 + SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1; 66.656 + SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2; 66.657 + SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1; 66.658 + SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2; 66.659 + SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1; 66.660 + SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2; 66.661 + SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1; 66.662 + SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2; 66.663 + SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1; 66.664 + SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2; 66.665 + SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1; 66.666 + SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2; 66.667 + SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1; 66.668 + SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2; 66.669 + SRC(7,0)= (t6 + t7 + 1) >> 1; 66.670 +} 66.671 +static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride) 66.672 +{ 66.673 + PREDICT_8x8_LOAD_TOP; 66.674 + PREDICT_8x8_LOAD_LEFT; 66.675 + PREDICT_8x8_LOAD_TOPLEFT; 66.676 + (void) t7; 66.677 + SRC(0,7)= (l6 + l7 + 1) >> 1; 66.678 + SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2; 66.679 + SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1; 66.680 + SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2; 66.681 + SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1; 66.682 + SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2; 66.683 + SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1; 66.684 + SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2; 66.685 + SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1; 66.686 + SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2; 66.687 + SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1; 66.688 + SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2; 66.689 + SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1; 66.690 + SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2; 66.691 + SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1; 66.692 + SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2; 66.693 + SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2; 66.694 + SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2; 66.695 + SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2; 66.696 + SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2; 66.697 + SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2; 66.698 + SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2; 66.699 +} 66.700 +static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride) 66.701 +{ 66.702 + PREDICT_8x8_LOAD_TOP; 66.703 + PREDICT_8x8_LOAD_TOPRIGHT; 66.704 + SRC(0,0)= (t0 + t1 + 1) >> 1; 66.705 + SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2; 66.706 + SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1; 66.707 + SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2; 66.708 + SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1; 66.709 + SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2; 66.710 + SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1; 66.711 + SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2; 66.712 + SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1; 66.713 + SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2; 66.714 + SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1; 66.715 + SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2; 66.716 + SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1; 66.717 + SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2; 66.718 + SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1; 66.719 + SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2; 66.720 + SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1; 66.721 + SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2; 66.722 + SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1; 66.723 + SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2; 66.724 + SRC(7,6)= (t10 + t11 + 1) >> 1; 66.725 + SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2; 66.726 +} 66.727 +static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride) 66.728 +{ 66.729 + (void) has_topright; 66.730 + PREDICT_8x8_LOAD_LEFT; 66.731 + SRC(0,0)= (l0 + l1 + 1) >> 1; 66.732 + SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2; 66.733 + SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1; 66.734 + SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2; 66.735 + SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1; 66.736 + SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2; 66.737 + SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1; 66.738 + SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2; 66.739 + SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1; 66.740 + SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2; 66.741 + SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1; 66.742 + SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2; 66.743 + SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1; 66.744 + SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2; 66.745 + SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)= 66.746 + SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)= 66.747 + SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)= 66.748 + SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7; 66.749 +} 66.750 +#undef PREDICT_8x8_LOAD_LEFT 66.751 +#undef PREDICT_8x8_LOAD_TOP 66.752 +#undef PREDICT_8x8_LOAD_TOPLEFT 66.753 +#undef PREDICT_8x8_LOAD_TOPRIGHT 66.754 +#undef PREDICT_8x8_DC 66.755 +#undef PTR 66.756 +#undef PT 66.757 +#undef PL 66.758 +#undef SRC 66.759 + 66.760 +void init_pred_ptrs(H264PredContext_spu *i){ 66.761 + 66.762 + i->pred4x4[VERT_PRED ]= pred4x4_vertical_c; 66.763 + i->pred4x4[HOR_PRED ]= pred4x4_horizontal_c; 66.764 + i->pred4x4[DC_PRED ]= pred4x4_dc_c; 66.765 + i->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c; 66.766 + i->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c; 66.767 + i->pred4x4[VERT_RIGHT_PRED ]= pred4x4_vertical_right_c; 66.768 + i->pred4x4[HOR_DOWN_PRED ]= pred4x4_horizontal_down_c; 66.769 + i->pred4x4[VERT_LEFT_PRED ]= pred4x4_vertical_left_c; 66.770 + i->pred4x4[HOR_UP_PRED ]= pred4x4_horizontal_up_c; 66.771 + i->pred4x4[LEFT_DC_PRED ]= pred4x4_left_dc_c; 66.772 + i->pred4x4[TOP_DC_PRED ]= pred4x4_top_dc_c; 66.773 + i->pred4x4[DC_128_PRED ]= pred4x4_128_dc_c; 66.774 + 66.775 + i->pred8x8l[VERT_PRED ]= pred8x8l_vertical_c; 66.776 + i->pred8x8l[HOR_PRED ]= pred8x8l_horizontal_c; 66.777 + i->pred8x8l[DC_PRED ]= pred8x8l_dc_c; 66.778 + i->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c; 66.779 + i->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c; 66.780 + i->pred8x8l[VERT_RIGHT_PRED ]= pred8x8l_vertical_right_c; 66.781 + i->pred8x8l[HOR_DOWN_PRED ]= pred8x8l_horizontal_down_c; 66.782 + i->pred8x8l[VERT_LEFT_PRED ]= pred8x8l_vertical_left_c; 66.783 + i->pred8x8l[HOR_UP_PRED ]= pred8x8l_horizontal_up_c; 66.784 + i->pred8x8l[LEFT_DC_PRED ]= pred8x8l_left_dc_c; 66.785 + i->pred8x8l[TOP_DC_PRED ]= pred8x8l_top_dc_c; 66.786 + i->pred8x8l[DC_128_PRED ]= pred8x8l_128_dc_c; 66.787 + 66.788 + 66.789 + i->pred8x8[VERT_PRED8x8 ]= ff_pred8x8_vertical_c; 66.790 + i->pred8x8[HOR_PRED8x8 ]= ff_pred8x8_horizontal_c; 66.791 + i->pred8x8[PLANE_PRED8x8 ]= ff_pred8x8_plane_c; 66.792 + i->pred8x8[DC_PRED8x8 ]= ff_pred8x8_dc_c; 66.793 + i->pred8x8[LEFT_DC_PRED8x8]= ff_pred8x8_left_dc_c; 66.794 + i->pred8x8[TOP_DC_PRED8x8 ]= ff_pred8x8_top_dc_c; 66.795 + i->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c; 66.796 + 66.797 + i->pred16x16[DC_PRED8x8 ]= ff_pred16x16_dc_c; 66.798 + i->pred16x16[VERT_PRED8x8 ]= ff_pred16x16_vertical_c; 66.799 + i->pred16x16[HOR_PRED8x8 ]= ff_pred16x16_horizontal_c; 66.800 + i->pred16x16[PLANE_PRED8x8 ]= ff_pred16x16_plane_c; 66.801 + i->pred16x16[LEFT_DC_PRED8x8]= ff_pred16x16_left_dc_c; 66.802 + i->pred16x16[TOP_DC_PRED8x8 ]= ff_pred16x16_top_dc_c; 66.803 + i->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c; 66.804 + 66.805 +}
67.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 67.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.h Mon Aug 27 12:09:56 2012 +0200 67.3 @@ -0,0 +1,48 @@ 67.4 +#ifndef H264_INTRA_SPU_H 67.5 +#define H264_INTRA_SPU_H 67.6 + 67.7 +#define MAX_NEG_CROP 1024 67.8 + 67.9 +// For Intra mode 67.10 +#define MB_TYPE_INTRA4x4 0x0001 67.11 +#define IS_INTRA(a) ((a)&7) 67.12 +#define IS_INTRA4x4(a) ((a)&MB_TYPE_INTRA4x4) 67.13 + 67.14 +#define CODEC_FLAG_GRAY 0x2000 67.15 + 67.16 +#define VERT_PRED 0 67.17 +#define HOR_PRED 1 67.18 +#define DC_PRED 2 67.19 +#define DIAG_DOWN_LEFT_PRED 3 67.20 +#define DIAG_DOWN_RIGHT_PRED 4 67.21 +#define VERT_RIGHT_PRED 5 67.22 +#define HOR_DOWN_PRED 6 67.23 +#define VERT_LEFT_PRED 7 67.24 +#define HOR_UP_PRED 8 67.25 + 67.26 +#define LEFT_DC_PRED 9 67.27 +#define TOP_DC_PRED 10 67.28 +#define DC_128_PRED 11 67.29 + 67.30 + 67.31 +#define DC_PRED8x8 0 67.32 +#define HOR_PRED8x8 1 67.33 +#define VERT_PRED8x8 2 67.34 +#define PLANE_PRED8x8 3 67.35 + 67.36 +#define LEFT_DC_PRED8x8 4 67.37 +#define TOP_DC_PRED8x8 5 67.38 +#define DC_128_PRED8x8 6 67.39 + 67.40 +typedef struct H264PredContext_spu{ 67.41 + 67.42 + intra_pred4x4 pred4x4[9+3]; 67.43 + intra_pred16x16 pred16x16[4+3]; 67.44 + intra_pred8x8 pred8x8[4+3]; 67.45 + intra_pred8x8l pred8x8l[9+3]; 67.46 + 67.47 +}H264PredContext_spu; 67.48 + 67.49 +void init_pred_ptrs(H264PredContext_spu *i); 67.50 + 67.51 +#endif
68.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 68.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_luma_template_spu.c Mon Aug 27 12:09:56 2012 +0200 68.3 @@ -0,0 +1,1560 @@ 68.4 +static void PREFIX_h264_qpel16_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { 68.5 + 68.6 + register int i; 68.7 + 68.8 + const int16_t i20ss= 20; 68.9 + const int16_t i5ss= 5; 68.10 + const int16_t i16ss= 16; 68.11 + const int16_t imax = 255; 68.12 + 68.13 + const vsint32_t vzero = spu_splats(0); 68.14 + const vsint16_t v20ss = spu_splats(i20ss); 68.15 + const vsint16_t v5ss = spu_splats(i5ss); 68.16 + const vsint16_t v16ss = spu_splats(i16ss); 68.17 + const vsint16_t vmax = (vsint16_t)spu_splats(imax); 68.18 + vuint16_t sat; 68.19 + 68.20 + const int shift_src =(unsigned int) src & 15; 68.21 + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; 68.22 + const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F}; 68.23 + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; 68.24 + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; 68.25 + 68.26 + uint8_t *srcbis = src - (STRIDE_Y * 2); 68.27 + 68.28 + const vuint8_t srcM2a = *(vuint8_t *)(srcbis); 68.29 + const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16); 68.30 + const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16)); 68.31 + 68.32 + srcbis += STRIDE_Y; 68.33 + const vuint8_t srcM1a = *(vuint8_t *)(srcbis); 68.34 + const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16); 68.35 + const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16)); 68.36 + 68.37 + srcbis += STRIDE_Y; 68.38 + const vuint8_t srcP0a = *(vuint8_t *)(srcbis); 68.39 + const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16); 68.40 + const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16)); 68.41 + 68.42 + srcbis += STRIDE_Y; 68.43 + const vuint8_t srcP1a = *(vuint8_t *)(srcbis); 68.44 + const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16); 68.45 + const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16)); 68.46 + 68.47 + srcbis += STRIDE_Y; 68.48 + const vuint8_t srcP2a = *(vuint8_t *)(srcbis); 68.49 + const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16); 68.50 + const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16)); 68.51 + 68.52 + srcbis += STRIDE_Y; 68.53 + 68.54 + vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); 68.55 + vsint16_t srcM2ssB = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel); 68.56 + vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); 68.57 + vsint16_t srcM1ssB = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel); 68.58 + vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); 68.59 + vsint16_t srcP0ssB = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel); 68.60 + vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); 68.61 + vsint16_t srcP1ssB = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel); 68.62 + vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); 68.63 + vsint16_t srcP2ssB = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel); 68.64 + 68.65 + for (i = 0 ; i < h ; i++) { 68.66 + const vuint8_t srcP3a = *(vuint8_t *)(srcbis); 68.67 + const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16); 68.68 + const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16)); 68.69 + 68.70 + const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); 68.71 + const vsint16_t srcP3ssB = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel); 68.72 + srcbis += STRIDE_Y; 68.73 + 68.74 + const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA); 68.75 + const vsint16_t sum1B = spu_add(srcP0ssB, srcP1ssB); 68.76 + const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA); 68.77 + const vsint16_t sum2B = spu_add(srcM1ssB, srcP2ssB); 68.78 + const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA); 68.79 + const vsint16_t sum3B = spu_add(srcM2ssB, srcP3ssB); 68.80 + 68.81 + srcM2ssA = srcM1ssA; 68.82 + srcM2ssB = srcM1ssB; 68.83 + srcM1ssA = srcP0ssA; 68.84 + srcM1ssB = srcP0ssB; 68.85 + srcP0ssA = srcP1ssA; 68.86 + srcP0ssB = srcP1ssB; 68.87 + srcP1ssA = srcP2ssA; 68.88 + srcP1ssB = srcP2ssB; 68.89 + srcP2ssA = srcP3ssA; 68.90 + srcP2ssB = srcP3ssB; 68.91 + 68.92 + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); 68.93 + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); 68.94 + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); 68.95 + const vsint16_t pp1A = spu_add(pp1A3, v16ss); 68.96 + 68.97 + const vsint32_t pp1B1 = spu_mule(sum1B, v20ss); 68.98 + const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss); 68.99 + const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez); 68.100 + const vsint16_t pp1B = spu_add(pp1B3, v16ss); 68.101 + 68.102 + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); 68.103 + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); 68.104 + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); 68.105 + 68.106 + const vsint32_t pp2B1 = spu_mule(sum2B, v5ss); 68.107 + const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss); 68.108 + const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez); 68.109 + 68.110 + const vsint16_t pp3A = spu_add(sum3A, pp1A); 68.111 + const vsint16_t pp3B = spu_add(sum3B, pp1B); 68.112 + 68.113 + const vsint16_t psumA = spu_sub(pp3A, pp2A); 68.114 + const vsint16_t psumB = spu_sub(pp3B, pp2B); 68.115 + 68.116 + vsint16_t sumA = spu_rlmask(psumA, -5); 68.117 + vsint16_t sumB = spu_rlmask(psumB, -5); 68.118 + 68.119 + //Saturation to 0 and 255 68.120 + sat = spu_cmpgt(sumA,(vsint16_t)vzero); 68.121 + sumA = spu_and(sumA,(vsint16_t)sat); 68.122 + sat = spu_cmpgt(sumA,vmax); 68.123 + sumA = spu_sel(sumA,vmax,sat); 68.124 + sat = spu_cmpgt(sumB,(vsint16_t)vzero); 68.125 + sumB = spu_and(sumB,(vsint16_t)sat); 68.126 + sat = spu_cmpgt(sumB,vmax); 68.127 + sumB = spu_sel(sumB,vmax,sat); 68.128 + 68.129 + const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, sumB, packsu); 68.130 + 68.131 + /* 16x16 dest luma blocks are alway aligned */ 68.132 + const vuint8_t vdst = *(vuint8_t *)dst; 68.133 + 68.134 + vuint8_t fsum; 68.135 + OP_U8_SPU(fsum, sum, vdst); 68.136 + 68.137 + *(vuint8_t *)dst=fsum; 68.138 + 68.139 + dst += dstStride; /* stride is multiple of 16 ,so dstperm and dstmask can remain out of the loop */ 68.140 + } 68.141 +} 68.142 + 68.143 +static void PREFIX_h264_qpel16_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { 68.144 + 68.145 + register int i; 68.146 + 68.147 + const int16_t i20ss = 20; 68.148 + const int16_t i5ss = 5; 68.149 + const int16_t i16ss = 16; 68.150 + const int16_t imax = 255; 68.151 + 68.152 + const vsint32_t vzero = spu_splats(0); 68.153 + const vsint16_t v20ss = spu_splats(i20ss); 68.154 + const vsint16_t v5ss = spu_splats(i5ss); 68.155 + const vsint16_t v16ss = spu_splats(i16ss); 68.156 + const vsint16_t vmax = (vsint16_t)spu_splats(imax); 68.157 + vuint16_t sat; 68.158 + 68.159 + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; 68.160 + const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F}; 68.161 + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; 68.162 + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; 68.163 + 68.164 + const int permM2 = (unsigned int) (src-2) & 15; 68.165 + const int permM1 = (unsigned int) (src-1) & 15; 68.166 + const int permP0 = (unsigned int) (src) & 15; 68.167 + const int permP1 = (unsigned int) (src+1) & 15; 68.168 + const int permP2 = (unsigned int) (src+2) & 15; 68.169 + const int permP3 = (unsigned int) (src+3) & 15; 68.170 + 68.171 + register int align = ((((unsigned long)src) - 2) % 16); 68.172 + 68.173 + for (i = 0 ; i < h ; i ++) { 68.174 + vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 68.175 + vuint8_t srcR1 = *(vuint8_t *)(src-2); 68.176 + vuint8_t srcR2 = *(vuint8_t *)(src+14); 68.177 + 68.178 + switch (align) { 68.179 + default: { 68.180 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.181 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.182 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.183 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.184 + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); 68.185 + srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); 68.186 + } break; 68.187 + case 11: { 68.188 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.189 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.190 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.191 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.192 + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); 68.193 + srcP3 = srcR2; 68.194 + } break; 68.195 + case 12: { 68.196 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.197 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.198 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.199 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.200 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.201 + srcP2 = srcR2; 68.202 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.203 + } break; 68.204 + case 13: { 68.205 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.206 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.207 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.208 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.209 + srcP1 = srcR2; 68.210 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.211 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.212 + } break; 68.213 + case 14: { 68.214 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.215 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.216 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.217 + srcP0 = srcR2; 68.218 + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); 68.219 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.220 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.221 + } break; 68.222 + case 15: { 68.223 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.224 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.225 + srcM1 = srcR2; 68.226 + srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); 68.227 + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); 68.228 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.229 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.230 + } break; 68.231 + } 68.232 + 68.233 + const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); 68.234 + const vsint16_t srcP0B = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel); 68.235 + const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); 68.236 + const vsint16_t srcP1B = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel); 68.237 + 68.238 + const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); 68.239 + const vsint16_t srcP2B = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel); 68.240 + const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); 68.241 + const vsint16_t srcP3B = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel); 68.242 + 68.243 + const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); 68.244 + const vsint16_t srcM2B = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel); 68.245 + const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); 68.246 + const vsint16_t srcM1B = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel); 68.247 + 68.248 + const vsint16_t sum1A = spu_add(srcP0A, srcP1A); 68.249 + const vsint16_t sum1B = spu_add(srcP0B, srcP1B); 68.250 + const vsint16_t sum2A = spu_add(srcM1A, srcP2A); 68.251 + const vsint16_t sum2B = spu_add(srcM1B, srcP2B); 68.252 + const vsint16_t sum3A = spu_add(srcM2A, srcP3A); 68.253 + const vsint16_t sum3B = spu_add(srcM2B, srcP3B); 68.254 + 68.255 + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); 68.256 + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); 68.257 + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); 68.258 + const vsint16_t pp1A = spu_add(pp1A3, v16ss); 68.259 + 68.260 + const vsint32_t pp1B1 = spu_mule(sum1B, v20ss); 68.261 + const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss); 68.262 + const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez); 68.263 + const vsint16_t pp1B = spu_add(pp1B3, v16ss); 68.264 + 68.265 + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); 68.266 + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); 68.267 + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); 68.268 + 68.269 + const vsint32_t pp2B1 = spu_mule(sum2B, v5ss); 68.270 + const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss); 68.271 + const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez); 68.272 + 68.273 + const vsint16_t pp3A = spu_add(sum3A, pp1A); 68.274 + const vsint16_t pp3B = spu_add(sum3B, pp1B); 68.275 + 68.276 + const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A); 68.277 + const vsint16_t psumB = spu_sub(pp3B, (vsint16_t)pp2B); 68.278 + 68.279 + vsint16_t sumA = spu_rlmask(psumA, -5); 68.280 + vsint16_t sumB = spu_rlmask(psumB, -5); 68.281 + 68.282 + //Saturation to 0 and 255 68.283 + sat = spu_cmpgt(sumA,(vsint16_t)vzero); 68.284 + sumA = spu_and(sumA,(vsint16_t)sat); 68.285 + sat = spu_cmpgt(sumA,vmax); 68.286 + sumA = spu_sel(sumA,vmax,sat); 68.287 + sat = spu_cmpgt(sumB,(vsint16_t)vzero); 68.288 + sumB = spu_and(sumB,(vsint16_t)sat); 68.289 + sat = spu_cmpgt(sumB,vmax); 68.290 + sumB = spu_sel(sumB,vmax,sat); 68.291 + 68.292 + const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, sumB, packsu); 68.293 + 68.294 + /* 16x16 dest luma blocks are alway aligned */ 68.295 + const vuint8_t vdst = *(vuint8_t *)dst; 68.296 + 68.297 + vuint8_t fsum; 68.298 + OP_U8_SPU(fsum, sum, vdst); 68.299 + 68.300 + *(vuint8_t *)dst=fsum; 68.301 + 68.302 + src += STRIDE_Y; 68.303 + dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ 68.304 + } 68.305 +} 68.306 + 68.307 +/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ 68.308 +static void PREFIX_h264_qpel16_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) { 68.309 + register int i; 68.310 + 68.311 + const int16_t i20ss = 20; 68.312 + const int16_t i5ss = 5; 68.313 + const int16_t imax = 255; 68.314 + 68.315 + const vsint32_t vzero = spu_splats(0); 68.316 + const vsint16_t v20ss = spu_splats(i20ss); 68.317 + const vsint16_t v5ss = spu_splats(i5ss); 68.318 + const vsint16_t vmax = (vsint16_t)spu_splats(imax); 68.319 + vuint16_t sat; 68.320 + 68.321 + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; 68.322 + const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F}; 68.323 + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; 68.324 + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; 68.325 + 68.326 + const int permM2 = (unsigned int) (src-2) & 15; 68.327 + const int permM1 = (unsigned int) (src-1) & 15; 68.328 + const int permP0 = (unsigned int) (src) & 15; 68.329 + const int permP1 = (unsigned int) (src+1) & 15; 68.330 + const int permP2 = (unsigned int) (src+2) & 15; 68.331 + const int permP3 = (unsigned int) (src+3) & 15; 68.332 + 68.333 + register int align = ((((unsigned long)src) - 2) % 16); 68.334 + 68.335 + src -= (2 * STRIDE_Y); 68.336 + 68.337 + for (i = 0 ; i < (h+5) ; i ++) { 68.338 + vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 68.339 + vuint8_t srcR1 = *(vuint8_t *)(src-2); 68.340 + vuint8_t srcR2 = *(vuint8_t *)(src+14); 68.341 + 68.342 + switch (align) { 68.343 + default: { 68.344 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.345 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.346 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.347 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.348 + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); 68.349 + srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); 68.350 + } break; 68.351 + case 11: { 68.352 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.353 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.354 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.355 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.356 + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); 68.357 + srcP3 = srcR2; 68.358 + } break; 68.359 + case 12: { 68.360 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.361 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.362 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.363 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.364 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.365 + srcP2 = srcR2; 68.366 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.367 + } break; 68.368 + case 13: { 68.369 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.370 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.371 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.372 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.373 + srcP1 = srcR2; 68.374 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.375 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.376 + } break; 68.377 + case 14: { 68.378 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.379 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.380 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.381 + srcP0 = srcR2; 68.382 + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); 68.383 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.384 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.385 + } break; 68.386 + case 15: { 68.387 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.388 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.389 + srcM1 = srcR2; 68.390 + srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); 68.391 + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); 68.392 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.393 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.394 + } break; 68.395 + } 68.396 + 68.397 + const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); 68.398 + const vsint16_t srcP0B = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel); 68.399 + const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); 68.400 + const vsint16_t srcP1B = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel); 68.401 + 68.402 + const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); 68.403 + const vsint16_t srcP2B = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel); 68.404 + const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); 68.405 + const vsint16_t srcP3B = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel); 68.406 + 68.407 + const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); 68.408 + const vsint16_t srcM2B = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel); 68.409 + const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); 68.410 + const vsint16_t srcM1B = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel); 68.411 + 68.412 + const vsint16_t sum1A = spu_add(srcP0A, srcP1A); 68.413 + const vsint16_t sum1B = spu_add(srcP0B, srcP1B); 68.414 + const vsint16_t sum2A = spu_add(srcM1A, srcP2A); 68.415 + const vsint16_t sum2B = spu_add(srcM1B, srcP2B); 68.416 + const vsint16_t sum3A = spu_add(srcM2A, srcP3A); 68.417 + const vsint16_t sum3B = spu_add(srcM2B, srcP3B); 68.418 + 68.419 + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); 68.420 + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); 68.421 + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); 68.422 + const vsint16_t pp1A = spu_add(pp1A3, sum3A); 68.423 + 68.424 + const vsint32_t pp1B1 = spu_mule(sum1B, v20ss); 68.425 + const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss); 68.426 + const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez); 68.427 + const vsint16_t pp1B = spu_add(pp1B3, sum3B); 68.428 + 68.429 + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); 68.430 + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); 68.431 + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); 68.432 + 68.433 + const vsint32_t pp2B1 = spu_mule(sum2B, v5ss); 68.434 + const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss); 68.435 + const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez); 68.436 + 68.437 + const vsint16_t psumA = spu_sub(pp1A, pp2A); 68.438 + const vsint16_t psumB = spu_sub(pp1B, pp2B); 68.439 + 68.440 + *(vsint16_t *)tmp = psumA; 68.441 + *(vsint16_t *)(tmp+8) = psumB; 68.442 + 68.443 + src += STRIDE_Y; 68.444 + tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ 68.445 + } 68.446 + 68.447 + const int32_t ni10si = -10; 68.448 + const int16_t i1ss = 1; 68.449 + const int32_t i512si = 512; 68.450 + const int32_t ni16si = -16; 68.451 + 68.452 + const vsint32_t nv10si = spu_splats(ni10si); 68.453 + const vsint16_t v1ss = spu_splats(i1ss); 68.454 + const vsint32_t v512si = spu_splats(i512si); 68.455 + const vsint32_t nv16si = spu_splats(ni16si); 68.456 + 68.457 + const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F}; 68.458 + const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F}; 68.459 + 68.460 + int16_t *tmpbis = tmp - (tmpStride * (h+5)); 68.461 + 68.462 + vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis); 68.463 + vsint16_t tmpM2ssB = *(vsint16_t *)(tmpbis+8); 68.464 + tmpbis += tmpStride; 68.465 + vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis); 68.466 + vsint16_t tmpM1ssB = *(vsint16_t *)(tmpbis+8); 68.467 + tmpbis += tmpStride; 68.468 + vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis); 68.469 + vsint16_t tmpP0ssB = *(vsint16_t *)(tmpbis+8); 68.470 + tmpbis += tmpStride; 68.471 + vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis); 68.472 + vsint16_t tmpP1ssB = *(vsint16_t *)(tmpbis+8); 68.473 + tmpbis += tmpStride; 68.474 + vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis); 68.475 + vsint16_t tmpP2ssB = *(vsint16_t *)(tmpbis+8); 68.476 + tmpbis += tmpStride; 68.477 + 68.478 + for (i = 0 ; i < h ; i++) { 68.479 + const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis); 68.480 + const vsint16_t tmpP3ssB = *(vsint16_t *)(tmpbis+8); 68.481 + tmpbis += tmpStride; 68.482 + 68.483 + const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA); 68.484 + const vsint16_t sum1B = spu_add(tmpP0ssB, tmpP1ssB); 68.485 + const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA); 68.486 + const vsint16_t sum2B = spu_add(tmpM1ssB, tmpP2ssB); 68.487 + const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA); 68.488 + const vsint16_t sum3B = spu_add(tmpM2ssB, tmpP3ssB); 68.489 + 68.490 + tmpM2ssA = tmpM1ssA; 68.491 + tmpM2ssB = tmpM1ssB; 68.492 + tmpM1ssA = tmpP0ssA; 68.493 + tmpM1ssB = tmpP0ssB; 68.494 + tmpP0ssA = tmpP1ssA; 68.495 + tmpP0ssB = tmpP1ssB; 68.496 + tmpP1ssA = tmpP2ssA; 68.497 + tmpP1ssB = tmpP2ssB; 68.498 + tmpP2ssA = tmpP3ssA; 68.499 + tmpP2ssB = tmpP3ssB; 68.500 + 68.501 + const vsint32_t pp1Ae = spu_mule(sum1A, v20ss); 68.502 + const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss); 68.503 + const vsint32_t pp1Be = spu_mule(sum1B, v20ss); 68.504 + const vsint32_t pp1Bo = spu_mulo(sum1B, v20ss); 68.505 + 68.506 + const vsint32_t pp2Ae = spu_mule(sum2A, v5ss); 68.507 + const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss); 68.508 + const vsint32_t pp2Be = spu_mule(sum2B, v5ss); 68.509 + const vsint32_t pp2Bo = spu_mulo(sum2B, v5ss); 68.510 + 68.511 + const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si); 68.512 + const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss); 68.513 + const vsint32_t pp3Be = spu_rlmask((vsint32_t)sum3B, nv16si); 68.514 + const vsint32_t pp3Bo = spu_mulo(sum3B, v1ss); 68.515 + 68.516 + const vsint32_t pp1cAe = spu_add(pp1Ae, v512si); 68.517 + const vsint32_t pp1cAo = spu_add(pp1Ao, v512si); 68.518 + const vsint32_t pp1cBe = spu_add(pp1Be, v512si); 68.519 + const vsint32_t pp1cBo = spu_add(pp1Bo, v512si); 68.520 + 68.521 + const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae); 68.522 + const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao); 68.523 + const vsint32_t pp32Be = spu_sub(pp3Be, pp2Be); 68.524 + const vsint32_t pp32Bo = spu_sub(pp3Bo, pp2Bo); 68.525 + 68.526 + const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae); 68.527 + const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao); 68.528 + const vsint32_t sumBe = spu_add(pp1cBe, pp32Be); 68.529 + const vsint32_t sumBo = spu_add(pp1cBo, pp32Bo); 68.530 + 68.531 + const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si); 68.532 + const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si); 68.533 + const vsint32_t ssumBe = spu_rlmask(sumBe, nv10si); 68.534 + const vsint32_t ssumBo = spu_rlmask(sumBo, nv10si); 68.535 + 68.536 + vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, ssumBe, packs); 68.537 + vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, ssumBo, packs); 68.538 + 68.539 + //Saturation to 0 and 255 68.540 + sat = spu_cmpgt(ssume,(vsint16_t)vzero); 68.541 + ssume = spu_and(ssume,(vsint16_t)sat); 68.542 + sat = spu_cmpgt(ssume,vmax); 68.543 + ssume = spu_sel(ssume,vmax,sat); 68.544 + sat = spu_cmpgt(ssumo,(vsint16_t)vzero); 68.545 + ssumo = spu_and(ssumo,(vsint16_t)sat); 68.546 + sat = spu_cmpgt(ssumo,vmax); 68.547 + ssumo = spu_sel(ssumo,vmax,sat); 68.548 + 68.549 + const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu); 68.550 + 68.551 + const vuint8_t sum = spu_shuffle(sumv, sumv, mperm); 68.552 + 68.553 + /* 16x16 dest luma blocks are alway aligned */ 68.554 + const vuint8_t vdst = *(vuint8_t *)dst; 68.555 + 68.556 + vuint8_t fsum; 68.557 + OP_U8_SPU(fsum, sum, vdst); 68.558 + 68.559 + *(vuint8_t *)dst=fsum; 68.560 + 68.561 + dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ 68.562 + 68.563 + } 68.564 +} 68.565 + 68.566 +static void PREFIX_h264_qpel8_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { 68.567 + 68.568 + register int i; 68.569 + 68.570 + const int16_t i20ss= 20; 68.571 + const int16_t i5ss= 5; 68.572 + const int16_t i16ss= 16; 68.573 + const int16_t imax = 255; 68.574 + 68.575 + const vsint32_t vzero = spu_splats(0); 68.576 + const vsint16_t vmax = (vsint16_t)spu_splats(imax); 68.577 + vuint16_t sat; 68.578 + 68.579 + const vsint16_t v20ss = spu_splats(i20ss); 68.580 + const vsint16_t v5ss = spu_splats(i5ss); 68.581 + const vsint16_t v16ss = spu_splats(i16ss); 68.582 + const int shift_src = (unsigned int) src & 15; 68.583 + 68.584 + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; 68.585 + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; 68.586 + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; 68.587 + 68.588 + /* 8x8 dest luma blocks are aligned or desaligned by 8*/ 68.589 + const int shift_dst = (unsigned int) dst & 15; 68.590 + vuint8_t dstmask; 68.591 + const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 68.592 + const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; 68.593 + 68.594 + if(shift_dst==0){ 68.595 + dstmask = dst8mask1; 68.596 + } 68.597 + else{ 68.598 + dstmask = dst8mask2; 68.599 + } 68.600 + 68.601 + uint8_t *srcbis = src - (STRIDE_Y * 2); 68.602 + 68.603 + const vuint8_t srcM2a = *(vuint8_t *)(srcbis); 68.604 + const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16); 68.605 + const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16)); 68.606 + 68.607 + srcbis += STRIDE_Y; 68.608 + const vuint8_t srcM1a = *(vuint8_t *)(srcbis); 68.609 + const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16); 68.610 + const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16)); 68.611 + 68.612 + srcbis += STRIDE_Y; 68.613 + const vuint8_t srcP0a = *(vuint8_t *)(srcbis); 68.614 + const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16); 68.615 + const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16)); 68.616 + 68.617 + srcbis += STRIDE_Y; 68.618 + const vuint8_t srcP1a = *(vuint8_t *)(srcbis); 68.619 + const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16); 68.620 + const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16)); 68.621 + 68.622 + srcbis += STRIDE_Y; 68.623 + const vuint8_t srcP2a = *(vuint8_t *)(srcbis); 68.624 + const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16); 68.625 + const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16)); 68.626 + 68.627 + srcbis += STRIDE_Y; 68.628 + 68.629 + vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); 68.630 + vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); 68.631 + vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); 68.632 + vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); 68.633 + vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); 68.634 + 68.635 + for (i = 0 ; i < h ; i++) { 68.636 + const vuint8_t srcP3a = *(vuint8_t *)(srcbis); 68.637 + const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16); 68.638 + const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16)); 68.639 + 68.640 + const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); 68.641 + srcbis += STRIDE_Y; 68.642 + 68.643 + const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA); 68.644 + const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA); 68.645 + const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA); 68.646 + 68.647 + srcM2ssA = srcM1ssA; 68.648 + srcM1ssA = srcP0ssA; 68.649 + srcP0ssA = srcP1ssA; 68.650 + srcP1ssA = srcP2ssA; 68.651 + srcP2ssA = srcP3ssA; 68.652 + 68.653 + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); 68.654 + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); 68.655 + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); 68.656 + const vsint16_t pp1A = spu_add(pp1A3, v16ss); 68.657 + 68.658 + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); 68.659 + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); 68.660 + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); 68.661 + 68.662 + const vsint16_t pp3A = spu_add(sum3A, pp1A); 68.663 + const vsint16_t psumA = spu_sub(pp3A, pp2A); 68.664 + vsint16_t sumA = spu_rlmask(psumA, -5); 68.665 + 68.666 + //Saturation to 0 and 255 68.667 + sat = spu_cmpgt(sumA,(vsint16_t)vzero); 68.668 + sumA = spu_and(sumA,(vsint16_t)sat); 68.669 + sat = spu_cmpgt(sumA,vmax); 68.670 + sumA = spu_sel(sumA,vmax,sat); 68.671 + 68.672 + const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu); 68.673 + 68.674 + const vuint8_t dst1 = *(vuint8_t *)dst; 68.675 + 68.676 + const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); 68.677 + vuint8_t fsum; 68.678 + OP_U8_SPU(fsum, dsum, dst1); 68.679 + 68.680 + *(vuint8_t *)dst=fsum; 68.681 + 68.682 + dst += dstStride; 68.683 + } 68.684 +} 68.685 + 68.686 +static void PREFIX_h264_qpel8_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { 68.687 + 68.688 + register int i; 68.689 + 68.690 + const int16_t i20ss = 20; 68.691 + const int16_t i5ss = 5; 68.692 + const int16_t i16ss = 16; 68.693 + const int16_t imax = 255; 68.694 + 68.695 + const vsint32_t vzero = spu_splats(0); 68.696 + const vsint16_t v20ss = spu_splats(i20ss); 68.697 + const vsint16_t v5ss = spu_splats(i5ss); 68.698 + const vsint16_t v16ss = spu_splats(i16ss); 68.699 + const vsint16_t vmax = (vsint16_t)spu_splats(imax); 68.700 + vuint16_t sat; 68.701 + 68.702 + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; 68.703 + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; 68.704 + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; 68.705 + 68.706 + /* 8x8 dest luma blocks are aligned or desaligned by 8*/ 68.707 + const int shift_dst = (unsigned int) dst & 15; 68.708 + vuint8_t dstmask; 68.709 + const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 68.710 + const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; 68.711 + 68.712 + if(shift_dst==0){ 68.713 + dstmask = dst8mask1; 68.714 + } 68.715 + else{ 68.716 + dstmask = dst8mask2; 68.717 + } 68.718 + 68.719 + const int permM2 = (unsigned int) (src-2) & 15; 68.720 + const int permM1 = (unsigned int) (src-1) & 15; 68.721 + const int permP0 = (unsigned int) (src) & 15; 68.722 + const int permP1 = (unsigned int) (src+1) & 15; 68.723 + const int permP2 = (unsigned int) (src+2) & 15; 68.724 + const int permP3 = (unsigned int) (src+3) & 15; 68.725 + 68.726 + register int align = ((((unsigned long)src) - 2) % 16); 68.727 + 68.728 + for (i = 0 ; i < h ; i ++) { 68.729 + vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 68.730 + vuint8_t srcR1 = *(vuint8_t *)(src-2); 68.731 + vuint8_t srcR2 = *(vuint8_t *)(src+14); 68.732 + 68.733 + switch (align) { 68.734 + default: { 68.735 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.736 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.737 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.738 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.739 + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); 68.740 + srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); 68.741 + } break; 68.742 + case 11: { 68.743 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.744 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.745 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.746 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.747 + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); 68.748 + srcP3 = srcR2; 68.749 + } break; 68.750 + case 12: { 68.751 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.752 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.753 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.754 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.755 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.756 + srcP2 = srcR2; 68.757 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.758 + } break; 68.759 + case 13: { 68.760 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.761 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.762 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.763 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.764 + srcP1 = srcR2; 68.765 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.766 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.767 + } break; 68.768 + case 14: { 68.769 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.770 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.771 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.772 + srcP0 = srcR2; 68.773 + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); 68.774 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.775 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.776 + } break; 68.777 + case 15: { 68.778 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.779 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.780 + srcM1 = srcR2; 68.781 + srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); 68.782 + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); 68.783 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.784 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.785 + } break; 68.786 + } 68.787 + 68.788 + const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); 68.789 + const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); 68.790 + 68.791 + const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); 68.792 + const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); 68.793 + 68.794 + const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); 68.795 + const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); 68.796 + 68.797 + const vsint16_t sum1A = spu_add(srcP0A, srcP1A); 68.798 + const vsint16_t sum2A = spu_add(srcM1A, srcP2A); 68.799 + const vsint16_t sum3A = spu_add(srcM2A, srcP3A); 68.800 + 68.801 + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); 68.802 + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); 68.803 + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); 68.804 + const vsint16_t pp1A = spu_add(pp1A3, v16ss); 68.805 + 68.806 + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); 68.807 + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); 68.808 + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); 68.809 + 68.810 + const vsint16_t pp3A = spu_add(sum3A, pp1A); 68.811 + 68.812 + const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A); 68.813 + 68.814 + vsint16_t sumA = spu_rlmask(psumA, -5); 68.815 + 68.816 + //Saturation to 0 and 255 68.817 + sat = spu_cmpgt(sumA,(vsint16_t)vzero); 68.818 + sumA = spu_and(sumA,(vsint16_t)sat); 68.819 + sat = spu_cmpgt(sumA,vmax); 68.820 + sumA = spu_sel(sumA,vmax,sat); 68.821 + 68.822 + const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu); 68.823 + 68.824 + const vuint8_t dst1 = *(vuint8_t *)dst; 68.825 + 68.826 + const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); 68.827 + vuint8_t fsum; 68.828 + OP_U8_SPU(fsum, dsum, dst1); 68.829 + 68.830 + *(vuint8_t *)dst=fsum; 68.831 + 68.832 + src += STRIDE_Y; 68.833 + dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ 68.834 + } 68.835 +} 68.836 + 68.837 +/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ 68.838 +static void PREFIX_h264_qpel8_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) { 68.839 + register int i; 68.840 + 68.841 + const int16_t i20ss = 20; 68.842 + const int16_t i5ss = 5; 68.843 + const int16_t imax = 255; 68.844 + 68.845 + const vsint32_t vzero = spu_splats(0); 68.846 + const vsint16_t v20ss = spu_splats(i20ss); 68.847 + const vsint16_t v5ss = spu_splats(i5ss); 68.848 + const vsint16_t vmax = (vsint16_t)spu_splats(imax); 68.849 + vuint16_t sat; 68.850 + 68.851 + const vuint8_t mergeh = {0x10,0x00,0x11,0x01,0x12,0x02,0x13,0x03,0x14,0x04,0x15,0x05,0x16,0x06,0x17,0x07}; 68.852 + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; 68.853 + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; 68.854 + 68.855 + const int permM2 = (unsigned int) (src-2) & 15; 68.856 + const int permM1 = (unsigned int) (src-1) & 15; 68.857 + const int permP0 = (unsigned int) (src) & 15; 68.858 + const int permP1 = (unsigned int) (src+1) & 15; 68.859 + const int permP2 = (unsigned int) (src+2) & 15; 68.860 + const int permP3 = (unsigned int) (src+3) & 15; 68.861 + 68.862 + register int align = ((((unsigned long)src) - 2) % 16); 68.863 + 68.864 + src -= (2 * STRIDE_Y); 68.865 + 68.866 + for (i = 0 ; i < (h+5) ; i ++) { 68.867 + vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 68.868 + vuint8_t srcR1 = *(vuint8_t *)(src-2); 68.869 + vuint8_t srcR2 = *(vuint8_t *)(src+14); 68.870 + 68.871 + switch (align) { 68.872 + default: { 68.873 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.874 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.875 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.876 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.877 + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); 68.878 + srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); 68.879 + } break; 68.880 + case 11: { 68.881 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.882 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.883 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.884 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.885 + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); 68.886 + srcP3 = srcR2; 68.887 + } break; 68.888 + case 12: { 68.889 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.890 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.891 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.892 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.893 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.894 + srcP2 = srcR2; 68.895 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.896 + } break; 68.897 + case 13: { 68.898 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.899 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.900 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.901 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.902 + srcP1 = srcR2; 68.903 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.904 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.905 + } break; 68.906 + case 14: { 68.907 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.908 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.909 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.910 + srcP0 = srcR2; 68.911 + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); 68.912 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.913 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.914 + } break; 68.915 + case 15: { 68.916 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.917 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.918 + srcM1 = srcR2; 68.919 + srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); 68.920 + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); 68.921 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.922 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.923 + } break; 68.924 + } 68.925 + 68.926 + const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, (vuint8_t)vzero, mergeh); 68.927 + const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, (vuint8_t)vzero, mergeh); 68.928 + const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, (vuint8_t)vzero, mergeh); 68.929 + const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, (vuint8_t)vzero, mergeh); 68.930 + const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, (vuint8_t)vzero, mergeh); 68.931 + const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, (vuint8_t)vzero, mergeh); 68.932 + 68.933 + const vsint16_t sum1A = spu_add(srcP0A, srcP1A); 68.934 + const vsint16_t sum2A = spu_add(srcM1A, srcP2A); 68.935 + const vsint16_t sum3A = spu_add(srcM2A, srcP3A); 68.936 + 68.937 + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); 68.938 + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); 68.939 + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); 68.940 + const vsint16_t pp1A = spu_add(pp1A3, sum3A); 68.941 + 68.942 + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); 68.943 + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); 68.944 + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); 68.945 + 68.946 + const vsint16_t psumA = spu_sub(pp1A, pp2A); 68.947 + 68.948 + *(vsint16_t *)tmp = psumA; 68.949 + 68.950 + src += STRIDE_Y; 68.951 + tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ 68.952 + } 68.953 + 68.954 + const int32_t ni10si = -10; 68.955 + const int16_t i1ss = 1; 68.956 + const int32_t i512si = 512; 68.957 + const int32_t ni16si = -16; 68.958 + 68.959 + const vsint32_t nv10si = spu_splats(ni10si); 68.960 + const vsint16_t v1ss = spu_splats(i1ss); 68.961 + const vsint32_t v512si = spu_splats(i512si); 68.962 + const vsint32_t nv16si = spu_splats(ni16si); 68.963 + 68.964 + const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F}; 68.965 + const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F}; 68.966 + 68.967 + const int shift_dst = (unsigned int) (dst) & 15; 68.968 + /* 8x8 dest luma blocks are aligned or desaligned by 8*/ 68.969 + vuint8_t dstmask; 68.970 + const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 68.971 + const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17}; 68.972 + 68.973 + if(shift_dst==0){ 68.974 + dstmask = dst8mask1; 68.975 + } 68.976 + else{ 68.977 + dstmask = dst8mask2; 68.978 + } 68.979 + 68.980 + int16_t *tmpbis = tmp - (tmpStride * (h+5)); 68.981 + 68.982 + vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis); 68.983 + tmpbis += tmpStride; 68.984 + vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis); 68.985 + tmpbis += tmpStride; 68.986 + vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis); 68.987 + tmpbis += tmpStride; 68.988 + vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis); 68.989 + tmpbis += tmpStride; 68.990 + vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis); 68.991 + tmpbis += tmpStride; 68.992 + 68.993 + for (i = 0 ; i < h ; i++) { 68.994 + const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis); 68.995 + tmpbis += tmpStride; 68.996 + 68.997 + const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA); 68.998 + const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA); 68.999 + const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA); 68.1000 + 68.1001 + tmpM2ssA = tmpM1ssA; 68.1002 + tmpM1ssA = tmpP0ssA; 68.1003 + tmpP0ssA = tmpP1ssA; 68.1004 + tmpP1ssA = tmpP2ssA; 68.1005 + tmpP2ssA = tmpP3ssA; 68.1006 + 68.1007 + const vsint32_t pp1Ae = spu_mule(sum1A, v20ss); 68.1008 + const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss); 68.1009 + const vsint32_t pp2Ae = spu_mule(sum2A, v5ss); 68.1010 + const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss); 68.1011 + 68.1012 + const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si); 68.1013 + const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss); 68.1014 + 68.1015 + const vsint32_t pp1cAe = spu_add(pp1Ae, v512si); 68.1016 + const vsint32_t pp1cAo = spu_add(pp1Ao, v512si); 68.1017 + 68.1018 + const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae); 68.1019 + const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao); 68.1020 + 68.1021 + const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae); 68.1022 + const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao); 68.1023 + 68.1024 + const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si); 68.1025 + const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si); 68.1026 + 68.1027 + vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, vzero, packs); 68.1028 + vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, vzero, packs); 68.1029 + 68.1030 + //Saturation to 0 and 255 68.1031 + sat = spu_cmpgt(ssume,(vsint16_t)vzero); 68.1032 + ssume = spu_and(ssume,(vsint16_t)sat); 68.1033 + sat = spu_cmpgt(ssume,vmax); 68.1034 + ssume = spu_sel(ssume,vmax,sat); 68.1035 + sat = spu_cmpgt(ssumo,(vsint16_t)vzero); 68.1036 + ssumo = spu_and(ssumo,(vsint16_t)sat); 68.1037 + sat = spu_cmpgt(ssumo,vmax); 68.1038 + ssumo = spu_sel(ssumo,vmax,sat); 68.1039 + 68.1040 + const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu); 68.1041 + 68.1042 + const vuint8_t sum = spu_shuffle(sumv, sumv, mperm); 68.1043 + 68.1044 + const vuint8_t dst1 = *(vuint8_t *)dst; 68.1045 + 68.1046 + const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); 68.1047 + vuint8_t fsum; 68.1048 + OP_U8_SPU(fsum, dsum, dst1); 68.1049 + 68.1050 + *(vuint8_t *)dst=fsum; 68.1051 + 68.1052 + dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ 68.1053 + 68.1054 + } 68.1055 +} 68.1056 + 68.1057 +static void PREFIX_h264_qpel4_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { 68.1058 + 68.1059 + register int i; 68.1060 + 68.1061 + const int16_t i20ss= 20; 68.1062 + const int16_t i5ss= 5; 68.1063 + const int16_t i16ss= 16; 68.1064 + const int16_t imax = 255; 68.1065 + 68.1066 + const vsint32_t vzero = spu_splats(0); 68.1067 + const vsint16_t v20ss = spu_splats(i20ss); 68.1068 + const vsint16_t v5ss = spu_splats(i5ss); 68.1069 + const vsint16_t v16ss = spu_splats(i16ss); 68.1070 + const vsint16_t vmax = (vsint16_t)spu_splats(imax); 68.1071 + vuint16_t sat; 68.1072 + 68.1073 + const int shift_src = (unsigned int) src & 15; 68.1074 + 68.1075 + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; 68.1076 + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; 68.1077 + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; 68.1078 + 68.1079 + /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/ 68.1080 + const int shift_dst = (unsigned int) dst & 15; 68.1081 + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 68.1082 + const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 68.1083 + const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 68.1084 + const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; 68.1085 + const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; 68.1086 + 68.1087 + switch(shift_dst){ 68.1088 + case 0: dstmask = dst4mask0; 68.1089 + break; 68.1090 + case 4: dstmask = dst4mask4; 68.1091 + break; 68.1092 + case 8: dstmask = dst4mask8; 68.1093 + break; 68.1094 + case 12: dstmask = dst4mask12; 68.1095 + break; 68.1096 + } 68.1097 + 68.1098 + uint8_t *srcbis = src - (STRIDE_Y * 2); 68.1099 + 68.1100 + const vuint8_t srcM2a = *(vuint8_t *)(srcbis); 68.1101 + const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16); 68.1102 + const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16)); 68.1103 + 68.1104 + srcbis += STRIDE_Y; 68.1105 + const vuint8_t srcM1a = *(vuint8_t *)(srcbis); 68.1106 + const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16); 68.1107 + const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16)); 68.1108 + 68.1109 + srcbis += STRIDE_Y; 68.1110 + const vuint8_t srcP0a = *(vuint8_t *)(srcbis); 68.1111 + const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16); 68.1112 + const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16)); 68.1113 + 68.1114 + srcbis += STRIDE_Y; 68.1115 + const vuint8_t srcP1a = *(vuint8_t *)(srcbis); 68.1116 + const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16); 68.1117 + const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16)); 68.1118 + 68.1119 + srcbis += STRIDE_Y; 68.1120 + const vuint8_t srcP2a = *(vuint8_t *)(srcbis); 68.1121 + const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16); 68.1122 + const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16)); 68.1123 + 68.1124 + srcbis += STRIDE_Y; 68.1125 + 68.1126 + vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); 68.1127 + vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); 68.1128 + vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); 68.1129 + vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); 68.1130 + vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); 68.1131 + 68.1132 + for (i = 0 ; i < h ; i++) { 68.1133 + const vuint8_t srcP3a = *(vuint8_t *)(srcbis); 68.1134 + const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16); 68.1135 + const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16)); 68.1136 + 68.1137 + const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); 68.1138 + srcbis += STRIDE_Y; 68.1139 + 68.1140 + const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA); 68.1141 + const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA); 68.1142 + const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA); 68.1143 + 68.1144 + srcM2ssA = srcM1ssA; 68.1145 + srcM1ssA = srcP0ssA; 68.1146 + srcP0ssA = srcP1ssA; 68.1147 + srcP1ssA = srcP2ssA; 68.1148 + srcP2ssA = srcP3ssA; 68.1149 + 68.1150 + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); 68.1151 + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); 68.1152 + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); 68.1153 + const vsint16_t pp1A = spu_add(pp1A3, v16ss); 68.1154 + 68.1155 + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); 68.1156 + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); 68.1157 + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); 68.1158 + 68.1159 + const vsint16_t pp3A = spu_add(sum3A, pp1A); 68.1160 + const vsint16_t psumA = spu_sub(pp3A, pp2A); 68.1161 + vsint16_t sumA = spu_rlmask(psumA, -5); 68.1162 + 68.1163 + //Saturation to 0 and 255 68.1164 + sat = spu_cmpgt(sumA,(vsint16_t)vzero); 68.1165 + sumA = spu_and(sumA,(vsint16_t)sat); 68.1166 + sat = spu_cmpgt(sumA,vmax); 68.1167 + sumA = spu_sel(sumA,vmax,sat); 68.1168 + 68.1169 + const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu); 68.1170 + 68.1171 + const vuint8_t dst1 = *(vuint8_t *)dst; 68.1172 + 68.1173 + const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); 68.1174 + vuint8_t fsum; 68.1175 + OP_U8_SPU(fsum, dsum, dst1); 68.1176 + 68.1177 + *(vuint8_t *)dst=fsum; 68.1178 + 68.1179 + dst += dstStride; 68.1180 + } 68.1181 +} 68.1182 + 68.1183 +static void PREFIX_h264_qpel4_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) { 68.1184 + 68.1185 + register int i; 68.1186 + 68.1187 + const int16_t i20ss = 20; 68.1188 + const int16_t i5ss = 5; 68.1189 + const int16_t i16ss = 16; 68.1190 + const int16_t imax = 255; 68.1191 + 68.1192 + const vsint32_t vzero = spu_splats(0); 68.1193 + const vsint16_t v20ss = spu_splats(i20ss); 68.1194 + const vsint16_t v5ss = spu_splats(i5ss); 68.1195 + const vsint16_t v16ss = spu_splats(i16ss); 68.1196 + const vsint16_t vmax = (vsint16_t)spu_splats(imax); 68.1197 + vuint16_t sat; 68.1198 + 68.1199 + const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07}; 68.1200 + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; 68.1201 + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; 68.1202 + 68.1203 + /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/ 68.1204 + const int shift_dst = (unsigned int) dst & 15; 68.1205 + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 68.1206 + const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 68.1207 + const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 68.1208 + const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; 68.1209 + const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; 68.1210 + 68.1211 + switch(shift_dst){ 68.1212 + case 0: dstmask = dst4mask0; 68.1213 + break; 68.1214 + case 4: dstmask = dst4mask4; 68.1215 + break; 68.1216 + case 8: dstmask = dst4mask8; 68.1217 + break; 68.1218 + case 12: dstmask = dst4mask12; 68.1219 + break; 68.1220 + } 68.1221 + 68.1222 + const int permM2 = (unsigned int) (src-2) & 15; 68.1223 + const int permM1 = (unsigned int) (src-1) & 15; 68.1224 + const int permP0 = (unsigned int) (src) & 15; 68.1225 + const int permP1 = (unsigned int) (src+1) & 15; 68.1226 + const int permP2 = (unsigned int) (src+2) & 15; 68.1227 + const int permP3 = (unsigned int) (src+3) & 15; 68.1228 + 68.1229 + register int align = ((((unsigned long)src) - 2) % 16); 68.1230 + 68.1231 + for (i = 0 ; i < h ; i ++) { 68.1232 + vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 68.1233 + vuint8_t srcR1 = *(vuint8_t *)(src-2); 68.1234 + vuint8_t srcR2 = *(vuint8_t *)(src+14); 68.1235 + 68.1236 + switch (align) { 68.1237 + default: { 68.1238 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.1239 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.1240 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.1241 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.1242 + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); 68.1243 + srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); 68.1244 + } break; 68.1245 + case 11: { 68.1246 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.1247 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.1248 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.1249 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.1250 + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); 68.1251 + srcP3 = srcR2; 68.1252 + } break; 68.1253 + case 12: { 68.1254 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.1255 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.1256 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.1257 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.1258 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.1259 + srcP2 = srcR2; 68.1260 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.1261 + } break; 68.1262 + case 13: { 68.1263 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.1264 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.1265 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.1266 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.1267 + srcP1 = srcR2; 68.1268 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.1269 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.1270 + } break; 68.1271 + case 14: { 68.1272 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.1273 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.1274 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.1275 + srcP0 = srcR2; 68.1276 + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); 68.1277 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.1278 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.1279 + } break; 68.1280 + case 15: { 68.1281 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.1282 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.1283 + srcM1 = srcR2; 68.1284 + srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); 68.1285 + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); 68.1286 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.1287 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.1288 + } break; 68.1289 + } 68.1290 + 68.1291 + const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh); 68.1292 + const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh); 68.1293 + 68.1294 + const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh); 68.1295 + const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh); 68.1296 + 68.1297 + const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh); 68.1298 + const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh); 68.1299 + 68.1300 + const vsint16_t sum1A = spu_add(srcP0A, srcP1A); 68.1301 + const vsint16_t sum2A = spu_add(srcM1A, srcP2A); 68.1302 + const vsint16_t sum3A = spu_add(srcM2A, srcP3A); 68.1303 + 68.1304 + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); 68.1305 + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); 68.1306 + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); 68.1307 + const vsint16_t pp1A = spu_add(pp1A3, v16ss); 68.1308 + 68.1309 + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); 68.1310 + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); 68.1311 + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); 68.1312 + 68.1313 + const vsint16_t pp3A = spu_add(sum3A, pp1A); 68.1314 + 68.1315 + const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A); 68.1316 + 68.1317 + vsint16_t sumA = spu_rlmask(psumA, -5); 68.1318 + 68.1319 + //Saturation to 0 and 255 68.1320 + sat = spu_cmpgt(sumA,(vsint16_t)vzero); 68.1321 + sumA = spu_and(sumA,(vsint16_t)sat); 68.1322 + sat = spu_cmpgt(sumA,vmax); 68.1323 + sumA = spu_sel(sumA,vmax,sat); 68.1324 + 68.1325 + const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu); 68.1326 + 68.1327 + const vuint8_t dst1 = *(vuint8_t *)dst; 68.1328 + 68.1329 + const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); 68.1330 + vuint8_t fsum; 68.1331 + OP_U8_SPU(fsum, dsum, dst1); 68.1332 + 68.1333 + *(vuint8_t *)dst=fsum; 68.1334 + 68.1335 + src += STRIDE_Y; 68.1336 + dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ 68.1337 + } 68.1338 +} 68.1339 + 68.1340 +static void PREFIX_h264_qpel4_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) { 68.1341 + register int i; 68.1342 + 68.1343 + const int16_t i20ss = 20; 68.1344 + const int16_t i5ss = 5; 68.1345 + const int16_t imax = 255; 68.1346 + 68.1347 + const vsint32_t vzero = spu_splats(0); 68.1348 + const vsint16_t v20ss = spu_splats(i20ss); 68.1349 + const vsint16_t v5ss = spu_splats(i5ss); 68.1350 + const vsint16_t vmax = (vsint16_t)spu_splats(imax); 68.1351 + vuint16_t sat; 68.1352 + 68.1353 + const vuint8_t mergeh = {0x10,0x00,0x11,0x01,0x12,0x02,0x13,0x03,0x14,0x04,0x15,0x05,0x16,0x06,0x17,0x07}; 68.1354 + const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F}; 68.1355 + const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F}; 68.1356 + 68.1357 + const int permM2 = (unsigned int) (src-2) & 15; 68.1358 + const int permM1 = (unsigned int) (src-1) & 15; 68.1359 + const int permP0 = (unsigned int) (src) & 15; 68.1360 + const int permP1 = (unsigned int) (src+1) & 15; 68.1361 + const int permP2 = (unsigned int) (src+2) & 15; 68.1362 + const int permP3 = (unsigned int) (src+3) & 15; 68.1363 + 68.1364 + register int align = ((((unsigned long)src) - 2) % 16); 68.1365 + 68.1366 + src -= (2 * STRIDE_Y); 68.1367 + 68.1368 + for (i = 0 ; i < (h+5) ; i ++) { 68.1369 + vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 68.1370 + vuint8_t srcR1 = *(vuint8_t *)(src-2); 68.1371 + vuint8_t srcR2 = *(vuint8_t *)(src+14); 68.1372 + 68.1373 + switch (align) { 68.1374 + default: { 68.1375 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.1376 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.1377 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.1378 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.1379 + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); 68.1380 + srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16)); 68.1381 + } break; 68.1382 + case 11: { 68.1383 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.1384 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.1385 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.1386 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.1387 + srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16)); 68.1388 + srcP3 = srcR2; 68.1389 + } break; 68.1390 + case 12: { 68.1391 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.1392 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.1393 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.1394 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.1395 + srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16)); 68.1396 + srcP2 = srcR2; 68.1397 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.1398 + } break; 68.1399 + case 13: { 68.1400 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.1401 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.1402 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.1403 + srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16)); 68.1404 + srcP1 = srcR2; 68.1405 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.1406 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.1407 + } break; 68.1408 + case 14: { 68.1409 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.1410 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.1411 + srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16)); 68.1412 + srcP0 = srcR2; 68.1413 + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); 68.1414 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.1415 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.1416 + } break; 68.1417 + case 15: { 68.1418 + vuint8_t srcR3 = *(vuint8_t *)(src+30); 68.1419 + srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16)); 68.1420 + srcM1 = srcR2; 68.1421 + srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16)); 68.1422 + srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16)); 68.1423 + srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16)); 68.1424 + srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16)); 68.1425 + } break; 68.1426 + } 68.1427 + 68.1428 + const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, (vuint8_t)vzero, mergeh); 68.1429 + const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, (vuint8_t)vzero, mergeh); 68.1430 + const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, (vuint8_t)vzero, mergeh); 68.1431 + const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, (vuint8_t)vzero, mergeh); 68.1432 + const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, (vuint8_t)vzero, mergeh); 68.1433 + const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, (vuint8_t)vzero, mergeh); 68.1434 + 68.1435 + const vsint16_t sum1A = spu_add(srcP0A, srcP1A); 68.1436 + const vsint16_t sum2A = spu_add(srcM1A, srcP2A); 68.1437 + const vsint16_t sum3A = spu_add(srcM2A, srcP3A); 68.1438 + 68.1439 + const vsint32_t pp1A1 = spu_mule(sum1A, v20ss); 68.1440 + const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss); 68.1441 + const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez); 68.1442 + const vsint16_t pp1A = spu_add(pp1A3, sum3A); 68.1443 + 68.1444 + const vsint32_t pp2A1 = spu_mule(sum2A, v5ss); 68.1445 + const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss); 68.1446 + const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez); 68.1447 + 68.1448 + const vsint16_t psumA = spu_sub(pp1A, pp2A); 68.1449 + 68.1450 + *(vsint16_t *)tmp = psumA; 68.1451 + 68.1452 + src += STRIDE_Y; 68.1453 + tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ 68.1454 + } 68.1455 + 68.1456 + const int32_t ni10si = -10; 68.1457 + const int16_t i1ss = 1; 68.1458 + const int32_t i512si = 512; 68.1459 + const int32_t ni16si = -16; 68.1460 + 68.1461 + const vsint32_t nv10si = spu_splats(ni10si); 68.1462 + const vsint16_t v1ss = spu_splats(i1ss); 68.1463 + const vsint32_t v512si = spu_splats(i512si); 68.1464 + const vsint32_t nv16si = spu_splats(ni16si); 68.1465 + 68.1466 + const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F}; 68.1467 + const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F}; 68.1468 + 68.1469 + const int shift_dst = (unsigned int) (dst) & 15; 68.1470 + /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/ 68.1471 + vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 68.1472 + const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 68.1473 + const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F}; 68.1474 + const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F}; 68.1475 + const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13}; 68.1476 + 68.1477 + switch(shift_dst){ 68.1478 + case 0: dstmask = dst4mask0; 68.1479 + break; 68.1480 + case 4: dstmask = dst4mask4; 68.1481 + break; 68.1482 + case 8: dstmask = dst4mask8; 68.1483 + break; 68.1484 + case 12: dstmask = dst4mask12; 68.1485 + break; 68.1486 + } 68.1487 + 68.1488 + int16_t *tmpbis = tmp - (tmpStride * (h+5)); 68.1489 + 68.1490 + vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis); 68.1491 + tmpbis += tmpStride; 68.1492 + vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis); 68.1493 + tmpbis += tmpStride; 68.1494 + vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis); 68.1495 + tmpbis += tmpStride; 68.1496 + vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis); 68.1497 + tmpbis += tmpStride; 68.1498 + vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis); 68.1499 + tmpbis += tmpStride; 68.1500 + 68.1501 + for (i = 0 ; i < h ; i++) { 68.1502 + const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis); 68.1503 + tmpbis += tmpStride; 68.1504 + 68.1505 + const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA); 68.1506 + const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA); 68.1507 + const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA); 68.1508 + 68.1509 + tmpM2ssA = tmpM1ssA; 68.1510 + tmpM1ssA = tmpP0ssA; 68.1511 + tmpP0ssA = tmpP1ssA; 68.1512 + tmpP1ssA = tmpP2ssA; 68.1513 + tmpP2ssA = tmpP3ssA; 68.1514 + 68.1515 + const vsint32_t pp1Ae = spu_mule(sum1A, v20ss); 68.1516 + const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss); 68.1517 + const vsint32_t pp2Ae = spu_mule(sum2A, v5ss); 68.1518 + const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss); 68.1519 + 68.1520 + const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si); 68.1521 + const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss); 68.1522 + 68.1523 + const vsint32_t pp1cAe = spu_add(pp1Ae, v512si); 68.1524 + const vsint32_t pp1cAo = spu_add(pp1Ao, v512si); 68.1525 + 68.1526 + const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae); 68.1527 + const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao); 68.1528 + 68.1529 + const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae); 68.1530 + const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao); 68.1531 + 68.1532 + const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si); 68.1533 + const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si); 68.1534 + 68.1535 + vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, vzero, packs); 68.1536 + vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, vzero, packs); 68.1537 + 68.1538 + //Saturation to 0 and 255 68.1539 + sat = spu_cmpgt(ssume,(vsint16_t)vzero); 68.1540 + ssume = spu_and(ssume,(vsint16_t)sat); 68.1541 + sat = spu_cmpgt(ssume,vmax); 68.1542 + ssume = spu_sel(ssume,vmax,sat); 68.1543 + sat = spu_cmpgt(ssumo,(vsint16_t)vzero); 68.1544 + ssumo = spu_and(ssumo,(vsint16_t)sat); 68.1545 + sat = spu_cmpgt(ssumo,vmax); 68.1546 + ssumo = spu_sel(ssumo,vmax,sat); 68.1547 + 68.1548 + const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu); 68.1549 + 68.1550 + const vuint8_t sum = spu_shuffle(sumv, sumv, mperm); 68.1551 + 68.1552 + const vuint8_t dst1 = *(vuint8_t *)dst; 68.1553 + 68.1554 + const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask); 68.1555 + vuint8_t fsum; 68.1556 + OP_U8_SPU(fsum, dsum, dst1); 68.1557 + 68.1558 + *(vuint8_t *)dst=fsum; 68.1559 + 68.1560 + dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */ 68.1561 + 68.1562 + } 68.1563 +}
69.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 69.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.c Mon Aug 27 12:09:56 2012 +0200 69.3 @@ -0,0 +1,362 @@ 69.4 +/* 69.5 + * Copyright (c) 2009 TUDelft 69.6 + * 69.7 + * Cell Parallel SPU - 2DWave Macroblock Decoding. 69.8 + */ 69.9 + 69.10 +/** 69.11 + * @file libavcodec/cell/spu/h264_main_spu.c 69.12 + * Cell Parallel SPU - 2DWave Macroblock Decoding 69.13 + * @author C C Chi <c.c.chi@student.tudelft.nl> 69.14 + * 69.15 + * SIMD kernels 69.16 + * H.264/AVC motion compensation 69.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu> 69.18 + * @author Albert Paradis <apar7632@hotmail.com> 69.19 + */ 69.20 + 69.21 + 69.22 +#include <stdio.h> 69.23 +#include <spu_intrinsics.h> 69.24 +#include <spu_mfcio.h> 69.25 +#include <assert.h> 69.26 + 69.27 +#include "h264_mc_spu.h" 69.28 +#include "h264_dma.h" 69.29 +#include "h264_tables.h" 69.30 +#include "h264_decode_mb_spu.h" 69.31 + 69.32 + 69.33 +//biweight buffer 69.34 +DECLARE_ALIGNED_16(uint8_t, tmp_y_ls[48*16]); 69.35 +DECLARE_ALIGNED_16(uint8_t, tmp_cb_ls[32*8]); 69.36 +DECLARE_ALIGNED_16(uint8_t, tmp_cr_ls[32*8]); 69.37 + 69.38 +//ref buffer (double buffered) 69.39 +DECLARE_ALIGNED_16(uint8_t, mc_ref[2][16*(4+5)*48 + 2*16*(2+1)*32]); 69.40 +uint8_t* ref_ptr; 69.41 + 69.42 +/** Motion Compensation functions*/ 69.43 + 69.44 +static void fill_mc_part(H264mc *mc, int n, int chroma_height, int x_offset, int y_offset, int itp, int weight, int list0, int list1){ 69.45 + H264mc_part *mc_part = mc->mc_part + mc->npart; 69.46 + mc_part->n =n; 69.47 + mc_part->chroma_height =chroma_height; 69.48 + mc_part->x_offset = x_offset; 69.49 + mc_part->y_offset = y_offset; 69.50 + mc_part->itp = itp; 69.51 + mc_part->weight = weight; 69.52 + mc_part->list0 = list0; 69.53 + mc_part->list1 = list1; 69.54 + 69.55 + mc->npart++; 69.56 +} 69.57 + 69.58 +void calc_mc_params(H264Mb* mb, H264mc *mc){ 69.59 + int mb_type = mb->mb_type; 69.60 + mc->npart=0; 69.61 + 69.62 + assert(!IS_INTRA(mb_type)); 69.63 + if(IS_16X16(mb_type)){ 69.64 + fill_mc_part(mc, 0, 8, 0, 0, 0, 0, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); 69.65 + }else if(IS_16X8(mb_type)){ 69.66 + fill_mc_part(mc, 0, 4, 0, 0, 0, 0, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); 69.67 + fill_mc_part(mc, 8, 4, 0, 4, 0, 1, IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); 69.68 + }else if(IS_8X16(mb_type)){ 69.69 + fill_mc_part(mc, 0, 8, 0, 0, 1, 2, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); 69.70 + fill_mc_part(mc, 4, 8, 4, 0, 1, 2, IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); 69.71 + }else{ 69.72 + int i; 69.73 + assert(IS_8X8(mb_type)); 69.74 + 69.75 + for(i=0; i<4; i++){ 69.76 + const int sub_mb_type= mb->sub_mb_type[i]; 69.77 + const int n= 4*i; 69.78 + int x_offset= (i&1)<<2; 69.79 + int y_offset= (i&2)<<1; 69.80 + 69.81 + if(IS_SUB_8X8(sub_mb_type)){ 69.82 + fill_mc_part(mc, n, 4, x_offset, y_offset, 1, 3, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); 69.83 + }else if(IS_SUB_8X4(sub_mb_type)){ 69.84 + fill_mc_part(mc, n, 2, x_offset, y_offset, 1, 4, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); 69.85 + fill_mc_part(mc, n+2, 2, x_offset, y_offset+2, 1, 4, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); 69.86 + }else if(IS_SUB_4X8(sub_mb_type)){ 69.87 + fill_mc_part(mc, n, 4, x_offset, y_offset, 2, 5, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); 69.88 + fill_mc_part(mc, n+1, 4, x_offset+2, y_offset, 2, 5, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); 69.89 + }else{ 69.90 + int j; 69.91 + assert(IS_SUB_4X4(sub_mb_type)); 69.92 + for(j=0; j<4; j++){ 69.93 + int sub_x_offset= x_offset + 2*(j&1); 69.94 + int sub_y_offset= y_offset + (j&2); 69.95 + fill_mc_part(mc, n+j, 2, sub_x_offset, sub_y_offset, 2, 6, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); 69.96 + } 69.97 + } 69.98 + } 69.99 + } 69.100 +} 69.101 + 69.102 +/** 69.103 +* Returns a pointer to mc_buf 69.104 +*/ 69.105 +static void* alloc_mc_buf(int size){ 69.106 + void* ptr = ref_ptr; 69.107 + ref_ptr += size; 69.108 + return ptr; 69.109 +} 69.110 + 69.111 +#define TAG_OFFSET_MC MBD_mc_buf1 69.112 +static uint8_t* get_mc_data(uint8_t* src_ea, int pic_xoffset, int pic_yoffset, int blk_h, int stride, int linesize, int idx){ 69.113 + assert(src_ea); 69.114 + int unalign; 69.115 + unsigned address_align; 69.116 + 69.117 + uint8_t* ea; 69.118 + uint8_t* ref_ptr = alloc_mc_buf(blk_h*stride); 69.119 + 69.120 + ea = src_ea + pic_xoffset + pic_yoffset*linesize; 69.121 + address_align = ((unsigned) ea) & 0xFFFFFFF0; 69.122 + unalign = ((unsigned) ea) & 0xF; 69.123 + get_dma_list(ref_ptr, (void *)address_align, stride, blk_h, linesize, idx + TAG_OFFSET_MC, 0); 69.124 + return (ref_ptr + unalign); 69.125 +} 69.126 + 69.127 +static uint8_t* get_mc_data_blocking(uint8_t* src_ea, int pic_xoffset, int pic_yoffset, int blk_h, int stride, int linesize, int idx){ 69.128 + assert(src_ea); 69.129 + int unalign; 69.130 + unsigned address_align; 69.131 + 69.132 + uint8_t* ea; 69.133 + uint8_t* ref_ptr = alloc_mc_buf(blk_h*stride); 69.134 + 69.135 + ea = src_ea + pic_xoffset + pic_yoffset*linesize; 69.136 + address_align = ((unsigned) ea) & 0xFFFFFFF0; 69.137 + unalign = ((unsigned) ea) & 0xF; 69.138 + get_dma_list(ref_ptr, (void *)address_align, stride, blk_h, linesize, MBD_mc_buf1, 0); 69.139 + wait_dma_id(MBD_mc_buf1); 69.140 + return (ref_ptr + unalign); 69.141 +} 69.142 + 69.143 +//#undef TAG_OFFSET_MC 69.144 + 69.145 +static void get_mc_components(H264Context_spu *h, H264Mb *mb, H264mc_part* mc_part, Picture_spu *pic, int n, int chroma_height, int list, int src_x_offset, int src_y_offset, int idx){ 69.146 + assert(pic); 69.147 + H264slice *s = h->s; 69.148 + ref_data *ref = &mc_part->ref[list]; 69.149 + const int mx= mb->mv_cache[list][ scan8[n] ][0] + src_x_offset*8; 69.150 + const int my= mb->mv_cache[list][ scan8[n] ][1] + src_y_offset*8; 69.151 + 69.152 + const int pic_width = 16*s->mb_width; 69.153 + const int pic_height = 16*s->mb_height; 69.154 + 69.155 + int blk_h= chroma_height*2+5; 69.156 + //int blk_w= 8*2+5; 69.157 + 69.158 + int blk_h_c= chroma_height+1; 69.159 + //int blk_w_c= 9; 69.160 + 69.161 + int ymx= mx>>2; 69.162 + int ymy= my>>2; 69.163 + int cmy= my>>3; 69.164 + int cmx= mx>>3; 69.165 + 69.166 + //truncate the motion vectors references 69.167 + if(ymy>= pic_height+2){ 69.168 + ymy=pic_height+1; 69.169 + }else if(ymy <=-19){ 69.170 + ymy=-18; 69.171 + } 69.172 + if(ymx>= pic_width+2){ 69.173 + ymx= pic_width+1; 69.174 + }else if(ymx<=-19){ 69.175 + ymx=-19; 69.176 + } 69.177 + 69.178 + if(cmy >= pic_height>>1){ 69.179 + cmy = (pic_height>>1) -1; 69.180 + }else if(cmy<=-9){ 69.181 + cmy=-8; 69.182 + } 69.183 + if(cmx >= pic_width>>1){ 69.184 + cmx = (pic_width>>1) -1; 69.185 + }else if(cmx<=-9){ 69.186 + cmx=-8; 69.187 + } 69.188 + if (!h->blocking){ 69.189 + ref->data[0]=get_mc_data(pic->data[0], ymx-2, ymy-2, blk_h, STRIDE_Y, s->linesize, idx); 69.190 + ref->data[1]=get_mc_data(pic->data[1], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx); 69.191 + ref->data[2]=get_mc_data(pic->data[2], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx); 69.192 + } else { 69.193 + ref->data[0]=get_mc_data_blocking(pic->data[0], ymx-2, ymy-2, blk_h, STRIDE_Y, s->linesize, idx); 69.194 + ref->data[1]=get_mc_data_blocking(pic->data[1], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx); 69.195 + ref->data[2]=get_mc_data_blocking(pic->data[2], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx); 69.196 + 69.197 + } 69.198 + 69.199 +} 69.200 + 69.201 +static void get_ref_data(H264Context_spu *h, H264Mb *mb, H264mc_part *mc_part, int idx){ 69.202 + H264slice *s = h->s; 69.203 + int x_offset = mc_part->x_offset; 69.204 + int y_offset = mc_part->y_offset; 69.205 + int list0 = mc_part->list0; 69.206 + int list1 = mc_part->list1; 69.207 + int n = mc_part->n; 69.208 + int chroma_height = mc_part->chroma_height; 69.209 + Picture_spu *refpic; 69.210 + 69.211 + x_offset += 8*mb->mb_x; 69.212 + y_offset += 8*mb->mb_y; 69.213 + 69.214 + if(list0){ 69.215 + refpic= &s->ref_list[0][ mb->ref_cache[0][ scan8[n] ] ]; 69.216 + get_mc_components(h, mb, mc_part, refpic, n, chroma_height, 0, x_offset, y_offset, idx); 69.217 + } 69.218 + if(list1){ 69.219 + refpic= &s->ref_list[1][ mb->ref_cache[1][ scan8[n] ] ]; 69.220 + get_mc_components(h, mb, mc_part, refpic, n, chroma_height, 1, x_offset, y_offset, idx); 69.221 + } 69.222 +} 69.223 + 69.224 +void fill_ref_buf(H264Context_spu *h, H264Mb *mb, H264mc *mc){ 69.225 + int idx = h->mc_idx; 69.226 + int i; 69.227 + 69.228 + get_list = get_list_buf; 69.229 + ref_ptr = mc_ref[idx]; 69.230 + for(i=0; i<mc->npart; i++){ 69.231 + get_ref_data(h, mb, &mc->mc_part[i], idx); 69.232 + } 69.233 +} 69.234 + 69.235 +static void mc_dir_part(H264Context_spu *h, H264mc_part* mc_part, int n, int chroma_height, int list, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, int stride_y, int stride_c){ 69.236 + 69.237 + H264Mb *mb = h->mb; 69.238 + ref_data* ref = &mc_part->ref[list]; 69.239 + const int mx= mb->mv_cache[list][ scan8[n] ][0]; //to determine the interpolation mode 69.240 + const int my= mb->mv_cache[list][ scan8[n] ][1]; 69.241 + const int luma_xy= (mx&3) + ((my&3)<<2); 69.242 + uint8_t *src_y, *src_cb, *src_cr; 69.243 + 69.244 + src_y = ref->data[0] +2+2*STRIDE_Y; 69.245 + src_cb = ref->data[1]; 69.246 + src_cr = ref->data[2]; 69.247 + 69.248 + qpix_op[luma_xy](dest_y, src_y, stride_y, chroma_height*2); 69.249 + chroma_op(dest_cb, src_cb, stride_c, chroma_height, mx&7, my&7); 69.250 + chroma_op(dest_cr, src_cr, stride_c, chroma_height, mx&7, my&7); 69.251 +} 69.252 + 69.253 + 69.254 +static void mc_part_biweighted(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg){ 69.255 + 69.256 + H264Mb *mb = h->mb; 69.257 + H264slice *s = h->s; 69.258 + int n = mc_part->n; 69.259 + int chroma_height = mc_part->chroma_height; 69.260 + int itp = mc_part->itp; 69.261 + int refn0 = mb->ref_cache[0][ scan8[n] ]; 69.262 + int refn1 = mb->ref_cache[1][ scan8[n] ]; 69.263 + qpel_mc_func *qpix_put= h->dsp.put_h264_qpel_pixels_tab[itp]; 69.264 + h264_chroma_mc_func chroma_put= h->dsp.put_h264_chroma_pixels_tab[itp]; 69.265 + 69.266 + // don't optimize for luma-only case, since B-frames usually 69.267 + // use implicit weights => chroma too. 69.268 + mc_dir_part(h, mc_part, n, chroma_height, 0, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, stride_y, stride_c); 69.269 + 69.270 + mc_dir_part(h, mc_part, n, chroma_height, 1, tmp_y_ls, tmp_cb_ls, tmp_cr_ls, qpix_put, chroma_put, STRIDE_Y, STRIDE_C); 69.271 + 69.272 + if(s->use_weight == 2){ 69.273 + int weight0 = s->implicit_weight[refn0][refn1][mb->mb_y&1]; 69.274 + int weight1 = 64 - weight0; 69.275 + luma_weight_avg( dest_y, tmp_y_ls, stride_y, STRIDE_Y, 5, weight0, weight1, 0); 69.276 + chroma_weight_avg(dest_cb, tmp_cb_ls, stride_c, STRIDE_C, 5, weight0, weight1, 0); 69.277 + chroma_weight_avg(dest_cr, tmp_cr_ls, stride_c, STRIDE_C, 5, weight0, weight1, 0); 69.278 + }else{ 69.279 + luma_weight_avg(dest_y, tmp_y_ls, stride_y, STRIDE_Y, s->luma_log2_weight_denom, s->luma_weight[refn0][0][0] , s->luma_weight[refn1][1][0], s->luma_weight[refn0][0][1] + s->luma_weight[refn1][1][1]); 69.280 + 69.281 + chroma_weight_avg(dest_cb, tmp_cb_ls, stride_c, STRIDE_C, s->chroma_log2_weight_denom, s->chroma_weight[refn0][0][0][0] , s->chroma_weight[refn1][1][0][0], s->chroma_weight[refn0][0][0][1] + s->chroma_weight[refn1][1][0][1]); 69.282 + 69.283 + chroma_weight_avg(dest_cr, tmp_cr_ls, stride_c, STRIDE_C, s->chroma_log2_weight_denom, s->chroma_weight[refn0][0][1][0] , s->chroma_weight[refn1][1][1][0], s->chroma_weight[refn0][0][1][1] + s->chroma_weight[refn1][1][1][1]); 69.284 + } 69.285 +} 69.286 + 69.287 +static void mc_part_weighted(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op, int list1){ 69.288 + 69.289 + H264Mb *mb = h->mb; 69.290 + H264slice *s = h->s; 69.291 + 69.292 + int n = mc_part->n; 69.293 + int chroma_height = mc_part->chroma_height; 69.294 + int itp = mc_part->itp; 69.295 + qpel_mc_func *qpix_put= h->dsp.put_h264_qpel_pixels_tab[itp]; 69.296 + h264_chroma_mc_func chroma_put= h->dsp.put_h264_chroma_pixels_tab[itp]; 69.297 + 69.298 + int list = list1 ? 1 : 0; 69.299 + int refn = mb->ref_cache[list][ scan8[n] ]; 69.300 + 69.301 + mc_dir_part(h, mc_part, n, chroma_height, list, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, stride_y, stride_c); 69.302 + 69.303 + luma_weight_op(dest_y, stride_y, s->luma_log2_weight_denom, s->luma_weight[refn][list][0], s->luma_weight[refn][list][1]); 69.304 + if(s->use_weight_chroma){ 69.305 + chroma_weight_op(dest_cb, stride_c, s->chroma_log2_weight_denom, s->chroma_weight[refn][list][0][0], s->chroma_weight[refn][list][0][1]); 69.306 + 69.307 + chroma_weight_op(dest_cr, stride_c, s->chroma_log2_weight_denom, s->chroma_weight[refn][list][1][0], s->chroma_weight[refn][list][1][1]); 69.308 + } 69.309 +} 69.310 + 69.311 + 69.312 +static void mc_part_std(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, int list0, int list1){ 69.313 + int n = mc_part->n; 69.314 + int chroma_height = mc_part->chroma_height; 69.315 + int itp = mc_part->itp; 69.316 + 69.317 + qpel_mc_func *qpix_op= h->dsp.put_h264_qpel_pixels_tab[itp]; 69.318 + h264_chroma_mc_func chroma_op= h->dsp.put_h264_chroma_pixels_tab[itp]; 69.319 + 69.320 + if(list0){ 69.321 + mc_dir_part(h, mc_part, n, chroma_height, 0, dest_y, dest_cb, dest_cr, qpix_op, chroma_op, stride_y, stride_c); 69.322 + 69.323 + qpix_op= h->dsp.avg_h264_qpel_pixels_tab[itp]; 69.324 + chroma_op= h->dsp.avg_h264_chroma_pixels_tab[itp]; 69.325 + } 69.326 + 69.327 + if(list1){ 69.328 + mc_dir_part(h, mc_part, n, chroma_height, 1, dest_y, dest_cb, dest_cr, qpix_op, chroma_op, stride_y, stride_c); 69.329 + } 69.330 +} 69.331 + 69.332 +static void mc_part(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){ 69.333 + H264slice *s = h->s; 69.334 + 69.335 + int weight = mc_part->weight; 69.336 + 69.337 + int x_offset = mc_part->x_offset; 69.338 + int y_offset = mc_part->y_offset; 69.339 + int list0 = mc_part->list0; 69.340 + int list1 = mc_part->list1; 69.341 + 69.342 + dest_y += 2*x_offset + 2*y_offset*stride_y; 69.343 + dest_cb += x_offset + y_offset*stride_c; 69.344 + dest_cr += x_offset + y_offset*stride_c; 69.345 + 69.346 + if(list0 && list1 && s->use_weight !=0){ 69.347 + h264_biweight_func *weight_avg = &h->dsp.biweight_h264_pixels_tab[weight]; 69.348 + mc_part_biweighted(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, weight_avg[0], weight_avg[3]); 69.349 + } 69.350 + else if ((list0 || list1) && s->use_weight ==1){ 69.351 + h264_weight_func *weight_op = &h->dsp.weight_h264_pixels_tab[weight]; 69.352 + mc_part_weighted(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, weight_op[0], weight_op[3], list1); 69.353 + } 69.354 + else{ 69.355 + mc_part_std(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, list0, list1); 69.356 + } 69.357 +} 69.358 + 69.359 +void hl_motion(H264Context_spu *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){ 69.360 + int i; 69.361 + H264mc *mc =h->mc; 69.362 + for(i=0; i<mc->npart; i++){ 69.363 + mc_part(h, &mc->mc_part[i], dest_y, dest_cb, dest_cr, stride_y, stride_c); 69.364 + } 69.365 +}
70.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 70.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.h Mon Aug 27 12:09:56 2012 +0200 70.3 @@ -0,0 +1,53 @@ 70.4 +#ifndef H264_MC_SPU_H 70.5 +#define H264_MC_SPU_H 70.6 + 70.7 +//#include "types_spu.h" 70.8 + 70.9 +// motion compensation constants: 70.10 +#define MB_TYPE_16x16 0x0008 70.11 +#define MB_TYPE_16x8 0x0010 70.12 +#define MB_TYPE_8x16 0x0020 70.13 +#define MB_TYPE_8x8 0x0040 70.14 +#define MB_TYPE_P0L0 0x1000 70.15 +#define IS_16X16(a) ((a)&MB_TYPE_16x16) 70.16 +#define IS_16X8(a) ((a)&MB_TYPE_16x8) 70.17 +#define IS_8X16(a) ((a)&MB_TYPE_8x16) 70.18 +#define IS_8X8(a) ((a)&MB_TYPE_8x8) 70.19 +#define IS_SUB_8X8(a) ((a)&MB_TYPE_16x16) //note reused 70.20 +#define IS_SUB_8X4(a) ((a)&MB_TYPE_16x8) //note reused 70.21 +#define IS_SUB_4X8(a) ((a)&MB_TYPE_8x16) //note reused 70.22 +#define IS_SUB_4X4(a) ((a)&MB_TYPE_8x8) //note reused 70.23 +#define IS_DIR(a, part, list) ((a) & (MB_TYPE_P0L0<<((part)+2*(list)))) 70.24 + 70.25 +#define FFMAX(a,b) ((a) > (b) ? (a) : (b)) 70.26 +#define FFMIN(a,b) ((a) > (b) ? (b) : (a)) 70.27 + 70.28 +//Motion compensation buffer strides 70.29 +#define STRIDE_Y 48 70.30 +#define STRIDE_C 32 70.31 + 70.32 +typedef struct ref_data{ 70.33 + uint8_t *data[3]; 70.34 +}ref_data; 70.35 + 70.36 +typedef struct H264mc_part{ 70.37 + int n; 70.38 + int chroma_height; 70.39 + int x_offset; 70.40 + int y_offset; 70.41 + int itp; 70.42 + int weight; 70.43 + int list0; 70.44 + int list1; 70.45 + int use_weight; 70.46 + ref_data ref[2]; 70.47 + 70.48 +}H264mc_part; 70.49 + 70.50 +typedef struct H264mc{ 70.51 + H264mc_part mc_part[16]; 70.52 + int npart; 70.53 +}H264mc; 70.54 + 70.55 + 70.56 +#endif
71.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 71.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_pred_spu.h Mon Aug 27 12:09:56 2012 +0200 71.3 @@ -0,0 +1,90 @@ 71.4 +/* 71.5 + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder 71.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 71.7 + * 71.8 + * This file is part of FFmpeg. 71.9 + * 71.10 + * FFmpeg is free software; you can redistribute it and/or 71.11 + * modify it under the terms of the GNU Lesser General Public 71.12 + * License as published by the Free Software Foundation; either 71.13 + * version 2.1 of the License, or (at your option) any later version. 71.14 + * 71.15 + * FFmpeg is distributed in the hope that it will be useful, 71.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 71.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 71.18 + * Lesser General Public License for more details. 71.19 + * 71.20 + * You should have received a copy of the GNU Lesser General Public 71.21 + * License along with FFmpeg; if not, write to the Free Software 71.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 71.23 + */ 71.24 + 71.25 +/** 71.26 + * @file 71.27 + * H.264 / AVC / MPEG4 prediction functions. 71.28 + * @author Michael Niedermayer <michaelni@gmx.at> 71.29 + */ 71.30 + 71.31 +#ifndef AVCODEC_H264PRED_H 71.32 +#define AVCODEC_H264PRED_H 71.33 + 71.34 +//#include "libavutil/common.h" 71.35 +//#include "dsputil.h" 71.36 + 71.37 +/** 71.38 + * Prediction types 71.39 + */ 71.40 +//@{ 71.41 +#define VERT_PRED 0 71.42 +#define HOR_PRED 1 71.43 +#define DC_PRED 2 71.44 +#define DIAG_DOWN_LEFT_PRED 3 71.45 +#define DIAG_DOWN_RIGHT_PRED 4 71.46 +#define VERT_RIGHT_PRED 5 71.47 +#define HOR_DOWN_PRED 6 71.48 +#define VERT_LEFT_PRED 7 71.49 +#define HOR_UP_PRED 8 71.50 + 71.51 +#define LEFT_DC_PRED 9 71.52 +#define TOP_DC_PRED 10 71.53 +#define DC_128_PRED 11 71.54 + 71.55 +#define DIAG_DOWN_LEFT_PRED_RV40_NODOWN 12 71.56 +#define HOR_UP_PRED_RV40_NODOWN 13 71.57 +#define VERT_LEFT_PRED_RV40_NODOWN 14 71.58 + 71.59 +#define DC_PRED8x8 0 71.60 +#define HOR_PRED8x8 1 71.61 +#define VERT_PRED8x8 2 71.62 +#define PLANE_PRED8x8 3 71.63 + 71.64 +#define LEFT_DC_PRED8x8 4 71.65 +#define TOP_DC_PRED8x8 5 71.66 +#define DC_128_PRED8x8 6 71.67 + 71.68 +#define ALZHEIMER_DC_L0T_PRED8x8 7 71.69 +#define ALZHEIMER_DC_0LT_PRED8x8 8 71.70 +#define ALZHEIMER_DC_L00_PRED8x8 9 71.71 +#define ALZHEIMER_DC_0L0_PRED8x8 10 71.72 +//@} 71.73 + 71.74 +/** 71.75 + * Context for storing H.264 prediction functions 71.76 + */ 71.77 +typedef struct H264PredContext{ 71.78 + void (*pred4x4 [9+3+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp? 71.79 + void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride); 71.80 + void (*pred8x8 [4+3+4])(uint8_t *src, int stride); 71.81 + void (*pred16x16[4+3])(uint8_t *src, int stride); 71.82 + 71.83 + void (*pred4x4_add [2])(uint8_t *pix/*align 4*/, const DCTELEM *block/*align 16*/, int stride); 71.84 + void (*pred8x8l_add [2])(uint8_t *pix/*align 8*/, const DCTELEM *block/*align 16*/, int stride); 71.85 + void (*pred8x8_add [3])(uint8_t *pix/*align 8*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride); 71.86 + void (*pred16x16_add[3])(uint8_t *pix/*align 16*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride); 71.87 +}H264PredContext; 71.88 + 71.89 +void ff_h264_pred_init(H264PredContext *h); 71.90 +void ff_h264_pred_init_arm(H264PredContext *h); 71.91 + 71.92 + 71.93 +#endif /* AVCODEC_H264PRED_H */
72.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 72.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.c Mon Aug 27 12:09:56 2012 +0200 72.3 @@ -0,0 +1,26 @@ 72.4 +#include <stdint.h> 72.5 +#include "h264_tables.h" 72.6 + 72.7 +uint8_t ff_cropTbl[256+2 *MAX_NEG_CROP] = {0, }; 72.8 + 72.9 +int block_offset[16+4+4]; 72.10 + 72.11 +void ff_cropTbl_init(){ 72.12 + int i; 72.13 + for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i; 72.14 + for(i=0;i<MAX_NEG_CROP;i++) { 72.15 + ff_cropTbl[i] = 0; 72.16 + ff_cropTbl[i + MAX_NEG_CROP + 256] = 255; 72.17 + } 72.18 +} 72.19 + 72.20 +void init_block_offset(int linesize, int uvlinesize){ 72.21 + int i; 72.22 + for(i=0; i<16; i++){ 72.23 + block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*linesize*((scan8[i] - scan8[0])>>3); 72.24 + } 72.25 + for(i=0; i<4; i++){ 72.26 + block_offset[16+i]= 72.27 + block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*uvlinesize*((scan8[i] - scan8[0])>>3); 72.28 + } 72.29 +} 72.30 \ No newline at end of file
73.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 73.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.h Mon Aug 27 12:09:56 2012 +0200 73.3 @@ -0,0 +1,83 @@ 73.4 +#ifndef H264_TABLES_H 73.5 +#define H264_TABLES_H 73.6 + 73.7 +#define MAX_NEG_CROP 1024 73.8 + 73.9 +extern uint8_t ff_cropTbl[256+2 *MAX_NEG_CROP]; 73.10 +extern int block_offset[16+4+4]; 73.11 + 73.12 +static const uint8_t scan8[16 + 2*4]={ 73.13 + 4+1*8, 5+1*8, 4+2*8, 5+2*8, 73.14 + 6+1*8, 7+1*8, 6+2*8, 7+2*8, 73.15 + 4+3*8, 5+3*8, 4+4*8, 5+4*8, 73.16 + 6+3*8, 7+3*8, 6+4*8, 7+4*8, 73.17 + 1+1*8, 2+1*8, 73.18 + 1+2*8, 2+2*8, 73.19 + 1+4*8, 2+4*8, 73.20 + 1+5*8, 2+5*8, 73.21 +}; 73.22 + 73.23 +static const uint8_t ff_zigzag_direct[64] = { 73.24 + 0, 1, 8, 16, 9, 2, 3, 10, 73.25 + 17, 24, 32, 25, 18, 11, 4, 5, 73.26 + 12, 19, 26, 33, 40, 48, 41, 34, 73.27 + 27, 20, 13, 6, 7, 14, 21, 28, 73.28 + 35, 42, 49, 56, 57, 50, 43, 36, 73.29 + 29, 22, 15, 23, 30, 37, 44, 51, 73.30 + 58, 59, 52, 45, 38, 31, 39, 46, 73.31 + 53, 60, 61, 54, 47, 55, 62, 63 73.32 +}; 73.33 + 73.34 +static const uint8_t zigzag_scan[16]={ 73.35 + 0+0*4, 1+0*4, 0+1*4, 0+2*4, 73.36 + 1+1*4, 2+0*4, 3+0*4, 2+1*4, 73.37 + 1+2*4, 0+3*4, 1+3*4, 2+2*4, 73.38 + 3+1*4, 3+2*4, 2+3*4, 3+3*4, 73.39 +}; 73.40 + 73.41 +static const uint8_t luma_dc_zigzag_scan[16]={ 73.42 + 0*16 + 0*64, 1*16 + 0*64, 2*16 + 0*64, 0*16 + 2*64, 73.43 + 3*16 + 0*64, 0*16 + 1*64, 1*16 + 1*64, 2*16 + 1*64, 73.44 + 1*16 + 2*64, 2*16 + 2*64, 3*16 + 2*64, 0*16 + 3*64, 73.45 + 3*16 + 1*64, 1*16 + 3*64, 2*16 + 3*64, 3*16 + 3*64, 73.46 +}; 73.47 + 73.48 +static const uint8_t chroma_dc_scan[4]={ 73.49 + (0+0*2)*16, (1+0*2)*16, 73.50 + (0+1*2)*16, (1+1*2)*16, //FIXME 73.51 +}; 73.52 + 73.53 +static const uint8_t rem6[52]={ 73.54 +0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 73.55 +}; 73.56 + 73.57 +static const uint8_t div6[52]={ 73.58 +0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 73.59 +}; 73.60 + 73.61 +static const uint8_t dequant4_coeff_init[6][3]={ 73.62 + {10,13,16}, 73.63 + {11,14,18}, 73.64 + {13,16,20}, 73.65 + {14,18,23}, 73.66 + {16,20,25}, 73.67 + {18,23,29}, 73.68 +}; 73.69 + 73.70 +static const uint8_t dequant8_coeff_init_scan[16] = { 73.71 + 0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1 73.72 +}; 73.73 +static const uint8_t dequant8_coeff_init[6][6]={ 73.74 + {20,18,32,19,25,24}, 73.75 + {22,19,35,21,28,26}, 73.76 + {26,23,42,24,33,31}, 73.77 + {28,25,45,26,35,33}, 73.78 + {32,28,51,30,40,38}, 73.79 + {36,32,58,34,46,43}, 73.80 +}; 73.81 + 73.82 + 73.83 +void init_block_offset(int linesize, int uvlinesize); 73.84 +void ff_cropTbl_init(); 73.85 + 73.86 +#endif
74.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 74.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_types_spu.h Mon Aug 27 12:09:56 2012 +0200 74.3 @@ -0,0 +1,203 @@ 74.4 +#ifndef H264_CELL_TYPES_H 74.5 +#define H264_CELL_TYPES_H 74.6 + 74.7 +#include <libsync.h> 74.8 +#include <libavcodec/avcodec.h> 74.9 + 74.10 +typedef struct spe_pos{ 74.11 + volatile int count; //number of mb processed 74.12 + uint32_t pad[3]; 74.13 +}spe_pos; 74.14 + 74.15 +//only the picture pointers are needed from the picture struct; 74.16 +typedef struct Picture_spu { 74.17 + uint8_t* data[3]; 74.18 +} Picture_spu; 74.19 + 74.20 +///For Cell, might be idea to use this instead for everything 74.21 +// struct that contains the pararms that change on slice 74.22 +typedef struct H264slice{ 74.23 + int deblocking_filter; 74.24 + int linesize; 74.25 + int uvlinesize; 74.26 + int mb_width; 74.27 + int mb_height; 74.28 + 74.29 + int use_weight; 74.30 + int use_weight_chroma; 74.31 + int luma_log2_weight_denom; 74.32 + int chroma_log2_weight_denom; 74.33 + 74.34 + int16_t luma_weight[16][2][2]; 74.35 + int16_t chroma_weight[16][2][2][2]; 74.36 + int16_t implicit_weight[16][16][2]; 74.37 + 74.38 + // ref picture ptr 74.39 + Picture_spu ref_list[2][16]; 74.40 + int state; 74.41 + int emu_edge_width; 74.42 + int emu_edge_height; 74.43 + 74.44 + int slice_type; 74.45 + int slice_type_nos; 74.46 + int slice_alpha_c0_offset; 74.47 + int slice_beta_offset; 74.48 + 74.49 + uint8_t chroma_qp_table[2][64]; 74.50 + 74.51 + H264Mb *blocks; 74.52 + uint8_t *dst_y, *dst_cb, *dst_cr; 74.53 + 74.54 + //uint32_t pad[2]; // padding the structure for multiple of 16 bytes 74.55 +}H264slice; 74.56 + 74.57 +typedef struct H264spe{ 74.58 +#define EDIP 0 74.59 +#define EDB 1 74.60 +#define MBD 2 74.61 + int type; 74.62 + int idx; 74.63 + int spe_id; 74.64 + int spe_total; 74.65 + int mb_width; 74.66 + int mb_stride; 74.67 + int mb_height; 74.68 + int linesize; 74.69 + int uvlinesize; 74.70 + //H264slice* slice_params; 74.71 + void* src_spe; 74.72 + void* tgt_spe; 74.73 + 74.74 + mutex_ea_t lock; 74.75 + cond_ea_t cond; 74.76 + atomic_ea_t cnt; 74.77 + 74.78 + mutex_ea_t rl_lock; 74.79 + cond_ea_t rl_cond; 74.80 + atomic_ea_t rl_cnt; 74.81 +}H264spe; 74.82 + 74.83 +typedef struct H264Cabac_spu{ 74.84 + int blocking; 74.85 + 74.86 + int top_cbp; 74.87 + int left_cbp; 74.88 + int neighbor_transform_size; //number of neighbors (top and/or left) that used 8x8 dct 74.89 + 74.90 + uint32_t dequant4_buffer[6][52][16]; 74.91 + uint32_t dequant8_buffer[2][52][64]; 74.92 + uint32_t (*dequant4_coeff[6])[16]; 74.93 + uint32_t (*dequant8_coeff[2])[64]; 74.94 + 74.95 + uint8_t (*non_zero_count_top)[32]; 74.96 + uint8_t (*non_zero_count)[32]; 74.97 + 74.98 + uint8_t (*mvd_top[2])[2]; 74.99 + uint8_t (*mvd[2])[2]; 74.100 + 74.101 + uint8_t *direct_top; 74.102 + uint8_t *direct; 74.103 + 74.104 + uint8_t *chroma_pred_mode_top; 74.105 + uint8_t *chroma_pred_mode; 74.106 + 74.107 + int8_t *intra4x4_pred_mode_top; 74.108 + int8_t *intra4x4_pred_mode; 74.109 + 74.110 + uint16_t *cbp_top; 74.111 + uint16_t *cbp; 74.112 + 74.113 + int8_t *qscale_top; 74.114 + int8_t *qscale; 74.115 + 74.116 + int8_t *ref_index_top[2]; 74.117 + int8_t *ref_index[2]; 74.118 + 74.119 + int16_t (*motion_val_top[2])[2]; 74.120 + int16_t (*motion_val[2])[2]; 74.121 + uint32_t *mb_type_top; 74.122 + uint32_t *mb_type; 74.123 + 74.124 + int8_t *list1_ref_index[2]; 74.125 + uint32_t *list1_mb_type; 74.126 + DECLARE_ALIGNED_16(int16_t, list1_motion_val[2][4*4][2]); // fill for a macroblock when required 74.127 + 74.128 + int b_stride; 74.129 + int mb_stride; 74.130 + int mb_width; 74.131 + int mb_height; 74.132 + 74.133 + uint8_t zigzag_scan[16]; 74.134 + uint8_t zigzag_scan8x8[64]; 74.135 + 74.136 + uint8_t direct_cache[5*8]; 74.137 + // Used to calculate loopfilter bS. 74.138 + DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2]; 74.139 + DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8]; 74.140 + DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8]; 74.141 + DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2]; 74.142 + 74.143 +} H264Cabac_spu; 74.144 + 74.145 +typedef struct EDSlice_spu{ 74.146 + PPS pps; ///< current pps 74.147 + 74.148 + H264Mb *mbs; 74.149 + 74.150 + int state; 74.151 + int qp_thresh; ///< QP threshold to skip loopfilter 74.152 + 74.153 + PictureInfo pic; 74.154 + PictureInfo list1; 74.155 +// Picture *ref_list[2][16]; ///Reordered version of default_ref_list according to picture reordering in slice header 74.156 + int ref_count[2]; ///< counts frames or fields, depending on current mb mode 74.157 + int slice_type; 74.158 + int slice_type_nos; 74.159 + int direct_8x8_inference_flag; 74.160 + 74.161 + uint8_t list_count; 74.162 + uint32_t coded_pic_num; 74.163 +///stuff only needed for nal/entropy decoding 74.164 + H264Mb *m; 74.165 + //GetBitContext gb; 74.166 + const uint8_t *bytestream_start; 74.167 + int byte_bufsize; 74.168 + int transform_bypass; 74.169 + int direct_spatial_mv_pred; 74.170 + int map_col_to_list0[2][16]; 74.171 + int dist_scale_factor[16]; 74.172 + 74.173 + int cabac_init_idc; 74.174 + int ref2frm[2][64]; ///< reference to frame number lists, the first 2 are for -2,-1 74.175 + int qscale; 74.176 + int chroma_qp[2]; //QPc 74.177 + int last_qscale_diff; 74.178 + 74.179 +// Picture* release_ref[MAX_MMCO_COUNT]; 74.180 +// int release_cnt; 74.181 + 74.182 + 74.183 +// int use_weight; 74.184 +// int use_weight_chroma; 74.185 +// int luma_log2_weight_denom; 74.186 +// int chroma_log2_weight_denom; 74.187 + 74.188 +// int8_t luma_weight[16][2][2]; 74.189 +// int8_t chroma_weight[16][2][2][2]; 74.190 +// int8_t implicit_weight[16][16][2]; 74.191 + 74.192 + 74.193 + 74.194 +// int slice_alpha_c0_offset; 74.195 +// int slice_beta_offset; 74.196 + 74.197 +// int nal_ref_idc; 74.198 +// int nal_unit_type; 74.199 +// uint8_t *rbsp_buffer; 74.200 +// unsigned int rbsp_buffer_size; 74.201 + 74.202 + 74.203 + 74.204 +} EDSlice_spu; 74.205 + 74.206 +#endif
75.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 75.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/mathops_spu.h Mon Aug 27 12:09:56 2012 +0200 75.3 @@ -0,0 +1,137 @@ 75.4 +/* 75.5 + * simple math operations 75.6 + * Copyright (c) 2001, 2002 Fabrice Bellard 75.7 + * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al 75.8 + * 75.9 + * This file is part of FFmpeg. 75.10 + * 75.11 + * FFmpeg is free software; you can redistribute it and/or 75.12 + * modify it under the terms of the GNU Lesser General Public 75.13 + * License as published by the Free Software Foundation; either 75.14 + * version 2.1 of the License, or (at your option) any later version. 75.15 + * 75.16 + * FFmpeg is distributed in the hope that it will be useful, 75.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 75.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 75.19 + * Lesser General Public License for more details. 75.20 + * 75.21 + * You should have received a copy of the GNU Lesser General Public 75.22 + * License along with FFmpeg; if not, write to the Free Software 75.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 75.24 + */ 75.25 +#ifndef AVCODEC_MATHOPS_H 75.26 +#define AVCODEC_MATHOPS_H 75.27 + 75.28 +// #include "libavutil/common.h" 75.29 +// #include "libavutil/internal.h" 75.30 +// 75.31 +// /* generic implementation */ 75.32 +// 75.33 +// #ifndef MULL 75.34 +// # define MULL(a,b,s) (((int64_t)(a) * (int64_t)(b)) >> (s)) 75.35 +// #endif 75.36 +// 75.37 +// #ifndef MULH 75.38 +// //gcc 3.4 creates an incredibly bloated mess out of this 75.39 +// //# define MULH(a,b) (((int64_t)(a) * (int64_t)(b))>>32) 75.40 +// 75.41 +// static av_always_inline int MULH(int a, int b){ 75.42 +// return ((int64_t)(a) * (int64_t)(b))>>32; 75.43 +// } 75.44 +// #endif 75.45 +// 75.46 +// #ifndef UMULH 75.47 +// static av_always_inline unsigned UMULH(unsigned a, unsigned b){ 75.48 +// return ((uint64_t)(a) * (uint64_t)(b))>>32; 75.49 +// } 75.50 +// #endif 75.51 +// 75.52 +// #ifndef MUL64 75.53 +// # define MUL64(a,b) ((int64_t)(a) * (int64_t)(b)) 75.54 +// #endif 75.55 +// 75.56 +// #ifndef MAC64 75.57 +// # define MAC64(d, a, b) ((d) += MUL64(a, b)) 75.58 +// #endif 75.59 +// 75.60 +// #ifndef MLS64 75.61 +// # define MLS64(d, a, b) ((d) -= MUL64(a, b)) 75.62 +// #endif 75.63 +// 75.64 +// /* signed 16x16 -> 32 multiply add accumulate */ 75.65 +// #ifndef MAC16 75.66 +// # define MAC16(rt, ra, rb) rt += (ra) * (rb) 75.67 +// #endif 75.68 +// 75.69 +// /* signed 16x16 -> 32 multiply */ 75.70 +// #ifndef MUL16 75.71 +// # define MUL16(ra, rb) ((ra) * (rb)) 75.72 +// #endif 75.73 +// 75.74 +// #ifndef MLS16 75.75 +// # define MLS16(rt, ra, rb) ((rt) -= (ra) * (rb)) 75.76 +// #endif 75.77 + 75.78 +/* median of 3 */ 75.79 +#ifndef mid_pred 75.80 +#define mid_pred mid_pred 75.81 +static inline av_const int mid_pred(int a, int b, int c) 75.82 +{ 75.83 +#if 0 75.84 + int t= (a-b)&((a-b)>>31); 75.85 + a-=t; 75.86 + b+=t; 75.87 + b-= (b-c)&((b-c)>>31); 75.88 + b+= (a-b)&((a-b)>>31); 75.89 + 75.90 + return b; 75.91 +#else 75.92 + if(a>b){ 75.93 + if(c>b){ 75.94 + if(c>a) b=a; 75.95 + else b=c; 75.96 + } 75.97 + }else{ 75.98 + if(b>c){ 75.99 + if(c>a) b=c; 75.100 + else b=a; 75.101 + } 75.102 + } 75.103 + return b; 75.104 +#endif 75.105 +} 75.106 +#endif 75.107 + 75.108 +// #ifndef sign_extend 75.109 +// static inline av_const int sign_extend(int val, unsigned bits) 75.110 +// { 75.111 +// return (val << (INT_BIT - bits)) >> (INT_BIT - bits); 75.112 +// } 75.113 +// #endif 75.114 +// 75.115 +// #ifndef zero_extend 75.116 +// static inline av_const unsigned zero_extend(unsigned val, unsigned bits) 75.117 +// { 75.118 +// return (val << (INT_BIT - bits)) >> (INT_BIT - bits); 75.119 +// } 75.120 +// #endif 75.121 +// 75.122 +// #ifndef COPY3_IF_LT 75.123 +// #define COPY3_IF_LT(x, y, a, b, c, d)\ 75.124 +// if ((y) < (x)) {\ 75.125 +// (x) = (y);\ 75.126 +// (a) = (b);\ 75.127 +// (c) = (d);\ 75.128 +// } 75.129 +// #endif 75.130 +// 75.131 +// #ifndef NEG_SSR32 75.132 +// # define NEG_SSR32(a,s) ((( int32_t)(a))>>(32-(s))) 75.133 +// #endif 75.134 +// 75.135 +// #ifndef NEG_USR32 75.136 +// # define NEG_USR32(a,s) (((uint32_t)(a))>>(32-(s))) 75.137 +// #endif 75.138 + 75.139 +#endif /* AVCODEC_MATHOPS_H */ 75.140 +
76.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 76.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/rectangle_spu.h Mon Aug 27 12:09:56 2012 +0200 76.3 @@ -0,0 +1,92 @@ 76.4 +/* 76.5 + * rectangle filling function 76.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 76.7 + * 76.8 + * This file is part of FFmpeg. 76.9 + * 76.10 + * FFmpeg is free software; you can redistribute it and/or 76.11 + * modify it under the terms of the GNU Lesser General Public 76.12 + * License as published by the Free Software Foundation; either 76.13 + * version 2.1 of the License, or (at your option) any later version. 76.14 + * 76.15 + * FFmpeg is distributed in the hope that it will be useful, 76.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 76.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 76.18 + * Lesser General Public License for more details. 76.19 + * 76.20 + * You should have received a copy of the GNU Lesser General Public 76.21 + * License along with FFmpeg; if not, write to the Free Software 76.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 76.23 + */ 76.24 + 76.25 +/** 76.26 + * @file 76.27 + * useful rectangle filling function 76.28 + * @author Michael Niedermayer <michaelni@gmx.at> 76.29 + */ 76.30 + 76.31 +#ifndef AVCODEC_RECTANGLE_H 76.32 +#define AVCODEC_RECTANGLE_H 76.33 + 76.34 +#include <assert.h> 76.35 + 76.36 +#define STRIDE_ALIGN 16 76.37 + 76.38 + 76.39 +/** 76.40 + * fill a rectangle. 76.41 + * @param h height of the rectangle, should be a constant 76.42 + * @param w width of the rectangle, should be a constant 76.43 + * @param size the size of val (1, 2 or 4), should be a constant 76.44 + */ 76.45 +static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){ 76.46 + uint8_t *p= (uint8_t*)vp; 76.47 + assert(size==1 || size==2 || size==4); 76.48 + assert(w<=4); 76.49 + 76.50 + w *= size; 76.51 + stride *= size; 76.52 + 76.53 + assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0); 76.54 + assert((stride&(w-1))==0); 76.55 + if(w==2){ 76.56 + const uint16_t v= size==4 ? val : val*0x0101; 76.57 + *(uint16_t*)(p + 0*stride)= v; 76.58 + if(h==1) return; 76.59 + *(uint16_t*)(p + 1*stride)= v; 76.60 + if(h==2) return; 76.61 + *(uint16_t*)(p + 2*stride)= v; 76.62 + *(uint16_t*)(p + 3*stride)= v; 76.63 + }else if(w==4){ 76.64 + const uint32_t v= size==4 ? val : size==2 ? val*0x00010001 : val*0x01010101; 76.65 + *(uint32_t*)(p + 0*stride)= v; 76.66 + if(h==1) return; 76.67 + *(uint32_t*)(p + 1*stride)= v; 76.68 + if(h==2) return; 76.69 + *(uint32_t*)(p + 2*stride)= v; 76.70 + *(uint32_t*)(p + 3*stride)= v; 76.71 + }else if(w==8){ 76.72 + const uint64_t v= size==2 ? val*0x0001000100010001ULL : val*0x0100000001ULL; 76.73 + *(uint64_t*)(p + 0*stride)= v; 76.74 + if(h==1) return; 76.75 + *(uint64_t*)(p + 1*stride)= v; 76.76 + if(h==2) return; 76.77 + *(uint64_t*)(p + 2*stride)= v; 76.78 + *(uint64_t*)(p + 3*stride)= v; 76.79 + }else if(w==16){ 76.80 + const uint64_t v= val*0x0100000001ULL; 76.81 + *(uint64_t*)(p + 0+0*stride)= v; 76.82 + *(uint64_t*)(p + 8+0*stride)= v; 76.83 + *(uint64_t*)(p + 0+1*stride)= v; 76.84 + *(uint64_t*)(p + 8+1*stride)= v; 76.85 + if(h==2) return; 76.86 + *(uint64_t*)(p + 0+2*stride)= v; 76.87 + *(uint64_t*)(p + 8+2*stride)= v; 76.88 + *(uint64_t*)(p + 0+3*stride)= v; 76.89 + *(uint64_t*)(p + 8+3*stride)= v; 76.90 + }else 76.91 + assert(0); 76.92 + assert(h==4); 76.93 +} 76.94 + 76.95 +#endif /* AVCODEC_RECTANGLE_H */
77.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 77.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/spe_ed.c Mon Aug 27 12:09:56 2012 +0200 77.3 @@ -0,0 +1,508 @@ 77.4 +#define CELL_SPE 77.5 + 77.6 +#include <string.h> 77.7 +#include <stdio.h> 77.8 +#include <spu_intrinsics.h> 77.9 +#include <spu_mfcio.h> 77.10 +#include "libavcodec/avcodec.h" 77.11 +#include "h264_cabac_spu.h" 77.12 +#include "cabac_spu.h" 77.13 +#include "h264_types_spu.h" 77.14 +#include "h264_tables.h" 77.15 +#include "h264_dma.h" 77.16 +#include "h264_tables.h" 77.17 + 77.18 +#define MB_WIDTH 240 77.19 +#define MB_STRIDE (MB_WIDTH+16) 77.20 + 77.21 +H264Cabac_spu hcabac; 77.22 +CABACContext cabac; 77.23 +DECLARE_ALIGNED_16(EDSlice_spu, slice[2]); 77.24 +DECLARE_ALIGNED_16(H264Mb, mb[2]); 77.25 +DECLARE_ALIGNED_16(H264spe, spe); 77.26 + 77.27 +DECLARE_ALIGNED_16(uint8_t, non_zero_count_table[2][MB_STRIDE][32]); 77.28 +DECLARE_ALIGNED_16(uint8_t, mvd_table[2][2][8*MB_STRIDE][2]); 77.29 +DECLARE_ALIGNED_16(uint8_t, direct_table[2][4*MB_STRIDE]); 77.30 +DECLARE_ALIGNED_16(uint8_t, chroma_pred_mode_table[2][MB_STRIDE]); 77.31 +DECLARE_ALIGNED_16(uint8_t, intra4x4_pred_mode_table[2][8*MB_STRIDE]); 77.32 +DECLARE_ALIGNED_16(uint16_t,cbp_table[2][MB_STRIDE]); 77.33 +DECLARE_ALIGNED_16(uint8_t, qscale_table[2][MB_STRIDE]); 77.34 + 77.35 +DECLARE_ALIGNED_16(uint32_t, mb_type_table[2][MB_STRIDE]); 77.36 +DECLARE_ALIGNED_16(int8_t, ref_index_table[2][2][4*MB_STRIDE]); 77.37 +DECLARE_ALIGNED_16(int16_t, motion_val_table[2][2][4*4*MB_WIDTH][2]); 77.38 + 77.39 +DECLARE_ALIGNED(128, uint8_t, bytestream_ls[4096]); 77.40 +DECLARE_ALIGNED_16(uint32_t, list1_mb_type_table[2][MB_STRIDE]); 77.41 +DECLARE_ALIGNED_16(int8_t, list1_ref_index_table[2][2][4*MB_STRIDE]); 77.42 + 77.43 +DECLARE_ALIGNED_16(spe_pos, dma_temp); //dma temp for sending 77.44 +//mb position of neighbouring spes 77.45 +DECLARE_ALIGNED_16(volatile spe_pos, src_spe); //written by SPE_ID -1 77.46 +static int total_lines; 77.47 + 77.48 +static inline int dep_resolved(H264spe *p){ 77.49 + int spe_id = p->spe_id; 77.50 + volatile int lines_proc = src_spe.count; 77.51 + if (spe_id==0) 77.52 + return (total_lines < lines_proc-1 +p->mb_height)? 1:0; 77.53 + else 77.54 + return (total_lines < lines_proc-1)? 1:0; 77.55 +} 77.56 + 77.57 +static void update_tgt_spe_dep(H264spe *p, int end){ 77.58 + // if (end ){ 77.59 + total_lines++; 77.60 + spe_pos* dma_spe = &dma_temp; 77.61 + spe_pos* tgt_spe = p->tgt_spe + (unsigned) &src_spe; //located in target spe local store 77.62 + dma_spe->count = end? total_lines+1: total_lines; 77.63 + spu_dma_barrier_put(dma_spe, (unsigned) tgt_spe, sizeof(dma_temp), ED_put); 77.64 + // } 77.65 + 77.66 +} 77.67 + 77.68 +static int init_cabac(H264spe *p, H264Cabac_spu *hc){ 77.69 + hc->mb_height = p->mb_height; 77.70 + hc->mb_width = p->mb_width; 77.71 + hc->b_stride = 4*p->mb_width; 77.72 + hc->mb_stride = p->mb_stride; 77.73 + 77.74 + for(int i=0; i<16; i++){ 77.75 + #define T(x) (x>>2) | ((x<<2) & 0xF) 77.76 + hc->zigzag_scan[i] = T(zigzag_scan[i]); 77.77 + #undef T 77.78 + } 77.79 + for(int i=0; i<64; i++){ 77.80 + #define T(x) (x>>3) | ((x&7)<<3) 77.81 + hc->zigzag_scan8x8[i] = T(ff_zigzag_direct[i]); 77.82 + #undef T 77.83 + } 77.84 +} 77.85 + 77.86 +static void reset_cabac_buffers(){ 77.87 + memset(intra4x4_pred_mode_table, 0, sizeof(intra4x4_pred_mode_table)); 77.88 + memset(mvd_table, 0, sizeof(mvd_table)); 77.89 + memset(direct_table, 0, sizeof(direct_table)); 77.90 + memset(chroma_pred_mode_table, 0, sizeof(chroma_pred_mode_table)); 77.91 + memset(cbp_table, 0, sizeof(cbp_table)); 77.92 + memset(qscale_table, 0, sizeof(qscale_table)); 77.93 + memset(mb_type_table, 0, sizeof(mb_type_table)); 77.94 + memset(ref_index_table, 0, sizeof(ref_index_table)); 77.95 + memset(motion_val_table, 0, sizeof(motion_val_table)); 77.96 +} 77.97 + 77.98 +static void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int bufsize){ 77.99 + int align = (unsigned) buf & 0xF; 77.100 + int dma_size; 77.101 + 77.102 + c->bytestream_ea_start= 77.103 + c->bytestream_ea= buf; 77.104 + c->bytestream_ea_end= buf + bufsize; 77.105 + c->bufsize = bufsize; 77.106 + 77.107 + if (bufsize + align >= sizeof(bytestream_ls)){ 77.108 + dma_size = sizeof(bytestream_ls); 77.109 + c->bufsize = c->bufsize +align - sizeof(bytestream_ls); 77.110 + }else{ 77.111 + int align_end = (bufsize+align) &0xF; 77.112 + if (align_end) 77.113 + dma_size = bufsize+align + 16-align_end; 77.114 + else 77.115 + dma_size = bufsize+align; 77.116 + c->bufsize = 0; 77.117 + } 77.118 +// printf("%d\n", dma_size); 77.119 + c->bytestream_end = &bytestream_ls[dma_size]; 77.120 + c->bytestream_start= c->bytestream = &bytestream_ls[align]; 77.121 + spu_dma_get(bytestream_ls, (unsigned) buf - align, dma_size, ED_get ); 77.122 + c->bytestream_ea_start= 77.123 + c->bytestream_ea= buf + dma_size -align; 77.124 + 77.125 + wait_dma_id(ED_get); 77.126 + 77.127 + if (align %2){ 77.128 + c->low = (*c->bytestream++)<<18; 77.129 + c->low+= (*c->bytestream++)<<10; 77.130 + c->low+= ((*c->bytestream++)<<2) + 2; 77.131 + }else { 77.132 + c->low = (*c->bytestream++)<<18; 77.133 + c->low+= (*c->bytestream++)<<10; 77.134 + c->low+= (2<<8); 77.135 + } 77.136 + 77.137 + c->range= 0x1FE; 77.138 + bytecount=0; 77.139 +} 77.140 + 77.141 +static void init_dequant8_coeff_table(EDSlice_spu *s, H264Cabac_spu *hc){ 77.142 + int i,q,x; 77.143 + const int transpose = HAVE_ALTIVEC; 77.144 + hc->dequant8_coeff[0] = hc->dequant8_buffer[0]; 77.145 + hc->dequant8_coeff[1] = hc->dequant8_buffer[1]; 77.146 + 77.147 + for(i=0; i<2; i++){ 77.148 + if(i && !memcmp(s->pps.scaling_matrix8[0], s->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){ 77.149 + hc->dequant8_coeff[1] = hc->dequant8_buffer[0]; 77.150 + break; 77.151 + } 77.152 + 77.153 + for(q=0; q<52; q++){ 77.154 + int shift = div6[q]; 77.155 + int idx = rem6[q]; 77.156 + for(x=0; x<64; x++) 77.157 + hc->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] = 77.158 + ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] * 77.159 + s->pps.scaling_matrix8[i][x]) << shift; 77.160 + } 77.161 + } 77.162 +} 77.163 + 77.164 +static void init_dequant4_coeff_table(EDSlice_spu *s, H264Cabac_spu *hc){ 77.165 + int i,j,q,x; 77.166 + const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON; 77.167 + for(i=0; i<6; i++ ){ 77.168 + hc->dequant4_coeff[i] = hc->dequant4_buffer[i]; 77.169 + for(j=0; j<i; j++){ 77.170 + if(!memcmp(s->pps.scaling_matrix4[j], s->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){ 77.171 + hc->dequant4_coeff[i] = hc->dequant4_buffer[j]; 77.172 + break; 77.173 + } 77.174 + } 77.175 + if(j<i) 77.176 + continue; 77.177 + 77.178 + for(q=0; q<52; q++){ 77.179 + int shift = div6[q] + 2; 77.180 + int idx = rem6[q]; 77.181 + for(x=0; x<16; x++) 77.182 + hc->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] = 77.183 + ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] * 77.184 + s->pps.scaling_matrix4[i][x]) << shift; 77.185 + } 77.186 + } 77.187 +} 77.188 + 77.189 +static void init_dequant_tables(EDSlice_spu *s, H264Cabac_spu *hc){ 77.190 + int i,x; 77.191 + 77.192 + init_dequant4_coeff_table(s, hc); 77.193 + if(s->pps.transform_8x8_mode) 77.194 + init_dequant8_coeff_table(s, hc); 77.195 + if(s->transform_bypass){ 77.196 + for(i=0; i<6; i++) 77.197 + for(x=0; x<16; x++) 77.198 + hc->dequant4_coeff[i][0][x] = 1<<6; 77.199 + if(s->pps.transform_8x8_mode) 77.200 + for(i=0; i<2; i++) 77.201 + for(x=0; x<64; x++) 77.202 + hc->dequant8_coeff[i][0][x] = 1<<6; 77.203 + } 77.204 +} 77.205 + 77.206 +static void init_entropy_buf(H264Cabac_spu *hc, EDSlice_spu *s){ 77.207 + hc->non_zero_count_top = non_zero_count_table[0]; 77.208 + hc->non_zero_count = non_zero_count_table[1]; 77.209 + hc->mvd_top[0] = mvd_table[0][0]; 77.210 + hc->mvd[0] = mvd_table[0][1]; 77.211 + hc->mvd_top[1] = mvd_table[1][0]; 77.212 + hc->mvd[1] = mvd_table[1][1]; 77.213 + hc->direct_top = direct_table[0]; 77.214 + hc->direct = direct_table[1]; 77.215 + hc->chroma_pred_mode_top = chroma_pred_mode_table[0]; 77.216 + hc->chroma_pred_mode = chroma_pred_mode_table[1]; 77.217 + hc->intra4x4_pred_mode_top = intra4x4_pred_mode_table[0]; 77.218 + hc->intra4x4_pred_mode = intra4x4_pred_mode_table[1]; 77.219 + hc->cbp_top = cbp_table[0]; 77.220 + hc->cbp = cbp_table[1]; 77.221 + hc->qscale_top = qscale_table[0] +1; 77.222 + hc->qscale = qscale_table[1] +1; 77.223 + 77.224 + hc->mb_type_top = mb_type_table[0]+1; 77.225 + hc->mb_type = mb_type_table[1]+1; 77.226 + hc->ref_index_top[0] = ref_index_table[0][0]; 77.227 + hc->ref_index_top[1] = ref_index_table[1][0]; 77.228 + hc->ref_index[0] = ref_index_table[0][1]; 77.229 + hc->ref_index[1] = ref_index_table[1][1]; 77.230 + hc->motion_val_top[0] = motion_val_table[0][0]; 77.231 + hc->motion_val_top[1] = motion_val_table[1][0]; 77.232 + hc->motion_val[0] = motion_val_table[0][1]; 77.233 + hc->motion_val[1] = motion_val_table[1][1]; 77.234 + 77.235 + int mb_stride = hc->mb_stride; 77.236 + 77.237 + if (s->slice_type_nos == FF_B_TYPE){ 77.238 + while(!dep_resolved(&spe)); 77.239 + spu_dma_get(list1_mb_type_table[0], (unsigned) (s->list1.mb_type -1), mb_stride*sizeof(uint32_t), ED_get); 77.240 + spu_dma_get(list1_ref_index_table[0][0], (unsigned) s->list1.ref_index[0], mb_stride*4*sizeof(int8_t), ED_get); 77.241 + spu_dma_get(list1_ref_index_table[0][1], (unsigned) s->list1.ref_index[1], mb_stride*4*sizeof(int8_t), ED_get); 77.242 + wait_dma_id(ED_get); 77.243 + spu_dma_get(list1_mb_type_table[1], (unsigned) (s->list1.mb_type -1 + mb_stride), mb_stride*sizeof(uint32_t), ED_get); 77.244 + spu_dma_get(list1_ref_index_table[1][0], (unsigned) (s->list1.ref_index[0] + 4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get); 77.245 + spu_dma_get(list1_ref_index_table[1][1], (unsigned) (s->list1.ref_index[1] + 4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get); 77.246 + hc->list1_mb_type = list1_mb_type_table[0]+1; 77.247 + hc->list1_ref_index[0] = list1_ref_index_table[0][0]; 77.248 + hc->list1_ref_index[1] = list1_ref_index_table[0][1]; 77.249 + } 77.250 + 77.251 +} 77.252 + 77.253 +static void update_entropy_buf(H264Cabac_spu *hc, EDSlice_spu *s, int line){ 77.254 + int mb_stride = hc->mb_stride; 77.255 + int mb_width = hc->mb_width; 77.256 + int top = (line+1)%2; 77.257 + int cur = line%2; 77.258 + int bottom = (line+1)%2; //same as top, but to identify prebuffering of next line. 77.259 + 77.260 + hc->non_zero_count_top = non_zero_count_table[top]; 77.261 + hc->non_zero_count = non_zero_count_table[cur]; 77.262 + hc->mvd_top[0] = mvd_table[0][top]; 77.263 + hc->mvd[0] = mvd_table[0][cur]; 77.264 + hc->mvd_top[1] = mvd_table[1][top]; 77.265 + hc->mvd[1] = mvd_table[1][cur]; 77.266 + hc->direct_top = direct_table[top]; 77.267 + hc->direct = direct_table[cur]; 77.268 + hc->chroma_pred_mode_top = chroma_pred_mode_table[top]; 77.269 + hc->chroma_pred_mode = chroma_pred_mode_table[cur]; 77.270 + hc->intra4x4_pred_mode_top = intra4x4_pred_mode_table[top]; 77.271 + hc->intra4x4_pred_mode = intra4x4_pred_mode_table[cur]; 77.272 + hc->cbp_top = cbp_table[top]; 77.273 + hc->cbp = cbp_table[cur]; 77.274 + hc->qscale_top = qscale_table[top] +1; 77.275 + hc->qscale = qscale_table[cur] +1; 77.276 + 77.277 + hc->mb_type_top = mb_type_table[top]+1; 77.278 + hc->mb_type = mb_type_table[cur]+1; 77.279 + hc->ref_index_top[0] = ref_index_table[0][top]; 77.280 + hc->ref_index_top[1] = ref_index_table[1][top]; 77.281 + hc->ref_index[0] = ref_index_table[0][cur]; 77.282 + hc->ref_index[1] = ref_index_table[1][cur]; 77.283 + hc->motion_val_top[0] = motion_val_table[0][top]; 77.284 + hc->motion_val_top[1] = motion_val_table[1][top]; 77.285 + hc->motion_val[0] = motion_val_table[0][cur]; 77.286 + hc->motion_val[1] = motion_val_table[1][cur]; 77.287 + 77.288 + wait_dma_id(ED_put); 77.289 + 77.290 + spu_dma_put(mb_type_table[top], (unsigned) (s->pic.mb_type -1 + line*mb_stride), mb_stride*sizeof(uint32_t), ED_put); 77.291 + spu_dma_put(ref_index_table[0][top], (unsigned) (s->pic.ref_index[0] + line*4*mb_stride), 4*mb_stride*sizeof(int8_t), ED_put); 77.292 + spu_dma_put(ref_index_table[1][top], (unsigned) (s->pic.ref_index[1] + line*4*mb_stride), 4*mb_stride*sizeof(int8_t), ED_put); 77.293 + spu_dma_put(motion_val_table[0][top], (unsigned) (s->pic.motion_val[0]+ line*16*mb_width), 16*mb_width*2*sizeof(int16_t), ED_put); 77.294 + spu_dma_put(motion_val_table[1][top], (unsigned) (s->pic.motion_val[1]+ line*16*mb_width), 16*mb_width*2*sizeof(int16_t), ED_put); 77.295 + 77.296 + if (s->slice_type_nos == FF_B_TYPE){ 77.297 + update_tgt_spe_dep(&spe, 0); 77.298 + wait_dma_id(ED_get); 77.299 + 77.300 + if (line + 2 < hc->mb_height){ 77.301 + while(!dep_resolved(&spe)); 77.302 + spu_dma_get(list1_mb_type_table[cur], (unsigned) (s->list1.mb_type -1 + (line+2)*mb_stride), mb_stride*sizeof(uint32_t), ED_get); 77.303 + spu_dma_get(list1_ref_index_table[cur][0], (unsigned) (s->list1.ref_index[0] + (line+2)*4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get); 77.304 + spu_dma_get(list1_ref_index_table[cur][1], (unsigned) (s->list1.ref_index[1] + (line+2)*4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get); 77.305 + } 77.306 + hc->list1_mb_type = list1_mb_type_table[bottom]+1; 77.307 + hc->list1_ref_index[0] = list1_ref_index_table[bottom][0]; 77.308 + hc->list1_ref_index[1] = list1_ref_index_table[bottom][1]; 77.309 + } 77.310 + 77.311 +} 77.312 + 77.313 +// void printmbdiff(EDSlice_spu *s, H264Cabac_spu *hc, H264Mb *mp, H264Mb *ms){ 77.314 +// 77.315 +// printf("mb_x %d, %d\n", mp->mb_x, ms->mb_x); 77.316 +// printf("mb_y %d, %d\n", mp->mb_y, ms->mb_y); 77.317 +// printf("mb_xy %d, %d\n", mp->mb_xy, ms->mb_xy); 77.318 +// printf("top_mb_xy %d, %d\n", mp->top_mb_xy, ms->top_mb_xy); 77.319 +// printf("left_mb_xy %d, %d\n", mp->left_mb_xy, ms->left_mb_xy); 77.320 +// printf("chroma_pred_mode %d, %d\n", mp->chroma_pred_mode, ms->chroma_pred_mode); 77.321 +// printf("intra16x16_pred_mode %d, %d\n", mp->intra16x16_pred_mode, ms->intra16x16_pred_mode); 77.322 +// printf("topleft_samples %d, %d\n", mp->topleft_samples_available, ms->topleft_samples_available); 77.323 +// printf("topright_samples %d, %d\n", mp->topright_samples_available, ms->topright_samples_available); 77.324 +// printf("top_samples %d, %d\n", mp->top_samples_available, ms->top_samples_available); 77.325 +// printf("left_samples %d, %d\n", mp->left_samples_available, ms->left_samples_available); 77.326 +// 77.327 +// if (memcmp(mp->intra4x4_pred_mode_cache, ms->intra4x4_pred_mode_cache, 40)){ 77.328 +// for (int i=0; i<5; i++){ 77.329 +// for (int j=0; j<8; j++){ 77.330 +// printf("%d, %d\t", mp->intra4x4_pred_mode_cache[i*8+j],ms->intra4x4_pred_mode_cache[i*8+j]); 77.331 +// } 77.332 +// printf("\n"); 77.333 +// } 77.334 +// } 77.335 +// 77.336 +// if (memcmp(mp->non_zero_count_cache, ms->non_zero_count_cache, 48)){ 77.337 +// for (int i=0; i<6; i++){ 77.338 +// for (int j=0; j<8; j++){ 77.339 +// printf("%u, %u\t", mp->non_zero_count_cache[i*8+j],ms->non_zero_count_cache[i*8+j]); 77.340 +// } 77.341 +// printf("\n"); 77.342 +// } 77.343 +// } 77.344 +// 77.345 +// if (memcmp(mp->sub_mb_type, ms->sub_mb_type, 8)){ 77.346 +// for (int i=0; i<4; i++){ 77.347 +// printf("%u, %u\t", mp->sub_mb_type[i], mp->sub_mb_type[i]); 77.348 +// printf("\n"); 77.349 +// } 77.350 +// } 77.351 +// 77.352 +// if (memcmp(mp->mv_cache, ms->mv_cache, 320)){ 77.353 +// for (int k=0; k<2; k++){ 77.354 +// for (int i=0; i<5; i++){ 77.355 +// for (int j=0; j<8; j++){ 77.356 +// printf("%d, %d, %d, %d\t", mp->mv_cache[k][i*8+j][0], mp->mv_cache[k][i*8+j][1], ms->mv_cache[k][i*8+j][0], ms->mv_cache[k][i*8+j][1]); 77.357 +// } 77.358 +// printf("\n"); 77.359 +// } 77.360 +// } 77.361 +// } 77.362 +// 77.363 +// if (memcmp(mp->ref_cache, ms->ref_cache, 80)){ 77.364 +// for (int k=0; k<2; k++){ 77.365 +// for (int i=0; i<5; i++){ 77.366 +// for (int j=0; j<8; j++){ 77.367 +// printf("%d, %d\t", mp->ref_cache[k][i*8+j], ms->ref_cache[k][i*8+j]); 77.368 +// } 77.369 +// printf("\n"); 77.370 +// } 77.371 +// } 77.372 +// } 77.373 +// 77.374 +// printf("cbp %d, %d\n", mp->cbp, ms->cbp); 77.375 +// for (int i=0; i<hc->mb_stride; i++){ 77.376 +// printf("%d, ", hc->cbp[i]); fflush(0); 77.377 +// } 77.378 +// printf("\n"); 77.379 +// 77.380 +// printf("mb_type %x, %x\n", mp->mb_type, ms->mb_type); 77.381 +// printf("mb_type IS_INTRA %d, IS_INTRA16x16 %d, IS_DIRECT %d\n", IS_INTRA(ms->mb_type), IS_INTRA16x16(ms->mb_type), IS_DIRECT(ms->mb_type) ); 77.382 +// printf("left_type %d, %d\n", mp->left_type, ms->left_type); 77.383 +// printf("top_type %d, %d\n", mp->top_type, ms->top_type); 77.384 +// printf("qscale_mb_xy %d, %d\n", mp->qscale_mb_xy, ms->qscale_mb_xy); 77.385 +// printf("qscale_left_mb_xy %d, %d\n", mp->qscale_left_mb_xy, ms->qscale_left_mb_xy); 77.386 +// printf("qscale_top_mb_xy %d, %d\n", mp->qscale_top_mb_xy, ms->qscale_top_mb_xy); 77.387 +// // for (int i=0; i<hc->mb_stride; i++){ 77.388 +// // printf("%d, ", qscale_table[0][i]); fflush(0); 77.389 +// // } 77.390 +// 77.391 +// if (memcmp(mp->mb, ms->mb, 768)){ 77.392 +// for (int i=0; i<16; i++){ 77.393 +// for (int j=0; j<16; j++){ 77.394 +// printf("%d, %d\t", mp->mb[j + i*16], ms->ref_cache[j + i*16]); 77.395 +// } 77.396 +// printf("\n"); 77.397 +// } 77.398 +// for (int i=0; i<8; i++){ 77.399 +// for (int j=0; j<8; j++){ 77.400 +// printf("%d, %d\t", mp->mb[256 + j + i*8], ms->ref_cache[j + i*8]); 77.401 +// } 77.402 +// printf("\n"); 77.403 +// } 77.404 +// for (int i=0; i<8; i++){ 77.405 +// for (int j=0; j<8; j++){ 77.406 +// printf("%d, %d\t", mp->mb[320+ j + i*8], ms->ref_cache[j + i*8]); 77.407 +// } 77.408 +// printf("\n"); 77.409 +// } 77.410 +// } 77.411 +// 77.412 +// if (memcmp(mp->bS, ms->bS, 32)){ 77.413 +// for (int k=0; k<2; k++){ 77.414 +// for (int i=0; i<4; i++){ 77.415 +// for (int j=0; j<4; j++){ 77.416 +// printf("%d, %d\t", mp->bS[k][i][j], mp->mv_cache[k][i][j]); 77.417 +// } 77.418 +// printf("\n"); 77.419 +// } 77.420 +// } 77.421 +// } 77.422 +// if (memcmp(mp->edges, ms->edges, 4)){ 77.423 +// printf("edges %d, %d, %d, %d\n", mp->edges[0], ms->edges[0], mp->edges[1], ms->edges[1]); 77.424 +// printf("deblock %d, %d\n", mp->deblock_mb, ms->deblock_mb); 77.425 +// } 77.426 +// 77.427 +// printf("dequant4_coeff_y %d, %d\n", mp->dequant4_coeff_y, ms->dequant4_coeff_y); 77.428 +// printf("dequant4_coeff_cb %d, %d\n", mp->dequant4_coeff_cb, ms->dequant4_coeff_cb); 77.429 +// printf("dequant4_coeff_cr %d, %d\n", mp->dequant4_coeff_cr, ms->dequant4_coeff_cr); 77.430 +// } 77.431 +// DECLARE_ALIGNED_16(H264Mb, tmp); 77.432 + 77.433 + 77.434 +int main(unsigned long long id, unsigned long long argp){ 77.435 + EDSlice_spu *s; 77.436 + H264Cabac_spu *hc = &hcabac; 77.437 + CABACContext *c = &cabac; 77.438 + H264spe *p = &spe; 77.439 + 77.440 + spu_write_out_mbox((unsigned) slice); 77.441 + spu_dma_get(p, (unsigned) argp, sizeof(H264spe), ED_spe); //ID_slice is used out of convienience 77.442 + wait_dma_id(ED_spe); 77.443 + 77.444 + ff_init_cabac_states(); 77.445 + init_cabac(p, hc); 77.446 + hc->blocking=0; 77.447 + for(;;){ 77.448 + spu_read_in_mbox(); 77.449 + s = &slice[0]; 77.450 + reset_cabac_buffers(); 77.451 + init_entropy_buf(hc, s); 77.452 + 77.453 + if (hc->blocking) wait_dma_id(ED_get); 77.454 + //printf("framesize %d\n", s->byte_bufsize);fflush(0); 77.455 + init_dequant_tables(s, hc); 77.456 + ff_init_cabac_decoder( c, s->bytestream_start, s->byte_bufsize ); 77.457 + ff_h264_init_cabac_states(s, c); 77.458 + 77.459 + int mb_slot=0; 77.460 + for(int j=0; j<hc->mb_height; j++){ 77.461 + for(int i=0; i<hc->mb_width; i++){ 77.462 + int eos,ret; 77.463 + H264Mb *m = &mb[mb_slot]; 77.464 + m->mb_x=i; 77.465 + m->mb_y=j; 77.466 + s->m = m; 77.467 + 77.468 + ret = ff_h264_decode_mb_cabac(hc, s, c); 77.469 + 77.470 +// spu_dma_get(&tmp, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_get); 77.471 +// wait_dma_id(ED_get); 77.472 +// if (memcmp(&tmp, m, sizeof(H264Mb))){ 77.473 +// printf("coded pic num %d\n", s->coded_pic_num); 77.474 +// printmbdiff(s, hc,&tmp, m); 77.475 +// return 0; 77.476 +// } 77.477 + //printf("qscale %d\n", m->qscale_mb_xy); 77.478 + if (!hc->blocking){ 77.479 + if (mb_slot){ 77.480 + spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb1); 77.481 + wait_dma_id(ED_putmb0); 77.482 + }else { 77.483 + spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb0); 77.484 + wait_dma_id(ED_putmb1); 77.485 + } 77.486 + mb_slot++; mb_slot%=2; 77.487 + }else { 77.488 + spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb0); 77.489 + wait_dma_id(ED_putmb0); 77.490 + } 77.491 + 77.492 + 77.493 + eos = get_cabac_terminate( c); 77.494 + 77.495 + if( ret < 0) { 77.496 + fprintf(stderr, "error at %d bytecount\n", bytecount); 77.497 + return -1; 77.498 + } 77.499 + } 77.500 + update_entropy_buf(hc, s, j); 77.501 + if (hc->blocking){ wait_dma_id(ED_get); wait_dma_id(ED_put);} 77.502 + } 77.503 + wait_dma_id(ED_put); 77.504 + spu_write_out_mbox(1); 77.505 + 77.506 + } 77.507 + 77.508 + return 0; 77.509 + 77.510 + 77.511 +}
78.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 78.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/spe_mbd.c Mon Aug 27 12:09:56 2012 +0200 78.3 @@ -0,0 +1,356 @@ 78.4 +/* 78.5 + * Copyright (c) 2009 TUDelft 78.6 + * 78.7 + * Cell Parallel SPU - 2DWave Macroblock Decoding. 78.8 + */ 78.9 + 78.10 +/** 78.11 + * @file libavcodec/cell/spu/h264_main_spu.c 78.12 + * Cell Parallel SPU - 2DWave Macroblock Decoding 78.13 + * @author C C Chi <c.c.chi@student.tudelft.nl> 78.14 + * 78.15 + * SIMD kernels 78.16 + * H.264/AVC motion compensation 78.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu> 78.18 + * @author Albert Paradis <apar7632@hotmail.com> 78.19 + */ 78.20 + 78.21 + 78.22 +/* Enable this lines to enable simulator statistic or generate traces */ 78.23 + 78.24 +//#define ENABLE_SIMULATOR 78.25 +//#define ENABLE_PARAVER_TRACING_CELL 78.26 + 78.27 +#ifdef ENABLE_SIMULATOR 78.28 + #include "/opt/ibm/systemsim-cell/include/callthru/spu/profile.h" 78.29 +#endif 78.30 + 78.31 +#ifdef ENABLE_TRACES 78.32 + #include "spu_trace.h" 78.33 +#endif 78.34 +#include <unistd.h> 78.35 +#include <stdio.h> 78.36 +#include <spu_intrinsics.h> 78.37 +#include <spu_mfcio.h> 78.38 +#include <libsync.h> 78.39 +#include <sys/time.h> 78.40 +#include <assert.h> 78.41 + 78.42 +//#include "dsputil_cell.h" 78.43 +#include "types_spu.h" 78.44 +#include "h264_intra_spu.h" 78.45 +#include "h264_decode_mb_spu.h" 78.46 +#include "h264_mc_spu.h" 78.47 +#include "h264_tables.h" 78.48 +#include "h264_dma.h" 78.49 + 78.50 + 78.51 +/** functions for supporting tracing with paraver for the SPU 78.52 + * 78.53 + */ 78.54 +inline void trace_init_SPU(){ 78.55 +#ifdef ENABLE_PARAVER_TRACING_CELL 78.56 + SPUtrace_init (); 78.57 +#endif 78.58 +} 78.59 + 78.60 +inline void trace_fini_SPU(){ 78.61 +#ifdef ENABLE_PARAVER_TRACING_CELL 78.62 + SPUtrace_fini (); 78.63 +#endif 78.64 +} 78.65 + 78.66 +inline void trace_event_SPU(int event, int id){ 78.67 +#ifdef ENABLE_PARAVER_TRACING_CELL 78.68 + SPUtrace_event (event, id); 78.69 +#else 78.70 + (void) event; 78.71 + (void) id; 78.72 +#endif 78.73 +} 78.74 + 78.75 +// for simulator statistic 78.76 +inline void clear_statistic(){ 78.77 +#ifdef ENABLE_SIMULATOR 78.78 + prof_clear(); 78.79 +#endif 78.80 +} 78.81 + 78.82 +inline void start_statistic(){ 78.83 +#ifdef ENABLE_SIMULATOR 78.84 + prof_start(); 78.85 +#endif 78.86 +} 78.87 + 78.88 +inline void stop_statistic(){ 78.89 +#ifdef ENABLE_SIMULATOR 78.90 + prof_stop(); 78.91 +#endif 78.92 +} 78.93 + 78.94 +H264Context_spu h_context; // struct that contain all the params to decode a macroblock 78.95 + 78.96 +DECLARE_ALIGNED_16(spe_pos, dma_temp); //dma temp for sending 78.97 +//mb position of neighbouring spes 78.98 +DECLARE_ALIGNED_16(volatile spe_pos, src_spe); //written by SPE_ID -1 78.99 +//DECLARE_ALIGNED_16(spe_pos, tgt_spe); //written by SPE_ID +1 78.100 + 78.101 +/** 78.102 +* Initializes the buffering of the mb data and associated mc data. The init_mb_buffer needs to 78.103 +* be called before any get_next_mb and only once at the beginning of the slice. 78.104 +* 78.105 +* Note: init_mc_buffer and get_next_mb expect the width of the picture to be more than 2 mb's 78.106 +*/ 78.107 +#define TAG_OFFSET_MB MBD_buf1 78.108 +#define TAG_OFFSET_MC MBD_mc_buf1 78.109 +static void init_mb_buffer(H264Context_spu* h){ 78.110 + H264slice *s = h->s; 78.111 + H264Mb *next_mb; 78.112 + int mb_height = s->mb_height; 78.113 + int mb_width = s->mb_width; 78.114 + 78.115 + h->mc_idx =0; 78.116 + 78.117 + h->mb_dec = 0; 78.118 + h->mb_mc = 0; 78.119 + h->mb_dma = 0; 78.120 + 78.121 + h->curr_line %= mb_height; 78.122 + h->next_mb_idx = h->curr_line * mb_width; 78.123 + h->mb_id = h->curr_line * mb_width; 78.124 + h->n_mc= h->curr_line * mb_width; 78.125 + 78.126 + next_mb = s->blocks + h->mb_id; 78.127 + spu_dma_get(&h->mb_buf[h->mb_dma], (unsigned) next_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB); 78.128 + h->mb_dma++; 78.129 + h->mb_id++; 78.130 + 78.131 + next_mb = s->blocks + h->mb_id; 78.132 + spu_dma_get(&h->mb_buf[h->mb_dma], (unsigned) next_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB); 78.133 + h->mb_dma++; 78.134 + h->mb_id++; 78.135 + wait_dma_id(0 + TAG_OFFSET_MB); 78.136 + 78.137 + H264Mb *mb = &h->mb_buf[0]; 78.138 + H264mc *mc = &h->mc_buf[0]; 78.139 + if(!IS_INTRA(mb->mb_type)){ 78.140 + calc_mc_params(mb, mc); 78.141 + fill_ref_buf(h, mb, mc); 78.142 + } 78.143 + h->n_mc++; 78.144 + h->mb_mc++; 78.145 +} 78.146 + 78.147 +static void *get_next_mb(H264Context_spu *h){ 78.148 + H264slice *s = h->s; 78.149 + H264spe *spe = &h->spe; 78.150 + H264Mb *mb_buf = h->mb_buf; 78.151 + H264mc *mc_buf = h->mc_buf; 78.152 + H264Mb *next_mb; 78.153 + H264Mb *next_dma_mb; 78.154 + 78.155 + if (h->curr_line >= s->mb_height) 78.156 + return NULL; 78.157 + 78.158 + if (h->mb_id < h->mb_total){ 78.159 + next_dma_mb = s->blocks + h->mb_id; 78.160 + spu_dma_get(&mb_buf[h->mb_dma], (unsigned) next_dma_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB); 78.161 + h->mb_dma = (h->mb_dma+1)%3; 78.162 + h->mb_id++; 78.163 + if (h->mb_id%s->mb_width ==0){ 78.164 + h->mb_id+=(spe->spe_total-1)*s->mb_width; 78.165 + } 78.166 + } 78.167 + 78.168 + h->mc = &mc_buf[h->mc_idx]; 78.169 + wait_dma_id(h->mc_idx + TAG_OFFSET_MC); 78.170 + h->mc_idx = (h->mc_idx+1)%2; 78.171 + if (h->n_mc < h->mb_total){ 78.172 + wait_dma_id(h->mb_mc + TAG_OFFSET_MB); 78.173 + H264Mb *mb = &mb_buf[h->mb_mc]; 78.174 + H264mc *mc = &mc_buf[h->mc_idx]; 78.175 + if(!IS_INTRA(mb->mb_type)){ 78.176 + calc_mc_params(mb, mc); 78.177 + fill_ref_buf(h, mb, mc); 78.178 + } 78.179 + h->n_mc++; 78.180 + if (h->n_mc%s->mb_width ==0){ 78.181 + h->n_mc+=(spe->spe_total-1)*s->mb_width; 78.182 + } 78.183 + } 78.184 + h->next_mb_idx++; 78.185 + if (h->next_mb_idx % s->mb_width ==0){ 78.186 + h->next_mb_idx+=(spe->spe_total-1)*s->mb_width; 78.187 + h->curr_line+=spe->spe_total; 78.188 + } 78.189 + 78.190 + h->mb_mc = (h->mb_mc+1)%3; 78.191 + next_mb = &mb_buf[h->mb_dec]; 78.192 + h->mb_dec = (h->mb_dec+1)%3; 78.193 + return next_mb; 78.194 +} 78.195 + 78.196 +static void *get_next_mb_blocking(H264Context_spu *h){ 78.197 + H264slice *s = h->s; 78.198 + H264spe *spe = &h->spe; 78.199 + H264Mb *mb_buf = h->mb_buf; 78.200 + H264mc *mc_buf = h->mc_buf; 78.201 + H264Mb *next_mb; 78.202 + H264Mb *next_dma_mb; 78.203 + 78.204 + if (h->mb_id >= h->mb_total) 78.205 + return NULL; 78.206 + 78.207 + //printf("%d\n", h->mb_id); 78.208 + next_dma_mb = s->blocks + h->mb_id; 78.209 + spu_dma_get(&mb_buf[0], (unsigned) next_dma_mb, sizeof(H264Mb), MBD_buf1); 78.210 + //h->mb_dma = (h->mb_dma+1)%3; 78.211 + h->mb_id++; 78.212 + if (h->mb_id%s->mb_width ==0){ 78.213 + h->mb_id+=(spe->spe_total-1)*s->mb_width; 78.214 + } 78.215 + wait_dma_id(MBD_buf1); 78.216 + 78.217 + h->mc = &mc_buf[0]; 78.218 + //h->mc_idx = (h->mc_idx+1)%2; 78.219 + //if (h->n_mc < h->mb_total){ 78.220 + H264Mb *mb = &mb_buf[0]; 78.221 + H264mc *mc = &mc_buf[0]; 78.222 + if(!IS_INTRA(mb->mb_type)){ 78.223 + calc_mc_params(mb, mc); 78.224 + fill_ref_buf(h, mb, mc); 78.225 + } 78.226 + //h->n_mc++; 78.227 + /*if (h->n_mc%s->mb_width ==0){ 78.228 + h->n_mc+=(spe->spe_total-1)*s->mb_width; 78.229 + }*/ 78.230 +// wait_dma_id(MBD_mc_buf1); 78.231 + 78.232 +// h->next_mb_idx++; 78.233 +// if (h->next_mb_idx % s->mb_width ==0){ 78.234 +// h->next_mb_idx+=(spe->spe_total-1)*s->mb_width; 78.235 +// h->curr_line+=spe->spe_total; 78.236 +// } 78.237 + 78.238 +// h->mb_mc = (h->mb_mc+1)%3; 78.239 + next_mb = &mb_buf[0]; 78.240 +// h->mb_dec = (h->mb_dec+1)%3; 78.241 + return next_mb; 78.242 +} 78.243 + 78.244 + 78.245 +#undef TAG_OFFSET_MB 78.246 +#undef TAG_OFFSET_MC 78.247 +static inline int dep_resolved(H264Context_spu *h){ 78.248 + H264slice *s = h->s; 78.249 + int spe_id = h->spe.spe_id; 78.250 + volatile int mb_proc_dep = src_spe.count; 78.251 + if (spe_id==0) 78.252 + return (h->mb_proc < mb_proc_dep-1 +s->mb_width)? 1:0; 78.253 + else 78.254 + return (h->mb_proc < mb_proc_dep-1)? 1:0; 78.255 +} 78.256 + 78.257 +void update_tgt_spe_dep(H264Context_spu *h, int end){ 78.258 + H264Mb *mb = h->mb; 78.259 + H264slice *s = h->s; 78.260 + H264spe *spe = &h->spe; 78.261 + int mb_x = mb->mb_x; 78.262 + 78.263 + if (end || (mb_x%2==0 && mb_x!=0) || mb_x==s->mb_width-1){ 78.264 + spe_pos* dma_spe = &dma_temp; 78.265 + spe_pos* tgt_spe = (spe_pos*) ((unsigned) spe->tgt_spe + (unsigned) &src_spe); //located in target spe local store 78.266 + dma_spe->count = end? h->mb_proc+1: h->mb_proc; 78.267 + spu_dma_barrier_put(dma_spe, (unsigned) tgt_spe, sizeof(dma_temp), MBD_put); 78.268 + } 78.269 + h->mb_proc++; 78.270 +} 78.271 + 78.272 + 78.273 +int main(unsigned long long id, unsigned long long argp) 78.274 +{ 78.275 + (void) id; 78.276 + H264Context_spu* h = &h_context; 78.277 + H264spe *spe_params = (H264spe *) (unsigned) argp; 78.278 + 78.279 + spu_dma_get(&h->spe, (unsigned) spe_params, sizeof(H264spe), MBD_slice); //ID_slice is used out of convienience 78.280 + wait_dma_id(MBD_slice); 78.281 + 78.282 + //clear_statistic(); 78.283 + dsputil_h264_init_cell(&h->dsp); 78.284 + ff_cropTbl_init(); 78.285 + init_pred_ptrs(&h->hpc); 78.286 + 78.287 + //send slice_buf to ppe 78.288 + spu_write_out_mbox((unsigned) h->slice_buf); 78.289 + h->sl_idx=0; 78.290 + // initialize tracing with paraver 78.291 + //trace_init_SPU(); 78.292 + h->frames =0; 78.293 + src_spe.count =0; 78.294 + h->mb_proc = 0; 78.295 + 78.296 + h->mb_id=0; 78.297 + h->mc_idx=0; 78.298 + h->mb_dec=0; 78.299 + h->mb_mc=0; 78.300 + h->mb_dma=0; 78.301 + h->next_mb_idx=0; 78.302 + 78.303 + h->blocking=0; 78.304 + 78.305 + 78.306 + H264spe* p = &h->spe; 78.307 + h->curr_line =p->spe_id; 78.308 + h->mb_total = p->mb_height*p->mb_width; 78.309 + int stride_y = 32; 78.310 + int stride_c = 16; 78.311 + //init block_offset array 78.312 + init_block_offset(stride_y, stride_c); 78.313 + for(;;){ 78.314 + spu_read_in_mbox(); 78.315 + 78.316 + h->s = &h->slice_buf[h->sl_idx]; 78.317 + h->sl_idx++; h->sl_idx%=2; 78.318 + 78.319 + if (h->s->state< 0){ 78.320 + break; 78.321 + } 78.322 + 78.323 + { 78.324 + if(!h->blocking){ 78.325 + init_mb_buffer(h); 78.326 + while((h->mb=(H264Mb *)get_next_mb(h))){ 78.327 + while(!dep_resolved(h)); 78.328 + //printf("frame %d mbx %d\t mby %d id %d\n", h->frames, h->mb->mb_x, h->mb->mb_y, p- >spe_id); 78.329 + hl_decode_mb_internal(h, stride_y, stride_c); 78.330 + } 78.331 + update_tgt_spe_dep(h, 1); 78.332 + }else{ 78.333 + h->mb_id=0; 78.334 + while((h->mb=(H264Mb *)get_next_mb_blocking(h))){ 78.335 + while(!dep_resolved(h)); 78.336 + //printf("frame %d mbx %d\t mby %d id %d\n", h->frames, h->mb->mb_x, h->mb->mb_y, p- >spe_id); 78.337 + hl_decode_mb_internal(h, stride_y, stride_c); 78.338 + } 78.339 + update_tgt_spe_dep(h, 1); 78.340 + } 78.341 + 78.342 + } 78.343 + 78.344 + h->frames++; 78.345 + 78.346 + if (p->spe_id == ((h->frames*p->mb_height -1)%p->spe_total)){ 78.347 + //printf("spe %d, %d\n", atomic_read(p->rl_cnt), h->frames); 78.348 + //MBSlice is copied beforehand. 78.349 + //only inc cnt. 78.350 + atomic_inc(p->rl_cnt); 78.351 + } 78.352 + { 78.353 + atomic_dec(p->cnt); 78.354 + } 78.355 + } 78.356 + 78.357 + return 0; 78.358 +} 78.359 +
79.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 79.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/types_spu.h Mon Aug 27 12:09:56 2012 +0200 79.3 @@ -0,0 +1,69 @@ 79.4 +/* 79.5 + * Copyright (c) 2006 Guillaume Poirier <gpoirier@mplayerhq.hu> 79.6 + * 79.7 + * This file is part of FFmpeg. 79.8 + * 79.9 + * FFmpeg is free software; you can redistribute it and/or 79.10 + * modify it under the terms of the GNU Lesser General Public 79.11 + * License as published by the Free Software Foundation; either 79.12 + * version 2.1 of the License, or (at your option) any later version. 79.13 + * 79.14 + * FFmpeg is distributed in the hope that it will be useful, 79.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 79.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 79.17 + * Lesser General Public License for more details. 79.18 + * 79.19 + * You should have received a copy of the GNU Lesser General Public 79.20 + * License along with FFmpeg; if not, write to the Free Software 79.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 79.22 + */ 79.23 + 79.24 +#ifndef TYPES_SPU_H 79.25 +#define TYPES_SPU_H 79.26 + 79.27 +/*********************************************************************** 79.28 + * Scalar types 79.29 + **********************************************************************/ 79.30 + typedef signed char int8_t; 79.31 + typedef signed short int16_t; 79.32 + typedef signed int int32_t; 79.33 + typedef unsigned char uint8_t; 79.34 + typedef unsigned short uint16_t; 79.35 + typedef unsigned int uint32_t; 79.36 + typedef unsigned long long uint64_t; 79.37 + 79.38 +// typedef short DCTELEM; // transform coeficients of dct 79.39 + 79.40 +/*********************************************************************** 79.41 + * Vector types 79.42 + **********************************************************************/ 79.43 + typedef vector signed int vsint32_t; 79.44 + typedef vector unsigned int vuint32_t; 79.45 + typedef vector signed short vsint16_t; 79.46 + typedef vector unsigned short vuint16_t; 79.47 + typedef vector signed char vsint8_t; 79.48 + typedef vector unsigned char vuint8_t; 79.49 + 79.50 +/*********************************************************************** 79.51 + * Functions 79.52 + **********************************************************************/ 79.53 + typedef void (*qpel_mc_func)(uint8_t *dst, uint8_t *src, int dst_stride, int h); 79.54 + typedef void (*h264_chroma_mc_func)(uint8_t *dst, uint8_t *src, int dst_stride, int h, int x, int y); 79.55 + typedef void (*h264_idct_func)(uint8_t *dst, short *block, int stride); 79.56 + typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset); 79.57 + typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int log2_denom, int weightd, 79.58 + int weights, int offset); 79.59 + typedef void(* intra_pred4x4)(uint8_t *src, uint8_t *topright, int stride); 79.60 + typedef void(* intra_pred16x16)(uint8_t *src, int stride); 79.61 + typedef void(* intra_pred8x8)(uint8_t *src, int stride); 79.62 + typedef void(* intra_pred8x8l)(uint8_t *src, int topleft, int topright, int stride); 79.63 + 79.64 + 79.65 +#define AVV(x...) {x} 79.66 + 79.67 + 79.68 +#endif // AVCODEC_TYPES_SPU_H 79.69 + 79.70 + 79.71 + 79.72 +
80.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 80.2 +++ b/ffmpeg_smp/h264dec/libavcodec/dsputil.c Mon Aug 27 12:09:56 2012 +0200 80.3 @@ -0,0 +1,1057 @@ 80.4 +/* 80.5 + * DSP utils 80.6 + * Copyright (c) 2000, 2001 Fabrice Bellard 80.7 + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 80.8 + * 80.9 + * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at> 80.10 + * 80.11 + * This file is part of FFmpeg. 80.12 + * 80.13 + * FFmpeg is free software; you can redistribute it and/or 80.14 + * modify it under the terms of the GNU Lesser General Public 80.15 + * License as published by the Free Software Foundation; either 80.16 + * version 2.1 of the License, or (at your option) any later version. 80.17 + * 80.18 + * FFmpeg is distributed in the hope that it will be useful, 80.19 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 80.20 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 80.21 + * Lesser General Public License for more details. 80.22 + * 80.23 + * You should have received a copy of the GNU Lesser General Public 80.24 + * License along with FFmpeg; if not, write to the Free Software 80.25 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 80.26 + */ 80.27 + 80.28 +/** 80.29 + * @file 80.30 + * DSP utils 80.31 + */ 80.32 + 80.33 +#include "libavutil/log.h" 80.34 +#include "dsputil.h" 80.35 +#include "simple_idct.h" 80.36 +#include "mathops.h" 80.37 +#include "config.h" 80.38 + 80.39 +uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, }; 80.40 +uint32_t ff_squareTbl[512] = {0, }; 80.41 + 80.42 +const uint8_t ff_zigzag_direct[64] = { 80.43 + 0, 1, 8, 16, 9, 2, 3, 10, 80.44 + 17, 24, 32, 25, 18, 11, 4, 5, 80.45 + 12, 19, 26, 33, 40, 48, 41, 34, 80.46 + 27, 20, 13, 6, 7, 14, 21, 28, 80.47 + 35, 42, 49, 56, 57, 50, 43, 36, 80.48 + 29, 22, 15, 23, 30, 37, 44, 51, 80.49 + 58, 59, 52, 45, 38, 31, 39, 46, 80.50 + 53, 60, 61, 54, 47, 55, 62, 63 80.51 +}; 80.52 + 80.53 + 80.54 +#define PIXOP2(OPNAME, OP) \ 80.55 +static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ 80.56 + int i;\ 80.57 + for(i=0; i<h; i++){\ 80.58 + OP(*((uint16_t*)(block )), AV_RN16(pixels ));\ 80.59 + pixels+=line_size;\ 80.60 + block +=line_size;\ 80.61 + }\ 80.62 +}\ 80.63 +static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ 80.64 + int i;\ 80.65 + for(i=0; i<h; i++){\ 80.66 + OP(*((uint32_t*)(block )), AV_RN32(pixels ));\ 80.67 + pixels+=line_size;\ 80.68 + block +=line_size;\ 80.69 + }\ 80.70 +}\ 80.71 +static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ 80.72 + int i;\ 80.73 + for(i=0; i<h; i++){\ 80.74 + OP(*((uint32_t*)(block )), AV_RN32(pixels ));\ 80.75 + OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\ 80.76 + pixels+=line_size;\ 80.77 + block +=line_size;\ 80.78 + }\ 80.79 +}\ 80.80 +static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ 80.81 + OPNAME ## _pixels8_c(block, pixels, line_size, h);\ 80.82 +}\ 80.83 +\ 80.84 +static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ 80.85 + int src_stride1, int src_stride2, int h){\ 80.86 + int i;\ 80.87 + for(i=0; i<h; i++){\ 80.88 + uint32_t a,b;\ 80.89 + a= AV_RN32(&src1[i*src_stride1 ]);\ 80.90 + b= AV_RN32(&src2[i*src_stride2 ]);\ 80.91 + OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\ 80.92 + a= AV_RN32(&src1[i*src_stride1+4]);\ 80.93 + b= AV_RN32(&src2[i*src_stride2+4]);\ 80.94 + OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\ 80.95 + }\ 80.96 +}\ 80.97 +\ 80.98 +static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ 80.99 + int src_stride1, int src_stride2, int h){\ 80.100 + int i;\ 80.101 + for(i=0; i<h; i++){\ 80.102 + uint32_t a,b;\ 80.103 + a= AV_RN32(&src1[i*src_stride1 ]);\ 80.104 + b= AV_RN32(&src2[i*src_stride2 ]);\ 80.105 + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ 80.106 + a= AV_RN32(&src1[i*src_stride1+4]);\ 80.107 + b= AV_RN32(&src2[i*src_stride2+4]);\ 80.108 + OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\ 80.109 + }\ 80.110 +}\ 80.111 +\ 80.112 +static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ 80.113 + int src_stride1, int src_stride2, int h){\ 80.114 + int i;\ 80.115 + for(i=0; i<h; i++){\ 80.116 + uint32_t a,b;\ 80.117 + a= AV_RN32(&src1[i*src_stride1 ]);\ 80.118 + b= AV_RN32(&src2[i*src_stride2 ]);\ 80.119 + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ 80.120 + }\ 80.121 +}\ 80.122 +\ 80.123 +static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ 80.124 + int src_stride1, int src_stride2, int h){\ 80.125 + int i;\ 80.126 + for(i=0; i<h; i++){\ 80.127 + uint32_t a,b;\ 80.128 + a= AV_RN16(&src1[i*src_stride1 ]);\ 80.129 + b= AV_RN16(&src2[i*src_stride2 ]);\ 80.130 + OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ 80.131 + }\ 80.132 +}\ 80.133 +\ 80.134 +static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ 80.135 + int src_stride1, int src_stride2, int h){\ 80.136 + OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\ 80.137 + OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\ 80.138 +}\ 80.139 +\ 80.140 +static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ 80.141 + int src_stride1, int src_stride2, int h){\ 80.142 + OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\ 80.143 + OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\ 80.144 +}\ 80.145 +\ 80.146 +static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ 80.147 + OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ 80.148 +}\ 80.149 +\ 80.150 +static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ 80.151 + OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ 80.152 +}\ 80.153 +\ 80.154 +static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ 80.155 + OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ 80.156 +}\ 80.157 +\ 80.158 +static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ 80.159 + OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ 80.160 +}\ 80.161 +\ 80.162 +static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ 80.163 + int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ 80.164 + int i;\ 80.165 + for(i=0; i<h; i++){\ 80.166 + uint32_t a, b, c, d, l0, l1, h0, h1;\ 80.167 + a= AV_RN32(&src1[i*src_stride1]);\ 80.168 + b= AV_RN32(&src2[i*src_stride2]);\ 80.169 + c= AV_RN32(&src3[i*src_stride3]);\ 80.170 + d= AV_RN32(&src4[i*src_stride4]);\ 80.171 + l0= (a&0x03030303UL)\ 80.172 + + (b&0x03030303UL)\ 80.173 + + 0x02020202UL;\ 80.174 + h0= ((a&0xFCFCFCFCUL)>>2)\ 80.175 + + ((b&0xFCFCFCFCUL)>>2);\ 80.176 + l1= (c&0x03030303UL)\ 80.177 + + (d&0x03030303UL);\ 80.178 + h1= ((c&0xFCFCFCFCUL)>>2)\ 80.179 + + ((d&0xFCFCFCFCUL)>>2);\ 80.180 + OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ 80.181 + a= AV_RN32(&src1[i*src_stride1+4]);\ 80.182 + b= AV_RN32(&src2[i*src_stride2+4]);\ 80.183 + c= AV_RN32(&src3[i*src_stride3+4]);\ 80.184 + d= AV_RN32(&src4[i*src_stride4+4]);\ 80.185 + l0= (a&0x03030303UL)\ 80.186 + + (b&0x03030303UL)\ 80.187 + + 0x02020202UL;\ 80.188 + h0= ((a&0xFCFCFCFCUL)>>2)\ 80.189 + + ((b&0xFCFCFCFCUL)>>2);\ 80.190 + l1= (c&0x03030303UL)\ 80.191 + + (d&0x03030303UL);\ 80.192 + h1= ((c&0xFCFCFCFCUL)>>2)\ 80.193 + + ((d&0xFCFCFCFCUL)>>2);\ 80.194 + OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ 80.195 + }\ 80.196 +}\ 80.197 +\ 80.198 +static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ 80.199 + OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ 80.200 +}\ 80.201 +\ 80.202 +static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ 80.203 + OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ 80.204 +}\ 80.205 +\ 80.206 +static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ 80.207 + OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ 80.208 +}\ 80.209 +\ 80.210 +static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ 80.211 + OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ 80.212 +}\ 80.213 +\ 80.214 +static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ 80.215 + int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ 80.216 + int i;\ 80.217 + for(i=0; i<h; i++){\ 80.218 + uint32_t a, b, c, d, l0, l1, h0, h1;\ 80.219 + a= AV_RN32(&src1[i*src_stride1]);\ 80.220 + b= AV_RN32(&src2[i*src_stride2]);\ 80.221 + c= AV_RN32(&src3[i*src_stride3]);\ 80.222 + d= AV_RN32(&src4[i*src_stride4]);\ 80.223 + l0= (a&0x03030303UL)\ 80.224 + + (b&0x03030303UL)\ 80.225 + + 0x01010101UL;\ 80.226 + h0= ((a&0xFCFCFCFCUL)>>2)\ 80.227 + + ((b&0xFCFCFCFCUL)>>2);\ 80.228 + l1= (c&0x03030303UL)\ 80.229 + + (d&0x03030303UL);\ 80.230 + h1= ((c&0xFCFCFCFCUL)>>2)\ 80.231 + + ((d&0xFCFCFCFCUL)>>2);\ 80.232 + OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ 80.233 + a= AV_RN32(&src1[i*src_stride1+4]);\ 80.234 + b= AV_RN32(&src2[i*src_stride2+4]);\ 80.235 + c= AV_RN32(&src3[i*src_stride3+4]);\ 80.236 + d= AV_RN32(&src4[i*src_stride4+4]);\ 80.237 + l0= (a&0x03030303UL)\ 80.238 + + (b&0x03030303UL)\ 80.239 + + 0x01010101UL;\ 80.240 + h0= ((a&0xFCFCFCFCUL)>>2)\ 80.241 + + ((b&0xFCFCFCFCUL)>>2);\ 80.242 + l1= (c&0x03030303UL)\ 80.243 + + (d&0x03030303UL);\ 80.244 + h1= ((c&0xFCFCFCFCUL)>>2)\ 80.245 + + ((d&0xFCFCFCFCUL)>>2);\ 80.246 + OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ 80.247 + }\ 80.248 +}\ 80.249 +static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ 80.250 + int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ 80.251 + OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ 80.252 + OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ 80.253 +}\ 80.254 +static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ 80.255 + int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ 80.256 + OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ 80.257 + OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ 80.258 +}\ 80.259 +\ 80.260 +static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ 80.261 +{\ 80.262 + int i, a0, b0, a1, b1;\ 80.263 + a0= pixels[0];\ 80.264 + b0= pixels[1] + 2;\ 80.265 + a0 += b0;\ 80.266 + b0 += pixels[2];\ 80.267 +\ 80.268 + pixels+=line_size;\ 80.269 + for(i=0; i<h; i+=2){\ 80.270 + a1= pixels[0];\ 80.271 + b1= pixels[1];\ 80.272 + a1 += b1;\ 80.273 + b1 += pixels[2];\ 80.274 +\ 80.275 + block[0]= (a1+a0)>>2; /* FIXME non put */\ 80.276 + block[1]= (b1+b0)>>2;\ 80.277 +\ 80.278 + pixels+=line_size;\ 80.279 + block +=line_size;\ 80.280 +\ 80.281 + a0= pixels[0];\ 80.282 + b0= pixels[1] + 2;\ 80.283 + a0 += b0;\ 80.284 + b0 += pixels[2];\ 80.285 +\ 80.286 + block[0]= (a1+a0)>>2;\ 80.287 + block[1]= (b1+b0)>>2;\ 80.288 + pixels+=line_size;\ 80.289 + block +=line_size;\ 80.290 + }\ 80.291 +}\ 80.292 +\ 80.293 +static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ 80.294 +{\ 80.295 + int i;\ 80.296 + const uint32_t a= AV_RN32(pixels );\ 80.297 + const uint32_t b= AV_RN32(pixels+1);\ 80.298 + uint32_t l0= (a&0x03030303UL)\ 80.299 + + (b&0x03030303UL)\ 80.300 + + 0x02020202UL;\ 80.301 + uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ 80.302 + + ((b&0xFCFCFCFCUL)>>2);\ 80.303 + uint32_t l1,h1;\ 80.304 +\ 80.305 + pixels+=line_size;\ 80.306 + for(i=0; i<h; i+=2){\ 80.307 + uint32_t a= AV_RN32(pixels );\ 80.308 + uint32_t b= AV_RN32(pixels+1);\ 80.309 + l1= (a&0x03030303UL)\ 80.310 + + (b&0x03030303UL);\ 80.311 + h1= ((a&0xFCFCFCFCUL)>>2)\ 80.312 + + ((b&0xFCFCFCFCUL)>>2);\ 80.313 + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ 80.314 + pixels+=line_size;\ 80.315 + block +=line_size;\ 80.316 + a= AV_RN32(pixels );\ 80.317 + b= AV_RN32(pixels+1);\ 80.318 + l0= (a&0x03030303UL)\ 80.319 + + (b&0x03030303UL)\ 80.320 + + 0x02020202UL;\ 80.321 + h0= ((a&0xFCFCFCFCUL)>>2)\ 80.322 + + ((b&0xFCFCFCFCUL)>>2);\ 80.323 + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ 80.324 + pixels+=line_size;\ 80.325 + block +=line_size;\ 80.326 + }\ 80.327 +}\ 80.328 +\ 80.329 +static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ 80.330 +{\ 80.331 + int j;\ 80.332 + for(j=0; j<2; j++){\ 80.333 + int i;\ 80.334 + const uint32_t a= AV_RN32(pixels );\ 80.335 + const uint32_t b= AV_RN32(pixels+1);\ 80.336 + uint32_t l0= (a&0x03030303UL)\ 80.337 + + (b&0x03030303UL)\ 80.338 + + 0x02020202UL;\ 80.339 + uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ 80.340 + + ((b&0xFCFCFCFCUL)>>2);\ 80.341 + uint32_t l1,h1;\ 80.342 +\ 80.343 + pixels+=line_size;\ 80.344 + for(i=0; i<h; i+=2){\ 80.345 + uint32_t a= AV_RN32(pixels );\ 80.346 + uint32_t b= AV_RN32(pixels+1);\ 80.347 + l1= (a&0x03030303UL)\ 80.348 + + (b&0x03030303UL);\ 80.349 + h1= ((a&0xFCFCFCFCUL)>>2)\ 80.350 + + ((b&0xFCFCFCFCUL)>>2);\ 80.351 + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ 80.352 + pixels+=line_size;\ 80.353 + block +=line_size;\ 80.354 + a= AV_RN32(pixels );\ 80.355 + b= AV_RN32(pixels+1);\ 80.356 + l0= (a&0x03030303UL)\ 80.357 + + (b&0x03030303UL)\ 80.358 + + 0x02020202UL;\ 80.359 + h0= ((a&0xFCFCFCFCUL)>>2)\ 80.360 + + ((b&0xFCFCFCFCUL)>>2);\ 80.361 + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ 80.362 + pixels+=line_size;\ 80.363 + block +=line_size;\ 80.364 + }\ 80.365 + pixels+=4-line_size*(h+1);\ 80.366 + block +=4-line_size*h;\ 80.367 + }\ 80.368 +}\ 80.369 +\ 80.370 +static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ 80.371 +{\ 80.372 + int j;\ 80.373 + for(j=0; j<2; j++){\ 80.374 + int i;\ 80.375 + const uint32_t a= AV_RN32(pixels );\ 80.376 + const uint32_t b= AV_RN32(pixels+1);\ 80.377 + uint32_t l0= (a&0x03030303UL)\ 80.378 + + (b&0x03030303UL)\ 80.379 + + 0x01010101UL;\ 80.380 + uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ 80.381 + + ((b&0xFCFCFCFCUL)>>2);\ 80.382 + uint32_t l1,h1;\ 80.383 +\ 80.384 + pixels+=line_size;\ 80.385 + for(i=0; i<h; i+=2){\ 80.386 + uint32_t a= AV_RN32(pixels );\ 80.387 + uint32_t b= AV_RN32(pixels+1);\ 80.388 + l1= (a&0x03030303UL)\ 80.389 + + (b&0x03030303UL);\ 80.390 + h1= ((a&0xFCFCFCFCUL)>>2)\ 80.391 + + ((b&0xFCFCFCFCUL)>>2);\ 80.392 + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ 80.393 + pixels+=line_size;\ 80.394 + block +=line_size;\ 80.395 + a= AV_RN32(pixels );\ 80.396 + b= AV_RN32(pixels+1);\ 80.397 + l0= (a&0x03030303UL)\ 80.398 + + (b&0x03030303UL)\ 80.399 + + 0x01010101UL;\ 80.400 + h0= ((a&0xFCFCFCFCUL)>>2)\ 80.401 + + ((b&0xFCFCFCFCUL)>>2);\ 80.402 + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ 80.403 + pixels+=line_size;\ 80.404 + block +=line_size;\ 80.405 + }\ 80.406 + pixels+=4-line_size*(h+1);\ 80.407 + block +=4-line_size*h;\ 80.408 + }\ 80.409 +}\ 80.410 +\ 80.411 +CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\ 80.412 + 80.413 +#define op_avg(a, b) a = rnd_avg32(a, b) 80.414 + 80.415 +#define op_put(a, b) a = b 80.416 + 80.417 +PIXOP2(avg, op_avg) 80.418 +PIXOP2(put, op_put) 80.419 +#undef op_avg 80.420 +#undef op_put 80.421 + 80.422 + 80.423 +#define H264_CHROMA_MC(OPNAME, OP)\ 80.424 +static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ 80.425 + const int A=(8-x)*(8-y);\ 80.426 + const int B=( x)*(8-y);\ 80.427 + const int C=(8-x)*( y);\ 80.428 + const int D=( x)*( y);\ 80.429 + int i;\ 80.430 + \ 80.431 + assert(x<8 && y<8 && x>=0 && y>=0);\ 80.432 +\ 80.433 + if(D){\ 80.434 + for(i=0; i<h; i++){\ 80.435 + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\ 80.436 + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\ 80.437 + dst+= stride;\ 80.438 + src+= stride;\ 80.439 + }\ 80.440 + }else{\ 80.441 + const int E= B+C;\ 80.442 + const int step= C ? stride : 1;\ 80.443 + for(i=0; i<h; i++){\ 80.444 + OP(dst[0], (A*src[0] + E*src[step+0]));\ 80.445 + OP(dst[1], (A*src[1] + E*src[step+1]));\ 80.446 + dst+= stride;\ 80.447 + src+= stride;\ 80.448 + }\ 80.449 + }\ 80.450 +}\ 80.451 +\ 80.452 +static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ 80.453 + const int A=(8-x)*(8-y);\ 80.454 + const int B=( x)*(8-y);\ 80.455 + const int C=(8-x)*( y);\ 80.456 + const int D=( x)*( y);\ 80.457 + int i;\ 80.458 + \ 80.459 + assert(x<8 && y<8 && x>=0 && y>=0);\ 80.460 +\ 80.461 + if(D){\ 80.462 + for(i=0; i<h; i++){\ 80.463 + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\ 80.464 + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\ 80.465 + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\ 80.466 + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\ 80.467 + dst+= stride;\ 80.468 + src+= stride;\ 80.469 + }\ 80.470 + }else{\ 80.471 + const int E= B+C;\ 80.472 + const int step= C ? stride : 1;\ 80.473 + for(i=0; i<h; i++){\ 80.474 + OP(dst[0], (A*src[0] + E*src[step+0]));\ 80.475 + OP(dst[1], (A*src[1] + E*src[step+1]));\ 80.476 + OP(dst[2], (A*src[2] + E*src[step+2]));\ 80.477 + OP(dst[3], (A*src[3] + E*src[step+3]));\ 80.478 + dst+= stride;\ 80.479 + src+= stride;\ 80.480 + }\ 80.481 + }\ 80.482 +}\ 80.483 +\ 80.484 +static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ 80.485 + const int A=(8-x)*(8-y);\ 80.486 + const int B=( x)*(8-y);\ 80.487 + const int C=(8-x)*( y);\ 80.488 + const int D=( x)*( y);\ 80.489 + int i;\ 80.490 + \ 80.491 + assert(x<8 && y<8 && x>=0 && y>=0);\ 80.492 +\ 80.493 + if(D){\ 80.494 + for(i=0; i<h; i++){\ 80.495 + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\ 80.496 + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\ 80.497 + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\ 80.498 + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\ 80.499 + OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\ 80.500 + OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\ 80.501 + OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\ 80.502 + OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\ 80.503 + dst+= stride;\ 80.504 + src+= stride;\ 80.505 + }\ 80.506 + }else{\ 80.507 + const int E= B+C;\ 80.508 + const int step= C ? stride : 1;\ 80.509 + for(i=0; i<h; i++){\ 80.510 + OP(dst[0], (A*src[0] + E*src[step+0]));\ 80.511 + OP(dst[1], (A*src[1] + E*src[step+1]));\ 80.512 + OP(dst[2], (A*src[2] + E*src[step+2]));\ 80.513 + OP(dst[3], (A*src[3] + E*src[step+3]));\ 80.514 + OP(dst[4], (A*src[4] + E*src[step+4]));\ 80.515 + OP(dst[5], (A*src[5] + E*src[step+5]));\ 80.516 + OP(dst[6], (A*src[6] + E*src[step+6]));\ 80.517 + OP(dst[7], (A*src[7] + E*src[step+7]));\ 80.518 + dst+= stride;\ 80.519 + src+= stride;\ 80.520 + }\ 80.521 + }\ 80.522 +} 80.523 + 80.524 +#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1) 80.525 +#define op_put(a, b) a = (((b) + 32)>>6) 80.526 + 80.527 +H264_CHROMA_MC(put_ , op_put) 80.528 +H264_CHROMA_MC(avg_ , op_avg) 80.529 +#undef op_avg 80.530 +#undef op_put 80.531 + 80.532 + 80.533 +#define H264_LOWPASS(OPNAME, OP, OP2) \ 80.534 +static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 80.535 + const int h=2;\ 80.536 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ 80.537 + int i;\ 80.538 + for(i=0; i<h; i++)\ 80.539 + {\ 80.540 + OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\ 80.541 + OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\ 80.542 + dst+=dstStride;\ 80.543 + src+=srcStride;\ 80.544 + }\ 80.545 +}\ 80.546 +\ 80.547 +static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 80.548 + const int w=2;\ 80.549 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ 80.550 + int i;\ 80.551 + for(i=0; i<w; i++)\ 80.552 + {\ 80.553 + const int srcB= src[-2*srcStride];\ 80.554 + const int srcA= src[-1*srcStride];\ 80.555 + const int src0= src[0 *srcStride];\ 80.556 + const int src1= src[1 *srcStride];\ 80.557 + const int src2= src[2 *srcStride];\ 80.558 + const int src3= src[3 *srcStride];\ 80.559 + const int src4= src[4 *srcStride];\ 80.560 + OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ 80.561 + OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ 80.562 + dst++;\ 80.563 + src++;\ 80.564 + }\ 80.565 +}\ 80.566 +\ 80.567 +static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 80.568 + const int h=2;\ 80.569 + const int w=2;\ 80.570 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ 80.571 + int i;\ 80.572 + src -= 2*srcStride;\ 80.573 + for(i=0; i<h+5; i++)\ 80.574 + {\ 80.575 + tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\ 80.576 + tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\ 80.577 + tmp+=tmpStride;\ 80.578 + src+=srcStride;\ 80.579 + }\ 80.580 + tmp -= tmpStride*(h+5-2);\ 80.581 + for(i=0; i<w; i++)\ 80.582 + {\ 80.583 + const int tmpB= tmp[-2*tmpStride];\ 80.584 + const int tmpA= tmp[-1*tmpStride];\ 80.585 + const int tmp0= tmp[0 *tmpStride];\ 80.586 + const int tmp1= tmp[1 *tmpStride];\ 80.587 + const int tmp2= tmp[2 *tmpStride];\ 80.588 + const int tmp3= tmp[3 *tmpStride];\ 80.589 + const int tmp4= tmp[4 *tmpStride];\ 80.590 + OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\ 80.591 + OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\ 80.592 + dst++;\ 80.593 + tmp++;\ 80.594 + }\ 80.595 +}\ 80.596 +static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 80.597 + const int h=4;\ 80.598 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ 80.599 + int i;\ 80.600 + for(i=0; i<h; i++)\ 80.601 + {\ 80.602 + OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\ 80.603 + OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\ 80.604 + OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\ 80.605 + OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\ 80.606 + dst+=dstStride;\ 80.607 + src+=srcStride;\ 80.608 + }\ 80.609 +}\ 80.610 +\ 80.611 +static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 80.612 + const int w=4;\ 80.613 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ 80.614 + int i;\ 80.615 + for(i=0; i<w; i++)\ 80.616 + {\ 80.617 + const int srcB= src[-2*srcStride];\ 80.618 + const int srcA= src[-1*srcStride];\ 80.619 + const int src0= src[0 *srcStride];\ 80.620 + const int src1= src[1 *srcStride];\ 80.621 + const int src2= src[2 *srcStride];\ 80.622 + const int src3= src[3 *srcStride];\ 80.623 + const int src4= src[4 *srcStride];\ 80.624 + const int src5= src[5 *srcStride];\ 80.625 + const int src6= src[6 *srcStride];\ 80.626 + OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ 80.627 + OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ 80.628 + OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\ 80.629 + OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\ 80.630 + dst++;\ 80.631 + src++;\ 80.632 + }\ 80.633 +}\ 80.634 +\ 80.635 +static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 80.636 + const int h=4;\ 80.637 + const int w=4;\ 80.638 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ 80.639 + int i;\ 80.640 + src -= 2*srcStride;\ 80.641 + for(i=0; i<h+5; i++)\ 80.642 + {\ 80.643 + tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\ 80.644 + tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\ 80.645 + tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\ 80.646 + tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\ 80.647 + tmp+=tmpStride;\ 80.648 + src+=srcStride;\ 80.649 + }\ 80.650 + tmp -= tmpStride*(h+5-2);\ 80.651 + for(i=0; i<w; i++)\ 80.652 + {\ 80.653 + const int tmpB= tmp[-2*tmpStride];\ 80.654 + const int tmpA= tmp[-1*tmpStride];\ 80.655 + const int tmp0= tmp[0 *tmpStride];\ 80.656 + const int tmp1= tmp[1 *tmpStride];\ 80.657 + const int tmp2= tmp[2 *tmpStride];\ 80.658 + const int tmp3= tmp[3 *tmpStride];\ 80.659 + const int tmp4= tmp[4 *tmpStride];\ 80.660 + const int tmp5= tmp[5 *tmpStride];\ 80.661 + const int tmp6= tmp[6 *tmpStride];\ 80.662 + OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\ 80.663 + OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\ 80.664 + OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\ 80.665 + OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\ 80.666 + dst++;\ 80.667 + tmp++;\ 80.668 + }\ 80.669 +}\ 80.670 +\ 80.671 +static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 80.672 + const int h=8;\ 80.673 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ 80.674 + int i;\ 80.675 + for(i=0; i<h; i++)\ 80.676 + {\ 80.677 + OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\ 80.678 + OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\ 80.679 + OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\ 80.680 + OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\ 80.681 + OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\ 80.682 + OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\ 80.683 + OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\ 80.684 + OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\ 80.685 + dst+=dstStride;\ 80.686 + src+=srcStride;\ 80.687 + }\ 80.688 +}\ 80.689 +\ 80.690 +static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 80.691 + const int w=8;\ 80.692 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ 80.693 + int i;\ 80.694 + for(i=0; i<w; i++)\ 80.695 + {\ 80.696 + const int srcB= src[-2*srcStride];\ 80.697 + const int srcA= src[-1*srcStride];\ 80.698 + const int src0= src[0 *srcStride];\ 80.699 + const int src1= src[1 *srcStride];\ 80.700 + const int src2= src[2 *srcStride];\ 80.701 + const int src3= src[3 *srcStride];\ 80.702 + const int src4= src[4 *srcStride];\ 80.703 + const int src5= src[5 *srcStride];\ 80.704 + const int src6= src[6 *srcStride];\ 80.705 + const int src7= src[7 *srcStride];\ 80.706 + const int src8= src[8 *srcStride];\ 80.707 + const int src9= src[9 *srcStride];\ 80.708 + const int src10=src[10*srcStride];\ 80.709 + OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ 80.710 + OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ 80.711 + OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\ 80.712 + OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\ 80.713 + OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\ 80.714 + OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\ 80.715 + OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\ 80.716 + OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\ 80.717 + dst++;\ 80.718 + src++;\ 80.719 + }\ 80.720 +}\ 80.721 +\ 80.722 +static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 80.723 + const int h=8;\ 80.724 + const int w=8;\ 80.725 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ 80.726 + int i;\ 80.727 + src -= 2*srcStride;\ 80.728 + for(i=0; i<h+5; i++)\ 80.729 + {\ 80.730 + tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\ 80.731 + tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\ 80.732 + tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\ 80.733 + tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\ 80.734 + tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\ 80.735 + tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\ 80.736 + tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\ 80.737 + tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\ 80.738 + tmp+=tmpStride;\ 80.739 + src+=srcStride;\ 80.740 + }\ 80.741 + tmp -= tmpStride*(h+5-2);\ 80.742 + for(i=0; i<w; i++)\ 80.743 + {\ 80.744 + const int tmpB= tmp[-2*tmpStride];\ 80.745 + const int tmpA= tmp[-1*tmpStride];\ 80.746 + const int tmp0= tmp[0 *tmpStride];\ 80.747 + const int tmp1= tmp[1 *tmpStride];\ 80.748 + const int tmp2= tmp[2 *tmpStride];\ 80.749 + const int tmp3= tmp[3 *tmpStride];\ 80.750 + const int tmp4= tmp[4 *tmpStride];\ 80.751 + const int tmp5= tmp[5 *tmpStride];\ 80.752 + const int tmp6= tmp[6 *tmpStride];\ 80.753 + const int tmp7= tmp[7 *tmpStride];\ 80.754 + const int tmp8= tmp[8 *tmpStride];\ 80.755 + const int tmp9= tmp[9 *tmpStride];\ 80.756 + const int tmp10=tmp[10*tmpStride];\ 80.757 + OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\ 80.758 + OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\ 80.759 + OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\ 80.760 + OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\ 80.761 + OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\ 80.762 + OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\ 80.763 + OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\ 80.764 + OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\ 80.765 + dst++;\ 80.766 + tmp++;\ 80.767 + }\ 80.768 +}\ 80.769 +\ 80.770 +static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 80.771 + OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\ 80.772 + OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\ 80.773 + src += 8*srcStride;\ 80.774 + dst += 8*dstStride;\ 80.775 + OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\ 80.776 + OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\ 80.777 +}\ 80.778 +\ 80.779 +static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 80.780 + OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\ 80.781 + OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\ 80.782 + src += 8*srcStride;\ 80.783 + dst += 8*dstStride;\ 80.784 + OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\ 80.785 + OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\ 80.786 +}\ 80.787 +\ 80.788 +static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 80.789 + OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\ 80.790 + OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\ 80.791 + src += 8*srcStride;\ 80.792 + dst += 8*dstStride;\ 80.793 + OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\ 80.794 + OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\ 80.795 +}\ 80.796 + 80.797 +#define H264_MC(OPNAME, SIZE) \ 80.798 +static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\ 80.799 + OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\ 80.800 +}\ 80.801 +\ 80.802 +static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\ 80.803 + uint8_t half[SIZE*SIZE];\ 80.804 + put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\ 80.805 + OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\ 80.806 +}\ 80.807 +\ 80.808 +static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\ 80.809 + OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\ 80.810 +}\ 80.811 +\ 80.812 +static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\ 80.813 + uint8_t half[SIZE*SIZE];\ 80.814 + put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\ 80.815 + OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\ 80.816 +}\ 80.817 +\ 80.818 +static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\ 80.819 + uint8_t full[SIZE*(SIZE+5)];\ 80.820 + uint8_t * const full_mid= full + SIZE*2;\ 80.821 + uint8_t half[SIZE*SIZE];\ 80.822 + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ 80.823 + put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\ 80.824 + OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\ 80.825 +}\ 80.826 +\ 80.827 +static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\ 80.828 + uint8_t full[SIZE*(SIZE+5)];\ 80.829 + uint8_t * const full_mid= full + SIZE*2;\ 80.830 + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ 80.831 + OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\ 80.832 +}\ 80.833 +\ 80.834 +static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\ 80.835 + uint8_t full[SIZE*(SIZE+5)];\ 80.836 + uint8_t * const full_mid= full + SIZE*2;\ 80.837 + uint8_t half[SIZE*SIZE];\ 80.838 + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ 80.839 + put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\ 80.840 + OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\ 80.841 +}\ 80.842 +\ 80.843 +static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\ 80.844 + uint8_t full[SIZE*(SIZE+5)];\ 80.845 + uint8_t * const full_mid= full + SIZE*2;\ 80.846 + uint8_t halfH[SIZE*SIZE];\ 80.847 + uint8_t halfV[SIZE*SIZE];\ 80.848 + put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\ 80.849 + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ 80.850 + put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ 80.851 + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ 80.852 +}\ 80.853 +\ 80.854 +static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\ 80.855 + uint8_t full[SIZE*(SIZE+5)];\ 80.856 + uint8_t * const full_mid= full + SIZE*2;\ 80.857 + uint8_t halfH[SIZE*SIZE];\ 80.858 + uint8_t halfV[SIZE*SIZE];\ 80.859 + put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\ 80.860 + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ 80.861 + put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ 80.862 + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ 80.863 +}\ 80.864 +\ 80.865 +static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\ 80.866 + uint8_t full[SIZE*(SIZE+5)];\ 80.867 + uint8_t * const full_mid= full + SIZE*2;\ 80.868 + uint8_t halfH[SIZE*SIZE];\ 80.869 + uint8_t halfV[SIZE*SIZE];\ 80.870 + put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\ 80.871 + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ 80.872 + put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ 80.873 + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ 80.874 +}\ 80.875 +\ 80.876 +static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\ 80.877 + uint8_t full[SIZE*(SIZE+5)];\ 80.878 + uint8_t * const full_mid= full + SIZE*2;\ 80.879 + uint8_t halfH[SIZE*SIZE];\ 80.880 + uint8_t halfV[SIZE*SIZE];\ 80.881 + put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\ 80.882 + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ 80.883 + put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ 80.884 + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ 80.885 +}\ 80.886 +\ 80.887 +static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\ 80.888 + int16_t tmp[SIZE*(SIZE+5)];\ 80.889 + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\ 80.890 +}\ 80.891 +\ 80.892 +static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\ 80.893 + int16_t tmp[SIZE*(SIZE+5)];\ 80.894 + uint8_t halfH[SIZE*SIZE];\ 80.895 + uint8_t halfHV[SIZE*SIZE];\ 80.896 + put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\ 80.897 + put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ 80.898 + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\ 80.899 +}\ 80.900 +\ 80.901 +static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\ 80.902 + int16_t tmp[SIZE*(SIZE+5)];\ 80.903 + uint8_t halfH[SIZE*SIZE];\ 80.904 + uint8_t halfHV[SIZE*SIZE];\ 80.905 + put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\ 80.906 + put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ 80.907 + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\ 80.908 +}\ 80.909 +\ 80.910 +static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\ 80.911 + uint8_t full[SIZE*(SIZE+5)];\ 80.912 + uint8_t * const full_mid= full + SIZE*2;\ 80.913 + int16_t tmp[SIZE*(SIZE+5)];\ 80.914 + uint8_t halfV[SIZE*SIZE];\ 80.915 + uint8_t halfHV[SIZE*SIZE];\ 80.916 + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ 80.917 + put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ 80.918 + put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ 80.919 + OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\ 80.920 +}\ 80.921 +\ 80.922 +static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\ 80.923 + uint8_t full[SIZE*(SIZE+5)];\ 80.924 + uint8_t * const full_mid= full + SIZE*2;\ 80.925 + int16_t tmp[SIZE*(SIZE+5)];\ 80.926 + uint8_t halfV[SIZE*SIZE];\ 80.927 + uint8_t halfHV[SIZE*SIZE];\ 80.928 + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ 80.929 + put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ 80.930 + put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ 80.931 + OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\ 80.932 +}\ 80.933 + 80.934 +#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1) 80.935 +#define op_put(a, b) a = cm[((b) + 16)>>5] 80.936 +#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1) 80.937 +#define op2_put(a, b) a = cm[((b) + 512)>>10] 80.938 + 80.939 +H264_LOWPASS(put_ , op_put, op2_put) 80.940 +H264_LOWPASS(avg_ , op_avg, op2_avg) 80.941 +H264_MC(put_, 2) 80.942 +H264_MC(put_, 4) 80.943 +H264_MC(put_, 8) 80.944 +H264_MC(put_, 16) 80.945 +H264_MC(avg_, 4) 80.946 +H264_MC(avg_, 8) 80.947 +H264_MC(avg_, 16) 80.948 + 80.949 +#undef op_avg 80.950 +#undef op_put 80.951 +#undef op2_avg 80.952 +#undef op2_put 80.953 + 80.954 +static void clear_block_c(DCTELEM *block) 80.955 +{ 80.956 + memset(block, 0, sizeof(DCTELEM)*64); 80.957 +} 80.958 + 80.959 +/** 80.960 + * memset(blocks, 0, sizeof(DCTELEM)*6*64) 80.961 + */ 80.962 +static void clear_blocks_c(DCTELEM *blocks) 80.963 +{ 80.964 + memset(blocks, 0, sizeof(DCTELEM)*6*64); 80.965 +} 80.966 + 80.967 +static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; } 80.968 + 80.969 +/* init static data */ 80.970 +av_cold void dsputil_static_init(void) 80.971 +{ 80.972 + int i; 80.973 + 80.974 + for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i; 80.975 + for(i=0;i<MAX_NEG_CROP;i++) { 80.976 + ff_cropTbl[i] = 0; 80.977 + ff_cropTbl[i + MAX_NEG_CROP + 256] = 255; 80.978 + } 80.979 + 80.980 + for(i=0;i<512;i++) { 80.981 + ff_squareTbl[i] = (i - 256) * (i - 256); 80.982 + } 80.983 +} 80.984 + 80.985 +int ff_check_alignment(void){ 80.986 + static int did_fail=0; 80.987 + DECLARE_ALIGNED(16, int, aligned); 80.988 + 80.989 + if((intptr_t)&aligned & 15){ 80.990 + if(!did_fail){ 80.991 +#if HAVE_MMX || HAVE_ALTIVEC 80.992 + av_log(AV_LOG_ERROR, 80.993 + "Compiler did not align stack variables. Libavcodec has been miscompiled\n" 80.994 + "and may be very slow or crash. This is not a bug in libavcodec,\n" 80.995 + "but in the compiler. You may try recompiling using gcc >= 4.2.\n" 80.996 + "Do not report crashes to FFmpeg developers.\n"); 80.997 +#endif 80.998 + did_fail=1; 80.999 + } 80.1000 + return -1; 80.1001 + } 80.1002 + return 0; 80.1003 +} 80.1004 + 80.1005 +av_cold void dsputil_init(DSPContext* c) 80.1006 +{ 80.1007 + (void) avg_pixels2_c; // kill a warning, avg_pixels2_c is a macro created function. 80.1008 + ff_check_alignment(); 80.1009 + dsputil_static_init(); 80.1010 + 80.1011 + c->idct_put= ff_simple_idct_put; 80.1012 + c->idct_add= ff_simple_idct_add; 80.1013 + c->idct = ff_simple_idct; 80.1014 + 80.1015 + c->clear_block = clear_block_c; 80.1016 + c->clear_blocks = clear_blocks_c; 80.1017 + 80.1018 +#define dspfunc(PFX, IDX, NUM) \ 80.1019 + c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \ 80.1020 + c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \ 80.1021 + c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \ 80.1022 + c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \ 80.1023 + c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \ 80.1024 + c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \ 80.1025 + c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \ 80.1026 + c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \ 80.1027 + c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \ 80.1028 + c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \ 80.1029 + c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \ 80.1030 + c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \ 80.1031 + c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \ 80.1032 + c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \ 80.1033 + c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \ 80.1034 + c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c 80.1035 + 80.1036 + 80.1037 + dspfunc(put_h264_qpel, 0, 16); 80.1038 + dspfunc(put_h264_qpel, 1, 8); 80.1039 + dspfunc(put_h264_qpel, 2, 4); 80.1040 + dspfunc(put_h264_qpel, 3, 2); 80.1041 + dspfunc(avg_h264_qpel, 0, 16); 80.1042 + dspfunc(avg_h264_qpel, 1, 8); 80.1043 + dspfunc(avg_h264_qpel, 2, 4); 80.1044 + 80.1045 +#undef dspfunc 80.1046 + c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c; 80.1047 + c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c; 80.1048 + c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c; 80.1049 + c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c; 80.1050 + c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c; 80.1051 + c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c; 80.1052 + 80.1053 + 80.1054 + c->prefetch= just_return; 80.1055 + 80.1056 + if (HAVE_MMX) dsputil_init_mmx (c); 80.1057 + if (ARCH_ARM) dsputil_init_arm (c); 80.1058 + if (HAVE_ALTIVEC) dsputil_init_ppc (c); //fixme PPC prefetch 80.1059 +} 80.1060 +
81.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 81.2 +++ b/ffmpeg_smp/h264dec/libavcodec/dsputil.h Mon Aug 27 12:09:56 2012 +0200 81.3 @@ -0,0 +1,465 @@ 81.4 +/* 81.5 + * DSP utils 81.6 + * Copyright (c) 2000, 2001, 2002 Fabrice Bellard 81.7 + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 81.8 + * 81.9 + * This file is part of FFmpeg. 81.10 + * 81.11 + * FFmpeg is free software; you can redistribute it and/or 81.12 + * modify it under the terms of the GNU Lesser General Public 81.13 + * License as published by the Free Software Foundation; either 81.14 + * version 2.1 of the License, or (at your option) any later version. 81.15 + * 81.16 + * FFmpeg is distributed in the hope that it will be useful, 81.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 81.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 81.19 + * Lesser General Public License for more details. 81.20 + * 81.21 + * You should have received a copy of the GNU Lesser General Public 81.22 + * License along with FFmpeg; if not, write to the Free Software 81.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 81.24 + */ 81.25 + 81.26 +/** 81.27 + * @file 81.28 + * DSP utils. 81.29 + * note, many functions in here may use MMX which trashes the FPU state, it is 81.30 + * absolutely necessary to call emms_c() between dsp & float/double code 81.31 + */ 81.32 + 81.33 +#ifndef AVCODEC_DSPUTIL_H 81.34 +#define AVCODEC_DSPUTIL_H 81.35 + 81.36 +#include "libavutil/intreadwrite.h" 81.37 +#include "avcodec.h" 81.38 +#include "h264_idct.h" 81.39 +// 81.40 +void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, 81.41 + const float *win, float add_bias, int len); 81.42 +void ff_float_to_int16_c(int16_t *dst, const float *src, long len); 81.43 +void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels); 81.44 + 81.45 +/* encoding scans */ 81.46 +extern const uint8_t ff_alternate_horizontal_scan[64]; 81.47 +extern const uint8_t ff_alternate_vertical_scan[64]; 81.48 +extern const uint8_t ff_zigzag_direct[64]; 81.49 +extern const uint8_t ff_zigzag248_direct[64]; 81.50 + 81.51 +/* pixel operations */ 81.52 +#define MAX_NEG_CROP 1024 81.53 + 81.54 +/* temporary */ 81.55 +extern uint32_t ff_squareTbl[512]; 81.56 +extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP]; 81.57 + 81.58 +/* VP3 DSP functions */ 81.59 +void ff_vp3_idct_c(DCTELEM *block/* align 16*/); 81.60 +void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); 81.61 +void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); 81.62 +void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/); 81.63 + 81.64 +void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values); 81.65 +void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values); 81.66 + 81.67 +/* VP6 DSP functions */ 81.68 +void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, int stride, 81.69 + const int16_t *h_weights, const int16_t *v_weights); 81.70 + 81.71 +/* Bink functions */ 81.72 +void ff_bink_idct_c (DCTELEM *block); 81.73 +void ff_bink_idct_add_c(uint8_t *dest, int linesize, DCTELEM *block); 81.74 +void ff_bink_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block); 81.75 + 81.76 +/* CAVS functions */ 81.77 +void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride); 81.78 +void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride); 81.79 +void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride); 81.80 +void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride); 81.81 + 81.82 +/* VC1 functions */ 81.83 +void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd); 81.84 +void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd); 81.85 + 81.86 +/* EA functions */ 81.87 +void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block); 81.88 + 81.89 +/* 1/2^n downscaling functions from imgconvert.c */ 81.90 +void ff_img_copy_plane(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); 81.91 +void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); 81.92 +void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); 81.93 +void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); 81.94 + 81.95 +void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 81.96 + int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height); 81.97 + 81.98 +/* minimum alignment rules ;) 81.99 +If you notice errors in the align stuff, need more alignment for some ASM code 81.100 +for some CPU or need to use a function with less aligned data then send a mail 81.101 +to the ffmpeg-devel mailing list, ... 81.102 + 81.103 +!warning These alignments might not match reality, (missing attribute((align)) 81.104 +stuff somewhere possible). 81.105 +I (Michael) did not check them, these are just the alignments which I think 81.106 +could be reached easily ... 81.107 + 81.108 +!future video codecs might need functions with less strict alignment 81.109 +*/ 81.110 + 81.111 +/* 81.112 +void get_pixels_c(DCTELEM *block, const uint8_t *pixels, int line_size); 81.113 +void diff_pixels_c(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride); 81.114 +void put_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size); 81.115 +void add_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size); 81.116 +void clear_blocks_c(DCTELEM *blocks); 81.117 +*/ 81.118 + 81.119 +/* add and put pixel (decoding) */ 81.120 +// blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16 81.121 +//h for op_pixels_func is limited to {width/2, width} but never larger than 16 and never smaller then 4 81.122 +typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h); 81.123 +typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h); 81.124 +typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride); 81.125 +typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y); 81.126 + 81.127 +typedef void (*op_fill_func)(uint8_t *block/*align width (8 or 16)*/, uint8_t value, int line_size, int h); 81.128 + 81.129 +#define DEF_OLD_QPEL(name)\ 81.130 +void ff_put_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\ 81.131 +void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\ 81.132 +void ff_avg_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride); 81.133 + 81.134 +DEF_OLD_QPEL(qpel16_mc11_old_c) 81.135 +DEF_OLD_QPEL(qpel16_mc31_old_c) 81.136 +DEF_OLD_QPEL(qpel16_mc12_old_c) 81.137 +DEF_OLD_QPEL(qpel16_mc32_old_c) 81.138 +DEF_OLD_QPEL(qpel16_mc13_old_c) 81.139 +DEF_OLD_QPEL(qpel16_mc33_old_c) 81.140 +DEF_OLD_QPEL(qpel8_mc11_old_c) 81.141 +DEF_OLD_QPEL(qpel8_mc31_old_c) 81.142 +DEF_OLD_QPEL(qpel8_mc12_old_c) 81.143 +DEF_OLD_QPEL(qpel8_mc32_old_c) 81.144 +DEF_OLD_QPEL(qpel8_mc13_old_c) 81.145 +DEF_OLD_QPEL(qpel8_mc33_old_c) 81.146 + 81.147 +#define CALL_2X_PIXELS(a, b, n)\ 81.148 +static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ 81.149 + b(block , pixels , line_size, h);\ 81.150 + b(block+n, pixels+n, line_size, h);\ 81.151 +} 81.152 + 81.153 +/* motion estimation */ 81.154 +// h is limited to {width/2, width, 2*width} but never larger than 16 and never smaller then 2 81.155 +// although currently h<4 is not used as functions with width <8 are neither used nor implemented 81.156 +typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/; 81.157 + 81.158 +/** 81.159 + * Scantable. 81.160 + */ 81.161 +typedef struct ScanTable{ 81.162 + const uint8_t *scantable; 81.163 + uint8_t permutated[64]; 81.164 + uint8_t raster_end[64]; 81.165 +#if ARCH_PPC 81.166 + /** Used by dct_quantize_altivec to find last-non-zero */ 81.167 + DECLARE_ALIGNED(16, uint8_t, inverse)[64]; 81.168 +#endif 81.169 +} ScanTable; 81.170 + 81.171 +void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable); 81.172 + 81.173 +void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, 81.174 + int block_w, int block_h, 81.175 + int src_x, int src_y, int w, int h); 81.176 + 81.177 + 81.178 +/** 81.179 + * DSPContext. 81.180 + */ 81.181 +typedef struct DSPContext { 81.182 + /* pixel ops : interface with DCT */ 81.183 + void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size); 81.184 + void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride); 81.185 + void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); 81.186 + void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); 81.187 + void (*put_pixels_nonclamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); 81.188 + void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); 81.189 + void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size); 81.190 + void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size); 81.191 + 81.192 + void (*clear_block)(DCTELEM *block/*align 16*/); 81.193 + void (*clear_blocks)(DCTELEM *blocks/*align 16*/); 81.194 + 81.195 + 81.196 + /** 81.197 + * Halfpel motion compensation with rounding (a+b+1)>>1. 81.198 + * this is an array[4][4] of motion compensation functions for 4 81.199 + * horizontal blocksizes (8,16) and the 4 halfpel positions<br> 81.200 + * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] 81.201 + * @param block destination where the result is stored 81.202 + * @param pixels source 81.203 + * @param line_size number of bytes in a horizontal line of block 81.204 + * @param h height 81.205 + */ 81.206 + op_pixels_func put_pixels_tab[4][4]; 81.207 + 81.208 + /** 81.209 + * Halfpel motion compensation with rounding (a+b+1)>>1. 81.210 + * This is an array[4][4] of motion compensation functions for 4 81.211 + * horizontal blocksizes (8,16) and the 4 halfpel positions<br> 81.212 + * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] 81.213 + * @param block destination into which the result is averaged (a+b+1)>>1 81.214 + * @param pixels source 81.215 + * @param line_size number of bytes in a horizontal line of block 81.216 + * @param h height 81.217 + */ 81.218 + op_pixels_func avg_pixels_tab[4][4]; 81.219 + 81.220 + /** 81.221 + * Halfpel motion compensation with no rounding (a+b)>>1. 81.222 + * this is an array[2][4] of motion compensation functions for 2 81.223 + * horizontal blocksizes (8,16) and the 4 halfpel positions<br> 81.224 + * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] 81.225 + * @param block destination where the result is stored 81.226 + * @param pixels source 81.227 + * @param line_size number of bytes in a horizontal line of block 81.228 + * @param h height 81.229 + */ 81.230 + op_pixels_func put_no_rnd_pixels_tab[4][4]; 81.231 + 81.232 + /** 81.233 + * Halfpel motion compensation with no rounding (a+b)>>1. 81.234 + * this is an array[2][4] of motion compensation functions for 2 81.235 + * horizontal blocksizes (8,16) and the 4 halfpel positions<br> 81.236 + * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] 81.237 + * @param block destination into which the result is averaged (a+b)>>1 81.238 + * @param pixels source 81.239 + * @param line_size number of bytes in a horizontal line of block 81.240 + * @param h height 81.241 + */ 81.242 + op_pixels_func avg_no_rnd_pixels_tab[4][4]; 81.243 + 81.244 + void (*put_no_rnd_pixels_l2[2])(uint8_t *block/*align width (8 or 16)*/, const uint8_t *a/*align 1*/, const uint8_t *b/*align 1*/, int line_size, int h); 81.245 + 81.246 + 81.247 + qpel_mc_func put_qpel_pixels_tab[2][16]; 81.248 + qpel_mc_func avg_qpel_pixels_tab[2][16]; 81.249 + qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16]; 81.250 + qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16]; 81.251 + qpel_mc_func put_mspel_pixels_tab[8]; 81.252 + 81.253 + /** 81.254 + * h264 Chroma MC 81.255 + */ 81.256 + h264_chroma_mc_func put_h264_chroma_pixels_tab[3]; 81.257 + h264_chroma_mc_func avg_h264_chroma_pixels_tab[3]; 81.258 + /* This is really one func used in VC-1 decoding */ 81.259 + h264_chroma_mc_func put_no_rnd_vc1_chroma_pixels_tab[3]; 81.260 + h264_chroma_mc_func avg_no_rnd_vc1_chroma_pixels_tab[3]; 81.261 + 81.262 + qpel_mc_func put_h264_qpel_pixels_tab[4][16]; 81.263 + qpel_mc_func avg_h264_qpel_pixels_tab[4][16]; 81.264 + 81.265 + qpel_mc_func put_2tap_qpel_pixels_tab[4][16]; 81.266 + qpel_mc_func avg_2tap_qpel_pixels_tab[4][16]; 81.267 + 81.268 + 81.269 + /* (I)DCT */ 81.270 + void (*fdct)(DCTELEM *block/* align 16*/); 81.271 + void (*fdct248)(DCTELEM *block/* align 16*/); 81.272 + 81.273 + /* IDCT really*/ 81.274 + void (*idct)(DCTELEM *block/* align 16*/); 81.275 + 81.276 + /** 81.277 + * block -> idct -> clip to unsigned 8 bit -> dest. 81.278 + * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...) 81.279 + * @param line_size size in bytes of a horizontal line of dest 81.280 + */ 81.281 + void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); 81.282 + 81.283 + /** 81.284 + * block -> idct -> add dest -> clip to unsigned 8 bit -> dest. 81.285 + * @param line_size size in bytes of a horizontal line of dest 81.286 + */ 81.287 + void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); 81.288 + 81.289 + void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w); 81.290 +#define EDGE_WIDTH 32 81.291 + 81.292 + void (*prefetch)(void *mem, int stride, int h); 81.293 + 81.294 +} DSPContext; 81.295 + 81.296 +void dsputil_static_init(void); 81.297 +void dsputil_init(DSPContext* p); 81.298 + 81.299 +int ff_check_alignment(void); 81.300 + 81.301 +/** 81.302 + * permute block according to permuatation. 81.303 + * @param last last non zero element in scantable order 81.304 + */ 81.305 +void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last); 81.306 + 81.307 +void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type); 81.308 + 81.309 +#define BYTE_VEC32(c) ((c)*0x01010101UL) 81.310 + 81.311 +static inline uint32_t rnd_avg32(uint32_t a, uint32_t b) 81.312 +{ 81.313 + return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1); 81.314 +} 81.315 + 81.316 +static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b) 81.317 +{ 81.318 + return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1); 81.319 +} 81.320 + 81.321 + 81.322 +/** 81.323 + * Empty mmx state. 81.324 + * this must be called between any dsp function and float/double code. 81.325 + * for example sin(); dsp->idct_put(); emms_c(); cos() 81.326 + */ 81.327 +#define emms_c() 81.328 + 81.329 +/* should be defined by architectures supporting 81.330 + one or more MultiMedia extension */ 81.331 +int mm_support(void); 81.332 +extern int mm_flags; 81.333 + 81.334 +void dsputil_init_arm(DSPContext* c); 81.335 +void dsputil_init_mmx(DSPContext* c); 81.336 +void dsputil_init_ppc(DSPContext* c); 81.337 + 81.338 +void ff_dsputil_init_dwt(DSPContext *c); 81.339 + 81.340 +#if HAVE_MMX 81.341 + 81.342 +#undef emms_c 81.343 + 81.344 +static inline void emms(void) 81.345 +{ 81.346 + __asm__ volatile ("emms;":::"memory"); 81.347 +} 81.348 + 81.349 + 81.350 +#define emms_c() \ 81.351 +{\ 81.352 + if (mm_flags & FF_MM_MMX)\ 81.353 + emms();\ 81.354 +} 81.355 + 81.356 +#elif ARCH_ARM 81.357 + 81.358 +#if HAVE_NEON 81.359 +# define STRIDE_ALIGN 16 81.360 +#endif 81.361 + 81.362 +#elif ARCH_PPC || ARCH_PPC64 || ARCH_CELL 81.363 + 81.364 +#define STRIDE_ALIGN 16 81.365 + 81.366 +#endif 81.367 + 81.368 +#ifndef STRIDE_ALIGN 81.369 +# define STRIDE_ALIGN 8 81.370 +#endif 81.371 + 81.372 +#define WRAPPER8_16(name8, name16)\ 81.373 +static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\ 81.374 + return name8(s, dst , src , stride, h)\ 81.375 + +name8(s, dst+8 , src+8 , stride, h);\ 81.376 +} 81.377 + 81.378 +#define WRAPPER8_16_SQ(name8, name16)\ 81.379 +static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\ 81.380 + int score=0;\ 81.381 + score +=name8(s, dst , src , stride, 8);\ 81.382 + score +=name8(s, dst+8 , src+8 , stride, 8);\ 81.383 + if(h==16){\ 81.384 + dst += 8*stride;\ 81.385 + src += 8*stride;\ 81.386 + score +=name8(s, dst , src , stride, 8);\ 81.387 + score +=name8(s, dst+8 , src+8 , stride, 8);\ 81.388 + }\ 81.389 + return score;\ 81.390 +} 81.391 + 81.392 +static inline void copy_block2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h) 81.393 +{ 81.394 + int i; 81.395 + for(i=0; i<h; i++) 81.396 + { 81.397 + AV_WN16(dst , AV_RN16(src )); 81.398 + dst+=dstStride; 81.399 + src+=srcStride; 81.400 + } 81.401 +} 81.402 + 81.403 +static inline void copy_block4(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h) 81.404 +{ 81.405 + int i; 81.406 + for(i=0; i<h; i++) 81.407 + { 81.408 + AV_WN32(dst , AV_RN32(src )); 81.409 + dst+=dstStride; 81.410 + src+=srcStride; 81.411 + } 81.412 +} 81.413 + 81.414 +static inline void copy_block8(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h) 81.415 +{ 81.416 + int i; 81.417 + for(i=0; i<h; i++) 81.418 + { 81.419 + AV_WN32(dst , AV_RN32(src )); 81.420 + AV_WN32(dst+4 , AV_RN32(src+4 )); 81.421 + dst+=dstStride; 81.422 + src+=srcStride; 81.423 + } 81.424 +} 81.425 + 81.426 +static inline void copy_block9(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h) 81.427 +{ 81.428 + int i; 81.429 + for(i=0; i<h; i++) 81.430 + { 81.431 + AV_WN32(dst , AV_RN32(src )); 81.432 + AV_WN32(dst+4 , AV_RN32(src+4 )); 81.433 + dst[8]= src[8]; 81.434 + dst+=dstStride; 81.435 + src+=srcStride; 81.436 + } 81.437 +} 81.438 + 81.439 +static inline void copy_block16(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h) 81.440 +{ 81.441 + int i; 81.442 + for(i=0; i<h; i++) 81.443 + { 81.444 + AV_WN32(dst , AV_RN32(src )); 81.445 + AV_WN32(dst+4 , AV_RN32(src+4 )); 81.446 + AV_WN32(dst+8 , AV_RN32(src+8 )); 81.447 + AV_WN32(dst+12, AV_RN32(src+12)); 81.448 + dst+=dstStride; 81.449 + src+=srcStride; 81.450 + } 81.451 +} 81.452 + 81.453 +static inline void copy_block17(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h) 81.454 +{ 81.455 + int i; 81.456 + for(i=0; i<h; i++) 81.457 + { 81.458 + AV_WN32(dst , AV_RN32(src )); 81.459 + AV_WN32(dst+4 , AV_RN32(src+4 )); 81.460 + AV_WN32(dst+8 , AV_RN32(src+8 )); 81.461 + AV_WN32(dst+12, AV_RN32(src+12)); 81.462 + dst[16]= src[16]; 81.463 + dst+=dstStride; 81.464 + src+=srcStride; 81.465 + } 81.466 +} 81.467 + 81.468 +#endif /* AVCODEC_DSPUTIL_H */
82.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 82.2 +++ b/ffmpeg_smp/h264dec/libavcodec/get_bits.h Mon Aug 27 12:09:56 2012 +0200 82.3 @@ -0,0 +1,325 @@ 82.4 +/* 82.5 + * copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at> 82.6 + * 82.7 + * This file is part of FFmpeg. 82.8 + * 82.9 + * FFmpeg is free software; you can redistribute it and/or 82.10 + * modify it under the terms of the GNU Lesser General Public 82.11 + * License as published by the Free Software Foundation; either 82.12 + * version 2.1 of the License, or (at your option) any later version. 82.13 + * 82.14 + * FFmpeg is distributed in the hope that it will be useful, 82.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 82.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 82.17 + * Lesser General Public License for more details. 82.18 + * 82.19 + * You should have received a copy of the GNU Lesser General Public 82.20 + * License along with FFmpeg; if not, write to the Free Software 82.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 82.22 + */ 82.23 + 82.24 +/** 82.25 + * @file 82.26 + * bitstream reader API header. 82.27 + */ 82.28 + 82.29 +#ifndef AVCODEC_GET_BITS_H 82.30 +#define AVCODEC_GET_BITS_H 82.31 + 82.32 +#include <stdint.h> 82.33 +#include <stdlib.h> 82.34 +#include <assert.h> 82.35 +#include "libavutil/bswap.h" 82.36 +#include "libavutil/common.h" 82.37 +#include "libavutil/intreadwrite.h" 82.38 +#include "libavutil/log.h" 82.39 +#include "mathops.h" 82.40 + 82.41 + 82.42 +typedef struct GetBitContext { 82.43 + uint8_t *rbsp; 82.44 + unsigned int rbsp_size; 82.45 + uint8_t *raw; 82.46 + const uint8_t *buffer, *buffer_end; 82.47 + unsigned int alloc_size; 82.48 + unsigned int buf_size; 82.49 + uint32_t *buffer_ptr; 82.50 + uint32_t cache0; 82.51 + uint32_t cache1; 82.52 + int bit_count; 82.53 + int size_in_bits; 82.54 +} GetBitContext; 82.55 + 82.56 +/* Bitstream reader API docs: 82.57 +name 82.58 + arbitrary name which is used as prefix for the internal variables 82.59 + 82.60 +gb 82.61 + getbitcontext 82.62 + 82.63 +OPEN_READER(name, gb) 82.64 + loads gb into local variables 82.65 + 82.66 +CLOSE_READER(name, gb) 82.67 + stores local vars in gb 82.68 + 82.69 +UPDATE_CACHE(name, gb) 82.70 + refills the internal cache from the bitstream 82.71 + after this call at least MIN_CACHE_BITS will be available, 82.72 + 82.73 +GET_CACHE(name, gb) 82.74 + will output the contents of the internal cache, next bit is MSB of 32 or 64 bit (FIXME 64bit) 82.75 + 82.76 +SHOW_UBITS(name, gb, num) 82.77 + will return the next num bits 82.78 + 82.79 +SHOW_SBITS(name, gb, num) 82.80 + will return the next num bits and do sign extension 82.81 + 82.82 +SKIP_BITS(name, gb, num) 82.83 + will skip over the next num bits 82.84 + note, this is equivalent to SKIP_CACHE; SKIP_COUNTER 82.85 + 82.86 +SKIP_CACHE(name, gb, num) 82.87 + will remove the next num bits from the cache (note SKIP_COUNTER MUST be called before UPDATE_CACHE / CLOSE_READER) 82.88 + 82.89 +SKIP_COUNTER(name, gb, num) 82.90 + will increment the internal bit counter (see SKIP_CACHE & SKIP_BITS) 82.91 + 82.92 +LAST_SKIP_CACHE(name, gb, num) 82.93 + will remove the next num bits from the cache if it is needed for UPDATE_CACHE otherwise it will do nothing 82.94 + 82.95 +LAST_SKIP_BITS(name, gb, num) 82.96 + is equivalent to LAST_SKIP_CACHE; SKIP_COUNTER 82.97 + 82.98 +for examples see get_bits, show_bits, skip_bits, get_vlc 82.99 +*/ 82.100 + 82.101 +#define MIN_CACHE_BITS 32 82.102 + 82.103 +#define OPEN_READER(name, gb)\ 82.104 + int name##_bit_count=(gb)->bit_count;\ 82.105 + uint32_t name##_cache0= (gb)->cache0;\ 82.106 + uint32_t name##_cache1= (gb)->cache1;\ 82.107 + uint32_t * name##_buffer_ptr=(gb)->buffer_ptr;\ 82.108 + 82.109 +#define CLOSE_READER(name, gb)\ 82.110 + (gb)->bit_count= name##_bit_count;\ 82.111 + (gb)->cache0= name##_cache0;\ 82.112 + (gb)->cache1= name##_cache1;\ 82.113 + (gb)->buffer_ptr= name##_buffer_ptr;\ 82.114 + 82.115 +#define UPDATE_CACHE(name, gb)\ 82.116 + if(name##_bit_count > 0){\ 82.117 + const uint32_t next= be2me_32( *name##_buffer_ptr );\ 82.118 + name##_cache0 |= NEG_USR32(next,name##_bit_count);\ 82.119 + name##_cache1 |= next<<name##_bit_count;\ 82.120 + name##_buffer_ptr++;\ 82.121 + name##_bit_count-= 32;\ 82.122 + }\ 82.123 + 82.124 +#if ARCH_X86 82.125 +# define SKIP_CACHE(name, gb, num)\ 82.126 + __asm__(\ 82.127 + "shldl %2, %1, %0 \n\t"\ 82.128 + "shll %2, %1 \n\t"\ 82.129 + : "+r" (name##_cache0), "+r" (name##_cache1)\ 82.130 + : "Ic" ((uint8_t)(num))\ 82.131 + ); 82.132 +#else 82.133 +# define SKIP_CACHE(name, gb, num)\ 82.134 + name##_cache0 <<= (num);\ 82.135 + name##_cache0 |= NEG_USR32(name##_cache1,num);\ 82.136 + name##_cache1 <<= (num); 82.137 +#endif 82.138 + 82.139 +#define SKIP_COUNTER(name, gb, num)\ 82.140 + name##_bit_count += (num);\ 82.141 + 82.142 +#define SKIP_BITS(name, gb, num)\ 82.143 + {\ 82.144 + SKIP_CACHE(name, gb, num)\ 82.145 + SKIP_COUNTER(name, gb, num)\ 82.146 + }\ 82.147 + 82.148 +#define LAST_SKIP_BITS(name, gb, num) SKIP_BITS(name, gb, num) 82.149 +#define LAST_SKIP_CACHE(name, gb, num) SKIP_CACHE(name, gb, num) 82.150 + 82.151 +#define SHOW_UBITS(name, gb, num)\ 82.152 + NEG_USR32(name##_cache0, num) 82.153 + 82.154 +#define SHOW_SBITS(name, gb, num)\ 82.155 + NEG_SSR32(name##_cache0, num) 82.156 + 82.157 +#define GET_CACHE(name, gb)\ 82.158 + (name##_cache0) 82.159 + 82.160 +static inline int get_bits_count(const GetBitContext *s){ 82.161 + return ((uint8_t*)s->buffer_ptr - s->buffer)*8 - 32 + s->bit_count; 82.162 +} 82.163 + 82.164 +static inline void skip_bits_long(GetBitContext *s, int n){ 82.165 + OPEN_READER(re, s) 82.166 + re_bit_count += n; 82.167 + re_buffer_ptr += re_bit_count>>5; 82.168 + re_bit_count &= 31; 82.169 + re_cache0 = be2me_32( re_buffer_ptr[-1] ) << re_bit_count; 82.170 + re_cache1 = 0; 82.171 + UPDATE_CACHE(re, s) 82.172 + CLOSE_READER(re, s) 82.173 +} 82.174 + 82.175 +/** 82.176 + * read mpeg1 dc style vlc (sign bit + mantisse with no MSB). 82.177 + * if MSB not set it is negative 82.178 + * @param n length in bits 82.179 + * @author BERO 82.180 + */ 82.181 +static inline int get_xbits(GetBitContext *s, int n){ 82.182 + register int sign; 82.183 + register int32_t cache; 82.184 + OPEN_READER(re, s) 82.185 + UPDATE_CACHE(re, s) 82.186 + cache = GET_CACHE(re,s); 82.187 + sign=(~cache)>>31; 82.188 + LAST_SKIP_BITS(re, s, n) 82.189 + CLOSE_READER(re, s) 82.190 + return (NEG_USR32(sign ^ cache, n) ^ sign) - sign; 82.191 +} 82.192 + 82.193 +static inline int get_sbits(GetBitContext *s, int n){ 82.194 + register int tmp; 82.195 + OPEN_READER(re, s) 82.196 + UPDATE_CACHE(re, s) 82.197 + tmp= SHOW_SBITS(re, s, n); 82.198 + LAST_SKIP_BITS(re, s, n) 82.199 + CLOSE_READER(re, s) 82.200 + return tmp; 82.201 +} 82.202 + 82.203 +/** 82.204 + * reads 1-17 bits. 82.205 + * Note, the alt bitstream reader can read up to 25 bits, but the libmpeg2 reader can't 82.206 + */ 82.207 +static inline unsigned int get_bits(GetBitContext *s, int n){ 82.208 + register int tmp; 82.209 + OPEN_READER(re, s) 82.210 + UPDATE_CACHE(re, s) 82.211 + tmp= SHOW_UBITS(re, s, n); 82.212 + LAST_SKIP_BITS(re, s, n) 82.213 + CLOSE_READER(re, s) 82.214 + return tmp; 82.215 +} 82.216 + 82.217 +/** 82.218 + * shows 1-17 bits. 82.219 + * Note, the alt bitstream reader can read up to 25 bits, but the libmpeg2 reader can't 82.220 + */ 82.221 +static inline unsigned int show_bits(GetBitContext *s, int n){ 82.222 + register int tmp; 82.223 + OPEN_READER(re, s) 82.224 + UPDATE_CACHE(re, s) 82.225 + tmp= SHOW_UBITS(re, s, n); 82.226 +// CLOSE_READER(re, s) 82.227 + return tmp; 82.228 +} 82.229 + 82.230 +static inline void skip_bits(GetBitContext *s, int n){ 82.231 + //Note gcc seems to optimize this to s->index+=n for the ALT_READER :)) 82.232 + OPEN_READER(re, s) 82.233 + UPDATE_CACHE(re, s) 82.234 + LAST_SKIP_BITS(re, s, n) 82.235 + CLOSE_READER(re, s) 82.236 +} 82.237 + 82.238 +static inline unsigned int get_bits1(GetBitContext *s){ 82.239 + return get_bits(s, 1); 82.240 +} 82.241 + 82.242 +static inline unsigned int show_bits1(GetBitContext *s){ 82.243 + return show_bits(s, 1); 82.244 +} 82.245 + 82.246 +static inline void skip_bits1(GetBitContext *s){ 82.247 + skip_bits(s, 1); 82.248 +} 82.249 + 82.250 +/** 82.251 + * reads 0-32 bits. 82.252 + */ 82.253 +static inline unsigned int get_bits_long(GetBitContext *s, int n){ 82.254 + if(n<=MIN_CACHE_BITS) return get_bits(s, n); 82.255 + else{ 82.256 + int ret= get_bits(s, 16) << (n-16); 82.257 + return ret | get_bits(s, n-16); 82.258 + } 82.259 +} 82.260 + 82.261 +/** 82.262 + * reads 0-32 bits as a signed integer. 82.263 + */ 82.264 +static inline int get_sbits_long(GetBitContext *s, int n) { 82.265 + return sign_extend(get_bits_long(s, n), n); 82.266 +} 82.267 + 82.268 +/** 82.269 + * shows 0-32 bits. 82.270 + */ 82.271 +static inline unsigned int show_bits_long(GetBitContext *s, int n){ 82.272 + if(n<=MIN_CACHE_BITS) return show_bits(s, n); 82.273 + else{ 82.274 + GetBitContext gb= *s; 82.275 + return get_bits_long(&gb, n); 82.276 + } 82.277 +} 82.278 + 82.279 +static inline int check_marker(GetBitContext *s, const char *msg) 82.280 +{ 82.281 + int bit= get_bits1(s); 82.282 + if(!bit) 82.283 + av_log(AV_LOG_INFO, "Marker bit missing %s\n", msg); 82.284 + 82.285 + return bit; 82.286 +} 82.287 + 82.288 +/** 82.289 + * init GetBitContext. 82.290 + * @param buffer bitstream buffer, must be FF_INPUT_BUFFER_PADDING_SIZE bytes larger then the actual read bits 82.291 + * because some optimized bitstream readers read 32 or 64 bit at once and could read over the end 82.292 + * @param bit_size the size of the buffer in bits 82.293 + * 82.294 + * While GetBitContext stores the buffer size, for performance reasons you are 82.295 + * responsible for checking for the buffer end yourself (take advantage of the padding)! 82.296 + */ 82.297 +static inline void init_get_bits(GetBitContext *s, 82.298 + const uint8_t *buffer, int bit_size) 82.299 +{ 82.300 + int buffer_size= (bit_size+7)>>3; 82.301 + if(buffer_size < 0 || bit_size < 0) { 82.302 + buffer_size = bit_size = 0; 82.303 + buffer = NULL; 82.304 + } 82.305 + 82.306 + s->buffer= buffer; 82.307 + s->size_in_bits= bit_size; 82.308 + s->buffer_end= buffer + buffer_size; 82.309 + 82.310 + s->buffer_ptr = (uint32_t*)((intptr_t)buffer&(~3)); 82.311 + s->bit_count = 32 + 8*((intptr_t)buffer&3); 82.312 + skip_bits_long(s, 0); 82.313 +} 82.314 + 82.315 +static inline void align_get_bits(GetBitContext *s) 82.316 +{ 82.317 + int n= (-get_bits_count(s)) & 7; 82.318 + if(n) skip_bits(s, n); 82.319 +} 82.320 + 82.321 +#define tprintf(p, ...) {} 82.322 + 82.323 +static inline int get_bits_left(GetBitContext *gb) 82.324 +{ 82.325 + return gb->size_in_bits - get_bits_count(gb); 82.326 +} 82.327 + 82.328 +#endif /* AVCODEC_GET_BITS_H */
83.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 83.2 +++ b/ffmpeg_smp/h264dec/libavcodec/golomb.c Mon Aug 27 12:09:56 2012 +0200 83.3 @@ -0,0 +1,184 @@ 83.4 +/* 83.5 + * exp golomb vlc stuff 83.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 83.7 + * 83.8 + * This file is part of FFmpeg. 83.9 + * 83.10 + * FFmpeg is free software; you can redistribute it and/or 83.11 + * modify it under the terms of the GNU Lesser General Public 83.12 + * License as published by the Free Software Foundation; either 83.13 + * version 2.1 of the License, or (at your option) any later version. 83.14 + * 83.15 + * FFmpeg is distributed in the hope that it will be useful, 83.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 83.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 83.18 + * Lesser General Public License for more details. 83.19 + * 83.20 + * You should have received a copy of the GNU Lesser General Public 83.21 + * License along with FFmpeg; if not, write to the Free Software 83.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 83.23 + */ 83.24 + 83.25 +/** 83.26 + * @file 83.27 + * @brief 83.28 + * exp golomb vlc stuff 83.29 + * @author Michael Niedermayer <michaelni@gmx.at> 83.30 + */ 83.31 + 83.32 +#include "libavutil/common.h" 83.33 + 83.34 +const uint8_t ff_log2_tab[256]={ 83.35 + 0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, 83.36 + 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, 83.37 + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, 83.38 + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, 83.39 + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 83.40 + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 83.41 + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 83.42 + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 83.43 +}; 83.44 + 83.45 +const uint8_t ff_golomb_vlc_len[512]={ 83.46 +14,13,12,12,11,11,11,11,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 83.47 +7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 83.48 +5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, 83.49 +5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, 83.50 +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 83.51 +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 83.52 +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 83.53 +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 83.54 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 83.55 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 83.56 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 83.57 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 83.58 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 83.59 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 83.60 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 83.61 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 83.62 +}; 83.63 + 83.64 +const uint8_t ff_ue_golomb_vlc_code[512]={ 83.65 +31,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30, 83.66 + 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14, 83.67 + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 83.68 + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 83.69 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 83.70 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 83.71 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 83.72 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 83.73 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.74 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.75 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.76 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.77 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.78 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.79 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.80 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 83.81 +}; 83.82 + 83.83 +const int8_t ff_se_golomb_vlc_code[512]={ 83.84 + 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 8, -8, 9, -9, 10,-10, 11,-11, 12,-12, 13,-13, 14,-14, 15,-15, 83.85 + 4, 4, 4, 4, -4, -4, -4, -4, 5, 5, 5, 5, -5, -5, -5, -5, 6, 6, 6, 6, -6, -6, -6, -6, 7, 7, 7, 7, -7, -7, -7, -7, 83.86 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 83.87 + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, 83.88 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 83.89 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 83.90 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 83.91 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 83.92 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.93 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.94 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.95 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.96 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.97 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.98 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.99 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.100 +}; 83.101 + 83.102 + 83.103 +const uint8_t ff_ue_golomb_len[256]={ 83.104 + 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,11, 83.105 +11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,13, 83.106 +13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13, 83.107 +13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,15, 83.108 +15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 83.109 +15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 83.110 +15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 83.111 +15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,17, 83.112 +}; 83.113 + 83.114 +const uint8_t ff_interleaved_golomb_vlc_len[256]={ 83.115 +9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5, 83.116 +9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5, 83.117 +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 83.118 +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 83.119 +9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5, 83.120 +9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5, 83.121 +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 83.122 +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 83.123 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 83.124 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 83.125 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 83.126 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 83.127 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 83.128 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 83.129 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 83.130 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 83.131 +}; 83.132 + 83.133 +const uint8_t ff_interleaved_ue_golomb_vlc_code[256]={ 83.134 + 15,16,7, 7, 17,18,8, 8, 3, 3, 3, 3, 3, 3, 3, 3, 83.135 + 19,20,9, 9, 21,22,10,10,4, 4, 4, 4, 4, 4, 4, 4, 83.136 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 83.137 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 83.138 + 23,24,11,11,25,26,12,12,5, 5, 5, 5, 5, 5, 5, 5, 83.139 + 27,28,13,13,29,30,14,14,6, 6, 6, 6, 6, 6, 6, 6, 83.140 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 83.141 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 83.142 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.143 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.144 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.145 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.146 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.147 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.148 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.149 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.150 +}; 83.151 + 83.152 +const int8_t ff_interleaved_se_golomb_vlc_code[256]={ 83.153 + 8, -8, 4, 4, 9, -9, -4, -4, 2, 2, 2, 2, 2, 2, 2, 2, 83.154 + 10,-10, 5, 5, 11,-11, -5, -5, -2, -2, -2, -2, -2, -2, -2, -2, 83.155 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 83.156 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 83.157 + 12,-12, 6, 6, 13,-13, -6, -6, 3, 3, 3, 3, 3, 3, 3, 3, 83.158 + 14,-14, 7, 7, 15,-15, -7, -7, -3, -3, -3, -3, -3, -3, -3, -3, 83.159 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 83.160 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 83.161 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.162 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.163 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.164 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.165 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.166 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.167 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.168 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.169 +}; 83.170 + 83.171 +const uint8_t ff_interleaved_dirac_golomb_vlc_code[256]={ 83.172 +0, 1, 0, 0, 2, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 83.173 +4, 5, 2, 2, 6, 7, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 83.174 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.175 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.176 +8, 9, 4, 4, 10,11,5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 83.177 +12,13,6, 6, 14,15,7, 7, 3, 3, 3, 3, 3, 3, 3, 3, 83.178 +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 83.179 +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 83.180 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.181 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.182 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.183 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.184 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.185 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.186 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83.187 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,};
84.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 84.2 +++ b/ffmpeg_smp/h264dec/libavcodec/golomb.h Mon Aug 27 12:09:56 2012 +0200 84.3 @@ -0,0 +1,410 @@ 84.4 +/* 84.5 + * exp golomb vlc stuff 84.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 84.7 + * Copyright (c) 2004 Alex Beregszaszi 84.8 + * 84.9 + * This file is part of FFmpeg. 84.10 + * 84.11 + * FFmpeg is free software; you can redistribute it and/or 84.12 + * modify it under the terms of the GNU Lesser General Public 84.13 + * License as published by the Free Software Foundation; either 84.14 + * version 2.1 of the License, or (at your option) any later version. 84.15 + * 84.16 + * FFmpeg is distributed in the hope that it will be useful, 84.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 84.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 84.19 + * Lesser General Public License for more details. 84.20 + * 84.21 + * You should have received a copy of the GNU Lesser General Public 84.22 + * License along with FFmpeg; if not, write to the Free Software 84.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 84.24 + */ 84.25 + 84.26 +/** 84.27 + * @file 84.28 + * @brief 84.29 + * exp golomb vlc stuff 84.30 + * @author Michael Niedermayer <michaelni@gmx.at> and Alex Beregszaszi 84.31 + */ 84.32 + 84.33 +#ifndef AVCODEC_GOLOMB_H 84.34 +#define AVCODEC_GOLOMB_H 84.35 + 84.36 +#include <stdint.h> 84.37 +#include "get_bits.h" 84.38 + 84.39 +#define INVALID_VLC 0x80000000 84.40 + 84.41 +extern const uint8_t ff_golomb_vlc_len[512]; 84.42 +extern const uint8_t ff_ue_golomb_vlc_code[512]; 84.43 +extern const int8_t ff_se_golomb_vlc_code[512]; 84.44 +extern const uint8_t ff_ue_golomb_len[256]; 84.45 + 84.46 +extern const uint8_t ff_interleaved_golomb_vlc_len[256]; 84.47 +extern const uint8_t ff_interleaved_ue_golomb_vlc_code[256]; 84.48 +extern const int8_t ff_interleaved_se_golomb_vlc_code[256]; 84.49 +extern const uint8_t ff_interleaved_dirac_golomb_vlc_code[256]; 84.50 + 84.51 + 84.52 + /** 84.53 + * read unsigned exp golomb code. 84.54 + */ 84.55 +static inline int get_ue_golomb(GetBitContext *gb){ 84.56 + unsigned int buf; 84.57 + int log; 84.58 + 84.59 + OPEN_READER(re, gb); 84.60 + UPDATE_CACHE(re, gb); 84.61 + buf=GET_CACHE(re, gb); 84.62 + 84.63 + if(buf >= (1<<27)){ 84.64 + buf >>= 32 - 9; 84.65 + LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]); 84.66 + CLOSE_READER(re, gb); 84.67 + 84.68 + return ff_ue_golomb_vlc_code[buf]; 84.69 + }else{ 84.70 + log= 2*av_log2_c(buf) - 31; 84.71 + buf>>= log; 84.72 + buf--; 84.73 + LAST_SKIP_BITS(re, gb, 32 - log); 84.74 + CLOSE_READER(re, gb); 84.75 + 84.76 + return buf; 84.77 + } 84.78 +} 84.79 + 84.80 + /** 84.81 + * read unsigned exp golomb code, constraint to a max of 31. 84.82 + * the return value is undefined if the stored value exceeds 31. 84.83 + */ 84.84 +static inline int get_ue_golomb_31(GetBitContext *gb){ 84.85 + unsigned int buf; 84.86 + 84.87 + OPEN_READER(re, gb); 84.88 + UPDATE_CACHE(re, gb); 84.89 + buf=GET_CACHE(re, gb); 84.90 + 84.91 + buf >>= 32 - 9; 84.92 + LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]); 84.93 + CLOSE_READER(re, gb); 84.94 + 84.95 + return ff_ue_golomb_vlc_code[buf]; 84.96 +} 84.97 + 84.98 +static inline int svq3_get_ue_golomb(GetBitContext *gb){ 84.99 + uint32_t buf; 84.100 + 84.101 + OPEN_READER(re, gb); 84.102 + UPDATE_CACHE(re, gb); 84.103 + buf=GET_CACHE(re, gb); 84.104 + 84.105 + if(buf&0xAA800000){ 84.106 + buf >>= 32 - 8; 84.107 + LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]); 84.108 + CLOSE_READER(re, gb); 84.109 + 84.110 + return ff_interleaved_ue_golomb_vlc_code[buf]; 84.111 + }else{ 84.112 + int ret = 1; 84.113 + 84.114 + while (1) { 84.115 + buf >>= 32 - 8; 84.116 + LAST_SKIP_BITS(re, gb, FFMIN(ff_interleaved_golomb_vlc_len[buf], 8)); 84.117 + 84.118 + if (ff_interleaved_golomb_vlc_len[buf] != 9){ 84.119 + ret <<= (ff_interleaved_golomb_vlc_len[buf] - 1) >> 1; 84.120 + ret |= ff_interleaved_dirac_golomb_vlc_code[buf]; 84.121 + break; 84.122 + } 84.123 + ret = (ret << 4) | ff_interleaved_dirac_golomb_vlc_code[buf]; 84.124 + UPDATE_CACHE(re, gb); 84.125 + buf = GET_CACHE(re, gb); 84.126 + } 84.127 + 84.128 + CLOSE_READER(re, gb); 84.129 + return ret - 1; 84.130 + } 84.131 +} 84.132 + 84.133 +/** 84.134 + * read unsigned truncated exp golomb code. 84.135 + */ 84.136 +static inline int get_te0_golomb(GetBitContext *gb, int range){ 84.137 + assert(range >= 1); 84.138 + 84.139 + if(range==1) return 0; 84.140 + else if(range==2) return get_bits1(gb)^1; 84.141 + else return get_ue_golomb(gb); 84.142 +} 84.143 + 84.144 +/** 84.145 + * read unsigned truncated exp golomb code. 84.146 + */ 84.147 +static inline int get_te_golomb(GetBitContext *gb, int range){ 84.148 + assert(range >= 1); 84.149 + 84.150 + if(range==2) return get_bits1(gb)^1; 84.151 + else return get_ue_golomb(gb); 84.152 +} 84.153 + 84.154 + 84.155 +/** 84.156 + * read signed exp golomb code. 84.157 + */ 84.158 +static inline int get_se_golomb(GetBitContext *gb){ 84.159 + unsigned int buf; 84.160 + int log; 84.161 + 84.162 + OPEN_READER(re, gb); 84.163 + UPDATE_CACHE(re, gb); 84.164 + buf=GET_CACHE(re, gb); 84.165 + 84.166 + if(buf >= (1<<27)){ 84.167 + buf >>= 32 - 9; 84.168 + LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]); 84.169 + CLOSE_READER(re, gb); 84.170 + 84.171 + return ff_se_golomb_vlc_code[buf]; 84.172 + }else{ 84.173 + log= 2*av_log2_c(buf) - 31; 84.174 + buf>>= log; 84.175 + 84.176 + LAST_SKIP_BITS(re, gb, 32 - log); 84.177 + CLOSE_READER(re, gb); 84.178 + 84.179 + if(buf&1) buf= -(buf>>1); 84.180 + else buf= (buf>>1); 84.181 + 84.182 + return buf; 84.183 + } 84.184 +} 84.185 + 84.186 +static inline int svq3_get_se_golomb(GetBitContext *gb){ 84.187 + unsigned int buf; 84.188 + int log; 84.189 + 84.190 + OPEN_READER(re, gb); 84.191 + UPDATE_CACHE(re, gb); 84.192 + buf=GET_CACHE(re, gb); 84.193 + 84.194 + if(buf&0xAA800000){ 84.195 + buf >>= 32 - 8; 84.196 + LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]); 84.197 + CLOSE_READER(re, gb); 84.198 + 84.199 + return ff_interleaved_se_golomb_vlc_code[buf]; 84.200 + }else{ 84.201 + LAST_SKIP_BITS(re, gb, 8); 84.202 + UPDATE_CACHE(re, gb); 84.203 + buf |= 1 | (GET_CACHE(re, gb) >> 8); 84.204 + 84.205 + if((buf & 0xAAAAAAAA) == 0) 84.206 + return INVALID_VLC; 84.207 + 84.208 + for(log=31; (buf & 0x80000000) == 0; log--){ 84.209 + buf = (buf << 2) - ((buf << log) >> (log - 1)) + (buf >> 30); 84.210 + } 84.211 + 84.212 + LAST_SKIP_BITS(re, gb, 63 - 2*log - 8); 84.213 + CLOSE_READER(re, gb); 84.214 + 84.215 + return (signed) (((((buf << log) >> log) - 1) ^ -(buf & 0x1)) + 1) >> 1; 84.216 + } 84.217 +} 84.218 + 84.219 +static inline int dirac_get_se_golomb(GetBitContext *gb){ 84.220 + uint32_t buf; 84.221 + uint32_t ret; 84.222 + 84.223 + ret = svq3_get_ue_golomb(gb); 84.224 + 84.225 + if (ret) { 84.226 + OPEN_READER(re, gb); 84.227 + UPDATE_CACHE(re, gb); 84.228 + buf = SHOW_SBITS(re, gb, 1); 84.229 + LAST_SKIP_BITS(re, gb, 1); 84.230 + ret = (ret ^ buf) - buf; 84.231 + CLOSE_READER(re, gb); 84.232 + } 84.233 + 84.234 + return ret; 84.235 +} 84.236 + 84.237 +/** 84.238 + * read unsigned golomb rice code (ffv1). 84.239 + */ 84.240 +static inline int get_ur_golomb(GetBitContext *gb, int k, int limit, int esc_len){ 84.241 + unsigned int buf; 84.242 + int log; 84.243 + 84.244 + OPEN_READER(re, gb); 84.245 + UPDATE_CACHE(re, gb); 84.246 + buf=GET_CACHE(re, gb); 84.247 + 84.248 + log= av_log2_c(buf); 84.249 + 84.250 + if(log > 31-limit){ 84.251 + buf >>= log - k; 84.252 + buf += (30-log)<<k; 84.253 + LAST_SKIP_BITS(re, gb, 32 + k - log); 84.254 + CLOSE_READER(re, gb); 84.255 + 84.256 + return buf; 84.257 + }else{ 84.258 + LAST_SKIP_BITS(re, gb, limit); 84.259 + UPDATE_CACHE(re, gb); 84.260 + 84.261 + buf = SHOW_UBITS(re, gb, esc_len); 84.262 + 84.263 + LAST_SKIP_BITS(re, gb, esc_len); 84.264 + CLOSE_READER(re, gb); 84.265 + 84.266 + return buf + limit - 1; 84.267 + } 84.268 +} 84.269 + 84.270 +/** 84.271 + * read unsigned golomb rice code (jpegls). 84.272 + */ 84.273 +static inline int get_ur_golomb_jpegls(GetBitContext *gb, int k, int limit, int esc_len){ 84.274 + unsigned int buf; 84.275 + int log; 84.276 + 84.277 + OPEN_READER(re, gb); 84.278 + UPDATE_CACHE(re, gb); 84.279 + buf=GET_CACHE(re, gb); 84.280 + 84.281 + log= av_log2_c(buf); 84.282 + 84.283 + if(log - k >= 32-MIN_CACHE_BITS+(MIN_CACHE_BITS==32) && 32-log < limit){ 84.284 + buf >>= log - k; 84.285 + buf += (30-log)<<k; 84.286 + LAST_SKIP_BITS(re, gb, 32 + k - log); 84.287 + CLOSE_READER(re, gb); 84.288 + 84.289 + return buf; 84.290 + }else{ 84.291 + int i; 84.292 + for(i=0; SHOW_UBITS(re, gb, 1) == 0; i++){ 84.293 + LAST_SKIP_BITS(re, gb, 1); 84.294 + UPDATE_CACHE(re, gb); 84.295 + } 84.296 + SKIP_BITS(re, gb, 1); 84.297 + 84.298 + if(i < limit - 1){ 84.299 + if(k){ 84.300 + buf = SHOW_UBITS(re, gb, k); 84.301 + LAST_SKIP_BITS(re, gb, k); 84.302 + }else{ 84.303 + buf=0; 84.304 + } 84.305 + 84.306 + CLOSE_READER(re, gb); 84.307 + return buf + (i<<k); 84.308 + }else if(i == limit - 1){ 84.309 + buf = SHOW_UBITS(re, gb, esc_len); 84.310 + LAST_SKIP_BITS(re, gb, esc_len); 84.311 + CLOSE_READER(re, gb); 84.312 + 84.313 + return buf + 1; 84.314 + }else 84.315 + return -1; 84.316 + } 84.317 +} 84.318 + 84.319 +/** 84.320 + * read signed golomb rice code (ffv1). 84.321 + */ 84.322 +static inline int get_sr_golomb(GetBitContext *gb, int k, int limit, int esc_len){ 84.323 + int v= get_ur_golomb(gb, k, limit, esc_len); 84.324 + 84.325 + v++; 84.326 + if (v&1) return v>>1; 84.327 + else return -(v>>1); 84.328 + 84.329 +// return (v>>1) ^ -(v&1); 84.330 +} 84.331 + 84.332 +/** 84.333 + * read signed golomb rice code (flac). 84.334 + */ 84.335 +static inline int get_sr_golomb_flac(GetBitContext *gb, int k, int limit, int esc_len){ 84.336 + int v= get_ur_golomb_jpegls(gb, k, limit, esc_len); 84.337 + return (v>>1) ^ -(v&1); 84.338 +} 84.339 + 84.340 +/** 84.341 + * read unsigned golomb rice code (shorten). 84.342 + */ 84.343 +static inline unsigned int get_ur_golomb_shorten(GetBitContext *gb, int k){ 84.344 + return get_ur_golomb_jpegls(gb, k, INT_MAX, 0); 84.345 +} 84.346 + 84.347 +/** 84.348 + * read signed golomb rice code (shorten). 84.349 + */ 84.350 +static inline int get_sr_golomb_shorten(GetBitContext* gb, int k) 84.351 +{ 84.352 + int uvar = get_ur_golomb_jpegls(gb, k + 1, INT_MAX, 0); 84.353 + if (uvar & 1) 84.354 + return ~(uvar >> 1); 84.355 + else 84.356 + return uvar >> 1; 84.357 +} 84.358 + 84.359 + 84.360 + 84.361 +#ifdef TRACE 84.362 + 84.363 +static inline int get_ue(GetBitContext *s, char *file, const char *func, int line){ 84.364 + int show= show_bits(s, 24); 84.365 + int pos= get_bits_count(s); 84.366 + int i= get_ue_golomb(s); 84.367 + int len= get_bits_count(s) - pos; 84.368 + int bits= show>>(24-len); 84.369 + 84.370 + print_bin(bits, len); 84.371 + 84.372 + av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d ue @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line); 84.373 + 84.374 + return i; 84.375 +} 84.376 + 84.377 +static inline int get_se(GetBitContext *s, char *file, const char *func, int line){ 84.378 + int show= show_bits(s, 24); 84.379 + int pos= get_bits_count(s); 84.380 + int i= get_se_golomb(s); 84.381 + int len= get_bits_count(s) - pos; 84.382 + int bits= show>>(24-len); 84.383 + 84.384 + print_bin(bits, len); 84.385 + 84.386 + av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d se @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line); 84.387 + 84.388 + return i; 84.389 +} 84.390 + 84.391 +static inline int get_te(GetBitContext *s, int r, char *file, const char *func, int line){ 84.392 + int show= show_bits(s, 24); 84.393 + int pos= get_bits_count(s); 84.394 + int i= get_te0_golomb(s, r); 84.395 + int len= get_bits_count(s) - pos; 84.396 + int bits= show>>(24-len); 84.397 + 84.398 + print_bin(bits, len); 84.399 + 84.400 + av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d te @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line); 84.401 + 84.402 + return i; 84.403 +} 84.404 + 84.405 +#define get_ue_golomb(a) get_ue(a, __FILE__, __PRETTY_FUNCTION__, __LINE__) 84.406 +#define get_se_golomb(a) get_se(a, __FILE__, __PRETTY_FUNCTION__, __LINE__) 84.407 +#define get_te_golomb(a, r) get_te(a, r, __FILE__, __PRETTY_FUNCTION__, __LINE__) 84.408 +#define get_te0_golomb(a, r) get_te(a, r, __FILE__, __PRETTY_FUNCTION__, __LINE__) 84.409 + 84.410 +#endif 84.411 + 84.412 + 84.413 +#endif /* AVCODEC_GOLOMB_H */
85.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 85.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264.c Mon Aug 27 12:09:56 2012 +0200 85.3 @@ -0,0 +1,215 @@ 85.4 +#include "config.h" 85.5 +#include "h264.h" 85.6 +#include "h264_misc.h" 85.7 +#include <math.h> 85.8 + 85.9 +H264Context *get_h264dec_context(const char *file_name, int ifile, int ofile, int width, int height, h264_options *opts){ 85.10 + int i; 85.11 + const int mb_height = (height + 15) / 16; 85.12 + const int mb_width = (width + 15) / 16; 85.13 + const int mb_stride = ((mb_width+1)/16 + 1) *16; //align mb_stride to 16 85.14 + 85.15 + ff_init_cabac_states(); 85.16 + 85.17 + H264Context *h= av_mallocz(sizeof(H264Context)); 85.18 + 85.19 + start_timer(h, TOTAL); 85.20 + h->file_name = file_name; 85.21 + h->profile = opts->profile; 85.22 + for (i=0; i<PROFILE_STAGES; i++) 85.23 + h->total_time[i]=0; 85.24 + 85.25 + h->ifile=ifile; 85.26 + h->ofile =ofile; 85.27 + 85.28 + h->verbose =opts->verbose; 85.29 + h->no_mbd =opts->no_mbd; 85.30 + h->static_3d =opts->static_3d; 85.31 + h->pipe_bufs = opts->pipe_bufs; 85.32 + h->slice_bufs = opts->slice_bufs; 85.33 + 85.34 + h->ed_ppe_threads =0; 85.35 + if (opts->ppe_ed){ 85.36 + h->ed_ppe_threads = (opts->threads >opts->ppe_ed)? opts->ppe_ed :opts->threads; 85.37 + } 85.38 + 85.39 + h->threads = opts->threads - h->ed_ppe_threads; 85.40 + h->smt = opts->smt; 85.41 + if (h->smt){ 85.42 + h->threads *= 2; 85.43 + } 85.44 + 85.45 + h->num_frames = opts->numframes; 85.46 + 85.47 + h->frame_width = width; 85.48 + h->frame_height = height; 85.49 + 85.50 + while ((width/2) %STRIDE_ALIGN) 85.51 + width+=STRIDE_ALIGN; 85.52 + h->width = width; 85.53 + h->height = mb_height*16; 85.54 + 85.55 + h->mb_height = mb_height; 85.56 + h->mb_width = mb_width; 85.57 + h->mb_stride = mb_stride; 85.58 + h->b4_stride = mb_width*4 + 1; 85.59 + h->b_stride = mb_width*4; 85.60 + 85.61 + h->smb_width = opts->smb_size[0]; 85.62 + h->smb_height = opts->smb_size[1] < h->smb_width ? opts->smb_size[1] : h->smb_width; 85.63 + h->smbc = getSuperMBContext(h, h->smb_width, h->smb_height); 85.64 + 85.65 + h->wave_order = opts->wave_order; 85.66 + 85.67 + h->pipe_bufs = opts->pipe_bufs; 85.68 + 85.69 + h->max_dpb_cnt = DPB_SIZE + opts->pipe_bufs; 85.70 + h->free_dpb_cnt = h->max_dpb_cnt; 85.71 + h->dpb = av_mallocz (h->max_dpb_cnt* sizeof (DecodedPicture)); 85.72 + 85.73 + 85.74 + h->free_sb_cnt = h->threads*opts->slice_bufs + (h->no_mbd != 0) ; //one extra to overlap some latency of signaling/freeing slicebuffers in entropy only mode 85.75 + h->sb_size = h->free_sb_cnt; 85.76 + h->sb = av_mallocz(h->sb_size* sizeof(SliceBufferEntry)); 85.77 + 85.78 + h->rl_q.size = FFMAX(1, FFMIN( (h->height-3 - 512)/16, h->mb_width/2)) +1; 85.79 + h->rl_q.free = h->rl_q.size -1; 85.80 + h->rl_q.ready=0; 85.81 + h->rl_q.fi = h->rl_q.fo= 0; 85.82 + h->rl_q.queue = av_malloc(h->rl_q.size* sizeof(RingLineEntry*)); 85.83 + for (i=0; i<h->rl_q.size; i++){ 85.84 + if( posix_memalign((void**)&h->rl_q.queue[i],64,sizeof(RingLineEntry))) 85.85 + h->rl_q.queue[i]=NULL; 85.86 + h->rl_q.queue[i]->top = av_malloc(h->mb_width*sizeof(TopBorder)); 85.87 + } 85.88 + 85.89 + h->rl_q.queue[0]->prev_line = h->rl_q.queue[h->rl_q.size-1]; 85.90 + for (i=1; i<h->rl_q.size; i++){ 85.91 + h->rl_q.queue[i]->prev_line = h->rl_q.queue[i-1]; 85.92 + } 85.93 + 85.94 + if( HAVE_MMX | HAVE_ALTIVEC| HAVE_NEON ){ 85.95 + for(i=0; i<16; i++){ 85.96 + #define T(x) (x>>2) | ((x<<2) & 0xF) 85.97 + h->zigzag_scan[i] = T(zigzag_scan[i]); 85.98 + #undef T 85.99 + } 85.100 + for(i=0; i<64; i++){ 85.101 + #define T(x) (x>>3) | ((x&7)<<3) 85.102 + h->zigzag_scan8x8[i] = T(ff_zigzag_direct[i]); 85.103 + #undef T 85.104 + } 85.105 + }else{ 85.106 + memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t)); 85.107 + memcpy(h->zigzag_scan8x8, ff_zigzag_direct, 64*sizeof(uint8_t)); 85.108 + } 85.109 + 85.110 + pthread_mutex_init(&h->smb_lock, NULL); 85.111 + pthread_mutex_init(&h->sdl_lock, NULL); 85.112 + pthread_cond_init(&h->sdl_cond, NULL); 85.113 + 85.114 + ///pthread initialization 85.115 + pthread_mutex_init(&h->ilock, NULL); 85.116 + pthread_cond_init(&h->icond, NULL); 85.117 + pthread_mutex_init(&h->slock, NULL); 85.118 + pthread_cond_init(&h->scond, NULL); 85.119 + pthread_mutex_init(&h->tlock, NULL); 85.120 + pthread_cond_init(&h->tcond, NULL); 85.121 + pthread_mutex_init(&h->tdlock, NULL); 85.122 + pthread_cond_init(&h->tdcond, NULL); 85.123 + h->start =!opts->numamap; //default dont wait for start signal 85.124 + h->statmbd = opts->statmbd; 85.125 + h->rl_side_touch= opts->numamap; 85.126 + h->touch_start=0; 85.127 + h->setaff =opts->statsched; 85.128 + h->init_threads=0; 85.129 + 85.130 + pthread_mutex_init(&h->task_lock, NULL); 85.131 + pthread_cond_init(&h->task_cond, NULL); 85.132 + for (i=0; i<STAGES; i++){ 85.133 + pthread_mutex_init (&h->lock[i], NULL); 85.134 + pthread_cond_init (&h->cond[i], NULL); 85.135 + 85.136 + pthread_mutex_init (&h->sb_q[i].lock, NULL); 85.137 + pthread_cond_init (&h->sb_q[i].cond, NULL); 85.138 + h->sb_q[i].size = h->free_sb_cnt; //change to num threads later 85.139 + h->sb_q[i].queue = av_malloc(h->free_sb_cnt* sizeof(SliceBufferEntry*)); 85.140 + h->sb_q[i].cnt = h->sb_q[i].fi = h->sb_q[i].fo =0; 85.141 + } 85.142 + 85.143 +#if HAVE_LIBSDL2 85.144 + h->sdlq.size=2; 85.145 + h->sdlq.ready=2; 85.146 + h->sdlq.queue = av_malloc(2* sizeof(SDL_Texture*)); 85.147 + pthread_mutex_init (&h->sdlq.sdl_lock, NULL); 85.148 + pthread_cond_init (&h->sdlq.sdl_cond, NULL); 85.149 +#endif 85.150 + 85.151 + h->display=opts->display; 85.152 + h->fullscreen=opts->fullscreen; 85.153 + 85.154 + return h; 85.155 +} 85.156 + 85.157 + 85.158 +void free_h264dec_context(H264Context *h) { 85.159 + int i; 85.160 + 85.161 + for(i=0; i<h->max_dpb_cnt; i++) 85.162 + free_dp(&h->dpb[i]); 85.163 + av_free (h->dpb); 85.164 + 85.165 + for(i=0; i<h->sb_size; i++){ 85.166 + if (h->sb[i].initialized){ 85.167 + free_sb_entry(&h->sb[i]); 85.168 + } 85.169 + } 85.170 + av_freep(&h->sb); 85.171 + 85.172 + for (i=0; i<h->rl_q.size; i++){ 85.173 + av_freep(&h->rl_q.queue[i]->top); 85.174 + av_freep(&h->rl_q.queue[i]); 85.175 + } 85.176 + av_freep(&h->rl_q.queue); 85.177 + 85.178 + ///pthread cleanup 85.179 + pthread_mutex_destroy (&h->task_lock); 85.180 + pthread_cond_destroy (&h->task_cond); 85.181 + for (i=0; i<STAGES; i++){ 85.182 + pthread_mutex_destroy (&h->lock[i]); 85.183 + pthread_cond_destroy (&h->cond[i]); 85.184 + 85.185 + pthread_mutex_destroy (&h->sb_q[i].lock); 85.186 + pthread_cond_destroy (&h->sb_q[i].cond); 85.187 + av_freep( &h->sb_q[i].queue); 85.188 + } 85.189 + pthread_mutex_destroy (&h->slock); 85.190 + pthread_cond_destroy (&h->scond); 85.191 + pthread_mutex_destroy (&h->ilock); 85.192 + pthread_cond_destroy (&h->icond); 85.193 + 85.194 + pthread_mutex_destroy(&h->smb_lock); 85.195 + pthread_mutex_destroy (&h->sdl_lock); 85.196 + pthread_cond_destroy (&h->sdl_cond); 85.197 +#if HAVE_LIBSDL2 85.198 + av_free(h->sdlq.queue); 85.199 + pthread_mutex_destroy (&h->sdlq.sdl_lock); 85.200 + pthread_cond_destroy (&h->sdlq.sdl_cond); 85.201 +#endif 85.202 + 85.203 + stop_timer(h, TOTAL); 85.204 + if (h->threads==0){ 85.205 + for (i=0; i<PROFILE_STAGES; i++) 85.206 + h->total_time[i] /= h->num_frames; 85.207 + double others = h->total_time[TOTAL]; 85.208 + for (i=1; i<PROFILE_STAGES; i++) 85.209 + others-=h->total_time[i]; 85.210 + if (h->profile == 1){ 85.211 + printf("\n[FRAME %.3fms] [FRONT %.3fms] [ENTROPY %.3fms] [MBREC %.3fms] [OTHERS %.3fms]\n", h->total_time[TOTAL], h->total_time[FRONT], h->total_time[ED], h->total_time[REC], others); 85.212 + }else if (h->profile ==2){ 85.213 + printf("\n[FRAME %.3fms] [FRONT %.3fms] [ENTROPY %.3fms] [PRED %.3fms] [OTHERS %.3fms]\n", h->total_time[TOTAL], h->total_time[FRONT], h->total_time[ED],h->total_time[REC], others); 85.214 + } 85.215 + } 85.216 + 85.217 + av_free(h); 85.218 +} 85.219 \ No newline at end of file
86.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 86.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264.h Mon Aug 27 12:09:56 2012 +0200 86.3 @@ -0,0 +1,76 @@ 86.4 +/* 86.5 +* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder 86.6 +* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 86.7 +* 86.8 +* This file is part of FFmpeg. 86.9 +* 86.10 +* FFmpeg is free software; you can redistribute it and/or 86.11 +* modify it under the terms of the GNU Lesser General Public 86.12 +* License as published by the Free Software Foundation; either 86.13 +* version 2.1 of the License, or (at your option) any later version. 86.14 +* 86.15 +* FFmpeg is distributed in the hope that it will be useful, 86.16 +* but WITHOUT ANY WARRANTY; without even the implied warranty of 86.17 +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 86.18 +* Lesser General Public License for more details. 86.19 +* 86.20 +* You should have received a copy of the GNU Lesser General Public 86.21 +* License along with FFmpeg; if not, write to the Free Software 86.22 +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 86.23 +*/ 86.24 + 86.25 +/** 86.26 +* @file 86.27 +* H.264 / AVC / MPEG4 part10 codec. 86.28 +* @author Michael Niedermayer <michaelni@gmx.at> 86.29 +*/ 86.30 + 86.31 +#ifndef H264_H 86.32 +#define H264_H 86.33 + 86.34 +#include "h264_entropy.h" 86.35 +#include "h264_data.h" 86.36 +#include "h264_mc.h" 86.37 +#include "h264_misc.h" 86.38 +#include "h264_dsp.h" 86.39 +#include "h264_pred.h" 86.40 +#include "h264_parser.h" 86.41 +#include "h264_nal.h" 86.42 +#include "h264_rec.h" 86.43 +#include "h264_deblock.h" 86.44 +#include "h264_types.h" 86.45 + 86.46 +typedef struct h264_options{ 86.47 + int statsched; 86.48 + int statmbd; 86.49 + int numamap; 86.50 + int no_mbd; 86.51 + int numframes; 86.52 + int display; 86.53 + int fullscreen; 86.54 + int verbose; 86.55 + int ppe_ed; // only useful for Cell 86.56 + int profile; 86.57 + int threads; 86.58 + int smb_size[2]; // only useful for OmpSs 86.59 + int wave_order; 86.60 + int static_3d; 86.61 + int pipe_bufs; 86.62 + int slice_bufs; 86.63 + int smt; 86.64 +}h264_options; 86.65 + 86.66 +int h264_decode_cell(H264Context *h); 86.67 +int h264_decode_cell_seq(H264Context *h); 86.68 + 86.69 +int h264_decode_ompss(H264Context *h); 86.70 + 86.71 +int h264_decode_pthread(H264Context *h); 86.72 +int h264_decode_seq(H264Context *h); 86.73 + 86.74 + 86.75 +H264Context *get_h264dec_context(const char *file_name, int ifile, int ofile, int frame_width, int frame_height, h264_options *opts); 86.76 +void free_h264dec_context(H264Context *h); 86.77 + 86.78 + 86.79 +#endif /* AVCODEC_H264_H */
87.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 87.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_cell.c Mon Aug 27 12:09:56 2012 +0200 87.3 @@ -0,0 +1,1242 @@ 87.4 + 87.5 +#include "h264_types.h" 87.6 +#include "h264_parser.h" 87.7 +#include "h264_nal.h" 87.8 +#include "h264_entropy.h" 87.9 +#include "h264_rec.h" 87.10 +#include "h264_misc.h" 87.11 +#include "cell/h264_types_spu.h" 87.12 +#include "h264_pthread.h" 87.13 + 87.14 +#include <pthread.h> 87.15 +#include <assert.h> 87.16 +#include <unistd.h> 87.17 + 87.18 +#include <libspe2.h> 87.19 +#include <ppu_intrinsics.h> 87.20 +#include <cbe_mfc.h> 87.21 +#include <libsync.h> 87.22 + 87.23 +// spe global variables 87.24 +unsigned rl_cnt_var, rl_mutex_var, rl_cond_var; 87.25 +atomic_ea_t rl_cnt; 87.26 +cond_ea_t rl_cond; 87.27 +mutex_ea_t rl_lock; 87.28 + 87.29 +H264spe * spe_params; 87.30 +unsigned mutex_var[16]; 87.31 +unsigned cond_var[16]; 87.32 +unsigned atomic_var[16]; 87.33 + 87.34 +pthread_t * spe_tid; 87.35 +spe_context_ptr_t *spe_context; 87.36 +void** spe_control_area; 87.37 +void** spe_ls_area; 87.38 +H264slice **spe_slice_buf; 87.39 + 87.40 +H264spe * spe_ed_params; 87.41 +unsigned mutex_ed_var[16]; 87.42 +unsigned cond_ed_var[16]; 87.43 +unsigned atomic_ed_var[16]; 87.44 + 87.45 +pthread_t * spe_ed_tid; 87.46 +spe_context_ptr_t *spe_ed_context; 87.47 +void** spe_ed_control_area; 87.48 +void** spe_ed_ls_area; 87.49 +EDSlice_spu **spe_ed_slice_buf; 87.50 + 87.51 +//structs to propagate stop signal 87.52 +MBSlice last_slice; 87.53 +EDSlice last_ed_slice; 87.54 +DecodedPicture last_pic; 87.55 +RawFrame last_frm; 87.56 + 87.57 +static int direct_B_resolved(EDSlice *s, int *poc_list, int *poc_cnt){ 87.58 + int i; 87.59 + int cnt = *poc_cnt; 87.60 + for(i=0; i<cnt; i++){ 87.61 + if (poc_list[i]==s->ref_list[1][0]->poc){ 87.62 + *poc_cnt=i+1; 87.63 + while(++i<cnt) 87.64 + poc_list[i]=0; 87.65 + return 1; 87.66 + } 87.67 + } 87.68 + return 0; 87.69 +} 87.70 + 87.71 +static void update_IP_poc_list(int *poc_list, int *poc_cnt, int poc) { 87.72 + int i=0; 87.73 + int cnt = *poc_cnt; 87.74 + 87.75 + while (poc_list[i] > poc) { i++;} 87.76 + if ( i< cnt) 87.77 + memmove(&poc_list[i+1], &poc_list[i], (cnt-i)*sizeof(int)); 87.78 + 87.79 + poc_list[i]=poc; 87.80 + (*poc_cnt)++; 87.81 +} 87.82 + 87.83 +static void *spe_ed_thread(void *arg){ 87.84 + H264spe *params = (H264spe *)arg; 87.85 + unsigned int idx = params->idx; 87.86 + unsigned int runflags = 0; 87.87 + unsigned int entry = SPE_DEFAULT_ENTRY; 87.88 + // run SPE context 87.89 + spe_context_run(spe_ed_context[idx], &entry, runflags, (void*) params, NULL, NULL); 87.90 + // done - now exit thread 87.91 + pthread_exit(NULL); 87.92 +} 87.93 + 87.94 +static void create_spe_ED_threads(H264Context *h, int ip_threads, int b_threads) { 87.95 + int i; 87.96 + int num_threads = ip_threads+b_threads; 87.97 + spe_program_handle_t * spe_program = spe_image_open("spe_ed"); 87.98 + // reserve memory for spe thread id, context and argument addresses 87.99 + spe_ed_tid = av_malloc(num_threads * sizeof (pthread_t)); 87.100 + spe_ed_context = av_malloc(num_threads * sizeof (spe_context_ptr_t)); 87.101 + spe_ed_params = av_malloc(num_threads * sizeof (H264spe)); 87.102 + spe_ed_control_area = av_malloc(num_threads * sizeof (void*)); 87.103 + spe_ed_ls_area = av_malloc(num_threads * sizeof (void*)); 87.104 + spe_ed_slice_buf = av_malloc(num_threads * sizeof (void*)); 87.105 + 87.106 + if (spe_program == NULL) 87.107 + av_log(AV_LOG_ERROR, "PPE: error opening SPE object image:%d. error=%s \n", errno, strerror(errno)); 87.108 + 87.109 + for (i = 0; i < num_threads; i++) { 87.110 + // create context for spe program 87.111 + spe_ed_context[i] = spe_context_create(SPE_MAP_PS, NULL); 87.112 + if (spe_ed_context[i] == NULL) 87.113 + av_log(AV_LOG_ERROR, "PPE: error creating SPE context:%d. error=%s \n", errno, strerror(errno)); 87.114 + // load SPE program into main memory 87.115 + if ((spe_program_load(spe_ed_context[i], spe_program)) == -1) 87.116 + av_log(AV_LOG_ERROR, "PPE: error loading SPE context:%d. error=%s \n", errno, strerror(errno)); 87.117 + //get the control_area for fast mailboxing 87.118 + if ((spe_ed_control_area[i] = spe_ps_area_get(spe_ed_context[i], SPE_CONTROL_AREA)) == NULL) 87.119 + av_log(AV_LOG_ERROR, "PPE: error retrieving SPE control area:%d. error=%s \n", errno, strerror(errno)); 87.120 + //get ls area for inter spe communication 87.121 + if ((spe_ed_ls_area[i] = spe_ls_area_get(spe_ed_context[i])) == NULL) 87.122 + av_log(AV_LOG_ERROR, "PPE: error retrieving SPE ls area:%d. error=%s \n", errno, strerror(errno)); 87.123 + } 87.124 + 87.125 + for (i = 0; i < ip_threads; i++) { 87.126 + spe_ed_params[i].mb_width = h->mb_width; 87.127 + spe_ed_params[i].mb_stride = h->mb_stride; 87.128 + spe_ed_params[i].mb_height = h->mb_height; 87.129 + spe_ed_params[i].type = EDIP; 87.130 + spe_ed_params[i].spe_id = i; 87.131 + spe_ed_params[i].idx = i; 87.132 + //spe_ed_params[i].spe_total = ip_threads; //not used 87.133 + //spe_params[i].slice_params= &slice_params; 87.134 + spe_ed_params[i].src_spe = spe_ed_ls_area[(i-1+num_threads)%num_threads]; 87.135 + spe_ed_params[i].tgt_spe = spe_ed_ls_area[(i+1)%num_threads]; 87.136 + 87.137 + spe_ed_params[i].lock = (mutex_ea_t) (unsigned) &mutex_ed_var[i]; 87.138 + spe_ed_params[i].cond = (cond_ea_t) (unsigned) &cond_ed_var[i]; 87.139 + spe_ed_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_ed_var[i]; atomic_set(spe_ed_params[i].cnt, 0); 87.140 + 87.141 + mutex_init(spe_ed_params[i].lock); 87.142 + cond_init(spe_ed_params[i].cond); 87.143 + if (pthread_create(&spe_ed_tid[i], NULL, spe_ed_thread, (void *) &spe_ed_params[i])) 87.144 + av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i); 87.145 + 87.146 + //slicebufaddr 87.147 + spe_ed_slice_buf[i] = (EDSlice_spu *) _spe_out_mbox_read(spe_ed_control_area[i]); 87.148 + av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i); 87.149 + } 87.150 + for (int j = 0; j < b_threads; j++) { 87.151 + i = j+ip_threads; 87.152 + spe_ed_params[i].mb_width = h->mb_width; 87.153 + spe_ed_params[i].mb_stride = h->mb_stride; 87.154 + spe_ed_params[i].mb_height = h->mb_height; 87.155 + spe_ed_params[i].type = EDB; 87.156 + spe_ed_params[i].idx = i; 87.157 + spe_ed_params[i].spe_id = j; 87.158 + spe_ed_params[i].spe_total = b_threads; 87.159 + //spe_params[i].slice_params= &slice_params; 87.160 + //spe_ed_params[i].src_spe = spe_ed_ls_area[(i-1+num_threads)%num_threads]; 87.161 + spe_ed_params[i].tgt_spe = spe_ed_ls_area[((j+1)%b_threads) + ip_threads]; 87.162 + 87.163 + spe_ed_params[i].lock = (mutex_ea_t) (unsigned) &mutex_ed_var[i]; 87.164 + spe_ed_params[i].cond = (cond_ea_t) (unsigned) &cond_ed_var[i]; 87.165 + spe_ed_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_ed_var[i]; atomic_set(spe_ed_params[i].cnt, 0); 87.166 + 87.167 + mutex_init(spe_ed_params[i].lock); 87.168 + cond_init(spe_ed_params[i].cond); 87.169 + if (pthread_create(&spe_ed_tid[i], NULL, spe_ed_thread, (void *) &spe_ed_params[i])) 87.170 + av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i); 87.171 + 87.172 + //slicebufaddr 87.173 + spe_ed_slice_buf[i] = (EDSlice_spu *) _spe_out_mbox_read(spe_ed_control_area[i]); 87.174 + av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i); 87.175 + } 87.176 + spe_image_close(spe_program); 87.177 + 87.178 +} 87.179 + 87.180 +static void fill_EDSlice_spu(EDSlice_spu *dst, EDSlice *src){ 87.181 + dst->pps = src->pps; 87.182 + dst->mbs = src->mbs; 87.183 + dst->state = src->state; 87.184 + dst->qp_thresh = src->qp_thresh; 87.185 + dst->pic = *src->current_picture; 87.186 + 87.187 + dst->ref_count[0] = src->ref_count[0]; 87.188 + dst->ref_count[1] = src->ref_count[1]; 87.189 + dst->slice_type = src->slice_type; 87.190 + dst->slice_type_nos = src->slice_type_nos; 87.191 + dst->direct_8x8_inference_flag = src->direct_8x8_inference_flag; 87.192 + dst->list_count = src->list_count; 87.193 + dst->coded_pic_num = src->coded_pic_num; 87.194 + 87.195 + GetBitContext *gb = &src->gb; 87.196 + align_get_bits( gb); 87.197 + dst->bytestream_start = gb->buffer + get_bits_count(gb)/8; 87.198 + dst->byte_bufsize = (get_bits_left(gb) + 7)/8; 87.199 + 87.200 + dst->transform_bypass = src->transform_bypass; 87.201 + dst->direct_spatial_mv_pred = src->direct_spatial_mv_pred; 87.202 + memcpy(dst->map_col_to_list0, src->map_col_to_list0, 2*16*sizeof(int)); 87.203 + memcpy(dst->dist_scale_factor, src->dist_scale_factor, 16*sizeof(int)); 87.204 + dst->cabac_init_idc = src->cabac_init_idc; 87.205 + memcpy(dst->ref2frm, src->ref2frm, 2*64*sizeof(int)); 87.206 + dst->chroma_qp[0]= src->chroma_qp[0]; 87.207 + dst->chroma_qp[1]= src->chroma_qp[1]; 87.208 + dst->qscale = src->qscale; 87.209 + dst->last_qscale_diff = src->last_qscale_diff; 87.210 + 87.211 + if (src->slice_type_nos == FF_B_TYPE) dst->list1 = *src->ref_list[1][0]; 87.212 +} 87.213 + 87.214 +static void send_slice_to_spe_and_wait(EDSlice_spu *s, int id){ 87.215 + unsigned status; 87.216 + 87.217 + spe_mfcio_get(spe_ed_context[id], (unsigned) spe_ed_slice_buf[id], s, sizeof(EDSlice_spu), 14, 0, 0); 87.218 + spe_mfcio_tag_status_read(spe_ed_context[id], 1<<14, SPE_TAG_ALL, &status); 87.219 + 87.220 + 87.221 + _spe_in_mbox_write(spe_ed_control_area[id], 0); 87.222 + 87.223 + while (!spe_out_mbox_status(spe_ed_context[id])){ 87.224 + //pthread_yield(); 87.225 + usleep(1000); 87.226 + } 87.227 + _spe_out_mbox_read(spe_ed_control_area[id]); 87.228 +} 87.229 + 87.230 +static int decode_slice_entropy_cell(EntropyContext *ec, EDSlice *s, int id){ 87.231 + int i,j; 87.232 + 87.233 + if( !s->pps.cabac ){ 87.234 + av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); 87.235 + return -1; 87.236 + } 87.237 + DECLARE_ALIGNED(16, EDSlice_spu, slice); 87.238 + fill_EDSlice_spu(&slice, s); 87.239 + 87.240 + send_slice_to_spe_and_wait(&slice, id); 87.241 + 87.242 + return 0; 87.243 +} 87.244 + 87.245 +static int decode_slice_entropy_cell_seq(H264Context *h, EntropyContext *ec, EDSlice *s){ 87.246 + int i,j; 87.247 + 87.248 + if( !s->pps.cabac ){ 87.249 + av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); 87.250 + return -1; 87.251 + } 87.252 + DECLARE_ALIGNED(16, EDSlice_spu, slice); 87.253 + fill_EDSlice_spu(&slice, s); 87.254 + 87.255 + send_slice_to_spe_and_wait(&slice, 0); 87.256 + 87.257 + if (s->release_cnt>0) { 87.258 + for (int i=0; i<s->release_cnt; i++){ 87.259 + release_pib_entry(h, s->release_ref[i], 2); 87.260 + } 87.261 + s->release_cnt=0; 87.262 + } 87.263 + 87.264 + release_pib_entry(h, s->current_picture, 1); 87.265 + av_freep(&s->gb.raw); 87.266 + if (s->gb.rbsp) 87.267 + av_freep(&s->gb.rbsp); 87.268 + 87.269 + return 0; 87.270 +} 87.271 + 87.272 +static void *entr_IP_spe_thread(void *arg){ 87.273 + EDThreadContext *eip = (EDThreadContext *) arg; 87.274 + H264Context *h = eip->h; 87.275 +// printf("eip %d, pid %d\n", eip->thread_num, syscall(SYS_gettid)); 87.276 + for (int i=0; i<SLICE_BUFS; i++){ 87.277 + eip->mbs[i] = av_malloc(h->mb_height*h->mb_width*sizeof(H264Mb)); 87.278 + } 87.279 + 87.280 + EntropyContext *ec = get_entropy_context(h); 87.281 + EDSlice *s; 87.282 + 87.283 + for(;;){ 87.284 + { 87.285 + pthread_mutex_lock(&eip->ed_lock); 87.286 + while (eip->ed_cnt <= 0) 87.287 + pthread_cond_wait(&eip->ed_cond, &eip->ed_lock); 87.288 + s = &eip->ed_q[eip->ed_fo]; 87.289 + eip->ed_fo++; eip->ed_fo %= MAX_SLICE_COUNT; 87.290 + pthread_mutex_unlock(&eip->ed_lock); 87.291 + } 87.292 + 87.293 + if (s->state<0) 87.294 + break; 87.295 + { 87.296 + pthread_mutex_lock(&eip->mbs_lock); 87.297 + while (eip->mbs_cnt <= 0) 87.298 + pthread_cond_wait(&eip->mbs_cond, &eip->mbs_lock); 87.299 + 87.300 + s->mbs = eip->mbs[eip->mbs_fo]; 87.301 + s->ed = eip; 87.302 + eip->mbs_cnt--; 87.303 + eip->mbs_fo++; eip->mbs_fo%=SLICE_BUFS; 87.304 + pthread_mutex_unlock(&eip->mbs_lock); 87.305 + } 87.306 + if (eip->cell){ 87.307 + decode_slice_entropy_cell(ec, s, eip->thread_num); 87.308 + }else{ 87.309 + decode_slice_entropy(ec, s); 87.310 + } 87.311 + 87.312 +// { 87.313 +// pthread_mutex_lock(&h->lock[ENTROPY2]); 87.314 +// h->ed_poc[h->ed_poc_fi++ % MAX_SLICE_COUNT] = s->current_picture->poc; 87.315 +// while (h->ed_poc_fi > h->ed_poc_fo + MAX_SLICE_COUNT) 87.316 +// h->ed_poc_fo++; 87.317 +// 87.318 +// pthread_cond_signal(&h->cond[ENTROPY2]); 87.319 +// pthread_mutex_unlock(&h->lock[ENTROPY2]); 87.320 +// } 87.321 + 87.322 + { 87.323 + pthread_mutex_lock(&h->lock[ENTROPY4]); 87.324 + while (h->ed_reorder_cnt>=MAX_SLICE_COUNT) 87.325 + pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]); 87.326 + h->ed_reorder_q[h->ed_reorder_fi] = *s; 87.327 + h->ed_reorder_cnt++; 87.328 + h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT; 87.329 + pthread_cond_signal(&h->cond[ENTROPY4]); 87.330 + pthread_mutex_unlock(&h->lock[ENTROPY4]); 87.331 + } 87.332 + 87.333 + { 87.334 + pthread_mutex_lock(&eip->ed_lock); 87.335 + eip->ed_cnt--; 87.336 + pthread_cond_signal(&eip->ed_cond); 87.337 + pthread_mutex_unlock(&eip->ed_lock); 87.338 + } 87.339 + } 87.340 + 87.341 + free_entropy_context(ec); 87.342 + 87.343 + pthread_exit(NULL); 87.344 + return NULL; 87.345 +} 87.346 + 87.347 +static void *entr_B_spe_thread(void *arg){ 87.348 + EDThreadContext *eb = (EDThreadContext *) arg; 87.349 + H264Context *h = eb->h; 87.350 +// printf("eb %d, pid %d\n", eb->thread_num, syscall(SYS_gettid)); 87.351 + for (int i=0; i<SLICE_BUFS; i++){ 87.352 + eb->mbs[i] = av_malloc(h->mb_height*h->mb_width*sizeof(H264Mb)); 87.353 + } 87.354 + 87.355 + EntropyContext *ec = get_entropy_context(h); 87.356 + EDSlice *s; 87.357 + 87.358 + for(;;){ 87.359 + { 87.360 + pthread_mutex_lock(&eb->ed_lock); 87.361 + while (eb->ed_cnt <= 0) 87.362 + pthread_cond_wait(&eb->ed_cond, &eb->ed_lock); 87.363 + s = &eb->ed_q[eb->ed_fo]; 87.364 + eb->ed_fo++; eb->ed_fo %= MAX_SLICE_COUNT; 87.365 + pthread_mutex_unlock(&eb->ed_lock); 87.366 + } 87.367 + 87.368 + if (s->state<0) 87.369 + break; 87.370 + { 87.371 + pthread_mutex_lock(&eb->mbs_lock); 87.372 + while (eb->mbs_cnt <= 0) 87.373 + pthread_cond_wait(&eb->mbs_cond, &eb->mbs_lock); 87.374 + s->mbs = eb->mbs[eb->mbs_fo]; 87.375 + s->ed = eb; 87.376 + eb->mbs_cnt--; 87.377 + eb->mbs_fo++; eb->mbs_fo%=SLICE_BUFS; 87.378 + pthread_mutex_unlock(&eb->mbs_lock); 87.379 + } 87.380 + //decode_B_slice_entropy(&hcabac, &cabac, s, eb, eb->prev_ed); 87.381 + decode_slice_entropy_cell(ec, s, eb->thread_num + h->edip_threads); 87.382 + 87.383 + { 87.384 + pthread_mutex_lock(&h->lock[ENTROPY4]); 87.385 + while (h->ed_reorder_cnt>=MAX_SLICE_COUNT) 87.386 + pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]); 87.387 + h->ed_reorder_q[h->ed_reorder_fi] = *s; 87.388 + h->ed_reorder_cnt++; 87.389 + h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT; 87.390 + pthread_cond_signal(&h->cond[ENTROPY4]); 87.391 + pthread_mutex_unlock(&h->lock[ENTROPY4]); 87.392 + 87.393 + } 87.394 + 87.395 + { 87.396 + pthread_mutex_lock(&eb->ed_lock); 87.397 + eb->ed_cnt--; 87.398 + pthread_cond_signal(&eb->ed_cond); 87.399 + pthread_mutex_unlock(&eb->ed_lock); 87.400 + } 87.401 + } 87.402 + eb->lines_cnt++; 87.403 + 87.404 + free_entropy_context(ec); 87.405 + 87.406 + pthread_exit(NULL); 87.407 + return NULL; 87.408 +} 87.409 + 87.410 +static void *entr_B_distribute(void *arg){ 87.411 + H264Context *h = (H264Context *) arg; 87.412 + EDSlice *s; 87.413 + 87.414 + int i, n=0, poc; 87.415 + 87.416 +// printf("eb dist, pid %d\n", syscall(SYS_gettid)); 87.417 + 87.418 + for(i=0; i<h->edb_threads; i++){ 87.419 + h->b[i].h =h; 87.420 + h->b[i].thread_num =i; 87.421 + h->b[i].thread_total =h->edb_threads; 87.422 + pthread_mutex_init(&h->b[i].mbs_lock, NULL); 87.423 + pthread_cond_init(&h->b[i].mbs_cond, NULL); 87.424 + h->b[i].mbs_fo = 0; 87.425 + h->b[i].mbs_cnt = SLICE_BUFS; 87.426 + h->b[i].ed_fi =0; 87.427 + h->b[i].ed_fo =0; 87.428 + h->b[i].ed_cnt =0; 87.429 + h->b[i].lines_cnt =0; 87.430 + h->b[i].prev_ed = &h->b[(i-1 +h->edb_threads) % h->edb_threads]; 87.431 + pthread_mutex_init(&h->b[i].ed_lock, NULL); 87.432 + pthread_cond_init(&h->b[i].ed_cond, NULL); 87.433 + pthread_create(&h->ed_B_thr[i], NULL, entr_B_spe_thread, &h->b[i]); 87.434 + } 87.435 + 87.436 + for(;;){ 87.437 + { 87.438 + pthread_mutex_lock(&h->lock[ENTROPY3B]); 87.439 + while (h->ed_B_cnt<=0) 87.440 + pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]); 87.441 + s= &h->ed_B_q[h->ed_B_fo]; 87.442 + h->ed_B_fo++; h->ed_B_fo %= MAX_SLICE_COUNT; 87.443 + pthread_mutex_unlock(&h->lock[ENTROPY3B]); 87.444 + 87.445 + } 87.446 + if (s->state<0) 87.447 + break; 87.448 + 87.449 + if (s->ref_list[1][0]->slice_type_nos != FF_B_TYPE){ 87.450 + while (poc < s->ref_list[1][0]->poc){ 87.451 + pthread_mutex_lock(&h->lock[ENTROPY2]); 87.452 + while (poc == h->ed_poc) 87.453 + pthread_cond_wait(&h->cond[ENTROPY2], &h->lock[ENTROPY2]); 87.454 + poc = h->ed_poc; 87.455 + pthread_mutex_unlock(&h->lock[ENTROPY2]); 87.456 + } 87.457 + } 87.458 + { 87.459 + pthread_mutex_lock(&h->b[n].ed_lock); 87.460 + while (h->b[n].ed_cnt >= MAX_SLICE_COUNT) 87.461 + pthread_cond_wait(&h->b[n].ed_cond, &h->b[n].ed_lock); 87.462 + h->b[n].ed_q[ h->b[n].ed_fi] = *s; 87.463 + h->b[n].ed_cnt++; 87.464 + h->b[n].ed_fi++; h->b[n].ed_fi %= MAX_SLICE_COUNT; 87.465 + pthread_cond_signal(&h->b[n].ed_cond); 87.466 + pthread_mutex_unlock(&h->b[n].ed_lock); 87.467 + 87.468 + n++; n%=h->edb_threads; 87.469 + } 87.470 + { 87.471 + pthread_mutex_lock(&h->lock[ENTROPY3B]); 87.472 + h->ed_B_cnt--; 87.473 + pthread_cond_signal(&h->cond[ENTROPY3B]); 87.474 + pthread_mutex_unlock(&h->lock[ENTROPY3B]); 87.475 + 87.476 + } 87.477 + 87.478 + } 87.479 + 87.480 + for (i=0; i<h->edb_threads; i++){ 87.481 + pthread_mutex_lock(&h->b[i].ed_lock); 87.482 + while (h->b[i].ed_cnt >= MAX_SLICE_COUNT) 87.483 + pthread_cond_wait(&h->b[i].ed_cond, &h->b[i].ed_lock); 87.484 + h->b[i].ed_q[ h->b[i].ed_fi] = *s; 87.485 + h->b[i].ed_cnt++; 87.486 + h->b[i].ed_fi++; h->b[i].ed_fi %= MAX_SLICE_COUNT; 87.487 + pthread_cond_signal(&h->b[i].ed_cond); 87.488 + pthread_mutex_unlock(&h->b[i].ed_lock); 87.489 + 87.490 + } 87.491 + for(int i=0; i<h->edb_threads; i++){ 87.492 + pthread_join(h->ed_B_thr[i], NULL); 87.493 + } 87.494 + pthread_exit(NULL); 87.495 + return NULL; 87.496 +} 87.497 + 87.498 + 87.499 +static void *entr_IPB_distribute(void *arg){ 87.500 + H264Context *h = (H264Context *) arg; 87.501 + EDSlice *s; 87.502 + int i,n=0; 87.503 + 87.504 + create_spe_ED_threads(h, h->edip_threads, h->edb_threads); 87.505 + pthread_create(&h->ed_B_dist, NULL, entr_B_distribute, h); 87.506 + for(i=0; i<h->edip_threads + h->edip_ppe_threads; i++){ 87.507 + h->ip[i].h =h; 87.508 + h->ip[i].cell = (i >= h->edip_ppe_threads); 87.509 + pthread_mutex_init(&h->ip[i].mbs_lock, NULL); 87.510 + pthread_cond_init(&h->ip[i].mbs_cond, NULL); 87.511 + h->ip[i].thread_num = i - h->edip_ppe_threads; 87.512 + h->ip[i].thread_total=h->edip_threads+ h->edip_ppe_threads; 87.513 + h->ip[i].mbs_fo = 0; 87.514 + h->ip[i].mbs_cnt = SLICE_BUFS; 87.515 + h->ip[i].ed_fi =0; 87.516 + h->ip[i].ed_fo =0; 87.517 + pthread_mutex_init(&h->ip[i].ed_lock, NULL); 87.518 + pthread_cond_init(&h->ip[i].ed_cond, NULL); 87.519 + pthread_create(&h->ed_IP_thr[i], NULL, entr_IP_spe_thread, &h->ip[i]); 87.520 + } 87.521 + 87.522 + for(;;){ 87.523 + { 87.524 + pthread_mutex_lock(&h->lock[ENTROPY]); 87.525 + while (h->ed_cnt<=0) 87.526 + pthread_cond_wait(&h->cond[ENTROPY], &h->lock[ENTROPY]); 87.527 + s= &h->ed_q[h->ed_fo]; 87.528 + 87.529 + pthread_mutex_unlock(&h->lock[ENTROPY]); 87.530 + h->ed_fo++; h->ed_fo %= MAX_SLICE_COUNT; 87.531 + } 87.532 + if (s->state<0) 87.533 + break; 87.534 + 87.535 + assert(s->current_picture); 87.536 + if (s->slice_type_nos == FF_B_TYPE ) 87.537 + { 87.538 + pthread_mutex_lock(&h->lock[ENTROPY3B]); 87.539 + while (h->ed_B_cnt>=MAX_SLICE_COUNT) 87.540 + pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]); 87.541 + h->ed_B_q[h->ed_B_fi] = *s; 87.542 + h->ed_B_cnt++; 87.543 + h->ed_B_fi++; h->ed_B_fi %= MAX_SLICE_COUNT; 87.544 + pthread_cond_signal(&h->cond[ENTROPY3B]); 87.545 + pthread_mutex_unlock(&h->lock[ENTROPY3B]); 87.546 + }else 87.547 + { 87.548 + ///round robin now, change to based on rawframes size. 87.549 + pthread_mutex_lock(&h->ip[n].ed_lock); 87.550 + while (h->ip[n].ed_cnt >= MAX_SLICE_COUNT) 87.551 + pthread_cond_wait(&h->ip[n].ed_cond, &h->ip[n].ed_lock); 87.552 + h->ip[n].ed_q[ h->ip[n].ed_fi] = *s; 87.553 + h->ip[n].ed_cnt++; 87.554 + h->ip[n].ed_fi++; h->ip[n].ed_fi %= MAX_SLICE_COUNT; 87.555 + pthread_cond_signal(&h->ip[n].ed_cond); 87.556 + pthread_mutex_unlock(&h->ip[n].ed_lock); 87.557 + 87.558 + n++; n %=(h->edip_threads+h->edip_ppe_threads); 87.559 + } 87.560 + { 87.561 + pthread_mutex_lock(&h->lock[ENTROPY]); 87.562 + h->ed_cnt--; 87.563 + pthread_cond_signal(&h->cond[ENTROPY]); 87.564 + pthread_mutex_unlock(&h->lock[ENTROPY]); 87.565 + 87.566 + } 87.567 + } 87.568 + 87.569 + { 87.570 + pthread_mutex_lock(&h->lock[ENTROPY3B]); 87.571 + while (h->ed_B_cnt>=MAX_SLICE_COUNT) 87.572 + pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]); 87.573 + h->ed_B_q[h->ed_B_fi] = *s; 87.574 + h->ed_B_cnt++; 87.575 + h->ed_B_fi++; h->ed_B_fi %= MAX_SLICE_COUNT; 87.576 + pthread_cond_signal(&h->cond[ENTROPY3B]); 87.577 + pthread_mutex_unlock(&h->lock[ENTROPY3B]); 87.578 + } 87.579 + { 87.580 + for (i=0; i<h->edip_threads + h->edip_ppe_threads; i++){ 87.581 + pthread_mutex_lock(&h->ip[i].ed_lock); 87.582 + while (h->ip[i].ed_cnt >= MAX_SLICE_COUNT) 87.583 + pthread_cond_wait(&h->ip[i].ed_cond, &h->ip[i].ed_lock); 87.584 + h->ip[i].ed_q[ h->ip[i].ed_fi] = *s; 87.585 + h->ip[i].ed_cnt++; 87.586 + h->ip[i].ed_fi++; h->ip[i].ed_fi %= MAX_SLICE_COUNT; 87.587 + pthread_cond_signal(&h->ip[i].ed_cond); 87.588 + pthread_mutex_unlock(&h->ip[i].ed_lock); 87.589 + } 87.590 + } 87.591 + { 87.592 + pthread_mutex_lock(&h->lock[ENTROPY4]); 87.593 + while (h->ed_reorder_cnt>=MAX_SLICE_COUNT) 87.594 + pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]); 87.595 + h->ed_reorder_q[h->ed_reorder_fi] = *s; 87.596 + h->ed_reorder_cnt++; 87.597 + h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT; 87.598 + pthread_cond_signal(&h->cond[ENTROPY4]); 87.599 + pthread_mutex_unlock(&h->lock[ENTROPY4]); 87.600 + 87.601 + } 87.602 + pthread_join(h->ed_B_dist, NULL); 87.603 + for(i=0; i<h->edip_threads; i++){ 87.604 + pthread_join(h->ed_IP_thr[i], NULL); 87.605 + } 87.606 + pthread_exit(NULL); 87.607 + return NULL; 87.608 +} 87.609 + 87.610 +static pthread_t ed_IPB_dist; 87.611 +static void *entropy_IPB_cell_thread(void *arg){ 87.612 + H264Context *h = (H264Context *) arg; 87.613 + int i; 87.614 + EDSlice reorder[MAX_SLICE_COUNT]; 87.615 + int ip_poc[MAX_SLICE_COUNT][2]={0,}; 87.616 + int next_ip_id=0; 87.617 + int ip_poc_cnt=0; 87.618 + EDSlice *s; 87.619 + int reorder_cnt=0; 87.620 + unsigned next_pic_num=0; 87.621 + 87.622 + pthread_create(&ed_IPB_dist, NULL, entr_IPB_distribute, h); 87.623 + int count =0; 87.624 + for(;;){ 87.625 + //signals received from the entropy decoders 87.626 + { 87.627 + pthread_mutex_lock(&h->lock[ENTROPY4]); 87.628 + while (h->ed_reorder_cnt<=0) 87.629 + pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]); 87.630 + s= &h->ed_reorder_q[h->ed_reorder_fo]; 87.631 + h->ed_reorder_fo++; h->ed_reorder_fo %=MAX_SLICE_COUNT; 87.632 + pthread_mutex_unlock(&h->lock[ENTROPY4]); 87.633 + } 87.634 + 87.635 + if (s->state >=0 && s->slice_type_nos != FF_B_TYPE){ 87.636 + for (i=0; i<ip_poc_cnt; i++){ 87.637 + if (s->ip_id < ip_poc[i][0]){ 87.638 + memmove(ip_poc[i+1], ip_poc[i], 2*(ip_poc_cnt-i)*sizeof(int)); 87.639 + break; 87.640 + } 87.641 + } 87.642 + ip_poc[i][0]= s->ip_id; 87.643 + ip_poc[i][1]= s->current_picture->poc; 87.644 + ip_poc_cnt++; 87.645 + 87.646 + while (next_ip_id == ip_poc[0][0]){ 87.647 + pthread_mutex_lock(&h->lock[ENTROPY2]); 87.648 + h->ed_poc = ip_poc[0][1]; 87.649 + 87.650 + pthread_cond_signal(&h->cond[ENTROPY2]); 87.651 + pthread_mutex_unlock(&h->lock[ENTROPY2]); 87.652 + memmove(ip_poc[0], ip_poc[1], 2*(ip_poc_cnt-1)*sizeof(int)); 87.653 + ip_poc_cnt--; 87.654 + next_ip_id++; 87.655 + } 87.656 + } 87.657 + 87.658 + for(i=reorder_cnt; i>0; i--){ 87.659 + if (s->coded_pic_num < reorder[i-1].coded_pic_num) 87.660 + break; 87.661 + reorder[i]=reorder[i-1]; 87.662 + } 87.663 + reorder[i]=*s; 87.664 + 87.665 + while(reorder_cnt>=0){ 87.666 + if (next_pic_num!=reorder[reorder_cnt].coded_pic_num){ 87.667 + break; 87.668 + } 87.669 + EDSlice *es = &reorder[reorder_cnt]; 87.670 + 87.671 + { 87.672 + pthread_mutex_lock(&h->lock[MBDEC]); 87.673 + while (h->mbdec_cnt >= MAX_SLICE_COUNT) 87.674 + pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]); 87.675 + copyEDtoMBSlice(&h->mbdec_q[h->mbdec_fi], es); 87.676 + 87.677 + h->mbdec_cnt++; 87.678 + h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT; 87.679 + pthread_cond_signal(&h->cond[MBDEC]); 87.680 + pthread_mutex_unlock(&h->lock[MBDEC]); 87.681 + 87.682 + } 87.683 + 87.684 + if (es->state<0) 87.685 + goto end; 87.686 + 87.687 + assert(es->current_picture); 87.688 + for (int i=0; i<es->release_cnt; i++){ 87.689 + release_pib_entry(h, es->release_ref[i], 2); 87.690 + } 87.691 + release_pib_entry(h, es->current_picture, 1); 87.692 + av_freep(&es->gb.raw); 87.693 + if (es->gb.rbsp) 87.694 + av_freep(&es->gb.rbsp); 87.695 + 87.696 + next_pic_num++; 87.697 + reorder_cnt--; 87.698 + } 87.699 + reorder_cnt++; 87.700 + 87.701 + { 87.702 + pthread_mutex_lock(&h->lock[ENTROPY4]); 87.703 + h->ed_reorder_cnt--; 87.704 + pthread_cond_signal(&h->cond[ENTROPY4]); 87.705 + pthread_mutex_unlock(&h->lock[ENTROPY4]); 87.706 + } 87.707 + } 87.708 + 87.709 +end: 87.710 + pthread_join(ed_IPB_dist, NULL); 87.711 + pthread_exit(NULL); 87.712 + return NULL; 87.713 +} 87.714 + 87.715 + 87.716 +static void fill_spe_slice(H264slice *dst, const MBSlice *src, H264Context *h){ 87.717 + dst->deblocking_filter =1; 87.718 + dst->linesize = src->current_picture->linesize[0]; 87.719 + dst->uvlinesize = src->current_picture->linesize[1]; 87.720 + dst->mb_width = h->mb_width; 87.721 + dst->mb_height = h->mb_height; 87.722 + dst->use_weight = src->use_weight; 87.723 + dst->use_weight_chroma = src->use_weight_chroma; 87.724 + dst->luma_log2_weight_denom = src->luma_log2_weight_denom; 87.725 + dst->chroma_log2_weight_denom = src->chroma_log2_weight_denom; 87.726 + 87.727 + //weights later 87.728 + memcpy(dst->luma_weight, src->luma_weight, 16*2*2*sizeof(int16_t)); 87.729 + memcpy(dst->chroma_weight, src->chroma_weight, 16*2*2*2*sizeof(int16_t)); 87.730 + memcpy(dst->implicit_weight, src->implicit_weight, 16*16*2*sizeof(int16_t)); 87.731 + 87.732 + for(int list=0; list<2; list++){ 87.733 + for (int i=0; i<src->ref_count[list]; i++){ 87.734 + Picture_spu *p_dst = &dst->ref_list[list][i]; 87.735 + DecodedPicture *p_src = src->ref_list[list][i]; 87.736 + if (p_src){ 87.737 + p_dst->data[0] = p_src->data[0]; 87.738 + p_dst->data[1] = p_src->data[1]; 87.739 + p_dst->data[2] = p_src->data[2]; 87.740 + } 87.741 + } 87.742 + } 87.743 + dst->state = src->state; 87.744 + 87.745 + dst->emu_edge_width =32; 87.746 + dst->emu_edge_height =32; 87.747 + dst->slice_type = src->slice_type; 87.748 + dst->slice_type_nos = src->slice_type_nos; 87.749 + dst->slice_alpha_c0_offset = src->slice_alpha_c0_offset; 87.750 + dst->slice_beta_offset = src->slice_beta_offset; 87.751 + 87.752 + memcpy(dst->chroma_qp_table, src->pps.chroma_qp_table, 2*64); 87.753 + 87.754 + dst->blocks = src->mbs; 87.755 + dst->dst_y = src->current_picture->data[0]; 87.756 + dst->dst_cb = src->current_picture->data[1]; 87.757 + dst->dst_cr = src->current_picture->data[2]; 87.758 +} 87.759 + 87.760 +static void decode_slice_mb_seq_cell(H264Context *h, MBRecContext *d, MBSlice *s, DecodedPicture *tmp){ 87.761 + static int rl_fi=0; 87.762 + 87.763 + DECLARE_ALIGNED(16, H264slice, spe_slice); 87.764 + H264spe *p=&spe_params[0]; 87.765 + unsigned status; 87.766 + uint8_t *dst_y, *dst_cb, *dst_cr; 87.767 + 87.768 + DecodedPicture *dp; 87.769 + 87.770 + for (int i=0; i<2; i++){ 87.771 + for(int j=0; j< s->ref_count[i]; j++){ 87.772 + if (s->ref_list_cpn[i][j] ==-1) 87.773 + continue; 87.774 + int k; 87.775 + for (k=0; k<DPB_SIZE; k++){ 87.776 + if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){ 87.777 + s->ref_list[i][j] = &h->dpb[k]; 87.778 + break; 87.779 + } 87.780 + } 87.781 + } 87.782 + } 87.783 + 87.784 + dp = get_dpb_entry(h); 87.785 + init_dpb_entry(dp, s, d->width, d->height); 87.786 + 87.787 + if (h->no_mbd) 87.788 + return; 87.789 + 87.790 + 87.791 + fill_spe_slice(&spe_slice, s, h); 87.792 + spe_mfcio_get(spe_context[0], (unsigned) (spe_slice_buf[0] + rl_fi), &spe_slice, sizeof(H264slice), 15, 0, 0); 87.793 + spe_mfcio_tag_status_read(spe_context[0], 1<<15, SPE_TAG_ALL, &status); 87.794 + rl_fi++; rl_fi %= 2; 87.795 + 87.796 + _spe_in_mbox_write(spe_control_area[0], 0); 87.797 + while (atomic_read(rl_cnt)<=0){ 87.798 + //pthread_yield(); 87.799 + usleep(1000); 87.800 + } 87.801 + atomic_dec(rl_cnt); 87.802 + 87.803 + 87.804 +/** This is error free, no visual artifacts, however, md5sum fails.... (WTF) **/ 87.805 +// memcpy(tmp->data[0], s->current_picture->data[0], tmp->linesize[0]*h->mb_height*16); 87.806 +// memcpy(tmp->data[1], s->current_picture->data[1], tmp->linesize[1]*h->mb_height*8); 87.807 +// memcpy(tmp->data[2], s->current_picture->data[2], tmp->linesize[1]*h->mb_height*8); 87.808 +// 87.809 +// memset(s->current_picture->data[0], 0, tmp->linesize[0]*h->mb_height*16); 87.810 +// memset(s->current_picture->data[1], 0, tmp->linesize[1]*h->mb_height*8); 87.811 +// memset(s->current_picture->data[2], 0, tmp->linesize[1]*h->mb_height*8); 87.812 +// 87.813 +// decode_slice_mb_seq(d, s); 87.814 +// 87.815 +// for (int i=0; i<h->mb_height*16; i++){ 87.816 +// for (int j=0; j<h->width; j++){ 87.817 +// if (tmp->data[0][j + i*tmp->linesize[0]] != s->current_picture->data[0][j + i*tmp->linesize[0]]){ 87.818 +// printf("%d, %d, %d, %d\n", j, i, tmp->data[0][j + i*tmp->linesize[0]], s->current_picture->data[0][j + i*tmp->linesize[0]]); 87.819 +// return; 87.820 +// } 87.821 +// } 87.822 +// } 87.823 +// 87.824 +// for (int i=0; i<h->mb_height*8; i++){ 87.825 +// for (int j=0; j<h->width/2; j++){ 87.826 +// if (tmp->data[1][j + i*tmp->linesize[1]] != s->current_picture->data[1][j + i*tmp->linesize[1]]){ 87.827 +// printf("%d, %d, %d, %d\n", j, i, tmp->data[1][j + i*tmp->linesize[1]], s->current_picture->data[1][j + i*tmp->linesize[1]]); 87.828 +// return; 87.829 +// } 87.830 +// } 87.831 +// } 87.832 +// 87.833 +// for (int i=0; i<h->mb_height*8; i++){ 87.834 +// for (int j=0; j<h->width/2; j++){ 87.835 +// if (tmp->data[2][j + i*tmp->linesize[1]] != s->current_picture->data[2][j + i*tmp->linesize[1]]){ 87.836 +// printf("%d, %d, %d, %d\n", j, i, tmp->data[2][j + i*tmp->linesize[1]], s->current_picture->data[2][j + i*tmp->linesize[1]]); 87.837 +// return; 87.838 +// } 87.839 +// } 87.840 +// } 87.841 + 87.842 + 87.843 + //printf("dst_y %p\n", dst_y); 87.844 + 87.845 + 87.846 + for (int i=0; i<s->release_cnt; i++){ 87.847 + for(int j=0; j<DPB_SIZE; j++){ 87.848 + if(h->dpb[j].cpn== s->release_ref_cpn[i]){ 87.849 + release_dpb_entry(h, &h->dpb[j], 2); 87.850 + break; 87.851 + } 87.852 + } 87.853 + } 87.854 + s->release_cnt=0; 87.855 + 87.856 +} 87.857 + 87.858 +static void *h264_spe_thread(void * thread_args ) { 87.859 + H264spe *params = (H264spe *)thread_args; 87.860 + unsigned int spe_id = params->spe_id; 87.861 + unsigned int runflags = 0; 87.862 + unsigned int entry = SPE_DEFAULT_ENTRY; 87.863 + // run SPE context 87.864 + spe_context_run(spe_context[spe_id], &entry, runflags, (void*) params, NULL, NULL); 87.865 + // done - now exit thread 87.866 + pthread_exit(NULL); 87.867 +} 87.868 + 87.869 +static int create_spe_MBR_threads(H264Context *h, int num_threads) { 87.870 + int i; 87.871 + 87.872 + // reserve memory for spe thread id, context and argument addresses 87.873 + spe_tid = av_malloc(num_threads * sizeof (pthread_t)); 87.874 + spe_context = av_malloc(num_threads * sizeof (spe_context_ptr_t)); 87.875 + spe_params = av_malloc(num_threads * sizeof (H264spe)); 87.876 + spe_control_area = av_malloc(num_threads * sizeof (void*)); 87.877 + spe_ls_area = av_malloc(num_threads * sizeof (void*)); 87.878 + spe_slice_buf = av_malloc(num_threads * sizeof (void*)); 87.879 + 87.880 + spe_program_handle_t *spe_program = spe_image_open("spe_mbd"); 87.881 + 87.882 + if (spe_program == NULL) 87.883 + av_log(AV_LOG_ERROR, "PPE: error opening SPE object image:%d. error=%s \n", errno, strerror(errno)); 87.884 + 87.885 + for (i = 0; i < num_threads; i++) { 87.886 + // create context for spe program 87.887 + spe_context[i] = spe_context_create(SPE_MAP_PS, NULL); 87.888 + if (spe_context[i] == NULL) 87.889 + av_log(AV_LOG_ERROR, "PPE: error creating SPE context:%d. error=%s \n", errno, strerror(errno)); 87.890 + // load SPE program into main memory 87.891 + if ((spe_program_load(spe_context[i], spe_program)) == -1) 87.892 + av_log(AV_LOG_ERROR, "PPE: error loading SPE context:%d. error=%s \n", errno, strerror(errno)); 87.893 + //get the control_area for fast mailboxing 87.894 + if ((spe_control_area[i] = spe_ps_area_get(spe_context[i], SPE_CONTROL_AREA)) == NULL) 87.895 + av_log(AV_LOG_ERROR, "PPE: error retrieving SPE control area:%d. error=%s \n", errno, strerror(errno)); 87.896 + //get ls area for inter spe communication 87.897 + if ((spe_ls_area[i] = spe_ls_area_get(spe_context[i])) == NULL) 87.898 + av_log(AV_LOG_ERROR, "PPE: error retrieving SPE ls area:%d. error=%s \n", errno, strerror(errno)); 87.899 + } 87.900 + 87.901 + for (i = 0; i < num_threads; i++) { 87.902 + spe_params[i].mb_width = h->mb_width; 87.903 + spe_params[i].mb_height = h->mb_height; 87.904 + spe_params[i].mb_stride = h->mb_stride; 87.905 + spe_params[i].spe_id = i; 87.906 + spe_params[i].spe_total = num_threads; 87.907 + //spe_params[i].slice_params= &slice_params; 87.908 + spe_params[i].src_spe = spe_ls_area[(i-1+num_threads)%num_threads]; 87.909 + spe_params[i].tgt_spe = spe_ls_area[(i+1)%num_threads]; 87.910 + 87.911 + spe_params[i].rl_lock = rl_lock; 87.912 + spe_params[i].rl_cond = rl_cond; 87.913 + spe_params[i].rl_cnt = rl_cnt; 87.914 + spe_params[i].lock = (mutex_ea_t) (unsigned) &mutex_var[i]; 87.915 + spe_params[i].cond = (cond_ea_t) (unsigned) &cond_var[i]; 87.916 + spe_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_var[i]; atomic_set(spe_params[i].cnt, 0); 87.917 + 87.918 + mutex_init(spe_params[i].lock); 87.919 + cond_init(spe_params[i].cond); 87.920 + if (pthread_create(&spe_tid[i], NULL, h264_spe_thread, (void *) &spe_params[i])) 87.921 + av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i); 87.922 + 87.923 + //slicebufaddr 87.924 + spe_slice_buf[i] = (H264slice *) _spe_out_mbox_read(spe_control_area[i]); 87.925 + 87.926 + av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i); 87.927 + } 87.928 + spe_image_close(spe_program); 87.929 + return 0; 87.930 +} 87.931 + 87.932 +//_spe_out_mbox_read(spe_control_area[i]); 87.933 +/** 87.934 +* joins all the spe worker threads. 87.935 +*/ 87.936 +static void join_spe_worker_threads(H264slice *s, int num_threads, int *rl_fi) { 87.937 + int i; 87.938 + ///just to keep coding consistency. 87.939 + { 87.940 + for (i=0; i<num_threads; i++){ 87.941 + H264spe *p=&spe_params[i]; 87.942 + unsigned status; 87.943 + 87.944 + while (atomic_read(p->cnt)>=2) {//double buffered 87.945 + usleep(1000);//cond_wait(p->cond, p->lock); 87.946 + } 87.947 + 87.948 + spe_mfcio_get(spe_context[i], (unsigned) (spe_slice_buf[i] + rl_fi[i]), s, sizeof(H264slice), 15, 0, 0); 87.949 + spe_mfcio_tag_status_read(spe_context[i], 1<<15, SPE_TAG_ALL, &status); 87.950 + //mutex_unlock(p->lock); 87.951 + _spe_in_mbox_write(spe_control_area[i], 0); 87.952 + } 87.953 + } 87.954 + 87.955 + for (i=0; i<num_threads; i++){ 87.956 + pthread_join(spe_tid[i], NULL); 87.957 + } 87.958 + 87.959 + for (i=0; i<num_threads; i++){ 87.960 + spe_context_destroy(spe_context[i]); 87.961 + } 87.962 + atomic_inc(rl_cnt); 87.963 + 87.964 + // destroy memory reserved for spe thread id, context and argument addresses 87.965 + av_freep(&spe_tid); 87.966 + av_freep(&spe_context); 87.967 + av_freep(&spe_params); 87.968 + av_freep(&spe_control_area); 87.969 + av_freep(&spe_slice_buf); 87.970 +} 87.971 + 87.972 + 87.973 +static void *rl_dist_thread(void *arg){ 87.974 + int i; 87.975 + H264Context *h = (H264Context *) arg; 87.976 + MBSlice *s; 87.977 + DecodedPicture *dp; 87.978 + int rl_fi[16]={0,}; 87.979 + DECLARE_ALIGNED(16, H264slice, spe_slice); 87.980 + 87.981 + create_spe_MBR_threads(h, h->rl_threads); 87.982 + for(;;){ 87.983 + { 87.984 + pthread_mutex_lock(&h->lock[MBDEC]); 87.985 + while (h->mbdec_cnt<=0) 87.986 + pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]); 87.987 + s= &h->mbdec_q[h->mbdec_fo]; 87.988 + h->mbdec_fo++; h->mbdec_fo %= MAX_SLICE_COUNT; 87.989 + pthread_mutex_unlock(&h->lock[MBDEC]); 87.990 + } 87.991 + 87.992 + if (s->state<0){ 87.993 + break; 87.994 + } 87.995 + for (int i=0; i<2; i++){ 87.996 + for(int j=0; j< s->ref_count[i]; j++){ 87.997 + if (s->ref_list_cpn[i][j] ==-1) 87.998 + continue; 87.999 + int k; 87.1000 + for (k=0; k<DPB_SIZE; k++){ 87.1001 + if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){ 87.1002 + s->ref_list[i][j] = &h->dpb[k]; 87.1003 + break; 87.1004 + } 87.1005 + } 87.1006 + 87.1007 + } 87.1008 + } 87.1009 + dp = get_dpb_entry(h); 87.1010 + init_dpb_entry(dp, s, h->width, h->height); 87.1011 + assert(s->current_picture); 87.1012 + { 87.1013 + while (atomic_read(rl_cnt) >=MAX_SLICE_COUNT){ 87.1014 + usleep(1000); 87.1015 + } 87.1016 + h->mbrel_q[h->mbrel_fi] = *s; 87.1017 + 87.1018 + h->mbrel_fi++; h->mbrel_fi %= MAX_SLICE_COUNT; 87.1019 + } 87.1020 + { 87.1021 + if(h->no_mbd){ 87.1022 + atomic_inc(rl_cnt); 87.1023 + }else { 87.1024 + fill_spe_slice(&spe_slice, s, h); 87.1025 + for (i=0; i<h->rl_threads; i++){ 87.1026 + H264spe *p=&spe_params[i]; 87.1027 + unsigned status; 87.1028 + while (atomic_read(p->cnt)>=2){ //double buffered 87.1029 + usleep(1000); 87.1030 + //cond_wait(p->cond, p->lock); 87.1031 + } 87.1032 + spe_mfcio_get(spe_context[i], (unsigned) (spe_slice_buf[i] + rl_fi[i]), &spe_slice, sizeof(H264slice), 15, 0, 0); 87.1033 + spe_mfcio_tag_status_read(spe_context[i], 1<<15, SPE_TAG_ALL, &status); 87.1034 + rl_fi[i]++; rl_fi[i] %= 2; 87.1035 + atomic_inc(p->cnt); 87.1036 + 87.1037 + _spe_in_mbox_write(spe_control_area[i], 0); 87.1038 + } 87.1039 + } 87.1040 + } 87.1041 + 87.1042 + { 87.1043 + pthread_mutex_lock(&h->lock[MBDEC]); 87.1044 + h->mbdec_cnt--; 87.1045 + pthread_cond_signal(&h->cond[MBDEC]); 87.1046 + pthread_mutex_unlock(&h->lock[MBDEC]); 87.1047 + } 87.1048 + 87.1049 + } 87.1050 + 87.1051 + { 87.1052 + while (atomic_read(rl_cnt) >=MAX_SLICE_COUNT){ 87.1053 + usleep(1000); 87.1054 + } 87.1055 + h->mbrel_q[h->mbrel_fi] = *s; 87.1056 + 87.1057 + h->mbrel_fi++; h->mbrel_fi %= MAX_SLICE_COUNT; 87.1058 + } 87.1059 + spe_slice.state=-1; 87.1060 + join_spe_worker_threads(&spe_slice, h->rl_threads, rl_fi); 87.1061 + pthread_exit(NULL); 87.1062 + return NULL; 87.1063 +} 87.1064 + 87.1065 +static void *mbdec_cell_thread(void *arg){ 87.1066 + H264Context *h = (H264Context *) arg; 87.1067 + 87.1068 + rl_lock = (mutex_ea_t) (unsigned) &rl_mutex_var; 87.1069 + rl_cond = (cond_ea_t) (unsigned) &rl_cond_var; 87.1070 + rl_cnt = (atomic_ea_t) (unsigned) &rl_cnt_var; 87.1071 + atomic_set(rl_cnt, 0); 87.1072 + mutex_init(rl_lock); 87.1073 + cond_init(rl_cond); 87.1074 +// printf("mbdec, pid %d\n", syscall(SYS_gettid)); 87.1075 + pthread_create(&h->rl_dist_thr, NULL, rl_dist_thread, h); 87.1076 + 87.1077 + for(;;){ 87.1078 + MBSlice *s=NULL; 87.1079 + { 87.1080 + while (atomic_read(rl_cnt)<=0){ 87.1081 + usleep(1000); 87.1082 + } 87.1083 + s= &h->mbrel_q[h->mbrel_fo]; 87.1084 + h->mbrel_fo++; h->mbrel_fo %= MAX_SLICE_COUNT; 87.1085 + } 87.1086 + 87.1087 + if (s->state<0) 87.1088 + break; 87.1089 + 87.1090 + for (int i=0; i<s->release_cnt; i++){ 87.1091 + for(int j=0; j<DPB_SIZE; j++){ 87.1092 + if(h->dpb[j].cpn== s->release_ref_cpn[i]){ 87.1093 + release_dpb_entry(h, &h->dpb[j], 2); 87.1094 + break; 87.1095 + } 87.1096 + } 87.1097 + } 87.1098 + 87.1099 + { 87.1100 + EDThreadContext *ed = s->ed; 87.1101 + pthread_mutex_lock(&ed->mbs_lock); 87.1102 + ed->mbs_cnt++; 87.1103 + pthread_cond_signal(&ed->mbs_cond); 87.1104 + pthread_mutex_unlock(&ed->mbs_lock); 87.1105 + } 87.1106 + 87.1107 + { 87.1108 + pthread_mutex_lock(&h->lock[WRITE]); 87.1109 + while (h->write_cnt>= DPB_SIZE) 87.1110 + pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]); 87.1111 + assert(s); 87.1112 + assert(s->current_picture); 87.1113 + h->write_q[h->write_fi]= s->current_picture; 87.1114 + h->write_cnt++; 87.1115 + h->write_fi++; h->write_fi %= DPB_SIZE; 87.1116 + pthread_cond_signal(&h->cond[WRITE]); 87.1117 + pthread_mutex_unlock(&h->lock[WRITE]); 87.1118 + 87.1119 + } 87.1120 + { 87.1121 + atomic_dec(rl_cnt); 87.1122 + } 87.1123 + 87.1124 + } 87.1125 + 87.1126 + {//propagate exit 87.1127 + pthread_mutex_lock(&h->lock[WRITE]); 87.1128 + while (h->write_cnt>= DPB_SIZE) 87.1129 + pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]); 87.1130 + last_pic.reference = -1; 87.1131 + h->write_q[h->write_fi] = &last_pic; 87.1132 + h->write_cnt++; 87.1133 + h->write_fi++; h->write_fi %= DPB_SIZE; 87.1134 + pthread_cond_signal(&h->cond[WRITE]); 87.1135 + pthread_mutex_unlock(&h->lock[WRITE]); 87.1136 + 87.1137 + } 87.1138 + pthread_join(h->rl_dist_thr, NULL); 87.1139 + pthread_exit(NULL); 87.1140 + return NULL; 87.1141 +} 87.1142 + 87.1143 +/* 87.1144 +* The following code is the main loop of the file converter 87.1145 +*/ 87.1146 +int h264_decode_cell(H264Context *h) { 87.1147 + 87.1148 + pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr; 87.1149 + 87.1150 + start_timer(); 87.1151 + 87.1152 + pthread_create(&read_thr, NULL, read_thread, h); 87.1153 + pthread_create(&parsenal_thr, NULL, parsenal_thread, h); 87.1154 + pthread_create(&entropy_thr, NULL, entropy_IPB_cell_thread, h); 87.1155 + pthread_create(&mbdec_thr, NULL, mbdec_cell_thread, h); 87.1156 + pthread_create(&write_thr, NULL, write_thread, h); 87.1157 + 87.1158 + pthread_join(read_thr, NULL); 87.1159 + pthread_join(parsenal_thr, NULL); 87.1160 + pthread_join(entropy_thr, NULL); 87.1161 + pthread_join(mbdec_thr, NULL); 87.1162 + pthread_join(write_thr, NULL); 87.1163 + 87.1164 + return 0; 87.1165 +} 87.1166 + 87.1167 +/* 87.1168 +* The following code is the main loop of the file converter 87.1169 +*/ 87.1170 +int h264_decode_cell_seq(H264Context *h) { 87.1171 +ParserContext *pc; 87.1172 + NalContext *nc; 87.1173 + EntropyContext *ec; 87.1174 + MBRecContext *rc; 87.1175 + OutputContext *oc; 87.1176 + 87.1177 + RawFrame frm; 87.1178 + EDSlice slice, *s=&slice; 87.1179 + MBSlice mbslice, *s2=&mbslice; 87.1180 + PictureInfo *pic=NULL; 87.1181 + DecodedPicture *out; 87.1182 + int size; 87.1183 + int frames=0; 87.1184 + 87.1185 + pc = get_parse_context(h->ifile); 87.1186 + nc = get_nal_context(h->width, h->height); 87.1187 + ec = get_entropy_context( h ); 87.1188 + rc = get_mbrec_context(h); 87.1189 + oc = get_output_context( h ); 87.1190 + 87.1191 + rl_lock = (mutex_ea_t) (unsigned) &rl_mutex_var; 87.1192 + rl_cond = (cond_ea_t) (unsigned) &rl_cond_var; 87.1193 + rl_cnt = (atomic_ea_t) (unsigned) &rl_cnt_var; 87.1194 + atomic_set(rl_cnt, 0); 87.1195 + mutex_init(rl_lock); 87.1196 + cond_init(rl_cond); 87.1197 + 87.1198 + memset(s, 0, sizeof(EDSlice)); 87.1199 + ff_init_slice(nc, s); 87.1200 + s->mbs = av_malloc( h->mb_height * h->mb_width * sizeof(H264Mb)); 87.1201 + 87.1202 + DecodedPicture tmp; 87.1203 + tmp.base[0]=0; 87.1204 + ///fix this when want to debug the Cell errors 87.1205 + //init_dpb_entry(&tmp, h->width, h->height); 87.1206 + 87.1207 + create_spe_ED_threads(h, 1, 0); 87.1208 + create_spe_MBR_threads(h, 1); 87.1209 + 87.1210 + start_timer(); 87.1211 + 87.1212 + while(!pc->final_frame && frames++ < h->num_frames){ 87.1213 + 87.1214 + av_read_frame_internal(pc, &frm); 87.1215 + 87.1216 + PictureInfo *pic=get_pib_entry(h); 87.1217 + ff_alloc_picture_info(nc, s, pic); 87.1218 + decode_nal_units(nc, s, &frm); 87.1219 + 87.1220 + copyEDtoMBSlice(s2, s); 87.1221 + decode_slice_entropy_cell_seq(h, ec, s); 87.1222 + 87.1223 + decode_slice_mb_seq_cell(h, rc, s2, &tmp); 87.1224 + 87.1225 + out =output_frame(h, oc, s2->current_picture, h->ofile, h->frame_width, h->frame_height); 87.1226 + 87.1227 + if (out){ 87.1228 + release_dpb_entry(h, out, 1); 87.1229 + } 87.1230 + print_report(oc->frame_number, oc->video_size, 0, h->verbose); 87.1231 + } 87.1232 + while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ; 87.1233 + 87.1234 + print_report(oc->frame_number, oc->video_size, 1, h->verbose); 87.1235 + 87.1236 + /* finished ! */ 87.1237 + av_freep(&s->mbs); 87.1238 + 87.1239 + free_parse_context(pc); 87.1240 + free_nal_context (nc); 87.1241 + free_entropy_context(ec); 87.1242 + free_mbrec_context(rc); 87.1243 + free_output_context(oc); 87.1244 + return 0; 87.1245 +}
88.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 88.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_data.h Mon Aug 27 12:09:56 2012 +0200 88.3 @@ -0,0 +1,243 @@ 88.4 +/* 88.5 + * H26L/H264/AVC/JVT/14496-10/... encoder/decoder 88.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 88.7 + * 88.8 + * This file is part of FFmpeg. 88.9 + * 88.10 + * FFmpeg is free software; you can redistribute it and/or 88.11 + * modify it under the terms of the GNU Lesser General Public 88.12 + * License as published by the Free Software Foundation; either 88.13 + * version 2.1 of the License, or (at your option) any later version. 88.14 + * 88.15 + * FFmpeg is distributed in the hope that it will be useful, 88.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 88.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 88.18 + * Lesser General Public License for more details. 88.19 + * 88.20 + * You should have received a copy of the GNU Lesser General Public 88.21 + * License along with FFmpeg; if not, write to the Free Software 88.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 88.23 + */ 88.24 + 88.25 +/** 88.26 + * @file 88.27 + * @brief 88.28 + * H264 / AVC / MPEG4 part10 codec data table 88.29 + * @author Michael Niedermayer <michaelni@gmx.at> 88.30 + */ 88.31 + 88.32 +#ifndef AVCODEC_H264DATA_H 88.33 +#define AVCODEC_H264DATA_H 88.34 + 88.35 +#include <stdint.h> 88.36 +#include "avcodec.h" 88.37 +//#include "h264.h" 88.38 + 88.39 +/* 88.40 +o-o o-o 88.41 + / / / 88.42 +o-o o-o 88.43 + ,---' 88.44 +o-o o-o 88.45 + / / / 88.46 +o-o o-o 88.47 +*/ 88.48 +//This table must be here because scan8[constant] must be known at compiletime 88.49 +static const uint8_t scan8[16 + 2*4]={ 88.50 + 4+1*8, 5+1*8, 4+2*8, 5+2*8, 88.51 + 6+1*8, 7+1*8, 6+2*8, 7+2*8, 88.52 + 4+3*8, 5+3*8, 4+4*8, 5+4*8, 88.53 + 6+3*8, 7+3*8, 6+4*8, 7+4*8, 88.54 + 1+1*8, 2+1*8, 88.55 + 1+2*8, 2+2*8, 88.56 + 1+4*8, 2+4*8, 88.57 + 1+5*8, 2+5*8, 88.58 +}; 88.59 + 88.60 +static const uint8_t golomb_to_pict_type[5]= 88.61 +{FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE}; 88.62 + 88.63 +static const uint8_t golomb_to_intra4x4_cbp[48]={ 88.64 + 47, 31, 15, 0, 23, 27, 29, 30, 7, 11, 13, 14, 39, 43, 45, 46, 88.65 + 16, 3, 5, 10, 12, 19, 21, 26, 28, 35, 37, 42, 44, 1, 2, 4, 88.66 + 8, 17, 18, 20, 24, 6, 9, 22, 25, 32, 33, 34, 36, 40, 38, 41 88.67 +}; 88.68 + 88.69 +static const uint8_t golomb_to_inter_cbp[48]={ 88.70 + 0, 16, 1, 2, 4, 8, 32, 3, 5, 10, 12, 15, 47, 7, 11, 13, 88.71 + 14, 6, 9, 31, 35, 37, 42, 44, 33, 34, 36, 40, 39, 43, 45, 46, 88.72 + 17, 18, 20, 24, 19, 21, 26, 28, 23, 27, 29, 30, 22, 25, 38, 41 88.73 +}; 88.74 + 88.75 +static const uint8_t zigzag_scan[16]={ 88.76 + 0+0*4, 1+0*4, 0+1*4, 0+2*4, 88.77 + 1+1*4, 2+0*4, 3+0*4, 2+1*4, 88.78 + 1+2*4, 0+3*4, 1+3*4, 2+2*4, 88.79 + 3+1*4, 3+2*4, 2+3*4, 3+3*4, 88.80 +}; 88.81 + 88.82 +static const uint8_t field_scan[16]={ 88.83 + 0+0*4, 0+1*4, 1+0*4, 0+2*4, 88.84 + 0+3*4, 1+1*4, 1+2*4, 1+3*4, 88.85 + 2+0*4, 2+1*4, 2+2*4, 2+3*4, 88.86 + 3+0*4, 3+1*4, 3+2*4, 3+3*4, 88.87 +}; 88.88 + 88.89 +static const uint8_t luma_dc_zigzag_scan[16]={ 88.90 + 0*16 + 0*64, 1*16 + 0*64, 2*16 + 0*64, 0*16 + 2*64, 88.91 + 3*16 + 0*64, 0*16 + 1*64, 1*16 + 1*64, 2*16 + 1*64, 88.92 + 1*16 + 2*64, 2*16 + 2*64, 3*16 + 2*64, 0*16 + 3*64, 88.93 + 3*16 + 1*64, 1*16 + 3*64, 2*16 + 3*64, 3*16 + 3*64, 88.94 +}; 88.95 + 88.96 +static const uint8_t luma_dc_field_scan[16]={ 88.97 + 0*16 + 0*64, 2*16 + 0*64, 1*16 + 0*64, 0*16 + 2*64, 88.98 + 2*16 + 2*64, 3*16 + 0*64, 1*16 + 2*64, 3*16 + 2*64, 88.99 + 0*16 + 1*64, 2*16 + 1*64, 0*16 + 3*64, 2*16 + 3*64, 88.100 + 1*16 + 1*64, 3*16 + 1*64, 1*16 + 3*64, 3*16 + 3*64, 88.101 +}; 88.102 + 88.103 +static const uint8_t chroma_dc_scan[4]={ 88.104 + (0+0*2)*16, (1+0*2)*16, 88.105 + (0+1*2)*16, (1+1*2)*16, //FIXME 88.106 +}; 88.107 + 88.108 + 88.109 +static const uint8_t field_scan8x8[64]={ 88.110 + 0+0*8, 0+1*8, 0+2*8, 1+0*8, 88.111 + 1+1*8, 0+3*8, 0+4*8, 1+2*8, 88.112 + 2+0*8, 1+3*8, 0+5*8, 0+6*8, 88.113 + 0+7*8, 1+4*8, 2+1*8, 3+0*8, 88.114 + 2+2*8, 1+5*8, 1+6*8, 1+7*8, 88.115 + 2+3*8, 3+1*8, 4+0*8, 3+2*8, 88.116 + 2+4*8, 2+5*8, 2+6*8, 2+7*8, 88.117 + 3+3*8, 4+1*8, 5+0*8, 4+2*8, 88.118 + 3+4*8, 3+5*8, 3+6*8, 3+7*8, 88.119 + 4+3*8, 5+1*8, 6+0*8, 5+2*8, 88.120 + 4+4*8, 4+5*8, 4+6*8, 4+7*8, 88.121 + 5+3*8, 6+1*8, 6+2*8, 5+4*8, 88.122 + 5+5*8, 5+6*8, 5+7*8, 6+3*8, 88.123 + 7+0*8, 7+1*8, 6+4*8, 6+5*8, 88.124 + 6+6*8, 6+7*8, 7+2*8, 7+3*8, 88.125 + 7+4*8, 7+5*8, 7+6*8, 7+7*8, 88.126 +}; 88.127 + 88.128 +typedef struct IMbInfo{ 88.129 + uint16_t type; 88.130 + uint8_t pred_mode; 88.131 + uint8_t cbp; 88.132 +} IMbInfo; 88.133 + 88.134 +static const IMbInfo i_mb_type_info[26]={ 88.135 +{MB_TYPE_INTRA4x4 , -1, -1}, 88.136 +{MB_TYPE_INTRA16x16, 2, 0}, 88.137 +{MB_TYPE_INTRA16x16, 1, 0}, 88.138 +{MB_TYPE_INTRA16x16, 0, 0}, 88.139 +{MB_TYPE_INTRA16x16, 3, 0}, 88.140 +{MB_TYPE_INTRA16x16, 2, 16}, 88.141 +{MB_TYPE_INTRA16x16, 1, 16}, 88.142 +{MB_TYPE_INTRA16x16, 0, 16}, 88.143 +{MB_TYPE_INTRA16x16, 3, 16}, 88.144 +{MB_TYPE_INTRA16x16, 2, 32}, 88.145 +{MB_TYPE_INTRA16x16, 1, 32}, 88.146 +{MB_TYPE_INTRA16x16, 0, 32}, 88.147 +{MB_TYPE_INTRA16x16, 3, 32}, 88.148 +{MB_TYPE_INTRA16x16, 2, 15+0}, 88.149 +{MB_TYPE_INTRA16x16, 1, 15+0}, 88.150 +{MB_TYPE_INTRA16x16, 0, 15+0}, 88.151 +{MB_TYPE_INTRA16x16, 3, 15+0}, 88.152 +{MB_TYPE_INTRA16x16, 2, 15+16}, 88.153 +{MB_TYPE_INTRA16x16, 1, 15+16}, 88.154 +{MB_TYPE_INTRA16x16, 0, 15+16}, 88.155 +{MB_TYPE_INTRA16x16, 3, 15+16}, 88.156 +{MB_TYPE_INTRA16x16, 2, 15+32}, 88.157 +{MB_TYPE_INTRA16x16, 1, 15+32}, 88.158 +{MB_TYPE_INTRA16x16, 0, 15+32}, 88.159 +{MB_TYPE_INTRA16x16, 3, 15+32}, 88.160 +{MB_TYPE_INTRA_PCM , -1, -1}, 88.161 +}; 88.162 + 88.163 +typedef struct PMbInfo{ 88.164 + uint16_t type; 88.165 + uint8_t partition_count; 88.166 +} PMbInfo; 88.167 + 88.168 +static const PMbInfo p_mb_type_info[5]={ 88.169 +{MB_TYPE_16x16|MB_TYPE_P0L0 , 1}, 88.170 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2}, 88.171 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2}, 88.172 +{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 4}, 88.173 +{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_REF0, 4}, 88.174 +}; 88.175 + 88.176 +static const PMbInfo p_sub_mb_type_info[4]={ 88.177 +{MB_TYPE_16x16|MB_TYPE_P0L0 , 1}, 88.178 +{MB_TYPE_16x8 |MB_TYPE_P0L0 , 2}, 88.179 +{MB_TYPE_8x16 |MB_TYPE_P0L0 , 2}, 88.180 +{MB_TYPE_8x8 |MB_TYPE_P0L0 , 4}, 88.181 +}; 88.182 + 88.183 +static const PMbInfo b_mb_type_info[23]={ 88.184 +{MB_TYPE_DIRECT2|MB_TYPE_L0L1 , 1, }, 88.185 +{MB_TYPE_16x16|MB_TYPE_P0L0 , 1, }, 88.186 +{MB_TYPE_16x16 |MB_TYPE_P0L1 , 1, }, 88.187 +{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1 , 1, }, 88.188 +{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, 88.189 +{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, 88.190 +{MB_TYPE_16x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, 88.191 +{MB_TYPE_8x16 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, 88.192 +{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L1, 2, }, 88.193 +{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L1, 2, }, 88.194 +{MB_TYPE_16x8 |MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, 88.195 +{MB_TYPE_8x16 |MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, 88.196 +{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, 88.197 +{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, 88.198 +{MB_TYPE_16x8 |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, 88.199 +{MB_TYPE_8x16 |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, 88.200 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, 88.201 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0 , 2, }, 88.202 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, 88.203 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, 88.204 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, 88.205 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, 88.206 +{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, }, 88.207 +}; 88.208 + 88.209 +static const PMbInfo b_sub_mb_type_info[13]={ 88.210 +{MB_TYPE_DIRECT2 , 1, }, 88.211 +{MB_TYPE_16x16|MB_TYPE_P0L0 , 1, }, 88.212 +{MB_TYPE_16x16 |MB_TYPE_P0L1 , 1, }, 88.213 +{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1 , 1, }, 88.214 +{MB_TYPE_16x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, 88.215 +{MB_TYPE_8x16 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 2, }, 88.216 +{MB_TYPE_16x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, 88.217 +{MB_TYPE_8x16 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 2, }, 88.218 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, 88.219 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, }, 88.220 +{MB_TYPE_8x8 |MB_TYPE_P0L0 |MB_TYPE_P1L0 , 4, }, 88.221 +{MB_TYPE_8x8 |MB_TYPE_P0L1 |MB_TYPE_P1L1, 4, }, 88.222 +{MB_TYPE_8x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, }, 88.223 +}; 88.224 + 88.225 +static const uint8_t dequant4_coeff_init[6][3]={ 88.226 + {10,13,16}, 88.227 + {11,14,18}, 88.228 + {13,16,20}, 88.229 + {14,18,23}, 88.230 + {16,20,25}, 88.231 + {18,23,29}, 88.232 +}; 88.233 + 88.234 +static const uint8_t dequant8_coeff_init_scan[16] = { 88.235 + 0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1 88.236 +}; 88.237 +static const uint8_t dequant8_coeff_init[6][6]={ 88.238 + {20,18,32,19,25,24}, 88.239 + {22,19,35,21,28,26}, 88.240 + {26,23,42,24,33,31}, 88.241 + {28,25,45,26,35,33}, 88.242 + {32,28,51,30,40,38}, 88.243 + {36,32,58,34,46,43}, 88.244 +}; 88.245 + 88.246 +#endif /* AVCODEC_H264DATA_H */
89.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 89.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_deblock.c Mon Aug 27 12:09:56 2012 +0200 89.3 @@ -0,0 +1,507 @@ 89.4 +/* 89.5 + * H.26L/H.264/AVC/JVT/14496-10/... loop filter 89.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 89.7 + * 89.8 + * This file is part of FFmpeg. 89.9 + * 89.10 + * FFmpeg is free software; you can redistribute it and/or 89.11 + * modify it under the terms of the GNU Lesser General Public 89.12 + * License as published by the Free Software Foundation; either 89.13 + * version 2.1 of the License, or (at your option) any later version. 89.14 + * 89.15 + * FFmpeg is distributed in the hope that it will be useful, 89.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 89.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 89.18 + * Lesser General Public License for more details. 89.19 + * 89.20 + * You should have received a copy of the GNU Lesser General Public 89.21 + * License along with FFmpeg; if not, write to the Free Software 89.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 89.23 + */ 89.24 + 89.25 +/** 89.26 + * @file 89.27 + * H.264 / AVC / MPEG4 part10 loop filter. 89.28 + * @author Michael Niedermayer <michaelni@gmx.at> 89.29 + */ 89.30 + 89.31 +#include "dsputil.h" 89.32 +#include "mathops.h" 89.33 +#include "rectangle.h" 89.34 +#include "h264_types.h" 89.35 +#include "h264_misc.h" 89.36 +#include "h264_data.h" 89.37 +//#undef NDEBUG 89.38 +#include <assert.h> 89.39 + 89.40 +/* Deblocking filter (p153) */ 89.41 +static const uint8_t alpha_table[52*3] = { 89.42 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89.43 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89.44 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89.45 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89.46 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89.47 + 0, 0, 0, 0, 0, 0, 4, 4, 5, 6, 89.48 + 7, 8, 9, 10, 12, 13, 15, 17, 20, 22, 89.49 + 25, 28, 32, 36, 40, 45, 50, 56, 63, 71, 89.50 + 80, 90,101,113,127,144,162,182,203,226, 89.51 + 255,255, 89.52 + 255,255,255,255,255,255,255,255,255,255,255,255,255, 89.53 + 255,255,255,255,255,255,255,255,255,255,255,255,255, 89.54 + 255,255,255,255,255,255,255,255,255,255,255,255,255, 89.55 + 255,255,255,255,255,255,255,255,255,255,255,255,255, 89.56 +}; 89.57 +static const uint8_t beta_table[52*3] = { 89.58 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89.59 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89.60 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89.61 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89.62 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89.63 + 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, 89.64 + 3, 3, 3, 4, 4, 4, 6, 6, 7, 7, 89.65 + 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 89.66 + 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 89.67 + 18, 18, 89.68 + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 89.69 + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 89.70 + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 89.71 + 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 89.72 +}; 89.73 +static const uint8_t tc0_table[52*3][4] = { 89.74 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 89.75 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 89.76 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 89.77 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 89.78 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 89.79 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 89.80 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 89.81 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 89.82 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 89.83 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 89.84 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, 89.85 + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 }, 89.86 + {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 }, 89.87 + {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, 89.88 + {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 }, 89.89 + {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 }, 89.90 + {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 }, 89.91 + {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 }, 89.92 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 89.93 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 89.94 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 89.95 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 89.96 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 89.97 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 89.98 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 89.99 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 89.100 + {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, 89.101 +}; 89.102 + 89.103 +av_always_inline static void filter_mb_edgev( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s) { 89.104 + const unsigned int index_a = qp + s->slice_alpha_c0_offset; 89.105 + const int alpha = alpha_table[index_a]; 89.106 + const int beta = beta_table[qp + s->slice_beta_offset]; 89.107 + if (alpha ==0 || beta == 0) return; 89.108 + 89.109 + if( bS[0] < 4 ) { 89.110 + int8_t tc[4]; 89.111 + tc[0] = tc0_table[index_a][bS[0]]; 89.112 + tc[1] = tc0_table[index_a][bS[1]]; 89.113 + tc[2] = tc0_table[index_a][bS[2]]; 89.114 + tc[3] = tc0_table[index_a][bS[3]]; 89.115 + mrc->hdsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc); 89.116 + } else { 89.117 + mrc->hdsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta); 89.118 + } 89.119 +} 89.120 + 89.121 +av_always_inline static void filter_mb_edgecv( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) { 89.122 + const unsigned int index_a = qp + s->slice_alpha_c0_offset; 89.123 + const int alpha = alpha_table[index_a]; 89.124 + const int beta = beta_table[qp + s->slice_beta_offset]; 89.125 + if (alpha ==0 || beta == 0) return; 89.126 + 89.127 + if( bS[0] < 4 ) { 89.128 + int8_t tc[4]; 89.129 + tc[0] = tc0_table[index_a][bS[0]]+1; 89.130 + tc[1] = tc0_table[index_a][bS[1]]+1; 89.131 + tc[2] = tc0_table[index_a][bS[2]]+1; 89.132 + tc[3] = tc0_table[index_a][bS[3]]+1; 89.133 + mrc->hdsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc); 89.134 + } else { 89.135 + mrc->hdsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta); 89.136 + } 89.137 +} 89.138 + 89.139 + 89.140 +av_always_inline static void filter_mb_edgeh( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) { 89.141 + const unsigned int index_a = qp + s->slice_alpha_c0_offset; 89.142 + const int alpha = alpha_table[index_a]; 89.143 + const int beta = beta_table[qp + s->slice_beta_offset]; 89.144 + if (alpha ==0 || beta == 0) return; 89.145 + 89.146 + if( bS[0] < 4 ) { 89.147 + int8_t tc[4]; 89.148 + tc[0] = tc0_table[index_a][bS[0]]; 89.149 + tc[1] = tc0_table[index_a][bS[1]]; 89.150 + tc[2] = tc0_table[index_a][bS[2]]; 89.151 + tc[3] = tc0_table[index_a][bS[3]]; 89.152 + mrc->hdsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc); 89.153 + } else { 89.154 + mrc->hdsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta); 89.155 + } 89.156 +} 89.157 + 89.158 +av_always_inline static void filter_mb_edgech( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) { 89.159 + const unsigned int index_a = qp + s->slice_alpha_c0_offset; 89.160 + const int alpha = alpha_table[index_a]; 89.161 + const int beta = beta_table[qp + s->slice_beta_offset]; 89.162 + if (alpha ==0 || beta == 0) return; 89.163 + 89.164 + if( bS[0] < 4 ) { 89.165 + int8_t tc[4]; 89.166 + tc[0] = tc0_table[index_a][bS[0]]+1; 89.167 + tc[1] = tc0_table[index_a][bS[1]]+1; 89.168 + tc[2] = tc0_table[index_a][bS[2]]+1; 89.169 + tc[3] = tc0_table[index_a][bS[3]]+1; 89.170 + mrc->hdsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc); 89.171 + } else { 89.172 + mrc->hdsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta); 89.173 + } 89.174 +} 89.175 + 89.176 +static av_always_inline void filter_mb_dir(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, int dir) { 89.177 + const int mbm_type = dir == 0 ? mrs->left_type : mrs->top_type; 89.178 + const int qp_xy= m->qscale_mb_xy; 89.179 + const int qp_dir = dir == 0 ? m->qscale_left_mb_xy : m->qscale_top_mb_xy; 89.180 + const int linesize = mrc->linesize; 89.181 + const int uvlinesize = mrc->uvlinesize; 89.182 + const int mb_type = m->mb_type; 89.183 + int edge; 89.184 + const int edges = mrs->edges[dir]; 89.185 + 89.186 + if(mbm_type){ 89.187 + int16_t* bS=mrs->bS[dir][0]; 89.188 + /* Filter edge */ 89.189 + // Do not use s->qscale as luma quantizer because it has not the same 89.190 + // value in IPCM macroblocks. 89.191 + if(bS[0]+bS[1]+bS[2]+bS[3]){ 89.192 + int qp = ( qp_xy + qp_dir + 1 ) >> 1; 89.193 + if( dir == 0 ) { 89.194 + filter_mb_edgev( &img_y[0], linesize, bS, qp, mrc, s ); 89.195 + { 89.196 + int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1; 89.197 + filter_mb_edgecv( &img_cb[0], uvlinesize, bS, qp, mrc, s); 89.198 + filter_mb_edgecv( &img_cr[0], uvlinesize, bS, qp, mrc, s); 89.199 + } 89.200 + } else { 89.201 + filter_mb_edgeh( &img_y[0], linesize, bS, qp, mrc, s ); 89.202 + { 89.203 + int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1; 89.204 + filter_mb_edgech( &img_cb[0], uvlinesize, bS, qp, mrc, s); 89.205 + filter_mb_edgech( &img_cr[0], uvlinesize, bS, qp, mrc, s); 89.206 + } 89.207 + } 89.208 + } 89.209 + } 89.210 + 89.211 + for( edge = 1; edge < edges; edge++ ) { 89.212 + int16_t* bS=mrs->bS[dir][edge]; 89.213 + int qp = qp_xy; 89.214 + 89.215 + if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type) 89.216 + continue; 89.217 + 89.218 + if(bS[0]+bS[1]+bS[2]+bS[3] == 0) 89.219 + continue; 89.220 + 89.221 + /* Filter edge */ 89.222 + // Do not use s->qscale as luma quantizer because it has not the same 89.223 + // value in IPCM macroblocks. 89.224 + 89.225 + if( dir == 0 ) { 89.226 + filter_mb_edgev( &img_y[4*edge], linesize, bS, qp, mrc, s); 89.227 + if( (edge&1) == 0 ) { 89.228 + filter_mb_edgecv( &img_cb[2*edge], uvlinesize, bS, get_chroma_qp(s, 0, qp_xy), mrc, s); 89.229 + filter_mb_edgecv( &img_cr[2*edge], uvlinesize, bS, get_chroma_qp(s, 1, qp_xy), mrc, s); 89.230 + } 89.231 + } else { 89.232 + filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, mrc, s ); 89.233 + if( (edge&1) == 0 ) { 89.234 + filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp(s, 0, qp_xy), mrc, s); 89.235 + filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp(s, 1, qp_xy), mrc, s); 89.236 + } 89.237 + } 89.238 + } 89.239 +} 89.240 + 89.241 +static int check_mv(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, long b_idx, long bn_idx, int mvy_limit){ 89.242 + int v; 89.243 + v= mrs->ref_cache[0][b_idx] != mrs->ref_cache[0][bn_idx]; 89.244 + if(!v && mrs->ref_cache[0][b_idx]!=-1) 89.245 + // absolute value >= 7 | ... 89.246 + v= ((unsigned) (mrs->mv_cache[0][b_idx][0] - mrs->mv_cache[0][bn_idx][0] + 3) >= 7U) | 89.247 + ((FFABS( mrs->mv_cache[0][b_idx][1] - mrs->mv_cache[0][bn_idx][1] )) >= mvy_limit); 89.248 + 89.249 + if(s->list_count==2){ 89.250 + if(!v) 89.251 + v = (mrs->ref_cache[1][b_idx] != mrs->ref_cache[1][bn_idx]) | 89.252 + ((unsigned) (mrs->mv_cache[1][b_idx][0] - mrs->mv_cache[1][bn_idx][0] + 3) >= 7U) | 89.253 + ((FFABS( mrs->mv_cache[1][b_idx][1] - mrs->mv_cache[1][bn_idx][1] )) >= mvy_limit); 89.254 + 89.255 + if(v){ 89.256 + if((mrs->ref_cache[0][b_idx] != mrs->ref_cache[1][bn_idx]) | 89.257 + (mrs->ref_cache[1][b_idx] != mrs->ref_cache[0][bn_idx])) 89.258 + return 1; 89.259 + return 89.260 + ((unsigned) (mrs->mv_cache[0][b_idx][0] - mrs->mv_cache[1][bn_idx][0] + 3) >= 7U) | 89.261 + ((FFABS( mrs->mv_cache[0][b_idx][1] - mrs->mv_cache[1][bn_idx][1] )) >= mvy_limit) | 89.262 + ((unsigned) (mrs->mv_cache[1][b_idx][0] - mrs->mv_cache[0][bn_idx][0] + 3) >= 7U) | 89.263 + ((FFABS( mrs->mv_cache[1][b_idx][1] - mrs->mv_cache[0][bn_idx][1] )) >= mvy_limit); 89.264 + } 89.265 + } 89.266 + 89.267 + return v; 89.268 +} 89.269 + 89.270 +static void calc_bS_values(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mvy_limit, int dir) { 89.271 + int mb_type = m->mb_type; 89.272 + int edge; 89.273 + const int mbm_type = dir == 0 ? mrs->left_type : mrs->top_type; 89.274 + 89.275 + // how often to recheck mv-based bS when iterating between edges 89.276 + static const uint8_t mask_edge_tab[2][8]={{0,3,3,3,1,1,1,1}, 89.277 + {0,3,1,1,3,3,3,3}}; 89.278 + const int mask_edge = mask_edge_tab[dir][(mb_type>>3)&7]; 89.279 + const int edges = mask_edge== 3 && !(m->cbp&15) ? 1 : 4; 89.280 + // how often to recheck mv-based bS when iterating along each edge 89.281 + const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)); 89.282 + 89.283 + mrs->edges[dir]= edges; 89.284 + 89.285 + if(mbm_type){ 89.286 + int16_t* bS=mrs->bS[dir][0]; 89.287 + if( IS_INTRA(mb_type|mbm_type)) { 89.288 + AV_WN64A(bS, 0x0004000400040004ULL); 89.289 + } else { 89.290 + int i; 89.291 + int mv_done; 89.292 + if( mask_par0 && ((mbm_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) { 89.293 + int b_idx= 8 + 4; 89.294 + int bn_idx= b_idx - (dir ? 8:1); 89.295 + 89.296 + bS[0] = bS[1] = bS[2] = bS[3] = check_mv(mrc, mrs, s, 8 + 4, bn_idx, mvy_limit); 89.297 + mv_done = 1; 89.298 + } 89.299 + else 89.300 + mv_done = 0; 89.301 + 89.302 + for( i = 0; i < 4; i++ ) { 89.303 + int x = dir == 0 ? 0 : i; 89.304 + int y = dir == 0 ? i : 0; 89.305 + int b_idx= 8 + 4 + x + 8*y; 89.306 + int bn_idx= b_idx - (dir ? 8:1); 89.307 + 89.308 + if( mrs->non_zero_count_cache[b_idx] | 89.309 + mrs->non_zero_count_cache[bn_idx] ) { 89.310 + bS[i] = 2; 89.311 + } 89.312 + else if(!mv_done) 89.313 + { 89.314 + bS[i] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit); 89.315 + } 89.316 + } 89.317 + } 89.318 + } 89.319 + 89.320 + /* Calculate bS */ 89.321 + for( edge = 1; edge < edges; edge++ ) { 89.322 + int16_t* bS=mrs->bS[dir][edge]; 89.323 + 89.324 + if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type) 89.325 + continue; 89.326 + 89.327 + if( IS_INTRA(mb_type)) { 89.328 + AV_WN64A(bS, 0x0003000300030003ULL); 89.329 + } else { 89.330 + int i; 89.331 + int mv_done; 89.332 + 89.333 + if( edge & mask_edge ) { 89.334 + AV_ZERO64(bS); 89.335 + mv_done = 1; 89.336 + } 89.337 + else if( mask_par0 ) { 89.338 + int b_idx= 8 + 4 + edge * (dir ? 8:1); 89.339 + int bn_idx= b_idx - (dir ? 8:1); 89.340 + 89.341 + bS[0] = bS[1] = bS[2] = bS[3] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit); 89.342 + mv_done = 1; 89.343 + } 89.344 + else 89.345 + mv_done = 0; 89.346 + 89.347 + for( i = 0; i < 4; i++ ) { 89.348 + int x = dir == 0 ? edge : i; 89.349 + int y = dir == 0 ? i : edge; 89.350 + int b_idx= 8 + 4 + x + 8*y; 89.351 + int bn_idx= b_idx - (dir ? 8:1); 89.352 + 89.353 + if( mrs->non_zero_count_cache[b_idx] | 89.354 + mrs->non_zero_count_cache[bn_idx] ) { 89.355 + bS[i] = 2; 89.356 + } 89.357 + else if(!mv_done) 89.358 + { 89.359 + bS[i] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit); 89.360 + } 89.361 + } 89.362 + 89.363 + if(bS[0]+bS[1]+bS[2]+bS[3] == 0) 89.364 + continue; 89.365 + } 89.366 + 89.367 + } 89.368 +} 89.369 + 89.370 + 89.371 +/** 89.372 +* 89.373 +* @return zero if the loop filter can be skiped 89.374 +*/ 89.375 +static int fill_filter_caches(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){ 89.376 + H264Mb *m_top = m - mrc->mb_width; 89.377 + H264Mb *m_left = m - 1; 89.378 + const int mb_x = m->mb_x; 89.379 + const int mb_y = m->mb_y; 89.380 + int top_type, left_type; 89.381 + int qp, top_qp, left_qp; 89.382 + int qp_thresh = s->qp_thresh; //FIXME strictly we should store qp_thresh for each mb of a slice 89.383 + 89.384 + qp = m->qscale_mb_xy ; 89.385 + left_qp = m->qscale_left_mb_xy ; 89.386 + top_qp = m->qscale_top_mb_xy ; 89.387 + 89.388 + //for sufficiently low qp, filtering wouldn't do anything 89.389 + //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp 89.390 + if(qp <= qp_thresh 89.391 + && (!(mb_x+mb_y) || ((qp + left_qp + 1)>>1) <= qp_thresh) 89.392 + && ( mb_y==0 || ((qp + top_qp + 1)>>1) <= qp_thresh)){ 89.393 + return 0; 89.394 + } 89.395 + 89.396 + if(IS_INTRA(mb_type)){ 89.397 + return 1; 89.398 + } 89.399 + 89.400 + { 89.401 + int list; 89.402 + for(list=0; list<s->list_count; list++){ 89.403 + int8_t *ref; 89.404 + 89.405 + if(!USES_LIST(mb_type, list)){ 89.406 + fill_rectangle( mrs->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); 89.407 + fill_rectangle( mrs->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); 89.408 + AV_WN32A(&mrs->ref_cache[list][scan8[ 0]], ((LIST_NOT_USED)&0xFF)*0x01010101u); 89.409 + AV_WN32A(&mrs->ref_cache[list][scan8[ 2]], ((LIST_NOT_USED)&0xFF)*0x01010101u); 89.410 + AV_WN32A(&mrs->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u); 89.411 + AV_WN32A(&mrs->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u); 89.412 + continue; 89.413 + } 89.414 + 89.415 + ref = &mrs->ref_index[list][4*mb_x]; 89.416 + { 89.417 + int (*ref2frm)[64] =(void *) (s->ref2frm[0] + 2); 89.418 + AV_WN32A(&mrs->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); 89.419 + AV_WN32A(&mrs->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); 89.420 + ref += 2; 89.421 + 89.422 + AV_WN32A(&mrs->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); 89.423 + AV_WN32A(&mrs->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101); 89.424 + } 89.425 + } 89.426 + } 89.427 + 89.428 + /* 89.429 + 0 . T T. T T T T 89.430 + 1 L . .L . . . . 89.431 + 2 L . .L . . . . 89.432 + 3 . T TL . . . . 89.433 + 4 L . .L . . . . 89.434 + 5 L . .. . . . . 89.435 + */ 89.436 + 89.437 + if (IS_SKIP(mb_type)){ 89.438 + memset(mrs->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui 89.439 + } 89.440 + 89.441 + //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec) 89.442 + top_type = mrs->top_type; 89.443 + left_type = mrs->left_type; 89.444 + if(top_type){ 89.445 + AV_COPY32(&mrs->non_zero_count_cache[4+8*0], &m_top->non_zero_count[3*4]); 89.446 + } 89.447 + 89.448 + if(left_type){ 89.449 + mrs->non_zero_count_cache[3+8*1]= m_left->non_zero_count[3+0*4]; 89.450 + mrs->non_zero_count_cache[3+8*2]= m_left->non_zero_count[3+1*4]; 89.451 + mrs->non_zero_count_cache[3+8*3]= m_left->non_zero_count[3+2*4]; 89.452 + mrs->non_zero_count_cache[3+8*4]= m_left->non_zero_count[3+3*4]; 89.453 + } 89.454 + 89.455 + if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){ 89.456 + int list; 89.457 + for(list=0; list<s->list_count; list++){ 89.458 + if(USES_LIST(top_type, list)){ 89.459 + const int b_xy= 4*mb_x + 3*mrc->b_stride; 89.460 + const int b8_x= 4*mb_x + 2; 89.461 + int (*ref2frm)[64] = (void *) (s->ref2frm[0] + 2); 89.462 + AV_COPY128(mrs->mv_cache[list][scan8[0] + 0 - 1*8], mrs->motion_val_top[list][b_xy + 0]); 89.463 + 89.464 + mrs->ref_cache[list][scan8[0] + 0 - 1*8]= 89.465 + mrs->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][mrs->ref_index_top[list][b8_x + 0]]; 89.466 + mrs->ref_cache[list][scan8[0] + 2 - 1*8]= 89.467 + mrs->ref_cache[list][scan8[0] + 3 - 1*8]= ref2frm[list][mrs->ref_index_top[list][b8_x + 1]]; 89.468 + }else{ 89.469 + AV_ZERO128(mrs->mv_cache[list][scan8[0] + 0 - 1*8]); 89.470 + AV_WN32A(&mrs->ref_cache[list][scan8[0] + 0 - 1*8], ((LIST_NOT_USED)&0xFF)*0x01010101u); 89.471 + } 89.472 + 89.473 + if(USES_LIST(left_type, list)){ 89.474 + const int b_x = 4*(mb_x-1) + 3; 89.475 + const int b8_x= 4*(mb_x-1) + 1; 89.476 + int (*ref2frm)[64] = (void *) (s->ref2frm[0] + 2); 89.477 + AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 + 0 ], mrs->motion_val[list][b_x + mrc->b_stride*0]); 89.478 + AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 + 8 ], mrs->motion_val[list][b_x + mrc->b_stride*1]); 89.479 + AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 +16 ], mrs->motion_val[list][b_x + mrc->b_stride*2]); 89.480 + AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 +24 ], mrs->motion_val[list][b_x + mrc->b_stride*3]); 89.481 + 89.482 + mrs->ref_cache[list][scan8[0] - 1 + 0 ]= 89.483 + mrs->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][mrs->ref_index[list][b8_x + 2*0]]; 89.484 + mrs->ref_cache[list][scan8[0] - 1 +16 ]= 89.485 + mrs->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][mrs->ref_index[list][b8_x + 2*1]]; 89.486 + 89.487 + }else{ 89.488 + AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 + 0 ]); 89.489 + AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 + 8 ]); 89.490 + AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 +16 ]); 89.491 + AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 +24 ]); 89.492 + 89.493 + mrs->ref_cache[list][scan8[0] - 1 + 0 ]= 89.494 + mrs->ref_cache[list][scan8[0] - 1 + 8 ]= 89.495 + mrs->ref_cache[list][scan8[0] - 1 + 16 ]= 89.496 + mrs->ref_cache[list][scan8[0] - 1 + 24 ]= LIST_NOT_USED; 89.497 + } 89.498 + } 89.499 + } 89.500 + return 1; 89.501 +} 89.502 + 89.503 +void ff_h264_filter_mb(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr) { 89.504 + if (fill_filter_caches(mrc, mrs, s, m, m->mb_type)){ 89.505 + calc_bS_values(mrc, mrs, s, m, 4, 0); 89.506 + calc_bS_values(mrc, mrs, s, m, 4, 1); 89.507 + filter_mb_dir(mrc, mrs, s, m, img_y, img_cb, img_cr, 0); 89.508 + filter_mb_dir(mrc, mrs, s, m, img_y, img_cb, img_cr, 1); 89.509 + } 89.510 +}
90.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 90.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_deblock.h Mon Aug 27 12:09:56 2012 +0200 90.3 @@ -0,0 +1,8 @@ 90.4 +#ifndef H264_LOOPFILTER_H 90.5 +#define H264_LOOPFILTER_H 90.6 + 90.7 +#include "h264_types.h" 90.8 + 90.9 +void ff_h264_filter_mb(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr); 90.10 + 90.11 +#endif
91.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 91.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_dsp.c Mon Aug 27 12:09:56 2012 +0200 91.3 @@ -0,0 +1,320 @@ 91.4 +/* 91.5 + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder 91.6 + * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at> 91.7 + * 91.8 + * This file is part of FFmpeg. 91.9 + * 91.10 + * FFmpeg is free software; you can redistribute it and/or 91.11 + * modify it under the terms of the GNU Lesser General Public 91.12 + * License as published by the Free Software Foundation; either 91.13 + * version 2.1 of the License, or (at your option) any later version. 91.14 + * 91.15 + * FFmpeg is distributed in the hope that it will be useful, 91.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 91.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 91.18 + * Lesser General Public License for more details. 91.19 + * 91.20 + * You should have received a copy of the GNU Lesser General Public 91.21 + * License along with FFmpeg; if not, write to the Free Software 91.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 91.23 + */ 91.24 + 91.25 +/** 91.26 + * @file 91.27 + * H.264 / AVC / MPEG4 part10 DSP functions. 91.28 + * @author Michael Niedermayer <michaelni@gmx.at> 91.29 + */ 91.30 + 91.31 +#include <stdint.h> 91.32 +#include "avcodec.h" 91.33 +#include "h264_dsp.h" 91.34 + 91.35 +#define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom ) 91.36 +#define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) 91.37 +#define H264_WEIGHT(W,H) \ 91.38 +static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ 91.39 + int y; \ 91.40 + offset <<= log2_denom; \ 91.41 + if(log2_denom) offset += 1<<(log2_denom-1); \ 91.42 + for(y=0; y<H; y++, block += stride){ \ 91.43 + op_scale1(0); \ 91.44 + op_scale1(1); \ 91.45 + if(W==2) continue; \ 91.46 + op_scale1(2); \ 91.47 + op_scale1(3); \ 91.48 + if(W==4) continue; \ 91.49 + op_scale1(4); \ 91.50 + op_scale1(5); \ 91.51 + op_scale1(6); \ 91.52 + op_scale1(7); \ 91.53 + if(W==8) continue; \ 91.54 + op_scale1(8); \ 91.55 + op_scale1(9); \ 91.56 + op_scale1(10); \ 91.57 + op_scale1(11); \ 91.58 + op_scale1(12); \ 91.59 + op_scale1(13); \ 91.60 + op_scale1(14); \ 91.61 + op_scale1(15); \ 91.62 + } \ 91.63 +} \ 91.64 +static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ 91.65 + int y; \ 91.66 + offset = ((offset + 1) | 1) << log2_denom; \ 91.67 + for(y=0; y<H; y++, dst += stride, src += stride){ \ 91.68 + op_scale2(0); \ 91.69 + op_scale2(1); \ 91.70 + if(W==2) continue; \ 91.71 + op_scale2(2); \ 91.72 + op_scale2(3); \ 91.73 + if(W==4) continue; \ 91.74 + op_scale2(4); \ 91.75 + op_scale2(5); \ 91.76 + op_scale2(6); \ 91.77 + op_scale2(7); \ 91.78 + if(W==8) continue; \ 91.79 + op_scale2(8); \ 91.80 + op_scale2(9); \ 91.81 + op_scale2(10); \ 91.82 + op_scale2(11); \ 91.83 + op_scale2(12); \ 91.84 + op_scale2(13); \ 91.85 + op_scale2(14); \ 91.86 + op_scale2(15); \ 91.87 + } \ 91.88 +} 91.89 + 91.90 +H264_WEIGHT(16,16) 91.91 +H264_WEIGHT(16,8) 91.92 +H264_WEIGHT(8,16) 91.93 +H264_WEIGHT(8,8) 91.94 +H264_WEIGHT(8,4) 91.95 +H264_WEIGHT(4,8) 91.96 +H264_WEIGHT(4,4) 91.97 +H264_WEIGHT(4,2) 91.98 +H264_WEIGHT(2,4) 91.99 +H264_WEIGHT(2,2) 91.100 + 91.101 +#undef op_scale1 91.102 +#undef op_scale2 91.103 +#undef H264_WEIGHT 91.104 + 91.105 +static av_always_inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) 91.106 +{ 91.107 + int i, d; 91.108 + for( i = 0; i < 4; i++ ) { 91.109 + if( tc0[i] < 0 ) { 91.110 + pix += 4*ystride; 91.111 + continue; 91.112 + } 91.113 + for( d = 0; d < 4; d++ ) { 91.114 + const int p0 = pix[-1*xstride]; 91.115 + const int p1 = pix[-2*xstride]; 91.116 + const int p2 = pix[-3*xstride]; 91.117 + const int q0 = pix[0]; 91.118 + const int q1 = pix[1*xstride]; 91.119 + const int q2 = pix[2*xstride]; 91.120 + 91.121 + if( FFABS( p0 - q0 ) < alpha && 91.122 + FFABS( p1 - p0 ) < beta && 91.123 + FFABS( q1 - q0 ) < beta ) { 91.124 + 91.125 + int tc = tc0[i]; 91.126 + int i_delta; 91.127 + 91.128 + if( FFABS( p2 - p0 ) < beta ) { 91.129 + if(tc0[i]) 91.130 + pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); 91.131 + tc++; 91.132 + } 91.133 + if( FFABS( q2 - q0 ) < beta ) { 91.134 + if(tc0[i]) 91.135 + pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); 91.136 + tc++; 91.137 + } 91.138 + 91.139 + i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); 91.140 + pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */ 91.141 + pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */ 91.142 + } 91.143 + pix += ystride; 91.144 + } 91.145 + } 91.146 +} 91.147 +static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 91.148 +{ 91.149 + h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0); 91.150 +} 91.151 +static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 91.152 +{ 91.153 + h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0); 91.154 +} 91.155 + 91.156 +static av_always_inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) 91.157 +{ 91.158 + int d; 91.159 + for( d = 0; d < 16; d++ ) { 91.160 + const int p2 = pix[-3*xstride]; 91.161 + const int p1 = pix[-2*xstride]; 91.162 + const int p0 = pix[-1*xstride]; 91.163 + 91.164 + const int q0 = pix[ 0*xstride]; 91.165 + const int q1 = pix[ 1*xstride]; 91.166 + const int q2 = pix[ 2*xstride]; 91.167 + 91.168 + if( FFABS( p0 - q0 ) < alpha && 91.169 + FFABS( p1 - p0 ) < beta && 91.170 + FFABS( q1 - q0 ) < beta ) { 91.171 + 91.172 + if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){ 91.173 + if( FFABS( p2 - p0 ) < beta) 91.174 + { 91.175 + const int p3 = pix[-4*xstride]; 91.176 + /* p0', p1', p2' */ 91.177 + pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; 91.178 + pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; 91.179 + pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; 91.180 + } else { 91.181 + /* p0' */ 91.182 + pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; 91.183 + } 91.184 + if( FFABS( q2 - q0 ) < beta) 91.185 + { 91.186 + const int q3 = pix[3*xstride]; 91.187 + /* q0', q1', q2' */ 91.188 + pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; 91.189 + pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; 91.190 + pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; 91.191 + } else { 91.192 + /* q0' */ 91.193 + pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; 91.194 + } 91.195 + }else{ 91.196 + /* p0', q0' */ 91.197 + pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; 91.198 + pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; 91.199 + } 91.200 + } 91.201 + pix += ystride; 91.202 + } 91.203 +} 91.204 +static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) 91.205 +{ 91.206 + h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta); 91.207 +} 91.208 +static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) 91.209 +{ 91.210 + h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta); 91.211 +} 91.212 + 91.213 +static av_always_inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) 91.214 +{ 91.215 + int i, d; 91.216 + for( i = 0; i < 4; i++ ) { 91.217 + const int tc = tc0[i]; 91.218 + if( tc <= 0 ) { 91.219 + pix += 2*ystride; 91.220 + continue; 91.221 + } 91.222 + for( d = 0; d < 2; d++ ) { 91.223 + const int p0 = pix[-1*xstride]; 91.224 + const int p1 = pix[-2*xstride]; 91.225 + const int q0 = pix[0]; 91.226 + const int q1 = pix[1*xstride]; 91.227 + 91.228 + if( FFABS( p0 - q0 ) < alpha && 91.229 + FFABS( p1 - p0 ) < beta && 91.230 + FFABS( q1 - q0 ) < beta ) { 91.231 + 91.232 + int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); 91.233 + 91.234 + pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */ 91.235 + pix[0] = av_clip_uint8( q0 - delta ); /* q0' */ 91.236 + } 91.237 + pix += ystride; 91.238 + } 91.239 + } 91.240 +} 91.241 +static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 91.242 +{ 91.243 + h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0); 91.244 +} 91.245 +static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 91.246 +{ 91.247 + h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0); 91.248 +} 91.249 + 91.250 +static av_always_inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) 91.251 +{ 91.252 + int d; 91.253 + for( d = 0; d < 8; d++ ) { 91.254 + const int p0 = pix[-1*xstride]; 91.255 + const int p1 = pix[-2*xstride]; 91.256 + const int q0 = pix[0]; 91.257 + const int q1 = pix[1*xstride]; 91.258 + 91.259 + if( FFABS( p0 - q0 ) < alpha && 91.260 + FFABS( p1 - p0 ) < beta && 91.261 + FFABS( q1 - q0 ) < beta ) { 91.262 + 91.263 + pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */ 91.264 + pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */ 91.265 + } 91.266 + pix += ystride; 91.267 + } 91.268 +} 91.269 +static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) 91.270 +{ 91.271 + h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta); 91.272 +} 91.273 +static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) 91.274 +{ 91.275 + h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta); 91.276 +} 91.277 + 91.278 +void ff_h264dsp_init(H264DSPContext *c) 91.279 +{ 91.280 + c->h264_idct_add= ff_h264_idct_add_c; 91.281 + c->h264_idct8_add= ff_h264_idct8_add_c; 91.282 + c->h264_idct_dc_add= ff_h264_idct_dc_add_c; 91.283 + c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c; 91.284 + c->h264_idct_add16 = ff_h264_idct_add16_c; 91.285 + c->h264_idct8_add4 = ff_h264_idct8_add4_c; 91.286 + c->h264_idct_add8 = ff_h264_idct_add8_c; 91.287 + c->h264_idct_add16intra= ff_h264_idct_add16intra_c; 91.288 + 91.289 + c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; 91.290 + c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; 91.291 + c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c; 91.292 + c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c; 91.293 + c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c; 91.294 + c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c; 91.295 + c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c; 91.296 + c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c; 91.297 + c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c; 91.298 + c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c; 91.299 + c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c; 91.300 + c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c; 91.301 + c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c; 91.302 + c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c; 91.303 + c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c; 91.304 + c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c; 91.305 + c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c; 91.306 + c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c; 91.307 + c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c; 91.308 + c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c; 91.309 + 91.310 + c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c; 91.311 + c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c; 91.312 + c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c; 91.313 + c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c; 91.314 + c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c; 91.315 + c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c; 91.316 + c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c; 91.317 + c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c; 91.318 + c->h264_loop_filter_strength= NULL; 91.319 + 91.320 + if (ARCH_ARM) ff_h264dsp_init_arm(c); 91.321 + if (HAVE_ALTIVEC) ff_h264dsp_init_ppc(c); 91.322 + if (HAVE_MMX) ff_h264dsp_init_x86(c); 91.323 +}
92.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 92.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_dsp.h Mon Aug 27 12:09:56 2012 +0200 92.3 @@ -0,0 +1,83 @@ 92.4 +/* 92.5 + * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at> 92.6 + * 92.7 + * This file is part of FFmpeg. 92.8 + * 92.9 + * FFmpeg is free software; you can redistribute it and/or 92.10 + * modify it under the terms of the GNU Lesser General Public 92.11 + * License as published by the Free Software Foundation; either 92.12 + * version 2.1 of the License, or (at your option) any later version. 92.13 + * 92.14 + * FFmpeg is distributed in the hope that it will be useful, 92.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 92.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 92.17 + * Lesser General Public License for more details. 92.18 + * 92.19 + * You should have received a copy of the GNU Lesser General Public 92.20 + * License along with FFmpeg; if not, write to the Free Software 92.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 92.22 + */ 92.23 + 92.24 +/** 92.25 + * @file 92.26 + * H.264 DSP functions. 92.27 + * @author Michael Niedermayer <michaelni@gmx.at> 92.28 + */ 92.29 + 92.30 +#ifndef AVCODEC_H264DSP_H 92.31 +#define AVCODEC_H264DSP_H 92.32 + 92.33 +#include <stdint.h> 92.34 +#include "dsputil.h" 92.35 + 92.36 +//typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y); 92.37 +typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset); 92.38 +typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset); 92.39 + 92.40 +/** 92.41 + * Context for storing H.264 DSP functions 92.42 + */ 92.43 +typedef struct H264DSPContext{ 92.44 + /* weighted MC */ 92.45 + h264_weight_func weight_h264_pixels_tab[10]; 92.46 + h264_biweight_func biweight_h264_pixels_tab[10]; 92.47 + 92.48 + /* loop filter */ 92.49 + void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0); 92.50 + void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0); 92.51 + /* v/h_loop_filter_luma_intra: align 16 */ 92.52 + void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta); 92.53 + void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta); 92.54 + void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0); 92.55 + void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0); 92.56 + void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta); 92.57 + void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta); 92.58 + // h264_loop_filter_strength: simd only. the C version is inlined in h264.c 92.59 + void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], 92.60 + int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field); 92.61 + 92.62 + /* IDCT */ 92.63 + /* NOTE!!! if you implement any of h264_idct8_add, h264_idct8_add4 then you must implement all of them 92.64 + NOTE!!! if you implement any of h264_idct_add, h264_idct_add16, h264_idct_add16intra, h264_idct_add8 then you must implement all of them 92.65 + The reason for above, is that no 2 out of one list may use a different permutation. 92.66 + */ 92.67 + void (*h264_idct_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride); 92.68 + void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); 92.69 + void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride); 92.70 + void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); 92.71 + void (*h264_dct)(DCTELEM block[4][4]); 92.72 + void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); 92.73 + void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); 92.74 + void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); 92.75 + void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); 92.76 + 92.77 + qpel_mc_func (*qpel_put)[16]; 92.78 + qpel_mc_func (*qpel_avg)[16]; 92.79 +}H264DSPContext; 92.80 + 92.81 +void ff_h264dsp_init(H264DSPContext *c); 92.82 +void ff_h264dsp_init_arm(H264DSPContext *c); 92.83 +void ff_h264dsp_init_ppc(H264DSPContext *c); 92.84 +void ff_h264dsp_init_x86(H264DSPContext *c); 92.85 + 92.86 +#endif /* AVCODEC_H264DSP_H */
93.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 93.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_entropy.c Mon Aug 27 12:09:56 2012 +0200 93.3 @@ -0,0 +1,2065 @@ 93.4 +/* 93.5 + * H.26L/H.264/AVC/JVT/14496-10/... cabac decoding 93.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 93.7 + * 93.8 + * This file is part of FFmpeg. 93.9 + * 93.10 + * FFmpeg is free software; you can redistribute it and/or 93.11 + * modify it under the terms of the GNU Lesser General Public 93.12 + * License as published by the Free Software Foundation; either 93.13 + * version 2.1 of the License, or (at your option) any later version. 93.14 + * 93.15 + * FFmpeg is distributed in the hope that it will be useful, 93.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 93.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 93.18 + * Lesser General Public License for more details. 93.19 + * 93.20 + * You should have received a copy of the GNU Lesser General Public 93.21 + * License along with FFmpeg; if not, write to the Free Software 93.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 93.23 + */ 93.24 + 93.25 +/** 93.26 + * @file 93.27 + * H.264 / AVC / MPEG4 part10 cabac decoding. 93.28 + * @author Michael Niedermayer <michaelni@gmx.at> 93.29 + */ 93.30 + 93.31 +#include "avcodec.h" 93.32 +#include "h264_types.h" 93.33 +#include "h264_data.h" 93.34 +#include "cabac.h" 93.35 +#include "rectangle.h" 93.36 +#include "h264_misc.h" 93.37 + 93.38 +// #undef NDEBUG 93.39 +#include <assert.h> 93.40 + 93.41 +/* Cabac pre state table */ 93.42 + 93.43 +static const int8_t cabac_context_init_I[460][2] = 93.44 +{ 93.45 + /* 0 - 10 */ 93.46 + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, 93.47 + { 2, 54 }, { 3, 74 }, { -28,127 }, { -23, 104 }, 93.48 + { -6, 53 }, { -1, 54 }, { 7, 51 }, 93.49 + 93.50 + /* 11 - 23 unsused for I */ 93.51 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 93.52 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 93.53 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 93.54 + { 0, 0 }, 93.55 + 93.56 + /* 24- 39 */ 93.57 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 93.58 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 93.59 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 93.60 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 93.61 + 93.62 + /* 40 - 53 */ 93.63 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 93.64 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 93.65 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 93.66 + { 0, 0 }, { 0, 0 }, 93.67 + 93.68 + /* 54 - 59 */ 93.69 + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, 93.70 + { 0, 0 }, { 0, 0 }, 93.71 + 93.72 + /* 60 - 69 */ 93.73 + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, 93.74 + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, 93.75 + { 13, 41 }, { 3, 62 }, 93.76 + 93.77 + /* 70 -> 87 */ 93.78 + { 0, 11 }, { 1, 55 }, { 0, 69 }, { -17, 127 }, 93.79 + { -13, 102 },{ 0, 82 }, { -7, 74 }, { -21, 107 }, 93.80 + { -27, 127 },{ -31, 127 },{ -24, 127 }, { -18, 95 }, 93.81 + { -27, 127 },{ -21, 114 },{ -30, 127 }, { -17, 123 }, 93.82 + { -12, 115 },{ -16, 122 }, 93.83 + 93.84 + /* 88 -> 104 */ 93.85 + { -11, 115 },{ -12, 63 }, { -2, 68 }, { -15, 84 }, 93.86 + { -13, 104 },{ -3, 70 }, { -8, 93 }, { -10, 90 }, 93.87 + { -30, 127 },{ -1, 74 }, { -6, 97 }, { -7, 91 }, 93.88 + { -20, 127 },{ -4, 56 }, { -5, 82 }, { -7, 76 }, 93.89 + { -22, 125 }, 93.90 + 93.91 + /* 105 -> 135 */ 93.92 + { -7, 93 }, { -11, 87 }, { -3, 77 }, { -5, 71 }, 93.93 + { -4, 63 }, { -4, 68 }, { -12, 84 }, { -7, 62 }, 93.94 + { -7, 65 }, { 8, 61 }, { 5, 56 }, { -2, 66 }, 93.95 + { 1, 64 }, { 0, 61 }, { -2, 78 }, { 1, 50 }, 93.96 + { 7, 52 }, { 10, 35 }, { 0, 44 }, { 11, 38 }, 93.97 + { 1, 45 }, { 0, 46 }, { 5, 44 }, { 31, 17 }, 93.98 + { 1, 51 }, { 7, 50 }, { 28, 19 }, { 16, 33 }, 93.99 + { 14, 62 }, { -13, 108 },{ -15, 100 }, 93.100 + 93.101 + /* 136 -> 165 */ 93.102 + { -13, 101 },{ -13, 91 }, { -12, 94 }, { -10, 88 }, 93.103 + { -16, 84 }, { -10, 86 }, { -7, 83 }, { -13, 87 }, 93.104 + { -19, 94 }, { 1, 70 }, { 0, 72 }, { -5, 74 }, 93.105 + { 18, 59 }, { -8, 102 }, { -15, 100 }, { 0, 95 }, 93.106 + { -4, 75 }, { 2, 72 }, { -11, 75 }, { -3, 71 }, 93.107 + { 15, 46 }, { -13, 69 }, { 0, 62 }, { 0, 65 }, 93.108 + { 21, 37 }, { -15, 72 }, { 9, 57 }, { 16, 54 }, 93.109 + { 0, 62 }, { 12, 72 }, 93.110 + 93.111 + /* 166 -> 196 */ 93.112 + { 24, 0 }, { 15, 9 }, { 8, 25 }, { 13, 18 }, 93.113 + { 15, 9 }, { 13, 19 }, { 10, 37 }, { 12, 18 }, 93.114 + { 6, 29 }, { 20, 33 }, { 15, 30 }, { 4, 45 }, 93.115 + { 1, 58 }, { 0, 62 }, { 7, 61 }, { 12, 38 }, 93.116 + { 11, 45 }, { 15, 39 }, { 11, 42 }, { 13, 44 }, 93.117 + { 16, 45 }, { 12, 41 }, { 10, 49 }, { 30, 34 }, 93.118 + { 18, 42 }, { 10, 55 }, { 17, 51 }, { 17, 46 }, 93.119 + { 0, 89 }, { 26, -19 }, { 22, -17 }, 93.120 + 93.121 + /* 197 -> 226 */ 93.122 + { 26, -17 }, { 30, -25 }, { 28, -20 }, { 33, -23 }, 93.123 + { 37, -27 }, { 33, -23 }, { 40, -28 }, { 38, -17 }, 93.124 + { 33, -11 }, { 40, -15 }, { 41, -6 }, { 38, 1 }, 93.125 + { 41, 17 }, { 30, -6 }, { 27, 3 }, { 26, 22 }, 93.126 + { 37, -16 }, { 35, -4 }, { 38, -8 }, { 38, -3 }, 93.127 + { 37, 3 }, { 38, 5 }, { 42, 0 }, { 35, 16 }, 93.128 + { 39, 22 }, { 14, 48 }, { 27, 37 }, { 21, 60 }, 93.129 + { 12, 68 }, { 2, 97 }, 93.130 + 93.131 + /* 227 -> 251 */ 93.132 + { -3, 71 }, { -6, 42 }, { -5, 50 }, { -3, 54 }, 93.133 + { -2, 62 }, { 0, 58 }, { 1, 63 }, { -2, 72 }, 93.134 + { -1, 74 }, { -9, 91 }, { -5, 67 }, { -5, 27 }, 93.135 + { -3, 39 }, { -2, 44 }, { 0, 46 }, { -16, 64 }, 93.136 + { -8, 68 }, { -10, 78 }, { -6, 77 }, { -10, 86 }, 93.137 + { -12, 92 }, { -15, 55 }, { -10, 60 }, { -6, 62 }, 93.138 + { -4, 65 }, 93.139 + 93.140 + /* 252 -> 275 */ 93.141 + { -12, 73 }, { -8, 76 }, { -7, 80 }, { -9, 88 }, 93.142 + { -17, 110 },{ -11, 97 }, { -20, 84 }, { -11, 79 }, 93.143 + { -6, 73 }, { -4, 74 }, { -13, 86 }, { -13, 96 }, 93.144 + { -11, 97 }, { -19, 117 },{ -8, 78 }, { -5, 33 }, 93.145 + { -4, 48 }, { -2, 53 }, { -3, 62 }, { -13, 71 }, 93.146 + { -10, 79 }, { -12, 86 }, { -13, 90 }, { -14, 97 }, 93.147 + 93.148 + /* 276 a bit special (not used, bypass is used instead) */ 93.149 + { 0, 0 }, 93.150 + 93.151 + /* 277 -> 307 */ 93.152 + { -6, 93 }, { -6, 84 }, { -8, 79 }, { 0, 66 }, 93.153 + { -1, 71 }, { 0, 62 }, { -2, 60 }, { -2, 59 }, 93.154 + { -5, 75 }, { -3, 62 }, { -4, 58 }, { -9, 66 }, 93.155 + { -1, 79 }, { 0, 71 }, { 3, 68 }, { 10, 44 }, 93.156 + { -7, 62 }, { 15, 36 }, { 14, 40 }, { 16, 27 }, 93.157 + { 12, 29 }, { 1, 44 }, { 20, 36 }, { 18, 32 }, 93.158 + { 5, 42 }, { 1, 48 }, { 10, 62 }, { 17, 46 }, 93.159 + { 9, 64 }, { -12, 104 },{ -11, 97 }, 93.160 + 93.161 + /* 308 -> 337 */ 93.162 + { -16, 96 }, { -7, 88 }, { -8, 85 }, { -7, 85 }, 93.163 + { -9, 85 }, { -13, 88 }, { 4, 66 }, { -3, 77 }, 93.164 + { -3, 76 }, { -6, 76 }, { 10, 58 }, { -1, 76 }, 93.165 + { -1, 83 }, { -7, 99 }, { -14, 95 }, { 2, 95 }, 93.166 + { 0, 76 }, { -5, 74 }, { 0, 70 }, { -11, 75 }, 93.167 + { 1, 68 }, { 0, 65 }, { -14, 73 }, { 3, 62 }, 93.168 + { 4, 62 }, { -1, 68 }, { -13, 75 }, { 11, 55 }, 93.169 + { 5, 64 }, { 12, 70 }, 93.170 + 93.171 + /* 338 -> 368 */ 93.172 + { 15, 6 }, { 6, 19 }, { 7, 16 }, { 12, 14 }, 93.173 + { 18, 13 }, { 13, 11 }, { 13, 15 }, { 15, 16 }, 93.174 + { 12, 23 }, { 13, 23 }, { 15, 20 }, { 14, 26 }, 93.175 + { 14, 44 }, { 17, 40 }, { 17, 47 }, { 24, 17 }, 93.176 + { 21, 21 }, { 25, 22 }, { 31, 27 }, { 22, 29 }, 93.177 + { 19, 35 }, { 14, 50 }, { 10, 57 }, { 7, 63 }, 93.178 + { -2, 77 }, { -4, 82 }, { -3, 94 }, { 9, 69 }, 93.179 + { -12, 109 },{ 36, -35 }, { 36, -34 }, 93.180 + 93.181 + /* 369 -> 398 */ 93.182 + { 32, -26 }, { 37, -30 }, { 44, -32 }, { 34, -18 }, 93.183 + { 34, -15 }, { 40, -15 }, { 33, -7 }, { 35, -5 }, 93.184 + { 33, 0 }, { 38, 2 }, { 33, 13 }, { 23, 35 }, 93.185 + { 13, 58 }, { 29, -3 }, { 26, 0 }, { 22, 30 }, 93.186 + { 31, -7 }, { 35, -15 }, { 34, -3 }, { 34, 3 }, 93.187 + { 36, -1 }, { 34, 5 }, { 32, 11 }, { 35, 5 }, 93.188 + { 34, 12 }, { 39, 11 }, { 30, 29 }, { 34, 26 }, 93.189 + { 29, 39 }, { 19, 66 }, 93.190 + 93.191 + /* 399 -> 435 */ 93.192 + { 31, 21 }, { 31, 31 }, { 25, 50 }, 93.193 + { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11, 85 }, 93.194 + { -15, 92 }, { -14, 89 }, { -26, 71 }, { -15, 81 }, 93.195 + { -14, 80 }, { 0, 68 }, { -14, 70 }, { -24, 56 }, 93.196 + { -23, 68 }, { -24, 50 }, { -11, 74 }, { 23, -13 }, 93.197 + { 26, -13 }, { 40, -15 }, { 49, -14 }, { 44, 3 }, 93.198 + { 45, 6 }, { 44, 34 }, { 33, 54 }, { 19, 82 }, 93.199 + { -3, 75 }, { -1, 23 }, { 1, 34 }, { 1, 43 }, 93.200 + { 0, 54 }, { -2, 55 }, { 0, 61 }, { 1, 64 }, 93.201 + { 0, 68 }, { -9, 92 }, 93.202 + 93.203 + /* 436 -> 459 */ 93.204 + { -14, 106 }, { -13, 97 }, { -15, 90 }, { -12, 90 }, 93.205 + { -18, 88 }, { -10, 73 }, { -9, 79 }, { -14, 86 }, 93.206 + { -10, 73 }, { -10, 70 }, { -10, 69 }, { -5, 66 }, 93.207 + { -9, 64 }, { -5, 58 }, { 2, 59 }, { 21, -10 }, 93.208 + { 24, -11 }, { 28, -8 }, { 28, -1 }, { 29, 3 }, 93.209 + { 29, 9 }, { 35, 20 }, { 29, 36 }, { 14, 67 } 93.210 +}; 93.211 + 93.212 +static const int8_t cabac_context_init_PB[3][460][2] = 93.213 +{ 93.214 + /* i_cabac_init_idc == 0 */ 93.215 + { 93.216 + /* 0 - 10 */ 93.217 + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, 93.218 + { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, 93.219 + { -6, 53 }, { -1, 54 }, { 7, 51 }, 93.220 + 93.221 + /* 11 - 23 */ 93.222 + { 23, 33 }, { 23, 2 }, { 21, 0 }, { 1, 9 }, 93.223 + { 0, 49 }, { -37, 118 }, { 5, 57 }, { -13, 78 }, 93.224 + { -11, 65 }, { 1, 62 }, { 12, 49 }, { -4, 73 }, 93.225 + { 17, 50 }, 93.226 + 93.227 + /* 24 - 39 */ 93.228 + { 18, 64 }, { 9, 43 }, { 29, 0 }, { 26, 67 }, 93.229 + { 16, 90 }, { 9, 104 }, { -46, 127 }, { -20, 104 }, 93.230 + { 1, 67 }, { -13, 78 }, { -11, 65 }, { 1, 62 }, 93.231 + { -6, 86 }, { -17, 95 }, { -6, 61 }, { 9, 45 }, 93.232 + 93.233 + /* 40 - 53 */ 93.234 + { -3, 69 }, { -6, 81 }, { -11, 96 }, { 6, 55 }, 93.235 + { 7, 67 }, { -5, 86 }, { 2, 88 }, { 0, 58 }, 93.236 + { -3, 76 }, { -10, 94 }, { 5, 54 }, { 4, 69 }, 93.237 + { -3, 81 }, { 0, 88 }, 93.238 + 93.239 + /* 54 - 59 */ 93.240 + { -7, 67 }, { -5, 74 }, { -4, 74 }, { -5, 80 }, 93.241 + { -7, 72 }, { 1, 58 }, 93.242 + 93.243 + /* 60 - 69 */ 93.244 + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, 93.245 + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, 93.246 + { 13, 41 }, { 3, 62 }, 93.247 + 93.248 + /* 70 - 87 */ 93.249 + { 0, 45 }, { -4, 78 }, { -3, 96 }, { -27, 126 }, 93.250 + { -28, 98 }, { -25, 101 }, { -23, 67 }, { -28, 82 }, 93.251 + { -20, 94 }, { -16, 83 }, { -22, 110 }, { -21, 91 }, 93.252 + { -18, 102 }, { -13, 93 }, { -29, 127 }, { -7, 92 }, 93.253 + { -5, 89 }, { -7, 96 }, { -13, 108 }, { -3, 46 }, 93.254 + { -1, 65 }, { -1, 57 }, { -9, 93 }, { -3, 74 }, 93.255 + { -9, 92 }, { -8, 87 }, { -23, 126 }, { 5, 54 }, 93.256 + { 6, 60 }, { 6, 59 }, { 6, 69 }, { -1, 48 }, 93.257 + { 0, 68 }, { -4, 69 }, { -8, 88 }, 93.258 + 93.259 + /* 105 -> 165 */ 93.260 + { -2, 85 }, { -6, 78 }, { -1, 75 }, { -7, 77 }, 93.261 + { 2, 54 }, { 5, 50 }, { -3, 68 }, { 1, 50 }, 93.262 + { 6, 42 }, { -4, 81 }, { 1, 63 }, { -4, 70 }, 93.263 + { 0, 67 }, { 2, 57 }, { -2, 76 }, { 11, 35 }, 93.264 + { 4, 64 }, { 1, 61 }, { 11, 35 }, { 18, 25 }, 93.265 + { 12, 24 }, { 13, 29 }, { 13, 36 }, { -10, 93 }, 93.266 + { -7, 73 }, { -2, 73 }, { 13, 46 }, { 9, 49 }, 93.267 + { -7, 100 }, { 9, 53 }, { 2, 53 }, { 5, 53 }, 93.268 + { -2, 61 }, { 0, 56 }, { 0, 56 }, { -13, 63 }, 93.269 + { -5, 60 }, { -1, 62 }, { 4, 57 }, { -6, 69 }, 93.270 + { 4, 57 }, { 14, 39 }, { 4, 51 }, { 13, 68 }, 93.271 + { 3, 64 }, { 1, 61 }, { 9, 63 }, { 7, 50 }, 93.272 + { 16, 39 }, { 5, 44 }, { 4, 52 }, { 11, 48 }, 93.273 + { -5, 60 }, { -1, 59 }, { 0, 59 }, { 22, 33 }, 93.274 + { 5, 44 }, { 14, 43 }, { -1, 78 }, { 0, 60 }, 93.275 + { 9, 69 }, 93.276 + 93.277 + /* 166 - 226 */ 93.278 + { 11, 28 }, { 2, 40 }, { 3, 44 }, { 0, 49 }, 93.279 + { 0, 46 }, { 2, 44 }, { 2, 51 }, { 0, 47 }, 93.280 + { 4, 39 }, { 2, 62 }, { 6, 46 }, { 0, 54 }, 93.281 + { 3, 54 }, { 2, 58 }, { 4, 63 }, { 6, 51 }, 93.282 + { 6, 57 }, { 7, 53 }, { 6, 52 }, { 6, 55 }, 93.283 + { 11, 45 }, { 14, 36 }, { 8, 53 }, { -1, 82 }, 93.284 + { 7, 55 }, { -3, 78 }, { 15, 46 }, { 22, 31 }, 93.285 + { -1, 84 }, { 25, 7 }, { 30, -7 }, { 28, 3 }, 93.286 + { 28, 4 }, { 32, 0 }, { 34, -1 }, { 30, 6 }, 93.287 + { 30, 6 }, { 32, 9 }, { 31, 19 }, { 26, 27 }, 93.288 + { 26, 30 }, { 37, 20 }, { 28, 34 }, { 17, 70 }, 93.289 + { 1, 67 }, { 5, 59 }, { 9, 67 }, { 16, 30 }, 93.290 + { 18, 32 }, { 18, 35 }, { 22, 29 }, { 24, 31 }, 93.291 + { 23, 38 }, { 18, 43 }, { 20, 41 }, { 11, 63 }, 93.292 + { 9, 59 }, { 9, 64 }, { -1, 94 }, { -2, 89 }, 93.293 + { -9, 108 }, 93.294 + 93.295 + /* 227 - 275 */ 93.296 + { -6, 76 }, { -2, 44 }, { 0, 45 }, { 0, 52 }, 93.297 + { -3, 64 }, { -2, 59 }, { -4, 70 }, { -4, 75 }, 93.298 + { -8, 82 }, { -17, 102 }, { -9, 77 }, { 3, 24 }, 93.299 + { 0, 42 }, { 0, 48 }, { 0, 55 }, { -6, 59 }, 93.300 + { -7, 71 }, { -12, 83 }, { -11, 87 }, { -30, 119 }, 93.301 + { 1, 58 }, { -3, 29 }, { -1, 36 }, { 1, 38 }, 93.302 + { 2, 43 }, { -6, 55 }, { 0, 58 }, { 0, 64 }, 93.303 + { -3, 74 }, { -10, 90 }, { 0, 70 }, { -4, 29 }, 93.304 + { 5, 31 }, { 7, 42 }, { 1, 59 }, { -2, 58 }, 93.305 + { -3, 72 }, { -3, 81 }, { -11, 97 }, { 0, 58 }, 93.306 + { 8, 5 }, { 10, 14 }, { 14, 18 }, { 13, 27 }, 93.307 + { 2, 40 }, { 0, 58 }, { -3, 70 }, { -6, 79 }, 93.308 + { -8, 85 }, 93.309 + 93.310 + /* 276 a bit special (not used, bypass is used instead) */ 93.311 + { 0, 0 }, 93.312 + 93.313 + /* 277 - 337 */ 93.314 + { -13, 106 }, { -16, 106 }, { -10, 87 }, { -21, 114 }, 93.315 + { -18, 110 }, { -14, 98 }, { -22, 110 }, { -21, 106 }, 93.316 + { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 }, 93.317 + { -10, 96 }, { -12, 95 }, { -5, 91 }, { -9, 93 }, 93.318 + { -22, 94 }, { -5, 86 }, { 9, 67 }, { -4, 80 }, 93.319 + { -10, 85 }, { -1, 70 }, { 7, 60 }, { 9, 58 }, 93.320 + { 5, 61 }, { 12, 50 }, { 15, 50 }, { 18, 49 }, 93.321 + { 17, 54 }, { 10, 41 }, { 7, 46 }, { -1, 51 }, 93.322 + { 7, 49 }, { 8, 52 }, { 9, 41 }, { 6, 47 }, 93.323 + { 2, 55 }, { 13, 41 }, { 10, 44 }, { 6, 50 }, 93.324 + { 5, 53 }, { 13, 49 }, { 4, 63 }, { 6, 64 }, 93.325 + { -2, 69 }, { -2, 59 }, { 6, 70 }, { 10, 44 }, 93.326 + { 9, 31 }, { 12, 43 }, { 3, 53 }, { 14, 34 }, 93.327 + { 10, 38 }, { -3, 52 }, { 13, 40 }, { 17, 32 }, 93.328 + { 7, 44 }, { 7, 38 }, { 13, 50 }, { 10, 57 }, 93.329 + { 26, 43 }, 93.330 + 93.331 + /* 338 - 398 */ 93.332 + { 14, 11 }, { 11, 14 }, { 9, 11 }, { 18, 11 }, 93.333 + { 21, 9 }, { 23, -2 }, { 32, -15 }, { 32, -15 }, 93.334 + { 34, -21 }, { 39, -23 }, { 42, -33 }, { 41, -31 }, 93.335 + { 46, -28 }, { 38, -12 }, { 21, 29 }, { 45, -24 }, 93.336 + { 53, -45 }, { 48, -26 }, { 65, -43 }, { 43, -19 }, 93.337 + { 39, -10 }, { 30, 9 }, { 18, 26 }, { 20, 27 }, 93.338 + { 0, 57 }, { -14, 82 }, { -5, 75 }, { -19, 97 }, 93.339 + { -35, 125 }, { 27, 0 }, { 28, 0 }, { 31, -4 }, 93.340 + { 27, 6 }, { 34, 8 }, { 30, 10 }, { 24, 22 }, 93.341 + { 33, 19 }, { 22, 32 }, { 26, 31 }, { 21, 41 }, 93.342 + { 26, 44 }, { 23, 47 }, { 16, 65 }, { 14, 71 }, 93.343 + { 8, 60 }, { 6, 63 }, { 17, 65 }, { 21, 24 }, 93.344 + { 23, 20 }, { 26, 23 }, { 27, 32 }, { 28, 23 }, 93.345 + { 28, 24 }, { 23, 40 }, { 24, 32 }, { 28, 29 }, 93.346 + { 23, 42 }, { 19, 57 }, { 22, 53 }, { 22, 61 }, 93.347 + { 11, 86 }, 93.348 + 93.349 + /* 399 - 435 */ 93.350 + { 12, 40 }, { 11, 51 }, { 14, 59 }, 93.351 + { -4, 79 }, { -7, 71 }, { -5, 69 }, { -9, 70 }, 93.352 + { -8, 66 }, { -10, 68 }, { -19, 73 }, { -12, 69 }, 93.353 + { -16, 70 }, { -15, 67 }, { -20, 62 }, { -19, 70 }, 93.354 + { -16, 66 }, { -22, 65 }, { -20, 63 }, { 9, -2 }, 93.355 + { 26, -9 }, { 33, -9 }, { 39, -7 }, { 41, -2 }, 93.356 + { 45, 3 }, { 49, 9 }, { 45, 27 }, { 36, 59 }, 93.357 + { -6, 66 }, { -7, 35 }, { -7, 42 }, { -8, 45 }, 93.358 + { -5, 48 }, { -12, 56 }, { -6, 60 }, { -5, 62 }, 93.359 + { -8, 66 }, { -8, 76 }, 93.360 + 93.361 + /* 436 - 459 */ 93.362 + { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, 93.363 + { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, 93.364 + { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, 93.365 + { -14, 66 }, { 0, 59 }, { 2, 59 }, { 21, -13 }, 93.366 + { 33, -14 }, { 39, -7 }, { 46, -2 }, { 51, 2 }, 93.367 + { 60, 6 }, { 61, 17 }, { 55, 34 }, { 42, 62 }, 93.368 + }, 93.369 + 93.370 + /* i_cabac_init_idc == 1 */ 93.371 + { 93.372 + /* 0 - 10 */ 93.373 + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, 93.374 + { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, 93.375 + { -6, 53 }, { -1, 54 }, { 7, 51 }, 93.376 + 93.377 + /* 11 - 23 */ 93.378 + { 22, 25 }, { 34, 0 }, { 16, 0 }, { -2, 9 }, 93.379 + { 4, 41 }, { -29, 118 }, { 2, 65 }, { -6, 71 }, 93.380 + { -13, 79 }, { 5, 52 }, { 9, 50 }, { -3, 70 }, 93.381 + { 10, 54 }, 93.382 + 93.383 + /* 24 - 39 */ 93.384 + { 26, 34 }, { 19, 22 }, { 40, 0 }, { 57, 2 }, 93.385 + { 41, 36 }, { 26, 69 }, { -45, 127 }, { -15, 101 }, 93.386 + { -4, 76 }, { -6, 71 }, { -13, 79 }, { 5, 52 }, 93.387 + { 6, 69 }, { -13, 90 }, { 0, 52 }, { 8, 43 }, 93.388 + 93.389 + /* 40 - 53 */ 93.390 + { -2, 69 },{ -5, 82 },{ -10, 96 },{ 2, 59 }, 93.391 + { 2, 75 },{ -3, 87 },{ -3, 100 },{ 1, 56 }, 93.392 + { -3, 74 },{ -6, 85 },{ 0, 59 },{ -3, 81 }, 93.393 + { -7, 86 },{ -5, 95 }, 93.394 + 93.395 + /* 54 - 59 */ 93.396 + { -1, 66 },{ -1, 77 },{ 1, 70 },{ -2, 86 }, 93.397 + { -5, 72 },{ 0, 61 }, 93.398 + 93.399 + /* 60 - 69 */ 93.400 + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, 93.401 + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, 93.402 + { 13, 41 }, { 3, 62 }, 93.403 + 93.404 + /* 70 - 104 */ 93.405 + { 13, 15 }, { 7, 51 }, { 2, 80 }, { -39, 127 }, 93.406 + { -18, 91 }, { -17, 96 }, { -26, 81 }, { -35, 98 }, 93.407 + { -24, 102 }, { -23, 97 }, { -27, 119 }, { -24, 99 }, 93.408 + { -21, 110 }, { -18, 102 }, { -36, 127 }, { 0, 80 }, 93.409 + { -5, 89 }, { -7, 94 }, { -4, 92 }, { 0, 39 }, 93.410 + { 0, 65 }, { -15, 84 }, { -35, 127 }, { -2, 73 }, 93.411 + { -12, 104 }, { -9, 91 }, { -31, 127 }, { 3, 55 }, 93.412 + { 7, 56 }, { 7, 55 }, { 8, 61 }, { -3, 53 }, 93.413 + { 0, 68 }, { -7, 74 }, { -9, 88 }, 93.414 + 93.415 + /* 105 -> 165 */ 93.416 + { -13, 103 }, { -13, 91 }, { -9, 89 }, { -14, 92 }, 93.417 + { -8, 76 }, { -12, 87 }, { -23, 110 }, { -24, 105 }, 93.418 + { -10, 78 }, { -20, 112 }, { -17, 99 }, { -78, 127 }, 93.419 + { -70, 127 }, { -50, 127 }, { -46, 127 }, { -4, 66 }, 93.420 + { -5, 78 }, { -4, 71 }, { -8, 72 }, { 2, 59 }, 93.421 + { -1, 55 }, { -7, 70 }, { -6, 75 }, { -8, 89 }, 93.422 + { -34, 119 }, { -3, 75 }, { 32, 20 }, { 30, 22 }, 93.423 + { -44, 127 }, { 0, 54 }, { -5, 61 }, { 0, 58 }, 93.424 + { -1, 60 }, { -3, 61 }, { -8, 67 }, { -25, 84 }, 93.425 + { -14, 74 }, { -5, 65 }, { 5, 52 }, { 2, 57 }, 93.426 + { 0, 61 }, { -9, 69 }, { -11, 70 }, { 18, 55 }, 93.427 + { -4, 71 }, { 0, 58 }, { 7, 61 }, { 9, 41 }, 93.428 + { 18, 25 }, { 9, 32 }, { 5, 43 }, { 9, 47 }, 93.429 + { 0, 44 }, { 0, 51 }, { 2, 46 }, { 19, 38 }, 93.430 + { -4, 66 }, { 15, 38 }, { 12, 42 }, { 9, 34 }, 93.431 + { 0, 89 }, 93.432 + 93.433 + /* 166 - 226 */ 93.434 + { 4, 45 }, { 10, 28 }, { 10, 31 }, { 33, -11 }, 93.435 + { 52, -43 }, { 18, 15 }, { 28, 0 }, { 35, -22 }, 93.436 + { 38, -25 }, { 34, 0 }, { 39, -18 }, { 32, -12 }, 93.437 + { 102, -94 }, { 0, 0 }, { 56, -15 }, { 33, -4 }, 93.438 + { 29, 10 }, { 37, -5 }, { 51, -29 }, { 39, -9 }, 93.439 + { 52, -34 }, { 69, -58 }, { 67, -63 }, { 44, -5 }, 93.440 + { 32, 7 }, { 55, -29 }, { 32, 1 }, { 0, 0 }, 93.441 + { 27, 36 }, { 33, -25 }, { 34, -30 }, { 36, -28 }, 93.442 + { 38, -28 }, { 38, -27 }, { 34, -18 }, { 35, -16 }, 93.443 + { 34, -14 }, { 32, -8 }, { 37, -6 }, { 35, 0 }, 93.444 + { 30, 10 }, { 28, 18 }, { 26, 25 }, { 29, 41 }, 93.445 + { 0, 75 }, { 2, 72 }, { 8, 77 }, { 14, 35 }, 93.446 + { 18, 31 }, { 17, 35 }, { 21, 30 }, { 17, 45 }, 93.447 + { 20, 42 }, { 18, 45 }, { 27, 26 }, { 16, 54 }, 93.448 + { 7, 66 }, { 16, 56 }, { 11, 73 }, { 10, 67 }, 93.449 + { -10, 116 }, 93.450 + 93.451 + /* 227 - 275 */ 93.452 + { -23, 112 }, { -15, 71 }, { -7, 61 }, { 0, 53 }, 93.453 + { -5, 66 }, { -11, 77 }, { -9, 80 }, { -9, 84 }, 93.454 + { -10, 87 }, { -34, 127 }, { -21, 101 }, { -3, 39 }, 93.455 + { -5, 53 }, { -7, 61 }, { -11, 75 }, { -15, 77 }, 93.456 + { -17, 91 }, { -25, 107 }, { -25, 111 }, { -28, 122 }, 93.457 + { -11, 76 }, { -10, 44 }, { -10, 52 }, { -10, 57 }, 93.458 + { -9, 58 }, { -16, 72 }, { -7, 69 }, { -4, 69 }, 93.459 + { -5, 74 }, { -9, 86 }, { 2, 66 }, { -9, 34 }, 93.460 + { 1, 32 }, { 11, 31 }, { 5, 52 }, { -2, 55 }, 93.461 + { -2, 67 }, { 0, 73 }, { -8, 89 }, { 3, 52 }, 93.462 + { 7, 4 }, { 10, 8 }, { 17, 8 }, { 16, 19 }, 93.463 + { 3, 37 }, { -1, 61 }, { -5, 73 }, { -1, 70 }, 93.464 + { -4, 78 }, 93.465 + 93.466 + /* 276 a bit special (not used, bypass is used instead) */ 93.467 + { 0, 0 }, 93.468 + 93.469 + /* 277 - 337 */ 93.470 + { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 }, 93.471 + { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 }, 93.472 + { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 }, 93.473 + { -10, 95 }, { -14, 100 }, { -8, 95 }, { -17, 111 }, 93.474 + { -28, 114 }, { -6, 89 }, { -2, 80 }, { -4, 82 }, 93.475 + { -9, 85 }, { -8, 81 }, { -1, 72 }, { 5, 64 }, 93.476 + { 1, 67 }, { 9, 56 }, { 0, 69 }, { 1, 69 }, 93.477 + { 7, 69 }, { -7, 69 }, { -6, 67 }, { -16, 77 }, 93.478 + { -2, 64 }, { 2, 61 }, { -6, 67 }, { -3, 64 }, 93.479 + { 2, 57 }, { -3, 65 }, { -3, 66 }, { 0, 62 }, 93.480 + { 9, 51 }, { -1, 66 }, { -2, 71 }, { -2, 75 }, 93.481 + { -1, 70 }, { -9, 72 }, { 14, 60 }, { 16, 37 }, 93.482 + { 0, 47 }, { 18, 35 }, { 11, 37 }, { 12, 41 }, 93.483 + { 10, 41 }, { 2, 48 }, { 12, 41 }, { 13, 41 }, 93.484 + { 0, 59 }, { 3, 50 }, { 19, 40 }, { 3, 66 }, 93.485 + { 18, 50 }, 93.486 + 93.487 + /* 338 - 398 */ 93.488 + { 19, -6 }, { 18, -6 }, { 14, 0 }, { 26, -12 }, 93.489 + { 31, -16 }, { 33, -25 }, { 33, -22 }, { 37, -28 }, 93.490 + { 39, -30 }, { 42, -30 }, { 47, -42 }, { 45, -36 }, 93.491 + { 49, -34 }, { 41, -17 }, { 32, 9 }, { 69, -71 }, 93.492 + { 63, -63 }, { 66, -64 }, { 77, -74 }, { 54, -39 }, 93.493 + { 52, -35 }, { 41, -10 }, { 36, 0 }, { 40, -1 }, 93.494 + { 30, 14 }, { 28, 26 }, { 23, 37 }, { 12, 55 }, 93.495 + { 11, 65 }, { 37, -33 }, { 39, -36 }, { 40, -37 }, 93.496 + { 38, -30 }, { 46, -33 }, { 42, -30 }, { 40, -24 }, 93.497 + { 49, -29 }, { 38, -12 }, { 40, -10 }, { 38, -3 }, 93.498 + { 46, -5 }, { 31, 20 }, { 29, 30 }, { 25, 44 }, 93.499 + { 12, 48 }, { 11, 49 }, { 26, 45 }, { 22, 22 }, 93.500 + { 23, 22 }, { 27, 21 }, { 33, 20 }, { 26, 28 }, 93.501 + { 30, 24 }, { 27, 34 }, { 18, 42 }, { 25, 39 }, 93.502 + { 18, 50 }, { 12, 70 }, { 21, 54 }, { 14, 71 }, 93.503 + { 11, 83 }, 93.504 + 93.505 + /* 399 - 435 */ 93.506 + { 25, 32 }, { 21, 49 }, { 21, 54 }, 93.507 + { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, 93.508 + { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, 93.509 + { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, 93.510 + { -14, 66 }, { 0, 59 }, { 2, 59 }, { 17, -10 }, 93.511 + { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, 93.512 + { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, 93.513 + { -5, 71 }, { 0, 24 }, { -1, 36 }, { -2, 42 }, 93.514 + { -2, 52 }, { -9, 57 }, { -6, 63 }, { -4, 65 }, 93.515 + { -4, 67 }, { -7, 82 }, 93.516 + 93.517 + /* 436 - 459 */ 93.518 + { -3, 81 }, { -3, 76 }, { -7, 72 }, { -6, 78 }, 93.519 + { -12, 72 }, { -14, 68 }, { -3, 70 }, { -6, 76 }, 93.520 + { -5, 66 }, { -5, 62 }, { 0, 57 }, { -4, 61 }, 93.521 + { -9, 60 }, { 1, 54 }, { 2, 58 }, { 17, -10 }, 93.522 + { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, 93.523 + { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, 93.524 + }, 93.525 + 93.526 + /* i_cabac_init_idc == 2 */ 93.527 + { 93.528 + /* 0 - 10 */ 93.529 + { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, 93.530 + { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, 93.531 + { -6, 53 }, { -1, 54 }, { 7, 51 }, 93.532 + 93.533 + /* 11 - 23 */ 93.534 + { 29, 16 }, { 25, 0 }, { 14, 0 }, { -10, 51 }, 93.535 + { -3, 62 }, { -27, 99 }, { 26, 16 }, { -4, 85 }, 93.536 + { -24, 102 }, { 5, 57 }, { 6, 57 }, { -17, 73 }, 93.537 + { 14, 57 }, 93.538 + 93.539 + /* 24 - 39 */ 93.540 + { 20, 40 }, { 20, 10 }, { 29, 0 }, { 54, 0 }, 93.541 + { 37, 42 }, { 12, 97 }, { -32, 127 }, { -22, 117 }, 93.542 + { -2, 74 }, { -4, 85 }, { -24, 102 }, { 5, 57 }, 93.543 + { -6, 93 }, { -14, 88 }, { -6, 44 }, { 4, 55 }, 93.544 + 93.545 + /* 40 - 53 */ 93.546 + { -11, 89 },{ -15, 103 },{ -21, 116 },{ 19, 57 }, 93.547 + { 20, 58 },{ 4, 84 },{ 6, 96 },{ 1, 63 }, 93.548 + { -5, 85 },{ -13, 106 },{ 5, 63 },{ 6, 75 }, 93.549 + { -3, 90 },{ -1, 101 }, 93.550 + 93.551 + /* 54 - 59 */ 93.552 + { 3, 55 },{ -4, 79 },{ -2, 75 },{ -12, 97 }, 93.553 + { -7, 50 },{ 1, 60 }, 93.554 + 93.555 + /* 60 - 69 */ 93.556 + { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, 93.557 + { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, 93.558 + { 13, 41 }, { 3, 62 }, 93.559 + 93.560 + /* 70 - 104 */ 93.561 + { 7, 34 }, { -9, 88 }, { -20, 127 }, { -36, 127 }, 93.562 + { -17, 91 }, { -14, 95 }, { -25, 84 }, { -25, 86 }, 93.563 + { -12, 89 }, { -17, 91 }, { -31, 127 }, { -14, 76 }, 93.564 + { -18, 103 }, { -13, 90 }, { -37, 127 }, { 11, 80 }, 93.565 + { 5, 76 }, { 2, 84 }, { 5, 78 }, { -6, 55 }, 93.566 + { 4, 61 }, { -14, 83 }, { -37, 127 }, { -5, 79 }, 93.567 + { -11, 104 }, { -11, 91 }, { -30, 127 }, { 0, 65 }, 93.568 + { -2, 79 }, { 0, 72 }, { -4, 92 }, { -6, 56 }, 93.569 + { 3, 68 }, { -8, 71 }, { -13, 98 }, 93.570 + 93.571 + /* 105 -> 165 */ 93.572 + { -4, 86 }, { -12, 88 }, { -5, 82 }, { -3, 72 }, 93.573 + { -4, 67 }, { -8, 72 }, { -16, 89 }, { -9, 69 }, 93.574 + { -1, 59 }, { 5, 66 }, { 4, 57 }, { -4, 71 }, 93.575 + { -2, 71 }, { 2, 58 }, { -1, 74 }, { -4, 44 }, 93.576 + { -1, 69 }, { 0, 62 }, { -7, 51 }, { -4, 47 }, 93.577 + { -6, 42 }, { -3, 41 }, { -6, 53 }, { 8, 76 }, 93.578 + { -9, 78 }, { -11, 83 }, { 9, 52 }, { 0, 67 }, 93.579 + { -5, 90 }, { 1, 67 }, { -15, 72 }, { -5, 75 }, 93.580 + { -8, 80 }, { -21, 83 }, { -21, 64 }, { -13, 31 }, 93.581 + { -25, 64 }, { -29, 94 }, { 9, 75 }, { 17, 63 }, 93.582 + { -8, 74 }, { -5, 35 }, { -2, 27 }, { 13, 91 }, 93.583 + { 3, 65 }, { -7, 69 }, { 8, 77 }, { -10, 66 }, 93.584 + { 3, 62 }, { -3, 68 }, { -20, 81 }, { 0, 30 }, 93.585 + { 1, 7 }, { -3, 23 }, { -21, 74 }, { 16, 66 }, 93.586 + { -23, 124 }, { 17, 37 }, { 44, -18 }, { 50, -34 }, 93.587 + { -22, 127 }, 93.588 + 93.589 + /* 166 - 226 */ 93.590 + { 4, 39 }, { 0, 42 }, { 7, 34 }, { 11, 29 }, 93.591 + { 8, 31 }, { 6, 37 }, { 7, 42 }, { 3, 40 }, 93.592 + { 8, 33 }, { 13, 43 }, { 13, 36 }, { 4, 47 }, 93.593 + { 3, 55 }, { 2, 58 }, { 6, 60 }, { 8, 44 }, 93.594 + { 11, 44 }, { 14, 42 }, { 7, 48 }, { 4, 56 }, 93.595 + { 4, 52 }, { 13, 37 }, { 9, 49 }, { 19, 58 }, 93.596 + { 10, 48 }, { 12, 45 }, { 0, 69 }, { 20, 33 }, 93.597 + { 8, 63 }, { 35, -18 }, { 33, -25 }, { 28, -3 }, 93.598 + { 24, 10 }, { 27, 0 }, { 34, -14 }, { 52, -44 }, 93.599 + { 39, -24 }, { 19, 17 }, { 31, 25 }, { 36, 29 }, 93.600 + { 24, 33 }, { 34, 15 }, { 30, 20 }, { 22, 73 }, 93.601 + { 20, 34 }, { 19, 31 }, { 27, 44 }, { 19, 16 }, 93.602 + { 15, 36 }, { 15, 36 }, { 21, 28 }, { 25, 21 }, 93.603 + { 30, 20 }, { 31, 12 }, { 27, 16 }, { 24, 42 }, 93.604 + { 0, 93 }, { 14, 56 }, { 15, 57 }, { 26, 38 }, 93.605 + { -24, 127 }, 93.606 + 93.607 + /* 227 - 275 */ 93.608 + { -24, 115 }, { -22, 82 }, { -9, 62 }, { 0, 53 }, 93.609 + { 0, 59 }, { -14, 85 }, { -13, 89 }, { -13, 94 }, 93.610 + { -11, 92 }, { -29, 127 }, { -21, 100 }, { -14, 57 }, 93.611 + { -12, 67 }, { -11, 71 }, { -10, 77 }, { -21, 85 }, 93.612 + { -16, 88 }, { -23, 104 }, { -15, 98 }, { -37, 127 }, 93.613 + { -10, 82 }, { -8, 48 }, { -8, 61 }, { -8, 66 }, 93.614 + { -7, 70 }, { -14, 75 }, { -10, 79 }, { -9, 83 }, 93.615 + { -12, 92 }, { -18, 108 }, { -4, 79 }, { -22, 69 }, 93.616 + { -16, 75 }, { -2, 58 }, { 1, 58 }, { -13, 78 }, 93.617 + { -9, 83 }, { -4, 81 }, { -13, 99 }, { -13, 81 }, 93.618 + { -6, 38 }, { -13, 62 }, { -6, 58 }, { -2, 59 }, 93.619 + { -16, 73 }, { -10, 76 }, { -13, 86 }, { -9, 83 }, 93.620 + { -10, 87 }, 93.621 + 93.622 + /* 276 a bit special (not used, bypass is used instead) */ 93.623 + { 0, 0 }, 93.624 + 93.625 + /* 277 - 337 */ 93.626 + { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 }, 93.627 + { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 }, 93.628 + { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 }, 93.629 + { -10, 94 }, { -15, 102 }, { -10, 99 }, { -13, 106 }, 93.630 + { -50, 127 }, { -5, 92 }, { 17, 57 }, { -5, 86 }, 93.631 + { -13, 94 }, { -12, 91 }, { -2, 77 }, { 0, 71 }, 93.632 + { -1, 73 }, { 4, 64 }, { -7, 81 }, { 5, 64 }, 93.633 + { 15, 57 }, { 1, 67 }, { 0, 68 }, { -10, 67 }, 93.634 + { 1, 68 }, { 0, 77 }, { 2, 64 }, { 0, 68 }, 93.635 + { -5, 78 }, { 7, 55 }, { 5, 59 }, { 2, 65 }, 93.636 + { 14, 54 }, { 15, 44 }, { 5, 60 }, { 2, 70 }, 93.637 + { -2, 76 }, { -18, 86 }, { 12, 70 }, { 5, 64 }, 93.638 + { -12, 70 }, { 11, 55 }, { 5, 56 }, { 0, 69 }, 93.639 + { 2, 65 }, { -6, 74 }, { 5, 54 }, { 7, 54 }, 93.640 + { -6, 76 }, { -11, 82 }, { -2, 77 }, { -2, 77 }, 93.641 + { 25, 42 }, 93.642 + 93.643 + /* 338 - 398 */ 93.644 + { 17, -13 }, { 16, -9 }, { 17, -12 }, { 27, -21 }, 93.645 + { 37, -30 }, { 41, -40 }, { 42, -41 }, { 48, -47 }, 93.646 + { 39, -32 }, { 46, -40 }, { 52, -51 }, { 46, -41 }, 93.647 + { 52, -39 }, { 43, -19 }, { 32, 11 }, { 61, -55 }, 93.648 + { 56, -46 }, { 62, -50 }, { 81, -67 }, { 45, -20 }, 93.649 + { 35, -2 }, { 28, 15 }, { 34, 1 }, { 39, 1 }, 93.650 + { 30, 17 }, { 20, 38 }, { 18, 45 }, { 15, 54 }, 93.651 + { 0, 79 }, { 36, -16 }, { 37, -14 }, { 37, -17 }, 93.652 + { 32, 1 }, { 34, 15 }, { 29, 15 }, { 24, 25 }, 93.653 + { 34, 22 }, { 31, 16 }, { 35, 18 }, { 31, 28 }, 93.654 + { 33, 41 }, { 36, 28 }, { 27, 47 }, { 21, 62 }, 93.655 + { 18, 31 }, { 19, 26 }, { 36, 24 }, { 24, 23 }, 93.656 + { 27, 16 }, { 24, 30 }, { 31, 29 }, { 22, 41 }, 93.657 + { 22, 42 }, { 16, 60 }, { 15, 52 }, { 14, 60 }, 93.658 + { 3, 78 }, { -16, 123 }, { 21, 53 }, { 22, 56 }, 93.659 + { 25, 61 }, 93.660 + 93.661 + /* 399 - 435 */ 93.662 + { 21, 33 }, { 19, 50 }, { 17, 61 }, 93.663 + { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, 93.664 + { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, 93.665 + { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, 93.666 + { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, 93.667 + { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, 93.668 + { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, 93.669 + { -9, 71 }, { -7, 37 }, { -8, 44 }, { -11, 49 }, 93.670 + { -10, 56 }, { -12, 59 }, { -8, 63 }, { -9, 67 }, 93.671 + { -6, 68 }, { -10, 79 }, 93.672 + 93.673 + /* 436 - 459 */ 93.674 + { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, 93.675 + { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, 93.676 + { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, 93.677 + { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, 93.678 + { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, 93.679 + { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, 93.680 + } 93.681 +}; 93.682 + 93.683 +static const uint8_t left_block_options[4][16]={ 93.684 + {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8}, 93.685 + {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8}, 93.686 + {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}, 93.687 + {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8} 93.688 +}; 93.689 + 93.690 +static const uint8_t rem6[52]={ 93.691 +0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 93.692 +}; 93.693 + 93.694 +static const uint8_t div6[52]={ 93.695 +0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 93.696 +}; 93.697 + 93.698 +static void init_dequant8_coeff_table(H264Slice *s, EntropyContext *ec){ 93.699 + int i,q,x; 93.700 + const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON; 93.701 + ec->dequant8_coeff[0] = ec->dequant8_buffer[0]; 93.702 + ec->dequant8_coeff[1] = ec->dequant8_buffer[1]; 93.703 + 93.704 + for(i=0; i<2; i++){ 93.705 + if(i && !memcmp(s->pps.scaling_matrix8[0], s->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){ 93.706 + ec->dequant8_coeff[1] = ec->dequant8_buffer[0]; 93.707 + break; 93.708 + } 93.709 + 93.710 + for(q=0; q<52; q++){ 93.711 + int shift = div6[q]; 93.712 + int idx = rem6[q]; 93.713 + for(x=0; x<64; x++) 93.714 + ec->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] = 93.715 + ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] * 93.716 + s->pps.scaling_matrix8[i][x]) << shift; 93.717 + } 93.718 + } 93.719 +} 93.720 + 93.721 +static void init_dequant4_coeff_table(H264Slice *s, EntropyContext *ec){ 93.722 + int i,j,q,x; 93.723 + const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON; 93.724 + for(i=0; i<6; i++ ){ 93.725 + ec->dequant4_coeff[i] = ec->dequant4_buffer[i]; 93.726 + for(j=0; j<i; j++){ 93.727 + if(!memcmp(s->pps.scaling_matrix4[j], s->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){ 93.728 + ec->dequant4_coeff[i] = ec->dequant4_buffer[j]; 93.729 + break; 93.730 + } 93.731 + } 93.732 + if(j<i) 93.733 + continue; 93.734 + 93.735 + for(q=0; q<52; q++){ 93.736 + int shift = div6[q] + 2; 93.737 + int idx = rem6[q]; 93.738 + for(x=0; x<16; x++) 93.739 + ec->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] = 93.740 + ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] * 93.741 + s->pps.scaling_matrix4[i][x]) << shift; 93.742 + } 93.743 + } 93.744 +} 93.745 + 93.746 +void init_dequant_tables(H264Slice *s, EntropyContext *ec){ 93.747 + int i,x; 93.748 + 93.749 + init_dequant4_coeff_table(s, ec); 93.750 + if(s->pps.transform_8x8_mode) 93.751 + init_dequant8_coeff_table(s, ec); 93.752 + if(s->transform_bypass){ 93.753 + for(i=0; i<6; i++) 93.754 + for(x=0; x<16; x++) 93.755 + ec->dequant4_coeff[i][0][x] = 1<<6; 93.756 + if(s->pps.transform_8x8_mode) 93.757 + for(i=0; i<2; i++) 93.758 + for(x=0; x<64; x++) 93.759 + ec->dequant8_coeff[i][0][x] = 1<<6; 93.760 + } 93.761 +} 93.762 + 93.763 +void ff_h264_init_cabac_states(EntropyContext *ec, H264Slice *s, CABACContext *c) { 93.764 + int i; 93.765 + const int8_t (*tab)[2]; 93.766 + 93.767 + if( s->slice_type_nos == FF_I_TYPE ) tab = cabac_context_init_I; 93.768 + else tab = cabac_context_init_PB[s->cabac_init_idc]; 93.769 + 93.770 + /* calculate pre-state */ 93.771 + for( i= 0; i < 460; i++ ) { 93.772 + int pre = 2*(((tab[i][0] * ec->curr_qscale) >>4 ) + tab[i][1]) - 127; 93.773 + 93.774 + pre^= pre>>31; 93.775 + if(pre > 124) 93.776 + pre= 124 + (pre&1); 93.777 + 93.778 + c->cabac_state[i] = pre; 93.779 + } 93.780 +} 93.781 + 93.782 +static void fill_decode_neighbors(EntropyContext *ec, H264Slice *s){ 93.783 + H264Mb *m = ec->m; 93.784 + const int mb_x = m->mb_x; 93.785 + 93.786 + if (m->mb_y){ 93.787 + ec->top_type = ec->mb_type_top[mb_x]; 93.788 + ec->topright_type= ec->mb_type_top[mb_x+1]; 93.789 + ec->topleft_type = ec->mb_type_top[mb_x-1]; 93.790 + m->qscale_top_mb_xy = ec->qscale_top[mb_x]; 93.791 + } else { 93.792 + ec->top_type = 0; 93.793 + ec->topright_type= 0; 93.794 + ec->topleft_type = 0; 93.795 + m->qscale_top_mb_xy = 0; 93.796 + } 93.797 + 93.798 + ec->left_type = ec->mb_type[mb_x-1] ; 93.799 + m->qscale_left_mb_xy = ec->qscale[mb_x-1]; 93.800 + 93.801 +} 93.802 + 93.803 +static void fill_decode_caches(EntropyContext *ec, H264Slice *s, int mb_type){ 93.804 + H264Mb *m = ec->m; 93.805 + int topleft_type, top_type, topright_type, left_type; 93.806 + const uint8_t * left_block= left_block_options[0]; 93.807 + const int mb_x = m->mb_x; 93.808 + int i; 93.809 + 93.810 + topleft_type = ec->topleft_type; 93.811 + top_type = ec->top_type; 93.812 + topright_type= ec->topright_type; 93.813 + left_type = ec->left_type; 93.814 + 93.815 + if(!IS_SKIP(mb_type)){ 93.816 + if(top_type){ 93.817 + AV_COPY32(&ec->non_zero_count_cache[4+8*0], &ec->non_zero_count_top[mb_x][0]); 93.818 + ec->non_zero_count_cache[1+8*0]= ec->non_zero_count_top[mb_x][4]; 93.819 + ec->non_zero_count_cache[2+8*0]= ec->non_zero_count_top[mb_x][5]; 93.820 + ec->non_zero_count_cache[1+8*3]= ec->non_zero_count_top[mb_x][6]; 93.821 + ec->non_zero_count_cache[2+8*3]= ec->non_zero_count_top[mb_x][7]; 93.822 + 93.823 + }else { 93.824 + ec->non_zero_count_cache[1+8*0]= 93.825 + ec->non_zero_count_cache[2+8*0]= 93.826 + ec->non_zero_count_cache[1+8*3]= 93.827 + ec->non_zero_count_cache[2+8*3]= 93.828 + AV_WN32A(&ec->non_zero_count_cache[4+8*0], !IS_INTRA(mb_type) ? 0 : 0x40404040); 93.829 + } 93.830 + 93.831 + if(left_type){ 93.832 + for (i=0; i<2; i++) { 93.833 + ec->non_zero_count_cache[3+8*1 + 2*8*i]= ec->non_zero_count_left[i*2+0]; 93.834 + ec->non_zero_count_cache[3+8*2 + 2*8*i]= ec->non_zero_count_left[i*2+1]; 93.835 + ec->non_zero_count_cache[0+8*1 + 3*8*i]= ec->non_zero_count_left[4+i*2+0]; 93.836 + ec->non_zero_count_cache[0+8*2 + 3*8*i]= ec->non_zero_count_left[4+i*2+1]; 93.837 + } 93.838 + } 93.839 + else{ 93.840 + for (i=0; i<2; i++) { 93.841 + ec->non_zero_count_cache[3+8*1 + 2*8*i]= 93.842 + ec->non_zero_count_cache[3+8*2 + 2*8*i]= 93.843 + ec->non_zero_count_cache[0+8*1 + 3*8*i]= 93.844 + ec->non_zero_count_cache[0+8*2 + 3*8*i]= !IS_INTRA(mb_type) ? 0 : 64; 93.845 + } 93.846 + } 93.847 + 93.848 + // top_cbp 93.849 + if(top_type) { 93.850 + ec->top_cbp = ec->cbp_top[mb_x]; 93.851 + } else { 93.852 + ec->top_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F; 93.853 + } 93.854 + // left_cbp 93.855 + if (left_type) { 93.856 + ec->left_cbp = (ec->cbp[mb_x-1] & 0x1f0) 93.857 + | ((ec->cbp[mb_x-1]>>(left_block[0]&(~1)))&2) 93.858 + | (((ec->cbp[mb_x-1]>>(left_block[2]&(~1)))&2) << 2); 93.859 + } else { 93.860 + ec->left_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F; 93.861 + } 93.862 + } 93.863 + 93.864 + if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){ 93.865 + int list; 93.866 + 93.867 + ec->ref_cache[0][scan8[5 ]+1] = ec->ref_cache[0][scan8[7 ]+1] = ec->ref_cache[0][scan8[13]+1] = 93.868 + ec->ref_cache[1][scan8[5 ]+1] = ec->ref_cache[1][scan8[7 ]+1] = ec->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE; 93.869 + 93.870 + for(list=0; list<s->list_count; list++){ 93.871 + if(!USES_LIST(mb_type, list)){ 93.872 + continue; 93.873 + } 93.874 + assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred)); 93.875 + 93.876 + if(USES_LIST(top_type, list)){ 93.877 + ec->ref_cache[list][scan8[0] + 0 - 1*8]= 93.878 + ec->ref_cache[list][scan8[0] + 1 - 1*8]= ec->ref_index_top[list][4*mb_x + 2]; 93.879 + ec->ref_cache[list][scan8[0] + 2 - 1*8]= 93.880 + ec->ref_cache[list][scan8[0] + 3 - 1*8]= ec->ref_index_top[list][4*mb_x + 3]; 93.881 + }else{ 93.882 + AV_WN32A(&ec->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101); 93.883 + } 93.884 + 93.885 + if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){ 93.886 + for(i=0; i<2; i++){ 93.887 + int cache_idx = scan8[0] - 1 + i*2*8; 93.888 + if(USES_LIST(left_type, list)){ 93.889 + const int b8_x= 4*(mb_x-1) + 1; 93.890 + ec->ref_cache[list][cache_idx ]= ec->ref_index[list][b8_x + (left_block[0+i*2]&~1)]; 93.891 + ec->ref_cache[list][cache_idx+8]= ec->ref_index[list][b8_x + (left_block[1+i*2]&~1)]; 93.892 + }else{ 93.893 + ec->ref_cache[list][cache_idx ]= 93.894 + ec->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE); 93.895 + } 93.896 + } 93.897 + }else{ 93.898 + if(USES_LIST(left_type, list)){ 93.899 + const int b8_x= 4*(mb_x-1) + 1; 93.900 + ec->ref_cache[list][scan8[0] - 1]= ec->ref_index[list][b8_x + (left_block[0]&~1)]; 93.901 + }else{ 93.902 + ec->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; 93.903 + } 93.904 + } 93.905 + 93.906 + if(USES_LIST(topright_type, list)){ 93.907 + ec->ref_cache[list][scan8[0] + 4 - 1*8]= ec->ref_index_top[list][4*(mb_x+1) + 2]; 93.908 + }else{ 93.909 + ec->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; 93.910 + } 93.911 + if(ec->ref_cache[list][scan8[0] + 4 - 1*8] < 0){ 93.912 + int topleft_partition= -1; 93.913 + if(USES_LIST(topleft_type, list)){ 93.914 + const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2); 93.915 + ec->ref_cache[list][scan8[0] - 1 - 1*8]= ec->ref_index_top[list][b8_x]; 93.916 + }else{ 93.917 + ec->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; 93.918 + } 93.919 + } 93.920 + 93.921 + if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) 93.922 + continue; 93.923 + 93.924 + if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) { 93.925 + ec->ref_cache[list][scan8[4 ]] = 93.926 + ec->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE; 93.927 + 93.928 + /* XXX beurk, Load mvd */ 93.929 + if(USES_LIST(top_type, list)){ 93.930 + AV_COPY64(ec->mvd_cache[list][scan8[0] + 0 - 1*8], ec->mvd_top[list][8*mb_x + 0]); 93.931 + }else{ 93.932 + AV_ZERO64(ec->mvd_cache[list][scan8[0] + 0 - 1*8]); 93.933 + } 93.934 + if(USES_LIST(left_type, list)){ 93.935 + AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 0*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[0]]); 93.936 + AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 1*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[1]]); 93.937 + }else{ 93.938 + AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 0*8]); 93.939 + AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 1*8]); 93.940 + } 93.941 + if(USES_LIST(left_type, list)){ 93.942 + AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 2*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[2]]); 93.943 + AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 3*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[3]]); 93.944 + }else{ 93.945 + AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 2*8]); 93.946 + AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 3*8]); 93.947 + } 93.948 + AV_ZERO16(ec->mvd_cache [list][scan8[4 ]]); 93.949 + AV_ZERO16(ec->mvd_cache [list][scan8[12]]); 93.950 + if(s->slice_type_nos == FF_B_TYPE){ 93.951 + fill_rectangle(&ec->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1); 93.952 + 93.953 + if(IS_DIRECT(top_type)){ 93.954 + AV_WN32A(&ec->direct_cache[scan8[0] - 1*8], 0x01010101u*(MB_TYPE_DIRECT2>>1)); 93.955 + }else if(IS_8X8(top_type)){ 93.956 + int b8_x = 4*mb_x; 93.957 + ec->direct_cache[scan8[0] + 0 - 1*8]= ec->direct_top[b8_x + 2]; 93.958 + ec->direct_cache[scan8[0] + 2 - 1*8]= ec->direct_top[b8_x + 3]; 93.959 + }else{ 93.960 + AV_WN32A(&ec->direct_cache[scan8[0] - 1*8], 0x01010101*(MB_TYPE_16x16>>1)); 93.961 + } 93.962 + 93.963 + if(IS_DIRECT(left_type)) 93.964 + ec->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_DIRECT2>>1; 93.965 + else if(IS_8X8(left_type)) 93.966 + ec->direct_cache[scan8[0] - 1 + 0*8]= ec->direct[4*(mb_x-1) + 1 + (left_block[0]&~1)]; 93.967 + else 93.968 + ec->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_16x16>>1; 93.969 + 93.970 + if(IS_DIRECT(left_type)) 93.971 + ec->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_DIRECT2>>1; 93.972 + else if(IS_8X8(left_type)) 93.973 + ec->direct_cache[scan8[0] - 1 + 2*8]= ec->direct[4*(mb_x-1) + 1 + (left_block[2]&~1)]; 93.974 + else 93.975 + ec->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_16x16>>1; 93.976 + } 93.977 + } 93.978 + } 93.979 + } 93.980 + ec->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type); 93.981 +} 93.982 + 93.983 +static inline void write_back_non_zero_count(EntropyContext *ec, H264Slice *s){ 93.984 + H264Mb *m = ec->m; 93.985 + const int mb_x= m->mb_x; 93.986 + 93.987 + //bottom nnz 93.988 + AV_COPY32(&ec->non_zero_count[mb_x][0], &ec->non_zero_count_cache[4+8*4] ); 93.989 + ec->non_zero_count[mb_x][4] = ec->non_zero_count_cache[1+8*2]; 93.990 + ec->non_zero_count[mb_x][5] = ec->non_zero_count_cache[2+8*2]; 93.991 + ec->non_zero_count[mb_x][6] = ec->non_zero_count_cache[1+8*5]; 93.992 + ec->non_zero_count[mb_x][7] = ec->non_zero_count_cache[2+8*5]; 93.993 + 93.994 + for (int i=0; i<2; i++) { 93.995 + ec->non_zero_count_left[i*2+0] = ec->non_zero_count_cache[7+8*1 + 2*8*i]; 93.996 + ec->non_zero_count_left[i*2+1] = ec->non_zero_count_cache[7+8*2 + 2*8*i]; 93.997 + ec->non_zero_count_left[4+i*2+0] = ec->non_zero_count_cache[2+8*1 + 3*8*i]; 93.998 + ec->non_zero_count_left[4+i*2+1] = ec->non_zero_count_cache[2+8*2 + 3*8*i]; 93.999 + } 93.1000 + 93.1001 + AV_COPY32(&m->non_zero_count[ 0], &ec->non_zero_count_cache[4+8*1]); 93.1002 + AV_COPY32(&m->non_zero_count[ 4], &ec->non_zero_count_cache[4+8*2]); 93.1003 + AV_COPY32(&m->non_zero_count[ 8], &ec->non_zero_count_cache[4+8*3]); 93.1004 + AV_COPY32(&m->non_zero_count[12], &ec->non_zero_count_cache[4+8*4]); 93.1005 + 93.1006 + for (int i=0; i<2; i++) { 93.1007 + m->non_zero_count[16 + i*2 ] = ec->non_zero_count_cache[8*1 + 8*i + 1]; 93.1008 + m->non_zero_count[16 + i*2 +1] = ec->non_zero_count_cache[8*1 + 8*i + 2]; 93.1009 + m->non_zero_count[20 + i*2 ] = ec->non_zero_count_cache[8*4 + 8*i + 1]; 93.1010 + m->non_zero_count[20 + i*2 +1] = ec->non_zero_count_cache[8*4 + 8*i + 2]; 93.1011 + } 93.1012 +} 93.1013 + 93.1014 +static inline void write_back_motion(EntropyContext *ec, H264Slice *s, int mb_type){ 93.1015 + H264Mb *m = ec->m; 93.1016 + const int mb_x = m->mb_x; 93.1017 + const int b_x = 4*m->mb_x; //try mb2b(8)_xy 93.1018 + int list; 93.1019 + 93.1020 + for(list=0; list<s->list_count; list++){ 93.1021 + if(!USES_LIST(mb_type, list)) 93.1022 + continue; 93.1023 + 93.1024 + { 93.1025 + uint8_t (*mvd_dst)[2] = (void *) ec->mvd[list][8*mb_x]; 93.1026 + uint8_t (*mvd_src)[2] = &ec->mvd_cache[list][scan8[0]]; 93.1027 + if(IS_SKIP(mb_type)) 93.1028 + AV_ZERO128(mvd_dst); 93.1029 + else{ 93.1030 + AV_COPY64(mvd_dst, mvd_src + 8*3); 93.1031 + AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0); 93.1032 + AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1); 93.1033 + AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2); 93.1034 + } 93.1035 + } 93.1036 + int8_t *ref_index = &ec->ref_index[list][b_x]; 93.1037 + { 93.1038 + ref_index[0+0*2]= ec->ref_cache[list][scan8[0]]; 93.1039 + ref_index[1+0*2]= ec->ref_cache[list][scan8[4]]; 93.1040 + ref_index[0+1*2]= ec->ref_cache[list][scan8[8]]; 93.1041 + ref_index[1+1*2]= ec->ref_cache[list][scan8[12]]; 93.1042 + } 93.1043 + } 93.1044 + 93.1045 + if(s->slice_type_nos == FF_B_TYPE){ 93.1046 + if(IS_8X8(mb_type)){ 93.1047 + uint8_t *direct = &ec->direct[4*mb_x]; 93.1048 + direct[1] = m->sub_mb_type[1]>>1; 93.1049 + direct[2] = m->sub_mb_type[2]>>1; 93.1050 + direct[3] = m->sub_mb_type[3]>>1; 93.1051 + } 93.1052 + } 93.1053 +} 93.1054 + 93.1055 +static inline int get_dct8x8_allowed(EntropyContext *ec, H264Slice *s){ 93.1056 + H264Mb *m = ec->m; 93.1057 + if(s->direct_8x8_inference_flag) 93.1058 + return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8 )*0x0001000100010001ULL)); 93.1059 + else 93.1060 + return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL)); 93.1061 +} 93.1062 + 93.1063 +/** 93.1064 + * decodes a P_SKIP or B_SKIP macroblock 93.1065 + */ 93.1066 +static void decode_mb_skip(EntropyContext *ec, H264Slice *s){ 93.1067 + H264Mb *m = ec->m; 93.1068 + const int mb_x = m->mb_x; 93.1069 + int mb_type; 93.1070 + 93.1071 + if( s->slice_type_nos == FF_B_TYPE ) 93.1072 + mb_type= MB_TYPE_16x16|MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP; 93.1073 + else 93.1074 + mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP; 93.1075 + 93.1076 + fill_rectangle(&ec->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1); 93.1077 + write_back_motion(ec, s, mb_type); 93.1078 + m->mb_type = ec->mb_type[mb_x] = mb_type; 93.1079 + m->qscale_mb_xy = ec->qscale[mb_x]= ec->curr_qscale; 93.1080 + 93.1081 + AV_ZERO64(ec->non_zero_count[mb_x]); 93.1082 + AV_ZERO64(ec->non_zero_count_left); 93.1083 + memset(m->non_zero_count, 0, 24); 93.1084 +} 93.1085 + 93.1086 +static int decode_cabac_intra_mb_type(EntropyContext *ec, H264Slice *s, CABACContext *c, int ctx_base, int intra_slice) { 93.1087 + uint8_t *state= &c->cabac_state[ctx_base]; 93.1088 + int mb_type; 93.1089 + 93.1090 + if(intra_slice){ 93.1091 + int ctx=0; 93.1092 + if( ec->left_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)) 93.1093 + ctx++; 93.1094 + if( ec->top_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)) 93.1095 + ctx++; 93.1096 + if( get_cabac_noinline( c, &state[ctx] ) == 0 ) 93.1097 + return 0; /* I4x4 */ 93.1098 + state += 2; 93.1099 + }else{ 93.1100 + if( get_cabac_noinline( c, state ) == 0 ) 93.1101 + return 0; /* I4x4 */ 93.1102 + } 93.1103 + 93.1104 + if( get_cabac_terminate( c ) ) 93.1105 + return 25; /* PCM */ 93.1106 + 93.1107 + mb_type = 1; /* I16x16 */ 93.1108 + mb_type += 12 * get_cabac_noinline( c, &state[1] ); /* cbp_luma != 0 */ 93.1109 + if( get_cabac_noinline(c, &state[2] ) ) /* cbp_chroma */ 93.1110 + mb_type += 4 + 4 * get_cabac_noinline(c, &state[2+intra_slice] ); 93.1111 + mb_type += 2 * get_cabac_noinline(c, &state[3+intra_slice] ); 93.1112 + mb_type += 1 * get_cabac_noinline(c, &state[3+2*intra_slice] ); 93.1113 + return mb_type; 93.1114 +} 93.1115 + 93.1116 +static int decode_cabac_mb_skip(EntropyContext *ec, H264Slice *s, H264Mb *m, CABACContext *c) { 93.1117 + int ctx = 0; 93.1118 + 93.1119 + if( m->mb_x>0 && !IS_SKIP( ec->left_type )) 93.1120 + ctx++; 93.1121 + if( m->mb_y>0 && !IS_SKIP( ec->top_type )) 93.1122 + ctx++; 93.1123 + 93.1124 + if( s->slice_type_nos == FF_B_TYPE ) 93.1125 + ctx += 13; 93.1126 + return get_cabac_noinline(c, &c->cabac_state[11+ctx] ); 93.1127 +} 93.1128 + 93.1129 +static int decode_cabac_mb_intra4x4_pred_mode_delta( CABACContext *c) { 93.1130 + int mode = 0; 93.1131 + 93.1132 + if( get_cabac(c, &c->cabac_state[68] ) ) 93.1133 + return -1; 93.1134 + 93.1135 + mode += 1 * get_cabac(c, &c->cabac_state[69] ); 93.1136 + mode += 2 * get_cabac(c, &c->cabac_state[69] ); 93.1137 + mode += 4 * get_cabac(c, &c->cabac_state[69] ); 93.1138 + 93.1139 + return mode; 93.1140 +} 93.1141 + 93.1142 +static int decode_cabac_mb_chroma_pre_mode(EntropyContext *ec, H264Slice *s, CABACContext *c) { 93.1143 + H264Mb *m = ec->m; 93.1144 + const int mb_x = m->mb_x; 93.1145 + 93.1146 + int ctx = 0; 93.1147 + 93.1148 + /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode to 0 */ 93.1149 + if( ec->left_type && ec->chroma_pred_mode[mb_x-1] != 0 ) 93.1150 + ctx++; 93.1151 + 93.1152 + if( ec->top_type && ec->chroma_pred_mode_top[mb_x] != 0 ) 93.1153 + ctx++; 93.1154 + 93.1155 + if( get_cabac_noinline(c, &c->cabac_state[64+ctx] ) == 0 ) 93.1156 + return 0; 93.1157 + 93.1158 + if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 ) 93.1159 + return 1; 93.1160 + if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 ) 93.1161 + return 2; 93.1162 + else 93.1163 + return 3; 93.1164 +} 93.1165 + 93.1166 +static int decode_cabac_mb_cbp_luma(EntropyContext *ec, CABACContext *c) { 93.1167 + int cbp_b, cbp_a, ctx, cbp = 0; 93.1168 + 93.1169 + cbp_a = ec->left_cbp; 93.1170 + cbp_b = ec->top_cbp; 93.1171 + 93.1172 + ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04); 93.1173 + cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]); 93.1174 + ctx = !(cbp & 0x01) + 2 * !(cbp_b & 0x08); 93.1175 + cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 1; 93.1176 + ctx = !(cbp_a & 0x08) + 2 * !(cbp & 0x01); 93.1177 + cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 2; 93.1178 + ctx = !(cbp & 0x04) + 2 * !(cbp & 0x02); 93.1179 + cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 3; 93.1180 + return cbp; 93.1181 +} 93.1182 +static int decode_cabac_mb_cbp_chroma(EntropyContext *ec, CABACContext *c) { 93.1183 + int ctx; 93.1184 + int cbp_a, cbp_b; 93.1185 + 93.1186 + cbp_a = (ec->left_cbp>>4)&0x03; 93.1187 + cbp_b = (ec-> top_cbp>>4)&0x03; 93.1188 + 93.1189 + ctx = 0; 93.1190 + if( cbp_a > 0 ) ctx++; 93.1191 + if( cbp_b > 0 ) ctx += 2; 93.1192 + if( get_cabac_noinline(c, &c->cabac_state[77 + ctx] ) == 0 ) 93.1193 + return 0; 93.1194 + 93.1195 + ctx = 4; 93.1196 + if( cbp_a == 2 ) ctx++; 93.1197 + if( cbp_b == 2 ) ctx += 2; 93.1198 + return 1 + get_cabac_noinline(c, &c->cabac_state[77 + ctx] ); 93.1199 +} 93.1200 + 93.1201 +static int decode_cabac_p_mb_sub_type( CABACContext *c) { 93.1202 + if( get_cabac(c, &c->cabac_state[21] ) ) 93.1203 + return 0; /* 8x8 */ 93.1204 + if( !get_cabac(c, &c->cabac_state[22] ) ) 93.1205 + return 1; /* 8x4 */ 93.1206 + if( get_cabac(c, &c->cabac_state[23] ) ) 93.1207 + return 2; /* 4x8 */ 93.1208 + return 3; /* 4x4 */ 93.1209 +} 93.1210 +static int decode_cabac_b_mb_sub_type(CABACContext *c) { 93.1211 + int type; 93.1212 + if( !get_cabac(c, &c->cabac_state[36] ) ) 93.1213 + return 0; /* B_Direct_8x8 */ 93.1214 + if( !get_cabac(c, &c->cabac_state[37] ) ) 93.1215 + return 1 + get_cabac(c, &c->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */ 93.1216 + type = 3; 93.1217 + if( get_cabac(c, &c->cabac_state[38] ) ) { 93.1218 + if( get_cabac(c, &c->cabac_state[39] ) ) 93.1219 + return 11 + get_cabac(c, &c->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */ 93.1220 + type += 4; 93.1221 + } 93.1222 + type += 2*get_cabac(c, &c->cabac_state[39] ); 93.1223 + type += get_cabac(c, &c->cabac_state[39] ); 93.1224 + return type; 93.1225 +} 93.1226 + 93.1227 +static int decode_cabac_mb_ref(EntropyContext *ec, H264Slice *s, CABACContext *c, int list, int n ) { 93.1228 + int refa = ec->ref_cache[list][scan8[n] - 1]; 93.1229 + int refb = ec->ref_cache[list][scan8[n] - 8]; 93.1230 + int ref = 0; 93.1231 + int ctx = 0; 93.1232 + 93.1233 + if( s->slice_type_nos == FF_B_TYPE) { 93.1234 + if( refa > 0 && !(ec->direct_cache[scan8[n] - 1]&(MB_TYPE_DIRECT2>>1)) ) 93.1235 + ctx++; 93.1236 + if( refb > 0 && !(ec->direct_cache[scan8[n] - 8]&(MB_TYPE_DIRECT2>>1)) ) 93.1237 + ctx += 2; 93.1238 + } else { 93.1239 + if( refa > 0 ) 93.1240 + ctx++; 93.1241 + if( refb > 0 ) 93.1242 + ctx += 2; 93.1243 + } 93.1244 + 93.1245 + while( get_cabac(c, &c->cabac_state[54+ctx] ) ) { 93.1246 + ref++; 93.1247 + ctx = (ctx>>2)+4; 93.1248 + if(ref >= 32 /*h->ref_list[list]*/){ 93.1249 + return -1; 93.1250 + } 93.1251 + } 93.1252 + return ref; 93.1253 +} 93.1254 + 93.1255 +static int decode_cabac_mb_mvd( CABACContext *c, int ctxbase, int amvd, int *mvda) { 93.1256 + int mvd; 93.1257 + 93.1258 + if(!get_cabac(c, &c->cabac_state[ctxbase+((amvd-3)>>(INT_BIT-1))+((amvd-33)>>(INT_BIT-1))+2])){ 93.1259 + *mvda= 0; 93.1260 + return 0; 93.1261 + } 93.1262 + 93.1263 + mvd= 1; 93.1264 + ctxbase+= 3; 93.1265 + while( mvd < 9 && get_cabac(c, &c->cabac_state[ctxbase] ) ) { 93.1266 + if( mvd < 4 ) 93.1267 + ctxbase++; 93.1268 + mvd++; 93.1269 + } 93.1270 + 93.1271 + if( mvd >= 9 ) { 93.1272 + int k = 3; 93.1273 + while( get_cabac_bypass(c ) ) { 93.1274 + mvd += 1 << k; 93.1275 + k++; 93.1276 + if(k>24){ 93.1277 + av_log(AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n"); 93.1278 + return INT_MIN; 93.1279 + } 93.1280 + } 93.1281 + while( k-- ) { 93.1282 + mvd += get_cabac_bypass(c )<<k; 93.1283 + } 93.1284 + *mvda=mvd < 70 ? mvd : 70; 93.1285 + }else 93.1286 + *mvda=mvd; 93.1287 + return get_cabac_bypass_sign(c, -mvd ); 93.1288 +} 93.1289 + 93.1290 +#define DECODE_CABAC_MB_MVD( ec, c, list, n )\ 93.1291 +{\ 93.1292 + int amvd0 = ec->mvd_cache[list][scan8[n] - 1][0] +\ 93.1293 + ec->mvd_cache[list][scan8[n] - 8][0];\ 93.1294 + int amvd1 = ec->mvd_cache[list][scan8[n] - 1][1] +\ 93.1295 + ec->mvd_cache[list][scan8[n] - 8][1];\ 93.1296 +\ 93.1297 + m->mvd[list][mp][0] = decode_cabac_mb_mvd( c, 40, amvd0, &mpx ); \ 93.1298 + m->mvd[list][mp][1] = decode_cabac_mb_mvd( c, 47, amvd1, &mpy ); \ 93.1299 + mp++; \ 93.1300 +} 93.1301 + 93.1302 +static av_always_inline int get_cabac_cbf_ctx(EntropyContext *ec, H264Slice *s, int cat, int idx, int is_dc ) { 93.1303 + int nza, nzb; 93.1304 + int ctx = 0; 93.1305 + 93.1306 + if( is_dc ) { 93.1307 + if( cat == 0 ) { 93.1308 + nza = ec->left_cbp&0x100; 93.1309 + nzb = ec-> top_cbp&0x100; 93.1310 + } else { 93.1311 + nza = (ec->left_cbp>>(6+idx))&0x01; 93.1312 + nzb = (ec-> top_cbp>>(6+idx))&0x01; 93.1313 + } 93.1314 + } else { 93.1315 + assert(cat == 1 || cat == 2 || cat == 4); 93.1316 + nza = ec->non_zero_count_cache[scan8[idx] - 1]; 93.1317 + nzb = ec->non_zero_count_cache[scan8[idx] - 8]; 93.1318 + } 93.1319 + 93.1320 + if( nza > 0 ) 93.1321 + ctx++; 93.1322 + 93.1323 + if( nzb > 0 ) 93.1324 + ctx += 2; 93.1325 + 93.1326 + return ctx + 4 * cat; 93.1327 +} 93.1328 + 93.1329 +DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = { 93.1330 + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 93.1331 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 93.1332 + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 93.1333 + 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 93.1334 +}; 93.1335 + 93.1336 +static const int significant_coeff_flag_offset[2][6] = { 93.1337 + { 105+0, 105+15, 105+29, 105+44, 105+47, 402 }, 93.1338 + { 277+0, 277+15, 277+29, 277+44, 277+47, 436 } 93.1339 +}; 93.1340 +static const int last_coeff_flag_offset[2][6] = { 93.1341 + { 166+0, 166+15, 166+29, 166+44, 166+47, 417 }, 93.1342 + { 338+0, 338+15, 338+29, 338+44, 338+47, 451 } 93.1343 +}; 93.1344 +static const int coeff_abs_level_m1_offset[6] = { 93.1345 + 227+0, 227+10, 227+20, 227+30, 227+39, 426 93.1346 +}; 93.1347 +static const uint8_t significant_coeff_flag_offset_8x8[2][63] = { 93.1348 + { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5, 93.1349 + 4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7, 93.1350 + 7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11, 93.1351 + 12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 }, 93.1352 + { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5, 93.1353 + 6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11, 93.1354 + 9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9, 93.1355 + 9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 } 93.1356 +}; 93.1357 +/* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0). 93.1358 +* 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter). 93.1359 +* map node ctx => cabac ctx for level=1 */ 93.1360 +static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }; 93.1361 +/* map node ctx => cabac ctx for level>1 */ 93.1362 +static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 }; 93.1363 +static const uint8_t coeff_abs_level_transition[2][8] = { 93.1364 + /* update node ctx after decoding a level=1 */ 93.1365 + { 1, 2, 3, 3, 4, 5, 6, 7 }, 93.1366 + /* update node ctx after decoding a level>1 */ 93.1367 + { 4, 4, 4, 4, 5, 6, 7, 7 } 93.1368 +}; 93.1369 + 93.1370 +static av_always_inline void decode_cabac_residual_internal(EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) { 93.1371 + H264Mb *m = ec->m; 93.1372 + const int mb_x = m->mb_x; 93.1373 + int index[64]; 93.1374 + 93.1375 + int av_unused last; 93.1376 + int coeff_count = 0; 93.1377 + int node_ctx = 0; 93.1378 + 93.1379 + uint8_t *significant_coeff_ctx_base; 93.1380 + uint8_t *last_coeff_ctx_base; 93.1381 + uint8_t *abs_level_m1_ctx_base; 93.1382 + 93.1383 + /* read coded block flag */ 93.1384 + if( is_dc || cat != 5 ) { 93.1385 + if( get_cabac( c, &c->cabac_state[85 + get_cabac_cbf_ctx( ec, s, cat, n, is_dc ) ] ) == 0 ) { 93.1386 + if( !is_dc ) 93.1387 + ec->non_zero_count_cache[scan8[n]] = 0; 93.1388 + return; 93.1389 + } 93.1390 + } 93.1391 + 93.1392 + significant_coeff_ctx_base = c->cabac_state 93.1393 + + significant_coeff_flag_offset[0][cat]; 93.1394 + last_coeff_ctx_base = c->cabac_state 93.1395 + + last_coeff_flag_offset[0][cat]; 93.1396 + abs_level_m1_ctx_base = c->cabac_state 93.1397 + + coeff_abs_level_m1_offset[cat]; 93.1398 + 93.1399 + if( !is_dc && cat == 5 ) { 93.1400 +#define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \ 93.1401 + for(last= 0; last < coefs; last++) { \ 93.1402 + uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \ 93.1403 + if( get_cabac( c, sig_ctx )) { \ 93.1404 + uint8_t *last_ctx = last_coeff_ctx_base + last_off; \ 93.1405 + index[coeff_count++] = last; \ 93.1406 + if( get_cabac( c, last_ctx ) ) { \ 93.1407 + last= max_coeff; \ 93.1408 + break; \ 93.1409 + } \ 93.1410 + } \ 93.1411 + }\ 93.1412 + if( last == max_coeff -1 ) {\ 93.1413 + index[coeff_count++] = last;\ 93.1414 + } 93.1415 + 93.1416 + const uint8_t *sig_off = significant_coeff_flag_offset_8x8[0]; 93.1417 + DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] ); 93.1418 + } else { 93.1419 + DECODE_SIGNIFICANCE( max_coeff - 1, last, last ); 93.1420 + } 93.1421 + assert(coeff_count > 0); 93.1422 + 93.1423 + if( is_dc ) { 93.1424 + if( cat == 0 ) 93.1425 + ec->cbp[mb_x] |= 0x100; 93.1426 + else 93.1427 + ec->cbp[mb_x] |= 0x40 << n; 93.1428 + } else { 93.1429 + if( cat == 5 ) 93.1430 + fill_rectangle(&ec->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1); 93.1431 + else { 93.1432 + assert( cat == 1 || cat == 2 || cat == 4 ); 93.1433 + ec->non_zero_count_cache[scan8[n]] = coeff_count; 93.1434 + } 93.1435 + } 93.1436 + 93.1437 + do { 93.1438 + uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base; 93.1439 + 93.1440 + int j= scantable[index[--coeff_count]]; 93.1441 + 93.1442 + if( get_cabac( c, ctx ) == 0 ) { 93.1443 + node_ctx = coeff_abs_level_transition[0][node_ctx]; 93.1444 + if( is_dc ) { 93.1445 + block[j] = get_cabac_bypass_sign( c, -1); 93.1446 + }else{ 93.1447 + block[j] = (get_cabac_bypass_sign( c, -qmul[j]) + 32) >> 6; 93.1448 + } 93.1449 + } else { 93.1450 + int coeff_abs = 2; 93.1451 + ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base; 93.1452 + node_ctx = coeff_abs_level_transition[1][node_ctx]; 93.1453 + 93.1454 + while( coeff_abs < 15 && get_cabac( c, ctx ) ) { 93.1455 + coeff_abs++; 93.1456 + } 93.1457 + 93.1458 + if( coeff_abs >= 15 ) { 93.1459 + int j = 0; 93.1460 + while( get_cabac_bypass( c ) ) { 93.1461 + j++; 93.1462 + } 93.1463 + 93.1464 + coeff_abs=1; 93.1465 + while( j-- ) { 93.1466 + coeff_abs += coeff_abs + get_cabac_bypass( c ); 93.1467 + } 93.1468 + coeff_abs+= 14; 93.1469 + } 93.1470 + 93.1471 + if( is_dc ) { 93.1472 + block[j] = get_cabac_bypass_sign( c, -coeff_abs ); 93.1473 + }else{ 93.1474 + block[j] = (get_cabac_bypass_sign( c, -coeff_abs ) * qmul[j] + 32) >> 6; 93.1475 + } 93.1476 + } 93.1477 + } while( coeff_count ); 93.1478 + 93.1479 +} 93.1480 + 93.1481 +static void decode_cabac_residual_dc( EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) { 93.1482 + decode_cabac_residual_internal( ec, s, c, block, cat, n, scantable, NULL, max_coeff, 1); 93.1483 +} 93.1484 + 93.1485 +static void decode_cabac_residual_nondc( EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) { 93.1486 + decode_cabac_residual_internal( ec, s, c, block, cat, n, scantable, qmul, max_coeff, 0); 93.1487 +} 93.1488 + 93.1489 +/** 93.1490 + * decodes a macroblock 93.1491 + * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed 93.1492 + */ 93.1493 +int ff_h264_decode_mb_cabac(EntropyContext *ec, H264Slice *s, CABACContext *c) { 93.1494 + H264Mb *m = ec->m; 93.1495 + int mb_x = m->mb_x; 93.1496 + int mb_type, partition_count, cbp = 0; 93.1497 + int dct8x8_allowed= s->pps.transform_8x8_mode; 93.1498 + 93.1499 + fill_decode_neighbors(ec, s); 93.1500 + 93.1501 + if( s->slice_type_nos != FF_I_TYPE ) { 93.1502 + int skip; 93.1503 + /* a skipped mb needs the aff flag from the following mb */ 93.1504 + skip = decode_cabac_mb_skip( ec, s, m, c); 93.1505 + 93.1506 + /* read skip flags */ 93.1507 + if( skip ) { 93.1508 + decode_mb_skip(ec, s); 93.1509 + m->cbp = ec->cbp[mb_x] = 0; 93.1510 + ec->chroma_pred_mode[mb_x] = 0; 93.1511 + ec->last_qscale_diff = 0; 93.1512 + return 0; 93.1513 + } 93.1514 + } 93.1515 + 93.1516 + if( s->slice_type_nos == FF_B_TYPE ) { 93.1517 + int ctx = 0; 93.1518 + 93.1519 + if( !IS_DIRECT( ec->left_type-1 ) ) 93.1520 + ctx++; 93.1521 + if( !IS_DIRECT( ec->top_type-1 ) ) 93.1522 + ctx++; 93.1523 + 93.1524 + if( !get_cabac_noinline(c, &c->cabac_state[27+ctx] ) ){ 93.1525 + mb_type= 0; /* B_Direct_16x16 */ 93.1526 + }else if( !get_cabac_noinline(c, &c->cabac_state[27+3] ) ) { 93.1527 + mb_type= 1 + get_cabac_noinline(c, &c->cabac_state[27+5] ); /* B_L[01]_16x16 */ 93.1528 + }else{ 93.1529 + int bits; 93.1530 + bits = get_cabac_noinline(c, &c->cabac_state[27+4] ) << 3; 93.1531 + bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 2; 93.1532 + bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 1; 93.1533 + bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ); 93.1534 + if( bits < 8 ){ 93.1535 + mb_type= bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */ 93.1536 + }else if( bits == 13 ){ 93.1537 + mb_type= decode_cabac_intra_mb_type(ec, s, c, 32, 0); 93.1538 + goto decode_intra_mb; 93.1539 + }else if( bits == 14 ){ 93.1540 + mb_type= 11; /* B_L1_L0_8x16 */ 93.1541 + }else if( bits == 15 ){ 93.1542 + mb_type= 22; /* B_8x8 */ 93.1543 + }else{ 93.1544 + bits= ( bits<<1 ) + get_cabac_noinline(c, &c->cabac_state[27+5] ); 93.1545 + mb_type= bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */ 93.1546 + } 93.1547 + } 93.1548 + partition_count= b_mb_type_info[mb_type].partition_count; 93.1549 + mb_type= b_mb_type_info[mb_type].type; 93.1550 + } else if( s->slice_type_nos == FF_P_TYPE ) { 93.1551 + if( get_cabac_noinline(c, &c->cabac_state[14] ) == 0 ) { 93.1552 + /* P-type */ 93.1553 + if( get_cabac_noinline(c, &c->cabac_state[15] ) == 0 ) { 93.1554 + /* P_L0_D16x16, P_8x8 */ 93.1555 + mb_type= 3 * get_cabac_noinline(c, &c->cabac_state[16] ); 93.1556 + } else { 93.1557 + /* P_L0_D8x16, P_L0_D16x8 */ 93.1558 + mb_type= 2 - get_cabac_noinline(c, &c->cabac_state[17] ); 93.1559 + } 93.1560 + partition_count= p_mb_type_info[mb_type].partition_count; 93.1561 + mb_type= p_mb_type_info[mb_type].type; 93.1562 + } else { 93.1563 + mb_type= decode_cabac_intra_mb_type(ec, s, c, 17, 0); 93.1564 + goto decode_intra_mb; 93.1565 + } 93.1566 + } else { 93.1567 + mb_type= decode_cabac_intra_mb_type(ec, s ,c, 3, 1); 93.1568 + if(s->slice_type == FF_SI_TYPE && mb_type) 93.1569 + mb_type--; 93.1570 + assert(s->slice_type_nos == FF_I_TYPE); 93.1571 +decode_intra_mb: 93.1572 + partition_count = 0; 93.1573 + cbp= i_mb_type_info[mb_type].cbp; 93.1574 + m->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode; 93.1575 + mb_type= i_mb_type_info[mb_type].type; 93.1576 + } 93.1577 + 93.1578 + if(IS_INTRA_PCM(mb_type)) { 93.1579 + const uint8_t *ptr; 93.1580 + // We assume these blocks are very rare so we do not optimize it. 93.1581 + // FIXME The two following lines get the bitstream position in the cabac 93.1582 + // decode, I think it should be done by a function in cabac.h (or cabac.c). 93.1583 + ptr=c->bytestream; 93.1584 + if(c->low&0x1) ptr--; 93.1585 + if(CABAC_BITS==16){ 93.1586 + if(c->low&0x1FF) ptr--; 93.1587 + } 93.1588 + //printf("pcm\n"); 93.1589 + // The pixels are stored in the same order as levels in h->mb array. 93.1590 + memcpy(m->mb, ptr, 256); ptr+=256; 93.1591 + memcpy(m->mb+128, ptr, 128); ptr+=128; 93.1592 + 93.1593 + ff_init_cabac_decoder(c, ptr, c->bytestream_end - ptr); 93.1594 + 93.1595 + // All blocks are present 93.1596 + m->cbp= ec->cbp[mb_x] = 0x1ef; 93.1597 + ec->chroma_pred_mode[mb_x] = 0; 93.1598 + // In deblocking, the quantizer is 0 93.1599 + m->qscale_mb_xy = ec->qscale[mb_x]= 0; 93.1600 + // All coeffs are present 93.1601 + memset(ec->non_zero_count[mb_x], 16, 8); 93.1602 + m->mb_type = ec->mb_type[mb_x]= mb_type; 93.1603 + ec->last_qscale_diff = 0; 93.1604 + 93.1605 + return 0; 93.1606 + } 93.1607 + 93.1608 + fill_decode_caches(ec, s, mb_type); 93.1609 + 93.1610 + int mp = 0; 93.1611 + if( IS_INTRA( mb_type ) ) { 93.1612 + int i, pred_mode; 93.1613 + if( IS_INTRA4x4( mb_type ) ) { 93.1614 + if( dct8x8_allowed && get_cabac_noinline(c, &c->cabac_state[399 + ec->neighbor_transform_size] ) ) { 93.1615 + mb_type |= MB_TYPE_8x8DCT; 93.1616 + for( i = 0; i < 16; i+=4 ) { 93.1617 + m->intra4x4_pred_mode[i] = decode_cabac_mb_intra4x4_pred_mode_delta(c); 93.1618 + } 93.1619 + } else { 93.1620 + for( i = 0; i < 16; i++ ) { 93.1621 + m->intra4x4_pred_mode[i] = decode_cabac_mb_intra4x4_pred_mode_delta(c); 93.1622 + } 93.1623 + } 93.1624 + } 93.1625 + 93.1626 + m->chroma_pred_mode= ec->chroma_pred_mode[mb_x] = 93.1627 + pred_mode = decode_cabac_mb_chroma_pre_mode( ec, s, c ); 93.1628 + 93.1629 + } else if( partition_count == 4 ) { 93.1630 + int i, j, sub_partition_count[4], list; 93.1631 + 93.1632 + if( s->slice_type_nos == FF_B_TYPE ) { 93.1633 + for( i = 0; i < 4; i++ ) { 93.1634 + m->sub_mb_type[i] = decode_cabac_b_mb_sub_type( c ); 93.1635 + sub_partition_count[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count; 93.1636 + m->sub_mb_type[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].type; 93.1637 + } 93.1638 + if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] | 93.1639 + m->sub_mb_type[2] | m->sub_mb_type[3]) ) { 93.1640 + ec->ref_cache[0][scan8[4]] = 93.1641 + ec->ref_cache[1][scan8[4]] = 93.1642 + ec->ref_cache[0][scan8[12]] = 93.1643 + ec->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE; 93.1644 + 93.1645 + for( i = 0; i < 4; i++ ) 93.1646 + fill_rectangle( &ec->direct_cache[scan8[4*i]], 2, 2, 8, (m->sub_mb_type[i]>>1)&0xFF, 1 ); 93.1647 + } 93.1648 + } else { 93.1649 + for( i = 0; i < 4; i++ ) { 93.1650 + m->sub_mb_type[i] = decode_cabac_p_mb_sub_type( c ); 93.1651 + sub_partition_count[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count; 93.1652 + m->sub_mb_type[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].type; 93.1653 + } 93.1654 + } 93.1655 + 93.1656 + for( list = 0; list < s->list_count; list++ ) { 93.1657 + for( i = 0; i < 4; i++ ) { 93.1658 + if(IS_DIRECT(m->sub_mb_type[i])) continue; 93.1659 + if(IS_DIR(m->sub_mb_type[i], 0, list)){ 93.1660 + if( s->ref_count[list] > 1 ){ 93.1661 + m->ref_index[list][i] = decode_cabac_mb_ref(ec, s, c, list, 4*i ); 93.1662 + if(m->ref_index[list][i] >= s->ref_count[list]){ 93.1663 + av_log(AV_LOG_ERROR, "Reference %d >= %d\n", m->ref_index[list][i], s->ref_count[list]); 93.1664 + return -1; 93.1665 + } 93.1666 + }else 93.1667 + m->ref_index[list][i] = 0; 93.1668 + } else { 93.1669 + m->ref_index[list][i] = -1; 93.1670 + } 93.1671 + ec->ref_cache[list][ scan8[4*i] ]=ec->ref_cache[list][ scan8[4*i]+1 ]= 93.1672 + ec->ref_cache[list][ scan8[4*i]+8 ]=ec->ref_cache[list][ scan8[4*i]+9 ]= m->ref_index[list][i]; 93.1673 + } 93.1674 + } 93.1675 + 93.1676 + if(dct8x8_allowed){ 93.1677 +// assert(0); 93.1678 + dct8x8_allowed = get_dct8x8_allowed(ec, s); 93.1679 + } 93.1680 + 93.1681 + for(list=0; list<s->list_count; list++){ 93.1682 + for(i=0; i<4; i++){ 93.1683 +// ec->ref_cache[list][ scan8[4*i] ]=ec->ref_cache[list][ scan8[4*i]+1 ]; 93.1684 + if(IS_DIRECT(m->sub_mb_type[i])){ 93.1685 + fill_rectangle(ec->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2); 93.1686 + continue; 93.1687 + } 93.1688 + 93.1689 + if(IS_DIR(m->sub_mb_type[i], 0, list) && !IS_DIRECT(m->sub_mb_type[i])){ 93.1690 + const int sub_mb_type= m->sub_mb_type[i]; 93.1691 + const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1; 93.1692 + for(j=0; j<sub_partition_count[i]; j++){ 93.1693 + int mpx, mpy; 93.1694 + const int index= 4*i + block_width*j; 93.1695 + uint8_t (* mvd_cache)[2]= &ec->mvd_cache[list][ scan8[index]]; 93.1696 + 93.1697 + DECODE_CABAC_MB_MVD( ec, c, list, index) 93.1698 + 93.1699 + if(IS_SUB_8X8(sub_mb_type)){ 93.1700 + mvd_cache[ 1 ][0]= 93.1701 + mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mpx; 93.1702 + mvd_cache[ 1 ][1]= 93.1703 + mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= mpy; 93.1704 + }else if(IS_SUB_8X4(sub_mb_type)){ 93.1705 + mvd_cache[ 1 ][0]= mpx; 93.1706 + mvd_cache[ 1 ][1]= mpy; 93.1707 + }else if(IS_SUB_4X8(sub_mb_type)){ 93.1708 + mvd_cache[ 8 ][0]= mpx; 93.1709 + mvd_cache[ 8 ][1]= mpy; 93.1710 + } 93.1711 + mvd_cache[ 0 ][0]= mpx; 93.1712 + mvd_cache[ 0 ][1]= mpy; 93.1713 + } 93.1714 + }else{ 93.1715 + fill_rectangle(ec->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 2); 93.1716 + } 93.1717 + } 93.1718 + } 93.1719 + } else if( IS_DIRECT(mb_type) ) { 93.1720 + mb_type |= MB_TYPE_16x16; 93.1721 + fill_rectangle(ec->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2); 93.1722 + fill_rectangle(ec->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2); 93.1723 + dct8x8_allowed &= s->direct_8x8_inference_flag; 93.1724 + } else { 93.1725 + int list, i; 93.1726 + if(IS_16X16(mb_type)){ 93.1727 + for(list=0; list<s->list_count; list++){ 93.1728 + if(IS_DIR(mb_type, 0, list)){ 93.1729 + int ref; 93.1730 + if(s->ref_count[list] > 1){ 93.1731 + ref= decode_cabac_mb_ref(ec, s, c, list, 0); 93.1732 + if(ref >= s->ref_count[list]){ 93.1733 + av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]); 93.1734 + return -1; 93.1735 + } 93.1736 + }else 93.1737 + ref=0; 93.1738 + m->ref_index[list][0]= ref; 93.1739 + fill_rectangle(&ec->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1); 93.1740 + } 93.1741 + } 93.1742 + for(list=0; list<s->list_count; list++){ 93.1743 + if(IS_DIR(mb_type, 0, list)){ 93.1744 + int mpx,mpy; 93.1745 + DECODE_CABAC_MB_MVD( ec, c, list, 0) 93.1746 + 93.1747 + fill_rectangle(ec->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2); 93.1748 + } 93.1749 + 93.1750 + } 93.1751 + } 93.1752 + else if(IS_16X8(mb_type)){ 93.1753 + for(list=0; list<s->list_count; list++){ 93.1754 + for(i=0; i<2; i++){ 93.1755 + if(IS_DIR(mb_type, i, list)){ 93.1756 + int ref; 93.1757 + if(s->ref_count[list] > 1){ 93.1758 + ref= decode_cabac_mb_ref(ec, s, c, list, 8*i ); 93.1759 + if(ref >= s->ref_count[list]){ 93.1760 + av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]); 93.1761 + return -1; 93.1762 + } 93.1763 + }else 93.1764 + ref=0; 93.1765 + m->ref_index[list][i]= ref; 93.1766 + fill_rectangle(&ec->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1); 93.1767 + }else{ 93.1768 + m->ref_index[list][i]= LIST_NOT_USED; 93.1769 + fill_rectangle(&ec->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1); 93.1770 + } 93.1771 + } 93.1772 + } 93.1773 + for(list=0; list<s->list_count; list++){ 93.1774 + for(i=0; i<2; i++){ 93.1775 + if(IS_DIR(mb_type, i, list)){ 93.1776 + int mpx,mpy; 93.1777 + DECODE_CABAC_MB_MVD( ec, c, list, 8*i) 93.1778 + 93.1779 + fill_rectangle(ec->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2); 93.1780 + }else{ 93.1781 + fill_rectangle(ec->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2); 93.1782 + } 93.1783 + } 93.1784 + } 93.1785 + }else{ 93.1786 + assert(IS_8X16(mb_type)); 93.1787 + for(list=0; list<s->list_count; list++){ 93.1788 + for(i=0; i<2; i++){ 93.1789 + if(IS_DIR(mb_type, i, list)){ //FIXME optimize 93.1790 + int ref; 93.1791 + if(s->ref_count[list] > 1){ 93.1792 + ref= decode_cabac_mb_ref(ec, s, c, list, 4*i ); 93.1793 + if(ref >= s->ref_count[list]){ 93.1794 + av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]); 93.1795 + return -1; 93.1796 + } 93.1797 + }else 93.1798 + ref=0; 93.1799 + m->ref_index[list][i]= ref; 93.1800 + fill_rectangle(&ec->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1); 93.1801 + }else{ 93.1802 + m->ref_index[list][i]= LIST_NOT_USED; 93.1803 + fill_rectangle(&ec->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1); 93.1804 + } 93.1805 + } 93.1806 + } 93.1807 + for(list=0; list<s->list_count; list++){ 93.1808 + for(i=0; i<2; i++){ 93.1809 + if(IS_DIR(mb_type, i, list)){ 93.1810 + int mpx,mpy; 93.1811 + DECODE_CABAC_MB_MVD( ec, c, list, 4*i) 93.1812 + 93.1813 + fill_rectangle(ec->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2); 93.1814 + }else{ 93.1815 + fill_rectangle(ec->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2); 93.1816 + } 93.1817 + } 93.1818 + } 93.1819 + } 93.1820 + } 93.1821 + 93.1822 + if( IS_INTER( mb_type ) ||(IS_DIRECT(mb_type))) { 93.1823 + ec->chroma_pred_mode[mb_x] = 0; 93.1824 + write_back_motion( ec, s, mb_type ); 93.1825 + } 93.1826 + 93.1827 + if( !IS_INTRA16x16( mb_type ) ) { 93.1828 + cbp = decode_cabac_mb_cbp_luma( ec, c); 93.1829 + cbp |= decode_cabac_mb_cbp_chroma( ec, c ) << 4; 93.1830 + } 93.1831 + 93.1832 + ec->cbp[mb_x] = m->cbp = cbp; 93.1833 + 93.1834 + if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) { 93.1835 + int t = get_cabac_noinline(c, &c->cabac_state[399 + ec->neighbor_transform_size] ); 93.1836 + mb_type |= MB_TYPE_8x8DCT * t; 93.1837 + } 93.1838 + m->mb_type = ec->mb_type[mb_x] = mb_type; 93.1839 + 93.1840 + if( cbp || IS_INTRA16x16( mb_type ) ) { 93.1841 + const uint8_t *scan, *scan8x8, *dc_scan; 93.1842 + const uint32_t *qmul; 93.1843 + 93.1844 + 93.1845 + if (s->transform_bypass && ec->curr_qscale){ 93.1846 + scan8x8= ff_zigzag_direct; 93.1847 + scan= zigzag_scan; 93.1848 + }else{ 93.1849 + scan8x8= ec->zigzag_scan8x8; 93.1850 + scan= ec->zigzag_scan; 93.1851 + } 93.1852 + dc_scan= luma_dc_zigzag_scan; 93.1853 + 93.1854 + // decode_cabac_mb_dqp 93.1855 + if(get_cabac_noinline(c, &c->cabac_state[60 + (ec->last_qscale_diff != 0)])){ 93.1856 + int val = 1; 93.1857 + int ctx= 2; 93.1858 + 93.1859 + while( get_cabac_noinline(c, &c->cabac_state[60 + ctx] ) ) { 93.1860 + ctx= 3; 93.1861 + val++; 93.1862 + if(val > 102){ //prevent infinite loop 93.1863 + av_log(AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", m->mb_x, m->mb_y); 93.1864 + return -1; 93.1865 + } 93.1866 + } 93.1867 + 93.1868 + if( val&0x01 ) 93.1869 + val= (val + 1)>>1 ; 93.1870 + else 93.1871 + val= -((val + 1)>>1); 93.1872 + ec->last_qscale_diff = val; 93.1873 + ec->curr_qscale += val; 93.1874 + if(((unsigned)ec->curr_qscale) > 51){ 93.1875 + if(ec->curr_qscale<0) ec->curr_qscale+= 52; 93.1876 + else ec->curr_qscale-= 52; 93.1877 + } 93.1878 + ec->chroma_qp[0] = get_chroma_qp( s, 0, ec->curr_qscale); 93.1879 + ec->chroma_qp[1] = get_chroma_qp( s, 1, ec->curr_qscale); 93.1880 + }else 93.1881 + ec->last_qscale_diff=0; 93.1882 + 93.1883 + memset(m->mb, 0, 16*16 * sizeof(DCTELEM)); 93.1884 + if( IS_INTRA16x16( mb_type ) ) { 93.1885 + int i; 93.1886 + 93.1887 + //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" ); 93.1888 + decode_cabac_residual_dc( ec, s, c, m->mb, 0, 0, dc_scan, 16); 93.1889 + qmul = ec->dequant4_coeff[0][ec->curr_qscale]; 93.1890 + if( cbp&15 ) { 93.1891 + for( i = 0; i < 16; i++ ) { 93.1892 + //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i ); 93.1893 + decode_cabac_residual_nondc( ec, s, c, m->mb + 16*i, 1, i, scan + 1, qmul, 15); 93.1894 + } 93.1895 + } else { 93.1896 + fill_rectangle(&ec->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1); 93.1897 + } 93.1898 + h264_luma_dc_dequant_idct_c(m->mb, qmul[0]); 93.1899 + } else { 93.1900 + 93.1901 + int i8x8, i4x4; 93.1902 + for( i8x8 = 0; i8x8 < 4; i8x8++ ) { 93.1903 + if( cbp & (1<<i8x8) ) { 93.1904 + if( IS_8x8DCT(mb_type) ) { 93.1905 + decode_cabac_residual_nondc(ec, s, c, m->mb + 64*i8x8, 5, 4*i8x8, 93.1906 + scan8x8, ec->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][ec->curr_qscale], 64); 93.1907 + } else { 93.1908 + qmul = ec->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][ec->curr_qscale]; 93.1909 + for( i4x4 = 0; i4x4 < 4; i4x4++ ) { 93.1910 + const int index = 4*i8x8 + i4x4; 93.1911 + //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index ); 93.1912 +//START_TIMER 93.1913 + decode_cabac_residual_nondc(ec, s, c, m->mb + 16*index, 2, index, scan, qmul, 16); 93.1914 +//STOP_TIMER("decode_residual") 93.1915 + } 93.1916 + } 93.1917 + } else { 93.1918 + uint8_t * const nnz= &ec->non_zero_count_cache[ scan8[4*i8x8] ]; 93.1919 + nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0; 93.1920 + } 93.1921 + } 93.1922 + } 93.1923 + 93.1924 + if( cbp&0x30 ){ 93.1925 + memset(m->mb + 256, 0, 2*64 * sizeof(DCTELEM)); 93.1926 + for( int i = 0; i < 2; i++ ) { 93.1927 + const uint32_t dequant4_coeff = ec->dequant4_coeff[IS_INTRA(mb_type) ? 1+i:4+i][ec->chroma_qp[i]][0]; 93.1928 + 93.1929 + //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c ); 93.1930 + decode_cabac_residual_dc(ec, s, c, m->mb + 256 + 16*4*i, 3, i, chroma_dc_scan, 4); 93.1931 + chroma_dc_dequant_idct_c(m->mb + 256 + 16*4*i, dequant4_coeff); 93.1932 + } 93.1933 + } 93.1934 + 93.1935 + if( cbp&0x20 ) { 93.1936 + int i, j; 93.1937 + for( i = 0; i < 2; i++ ) { 93.1938 + qmul = ec->dequant4_coeff[i+1+(IS_INTRA( mb_type ) ? 0:3)][ec->chroma_qp[i]]; 93.1939 + for( j = 0; j < 4; j++ ) { 93.1940 + const int index = 16 + 4 * i + j; 93.1941 + //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 ); 93.1942 + decode_cabac_residual_nondc( ec, s, c, m->mb + 16*index, 4, index, scan + 1, qmul, 15); 93.1943 + } 93.1944 + } 93.1945 + } else { 93.1946 + uint8_t * const nnz= &ec->non_zero_count_cache[0]; 93.1947 + nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] = 93.1948 + nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0; 93.1949 + } 93.1950 + 93.1951 + } else { 93.1952 + uint8_t * const nnz= &ec->non_zero_count_cache[0]; 93.1953 + fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1); 93.1954 + nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] = 93.1955 + nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0; 93.1956 + ec->last_qscale_diff = 0; 93.1957 + } 93.1958 + 93.1959 + m->qscale_mb_xy = ec->qscale[mb_x]= ec->curr_qscale; 93.1960 + write_back_non_zero_count(ec, s); 93.1961 + 93.1962 + 93.1963 + return 0; 93.1964 +} 93.1965 + 93.1966 +void free_entropy_context(EntropyContext *ec){ 93.1967 + av_freep(&ec->non_zero_count_row[0]); 93.1968 + av_freep(&ec->non_zero_count_row[1]); 93.1969 + av_freep(&ec->mvd_table[0][0]); 93.1970 + av_freep(&ec->mvd_table[0][1]); 93.1971 + av_freep(&ec->mvd_table[1][0]); 93.1972 + av_freep(&ec->mvd_table[1][1]); 93.1973 + 93.1974 + av_freep(&ec->direct_table[0]); 93.1975 + av_freep(&ec->direct_table[1]); 93.1976 + av_freep(&ec->chroma_pred_mode_table[0]); 93.1977 + av_freep(&ec->chroma_pred_mode_table[1]); 93.1978 + av_freep(&ec->cbp_table[0]); 93.1979 + av_freep(&ec->cbp_table[1]); 93.1980 + av_freep(&ec->qscale_table[0]); 93.1981 + av_freep(&ec->qscale_table[1]); 93.1982 + 93.1983 + av_freep(&ec->mb_type_table[0]); 93.1984 + av_freep(&ec->mb_type_table[1]); 93.1985 + av_freep(&ec->ref_index_table[0][0]); 93.1986 + av_freep(&ec->ref_index_table[0][1]); 93.1987 + av_freep(&ec->ref_index_table[1][0]); 93.1988 + av_freep(&ec->ref_index_table[1][1]); 93.1989 + 93.1990 + 93.1991 + av_free(ec); 93.1992 +} 93.1993 + 93.1994 +EntropyContext *get_entropy_context(H264Context *h){ 93.1995 + const int mb_height = h->mb_height; 93.1996 + const int mb_width = h->mb_width; 93.1997 + const int mb_stride = h->mb_stride; 93.1998 + 93.1999 + EntropyContext *ec = av_mallocz(sizeof(EntropyContext)); 93.2000 + 93.2001 + ec->mb_width = mb_width; 93.2002 + ec->mb_height = mb_height; 93.2003 + ec->b_stride = mb_width*4; 93.2004 + ec->mb_stride = mb_stride; 93.2005 + 93.2006 + FF_ALLOCZ_OR_GOTO(ec->non_zero_count_row[0], mb_stride * 8 * sizeof(uint8_t), fail) 93.2007 + FF_ALLOCZ_OR_GOTO(ec->non_zero_count_row[1], mb_stride * 8 * sizeof(uint8_t), fail) 93.2008 + 93.2009 + FF_ALLOCZ_OR_GOTO(ec->mvd_table[0][0], 16*mb_stride * sizeof(uint8_t), fail); 93.2010 + FF_ALLOCZ_OR_GOTO(ec->mvd_table[0][1], 16*mb_stride * sizeof(uint8_t), fail); 93.2011 + FF_ALLOCZ_OR_GOTO(ec->mvd_table[1][0], 16*mb_stride * sizeof(uint8_t), fail); 93.2012 + FF_ALLOCZ_OR_GOTO(ec->mvd_table[1][1], 16*mb_stride * sizeof(uint8_t), fail); 93.2013 + 93.2014 + FF_ALLOCZ_OR_GOTO(ec->direct_table[0], 4*mb_stride * sizeof(uint8_t) , fail); 93.2015 + FF_ALLOCZ_OR_GOTO(ec->direct_table[1], 4*mb_stride * sizeof(uint8_t) , fail); 93.2016 + 93.2017 + FF_ALLOCZ_OR_GOTO(ec->chroma_pred_mode_table[0], mb_stride * sizeof(uint8_t), fail) 93.2018 + FF_ALLOCZ_OR_GOTO(ec->chroma_pred_mode_table[1], mb_stride * sizeof(uint8_t), fail) 93.2019 + 93.2020 + FF_ALLOCZ_OR_GOTO(ec->cbp_table[0], mb_stride * sizeof(uint16_t), fail) 93.2021 + FF_ALLOCZ_OR_GOTO(ec->cbp_table[1], mb_stride * sizeof(uint16_t), fail) 93.2022 + 93.2023 + FF_ALLOCZ_OR_GOTO(ec->qscale_table[0], mb_stride * sizeof(uint8_t) , fail) 93.2024 + FF_ALLOCZ_OR_GOTO(ec->qscale_table[1], mb_stride * sizeof(uint8_t) , fail) 93.2025 + 93.2026 + FF_ALLOCZ_OR_GOTO(ec->mb_type_table[0] , (mb_stride+1) * sizeof(uint32_t), fail) 93.2027 + FF_ALLOCZ_OR_GOTO(ec->mb_type_table[1] , (mb_stride+1) * sizeof(uint32_t), fail) 93.2028 + 93.2029 + FF_ALLOCZ_OR_GOTO(ec->ref_index_table[0][0], 4*mb_stride * sizeof(int8_t), fail) 93.2030 + FF_ALLOCZ_OR_GOTO(ec->ref_index_table[1][0], 4*mb_stride * sizeof(int8_t), fail) 93.2031 + FF_ALLOCZ_OR_GOTO(ec->ref_index_table[0][1], 4*mb_stride * sizeof(int8_t), fail) 93.2032 + FF_ALLOCZ_OR_GOTO(ec->ref_index_table[1][1], 4*mb_stride * sizeof(int8_t), fail) 93.2033 + 93.2034 + ec->zigzag_scan = h->zigzag_scan; 93.2035 + ec->zigzag_scan8x8 = h->zigzag_scan8x8; 93.2036 + 93.2037 + return ec; 93.2038 +fail: 93.2039 + free_entropy_context(ec); 93.2040 + return NULL; 93.2041 +} 93.2042 + 93.2043 +void init_entropy_buf(EntropyContext *ec, H264Slice *s, int line){ 93.2044 + int top = (line+1)%2; 93.2045 + int cur = line%2; 93.2046 + 93.2047 + ec->non_zero_count_top = ec->non_zero_count_row[top]; 93.2048 + ec->non_zero_count = ec->non_zero_count_row[cur]; 93.2049 + ec->mvd_top[0] = ec->mvd_table[0][top]; 93.2050 + ec->mvd[0] = ec->mvd_table[0][cur]; 93.2051 + ec->mvd_top[1] = ec->mvd_table[1][top]; 93.2052 + ec->mvd[1] = ec->mvd_table[1][cur]; 93.2053 + ec->direct_top = ec->direct_table[top]; 93.2054 + ec->direct = ec->direct_table[cur]; 93.2055 + ec->chroma_pred_mode_top = ec->chroma_pred_mode_table[top]; 93.2056 + ec->chroma_pred_mode = ec->chroma_pred_mode_table[cur]; 93.2057 + ec->cbp_top = ec->cbp_table[top]; 93.2058 + ec->cbp = ec->cbp_table[cur]; 93.2059 + ec->qscale_top = ec->qscale_table[top] +1; 93.2060 + ec->qscale = ec->qscale_table[cur] +1; 93.2061 + ec->mb_type_top = ec->mb_type_table[top]+1; 93.2062 + ec->mb_type = ec->mb_type_table[cur]+1; 93.2063 + ec->ref_index_top[0] = ec->ref_index_table[0][top]; 93.2064 + ec->ref_index_top[1] = ec->ref_index_table[1][top]; 93.2065 + ec->ref_index[0] = ec->ref_index_table[0][cur]; 93.2066 + ec->ref_index[1] = ec->ref_index_table[1][cur]; 93.2067 + 93.2068 +}
94.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 94.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_entropy.h Mon Aug 27 12:09:56 2012 +0200 94.3 @@ -0,0 +1,20 @@ 94.4 +#ifndef H264_CABAC_H 94.5 +#define H264_CABAC_H 94.6 + 94.7 +#include "h264_types.h" 94.8 +#include "cabac.h" 94.9 + 94.10 +/** 94.11 + * decodes a CABAC coded macroblock 94.12 + * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed 94.13 + */ 94.14 + 94.15 +int ff_h264_decode_mb_cabac(EntropyContext *ec, H264Slice *s, CABACContext *c); 94.16 +void ff_h264_init_cabac_states(EntropyContext *ec, H264Slice *s, CABACContext *c); 94.17 + 94.18 +int init_entropy_buf(EntropyContext *ec, H264Slice *s, int line); 94.19 +EntropyContext * get_entropy_context(H264Context *h); 94.20 +void init_dequant_tables(H264Slice *s, EntropyContext *ec); 94.21 +void free_entropy_context(EntropyContext *ec); 94.22 + 94.23 +#endif
95.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 95.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_idct.c Mon Aug 27 12:09:56 2012 +0200 95.3 @@ -0,0 +1,270 @@ 95.4 +/* 95.5 + * H.264 IDCT 95.6 + * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at> 95.7 + * 95.8 + * This file is part of FFmpeg. 95.9 + * 95.10 + * FFmpeg is free software; you can redistribute it and/or 95.11 + * modify it under the terms of the GNU Lesser General Public 95.12 + * License as published by the Free Software Foundation; either 95.13 + * version 2.1 of the License, or (at your option) any later version. 95.14 + * 95.15 + * FFmpeg is distributed in the hope that it will be useful, 95.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 95.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 95.18 + * Lesser General Public License for more details. 95.19 + * 95.20 + * You should have received a copy of the GNU Lesser General Public 95.21 + * License along with FFmpeg; if not, write to the Free Software 95.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 95.23 + */ 95.24 + 95.25 +/** 95.26 + * @file 95.27 + * H.264 IDCT. 95.28 + * @author Michael Niedermayer <michaelni@gmx.at> 95.29 + */ 95.30 + 95.31 +#include "dsputil.h" 95.32 +#include "h264_data.h" 95.33 + 95.34 +static av_always_inline void idct_internal(uint8_t *dst, DCTELEM *block, int stride, int block_stride, int shift, int add){ 95.35 + int i; 95.36 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; 95.37 + 95.38 + block[0] += 1<<(shift-1); 95.39 + 95.40 + for(i=0; i<4; i++){ 95.41 + const int z0= block[0 + block_stride*i] + block[2 + block_stride*i]; 95.42 + const int z1= block[0 + block_stride*i] - block[2 + block_stride*i]; 95.43 + const int z2= (block[1 + block_stride*i]>>1) - block[3 + block_stride*i]; 95.44 + const int z3= block[1 + block_stride*i] + (block[3 + block_stride*i]>>1); 95.45 + 95.46 + block[0 + block_stride*i]= z0 + z3; 95.47 + block[1 + block_stride*i]= z1 + z2; 95.48 + block[2 + block_stride*i]= z1 - z2; 95.49 + block[3 + block_stride*i]= z0 - z3; 95.50 + } 95.51 + 95.52 + for(i=0; i<4; i++){ 95.53 + const int z0= block[i + block_stride*0] + block[i + block_stride*2]; 95.54 + const int z1= block[i + block_stride*0] - block[i + block_stride*2]; 95.55 + const int z2= (block[i + block_stride*1]>>1) - block[i + block_stride*3]; 95.56 + const int z3= block[i + block_stride*1] + (block[i + block_stride*3]>>1); 95.57 + 95.58 + dst[i + 0*stride]= cm[ add*dst[i + 0*stride] + ((z0 + z3) >> shift) ]; 95.59 + dst[i + 1*stride]= cm[ add*dst[i + 1*stride] + ((z1 + z2) >> shift) ]; 95.60 + dst[i + 2*stride]= cm[ add*dst[i + 2*stride] + ((z1 - z2) >> shift) ]; 95.61 + dst[i + 3*stride]= cm[ add*dst[i + 3*stride] + ((z0 - z3) >> shift) ]; 95.62 + } 95.63 +} 95.64 + 95.65 +void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride){ 95.66 + idct_internal(dst, block, stride, 4, 6, 1); 95.67 +} 95.68 + 95.69 +void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block){ 95.70 + idct_internal(dst, block, stride, 8, 3, 1); 95.71 +} 95.72 + 95.73 +void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block){ 95.74 + idct_internal(dst, block, stride, 8, 3, 0); 95.75 +} 95.76 + 95.77 +void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){ 95.78 + int i; 95.79 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; 95.80 + 95.81 + block[0] += 32; 95.82 + 95.83 + for( i = 0; i < 8; i++ ) 95.84 + { 95.85 + const int a0 = block[0+i*8] + block[4+i*8]; 95.86 + const int a2 = block[0+i*8] - block[4+i*8]; 95.87 + const int a4 = (block[2+i*8]>>1) - block[6+i*8]; 95.88 + const int a6 = (block[6+i*8]>>1) + block[2+i*8]; 95.89 + 95.90 + const int b0 = a0 + a6; 95.91 + const int b2 = a2 + a4; 95.92 + const int b4 = a2 - a4; 95.93 + const int b6 = a0 - a6; 95.94 + 95.95 + const int a1 = -block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1); 95.96 + const int a3 = block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1); 95.97 + const int a5 = -block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1); 95.98 + const int a7 = block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1); 95.99 + 95.100 + const int b1 = (a7>>2) + a1; 95.101 + const int b3 = a3 + (a5>>2); 95.102 + const int b5 = (a3>>2) - a5; 95.103 + const int b7 = a7 - (a1>>2); 95.104 + 95.105 + block[0+i*8] = b0 + b7; 95.106 + block[7+i*8] = b0 - b7; 95.107 + block[1+i*8] = b2 + b5; 95.108 + block[6+i*8] = b2 - b5; 95.109 + block[2+i*8] = b4 + b3; 95.110 + block[5+i*8] = b4 - b3; 95.111 + block[3+i*8] = b6 + b1; 95.112 + block[4+i*8] = b6 - b1; 95.113 + } 95.114 + for( i = 0; i < 8; i++ ) 95.115 + { 95.116 + const int a0 = block[i+0*8] + block[i+4*8]; 95.117 + const int a2 = block[i+0*8] - block[i+4*8]; 95.118 + const int a4 = (block[i+2*8]>>1) - block[i+6*8]; 95.119 + const int a6 = (block[i+6*8]>>1) + block[i+2*8]; 95.120 + 95.121 + const int b0 = a0 + a6; 95.122 + const int b2 = a2 + a4; 95.123 + const int b4 = a2 - a4; 95.124 + const int b6 = a0 - a6; 95.125 + 95.126 + const int a1 = -block[i+3*8] + block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1); 95.127 + const int a3 = block[i+1*8] + block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1); 95.128 + const int a5 = -block[i+1*8] + block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1); 95.129 + const int a7 = block[i+3*8] + block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1); 95.130 + 95.131 + const int b1 = (a7>>2) + a1; 95.132 + const int b3 = a3 + (a5>>2); 95.133 + const int b5 = (a3>>2) - a5; 95.134 + const int b7 = a7 - (a1>>2); 95.135 + 95.136 + dst[i + 0*stride] = cm[ dst[i + 0*stride] + ((b0 + b7) >> 6) ]; 95.137 + dst[i + 1*stride] = cm[ dst[i + 1*stride] + ((b2 + b5) >> 6) ]; 95.138 + dst[i + 2*stride] = cm[ dst[i + 2*stride] + ((b4 + b3) >> 6) ]; 95.139 + dst[i + 3*stride] = cm[ dst[i + 3*stride] + ((b6 + b1) >> 6) ]; 95.140 + dst[i + 4*stride] = cm[ dst[i + 4*stride] + ((b6 - b1) >> 6) ]; 95.141 + dst[i + 5*stride] = cm[ dst[i + 5*stride] + ((b4 - b3) >> 6) ]; 95.142 + dst[i + 6*stride] = cm[ dst[i + 6*stride] + ((b2 - b5) >> 6) ]; 95.143 + dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ]; 95.144 + } 95.145 +} 95.146 + 95.147 +// assumes all AC coefs are 0 95.148 +void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){ 95.149 + int i, j; 95.150 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; 95.151 + int dc = (block[0] + 32) >> 6; 95.152 + for( j = 0; j < 4; j++ ) 95.153 + { 95.154 + for( i = 0; i < 4; i++ ) 95.155 + dst[i] = cm[ dst[i] + dc ]; 95.156 + dst += stride; 95.157 + } 95.158 +} 95.159 + 95.160 +void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){ 95.161 + int i, j; 95.162 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; 95.163 + int dc = (block[0] + 32) >> 6; 95.164 + for( j = 0; j < 8; j++ ) 95.165 + { 95.166 + for( i = 0; i < 8; i++ ) 95.167 + dst[i] = cm[ dst[i] + dc ]; 95.168 + dst += stride; 95.169 + } 95.170 +} 95.171 + 95.172 +void ff_h264_idct_add16_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 95.173 + int i; 95.174 + for(i=0; i<16; i++){ 95.175 + int nnz = nnzc[ scan8[i] ]; 95.176 + if(nnz){ 95.177 + if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride); 95.178 + else idct_internal (dst + block_offset[i], block + i*16, stride, 4, 6, 1); 95.179 + } 95.180 + } 95.181 +} 95.182 + 95.183 +void ff_h264_idct_add16intra_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 95.184 + int i; 95.185 + for(i=0; i<16; i++){ 95.186 + if(nnzc[ scan8[i] ]) idct_internal (dst + block_offset[i], block + i*16, stride, 4, 6, 1); 95.187 + else if(block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride); 95.188 + } 95.189 +} 95.190 + 95.191 +void ff_h264_idct8_add4_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 95.192 + int i; 95.193 + for(i=0; i<16; i+=4){ 95.194 + int nnz = nnzc[ scan8[i] ]; 95.195 + if(nnz){ 95.196 + if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_c(dst + block_offset[i], block + i*16, stride); 95.197 + else ff_h264_idct8_add_c (dst + block_offset[i], block + i*16, stride); 95.198 + } 95.199 + } 95.200 +} 95.201 + 95.202 +void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 95.203 + int i; 95.204 + for(i=16; i<16+8; i++){ 95.205 + if(nnzc[ scan8[i] ]) 95.206 + ff_h264_idct_add_c (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); 95.207 + else if(block[i*16]) 95.208 + ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); 95.209 + } 95.210 +} 95.211 + 95.212 +/** 95.213 +* IDCT transforms the 16 dc values and dequantizes them. 95.214 +* @param qp quantization parameter 95.215 +*/ 95.216 +void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul){ 95.217 + #define stride 16 95.218 + int i; 95.219 + int temp[16]; //FIXME check if this is a good idea 95.220 + static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride}; 95.221 + static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride}; 95.222 + 95.223 + //return; 95.224 + for(i=0; i<4; i++){ 95.225 + const int offset= y_offset[i]; 95.226 + const int z0= block[offset+stride*0] + block[offset+stride*4]; 95.227 + const int z1= block[offset+stride*0] - block[offset+stride*4]; 95.228 + const int z2= block[offset+stride*1] - block[offset+stride*5]; 95.229 + const int z3= block[offset+stride*1] + block[offset+stride*5]; 95.230 + 95.231 + temp[4*i+0]= z0+z3; 95.232 + temp[4*i+1]= z1+z2; 95.233 + temp[4*i+2]= z1-z2; 95.234 + temp[4*i+3]= z0-z3; 95.235 + } 95.236 + 95.237 + for(i=0; i<4; i++){ 95.238 + const int offset= x_offset[i]; 95.239 + const int z0= temp[4*0+i] + temp[4*2+i]; 95.240 + const int z1= temp[4*0+i] - temp[4*2+i]; 95.241 + const int z2= temp[4*1+i] - temp[4*3+i]; 95.242 + const int z3= temp[4*1+i] + temp[4*3+i]; 95.243 + 95.244 + block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual 95.245 + block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8)); 95.246 + block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); 95.247 + block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); 95.248 + } 95.249 +} 95.250 + 95.251 +#undef xStride 95.252 +#undef stride 95.253 + 95.254 +void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul){ 95.255 + const int stride= 16*2; 95.256 + const int xStride= 16; 95.257 + int a,b,c,d,e; 95.258 + 95.259 + a= block[stride*0 + xStride*0]; 95.260 + b= block[stride*0 + xStride*1]; 95.261 + c= block[stride*1 + xStride*0]; 95.262 + d= block[stride*1 + xStride*1]; 95.263 + 95.264 + e= a-b; 95.265 + a= a+b; 95.266 + b= c-d; 95.267 + c= c+d; 95.268 + 95.269 + block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7; 95.270 + block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7; 95.271 + block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7; 95.272 + block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7; 95.273 +}
96.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 96.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_idct.h Mon Aug 27 12:09:56 2012 +0200 96.3 @@ -0,0 +1,19 @@ 96.4 +#ifndef H264_IDCT_H 96.5 +#define H264_IDCT_H 96.6 + 96.7 +#include "avcodec.h" 96.8 + 96.9 +void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride); 96.10 +void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride); 96.11 +void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride); 96.12 +void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride); 96.13 +void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block); 96.14 +void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block); 96.15 +void ff_h264_idct_add16_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); 96.16 +void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); 96.17 +void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); 96.18 +void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); 96.19 +void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul); 96.20 +void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul); 96.21 + 96.22 +#endif
97.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 97.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_mc.c Mon Aug 27 12:09:56 2012 +0200 97.3 @@ -0,0 +1,272 @@ 97.4 +#include "h264_types.h" 97.5 +#include "h264_data.h" 97.6 + 97.7 +static inline void mc_dir_part(MBRecContext *d, MBRecState *mrs, H264Mb *m, DecodedPicture *pic, int n, int square, 97.8 + int chroma_height, int delta, int list,uint8_t *dest_y, 97.9 + uint8_t *dest_cb, uint8_t *dest_cr, int src_x_offset, int src_y_offset, 97.10 + qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){ 97.11 + const int mx= mrs->mv_cache[list][ scan8[n] ][0] + src_x_offset*8; 97.12 + const int my= mrs->mv_cache[list][ scan8[n] ][1] + src_y_offset*8; 97.13 + const int luma_xy= (mx&3) + ((my&3)<<2); 97.14 + const int pic_width = 16*d->mb_width; 97.15 + const int pic_height = 16*d->mb_height; 97.16 + 97.17 + uint8_t *src_y, *src_cb, *src_cr; 97.18 + int ymx= mx>>2; 97.19 + int ymy= my>>2; 97.20 + int cmy= my>>3; 97.21 + int cmx= mx>>3; 97.22 + 97.23 + //truncate the motion vectors references 97.24 + if(ymy>= pic_height+2){ 97.25 + ymy=pic_height+1; 97.26 + }else if(ymy <=-19){ 97.27 + ymy=-18; 97.28 + } 97.29 + if(ymx>= pic_width+2){ 97.30 + ymx= pic_width+1; 97.31 + }else if(ymx<=-19){ 97.32 + ymx=-19; 97.33 + } 97.34 + 97.35 + src_y = pic->data[0] + ymx + ymy*d->linesize; 97.36 + qpix_op[luma_xy](dest_y, src_y, d->linesize); //FIXME try variable height perhaps? 97.37 + if(!square){ 97.38 + qpix_op[luma_xy](dest_y + delta, src_y + delta, d->linesize); 97.39 + } 97.40 + 97.41 + if(cmy >= pic_height>>1){ 97.42 + cmy = (pic_height>>1) -1; 97.43 + }else if(cmy<=-9){ 97.44 + cmy=-8; 97.45 + } 97.46 + if(cmx >= pic_width>>1){ 97.47 + cmx = (pic_width>>1) -1; 97.48 + }else if(cmx<=-9){ 97.49 + cmx=-8; 97.50 + } 97.51 + 97.52 + src_cb= pic->data[1] + cmx + cmy*d->uvlinesize; 97.53 + src_cr= pic->data[2] + cmx + cmy*d->uvlinesize; 97.54 + 97.55 + chroma_op(dest_cb, src_cb, d->uvlinesize, chroma_height, mx&7, my&7); 97.56 + chroma_op(dest_cr, src_cr, d->uvlinesize, chroma_height, mx&7, my&7); 97.57 +} 97.58 + 97.59 +static inline void mc_part_std(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta, 97.60 + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, 97.61 + int x_offset, int y_offset, 97.62 + qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, 97.63 + qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, 97.64 + int list0, int list1){ 97.65 + qpel_mc_func *qpix_op= qpix_put; 97.66 + h264_chroma_mc_func chroma_op= chroma_put; 97.67 + 97.68 + dest_y += 2*x_offset + 2*y_offset*d-> linesize; 97.69 + dest_cb += x_offset + y_offset*d->uvlinesize; 97.70 + dest_cr += x_offset + y_offset*d->uvlinesize; 97.71 + x_offset += 8*m->mb_x; 97.72 + y_offset += 8*m->mb_y; 97.73 + 97.74 + if(list0){ 97.75 + DecodedPicture *ref= s->dp_ref_list[0][ mrs->ref_cache[0][ scan8[n] ] ]; 97.76 + mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, 0, 97.77 + dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_op, chroma_op); 97.78 + 97.79 + qpix_op= qpix_avg; 97.80 + chroma_op= chroma_avg; 97.81 + } 97.82 + 97.83 + if(list1){ 97.84 + DecodedPicture *ref= s->dp_ref_list[1][ mrs->ref_cache[1][ scan8[n] ] ]; 97.85 + mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, 1, 97.86 + dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_op, chroma_op); 97.87 + } 97.88 +} 97.89 + 97.90 +static inline void mc_part_weighted(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta, 97.91 + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, 97.92 + int x_offset, int y_offset, 97.93 + qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, 97.94 + h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op, 97.95 + h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg, 97.96 + int list0, int list1){ 97.97 + dest_y += 2*x_offset + 2*y_offset*d-> linesize; 97.98 + dest_cb += x_offset + y_offset*d->uvlinesize; 97.99 + dest_cr += x_offset + y_offset*d->uvlinesize; 97.100 + x_offset += 8*m->mb_x; 97.101 + y_offset += 8*m->mb_y; 97.102 + 97.103 + if(list0 && list1){ 97.104 + /* don't optimize for luma-only case, since B-frames usually 97.105 + * use implicit weights => chroma too. */ 97.106 + uint8_t *tmp_y = d->scratchpad_y + 2*x_offset +16 ; 97.107 + uint8_t *tmp_cb = d->scratchpad_cb + x_offset + 8; 97.108 + uint8_t *tmp_cr = d->scratchpad_cr + x_offset + 8; 97.109 + 97.110 +/* 97.111 + uint8_t *tmp_cb = d->scratchpad; 97.112 + uint8_t *tmp_cr = d->scratchpad + 8; 97.113 + uint8_t *tmp_y = d->scratchpad + 8*d->uvlinesize;*/ 97.114 + int refn0 = mrs->ref_cache[0][ scan8[n] ]; 97.115 + int refn1 = mrs->ref_cache[1][ scan8[n] ]; 97.116 + 97.117 + mc_dir_part(d, mrs, m, s->dp_ref_list[0][refn0], n, square, chroma_height, delta, 0, 97.118 + dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put); 97.119 + mc_dir_part(d, mrs, m, s->dp_ref_list[1][refn1], n, square, chroma_height, delta, 1, 97.120 + tmp_y, tmp_cb, tmp_cr, x_offset, y_offset, qpix_put, chroma_put); 97.121 + 97.122 + if(s->use_weight == 2){ 97.123 + int weight0 = s->implicit_weight[refn0][refn1][m->mb_y&1]; 97.124 + int weight1 = 64 - weight0; 97.125 + luma_weight_avg( dest_y, tmp_y, d-> linesize, 5, weight0, weight1, 0); 97.126 + chroma_weight_avg(dest_cb, tmp_cb, d->uvlinesize, 5, weight0, weight1, 0); 97.127 + chroma_weight_avg(dest_cr, tmp_cr, d->uvlinesize, 5, weight0, weight1, 0); 97.128 + }else{ 97.129 + luma_weight_avg(dest_y, tmp_y, d->linesize, s->luma_log2_weight_denom, 97.130 + s->luma_weight[refn0][0][0] , s->luma_weight[refn1][1][0], 97.131 + s->luma_weight[refn0][0][1] + s->luma_weight[refn1][1][1]); 97.132 + chroma_weight_avg(dest_cb, tmp_cb, d->uvlinesize, s->chroma_log2_weight_denom, 97.133 + s->chroma_weight[refn0][0][0][0] , s->chroma_weight[refn1][1][0][0], 97.134 + s->chroma_weight[refn0][0][0][1] + s->chroma_weight[refn1][1][0][1]); 97.135 + chroma_weight_avg(dest_cr, tmp_cr, d->uvlinesize, s->chroma_log2_weight_denom, 97.136 + s->chroma_weight[refn0][0][1][0] , s->chroma_weight[refn1][1][1][0], 97.137 + s->chroma_weight[refn0][0][1][1] + s->chroma_weight[refn1][1][1][1]); 97.138 + } 97.139 + }else{ 97.140 + int list = list1 ? 1 : 0; 97.141 + int refn = mrs->ref_cache[list][ scan8[n] ]; 97.142 + DecodedPicture *ref= s->dp_ref_list[list][refn]; 97.143 + mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, list, 97.144 + dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put); 97.145 + 97.146 + luma_weight_op(dest_y, d->linesize, s->luma_log2_weight_denom, 97.147 + s->luma_weight[refn][list][0], s->luma_weight[refn][list][1]); 97.148 + if(s->use_weight_chroma){ 97.149 + chroma_weight_op(dest_cb, d->uvlinesize, s->chroma_log2_weight_denom, 97.150 + s->chroma_weight[refn][list][0][0], s->chroma_weight[refn][list][0][1]); 97.151 + chroma_weight_op(dest_cr, d->uvlinesize, s->chroma_log2_weight_denom, 97.152 + s->chroma_weight[refn][list][1][0], s->chroma_weight[refn][list][1][1]); 97.153 + } 97.154 + } 97.155 +} 97.156 + 97.157 +static inline void mc_part(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta, 97.158 + uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, 97.159 + int x_offset, int y_offset, 97.160 + qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, 97.161 + qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg, 97.162 + h264_weight_func *weight_op, h264_biweight_func *weight_avg, 97.163 + int list0, int list1){ 97.164 + if((s->use_weight==2 && list0 && list1 97.165 + && (s->implicit_weight[ mrs->ref_cache[0][scan8[n]] ][ mrs->ref_cache[1][scan8[n]] ][m->mb_y&1] != 32)) 97.166 + || s->use_weight==1) 97.167 + mc_part_weighted(d, mrs, s, m, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, 97.168 + x_offset, y_offset, qpix_put, chroma_put, 97.169 + weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1); 97.170 + else 97.171 + mc_part_std(d, mrs, s, m, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, 97.172 + x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1); 97.173 +} 97.174 + 97.175 +static inline void prefetch_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int list){ 97.176 + /* fetch pixels for estimated mv 4 macroblocks ahead 97.177 + * optimized for 64byte cache lines */ 97.178 + const int refn = mrs->ref_cache[list][scan8[0]]; 97.179 + 97.180 + if(refn >= 0){ 97.181 + const int mx= (mrs->mv_cache[list][scan8[0]][0]>>2) + 16*m->mb_x + 8; 97.182 + const int my= (mrs->mv_cache[list][scan8[0]][1]>>2) + 16*m->mb_y; 97.183 + uint8_t **src= s->dp_ref_list[list][refn]->data; 97.184 + int off= mx + (my + (m->mb_x&3)*4)*d->linesize + 64; 97.185 + 97.186 + d->dsp.prefetch(src[0]+off, d->linesize, 4); 97.187 + off= (mx>>1) + ((my>>1) + (m->mb_x&7))*d->uvlinesize + 64; 97.188 + d->dsp.prefetch(src[1]+off, src[2]-src[1], 2); 97.189 + } 97.190 +} 97.191 + 97.192 +void hl_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, 97.193 + qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), 97.194 + qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), 97.195 + h264_weight_func *weight_op, h264_biweight_func *weight_avg){ 97.196 + const int mb_type= m->mb_type; 97.197 + assert(IS_INTER(mb_type)); 97.198 + 97.199 + if (mb_type & MB_TYPE_L0) 97.200 + prefetch_motion(d, mrs, s, m, 0); 97.201 + if (mb_type & MB_TYPE_L1) 97.202 + prefetch_motion(d, mrs, s, m, 1); 97.203 + 97.204 + if(IS_16X16(mb_type)){ 97.205 + mc_part(d, mrs, s, m, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0, 97.206 + qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0], 97.207 + weight_op, weight_avg, 97.208 + IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); 97.209 + }else if(IS_16X8(mb_type)){ 97.210 + mc_part(d, mrs, s, m, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0, 97.211 + qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], 97.212 + &weight_op[1], &weight_avg[1], 97.213 + IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); 97.214 + mc_part(d, mrs, s, m, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4, 97.215 + qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], 97.216 + &weight_op[1], &weight_avg[1], 97.217 + IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); 97.218 + }else if(IS_8X16(mb_type)){ 97.219 + mc_part(d, mrs, s, m, 0, 0, 8, 8*d->linesize, dest_y, dest_cb, dest_cr, 0, 0, 97.220 + qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], 97.221 + &weight_op[2], &weight_avg[2], 97.222 + IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1)); 97.223 + mc_part(d, mrs, s, m, 4, 0, 8, 8*d->linesize, dest_y, dest_cb, dest_cr, 4, 0, 97.224 + qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], 97.225 + &weight_op[2], &weight_avg[2], 97.226 + IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1)); 97.227 + }else{ 97.228 + int i; 97.229 + 97.230 + assert(IS_8X8(mb_type)); 97.231 + 97.232 + for(i=0; i<4; i++){ 97.233 + const int sub_mb_type= m->sub_mb_type[i]; 97.234 + const int n= 4*i; 97.235 + int x_offset= (i&1)<<2; 97.236 + int y_offset= (i&2)<<1; 97.237 + 97.238 + if(IS_SUB_8X8(sub_mb_type)){ 97.239 + mc_part(d, mrs, s, m, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, 97.240 + qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], 97.241 + &weight_op[3], &weight_avg[3], 97.242 + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); 97.243 + }else if(IS_SUB_8X4(sub_mb_type)){ 97.244 + mc_part(d, mrs, s, m, n, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset, 97.245 + qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], 97.246 + &weight_op[4], &weight_avg[4], 97.247 + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); 97.248 + mc_part(d, mrs, s, m, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2, 97.249 + qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], 97.250 + &weight_op[4], &weight_avg[4], 97.251 + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); 97.252 + }else if(IS_SUB_4X8(sub_mb_type)){ 97.253 + mc_part(d, mrs, s, m, n, 0, 4, 4*d->linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, 97.254 + qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], 97.255 + &weight_op[5], &weight_avg[5], 97.256 + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); 97.257 + mc_part(d, mrs, s, m, n+1, 0, 4, 4*d->linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, 97.258 + qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], 97.259 + &weight_op[5], &weight_avg[5], 97.260 + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); 97.261 + }else{ 97.262 + int j; 97.263 + assert(IS_SUB_4X4(sub_mb_type)); 97.264 + for(j=0; j<4; j++){ 97.265 + int sub_x_offset= x_offset + 2*(j&1); 97.266 + int sub_y_offset= y_offset + (j&2); 97.267 + mc_part(d, mrs, s, m, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, 97.268 + qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], 97.269 + &weight_op[6], &weight_avg[6], 97.270 + IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1)); 97.271 + } 97.272 + } 97.273 + } 97.274 + } 97.275 +}
98.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 98.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_mc.h Mon Aug 27 12:09:56 2012 +0200 98.3 @@ -0,0 +1,12 @@ 98.4 +#ifndef H264_MC_H 98.5 +#define H264_MC_H 98.6 + 98.7 +#include "dsputil.h" 98.8 +#include "h264_types.h" 98.9 + 98.10 +void hl_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, 98.11 + qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put), 98.12 + qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg), 98.13 + h264_weight_func *weight_op, h264_biweight_func *weight_avg); 98.14 + 98.15 +#endif
99.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 99.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_misc.c Mon Aug 27 12:09:56 2012 +0200 99.3 @@ -0,0 +1,944 @@ 99.4 +#include "config.h" 99.5 + 99.6 +#include "h264_types.h" 99.7 + 99.8 +#include <unistd.h> 99.9 +#include <sys/resource.h> 99.10 +#include <sys/time.h> 99.11 +#include <time.h> 99.12 +#include <pthread.h> 99.13 +#undef NDEBUG 99.14 +#include <assert.h> 99.15 + 99.16 +#if HAVE_LIBSDL2 99.17 +#include <SDL2/SDL.h> 99.18 +#if HAVE_LIBSDL_TTF 99.19 +#include <SDL/SDL_ttf.h> 99.20 +#endif 99.21 +#endif 99.22 + 99.23 +void start_timer(H264Context *h, int stage){ 99.24 + clock_gettime(CLOCK_REALTIME, &h->start_time[stage]); 99.25 +} 99.26 + 99.27 +void stop_timer(H264Context *h, int stage){ 99.28 + clock_gettime(CLOCK_REALTIME, &h->end_time[stage]); 99.29 + double time = (double) 1.e3*(h->end_time[stage].tv_sec - h->start_time[stage].tv_sec) + 1.e-6*(h->end_time[stage].tv_nsec - h->start_time[stage].tv_nsec); 99.30 + h->last_time [stage] = time; 99.31 + h->total_time[stage] += time; 99.32 +} 99.33 + 99.34 +void init_sb_entry(H264Context *h, SliceBufferEntry *sbe){ 99.35 + sbe->mbs = av_malloc(h->mb_width*h->mb_height* sizeof(H264Mb)); 99.36 + sbe->initialized = 1; 99.37 +} 99.38 + 99.39 +void free_sb_entry(SliceBufferEntry *sbe){ 99.40 + av_free(sbe->mbs); 99.41 + av_freep(&sbe->gb.raw); 99.42 + if (sbe->gb.rbsp) 99.43 + av_freep(&sbe->gb.rbsp); 99.44 + sbe->initialized = 0; 99.45 +} 99.46 + 99.47 +SliceBufferEntry *get_sb_entry(H264Context *h){ 99.48 + SliceBufferEntry *sb = NULL; 99.49 + 99.50 + pthread_mutex_lock(&h->lock[PARSE]); 99.51 + while (h->free_sb_cnt<=0) 99.52 + pthread_cond_wait(&h->cond[PARSE], &h->lock[PARSE]); 99.53 + /* use first free picture */ 99.54 + for(int i=0; i<h->sb_size; i++){ 99.55 + if(h->sb[i].state==0){ 99.56 + sb= &h->sb[i]; 99.57 + sb->state=1; 99.58 + sb->lines_taken=0; 99.59 + sb->lines_total=h->mb_height; 99.60 + break; 99.61 + } 99.62 + } 99.63 + h->free_sb_cnt--; 99.64 + 99.65 + pthread_mutex_unlock(&h->lock[PARSE]); 99.66 + 99.67 + memset (&sb->slice, 0, sizeof(H264Slice)); 99.68 + 99.69 + return sb; 99.70 +} 99.71 + 99.72 +void release_sb_entry(H264Context *h, SliceBufferEntry *sb){ 99.73 + pthread_mutex_lock(&h->lock[PARSE]); 99.74 + 99.75 + sb->state = 0; 99.76 + h->free_sb_cnt++; 99.77 + pthread_cond_signal(&h->cond[PARSE]); 99.78 + 99.79 + pthread_mutex_unlock(&h->lock[PARSE]); 99.80 +} 99.81 + 99.82 +int init_dpb_entry(H264Context *h, DecodedPicture *pic, H264Slice *s, int width, int height){ 99.83 + int i; 99.84 + 99.85 + s->curr_pic=pic; 99.86 + pic->poc = s->poc; 99.87 + pic->key_frame = s->key_frame; 99.88 + pic->mmco_reset = s->mmco_reset; 99.89 + pic->reference = s->nal_ref_idc? 3:1; 99.90 + pic->cpn = s->coded_pic_num; 99.91 + 99.92 + if(pic->data[0]==NULL) { 99.93 + int size[3] = {0}; 99.94 + 99.95 + width+= EDGE_WIDTH*2; 99.96 + height+= EDGE_WIDTH*2; 99.97 + 99.98 + pic->linesize[0]= width; 99.99 + pic->linesize[1]= pic->linesize[2] = width>>1; 99.100 + 99.101 + size[0] = width*height; 99.102 + size[1] = size[2] = width*height>>2; 99.103 + 99.104 + for(i=0; i<3; i++){ 99.105 + pic->base[i]= av_malloc(size[i]); 99.106 + } 99.107 + 99.108 + pic->data[0] = pic->base[0] + (pic->linesize[0]*EDGE_WIDTH) + EDGE_WIDTH; 99.109 + pic->data[1] = pic->base[1] + (pic->linesize[1]*EDGE_WIDTH>>1) + (EDGE_WIDTH>>1); 99.110 + pic->data[2] = pic->base[2] + (pic->linesize[2]*EDGE_WIDTH>>1) + (EDGE_WIDTH>>1); 99.111 + } 99.112 + 99.113 + const int big_mb_num= h->mb_stride*(h->mb_height+1) + 1; //the +1 is needed so memset(,,stride*height) does not sig11 99.114 + const int mb_array_size= h->mb_stride*h->mb_height; 99.115 + const int b4_array_size= h->b4_stride*h->mb_height*4; 99.116 + 99.117 + if(pic->mb_type_base==NULL){ 99.118 + FF_ALLOCZ_OR_GOTO(pic->mb_type_base , big_mb_num * sizeof(uint32_t), fail) 99.119 + pic->mb_type= pic->mb_type_base + h->mb_stride+1; 99.120 + 99.121 + for(int i=0; i<2; i++){ 99.122 + FF_ALLOCZ_OR_GOTO(pic->motion_val_base[i], 2 * (b4_array_size+4) * sizeof(int16_t), fail) 99.123 + pic->motion_val[i]= pic->motion_val_base[i]+4; 99.124 + FF_ALLOCZ_OR_GOTO(pic->ref_index[i], 4*mb_array_size * sizeof(uint8_t), fail) 99.125 + } 99.126 + FF_ALLOCZ_OR_GOTO(pic->intra4x4_pred_mode, h->mb_width*h->mb_height * 4* sizeof(int8_t), fail) 99.127 + } 99.128 + 99.129 + return 0; 99.130 + fail: 99.131 + return -1; 99.132 +} 99.133 + 99.134 +void free_dp(DecodedPicture *pic){ 99.135 + if(pic->base[0]){ 99.136 + for (int i=0; i<3; i++){ 99.137 + av_free(pic->base[i]); 99.138 + pic->data[i]= NULL; 99.139 + } 99.140 + } 99.141 + if (pic->mb_type_base){ 99.142 + av_free(pic->mb_type_base); 99.143 + pic->mb_type= NULL; 99.144 + for(int i=0; i<2; i++){ 99.145 + av_free(pic->motion_val_base[i]); 99.146 + av_free(pic->ref_index[i]); 99.147 + } 99.148 + av_free(pic->intra4x4_pred_mode); 99.149 + } 99.150 +} 99.151 + 99.152 +DecodedPicture *get_dpb_entry(H264Context *h, H264Slice *s){ 99.153 + DecodedPicture *dp = NULL; 99.154 + 99.155 + pthread_mutex_lock(&h->lock[REORDER2]); 99.156 + while (h->free_dpb_cnt<=0){ 99.157 + #if OMPSS 99.158 + assert(0); 99.159 + #endif 99.160 + pthread_cond_wait(&h->cond[REORDER2], &h->lock[REORDER2]); 99.161 + } 99.162 + /* use first free picture */ 99.163 + for(int i=0; i<h->max_dpb_cnt; i++){ 99.164 + if(h->dpb[i].reference==0){ 99.165 + dp= &h->dpb[i]; 99.166 + break; 99.167 + } 99.168 + } 99.169 + assert(dp); 99.170 + init_dpb_entry(h, dp, s, h->width, h->height); 99.171 + h->free_dpb_cnt--; 99.172 + h->acdpb_cnt++; //debug 99.173 + pthread_mutex_unlock(&h->lock[REORDER2]); 99.174 + 99.175 + return dp; 99.176 +} 99.177 + 99.178 +void release_dpb_entry(H264Context *h, DecodedPicture *pic, int mode){ 99.179 + pthread_mutex_lock(&h->lock[REORDER2]); 99.180 + pic->reference &= ~mode; 99.181 + if (pic->reference == 0){ 99.182 + h->free_dpb_cnt++; 99.183 + h->reldpb_cnt++; //debug 99.184 + pthread_cond_signal(&h->cond[REORDER2]); 99.185 + } 99.186 + pthread_mutex_unlock(&h->lock[REORDER2]); 99.187 +} 99.188 + 99.189 + 99.190 +/** 99.191 +* Extends the edges of a macroblock line. 99.192 +*/ 99.193 +void draw_edges(MBRecContext *d, H264Slice *s, int line){ 99.194 + int i; 99.195 + int mb_width=d->mb_width; 99.196 + int mb_height=d->mb_height; 99.197 + int last = (line+1 == mb_height); 99.198 + int lines = last?16:12; 99.199 + int linesize = d->linesize; 99.200 + int uvlinesize = d->uvlinesize; 99.201 + uint8_t *y = s->curr_pic->data[0] + 16*line*linesize; 99.202 + uint8_t *cb = s->curr_pic->data[1] + 8*line*uvlinesize; 99.203 + uint8_t *cr = s->curr_pic->data[2] + 8*line*uvlinesize; 99.204 + 99.205 + for (i=-4; i<lines; i++){ 99.206 + memset(y + i*linesize - EDGE_WIDTH, y[i*linesize], EDGE_WIDTH); 99.207 + memset(y + i*linesize + mb_width*16, y[i*linesize +mb_width*16 -1], EDGE_WIDTH); 99.208 + } 99.209 + for (i=-2; i<lines/2; i++){ 99.210 + memset(cb + i*uvlinesize - EDGE_WIDTH/2, cb[i*uvlinesize], EDGE_WIDTH/2); 99.211 + memset(cb + i*uvlinesize + mb_width*8, cb[i*uvlinesize +mb_width*8 -1], EDGE_WIDTH/2); 99.212 + memset(cr + i*uvlinesize - EDGE_WIDTH/2, cr[i*uvlinesize], EDGE_WIDTH/2); 99.213 + memset(cr + i*uvlinesize + mb_width*8, cr[i*uvlinesize +mb_width*8 -1], EDGE_WIDTH/2); 99.214 + } 99.215 + 99.216 + if (line==0){ 99.217 + y -= EDGE_WIDTH; 99.218 + cb -= EDGE_WIDTH/2; 99.219 + cr -= EDGE_WIDTH/2; 99.220 + for (i=1; i<=21; i++){ 99.221 + memcpy(y -i*linesize, y, linesize); 99.222 + } 99.223 + for (i=1; i<=9; i++){ 99.224 + memcpy(cb -i*uvlinesize, cb, uvlinesize); 99.225 + memcpy(cr -i*uvlinesize, cr, uvlinesize); 99.226 + } 99.227 + }else if (last){ 99.228 + y += -EDGE_WIDTH + 15*linesize; 99.229 + cb += -EDGE_WIDTH/2 + 7*uvlinesize; 99.230 + cr += -EDGE_WIDTH/2 + 7*uvlinesize; 99.231 + for (i=1; i<=21; i++){ 99.232 + memcpy(y +i*linesize, y, linesize); 99.233 + } 99.234 + for (i=1; i<=9; i++){ 99.235 + memcpy(cb +i*uvlinesize, cb, uvlinesize); 99.236 + memcpy(cr +i*uvlinesize, cr, uvlinesize); 99.237 + } 99.238 + } 99.239 +} 99.240 + 99.241 +static int64_t timer_start; 99.242 +int64_t av_gettime(void) { 99.243 + struct timeval tv; 99.244 + gettimeofday(&tv,NULL); 99.245 + return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec; 99.246 +} 99.247 + 99.248 +void av_start_timer(){ 99.249 + timer_start = av_gettime(); 99.250 +} 99.251 + 99.252 +void print_report(int frame_number, uint64_t video_size, int is_last_report, int verbose) { 99.253 + static int64_t last_time = -1; 99.254 + static int64_t last_frame_number = 0; 99.255 + float t=0, t2=0; 99.256 + int64_t cur_time=0; 99.257 + 99.258 + if (!is_last_report) { 99.259 + /* display the report every 0.5 seconds */ 99.260 + cur_time = av_gettime(); 99.261 + if (last_time == -1) { 99.262 + last_time = cur_time; 99.263 + return; 99.264 + } 99.265 + if ((cur_time - last_time) < 500000) 99.266 + return; 99.267 + t = (cur_time-timer_start) / 1000000.0; 99.268 + t2 = (cur_time-last_time) / 1000000.0; 99.269 + } 99.270 + 99.271 + if (verbose){ 99.272 + fprintf(stderr, "frame=%5d avgfps=%3d curfps=%3d\r", frame_number, (int)(frame_number/t+0.5), (int)((frame_number - last_frame_number)/t2+0.5) ); 99.273 + fflush(stderr); 99.274 + } 99.275 + last_frame_number = frame_number; 99.276 + last_time = cur_time; 99.277 + 99.278 + if (is_last_report){ 99.279 + t = (av_gettime()-timer_start) / 1000000.0; 99.280 + fprintf(stderr, "%c[2Kframe=%5d avgfps=%3d\r", 27, frame_number, (int)(frame_number/t+0.5)); 99.281 + fprintf(stderr, "\n"); 99.282 + fprintf(stderr, "video:%1.0fkB\n", video_size/1024.0); 99.283 + fflush(stderr); 99.284 + } 99.285 +} 99.286 + 99.287 +/* Sort B-frames into display order */ 99.288 +static DecodedPicture *get_reordered_picture(OutputContext *w, int flush){ 99.289 + int i; 99.290 + int out_idx = 0; 99.291 + DecodedPicture *out = w->delayed_pic[0]; 99.292 + 99.293 + if (!out) 99.294 + return NULL; 99.295 + 99.296 + for(i=1; w->delayed_pic[i] && !w->delayed_pic[i]->key_frame && !w->delayed_pic[i]->mmco_reset; i++){ 99.297 + if(w->delayed_pic[i]->poc < out->poc){ 99.298 + out = w->delayed_pic[i]; 99.299 + out_idx = i; 99.300 + } 99.301 + } 99.302 + 99.303 + if(w->dp_cnt > MAX_DELAYED_PIC_COUNT || flush) { 99.304 + for(i=out_idx; w->delayed_pic[i]; i++) 99.305 + w->delayed_pic[i] = w->delayed_pic[i+1]; 99.306 + w->dp_cnt--; 99.307 + return out; 99.308 + } 99.309 + return NULL; 99.310 +} 99.311 + 99.312 +/** 99.313 +* Remove the extra borders, and places the three parts of the image after each other. 99.314 +*/ 99.315 +static int raw_encode(const DecodedPicture* src, int width, int height, unsigned char *dest) { 99.316 + int i, j; 99.317 +/** To write entire image including extra borders*/ 99.318 +// int w = src->linesize[0]; 99.319 +// int h = height+64; 99.320 +// int w2 = w>>1; 99.321 +// int h2 = h>>1; 99.322 +// int data_planes=3; 99.323 +// int size = w * h + 2 *w2*h2; 99.324 +// const unsigned char* s; 99.325 +// for (i=0; i<data_planes; i++) { 99.326 +// if (i == 1) { 99.327 +// w = w2; 99.328 +// h = h2; 99.329 +// } 99.330 +// s = src->base[i]; 99.331 +// for(j=0; j<h; j++) { 99.332 +// memcpy(dest, s, src->linesize[i]); 99.333 +// dest += w; 99.334 +// s += src->linesize[i]; 99.335 +// } 99.336 +// } 99.337 + 99.338 + int w = (width*8 + 7)/8; 99.339 + int h = height; 99.340 + int w2 =((width >>1) * 8 + 7) / 8; 99.341 + int h2 = ((height+1) >>1); //not sure about +1 99.342 + int data_planes=3; 99.343 + int size = w * h + 2 *w2*h2; 99.344 + const unsigned char* s; 99.345 + 99.346 + 99.347 + for (i=0; i<data_planes; i++) { 99.348 + if (i == 1) { 99.349 + w = w2; 99.350 + h = h2; 99.351 + } 99.352 + s = src->data[i]; 99.353 + for(j=0; j<h; j++) { 99.354 + memcpy(dest, s, w); 99.355 + dest += w; 99.356 + s += src->linesize[i]; 99.357 + } 99.358 + } 99.359 + return size; 99.360 +} 99.361 + 99.362 +#ifdef HAVE_LIBSDL2 99.363 +static SDL_Texture *get_next_texture(H264Context *h, int side){ 99.364 + SDLTextureQueue *sdlq = &h->sdlq; 99.365 + SDL_Texture *texture; 99.366 + pthread_mutex_lock (&sdlq->sdl_lock); 99.367 + if (side ){ //send 99.368 + while (sdlq->ready >= sdlq->size) 99.369 + pthread_cond_wait(&sdlq->sdl_cond, &sdlq->sdl_lock); 99.370 + texture = sdlq->queue[sdlq->fi]; 99.371 + sdlq->fi++; sdlq->fi %= sdlq->size; 99.372 + } else { //recv 99.373 + while (sdlq->ready <= 0 && !sdlq->exit) 99.374 + pthread_cond_wait(&sdlq->sdl_cond, &sdlq->sdl_lock); 99.375 + 99.376 + if (sdlq->ready == 0 && sdlq->exit){ 99.377 + texture = NULL; 99.378 + }else{ 99.379 + texture = sdlq->queue[sdlq->fo]; 99.380 + sdlq->fo++; sdlq->fo %= sdlq->size; 99.381 + } 99.382 + } 99.383 + pthread_mutex_unlock(&sdlq->sdl_lock); 99.384 + 99.385 + return texture; 99.386 +} 99.387 + 99.388 +static void signal_texture(H264Context *h, int side){ 99.389 + SDLTextureQueue *sdlq = &h->sdlq; 99.390 + pthread_mutex_lock (&sdlq->sdl_lock); 99.391 + if (side) 99.392 + sdlq->ready++; 99.393 + else 99.394 + sdlq->ready--; 99.395 + pthread_cond_signal(&sdlq->sdl_cond); 99.396 + pthread_mutex_unlock(&sdlq->sdl_lock); 99.397 +} 99.398 + 99.399 +void signal_sdl_exit(H264Context *h){ 99.400 + SDLTextureQueue *sdlq = &h->sdlq; 99.401 + pthread_mutex_lock (&sdlq->sdl_lock); 99.402 + sdlq->exit=1; 99.403 + pthread_cond_signal(&sdlq->sdl_cond); 99.404 + pthread_mutex_unlock(&sdlq->sdl_lock); 99.405 +} 99.406 + 99.407 +static void display_frame(H264Context *h, OutputContext *w, int fd, DecodedPicture *in_picture, int frame_width, int frame_height, int dropable){ 99.408 + static int64_t last_time = -1; 99.409 + int64_t cur_time; 99.410 +// SDLContext *sdlc = h->sdlc; 99.411 + uint8_t *iyuv_pixels; 99.412 + int pitch; 99.413 + 99.414 + 99.415 + if (last_time == -1){ 99.416 + last_time = av_gettime(); 99.417 + } 99.418 + 99.419 + 99.420 + /* do not display frames that are less than 8.125 ms apart (120fps)*/ 99.421 + if (dropable){ 99.422 + cur_time = av_gettime(); 99.423 + 99.424 + if ((cur_time - last_time) < 8125) 99.425 + return; 99.426 + 99.427 + last_time =cur_time; 99.428 + } 99.429 + 99.430 + if(in_picture){ 99.431 + 99.432 + SDL_Texture *texture= get_next_texture(h, 1); 99.433 + 99.434 + SDL_LockTexture( texture, NULL, (void **)&iyuv_pixels, &pitch ); 99.435 + 99.436 + raw_encode(in_picture, frame_width, frame_height, iyuv_pixels); 99.437 + 99.438 + signal_texture(h, 1); 99.439 + } 99.440 +} 99.441 +#endif 99.442 + 99.443 +// TODO: Parallelize the raw_encode (either split frame or over frames) 99.444 +static void do_video_out(OutputContext *w, int fd, DecodedPicture *in_picture, int frame_width, int frame_height) { 99.445 + int size=0; 99.446 + //remove extra borders 99.447 + 99.448 + if(in_picture) 99.449 + size= raw_encode(in_picture, frame_width, frame_height, w->bit_buffer); 99.450 + 99.451 + if (size < 0) { 99.452 + fprintf(stderr, "Video encoding failed\n"); 99.453 + }else { 99.454 + if (write(fd, w->bit_buffer, size)<0) 99.455 + fprintf(stderr, "Write frame failed\n"); 99.456 + } 99.457 + 99.458 + w->video_size += size; 99.459 +} 99.460 + 99.461 +DecodedPicture *output_frame(H264Context *h, OutputContext *oc, DecodedPicture *pic, int fd, int frame_width, int frame_height) { 99.462 + DecodedPicture *out; 99.463 + 99.464 + if (pic){ 99.465 + oc->delayed_pic[oc->dp_cnt++]=pic; 99.466 + out = get_reordered_picture(oc, 0); 99.467 + }else{ 99.468 + out = get_reordered_picture(oc, 1); 99.469 + } 99.470 + 99.471 + if (out){ 99.472 + if (fd){ 99.473 + do_video_out(oc, fd, out, frame_width, frame_height); 99.474 + }else{ 99.475 +#ifdef HAVE_LIBSDL2 99.476 + if (h->display){ 99.477 + display_frame(h, oc, fd, out, frame_width, frame_height, !(pic==NULL)); 99.478 + } 99.479 +#endif 99.480 + } 99.481 + oc->frame_number++; 99.482 + } 99.483 + 99.484 + return out; 99.485 +} 99.486 + 99.487 +OutputContext *get_output_context(H264Context *h){ 99.488 + const int frame_width=h->frame_width; 99.489 + const int frame_height=h->frame_height; 99.490 + const int frame_size = frame_width*frame_height; 99.491 + 99.492 + OutputContext *oc = av_mallocz(sizeof(OutputContext)); 99.493 + oc->bit_buffer_size= FFMAX(1024*256, frame_size*2); // oversize a little bit to allow extra border write 99.494 + oc->bit_buffer= av_mallocz(oc->bit_buffer_size); 99.495 + 99.496 + return oc; 99.497 +} 99.498 + 99.499 +void free_output_context(OutputContext *oc){ 99.500 + 99.501 + av_free(oc->bit_buffer); 99.502 + av_free(oc); 99.503 +} 99.504 + 99.505 +SuperMBContext *getSuperMBContext(H264Context *h, int smb_width, int smb_height){ 99.506 + SuperMBContext *smbc = av_mallocz(sizeof(SuperMBContext)); 99.507 + 99.508 + smbc->smb_width = smb_width; 99.509 + smbc->smb_height = smb_height; 99.510 + 99.511 + smbc->nsmb_height = h->mb_height / smbc->smb_height + (h->mb_height%smbc->smb_height ? 1:0); //only need one extra if mb_height was not dividable 99.512 + smbc->nsmb_width = h->mb_width / smbc->smb_width; 99.513 + while ( (smbc->nsmb_width * smbc->smb_width)-(smbc->smb_height-1) < h->mb_width ) 99.514 + smbc->nsmb_width++; 99.515 + 99.516 + smbc->nsmb_3dheight= smbc->nsmb_height - ((h->mb_height/2)/smbc->smb_height +1); //assuming max motion vector of half the height 99.517 + 99.518 + smbc->smbs[0] = av_malloc (smbc->nsmb_width * smbc->nsmb_height * sizeof(SuperMBTask)); 99.519 + smbc->smbs[1] = av_malloc (smbc->nsmb_width * smbc->nsmb_height * sizeof(SuperMBTask)); 99.520 + for (int y=0, i=0; i<smbc->nsmb_height; i++, y+=smbc->smb_height){ 99.521 + for (int x=0, j=0; j<smbc->nsmb_width; j++, x+=smbc->smb_width){ 99.522 + smbc->smbs[0][i*smbc->nsmb_width +j].smb_y = y; 99.523 + smbc->smbs[0][i*smbc->nsmb_width +j].smb_x = x; 99.524 + smbc->smbs[1][i*smbc->nsmb_width +j].smb_y = y; 99.525 + smbc->smbs[1][i*smbc->nsmb_width +j].smb_x = x; 99.526 + } 99.527 + } 99.528 + 99.529 + smbc->refcount = 1; 99.530 + 99.531 + return smbc; 99.532 +} 99.533 + 99.534 +void freeSuperMBContext(SuperMBContext *smbc){ 99.535 + av_free(smbc->smbs[0]); 99.536 + av_free(smbc->smbs[1]); 99.537 + av_free(smbc); 99.538 +} 99.539 + 99.540 +SuperMBContext * acquire_smbc(H264Context *h ){ 99.541 + SuperMBContext *smbc; 99.542 + 99.543 + pthread_mutex_lock (&h->smb_lock); 99.544 + smbc = h->smbc; 99.545 + smbc->refcount++; 99.546 + pthread_mutex_unlock(&h->smb_lock); 99.547 + return smbc; 99.548 +} 99.549 + 99.550 +void release_smbc(H264Context *h, SuperMBContext *smbc){ 99.551 + pthread_mutex_lock (&h->smb_lock); 99.552 + smbc->refcount--; 99.553 + if (smbc->refcount==0){ 99.554 + freeSuperMBContext(smbc); 99.555 + } 99.556 + pthread_mutex_unlock(&h->smb_lock); 99.557 + 99.558 +} 99.559 + 99.560 + 99.561 +#ifdef HAVE_LIBSDL2 99.562 + 99.563 +// #if OMPSS 99.564 +static void draw_sb_border(H264Context *h, uint32_t *rgba_pixels, int smb_x, int smb_y){ 99.565 + int mb_width = h->mb_width; 99.566 + int mb_height = h->mb_height; 99.567 + int width = h->frame_width; 99.568 + int height = h->frame_height; 99.569 + 99.570 + int mb_x = smb_x * h->smb_width; 99.571 + int mb_y = smb_y * h->smb_height; 99.572 + 99.573 + uint32_t pix= 0x0000FFC0; 99.574 + 99.575 + for (int k=0, i=mb_y; i< mb_y + h->smb_height; i++, k++){ 99.576 + for (int l=0, j=mb_x -k ; j< mb_x - k + h->smb_width; j++, l++){ 99.577 + //outside frame 99.578 + if (i<0 || i>=mb_height || j<0 || j>=mb_width) { 99.579 + continue; 99.580 + } 99.581 + 99.582 + //draw top 99.583 + if (i==0 || k==0 || l==0){ 99.584 + int mx = j*16; 99.585 + int my = i*16; 99.586 + uint32_t *top = rgba_pixels + my*width + mx; 99.587 + int endx = mx+16 < width? 16: width-mx; 99.588 + 99.589 + for (int x = 0; x<endx; x++){ 99.590 + top[x] = pix; 99.591 + } 99.592 + } 99.593 + 99.594 + //draw bottom 99.595 + if (i==mb_height-1 || k==h->smb_height-1 || l==h->smb_width-1){ 99.596 + int mx = j*16; 99.597 + int my = i*16 + 15; my = my < height ? my: height-1; 99.598 + uint32_t *bottom = rgba_pixels + my*width + mx; 99.599 + int endx = mx+16 < width? 16: width-mx; 99.600 + 99.601 + for (int x = 0; x<endx; x++){ 99.602 + bottom[x] = pix; 99.603 + } 99.604 + } 99.605 + 99.606 + //draw left 99.607 + if (j==0 || l==0 ){ 99.608 + int mx = j*16; 99.609 + int my = i*16; 99.610 + uint32_t *left = rgba_pixels + my*width + mx; 99.611 + int endy = my +16 < height ? 16: height - my; 99.612 + 99.613 + for (int y = 0; y<endy; y++){ 99.614 + left[y*width] = pix; 99.615 + } 99.616 + } 99.617 + 99.618 + //draw right 99.619 + if (j==mb_width -1 || l==h->smb_width-1 ){ 99.620 + int mx = j*16 + 15; mx = mx < width ? mx: width-1; 99.621 + int my = i*16; 99.622 + uint32_t *right = rgba_pixels + my*width + mx; 99.623 + int endy = my +16 < height ? 16: height - my; 99.624 + 99.625 + for (int y = 0; y<endy; y++){ 99.626 + right[y*width] = pix; 99.627 + } 99.628 + } 99.629 + } 99.630 + } 99.631 +} 99.632 + 99.633 +static void draw_sbmap (H264Context *h, SuperMBContext *smbc, SDLContext *sdlc){ 99.634 + int pitch; 99.635 + uint32_t *rgba_pixels; 99.636 + SDL_Texture *sbmap= sdlc->sbmap_texture; 99.637 + 99.638 + SDL_LockTexture( sbmap, NULL, (void **)&rgba_pixels, &pitch ); 99.639 + 99.640 + memset (rgba_pixels, 0, pitch * h->height); 99.641 + for (int i=0; i< smbc->nsmb_height; i++){ 99.642 + for (int j=0; j< smbc->nsmb_width; j++){ 99.643 + draw_sb_border(h, rgba_pixels, j, i); 99.644 + } 99.645 + } 99.646 + 99.647 + SDL_UnlockTexture( sbmap ); 99.648 +} 99.649 +// #endif 99.650 + 99.651 +// static void calc_sb_sizes (H264Context *h, SuperMBContext *smbc){ 99.652 +// smbc->smb_height = h->smb_height; 99.653 +// smbc->smb_width = h->smb_width; 99.654 +// 99.655 +// smbc->nsmb_height = h->mb_height / smbc->smb_height + (h->mb_height%smbc->smb_height ? 1:0); //only need one extra if mb_height was not dividable 99.656 +// smbc->nsmb_width = h->mb_width / smbc->smb_width; 99.657 +// while ( (smbc->nsmb_width * smbc->smb_width)-(smbc->smb_height-1) < h->mb_width ) 99.658 +// smbc->nsmb_width++; 99.659 +// } 99.660 + 99.661 + 99.662 +static void handle_key_event(H264Context *h, SDLContext *sdlc, SDL_Keysym keysym){ 99.663 + int arrow=0; 99.664 + 99.665 + switch (keysym.sym){ 99.666 + case SDLK_ESCAPE: 99.667 + if (sdlc->fullscreen){ 99.668 + SDL_SetWindowFullscreen(sdlc->window, SDL_FALSE); 99.669 + sdlc->fullscreen = 0; 99.670 + } 99.671 + break; 99.672 + case SDLK_SPACE: 99.673 + pthread_mutex_lock(&h->sdl_lock); 99.674 + sdlc->pause = !sdlc->pause; 99.675 + pthread_cond_signal(&h->sdl_cond); 99.676 + pthread_mutex_unlock(&h->sdl_lock); 99.677 + break; 99.678 + case SDLK_f: 99.679 + if (!sdlc->fullscreen){ 99.680 + if (keysym.mod == KMOD_LCTRL){ 99.681 +// SDL_SetWindowDisplayMode (sdlc->window, &sdlc->full); 99.682 + SDL_SetWindowFullscreen(sdlc->window, SDL_TRUE); 99.683 + 99.684 + sdlc->fullscreen = 1; 99.685 + } 99.686 + } 99.687 + break; 99.688 + case SDLK_m: 99.689 + sdlc->showmap = !sdlc->showmap; 99.690 + break; 99.691 + case SDLK_UP: 99.692 + if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_height < h->mb_height && h->smb_height < h->smb_width){ 99.693 + h->smb_height++; 99.694 + arrow =1; 99.695 + } 99.696 + break; 99.697 + case SDLK_DOWN: 99.698 + if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_height > 1 ){ 99.699 + h->smb_height--; 99.700 + arrow =1; 99.701 + } 99.702 + break; 99.703 + case SDLK_LEFT: 99.704 + if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_width > 1 && h->smb_width > h->smb_height){ 99.705 + h->smb_width--; 99.706 + arrow =1; 99.707 + } 99.708 + break; 99.709 + case SDLK_RIGHT: 99.710 + if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_width < h->mb_width){ 99.711 + h->smb_width++; 99.712 + arrow =1; 99.713 + } 99.714 + break; 99.715 + } 99.716 + 99.717 + if (arrow){ 99.718 + SuperMBContext *smbc = getSuperMBContext(h, h->smb_width, h->smb_height); 99.719 + pthread_mutex_lock(&h->smb_lock); 99.720 + h->smbc->refcount--; 99.721 + if (h->smbc->refcount == 0) 99.722 + freeSuperMBContext(h->smbc); 99.723 + h->smbc = smbc; 99.724 + sdlc->updatemap =1; 99.725 + pthread_mutex_unlock(&h->smb_lock); 99.726 + } 99.727 +} 99.728 + 99.729 +void handle_window_event(H264Context *h, SDLContext *sdlc, SDL_WindowEvent winevent){ 99.730 + SDL_Rect nrect; 99.731 + switch (winevent.event){ 99.732 + case SDL_WINDOWEVENT_RESIZED: 99.733 + 99.734 + sdlc->win_w = winevent.data1; 99.735 + sdlc->win_h = winevent.data2; 99.736 + 99.737 + double aspect = (double) sdlc->win_w/ sdlc->win_h; 99.738 + if ( aspect < sdlc->aspect){ 99.739 + double r = (double) sdlc->win_w / sdlc->rect.w; 99.740 + double h = (double) sdlc->rect.h * r; 99.741 + 99.742 + nrect.y = lrint(( (double) sdlc->win_h - h)/2); 99.743 + nrect.h = lrint(h); 99.744 + 99.745 + nrect.x=0; 99.746 + nrect.w= sdlc->win_w; 99.747 + 99.748 + }else { 99.749 + double r = (double) sdlc->win_h / sdlc->rect.h; 99.750 + double w = (double) sdlc->rect.w * r; 99.751 + 99.752 + nrect.x = lrint(( (double) sdlc->win_w - w)/2); 99.753 + nrect.w = lrint(w); 99.754 + 99.755 + nrect.y=0; 99.756 + nrect.h= sdlc->win_h; 99.757 + } 99.758 + //prob better to lock 99.759 + sdlc->win_rect = nrect; 99.760 + sdlc->resized=1; 99.761 + break; 99.762 + } 99.763 +} 99.764 + 99.765 +void *sdl_event_listen_thread(void *arg){ 99.766 + H264Context *h = (H264Context *) arg; 99.767 + SDLContext *sdlc = h->sdlc; 99.768 + SDL_Event event; 99.769 + 99.770 + while ( SDL_WaitEvent(&event) ) { 99.771 + switch (event.type) { 99.772 + case SDL_KEYDOWN: 99.773 + handle_key_event(h, sdlc, event.key.keysym); 99.774 + break; 99.775 + case SDL_WINDOWEVENT: 99.776 + handle_window_event(h, sdlc, event.window); 99.777 + break; 99.778 + case SDL_QUIT: 99.779 + h->quit=1; 99.780 + goto finish; 99.781 + } 99.782 + } 99.783 +finish: 99.784 + pthread_exit(NULL); 99.785 + return NULL; 99.786 +} 99.787 + 99.788 +//XInitThreads not called in SDL2 library, causes crash 99.789 +//remove in future when fixed ... 99.790 +#include <X11/Xlib.h> 99.791 + 99.792 +SDLContext *get_SDL_context(H264Context *h){ 99.793 + const int frame_width=h->frame_width; 99.794 + const int frame_height=h->frame_height; 99.795 + 99.796 + SDLContext *sdlc = av_mallocz(sizeof(SDLContext)); 99.797 + sdlc->display = h->display; 99.798 + sdlc->fullscreen = h->fullscreen; 99.799 + 99.800 + sdlc->aspect = (double) frame_width / (double) frame_height; 99.801 + sdlc->rect.x =0; 99.802 + sdlc->rect.y =0; 99.803 + sdlc->rect.w =frame_width; 99.804 + sdlc->rect.h =frame_height; 99.805 + 99.806 + XInitThreads(); //workaround 99.807 + 99.808 + // Initializes the video subsystem 99.809 + if (SDL_Init(SDL_INIT_VIDEO) < 0) { 99.810 + fprintf(stderr, "Unable to init SDL: %s\n", SDL_GetError()); 99.811 + #undef exit 99.812 + exit(-1); 99.813 + } 99.814 + SDL_SetHint("SDL_HINT_RENDER_SCALE_QUALITY", "best"); 99.815 + SDL_SetHint("SDL_HINT_RENDER_OPENGL_SHADERS", "1"); 99.816 + 99.817 + SDL_GetDesktopDisplayMode(0, &sdlc->full); 99.818 + sdlc->full.format = SDL_PIXELFORMAT_IYUV; 99.819 + 99.820 + sdlc->wind = sdlc->full; 99.821 + if (sdlc->wind.w > frame_width) sdlc->wind.w = frame_width; 99.822 + if (sdlc->wind.h > frame_height) sdlc->wind.h = frame_height; 99.823 + 99.824 + sdlc->win_rect.x =0; 99.825 + sdlc->win_rect.y =0; 99.826 + sdlc->win_rect.w =sdlc->wind.w; 99.827 + sdlc->win_rect.h =sdlc->wind.h; 99.828 + 99.829 + if (sdlc->fullscreen){ 99.830 + sdlc->window = SDL_CreateWindow( h->file_name, SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, sdlc->full.w, sdlc->full.h, SDL_WINDOW_FULLSCREEN|SDL_WINDOW_SHOWN|SDL_WINDOW_RESIZABLE); 99.831 + SDL_SetWindowDisplayMode (sdlc->window, &sdlc->full); 99.832 + } else { 99.833 + sdlc->window = SDL_CreateWindow( h->file_name, SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, sdlc->wind.w, sdlc->wind.h, SDL_WINDOW_RESIZABLE|SDL_WINDOW_SHOWN); 99.834 + SDL_SetWindowDisplayMode (sdlc->window, &sdlc->wind); 99.835 + } 99.836 + 99.837 + sdlc->renderer = SDL_CreateRenderer(sdlc->window, -1, SDL_RENDERER_ACCELERATED); 99.838 +// sdlc->renderer = SDL_CreateRenderer(sdlc->window, -1, SDL_RENDERER_SOFTWARE); 99.839 + 99.840 + h->sdlq.queue[0] = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_IYUV, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height); 99.841 + h->sdlq.queue[1] = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_IYUV, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height); 99.842 + 99.843 + sdlc->sbmap_texture = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_RGBA8888, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height); 99.844 + SDL_SetTextureBlendMode(sdlc->sbmap_texture, SDL_BLENDMODE_BLEND); 99.845 + sdlc->updatemap = 1; 99.846 + 99.847 +#if HAVE_LIBSDL_TTF 99.848 + //not working with SDL 2.0, try again in future when supported 99.849 + if(TTF_Init()==-1) { 99.850 + printf("TTF_Init: %s\n", TTF_GetError()); 99.851 + exit(2); 99.852 + } 99.853 + 99.854 + // Load a font 99.855 + TTF_Font *font; 99.856 + font = TTF_OpenFont("/usr/share/fonts/truetype/freefont/FreeSans.ttf", 24); 99.857 + if (font == NULL) 99.858 + { 99.859 + printf("TTF_OpenFont() Failed: %s\n", TTF_GetError()); 99.860 + TTF_Quit(); 99.861 + exit(1); 99.862 + } 99.863 +#endif 99.864 + 99.865 + pthread_create(&sdlc->listen_thread, NULL, sdl_event_listen_thread, h); 99.866 + 99.867 + return sdlc; 99.868 + 99.869 +} 99.870 + 99.871 +void free_SDL_context(H264Context *h){ 99.872 + SDLContext *sdlc = h->sdlc; 99.873 + pthread_join(sdlc->listen_thread, NULL); 99.874 + 99.875 +#if HAVE_LIBSDL_TTF 99.876 + TTF_Quit(); 99.877 +#endif 99.878 + SDL_DestroyTexture(h->sdlq.queue[0]); 99.879 + SDL_DestroyTexture(h->sdlq.queue[1]); 99.880 + SDL_DestroyTexture(sdlc->sbmap_texture); 99.881 + SDL_DestroyRenderer(sdlc->renderer); 99.882 + SDL_DestroyWindow(sdlc->window); 99.883 + SDL_Quit(); 99.884 + 99.885 +} 99.886 + 99.887 +void *sdl_thread(void *arg){ 99.888 + H264Context *h = (H264Context *) arg; 99.889 + 99.890 + SDLContext *sdlc = get_SDL_context(h); 99.891 + h->sdlc = sdlc; 99.892 + 99.893 + signal_texture(h, 0); 99.894 + signal_texture(h, 0); 99.895 + 99.896 + SDL_Texture *texture; 99.897 + for (;;){ 99.898 + pthread_mutex_lock(&h->sdl_lock); 99.899 + while (sdlc->pause){ 99.900 + pthread_cond_wait(&h->sdl_cond, &h->sdl_lock); 99.901 + } 99.902 + pthread_mutex_unlock(&h->sdl_lock); 99.903 + 99.904 + texture = get_next_texture(h, 0); 99.905 + if (texture == NULL) 99.906 + break; 99.907 + 99.908 + SDL_UnlockTexture(texture); 99.909 + 99.910 + //clear if resized 99.911 + if (sdlc->resized){ 99.912 + // KDE bug prob, reset viewport change after resize from max 99.913 + SDL_RenderSetViewport(sdlc->renderer, NULL); 99.914 + SDL_SetRenderDrawColor(sdlc->renderer, 0, 0, 0, 255); 99.915 + SDL_RenderClear(sdlc->renderer); 99.916 + sdlc->resized = 0; 99.917 + } 99.918 + 99.919 + SDL_RenderCopy(sdlc->renderer, texture, &sdlc->rect, &sdlc->win_rect); 99.920 + 99.921 + if (sdlc->showmap){ 99.922 + if (sdlc->updatemap){ 99.923 + SuperMBContext *smbc; 99.924 + pthread_mutex_lock (&h->smb_lock); 99.925 + smbc = h->smbc; 99.926 + smbc->refcount++; 99.927 + sdlc->updatemap=0; 99.928 + pthread_mutex_unlock(&h->smb_lock); 99.929 + 99.930 + draw_sbmap(h, smbc, sdlc); 99.931 + 99.932 + release_smbc(h, smbc); 99.933 + } 99.934 + SDL_RenderCopy(sdlc->renderer, sdlc->sbmap_texture, &sdlc->rect, &sdlc->win_rect); 99.935 + } 99.936 + 99.937 + SDL_RenderPresent(sdlc->renderer); 99.938 + signal_texture(h, 0); 99.939 + } 99.940 + 99.941 + free_SDL_context(h); 99.942 + 99.943 + pthread_exit(NULL); 99.944 + return NULL; 99.945 +} 99.946 +#endif 99.947 +
100.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 100.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_misc.h Mon Aug 27 12:09:56 2012 +0200 100.3 @@ -0,0 +1,52 @@ 100.4 +#ifndef H264_MISC_H 100.5 +#define H264_MISC_H 100.6 + 100.7 +#include "avcodec.h" 100.8 +#include "h264_types.h" 100.9 + 100.10 +void start_timer(H264Context *h, int stage); 100.11 +void stop_timer(H264Context *h, int stage); 100.12 + 100.13 +void init_sb_entry(H264Context *h, SliceBufferEntry *sbe); 100.14 +void free_sb_entry(SliceBufferEntry *sb); 100.15 +SliceBufferEntry *get_sb_entry(H264Context *h); 100.16 +void release_sb_entry(H264Context *h, SliceBufferEntry *sb); 100.17 + 100.18 +DecodedPicture *get_dpb_entry(H264Context *h, H264Slice *s); 100.19 +void release_dpb_entry(H264Context *h, DecodedPicture *pic, int mode); 100.20 + 100.21 +void draw_edges(MBRecContext *d, H264Slice *s, int line); 100.22 + 100.23 +int ff_init_slice(NalContext *n, H264Slice *s); 100.24 +void free_picture(PictureInfo *pic); 100.25 +void free_dp(DecodedPicture *pic); 100.26 + 100.27 +void av_start_timer(); 100.28 +int copyEDtoH264Slice(H264Slice *ms, H264Slice *es); 100.29 +void print_report(int frame_number, uint64_t video_size, int is_last_report, int verbose); 100.30 + 100.31 +int ff_alloc_picture_info(NalContext *n, H264Slice *s, PictureInfo *pic); 100.32 +DecodedPicture *output_frame(H264Context *h, OutputContext *oc, DecodedPicture *pic, int fd, int frame_width, int frame_height); 100.33 +OutputContext *get_output_context(H264Context *h); 100.34 +void free_output_context(OutputContext *oc); 100.35 + 100.36 +void freeSuperMBContext(SuperMBContext *smbc); 100.37 +SuperMBContext *getSuperMBContext(H264Context *h, int smb_width, int smb_height); 100.38 +void release_smbc(H264Context *h, SuperMBContext *smbc); 100.39 +SuperMBContext * acquire_smbc(H264Context *h ); 100.40 + 100.41 +#if HAVE_LIBSDL2 100.42 +void signal_sdl_exit(H264Context *h); 100.43 +void *sdl_thread(void *arg); 100.44 +SDLContext *get_SDL_context(H264Context *h); 100.45 +void free_SDL_context(SDLContext *sdlc); 100.46 +#endif 100.47 + 100.48 +/** 100.49 +* gets the chroma qp. 100.50 +*/ 100.51 +static inline int get_chroma_qp(H264Slice *s, int t, int qscale){ 100.52 + return s->pps.chroma_qp_table[t][qscale]; 100.53 +} 100.54 + 100.55 +#endif
101.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 101.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_nal.c Mon Aug 27 12:09:56 2012 +0200 101.3 @@ -0,0 +1,628 @@ 101.4 +#include "h264_types.h" 101.5 +#include "h264_data.h" 101.6 + 101.7 +#include "golomb.h" 101.8 +#include "h264_sei.h" 101.9 +#include "h264_refs.h" 101.10 +#include "h264_ps.h" 101.11 +#include "h264_pred_mode.h" 101.12 +#include "h264_misc.h" 101.13 + 101.14 +static int ff_h264_decode_rbsp_trailing(const uint8_t *src){ 101.15 + int v= *src; 101.16 + int r; 101.17 + 101.18 + for(r=1; r<9; r++){ 101.19 + if(v&1) return r; 101.20 + v>>=1; 101.21 + } 101.22 + return 0; 101.23 +} 101.24 + 101.25 +static int pred_weight_table(H264Slice *s, GetBitContext *gb){ 101.26 + int luma_def, chroma_def; 101.27 + 101.28 + s->use_weight= 0; 101.29 + s->use_weight_chroma= 0; 101.30 + s->luma_log2_weight_denom= get_ue_golomb(gb); 101.31 + s->chroma_log2_weight_denom= get_ue_golomb(gb); 101.32 + luma_def = 1<<s->luma_log2_weight_denom; 101.33 + chroma_def = 1<<s->chroma_log2_weight_denom; 101.34 + 101.35 + for(int list=0; list<2; list++){ 101.36 + for(int i=0; i<s->ref_count[list]; i++){ 101.37 + int luma_weight_flag, chroma_weight_flag; 101.38 + 101.39 + luma_weight_flag= get_bits1(gb); 101.40 + if(luma_weight_flag){ 101.41 + s->luma_weight[i][list][0]= get_se_golomb(gb); 101.42 + s->luma_weight[i][list][1]= get_se_golomb(gb); 101.43 + if( s->luma_weight[i][list][0] != luma_def 101.44 + || s->luma_weight[i][list][1] != 0) { 101.45 + s->use_weight= 1; 101.46 + } 101.47 + }else{ 101.48 + s->luma_weight[i][list][0]= luma_def; 101.49 + s->luma_weight[i][list][1]= 0; 101.50 + } 101.51 + 101.52 + chroma_weight_flag= get_bits1(gb); 101.53 + if(chroma_weight_flag){ 101.54 + int j; 101.55 + for(j=0; j<2; j++){ 101.56 + s->chroma_weight[i][list][j][0]= get_se_golomb(gb); 101.57 + s->chroma_weight[i][list][j][1]= get_se_golomb(gb); 101.58 + if( s->chroma_weight[i][list][j][0] != chroma_def 101.59 + || s->chroma_weight[i][list][j][1] != 0) { 101.60 + s->use_weight_chroma= 1; 101.61 + } 101.62 + } 101.63 + }else{ 101.64 + int j; 101.65 + for(j=0; j<2; j++){ 101.66 + s->chroma_weight[i][list][j][0]= chroma_def; 101.67 + s->chroma_weight[i][list][j][1]= 0; 101.68 + } 101.69 + } 101.70 + } 101.71 + if(s->slice_type_nos != FF_B_TYPE) break; 101.72 + } 101.73 + s->use_weight= s->use_weight || s->use_weight_chroma; 101.74 + return 0; 101.75 +} 101.76 + 101.77 +/** 101.78 +* Initialize implicit_weight table. 101.79 +*/ 101.80 +static void implicit_weight_table(H264Slice *s){ 101.81 + int ref0, ref1, cur_poc, ref_start, ref_count0, ref_count1; 101.82 + 101.83 + cur_poc = s->poc; 101.84 + if( s->ref_count[0] == 1 && s->ref_count[1] == 1 && s->ref_list[0][0]->poc + s->ref_list[1][0]->poc == 2*cur_poc){ 101.85 + s->use_weight= 0; 101.86 + s->use_weight_chroma= 0; 101.87 + return; 101.88 + } 101.89 + ref_start= 0; 101.90 + ref_count0= s->ref_count[0]; 101.91 + ref_count1= s->ref_count[1]; 101.92 + 101.93 + s->use_weight= 2; 101.94 + s->use_weight_chroma= 2; 101.95 + s->luma_log2_weight_denom= 5; 101.96 + s->chroma_log2_weight_denom= 5; 101.97 + 101.98 + for(ref0=ref_start; ref0 < ref_count0; ref0++){ 101.99 + int poc0 = s->ref_list[0][ref0]->poc; 101.100 + for(ref1=ref_start; ref1 < ref_count1; ref1++){ 101.101 + int poc1 = s->ref_list[1][ref1]->poc; 101.102 + int td = av_clip(poc1 - poc0, -128, 127); 101.103 + int w= 32; 101.104 + if(td){ 101.105 + int tb = av_clip(cur_poc - poc0, -128, 127); 101.106 + int tx = (16384 + (FFABS(td) >> 1)) / td; 101.107 + int dist_scale_factor = (tb*tx + 32) >> 8; 101.108 + if(dist_scale_factor >= -64 && dist_scale_factor <= 128) 101.109 + w = 64 - dist_scale_factor; 101.110 + } 101.111 + s->implicit_weight[ref0][ref1][0]= 101.112 + s->implicit_weight[ref0][ref1][1]= w; 101.113 + } 101.114 + } 101.115 +} 101.116 + 101.117 +/** 101.118 +* instantaneous decoder refresh. 101.119 +*/ 101.120 +static void idr(NalContext *n, H264Slice *s){ 101.121 + ff_h264_remove_all_refs(n, s); 101.122 + n->prev_frame_num= 0; 101.123 + n->prev_frame_num_offset= 0; 101.124 + n->poc_offset += (n->prev_poc_msb<<16) + n->prev_poc_lsb; 101.125 + n->prev_poc_msb= 101.126 + n->prev_poc_lsb= 0; 101.127 +} 101.128 + 101.129 +static int init_poc(NalContext *n, H264Slice *s, GetBitContext *gb){ 101.130 + const int max_frame_num= 1<<n->sps.log2_max_frame_num; 101.131 + int frame_poc; 101.132 + 101.133 + if(n->sps.poc_type==0){ 101.134 + n->poc_lsb= get_bits(gb, n->sps.log2_max_poc_lsb); 101.135 + } 101.136 + 101.137 + if(n->sps.poc_type==1 && !n->sps.delta_pic_order_always_zero_flag){ 101.138 + n->delta_poc= get_se_golomb(gb); 101.139 + } 101.140 + 101.141 + n->frame_num_offset= n->prev_frame_num_offset; 101.142 + if(n->frame_num < n->prev_frame_num) 101.143 + n->frame_num_offset += max_frame_num; 101.144 + 101.145 + if(n->sps.poc_type==0){ 101.146 + const int max_poc_lsb= 1<<n->sps.log2_max_poc_lsb; 101.147 + 101.148 + if(n->poc_lsb < n->prev_poc_lsb && n->prev_poc_lsb - n->poc_lsb >= max_poc_lsb/2) 101.149 + n->poc_msb = n->prev_poc_msb + max_poc_lsb; 101.150 + else if(n->poc_lsb > n->prev_poc_lsb && n->prev_poc_lsb - n->poc_lsb < -max_poc_lsb/2) 101.151 + n->poc_msb = n->prev_poc_msb - max_poc_lsb; 101.152 + else 101.153 + n->poc_msb = n->prev_poc_msb; 101.154 + 101.155 + frame_poc = n->poc_msb + n->poc_lsb; 101.156 + }else if(n->sps.poc_type==1){ 101.157 + int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc; 101.158 + int i; 101.159 + 101.160 + if(n->sps.poc_cycle_length != 0) 101.161 + abs_frame_num = n->frame_num_offset + n->frame_num; 101.162 + else 101.163 + abs_frame_num = 0; 101.164 + 101.165 + if(s->nal_ref_idc==0 && abs_frame_num > 0) 101.166 + abs_frame_num--; 101.167 + 101.168 + expected_delta_per_poc_cycle = 0; 101.169 + for(i=0; i < n->sps.poc_cycle_length; i++) 101.170 + expected_delta_per_poc_cycle += n->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse 101.171 + 101.172 + if(abs_frame_num > 0){ 101.173 + int poc_cycle_cnt = (abs_frame_num - 1) / n->sps.poc_cycle_length; 101.174 + int frame_num_in_poc_cycle = (abs_frame_num - 1) % n->sps.poc_cycle_length; 101.175 + 101.176 + expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle; 101.177 + for(i = 0; i <= frame_num_in_poc_cycle; i++) 101.178 + expectedpoc = expectedpoc + n->sps.offset_for_ref_frame[ i ]; 101.179 + } else 101.180 + expectedpoc = 0; 101.181 + if(s->nal_ref_idc == 0) 101.182 + expectedpoc = expectedpoc + n->sps.offset_for_non_ref_pic; 101.183 + frame_poc = expectedpoc + n->delta_poc; 101.184 + }else{ 101.185 + int poc= 2*(n->frame_num_offset + n->frame_num); 101.186 + if(!s->nal_ref_idc) 101.187 + poc--; 101.188 + frame_poc= poc; 101.189 + } 101.190 + s->current_picture_info->poc= s->poc = frame_poc + n->poc_offset; 101.191 + s->coded_pic_num = n->coded_pic_num++; 101.192 + 101.193 + return 0; 101.194 +} 101.195 + 101.196 +static void ref2frame(NalContext *n, H264Slice *s){ 101.197 + for(int j=0; j<s->list_count; j++){ 101.198 + int *ref2frm= s->ref2frm[j]; 101.199 + 101.200 + ref2frm[0]= 101.201 + ref2frm[1]= -1; 101.202 + 101.203 + for(int i=0; i<s->ref_count[j]; i++){ 101.204 + ref2frm[i+2]= 15; 101.205 + if(s->ref_list[j][i]->cpn >=0){ 101.206 + int k; 101.207 + for(k=0; k<n->short_ref_count; k++){ 101.208 + if(n->short_ref[k]->cpn == s->ref_list[j][i]->cpn){ 101.209 + ref2frm[i+2]= k; 101.210 + break; 101.211 + } 101.212 + } 101.213 + } 101.214 + } 101.215 + } 101.216 +} 101.217 + 101.218 +/** 101.219 +* decodes a slice header. 101.220 +* This will also call MPV_common_init() and frame_start() as needed. 101.221 +* 101.222 +* @param h h264context 101.223 +* @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding) 101.224 +* 101.225 +* @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded 101.226 +*/ 101.227 +static int decode_slice_header(NalContext *n, H264Slice *s, GetBitContext *gb){ 101.228 + unsigned int first_mb_in_slice; 101.229 + unsigned int pps_id; 101.230 + int num_ref_idx_active_override_flag; 101.231 + unsigned int slice_type, tmp; 101.232 + 101.233 + first_mb_in_slice= get_ue_golomb(gb); 101.234 + (void) first_mb_in_slice; 101.235 + 101.236 + slice_type= get_ue_golomb_31(gb); 101.237 + if(slice_type > 9){ 101.238 + av_log(AV_LOG_ERROR, "slice type too large (%d)\n", s->slice_type); 101.239 + return -1; 101.240 + } 101.241 + if(slice_type > 4) 101.242 + slice_type -= 5; 101.243 + 101.244 + slice_type= golomb_to_pict_type[ slice_type ]; 101.245 + 101.246 + s->slice_type= slice_type; 101.247 + s->slice_type_nos= slice_type & 3; 101.248 + s->current_picture_info->slice_type_nos = s->slice_type_nos; 101.249 + s->current_picture_info->reference= s->nal_ref_idc? 2:0; 101.250 + s->key_frame = s->slice_type == FF_I_TYPE; 101.251 + 101.252 + pps_id= get_ue_golomb(gb); 101.253 + 101.254 + if(pps_id>=MAX_PPS_COUNT){ 101.255 + av_log(AV_LOG_ERROR, "pps_id out of range\n"); 101.256 + return -1; 101.257 + } 101.258 + if(!n->pps_buffers[pps_id]) { 101.259 + av_log(AV_LOG_ERROR, "non-existing PPS %u referenced\n", pps_id); 101.260 + return -1; 101.261 + } 101.262 + s->pps= *n->pps_buffers[pps_id]; 101.263 + 101.264 + if(!n->sps_buffers[s->pps.sps_id]) { 101.265 + av_log(AV_LOG_ERROR, "non-existing SPS %u referenced\n", s->pps.sps_id); 101.266 + return -1; 101.267 + } 101.268 + n->sps = *n->sps_buffers[s->pps.sps_id]; 101.269 + 101.270 + n->mb_width= n->sps.mb_width; 101.271 + n->mb_height= n->sps.mb_height; 101.272 + 101.273 + int chroma444 = (n->sps.chroma_format_idc == 3); 101.274 + n->width = 16*n->mb_width - (2>>chroma444)*FFMIN(n->sps.crop_right, (8<<chroma444)-1); 101.275 + if(n->sps.frame_mbs_only_flag) 101.276 + n->height= 16*n->mb_height - (2>>chroma444)*FFMIN(n->sps.crop_bottom, (8<<chroma444)-1); 101.277 + else 101.278 + n->height= 16*n->mb_height - (4>>chroma444)*FFMIN(n->sps.crop_bottom, (8<<chroma444)-1); 101.279 + 101.280 + s->direct_8x8_inference_flag = n->sps.direct_8x8_inference_flag; 101.281 + s->transform_bypass = n->sps.transform_bypass; 101.282 + 101.283 + n->frame_num= get_bits(gb, n->sps.log2_max_frame_num); 101.284 + if(n->frame_num != n->prev_frame_num && n->frame_num != (n->prev_frame_num+1)%(1<<n->sps.log2_max_frame_num)){ 101.285 + av_log(AV_LOG_ERROR, "unexpected frame_num \n"); 101.286 + } 101.287 + 101.288 + s->current_picture_info->frame_num= n->frame_num; //FIXME frame_num cleanup 101.289 + n->max_pic_num= 1<< n->sps.log2_max_frame_num; 101.290 + 101.291 + if(s->nal_unit_type == NAL_IDR_SLICE){ 101.292 + get_ue_golomb(gb); /* idr_pic_id */ 101.293 + } 101.294 + 101.295 + init_poc(n, s, gb); 101.296 + 101.297 + if(s->pps.redundant_pic_cnt_present){ 101.298 + n->redundant_pic_count= get_ue_golomb(gb); 101.299 + } 101.300 + 101.301 + //set defaults, might be overridden a few lines later 101.302 + s->ref_count[0]= s->pps.ref_count[0]; 101.303 + s->ref_count[1]= s->pps.ref_count[1]; 101.304 + 101.305 + if(s->slice_type_nos != FF_I_TYPE){ 101.306 + if(s->slice_type_nos == FF_B_TYPE){ 101.307 + s->direct_spatial_mv_pred= get_bits1(gb); 101.308 + } 101.309 + num_ref_idx_active_override_flag= get_bits1(gb); 101.310 + 101.311 + if(num_ref_idx_active_override_flag){ 101.312 + s->ref_count[0]= get_ue_golomb(gb) + 1; 101.313 + if(s->slice_type_nos==FF_B_TYPE) 101.314 + s->ref_count[1]= get_ue_golomb(gb) + 1; 101.315 + 101.316 + if(s->ref_count[0]-1 > 32-1 || s->ref_count[1]-1 > 32-1){ 101.317 + av_log(AV_LOG_ERROR, "reference overflow\n"); 101.318 + s->ref_count[0]= s->ref_count[1]= 1; 101.319 + return -1; 101.320 + } 101.321 + } 101.322 + if(s->slice_type_nos == FF_B_TYPE) 101.323 + s->list_count= 2; 101.324 + else 101.325 + s->list_count= 1; 101.326 + }else 101.327 + s->list_count= 0; 101.328 + 101.329 + 101.330 + if(s->slice_type_nos!=FF_I_TYPE){ 101.331 + ff_h264_fill_default_ref_list(n, s); 101.332 + ff_h264_decode_ref_pic_list_reordering(n, s, gb); 101.333 + ref2frame(n, s); 101.334 + 101.335 + for(int i=0; i<2; i++){ 101.336 + for(int j=0; j<s->ref_count[i]; j++){ 101.337 + if (s->ref_list[i][j]==NULL || s->ref_list[i][j]->reference < 2) // Don't know why sometimes the ref_count=1 while there are no references 101.338 + s->ref_list_cpn[i][j] = -1; 101.339 + else 101.340 + s->ref_list_cpn[i][j] = s->ref_list[i][j]->cpn; 101.341 + } 101.342 + } 101.343 + } 101.344 + 101.345 + if( (s->pps.weighted_pred && s->slice_type_nos == FF_P_TYPE ) 101.346 + || (s->pps.weighted_bipred_idc==1 && s->slice_type_nos== FF_B_TYPE ) ){ 101.347 + pred_weight_table(s, gb); 101.348 + } 101.349 + else if(s->pps.weighted_bipred_idc==2 && s->slice_type_nos== FF_B_TYPE){ 101.350 + implicit_weight_table( s); 101.351 + }else { 101.352 + s->use_weight = 0; 101.353 + } 101.354 + 101.355 + if(s->nal_ref_idc){ 101.356 + ff_h264_ref_pic_marking(n, s, gb); 101.357 + n->prev_poc_msb= n->poc_msb; 101.358 + n->prev_poc_lsb= n->poc_lsb; 101.359 + } 101.360 + 101.361 + n->prev_frame_num_offset= n->frame_num_offset; 101.362 + n->prev_frame_num= n->frame_num; 101.363 + 101.364 + if(s->slice_type_nos != FF_B_TYPE){ 101.365 + s->ip_id= n->ip_id++; 101.366 + } 101.367 + 101.368 + if(s->slice_type_nos==FF_B_TYPE && !s->direct_spatial_mv_pred){ 101.369 + ff_h264_direct_dist_scale_factor(s); 101.370 + } 101.371 + ff_h264_direct_ref_list_init(s); 101.372 + 101.373 + 101.374 + if( s->slice_type_nos != FF_I_TYPE && s->pps.cabac ){ 101.375 + tmp = get_ue_golomb_31(gb); 101.376 + if(tmp > 2){ 101.377 + av_log(AV_LOG_ERROR, "cabac_init_idc overflow\n"); 101.378 + return -1; 101.379 + } 101.380 + s->cabac_init_idc= tmp; 101.381 + } 101.382 + 101.383 + tmp = s->pps.init_qp + get_se_golomb(gb); 101.384 + if(tmp>51){ 101.385 + av_log(AV_LOG_ERROR, "QP %u out of range\n", tmp); 101.386 + return -1; 101.387 + } 101.388 + s->qscale= tmp; 101.389 + 101.390 + //FIXME qscale / qp ... stuff 101.391 + if(s->slice_type == FF_SP_TYPE){ 101.392 + get_bits1(gb); /* sp_for_switch_flag */ 101.393 + } 101.394 + if(s->slice_type==FF_SP_TYPE || s->slice_type == FF_SI_TYPE){ 101.395 + get_se_golomb(gb); /* slice_qs_delta */ 101.396 + } 101.397 + 101.398 + s->slice_alpha_c0_offset = 52; 101.399 + s->slice_beta_offset = 52; 101.400 + if( s->pps.deblocking_filter_parameters_present ) { 101.401 + tmp= get_ue_golomb_31(gb); 101.402 + if(tmp > 1){ 101.403 + av_log(AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp); 101.404 + return -1; 101.405 + } 101.406 + 101.407 + if(tmp < 2) 101.408 + tmp^= 1; // 1<->0 101.409 + 101.410 + if( tmp ) { 101.411 + s->slice_alpha_c0_offset += get_se_golomb(gb) << 1; 101.412 + s->slice_beta_offset += get_se_golomb(gb) << 1; 101.413 + if( (unsigned) s->slice_alpha_c0_offset > 104U 101.414 + ||(unsigned) s->slice_beta_offset > 104U){ 101.415 + av_log(AV_LOG_ERROR, "deblocking filter parameters %d %d out of range\n", s->slice_alpha_c0_offset, s->slice_beta_offset); 101.416 + return -1; 101.417 + } 101.418 + } 101.419 + } 101.420 + 101.421 + s->qp_thresh= 15 + 52 - FFMIN(s->slice_alpha_c0_offset, s->slice_beta_offset) - FFMAX3(0, s->pps.chroma_qp_index_offset[0], s->pps.chroma_qp_index_offset[1]); 101.422 + 101.423 + return 0; 101.424 +} 101.425 + 101.426 +PictureInfo *get_pib_entry(NalContext *nc, int coded_pic_num){ 101.427 + PictureInfo *pic = NULL; 101.428 + 101.429 + for(int i=0; i<MAX_REF_PIC_COUNT+1; i++){ 101.430 + if(nc->picture[i].reference==0){ 101.431 + pic= &nc->picture[i]; 101.432 + break; 101.433 + } 101.434 + } 101.435 + pic->cpn = coded_pic_num; 101.436 + 101.437 + return pic; 101.438 +} 101.439 + 101.440 +int decode_nal_units(NalContext *n, H264Slice *s, GetBitContext *gb1){ 101.441 + GetBitContext *gb = gb1; 101.442 + uint8_t *buf = gb1->raw; 101.443 + int buf_size = gb1->buf_size; 101.444 + int next_avc = buf_size; 101.445 + int buf_index=0; 101.446 + uint8_t *dst=NULL; 101.447 +// gb->raw = gb1->raw; 101.448 +// gb->rbsp = NULL; 101.449 + s->release_cnt=0; 101.450 + ff_h264_reset_sei(n); 101.451 + 101.452 + s->current_picture_info = get_pib_entry(n, n->coded_pic_num); 101.453 + 101.454 + for(;;){ 101.455 + int consumed; 101.456 + int dst_length; 101.457 + int bit_length; 101.458 + const uint8_t *ptr; 101.459 + int err; 101.460 + 101.461 + if (buf_index >= buf_size){ 101.462 + break; 101.463 + } else { 101.464 + // start code prefix search 101.465 + for(; buf_index + 3 < buf_size; buf_index++){ 101.466 + // This should always succeed in the first iteration. 101.467 + if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1) 101.468 + break; 101.469 + } 101.470 + if(buf_index+3 >= buf_size) break; 101.471 + buf_index+=3; 101.472 + } 101.473 + 101.474 + { 101.475 + int length = next_avc - buf_index; 101.476 + int i, si, di; 101.477 + uint8_t *src= buf+buf_index; 101.478 + // src[0]&0x80; //forbidden bit 101.479 + s->nal_ref_idc= src[0]>>5; 101.480 + s->nal_unit_type= src[0]&0x1F; 101.481 + 101.482 + src++; length--; 101.483 + 101.484 + for(i=0; i+1<length; i+=2){ 101.485 + if(src[i]) continue; 101.486 + if(i>0 && src[i-1]==0) i--; 101.487 + if(i+2<length && src[i+1]==0 && src[i+2]<=3){ 101.488 + if(src[i+2]!=3){ 101.489 + /* startcode, so we must be past the end */ 101.490 + length=i; 101.491 + } 101.492 + break; 101.493 + } 101.494 + } 101.495 + 101.496 + if(i>=length-1){ //no escaped 0 101.497 + dst_length= length; 101.498 + consumed= length+1; //+1 for the header 101.499 + ptr=src; 101.500 + }else{ 101.501 + av_fast_malloc(&gb->rbsp, &gb->rbsp_size, length+FF_INPUT_BUFFER_PADDING_SIZE); 101.502 + dst = gb->rbsp; 101.503 +// if (dst){ 101.504 +// av_free(dst); 101.505 +// } 101.506 +// dst = av_malloc(length+FF_INPUT_BUFFER_PADDING_SIZE); 101.507 + 101.508 + if (dst == NULL){ 101.509 + return -1; 101.510 + } 101.511 + 101.512 + //printf("decoding esc\n"); 101.513 + memcpy(dst, src, i); 101.514 + si=di=i; 101.515 + while(si+2<length){ 101.516 + //remove escapes (very rare 1:2^22) 101.517 + if(src[si+2]>3){ 101.518 + dst[di++]= src[si++]; 101.519 + dst[di++]= src[si++]; 101.520 + }else if(src[si]==0 && src[si+1]==0){ 101.521 + if(src[si+2]==3){ //escape 101.522 + dst[di++]= 0; 101.523 + dst[di++]= 0; 101.524 + si+=3; 101.525 + continue; 101.526 + }else //next start code 101.527 + goto nsc; 101.528 + } 101.529 + 101.530 + dst[di++]= src[si++]; 101.531 + } 101.532 + while(si<length) 101.533 + dst[di++]= src[si++]; 101.534 + nsc: 101.535 + 101.536 + memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE); 101.537 + 101.538 + dst_length= di; 101.539 + consumed= si + 1;//+1 for the header 101.540 + //FIXME store exact number of bits in the getbitcontext (it is needed for decoding) 101.541 + ptr=dst; 101.542 +// gb->rbsp=ptr; 101.543 + } 101.544 + } 101.545 + if (ptr==NULL || dst_length < 0){ 101.546 + return -1; 101.547 + } 101.548 + 101.549 + //error prevention, should not touch dst_length 101.550 + while(ptr[dst_length - 1] == 0 && dst_length > 0) 101.551 + dst_length--; 101.552 + 101.553 + bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(ptr + dst_length - 1)); 101.554 + buf_index += consumed; 101.555 + 101.556 + err = 0; 101.557 + init_get_bits(gb, ptr, bit_length); 101.558 + switch(s->nal_unit_type){ 101.559 + case NAL_IDR_SLICE: 101.560 + idr(n, s); //FIXME ensure we don't loose some frames if there is reordering 101.561 + case NAL_SLICE: 101.562 + if((err = decode_slice_header(n, s, gb))) 101.563 + break; 101.564 + s->key_frame |= (s->nal_unit_type == NAL_IDR_SLICE) || (n->sei_recovery_frame_cnt >= 0); 101.565 + break; 101.566 + case NAL_DPA: 101.567 + case NAL_DPB: 101.568 + case NAL_DPC: 101.569 + av_log(AV_LOG_ERROR,"no slices/data partitioning support\n"); 101.570 + break; 101.571 + case NAL_SEI: 101.572 + ff_h264_decode_sei(n, gb); 101.573 + break; 101.574 + case NAL_SPS: 101.575 + ff_h264_decode_seq_parameter_set(n, gb); 101.576 + break; 101.577 + case NAL_PPS: 101.578 + ff_h264_decode_picture_parameter_set(n, gb, bit_length); 101.579 + break; 101.580 + case NAL_AUD: 101.581 + case NAL_END_SEQUENCE: 101.582 + case NAL_END_STREAM: 101.583 + case NAL_FILLER_DATA: 101.584 + case NAL_SPS_EXT: 101.585 + case NAL_AUXILIARY_SLICE: 101.586 + break; 101.587 + default: 101.588 + av_log(AV_LOG_ERROR, "Unknown NAL code: %d (%d bits)\n", s->nal_unit_type, bit_length); 101.589 + } 101.590 + if (err < 0) 101.591 + av_log(AV_LOG_ERROR, "decode_slice_header error\n"); 101.592 + 101.593 + } 101.594 + 101.595 + return buf_index; 101.596 +} 101.597 + 101.598 +NalContext *get_nal_context(int width, int height){ 101.599 + const int mb_height = (height + 15) / 16; 101.600 + const int mb_width = (width + 15) / 16; 101.601 + const int mb_stride = ((mb_width+1)/16 + 1) *16; //align mb_stride to 16 101.602 + 101.603 + NalContext *nc = av_mallocz(sizeof(NalContext)); 101.604 + nc->width = width; 101.605 + nc->height = height; 101.606 + nc->mb_height = mb_height; 101.607 + nc->mb_width = mb_width; 101.608 + nc->b4_stride = mb_width*4 + 1; 101.609 + nc->mb_stride = mb_stride; 101.610 + nc->outputed_poc = INT_MIN; 101.611 + 101.612 + for(int i=0; i<16; i++){ 101.613 + nc->picture[i].cpn =-1; 101.614 + } 101.615 + 101.616 + return nc; 101.617 +} 101.618 + 101.619 +void free_nal_context(NalContext *nc){ 101.620 + for(int i = 0; i < MAX_SPS_COUNT; i++){ 101.621 + if (nc->sps_buffers[i]){ 101.622 + av_free( nc->sps_buffers[i]); 101.623 + } 101.624 + } 101.625 + for(int i = 0; i < MAX_PPS_COUNT; i++){ 101.626 + if (nc->pps_buffers[i]){ 101.627 + av_free( nc->pps_buffers[i]); 101.628 + } 101.629 + } 101.630 + av_free(nc); 101.631 +}
102.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 102.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_nal.h Mon Aug 27 12:09:56 2012 +0200 102.3 @@ -0,0 +1,11 @@ 102.4 +#ifndef H264_NAL_H 102.5 +#define H264_NAL_H 102.6 + 102.7 +#include "avcodec.h" 102.8 +#include "h264_types.h" 102.9 + 102.10 +int decode_nal_units(NalContext *n, H264Slice *s, GetBitContext *gb); 102.11 +NalContext *get_nal_context(int width, int height); 102.12 +void free_nal_context(NalContext *nc); 102.13 + 102.14 +#endif
103.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 103.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_numa.c Mon Aug 27 12:09:56 2012 +0200 103.3 @@ -0,0 +1,33 @@ 103.4 + 103.5 +#include <pthread.h> 103.6 +#include "h264.h" 103.7 +#include "malloc.h" 103.8 + 103.9 +/* 103.10 +* Pthread version with affinity lock for ED and MBD threads. Deprecated 103.11 +*/ 103.12 +int av_transcode_pthread_affinity(int ifile, int ofile, int frame_width, int frame_height, h264_options *opts) { 103.13 + H264Context *h; 103.14 + pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr; 103.15 + 103.16 + h = ff_h264_decode_init(ifile, ofile, frame_width, frame_height, opts); 103.17 + timer_start = av_gettime(); 103.18 + 103.19 + pthread_create(&read_thr, NULL, read_thread, h); 103.20 + pthread_create(&parsenal_thr, NULL, parsenal_thread, h); 103.21 + pthread_create(&entropy_thr, NULL, entropy_IPB_thread, h); 103.22 + pthread_create(&mbdec_thr, NULL, mbdec_thread, h); 103.23 + pthread_create(&write_thr, NULL, write_thread, h); 103.24 + 103.25 + 103.26 + pthread_join(read_thr, NULL); 103.27 + pthread_join(parsenal_thr, NULL); 103.28 + pthread_join(entropy_thr, NULL); 103.29 + pthread_join(mbdec_thr, NULL); 103.30 + pthread_join(write_thr, NULL); 103.31 + 103.32 + /* finished ! */ 103.33 + ff_h264_decode_end(h); 103.34 + 103.35 + return 0; 103.36 +}
104.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 104.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_ompss.c Mon Aug 27 12:09:56 2012 +0200 104.3 @@ -0,0 +1,400 @@ 104.4 +/* 104.5 +* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder 104.6 +* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 104.7 +* 104.8 +* This file is part of FFmpeg. 104.9 +* 104.10 +* FFmpeg is free software; you can redistribute it and/or 104.11 +* modify it under the terms of the GNU Lesser General Public 104.12 +* License as published by the Free Software Foundation; either 104.13 +* version 2.1 of the License, or (at your option) any later version. 104.14 +* 104.15 +* FFmpeg is distributed in the hope that it will be useful, 104.16 +* but WITHOUT ANY WARRANTY; without even the implied warranty of 104.17 +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 104.18 +* Lesser General Public License for more details. 104.19 +* 104.20 +* You should have received a copy of the GNU Lesser General Public 104.21 +* License along with FFmpeg; if not, write to the Free Software 104.22 +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 104.23 +*/ 104.24 +#include "h264_types.h" 104.25 +#include "h264_parser.h" 104.26 +#include "h264_nal.h" 104.27 +#include "h264_entropy.h" 104.28 +#include "h264_rec.h" 104.29 +#include "h264_pred_mode.h" 104.30 +#include "h264_misc.h" 104.31 +// #undef NDEBUG 104.32 +#include <assert.h> 104.33 + 104.34 +#pragma omp task inout(*pc, *nc) output(*sbe) 104.35 +static void parse_task(H264Context *h, ParserContext *pc, NalContext *nc, SliceBufferEntry *sbe){ 104.36 + H264Slice *s; 104.37 + 104.38 + if (!sbe->initialized){ 104.39 + init_sb_entry(h, sbe); 104.40 + sbe->lines_total=h->mb_height; 104.41 + } 104.42 + 104.43 + av_read_frame_internal(pc, &sbe->gb); 104.44 + s = &sbe->slice; 104.45 + 104.46 + decode_nal_units(nc, s, &sbe->gb); 104.47 +} 104.48 + 104.49 +#pragma omp task inout(*ec) inout(*sbe) 104.50 +static void decode_slice_entropy_task(H264Context *h, EntropyContext *ec, SliceBufferEntry *sbe){ 104.51 + int i,j; 104.52 + H264Slice *s = &sbe->slice; 104.53 + GetBitContext *gb = &sbe->gb; 104.54 + H264Mb *mbs = sbe->mbs; 104.55 +// GetBitContext *gb = s->gb; 104.56 + CABACContext *c = &ec->c; 104.57 + 104.58 + if( !s->pps.cabac ){ 104.59 + av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); 104.60 + return ; 104.61 + } 104.62 + 104.63 + init_dequant_tables(s, ec); 104.64 + ec->curr_qscale = s->qscale; 104.65 + ec->last_qscale_diff = 0; 104.66 + ec->chroma_qp[0] = get_chroma_qp((H264Slice *) s, 0, s->qscale); 104.67 + ec->chroma_qp[1] = get_chroma_qp((H264Slice *) s, 1, s->qscale); 104.68 + 104.69 + /* realign */ 104.70 + align_get_bits( gb ); 104.71 + /* init cabac */ 104.72 + ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8); 104.73 + 104.74 + ff_h264_init_cabac_states(ec, s, c); 104.75 + 104.76 + for(j=0; j<ec->mb_height; j++){ 104.77 + init_entropy_buf(ec, s, j); 104.78 + for(i=0; i<ec->mb_width; i++){ 104.79 + int eos,ret; 104.80 + H264Mb *m = &mbs[i + j*ec->mb_width]; 104.81 + m->mb_x=i; 104.82 + m->mb_y=j; 104.83 + ec->m = m; 104.84 + 104.85 + ret = ff_h264_decode_mb_cabac(ec, s, c); 104.86 + eos = get_cabac_terminate( c); 104.87 + (void) eos; 104.88 + if( ret < 0 || c->bytestream > c->bytestream_end + 2) { 104.89 + av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream); 104.90 + return ; 104.91 + } 104.92 + } 104.93 + } 104.94 +} 104.95 + 104.96 +static void decode_super_mb_block(MBRecContext *d, H264Slice *s, SuperMBContext *smbc, H264Mb *mbs, int smb_x, int smb_y){ 104.97 + MBRecState mrs; 104.98 +// memset(&mrs, 0, sizeof(MBRecState)); 104.99 + 104.100 + for (int k=0, i= smb_y; i< smb_y + smbc->smb_height; i++, k++){ 104.101 + init_mbrec_context(d, &mrs, s, i); 104.102 + for (int j= smb_x -k ; j< smb_x - k + smbc->smb_width; j++){ 104.103 + if (i< d->mb_height && j >= 0 && j < d->mb_width){ 104.104 + h264_decode_mb_internal (d, &mrs, s, &mbs[i*d->mb_width+j]); 104.105 + } 104.106 + } 104.107 + } 104.108 +} 104.109 + 104.110 +#pragma omp task input(*d, *sbe, *ml, *mur) inout(*m) 104.111 +static void decode_super_mb_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *ml, 104.112 +SuperMBTask *mur, SuperMBTask *m){ 104.113 + H264Slice *s = &sbe->slice; 104.114 + H264Mb *mbs = sbe->mbs; 104.115 + decode_super_mb_block(d, s, smbc, mbs, m->smb_x, m->smb_y); 104.116 +} 104.117 + 104.118 +#pragma omp task input(*d, *sbe) inout(*sm) 104.119 +static void draw_edges_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *sm, int line){ 104.120 + H264Slice *s = &sbe->slice; 104.121 + for (int i=line*smbc->smb_height; i< (line+1)*smbc->smb_height && i< d->mb_height; i++) 104.122 + draw_edges(d, s, i); 104.123 +} 104.124 + 104.125 +static void decode_mb_in_slice(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe){ 104.126 + int i,j; 104.127 + 104.128 + SuperMBContext *smbc = acquire_smbc(h); 104.129 + int smb_height =smbc->nsmb_height, smb_width= smbc->nsmb_width; 104.130 + SuperMBTask *smbs = smbc->smbs[0]; 104.131 + 104.132 + SuperMBTask *sm=NULL, *sml, *smur; 104.133 + for(j=0; j< smb_height; j++){ 104.134 + for(i=0; i< smb_width; i++){ 104.135 + sm = smbs + j*smb_width + i; 104.136 + sml = sm - ((i > 0) ? 1: 0); 104.137 + smur = sm + (((i < smb_width-1) && (j >0)) ? -smb_width+1: 0); 104.138 + decode_super_mb_task(d, sbe, smbc, sml, smur, sm); 104.139 + } 104.140 + draw_edges_task(d, sbe, smbc, sm, j); 104.141 + } 104.142 + #pragma omp taskwait on(*sm) 104.143 + 104.144 + release_smbc(h, smbc); 104.145 +} 104.146 + 104.147 +#pragma omp task inout(*d) inout(*sbe) 104.148 +static void decode_slice_mb_task(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe){ 104.149 + H264Slice *s = &sbe->slice; 104.150 + 104.151 + for (int i=0; i<2; i++){ 104.152 + for(int j=0; j< s->ref_count[i]; j++){ 104.153 + if (s->ref_list_cpn[i][j] ==-1) 104.154 + continue; 104.155 + int k; 104.156 + for (k=0; k< h->max_dpb_cnt; k++){ 104.157 + if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){ 104.158 + s->dp_ref_list[i][j] = &h->dpb[k]; 104.159 + break; 104.160 + } 104.161 + } 104.162 + } 104.163 + } 104.164 + 104.165 + #pragma omp critical (dpb) 104.166 + get_dpb_entry(h, s); 104.167 + 104.168 + if (!h->no_mbd){ 104.169 + decode_mb_in_slice (h, d, sbe); 104.170 + } 104.171 + 104.172 + for (int i=0; i<s->release_cnt; i++){ 104.173 + for(int j=0; j<h->max_dpb_cnt; j++){ 104.174 + if(h->dpb[j].cpn== s->release_ref_cpn[i]){ 104.175 + #pragma omp critical (dpb) 104.176 + release_dpb_entry(h, &h->dpb[j], 2); 104.177 + break; 104.178 + } 104.179 + } 104.180 + } 104.181 + s->release_cnt=0; 104.182 +} 104.183 + 104.184 +// for static 3d wave 104.185 +/*-------------------------------------------------------------------------------*/ 104.186 +#pragma omp task input(*d, *sbe, *ml, *mur, *mprev) inout(*m) 104.187 +static void decode_3dwave_super_mb_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *ml, 104.188 +SuperMBTask *mur, SuperMBTask *mprev, SuperMBTask *m){ 104.189 + H264Slice *s = &sbe->slice; 104.190 + H264Mb *mbs = sbe->mbs; 104.191 + 104.192 + decode_super_mb_block(d, s, smbc, mbs, m->smb_x, m->smb_y); 104.193 +} 104.194 + 104.195 +// int init_ref_count=0; 104.196 +#pragma omp task inout(*d, *sbe, *init) 104.197 +static void init_ref_list_and_get_dpb_task(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe, int *init){ 104.198 + H264Slice *s = &sbe->slice; 104.199 + for (int i=0; i<2; i++){ 104.200 + for(int j=0; j< s->ref_count[i]; j++){ 104.201 + if (s->ref_list_cpn[i][j] ==-1) 104.202 + continue; 104.203 + int k; 104.204 + for (k=0; k<h->max_dpb_cnt; k++){ 104.205 + if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){ 104.206 + s->dp_ref_list[i][j] = &h->dpb[k]; 104.207 + break; 104.208 + } 104.209 + } 104.210 + } 104.211 + } 104.212 + 104.213 + #pragma omp critical (dpb) 104.214 + get_dpb_entry(h, s); 104.215 + 104.216 +} 104.217 + 104.218 +static SuperMBTask* add_decode_slice_3dwave_tasks(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc){ 104.219 + int i,j; 104.220 + 104.221 + int smb_3d_height =smbc->nsmb_3dheight; 104.222 + int smb_height =smbc->nsmb_height, smb_width= smbc->nsmb_width; 104.223 + int smb_diff_prev = smb_height - smb_3d_height; 104.224 + SuperMBTask *sm=NULL, *sml, *smur, *smprev; 104.225 + 104.226 + SuperMBTask *smbs = smbc->smbs[smbc->index++]; smbc->index%=2; 104.227 + SuperMBTask *smbs_prev = smbc->smbs[smbc->index]; // index rotates -> next == prev 104.228 + 104.229 + for(j=0; j<smb_3d_height ; j++){ 104.230 + for(i=0; i< smb_width; i++){ 104.231 + sm = smbs + j*smb_width + i; 104.232 + sml = sm - ((i > 0) ? 1: 0); 104.233 + smur = sm + (((i < smb_width-1) && (j >0)) ? -smb_width+1: 0); 104.234 + smprev = smbs_prev + (j + smb_diff_prev+1)*smb_width -1; 104.235 + decode_3dwave_super_mb_task(d, sbe, smbc, sml, smur, smprev, sm); 104.236 + } 104.237 + draw_edges_task(d, sbe, smbc, sm, j); 104.238 + } 104.239 + 104.240 + for(; j< smb_height; j++){ 104.241 + for(i=0; i< smb_width; i++){ 104.242 + sm = smbs + j*smb_width + i; 104.243 + sml = sm - ((i > 0) ? 1: 0); 104.244 + smur = sm + (((i < smb_width-1) && (j >0)) ? -smb_width+1: 0); 104.245 + decode_super_mb_task(d, sbe, smbc, sml, smur, sm); 104.246 + } 104.247 + draw_edges_task(d, sbe, smbc, sm, j); 104.248 + } 104.249 + return sm; 104.250 +} 104.251 + 104.252 +#pragma omp task inout(*d, *sbe, *release) input (*lastsmb) 104.253 +static void release_ref_list_task(H264Context *h, SuperMBContext *smbc, MBRecContext *d, SliceBufferEntry *sbe, SuperMBTask *lastsmb, int *release){ 104.254 + H264Slice *s = &sbe->slice; 104.255 + for (int i=0; i<s->release_cnt; i++){ 104.256 + for(int j=0; j<h->max_dpb_cnt; j++){ 104.257 + if(h->dpb[j].cpn== s->release_ref_cpn[i]){ 104.258 + #pragma omp critical (dpb) 104.259 + release_dpb_entry(h, &h->dpb[j], 2); 104.260 + break; 104.261 + } 104.262 + } 104.263 + } 104.264 + s->release_cnt=0; 104.265 + 104.266 + release_smbc(h, smbc); 104.267 + 104.268 +} 104.269 + 104.270 +// static void decode_mb_static_3dwave(H264Context *h, int mb_height, int mb_width, MBRecContext *d, H264Slice *s, H264Mb *mbs, SuperMBTask *smbs, SuperMBTask *smbs_prev){ 104.271 +// 104.272 +// } 104.273 +/*-------------------------------------------------------------------------------*/ 104.274 +//end for static 3d wave 104.275 + 104.276 +#pragma omp task inout (*oc) input(*sbe) 104.277 +static void output_task(H264Context *h, OutputContext *oc, SliceBufferEntry *sbe){ 104.278 + DecodedPicture* out =output_frame(h, oc, sbe->slice.curr_pic, h->ofile, h->frame_width, h->frame_height); 104.279 + if (out){ 104.280 + #pragma omp critical (dpb) 104.281 + release_dpb_entry(h, out, 1); 104.282 + } 104.283 + print_report(oc->frame_number, oc->video_size, 0, h->verbose); 104.284 +} 104.285 + 104.286 +/* 104.287 +* The following code is the main loop of the file converter 104.288 +*/ 104.289 +int h264_decode_ompss( H264Context *h) { 104.290 + const int bufs = h->pipe_bufs; 104.291 + 104.292 + ParserContext *pc; 104.293 + NalContext *nc; 104.294 + EntropyContext *ec[bufs]; 104.295 + MBRecContext *rc[2]; 104.296 + OutputContext *oc; 104.297 + SliceBufferEntry *sbe; 104.298 + SuperMBContext *smbc; 104.299 + 104.300 + DecodedPicture *out; 104.301 + int frames=0; 104.302 + 104.303 +#if HAVE_LIBSDL2 104.304 + pthread_t sdl_thr; 104.305 + if (h->display){ 104.306 + pthread_create(&sdl_thr, NULL, sdl_thread, h); 104.307 + } 104.308 +#endif 104.309 + sbe= av_mallocz(sizeof(SliceBufferEntry) * bufs); 104.310 + 104.311 + 104.312 + pc = get_parse_context(h->ifile); 104.313 + nc = get_nal_context(h->width, h->height); 104.314 + 104.315 + for(int i=0; i<bufs; i++){ 104.316 + ec[i] = get_entropy_context( h ); 104.317 + } 104.318 + 104.319 + for(int i=0; i<2; i++){ 104.320 + rc[i] = get_mbrec_context(h); 104.321 + } 104.322 + 104.323 + oc = get_output_context( h ); 104.324 + 104.325 + av_start_timer(); 104.326 + int k=0; int init, release; 104.327 + if (h->static_3d && bufs < h->num_frames ){ 104.328 + int num_pre_ed =0; 104.329 + for (num_pre_ed=0; num_pre_ed< bufs -1 && !pc->final_frame; num_pre_ed++){ 104.330 + parse_task( h, pc, nc, &sbe[k%bufs] ); 104.331 + decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]); 104.332 + #pragma omp taskwait on(*pc) 104.333 + k++; 104.334 + } 104.335 + 104.336 + while(!pc->final_frame && frames++ < h->num_frames && !h->quit){ 104.337 + parse_task( h, pc, nc, &sbe[k%bufs] ); 104.338 + decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]); 104.339 + 104.340 + k++; 104.341 + 104.342 + init_ref_list_and_get_dpb_task(h, rc[k%2], &sbe[k%bufs], &init); 104.343 + smbc = acquire_smbc(h); 104.344 + SuperMBTask *lastsmb= add_decode_slice_3dwave_tasks(rc[k%2], &sbe[k%bufs], smbc); 104.345 + release_ref_list_task(h, smbc, rc[k%2], &sbe[k%bufs], lastsmb, &release); 104.346 + 104.347 + output_task (h, oc, &sbe[k%bufs]); 104.348 + #pragma omp taskwait on(*pc) 104.349 + } 104.350 + 104.351 + for (int i=0; i< num_pre_ed; i++){ 104.352 + k++; 104.353 + init_ref_list_and_get_dpb_task(h, rc[k%2], &sbe[k%bufs], &init); 104.354 + smbc = acquire_smbc(h); 104.355 + SuperMBTask *lastsmb= add_decode_slice_3dwave_tasks(rc[k%2], &sbe[k%bufs], smbc); 104.356 + release_ref_list_task(h, smbc, rc[k%2], &sbe[k%bufs], lastsmb, &release); 104.357 + 104.358 + output_task (h, oc, &sbe[k%bufs]); 104.359 + } 104.360 + 104.361 + } else { 104.362 + while(!pc->final_frame && frames++ < h->num_frames && !h->quit){ 104.363 + parse_task( h, pc, nc, &sbe[k%bufs] ); 104.364 + 104.365 + decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]); 104.366 + 104.367 + decode_slice_mb_task(h, rc[0], &sbe[k%bufs]); 104.368 + 104.369 + output_task (h, oc, &sbe[k%bufs]); 104.370 + #pragma omp taskwait on(*pc) 104.371 + k++; 104.372 + } 104.373 + } 104.374 + #pragma omp taskwait 104.375 + 104.376 + while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ; 104.377 + 104.378 + print_report(oc->frame_number, oc->video_size, 1, h->verbose); 104.379 + h->num_frames = oc->frame_number; 104.380 + /* finished ! */ 104.381 + 104.382 + free_parse_context(pc); 104.383 + free_nal_context (nc); 104.384 + free_output_context(oc); 104.385 + for (int i=0; i<bufs; i++){ 104.386 + free_sb_entry(&sbe[i]); 104.387 + free_entropy_context(ec[i]); 104.388 + } 104.389 + av_free(sbe); 104.390 + 104.391 + for (int i=0; i<2; i++){ 104.392 + free_mbrec_context(rc[i]); 104.393 + } 104.394 + 104.395 +#if HAVE_LIBSDL2 104.396 + if (h->display){ 104.397 + signal_sdl_exit(h); 104.398 + pthread_join(sdl_thr, NULL); 104.399 + } 104.400 +#endif 104.401 + 104.402 + return 0; 104.403 +}
105.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 105.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_parser.c Mon Aug 27 12:09:56 2012 +0200 105.3 @@ -0,0 +1,224 @@ 105.4 +/* 105.5 + * H.26L/H.264/AVC/JVT/14496-10/... parser 105.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 105.7 + * 105.8 + * This file is part of FFmpeg. 105.9 + * 105.10 + * FFmpeg is free software; you can redistribute it and/or 105.11 + * modify it under the terms of the GNU Lesser General Public 105.12 + * License as published by the Free Software Foundation; either 105.13 + * version 2.1 of the License, or (at your option) any later version. 105.14 + * 105.15 + * FFmpeg is distributed in the hope that it will be useful, 105.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 105.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 105.18 + * Lesser General Public License for more details. 105.19 + * 105.20 + * You should have received a copy of the GNU Lesser General Public 105.21 + * License along with FFmpeg; if not, write to the Free Software 105.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 105.23 + */ 105.24 + 105.25 +/** 105.26 + * @file 105.27 + * H.264 / AVC / MPEG4 part10 parser. 105.28 + * @author Michael Niedermayer <michaelni@gmx.at> 105.29 + */ 105.30 + 105.31 +#include <unistd.h> 105.32 + 105.33 +#include "golomb.h" 105.34 +#include "libavutil/error.h" 105.35 +#include "h264_types.h" 105.36 + 105.37 +#undef NDEBUG 105.38 +#include <assert.h> 105.39 + 105.40 +#define END_NOT_FOUND (-100) 105.41 + 105.42 +static int ff_h264_find_frame_end(ParserContext *s, const uint8_t *buf, int buf_size) 105.43 +{ 105.44 + int i; 105.45 + uint32_t state; 105.46 + 105.47 + state= s->state; 105.48 + if(state>13) 105.49 + state= 7; 105.50 + 105.51 + for(i=0; i<buf_size; i++){ 105.52 + if(state==7){ 105.53 + /* we check i<buf_size instead of i+3/7 because its simpler 105.54 + * and there should be FF_INPUT_BUFFER_PADDING_SIZE bytes at the end 105.55 + */ 105.56 + while(i<buf_size && !((~*(const uint64_t*)(buf+i) & (*(const uint64_t*)(buf+i) - 0x0101010101010101ULL)) & 0x8080808080808080ULL)) 105.57 + i+=8; 105.58 + 105.59 + for(; i<buf_size; i++){ 105.60 + if(!buf[i]){ 105.61 + state=2; 105.62 + break; 105.63 + } 105.64 + } 105.65 + }else if(state<=2){ 105.66 + if(buf[i]==1) state^= 5; //2->7, 1->4, 0->5 105.67 + else if(buf[i]) state = 7; 105.68 + else state>>=1; //2->1, 1->0, 0->0 105.69 + }else if(state<=5){ 105.70 + int v= buf[i] & 0x1F; 105.71 + if(v==6 || v==7 || v==8 || v==9){ 105.72 + if(s->frame_start_found){ 105.73 + i++; 105.74 + goto found; 105.75 + } 105.76 + }else if(v==1 || v==2 || v==5){ 105.77 + if(s->frame_start_found){ 105.78 + state+=8; 105.79 + continue; 105.80 + }else 105.81 + s->frame_start_found = 1; 105.82 + } 105.83 + state= 7; 105.84 + }else{ 105.85 + if(buf[i] & 0x80) 105.86 + goto found; 105.87 + state= 7; 105.88 + } 105.89 + } 105.90 + s->state= state; 105.91 + return END_NOT_FOUND; 105.92 + 105.93 +found: 105.94 + s->state=7; 105.95 + s->frame_start_found= 0; 105.96 + return i-(state&5); 105.97 +} 105.98 + 105.99 +static int ff_combine_frame(ParserContext *s, GetBitContext *gb, int next, uint8_t **buf, int *buf_size) 105.100 +{ 105.101 + int i; 105.102 + /* Copy overread bytes from last frame into buffer. */ 105.103 + for(i =0; s->overread_cnt>0; s->overread_cnt--, i++){ 105.104 + gb->raw[s->index++]= s->overread[i]; 105.105 + } 105.106 + 105.107 + /* EOF - END_NOT_FOUND means no next frame start is found in current partial read. If buf_size of the partial read is 0 we are at EOF */ 105.108 + if(!*buf_size && next == END_NOT_FOUND){ 105.109 + next= 0; 105.110 + } 105.111 + s->last_index= s->index; 105.112 + 105.113 + /* copy into buffer end return */ 105.114 + if(next == END_NOT_FOUND){ 105.115 + gb->raw = av_fast_realloc(gb->raw, &gb->alloc_size, (*buf_size) + s->index + FF_INPUT_BUFFER_PADDING_SIZE); 105.116 + memcpy(&gb->raw[s->index], *buf, *buf_size); 105.117 + s->index += *buf_size; 105.118 + return -1; 105.119 + } 105.120 + 105.121 + ///end found 105.122 + *buf_size= s->index + next; 105.123 + /* append to buffer */ 105.124 + 105.125 + gb->raw = av_fast_realloc(gb->raw, &gb->alloc_size, next + s->index + FF_INPUT_BUFFER_PADDING_SIZE); 105.126 + memcpy(&gb->raw[s->index], *buf, next + FF_INPUT_BUFFER_PADDING_SIZE ); 105.127 + s->index = 0; 105.128 + 105.129 + /* store overread bytes */ 105.130 + for(i=0; next < 0; next++, i++){ 105.131 + s->state = (s->state<<8) | gb->raw[s->last_index + next]; 105.132 + s->overread[i] = gb->raw[s->last_index + next]; 105.133 + s->overread_cnt++; 105.134 + } 105.135 + 105.136 + return 0; 105.137 +} 105.138 + 105.139 +static int h264_parse(ParserContext *s, GetBitContext *gb, 105.140 + uint8_t *buf, int buf_size) 105.141 +{ 105.142 + int next; 105.143 + 105.144 + next= ff_h264_find_frame_end(s, buf, buf_size); 105.145 + 105.146 + if (ff_combine_frame(s, gb, next, &buf, &buf_size) < 0) { 105.147 + gb->buf_size = 0; 105.148 + return buf_size; 105.149 + } 105.150 + 105.151 + if(next<0 && next != END_NOT_FOUND){ 105.152 + assert(s->last_index + next >= 0 ); 105.153 + ff_h264_find_frame_end(s, &gb->raw[s->last_index + next], -next); //update state 105.154 + } 105.155 + 105.156 + gb->buf_size = buf_size; 105.157 + return next; 105.158 +} 105.159 + 105.160 +static int ff_raw_read_partial_packet(ParserContext *pc) 105.161 +{ 105.162 + int len= -1; 105.163 + 105.164 + if (!pc->eof_reached){ 105.165 + len = read( pc->ifile, pc->data, pc->buffer_size); 105.166 +// printf("read task %d\t%d\n", pc->ifile, len); fflush(NULL); 105.167 + if (len < pc->buffer_size) { 105.168 + pc->eof_reached = 1; 105.169 + } 105.170 + } 105.171 + 105.172 + return len; 105.173 +} 105.174 + 105.175 +void av_read_frame_internal(ParserContext *pc, GetBitContext *gb){ 105.176 + int len; 105.177 + uint8_t dummy_buf[FF_INPUT_BUFFER_PADDING_SIZE]={0}; 105.178 + av_fast_malloc(&gb->raw, &gb->alloc_size, 2048+FF_INPUT_BUFFER_PADDING_SIZE); 105.179 + 105.180 + //Parsing is performed before read, since there are ussually leftovers from parsing the previous frame. 105.181 + for(;;) { 105.182 + if (pc->cur_len>0){ 105.183 + len = h264_parse(pc, gb, pc->cur_ptr, pc->cur_len); 105.184 + if (len<0) 105.185 + len =0; 105.186 + //* increment read pointer */ 105.187 + pc->cur_ptr += len; 105.188 + pc->cur_len -= len; 105.189 + 105.190 + if (gb->buf_size) { 105.191 + break; 105.192 + } 105.193 + } 105.194 + 105.195 + //check for ret and not parser->eof_reached as one "read" can contain more than 1 frame 105.196 + pc->size= ff_raw_read_partial_packet(pc); 105.197 + if (pc->size < 0) { 105.198 + pc->final_frame =1; 105.199 + /* return the last frames, if any */ 105.200 + h264_parse(pc, gb, dummy_buf, 0); 105.201 + break; 105.202 + } 105.203 + pc->cur_ptr = pc->data; 105.204 + pc->cur_len = pc->size; 105.205 + } 105.206 + 105.207 + assert(gb->raw!=NULL); 105.208 + 105.209 +} 105.210 + 105.211 +ParserContext *get_parse_context(int ifile){ 105.212 + ParserContext *pc = av_mallocz(sizeof(ParserContext)); 105.213 + pc->buffer_size = 2048; 105.214 + pc->final_frame = 0; 105.215 + pc->cur_len= 0; 105.216 + pc->data = av_mallocz(2048 + FF_INPUT_BUFFER_PADDING_SIZE); 105.217 + pc->size = 2048; 105.218 + pc->eof_reached =0; 105.219 + pc->ifile = ifile; 105.220 + 105.221 + return pc; 105.222 +} 105.223 + 105.224 +void free_parse_context(ParserContext *pc){ 105.225 + av_free(pc->data); 105.226 + av_free(pc); 105.227 +}
106.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 106.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_parser.h Mon Aug 27 12:09:56 2012 +0200 106.3 @@ -0,0 +1,10 @@ 106.4 +#ifndef H264_PARSER_H 106.5 +#define H264_PARSER_H 106.6 + 106.7 +#include "h264_types.h" 106.8 + 106.9 +void av_read_frame_internal(ParserContext *pc, GetBitContext *gb); 106.10 +ParserContext *get_parse_context(int ifile); 106.11 +void free_parse_context(ParserContext *pc); 106.12 + 106.13 +#endif
107.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 107.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_pred.c Mon Aug 27 12:09:56 2012 +0200 107.3 @@ -0,0 +1,945 @@ 107.4 +/* 107.5 + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder 107.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 107.7 + * 107.8 + * This file is part of FFmpeg. 107.9 + * 107.10 + * FFmpeg is free software; you can redistribute it and/or 107.11 + * modify it under the terms of the GNU Lesser General Public 107.12 + * License as published by the Free Software Foundation; either 107.13 + * version 2.1 of the License, or (at your option) any later version. 107.14 + * 107.15 + * FFmpeg is distributed in the hope that it will be useful, 107.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 107.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 107.18 + * Lesser General Public License for more details. 107.19 + * 107.20 + * You should have received a copy of the GNU Lesser General Public 107.21 + * License along with FFmpeg; if not, write to the Free Software 107.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 107.23 + */ 107.24 + 107.25 +/** 107.26 + * @file 107.27 + * H.264 / AVC / MPEG4 part10 prediction functions. 107.28 + * @author Michael Niedermayer <michaelni@gmx.at> 107.29 + */ 107.30 + 107.31 +#include "avcodec.h" 107.32 +#include "h264_pred.h" 107.33 +//#include "dsputil.h" 107.34 + 107.35 +static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){ 107.36 + (void) topright; 107.37 + const uint32_t a= ((uint32_t*)(src-stride))[0]; 107.38 + ((uint32_t*)(src+0*stride))[0]= a; 107.39 + ((uint32_t*)(src+1*stride))[0]= a; 107.40 + ((uint32_t*)(src+2*stride))[0]= a; 107.41 + ((uint32_t*)(src+3*stride))[0]= a; 107.42 +} 107.43 + 107.44 +static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){ 107.45 + (void) topright; 107.46 + ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101; 107.47 + ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101; 107.48 + ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101; 107.49 + ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101; 107.50 +} 107.51 + 107.52 +static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){ 107.53 + (void) topright; 107.54 + const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] 107.55 + + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3; 107.56 + 107.57 + ((uint32_t*)(src+0*stride))[0]= 107.58 + ((uint32_t*)(src+1*stride))[0]= 107.59 + ((uint32_t*)(src+2*stride))[0]= 107.60 + ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; 107.61 +} 107.62 + 107.63 +static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){ 107.64 + (void) topright; 107.65 + const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2; 107.66 + 107.67 + ((uint32_t*)(src+0*stride))[0]= 107.68 + ((uint32_t*)(src+1*stride))[0]= 107.69 + ((uint32_t*)(src+2*stride))[0]= 107.70 + ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; 107.71 +} 107.72 + 107.73 +static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){ 107.74 + (void) topright; 107.75 + const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2; 107.76 + 107.77 + ((uint32_t*)(src+0*stride))[0]= 107.78 + ((uint32_t*)(src+1*stride))[0]= 107.79 + ((uint32_t*)(src+2*stride))[0]= 107.80 + ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101; 107.81 +} 107.82 + 107.83 +static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){ 107.84 + (void) topright; 107.85 + ((uint32_t*)(src+0*stride))[0]= 107.86 + ((uint32_t*)(src+1*stride))[0]= 107.87 + ((uint32_t*)(src+2*stride))[0]= 107.88 + ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U; 107.89 +} 107.90 + 107.91 + 107.92 +#define LOAD_TOP_RIGHT_EDGE\ 107.93 + const int av_unused t4= topright[0];\ 107.94 + const int av_unused t5= topright[1];\ 107.95 + const int av_unused t6= topright[2];\ 107.96 + const int av_unused t7= topright[3];\ 107.97 + 107.98 +#define LOAD_DOWN_LEFT_EDGE\ 107.99 + const int av_unused l4= src[-1+4*stride];\ 107.100 + const int av_unused l5= src[-1+5*stride];\ 107.101 + const int av_unused l6= src[-1+6*stride];\ 107.102 + const int av_unused l7= src[-1+7*stride];\ 107.103 + 107.104 +#define LOAD_LEFT_EDGE\ 107.105 + const int av_unused l0= src[-1+0*stride];\ 107.106 + const int av_unused l1= src[-1+1*stride];\ 107.107 + const int av_unused l2= src[-1+2*stride];\ 107.108 + const int av_unused l3= src[-1+3*stride];\ 107.109 + 107.110 +#define LOAD_TOP_EDGE\ 107.111 + const int av_unused t0= src[ 0-1*stride];\ 107.112 + const int av_unused t1= src[ 1-1*stride];\ 107.113 + const int av_unused t2= src[ 2-1*stride];\ 107.114 + const int av_unused t3= src[ 3-1*stride];\ 107.115 + 107.116 +static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){ 107.117 + (void) topright; 107.118 + const int lt= src[-1-1*stride]; 107.119 + LOAD_TOP_EDGE 107.120 + LOAD_LEFT_EDGE 107.121 + 107.122 + src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2; 107.123 + src[0+2*stride]= 107.124 + src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2; 107.125 + src[0+1*stride]= 107.126 + src[1+2*stride]= 107.127 + src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2; 107.128 + src[0+0*stride]= 107.129 + src[1+1*stride]= 107.130 + src[2+2*stride]= 107.131 + src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2; 107.132 + src[1+0*stride]= 107.133 + src[2+1*stride]= 107.134 + src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2; 107.135 + src[2+0*stride]= 107.136 + src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; 107.137 + src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2; 107.138 +} 107.139 + 107.140 +static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){ 107.141 + LOAD_TOP_EDGE 107.142 + LOAD_TOP_RIGHT_EDGE 107.143 +// LOAD_LEFT_EDGE 107.144 + 107.145 + src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2; 107.146 + src[1+0*stride]= 107.147 + src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2; 107.148 + src[2+0*stride]= 107.149 + src[1+1*stride]= 107.150 + src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2; 107.151 + src[3+0*stride]= 107.152 + src[2+1*stride]= 107.153 + src[1+2*stride]= 107.154 + src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2; 107.155 + src[3+1*stride]= 107.156 + src[2+2*stride]= 107.157 + src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2; 107.158 + src[3+2*stride]= 107.159 + src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2; 107.160 + src[3+3*stride]=(t6 + 3*t7 + 2)>>2; 107.161 +} 107.162 + 107.163 +static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){ 107.164 + (void) topright; 107.165 + const int lt= src[-1-1*stride]; 107.166 + LOAD_TOP_EDGE 107.167 + LOAD_LEFT_EDGE 107.168 + 107.169 + src[0+0*stride]= 107.170 + src[1+2*stride]=(lt + t0 + 1)>>1; 107.171 + src[1+0*stride]= 107.172 + src[2+2*stride]=(t0 + t1 + 1)>>1; 107.173 + src[2+0*stride]= 107.174 + src[3+2*stride]=(t1 + t2 + 1)>>1; 107.175 + src[3+0*stride]=(t2 + t3 + 1)>>1; 107.176 + src[0+1*stride]= 107.177 + src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2; 107.178 + src[1+1*stride]= 107.179 + src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2; 107.180 + src[2+1*stride]= 107.181 + src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2; 107.182 + src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2; 107.183 + src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2; 107.184 + src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; 107.185 +} 107.186 + 107.187 +static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){ 107.188 + LOAD_TOP_EDGE 107.189 + LOAD_TOP_RIGHT_EDGE 107.190 + 107.191 + src[0+0*stride]=(t0 + t1 + 1)>>1; 107.192 + src[1+0*stride]= 107.193 + src[0+2*stride]=(t1 + t2 + 1)>>1; 107.194 + src[2+0*stride]= 107.195 + src[1+2*stride]=(t2 + t3 + 1)>>1; 107.196 + src[3+0*stride]= 107.197 + src[2+2*stride]=(t3 + t4+ 1)>>1; 107.198 + src[3+2*stride]=(t4 + t5+ 1)>>1; 107.199 + src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2; 107.200 + src[1+1*stride]= 107.201 + src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2; 107.202 + src[2+1*stride]= 107.203 + src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2; 107.204 + src[3+1*stride]= 107.205 + src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2; 107.206 + src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2; 107.207 +} 107.208 + 107.209 +static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){ 107.210 + (void) topright; 107.211 + LOAD_LEFT_EDGE 107.212 + 107.213 + src[0+0*stride]=(l0 + l1 + 1)>>1; 107.214 + src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2; 107.215 + src[2+0*stride]= 107.216 + src[0+1*stride]=(l1 + l2 + 1)>>1; 107.217 + src[3+0*stride]= 107.218 + src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2; 107.219 + src[2+1*stride]= 107.220 + src[0+2*stride]=(l2 + l3 + 1)>>1; 107.221 + src[3+1*stride]= 107.222 + src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2; 107.223 + src[3+2*stride]= 107.224 + src[1+3*stride]= 107.225 + src[0+3*stride]= 107.226 + src[2+2*stride]= 107.227 + src[2+3*stride]= 107.228 + src[3+3*stride]=l3; 107.229 +} 107.230 + 107.231 + 107.232 +static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){ 107.233 + (void) topright; 107.234 + const int lt= src[-1-1*stride]; 107.235 + LOAD_TOP_EDGE 107.236 + LOAD_LEFT_EDGE 107.237 + 107.238 + src[0+0*stride]= 107.239 + src[2+1*stride]=(lt + l0 + 1)>>1; 107.240 + src[1+0*stride]= 107.241 + src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2; 107.242 + src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2; 107.243 + src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2; 107.244 + src[0+1*stride]= 107.245 + src[2+2*stride]=(l0 + l1 + 1)>>1; 107.246 + src[1+1*stride]= 107.247 + src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2; 107.248 + src[0+2*stride]= 107.249 + src[2+3*stride]=(l1 + l2+ 1)>>1; 107.250 + src[1+2*stride]= 107.251 + src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2; 107.252 + src[0+3*stride]=(l2 + l3 + 1)>>1; 107.253 + src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2; 107.254 +} 107.255 + 107.256 +static void pred16x16_vertical_c(uint8_t *src, int stride){ 107.257 + int i; 107.258 + const uint32_t a= ((uint32_t*)(src-stride))[0]; 107.259 + const uint32_t b= ((uint32_t*)(src-stride))[1]; 107.260 + const uint32_t c= ((uint32_t*)(src-stride))[2]; 107.261 + const uint32_t d= ((uint32_t*)(src-stride))[3]; 107.262 + 107.263 + for(i=0; i<16; i++){ 107.264 + ((uint32_t*)(src+i*stride))[0]= a; 107.265 + ((uint32_t*)(src+i*stride))[1]= b; 107.266 + ((uint32_t*)(src+i*stride))[2]= c; 107.267 + ((uint32_t*)(src+i*stride))[3]= d; 107.268 + } 107.269 +} 107.270 + 107.271 +static void pred16x16_horizontal_c(uint8_t *src, int stride){ 107.272 + int i; 107.273 + 107.274 + for(i=0; i<16; i++){ 107.275 + ((uint32_t*)(src+i*stride))[0]= 107.276 + ((uint32_t*)(src+i*stride))[1]= 107.277 + ((uint32_t*)(src+i*stride))[2]= 107.278 + ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101; 107.279 + } 107.280 +} 107.281 + 107.282 +static void pred16x16_dc_c(uint8_t *src, int stride){ 107.283 + int i, dc=0; 107.284 + 107.285 + for(i=0;i<16; i++){ 107.286 + dc+= src[-1+i*stride]; 107.287 + } 107.288 + 107.289 + for(i=0;i<16; i++){ 107.290 + dc+= src[i-stride]; 107.291 + } 107.292 + 107.293 + dc= 0x01010101*((dc + 16)>>5); 107.294 + 107.295 + for(i=0; i<16; i++){ 107.296 + ((uint32_t*)(src+i*stride))[0]= 107.297 + ((uint32_t*)(src+i*stride))[1]= 107.298 + ((uint32_t*)(src+i*stride))[2]= 107.299 + ((uint32_t*)(src+i*stride))[3]= dc; 107.300 + } 107.301 +} 107.302 + 107.303 +static void pred16x16_left_dc_c(uint8_t *src, int stride){ 107.304 + int i, dc=0; 107.305 + 107.306 + for(i=0;i<16; i++){ 107.307 + dc+= src[-1+i*stride]; 107.308 + } 107.309 + 107.310 + dc= 0x01010101*((dc + 8)>>4); 107.311 + 107.312 + for(i=0; i<16; i++){ 107.313 + ((uint32_t*)(src+i*stride))[0]= 107.314 + ((uint32_t*)(src+i*stride))[1]= 107.315 + ((uint32_t*)(src+i*stride))[2]= 107.316 + ((uint32_t*)(src+i*stride))[3]= dc; 107.317 + } 107.318 +} 107.319 + 107.320 +static void pred16x16_top_dc_c(uint8_t *src, int stride){ 107.321 + int i, dc=0; 107.322 + 107.323 + for(i=0;i<16; i++){ 107.324 + dc+= src[i-stride]; 107.325 + } 107.326 + dc= 0x01010101*((dc + 8)>>4); 107.327 + 107.328 + for(i=0; i<16; i++){ 107.329 + ((uint32_t*)(src+i*stride))[0]= 107.330 + ((uint32_t*)(src+i*stride))[1]= 107.331 + ((uint32_t*)(src+i*stride))[2]= 107.332 + ((uint32_t*)(src+i*stride))[3]= dc; 107.333 + } 107.334 +} 107.335 + 107.336 +static void pred16x16_128_dc_c(uint8_t *src, int stride){ 107.337 + int i; 107.338 + 107.339 + for(i=0; i<16; i++){ 107.340 + ((uint32_t*)(src+i*stride))[0]= 107.341 + ((uint32_t*)(src+i*stride))[1]= 107.342 + ((uint32_t*)(src+i*stride))[2]= 107.343 + ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U; 107.344 + } 107.345 +} 107.346 + 107.347 +static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3, const int rv40){ 107.348 + int i, j, k; 107.349 + int a; 107.350 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; 107.351 + const uint8_t * const src0 = src+7-stride; 107.352 + const uint8_t *src1 = src+8*stride-1; 107.353 + const uint8_t *src2 = src1-2*stride; // == src+6*stride-1; 107.354 + int H = src0[1] - src0[-1]; 107.355 + int V = src1[0] - src2[ 0]; 107.356 + for(k=2; k<=8; ++k) { 107.357 + src1 += stride; src2 -= stride; 107.358 + H += k*(src0[k] - src0[-k]); 107.359 + V += k*(src1[0] - src2[ 0]); 107.360 + } 107.361 + if(svq3){ 107.362 + H = ( 5*(H/4) ) / 16; 107.363 + V = ( 5*(V/4) ) / 16; 107.364 + 107.365 + /* required for 100% accuracy */ 107.366 + i = H; H = V; V = i; 107.367 + }else if(rv40){ 107.368 + H = ( H + (H>>2) ) >> 4; 107.369 + V = ( V + (V>>2) ) >> 4; 107.370 + }else{ 107.371 + H = ( 5*H+32 ) >> 6; 107.372 + V = ( 5*V+32 ) >> 6; 107.373 + } 107.374 + 107.375 + a = 16*(src1[0] + src2[16] + 1) - 7*(V+H); 107.376 + for(j=16; j>0; --j) { 107.377 + int b = a; 107.378 + a += V; 107.379 + for(i=-16; i<0; i+=4) { 107.380 + src[16+i] = cm[ (b ) >> 5 ]; 107.381 + src[17+i] = cm[ (b+ H) >> 5 ]; 107.382 + src[18+i] = cm[ (b+2*H) >> 5 ]; 107.383 + src[19+i] = cm[ (b+3*H) >> 5 ]; 107.384 + b += 4*H; 107.385 + } 107.386 + src += stride; 107.387 + } 107.388 +} 107.389 + 107.390 +static void pred16x16_plane_c(uint8_t *src, int stride){ 107.391 + pred16x16_plane_compat_c(src, stride, 0, 0); 107.392 +} 107.393 + 107.394 + 107.395 +static void pred8x8_vertical_c(uint8_t *src, int stride){ 107.396 + int i; 107.397 + const uint32_t a= ((uint32_t*)(src-stride))[0]; 107.398 + const uint32_t b= ((uint32_t*)(src-stride))[1]; 107.399 + 107.400 + for(i=0; i<8; i++){ 107.401 + ((uint32_t*)(src+i*stride))[0]= a; 107.402 + ((uint32_t*)(src+i*stride))[1]= b; 107.403 + } 107.404 +} 107.405 + 107.406 +static void pred8x8_horizontal_c(uint8_t *src, int stride){ 107.407 + int i; 107.408 + 107.409 + for(i=0; i<8; i++){ 107.410 + ((uint32_t*)(src+i*stride))[0]= 107.411 + ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101; 107.412 + } 107.413 +} 107.414 + 107.415 +static void pred8x8_128_dc_c(uint8_t *src, int stride){ 107.416 + int i; 107.417 + 107.418 + for(i=0; i<8; i++){ 107.419 + ((uint32_t*)(src+i*stride))[0]= 107.420 + ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U; 107.421 + } 107.422 +} 107.423 + 107.424 +static void pred8x8_left_dc_c(uint8_t *src, int stride){ 107.425 + int i; 107.426 + int dc0, dc2; 107.427 + 107.428 + dc0=dc2=0; 107.429 + for(i=0;i<4; i++){ 107.430 + dc0+= src[-1+i*stride]; 107.431 + dc2+= src[-1+(i+4)*stride]; 107.432 + } 107.433 + dc0= 0x01010101*((dc0 + 2)>>2); 107.434 + dc2= 0x01010101*((dc2 + 2)>>2); 107.435 + 107.436 + for(i=0; i<4; i++){ 107.437 + ((uint32_t*)(src+i*stride))[0]= 107.438 + ((uint32_t*)(src+i*stride))[1]= dc0; 107.439 + } 107.440 + for(i=4; i<8; i++){ 107.441 + ((uint32_t*)(src+i*stride))[0]= 107.442 + ((uint32_t*)(src+i*stride))[1]= dc2; 107.443 + } 107.444 +} 107.445 + 107.446 + 107.447 +static void pred8x8_top_dc_c(uint8_t *src, int stride){ 107.448 + int i; 107.449 + int dc0, dc1; 107.450 + 107.451 + dc0=dc1=0; 107.452 + for(i=0;i<4; i++){ 107.453 + dc0+= src[i-stride]; 107.454 + dc1+= src[4+i-stride]; 107.455 + } 107.456 + dc0= 0x01010101*((dc0 + 2)>>2); 107.457 + dc1= 0x01010101*((dc1 + 2)>>2); 107.458 + 107.459 + for(i=0; i<4; i++){ 107.460 + ((uint32_t*)(src+i*stride))[0]= dc0; 107.461 + ((uint32_t*)(src+i*stride))[1]= dc1; 107.462 + } 107.463 + for(i=4; i<8; i++){ 107.464 + ((uint32_t*)(src+i*stride))[0]= dc0; 107.465 + ((uint32_t*)(src+i*stride))[1]= dc1; 107.466 + } 107.467 +} 107.468 + 107.469 +static void pred8x8_dc_c(uint8_t *src, int stride){ 107.470 + int i; 107.471 + int dc0, dc1, dc2, dc3; 107.472 + 107.473 + dc0=dc1=dc2=0; 107.474 + for(i=0;i<4; i++){ 107.475 + dc0+= src[-1+i*stride] + src[i-stride]; 107.476 + dc1+= src[4+i-stride]; 107.477 + dc2+= src[-1+(i+4)*stride]; 107.478 + } 107.479 + dc3= 0x01010101*((dc1 + dc2 + 4)>>3); 107.480 + dc0= 0x01010101*((dc0 + 4)>>3); 107.481 + dc1= 0x01010101*((dc1 + 2)>>2); 107.482 + dc2= 0x01010101*((dc2 + 2)>>2); 107.483 + 107.484 + for(i=0; i<4; i++){ 107.485 + ((uint32_t*)(src+i*stride))[0]= dc0; 107.486 + ((uint32_t*)(src+i*stride))[1]= dc1; 107.487 + } 107.488 + for(i=4; i<8; i++){ 107.489 + ((uint32_t*)(src+i*stride))[0]= dc2; 107.490 + ((uint32_t*)(src+i*stride))[1]= dc3; 107.491 + } 107.492 +} 107.493 + 107.494 +//the following 4 function should not be optimized! 107.495 +static void pred8x8_mad_cow_dc_l0t(uint8_t *src, int stride){ 107.496 + pred8x8_top_dc_c(src, stride); 107.497 + pred4x4_dc_c(src, NULL, stride); 107.498 +} 107.499 + 107.500 +static void pred8x8_mad_cow_dc_0lt(uint8_t *src, int stride){ 107.501 + pred8x8_dc_c(src, stride); 107.502 + pred4x4_top_dc_c(src, NULL, stride); 107.503 +} 107.504 + 107.505 +static void pred8x8_mad_cow_dc_l00(uint8_t *src, int stride){ 107.506 + pred8x8_left_dc_c(src, stride); 107.507 + pred4x4_128_dc_c(src + 4*stride , NULL, stride); 107.508 + pred4x4_128_dc_c(src + 4*stride + 4, NULL, stride); 107.509 +} 107.510 + 107.511 +static void pred8x8_mad_cow_dc_0l0(uint8_t *src, int stride){ 107.512 + pred8x8_left_dc_c(src, stride); 107.513 + pred4x4_128_dc_c(src , NULL, stride); 107.514 + pred4x4_128_dc_c(src + 4, NULL, stride); 107.515 +} 107.516 + 107.517 +static void pred8x8_plane_c(uint8_t *src, int stride){ 107.518 + int j, k; 107.519 + int a; 107.520 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; 107.521 + const uint8_t * const src0 = src+3-stride; 107.522 + const uint8_t *src1 = src+4*stride-1; 107.523 + const uint8_t *src2 = src1-2*stride; // == src+2*stride-1; 107.524 + int H = src0[1] - src0[-1]; 107.525 + int V = src1[0] - src2[ 0]; 107.526 + for(k=2; k<=4; ++k) { 107.527 + src1 += stride; src2 -= stride; 107.528 + H += k*(src0[k] - src0[-k]); 107.529 + V += k*(src1[0] - src2[ 0]); 107.530 + } 107.531 + H = ( 17*H+16 ) >> 5; 107.532 + V = ( 17*V+16 ) >> 5; 107.533 + 107.534 + a = 16*(src1[0] + src2[8]+1) - 3*(V+H); 107.535 + for(j=8; j>0; --j) { 107.536 + int b = a; 107.537 + a += V; 107.538 + src[0] = cm[ (b ) >> 5 ]; 107.539 + src[1] = cm[ (b+ H) >> 5 ]; 107.540 + src[2] = cm[ (b+2*H) >> 5 ]; 107.541 + src[3] = cm[ (b+3*H) >> 5 ]; 107.542 + src[4] = cm[ (b+4*H) >> 5 ]; 107.543 + src[5] = cm[ (b+5*H) >> 5 ]; 107.544 + src[6] = cm[ (b+6*H) >> 5 ]; 107.545 + src[7] = cm[ (b+7*H) >> 5 ]; 107.546 + src += stride; 107.547 + } 107.548 +} 107.549 + 107.550 +#define SRC(x,y) src[(x)+(y)*stride] 107.551 +#define PL(y) \ 107.552 + const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2; 107.553 +#define PREDICT_8x8_LOAD_LEFT \ 107.554 + const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \ 107.555 + + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \ 107.556 + PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \ 107.557 + const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2 107.558 + 107.559 +#define PT(x) \ 107.560 + const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; 107.561 +#define PREDICT_8x8_LOAD_TOP \ 107.562 + const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \ 107.563 + + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \ 107.564 + PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \ 107.565 + const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \ 107.566 + + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2 107.567 + 107.568 +#define PTR(x) \ 107.569 + t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; 107.570 +#define PREDICT_8x8_LOAD_TOPRIGHT \ 107.571 + int t8, t9, t10, t11, t12, t13, t14, t15; \ 107.572 + if(has_topright) { \ 107.573 + PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \ 107.574 + t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \ 107.575 + } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1); 107.576 + 107.577 +#define PREDICT_8x8_LOAD_TOPLEFT \ 107.578 + const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2 107.579 + 107.580 +#define PREDICT_8x8_DC(v) \ 107.581 + int y; \ 107.582 + for( y = 0; y < 8; y++ ) { \ 107.583 + ((uint32_t*)src)[0] = \ 107.584 + ((uint32_t*)src)[1] = v; \ 107.585 + src += stride; \ 107.586 + } 107.587 + 107.588 +static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){ 107.589 + (void) has_topleft; (void) has_topright; 107.590 + PREDICT_8x8_DC(0x80808080); 107.591 +} 107.592 + 107.593 +static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){ 107.594 + (void) has_topleft; (void) has_topright; 107.595 + PREDICT_8x8_LOAD_LEFT; 107.596 + const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101; 107.597 + PREDICT_8x8_DC(dc); 107.598 +} 107.599 + 107.600 +static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){ 107.601 + PREDICT_8x8_LOAD_TOP; 107.602 + const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101; 107.603 + PREDICT_8x8_DC(dc); 107.604 +} 107.605 + 107.606 +static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){ 107.607 + PREDICT_8x8_LOAD_LEFT; 107.608 + PREDICT_8x8_LOAD_TOP; 107.609 + const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7 107.610 + +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101; 107.611 + PREDICT_8x8_DC(dc); 107.612 +} 107.613 + 107.614 +static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride){ 107.615 + (void) has_topleft; (void) has_topright; 107.616 + PREDICT_8x8_LOAD_LEFT; 107.617 +#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\ 107.618 + ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y 107.619 + ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7); 107.620 +#undef ROW 107.621 +} 107.622 + 107.623 +static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride){ 107.624 + int y; 107.625 + PREDICT_8x8_LOAD_TOP; 107.626 + src[0] = t0; 107.627 + src[1] = t1; 107.628 + src[2] = t2; 107.629 + src[3] = t3; 107.630 + src[4] = t4; 107.631 + src[5] = t5; 107.632 + src[6] = t6; 107.633 + src[7] = t7; 107.634 + for( y = 1; y < 8; y++ ) 107.635 + *(uint64_t*)(src+y*stride) = *(uint64_t*)src; 107.636 +} 107.637 + 107.638 +static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride){ 107.639 + PREDICT_8x8_LOAD_TOP; 107.640 + PREDICT_8x8_LOAD_TOPRIGHT; 107.641 + SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2; 107.642 + SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2; 107.643 + SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2; 107.644 + SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2; 107.645 + SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2; 107.646 + SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2; 107.647 + SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2; 107.648 + SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2; 107.649 + SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2; 107.650 + SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2; 107.651 + SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2; 107.652 + SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2; 107.653 + SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2; 107.654 + SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2; 107.655 + SRC(7,7)= (t14 + 3*t15 + 2) >> 2; 107.656 +} 107.657 + 107.658 +static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride){ 107.659 + PREDICT_8x8_LOAD_TOP; 107.660 + PREDICT_8x8_LOAD_LEFT; 107.661 + PREDICT_8x8_LOAD_TOPLEFT; 107.662 + SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2; 107.663 + SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2; 107.664 + SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2; 107.665 + SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2; 107.666 + SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2; 107.667 + SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2; 107.668 + SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2; 107.669 + SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2; 107.670 + SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2; 107.671 + SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2; 107.672 + SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2; 107.673 + SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2; 107.674 + SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2; 107.675 + SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2; 107.676 + SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2; 107.677 +} 107.678 + 107.679 +static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride){ 107.680 + PREDICT_8x8_LOAD_TOP; 107.681 + PREDICT_8x8_LOAD_LEFT; 107.682 + PREDICT_8x8_LOAD_TOPLEFT; 107.683 + SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2; 107.684 + SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2; 107.685 + SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2; 107.686 + SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2; 107.687 + SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2; 107.688 + SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2; 107.689 + SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2; 107.690 + SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1; 107.691 + SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2; 107.692 + SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1; 107.693 + SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2; 107.694 + SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1; 107.695 + SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2; 107.696 + SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1; 107.697 + SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2; 107.698 + SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1; 107.699 + SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2; 107.700 + SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1; 107.701 + SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2; 107.702 + SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1; 107.703 + SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2; 107.704 + SRC(7,0)= (t6 + t7 + 1) >> 1; 107.705 +} 107.706 + 107.707 +static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride){ 107.708 + PREDICT_8x8_LOAD_TOP; 107.709 + PREDICT_8x8_LOAD_LEFT; 107.710 + PREDICT_8x8_LOAD_TOPLEFT; 107.711 + SRC(0,7)= (l6 + l7 + 1) >> 1; 107.712 + SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2; 107.713 + SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1; 107.714 + SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2; 107.715 + SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1; 107.716 + SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2; 107.717 + SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1; 107.718 + SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2; 107.719 + SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1; 107.720 + SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2; 107.721 + SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1; 107.722 + SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2; 107.723 + SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1; 107.724 + SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2; 107.725 + SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1; 107.726 + SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2; 107.727 + SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2; 107.728 + SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2; 107.729 + SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2; 107.730 + SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2; 107.731 + SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2; 107.732 + SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2; 107.733 +} 107.734 + 107.735 +static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride){ 107.736 + PREDICT_8x8_LOAD_TOP; 107.737 + PREDICT_8x8_LOAD_TOPRIGHT; 107.738 + SRC(0,0)= (t0 + t1 + 1) >> 1; 107.739 + SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2; 107.740 + SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1; 107.741 + SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2; 107.742 + SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1; 107.743 + SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2; 107.744 + SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1; 107.745 + SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2; 107.746 + SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1; 107.747 + SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2; 107.748 + SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1; 107.749 + SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2; 107.750 + SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1; 107.751 + SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2; 107.752 + SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1; 107.753 + SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2; 107.754 + SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1; 107.755 + SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2; 107.756 + SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1; 107.757 + SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2; 107.758 + SRC(7,6)= (t10 + t11 + 1) >> 1; 107.759 + SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2; 107.760 +} 107.761 + 107.762 +static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride){ 107.763 + (void) has_topleft; (void) has_topright; 107.764 + PREDICT_8x8_LOAD_LEFT; 107.765 + SRC(0,0)= (l0 + l1 + 1) >> 1; 107.766 + SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2; 107.767 + SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1; 107.768 + SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2; 107.769 + SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1; 107.770 + SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2; 107.771 + SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1; 107.772 + SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2; 107.773 + SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1; 107.774 + SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2; 107.775 + SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1; 107.776 + SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2; 107.777 + SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1; 107.778 + SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2; 107.779 + SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)= 107.780 + SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)= 107.781 + SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)= 107.782 + SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7; 107.783 +} 107.784 +#undef PREDICT_8x8_LOAD_LEFT 107.785 +#undef PREDICT_8x8_LOAD_TOP 107.786 +#undef PREDICT_8x8_LOAD_TOPLEFT 107.787 +#undef PREDICT_8x8_LOAD_TOPRIGHT 107.788 +#undef PREDICT_8x8_DC 107.789 +#undef PTR 107.790 +#undef PT 107.791 +#undef PL 107.792 +#undef SRC 107.793 + 107.794 +static void pred4x4_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){ 107.795 + int i; 107.796 + pix -= stride; 107.797 + for(i=0; i<4; i++){ 107.798 + uint8_t v = pix[0]; 107.799 + pix[1*stride]= v += block[0]; 107.800 + pix[2*stride]= v += block[4]; 107.801 + pix[3*stride]= v += block[8]; 107.802 + pix[4*stride]= v + block[12]; 107.803 + pix++; 107.804 + block++; 107.805 + } 107.806 +} 107.807 + 107.808 +static void pred4x4_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){ 107.809 + int i; 107.810 + for(i=0; i<4; i++){ 107.811 + uint8_t v = pix[-1]; 107.812 + pix[0]= v += block[0]; 107.813 + pix[1]= v += block[1]; 107.814 + pix[2]= v += block[2]; 107.815 + pix[3]= v + block[3]; 107.816 + pix+= stride; 107.817 + block+= 4; 107.818 + } 107.819 +} 107.820 + 107.821 +static void pred8x8l_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){ 107.822 + int i; 107.823 + pix -= stride; 107.824 + for(i=0; i<8; i++){ 107.825 + uint8_t v = pix[0]; 107.826 + pix[1*stride]= v += block[0]; 107.827 + pix[2*stride]= v += block[8]; 107.828 + pix[3*stride]= v += block[16]; 107.829 + pix[4*stride]= v += block[24]; 107.830 + pix[5*stride]= v += block[32]; 107.831 + pix[6*stride]= v += block[40]; 107.832 + pix[7*stride]= v += block[48]; 107.833 + pix[8*stride]= v + block[56]; 107.834 + pix++; 107.835 + block++; 107.836 + } 107.837 +} 107.838 + 107.839 +static void pred8x8l_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){ 107.840 + int i; 107.841 + for(i=0; i<8; i++){ 107.842 + uint8_t v = pix[-1]; 107.843 + pix[0]= v += block[0]; 107.844 + pix[1]= v += block[1]; 107.845 + pix[2]= v += block[2]; 107.846 + pix[3]= v += block[3]; 107.847 + pix[4]= v += block[4]; 107.848 + pix[5]= v += block[5]; 107.849 + pix[6]= v += block[6]; 107.850 + pix[7]= v + block[7]; 107.851 + pix+= stride; 107.852 + block+= 8; 107.853 + } 107.854 +} 107.855 + 107.856 +static void pred16x16_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ 107.857 + int i; 107.858 + for(i=0; i<16; i++) 107.859 + pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride); 107.860 +} 107.861 + 107.862 +static void pred16x16_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ 107.863 + int i; 107.864 + for(i=0; i<16; i++) 107.865 + pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride); 107.866 +} 107.867 + 107.868 +static void pred8x8_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ 107.869 + int i; 107.870 + for(i=0; i<4; i++) 107.871 + pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride); 107.872 +} 107.873 + 107.874 +static void pred8x8_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){ 107.875 + int i; 107.876 + for(i=0; i<4; i++) 107.877 + pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride); 107.878 +} 107.879 + 107.880 + 107.881 +/** 107.882 + * Sets the intra prediction function pointers. 107.883 + */ 107.884 +void ff_h264_pred_init(H264PredContext *h){ 107.885 + 107.886 + h->pred4x4[VERT_PRED ]= pred4x4_vertical_c; 107.887 + h->pred4x4[HOR_PRED ]= pred4x4_horizontal_c; 107.888 + h->pred4x4[DC_PRED ]= pred4x4_dc_c; 107.889 + h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c; 107.890 + h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c; 107.891 + h->pred4x4[VERT_RIGHT_PRED ]= pred4x4_vertical_right_c; 107.892 + h->pred4x4[HOR_DOWN_PRED ]= pred4x4_horizontal_down_c; 107.893 + h->pred4x4[VERT_LEFT_PRED ]= pred4x4_vertical_left_c; 107.894 + h->pred4x4[HOR_UP_PRED ]= pred4x4_horizontal_up_c; 107.895 + h->pred4x4[LEFT_DC_PRED ]= pred4x4_left_dc_c; 107.896 + h->pred4x4[TOP_DC_PRED ]= pred4x4_top_dc_c; 107.897 + h->pred4x4[DC_128_PRED ]= pred4x4_128_dc_c; 107.898 + 107.899 + h->pred8x8l[VERT_PRED ]= pred8x8l_vertical_c; 107.900 + h->pred8x8l[HOR_PRED ]= pred8x8l_horizontal_c; 107.901 + h->pred8x8l[DC_PRED ]= pred8x8l_dc_c; 107.902 + h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c; 107.903 + h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c; 107.904 + h->pred8x8l[VERT_RIGHT_PRED ]= pred8x8l_vertical_right_c; 107.905 + h->pred8x8l[HOR_DOWN_PRED ]= pred8x8l_horizontal_down_c; 107.906 + h->pred8x8l[VERT_LEFT_PRED ]= pred8x8l_vertical_left_c; 107.907 + h->pred8x8l[HOR_UP_PRED ]= pred8x8l_horizontal_up_c; 107.908 + h->pred8x8l[LEFT_DC_PRED ]= pred8x8l_left_dc_c; 107.909 + h->pred8x8l[TOP_DC_PRED ]= pred8x8l_top_dc_c; 107.910 + h->pred8x8l[DC_128_PRED ]= pred8x8l_128_dc_c; 107.911 + 107.912 + h->pred8x8[VERT_PRED8x8 ]= pred8x8_vertical_c; 107.913 + h->pred8x8[HOR_PRED8x8 ]= pred8x8_horizontal_c; 107.914 + h->pred8x8[PLANE_PRED8x8 ]= pred8x8_plane_c; 107.915 + 107.916 + h->pred8x8[DC_PRED8x8 ]= pred8x8_dc_c; 107.917 + h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c; 107.918 + h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c; 107.919 + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= pred8x8_mad_cow_dc_l0t; 107.920 + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= pred8x8_mad_cow_dc_0lt; 107.921 + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= pred8x8_mad_cow_dc_l00; 107.922 + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= pred8x8_mad_cow_dc_0l0; 107.923 + 107.924 + h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c; 107.925 + 107.926 + h->pred16x16[DC_PRED8x8 ]= pred16x16_dc_c; 107.927 + h->pred16x16[VERT_PRED8x8 ]= pred16x16_vertical_c; 107.928 + h->pred16x16[HOR_PRED8x8 ]= pred16x16_horizontal_c; 107.929 + h->pred16x16[PLANE_PRED8x8 ]= pred16x16_plane_c; 107.930 + 107.931 + h->pred16x16[PLANE_PRED8x8 ]= pred16x16_plane_c; 107.932 + 107.933 + h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c; 107.934 + h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c; 107.935 + h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c; 107.936 + 107.937 + //special lossless h/v prediction for h264 107.938 + h->pred4x4_add [VERT_PRED ]= pred4x4_vertical_add_c; 107.939 + h->pred4x4_add [ HOR_PRED ]= pred4x4_horizontal_add_c; 107.940 + h->pred8x8l_add [VERT_PRED ]= pred8x8l_vertical_add_c; 107.941 + h->pred8x8l_add [ HOR_PRED ]= pred8x8l_horizontal_add_c; 107.942 + h->pred8x8_add [VERT_PRED8x8]= pred8x8_vertical_add_c; 107.943 + h->pred8x8_add [ HOR_PRED8x8]= pred8x8_horizontal_add_c; 107.944 + h->pred16x16_add[VERT_PRED8x8]= pred16x16_vertical_add_c; 107.945 + h->pred16x16_add[ HOR_PRED8x8]= pred16x16_horizontal_add_c; 107.946 + 107.947 + if (HAVE_NEON) ff_h264_pred_init_arm(h); 107.948 +}
108.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 108.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_pred.h Mon Aug 27 12:09:56 2012 +0200 108.3 @@ -0,0 +1,90 @@ 108.4 +/* 108.5 + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder 108.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 108.7 + * 108.8 + * This file is part of FFmpeg. 108.9 + * 108.10 + * FFmpeg is free software; you can redistribute it and/or 108.11 + * modify it under the terms of the GNU Lesser General Public 108.12 + * License as published by the Free Software Foundation; either 108.13 + * version 2.1 of the License, or (at your option) any later version. 108.14 + * 108.15 + * FFmpeg is distributed in the hope that it will be useful, 108.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 108.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 108.18 + * Lesser General Public License for more details. 108.19 + * 108.20 + * You should have received a copy of the GNU Lesser General Public 108.21 + * License along with FFmpeg; if not, write to the Free Software 108.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 108.23 + */ 108.24 + 108.25 +/** 108.26 + * @file 108.27 + * H.264 / AVC / MPEG4 prediction functions. 108.28 + * @author Michael Niedermayer <michaelni@gmx.at> 108.29 + */ 108.30 + 108.31 +#ifndef AVCODEC_H264PRED_H 108.32 +#define AVCODEC_H264PRED_H 108.33 + 108.34 +#include "libavutil/common.h" 108.35 +#include "dsputil.h" 108.36 + 108.37 +/** 108.38 + * Prediction types 108.39 + */ 108.40 +//@{ 108.41 +#define VERT_PRED 0 108.42 +#define HOR_PRED 1 108.43 +#define DC_PRED 2 108.44 +#define DIAG_DOWN_LEFT_PRED 3 108.45 +#define DIAG_DOWN_RIGHT_PRED 4 108.46 +#define VERT_RIGHT_PRED 5 108.47 +#define HOR_DOWN_PRED 6 108.48 +#define VERT_LEFT_PRED 7 108.49 +#define HOR_UP_PRED 8 108.50 + 108.51 +#define LEFT_DC_PRED 9 108.52 +#define TOP_DC_PRED 10 108.53 +#define DC_128_PRED 11 108.54 + 108.55 +#define DIAG_DOWN_LEFT_PRED_RV40_NODOWN 12 108.56 +#define HOR_UP_PRED_RV40_NODOWN 13 108.57 +#define VERT_LEFT_PRED_RV40_NODOWN 14 108.58 + 108.59 +#define DC_PRED8x8 0 108.60 +#define HOR_PRED8x8 1 108.61 +#define VERT_PRED8x8 2 108.62 +#define PLANE_PRED8x8 3 108.63 + 108.64 +#define LEFT_DC_PRED8x8 4 108.65 +#define TOP_DC_PRED8x8 5 108.66 +#define DC_128_PRED8x8 6 108.67 + 108.68 +#define ALZHEIMER_DC_L0T_PRED8x8 7 108.69 +#define ALZHEIMER_DC_0LT_PRED8x8 8 108.70 +#define ALZHEIMER_DC_L00_PRED8x8 9 108.71 +#define ALZHEIMER_DC_0L0_PRED8x8 10 108.72 +//@} 108.73 + 108.74 +/** 108.75 + * Context for storing H.264 prediction functions 108.76 + */ 108.77 +typedef struct H264PredContext{ 108.78 + void (*pred4x4 [9+3+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp? 108.79 + void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride); 108.80 + void (*pred8x8 [4+3+4])(uint8_t *src, int stride); 108.81 + void (*pred16x16[4+3])(uint8_t *src, int stride); 108.82 + 108.83 + void (*pred4x4_add [2])(uint8_t *pix/*align 4*/, const DCTELEM *block/*align 16*/, int stride); 108.84 + void (*pred8x8l_add [2])(uint8_t *pix/*align 8*/, const DCTELEM *block/*align 16*/, int stride); 108.85 + void (*pred8x8_add [3])(uint8_t *pix/*align 8*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride); 108.86 + void (*pred16x16_add[3])(uint8_t *pix/*align 16*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride); 108.87 +}H264PredContext; 108.88 + 108.89 +void ff_h264_pred_init(H264PredContext *h); 108.90 +void ff_h264_pred_init_arm(H264PredContext *h); 108.91 + 108.92 + 108.93 +#endif /* AVCODEC_H264PRED_H */
109.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 109.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.c Mon Aug 27 12:09:56 2012 +0200 109.3 @@ -0,0 +1,1013 @@ 109.4 +/* 109.5 + * H.26L/H.264/AVC/JVT/14496-10/... direct mb/block decoding 109.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 109.7 + * 109.8 + * This file is part of FFmpeg. 109.9 + * 109.10 + * FFmpeg is free software; you can redistribute it and/or 109.11 + * modify it under the terms of the GNU Lesser General Public 109.12 + * License as published by the Free Software Foundation; either 109.13 + * version 2.1 of the License, or (at your option) any later version. 109.14 + * 109.15 + * FFmpeg is distributed in the hope that it will be useful, 109.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 109.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 109.18 + * Lesser General Public License for more details. 109.19 + * 109.20 + * You should have received a copy of the GNU Lesser General Public 109.21 + * License along with FFmpeg; if not, write to the Free Software 109.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 109.23 + */ 109.24 + 109.25 +/** 109.26 + * @file 109.27 + * H.264 / AVC / MPEG4 part10 direct mb/block decoding. 109.28 + * @author Michael Niedermayer <michaelni@gmx.at> 109.29 + */ 109.30 + 109.31 +#include "dsputil.h" 109.32 +#include "avcodec.h" 109.33 +#include "h264_data.h" 109.34 +#include "h264.h" 109.35 +#include "rectangle.h" 109.36 + 109.37 +//#undef NDEBUG 109.38 +#include <assert.h> 109.39 + 109.40 +static const uint8_t left_block_options[4][16]={ 109.41 + {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8}, 109.42 + {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8}, 109.43 + {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}, 109.44 + {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8} 109.45 +}; 109.46 + 109.47 + 109.48 +// static void check_cache_copy(MBRecContext *mrc, H264Slice *s, H264Mb *m){ 109.49 +// for (int list=0; list<2; list++){ 109.50 +// for (int i=0; i<40; i++){ 109.51 +// assert (m->ref_cache[list][i] == m->ref_cache_copy[list][i]); 109.52 +// assert (mrs->mv_cache[list][i][0] == mrs->mv_cache_copy[list][i][0]); 109.53 +// assert (mrs->mv_cache[list][i][1] == mrs->mv_cache_copy[list][i][1]); 109.54 +// } 109.55 +// } 109.56 +// } 109.57 + 109.58 +// static void check_cache_copy2(MBRecContext *mrc, H264Slice *s, H264Mb *m){ 109.59 +// for (int list=0; list<2; list++){ 109.60 +// for (int i=0; i<40; i++){ 109.61 +// assert (m->ref_cache[list][i] == m->ref_cache_copy2[list][i]); 109.62 +// assert (mrs->mv_cache[list][i][0] == mrs->mv_cache_copy2[list][i][0]); 109.63 +// assert (mrs->mv_cache[list][i][1] == mrs->mv_cache_copy2[list][i][1]); 109.64 +// } 109.65 +// } 109.66 +// } 109.67 + 109.68 +static void fill_decode_caches_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){ 109.69 + int topleft_type, top_type, topright_type, left_type; 109.70 + const uint8_t * left_block= left_block_options[0]; 109.71 + const int mb_x = m->mb_x; 109.72 + int i; 109.73 + 109.74 + mrs->top_type = mrs->mb_type_top[mb_x ]; 109.75 + mrs->left_type = mrs->mb_type [mb_x-1]; 109.76 + 109.77 + topleft_type = mrs->mb_type_top[mb_x-1]; 109.78 + top_type = mrs->mb_type_top[mb_x ]; 109.79 + topright_type= mrs->mb_type_top[mb_x+1]; 109.80 + left_type = mrs->mb_type [mb_x-1]; 109.81 + 109.82 + int type_mask= s->pps.constrained_intra_pred ? 1 : -1; 109.83 + 109.84 + if(!IS_SKIP(mb_type)){ 109.85 +// memset(mrc->non_zero_count_cache, 0, sizeof(mrc->non_zero_count_cache)); 109.86 + AV_COPY32(&mrs->non_zero_count_cache[4+8*1], &m->non_zero_count[ 0]); 109.87 + AV_COPY32(&mrs->non_zero_count_cache[4+8*2], &m->non_zero_count[ 4]); 109.88 + AV_COPY32(&mrs->non_zero_count_cache[4+8*3], &m->non_zero_count[ 8]); 109.89 + AV_COPY32(&mrs->non_zero_count_cache[4+8*4], &m->non_zero_count[12]); 109.90 + 109.91 + for (int i=0; i<2; i++) { 109.92 + mrs->non_zero_count_cache[8*1 + 8*i + 1] = m->non_zero_count[16 + i*2 ]; 109.93 + mrs->non_zero_count_cache[8*1 + 8*i + 2] = m->non_zero_count[16 + i*2 +1]; 109.94 + mrs->non_zero_count_cache[8*4 + 8*i + 1] = m->non_zero_count[20 + i*2 ]; 109.95 + mrs->non_zero_count_cache[8*4 + 8*i + 2] = m->non_zero_count[20 + i*2 +1]; 109.96 + } 109.97 + 109.98 + if(IS_INTRA(mb_type)){ 109.99 +// memset(mrc->intra4x4_pred_mode_cache, 0, sizeof(mrc->intra4x4_pred_mode_cache)); 109.100 + 109.101 + mrs->topleft_samples_available= 109.102 + mrs->top_samples_available= 109.103 + mrs->left_samples_available= 0xFFFF; 109.104 + mrs->topright_samples_available= 0xEEEA; 109.105 + 109.106 + if(!(top_type & type_mask)){ 109.107 + mrs->topleft_samples_available= 0xB3FF; 109.108 + mrs->top_samples_available= 0x33FF; 109.109 + mrs->topright_samples_available= 0x26EA; 109.110 + } 109.111 + 109.112 + if(!(left_type & type_mask)){ 109.113 + mrs->topleft_samples_available&= 0xDF5F; 109.114 + mrs->left_samples_available&= 0x5F5F; 109.115 + } 109.116 + 109.117 + if(!(topleft_type & type_mask)) 109.118 + mrs->topleft_samples_available&= 0x7FFF; 109.119 + 109.120 + if(!(topright_type & type_mask)) 109.121 + mrs->topright_samples_available&= 0xFBFF; 109.122 + 109.123 + if(IS_INTRA4x4(mb_type)){ 109.124 + if(IS_INTRA4x4(top_type)){ 109.125 + AV_COPY32(mrs->intra4x4_pred_mode_cache+4+8*0, &mrs->intra4x4_pred_mode_top[4*mb_x]); 109.126 + }else{ 109.127 + mrs->intra4x4_pred_mode_cache[4+8*0]= 109.128 + mrs->intra4x4_pred_mode_cache[5+8*0]= 109.129 + mrs->intra4x4_pred_mode_cache[6+8*0]= 109.130 + mrs->intra4x4_pred_mode_cache[7+8*0]= 2 - 3*!(top_type & type_mask); 109.131 + } 109.132 + 109.133 + if(IS_INTRA4x4(left_type)){ 109.134 +#if OMPSS 109.135 + mrs->intra4x4_pred_mode_cache[3+8*1]= m->intra4x4_pred_mode_left[0]; 109.136 + mrs->intra4x4_pred_mode_cache[3+8*2]= m->intra4x4_pred_mode_left[1]; 109.137 + mrs->intra4x4_pred_mode_cache[3+8*3]= m->intra4x4_pred_mode_left[2]; 109.138 + mrs->intra4x4_pred_mode_cache[3+8*4]= m->intra4x4_pred_mode_left[3]; 109.139 +#else 109.140 + mrs->intra4x4_pred_mode_cache[3+8*1]= mrs->intra4x4_pred_mode_left[0]; 109.141 + mrs->intra4x4_pred_mode_cache[3+8*2]= mrs->intra4x4_pred_mode_left[1]; 109.142 + mrs->intra4x4_pred_mode_cache[3+8*3]= mrs->intra4x4_pred_mode_left[2]; 109.143 + mrs->intra4x4_pred_mode_cache[3+8*4]= mrs->intra4x4_pred_mode_left[3]; 109.144 +#endif 109.145 + }else{ 109.146 + mrs->intra4x4_pred_mode_cache[3+8*1]= 109.147 + mrs->intra4x4_pred_mode_cache[3+8*2]= 109.148 + mrs->intra4x4_pred_mode_cache[3+8*3]= 109.149 + mrs->intra4x4_pred_mode_cache[3+8*4]= 2 - 3*!(left_type & type_mask); 109.150 + } 109.151 + } 109.152 + } 109.153 + } 109.154 + 109.155 + if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){ 109.156 + int list; 109.157 + 109.158 +// memset(mrs->mv_cache, 0, sizeof(mrs->mv_cache)); 109.159 +// memset(mrs->ref_cache, 0, sizeof(mrs->ref_cache)); 109.160 + 109.161 + mrs->ref_cache[0][scan8[5 ]+1] = mrs->ref_cache[0][scan8[7 ]+1] = mrs->ref_cache[0][scan8[13]+1] = 109.162 + mrs->ref_cache[1][scan8[5 ]+1] = mrs->ref_cache[1][scan8[7 ]+1] = mrs->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE; 109.163 + 109.164 + for(list=0; list<s->list_count; list++){ 109.165 + if(!USES_LIST(mb_type, list)){ 109.166 + continue; 109.167 + } 109.168 + assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred)); 109.169 + 109.170 + if(USES_LIST(top_type, list)){ 109.171 + const int b_xy= 4*mb_x + 3*mrc->b_stride; 109.172 + AV_COPY128(mrs->mv_cache[list][scan8[0] + 0 - 1*8], mrs->motion_val_top[list][b_xy + 0]); 109.173 + mrs->ref_cache[list][scan8[0] + 0 - 1*8]= 109.174 + mrs->ref_cache[list][scan8[0] + 1 - 1*8]= mrs->ref_index_top[list][4*mb_x + 2]; 109.175 + mrs->ref_cache[list][scan8[0] + 2 - 1*8]= 109.176 + mrs->ref_cache[list][scan8[0] + 3 - 1*8]= mrs->ref_index_top[list][4*mb_x + 3]; 109.177 + }else{ 109.178 + AV_ZERO128(mrs->mv_cache[list][scan8[0] + 0 - 1*8]); 109.179 + AV_WN32A(&mrs->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101); 109.180 + } 109.181 + 109.182 + if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){ 109.183 + for(i=0; i<2; i++){ 109.184 + int cache_idx = scan8[0] - 1 + i*2*8; 109.185 + if(USES_LIST(left_type, list)){ 109.186 + const int b_xy= 4*(mb_x-1) + 3; 109.187 + const int b8_x= 4*(mb_x-1) + 1; 109.188 + AV_COPY32(mrs->mv_cache[list][cache_idx ], mrs->motion_val[list][b_xy + mrc->b_stride*left_block[0+i*2]]); 109.189 + AV_COPY32(mrs->mv_cache[list][cache_idx+8], mrs->motion_val[list][b_xy + mrc->b_stride*left_block[1+i*2]]); 109.190 + mrs->ref_cache[list][cache_idx ]= mrs->ref_index[list][b8_x + (left_block[0+i*2]&~1)]; 109.191 + mrs->ref_cache[list][cache_idx+8]= mrs->ref_index[list][b8_x + (left_block[1+i*2]&~1)]; 109.192 + }else{ 109.193 + AV_ZERO32(mrs->mv_cache [list][cache_idx ]); 109.194 + AV_ZERO32(mrs->mv_cache [list][cache_idx+8]); 109.195 + mrs->ref_cache[list][cache_idx ]= 109.196 + mrs->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE); 109.197 + } 109.198 + } 109.199 + }else{ 109.200 + if(USES_LIST(left_type, list)){ 109.201 + const int b_x = 4*(mb_x-1) + 3; 109.202 + const int b8_x= 4*(mb_x-1) + 1; 109.203 + AV_COPY32(mrs->mv_cache[list][scan8[0] - 1], mrs->motion_val[list][b_x + mrc->b_stride*left_block[0]]); 109.204 + mrs->ref_cache[list][scan8[0] - 1]= mrs->ref_index[list][b8_x + (left_block[0]&~1)]; 109.205 + }else{ 109.206 + AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1]); 109.207 + mrs->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; 109.208 + } 109.209 + } 109.210 + 109.211 + if(USES_LIST(topright_type, list)){ 109.212 + const int b_xy= 4*(mb_x+1) + 3*mrc->b_stride; 109.213 + AV_COPY32(mrs->mv_cache[list][scan8[0] + 4 - 1*8], mrs->motion_val_top[list][b_xy]); 109.214 + mrs->ref_cache[list][scan8[0] + 4 - 1*8]= mrs->ref_index_top[list][4*(mb_x+1) + 2]; 109.215 + }else{ 109.216 + AV_ZERO32(mrs->mv_cache [list][scan8[0] + 4 - 1*8]); 109.217 + mrs->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; 109.218 + } 109.219 + if(mrs->ref_cache[list][scan8[0] + 4 - 1*8] < 0){ 109.220 + int topleft_partition= -1; 109.221 + if(USES_LIST(topleft_type, list)){ 109.222 + const int b_xy = 4*(mb_x-1) + 3 + mrc->b_stride + (topleft_partition & 2*mrc->b_stride); 109.223 + const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2); 109.224 + AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 - 1*8], mrs->motion_val_top[list][b_xy]); 109.225 + mrs->ref_cache[list][scan8[0] - 1 - 1*8]= mrs->ref_index_top[list][b8_x]; 109.226 + }else{ 109.227 + AV_ZERO32(mrs->mv_cache[list][scan8[0] - 1 - 1*8]); 109.228 + mrs->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE; 109.229 + } 109.230 + } 109.231 + 109.232 + if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) 109.233 + continue; 109.234 + 109.235 + if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) { 109.236 + mrs->ref_cache[list][scan8[4 ]] = 109.237 + mrs->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE; 109.238 + AV_ZERO32(mrs->mv_cache [list][scan8[4 ]]); 109.239 + AV_ZERO32(mrs->mv_cache [list][scan8[12]]); 109.240 + } 109.241 + } 109.242 + } 109.243 +} 109.244 + 109.245 +static inline void write_back_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){ 109.246 + const int b_stride = mrc->b_stride; 109.247 + const int b_x = 4*m->mb_x; //try mb2b(8)_xy 109.248 + const int b8_x= 4*m->mb_x; 109.249 + int list; 109.250 + 109.251 + if(!USES_LIST(mb_type, 0)) 109.252 + fill_rectangle(&mrs->ref_index[0][b8_x], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1); 109.253 + 109.254 + for(list=0; list<s->list_count; list++){ 109.255 + int y; 109.256 + int16_t (*mv_dst)[2]; 109.257 + int16_t (*mv_src)[2]; 109.258 + 109.259 + if(!USES_LIST(mb_type, list)) 109.260 + continue; 109.261 + 109.262 + mv_dst = &mrs->motion_val[list][b_x]; 109.263 + mv_src = &mrs->mv_cache[list][scan8[0]]; 109.264 + for(y=0; y<4; y++){ 109.265 + AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y); 109.266 + } 109.267 + 109.268 + { 109.269 + int8_t *ref_index = &mrs->ref_index[list][b8_x]; 109.270 + ref_index[0+0*2]= mrs->ref_cache[list][scan8[0]]; 109.271 + ref_index[1+0*2]= mrs->ref_cache[list][scan8[4]]; 109.272 + ref_index[0+1*2]= mrs->ref_cache[list][scan8[8]]; 109.273 + ref_index[1+1*2]= mrs->ref_cache[list][scan8[12]]; 109.274 + } 109.275 + } 109.276 +} 109.277 + 109.278 + 109.279 +/** 109.280 +* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks. 109.281 +*/ 109.282 +static int check_intra4x4_pred_mode(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m){ 109.283 + static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0}; 109.284 + static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED}; 109.285 + int i; 109.286 + 109.287 + if(!(mrs->top_samples_available&0x8000)){ 109.288 + for(i=0; i<4; i++){ 109.289 + int status= top[ mrs->intra4x4_pred_mode_cache[scan8[0] + i] ]; 109.290 + if(status<0){ 109.291 + av_log(AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y); 109.292 + return -1; 109.293 + } else if(status){ 109.294 + mrs->intra4x4_pred_mode_cache[scan8[0] + i]= status; 109.295 + } 109.296 + } 109.297 + } 109.298 + 109.299 + if((mrs->left_samples_available&0x8888)!=0x8888){ 109.300 + static const int mask[4]={0x8000,0x2000,0x80,0x20}; 109.301 + for(i=0; i<4; i++){ 109.302 + if(!(mrs->left_samples_available&mask[i])){ 109.303 + int status= left[ mrs->intra4x4_pred_mode_cache[scan8[0] + 8*i] ]; 109.304 + if(status<0){ 109.305 + av_log(AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y); 109.306 + return -1; 109.307 + } else if(status){ 109.308 + mrs->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status; 109.309 + } 109.310 + } 109.311 + } 109.312 + } 109.313 + return 0; 109.314 +} 109.315 + 109.316 +/** 109.317 +* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks. 109.318 +*/ 109.319 +static int check_intra_pred_mode(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mode){ 109.320 + static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1}; 109.321 + static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8}; 109.322 + 109.323 + if(mode > 6) { 109.324 + av_log(AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", m->mb_x, m->mb_y); 109.325 + return -1; 109.326 + } 109.327 + 109.328 + if(!(mrs->top_samples_available&0x8000)){ 109.329 + mode= top[ mode ]; 109.330 + if(mode<0){ 109.331 + av_log(AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", m->mb_x, m->mb_y); 109.332 + return -1; 109.333 + } 109.334 + } 109.335 + 109.336 + if((mrs->left_samples_available&0x8080) != 0x8080){ 109.337 + mode= left[ mode ]; 109.338 + if(mrs->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred 109.339 + mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(mrs->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8); 109.340 + } 109.341 + if(mode<0){ 109.342 + av_log(AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", m->mb_x, m->mb_y); 109.343 + return -1; 109.344 + } 109.345 + } 109.346 + return mode; 109.347 +} 109.348 + 109.349 +/** 109.350 + * gets the predicted intra4x4 prediction mode. 109.351 + */ 109.352 +static inline int pred_intra_mode(MBRecContext *mrc, MBRecState *mrs, int n){ 109.353 + const int index8= scan8[n]; 109.354 + const int left= mrs->intra4x4_pred_mode_cache[index8 - 1]; 109.355 + const int top = mrs->intra4x4_pred_mode_cache[index8 - 8]; 109.356 + const int min= FFMIN(left, top); 109.357 + 109.358 + if(min<0) return DC_PRED; 109.359 + else return min; 109.360 +} 109.361 + 109.362 +static void write_back_intra_pred_mode_rec(MBRecContext *mrc, MBRecState *mrs, H264Mb *m, int mb_x){ 109.363 + int8_t *mode= &mrs->intra4x4_pred_mode[4*mb_x]; 109.364 + 109.365 + AV_COPY32(mode, mrs->intra4x4_pred_mode_cache + 4 + 8*4); 109.366 +#if OMPSS 109.367 + if (m->mb_x < mrc->mb_width-1){ 109.368 + H264Mb *mr= m+1; 109.369 + mode = mr->intra4x4_pred_mode_left; 109.370 + mode[0]= mrs->intra4x4_pred_mode_cache[7+8*1]; 109.371 + mode[1]= mrs->intra4x4_pred_mode_cache[7+8*2]; 109.372 + mode[2]= mrs->intra4x4_pred_mode_cache[7+8*3]; 109.373 + mode[3]= mrs->intra4x4_pred_mode_cache[7+8*4]; 109.374 + } 109.375 +#else 109.376 + mode = mrs->intra4x4_pred_mode_left; 109.377 + mode[0]= mrs->intra4x4_pred_mode_cache[7+8*1]; 109.378 + mode[1]= mrs->intra4x4_pred_mode_cache[7+8*2]; 109.379 + mode[2]= mrs->intra4x4_pred_mode_cache[7+8*3]; 109.380 + mode[3]= mrs->intra4x4_pred_mode_cache[7+8*4]; 109.381 +#endif 109.382 +} 109.383 + 109.384 +static void pred_spatial_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){ 109.385 + int b4_stride = mrc->b_stride; 109.386 + const int mb_x = m->mb_x; 109.387 + int mb_type_col[2]; 109.388 + const int16_t (*l1mv0)[2], (*l1mv1)[2]; 109.389 + const int8_t *l1ref0, *l1ref1; 109.390 + const int is_b8x8 = IS_8X8(*mb_type); 109.391 + unsigned int sub_mb_type= MB_TYPE_L0L1; 109.392 + int i8, i4; 109.393 + int ref[2]; 109.394 + int mv[2]; 109.395 + int list; 109.396 + 109.397 + //assert(h->ref_list[1][0].reference&3); 109.398 + 109.399 +#define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM) 109.400 + 109.401 + /* ref = min(neighbors) */ 109.402 + for(list=0; list<2; list++){ 109.403 + int left_ref = mrs->ref_cache[list][scan8[0] - 1]; 109.404 + int top_ref = mrs->ref_cache[list][scan8[0] - 8]; 109.405 + int refc = mrs->ref_cache[list][scan8[0] - 8 + 4]; 109.406 + const int16_t *C= mrs->mv_cache[list][ scan8[0] - 8 + 4]; 109.407 + if(refc == PART_NOT_AVAILABLE){ 109.408 + refc = mrs->ref_cache[list][scan8[0] - 8 - 1]; 109.409 + C = mrs->mv_cache[list][scan8[0] - 8 - 1]; 109.410 + } 109.411 + ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc); 109.412 + if(ref[list] >= 0){ 109.413 + //this is just pred_motion() but with the cases removed that cannot happen for direct blocks 109.414 + const int16_t * const A= mrs->mv_cache[list][ scan8[0] - 1 ]; 109.415 + const int16_t * const B= mrs->mv_cache[list][ scan8[0] - 8 ]; 109.416 + 109.417 + int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]); 109.418 + if(match_count > 1){ //most common 109.419 + mv[list]= pack16to32(mid_pred(A[0], B[0], C[0]), 109.420 + mid_pred(A[1], B[1], C[1]) ); 109.421 + }else { 109.422 + assert(match_count==1); 109.423 + if(left_ref==ref[list]){ 109.424 + mv[list]= AV_RN32A(A); 109.425 + }else if(top_ref==ref[list]){ 109.426 + mv[list]= AV_RN32A(B); 109.427 + }else{ 109.428 + mv[list]= AV_RN32A(C); 109.429 + } 109.430 + } 109.431 + }else{ 109.432 + int mask= ~(MB_TYPE_L0 << (2*list)); 109.433 + mv[list] = 0; 109.434 + ref[list] = -1; 109.435 + if(!is_b8x8) 109.436 + *mb_type &= mask; 109.437 + sub_mb_type &= mask; 109.438 + } 109.439 + } 109.440 + 109.441 + if(ref[0] < 0 && ref[1] < 0){ 109.442 + ref[0] = ref[1] = 0; 109.443 + if(!is_b8x8) 109.444 + *mb_type |= MB_TYPE_L0L1; 109.445 + sub_mb_type |= MB_TYPE_L0L1; 109.446 + } 109.447 + 109.448 + if(!(is_b8x8|mv[0]|mv[1])){ 109.449 + fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1); 109.450 + fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1); 109.451 + fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4); 109.452 + fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4); 109.453 + *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2; 109.454 + return; 109.455 + } 109.456 + 109.457 + mb_type_col[0] = 109.458 + mb_type_col[1] = mrs->list1_mb_type[mb_x]; 109.459 + 109.460 + sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ 109.461 + if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){ 109.462 + *mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_16x16 */ 109.463 + }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){ 109.464 + *mb_type |= MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); 109.465 + }else{ 109.466 + if(!s->direct_8x8_inference_flag){ 109.467 + /* FIXME save sub mb types from previous frames (or derive from MVs) 109.468 + * so we know exactly what block size to use */ 109.469 + sub_mb_type += (MB_TYPE_8x8-MB_TYPE_16x16); /* B_SUB_4x4 */ 109.470 + } 109.471 + *mb_type |= MB_TYPE_8x8; 109.472 + } 109.473 + 109.474 + l1mv0 = (void *) &mrs->list1_motion_val[0][4*mb_x]; 109.475 + l1mv1 = (void *) &mrs->list1_motion_val[1][4*mb_x]; 109.476 + l1ref0 = &mrs->list1_ref_index [0][4*mb_x]; 109.477 + l1ref1 = &mrs->list1_ref_index [1][4*mb_x]; 109.478 +// if(!b8_stride){ 109.479 +// if(m->mb_y&1){ 109.480 +// l1ref0 += 2; 109.481 +// l1ref1 += 2; 109.482 +// l1mv0 += 2*b4_stride; 109.483 +// l1mv1 += 2*b4_stride; 109.484 +// } 109.485 +// } 109.486 + 109.487 + if(IS_16X16(*mb_type)){ 109.488 + int a,b; 109.489 + 109.490 + fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1); 109.491 + fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1); 109.492 + if(!IS_INTRA(mb_type_col[0]) && ( (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1) 109.493 + || (l1ref0[0] < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1 109.494 + ))){ 109.495 + a=b=0; 109.496 + if(ref[0] > 0) 109.497 + a= mv[0]; 109.498 + if(ref[1] > 0) 109.499 + b= mv[1]; 109.500 + }else{ 109.501 + a= mv[0]; 109.502 + b= mv[1]; 109.503 + } 109.504 + fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, a, 4); 109.505 + fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, b, 4); 109.506 + }else{ 109.507 + int n=0; 109.508 + for(i8=0; i8<4; i8++){ 109.509 + const int x8 = i8&1; 109.510 + const int y8 = i8>>1; 109.511 + 109.512 + if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8])) 109.513 + continue; 109.514 + m->sub_mb_type[i8] = sub_mb_type; 109.515 + 109.516 + fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, mv[0], 4); 109.517 + fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, mv[1], 4); 109.518 + fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1); 109.519 + fill_rectangle(&mrs->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1); 109.520 + 109.521 + /* col_zero_flag */ 109.522 + if(!IS_INTRA(mb_type_col[0]) && (l1ref0[i8] == 0 || (l1ref0[i8] < 0 && l1ref1[i8] == 0 )) 109.523 + ){ 109.524 + const int16_t (*l1mv)[2]= l1ref0[i8] == 0 ? l1mv0 : l1mv1; 109.525 + if(IS_SUB_8X8(sub_mb_type)){ 109.526 + const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride]; 109.527 + if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){ 109.528 + if(ref[0] == 0) 109.529 + fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4); 109.530 + if(ref[1] == 0) 109.531 + fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4); 109.532 + n+=4; 109.533 + } 109.534 + }else{ 109.535 + int k=0; 109.536 + for(i4=0; i4<4; i4++){ 109.537 + const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride]; 109.538 + if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){ 109.539 + if(ref[0] == 0) 109.540 + AV_ZERO32(mrs->mv_cache[0][scan8[i8*4+i4]]); 109.541 + if(ref[1] == 0) 109.542 + AV_ZERO32(mrs->mv_cache[1][scan8[i8*4+i4]]); 109.543 + k++; 109.544 + } 109.545 + } 109.546 + if(!(k&3)) 109.547 + m->sub_mb_type[i8]+= MB_TYPE_16x16 - MB_TYPE_8x8; 109.548 + n+=k; 109.549 + } 109.550 + } 109.551 + } 109.552 + if(!is_b8x8 && !(n&15)){ 109.553 + *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2; 109.554 + } 109.555 + } 109.556 +} 109.557 + 109.558 +static void pred_temp_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){ 109.559 + const int mb_x = m->mb_x; 109.560 + int b4_stride = mrc->b_stride; 109.561 + int mb_type_col[2]; 109.562 + const int16_t (*l1mv0)[2], (*l1mv1)[2]; 109.563 + const int8_t *l1ref0, *l1ref1; 109.564 + const int is_b8x8 = IS_8X8(*mb_type); 109.565 + unsigned int sub_mb_type; 109.566 + int i8, i4; 109.567 + const int *map_col_to_list0[2] = {s->map_col_to_list0[0], s->map_col_to_list0[1]}; 109.568 + const int *dist_scale_factor = s->dist_scale_factor; 109.569 + 109.570 + mb_type_col[0] = 109.571 + mb_type_col[1] = mrs->list1_mb_type[mb_x]; 109.572 + 109.573 + sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ 109.574 + if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){ 109.575 + *mb_type |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */ 109.576 + }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){ 109.577 + *mb_type |= MB_TYPE_L0L1|MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); 109.578 + }else{ 109.579 + if(!s->direct_8x8_inference_flag){ 109.580 + /* FIXME save sub mb types from previous frames (or derive from MVs) 109.581 + * so we know exactly what block size to use */ 109.582 + sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */ 109.583 + } 109.584 + *mb_type |= MB_TYPE_8x8|MB_TYPE_L0L1; 109.585 + } 109.586 + 109.587 + l1mv0 = (void *) &mrs->list1_motion_val[0][4*mb_x]; 109.588 + l1mv1 = (void *) &mrs->list1_motion_val[1][4*mb_x]; 109.589 + l1ref0 = &mrs->list1_ref_index [0][4*mb_x]; 109.590 + l1ref1 = &mrs->list1_ref_index [1][4*mb_x]; 109.591 + 109.592 + /* one-to-one mv scaling */ 109.593 + if(IS_16X16(*mb_type)){ 109.594 + int ref, mv0, mv1; 109.595 + 109.596 + fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1); 109.597 + if(IS_INTRA(mb_type_col[0])){ 109.598 + ref=mv0=mv1=0; 109.599 + }else{ 109.600 + const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]] 109.601 + : map_col_to_list0[1][l1ref1[0]]; 109.602 + const int scale = dist_scale_factor[ref0]; 109.603 + const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0]; 109.604 + int mv_l0[2]; 109.605 + mv_l0[0] = (scale * mv_col[0] + 128) >> 8; 109.606 + mv_l0[1] = (scale * mv_col[1] + 128) >> 8; 109.607 + ref= ref0; 109.608 + mv0= pack16to32(mv_l0[0],mv_l0[1]); 109.609 + mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]); 109.610 + } 109.611 + fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1); 109.612 + fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4); 109.613 + fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4); 109.614 + }else{ 109.615 + for(i8=0; i8<4; i8++){ 109.616 + const int x8 = i8&1; 109.617 + const int y8 = i8>>1; 109.618 + int ref0, scale; 109.619 + const int16_t (*l1mv)[2]= l1mv0; 109.620 + 109.621 + if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8])) 109.622 + continue; 109.623 + m->sub_mb_type[i8] = sub_mb_type; 109.624 + fill_rectangle(&mrs->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1); 109.625 + if(IS_INTRA(mb_type_col[0])){ 109.626 + fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1); 109.627 + fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4); 109.628 + fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4); 109.629 + continue; 109.630 + } 109.631 + 109.632 + ref0 = l1ref0[i8]; 109.633 + if(ref0 >= 0) 109.634 + ref0 = map_col_to_list0[0][ref0 ]; 109.635 + else{ 109.636 + ref0 = map_col_to_list0[1][l1ref1[i8]]; 109.637 + l1mv= l1mv1; 109.638 + } 109.639 + scale = dist_scale_factor[ref0]; 109.640 + 109.641 + fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1); 109.642 + if(IS_SUB_8X8(sub_mb_type)){ 109.643 + const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride]; 109.644 + int mx = (scale * mv_col[0] + 128) >> 8; 109.645 + int my = (scale * mv_col[1] + 128) >> 8; 109.646 + fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4); 109.647 + fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4); 109.648 + }else 109.649 + for(i4=0; i4<4; i4++){ 109.650 + const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride]; 109.651 + int16_t *mv_l0 = mrs->mv_cache[0][scan8[i8*4+i4]]; 109.652 + mv_l0[0] = (scale * mv_col[0] + 128) >> 8; 109.653 + mv_l0[1] = (scale * mv_col[1] + 128) >> 8; 109.654 + AV_WN32A(mrs->mv_cache[1][scan8[i8*4+i4]], 109.655 + pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1])); 109.656 + } 109.657 + } 109.658 + } 109.659 +} 109.660 + 109.661 +void ff_h264_pred_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){ 109.662 + if(s->direct_spatial_mv_pred){ 109.663 + pred_spatial_direct_motion_rec(mrc, mrs, s, m, mb_type); 109.664 + }else{ 109.665 + pred_temp_direct_motion_rec(mrc, mrs, s, m, mb_type); 109.666 + } 109.667 +} 109.668 + 109.669 +static inline int fetch_diagonal_mv(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, const int16_t **C, int i, int list, int part_width){ 109.670 + const int topright_ref= mrs->ref_cache[list][ i - 8 + part_width ]; 109.671 + 109.672 + if(topright_ref != PART_NOT_AVAILABLE){ 109.673 + *C= mrs->mv_cache[list][ i - 8 + part_width ]; 109.674 + return topright_ref; 109.675 + }else{ 109.676 + *C= mrs->mv_cache[list][ i - 8 - 1 ]; 109.677 + return mrs->ref_cache[list][ i - 8 - 1 ]; 109.678 + } 109.679 +} 109.680 + 109.681 +/** 109.682 + * gets the predicted MV. 109.683 + * @param n the block index 109.684 + * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4) 109.685 + * @param mx the x component of the predicted motion vector 109.686 + * @param my the y component of the predicted motion vector 109.687 + */ 109.688 +static inline void pred_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int part_width, int list, int ref, int * const mx, int * const my){ 109.689 + const int index8= scan8[n]; 109.690 + const int top_ref= mrs->ref_cache[list][ index8 - 8 ]; 109.691 + const int left_ref= mrs->ref_cache[list][ index8 - 1 ]; 109.692 + const int16_t * const A= mrs->mv_cache[list][ index8 - 1 ]; 109.693 + const int16_t * const B= mrs->mv_cache[list][ index8 - 8 ]; 109.694 + const int16_t * C; 109.695 + int diagonal_ref, match_count; 109.696 + 109.697 + assert(part_width==1 || part_width==2 || part_width==4); 109.698 + 109.699 +/* mv_cache 109.700 + B . . A T T T T 109.701 + U . . L . . , . 109.702 + U . . L . . . . 109.703 + U . . L . . , . 109.704 + . . . L . . . . 109.705 +*/ 109.706 + 109.707 + diagonal_ref= fetch_diagonal_mv(mrc, mrs, s, &C, index8, list, part_width); 109.708 + match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref); 109.709 + 109.710 + if(match_count > 1){ //most common 109.711 + *mx= mid_pred(A[0], B[0], C[0]); 109.712 + *my= mid_pred(A[1], B[1], C[1]); 109.713 + }else if(match_count==1){ 109.714 + if(left_ref==ref){ 109.715 + *mx= A[0]; 109.716 + *my= A[1]; 109.717 + }else if(top_ref==ref){ 109.718 + *mx= B[0]; 109.719 + *my= B[1]; 109.720 + }else{ 109.721 + *mx= C[0]; 109.722 + *my= C[1]; 109.723 + } 109.724 + }else{ 109.725 + if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){ 109.726 + *mx= A[0]; 109.727 + *my= A[1]; 109.728 + }else{ 109.729 + *mx= mid_pred(A[0], B[0], C[0]); 109.730 + *my= mid_pred(A[1], B[1], C[1]); 109.731 + } 109.732 + } 109.733 + 109.734 +} 109.735 + 109.736 +/** 109.737 + * gets the directionally predicted 16x8 MV. 109.738 + * @param n the block index 109.739 + * @param mx the x component of the predicted motion vector 109.740 + * @param my the y component of the predicted motion vector 109.741 + */ 109.742 +static inline void pred_16x8_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int list, int ref, int * const mx, int * const my){ 109.743 + if(n==0){ 109.744 + const int top_ref= mrs->ref_cache[list][ scan8[0] - 8 ]; 109.745 + const int16_t * const B= mrs->mv_cache[list][ scan8[0] - 8 ]; 109.746 + 109.747 + if(top_ref == ref){ 109.748 + *mx= B[0]; 109.749 + *my= B[1]; 109.750 + return; 109.751 + } 109.752 + }else{ 109.753 + const int left_ref= mrs->ref_cache[list][ scan8[8] - 1 ]; 109.754 + const int16_t * const A= mrs->mv_cache[list][ scan8[8] - 1 ]; 109.755 + 109.756 + if(left_ref == ref){ 109.757 + *mx= A[0]; 109.758 + *my= A[1]; 109.759 + return; 109.760 + } 109.761 + } 109.762 + 109.763 + //RARE 109.764 + pred_motion(mrc, mrs, s, n, 4, list, ref, mx, my); 109.765 +} 109.766 + 109.767 +/** 109.768 + * gets the directionally predicted 8x16 MV. 109.769 + * @param n the block index 109.770 + * @param mx the x component of the predicted motion vector 109.771 + * @param my the y component of the predicted motion vector 109.772 + */ 109.773 +static inline void pred_8x16_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int list, int ref, int * const mx, int * const my){ 109.774 + if(n==0){ 109.775 + const int left_ref= mrs->ref_cache[list][ scan8[0] - 1 ]; 109.776 + const int16_t * const A= mrs->mv_cache[list][ scan8[0] - 1 ]; 109.777 + 109.778 + if(left_ref == ref){ 109.779 + *mx= A[0]; 109.780 + *my= A[1]; 109.781 + return; 109.782 + } 109.783 + }else{ 109.784 + const int16_t * C; 109.785 + int diagonal_ref; 109.786 + 109.787 + diagonal_ref= fetch_diagonal_mv(mrc, mrs, s, &C, scan8[4], list, 2); 109.788 + if(diagonal_ref == ref){ 109.789 + *mx= C[0]; 109.790 + *my= C[1]; 109.791 + return; 109.792 + } 109.793 + } 109.794 + 109.795 + //RARE 109.796 + pred_motion(mrc, mrs, s, n, 2, list, ref, mx, my); 109.797 +} 109.798 + 109.799 +static inline void pred_pskip_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb * m, int * const mx, int * const my){ 109.800 + const int top_ref = mrs->ref_cache[0][ scan8[0] - 8 ]; 109.801 + const int left_ref= mrs->ref_cache[0][ scan8[0] - 1 ]; 109.802 + 109.803 + if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE 109.804 + || !( top_ref | AV_RN32A(mrs->mv_cache[0][ scan8[0] - 8 ])) 109.805 + || !(left_ref | AV_RN32A(mrs->mv_cache[0][ scan8[0] - 1 ]))){ 109.806 + 109.807 + *mx = *my = 0; 109.808 + return; 109.809 + } 109.810 + 109.811 + pred_motion(mrc, mrs, s, 0, 4, 0, 0, mx, my); 109.812 + 109.813 + return; 109.814 +} 109.815 + 109.816 +#define ADD_MVD(list) \ 109.817 +{ \ 109.818 + mx += m->mvd[list][mp][0]; \ 109.819 + my += m->mvd[list][mp][1]; \ 109.820 + mp++; \ 109.821 +} 109.822 + 109.823 +int pred_motion_mb_rec (MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m){ 109.824 + int mp=0; 109.825 + int mb_type = m->mb_type; 109.826 + const int mb_x = m->mb_x; 109.827 + 109.828 +// mrc->m =m; 109.829 + 109.830 + fill_decode_caches_rec(mrc, mrs, s, m, mb_type); 109.831 + if (IS_SKIP(mb_type)){ 109.832 + mb_type=0; 109.833 + 109.834 + if( s->slice_type_nos == FF_B_TYPE ) 109.835 + { 109.836 + mb_type|= MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP; 109.837 + ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type); 109.838 + } 109.839 + else 109.840 + { 109.841 + int mx, my; 109.842 + 109.843 + mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP; //FIXME check required 109.844 + pred_pskip_motion(mrc, mrs, s, m, &mx, &my); 109.845 + fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1); 109.846 + fill_rectangle(mrs->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4); 109.847 + } 109.848 + 109.849 + write_back_motion_rec(mrc, mrs, s, m, mb_type); 109.850 + m->mb_type = mrs->mb_type[mb_x]= mb_type; 109.851 + return 0; 109.852 + } 109.853 + 109.854 + 109.855 + if (IS_INTRA_PCM(mb_type)){ 109.856 + mrs->mb_type[mb_x] = mb_type; 109.857 + return 0; 109.858 + } 109.859 + else if (IS_INTRA(mb_type)){ 109.860 + int i, pred_mode; 109.861 + 109.862 + if( IS_INTRA4x4( mb_type ) ) { 109.863 + if ( IS_8x8DCT(mb_type) ) { 109.864 + for( i = 0; i < 16; i+=4 ) { 109.865 + int pred = pred_intra_mode(mrc, mrs, i ); 109.866 + int mode = m->intra4x4_pred_mode[i]; 109.867 + 109.868 + mode = mode < 0 ? pred : mode + ( mode >= pred ); 109.869 + fill_rectangle( &mrs->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 ); 109.870 + } 109.871 + } else { 109.872 + for( i = 0; i < 16; i++ ) { 109.873 + int pred = pred_intra_mode(mrc, mrs, i ); 109.874 + int mode = m->intra4x4_pred_mode[i]; 109.875 + mode = mode < 0 ? pred : mode + ( mode >= pred ); 109.876 + mrs->intra4x4_pred_mode_cache[ scan8[i] ] = mode; 109.877 + } 109.878 + } 109.879 + write_back_intra_pred_mode_rec(mrc, mrs, m, mb_x); 109.880 + if( check_intra4x4_pred_mode(mrc, mrs, s, m) < 0 ) return -1; 109.881 + } else { 109.882 + m->intra16x16_pred_mode= check_intra_pred_mode(mrc, mrs, s, m, m->intra16x16_pred_mode ); 109.883 + if( m->intra16x16_pred_mode < 0 ) return -1; 109.884 + } 109.885 + 109.886 + pred_mode = m->chroma_pred_mode; 109.887 + pred_mode= check_intra_pred_mode( mrc, mrs, s, m, pred_mode ); 109.888 + if( pred_mode < 0 ) return -1; 109.889 + m->chroma_pred_mode= pred_mode; 109.890 + 109.891 + } 109.892 + else if (IS_8X8(mb_type)){ 109.893 + int i, j, list; 109.894 + 109.895 + if( s->slice_type_nos == FF_B_TYPE ) { 109.896 + if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] | 109.897 + m->sub_mb_type[2] | m->sub_mb_type[3]) ) { 109.898 + ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type); 109.899 + mrs->ref_cache[0][scan8[4]] = 109.900 + mrs->ref_cache[1][scan8[4]] = 109.901 + mrs->ref_cache[0][scan8[12]] = 109.902 + mrs->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE; 109.903 + } 109.904 + } 109.905 + 109.906 + for(list=0; list<s->list_count; list++){ 109.907 + for(i=0; i<4; i++){ 109.908 + if(IS_DIRECT(m->sub_mb_type[i])){ 109.909 + mrs->ref_cache[list][ scan8[4*i] ]=mrs->ref_cache[list][ scan8[4*i]+1 ]; 109.910 + continue; 109.911 + } else { 109.912 + mrs->ref_cache[list][ scan8[4*i] ]=mrs->ref_cache[list][ scan8[4*i]+1 ]= 109.913 + mrs->ref_cache[list][ scan8[4*i]+8 ]=mrs->ref_cache[list][ scan8[4*i]+9 ]= m->ref_index[list][i]; 109.914 + 109.915 + if(IS_DIR(m->sub_mb_type[i], 0, list) ){ 109.916 + const int sub_mb_type= m->sub_mb_type[i]; 109.917 + const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1; 109.918 + 109.919 + int sub_partition_count = IS_SUB_8X8(sub_mb_type) ? 1 : (IS_SUB_4X4(sub_mb_type)? 4 :2); 109.920 + for(j=0; j<sub_partition_count; j++){ 109.921 + int mx, my; 109.922 + const int index= 4*i + block_width*j; 109.923 + int16_t (* mv_cache)[2]= &mrs->mv_cache[list][ scan8[index]]; 109.924 + pred_motion(mrc, mrs, s, index, block_width, list, mrs->ref_cache[list][ scan8[index] ], &mx, &my); 109.925 + 109.926 + ADD_MVD(list) 109.927 + 109.928 + if(IS_SUB_8X8(sub_mb_type)){ 109.929 + mv_cache[ 1 ][0]= 109.930 + mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx; 109.931 + mv_cache[ 1 ][1]= 109.932 + mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my; 109.933 + }else if(IS_SUB_8X4(sub_mb_type)){ 109.934 + mv_cache[ 1 ][0]= mx; 109.935 + mv_cache[ 1 ][1]= my; 109.936 + }else if(IS_SUB_4X8(sub_mb_type)){ 109.937 + mv_cache[ 8 ][0]= mx; 109.938 + mv_cache[ 8 ][1]= my; 109.939 + } 109.940 + mv_cache[ 0 ][0]= mx; 109.941 + mv_cache[ 0 ][1]= my; 109.942 + } 109.943 + }else{ 109.944 + fill_rectangle(mrs->mv_cache [list][ scan8[4*i] ], 2, 2, 8, 0, 4); 109.945 + } 109.946 + } 109.947 + } 109.948 + } 109.949 + } else if( IS_DIRECT(mb_type) ) { 109.950 + mb_type &= ~MB_TYPE_16x16; //FIXME not nice 109.951 + ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type); 109.952 + } 109.953 + else { 109.954 + int list, i; 109.955 + if(IS_16X16(mb_type)){ 109.956 + for(list=0; list<s->list_count; list++){ 109.957 + if(IS_DIR(mb_type, 0, list)){ 109.958 + int ref; 109.959 + int mx,my; 109.960 + 109.961 + ref = m->ref_index[list][0]; 109.962 + fill_rectangle(&mrs->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1); 109.963 + pred_motion(mrc, mrs, s, 0, 4, list, mrs->ref_cache[list][ scan8[0] ], &mx, &my); 109.964 + ADD_MVD(list) 109.965 + fill_rectangle(mrs->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4); 109.966 + } 109.967 + } 109.968 + } 109.969 + else if(IS_16X8(mb_type)){ 109.970 + for(list=0; list<s->list_count; list++){ 109.971 + for(i=0; i<2; i++){ 109.972 + if(IS_DIR(mb_type, i, list)){ 109.973 + int ref; 109.974 + int mx,my; 109.975 + ref = m->ref_index[list][i]; 109.976 + fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1); 109.977 + 109.978 + pred_16x8_motion(mrc, mrs, s, 8*i, list, mrs->ref_cache[list][scan8[0] + 16*i], &mx, &my); 109.979 + ADD_MVD(list) 109.980 + 109.981 + fill_rectangle(mrs->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4); 109.982 + }else{ 109.983 + fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1); 109.984 + fill_rectangle(mrs->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4); 109.985 + } 109.986 + } 109.987 + } 109.988 + 109.989 + }else{ 109.990 + assert(IS_8X16(mb_type)); 109.991 + 109.992 + for(list=0; list<s->list_count; list++){ 109.993 + for(i=0; i<2; i++){ 109.994 + if(IS_DIR(mb_type, i, list)){ //FIXME optimize 109.995 + int ref; 109.996 + int mx,my; 109.997 + ref = m->ref_index[list][i]; 109.998 + fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1); 109.999 + pred_8x16_motion(mrc, mrs, s, i*4, list, mrs->ref_cache[list][ scan8[0] + 2*i ], &mx, &my); 109.1000 + ADD_MVD(list) 109.1001 + fill_rectangle(mrs->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4); 109.1002 + }else{ 109.1003 + fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1); 109.1004 + fill_rectangle(mrs->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4); 109.1005 + } 109.1006 + } 109.1007 + } 109.1008 + } 109.1009 + } 109.1010 + 109.1011 + if (IS_INTER(mb_type)||(IS_DIRECT(mb_type))) 109.1012 + write_back_motion_rec(mrc, mrs, s, m, mb_type); 109.1013 + m->mb_type = mrs->mb_type[mb_x]= mb_type; 109.1014 + 109.1015 + return 0; 109.1016 +}
110.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 110.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.h Mon Aug 27 12:09:56 2012 +0200 110.3 @@ -0,0 +1,10 @@ 110.4 +#ifndef H264_DIRECT_H 110.5 +#define H264_DIRECT_H 110.6 + 110.7 +#include "h264_types.h" 110.8 + 110.9 +void ff_h264_pred_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int *mb_type); 110.10 +int pred_motion_mb_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m); 110.11 + 110.12 + 110.13 +#endif
111.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 111.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_ps.c Mon Aug 27 12:09:56 2012 +0200 111.3 @@ -0,0 +1,462 @@ 111.4 +/* 111.5 + * H.26L/H.264/AVC/JVT/14496-10/... parameter set decoding 111.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 111.7 + * 111.8 + * This file is part of FFmpeg. 111.9 + * 111.10 + * FFmpeg is free software; you can redistribute it and/or 111.11 + * modify it under the terms of the GNU Lesser General Public 111.12 + * License as published by the Free Software Foundation; either 111.13 + * version 2.1 of the License, or (at your option) any later version. 111.14 + * 111.15 + * FFmpeg is distributed in the hope that it will be useful, 111.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 111.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 111.18 + * Lesser General Public License for more details. 111.19 + * 111.20 + * You should have received a copy of the GNU Lesser General Public 111.21 + * License along with FFmpeg; if not, write to the Free Software 111.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 111.23 + */ 111.24 + 111.25 +/** 111.26 + * @file 111.27 + * H.264 / AVC / MPEG4 part10 parameter set decoding. 111.28 + * @author Michael Niedermayer <michaelni@gmx.at> 111.29 + */ 111.30 + 111.31 +#include "dsputil.h" 111.32 +#include "avcodec.h" 111.33 +#include "h264_types.h" 111.34 +#include "h264_data.h" 111.35 +#include "golomb.h" 111.36 + 111.37 + 111.38 +//#undef NDEBUG 111.39 +#include <assert.h> 111.40 + 111.41 +static const int pixel_aspect[17][2]={ 111.42 + {0, 1}, 111.43 + {1, 1}, 111.44 + {12, 11}, 111.45 + {10, 11}, 111.46 + {16, 11}, 111.47 + {40, 33}, 111.48 + {24, 11}, 111.49 + {20, 11}, 111.50 + {32, 11}, 111.51 + {80, 33}, 111.52 + {18, 11}, 111.53 + {15, 11}, 111.54 + {64, 33}, 111.55 + {160,99}, 111.56 + {4, 3}, 111.57 + {3, 2}, 111.58 + {2, 1}, 111.59 +}; 111.60 + 111.61 +const uint8_t ff_h264_chroma_qp[52]={ 111.62 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 111.63 + 12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27, 111.64 + 28,29,29,30,31,32,32,33,34,34,35,35,36,36,37,37, 111.65 + 37,38,38,38,39,39,39,39 111.66 +}; 111.67 + 111.68 +static const uint8_t default_scaling4[2][16]={ 111.69 +{ 6,13,20,28, 111.70 + 13,20,28,32, 111.71 + 20,28,32,37, 111.72 + 28,32,37,42 111.73 +},{ 111.74 + 10,14,20,24, 111.75 + 14,20,24,27, 111.76 + 20,24,27,30, 111.77 + 24,27,30,34 111.78 +}}; 111.79 + 111.80 +static const uint8_t default_scaling8[2][64]={ 111.81 +{ 6,10,13,16,18,23,25,27, 111.82 + 10,11,16,18,23,25,27,29, 111.83 + 13,16,18,23,25,27,29,31, 111.84 + 16,18,23,25,27,29,31,33, 111.85 + 18,23,25,27,29,31,33,36, 111.86 + 23,25,27,29,31,33,36,38, 111.87 + 25,27,29,31,33,36,38,40, 111.88 + 27,29,31,33,36,38,40,42 111.89 +},{ 111.90 + 9,13,15,17,19,21,22,24, 111.91 + 13,13,17,19,21,22,24,25, 111.92 + 15,17,19,21,22,24,25,27, 111.93 + 17,19,21,22,24,25,27,28, 111.94 + 19,21,22,24,25,27,28,30, 111.95 + 21,22,24,25,27,28,30,32, 111.96 + 22,24,25,27,28,30,32,33, 111.97 + 24,25,27,28,30,32,33,35 111.98 +}}; 111.99 + 111.100 +static inline int decode_hrd_parameters(GetBitContext *gb, SPS *sps){ 111.101 + int cpb_count, i; 111.102 + cpb_count = get_ue_golomb_31(gb) + 1; 111.103 + 111.104 + if(cpb_count > 32){ 111.105 + av_log(AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count); 111.106 + return -1; 111.107 + } 111.108 + 111.109 + get_bits(gb, 4); /* bit_rate_scale */ 111.110 + get_bits(gb, 4); /* cpb_size_scale */ 111.111 + for(i=0; i<cpb_count; i++){ 111.112 + get_ue_golomb(gb); /* bit_rate_value_minus1 */ 111.113 + get_ue_golomb(gb); /* cpb_size_value_minus1 */ 111.114 + get_bits1(gb); /* cbr_flag */ 111.115 + } 111.116 + sps->initial_cpb_removal_delay_length = get_bits(gb, 5) + 1; 111.117 + sps->cpb_removal_delay_length = get_bits(gb, 5) + 1; 111.118 + sps->dpb_output_delay_length = get_bits(gb, 5) + 1; 111.119 + sps->time_offset_length = get_bits(gb, 5); 111.120 + sps->cpb_cnt = cpb_count; 111.121 + return 0; 111.122 +} 111.123 + 111.124 +static inline int decode_vui_parameters(GetBitContext *gb, SPS *sps){ 111.125 + int aspect_ratio_info_present_flag; 111.126 + unsigned int aspect_ratio_idc; 111.127 + 111.128 + aspect_ratio_info_present_flag= get_bits1(gb); 111.129 + 111.130 + if( aspect_ratio_info_present_flag ) { 111.131 + aspect_ratio_idc= get_bits(gb, 8); 111.132 + if( aspect_ratio_idc == EXTENDED_SAR ) { 111.133 + sps->num= get_bits(gb, 16); 111.134 + sps->den= get_bits(gb, 16); 111.135 + }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(int[2])){ 111.136 + //sps->sar= pixel_aspect[aspect_ratio_idc]; 111.137 + }else{ 111.138 + av_log( AV_LOG_ERROR, "illegal aspect ratio idc %d\n", aspect_ratio_idc); 111.139 + // return -1; 111.140 + } 111.141 + }else{ 111.142 + sps->num= 111.143 + sps->den= 0; 111.144 + } 111.145 + 111.146 + if(get_bits1(gb)){ /* overscan_info_present_flag */ 111.147 + get_bits1(gb); /* overscan_appropriate_flag */ 111.148 + } 111.149 + 111.150 + sps->video_signal_type_present_flag = get_bits1(gb); 111.151 + if(sps->video_signal_type_present_flag){ 111.152 + get_bits(gb, 3); /* video_format */ 111.153 + sps->full_range = get_bits1(gb); /* video_full_range_flag */ 111.154 + 111.155 + sps->colour_description_present_flag = get_bits1(gb); 111.156 + if(sps->colour_description_present_flag){ 111.157 + sps->color_primaries = get_bits(gb, 8); /* colour_primaries */ 111.158 + sps->color_trc = get_bits(gb, 8); /* transfer_characteristics */ 111.159 + sps->colorspace = get_bits(gb, 8); /* matrix_coefficients */ 111.160 + if (sps->color_primaries >= AVCOL_PRI_NB) 111.161 + sps->color_primaries = AVCOL_PRI_UNSPECIFIED; 111.162 + if (sps->color_trc >= AVCOL_TRC_NB) 111.163 + sps->color_trc = AVCOL_TRC_UNSPECIFIED; 111.164 + if (sps->colorspace >= AVCOL_SPC_NB) 111.165 + sps->colorspace = AVCOL_SPC_UNSPECIFIED; 111.166 + } 111.167 + } 111.168 + 111.169 + if(get_bits1(gb)){ /* chroma_location_info_present_flag */ 111.170 + av_log(AV_LOG_ERROR, "chroma_location_info_present_flag found, but not supported\n"); 111.171 + (void) (get_ue_golomb(gb)+1); /* chroma_sample_location_type_top_field */ 111.172 + (void) get_ue_golomb(gb); /* chroma_sample_location_type_bottom_field */ 111.173 + } 111.174 + 111.175 + sps->timing_info_present_flag = get_bits1(gb); 111.176 + if(sps->timing_info_present_flag){ 111.177 + sps->num_units_in_tick = get_bits_long(gb, 32); 111.178 + sps->time_scale = get_bits_long(gb, 32); 111.179 + if(!sps->num_units_in_tick || !sps->time_scale){ 111.180 + av_log(AV_LOG_ERROR, "time_scale/num_units_in_tick invalid or unsupported (%d/%d)\n", sps->time_scale, sps->num_units_in_tick); 111.181 + return -1; 111.182 + } 111.183 + sps->fixed_frame_rate_flag = get_bits1(gb); 111.184 + } 111.185 + 111.186 + sps->nal_hrd_parameters_present_flag = get_bits1(gb); 111.187 + if(sps->nal_hrd_parameters_present_flag) 111.188 + if(decode_hrd_parameters(gb, sps) < 0) 111.189 + return -1; 111.190 + sps->vcl_hrd_parameters_present_flag = get_bits1(gb); 111.191 + if(sps->vcl_hrd_parameters_present_flag) 111.192 + if(decode_hrd_parameters(gb, sps) < 0) 111.193 + return -1; 111.194 + if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag) 111.195 + get_bits1(gb); /* low_delay_hrd_flag */ 111.196 + sps->pic_struct_present_flag = get_bits1(gb); 111.197 + 111.198 + sps->bitstream_restriction_flag = get_bits1(gb); 111.199 + if(sps->bitstream_restriction_flag){ 111.200 + get_bits1(gb); /* motion_vectors_over_pic_boundaries_flag */ 111.201 + get_ue_golomb(gb); /* max_bytes_per_pic_denom */ 111.202 + get_ue_golomb(gb); /* max_bits_per_mb_denom */ 111.203 + get_ue_golomb(gb); /* log2_max_mv_length_horizontal */ 111.204 + get_ue_golomb(gb); /* log2_max_mv_length_vertical */ 111.205 + sps->num_reorder_frames= get_ue_golomb(gb); 111.206 + get_ue_golomb(gb); /*max_dec_frame_buffering*/ 111.207 + 111.208 + if(sps->num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){ 111.209 + av_log(AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames); 111.210 + return -1; 111.211 + } 111.212 + } 111.213 + 111.214 + return 0; 111.215 +} 111.216 + 111.217 +static void decode_scaling_list(GetBitContext *gb, uint8_t *factors, int size, const uint8_t *jvt_list, const uint8_t *fallback_list){ 111.218 + int i, last = 8, next = 8; 111.219 + const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct; 111.220 + if(!get_bits1(gb)) /* matrix not written, we use the predicted one */ 111.221 + memcpy(factors, fallback_list, size*sizeof(uint8_t)); 111.222 + else 111.223 + for(i=0;i<size;i++){ 111.224 + if(next) 111.225 + next = (last + get_se_golomb(gb)) & 0xff; 111.226 + if(!i && !next){ /* matrix not written, we use the preset one */ 111.227 + memcpy(factors, jvt_list, size*sizeof(uint8_t)); 111.228 + break; 111.229 + } 111.230 + last = factors[scan[i]] = next ? next : last; 111.231 + } 111.232 +} 111.233 + 111.234 +static void decode_scaling_matrices(GetBitContext *gb, SPS *sps, PPS *pps, int is_sps, 111.235 + uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){ 111.236 + int fallback_sps = !is_sps && sps->scaling_matrix_present; 111.237 + const uint8_t *fallback[4] = { 111.238 + fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0], 111.239 + fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1], 111.240 + fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0], 111.241 + fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1] 111.242 + }; 111.243 + if(get_bits1(gb)){ 111.244 + sps->scaling_matrix_present |= is_sps; 111.245 + decode_scaling_list(gb, scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y 111.246 + decode_scaling_list(gb, scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr 111.247 + decode_scaling_list(gb, scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb 111.248 + decode_scaling_list(gb, scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y 111.249 + decode_scaling_list(gb, scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr 111.250 + decode_scaling_list(gb, scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb 111.251 + if(is_sps || pps->transform_8x8_mode){ 111.252 + decode_scaling_list(gb, scaling_matrix8[0],64,default_scaling8[0],fallback[2]); // Intra, Y 111.253 + decode_scaling_list(gb, scaling_matrix8[1],64,default_scaling8[1],fallback[3]); // Inter, Y 111.254 + } 111.255 + } 111.256 +} 111.257 + 111.258 +int ff_h264_decode_seq_parameter_set(NalContext *n, GetBitContext *gb){ 111.259 + int profile_idc, level_idc; 111.260 + unsigned int sps_id; 111.261 + int i; 111.262 + SPS *sps; 111.263 + 111.264 + profile_idc= get_bits(gb, 8); 111.265 + get_bits1(gb); //constraint_set0_flag 111.266 + get_bits1(gb); //constraint_set1_flag 111.267 + get_bits1(gb); //constraint_set2_flag 111.268 + get_bits1(gb); //constraint_set3_flag 111.269 + get_bits(gb, 4); // reserved 111.270 + level_idc= get_bits(gb, 8); 111.271 + sps_id= get_ue_golomb_31(gb); 111.272 + 111.273 + if(sps_id >= MAX_SPS_COUNT) { 111.274 + av_log(AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id); 111.275 + return -1; 111.276 + } 111.277 + if (!n->sps_buffers[sps_id]) 111.278 + n->sps_buffers[sps_id]= av_mallocz(sizeof(SPS)); 111.279 + 111.280 + sps = n->sps_buffers[sps_id]; 111.281 + if(sps == NULL) 111.282 + return -1; 111.283 + 111.284 + sps->profile_idc= profile_idc; 111.285 + sps->level_idc= level_idc; 111.286 + 111.287 + memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4)); 111.288 + memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8)); 111.289 + sps->scaling_matrix_present = 0; 111.290 + 111.291 + if(sps->profile_idc >= 100){ //high profile 111.292 + sps->chroma_format_idc= get_ue_golomb_31(gb); 111.293 + if(sps->chroma_format_idc == 3) 111.294 + sps->residual_color_transform_flag = get_bits1(gb); 111.295 + sps->bit_depth_luma = get_ue_golomb(gb) + 8; 111.296 + sps->bit_depth_chroma = get_ue_golomb(gb) + 8; 111.297 + sps->transform_bypass = get_bits1(gb); 111.298 + decode_scaling_matrices(gb, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8); 111.299 + }else{ 111.300 + sps->chroma_format_idc= 1; 111.301 + sps->bit_depth_luma = 8; 111.302 + sps->bit_depth_chroma = 8; 111.303 + } 111.304 + 111.305 + sps->log2_max_frame_num= get_ue_golomb(gb) + 4; 111.306 + sps->poc_type= get_ue_golomb_31(gb); 111.307 + 111.308 + if(sps->poc_type == 0){ //FIXME #define 111.309 + sps->log2_max_poc_lsb= get_ue_golomb(gb) + 4; 111.310 + } else if(sps->poc_type == 1){//FIXME #define 111.311 + sps->delta_pic_order_always_zero_flag= get_bits1(gb); 111.312 + sps->offset_for_non_ref_pic= get_se_golomb(gb); 111.313 + sps->offset_for_top_to_bottom_field= get_se_golomb(gb); 111.314 + sps->poc_cycle_length = get_ue_golomb(gb); 111.315 + 111.316 + if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){ 111.317 + av_log(AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length); 111.318 + goto fail; 111.319 + } 111.320 + 111.321 + for(i=0; i<sps->poc_cycle_length; i++) 111.322 + sps->offset_for_ref_frame[i]= get_se_golomb(gb); 111.323 + }else if(sps->poc_type != 2){ 111.324 + av_log(AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type); 111.325 + goto fail; 111.326 + } 111.327 + 111.328 + sps->ref_frame_count= get_ue_golomb_31(gb); 111.329 + if(sps->ref_frame_count >= 32){ 111.330 + av_log(AV_LOG_ERROR, "too many reference frames\n"); 111.331 + goto fail; 111.332 + } 111.333 + sps->gaps_in_frame_num_allowed_flag= get_bits1(gb); 111.334 + sps->mb_width = get_ue_golomb(gb) + 1; 111.335 + sps->mb_height= get_ue_golomb(gb) + 1; 111.336 + 111.337 + 111.338 + sps->frame_mbs_only_flag= get_bits1(gb); 111.339 + if(!sps->frame_mbs_only_flag){ 111.340 + av_log(AV_LOG_ERROR, "MBAFF support not included\n"); 111.341 + get_bits1(gb); 111.342 + }else 111.343 + sps->mb_aff= 0; 111.344 + 111.345 + sps->direct_8x8_inference_flag= get_bits1(gb); 111.346 + if(!sps->frame_mbs_only_flag && !sps->direct_8x8_inference_flag){ 111.347 + av_log(AV_LOG_ERROR, "This stream was generated by a broken encoder, invalid 8x8 inference\n"); 111.348 + goto fail; 111.349 + } 111.350 + 111.351 + sps->crop= get_bits1(gb); 111.352 + if(sps->crop){ 111.353 + sps->crop_left = get_ue_golomb(gb); 111.354 + sps->crop_right = get_ue_golomb(gb); 111.355 + sps->crop_top = get_ue_golomb(gb); 111.356 + sps->crop_bottom= get_ue_golomb(gb); 111.357 + if(sps->crop_left || sps->crop_top){ 111.358 + av_log( AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n"); 111.359 + } 111.360 + if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){ 111.361 + av_log( AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n"); 111.362 + } 111.363 + }else { 111.364 + 111.365 + sps->crop_left = 111.366 + sps->crop_right = 111.367 + sps->crop_top = 111.368 + sps->crop_bottom= 0; 111.369 + } 111.370 + 111.371 + sps->vui_parameters_present_flag= get_bits1(gb); 111.372 + if( sps->vui_parameters_present_flag ) 111.373 + if (decode_vui_parameters(gb, sps) < 0) 111.374 + goto fail; 111.375 + 111.376 + 111.377 + n->sps = *sps; 111.378 + 111.379 + if( sps->bitstream_restriction_flag){ 111.380 + n->has_b_frames = sps->num_reorder_frames; 111.381 + } 111.382 + else 111.383 + n->has_b_frames= MAX_DELAYED_PIC_COUNT; 111.384 + 111.385 + return 0; 111.386 +fail: 111.387 + av_free(sps); 111.388 + return -1; 111.389 +} 111.390 + 111.391 +static void 111.392 +build_qp_table(PPS *pps, int t, int index) 111.393 +{ 111.394 + int i; 111.395 + for(i = 0; i < 52; i++) 111.396 + pps->chroma_qp_table[t][i] = ff_h264_chroma_qp[av_clip(i + index, 0, 51)]; 111.397 +} 111.398 + 111.399 +int ff_h264_decode_picture_parameter_set(NalContext *n, GetBitContext *gb, int bit_length){ 111.400 + unsigned int pps_id= get_ue_golomb(gb); 111.401 + PPS *pps; 111.402 + 111.403 + if(pps_id >= MAX_PPS_COUNT) { 111.404 + av_log(AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id); 111.405 + return -1; 111.406 + } 111.407 + if (!n->pps_buffers[pps_id]) 111.408 + n->pps_buffers[pps_id]= av_mallocz(sizeof(PPS)); 111.409 + pps = n->pps_buffers[pps_id]; 111.410 + if(pps == NULL) 111.411 + return -1; 111.412 + pps->sps_id= get_ue_golomb_31(gb); 111.413 + if((unsigned)pps->sps_id>=MAX_SPS_COUNT || n->sps_buffers[pps->sps_id] == NULL){ 111.414 + av_log(AV_LOG_ERROR, "sps_id out of range\n"); 111.415 + goto fail; 111.416 + } 111.417 + 111.418 + pps->cabac= get_bits1(gb); 111.419 + pps->pic_order_present= get_bits1(gb); 111.420 + if(pps->pic_order_present){ 111.421 + av_log(AV_LOG_ERROR, "no interlaces support\n"); 111.422 + } 111.423 + pps->slice_group_count= get_ue_golomb(gb) + 1; 111.424 + if(pps->slice_group_count > 1 ){ 111.425 + pps->mb_slice_group_map_type= get_ue_golomb(gb); 111.426 + av_log(AV_LOG_ERROR, "multiple slices not supported\n"); 111.427 + } 111.428 + pps->ref_count[0]= get_ue_golomb(gb) + 1; 111.429 + pps->ref_count[1]= get_ue_golomb(gb) + 1; 111.430 + if(pps->ref_count[0]> 32 || pps->ref_count[1]> 32){ 111.431 + av_log(AV_LOG_ERROR, "reference overflow (pps)\n"); 111.432 + goto fail; 111.433 + } 111.434 + 111.435 + pps->weighted_pred= get_bits1(gb); 111.436 + pps->weighted_bipred_idc= get_bits(gb, 2); 111.437 + pps->init_qp= get_se_golomb(gb) + 26; 111.438 + pps->init_qs= get_se_golomb(gb) + 26; 111.439 + pps->chroma_qp_index_offset[0]= get_se_golomb(gb); 111.440 + pps->deblocking_filter_parameters_present= get_bits1(gb); 111.441 + pps->constrained_intra_pred= get_bits1(gb); 111.442 + pps->redundant_pic_cnt_present = get_bits1(gb); 111.443 + 111.444 + pps->transform_8x8_mode= 0; 111.445 + memcpy(pps->scaling_matrix4, n->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4)); 111.446 + memcpy(pps->scaling_matrix8, n->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8)); 111.447 + 111.448 + if(get_bits_count(gb) < bit_length){ 111.449 + pps->transform_8x8_mode= get_bits1(gb); 111.450 + decode_scaling_matrices(gb, n->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8); 111.451 + pps->chroma_qp_index_offset[1]= get_se_golomb(gb); //second_chroma_qp_index_offset 111.452 + } else { 111.453 + pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0]; 111.454 + } 111.455 + 111.456 + build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]); 111.457 + build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]); 111.458 + if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) 111.459 + pps->chroma_qp_diff= 1; 111.460 + 111.461 + return 0; 111.462 +fail: 111.463 + av_free(pps); 111.464 + return -1; 111.465 +}
112.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 112.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_ps.h Mon Aug 27 12:09:56 2012 +0200 112.3 @@ -0,0 +1,9 @@ 112.4 +#ifndef H264_PS_H 112.5 +#define H264_PS_H 112.6 + 112.7 +#include "h264_types.h" 112.8 + 112.9 +int ff_h264_decode_seq_parameter_set(NalContext *n, GetBitContext *gb); 112.10 +int ff_h264_decode_picture_parameter_set(NalContext *n, GetBitContext *gb, int bit_length); 112.11 + 112.12 +#endif
113.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 113.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_pthread.c Mon Aug 27 12:09:56 2012 +0200 113.3 @@ -0,0 +1,604 @@ 113.4 +#include "config.h" 113.5 + 113.6 +#include "h264_types.h" 113.7 +#include "h264_parser.h" 113.8 +#include "h264_nal.h" 113.9 +#include "h264_entropy.h" 113.10 +#include "h264_rec.h" 113.11 +#include "h264_misc.h" 113.12 +// #undef NDEBUG 113.13 +#include <assert.h> 113.14 +#include <pthread.h> 113.15 + 113.16 +#define XOANON 1 113.17 + 113.18 +#ifdef XOANON 113.19 +static int ed_rec_affinity[40] = { 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 113.20 + 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 113.21 + 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 113.22 + 3, 7, 11, 15, 19, 23, 27, 31, 35, 39 }; 113.23 +static int ed_rec_smt_aff[80] = { 0, 40, 4, 44, 8, 48, 12, 52, 16, 56, 20, 60, 24, 64, 28, 68, 32, 72, 36, 76, 113.24 + 1, 41, 5, 45, 9, 49, 13, 53, 17, 57, 21, 61, 25, 65, 29, 69, 33, 73, 37, 77, 113.25 + 2, 42, 6, 46, 10, 50, 14, 54, 18, 58, 22, 62, 26, 66, 30, 70, 34, 74, 38, 78, 113.26 + 3, 43, 7, 47, 11, 51, 15, 55, 19, 59, 23, 63, 27, 67, 31, 71, 35, 75, 39, 79 }; 113.27 +#else 113.28 +static int ed_rec_affinity[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; 113.29 +static int ed_rec_smt_aff[20] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, }; 113.30 +#endif 113.31 + 113.32 +static int frames=0; 113.33 + 113.34 +static void notify_one_worker(H264Context *h){ 113.35 + pthread_mutex_lock(&h->task_lock); 113.36 + pthread_cond_signal(&h->task_cond); 113.37 + pthread_mutex_unlock(&h->task_lock); 113.38 +} 113.39 + 113.40 +static void notify_all_workers(H264Context *h){ 113.41 + pthread_mutex_lock(&h->task_lock); 113.42 + pthread_cond_broadcast(&h->task_cond); 113.43 + pthread_mutex_unlock(&h->task_lock); 113.44 +} 113.45 + 113.46 +static void push_sbe (SliceBufferQueue *sbq, SliceBufferEntry *sbe, int notify ){ 113.47 + pthread_mutex_lock(&sbq->lock); 113.48 + while (sbq->cnt >= sbq->size) 113.49 + pthread_cond_wait(&sbq->cond, &sbq->lock); 113.50 + sbq->queue[sbq->fi] = sbe; 113.51 + sbq->cnt++; 113.52 + sbq->fi++; sbq->fi %= sbq->size; 113.53 + if (notify) 113.54 + pthread_cond_signal(&sbq->cond); 113.55 + pthread_mutex_unlock(&sbq->lock); 113.56 +} 113.57 + 113.58 +static SliceBufferEntry* pop_sbe (SliceBufferQueue *sbq, int block){ 113.59 + SliceBufferEntry *sbe=NULL; 113.60 + 113.61 + pthread_mutex_lock(&sbq->lock); 113.62 + if (block){ 113.63 + while (sbq->cnt <= 0) 113.64 + pthread_cond_wait(&sbq->cond, &sbq->lock); 113.65 + }else { 113.66 + if (sbq->cnt <= 0) 113.67 + goto nonblock; 113.68 + } 113.69 + sbe = sbq->queue[sbq->fo]; 113.70 + sbq->cnt--; 113.71 + sbq->fo++; sbq->fo %= sbq->size; 113.72 + pthread_cond_signal(&sbq->cond); 113.73 +nonblock: 113.74 + pthread_mutex_unlock(&sbq->lock); 113.75 + 113.76 + return sbe; 113.77 +} 113.78 + 113.79 +// static void push_rle (RingLineQueue *rlq, SliceBufferEntry *sbe, int line, int notify){ 113.80 +// 113.81 +// //check for free slots 113.82 +// pthread_mutex_lock(&rlq->wslock); 113.83 +// while (rlq->free <= 0){ 113.84 +// pthread_cond_wait(&rlq->wscond, &rlq->wslock); 113.85 +// } 113.86 +// //free slot is available, decrement one in this lock 113.87 +// rlq->free--; 113.88 +// pthread_mutex_unlock(&rlq->wslock); 113.89 +// 113.90 +// pthread_mutex_lock(&rlq->swlock); 113.91 +// rlq->queue[rlq->fi]->sbe=sbe; 113.92 +// rlq->queue[rlq->fi]->line=line; 113.93 +// rlq->queue[rlq->fi]->mb_cnt=0; 113.94 +// rlq->fi++; rlq->fi %= rlq->size; 113.95 +// rlq->ready++; 113.96 +// if(notify) 113.97 +// pthread_cond_signal(&rlq->swcond); 113.98 +// pthread_mutex_unlock(&rlq->swlock); 113.99 +// } 113.100 + 113.101 +// static RingLineEntry* pop_rle (RingLineQueue *rlq, int block){ 113.102 +// RingLineEntry *rle=NULL; 113.103 +// 113.104 +// pthread_mutex_lock(&rlq->swlock); 113.105 +// if (block){ 113.106 +// while (rlq->ready <= 0) 113.107 +// pthread_cond_wait(&rlq->swcond, &rlq->swlock); 113.108 +// }else { 113.109 +// if (rlq->ready <= 0) 113.110 +// goto nonblock; 113.111 +// } 113.112 +// rle = rlq->queue[rlq->fo]; 113.113 +// rlq->fo++; rlq->fo %= rlq->size; 113.114 +// rlq->ready--; 113.115 +// nonblock: 113.116 +// pthread_mutex_unlock(&rlq->swlock); 113.117 +// 113.118 +// return rle; 113.119 +// } 113.120 +// 113.121 +// static void rel_rle (RingLineQueue *rlq){ 113.122 +// pthread_mutex_lock(&rlq->wslock); 113.123 +// rlq->free++; 113.124 +// pthread_cond_signal(&rlq->wscond); 113.125 +// pthread_mutex_unlock(&rlq->wslock); 113.126 +// } 113.127 + 113.128 +static RingLineEntry* pop_rle (SliceBufferQueue *sbq, RingLineQueue *rlq, int *has_token){ 113.129 + RingLineEntry *rle=NULL; 113.130 + SliceBufferEntry *sbe=NULL; 113.131 + int line=-1; 113.132 + 113.133 + pthread_mutex_lock(&sbq->lock); 113.134 + if (sbq->cnt <= 0) 113.135 + goto unlock; 113.136 + sbe = sbq->queue[sbq->fo]; 113.137 + line = sbe->lines_taken; 113.138 + 113.139 + 113.140 + pthread_mutex_lock(&rlq->swlock); 113.141 + if (!*has_token){ 113.142 + if (rlq->free <= 0) 113.143 + goto unlock2; 113.144 + rlq->free--; 113.145 + *has_token=1; 113.146 + } 113.147 + rle = rlq->queue[rlq->fo]; 113.148 + rlq->fo++; rlq->fo %= rlq->size; 113.149 + rle->sbe=sbe; 113.150 + rle->line = line; 113.151 + rle->mb_cnt =0; 113.152 + if (++sbe->lines_taken >= sbe->lines_total){ 113.153 + sbq->cnt--; 113.154 + sbq->fo++; sbq->fo %= sbq->size; 113.155 + pthread_cond_signal(&sbq->cond); 113.156 + } 113.157 +unlock2: 113.158 + pthread_mutex_unlock(&rlq->swlock); 113.159 +unlock: 113.160 + pthread_mutex_unlock(&sbq->lock); 113.161 + 113.162 + 113.163 + return rle; 113.164 +} 113.165 + 113.166 +static void rel_rle (RingLineQueue *rlq, int *rec_token){ 113.167 + pthread_mutex_lock(&rlq->swlock); 113.168 + rlq->free++; 113.169 + *rec_token=0; 113.170 +// pthread_cond_signal(&rlq->swcond); 113.171 + pthread_mutex_unlock(&rlq->swlock); 113.172 + 113.173 +} 113.174 + 113.175 +//get either a entropy or a line reconstruct task 113.176 +static void pop_next_task(H264Context *h, SliceBufferEntry **psbe, RingLineEntry **prle, int *rec_token){ 113.177 + 113.178 + pthread_mutex_lock(&h->task_lock); 113.179 + 113.180 + for(;;){ 113.181 + if ( (*psbe = pop_sbe(&h->sb_q[ENTROPY], 0)) ){ 113.182 + if (*rec_token){ 113.183 + rel_rle(&h->rl_q, rec_token); 113.184 + pthread_cond_signal(&h->task_cond); 113.185 + } 113.186 + break; 113.187 + } 113.188 + else if ( (*prle = pop_rle(&h->sb_q[MBDEC], &h->rl_q, rec_token)) ) 113.189 + break; 113.190 + pthread_cond_wait(&h->task_cond, &h->task_lock); 113.191 + } 113.192 + 113.193 + pthread_mutex_unlock(&h->task_lock); 113.194 +} 113.195 + 113.196 +void *parse_thread(void *arg){ 113.197 + H264Context *h = (H264Context *) arg; 113.198 + ParserContext *pc = get_parse_context(h->ifile); 113.199 + NalContext *nc = get_nal_context(h->width, h->height); 113.200 + H264Slice *s; 113.201 + SliceBufferEntry *sbe = NULL; 113.202 + 113.203 + while(!pc->final_frame && frames++ <h->num_frames && !h->quit){ 113.204 + sbe = get_sb_entry(h); 113.205 + 113.206 + av_read_frame_internal(pc, &sbe->gb); 113.207 + s = &sbe->slice; 113.208 + 113.209 + decode_nal_units(nc, s, &sbe->gb); 113.210 + 113.211 + push_sbe(&h->sb_q[ENTROPY], sbe, 0); 113.212 + notify_one_worker(h); 113.213 + } 113.214 + 113.215 + if (!h->no_mbd){ 113.216 + sbe = get_sb_entry(h); 113.217 + sbe->state=-1; 113.218 + sbe->slice.coded_pic_num=nc->coded_pic_num; 113.219 + sbe->lines_total=h->threads; 113.220 + 113.221 + push_sbe(&h->sb_q[REORDER], sbe, 1); 113.222 + }else{ 113.223 + for (int i=0; i<h->threads; i++){ 113.224 + sbe = get_sb_entry(h); 113.225 + sbe->state=-1; 113.226 + push_sbe(&h->sb_q[ENTROPY], sbe, 1); 113.227 + notify_one_worker(h); 113.228 + } 113.229 + } 113.230 + free_nal_context(nc); 113.231 + free_parse_context(pc); 113.232 + 113.233 + pthread_exit(NULL); 113.234 + return NULL; 113.235 +} 113.236 + 113.237 +int decode_slice_entropy(EntropyContext *ec, SliceBufferEntry *sbe){ 113.238 + int i,j; 113.239 + H264Slice *s = &sbe->slice; 113.240 + GetBitContext *gb = &sbe->gb; 113.241 + CABACContext *c = &ec->c; 113.242 + H264Mb *mbs = sbe->mbs; 113.243 + 113.244 + if( !s->pps.cabac ){ 113.245 + av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); 113.246 + return -1; 113.247 + } 113.248 + 113.249 + init_dequant_tables(s, ec); 113.250 + ec->curr_qscale = s->qscale; 113.251 + ec->last_qscale_diff = 0; 113.252 + ec->chroma_qp[0] = get_chroma_qp( s, 0, s->qscale); 113.253 + ec->chroma_qp[1] = get_chroma_qp( s, 1, s->qscale); 113.254 + 113.255 + /* realign */ 113.256 + align_get_bits( gb ); 113.257 + /* init cabac */ 113.258 + ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8); 113.259 + 113.260 + ff_h264_init_cabac_states(ec, s, c); 113.261 + 113.262 + for(j=0; j<ec->mb_height; j++){ 113.263 + init_entropy_buf(ec, s, j); 113.264 + for(i=0; i<ec->mb_width; i++){ 113.265 + int eos,ret; 113.266 + H264Mb *m = &mbs[i + j*ec->mb_width]; 113.267 + //memset(m, 0, sizeof(H264Mb)); 113.268 + m->mb_x=i; 113.269 + m->mb_y=j; 113.270 + ec->m = m; 113.271 + 113.272 + ret = ff_h264_decode_mb_cabac(ec, s, c); 113.273 + eos = get_cabac_terminate( c); (void) eos; 113.274 + 113.275 + if( ret < 0 || c->bytestream > c->bytestream_end + 2) { 113.276 + av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream); 113.277 + return -1; 113.278 + } 113.279 + } 113.280 + } 113.281 + 113.282 + return 0; 113.283 +} 113.284 + 113.285 +static int decode_slice_mb(MBRecContext *d, RingLineEntry *rle, int frames){ 113.286 + SliceBufferEntry *sbe= rle->sbe; 113.287 + H264Slice *s = &sbe->slice; 113.288 + H264Mb *mbs = sbe->mbs; 113.289 + 113.290 + int mb_width= d->mb_width; 113.291 + int i; 113.292 + const int line = rle->line; 113.293 + 113.294 + init_mbrec_context(d, d->mrs, s, line); 113.295 + 113.296 + H264Mb *m = &mbs[line*mb_width]; 113.297 + d->top=rle->prev_line->top; 113.298 + d->top_next=rle->top; 113.299 + 113.300 +// assert(rle->mb_cnt ==0); 113.301 + for(i=0; i< mb_width; i++){ 113.302 + if (frames || line>0){ 113.303 + while (rle->mb_cnt >= rle->prev_line->mb_cnt -1); 113.304 + } 113.305 + h264_decode_mb_internal( d, d->mrs, s, &m[i]); 113.306 + rle->mb_cnt++; 113.307 + } 113.308 + draw_edges(d, s, line); 113.309 + 113.310 + return 0; 113.311 +} 113.312 + 113.313 +// static int decode_slice_mb_static(MBRecContext *d, H264Slice *s, RLThreadContext *r, RLThreadContext *rp, int frames){ 113.314 +// int mb_height= d->mb_height; 113.315 +// int mb_width= d->mb_width; 113.316 +// int thread_num = r->thread_num; 113.317 +// int thread_total = r->thread_total; 113.318 +// int i; 113.319 +// int j = thread_num; 113.320 +// 113.321 +// r->mb_cnt=frames* mb_height*mb_width; 113.322 +// for(; j<mb_height; j+=thread_total){ 113.323 +// H264Mb *m = &s->mbs[j*mb_width]; 113.324 +// for(i=0; i< mb_width; i++){ 113.325 +// if (j>0){ 113.326 +// while (r->mb_cnt- (thread_num? 0:mb_width) >= rp->mb_cnt-1); 113.327 +// } 113.328 +// h264_decode_mb_internal(d, s, m++); 113.329 +// r->mb_cnt++; 113.330 +// } 113.331 +// draw_edges(d, s, j); 113.332 +// } 113.333 +// return 0; 113.334 +// } 113.335 + 113.336 +static void *ed_rec_thread(void *arg){ 113.337 + H264Context *h = (H264Context*) arg; 113.338 + EntropyContext *ec=NULL; 113.339 + MBRecContext *mrc=NULL; 113.340 + 113.341 + RingLineEntry *rle=NULL; 113.342 + SliceBufferEntry *sbe=NULL; 113.343 + H264Slice *s; 113.344 + int rec_token=0; 113.345 + 113.346 + if (!h->no_mbd){ 113.347 + mrc = get_mbrec_context(h); 113.348 + } 113.349 + ec = get_entropy_context(h); 113.350 + 113.351 + for(;;){ 113.352 + pop_next_task(h, &sbe, &rle, &rec_token); 113.353 + if (sbe){ 113.354 + if (h->no_mbd && sbe->state<0){ 113.355 + break; 113.356 + } 113.357 + if (!sbe->initialized){ 113.358 + init_sb_entry(h, sbe); 113.359 + } 113.360 + decode_slice_entropy(ec, sbe); 113.361 + 113.362 + if (h->no_mbd){ 113.363 + release_sb_entry(h, sbe); 113.364 + sbe=NULL; 113.365 + } else { 113.366 + push_sbe(&h->sb_q[REORDER], sbe, 1); 113.367 + } 113.368 + } else if (rle){ 113.369 + if (rle->sbe->state<0) 113.370 + break; 113.371 + s = &rle->sbe->slice; 113.372 + 113.373 + decode_slice_mb(mrc, rle, s->coded_pic_num); 113.374 + 113.375 + if (rle->line == h->mb_height-1){ 113.376 + push_sbe(&h->sb_q[OUTPUT], rle->sbe, 1); 113.377 + } 113.378 + rle->mb_cnt++; 113.379 + } 113.380 + } 113.381 + 113.382 + //make sure threads quit in order of rle assignment 113.383 + if (!h->no_mbd){ 113.384 + while (rle->prev_line->mb_cnt <= h->mb_width); 113.385 + rel_rle(&h->rl_q, &rec_token); 113.386 + notify_one_worker(h); 113.387 + rle->mb_cnt = h->mb_width +1; 113.388 + if (rle->line == h->threads-1){ 113.389 + push_sbe(&h->sb_q[OUTPUT], rle->sbe, 1); 113.390 + } 113.391 + 113.392 + free_mbrec_context(mrc); 113.393 + } 113.394 + 113.395 + free_entropy_context(ec); 113.396 + 113.397 + pthread_exit(NULL); 113.398 + return NULL; 113.399 +} 113.400 + 113.401 +static void *reorder_thread(void *arg){ 113.402 + H264Context *h = (H264Context *) arg; 113.403 + int i; 113.404 + SliceBufferEntry *reorder[h->sb_size]; 113.405 + SliceBufferEntry *sbe, *next_sbe; 113.406 + H264Slice *s; 113.407 + int reorder_cnt=0; 113.408 + unsigned next_pic_num=0; 113.409 + 113.410 + for(;;){ 113.411 + 113.412 + sbe = pop_sbe(&h->sb_q[REORDER], 1); 113.413 + 113.414 + s = &sbe->slice; 113.415 + for(i=reorder_cnt; i>0; i--){ 113.416 + if (s->coded_pic_num < reorder[i-1]->slice.coded_pic_num) 113.417 + break; 113.418 + reorder[i]=reorder[i-1]; 113.419 + } 113.420 + reorder[i]=sbe; 113.421 + 113.422 + while(reorder_cnt>=0){ 113.423 + if (next_pic_num!=reorder[reorder_cnt]->slice.coded_pic_num){ 113.424 + break; 113.425 + } 113.426 + next_sbe = reorder[reorder_cnt]; 113.427 + H264Slice *es = &next_sbe->slice; 113.428 + 113.429 + if (next_sbe->state<0) 113.430 + goto end; 113.431 + 113.432 + for (int i=0; i<2; i++){ 113.433 + for(int j=0; j< es->ref_count[i]; j++){ 113.434 + if (es->ref_list_cpn[i][j] ==-1) 113.435 + continue; 113.436 + int k; 113.437 + for (k=0; k<h->max_dpb_cnt; k++){ 113.438 + if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == es->ref_list_cpn[i][j]){ 113.439 + es->dp_ref_list[i][j] = &h->dpb[k]; 113.440 + break; 113.441 + } 113.442 + } 113.443 + } 113.444 + } 113.445 + next_sbe->dp = get_dpb_entry(h, es); 113.446 + 113.447 + push_sbe(&h->sb_q[MBDEC], next_sbe, 0); 113.448 + notify_all_workers(h); 113.449 + 113.450 +// for (int i=0; i< h->mb_height; i++){ 113.451 +// push_rle(&h->rl_q, next_sbe, i, 0); 113.452 +// notify_one_worker(h); 113.453 +// } 113.454 + 113.455 + 113.456 + next_pic_num++; 113.457 + reorder_cnt--; 113.458 + } 113.459 + reorder_cnt++; 113.460 + } 113.461 + 113.462 +end: 113.463 + { 113.464 + push_sbe(&h->sb_q[MBDEC], next_sbe, 0); 113.465 + notify_all_workers(h); 113.466 + if (h->no_mbd){ 113.467 + push_sbe(&h->sb_q[OUTPUT], next_sbe, 1); 113.468 + } 113.469 +// for (int i=0; i< h->threads; i++){ 113.470 +// push_rle(&h->rl_q, next_sbe, i, 0); 113.471 +// notify_one_worker(h); 113.472 +// } 113.473 + } 113.474 + 113.475 + pthread_exit(NULL); 113.476 + return NULL; 113.477 +} 113.478 + 113.479 +void create_ed_rec_threads(H264Context *h){ 113.480 + cpu_set_t cpuset; 113.481 + int* aff; 113.482 + 113.483 + if (h->setaff){ 113.484 + aff = h->smt ? ed_rec_smt_aff : ed_rec_affinity ; 113.485 + for (int i=0; i<h->threads; i++){ 113.486 + pthread_attr_init(&h->ed_rec_attr[i]); 113.487 + CPU_ZERO(&cpuset); 113.488 + CPU_SET(aff[i], &cpuset); 113.489 + pthread_attr_setaffinity_np(&h->ed_rec_attr[i], sizeof(cpu_set_t), &cpuset); 113.490 + pthread_create(&h->ed_rec_thr[i], &h->ed_rec_attr[i], ed_rec_thread, h); 113.491 + } 113.492 + } else { 113.493 + for (int i=0; i<h->threads; i++){ 113.494 + pthread_create(&h->ed_rec_thr[i], NULL, ed_rec_thread, h); 113.495 + } 113.496 + } 113.497 +} 113.498 + 113.499 +void join_ed_rec_threads(H264Context *h){ 113.500 + for (int i=0; i< h->threads; i++){ 113.501 + pthread_join(h->ed_rec_thr[i], NULL); 113.502 + } 113.503 +} 113.504 + 113.505 +void *output_thread(void *arg){ 113.506 + H264Context *h = (H264Context *) arg; 113.507 + 113.508 + OutputContext *oc = get_output_context( h ); 113.509 + 113.510 + SliceBufferEntry *sbe = NULL; 113.511 + H264Slice *s=NULL; 113.512 + for(;;) { 113.513 + DecodedPicture *out, *dp; 113.514 + sbe = pop_sbe(&h->sb_q[OUTPUT], 1); 113.515 + 113.516 + if (sbe->state <0) 113.517 + break; 113.518 + 113.519 + s = &sbe->slice; 113.520 + for (int i=0; i<s->release_cnt; i++){ 113.521 + for(int j=0; j<h->max_dpb_cnt; j++){ 113.522 + if(h->dpb[j].cpn== s->release_ref_cpn[i]){ 113.523 + release_dpb_entry(h, &h->dpb[j], 2); 113.524 + break; 113.525 + } 113.526 + } 113.527 + } 113.528 + 113.529 + dp=sbe->dp; 113.530 + release_sb_entry(h, sbe); 113.531 + 113.532 + out =output_frame(h, oc, dp, h->ofile, h->frame_width, h->frame_height); 113.533 + if (out){ 113.534 + release_dpb_entry(h, out, 1); 113.535 + } 113.536 + 113.537 + print_report(oc->frame_number, oc->video_size, 0, h->verbose); 113.538 + 113.539 + } 113.540 + /* at the end of stream, we must flush the decoder buffers */ 113.541 + while (output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height)); 113.542 + print_report(oc->frame_number, oc->video_size, 1, h->verbose); 113.543 + 113.544 + free_output_context(oc); 113.545 + 113.546 + pthread_exit(NULL); 113.547 + return NULL; 113.548 +} 113.549 + 113.550 +/* 113.551 +* The following code is the main loop of the file converter 113.552 +*/ 113.553 +int h264_decode_pthread(H264Context *h) { 113.554 + pthread_t parse_thr, reorder_thr, output_thr; 113.555 + 113.556 + av_start_timer(); 113.557 + 113.558 + pthread_create(&parse_thr, NULL, parse_thread, h); 113.559 + if (!h->no_mbd){ 113.560 + pthread_create(&reorder_thr, NULL, reorder_thread, h); 113.561 + pthread_create(&output_thr, NULL, output_thread, h); 113.562 + } 113.563 +#if HAVE_LIBSDL2 113.564 + pthread_t sdl_thr; 113.565 + if (h->display){ 113.566 + pthread_create(&sdl_thr, NULL, sdl_thread, h); 113.567 + } 113.568 +#endif 113.569 + create_ed_rec_threads(h); 113.570 + 113.571 + 113.572 + if (h->rl_side_touch){ 113.573 + pthread_mutex_lock(&h->ilock); 113.574 + while (h->init_threads< h->threads) 113.575 + pthread_cond_wait(&h->icond, &h->ilock); 113.576 + pthread_mutex_unlock(&h->ilock); 113.577 + 113.578 + pthread_mutex_lock(&h->tlock); 113.579 + h->touch_start =1; 113.580 + pthread_cond_broadcast(&h->tcond); 113.581 + pthread_mutex_unlock(&h->tlock); 113.582 + 113.583 + pthread_mutex_lock(&h->tdlock); 113.584 + while (h->touch_done < h->threads) 113.585 + pthread_cond_wait(&h->tdcond, &h->tdlock); 113.586 + pthread_mutex_unlock(&h->tdlock); 113.587 + 113.588 + pthread_mutex_lock(&h->slock); 113.589 + h->start =1; 113.590 + pthread_cond_broadcast(&h->scond); 113.591 + pthread_mutex_unlock(&h->slock); 113.592 + } 113.593 + join_ed_rec_threads(h); 113.594 + pthread_join(parse_thr, NULL); 113.595 + if (!h->no_mbd){ 113.596 + pthread_join(reorder_thr, NULL); 113.597 + pthread_join(output_thr, NULL); 113.598 + } 113.599 +#if HAVE_LIBSDL2 113.600 + if (h->display) 113.601 + signal_sdl_exit(h); 113.602 + pthread_join(sdl_thr, NULL); 113.603 +#endif 113.604 + 113.605 + 113.606 + return 0; 113.607 +}
114.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 114.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_pthread.h Mon Aug 27 12:09:56 2012 +0200 114.3 @@ -0,0 +1,14 @@ 114.4 +#ifndef H264_PTHREAD_H 114.5 +#define H264_PTHREAD_H 114.6 + 114.7 +#include "h264_types.h" 114.8 + 114.9 +int decode_B_slice_entropy(EntropyContext *ec, EDSlice *s, EDThreadContext *eb, EDThreadContext *eb_prev); 114.10 +int decode_slice_entropy(EntropyContext *hc, EDSlice *s); 114.11 + 114.12 +void *read_thread(void *arg); 114.13 +void *parsenal_thread(void *arg); 114.14 +void *mbrec_thread(void *arg); 114.15 +void *write_thread(void *arg); 114.16 + 114.17 +#endif
115.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 115.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_rec.c Mon Aug 27 12:09:56 2012 +0200 115.3 @@ -0,0 +1,412 @@ 115.4 +#include "config.h" 115.5 + 115.6 +#include "dsputil.h" 115.7 +#include "h264_types.h" 115.8 +#include "h264_data.h" 115.9 +#include "h264_mc.h" 115.10 +#include "h264_deblock.h" 115.11 +#include "h264_pred_mode.h" 115.12 +//#undef NDEBUG 115.13 +#include <assert.h> 115.14 + 115.15 +void init_mbrec_context(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int line){ 115.16 + DecodedPicture *pic = s->curr_pic; 115.17 + int mb_stride = mrc->mb_stride; 115.18 + int mb_width = mrc->mb_width; 115.19 + mrs->mb_type_top = pic->mb_type + (line -1)*mb_stride; 115.20 + mrs->mb_type = pic->mb_type + line*mb_stride; 115.21 + mrs->ref_index_top[0] = pic->ref_index[0] + 4*(line -1)*mb_stride; 115.22 + mrs->ref_index_top[1] = pic->ref_index[1] + 4*(line -1)*mb_stride; 115.23 + mrs->ref_index[0] = pic->ref_index[0] + 4*line*mb_stride; 115.24 + mrs->ref_index[1] = pic->ref_index[1] + 4*line*mb_stride; 115.25 + 115.26 + mrs->motion_val_top[0] = pic->motion_val[0] + 4*mb_width*4*(line-1); 115.27 + mrs->motion_val_top[1] = pic->motion_val[1] + 4*mb_width*4*(line-1); 115.28 + mrs->motion_val[0] = pic->motion_val[0] + 4*mb_width*4*line; 115.29 + mrs->motion_val[1] = pic->motion_val[1] + 4*mb_width*4*line; 115.30 + 115.31 + mrs->intra4x4_pred_mode_top = pic->intra4x4_pred_mode + 4*mb_width*(line-1); 115.32 + mrs->intra4x4_pred_mode = pic->intra4x4_pred_mode + 4*mb_width*line; 115.33 + 115.34 + mrs->non_zero_count_top = pic->non_zero_count + 8*mb_width*(line-1); 115.35 + mrs->non_zero_count = pic->non_zero_count + 8*mb_width*line; 115.36 + 115.37 + if (s->slice_type_nos == FF_B_TYPE){ 115.38 + mrs->list1_mb_type = s->dp_ref_list[1][0]->mb_type + line*mb_stride; 115.39 + mrs->list1_ref_index[0] = s->dp_ref_list[1][0]->ref_index[0] + 4*line*mb_stride; 115.40 + mrs->list1_ref_index[1] = s->dp_ref_list[1][0]->ref_index[1] + 4*line*mb_stride; 115.41 + mrs->list1_motion_val[0] = s->dp_ref_list[1][0]->motion_val[0] + 4*mb_width*4*line; 115.42 + mrs->list1_motion_val[1] = s->dp_ref_list[1][0]->motion_val[1] + 4*mb_width*4*line; 115.43 + } 115.44 + 115.45 +} 115.46 + 115.47 +#if OMPSS 115.48 +static void backup_mb_border(H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){ 115.49 + int i; 115.50 + uint8_t * top_border_y1 = m->top_border; 115.51 + uint8_t * top_border_y2 = m->top_border + 8; 115.52 + uint8_t * top_border_cb = m->top_border + 16; 115.53 + uint8_t * top_border_cr = m->top_border + 24; 115.54 + uint8_t * top_border_next = m->top_border_next; 115.55 + 115.56 + src_y -= linesize; 115.57 + src_cb -= uvlinesize; 115.58 + src_cr -= uvlinesize; 115.59 + 115.60 + m->left_border[0]= m->top_border[15]; 115.61 + for(i=1; i<17 ; i++){ 115.62 + m->left_border[i]= src_y[15 + i*linesize]; 115.63 + } 115.64 + 115.65 + *(uint64_t*)(top_border_y1) = *(uint64_t*)(src_y + 16*linesize); 115.66 + *(uint64_t*)(top_border_next) = *(uint64_t*)(src_y + 16*linesize); 115.67 + *(uint64_t*)(top_border_y2) = *(uint64_t*)(src_y +8+16*linesize); 115.68 + 115.69 + m->left_border[17]= m->top_border[16+7]; 115.70 + m->left_border[17+9]= m->top_border[24+7]; 115.71 + for(i=1; i<9; i++){ 115.72 + m->left_border[17 +i]= src_cb[7+i*uvlinesize]; 115.73 + m->left_border[17+9+i]= src_cr[7+i*uvlinesize]; 115.74 + } 115.75 + *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize); 115.76 + *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize); 115.77 +} 115.78 + 115.79 +static void xchg_mb_border(H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){ 115.80 + int temp8, i; 115.81 + uint64_t temp64; 115.82 + 115.83 + uint8_t * top_border_y1 = m->top_border; 115.84 + uint8_t * top_border_y2 = m->top_border + 8; 115.85 + uint8_t * top_border_cb = m->top_border + 16; 115.86 + uint8_t * top_border_cr = m->top_border + 24; 115.87 + uint8_t * top_border_next = m->top_border_next; 115.88 + 115.89 + int deblock_left; 115.90 + int deblock_top; 115.91 + 115.92 + deblock_left = (m->mb_x > 0); 115.93 + deblock_top = (m->mb_y > 0); 115.94 + 115.95 + src_y -= ( linesize + 1); 115.96 + src_cb -= (uvlinesize + 1); 115.97 + src_cr -= (uvlinesize + 1); 115.98 + 115.99 + #define XCHG(a,b,t,xchg)\ 115.100 + t= a;\ 115.101 + if(xchg)\ 115.102 + a= b;\ 115.103 + b= t; 115.104 + 115.105 + if(deblock_left){ 115.106 + for(i = !deblock_top; i<16; i++){ 115.107 + XCHG(m->left_border[i], src_y [i* linesize], temp8, xchg); 115.108 + } 115.109 + XCHG(m->left_border[i], src_y [i* linesize], temp8, 1); 115.110 + 115.111 + for(i = !deblock_top; i<8; i++){ 115.112 + XCHG(m->left_border[17 +i], src_cb[i*uvlinesize], temp8, xchg); 115.113 + XCHG(m->left_border[17+9+i], src_cr[i*uvlinesize], temp8, xchg); 115.114 + } 115.115 + XCHG(m->left_border[17 +i], src_cb[i*uvlinesize], temp8, 1); 115.116 + XCHG(m->left_border[17+9+i], src_cr[i*uvlinesize], temp8, 1); 115.117 + } 115.118 + 115.119 + if(deblock_top){ 115.120 + XCHG(*(uint64_t*)(top_border_y1) , *(uint64_t*)(src_y +1), temp64, xchg); 115.121 + XCHG(*(uint64_t*)(top_border_y2) , *(uint64_t*)(src_y +9), temp64, 1); 115.122 + XCHG(*(uint64_t*)(top_border_next), *(uint64_t*)(src_y +17), temp64, 1); 115.123 + 115.124 + XCHG(*(uint64_t*)(top_border_cb) , *(uint64_t*)(src_cb+1), temp64, 1); 115.125 + XCHG(*(uint64_t*)(top_border_cr) , *(uint64_t*)(src_cr+1), temp64, 1); 115.126 + } 115.127 +} 115.128 +#else 115.129 + 115.130 +static void backup_mb_border(MBRecContext *d, H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){ 115.131 + int i; 115.132 + uint8_t* top_border_y = d->top[m->mb_x].unfiltered_y; 115.133 + uint8_t* top_border_cb = d->top[m->mb_x].unfiltered_cb; 115.134 + uint8_t* top_border_cr = d->top[m->mb_x].unfiltered_cr; 115.135 + 115.136 + uint8_t* left_border_y = d->left.unfiltered_y; 115.137 + uint8_t* left_border_cb = d->left.unfiltered_cb; 115.138 + uint8_t* left_border_cr = d->left.unfiltered_cr; 115.139 + 115.140 + src_y -= linesize; 115.141 + src_cb -= uvlinesize; 115.142 + src_cr -= uvlinesize; 115.143 + 115.144 + // There are two lines saved, the line above the top macroblock of a pair, 115.145 + // and the line above the bottom macroblock 115.146 + left_border_y[0] = top_border_y[15]; 115.147 + for(i=1; i<17; i++){ 115.148 + left_border_y[i] = src_y[15+i* linesize]; 115.149 + } 115.150 + *(uint64_t*)(top_border_y ) = *(uint64_t*)(src_y + 16*linesize); 115.151 + *(uint64_t*)(top_border_y +8) = *(uint64_t*)(src_y +8+16*linesize); 115.152 + 115.153 + left_border_cb[0] = top_border_cb[7]; 115.154 + left_border_cr[0] = top_border_cr[7]; 115.155 + for(i=1; i<9; i++){ 115.156 + left_border_cb[i] = src_cb[7+i*uvlinesize]; 115.157 + left_border_cr[i] = src_cr[7+i*uvlinesize]; 115.158 + } 115.159 + *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize); 115.160 + *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize); 115.161 +} 115.162 + 115.163 +static void xchg_mb_border(MBRecContext *d, H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){ 115.164 + 115.165 + int temp8, i; 115.166 + uint64_t temp64; 115.167 + int deblock_left; 115.168 + int deblock_top; 115.169 + 115.170 + uint8_t* top_border_y = d->top[m->mb_x].unfiltered_y; 115.171 + uint8_t* top_border_cb = d->top[m->mb_x].unfiltered_cb; 115.172 + uint8_t* top_border_cr = d->top[m->mb_x].unfiltered_cr; 115.173 + uint8_t* top_border_y_next = d->top[m->mb_x +1].unfiltered_y; 115.174 + 115.175 + uint8_t* left_border_y = d->left.unfiltered_y; 115.176 + uint8_t* left_border_cb = d->left.unfiltered_cb; 115.177 + uint8_t* left_border_cr = d->left.unfiltered_cr; 115.178 + 115.179 + deblock_left = (m->mb_x > 0); 115.180 + deblock_top = (m->mb_y > 0); 115.181 + 115.182 + src_y -= ( linesize + 1); 115.183 + src_cb -= (uvlinesize + 1); 115.184 + src_cr -= (uvlinesize + 1); 115.185 + 115.186 + #define XCHG(a,b,t,xchg)\ 115.187 + t= a;\ 115.188 + if(xchg)\ 115.189 + a= b;\ 115.190 + b= t; 115.191 + 115.192 + if(deblock_left){ 115.193 + for(i = !deblock_top; i<16; i++){ 115.194 + XCHG(left_border_y[i], src_y [i* linesize], temp8, xchg); 115.195 + } 115.196 + XCHG(left_border_y[i], src_y [i* linesize], temp8, 1); 115.197 + 115.198 + for(i = !deblock_top; i<8; i++){ 115.199 + XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, xchg); 115.200 + XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, xchg); 115.201 + } 115.202 + XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, 1); 115.203 + XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, 1); 115.204 + } 115.205 + 115.206 + if(deblock_top){ 115.207 + XCHG(*(uint64_t*)(top_border_y+0), *(uint64_t*)(src_y +1), temp64, xchg); 115.208 + XCHG(*(uint64_t*)(top_border_y+8), *(uint64_t*)(src_y +9), temp64, 1); 115.209 + if(m->mb_x+1 < d->mb_width){ 115.210 + XCHG(*(uint64_t*)(top_border_y_next), *(uint64_t*)(src_y +17), temp64, 1); 115.211 + } 115.212 + XCHG(*(uint64_t*)(top_border_cb), *(uint64_t*)(src_cb+1), temp64, 1); 115.213 + XCHG(*(uint64_t*)(top_border_cr), *(uint64_t*)(src_cr+1), temp64, 1); 115.214 + } 115.215 +} 115.216 + 115.217 +#endif 115.218 + 115.219 +void h264_decode_mb_internal(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m){ 115.220 + int i; 115.221 + const int mb_x= m->mb_x; 115.222 + const int mb_y= m->mb_y; 115.223 + int *block_offset = d->block_offset; 115.224 + 115.225 + void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); 115.226 + void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); 115.227 + 115.228 + int linesize = d->linesize; 115.229 + int uvlinesize = d->uvlinesize; 115.230 + 115.231 + uint8_t *dest_y = s->curr_pic->data[0] + (mb_x + mb_y * linesize ) * 16; 115.232 + uint8_t *dest_cb = s->curr_pic->data[1] + (mb_x + mb_y * uvlinesize) * 8; 115.233 + uint8_t *dest_cr = s->curr_pic->data[2] + (mb_x + mb_y * uvlinesize) * 8; 115.234 + 115.235 + pred_motion_mb_rec (d, mrs, s, m); 115.236 + 115.237 + const int mb_type= m->mb_type; 115.238 + 115.239 + d->dsp.prefetch(dest_y + (m->mb_x&3)*4*linesize + 64, d->linesize, 4); 115.240 + d->dsp.prefetch(dest_cb + (m->mb_x&7)*uvlinesize + 64, dest_cr - dest_cb, 2); 115.241 + 115.242 + if(IS_INTRA(mb_type)){ 115.243 +#if OMPSS 115.244 + xchg_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1); 115.245 +#else 115.246 + xchg_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1); 115.247 +#endif 115.248 + 115.249 + d->hpc.pred8x8[ m->chroma_pred_mode ](dest_cb, uvlinesize); 115.250 + d->hpc.pred8x8[ m->chroma_pred_mode ](dest_cr, uvlinesize); 115.251 + 115.252 + if(IS_INTRA4x4(mb_type)){ 115.253 + if(IS_8x8DCT(mb_type)){ 115.254 + idct_dc_add = d->hdsp.h264_idct8_dc_add; 115.255 + idct_add = d->hdsp.h264_idct8_add; 115.256 + 115.257 + for(i=0; i<16; i+=4){ 115.258 + uint8_t * const ptr= dest_y + block_offset[i]; 115.259 + const int dir= mrs->intra4x4_pred_mode_cache[ scan8[i] ]; 115.260 + 115.261 + const int nnz = mrs->non_zero_count_cache[ scan8[i] ]; 115.262 + d->hpc.pred8x8l[ dir ](ptr, (mrs->topleft_samples_available<<i)&0x8000, 115.263 + (mrs->topright_samples_available<<i)&0x4000, linesize); 115.264 + if(nnz){ 115.265 + if(nnz == 1 && m->mb[i*16]) 115.266 + idct_dc_add(ptr, m->mb + i*16, linesize); 115.267 + else 115.268 + idct_add (ptr, m->mb + i*16, linesize); 115.269 + } 115.270 + } 115.271 + }else{ 115.272 + idct_dc_add = d->hdsp.h264_idct_dc_add; 115.273 + idct_add = d->hdsp.h264_idct_add; 115.274 + 115.275 + for(i=0; i<16; i++){ 115.276 + uint8_t * const ptr= dest_y + block_offset[i]; 115.277 + const int dir= mrs->intra4x4_pred_mode_cache[ scan8[i] ]; 115.278 + uint8_t *topright; 115.279 + int nnz, tr; 115.280 + if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){ 115.281 + const int topright_avail= (mrs->topright_samples_available<<i)&0x8000; 115.282 + assert(mb_y || linesize <= block_offset[i]); 115.283 + if(!topright_avail){ 115.284 + tr= ptr[3 - linesize]*0x01010101; 115.285 + topright= (uint8_t*) &tr; 115.286 + }else 115.287 + topright= ptr + 4 - linesize; 115.288 + }else 115.289 + topright= NULL; 115.290 + 115.291 + d->hpc.pred4x4[ dir ](ptr, topright, linesize); 115.292 + nnz = mrs->non_zero_count_cache[ scan8[i] ]; 115.293 + if(nnz){ 115.294 + if(nnz == 1 && m->mb[i*16]) 115.295 + idct_dc_add(ptr, m->mb + i*16, linesize); 115.296 + else 115.297 + idct_add (ptr, m->mb + i*16, linesize); 115.298 + } 115.299 + } 115.300 + } 115.301 + }else{ 115.302 + d->hpc.pred16x16[ m->intra16x16_pred_mode ](dest_y , linesize); 115.303 + } 115.304 +#if OMPSS 115.305 + xchg_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0); 115.306 +#else 115.307 + xchg_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0); 115.308 +#endif 115.309 + }else { 115.310 + hl_motion(d, mrs, s, m, dest_y, dest_cb, dest_cr, 115.311 + d->hdsp.qpel_put, d->dsp.put_h264_chroma_pixels_tab, 115.312 + d->hdsp.qpel_avg, d->dsp.avg_h264_chroma_pixels_tab, 115.313 + d->hdsp.weight_h264_pixels_tab, d->hdsp.biweight_h264_pixels_tab); 115.314 + } 115.315 + 115.316 + if(!IS_INTRA4x4(mb_type)){ 115.317 + 115.318 + if(IS_INTRA16x16(mb_type)){ 115.319 + 115.320 + d->hdsp.h264_idct_add16intra(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache); 115.321 + 115.322 + }else if(m->cbp&15){ 115.323 + 115.324 + if(IS_8x8DCT(mb_type)){ 115.325 + d->hdsp.h264_idct8_add4(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache); 115.326 + }else{ 115.327 + d->hdsp.h264_idct_add16(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache); 115.328 + } 115.329 + } 115.330 + } 115.331 + 115.332 + if(m->cbp&0x30){ 115.333 + uint8_t *dest[2] = {dest_cb, dest_cr}; 115.334 + 115.335 + idct_add = d->hdsp.h264_idct_add; 115.336 + idct_dc_add = d->hdsp.h264_idct_dc_add; 115.337 + for(i=16; i<16+8; i++){ 115.338 + if(mrs->non_zero_count_cache[ scan8[i] ]) 115.339 + idct_add (dest[(i&4)>>2] + block_offset[i], m->mb + i*16, uvlinesize); 115.340 + else if(m->mb[i*16]) 115.341 + idct_dc_add(dest[(i&4)>>2] + block_offset[i], m->mb + i*16, uvlinesize); 115.342 + } 115.343 + } 115.344 + 115.345 +#if OMPSS 115.346 + backup_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize); 115.347 + if (mb_x+1 <d->mb_width){ 115.348 + H264Mb *mr = m+1; 115.349 + memcpy(mr->left_border, m->left_border, sizeof(m->left_border)); 115.350 + } 115.351 + if (mb_y +1 <d->mb_height){ 115.352 + H264Mb *md = m + d->mb_width; 115.353 + memcpy(md->top_border, m->top_border, sizeof(m->top_border)); 115.354 + if (mb_x>0){ 115.355 + H264Mb *mdl = m + d->mb_width -1; 115.356 + memcpy(mdl->top_border_next, m->top_border_next, sizeof(m->top_border_next)); 115.357 + } 115.358 + } 115.359 +#else 115.360 + backup_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize); 115.361 + if (mb_y +1 <d->mb_height && d->top_next != d->top){ 115.362 + memcpy(&d->top_next[mb_x],&d->top[mb_x], sizeof(TopBorder)); 115.363 + } 115.364 +#endif 115.365 + 115.366 + ff_h264_filter_mb(d, mrs, s, m, dest_y, dest_cb, dest_cr); 115.367 +} 115.368 + 115.369 +MBRecContext *get_mbrec_context(H264Context *h){ 115.370 + MBRecContext *d = av_mallocz(sizeof(MBRecContext)); 115.371 + 115.372 + ff_h264dsp_init(&d->hdsp); 115.373 + ff_h264_pred_init(&d->hpc); 115.374 + dsputil_init(&d->dsp); 115.375 + 115.376 +#if !OMPSS 115.377 + d->mrs = av_mallocz(sizeof(MBRecState)); 115.378 +#endif 115.379 + d->hdsp.qpel_put= d->dsp.put_h264_qpel_pixels_tab; 115.380 + d->hdsp.qpel_avg= d->dsp.avg_h264_qpel_pixels_tab; 115.381 + d->mb_height = h->mb_height; 115.382 + d->mb_width = h->mb_width; 115.383 + d->mb_stride = h->mb_stride; 115.384 + d->b_stride = h->b_stride; 115.385 + d->height = h->height; 115.386 + d->width = h->width; 115.387 + d->linesize = h->width + EDGE_WIDTH*2; 115.388 + d->uvlinesize = d->linesize>>1; 115.389 + 115.390 + d->scratchpad_y = av_malloc(d->linesize*16*sizeof(uint8_t)); 115.391 + d->scratchpad_cb= av_malloc(d->uvlinesize*8*sizeof(uint8_t)); 115.392 + d->scratchpad_cr= av_malloc(d->uvlinesize*8*sizeof(uint8_t)); 115.393 + 115.394 + for (int i=0; i<16; i++){ 115.395 + d->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*d->linesize*((scan8[i] - scan8[0])>>3); 115.396 + } 115.397 + for (int i=0; i<4; i++){ 115.398 + d->block_offset[16+i]= 115.399 + d->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*d->uvlinesize*((scan8[i] - scan8[0])>>3); 115.400 + } 115.401 + 115.402 + 115.403 + 115.404 + return d; 115.405 +} 115.406 + 115.407 +void free_mbrec_context(MBRecContext *d){ 115.408 +#if !OMPSS 115.409 + av_free(d->mrs); 115.410 +#endif 115.411 + av_free(d->scratchpad_y); 115.412 + av_free(d->scratchpad_cb); 115.413 + av_free(d->scratchpad_cr); 115.414 + av_free(d); 115.415 +}
116.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 116.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_rec.h Mon Aug 27 12:09:56 2012 +0200 116.3 @@ -0,0 +1,12 @@ 116.4 +#ifndef H264_REC_H 116.5 +#define H264_REC_H 116.6 + 116.7 +#include "h264_types.h" 116.8 + 116.9 +MBRecContext *get_mbrec_context(H264Context *h); 116.10 +void free_mbrec_context( MBRecContext *d); 116.11 +void h264_decode_mb_internal(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m); 116.12 + 116.13 +void init_mbrec_context(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int line); 116.14 + 116.15 +#endif
117.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 117.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_refs.c Mon Aug 27 12:09:56 2012 +0200 117.3 @@ -0,0 +1,461 @@ 117.4 +/* 117.5 + * H.26L/H.264/AVC/JVT/14496-10/... reference picture handling 117.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 117.7 + * 117.8 + * This file is part of FFmpeg. 117.9 + * 117.10 + * FFmpeg is free software; you can redistribute it and/or 117.11 + * modify it under the terms of the GNU Lesser General Public 117.12 + * License as published by the Free Software Foundation; either 117.13 + * version 2.1 of the License, or (at your option) any later version. 117.14 + * 117.15 + * FFmpeg is distributed in the hope that it will be useful, 117.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 117.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 117.18 + * Lesser General Public License for more details. 117.19 + * 117.20 + * You should have received a copy of the GNU Lesser General Public 117.21 + * License along with FFmpeg; if not, write to the Free Software 117.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 117.23 + */ 117.24 + 117.25 +/** 117.26 + * @file 117.27 + * H.264 / AVC / MPEG4 part10 reference picture handling. 117.28 + * @author Michael Niedermayer <michaelni@gmx.at> 117.29 + */ 117.30 + 117.31 +#include "dsputil.h" 117.32 +#include "h264_types.h" 117.33 +#include "golomb.h" 117.34 + 117.35 +//#undef NDEBUG 117.36 +#include <assert.h> 117.37 + 117.38 +static int build_def_list(PictureInfo **def, PictureInfo **in, int len, int is_long){ 117.39 + int i[2]={0}; 117.40 + int index=0; 117.41 + 117.42 + while(i[0]<len || i[1]<len){ 117.43 + while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference))) 117.44 + i[0]++; 117.45 + while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & 0))) 117.46 + i[1]++; 117.47 + if(i[0] < len){ 117.48 + in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num; 117.49 + def[index++]= in[ i[0]++ ]; 117.50 + } 117.51 + if(i[1] < len){ 117.52 + in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num; 117.53 + def[index++]= in[ i[1]++ ]; 117.54 + } 117.55 + } 117.56 + 117.57 + return index; 117.58 +} 117.59 + 117.60 +static int add_sorted(PictureInfo **sorted, PictureInfo **src, int len, int limit, int dir){ 117.61 + int i, best_poc; 117.62 + int out_i= 0; 117.63 + 117.64 + for(;;){ 117.65 + best_poc= dir ? INT_MIN : INT_MAX; 117.66 + 117.67 + for(i=0; i<len; i++){ 117.68 + const int poc= src[i]->poc; 117.69 + if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){ 117.70 + best_poc= poc; 117.71 + sorted[out_i]= src[i]; 117.72 + } 117.73 + } 117.74 + if(best_poc == (dir ? INT_MIN : INT_MAX)) 117.75 + break; 117.76 + limit= sorted[out_i++]->poc - dir; 117.77 + } 117.78 + return out_i; 117.79 +} 117.80 + 117.81 +int ff_h264_fill_default_ref_list(NalContext *n, H264Slice *s){ 117.82 + int i,len; 117.83 + 117.84 + if(s->slice_type_nos==FF_B_TYPE){ 117.85 + PictureInfo *sorted[32]; 117.86 + int cur_poc, list; 117.87 + int lens[2]; 117.88 + 117.89 + cur_poc= s->poc; 117.90 + 117.91 + for(list= 0; list<2; list++){ 117.92 + len= add_sorted(sorted, n->short_ref, n->short_ref_count, cur_poc, !list); 117.93 + len+=add_sorted(sorted+len, n->short_ref, n->short_ref_count, cur_poc, list); 117.94 + assert(len<=32); 117.95 + len= build_def_list(s->ref_list[list], sorted, len, 0); 117.96 + len+=build_def_list(s->ref_list[list] +len, n->long_ref, 16 , 1); 117.97 + assert(len<=32); 117.98 + 117.99 + for(int i=len; i<s->ref_count[list]; i++) 117.100 + s->ref_list[list][i] = NULL; 117.101 + 117.102 + lens[list]= len; 117.103 + } 117.104 + 117.105 + if(lens[0] == lens[1] && lens[1] > 1){ 117.106 + for(i=0; s->ref_list[0][i]->poc == s->ref_list[1][i]->poc && i<lens[0]; i++); 117.107 + 117.108 + if(i == lens[0]) 117.109 + FFSWAP(PictureInfo *, s->ref_list[1][0], s->ref_list[1][1]); 117.110 + } 117.111 + }else{ 117.112 + len = build_def_list(s->ref_list[0], n->short_ref, n->short_ref_count, 0); 117.113 + len+= build_def_list(s->ref_list[0] +len, n->long_ref, 16, 1); 117.114 + assert(len <= 32); 117.115 + for(i=len; i<s->ref_count[0]; i++) 117.116 + s->ref_list[0][i] = NULL; 117.117 + } 117.118 + 117.119 + return 0; 117.120 +} 117.121 + 117.122 +/** 117.123 +* print short term list 117.124 +*/ 117.125 +static void print_short_term(NalContext *n) { 117.126 + av_log(AV_LOG_DEBUG, "short term list:\n"); 117.127 + for(int i=0; i<n->short_ref_count; i++){ 117.128 + PictureInfo *pic= n->short_ref[i]; 117.129 + av_log(AV_LOG_DEBUG, "%d fn:%d poc:%d ref:%d \n", i, pic->frame_num, pic->poc, pic->reference); 117.130 + } 117.131 +} 117.132 + 117.133 +/** 117.134 +* print long term list 117.135 +*/ 117.136 +static void print_long_term(NalContext *n) { 117.137 + uint32_t i; 117.138 + 117.139 + av_log(AV_LOG_DEBUG, "long term list:\n"); 117.140 + for(i = 0; i < 16; i++){ 117.141 + PictureInfo *pic= n->long_ref[i]; 117.142 + if (pic) { 117.143 + av_log(AV_LOG_DEBUG, "%d fn:%d poc:%d\n", i, pic->frame_num, pic->poc); 117.144 + } 117.145 + } 117.146 +} 117.147 + 117.148 +int ff_h264_decode_ref_pic_list_reordering(NalContext *n, H264Slice *s, GetBitContext *gb){ 117.149 + int list, index; 117.150 + 117.151 + print_short_term(n); 117.152 + print_long_term(n); 117.153 + 117.154 + for(list=0; list<s->list_count; list++){ 117.155 + 117.156 + if(get_bits1(gb)){ 117.157 + int frame_num = n->frame_num; 117.158 + unsigned int abs_diff_pic_num; 117.159 + for(index=0; ; index++){ 117.160 + unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(gb); 117.161 + int i=0; 117.162 + PictureInfo *ref = NULL; 117.163 + 117.164 + if(reordering_of_pic_nums_idc==3){ 117.165 + break; 117.166 + } 117.167 + if(index >= s->ref_count[list]){ 117.168 + av_log(AV_LOG_ERROR, "reference count overflow\n"); 117.169 + return -1; 117.170 + } 117.171 + 117.172 + if (reordering_of_pic_nums_idc>2){ 117.173 + av_log(AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n"); 117.174 + return -1; 117.175 + } 117.176 + 117.177 + if (reordering_of_pic_nums_idc<2){ 117.178 + //av_log(AV_LOG_ERROR, "long term pic not supported\n"); 117.179 + 117.180 + abs_diff_pic_num= get_ue_golomb(gb) + 1; 117.181 + if(abs_diff_pic_num > (unsigned) n->max_pic_num){ 117.182 + av_log(AV_LOG_ERROR, "abs_diff_pic_num overflow\n"); 117.183 + return -1; 117.184 + } 117.185 + 117.186 + if(reordering_of_pic_nums_idc == 0) 117.187 + frame_num-= abs_diff_pic_num; 117.188 + else 117.189 + frame_num+= abs_diff_pic_num; 117.190 + frame_num &= n->max_pic_num - 1; 117.191 + 117.192 + for(i= 0 ; i<n->short_ref_count; i++){ 117.193 + ref = n->short_ref[i]; 117.194 + if(ref->frame_num == frame_num && ref->reference){ 117.195 + break; 117.196 + } 117.197 + } 117.198 + ref->pic_id= frame_num; 117.199 + }else{ 117.200 + int long_idx; 117.201 + long_idx= get_ue_golomb(gb); //long_term_pic_idx 117.202 + 117.203 + if(long_idx>31){ 117.204 + av_log(AV_LOG_ERROR, "long_term_pic_idx overflow\n"); 117.205 + return -1; 117.206 + } 117.207 + ref = n->long_ref[long_idx]; 117.208 + assert(!(ref && !ref->reference)); 117.209 + if(ref && (ref->reference)){ 117.210 + ref->pic_id= long_idx; 117.211 + assert(ref->long_ref); 117.212 + }else{ 117.213 + av_log(AV_LOG_ERROR, "reference picture missing during reorder\n"); 117.214 + } 117.215 + } 117.216 + 117.217 + if (i >= n->short_ref_count) { 117.218 + av_log(AV_LOG_ERROR, "reference picture missing during reorder\n"); 117.219 + return -1; 117.220 + } else { 117.221 + for(i=index; i+1 <s->ref_count[list]; i++){ 117.222 + 117.223 +// if(ref->frame_num == s->ref_list[list][i]->frame_num) 117.224 +// break; 117.225 + ///there is probably no need for a separate pic_id and frame_num 117.226 + if (s->ref_list[list][i]){ 117.227 + 117.228 + if(ref->long_ref == s->ref_list[list][i]->long_ref && ref->pic_id == s->ref_list[list][i]->pic_id) 117.229 + break; 117.230 + } 117.231 + } 117.232 + for(; i > index; i--){ 117.233 + s->ref_list[list][i]= s->ref_list[list][i-1]; 117.234 + } 117.235 + s->ref_list[list][index]= ref; 117.236 + } 117.237 + } 117.238 + } 117.239 + } 117.240 + 117.241 +// //Check if everything went well 117.242 +// for(list=0; list<s->list_count; list++){ 117.243 +// //printf("ref_count %d list %d\n", s->ref_count[list], list); 117.244 +// for(index= 0; index < s->ref_count[list]; index++){ 117.245 +// //printf("%d\n", s->ref_list[list][index]->pic_id); 117.246 +// if(!s->ref_list[list][index]->data[0]){ 117.247 +// av_log(AV_LOG_ERROR, "Missing reference picture\n"); 117.248 +// return -1; 117.249 +// } 117.250 +// } 117.251 +// } 117.252 + 117.253 + return 0; 117.254 +} 117.255 + 117.256 +static PictureInfo *find_short(NalContext *n, int frame_num){ 117.257 + int i; 117.258 + for(i=0; i<n->short_ref_count; i++){ 117.259 + if(n->short_ref[i]->frame_num == frame_num) { 117.260 + return n->short_ref[i]; 117.261 + } 117.262 + } 117.263 + return NULL; 117.264 +} 117.265 + 117.266 +static int remove_short(NalContext *n, H264Slice *s, int frame_num, int release){ 117.267 + int i; 117.268 + 117.269 + for (i=0; i<n->short_ref_count; i++){ 117.270 + if (n->short_ref[i]->frame_num == frame_num){ 117.271 + if (release){ 117.272 + s->release_ref_cpn[s->release_cnt++] = n->short_ref[i]->cpn; 117.273 + n->short_ref[i]->reference &= ~2; 117.274 + } 117.275 + n->short_ref[i] = NULL; 117.276 + if (--n->short_ref_count) 117.277 + memmove(&n->short_ref[i], &n->short_ref[i+1], (n->short_ref_count - i)*sizeof(PictureInfo *)); 117.278 + return 0; 117.279 + } 117.280 + } 117.281 + return -1; 117.282 +} 117.283 + 117.284 +static void remove_long(NalContext *n, H264Slice *s, int i){ 117.285 + 117.286 + if (n->long_ref[i]){ 117.287 + s->release_ref_cpn[s->release_cnt++] = n->long_ref[i]->cpn; 117.288 + n->long_ref[i]->reference &= ~2; 117.289 + n->long_ref[i]->long_ref = 0; 117.290 + n->long_ref_count--; 117.291 + n->long_ref[i] = NULL; 117.292 + } 117.293 +} 117.294 + 117.295 +void ff_h264_remove_all_refs(NalContext *n, H264Slice *s){ 117.296 + int i; 117.297 + 117.298 + while (n->short_ref[0]) 117.299 + remove_short(n, s, n->short_ref[0]->frame_num, 1); 117.300 + 117.301 + for(i=0; i<16; i++){ 117.302 + remove_long(n, s, i); 117.303 + } 117.304 + assert(n->short_ref_count==0); 117.305 + assert(n->long_ref_count==0); 117.306 +} 117.307 + 117.308 +int ff_h264_ref_pic_marking(NalContext *n, H264Slice *s, GetBitContext *gb){ 117.309 + 117.310 + if(s->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields 117.311 + get_bits1(gb); //get_bits1(gb) -1; //broken link 117.312 + if(get_bits1(gb)){ 117.313 + av_log(AV_LOG_ERROR, "MMCO_LONG reference management not supported\n"); 117.314 + } 117.315 + }else{ 117.316 + if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag 117.317 + int i,j; 117.318 + for(i= 0; i<MAX_MMCO_COUNT; i++) { 117.319 + PictureInfo *pic; 117.320 + int short_pic_num=0; 117.321 + unsigned int long_arg=0; 117.322 + MMCOOpcode opcode= get_ue_golomb_31(gb); 117.323 + 117.324 + if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){ 117.325 + short_pic_num= (n->frame_num - get_ue_golomb(gb) - 1) & (n->max_pic_num - 1); 117.326 + } 117.327 + if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){ 117.328 + long_arg= get_ue_golomb_31(gb); 117.329 + if(long_arg >= 16){ 117.330 + av_log(AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode); 117.331 + return -1; 117.332 + } 117.333 + } 117.334 + 117.335 + if(opcode > (unsigned)MMCO_LONG){ 117.336 + av_log(AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode); 117.337 + return -1; 117.338 + } 117.339 + if(opcode == MMCO_END) 117.340 + break; 117.341 + 117.342 + switch (opcode){ 117.343 + case MMCO_SHORT2UNUSED: 117.344 + remove_short(n, s, short_pic_num, 1); 117.345 + break; 117.346 + case MMCO_SHORT2LONG: 117.347 + pic = find_short(n, short_pic_num); 117.348 + if (n->long_ref[long_arg] != pic) 117.349 + remove_long(n, s, long_arg); 117.350 + remove_short(n, s, short_pic_num, 0); 117.351 + n->long_ref[long_arg]= pic; 117.352 + if (pic){ 117.353 + pic->long_ref=1; 117.354 + n->long_ref[long_arg]= pic; 117.355 + n->long_ref_count++; 117.356 + } 117.357 + break; 117.358 + case MMCO_LONG2UNUSED: 117.359 + assert(n->long_ref[long_arg]); 117.360 + remove_long(n, s, long_arg); 117.361 + break; 117.362 + case MMCO_SET_MAX_LONG: 117.363 + for(j=long_arg; j<16; j++) 117.364 + remove_long(n, s, j); 117.365 + break; 117.366 + case MMCO_RESET: 117.367 + while(n->short_ref_count) 117.368 + remove_short(n, s, n->short_ref[0]->frame_num, 1); 117.369 + 117.370 + for(j=0; j < 16; j++) 117.371 + remove_long(n, s, j); 117.372 + 117.373 + s->current_picture_info->poc= 117.374 + s->poc = 117.375 + n->poc_lsb= 117.376 + n->poc_msb= 117.377 + n->frame_num= 117.378 + s->current_picture_info->frame_num= 0; 117.379 + break; 117.380 + case MMCO_END: 117.381 + case MMCO_LONG: 117.382 + break; 117.383 + } 117.384 + } 117.385 + }else{// sliding window ref picture marking 117.386 + if(n->short_ref_count == n->sps.ref_frame_count) { 117.387 + s->release_ref_cpn[s->release_cnt++] = n->short_ref[n->short_ref_count - 1]->cpn; 117.388 + n->short_ref[n->short_ref_count - 1]->reference &= ~2; 117.389 + n->short_ref[ n->short_ref_count - 1 ] =NULL; 117.390 + n->short_ref_count--; 117.391 + } 117.392 + } 117.393 + } 117.394 + 117.395 + if(n->short_ref_count) 117.396 + memmove(&n->short_ref[1], &n->short_ref[0], n->short_ref_count*sizeof(PictureInfo *)); 117.397 + 117.398 + n->short_ref[0]= s->current_picture_info; 117.399 + n->short_ref_count++; 117.400 + 117.401 + return 0; 117.402 +} 117.403 + 117.404 +static int get_scale_factor(H264Slice *s, int poc, int poc1, int i){ 117.405 + int poc0 = s->ref_list[0][i]->poc; 117.406 + int td = av_clip(poc1 - poc0, -128, 127); 117.407 + if(td == 0 || s->ref_list[0][i]->long_ref){ 117.408 + return 256; 117.409 + }else{ 117.410 + int tb = av_clip(poc - poc0, -128, 127); 117.411 + int tx = (16384 + (FFABS(td) >> 1)) / td; 117.412 + return av_clip((tb*tx + 32) >> 6, -1024, 1023); 117.413 + } 117.414 +} 117.415 + 117.416 +void ff_h264_direct_dist_scale_factor(H264Slice *s){ 117.417 + const int poc = s->current_picture_info->poc; 117.418 + const int poc1 = s->ref_list[1][0]->poc; 117.419 + 117.420 + for(int i=0; i<s->ref_count[0]; i++){ 117.421 + s->dist_scale_factor[i] = get_scale_factor(s, poc, poc1, i); 117.422 + } 117.423 +} 117.424 + 117.425 +static void fill_colmap(H264Slice *s, int map[2][16], int list){ 117.426 + PictureInfo * const ref1 = s->ref_list[1][0]; 117.427 + int old_ref, rfield; 117.428 + 117.429 + /* bogus; fills in for missing frames */ 117.430 + memset(map[list], 0, sizeof(map[list])); 117.431 + 117.432 + for(rfield=0; rfield<2; rfield++){ 117.433 + for(old_ref=0; old_ref < ref1->ref_count[list]; old_ref++){ 117.434 + int poc = ref1->ref_poc[list][old_ref]; 117.435 + 117.436 + for(int j=0; j<s->ref_count[0]; j++){ 117.437 + if(s->ref_list[0][j]->poc == poc){ 117.438 + map[list][old_ref] = j; 117.439 + break; 117.440 + } 117.441 + } 117.442 + } 117.443 + } 117.444 +} 117.445 + 117.446 +void ff_h264_direct_ref_list_init(H264Slice *s){ 117.447 + PictureInfo * const cur = s->current_picture_info; 117.448 + int list; 117.449 + 117.450 + for(list=0; list<2; list++){ 117.451 + cur->ref_count[list] = s->ref_count[list]; 117.452 + for(int j=0; j<s->ref_count[list]; j++){ 117.453 + cur->ref_poc[list][j] = s->ref_list[list][j] ? s->ref_list[list][j]->poc : 0; 117.454 + } 117.455 + } 117.456 + 117.457 + if(s->slice_type_nos != FF_B_TYPE || s->direct_spatial_mv_pred) 117.458 + return; 117.459 + 117.460 + for(list=0; list<2; list++){ 117.461 + fill_colmap(s, s->map_col_to_list0, list); 117.462 + } 117.463 +} 117.464 +
118.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 118.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_refs.h Mon Aug 27 12:09:56 2012 +0200 118.3 @@ -0,0 +1,14 @@ 118.4 +#ifndef H264_REFS_H 118.5 +#define H264_REFS_H 118.6 + 118.7 +#include "avcodec.h" 118.8 +#include "h264_types.h" 118.9 + 118.10 +int ff_h264_fill_default_ref_list(NalContext *n, H264Slice *s); 118.11 +int ff_h264_decode_ref_pic_list_reordering(NalContext *n, H264Slice *s, GetBitContext *gb); 118.12 +void ff_h264_remove_all_refs(NalContext *n, H264Slice *s); 118.13 +int ff_h264_ref_pic_marking(NalContext *n, H264Slice *s, GetBitContext *gb); 118.14 +void ff_h264_direct_ref_list_init(H264Slice *s); 118.15 +void ff_h264_direct_dist_scale_factor(H264Slice *s); 118.16 + 118.17 +#endif
119.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 119.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_sei.c Mon Aug 27 12:09:56 2012 +0200 119.3 @@ -0,0 +1,191 @@ 119.4 +/* 119.5 + * H.26L/H.264/AVC/JVT/14496-10/... sei decoding 119.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 119.7 + * 119.8 + * This file is part of FFmpeg. 119.9 + * 119.10 + * FFmpeg is free software; you can redistribute it and/or 119.11 + * modify it under the terms of the GNU Lesser General Public 119.12 + * License as published by the Free Software Foundation; either 119.13 + * version 2.1 of the License, or (at your option) any later version. 119.14 + * 119.15 + * FFmpeg is distributed in the hope that it will be useful, 119.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 119.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 119.18 + * Lesser General Public License for more details. 119.19 + * 119.20 + * You should have received a copy of the GNU Lesser General Public 119.21 + * License along with FFmpeg; if not, write to the Free Software 119.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 119.23 + */ 119.24 + 119.25 +/** 119.26 + * @file 119.27 + * H.264 / AVC / MPEG4 part10 sei decoding. 119.28 + * @author Michael Niedermayer <michaelni@gmx.at> 119.29 + */ 119.30 + 119.31 +#include "avcodec.h" 119.32 +#include "h264_types.h" 119.33 +#include "golomb.h" 119.34 + 119.35 +//#undef NDEBUG 119.36 +#include <assert.h> 119.37 + 119.38 +static const uint8_t sei_num_clock_ts_table[9]={ 119.39 + 1, 1, 1, 2, 2, 3, 3, 2, 3 119.40 +}; 119.41 + 119.42 +void ff_h264_reset_sei(NalContext *n) { 119.43 + n->sei_recovery_frame_cnt = -1; 119.44 + n->sei_dpb_output_delay = 0; 119.45 + n->sei_cpb_removal_delay = -1; 119.46 + n->sei_buffering_period_present = 0; 119.47 +} 119.48 + 119.49 +static int decode_picture_timing(NalContext *n, GetBitContext *gb){ 119.50 + if(n->sps.nal_hrd_parameters_present_flag || n->sps.vcl_hrd_parameters_present_flag){ 119.51 + n->sei_cpb_removal_delay = get_bits(gb, n->sps.cpb_removal_delay_length); 119.52 + n->sei_dpb_output_delay = get_bits(gb, n->sps.dpb_output_delay_length); 119.53 + } 119.54 + if(n->sps.pic_struct_present_flag){ 119.55 + unsigned int i, num_clock_ts; 119.56 + n->sei_pic_struct = get_bits(gb, 4); 119.57 + n->sei_ct_type = 0; 119.58 + 119.59 + if (n->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING) 119.60 + return -1; 119.61 + 119.62 + num_clock_ts = sei_num_clock_ts_table[n->sei_pic_struct]; 119.63 + 119.64 + for (i = 0 ; i < num_clock_ts ; i++){ 119.65 + if(get_bits(gb, 1)){ /* clock_timestamp_flag */ 119.66 + unsigned int full_timestamp_flag; 119.67 + n->sei_ct_type |= 1<<get_bits(gb, 2); 119.68 + skip_bits(gb, 1); /* nuit_field_based_flag */ 119.69 + skip_bits(gb, 5); /* counting_type */ 119.70 + full_timestamp_flag = get_bits(gb, 1); 119.71 + skip_bits(gb, 1); /* discontinuity_flag */ 119.72 + skip_bits(gb, 1); /* cnt_dropped_flag */ 119.73 + skip_bits(gb, 8); /* n_frames */ 119.74 + if(full_timestamp_flag){ 119.75 + skip_bits(gb, 6); /* seconds_value 0..59 */ 119.76 + skip_bits(gb, 6); /* minutes_value 0..59 */ 119.77 + skip_bits(gb, 5); /* hours_value 0..23 */ 119.78 + }else{ 119.79 + if(get_bits(gb, 1)){ /* seconds_flag */ 119.80 + skip_bits(gb, 6); /* seconds_value range 0..59 */ 119.81 + if(get_bits(gb, 1)){ /* minutes_flag */ 119.82 + skip_bits(gb, 6); /* minutes_value 0..59 */ 119.83 + if(get_bits(gb, 1)) /* hours_flag */ 119.84 + skip_bits(gb, 5); /* hours_value 0..23 */ 119.85 + } 119.86 + } 119.87 + } 119.88 + if(n->sps.time_offset_length > 0) 119.89 + skip_bits(gb, n->sps.time_offset_length); /* time_offset */ 119.90 + } 119.91 + } 119.92 + } 119.93 + return 0; 119.94 +} 119.95 + 119.96 +static int decode_unregistered_user_data(GetBitContext *gb, int size){ 119.97 + char user_data[16+256]; 119.98 + int e, build, i; 119.99 + 119.100 + if(size<16) 119.101 + return -1; 119.102 + 119.103 + for(i=0; i<(int) sizeof(user_data)-1 && i<size; i++){ 119.104 + user_data[i]= get_bits(gb, 8); 119.105 + } 119.106 + 119.107 + user_data[i]= 0; 119.108 + e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build); 119.109 + (void) e; 119.110 + for(; i<size; i++) 119.111 + skip_bits(gb, 8); 119.112 + 119.113 + return 0; 119.114 +} 119.115 + 119.116 +static int decode_recovery_point(NalContext *n, GetBitContext *gb){ 119.117 + 119.118 + n->sei_recovery_frame_cnt = get_ue_golomb(gb); 119.119 + skip_bits(gb, 4); /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */ 119.120 + 119.121 + return 0; 119.122 +} 119.123 + 119.124 +static int decode_buffering_period(NalContext *n, GetBitContext *gb){ 119.125 + unsigned int sps_id; 119.126 + int sched_sel_idx; 119.127 + SPS *sps; 119.128 + 119.129 + sps_id = get_ue_golomb_31(gb); 119.130 + if(sps_id > 31 || !n->sps_buffers[sps_id]) { 119.131 + av_log(AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id); 119.132 + return -1; 119.133 + } 119.134 + sps = n->sps_buffers[sps_id]; 119.135 + 119.136 + // NOTE: This is really so duplicated in the standard... See H.264, D.1.1 119.137 + if (sps->nal_hrd_parameters_present_flag) { 119.138 + for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) { 119.139 + n->initial_cpb_removal_delay[sched_sel_idx] = get_bits(gb, sps->initial_cpb_removal_delay_length); 119.140 + skip_bits(gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset 119.141 + } 119.142 + } 119.143 + if (sps->vcl_hrd_parameters_present_flag) { 119.144 + for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) { 119.145 + n->initial_cpb_removal_delay[sched_sel_idx] = get_bits(gb, sps->initial_cpb_removal_delay_length); 119.146 + skip_bits(gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset 119.147 + } 119.148 + } 119.149 + 119.150 + n->sei_buffering_period_present = 1; 119.151 + return 0; 119.152 +} 119.153 + 119.154 +int ff_h264_decode_sei(NalContext *n, GetBitContext *gb){ 119.155 + while(get_bits_count(gb) + 16 < gb->size_in_bits){ 119.156 + int size, type; 119.157 + 119.158 + type=0; 119.159 + do{ 119.160 + type+= show_bits(gb, 8); 119.161 + }while(get_bits(gb, 8) == 255); 119.162 + 119.163 + size=0; 119.164 + do{ 119.165 + size+= show_bits(gb, 8); 119.166 + }while(get_bits(gb, 8) == 255); 119.167 + 119.168 + switch(type){ 119.169 + case SEI_TYPE_PIC_TIMING: // Picture timing SEI 119.170 + if(decode_picture_timing(n, gb) < 0) 119.171 + return -1; 119.172 + break; 119.173 + case SEI_TYPE_USER_DATA_UNREGISTERED: 119.174 + if(decode_unregistered_user_data(gb, size) < 0) 119.175 + return -1; 119.176 + break; 119.177 + case SEI_TYPE_RECOVERY_POINT: 119.178 + if(decode_recovery_point(n, gb) < 0) 119.179 + return -1; 119.180 + break; 119.181 + case SEI_BUFFERING_PERIOD: 119.182 + if(decode_buffering_period(n, gb) < 0) 119.183 + return -1; 119.184 + break; 119.185 + default: 119.186 + skip_bits(gb, 8*size); 119.187 + } 119.188 + 119.189 + //FIXME check bits here 119.190 + align_get_bits(gb); 119.191 + } 119.192 + 119.193 + return 0; 119.194 +}
120.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 120.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_sei.h Mon Aug 27 12:09:56 2012 +0200 120.3 @@ -0,0 +1,7 @@ 120.4 +#ifndef H264_SEI_H 120.5 +#define H264_SEI_H 120.6 + 120.7 +int ff_h264_decode_sei(NalContext *n, GetBitContext *gb); 120.8 +void ff_h264_reset_sei(NalContext *n); 120.9 + 120.10 +#endif
121.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 121.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_seq.c Mon Aug 27 12:09:56 2012 +0200 121.3 @@ -0,0 +1,220 @@ 121.4 +/* 121.5 +* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder 121.6 +* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 121.7 +* 121.8 +* This file is part of FFmpeg. 121.9 +* 121.10 +* FFmpeg is free software; you can redistribute it and/or 121.11 +* modify it under the terms of the GNU Lesser General Public 121.12 +* License as published by the Free Software Foundation; either 121.13 +* version 2.1 of the License, or (at your option) any later version. 121.14 +* 121.15 +* FFmpeg is distributed in the hope that it will be useful, 121.16 +* but WITHOUT ANY WARRANTY; without even the implied warranty of 121.17 +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 121.18 +* Lesser General Public License for more details. 121.19 +* 121.20 +* You should have received a copy of the GNU Lesser General Public 121.21 +* License along with FFmpeg; if not, write to the Free Software 121.22 +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 121.23 +*/ 121.24 +#include "h264_types.h" 121.25 +#include "h264_parser.h" 121.26 +#include "h264_nal.h" 121.27 +#include "h264_entropy.h" 121.28 +#include "h264_rec.h" 121.29 +#include "h264_pred_mode.h" 121.30 +#include "h264_misc.h" 121.31 +// #undef NDEBUG 121.32 +#include <assert.h> 121.33 + 121.34 +static int decode_slice_entropy_seq(H264Context *h, EntropyContext *ec, H264Slice *s, GetBitContext *gb, H264Mb *mbs){ 121.35 + int i,j; 121.36 +// GetBitContext *gb = s->gb; 121.37 + CABACContext *c = &ec->c; 121.38 + 121.39 + if( !s->pps.cabac ){ 121.40 + av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); 121.41 + return -1; 121.42 + } 121.43 + 121.44 + init_dequant_tables(s, ec); 121.45 + ec->curr_qscale = s->qscale; 121.46 + ec->last_qscale_diff = 0; 121.47 + ec->chroma_qp[0] = get_chroma_qp((H264Slice *) s, 0, s->qscale); 121.48 + ec->chroma_qp[1] = get_chroma_qp((H264Slice *) s, 1, s->qscale); 121.49 + 121.50 + /* realign */ 121.51 + align_get_bits( gb ); 121.52 + /* init cabac */ 121.53 + ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8); 121.54 + 121.55 + ff_h264_init_cabac_states(ec, s, c); 121.56 + 121.57 + for(j=0; j<ec->mb_height; j++){ 121.58 + init_entropy_buf(ec, s, j); 121.59 + for(i=0; i<ec->mb_width; i++){ 121.60 + int eos,ret; 121.61 + H264Mb *m = &mbs[i + j*ec->mb_width]; 121.62 + //memset(m, 0, sizeof(H264Mb)); 121.63 + m->mb_x=i; 121.64 + m->mb_y=j; 121.65 + ec->m = m; 121.66 + 121.67 + ret = ff_h264_decode_mb_cabac(ec, s, c); 121.68 + eos = get_cabac_terminate( c); 121.69 + (void) eos; 121.70 + if( ret < 0 || c->bytestream > c->bytestream_end + 2) { 121.71 + av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream); 121.72 + return -1; 121.73 + } 121.74 + } 121.75 + } 121.76 + 121.77 +// av_freep(&s->gb.raw); 121.78 +// if (s->gb.rbsp) 121.79 +// av_freep(&s->gb.rbsp); 121.80 + 121.81 + return 0; 121.82 +} 121.83 + 121.84 + 121.85 + 121.86 +/** 121.87 +* Sequential version 121.88 +*/ 121.89 +static void decode_slice_mb_seq(H264Context *h, MBRecContext *d, H264Slice *s2, H264Mb *mbs){ 121.90 + 121.91 + for (int i=0; i<2; i++){ 121.92 + for(int j=0; j< s2->ref_count[i]; j++){ 121.93 + if (s2->ref_list_cpn[i][j] ==-1) 121.94 + continue; 121.95 + int k; 121.96 + for (k=0; k<h->max_dpb_cnt; k++){ 121.97 + if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s2->ref_list_cpn[i][j]){ 121.98 + s2->dp_ref_list[i][j] = &h->dpb[k]; 121.99 + break; 121.100 + } 121.101 + } 121.102 + } 121.103 + } 121.104 + 121.105 + get_dpb_entry(h, s2); 121.106 + 121.107 + if (!h->no_mbd){ 121.108 + for(int j=0; j<d->mb_height; j++){ 121.109 + init_mbrec_context(d, d->mrs, s2, j); 121.110 + if (h->profile) printf("\n[MBREC LINE %d ", j); 121.111 + for(int i=0; i<d->mb_width; i++){ 121.112 + 121.113 + if ((i & 0x7) == 0) start_timer(h, REC); 121.114 + H264Mb *m = &mbs[i + j*d->mb_width]; 121.115 + if (h->profile==2) 121.116 + pred_motion_mb_rec (d, d->mrs, s2, m); 121.117 + else{ 121.118 + h264_decode_mb_internal(d, d->mrs, s2, m); 121.119 + } 121.120 + stop_timer(h, REC); 121.121 + } 121.122 + draw_edges(d, s2, j); 121.123 + 121.124 + } 121.125 + } 121.126 + 121.127 + for (int i=0; i<s2->release_cnt; i++){ 121.128 + for(int j=0; j<h->max_dpb_cnt; j++){ 121.129 + if(h->dpb[j].cpn== s2->release_ref_cpn[i]){ 121.130 + release_dpb_entry(h, &h->dpb[j], 2); 121.131 + break; 121.132 + } 121.133 + } 121.134 + } 121.135 + s2->release_cnt=0; 121.136 +} 121.137 + 121.138 +/* 121.139 +* The following code is the main loop of the file converter 121.140 +*/ 121.141 +int h264_decode_seq( H264Context *h) { 121.142 + ParserContext *pc; 121.143 + NalContext *nc; 121.144 + EntropyContext *ec; 121.145 + MBRecContext *rc; 121.146 + OutputContext *oc; 121.147 + 121.148 + H264Slice slice, *s=&slice; 121.149 + H264Mb *mbs; 121.150 + DecodedPicture *out; 121.151 + int frames=0; 121.152 + 121.153 +#if HAVE_LIBSDL2 121.154 + pthread_t sdl_thr; 121.155 + if (h->display){ 121.156 + pthread_create(&sdl_thr, NULL, sdl_thread, h); 121.157 + } 121.158 +#endif 121.159 + 121.160 + pc = get_parse_context(h->ifile); 121.161 + nc = get_nal_context(h->width, h->height); 121.162 + 121.163 + memset(s, 0, sizeof(H264Slice)); 121.164 + mbs = av_malloc( h->mb_height * h->mb_width * sizeof(H264Mb)); 121.165 + 121.166 + ec = get_entropy_context( h ); 121.167 + rc = get_mbrec_context(h); 121.168 + rc->top_next = rc->top = av_malloc( h->mb_width * sizeof(TopBorder)); 121.169 + 121.170 + oc = get_output_context( h ); 121.171 + 121.172 + av_start_timer(); 121.173 + GetBitContext gb = {0,}; 121.174 + while(!pc->final_frame && frames++ < h->num_frames && !h->quit){ 121.175 + if (h->profile) start_timer(h, FRONT); 121.176 + av_read_frame_internal(pc, &gb); 121.177 + decode_nal_units(nc, s, &gb); 121.178 + if (h->profile) stop_timer(h, FRONT); 121.179 +// memset(s->mbs, 0, sizeof(H264Mb)*ec->mb_width*ec->mb_height); 121.180 + if (h->profile) start_timer(h, ED); 121.181 + decode_slice_entropy_seq(h, ec, s, &gb, mbs); 121.182 + if (h->profile) stop_timer(h, ED); 121.183 + 121.184 + if (h->profile) start_timer(h, REC); 121.185 + decode_slice_mb_seq(h, rc, s, mbs); 121.186 + if (h->profile) stop_timer(h, REC); 121.187 + 121.188 + out =output_frame(h, oc, s->curr_pic, h->ofile, h->frame_width, h->frame_height); 121.189 + if (out){ 121.190 + release_dpb_entry(h, out, 1); 121.191 + } 121.192 + 121.193 + print_report(oc->frame_number, oc->video_size, 0, h->verbose); 121.194 + if (h->profile == 3){ 121.195 + printf("[ENTROPY %.3fms] [MBREC %.3fms]\n", h->last_time[ED] , h->last_time[REC]); 121.196 + } 121.197 + } 121.198 + while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ; 121.199 + 121.200 + print_report(oc->frame_number, oc->video_size, 1, h->verbose); 121.201 + h->num_frames = oc->frame_number; 121.202 + /* finished ! */ 121.203 + av_freep(&mbs); 121.204 + av_freep(&gb.raw); 121.205 + if (gb.rbsp) 121.206 + av_freep(&gb.rbsp); 121.207 + av_freep(&rc->top); 121.208 + 121.209 + free_parse_context(pc); 121.210 + free_nal_context (nc); 121.211 + free_entropy_context(ec); 121.212 + free_mbrec_context(rc); 121.213 + free_output_context(oc); 121.214 + 121.215 +#if HAVE_LIBSDL2 121.216 + if (h->display){ 121.217 + signal_sdl_exit(h); 121.218 + pthread_join(sdl_thr, NULL); 121.219 + } 121.220 +#endif 121.221 + 121.222 + return 0; 121.223 +}
122.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 122.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_types.h Mon Aug 27 12:09:56 2012 +0200 122.3 @@ -0,0 +1,658 @@ 122.4 +#ifndef H264_TYPES_H 122.5 +#define H264_TYPES_H 122.6 + 122.7 +#include "config.h" 122.8 +#ifdef HAVE_LIBSDL2 122.9 +#include <SDL2/SDL.h> 122.10 +#endif 122.11 + 122.12 +#include <pthread.h> 122.13 +#include "avcodec.h" 122.14 +#include "cabac.h" 122.15 +#include "h264_dsp.h" 122.16 +#include "h264_pred.h" 122.17 +#include "get_bits.h" 122.18 + 122.19 + 122.20 +#define MAX_REF_PIC_COUNT 16 122.21 +#define MAX_DELAYED_PIC_COUNT 16 122.22 + 122.23 +#define MAX_THREADS 80 122.24 + 122.25 +//#define MAX_PIC_COUNT (4*(MAX_REF_PIC_COUNT+MAX_DELAYED_PIC_COUNT)) 122.26 + 122.27 +#define DPB_SIZE 33 122.28 + 122.29 + 122.30 +//potsdam machine 8xX7560 without HT 122.31 +// static int edb_affinity [16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 122.32 +// static int edip_affinity[8] = {16, 17, 18, 19, 20, 21, 22, 23}; 122.33 +// 122.34 +// static int mbd_affinity[8][5] = { {24, 32, 40, 48, 56}, 122.35 +// {25, 33, 41, 49, 57}, 122.36 +// {26, 34, 42, 50, 58}, 122.37 +// {27, 35, 43, 51, 59}, 122.38 +// {28, 36, 44, 52, 60}, 122.39 +// {29, 37, 45, 53, 61}, 122.40 +// {30, 38, 46, 54, 62}, 122.41 +// {31, 39, 47, 55, 63}, }; 122.42 + 122.43 +// static int edb_affinity [22] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 58, 59, 60, 61 ,62, 63}; 122.44 +// static int edip_affinity[10] = {16, 17, 18, 19, 20, 21, 22, 23, 56, 57 }; 122.45 +// 122.46 +// static int mbd_affinity[8][5] = { {24, 32, 40, 48, 56}, 122.47 +// {25, 33, 41, 49, 57}, 122.48 +// {26, 34, 42, 50, 58}, 122.49 +// {27, 35, 43, 51, 59}, 122.50 +// {28, 36, 44, 52, 60}, 122.51 +// {29, 37, 45, 53, 61}, 122.52 +// {30, 38, 46, 54, 62}, 122.53 +// {31, 39, 47, 55, 63}, }; 122.54 +// //4 socket 122.55 +// static int edip_affinity[5] = {0, 1, 2, 3, 56}; 122.56 +// static int edb_affinity [12] = {8, 9, 10, 11, 16, 17, 18, 19, 59, 58, 57, 51}; 122.57 +// 122.58 +// static int mbd_affinity[4][5] = { {24, 32, 40, 48, 56}, 122.59 +// {25, 33, 41, 49, 57}, 122.60 +// {26, 34, 42, 50, 58}, 122.61 +// {27, 35, 43, 51, 59}, }; 122.62 + 122.63 +// static int edip_affinity[3] = {0, 1, 49}; 122.64 +// static int edb_affinity [6] = {8, 9, 16, 17, 56, 57}; 122.65 +// 122.66 +// static int mbd_affinity[2][5] = { {24, 32, 40, 48, 56}, 122.67 +// {25, 33, 41, 49, 57}}; 122.68 + 122.69 +// static int edip_affinity[2] = {0, 8}; 122.70 +// static int edb_affinity [3] = {16, 24, 56}; 122.71 +// 122.72 +// static int mbd_affinity[1][4] = { {32, 40, 48, 56}, 122.73 +// }; 122.74 + 122.75 +/// for ducks_take_off_2160p 122.76 +// static int edip_affinity[2] = {0, 8}; 122.77 +// static int edb_affinity [3] = {16, 24, 32}; 122.78 +// 122.79 +// static int mbd_affinity[1][4] = {{ 40, 48, 56, 32}}; 122.80 + 122.81 +// static int edip_affinity[3] = {0, 1, 57}; 122.82 +// static int edb_affinity [7] = {8, 9, 16, 17, 24, 25, 56}; 122.83 +// 122.84 +// static int mbd_affinity[2][4] = { {32, 40, 48, 56}, 122.85 +// {33, 41, 49, 57}}; 122.86 + 122.87 +//4 socket 122.88 +// static int edip_affinity[6] = {0, 1, 2, 3, 59}; 122.89 +// static int edb_affinity [14] = {8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 58, 57}; 122.90 +// 122.91 +// static int mbd_affinity[4][4] = { {32, 40, 48, 56}, 122.92 +// {33, 41, 49, 57}, 122.93 +// {34, 42, 50, 58}, 122.94 +// {35, 43, 51, 59}, }; 122.95 + 122.96 + 122.97 +// static int edb_affinity [29] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 59, 60, 61, 62, 63}; 122.98 +// static int edip_affinity[11] = {24, 25, 26, 27, 28, 29, 30, 31, 63, 62, 61}; 122.99 +// 122.100 +// static int mbd_affinity[8][4] = {{32, 40, 48, 56}, 122.101 +// {33, 41, 49, 57}, 122.102 +// {34, 42, 50, 58}, 122.103 +// {35, 43, 51, 59}, 122.104 +// {36, 44, 52, 60}, 122.105 +// {37, 45, 53, 61}, 122.106 +// {38, 46, 54, 62}, 122.107 +// {39, 47, 55, 63}, }; 122.108 + 122.109 +//potsdam machine 4xX7550 with HT 122.110 +// int edip_affinity[16] = {0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; 122.111 +// int edb_affinity [16] = {1, 9, 17, 25, 2, 10, 18, 26, 6, 14, 22, 30, 7, 15, 23, 31 }; 122.112 +// int edip_affinity[16] = {58, 50, 42, 34, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; 122.113 +// int edb_affinity [16] = {57, 49, 41, 33, 56, 48, 40, 32, 6, 14, 22, 30, 7, 15, 23, 31 }; 122.114 +// //int edb_affinity [16] = {4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 }; 122.115 +// //mb threads affinity on logical cores moving back to keep inteference with ed threads low 122.116 +// int mbd_affinity[4][8] = { {63, 62, 61, 60, 59, 58, 57, 56}, 122.117 +// {55, 54, 53, 52, 51, 50, 49, 48}, 122.118 +// {47, 46, 45, 44, 43, 42, 41, 40}, 122.119 +// {39, 38, 37, 36, 35, 34, 33, 32}, 122.120 +// }; 122.121 + 122.122 + 122.123 +// static int edip_affinity[2] = {0, 2}; 122.124 +// static int edb_affinity [4] = {1, 3, 2, 5}; 122.125 +// 122.126 +// static int mbd_affinity[1][4] = {{ 4, 6, 7, 5}}; 122.127 + 122.128 +enum{ 122.129 + PARSE=0, 122.130 + ENTROPY, 122.131 + REORDER, 122.132 + REORDER2, //second mutex-cond pair used in reorder_thread 122.133 + MBDEC, 122.134 + OUTPUT, 122.135 + STAGES 122.136 +}; 122.137 + 122.138 +//adhoc for profiling 122.139 +enum{ 122.140 + TOTAL=0, 122.141 + FRONT, 122.142 + ED, 122.143 + REC, 122.144 + PROFILE_STAGES 122.145 +}; 122.146 + 122.147 +/* bit input */ 122.148 +/* buffer, buffer_end and size_in_bits must be present and used by every reader */ 122.149 + 122.150 +/* frame parsing */ 122.151 +typedef struct ParserContext { 122.152 + //int64_t offset; ///< byte offset from starting packet start 122.153 + int ifile; 122.154 + int ofile; 122.155 + int buffer_size; 122.156 + int eof_reached; 122.157 + 122.158 + uint8_t *data; 122.159 + int size; 122.160 + uint8_t *cur_ptr; 122.161 + int cur_len; 122.162 + 122.163 + int64_t frame_offset; /* offset of the current frame */ 122.164 + int64_t cur_offset; /* current offset (incremented by each av_parser_parse()) */ 122.165 + int64_t next_frame_offset; /* offset of the next frame */ 122.166 + int pict_type; 122.167 + int repeat_pict; //frame_duration = (1 + repeat_pict) * time_base. It is used by codecs like H.264 to display telecined material. 122.168 + int key_frame; //Set by parser to 1 for key frames and 0 for non-key frames. 122.169 + int64_t pos; // Byte position of currently parsed frame in stream. 122.170 + int64_t last_pos; //Previous frame byte position. 122.171 + int final_frame; 122.172 + 122.173 + uint8_t overread[5]; 122.174 + int overread_cnt; ///< the number of bytes which where irreversibly read from the next frame 122.175 + int index; 122.176 + int last_index; 122.177 + int frame_start_found; 122.178 + uint32_t state; ///< contains the last few bytes in MSB order 122.179 +} ParserContext; 122.180 + 122.181 +typedef struct NalContext { 122.182 + 122.183 + SPS *sps_buffers[MAX_SPS_COUNT]; 122.184 + PPS *pps_buffers[MAX_PPS_COUNT]; 122.185 + SPS sps; ///< current sps 122.186 + 122.187 + PictureInfo picture[16 + 1]; ///< Ref pic buffer used for deriving lists. Later linked with pic in dpb. 122.188 + PictureInfo *release_ref[MAX_MMCO_COUNT]; 122.189 + PictureInfo *short_ref[32]; 122.190 + PictureInfo *long_ref[32]; 122.191 + int long_ref_count; ///< number of actual long term references 122.192 + int short_ref_count; ///< number of actual short term references 122.193 + 122.194 + //POC stuff 122.195 + uint32_t coded_pic_num; 122.196 + int poc_lsb; 122.197 + int poc_msb; 122.198 + uint32_t poc_offset; 122.199 + int delta_poc; 122.200 + int frame_num; 122.201 + int prev_poc_msb; ///< poc_msb of the last reference pic for POC type 0 122.202 + int prev_poc_lsb; ///< poc_lsb of the last reference pic for POC type 0 122.203 + int frame_num_offset; ///< for POC type 2 122.204 + int prev_frame_num_offset; ///< for POC type 2 122.205 + int prev_frame_num; ///< frame_num of the last pic for POC type 1/2 122.206 + 122.207 + int max_pic_num; 122.208 + int redundant_pic_count; 122.209 + int outputed_poc; 122.210 + int ip_id; 122.211 +// int b8_stride; ///< 2*mb_width+1 used for some 8x8 block arrays to allow simple addressing 122.212 + int b4_stride; ///< 4*mb_width+1 used for some 4x4 block arrays to allow simple addressing 122.213 + int mb_stride; ///< mb_width+1 used for some arrays to allow simple addressing of left & top MBs without sig11 122.214 + int mb_width; 122.215 + int mb_height; 122.216 + int width; 122.217 + int height; 122.218 + 122.219 + int has_b_frames; 122.220 + //pic_struct in picture timing SEI message 122.221 + SEI_PicStructType sei_pic_struct; 122.222 + // Bit set of clock types for fields/frames in picture timing SEI message. For each found ct_type, appropriate bit is set (e.g., bit 1 for interlaced). 122.223 + int sei_ct_type; 122.224 + // dpb_output_delay in picture timing SEI message, see H.264 C.2.2 122.225 + int sei_dpb_output_delay; 122.226 + //cpb_removal_delay in picture timing SEI message, see H.264 C.1.2 122.227 + int sei_cpb_removal_delay; 122.228 + //recovery_frame_cnt from SEI message 122.229 + int sei_recovery_frame_cnt; 122.230 + // Timestamp stuff 122.231 + int sei_buffering_period_present; ///< Buffering period SEI flag 122.232 + int initial_cpb_removal_delay[32]; ///< Initial timestamps for CPBs 122.233 + 122.234 +} NalContext; 122.235 + 122.236 +typedef struct EntropyContext{ 122.237 + CABACContext c; 122.238 + 122.239 + H264Mb *m; 122.240 + int top_cbp; 122.241 + int left_cbp; 122.242 + int neighbor_transform_size; //number of neighbors (top and/or left) that used 8x8 dct 122.243 + 122.244 + uint32_t top_type; 122.245 + uint32_t left_type; 122.246 + uint32_t topright_type; 122.247 + uint32_t topleft_type; 122.248 + 122.249 + int curr_qscale; 122.250 + int chroma_qp[2]; //QPc 122.251 + int last_qscale_diff; 122.252 + 122.253 + uint32_t dequant4_buffer[6][52][16]; 122.254 + uint32_t dequant8_buffer[2][52][64]; 122.255 + uint32_t (*dequant4_coeff[6])[16]; 122.256 + uint32_t (*dequant8_coeff[2])[64]; 122.257 + 122.258 +// uint8_t (*non_zero_count_top)[32]; 122.259 +// uint8_t (*non_zero_count)[32]; 122.260 +// uint8_t (*non_zero_count_row[2])[32]; 122.261 + 122.262 + uint8_t (*non_zero_count_top)[8]; 122.263 + uint8_t (*non_zero_count)[8]; 122.264 + uint8_t (*non_zero_count_row[2])[8]; 122.265 + DECLARE_ALIGNED(8, uint8_t, non_zero_count_left[8]); 122.266 + 122.267 + uint8_t (*mvd_top[2])[2]; 122.268 + uint8_t (*mvd[2])[2]; 122.269 + uint8_t (*mvd_table[2][2])[2]; 122.270 + 122.271 + uint8_t *direct_top; 122.272 + uint8_t *direct; 122.273 + uint8_t *direct_table[2]; 122.274 + 122.275 + uint8_t *chroma_pred_mode_top; 122.276 + uint8_t *chroma_pred_mode; 122.277 + uint8_t *chroma_pred_mode_table[2]; 122.278 + 122.279 + uint16_t *cbp_top; 122.280 + uint16_t *cbp; 122.281 + uint16_t *cbp_table[2]; 122.282 + 122.283 + int8_t *qscale_top; 122.284 + int8_t *qscale; 122.285 + int8_t *qscale_table[2]; 122.286 + 122.287 + int8_t *ref_index_top[2]; 122.288 + int8_t *ref_index[2]; 122.289 + int8_t *ref_index_table[2][2]; 122.290 + 122.291 + uint32_t *mb_type_top; 122.292 + uint32_t *mb_type; 122.293 + uint32_t *mb_type_table[2]; 122.294 + 122.295 + int b_stride; 122.296 + int mb_stride; 122.297 + int mb_width; 122.298 + int mb_height; 122.299 + 122.300 + uint8_t *zigzag_scan; 122.301 + uint8_t *zigzag_scan8x8; 122.302 + uint8_t direct_cache[5*8]; 122.303 + 122.304 + DECLARE_ALIGNED(8, int8_t, intra4x4_pred_mode_cache[5*8]); 122.305 + DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2]; 122.306 + DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8]; 122.307 + DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8]; 122.308 + DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2]; 122.309 + 122.310 +} EntropyContext; 122.311 + 122.312 +typedef struct H264Slice { 122.313 + PPS pps; ///< current pps 122.314 + PictureInfo* current_picture_info; 122.315 + DecodedPicture* curr_pic; 122.316 + int slice_num; 122.317 + 122.318 + int release_ref_cpn[MAX_MMCO_COUNT]; 122.319 + int release_cnt; 122.320 + 122.321 + int qp_thresh; ///< QP threshold to skip loopfilter 122.322 + int use_weight; 122.323 + int use_weight_chroma; 122.324 + int luma_log2_weight_denom; 122.325 + int chroma_log2_weight_denom; 122.326 + 122.327 + int16_t luma_weight[16][2][2]; 122.328 + int16_t chroma_weight[16][2][2][2]; 122.329 + int16_t implicit_weight[16][16][2]; 122.330 + 122.331 + //poc number of ref_list int ref_poc[2][16] 122.332 + //In edslice this must becom Picture Info 122.333 + int ref_list_cpn[2][16]; 122.334 + PictureInfo *ref_list[2][16]; ///Reordered version of default_ref_list according to picture reordering in slice header 122.335 + DecodedPicture *dp_ref_list[2][16]; 122.336 + int ref_count[2]; ///< counts frames or fields, depending on current mb mode 122.337 + 122.338 + int slice_type; 122.339 + int slice_type_nos; 122.340 + int slice_alpha_c0_offset; 122.341 + int slice_beta_offset; 122.342 + int direct_8x8_inference_flag; 122.343 + 122.344 + uint8_t list_count; 122.345 + uint32_t coded_pic_num; 122.346 + 122.347 + int poc; 122.348 + int key_frame; 122.349 + int mmco_reset; //FIXME not used? 122.350 + 122.351 + ///stuff only needed for nal/entropy decoding 122.352 +// H264Mb *m; 122.353 +// GetBitContext *gb; 122.354 + int ip_id; 122.355 + int transform_bypass; 122.356 + int direct_spatial_mv_pred; 122.357 + int map_col_to_list0[2][16]; 122.358 + int dist_scale_factor[16]; 122.359 + 122.360 + int cabac_init_idc; 122.361 + int nal_ref_idc; 122.362 + int nal_unit_type; 122.363 + 122.364 + int ref2frm[2][64]; ///< reference to frame number lists, the first 2 are for -2,-1 122.365 + 122.366 + int qscale; 122.367 + 122.368 +} H264Slice; 122.369 + 122.370 +typedef struct { 122.371 + H264Slice slice; 122.372 + H264Mb *mbs; 122.373 + DecodedPicture *dp; 122.374 + GetBitContext gb; 122.375 + 122.376 + int lines_taken; 122.377 + int lines_total; 122.378 + int state; // 0 free, 1 in use //1 wait for entropy, 2 wait for reconstruct. 122.379 + int initialized; 122.380 +} SliceBufferEntry; 122.381 + 122.382 +typedef struct RingLineEntry{ 122.383 + union{ 122.384 + DECLARE_ALIGNED(64, volatile int32_t, mb_cnt); 122.385 + DECLARE_ALIGNED(64, int32_t, pad[16]); 122.386 + }; 122.387 + SliceBufferEntry *sbe; 122.388 + int id; 122.389 + int line; 122.390 + TopBorder *top; 122.391 + struct RingLineEntry *prev_line; 122.392 + 122.393 +} RingLineEntry; 122.394 + 122.395 +// #if OMPSS 122.396 +typedef struct SuperMBTask{ 122.397 + int smb_x; 122.398 + int smb_y; 122.399 +} SuperMBTask; 122.400 + 122.401 +typedef struct SuperMBContext{ 122.402 + int nsmb_width; //number of super macroblocks in picture width 122.403 + int nsmb_height; //number of super macroblocks in picture height 122.404 + int nsmb_3dheight; //number of super macroblocks in picture height - max motion vertical vector 122.405 + int smb_width; //width of a super macroblock 122.406 + int smb_height; //height of a super macroblock 122.407 + int refcount; 122.408 + int index; 122.409 + SuperMBTask *smbs[2]; 122.410 +} SuperMBContext; 122.411 +// #endif 122.412 + 122.413 +//scratchpad for decoding a macroblock 122.414 +typedef struct MBRecState{ 122.415 + int8_t *ref_index_top[2]; 122.416 + int8_t *ref_index[2]; 122.417 + int16_t (*motion_val_top[2])[2]; 122.418 + int16_t (*motion_val[2])[2]; 122.419 + uint32_t *mb_type_top; 122.420 + uint32_t *mb_type; 122.421 + 122.422 + int8_t *list1_ref_index[2]; 122.423 + int16_t (*list1_motion_val[2])[2]; 122.424 + uint32_t *list1_mb_type; 122.425 + 122.426 + int8_t *intra4x4_pred_mode_top; 122.427 + int8_t *intra4x4_pred_mode; 122.428 +#if !OMPSS 122.429 + int8_t intra4x4_pred_mode_left[4]; 122.430 +#endif 122.431 + int8_t *non_zero_count_top; 122.432 + int8_t *non_zero_count; 122.433 +// int8_t non_zero_count_left[8]; 122.434 + 122.435 + 122.436 + unsigned int topleft_samples_available; 122.437 + unsigned int topright_samples_available; 122.438 + unsigned int top_samples_available; 122.439 + unsigned int left_samples_available; 122.440 + 122.441 + int top_type; 122.442 + int left_type; 122.443 + 122.444 + DECLARE_ALIGNED(8, int8_t, intra4x4_pred_mode_cache[5*8]); 122.445 + DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2]; 122.446 + DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8]; 122.447 + DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8]; 122.448 + DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2]; 122.449 + 122.450 + DECLARE_ALIGNED(8, int16_t, bS)[2][4][4]; 122.451 + uint8_t edges[2]; 122.452 + 122.453 +}MBRecState ; 122.454 + 122.455 +typedef struct MBRecContext{ 122.456 + DSPContext dsp; ///< pointers for accelerated dsp functions 122.457 + H264DSPContext hdsp; 122.458 + H264PredContext hpc; 122.459 + 122.460 + MBRecState *mrs; 122.461 + RingLineEntry *rle; //debug 122.462 + 122.463 + uint8_t *scratchpad_y; ///implemented different on Cell 122.464 + uint8_t *scratchpad_cb; ///implemented different on Cell 122.465 + uint8_t *scratchpad_cr; ///implemented different on Cell 122.466 + 122.467 + int linesize; 122.468 + int uvlinesize; 122.469 + int mb_width; 122.470 + int mb_height; 122.471 + int mb_stride; 122.472 + int b_stride; 122.473 + int width; 122.474 + int height; 122.475 + 122.476 +#if !OMPSS // not used in OMPSS 122.477 + LeftBorder left; 122.478 + TopBorder *top; 122.479 + TopBorder *top_next; // next line top border 122.480 +#endif 122.481 + /* 122.482 + .UU.YYYY 122.483 + .UU.YYYY 122.484 + .vv.YYYY 122.485 + .VV.YYYY 122.486 + */ 122.487 + 122.488 + // block_offset[ 0..23] for frame macroblocks 122.489 + int block_offset[16+8]; 122.490 + 122.491 +} MBRecContext; 122.492 + 122.493 +#ifdef HAVE_LIBSDL2 122.494 +typedef struct SDLContext{ 122.495 + int display; 122.496 + int fullscreen; 122.497 + pthread_t listen_thread; 122.498 + 122.499 + SDL_DisplayMode full; 122.500 + SDL_DisplayMode wind; 122.501 + 122.502 + 122.503 + SDL_Renderer *renderer; 122.504 + SDL_Rect rect; 122.505 + SDL_Rect win_rect; 122.506 + SDL_Window *window; 122.507 + double aspect; 122.508 + int win_w; 122.509 + int win_h; 122.510 + int resized; 122.511 + 122.512 + SDL_Texture *sbmap_texture; 122.513 + int showmap; 122.514 + int updatemap; 122.515 + int pause; 122.516 + 122.517 +} SDLContext; 122.518 +#endif 122.519 + 122.520 +typedef struct OutputContext { 122.521 + int bit_buffer_size; 122.522 + uint8_t *bit_buffer; 122.523 + uint64_t video_size; 122.524 + int frame_number; 122.525 + DecodedPicture *delayed_pic[DPB_SIZE]; 122.526 + int dp_cnt; 122.527 + 122.528 +} OutputContext; 122.529 + 122.530 +typedef struct { 122.531 + pthread_mutex_t lock; 122.532 + pthread_cond_t cond; 122.533 + SliceBufferEntry **queue; 122.534 + int size; 122.535 + int cnt; 122.536 + int fi; 122.537 + int fo; 122.538 +} SliceBufferQueue; 122.539 + 122.540 +typedef struct { 122.541 + pthread_mutex_t wslock; 122.542 + pthread_cond_t wscond; 122.543 + pthread_mutex_t swlock; 122.544 + pthread_cond_t swcond; 122.545 + RingLineEntry **queue; 122.546 + int size; 122.547 + int ready; 122.548 + int free; 122.549 + int fi; 122.550 + int fo; 122.551 +} RingLineQueue; 122.552 + 122.553 +#if HAVE_LIBSDL2 122.554 +typedef struct { 122.555 + pthread_mutex_t sdl_lock; 122.556 + pthread_cond_t sdl_cond; 122.557 + SDL_Texture **queue; 122.558 + int size; 122.559 + int ready; 122.560 + int fi; 122.561 + int fo; 122.562 + int exit; 122.563 +} SDLTextureQueue; 122.564 +#endif 122.565 +/** 122.566 +* H264Context 122.567 +*/ 122.568 +typedef struct H264Context{ 122.569 + SliceBufferQueue sb_q[STAGES]; 122.570 + RingLineQueue rl_q; 122.571 + 122.572 + pthread_mutex_t lock[STAGES]; 122.573 + pthread_cond_t cond[STAGES]; 122.574 + 122.575 + pthread_mutex_t task_lock; 122.576 + pthread_cond_t task_cond; 122.577 + 122.578 + pthread_attr_t ed_rec_attr[MAX_THREADS]; 122.579 + pthread_t ed_rec_thr[MAX_THREADS]; 122.580 + 122.581 + int init_threads; 122.582 + pthread_mutex_t ilock; 122.583 + pthread_cond_t icond; 122.584 + 122.585 + const char *file_name; 122.586 + int profile; 122.587 + int start; 122.588 + int touch_start; 122.589 + int setaff; 122.590 + int touch_done; 122.591 + int rl_side_touch; 122.592 + int statmbd; 122.593 + pthread_mutex_t slock; 122.594 + pthread_cond_t scond; 122.595 + pthread_mutex_t tlock; 122.596 + pthread_cond_t tcond; 122.597 + pthread_mutex_t tdlock; 122.598 + pthread_cond_t tdcond; 122.599 + 122.600 + int ed_ppe_threads; 122.601 + int threads; 122.602 + int smt; 122.603 + 122.604 + int acdpb_cnt; //debug 122.605 + int reldpb_cnt; 122.606 + 122.607 + int sb_size; 122.608 + SliceBufferEntry *sb; ///< Slice Syntax Buffer 122.609 + int free_sb_cnt; 122.610 + int slice_bufs; 122.611 + 122.612 + int max_dpb_cnt; 122.613 + DecodedPicture *dpb; ///< Decoded Picture Buffer 122.614 + int free_dpb_cnt; 122.615 + 122.616 + int ifile; 122.617 + int ofile; 122.618 + int frame_width; 122.619 + int frame_height; 122.620 + int num_frames; 122.621 + int width; 122.622 + int height; 122.623 + int mb_width; 122.624 + int mb_height; 122.625 + int mb_stride; ///< mb_width+1 used for some arrays to allow simple addressing of left & top MBs without sig11 122.626 + int b4_stride; 122.627 + int b_stride; 122.628 + 122.629 + int smb_height; 122.630 + int smb_width; 122.631 + pthread_mutex_t smb_lock; 122.632 + pthread_cond_t sdl_cond; 122.633 + pthread_mutex_t sdl_lock; 122.634 + SuperMBContext *smbc; 122.635 + 122.636 + int wave_order; 122.637 + int static_3d; 122.638 + int pipe_bufs; 122.639 + 122.640 + //shared tables used in entropy decoding 122.641 + uint8_t zigzag_scan[16]; 122.642 + uint8_t zigzag_scan8x8[64]; 122.643 + 122.644 + int verbose; 122.645 + int no_mbd; 122.646 + int display; 122.647 + int fullscreen; 122.648 + int quit; 122.649 +#ifdef HAVE_LIBSDL2 122.650 + SDLTextureQueue sdlq; 122.651 + SDLContext *sdlc; 122.652 +#endif 122.653 + 122.654 + struct timespec start_time[PROFILE_STAGES]; 122.655 + struct timespec end_time[PROFILE_STAGES]; 122.656 + double last_time[PROFILE_STAGES]; 122.657 + double total_time[PROFILE_STAGES]; 122.658 + 122.659 +}H264Context; 122.660 + 122.661 +#endif
123.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 123.2 +++ b/ffmpeg_smp/h264dec/libavcodec/mathops.h Mon Aug 27 12:09:56 2012 +0200 123.3 @@ -0,0 +1,145 @@ 123.4 +/* 123.5 + * simple math operations 123.6 + * Copyright (c) 2001, 2002 Fabrice Bellard 123.7 + * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al 123.8 + * 123.9 + * This file is part of FFmpeg. 123.10 + * 123.11 + * FFmpeg is free software; you can redistribute it and/or 123.12 + * modify it under the terms of the GNU Lesser General Public 123.13 + * License as published by the Free Software Foundation; either 123.14 + * version 2.1 of the License, or (at your option) any later version. 123.15 + * 123.16 + * FFmpeg is distributed in the hope that it will be useful, 123.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 123.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 123.19 + * Lesser General Public License for more details. 123.20 + * 123.21 + * You should have received a copy of the GNU Lesser General Public 123.22 + * License along with FFmpeg; if not, write to the Free Software 123.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 123.24 + */ 123.25 +#ifndef AVCODEC_MATHOPS_H 123.26 +#define AVCODEC_MATHOPS_H 123.27 + 123.28 +#include "libavutil/common.h" 123.29 +#include "libavutil/internal.h" 123.30 + 123.31 +#if ARCH_ARM 123.32 +# include "arm/mathops.h" 123.33 +#elif ARCH_PPC 123.34 +# include "ppc/mathops.h" 123.35 +#elif ARCH_X86 123.36 +# include "x86/mathops.h" 123.37 +#endif 123.38 + 123.39 +/* generic implementation */ 123.40 + 123.41 +#ifndef MULL 123.42 +# define MULL(a,b,s) (((int64_t)(a) * (int64_t)(b)) >> (s)) 123.43 +#endif 123.44 + 123.45 +#ifndef MULH 123.46 +//gcc 3.4 creates an incredibly bloated mess out of this 123.47 +//# define MULH(a,b) (((int64_t)(a) * (int64_t)(b))>>32) 123.48 + 123.49 +static av_always_inline int MULH(int a, int b){ 123.50 + return ((int64_t)(a) * (int64_t)(b))>>32; 123.51 +} 123.52 +#endif 123.53 + 123.54 +#ifndef UMULH 123.55 +static av_always_inline unsigned UMULH(unsigned a, unsigned b){ 123.56 + return ((uint64_t)(a) * (uint64_t)(b))>>32; 123.57 +} 123.58 +#endif 123.59 + 123.60 +#ifndef MUL64 123.61 +# define MUL64(a,b) ((int64_t)(a) * (int64_t)(b)) 123.62 +#endif 123.63 + 123.64 +#ifndef MAC64 123.65 +# define MAC64(d, a, b) ((d) += MUL64(a, b)) 123.66 +#endif 123.67 + 123.68 +#ifndef MLS64 123.69 +# define MLS64(d, a, b) ((d) -= MUL64(a, b)) 123.70 +#endif 123.71 + 123.72 +/* signed 16x16 -> 32 multiply add accumulate */ 123.73 +#ifndef MAC16 123.74 +# define MAC16(rt, ra, rb) rt += (ra) * (rb) 123.75 +#endif 123.76 + 123.77 +/* signed 16x16 -> 32 multiply */ 123.78 +#ifndef MUL16 123.79 +# define MUL16(ra, rb) ((ra) * (rb)) 123.80 +#endif 123.81 + 123.82 +#ifndef MLS16 123.83 +# define MLS16(rt, ra, rb) ((rt) -= (ra) * (rb)) 123.84 +#endif 123.85 + 123.86 +/* median of 3 */ 123.87 +#ifndef mid_pred 123.88 +#define mid_pred mid_pred 123.89 +static inline av_const int mid_pred(int a, int b, int c) 123.90 +{ 123.91 +#if 0 123.92 + int t= (a-b)&((a-b)>>31); 123.93 + a-=t; 123.94 + b+=t; 123.95 + b-= (b-c)&((b-c)>>31); 123.96 + b+= (a-b)&((a-b)>>31); 123.97 + 123.98 + return b; 123.99 +#else 123.100 + if(a>b){ 123.101 + if(c>b){ 123.102 + if(c>a) b=a; 123.103 + else b=c; 123.104 + } 123.105 + }else{ 123.106 + if(b>c){ 123.107 + if(c>a) b=c; 123.108 + else b=a; 123.109 + } 123.110 + } 123.111 + return b; 123.112 +#endif 123.113 +} 123.114 +#endif 123.115 + 123.116 +#ifndef sign_extend 123.117 +static inline av_const int sign_extend(int val, unsigned bits) 123.118 +{ 123.119 + return (val << (INT_BIT - bits)) >> (INT_BIT - bits); 123.120 +} 123.121 +#endif 123.122 + 123.123 +#ifndef zero_extend 123.124 +static inline av_const unsigned zero_extend(unsigned val, unsigned bits) 123.125 +{ 123.126 + return (val << (INT_BIT - bits)) >> (INT_BIT - bits); 123.127 +} 123.128 +#endif 123.129 + 123.130 +#ifndef COPY3_IF_LT 123.131 +#define COPY3_IF_LT(x, y, a, b, c, d)\ 123.132 +if ((y) < (x)) {\ 123.133 + (x) = (y);\ 123.134 + (a) = (b);\ 123.135 + (c) = (d);\ 123.136 +} 123.137 +#endif 123.138 + 123.139 +#ifndef NEG_SSR32 123.140 +# define NEG_SSR32(a,s) ((( int32_t)(a))>>(32-(s))) 123.141 +#endif 123.142 + 123.143 +#ifndef NEG_USR32 123.144 +# define NEG_USR32(a,s) (((uint32_t)(a))>>(32-(s))) 123.145 +#endif 123.146 + 123.147 +#endif /* AVCODEC_MATHOPS_H */ 123.148 +
124.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 124.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.c Mon Aug 27 12:09:56 2012 +0200 124.3 @@ -0,0 +1,619 @@ 124.4 +/* 124.5 + * Copyright (c) 2002 Brian Foley 124.6 + * Copyright (c) 2002 Dieter Shirley 124.7 + * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> 124.8 + * 124.9 + * This file is part of FFmpeg. 124.10 + * 124.11 + * FFmpeg is free software; you can redistribute it and/or 124.12 + * modify it under the terms of the GNU Lesser General Public 124.13 + * License as published by the Free Software Foundation; either 124.14 + * version 2.1 of the License, or (at your option) any later version. 124.15 + * 124.16 + * FFmpeg is distributed in the hope that it will be useful, 124.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 124.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 124.19 + * Lesser General Public License for more details. 124.20 + * 124.21 + * You should have received a copy of the GNU Lesser General Public 124.22 + * License along with FFmpeg; if not, write to the Free Software 124.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 124.24 + */ 124.25 + 124.26 +#include "config.h" 124.27 +#if HAVE_ALTIVEC_H 124.28 +#include <altivec.h> 124.29 +#endif 124.30 +#include "libavcodec/dsputil.h" 124.31 +#include "dsputil_ppc.h" 124.32 +#include "util_altivec.h" 124.33 +#include "types_altivec.h" 124.34 +#include "dsputil_altivec.h" 124.35 + 124.36 + 124.37 +static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) 124.38 +{ 124.39 + int i; 124.40 + vector unsigned char perm, bytes, *pixv; 124.41 + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 124.42 + vector signed short shorts; 124.43 + 124.44 + for (i = 0; i < 8; i++) { 124.45 + // Read potentially unaligned pixels. 124.46 + // We're reading 16 pixels, and actually only want 8, 124.47 + // but we simply ignore the extras. 124.48 + perm = vec_lvsl(0, pixels); 124.49 + pixv = (vector unsigned char *) pixels; 124.50 + bytes = vec_perm(pixv[0], pixv[1], perm); 124.51 + 124.52 + // convert the bytes into shorts 124.53 + shorts = (vector signed short)vec_mergeh(zero, bytes); 124.54 + 124.55 + // save the data to the block, we assume the block is 16-byte aligned 124.56 + vec_st(shorts, i*16, (vector signed short*)block); 124.57 + 124.58 + pixels += line_size; 124.59 + } 124.60 +} 124.61 + 124.62 +static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, 124.63 + const uint8_t *s2, int stride) 124.64 +{ 124.65 + int i; 124.66 + vector unsigned char perm, bytes, *pixv; 124.67 + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 124.68 + vector signed short shorts1, shorts2; 124.69 + 124.70 + for (i = 0; i < 4; i++) { 124.71 + // Read potentially unaligned pixels 124.72 + // We're reading 16 pixels, and actually only want 8, 124.73 + // but we simply ignore the extras. 124.74 + perm = vec_lvsl(0, s1); 124.75 + pixv = (vector unsigned char *) s1; 124.76 + bytes = vec_perm(pixv[0], pixv[1], perm); 124.77 + 124.78 + // convert the bytes into shorts 124.79 + shorts1 = (vector signed short)vec_mergeh(zero, bytes); 124.80 + 124.81 + // Do the same for the second block of pixels 124.82 + perm = vec_lvsl(0, s2); 124.83 + pixv = (vector unsigned char *) s2; 124.84 + bytes = vec_perm(pixv[0], pixv[1], perm); 124.85 + 124.86 + // convert the bytes into shorts 124.87 + shorts2 = (vector signed short)vec_mergeh(zero, bytes); 124.88 + 124.89 + // Do the subtraction 124.90 + shorts1 = vec_sub(shorts1, shorts2); 124.91 + 124.92 + // save the data to the block, we assume the block is 16-byte aligned 124.93 + vec_st(shorts1, 0, (vector signed short*)block); 124.94 + 124.95 + s1 += stride; 124.96 + s2 += stride; 124.97 + block += 8; 124.98 + 124.99 + 124.100 + // The code below is a copy of the code above... This is a manual 124.101 + // unroll. 124.102 + 124.103 + // Read potentially unaligned pixels 124.104 + // We're reading 16 pixels, and actually only want 8, 124.105 + // but we simply ignore the extras. 124.106 + perm = vec_lvsl(0, s1); 124.107 + pixv = (vector unsigned char *) s1; 124.108 + bytes = vec_perm(pixv[0], pixv[1], perm); 124.109 + 124.110 + // convert the bytes into shorts 124.111 + shorts1 = (vector signed short)vec_mergeh(zero, bytes); 124.112 + 124.113 + // Do the same for the second block of pixels 124.114 + perm = vec_lvsl(0, s2); 124.115 + pixv = (vector unsigned char *) s2; 124.116 + bytes = vec_perm(pixv[0], pixv[1], perm); 124.117 + 124.118 + // convert the bytes into shorts 124.119 + shorts2 = (vector signed short)vec_mergeh(zero, bytes); 124.120 + 124.121 + // Do the subtraction 124.122 + shorts1 = vec_sub(shorts1, shorts2); 124.123 + 124.124 + // save the data to the block, we assume the block is 16-byte aligned 124.125 + vec_st(shorts1, 0, (vector signed short*)block); 124.126 + 124.127 + s1 += stride; 124.128 + s2 += stride; 124.129 + block += 8; 124.130 + } 124.131 +} 124.132 + 124.133 + 124.134 +static void clear_block_altivec(DCTELEM *block) { 124.135 + LOAD_ZERO; 124.136 + vec_st(zero_s16v, 0, block); 124.137 + vec_st(zero_s16v, 16, block); 124.138 + vec_st(zero_s16v, 32, block); 124.139 + vec_st(zero_s16v, 48, block); 124.140 + vec_st(zero_s16v, 64, block); 124.141 + vec_st(zero_s16v, 80, block); 124.142 + vec_st(zero_s16v, 96, block); 124.143 + vec_st(zero_s16v, 112, block); 124.144 +} 124.145 + 124.146 + 124.147 + 124.148 +/* next one assumes that ((line_size % 16) == 0) */ 124.149 +void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 124.150 +{ 124.151 +POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1); 124.152 + register vector unsigned char pixelsv1, pixelsv2; 124.153 + register vector unsigned char pixelsv1B, pixelsv2B; 124.154 + register vector unsigned char pixelsv1C, pixelsv2C; 124.155 + register vector unsigned char pixelsv1D, pixelsv2D; 124.156 + 124.157 + register vector unsigned char perm = vec_lvsl(0, pixels); 124.158 + int i; 124.159 + register int line_size_2 = line_size << 1; 124.160 + register int line_size_3 = line_size + line_size_2; 124.161 + register int line_size_4 = line_size << 2; 124.162 + 124.163 +POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); 124.164 +// hand-unrolling the loop by 4 gains about 15% 124.165 +// mininum execution time goes from 74 to 60 cycles 124.166 +// it's faster than -funroll-loops, but using 124.167 +// -funroll-loops w/ this is bad - 74 cycles again. 124.168 +// all this is on a 7450, tuning for the 7450 124.169 +#if 0 124.170 + for (i = 0; i < h; i++) { 124.171 + pixelsv1 = vec_ld(0, pixels); 124.172 + pixelsv2 = vec_ld(16, pixels); 124.173 + vec_st(vec_perm(pixelsv1, pixelsv2, perm), 124.174 + 0, block); 124.175 + pixels+=line_size; 124.176 + block +=line_size; 124.177 + } 124.178 +#else 124.179 + for (i = 0; i < h; i += 4) { 124.180 + pixelsv1 = vec_ld( 0, pixels); 124.181 + pixelsv2 = vec_ld(15, pixels); 124.182 + pixelsv1B = vec_ld(line_size, pixels); 124.183 + pixelsv2B = vec_ld(15 + line_size, pixels); 124.184 + pixelsv1C = vec_ld(line_size_2, pixels); 124.185 + pixelsv2C = vec_ld(15 + line_size_2, pixels); 124.186 + pixelsv1D = vec_ld(line_size_3, pixels); 124.187 + pixelsv2D = vec_ld(15 + line_size_3, pixels); 124.188 + vec_st(vec_perm(pixelsv1, pixelsv2, perm), 124.189 + 0, (unsigned char*)block); 124.190 + vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), 124.191 + line_size, (unsigned char*)block); 124.192 + vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), 124.193 + line_size_2, (unsigned char*)block); 124.194 + vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), 124.195 + line_size_3, (unsigned char*)block); 124.196 + pixels+=line_size_4; 124.197 + block +=line_size_4; 124.198 + } 124.199 +#endif 124.200 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); 124.201 +} 124.202 + 124.203 +/* next one assumes that ((line_size % 16) == 0) */ 124.204 +#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) 124.205 +void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 124.206 +{ 124.207 +POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); 124.208 + register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 124.209 + register vector unsigned char perm = vec_lvsl(0, pixels); 124.210 + int i; 124.211 + 124.212 +POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); 124.213 + 124.214 + for (i = 0; i < h; i++) { 124.215 + pixelsv1 = vec_ld( 0, pixels); 124.216 + pixelsv2 = vec_ld(16,pixels); 124.217 + blockv = vec_ld(0, block); 124.218 + pixelsv = vec_perm(pixelsv1, pixelsv2, perm); 124.219 + blockv = vec_avg(blockv,pixelsv); 124.220 + vec_st(blockv, 0, (unsigned char*)block); 124.221 + pixels+=line_size; 124.222 + block +=line_size; 124.223 + } 124.224 + 124.225 +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); 124.226 +} 124.227 + 124.228 +/* next one assumes that ((line_size % 8) == 0) */ 124.229 +static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 124.230 +{ 124.231 +POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1); 124.232 + register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 124.233 + int i; 124.234 + 124.235 +POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); 124.236 + 124.237 + for (i = 0; i < h; i++) { 124.238 + /* block is 8 bytes-aligned, so we're either in the 124.239 + left block (16 bytes-aligned) or in the right block (not) */ 124.240 + int rightside = ((unsigned long)block & 0x0000000F); 124.241 + 124.242 + blockv = vec_ld(0, block); 124.243 + pixelsv1 = vec_ld( 0, pixels); 124.244 + pixelsv2 = vec_ld(16, pixels); 124.245 + pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); 124.246 + 124.247 + if (rightside) { 124.248 + pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); 124.249 + } else { 124.250 + pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); 124.251 + } 124.252 + 124.253 + blockv = vec_avg(blockv, pixelsv); 124.254 + 124.255 + vec_st(blockv, 0, block); 124.256 + 124.257 + pixels += line_size; 124.258 + block += line_size; 124.259 + } 124.260 + 124.261 +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); 124.262 +} 124.263 + 124.264 +/* next one assumes that ((line_size % 8) == 0) */ 124.265 +static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 124.266 +{ 124.267 +POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); 124.268 + register int i; 124.269 + register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 124.270 + register vector unsigned char blockv, temp1, temp2; 124.271 + register vector unsigned short pixelssum1, pixelssum2, temp3; 124.272 + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 124.273 + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 124.274 + 124.275 + temp1 = vec_ld(0, pixels); 124.276 + temp2 = vec_ld(16, pixels); 124.277 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 124.278 + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 124.279 + pixelsv2 = temp2; 124.280 + } else { 124.281 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 124.282 + } 124.283 + pixelsv1 = vec_mergeh(vczero, pixelsv1); 124.284 + pixelsv2 = vec_mergeh(vczero, pixelsv2); 124.285 + pixelssum1 = vec_add((vector unsigned short)pixelsv1, 124.286 + (vector unsigned short)pixelsv2); 124.287 + pixelssum1 = vec_add(pixelssum1, vctwo); 124.288 + 124.289 +POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); 124.290 + for (i = 0; i < h ; i++) { 124.291 + int rightside = ((unsigned long)block & 0x0000000F); 124.292 + blockv = vec_ld(0, block); 124.293 + 124.294 + temp1 = vec_ld(line_size, pixels); 124.295 + temp2 = vec_ld(line_size + 16, pixels); 124.296 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 124.297 + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 124.298 + pixelsv2 = temp2; 124.299 + } else { 124.300 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 124.301 + } 124.302 + 124.303 + pixelsv1 = vec_mergeh(vczero, pixelsv1); 124.304 + pixelsv2 = vec_mergeh(vczero, pixelsv2); 124.305 + pixelssum2 = vec_add((vector unsigned short)pixelsv1, 124.306 + (vector unsigned short)pixelsv2); 124.307 + temp3 = vec_add(pixelssum1, pixelssum2); 124.308 + temp3 = vec_sra(temp3, vctwo); 124.309 + pixelssum1 = vec_add(pixelssum2, vctwo); 124.310 + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 124.311 + 124.312 + if (rightside) { 124.313 + blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 124.314 + } else { 124.315 + blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 124.316 + } 124.317 + 124.318 + vec_st(blockv, 0, block); 124.319 + 124.320 + block += line_size; 124.321 + pixels += line_size; 124.322 + } 124.323 + 124.324 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); 124.325 +} 124.326 + 124.327 +/* next one assumes that ((line_size % 8) == 0) */ 124.328 +static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 124.329 +{ 124.330 +POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); 124.331 + register int i; 124.332 + register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 124.333 + register vector unsigned char blockv, temp1, temp2; 124.334 + register vector unsigned short pixelssum1, pixelssum2, temp3; 124.335 + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 124.336 + register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 124.337 + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 124.338 + 124.339 + temp1 = vec_ld(0, pixels); 124.340 + temp2 = vec_ld(16, pixels); 124.341 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 124.342 + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 124.343 + pixelsv2 = temp2; 124.344 + } else { 124.345 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 124.346 + } 124.347 + pixelsv1 = vec_mergeh(vczero, pixelsv1); 124.348 + pixelsv2 = vec_mergeh(vczero, pixelsv2); 124.349 + pixelssum1 = vec_add((vector unsigned short)pixelsv1, 124.350 + (vector unsigned short)pixelsv2); 124.351 + pixelssum1 = vec_add(pixelssum1, vcone); 124.352 + 124.353 +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); 124.354 + for (i = 0; i < h ; i++) { 124.355 + int rightside = ((unsigned long)block & 0x0000000F); 124.356 + blockv = vec_ld(0, block); 124.357 + 124.358 + temp1 = vec_ld(line_size, pixels); 124.359 + temp2 = vec_ld(line_size + 16, pixels); 124.360 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 124.361 + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 124.362 + pixelsv2 = temp2; 124.363 + } else { 124.364 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 124.365 + } 124.366 + 124.367 + pixelsv1 = vec_mergeh(vczero, pixelsv1); 124.368 + pixelsv2 = vec_mergeh(vczero, pixelsv2); 124.369 + pixelssum2 = vec_add((vector unsigned short)pixelsv1, 124.370 + (vector unsigned short)pixelsv2); 124.371 + temp3 = vec_add(pixelssum1, pixelssum2); 124.372 + temp3 = vec_sra(temp3, vctwo); 124.373 + pixelssum1 = vec_add(pixelssum2, vcone); 124.374 + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 124.375 + 124.376 + if (rightside) { 124.377 + blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 124.378 + } else { 124.379 + blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 124.380 + } 124.381 + 124.382 + vec_st(blockv, 0, block); 124.383 + 124.384 + block += line_size; 124.385 + pixels += line_size; 124.386 + } 124.387 + 124.388 +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); 124.389 +} 124.390 + 124.391 +/* next one assumes that ((line_size % 16) == 0) */ 124.392 +static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 124.393 +{ 124.394 +POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); 124.395 + register int i; 124.396 + register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; 124.397 + register vector unsigned char blockv, temp1, temp2; 124.398 + register vector unsigned short temp3, temp4, 124.399 + pixelssum1, pixelssum2, pixelssum3, pixelssum4; 124.400 + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 124.401 + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 124.402 + 124.403 +POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); 124.404 + 124.405 + temp1 = vec_ld(0, pixels); 124.406 + temp2 = vec_ld(16, pixels); 124.407 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 124.408 + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 124.409 + pixelsv2 = temp2; 124.410 + } else { 124.411 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 124.412 + } 124.413 + pixelsv3 = vec_mergel(vczero, pixelsv1); 124.414 + pixelsv4 = vec_mergel(vczero, pixelsv2); 124.415 + pixelsv1 = vec_mergeh(vczero, pixelsv1); 124.416 + pixelsv2 = vec_mergeh(vczero, pixelsv2); 124.417 + pixelssum3 = vec_add((vector unsigned short)pixelsv3, 124.418 + (vector unsigned short)pixelsv4); 124.419 + pixelssum3 = vec_add(pixelssum3, vctwo); 124.420 + pixelssum1 = vec_add((vector unsigned short)pixelsv1, 124.421 + (vector unsigned short)pixelsv2); 124.422 + pixelssum1 = vec_add(pixelssum1, vctwo); 124.423 + 124.424 + for (i = 0; i < h ; i++) { 124.425 + blockv = vec_ld(0, block); 124.426 + 124.427 + temp1 = vec_ld(line_size, pixels); 124.428 + temp2 = vec_ld(line_size + 16, pixels); 124.429 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 124.430 + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 124.431 + pixelsv2 = temp2; 124.432 + } else { 124.433 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 124.434 + } 124.435 + 124.436 + pixelsv3 = vec_mergel(vczero, pixelsv1); 124.437 + pixelsv4 = vec_mergel(vczero, pixelsv2); 124.438 + pixelsv1 = vec_mergeh(vczero, pixelsv1); 124.439 + pixelsv2 = vec_mergeh(vczero, pixelsv2); 124.440 + 124.441 + pixelssum4 = vec_add((vector unsigned short)pixelsv3, 124.442 + (vector unsigned short)pixelsv4); 124.443 + pixelssum2 = vec_add((vector unsigned short)pixelsv1, 124.444 + (vector unsigned short)pixelsv2); 124.445 + temp4 = vec_add(pixelssum3, pixelssum4); 124.446 + temp4 = vec_sra(temp4, vctwo); 124.447 + temp3 = vec_add(pixelssum1, pixelssum2); 124.448 + temp3 = vec_sra(temp3, vctwo); 124.449 + 124.450 + pixelssum3 = vec_add(pixelssum4, vctwo); 124.451 + pixelssum1 = vec_add(pixelssum2, vctwo); 124.452 + 124.453 + blockv = vec_packsu(temp3, temp4); 124.454 + 124.455 + vec_st(blockv, 0, block); 124.456 + 124.457 + block += line_size; 124.458 + pixels += line_size; 124.459 + } 124.460 + 124.461 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); 124.462 +} 124.463 + 124.464 +/* next one assumes that ((line_size % 16) == 0) */ 124.465 +static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 124.466 +{ 124.467 +POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); 124.468 + register int i; 124.469 + register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; 124.470 + register vector unsigned char blockv, temp1, temp2; 124.471 + register vector unsigned short temp3, temp4, 124.472 + pixelssum1, pixelssum2, pixelssum3, pixelssum4; 124.473 + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 124.474 + register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 124.475 + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 124.476 + 124.477 +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); 124.478 + 124.479 + temp1 = vec_ld(0, pixels); 124.480 + temp2 = vec_ld(16, pixels); 124.481 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 124.482 + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 124.483 + pixelsv2 = temp2; 124.484 + } else { 124.485 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 124.486 + } 124.487 + pixelsv3 = vec_mergel(vczero, pixelsv1); 124.488 + pixelsv4 = vec_mergel(vczero, pixelsv2); 124.489 + pixelsv1 = vec_mergeh(vczero, pixelsv1); 124.490 + pixelsv2 = vec_mergeh(vczero, pixelsv2); 124.491 + pixelssum3 = vec_add((vector unsigned short)pixelsv3, 124.492 + (vector unsigned short)pixelsv4); 124.493 + pixelssum3 = vec_add(pixelssum3, vcone); 124.494 + pixelssum1 = vec_add((vector unsigned short)pixelsv1, 124.495 + (vector unsigned short)pixelsv2); 124.496 + pixelssum1 = vec_add(pixelssum1, vcone); 124.497 + 124.498 + for (i = 0; i < h ; i++) { 124.499 + blockv = vec_ld(0, block); 124.500 + 124.501 + temp1 = vec_ld(line_size, pixels); 124.502 + temp2 = vec_ld(line_size + 16, pixels); 124.503 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 124.504 + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 124.505 + pixelsv2 = temp2; 124.506 + } else { 124.507 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 124.508 + } 124.509 + 124.510 + pixelsv3 = vec_mergel(vczero, pixelsv1); 124.511 + pixelsv4 = vec_mergel(vczero, pixelsv2); 124.512 + pixelsv1 = vec_mergeh(vczero, pixelsv1); 124.513 + pixelsv2 = vec_mergeh(vczero, pixelsv2); 124.514 + 124.515 + pixelssum4 = vec_add((vector unsigned short)pixelsv3, 124.516 + (vector unsigned short)pixelsv4); 124.517 + pixelssum2 = vec_add((vector unsigned short)pixelsv1, 124.518 + (vector unsigned short)pixelsv2); 124.519 + temp4 = vec_add(pixelssum3, pixelssum4); 124.520 + temp4 = vec_sra(temp4, vctwo); 124.521 + temp3 = vec_add(pixelssum1, pixelssum2); 124.522 + temp3 = vec_sra(temp3, vctwo); 124.523 + 124.524 + pixelssum3 = vec_add(pixelssum4, vcone); 124.525 + pixelssum1 = vec_add(pixelssum2, vcone); 124.526 + 124.527 + blockv = vec_packsu(temp3, temp4); 124.528 + 124.529 + vec_st(blockv, 0, block); 124.530 + 124.531 + block += line_size; 124.532 + pixels += line_size; 124.533 + } 124.534 + 124.535 +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); 124.536 +} 124.537 + 124.538 +/* next one assumes that ((line_size % 8) == 0) */ 124.539 +static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 124.540 +{ 124.541 +POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1); 124.542 + register int i; 124.543 + register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 124.544 + register vector unsigned char blockv, temp1, temp2, blocktemp; 124.545 + register vector unsigned short pixelssum1, pixelssum2, temp3; 124.546 + 124.547 + register const vector unsigned char vczero = (const vector unsigned char) 124.548 + vec_splat_u8(0); 124.549 + register const vector unsigned short vctwo = (const vector unsigned short) 124.550 + vec_splat_u16(2); 124.551 + 124.552 + temp1 = vec_ld(0, pixels); 124.553 + temp2 = vec_ld(16, pixels); 124.554 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 124.555 + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 124.556 + pixelsv2 = temp2; 124.557 + } else { 124.558 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 124.559 + } 124.560 + pixelsv1 = vec_mergeh(vczero, pixelsv1); 124.561 + pixelsv2 = vec_mergeh(vczero, pixelsv2); 124.562 + pixelssum1 = vec_add((vector unsigned short)pixelsv1, 124.563 + (vector unsigned short)pixelsv2); 124.564 + pixelssum1 = vec_add(pixelssum1, vctwo); 124.565 + 124.566 +POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1); 124.567 + for (i = 0; i < h ; i++) { 124.568 + int rightside = ((unsigned long)block & 0x0000000F); 124.569 + blockv = vec_ld(0, block); 124.570 + 124.571 + temp1 = vec_ld(line_size, pixels); 124.572 + temp2 = vec_ld(line_size + 16, pixels); 124.573 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 124.574 + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 124.575 + pixelsv2 = temp2; 124.576 + } else { 124.577 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 124.578 + } 124.579 + 124.580 + pixelsv1 = vec_mergeh(vczero, pixelsv1); 124.581 + pixelsv2 = vec_mergeh(vczero, pixelsv2); 124.582 + pixelssum2 = vec_add((vector unsigned short)pixelsv1, 124.583 + (vector unsigned short)pixelsv2); 124.584 + temp3 = vec_add(pixelssum1, pixelssum2); 124.585 + temp3 = vec_sra(temp3, vctwo); 124.586 + pixelssum1 = vec_add(pixelssum2, vctwo); 124.587 + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 124.588 + 124.589 + if (rightside) { 124.590 + blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 124.591 + } else { 124.592 + blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 124.593 + } 124.594 + 124.595 + blockv = vec_avg(blocktemp, blockv); 124.596 + vec_st(blockv, 0, block); 124.597 + 124.598 + block += line_size; 124.599 + pixels += line_size; 124.600 + } 124.601 + 124.602 +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1); 124.603 +} 124.604 + 124.605 +void dsputil_init_altivec(DSPContext* c) 124.606 +{ 124.607 + c->diff_pixels = diff_pixels_altivec; 124.608 + c->get_pixels = get_pixels_altivec; 124.609 + c->clear_block = clear_block_altivec; 124.610 + 124.611 + c->put_pixels_tab[0][0] = put_pixels16_altivec; 124.612 + /* the two functions do the same thing, so use the same code */ 124.613 + c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; 124.614 + c->avg_pixels_tab[0][0] = avg_pixels16_altivec; 124.615 + c->avg_pixels_tab[1][0] = avg_pixels8_altivec; 124.616 + c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; 124.617 + c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; 124.618 + c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; 124.619 + c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; 124.620 + c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; 124.621 + 124.622 +}
125.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 125.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.h Mon Aug 27 12:09:56 2012 +0200 125.3 @@ -0,0 +1,52 @@ 125.4 +/* 125.5 + * Copyright (c) 2002 Brian Foley 125.6 + * Copyright (c) 2002 Dieter Shirley 125.7 + * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> 125.8 + * 125.9 + * This file is part of FFmpeg. 125.10 + * 125.11 + * FFmpeg is free software; you can redistribute it and/or 125.12 + * modify it under the terms of the GNU Lesser General Public 125.13 + * License as published by the Free Software Foundation; either 125.14 + * version 2.1 of the License, or (at your option) any later version. 125.15 + * 125.16 + * FFmpeg is distributed in the hope that it will be useful, 125.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 125.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 125.19 + * Lesser General Public License for more details. 125.20 + * 125.21 + * You should have received a copy of the GNU Lesser General Public 125.22 + * License along with FFmpeg; if not, write to the Free Software 125.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 125.24 + */ 125.25 + 125.26 +#ifndef AVCODEC_PPC_DSPUTIL_ALTIVEC_H 125.27 +#define AVCODEC_PPC_DSPUTIL_ALTIVEC_H 125.28 + 125.29 +#include <stdint.h> 125.30 +#include "libavcodec/dsputil.h" 125.31 + 125.32 +void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); 125.33 + 125.34 +void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); 125.35 + 125.36 +int has_altivec(void); 125.37 + 125.38 +void fdct_altivec(int16_t *block); 125.39 +void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h, 125.40 + int x16, int y16, int rounder); 125.41 +void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); 125.42 +void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); 125.43 + 125.44 +void ff_vp3_idct_altivec(DCTELEM *block); 125.45 +void ff_vp3_idct_put_altivec(uint8_t *dest, int line_size, DCTELEM *block); 125.46 +void ff_vp3_idct_add_altivec(uint8_t *dest, int line_size, DCTELEM *block); 125.47 + 125.48 +void dsputil_h264_init_ppc(DSPContext* c); 125.49 + 125.50 +void dsputil_init_altivec(DSPContext* c); 125.51 +//void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx); 125.52 +//void float_init_altivec(DSPContext* c, AVCodecContext *avctx); 125.53 +//void int_init_altivec(DSPContext* c, AVCodecContext *avctx); 125.54 + 125.55 +#endif /* AVCODEC_PPC_DSPUTIL_ALTIVEC_H */
126.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 126.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.c Mon Aug 27 12:09:56 2012 +0200 126.3 @@ -0,0 +1,48 @@ 126.4 +/* 126.5 + * Copyright (c) 2002 Brian Foley 126.6 + * Copyright (c) 2002 Dieter Shirley 126.7 + * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> 126.8 + * 126.9 + * This file is part of FFmpeg. 126.10 + * 126.11 + * FFmpeg is free software; you can redistribute it and/or 126.12 + * modify it under the terms of the GNU Lesser General Public 126.13 + * License as published by the Free Software Foundation; either 126.14 + * version 2.1 of the License, or (at your option) any later version. 126.15 + * 126.16 + * FFmpeg is distributed in the hope that it will be useful, 126.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 126.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 126.19 + * Lesser General Public License for more details. 126.20 + * 126.21 + * You should have received a copy of the GNU Lesser General Public 126.22 + * License along with FFmpeg; if not, write to the Free Software 126.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 126.24 + */ 126.25 + 126.26 +#include "libavcodec/dsputil.h" 126.27 +#include "dsputil_ppc.h" 126.28 +#include "dsputil_altivec.h" 126.29 + 126.30 +static void prefetch_ppc(void *mem, int stride, int h) 126.31 +{ 126.32 + register const uint8_t *p = mem; 126.33 + do { 126.34 + __asm__ volatile ("dcbt 0,%0" : : "r" (p)); 126.35 + p+= stride; 126.36 + } while(--h); 126.37 +} 126.38 + 126.39 +void dsputil_init_ppc(DSPContext* c) 126.40 +{ 126.41 + c->prefetch = prefetch_ppc; 126.42 + 126.43 +#if HAVE_ALTIVEC 126.44 + dsputil_h264_init_ppc(c); 126.45 + dsputil_init_altivec(c); 126.46 + 126.47 + c->idct_put = idct_put_altivec; 126.48 + c->idct_add = idct_add_altivec; 126.49 + 126.50 +#endif /* HAVE_ALTIVEC */ 126.51 +}
127.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 127.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.h Mon Aug 27 12:09:56 2012 +0200 127.3 @@ -0,0 +1,154 @@ 127.4 +/* 127.5 + * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> 127.6 + * 127.7 + * This file is part of FFmpeg. 127.8 + * 127.9 + * FFmpeg is free software; you can redistribute it and/or 127.10 + * modify it under the terms of the GNU Lesser General Public 127.11 + * License as published by the Free Software Foundation; either 127.12 + * version 2.1 of the License, or (at your option) any later version. 127.13 + * 127.14 + * FFmpeg is distributed in the hope that it will be useful, 127.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 127.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 127.17 + * Lesser General Public License for more details. 127.18 + * 127.19 + * You should have received a copy of the GNU Lesser General Public 127.20 + * License along with FFmpeg; if not, write to the Free Software 127.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 127.22 + */ 127.23 + 127.24 +#ifndef AVCODEC_PPC_DSPUTIL_PPC_H 127.25 +#define AVCODEC_PPC_DSPUTIL_PPC_H 127.26 + 127.27 +#include "config.h" 127.28 + 127.29 +#if CONFIG_POWERPC_PERF 127.30 +void powerpc_display_perf_report(void); 127.31 +/* the 604* have 2, the G3* have 4, the G4s have 6, 127.32 + and the G5 are completely different (they MUST use 127.33 + ARCH_PPC64, and let's hope all future 64 bis PPC 127.34 + will use the same PMCs... */ 127.35 +#define POWERPC_NUM_PMC_ENABLED 6 127.36 +/* if you add to the enum below, also add to the perfname array 127.37 + in dsputil_ppc.c */ 127.38 +enum powerpc_perf_index { 127.39 + altivec_fft_num = 0, 127.40 + altivec_gmc1_num, 127.41 + altivec_dct_unquantize_h263_num, 127.42 + altivec_fdct, 127.43 + altivec_idct_add_num, 127.44 + altivec_idct_put_num, 127.45 + altivec_put_pixels16_num, 127.46 + altivec_avg_pixels16_num, 127.47 + altivec_avg_pixels8_num, 127.48 + altivec_put_pixels8_xy2_num, 127.49 + altivec_put_no_rnd_pixels8_xy2_num, 127.50 + altivec_put_pixels16_xy2_num, 127.51 + altivec_put_no_rnd_pixels16_xy2_num, 127.52 + altivec_hadamard8_diff8x8_num, 127.53 + altivec_hadamard8_diff16_num, 127.54 + altivec_avg_pixels8_xy2_num, 127.55 + powerpc_clear_blocks_dcbz32, 127.56 + powerpc_clear_blocks_dcbz128, 127.57 + altivec_put_h264_chroma_mc8_num, 127.58 + altivec_avg_h264_chroma_mc8_num, 127.59 + altivec_put_h264_qpel16_h_lowpass_num, 127.60 + altivec_avg_h264_qpel16_h_lowpass_num, 127.61 + altivec_put_h264_qpel16_v_lowpass_num, 127.62 + altivec_avg_h264_qpel16_v_lowpass_num, 127.63 + altivec_put_h264_qpel16_hv_lowpass_num, 127.64 + altivec_avg_h264_qpel16_hv_lowpass_num, 127.65 + powerpc_perf_total 127.66 +}; 127.67 +enum powerpc_data_index { 127.68 + powerpc_data_min = 0, 127.69 + powerpc_data_max, 127.70 + powerpc_data_sum, 127.71 + powerpc_data_num, 127.72 + powerpc_data_total 127.73 +}; 127.74 +extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; 127.75 + 127.76 +#if !ARCH_PPC64 127.77 +#define POWERP_PMC_DATATYPE unsigned long 127.78 +#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 937" : "=r" (a)) 127.79 +#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 938" : "=r" (a)) 127.80 +#if (POWERPC_NUM_PMC_ENABLED > 2) 127.81 +#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 941" : "=r" (a)) 127.82 +#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 942" : "=r" (a)) 127.83 +#else 127.84 +#define POWERPC_GET_PMC3(a) do {} while (0) 127.85 +#define POWERPC_GET_PMC4(a) do {} while (0) 127.86 +#endif 127.87 +#if (POWERPC_NUM_PMC_ENABLED > 4) 127.88 +#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 929" : "=r" (a)) 127.89 +#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 930" : "=r" (a)) 127.90 +#else 127.91 +#define POWERPC_GET_PMC5(a) do {} while (0) 127.92 +#define POWERPC_GET_PMC6(a) do {} while (0) 127.93 +#endif 127.94 +#else /* ARCH_PPC64 */ 127.95 +#define POWERP_PMC_DATATYPE unsigned long long 127.96 +#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 771" : "=r" (a)) 127.97 +#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 772" : "=r" (a)) 127.98 +#if (POWERPC_NUM_PMC_ENABLED > 2) 127.99 +#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 773" : "=r" (a)) 127.100 +#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 774" : "=r" (a)) 127.101 +#else 127.102 +#define POWERPC_GET_PMC3(a) do {} while (0) 127.103 +#define POWERPC_GET_PMC4(a) do {} while (0) 127.104 +#endif 127.105 +#if (POWERPC_NUM_PMC_ENABLED > 4) 127.106 +#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 775" : "=r" (a)) 127.107 +#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 776" : "=r" (a)) 127.108 +#else 127.109 +#define POWERPC_GET_PMC5(a) do {} while (0) 127.110 +#define POWERPC_GET_PMC6(a) do {} while (0) 127.111 +#endif 127.112 +#endif /* ARCH_PPC64 */ 127.113 +#define POWERPC_PERF_DECLARE(a, cond) \ 127.114 + POWERP_PMC_DATATYPE \ 127.115 + pmc_start[POWERPC_NUM_PMC_ENABLED], \ 127.116 + pmc_stop[POWERPC_NUM_PMC_ENABLED], \ 127.117 + pmc_loop_index; 127.118 +#define POWERPC_PERF_START_COUNT(a, cond) do { \ 127.119 + POWERPC_GET_PMC6(pmc_start[5]); \ 127.120 + POWERPC_GET_PMC5(pmc_start[4]); \ 127.121 + POWERPC_GET_PMC4(pmc_start[3]); \ 127.122 + POWERPC_GET_PMC3(pmc_start[2]); \ 127.123 + POWERPC_GET_PMC2(pmc_start[1]); \ 127.124 + POWERPC_GET_PMC1(pmc_start[0]); \ 127.125 + } while (0) 127.126 +#define POWERPC_PERF_STOP_COUNT(a, cond) do { \ 127.127 + POWERPC_GET_PMC1(pmc_stop[0]); \ 127.128 + POWERPC_GET_PMC2(pmc_stop[1]); \ 127.129 + POWERPC_GET_PMC3(pmc_stop[2]); \ 127.130 + POWERPC_GET_PMC4(pmc_stop[3]); \ 127.131 + POWERPC_GET_PMC5(pmc_stop[4]); \ 127.132 + POWERPC_GET_PMC6(pmc_stop[5]); \ 127.133 + if (cond) { \ 127.134 + for(pmc_loop_index = 0; \ 127.135 + pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \ 127.136 + pmc_loop_index++) { \ 127.137 + if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) { \ 127.138 + POWERP_PMC_DATATYPE diff = \ 127.139 + pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \ 127.140 + if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \ 127.141 + perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \ 127.142 + if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \ 127.143 + perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \ 127.144 + perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \ 127.145 + perfdata[pmc_loop_index][a][powerpc_data_num] ++; \ 127.146 + } \ 127.147 + } \ 127.148 + } \ 127.149 +} while (0) 127.150 +#else /* CONFIG_POWERPC_PERF */ 127.151 +// those are needed to avoid empty statements. 127.152 +#define POWERPC_PERF_DECLARE(a, cond) int altivec_placeholder __attribute__ ((unused)) 127.153 +#define POWERPC_PERF_START_COUNT(a, cond) do {} while (0) 127.154 +#define POWERPC_PERF_STOP_COUNT(a, cond) do {} while (0) 127.155 +#endif /* CONFIG_POWERPC_PERF */ 127.156 + 127.157 +#endif /* AVCODEC_PPC_DSPUTIL_PPC_H */
128.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 128.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/h264_altivec.c Mon Aug 27 12:09:56 2012 +0200 128.3 @@ -0,0 +1,1021 @@ 128.4 +/* 128.5 + * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> 128.6 + * 128.7 + * This file is part of FFmpeg. 128.8 + * 128.9 + * FFmpeg is free software; you can redistribute it and/or 128.10 + * modify it under the terms of the GNU Lesser General Public 128.11 + * License as published by the Free Software Foundation; either 128.12 + * version 2.1 of the License, or (at your option) any later version. 128.13 + * 128.14 + * FFmpeg is distributed in the hope that it will be useful, 128.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 128.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 128.17 + * Lesser General Public License for more details. 128.18 + * 128.19 + * You should have received a copy of the GNU Lesser General Public 128.20 + * License along with FFmpeg; if not, write to the Free Software 128.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 128.22 + */ 128.23 + 128.24 +#include "libavcodec/dsputil.h" 128.25 +#include "libavcodec/h264_data.h" 128.26 +#include "libavcodec/h264_dsp.h" 128.27 + 128.28 +#include "dsputil_ppc.h" 128.29 +#include "dsputil_altivec.h" 128.30 +#include "util_altivec.h" 128.31 +#include "types_altivec.h" 128.32 + 128.33 +#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s 128.34 +#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s) 128.35 + 128.36 +#define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC 128.37 +#define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec 128.38 +#define PREFIX_no_rnd_vc1_chroma_mc8_altivec put_no_rnd_vc1_chroma_mc8_altivec 128.39 +#define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num 128.40 +#define PREFIX_h264_qpel16_h_lowpass_altivec put_h264_qpel16_h_lowpass_altivec 128.41 +#define PREFIX_h264_qpel16_h_lowpass_num altivec_put_h264_qpel16_h_lowpass_num 128.42 +#define PREFIX_h264_qpel16_v_lowpass_altivec put_h264_qpel16_v_lowpass_altivec 128.43 +#define PREFIX_h264_qpel16_v_lowpass_num altivec_put_h264_qpel16_v_lowpass_num 128.44 +#define PREFIX_h264_qpel16_hv_lowpass_altivec put_h264_qpel16_hv_lowpass_altivec 128.45 +#define PREFIX_h264_qpel16_hv_lowpass_num altivec_put_h264_qpel16_hv_lowpass_num 128.46 +#include "h264_template_altivec.c" 128.47 +#undef OP_U8_ALTIVEC 128.48 +#undef PREFIX_h264_chroma_mc8_altivec 128.49 +#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec 128.50 +#undef PREFIX_h264_chroma_mc8_num 128.51 +#undef PREFIX_h264_qpel16_h_lowpass_altivec 128.52 +#undef PREFIX_h264_qpel16_h_lowpass_num 128.53 +#undef PREFIX_h264_qpel16_v_lowpass_altivec 128.54 +#undef PREFIX_h264_qpel16_v_lowpass_num 128.55 +#undef PREFIX_h264_qpel16_hv_lowpass_altivec 128.56 +#undef PREFIX_h264_qpel16_hv_lowpass_num 128.57 + 128.58 +#define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC 128.59 +#define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec 128.60 +#define PREFIX_no_rnd_vc1_chroma_mc8_altivec avg_no_rnd_vc1_chroma_mc8_altivec 128.61 +#define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num 128.62 +#define PREFIX_h264_qpel16_h_lowpass_altivec avg_h264_qpel16_h_lowpass_altivec 128.63 +#define PREFIX_h264_qpel16_h_lowpass_num altivec_avg_h264_qpel16_h_lowpass_num 128.64 +#define PREFIX_h264_qpel16_v_lowpass_altivec avg_h264_qpel16_v_lowpass_altivec 128.65 +#define PREFIX_h264_qpel16_v_lowpass_num altivec_avg_h264_qpel16_v_lowpass_num 128.66 +#define PREFIX_h264_qpel16_hv_lowpass_altivec avg_h264_qpel16_hv_lowpass_altivec 128.67 +#define PREFIX_h264_qpel16_hv_lowpass_num altivec_avg_h264_qpel16_hv_lowpass_num 128.68 +#include "h264_template_altivec.c" 128.69 +#undef OP_U8_ALTIVEC 128.70 +#undef PREFIX_h264_chroma_mc8_altivec 128.71 +#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec 128.72 +#undef PREFIX_h264_chroma_mc8_num 128.73 +#undef PREFIX_h264_qpel16_h_lowpass_altivec 128.74 +#undef PREFIX_h264_qpel16_h_lowpass_num 128.75 +#undef PREFIX_h264_qpel16_v_lowpass_altivec 128.76 +#undef PREFIX_h264_qpel16_v_lowpass_num 128.77 +#undef PREFIX_h264_qpel16_hv_lowpass_altivec 128.78 +#undef PREFIX_h264_qpel16_hv_lowpass_num 128.79 + 128.80 +#define H264_MC(OPNAME, SIZE, CODETYPE) \ 128.81 +static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\ 128.82 + OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\ 128.83 +}\ 128.84 +\ 128.85 +static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \ 128.86 + DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ 128.87 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ 128.88 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\ 128.89 +}\ 128.90 +\ 128.91 +static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ 128.92 + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\ 128.93 +}\ 128.94 +\ 128.95 +static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ 128.96 + DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ 128.97 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ 128.98 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\ 128.99 +}\ 128.100 +\ 128.101 +static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ 128.102 + DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ 128.103 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ 128.104 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\ 128.105 +}\ 128.106 +\ 128.107 +static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ 128.108 + OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\ 128.109 +}\ 128.110 +\ 128.111 +static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ 128.112 + DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ 128.113 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ 128.114 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\ 128.115 +}\ 128.116 +\ 128.117 +static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ 128.118 + DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ 128.119 + DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ 128.120 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ 128.121 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ 128.122 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ 128.123 +}\ 128.124 +\ 128.125 +static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ 128.126 + DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ 128.127 + DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ 128.128 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ 128.129 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ 128.130 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ 128.131 +}\ 128.132 +\ 128.133 +static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ 128.134 + DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ 128.135 + DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ 128.136 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ 128.137 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ 128.138 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ 128.139 +}\ 128.140 +\ 128.141 +static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ 128.142 + DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ 128.143 + DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ 128.144 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ 128.145 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ 128.146 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ 128.147 +}\ 128.148 +\ 128.149 +static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ 128.150 + DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ 128.151 + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\ 128.152 +}\ 128.153 +\ 128.154 +static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ 128.155 + DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ 128.156 + DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ 128.157 + DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ 128.158 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ 128.159 + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ 128.160 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\ 128.161 +}\ 128.162 +\ 128.163 +static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ 128.164 + DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ 128.165 + DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ 128.166 + DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ 128.167 + put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ 128.168 + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ 128.169 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\ 128.170 +}\ 128.171 +\ 128.172 +static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ 128.173 + DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ 128.174 + DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ 128.175 + DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ 128.176 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ 128.177 + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ 128.178 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ 128.179 +}\ 128.180 +\ 128.181 +static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ 128.182 + DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ 128.183 + DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ 128.184 + DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ 128.185 + put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ 128.186 + put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ 128.187 + OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ 128.188 +}\ 128.189 + 128.190 +static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, 128.191 + const uint8_t * src2, int dst_stride, 128.192 + int src_stride1, int h) 128.193 +{ 128.194 + int i; 128.195 + vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align; 128.196 + 128.197 + mask_ = vec_lvsl(0, src2); 128.198 + 128.199 + for (i = 0; i < h; i++) { 128.200 + 128.201 + tmp1 = vec_ld(i * src_stride1, src1); 128.202 + mask = vec_lvsl(i * src_stride1, src1); 128.203 + tmp2 = vec_ld(i * src_stride1 + 15, src1); 128.204 + 128.205 + a = vec_perm(tmp1, tmp2, mask); 128.206 + 128.207 + tmp1 = vec_ld(i * 16, src2); 128.208 + tmp2 = vec_ld(i * 16 + 15, src2); 128.209 + 128.210 + b = vec_perm(tmp1, tmp2, mask_); 128.211 + 128.212 + tmp1 = vec_ld(0, dst); 128.213 + mask = vec_lvsl(0, dst); 128.214 + tmp2 = vec_ld(15, dst); 128.215 + 128.216 + d = vec_avg(a, b); 128.217 + 128.218 + edges = vec_perm(tmp2, tmp1, mask); 128.219 + 128.220 + align = vec_lvsr(0, dst); 128.221 + 128.222 + tmp2 = vec_perm(d, edges, align); 128.223 + tmp1 = vec_perm(edges, d, align); 128.224 + 128.225 + vec_st(tmp2, 15, dst); 128.226 + vec_st(tmp1, 0 , dst); 128.227 + 128.228 + dst += dst_stride; 128.229 + } 128.230 +} 128.231 + 128.232 +static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, 128.233 + const uint8_t * src2, int dst_stride, 128.234 + int src_stride1, int h) 128.235 +{ 128.236 + int i; 128.237 + vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align; 128.238 + 128.239 + mask_ = vec_lvsl(0, src2); 128.240 + 128.241 + for (i = 0; i < h; i++) { 128.242 + 128.243 + tmp1 = vec_ld(i * src_stride1, src1); 128.244 + mask = vec_lvsl(i * src_stride1, src1); 128.245 + tmp2 = vec_ld(i * src_stride1 + 15, src1); 128.246 + 128.247 + a = vec_perm(tmp1, tmp2, mask); 128.248 + 128.249 + tmp1 = vec_ld(i * 16, src2); 128.250 + tmp2 = vec_ld(i * 16 + 15, src2); 128.251 + 128.252 + b = vec_perm(tmp1, tmp2, mask_); 128.253 + 128.254 + tmp1 = vec_ld(0, dst); 128.255 + mask = vec_lvsl(0, dst); 128.256 + tmp2 = vec_ld(15, dst); 128.257 + 128.258 + d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b)); 128.259 + 128.260 + edges = vec_perm(tmp2, tmp1, mask); 128.261 + 128.262 + align = vec_lvsr(0, dst); 128.263 + 128.264 + tmp2 = vec_perm(d, edges, align); 128.265 + tmp1 = vec_perm(edges, d, align); 128.266 + 128.267 + vec_st(tmp2, 15, dst); 128.268 + vec_st(tmp1, 0 , dst); 128.269 + 128.270 + dst += dst_stride; 128.271 + } 128.272 +} 128.273 + 128.274 +/* Implemented but could be faster 128.275 +#define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h) 128.276 +#define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h) 128.277 + */ 128.278 + 128.279 +H264_MC(put_, 16, altivec) 128.280 +H264_MC(avg_, 16, altivec) 128.281 + 128.282 + 128.283 +/**************************************************************************** 128.284 + * IDCT transform: 128.285 + ****************************************************************************/ 128.286 + 128.287 +#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \ 128.288 + /* 1st stage */ \ 128.289 + vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \ 128.290 + vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \ 128.291 + vz2 = vec_sra(vb1,vec_splat_u16(1)); \ 128.292 + vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \ 128.293 + vz3 = vec_sra(vb3,vec_splat_u16(1)); \ 128.294 + vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \ 128.295 + /* 2nd stage: output */ \ 128.296 + va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \ 128.297 + va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \ 128.298 + va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \ 128.299 + va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */ 128.300 + 128.301 +#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \ 128.302 + b0 = vec_mergeh( a0, a0 ); \ 128.303 + b1 = vec_mergeh( a1, a0 ); \ 128.304 + b2 = vec_mergeh( a2, a0 ); \ 128.305 + b3 = vec_mergeh( a3, a0 ); \ 128.306 + a0 = vec_mergeh( b0, b2 ); \ 128.307 + a1 = vec_mergel( b0, b2 ); \ 128.308 + a2 = vec_mergeh( b1, b3 ); \ 128.309 + a3 = vec_mergel( b1, b3 ); \ 128.310 + b0 = vec_mergeh( a0, a2 ); \ 128.311 + b1 = vec_mergel( a0, a2 ); \ 128.312 + b2 = vec_mergeh( a1, a3 ); \ 128.313 + b3 = vec_mergel( a1, a3 ) 128.314 + 128.315 +#define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \ 128.316 + vdst_orig = vec_ld(0, dst); \ 128.317 + vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \ 128.318 + vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst); \ 128.319 + va = vec_add(va, vdst_ss); \ 128.320 + va_u8 = vec_packsu(va, zero_s16v); \ 128.321 + va_u32 = vec_splat((vec_u32)va_u8, 0); \ 128.322 + vec_ste(va_u32, element, (uint32_t*)dst); 128.323 + 128.324 +static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride) 128.325 +{ 128.326 + vec_s16 va0, va1, va2, va3; 128.327 + vec_s16 vz0, vz1, vz2, vz3; 128.328 + vec_s16 vtmp0, vtmp1, vtmp2, vtmp3; 128.329 + vec_u8 va_u8; 128.330 + vec_u32 va_u32; 128.331 + vec_s16 vdst_ss; 128.332 + const vec_u16 v6us = vec_splat_u16(6); 128.333 + vec_u8 vdst, vdst_orig; 128.334 + vec_u8 vdst_mask = vec_lvsl(0, dst); 128.335 + int element = ((unsigned long)dst & 0xf) >> 2; 128.336 + LOAD_ZERO; 128.337 + 128.338 + block[0] += 32; /* add 32 as a DC-level for rounding */ 128.339 + 128.340 + vtmp0 = vec_ld(0,block); 128.341 + vtmp1 = vec_sld(vtmp0, vtmp0, 8); 128.342 + vtmp2 = vec_ld(16,block); 128.343 + vtmp3 = vec_sld(vtmp2, vtmp2, 8); 128.344 + 128.345 + VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); 128.346 + VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3); 128.347 + VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); 128.348 + 128.349 + va0 = vec_sra(va0,v6us); 128.350 + va1 = vec_sra(va1,v6us); 128.351 + va2 = vec_sra(va2,v6us); 128.352 + va3 = vec_sra(va3,v6us); 128.353 + 128.354 + VEC_LOAD_U8_ADD_S16_STORE_U8(va0); 128.355 + dst += stride; 128.356 + VEC_LOAD_U8_ADD_S16_STORE_U8(va1); 128.357 + dst += stride; 128.358 + VEC_LOAD_U8_ADD_S16_STORE_U8(va2); 128.359 + dst += stride; 128.360 + VEC_LOAD_U8_ADD_S16_STORE_U8(va3); 128.361 +} 128.362 + 128.363 +#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\ 128.364 + /* a0 = SRC(0) + SRC(4); */ \ 128.365 + vec_s16 a0v = vec_add(s0, s4); \ 128.366 + /* a2 = SRC(0) - SRC(4); */ \ 128.367 + vec_s16 a2v = vec_sub(s0, s4); \ 128.368 + /* a4 = (SRC(2)>>1) - SRC(6); */ \ 128.369 + vec_s16 a4v = vec_sub(vec_sra(s2, onev), s6); \ 128.370 + /* a6 = (SRC(6)>>1) + SRC(2); */ \ 128.371 + vec_s16 a6v = vec_add(vec_sra(s6, onev), s2); \ 128.372 + /* b0 = a0 + a6; */ \ 128.373 + vec_s16 b0v = vec_add(a0v, a6v); \ 128.374 + /* b2 = a2 + a4; */ \ 128.375 + vec_s16 b2v = vec_add(a2v, a4v); \ 128.376 + /* b4 = a2 - a4; */ \ 128.377 + vec_s16 b4v = vec_sub(a2v, a4v); \ 128.378 + /* b6 = a0 - a6; */ \ 128.379 + vec_s16 b6v = vec_sub(a0v, a6v); \ 128.380 + /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \ 128.381 + /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \ 128.382 + vec_s16 a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \ 128.383 + /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \ 128.384 + /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \ 128.385 + vec_s16 a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\ 128.386 + /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \ 128.387 + /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \ 128.388 + vec_s16 a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\ 128.389 + /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \ 128.390 + vec_s16 a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\ 128.391 + /* b1 = (a7>>2) + a1; */ \ 128.392 + vec_s16 b1v = vec_add( vec_sra(a7v, twov), a1v); \ 128.393 + /* b3 = a3 + (a5>>2); */ \ 128.394 + vec_s16 b3v = vec_add(a3v, vec_sra(a5v, twov)); \ 128.395 + /* b5 = (a3>>2) - a5; */ \ 128.396 + vec_s16 b5v = vec_sub( vec_sra(a3v, twov), a5v); \ 128.397 + /* b7 = a7 - (a1>>2); */ \ 128.398 + vec_s16 b7v = vec_sub( a7v, vec_sra(a1v, twov)); \ 128.399 + /* DST(0, b0 + b7); */ \ 128.400 + d0 = vec_add(b0v, b7v); \ 128.401 + /* DST(1, b2 + b5); */ \ 128.402 + d1 = vec_add(b2v, b5v); \ 128.403 + /* DST(2, b4 + b3); */ \ 128.404 + d2 = vec_add(b4v, b3v); \ 128.405 + /* DST(3, b6 + b1); */ \ 128.406 + d3 = vec_add(b6v, b1v); \ 128.407 + /* DST(4, b6 - b1); */ \ 128.408 + d4 = vec_sub(b6v, b1v); \ 128.409 + /* DST(5, b4 - b3); */ \ 128.410 + d5 = vec_sub(b4v, b3v); \ 128.411 + /* DST(6, b2 - b5); */ \ 128.412 + d6 = vec_sub(b2v, b5v); \ 128.413 + /* DST(7, b0 - b7); */ \ 128.414 + d7 = vec_sub(b0v, b7v); \ 128.415 +} 128.416 + 128.417 +#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \ 128.418 + /* unaligned load */ \ 128.419 + vec_u8 hv = vec_ld( 0, dest ); \ 128.420 + vec_u8 lv = vec_ld( 7, dest ); \ 128.421 + vec_u8 dstv = vec_perm( hv, lv, (vec_u8)perm_ldv ); \ 128.422 + vec_s16 idct_sh6 = vec_sra(idctv, sixv); \ 128.423 + vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv); \ 128.424 + vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16); \ 128.425 + vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum); \ 128.426 + vec_u8 edgehv; \ 128.427 + /* unaligned store */ \ 128.428 + vec_u8 bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\ 128.429 + vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv ); \ 128.430 + lv = vec_sel( lv, bodyv, edgelv ); \ 128.431 + vec_st( lv, 7, dest ); \ 128.432 + hv = vec_ld( 0, dest ); \ 128.433 + edgehv = vec_perm( zero_u8v, sel, perm_stv ); \ 128.434 + hv = vec_sel( hv, bodyv, edgehv ); \ 128.435 + vec_st( hv, 0, dest ); \ 128.436 + } 128.437 + 128.438 +static void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { 128.439 + vec_s16 s0, s1, s2, s3, s4, s5, s6, s7; 128.440 + vec_s16 d0, d1, d2, d3, d4, d5, d6, d7; 128.441 + vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; 128.442 + 128.443 + vec_u8 perm_ldv = vec_lvsl(0, dst); 128.444 + vec_u8 perm_stv = vec_lvsr(8, dst); 128.445 + 128.446 + const vec_u16 onev = vec_splat_u16(1); 128.447 + const vec_u16 twov = vec_splat_u16(2); 128.448 + const vec_u16 sixv = vec_splat_u16(6); 128.449 + 128.450 + const vec_u8 sel = (vec_u8) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1}; 128.451 + LOAD_ZERO; 128.452 + 128.453 + dct[0] += 32; // rounding for the >>6 at the end 128.454 + 128.455 + s0 = vec_ld(0x00, (int16_t*)dct); 128.456 + s1 = vec_ld(0x10, (int16_t*)dct); 128.457 + s2 = vec_ld(0x20, (int16_t*)dct); 128.458 + s3 = vec_ld(0x30, (int16_t*)dct); 128.459 + s4 = vec_ld(0x40, (int16_t*)dct); 128.460 + s5 = vec_ld(0x50, (int16_t*)dct); 128.461 + s6 = vec_ld(0x60, (int16_t*)dct); 128.462 + s7 = vec_ld(0x70, (int16_t*)dct); 128.463 + 128.464 + IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, 128.465 + d0, d1, d2, d3, d4, d5, d6, d7); 128.466 + 128.467 + TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 ); 128.468 + 128.469 + IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7, 128.470 + idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7); 128.471 + 128.472 + ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel); 128.473 + ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel); 128.474 + ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel); 128.475 + ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel); 128.476 + ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel); 128.477 + ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel); 128.478 + ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel); 128.479 + ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); 128.480 +} 128.481 + 128.482 +static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, DCTELEM *block, int stride, int size) 128.483 +{ 128.484 + vec_s16 dc16; 128.485 + vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; 128.486 + LOAD_ZERO; 128.487 + DECLARE_ALIGNED(16, int, dc); 128.488 + int i; 128.489 + 128.490 + dc = (block[0] + 32) >> 6; 128.491 + dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); 128.492 + 128.493 + if (size == 4) 128.494 + dc16 = vec_sld(dc16, zero_s16v, 8); 128.495 + dcplus = vec_packsu(dc16, zero_s16v); 128.496 + dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); 128.497 + 128.498 + aligner = vec_lvsr(0, dst); 128.499 + dcplus = vec_perm(dcplus, dcplus, aligner); 128.500 + dcminus = vec_perm(dcminus, dcminus, aligner); 128.501 + 128.502 + for (i = 0; i < size; i += 4) { 128.503 + v0 = vec_ld(0, dst+0*stride); 128.504 + v1 = vec_ld(0, dst+1*stride); 128.505 + v2 = vec_ld(0, dst+2*stride); 128.506 + v3 = vec_ld(0, dst+3*stride); 128.507 + 128.508 + v0 = vec_adds(v0, dcplus); 128.509 + v1 = vec_adds(v1, dcplus); 128.510 + v2 = vec_adds(v2, dcplus); 128.511 + v3 = vec_adds(v3, dcplus); 128.512 + 128.513 + v0 = vec_subs(v0, dcminus); 128.514 + v1 = vec_subs(v1, dcminus); 128.515 + v2 = vec_subs(v2, dcminus); 128.516 + v3 = vec_subs(v3, dcminus); 128.517 + 128.518 + vec_st(v0, 0, dst+0*stride); 128.519 + vec_st(v1, 0, dst+1*stride); 128.520 + vec_st(v2, 0, dst+2*stride); 128.521 + vec_st(v3, 0, dst+3*stride); 128.522 + 128.523 + dst += 4*stride; 128.524 + } 128.525 +} 128.526 + 128.527 +static void h264_idct_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride) 128.528 +{ 128.529 + h264_idct_dc_add_internal(dst, block, stride, 4); 128.530 +} 128.531 + 128.532 +static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride) 128.533 +{ 128.534 + h264_idct_dc_add_internal(dst, block, stride, 8); 128.535 +} 128.536 + 128.537 +static void ff_h264_idct_add16_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 128.538 + int i; 128.539 + for(i=0; i<16; i++){ 128.540 + int nnz = nnzc[ scan8[i] ]; 128.541 + if(nnz){ 128.542 + if(nnz==1 && block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride); 128.543 + else ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride); 128.544 + } 128.545 + } 128.546 +} 128.547 + 128.548 +static void ff_h264_idct_add16intra_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 128.549 + int i; 128.550 + for(i=0; i<16; i++){ 128.551 + if(nnzc[ scan8[i] ]) ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride); 128.552 + else if(block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride); 128.553 + } 128.554 +} 128.555 + 128.556 +static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 128.557 + int i; 128.558 + for(i=0; i<16; i+=4){ 128.559 + int nnz = nnzc[ scan8[i] ]; 128.560 + if(nnz){ 128.561 + if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_altivec(dst + block_offset[i], block + i*16, stride); 128.562 + else ff_h264_idct8_add_altivec (dst + block_offset[i], block + i*16, stride); 128.563 + } 128.564 + } 128.565 +} 128.566 + 128.567 +static void ff_h264_idct_add8_altivec(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 128.568 + int i; 128.569 + for(i=16; i<16+8; i++){ 128.570 + if(nnzc[ scan8[i] ]) 128.571 + ff_h264_idct_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); 128.572 + else if(block[i*16]) 128.573 + h264_idct_dc_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); 128.574 + } 128.575 +} 128.576 + 128.577 +#define transpose4x16(r0, r1, r2, r3) { \ 128.578 + register vec_u8 r4; \ 128.579 + register vec_u8 r5; \ 128.580 + register vec_u8 r6; \ 128.581 + register vec_u8 r7; \ 128.582 + \ 128.583 + r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ 128.584 + r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ 128.585 + r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \ 128.586 + r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \ 128.587 + \ 128.588 + r0 = vec_mergeh(r4, r6); /*all set 0*/ \ 128.589 + r1 = vec_mergel(r4, r6); /*all set 1*/ \ 128.590 + r2 = vec_mergeh(r5, r7); /*all set 2*/ \ 128.591 + r3 = vec_mergel(r5, r7); /*all set 3*/ \ 128.592 +} 128.593 + 128.594 +static inline void write16x4(uint8_t *dst, int dst_stride, 128.595 + register vec_u8 r0, register vec_u8 r1, 128.596 + register vec_u8 r2, register vec_u8 r3) { 128.597 + DECLARE_ALIGNED(16, unsigned char, result)[64]; 128.598 + uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; 128.599 + int int_dst_stride = dst_stride/4; 128.600 + 128.601 + vec_st(r0, 0, result); 128.602 + vec_st(r1, 16, result); 128.603 + vec_st(r2, 32, result); 128.604 + vec_st(r3, 48, result); 128.605 + /* FIXME: there has to be a better way!!!! */ 128.606 + *dst_int = *src_int; 128.607 + *(dst_int+ int_dst_stride) = *(src_int + 1); 128.608 + *(dst_int+ 2*int_dst_stride) = *(src_int + 2); 128.609 + *(dst_int+ 3*int_dst_stride) = *(src_int + 3); 128.610 + *(dst_int+ 4*int_dst_stride) = *(src_int + 4); 128.611 + *(dst_int+ 5*int_dst_stride) = *(src_int + 5); 128.612 + *(dst_int+ 6*int_dst_stride) = *(src_int + 6); 128.613 + *(dst_int+ 7*int_dst_stride) = *(src_int + 7); 128.614 + *(dst_int+ 8*int_dst_stride) = *(src_int + 8); 128.615 + *(dst_int+ 9*int_dst_stride) = *(src_int + 9); 128.616 + *(dst_int+10*int_dst_stride) = *(src_int + 10); 128.617 + *(dst_int+11*int_dst_stride) = *(src_int + 11); 128.618 + *(dst_int+12*int_dst_stride) = *(src_int + 12); 128.619 + *(dst_int+13*int_dst_stride) = *(src_int + 13); 128.620 + *(dst_int+14*int_dst_stride) = *(src_int + 14); 128.621 + *(dst_int+15*int_dst_stride) = *(src_int + 15); 128.622 +} 128.623 + 128.624 +/** \brief performs a 6x16 transpose of data in src, and stores it to dst 128.625 + \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing 128.626 + out of unaligned_load() */ 128.627 +#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ 128.628 + register vec_u8 r0 = unaligned_load(0, src); \ 128.629 + register vec_u8 r1 = unaligned_load( src_stride, src); \ 128.630 + register vec_u8 r2 = unaligned_load(2* src_stride, src); \ 128.631 + register vec_u8 r3 = unaligned_load(3* src_stride, src); \ 128.632 + register vec_u8 r4 = unaligned_load(4* src_stride, src); \ 128.633 + register vec_u8 r5 = unaligned_load(5* src_stride, src); \ 128.634 + register vec_u8 r6 = unaligned_load(6* src_stride, src); \ 128.635 + register vec_u8 r7 = unaligned_load(7* src_stride, src); \ 128.636 + register vec_u8 r14 = unaligned_load(14*src_stride, src); \ 128.637 + register vec_u8 r15 = unaligned_load(15*src_stride, src); \ 128.638 + \ 128.639 + r8 = unaligned_load( 8*src_stride, src); \ 128.640 + r9 = unaligned_load( 9*src_stride, src); \ 128.641 + r10 = unaligned_load(10*src_stride, src); \ 128.642 + r11 = unaligned_load(11*src_stride, src); \ 128.643 + r12 = unaligned_load(12*src_stride, src); \ 128.644 + r13 = unaligned_load(13*src_stride, src); \ 128.645 + \ 128.646 + /*Merge first pairs*/ \ 128.647 + r0 = vec_mergeh(r0, r8); /*0, 8*/ \ 128.648 + r1 = vec_mergeh(r1, r9); /*1, 9*/ \ 128.649 + r2 = vec_mergeh(r2, r10); /*2,10*/ \ 128.650 + r3 = vec_mergeh(r3, r11); /*3,11*/ \ 128.651 + r4 = vec_mergeh(r4, r12); /*4,12*/ \ 128.652 + r5 = vec_mergeh(r5, r13); /*5,13*/ \ 128.653 + r6 = vec_mergeh(r6, r14); /*6,14*/ \ 128.654 + r7 = vec_mergeh(r7, r15); /*7,15*/ \ 128.655 + \ 128.656 + /*Merge second pairs*/ \ 128.657 + r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \ 128.658 + r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \ 128.659 + r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \ 128.660 + r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \ 128.661 + r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \ 128.662 + r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \ 128.663 + r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \ 128.664 + r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \ 128.665 + \ 128.666 + /*Third merge*/ \ 128.667 + r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \ 128.668 + r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \ 128.669 + r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \ 128.670 + r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \ 128.671 + r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \ 128.672 + r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \ 128.673 + /* Don't need to compute 3 and 7*/ \ 128.674 + \ 128.675 + /*Final merge*/ \ 128.676 + r8 = vec_mergeh(r0, r4); /*all set 0*/ \ 128.677 + r9 = vec_mergel(r0, r4); /*all set 1*/ \ 128.678 + r10 = vec_mergeh(r1, r5); /*all set 2*/ \ 128.679 + r11 = vec_mergel(r1, r5); /*all set 3*/ \ 128.680 + r12 = vec_mergeh(r2, r6); /*all set 4*/ \ 128.681 + r13 = vec_mergel(r2, r6); /*all set 5*/ \ 128.682 + /* Don't need to compute 14 and 15*/ \ 128.683 + \ 128.684 +} 128.685 + 128.686 +// out: o = |x-y| < a 128.687 +static inline vec_u8 diff_lt_altivec ( register vec_u8 x, 128.688 + register vec_u8 y, 128.689 + register vec_u8 a) { 128.690 + 128.691 + register vec_u8 diff = vec_subs(x, y); 128.692 + register vec_u8 diffneg = vec_subs(y, x); 128.693 + register vec_u8 o = vec_or(diff, diffneg); /* |x-y| */ 128.694 + o = (vec_u8)vec_cmplt(o, a); 128.695 + return o; 128.696 +} 128.697 + 128.698 +static inline vec_u8 h264_deblock_mask ( register vec_u8 p0, 128.699 + register vec_u8 p1, 128.700 + register vec_u8 q0, 128.701 + register vec_u8 q1, 128.702 + register vec_u8 alpha, 128.703 + register vec_u8 beta) { 128.704 + 128.705 + register vec_u8 mask; 128.706 + register vec_u8 tempmask; 128.707 + 128.708 + mask = diff_lt_altivec(p0, q0, alpha); 128.709 + tempmask = diff_lt_altivec(p1, p0, beta); 128.710 + mask = vec_and(mask, tempmask); 128.711 + tempmask = diff_lt_altivec(q1, q0, beta); 128.712 + mask = vec_and(mask, tempmask); 128.713 + 128.714 + return mask; 128.715 +} 128.716 + 128.717 +// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) 128.718 +static inline vec_u8 h264_deblock_q1(register vec_u8 p0, 128.719 + register vec_u8 p1, 128.720 + register vec_u8 p2, 128.721 + register vec_u8 q0, 128.722 + register vec_u8 tc0) { 128.723 + 128.724 + register vec_u8 average = vec_avg(p0, q0); 128.725 + register vec_u8 temp; 128.726 + register vec_u8 uncliped; 128.727 + register vec_u8 ones; 128.728 + register vec_u8 max; 128.729 + register vec_u8 min; 128.730 + register vec_u8 newp1; 128.731 + 128.732 + temp = vec_xor(average, p2); 128.733 + average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ 128.734 + ones = vec_splat_u8(1); 128.735 + temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ 128.736 + uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ 128.737 + max = vec_adds(p1, tc0); 128.738 + min = vec_subs(p1, tc0); 128.739 + newp1 = vec_max(min, uncliped); 128.740 + newp1 = vec_min(max, newp1); 128.741 + return newp1; 128.742 +} 128.743 + 128.744 +#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ 128.745 + \ 128.746 + const vec_u8 A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ 128.747 + \ 128.748 + register vec_u8 pq0bit = vec_xor(p0,q0); \ 128.749 + register vec_u8 q1minus; \ 128.750 + register vec_u8 p0minus; \ 128.751 + register vec_u8 stage1; \ 128.752 + register vec_u8 stage2; \ 128.753 + register vec_u8 vec160; \ 128.754 + register vec_u8 delta; \ 128.755 + register vec_u8 deltaneg; \ 128.756 + \ 128.757 + q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ 128.758 + stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ 128.759 + stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \ 128.760 + p0minus = vec_nor(p0, p0); /* 255 - p0 */ \ 128.761 + stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \ 128.762 + pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \ 128.763 + stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \ 128.764 + stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \ 128.765 + vec160 = vec_ld(0, &A0v); \ 128.766 + deltaneg = vec_subs(vec160, stage2); /* -d */ \ 128.767 + delta = vec_subs(stage2, vec160); /* d */ \ 128.768 + deltaneg = vec_min(tc0masked, deltaneg); \ 128.769 + delta = vec_min(tc0masked, delta); \ 128.770 + p0 = vec_subs(p0, deltaneg); \ 128.771 + q0 = vec_subs(q0, delta); \ 128.772 + p0 = vec_adds(p0, delta); \ 128.773 + q0 = vec_adds(q0, deltaneg); \ 128.774 +} 128.775 + 128.776 +#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ 128.777 + DECLARE_ALIGNED(16, unsigned char, temp)[16]; \ 128.778 + register vec_u8 alphavec; \ 128.779 + register vec_u8 betavec; \ 128.780 + register vec_u8 mask; \ 128.781 + register vec_u8 p1mask; \ 128.782 + register vec_u8 q1mask; \ 128.783 + register vector signed char tc0vec; \ 128.784 + register vec_u8 finaltc0; \ 128.785 + register vec_u8 tc0masked; \ 128.786 + register vec_u8 newp1; \ 128.787 + register vec_u8 newq1; \ 128.788 + \ 128.789 + temp[0] = alpha; \ 128.790 + temp[1] = beta; \ 128.791 + alphavec = vec_ld(0, temp); \ 128.792 + betavec = vec_splat(alphavec, 0x1); \ 128.793 + alphavec = vec_splat(alphavec, 0x0); \ 128.794 + mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \ 128.795 + \ 128.796 + *((int *)temp) = *((int *)tc0); \ 128.797 + tc0vec = vec_ld(0, (signed char*)temp); \ 128.798 + tc0vec = vec_mergeh(tc0vec, tc0vec); \ 128.799 + tc0vec = vec_mergeh(tc0vec, tc0vec); \ 128.800 + mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ 128.801 + finaltc0 = vec_and((vec_u8)tc0vec, mask); /* tc = tc0 */ \ 128.802 + \ 128.803 + p1mask = diff_lt_altivec(p2, p0, betavec); \ 128.804 + p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \ 128.805 + tc0masked = vec_and(p1mask, (vec_u8)tc0vec); \ 128.806 + finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ 128.807 + newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ 128.808 + /*end if*/ \ 128.809 + \ 128.810 + q1mask = diff_lt_altivec(q2, q0, betavec); \ 128.811 + q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ 128.812 + tc0masked = vec_and(q1mask, (vec_u8)tc0vec); \ 128.813 + finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ 128.814 + newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ 128.815 + /*end if*/ \ 128.816 + \ 128.817 + h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ 128.818 + p1 = newp1; \ 128.819 + q1 = newq1; \ 128.820 +} 128.821 + 128.822 +static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { 128.823 + 128.824 + if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { 128.825 + register vec_u8 p2 = vec_ld(-3*stride, pix); 128.826 + register vec_u8 p1 = vec_ld(-2*stride, pix); 128.827 + register vec_u8 p0 = vec_ld(-1*stride, pix); 128.828 + register vec_u8 q0 = vec_ld(0, pix); 128.829 + register vec_u8 q1 = vec_ld(stride, pix); 128.830 + register vec_u8 q2 = vec_ld(2*stride, pix); 128.831 + h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); 128.832 + vec_st(p1, -2*stride, pix); 128.833 + vec_st(p0, -1*stride, pix); 128.834 + vec_st(q0, 0, pix); 128.835 + vec_st(q1, stride, pix); 128.836 + } 128.837 +} 128.838 + 128.839 +static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { 128.840 + 128.841 + register vec_u8 line0, line1, line2, line3, line4, line5; 128.842 + if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) 128.843 + return; 128.844 + readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); 128.845 + h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); 128.846 + transpose4x16(line1, line2, line3, line4); 128.847 + write16x4(pix-2, stride, line1, line2, line3, line4); 128.848 +} 128.849 + 128.850 +static av_always_inline 128.851 +void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h) 128.852 +{ 128.853 + int y, aligned; 128.854 + vec_u8 vblock; 128.855 + vec_s16 vtemp, vweight, voffset, v0, v1; 128.856 + vec_u16 vlog2_denom; 128.857 + DECLARE_ALIGNED(16, int32_t, temp)[4]; 128.858 + LOAD_ZERO; 128.859 + 128.860 + offset <<= log2_denom; 128.861 + if(log2_denom) offset += 1<<(log2_denom-1); 128.862 + temp[0] = log2_denom; 128.863 + temp[1] = weight; 128.864 + temp[2] = offset; 128.865 + 128.866 + vtemp = (vec_s16)vec_ld(0, temp); 128.867 + vlog2_denom = (vec_u16)vec_splat(vtemp, 1); 128.868 + vweight = vec_splat(vtemp, 3); 128.869 + voffset = vec_splat(vtemp, 5); 128.870 + aligned = !((unsigned long)block & 0xf); 128.871 + 128.872 + for (y=0; y<h; y++) { 128.873 + vblock = vec_ld(0, block); 128.874 + 128.875 + v0 = (vec_s16)vec_mergeh(zero_u8v, vblock); 128.876 + v1 = (vec_s16)vec_mergel(zero_u8v, vblock); 128.877 + 128.878 + if (w == 16 || aligned) { 128.879 + v0 = vec_mladd(v0, vweight, zero_s16v); 128.880 + v0 = vec_adds(v0, voffset); 128.881 + v0 = vec_sra(v0, vlog2_denom); 128.882 + } 128.883 + if (w == 16 || !aligned) { 128.884 + v1 = vec_mladd(v1, vweight, zero_s16v); 128.885 + v1 = vec_adds(v1, voffset); 128.886 + v1 = vec_sra(v1, vlog2_denom); 128.887 + } 128.888 + vblock = vec_packsu(v0, v1); 128.889 + vec_st(vblock, 0, block); 128.890 + 128.891 + block += stride; 128.892 + } 128.893 +} 128.894 + 128.895 +static av_always_inline 128.896 +void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, 128.897 + int weightd, int weights, int offset, int w, int h) 128.898 +{ 128.899 + int y, dst_aligned, src_aligned; 128.900 + vec_u8 vsrc, vdst; 128.901 + vec_s16 vtemp, vweights, vweightd, voffset, v0, v1, v2, v3; 128.902 + vec_u16 vlog2_denom; 128.903 + DECLARE_ALIGNED(16, int32_t, temp)[4]; 128.904 + LOAD_ZERO; 128.905 + 128.906 + offset = ((offset + 1) | 1) << log2_denom; 128.907 + temp[0] = log2_denom+1; 128.908 + temp[1] = weights; 128.909 + temp[2] = weightd; 128.910 + temp[3] = offset; 128.911 + 128.912 + vtemp = (vec_s16)vec_ld(0, temp); 128.913 + vlog2_denom = (vec_u16)vec_splat(vtemp, 1); 128.914 + vweights = vec_splat(vtemp, 3); 128.915 + vweightd = vec_splat(vtemp, 5); 128.916 + voffset = vec_splat(vtemp, 7); 128.917 + dst_aligned = !((unsigned long)dst & 0xf); 128.918 + src_aligned = !((unsigned long)src & 0xf); 128.919 + 128.920 + for (y=0; y<h; y++) { 128.921 + vdst = vec_ld(0, dst); 128.922 + vsrc = vec_ld(0, src); 128.923 + 128.924 + v0 = (vec_s16)vec_mergeh(zero_u8v, vdst); 128.925 + v1 = (vec_s16)vec_mergel(zero_u8v, vdst); 128.926 + v2 = (vec_s16)vec_mergeh(zero_u8v, vsrc); 128.927 + v3 = (vec_s16)vec_mergel(zero_u8v, vsrc); 128.928 + 128.929 + if (w == 8) { 128.930 + if (src_aligned) 128.931 + v3 = v2; 128.932 + else 128.933 + v2 = v3; 128.934 + } 128.935 + 128.936 + if (w == 16 || dst_aligned) { 128.937 + v0 = vec_mladd(v0, vweightd, zero_s16v); 128.938 + v2 = vec_mladd(v2, vweights, zero_s16v); 128.939 + 128.940 + v0 = vec_adds(v0, voffset); 128.941 + v0 = vec_adds(v0, v2); 128.942 + v0 = vec_sra(v0, vlog2_denom); 128.943 + } 128.944 + if (w == 16 || !dst_aligned) { 128.945 + v1 = vec_mladd(v1, vweightd, zero_s16v); 128.946 + v3 = vec_mladd(v3, vweights, zero_s16v); 128.947 + 128.948 + v1 = vec_adds(v1, voffset); 128.949 + v1 = vec_adds(v1, v3); 128.950 + v1 = vec_sra(v1, vlog2_denom); 128.951 + } 128.952 + vdst = vec_packsu(v0, v1); 128.953 + vec_st(vdst, 0, dst); 128.954 + 128.955 + dst += stride; 128.956 + src += stride; 128.957 + } 128.958 +} 128.959 + 128.960 +#define H264_WEIGHT(W,H) \ 128.961 +static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ 128.962 + weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \ 128.963 +}\ 128.964 +static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ 128.965 + biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \ 128.966 +} 128.967 + 128.968 +H264_WEIGHT(16,16) 128.969 +H264_WEIGHT(16, 8) 128.970 +H264_WEIGHT( 8,16) 128.971 +H264_WEIGHT( 8, 8) 128.972 +H264_WEIGHT( 8, 4) 128.973 + 128.974 +void dsputil_h264_init_ppc(DSPContext* c) { 128.975 + c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec; 128.976 + c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; 128.977 + 128.978 +#define dspfunc(PFX, IDX, NUM) \ 128.979 + c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \ 128.980 + c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \ 128.981 + c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \ 128.982 + c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \ 128.983 + c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \ 128.984 + c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \ 128.985 + c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \ 128.986 + c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \ 128.987 + c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \ 128.988 + c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \ 128.989 + c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \ 128.990 + c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \ 128.991 + c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \ 128.992 + c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \ 128.993 + c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \ 128.994 + c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec 128.995 + 128.996 + dspfunc(put_h264_qpel, 0, 16); 128.997 + dspfunc(avg_h264_qpel, 0, 16); 128.998 +#undef dspfunc 128.999 +} 128.1000 + 128.1001 +void ff_h264dsp_init_ppc(H264DSPContext *c){ 128.1002 + c->h264_idct_dc_add= h264_idct_dc_add_altivec; 128.1003 + c->h264_idct_add = ff_h264_idct_add_altivec; 128.1004 + c->h264_idct_add8 = ff_h264_idct_add8_altivec; 128.1005 + c->h264_idct_add16 = ff_h264_idct_add16_altivec; 128.1006 + c->h264_idct_add16intra = ff_h264_idct_add16intra_altivec; 128.1007 + 128.1008 + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_altivec; 128.1009 + c->h264_idct8_add = ff_h264_idct8_add_altivec; 128.1010 + c->h264_idct8_add4 = ff_h264_idct8_add4_altivec; 128.1011 + c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; 128.1012 + c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec; 128.1013 + 128.1014 + c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec; 128.1015 + c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec; 128.1016 + c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec; 128.1017 + c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec; 128.1018 + c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec; 128.1019 + c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec; 128.1020 + c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec; 128.1021 + c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec; 128.1022 + c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec; 128.1023 + c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec; 128.1024 +}
129.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 129.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/h264_template_altivec.c Mon Aug 27 12:09:56 2012 +0200 129.3 @@ -0,0 +1,783 @@ 129.4 +/* 129.5 + * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> 129.6 + * 129.7 + * This file is part of FFmpeg. 129.8 + * 129.9 + * FFmpeg is free software; you can redistribute it and/or 129.10 + * modify it under the terms of the GNU Lesser General Public 129.11 + * License as published by the Free Software Foundation; either 129.12 + * version 2.1 of the License, or (at your option) any later version. 129.13 + * 129.14 + * FFmpeg is distributed in the hope that it will be useful, 129.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 129.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 129.17 + * Lesser General Public License for more details. 129.18 + * 129.19 + * You should have received a copy of the GNU Lesser General Public 129.20 + * License along with FFmpeg; if not, write to the Free Software 129.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 129.22 + */ 129.23 + 129.24 +//#define DEBUG_ALIGNMENT 129.25 +#ifdef DEBUG_ALIGNMENT 129.26 +#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); 129.27 +#else 129.28 +#define ASSERT_ALIGNED(ptr) ; 129.29 +#endif 129.30 + 129.31 +/* this code assume that stride % 16 == 0 */ 129.32 + 129.33 +#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \ 129.34 + vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\ 129.35 + vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\ 129.36 +\ 129.37 + psum = vec_mladd(vA, vsrc0ssH, BIAS1);\ 129.38 + psum = vec_mladd(vB, vsrc1ssH, psum);\ 129.39 + psum = vec_mladd(vC, vsrc2ssH, psum);\ 129.40 + psum = vec_mladd(vD, vsrc3ssH, psum);\ 129.41 + psum = BIAS2(psum);\ 129.42 + psum = vec_sr(psum, v6us);\ 129.43 +\ 129.44 + vdst = vec_ld(0, dst);\ 129.45 + ppsum = (vec_u8)vec_pack(psum, psum);\ 129.46 + vfdst = vec_perm(vdst, ppsum, fperm);\ 129.47 +\ 129.48 + OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 129.49 +\ 129.50 + vec_st(fsum, 0, dst);\ 129.51 +\ 129.52 + vsrc0ssH = vsrc2ssH;\ 129.53 + vsrc1ssH = vsrc3ssH;\ 129.54 +\ 129.55 + dst += stride;\ 129.56 + src += stride; 129.57 + 129.58 +#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ 129.59 +\ 129.60 + vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\ 129.61 + vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\ 129.62 +\ 129.63 + psum = vec_mladd(vA, vsrc0ssH, v32ss);\ 129.64 + psum = vec_mladd(vE, vsrc1ssH, psum);\ 129.65 + psum = vec_sr(psum, v6us);\ 129.66 +\ 129.67 + vdst = vec_ld(0, dst);\ 129.68 + ppsum = (vec_u8)vec_pack(psum, psum);\ 129.69 + vfdst = vec_perm(vdst, ppsum, fperm);\ 129.70 +\ 129.71 + OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 129.72 +\ 129.73 + vec_st(fsum, 0, dst);\ 129.74 +\ 129.75 + dst += stride;\ 129.76 + src += stride; 129.77 + 129.78 +#define noop(a) a 129.79 +#define add28(a) vec_add(v28ss, a) 129.80 + 129.81 +static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, 129.82 + int stride, int h, int x, int y) { 129.83 + POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); 129.84 + DECLARE_ALIGNED(16, signed int, ABCD)[4] = 129.85 + {((8 - x) * (8 - y)), 129.86 + (( x) * (8 - y)), 129.87 + ((8 - x) * ( y)), 129.88 + (( x) * ( y))}; 129.89 + register int i; 129.90 + vec_u8 fperm; 129.91 + const vec_s32 vABCD = vec_ld(0, ABCD); 129.92 + const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); 129.93 + const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); 129.94 + const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); 129.95 + const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); 129.96 + LOAD_ZERO; 129.97 + const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); 129.98 + const vec_u16 v6us = vec_splat_u16(6); 129.99 + register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 129.100 + register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 129.101 + 129.102 + vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; 129.103 + vec_u8 vsrc0uc, vsrc1uc; 129.104 + vec_s16 vsrc0ssH, vsrc1ssH; 129.105 + vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; 129.106 + vec_s16 vsrc2ssH, vsrc3ssH, psum; 129.107 + vec_u8 vdst, ppsum, vfdst, fsum; 129.108 + 129.109 + POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); 129.110 + 129.111 + if (((unsigned long)dst) % 16 == 0) { 129.112 + fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 129.113 + 0x14, 0x15, 0x16, 0x17, 129.114 + 0x08, 0x09, 0x0A, 0x0B, 129.115 + 0x0C, 0x0D, 0x0E, 0x0F}; 129.116 + } else { 129.117 + fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, 129.118 + 0x04, 0x05, 0x06, 0x07, 129.119 + 0x18, 0x19, 0x1A, 0x1B, 129.120 + 0x1C, 0x1D, 0x1E, 0x1F}; 129.121 + } 129.122 + 129.123 + vsrcAuc = vec_ld(0, src); 129.124 + 129.125 + if (loadSecond) 129.126 + vsrcBuc = vec_ld(16, src); 129.127 + vsrcperm0 = vec_lvsl(0, src); 129.128 + vsrcperm1 = vec_lvsl(1, src); 129.129 + 129.130 + vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); 129.131 + if (reallyBadAlign) 129.132 + vsrc1uc = vsrcBuc; 129.133 + else 129.134 + vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); 129.135 + 129.136 + vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc); 129.137 + vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc); 129.138 + 129.139 + if (ABCD[3]) { 129.140 + if (!loadSecond) {// -> !reallyBadAlign 129.141 + for (i = 0 ; i < h ; i++) { 129.142 + vsrcCuc = vec_ld(stride + 0, src); 129.143 + vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 129.144 + vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 129.145 + 129.146 + CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) 129.147 + } 129.148 + } else { 129.149 + vec_u8 vsrcDuc; 129.150 + for (i = 0 ; i < h ; i++) { 129.151 + vsrcCuc = vec_ld(stride + 0, src); 129.152 + vsrcDuc = vec_ld(stride + 16, src); 129.153 + vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 129.154 + if (reallyBadAlign) 129.155 + vsrc3uc = vsrcDuc; 129.156 + else 129.157 + vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 129.158 + 129.159 + CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) 129.160 + } 129.161 + } 129.162 + } else { 129.163 + const vec_s16 vE = vec_add(vB, vC); 129.164 + if (ABCD[2]) { // x == 0 B == 0 129.165 + if (!loadSecond) {// -> !reallyBadAlign 129.166 + for (i = 0 ; i < h ; i++) { 129.167 + vsrcCuc = vec_ld(stride + 0, src); 129.168 + vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 129.169 + CHROMA_MC8_ALTIVEC_CORE_SIMPLE 129.170 + 129.171 + vsrc0uc = vsrc1uc; 129.172 + } 129.173 + } else { 129.174 + vec_u8 vsrcDuc; 129.175 + for (i = 0 ; i < h ; i++) { 129.176 + vsrcCuc = vec_ld(stride + 0, src); 129.177 + vsrcDuc = vec_ld(stride + 15, src); 129.178 + vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 129.179 + CHROMA_MC8_ALTIVEC_CORE_SIMPLE 129.180 + 129.181 + vsrc0uc = vsrc1uc; 129.182 + } 129.183 + } 129.184 + } else { // y == 0 C == 0 129.185 + if (!loadSecond) {// -> !reallyBadAlign 129.186 + for (i = 0 ; i < h ; i++) { 129.187 + vsrcCuc = vec_ld(0, src); 129.188 + vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 129.189 + vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 129.190 + 129.191 + CHROMA_MC8_ALTIVEC_CORE_SIMPLE 129.192 + } 129.193 + } else { 129.194 + vec_u8 vsrcDuc; 129.195 + for (i = 0 ; i < h ; i++) { 129.196 + vsrcCuc = vec_ld(0, src); 129.197 + vsrcDuc = vec_ld(15, src); 129.198 + vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 129.199 + if (reallyBadAlign) 129.200 + vsrc1uc = vsrcDuc; 129.201 + else 129.202 + vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 129.203 + 129.204 + CHROMA_MC8_ALTIVEC_CORE_SIMPLE 129.205 + } 129.206 + } 129.207 + } 129.208 + } 129.209 + POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); 129.210 +} 129.211 + 129.212 +/* this code assume that stride % 16 == 0 */ 129.213 +static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { 129.214 + DECLARE_ALIGNED(16, signed int, ABCD)[4] = 129.215 + {((8 - x) * (8 - y)), 129.216 + (( x) * (8 - y)), 129.217 + ((8 - x) * ( y)), 129.218 + (( x) * ( y))}; 129.219 + register int i; 129.220 + vec_u8 fperm; 129.221 + const vec_s32 vABCD = vec_ld(0, ABCD); 129.222 + const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); 129.223 + const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); 129.224 + const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); 129.225 + const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); 129.226 + LOAD_ZERO; 129.227 + const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); 129.228 + const vec_u16 v6us = vec_splat_u16(6); 129.229 + register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; 129.230 + register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; 129.231 + 129.232 + vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; 129.233 + vec_u8 vsrc0uc, vsrc1uc; 129.234 + vec_s16 vsrc0ssH, vsrc1ssH; 129.235 + vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; 129.236 + vec_s16 vsrc2ssH, vsrc3ssH, psum; 129.237 + vec_u8 vdst, ppsum, vfdst, fsum; 129.238 + 129.239 + if (((unsigned long)dst) % 16 == 0) { 129.240 + fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, 129.241 + 0x14, 0x15, 0x16, 0x17, 129.242 + 0x08, 0x09, 0x0A, 0x0B, 129.243 + 0x0C, 0x0D, 0x0E, 0x0F}; 129.244 + } else { 129.245 + fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, 129.246 + 0x04, 0x05, 0x06, 0x07, 129.247 + 0x18, 0x19, 0x1A, 0x1B, 129.248 + 0x1C, 0x1D, 0x1E, 0x1F}; 129.249 + } 129.250 + 129.251 + vsrcAuc = vec_ld(0, src); 129.252 + 129.253 + if (loadSecond) 129.254 + vsrcBuc = vec_ld(16, src); 129.255 + vsrcperm0 = vec_lvsl(0, src); 129.256 + vsrcperm1 = vec_lvsl(1, src); 129.257 + 129.258 + vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); 129.259 + if (reallyBadAlign) 129.260 + vsrc1uc = vsrcBuc; 129.261 + else 129.262 + vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); 129.263 + 129.264 + vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc); 129.265 + vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc); 129.266 + 129.267 + if (!loadSecond) {// -> !reallyBadAlign 129.268 + for (i = 0 ; i < h ; i++) { 129.269 + 129.270 + 129.271 + vsrcCuc = vec_ld(stride + 0, src); 129.272 + 129.273 + vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 129.274 + vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); 129.275 + 129.276 + CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) 129.277 + } 129.278 + } else { 129.279 + vec_u8 vsrcDuc; 129.280 + for (i = 0 ; i < h ; i++) { 129.281 + vsrcCuc = vec_ld(stride + 0, src); 129.282 + vsrcDuc = vec_ld(stride + 16, src); 129.283 + 129.284 + vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 129.285 + if (reallyBadAlign) 129.286 + vsrc3uc = vsrcDuc; 129.287 + else 129.288 + vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); 129.289 + 129.290 + CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28) 129.291 + } 129.292 + } 129.293 +} 129.294 + 129.295 +#undef noop 129.296 +#undef add28 129.297 +#undef CHROMA_MC8_ALTIVEC_CORE 129.298 + 129.299 +/* this code assume stride % 16 == 0 */ 129.300 +static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 129.301 + POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); 129.302 + register int i; 129.303 + 129.304 + LOAD_ZERO; 129.305 + const vec_u8 permM2 = vec_lvsl(-2, src); 129.306 + const vec_u8 permM1 = vec_lvsl(-1, src); 129.307 + const vec_u8 permP0 = vec_lvsl(+0, src); 129.308 + const vec_u8 permP1 = vec_lvsl(+1, src); 129.309 + const vec_u8 permP2 = vec_lvsl(+2, src); 129.310 + const vec_u8 permP3 = vec_lvsl(+3, src); 129.311 + const vec_s16 v5ss = vec_splat_s16(5); 129.312 + const vec_u16 v5us = vec_splat_u16(5); 129.313 + const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 129.314 + const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 129.315 + 129.316 + vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 129.317 + 129.318 + register int align = ((((unsigned long)src) - 2) % 16); 129.319 + 129.320 + vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, 129.321 + srcP2A, srcP2B, srcP3A, srcP3B, 129.322 + srcM1A, srcM1B, srcM2A, srcM2B, 129.323 + sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 129.324 + pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 129.325 + psumA, psumB, sumA, sumB; 129.326 + 129.327 + vec_u8 sum, vdst, fsum; 129.328 + 129.329 + POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 129.330 + 129.331 + for (i = 0 ; i < 16 ; i ++) { 129.332 + vec_u8 srcR1 = vec_ld(-2, src); 129.333 + vec_u8 srcR2 = vec_ld(14, src); 129.334 + 129.335 + switch (align) { 129.336 + default: { 129.337 + srcM2 = vec_perm(srcR1, srcR2, permM2); 129.338 + srcM1 = vec_perm(srcR1, srcR2, permM1); 129.339 + srcP0 = vec_perm(srcR1, srcR2, permP0); 129.340 + srcP1 = vec_perm(srcR1, srcR2, permP1); 129.341 + srcP2 = vec_perm(srcR1, srcR2, permP2); 129.342 + srcP3 = vec_perm(srcR1, srcR2, permP3); 129.343 + } break; 129.344 + case 11: { 129.345 + srcM2 = vec_perm(srcR1, srcR2, permM2); 129.346 + srcM1 = vec_perm(srcR1, srcR2, permM1); 129.347 + srcP0 = vec_perm(srcR1, srcR2, permP0); 129.348 + srcP1 = vec_perm(srcR1, srcR2, permP1); 129.349 + srcP2 = vec_perm(srcR1, srcR2, permP2); 129.350 + srcP3 = srcR2; 129.351 + } break; 129.352 + case 12: { 129.353 + vec_u8 srcR3 = vec_ld(30, src); 129.354 + srcM2 = vec_perm(srcR1, srcR2, permM2); 129.355 + srcM1 = vec_perm(srcR1, srcR2, permM1); 129.356 + srcP0 = vec_perm(srcR1, srcR2, permP0); 129.357 + srcP1 = vec_perm(srcR1, srcR2, permP1); 129.358 + srcP2 = srcR2; 129.359 + srcP3 = vec_perm(srcR2, srcR3, permP3); 129.360 + } break; 129.361 + case 13: { 129.362 + vec_u8 srcR3 = vec_ld(30, src); 129.363 + srcM2 = vec_perm(srcR1, srcR2, permM2); 129.364 + srcM1 = vec_perm(srcR1, srcR2, permM1); 129.365 + srcP0 = vec_perm(srcR1, srcR2, permP0); 129.366 + srcP1 = srcR2; 129.367 + srcP2 = vec_perm(srcR2, srcR3, permP2); 129.368 + srcP3 = vec_perm(srcR2, srcR3, permP3); 129.369 + } break; 129.370 + case 14: { 129.371 + vec_u8 srcR3 = vec_ld(30, src); 129.372 + srcM2 = vec_perm(srcR1, srcR2, permM2); 129.373 + srcM1 = vec_perm(srcR1, srcR2, permM1); 129.374 + srcP0 = srcR2; 129.375 + srcP1 = vec_perm(srcR2, srcR3, permP1); 129.376 + srcP2 = vec_perm(srcR2, srcR3, permP2); 129.377 + srcP3 = vec_perm(srcR2, srcR3, permP3); 129.378 + } break; 129.379 + case 15: { 129.380 + vec_u8 srcR3 = vec_ld(30, src); 129.381 + srcM2 = vec_perm(srcR1, srcR2, permM2); 129.382 + srcM1 = srcR2; 129.383 + srcP0 = vec_perm(srcR2, srcR3, permP0); 129.384 + srcP1 = vec_perm(srcR2, srcR3, permP1); 129.385 + srcP2 = vec_perm(srcR2, srcR3, permP2); 129.386 + srcP3 = vec_perm(srcR2, srcR3, permP3); 129.387 + } break; 129.388 + } 129.389 + 129.390 + srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); 129.391 + srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); 129.392 + srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); 129.393 + srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); 129.394 + 129.395 + srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); 129.396 + srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); 129.397 + srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); 129.398 + srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); 129.399 + 129.400 + srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); 129.401 + srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); 129.402 + srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); 129.403 + srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); 129.404 + 129.405 + sum1A = vec_adds(srcP0A, srcP1A); 129.406 + sum1B = vec_adds(srcP0B, srcP1B); 129.407 + sum2A = vec_adds(srcM1A, srcP2A); 129.408 + sum2B = vec_adds(srcM1B, srcP2B); 129.409 + sum3A = vec_adds(srcM2A, srcP3A); 129.410 + sum3B = vec_adds(srcM2B, srcP3B); 129.411 + 129.412 + pp1A = vec_mladd(sum1A, v20ss, v16ss); 129.413 + pp1B = vec_mladd(sum1B, v20ss, v16ss); 129.414 + 129.415 + pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 129.416 + pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 129.417 + 129.418 + pp3A = vec_add(sum3A, pp1A); 129.419 + pp3B = vec_add(sum3B, pp1B); 129.420 + 129.421 + psumA = vec_sub(pp3A, pp2A); 129.422 + psumB = vec_sub(pp3B, pp2B); 129.423 + 129.424 + sumA = vec_sra(psumA, v5us); 129.425 + sumB = vec_sra(psumB, v5us); 129.426 + 129.427 + sum = vec_packsu(sumA, sumB); 129.428 + 129.429 + ASSERT_ALIGNED(dst); 129.430 + vdst = vec_ld(0, dst); 129.431 + 129.432 + OP_U8_ALTIVEC(fsum, sum, vdst); 129.433 + 129.434 + vec_st(fsum, 0, dst); 129.435 + 129.436 + src += srcStride; 129.437 + dst += dstStride; 129.438 + } 129.439 + POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); 129.440 +} 129.441 + 129.442 +/* this code assume stride % 16 == 0 */ 129.443 +static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { 129.444 + POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); 129.445 + 129.446 + register int i; 129.447 + 129.448 + LOAD_ZERO; 129.449 + const vec_u8 perm = vec_lvsl(0, src); 129.450 + const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 129.451 + const vec_u16 v5us = vec_splat_u16(5); 129.452 + const vec_s16 v5ss = vec_splat_s16(5); 129.453 + const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); 129.454 + 129.455 + uint8_t *srcbis = src - (srcStride * 2); 129.456 + 129.457 + const vec_u8 srcM2a = vec_ld(0, srcbis); 129.458 + const vec_u8 srcM2b = vec_ld(16, srcbis); 129.459 + const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm); 129.460 + //srcbis += srcStride; 129.461 + const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride); 129.462 + const vec_u8 srcM1b = vec_ld(16, srcbis); 129.463 + const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm); 129.464 + //srcbis += srcStride; 129.465 + const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride); 129.466 + const vec_u8 srcP0b = vec_ld(16, srcbis); 129.467 + const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm); 129.468 + //srcbis += srcStride; 129.469 + const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride); 129.470 + const vec_u8 srcP1b = vec_ld(16, srcbis); 129.471 + const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm); 129.472 + //srcbis += srcStride; 129.473 + const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride); 129.474 + const vec_u8 srcP2b = vec_ld(16, srcbis); 129.475 + const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm); 129.476 + //srcbis += srcStride; 129.477 + 129.478 + vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2); 129.479 + vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2); 129.480 + vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1); 129.481 + vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1); 129.482 + vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0); 129.483 + vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0); 129.484 + vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1); 129.485 + vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1); 129.486 + vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2); 129.487 + vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2); 129.488 + 129.489 + vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, 129.490 + psumA, psumB, sumA, sumB, 129.491 + srcP3ssA, srcP3ssB, 129.492 + sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; 129.493 + 129.494 + vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3; 129.495 + 129.496 + POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); 129.497 + 129.498 + for (i = 0 ; i < 16 ; i++) { 129.499 + srcP3a = vec_ld(0, srcbis += srcStride); 129.500 + srcP3b = vec_ld(16, srcbis); 129.501 + srcP3 = vec_perm(srcP3a, srcP3b, perm); 129.502 + srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3); 129.503 + srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3); 129.504 + //srcbis += srcStride; 129.505 + 129.506 + sum1A = vec_adds(srcP0ssA, srcP1ssA); 129.507 + sum1B = vec_adds(srcP0ssB, srcP1ssB); 129.508 + sum2A = vec_adds(srcM1ssA, srcP2ssA); 129.509 + sum2B = vec_adds(srcM1ssB, srcP2ssB); 129.510 + sum3A = vec_adds(srcM2ssA, srcP3ssA); 129.511 + sum3B = vec_adds(srcM2ssB, srcP3ssB); 129.512 + 129.513 + srcM2ssA = srcM1ssA; 129.514 + srcM2ssB = srcM1ssB; 129.515 + srcM1ssA = srcP0ssA; 129.516 + srcM1ssB = srcP0ssB; 129.517 + srcP0ssA = srcP1ssA; 129.518 + srcP0ssB = srcP1ssB; 129.519 + srcP1ssA = srcP2ssA; 129.520 + srcP1ssB = srcP2ssB; 129.521 + srcP2ssA = srcP3ssA; 129.522 + srcP2ssB = srcP3ssB; 129.523 + 129.524 + pp1A = vec_mladd(sum1A, v20ss, v16ss); 129.525 + pp1B = vec_mladd(sum1B, v20ss, v16ss); 129.526 + 129.527 + pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 129.528 + pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 129.529 + 129.530 + pp3A = vec_add(sum3A, pp1A); 129.531 + pp3B = vec_add(sum3B, pp1B); 129.532 + 129.533 + psumA = vec_sub(pp3A, pp2A); 129.534 + psumB = vec_sub(pp3B, pp2B); 129.535 + 129.536 + sumA = vec_sra(psumA, v5us); 129.537 + sumB = vec_sra(psumB, v5us); 129.538 + 129.539 + sum = vec_packsu(sumA, sumB); 129.540 + 129.541 + ASSERT_ALIGNED(dst); 129.542 + vdst = vec_ld(0, dst); 129.543 + 129.544 + OP_U8_ALTIVEC(fsum, sum, vdst); 129.545 + 129.546 + vec_st(fsum, 0, dst); 129.547 + 129.548 + dst += dstStride; 129.549 + } 129.550 + POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); 129.551 +} 129.552 + 129.553 +/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ 129.554 +static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { 129.555 + POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); 129.556 + register int i; 129.557 + LOAD_ZERO; 129.558 + const vec_u8 permM2 = vec_lvsl(-2, src); 129.559 + const vec_u8 permM1 = vec_lvsl(-1, src); 129.560 + const vec_u8 permP0 = vec_lvsl(+0, src); 129.561 + const vec_u8 permP1 = vec_lvsl(+1, src); 129.562 + const vec_u8 permP2 = vec_lvsl(+2, src); 129.563 + const vec_u8 permP3 = vec_lvsl(+3, src); 129.564 + const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); 129.565 + const vec_u32 v10ui = vec_splat_u32(10); 129.566 + const vec_s16 v5ss = vec_splat_s16(5); 129.567 + const vec_s16 v1ss = vec_splat_s16(1); 129.568 + const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); 129.569 + const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); 129.570 + 129.571 + register int align = ((((unsigned long)src) - 2) % 16); 129.572 + 129.573 + vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, 129.574 + srcP2A, srcP2B, srcP3A, srcP3B, 129.575 + srcM1A, srcM1B, srcM2A, srcM2B, 129.576 + sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, 129.577 + pp1A, pp1B, pp2A, pp2B, psumA, psumB; 129.578 + 129.579 + const vec_u8 mperm = (const vec_u8) 129.580 + {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 129.581 + 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; 129.582 + int16_t *tmpbis = tmp; 129.583 + 129.584 + vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, 129.585 + tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, 129.586 + tmpP2ssA, tmpP2ssB; 129.587 + 129.588 + vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, 129.589 + pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, 129.590 + pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, 129.591 + ssumAe, ssumAo, ssumBe, ssumBo; 129.592 + vec_u8 fsum, sumv, sum, vdst; 129.593 + vec_s16 ssume, ssumo; 129.594 + 129.595 + POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 129.596 + src -= (2 * srcStride); 129.597 + for (i = 0 ; i < 21 ; i ++) { 129.598 + vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; 129.599 + vec_u8 srcR1 = vec_ld(-2, src); 129.600 + vec_u8 srcR2 = vec_ld(14, src); 129.601 + 129.602 + switch (align) { 129.603 + default: { 129.604 + srcM2 = vec_perm(srcR1, srcR2, permM2); 129.605 + srcM1 = vec_perm(srcR1, srcR2, permM1); 129.606 + srcP0 = vec_perm(srcR1, srcR2, permP0); 129.607 + srcP1 = vec_perm(srcR1, srcR2, permP1); 129.608 + srcP2 = vec_perm(srcR1, srcR2, permP2); 129.609 + srcP3 = vec_perm(srcR1, srcR2, permP3); 129.610 + } break; 129.611 + case 11: { 129.612 + srcM2 = vec_perm(srcR1, srcR2, permM2); 129.613 + srcM1 = vec_perm(srcR1, srcR2, permM1); 129.614 + srcP0 = vec_perm(srcR1, srcR2, permP0); 129.615 + srcP1 = vec_perm(srcR1, srcR2, permP1); 129.616 + srcP2 = vec_perm(srcR1, srcR2, permP2); 129.617 + srcP3 = srcR2; 129.618 + } break; 129.619 + case 12: { 129.620 + vec_u8 srcR3 = vec_ld(30, src); 129.621 + srcM2 = vec_perm(srcR1, srcR2, permM2); 129.622 + srcM1 = vec_perm(srcR1, srcR2, permM1); 129.623 + srcP0 = vec_perm(srcR1, srcR2, permP0); 129.624 + srcP1 = vec_perm(srcR1, srcR2, permP1); 129.625 + srcP2 = srcR2; 129.626 + srcP3 = vec_perm(srcR2, srcR3, permP3); 129.627 + } break; 129.628 + case 13: { 129.629 + vec_u8 srcR3 = vec_ld(30, src); 129.630 + srcM2 = vec_perm(srcR1, srcR2, permM2); 129.631 + srcM1 = vec_perm(srcR1, srcR2, permM1); 129.632 + srcP0 = vec_perm(srcR1, srcR2, permP0); 129.633 + srcP1 = srcR2; 129.634 + srcP2 = vec_perm(srcR2, srcR3, permP2); 129.635 + srcP3 = vec_perm(srcR2, srcR3, permP3); 129.636 + } break; 129.637 + case 14: { 129.638 + vec_u8 srcR3 = vec_ld(30, src); 129.639 + srcM2 = vec_perm(srcR1, srcR2, permM2); 129.640 + srcM1 = vec_perm(srcR1, srcR2, permM1); 129.641 + srcP0 = srcR2; 129.642 + srcP1 = vec_perm(srcR2, srcR3, permP1); 129.643 + srcP2 = vec_perm(srcR2, srcR3, permP2); 129.644 + srcP3 = vec_perm(srcR2, srcR3, permP3); 129.645 + } break; 129.646 + case 15: { 129.647 + vec_u8 srcR3 = vec_ld(30, src); 129.648 + srcM2 = vec_perm(srcR1, srcR2, permM2); 129.649 + srcM1 = srcR2; 129.650 + srcP0 = vec_perm(srcR2, srcR3, permP0); 129.651 + srcP1 = vec_perm(srcR2, srcR3, permP1); 129.652 + srcP2 = vec_perm(srcR2, srcR3, permP2); 129.653 + srcP3 = vec_perm(srcR2, srcR3, permP3); 129.654 + } break; 129.655 + } 129.656 + 129.657 + srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); 129.658 + srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); 129.659 + srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); 129.660 + srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); 129.661 + 129.662 + srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); 129.663 + srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); 129.664 + srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); 129.665 + srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); 129.666 + 129.667 + srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); 129.668 + srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); 129.669 + srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); 129.670 + srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); 129.671 + 129.672 + sum1A = vec_adds(srcP0A, srcP1A); 129.673 + sum1B = vec_adds(srcP0B, srcP1B); 129.674 + sum2A = vec_adds(srcM1A, srcP2A); 129.675 + sum2B = vec_adds(srcM1B, srcP2B); 129.676 + sum3A = vec_adds(srcM2A, srcP3A); 129.677 + sum3B = vec_adds(srcM2B, srcP3B); 129.678 + 129.679 + pp1A = vec_mladd(sum1A, v20ss, sum3A); 129.680 + pp1B = vec_mladd(sum1B, v20ss, sum3B); 129.681 + 129.682 + pp2A = vec_mladd(sum2A, v5ss, zero_s16v); 129.683 + pp2B = vec_mladd(sum2B, v5ss, zero_s16v); 129.684 + 129.685 + psumA = vec_sub(pp1A, pp2A); 129.686 + psumB = vec_sub(pp1B, pp2B); 129.687 + 129.688 + vec_st(psumA, 0, tmp); 129.689 + vec_st(psumB, 16, tmp); 129.690 + 129.691 + src += srcStride; 129.692 + tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ 129.693 + } 129.694 + 129.695 + tmpM2ssA = vec_ld(0, tmpbis); 129.696 + tmpM2ssB = vec_ld(16, tmpbis); 129.697 + tmpbis += tmpStride; 129.698 + tmpM1ssA = vec_ld(0, tmpbis); 129.699 + tmpM1ssB = vec_ld(16, tmpbis); 129.700 + tmpbis += tmpStride; 129.701 + tmpP0ssA = vec_ld(0, tmpbis); 129.702 + tmpP0ssB = vec_ld(16, tmpbis); 129.703 + tmpbis += tmpStride; 129.704 + tmpP1ssA = vec_ld(0, tmpbis); 129.705 + tmpP1ssB = vec_ld(16, tmpbis); 129.706 + tmpbis += tmpStride; 129.707 + tmpP2ssA = vec_ld(0, tmpbis); 129.708 + tmpP2ssB = vec_ld(16, tmpbis); 129.709 + tmpbis += tmpStride; 129.710 + 129.711 + for (i = 0 ; i < 16 ; i++) { 129.712 + const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); 129.713 + const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); 129.714 + 129.715 + const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); 129.716 + const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); 129.717 + const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); 129.718 + const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); 129.719 + const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); 129.720 + const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); 129.721 + 129.722 + tmpbis += tmpStride; 129.723 + 129.724 + tmpM2ssA = tmpM1ssA; 129.725 + tmpM2ssB = tmpM1ssB; 129.726 + tmpM1ssA = tmpP0ssA; 129.727 + tmpM1ssB = tmpP0ssB; 129.728 + tmpP0ssA = tmpP1ssA; 129.729 + tmpP0ssB = tmpP1ssB; 129.730 + tmpP1ssA = tmpP2ssA; 129.731 + tmpP1ssB = tmpP2ssB; 129.732 + tmpP2ssA = tmpP3ssA; 129.733 + tmpP2ssB = tmpP3ssB; 129.734 + 129.735 + pp1Ae = vec_mule(sum1A, v20ss); 129.736 + pp1Ao = vec_mulo(sum1A, v20ss); 129.737 + pp1Be = vec_mule(sum1B, v20ss); 129.738 + pp1Bo = vec_mulo(sum1B, v20ss); 129.739 + 129.740 + pp2Ae = vec_mule(sum2A, v5ss); 129.741 + pp2Ao = vec_mulo(sum2A, v5ss); 129.742 + pp2Be = vec_mule(sum2B, v5ss); 129.743 + pp2Bo = vec_mulo(sum2B, v5ss); 129.744 + 129.745 + pp3Ae = vec_sra((vec_s32)sum3A, v16ui); 129.746 + pp3Ao = vec_mulo(sum3A, v1ss); 129.747 + pp3Be = vec_sra((vec_s32)sum3B, v16ui); 129.748 + pp3Bo = vec_mulo(sum3B, v1ss); 129.749 + 129.750 + pp1cAe = vec_add(pp1Ae, v512si); 129.751 + pp1cAo = vec_add(pp1Ao, v512si); 129.752 + pp1cBe = vec_add(pp1Be, v512si); 129.753 + pp1cBo = vec_add(pp1Bo, v512si); 129.754 + 129.755 + pp32Ae = vec_sub(pp3Ae, pp2Ae); 129.756 + pp32Ao = vec_sub(pp3Ao, pp2Ao); 129.757 + pp32Be = vec_sub(pp3Be, pp2Be); 129.758 + pp32Bo = vec_sub(pp3Bo, pp2Bo); 129.759 + 129.760 + sumAe = vec_add(pp1cAe, pp32Ae); 129.761 + sumAo = vec_add(pp1cAo, pp32Ao); 129.762 + sumBe = vec_add(pp1cBe, pp32Be); 129.763 + sumBo = vec_add(pp1cBo, pp32Bo); 129.764 + 129.765 + ssumAe = vec_sra(sumAe, v10ui); 129.766 + ssumAo = vec_sra(sumAo, v10ui); 129.767 + ssumBe = vec_sra(sumBe, v10ui); 129.768 + ssumBo = vec_sra(sumBo, v10ui); 129.769 + 129.770 + ssume = vec_packs(ssumAe, ssumBe); 129.771 + ssumo = vec_packs(ssumAo, ssumBo); 129.772 + 129.773 + sumv = vec_packsu(ssume, ssumo); 129.774 + sum = vec_perm(sumv, sumv, mperm); 129.775 + 129.776 + ASSERT_ALIGNED(dst); 129.777 + vdst = vec_ld(0, dst); 129.778 + 129.779 + OP_U8_ALTIVEC(fsum, sum, vdst); 129.780 + 129.781 + vec_st(fsum, 0, dst); 129.782 + 129.783 + dst += dstStride; 129.784 + } 129.785 + POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); 129.786 +}
130.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 130.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/idct_altivec.c Mon Aug 27 12:09:56 2012 +0200 130.3 @@ -0,0 +1,232 @@ 130.4 +/* 130.5 + * Copyright (c) 2001 Michel Lespinasse 130.6 + * 130.7 + * This file is part of FFmpeg. 130.8 + * 130.9 + * FFmpeg is free software; you can redistribute it and/or 130.10 + * modify it under the terms of the GNU Lesser General Public 130.11 + * License as published by the Free Software Foundation; either 130.12 + * version 2.1 of the License, or (at your option) any later version. 130.13 + * 130.14 + * FFmpeg is distributed in the hope that it will be useful, 130.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 130.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 130.17 + * Lesser General Public License for more details. 130.18 + * 130.19 + * You should have received a copy of the GNU Lesser General Public 130.20 + * License along with FFmpeg; if not, write to the Free Software 130.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 130.22 + */ 130.23 + 130.24 +/* 130.25 + * NOTE: This code is based on GPL code from the libmpeg2 project. The 130.26 + * author, Michel Lespinasses, has given explicit permission to release 130.27 + * under LGPL as part of FFmpeg. 130.28 + */ 130.29 + 130.30 +/* 130.31 + * FFmpeg integration by Dieter Shirley 130.32 + * 130.33 + * This file is a direct copy of the AltiVec IDCT module from the libmpeg2 130.34 + * project. I've deleted all of the libmpeg2-specific code, renamed the 130.35 + * functions and reordered the function parameters. The only change to the 130.36 + * IDCT function itself was to factor out the partial transposition, and to 130.37 + * perform a full transpose at the end of the function. 130.38 + */ 130.39 + 130.40 + 130.41 +#include <stdlib.h> /* malloc(), free() */ 130.42 +#include <string.h> 130.43 +#include "config.h" 130.44 +#if HAVE_ALTIVEC_H 130.45 +#include <altivec.h> 130.46 +#endif 130.47 +#include "libavcodec/dsputil.h" 130.48 +#include "types_altivec.h" 130.49 +#include "dsputil_ppc.h" 130.50 +#include "dsputil_altivec.h" 130.51 + 130.52 +#define IDCT_HALF \ 130.53 + /* 1st stage */ \ 130.54 + t1 = vec_mradds (a1, vx7, vx1 ); \ 130.55 + t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \ 130.56 + t7 = vec_mradds (a2, vx5, vx3); \ 130.57 + t3 = vec_mradds (ma2, vx3, vx5); \ 130.58 + \ 130.59 + /* 2nd stage */ \ 130.60 + t5 = vec_adds (vx0, vx4); \ 130.61 + t0 = vec_subs (vx0, vx4); \ 130.62 + t2 = vec_mradds (a0, vx6, vx2); \ 130.63 + t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \ 130.64 + t6 = vec_adds (t8, t3); \ 130.65 + t3 = vec_subs (t8, t3); \ 130.66 + t8 = vec_subs (t1, t7); \ 130.67 + t1 = vec_adds (t1, t7); \ 130.68 + \ 130.69 + /* 3rd stage */ \ 130.70 + t7 = vec_adds (t5, t2); \ 130.71 + t2 = vec_subs (t5, t2); \ 130.72 + t5 = vec_adds (t0, t4); \ 130.73 + t0 = vec_subs (t0, t4); \ 130.74 + t4 = vec_subs (t8, t3); \ 130.75 + t3 = vec_adds (t8, t3); \ 130.76 + \ 130.77 + /* 4th stage */ \ 130.78 + vy0 = vec_adds (t7, t1); \ 130.79 + vy7 = vec_subs (t7, t1); \ 130.80 + vy1 = vec_mradds (c4, t3, t5); \ 130.81 + vy6 = vec_mradds (mc4, t3, t5); \ 130.82 + vy2 = vec_mradds (c4, t4, t0); \ 130.83 + vy5 = vec_mradds (mc4, t4, t0); \ 130.84 + vy3 = vec_adds (t2, t6); \ 130.85 + vy4 = vec_subs (t2, t6); 130.86 + 130.87 + 130.88 +#define IDCT \ 130.89 + vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \ 130.90 + vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ 130.91 + vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias; \ 130.92 + vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \ 130.93 + vec_u16 shift; \ 130.94 + \ 130.95 + c4 = vec_splat (constants[0], 0); \ 130.96 + a0 = vec_splat (constants[0], 1); \ 130.97 + a1 = vec_splat (constants[0], 2); \ 130.98 + a2 = vec_splat (constants[0], 3); \ 130.99 + mc4 = vec_splat (constants[0], 4); \ 130.100 + ma2 = vec_splat (constants[0], 5); \ 130.101 + bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3); \ 130.102 + \ 130.103 + zero = vec_splat_s16 (0); \ 130.104 + shift = vec_splat_u16 (4); \ 130.105 + \ 130.106 + vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \ 130.107 + vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \ 130.108 + vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \ 130.109 + vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \ 130.110 + vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \ 130.111 + vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \ 130.112 + vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \ 130.113 + vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \ 130.114 + \ 130.115 + IDCT_HALF \ 130.116 + \ 130.117 + vx0 = vec_mergeh (vy0, vy4); \ 130.118 + vx1 = vec_mergel (vy0, vy4); \ 130.119 + vx2 = vec_mergeh (vy1, vy5); \ 130.120 + vx3 = vec_mergel (vy1, vy5); \ 130.121 + vx4 = vec_mergeh (vy2, vy6); \ 130.122 + vx5 = vec_mergel (vy2, vy6); \ 130.123 + vx6 = vec_mergeh (vy3, vy7); \ 130.124 + vx7 = vec_mergel (vy3, vy7); \ 130.125 + \ 130.126 + vy0 = vec_mergeh (vx0, vx4); \ 130.127 + vy1 = vec_mergel (vx0, vx4); \ 130.128 + vy2 = vec_mergeh (vx1, vx5); \ 130.129 + vy3 = vec_mergel (vx1, vx5); \ 130.130 + vy4 = vec_mergeh (vx2, vx6); \ 130.131 + vy5 = vec_mergel (vx2, vx6); \ 130.132 + vy6 = vec_mergeh (vx3, vx7); \ 130.133 + vy7 = vec_mergel (vx3, vx7); \ 130.134 + \ 130.135 + vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \ 130.136 + vx1 = vec_mergel (vy0, vy4); \ 130.137 + vx2 = vec_mergeh (vy1, vy5); \ 130.138 + vx3 = vec_mergel (vy1, vy5); \ 130.139 + vx4 = vec_mergeh (vy2, vy6); \ 130.140 + vx5 = vec_mergel (vy2, vy6); \ 130.141 + vx6 = vec_mergeh (vy3, vy7); \ 130.142 + vx7 = vec_mergel (vy3, vy7); \ 130.143 + \ 130.144 + IDCT_HALF \ 130.145 + \ 130.146 + shift = vec_splat_u16 (6); \ 130.147 + vx0 = vec_sra (vy0, shift); \ 130.148 + vx1 = vec_sra (vy1, shift); \ 130.149 + vx2 = vec_sra (vy2, shift); \ 130.150 + vx3 = vec_sra (vy3, shift); \ 130.151 + vx4 = vec_sra (vy4, shift); \ 130.152 + vx5 = vec_sra (vy5, shift); \ 130.153 + vx6 = vec_sra (vy6, shift); \ 130.154 + vx7 = vec_sra (vy7, shift); 130.155 + 130.156 + 130.157 +static const vec_s16 constants[5] = { 130.158 + {23170, 13573, 6518, 21895, -23170, -21895, 32, 31}, 130.159 + {16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725}, 130.160 + {22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521}, 130.161 + {21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692}, 130.162 + {19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722} 130.163 +}; 130.164 + 130.165 +void idct_put_altivec(uint8_t* dest, int stride, int16_t *blk) 130.166 +{ 130.167 +POWERPC_PERF_DECLARE(altivec_idct_put_num, 1); 130.168 + vec_s16 *block = (vec_s16*)blk; 130.169 + vec_u8 tmp; 130.170 + 130.171 +#if CONFIG_POWERPC_PERF 130.172 +POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1); 130.173 +#endif 130.174 + IDCT 130.175 + 130.176 +#define COPY(dest,src) \ 130.177 + tmp = vec_packsu (src, src); \ 130.178 + vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \ 130.179 + vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest); 130.180 + 130.181 + COPY (dest, vx0) dest += stride; 130.182 + COPY (dest, vx1) dest += stride; 130.183 + COPY (dest, vx2) dest += stride; 130.184 + COPY (dest, vx3) dest += stride; 130.185 + COPY (dest, vx4) dest += stride; 130.186 + COPY (dest, vx5) dest += stride; 130.187 + COPY (dest, vx6) dest += stride; 130.188 + COPY (dest, vx7) 130.189 + 130.190 +POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1); 130.191 +} 130.192 + 130.193 +void idct_add_altivec(uint8_t* dest, int stride, int16_t *blk) 130.194 +{ 130.195 +POWERPC_PERF_DECLARE(altivec_idct_add_num, 1); 130.196 + vec_s16 *block = (vec_s16*)blk; 130.197 + vec_u8 tmp; 130.198 + vec_s16 tmp2, tmp3; 130.199 + vec_u8 perm0; 130.200 + vec_u8 perm1; 130.201 + vec_u8 p0, p1, p; 130.202 + 130.203 +#if CONFIG_POWERPC_PERF 130.204 +POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); 130.205 +#endif 130.206 + 130.207 + IDCT 130.208 + 130.209 + p0 = vec_lvsl (0, dest); 130.210 + p1 = vec_lvsl (stride, dest); 130.211 + p = vec_splat_u8 (-1); 130.212 + perm0 = vec_mergeh (p, p0); 130.213 + perm1 = vec_mergeh (p, p1); 130.214 + 130.215 +#define ADD(dest,src,perm) \ 130.216 + /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ 130.217 + tmp = vec_ld (0, dest); \ 130.218 + tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \ 130.219 + tmp3 = vec_adds (tmp2, src); \ 130.220 + tmp = vec_packsu (tmp3, tmp3); \ 130.221 + vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \ 130.222 + vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest); 130.223 + 130.224 + ADD (dest, vx0, perm0) dest += stride; 130.225 + ADD (dest, vx1, perm1) dest += stride; 130.226 + ADD (dest, vx2, perm0) dest += stride; 130.227 + ADD (dest, vx3, perm1) dest += stride; 130.228 + ADD (dest, vx4, perm0) dest += stride; 130.229 + ADD (dest, vx5, perm1) dest += stride; 130.230 + ADD (dest, vx6, perm0) dest += stride; 130.231 + ADD (dest, vx7, perm1) 130.232 + 130.233 +POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1); 130.234 +} 130.235 +
131.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 131.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/mathops.h Mon Aug 27 12:09:56 2012 +0200 131.3 @@ -0,0 +1,79 @@ 131.4 +/* 131.5 + * simple math operations 131.6 + * Copyright (c) 2001, 2002 Fabrice Bellard 131.7 + * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al 131.8 + * 131.9 + * This file is part of FFmpeg. 131.10 + * 131.11 + * FFmpeg is free software; you can redistribute it and/or 131.12 + * modify it under the terms of the GNU Lesser General Public 131.13 + * License as published by the Free Software Foundation; either 131.14 + * version 2.1 of the License, or (at your option) any later version. 131.15 + * 131.16 + * FFmpeg is distributed in the hope that it will be useful, 131.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 131.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 131.19 + * Lesser General Public License for more details. 131.20 + * 131.21 + * You should have received a copy of the GNU Lesser General Public 131.22 + * License along with FFmpeg; if not, write to the Free Software 131.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 131.24 + */ 131.25 + 131.26 +#ifndef AVCODEC_PPC_MATHOPS_H 131.27 +#define AVCODEC_PPC_MATHOPS_H 131.28 + 131.29 +#include <stdint.h> 131.30 +#include "config.h" 131.31 +#include "libavutil/common.h" 131.32 + 131.33 +#if HAVE_PPC4XX 131.34 +/* signed 16x16 -> 32 multiply add accumulate */ 131.35 +#define MAC16(rt, ra, rb) \ 131.36 + __asm__ ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); 131.37 + 131.38 +/* signed 16x16 -> 32 multiply */ 131.39 +#define MUL16(ra, rb) \ 131.40 + ({ int __rt; \ 131.41 + __asm__ ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \ 131.42 + __rt; }) 131.43 +#endif 131.44 + 131.45 +#define MULH MULH 131.46 +static inline av_const int MULH(int a, int b){ 131.47 + int r; 131.48 + __asm__ ("mulhw %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); 131.49 + return r; 131.50 +} 131.51 + 131.52 +#if !ARCH_PPC64 131.53 +static inline av_const int64_t MAC64(int64_t d, int a, int b) 131.54 +{ 131.55 + union { uint64_t x; unsigned hl[2]; } x = { d }; 131.56 + int h, l; 131.57 + __asm__ ("mullw %3, %4, %5 \n\t" 131.58 + "mulhw %2, %4, %5 \n\t" 131.59 + "addc %1, %1, %3 \n\t" 131.60 + "adde %0, %0, %2 \n\t" 131.61 + : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l) 131.62 + : "r"(a), "r"(b)); 131.63 + return x.x; 131.64 +} 131.65 +#define MAC64(d, a, b) ((d) = MAC64(d, a, b)) 131.66 + 131.67 +static inline av_const int64_t MLS64(int64_t d, int a, int b) 131.68 +{ 131.69 + union { uint64_t x; unsigned hl[2]; } x = { d }; 131.70 + int h, l; 131.71 + __asm__ ("mullw %3, %4, %5 \n\t" 131.72 + "mulhw %2, %4, %5 \n\t" 131.73 + "subfc %1, %3, %1 \n\t" 131.74 + "subfe %0, %2, %0 \n\t" 131.75 + : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l) 131.76 + : "r"(a), "r"(b)); 131.77 + return x.x; 131.78 +} 131.79 +#define MLS64(d, a, b) ((d) = MLS64(d, a, b)) 131.80 +#endif 131.81 + 131.82 +#endif /* AVCODEC_PPC_MATHOPS_H */
132.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 132.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/types_altivec.h Mon Aug 27 12:09:56 2012 +0200 132.3 @@ -0,0 +1,46 @@ 132.4 +/* 132.5 + * Copyright (c) 2006 Guillaume Poirier <gpoirier@mplayerhq.hu> 132.6 + * 132.7 + * This file is part of FFmpeg. 132.8 + * 132.9 + * FFmpeg is free software; you can redistribute it and/or 132.10 + * modify it under the terms of the GNU Lesser General Public 132.11 + * License as published by the Free Software Foundation; either 132.12 + * version 2.1 of the License, or (at your option) any later version. 132.13 + * 132.14 + * FFmpeg is distributed in the hope that it will be useful, 132.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 132.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 132.17 + * Lesser General Public License for more details. 132.18 + * 132.19 + * You should have received a copy of the GNU Lesser General Public 132.20 + * License along with FFmpeg; if not, write to the Free Software 132.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 132.22 + */ 132.23 + 132.24 +#ifndef AVCODEC_PPC_TYPES_ALTIVEC_H 132.25 +#define AVCODEC_PPC_TYPES_ALTIVEC_H 132.26 + 132.27 +/*********************************************************************** 132.28 + * Vector types 132.29 + **********************************************************************/ 132.30 +#define vec_u8 vector unsigned char 132.31 +#define vec_s8 vector signed char 132.32 +#define vec_u16 vector unsigned short 132.33 +#define vec_s16 vector signed short 132.34 +#define vec_u32 vector unsigned int 132.35 +#define vec_s32 vector signed int 132.36 + 132.37 +/*********************************************************************** 132.38 + * Null vector 132.39 + **********************************************************************/ 132.40 +#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8( 0 ) 132.41 + 132.42 +#define zero_u8v (vec_u8) zerov 132.43 +#define zero_s8v (vec_s8) zerov 132.44 +#define zero_u16v (vec_u16) zerov 132.45 +#define zero_s16v (vec_s16) zerov 132.46 +#define zero_u32v (vec_u32) zerov 132.47 +#define zero_s32v (vec_s32) zerov 132.48 + 132.49 +#endif /* AVCODEC_PPC_TYPES_ALTIVEC_H */
133.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 133.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/util_altivec.h Mon Aug 27 12:09:56 2012 +0200 133.3 @@ -0,0 +1,105 @@ 133.4 +/* 133.5 + * This file is part of FFmpeg. 133.6 + * 133.7 + * FFmpeg is free software; you can redistribute it and/or 133.8 + * modify it under the terms of the GNU Lesser General Public 133.9 + * License as published by the Free Software Foundation; either 133.10 + * version 2.1 of the License, or (at your option) any later version. 133.11 + * 133.12 + * FFmpeg is distributed in the hope that it will be useful, 133.13 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 133.14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 133.15 + * Lesser General Public License for more details. 133.16 + * 133.17 + * You should have received a copy of the GNU Lesser General Public 133.18 + * License along with FFmpeg; if not, write to the Free Software 133.19 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 133.20 + */ 133.21 + 133.22 +/** 133.23 + * @file 133.24 + * Contains misc utility macros and inline functions 133.25 + */ 133.26 + 133.27 +#ifndef AVCODEC_PPC_UTIL_ALTIVEC_H 133.28 +#define AVCODEC_PPC_UTIL_ALTIVEC_H 133.29 + 133.30 +#include <stdint.h> 133.31 + 133.32 +#include "config.h" 133.33 + 133.34 +#if HAVE_ALTIVEC_H 133.35 +#include <altivec.h> 133.36 +#endif 133.37 + 133.38 +// used to build registers permutation vectors (vcprm) 133.39 +// the 's' are for words in the _s_econd vector 133.40 +#define WORD_0 0x00,0x01,0x02,0x03 133.41 +#define WORD_1 0x04,0x05,0x06,0x07 133.42 +#define WORD_2 0x08,0x09,0x0a,0x0b 133.43 +#define WORD_3 0x0c,0x0d,0x0e,0x0f 133.44 +#define WORD_s0 0x10,0x11,0x12,0x13 133.45 +#define WORD_s1 0x14,0x15,0x16,0x17 133.46 +#define WORD_s2 0x18,0x19,0x1a,0x1b 133.47 +#define WORD_s3 0x1c,0x1d,0x1e,0x1f 133.48 + 133.49 +#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d} 133.50 +#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d} 133.51 + 133.52 +// vcprmle is used to keep the same index as in the SSE version. 133.53 +// it's the same as vcprm, with the index inversed 133.54 +// ('le' is Little Endian) 133.55 +#define vcprmle(a,b,c,d) vcprm(d,c,b,a) 133.56 + 133.57 +// used to build inverse/identity vectors (vcii) 133.58 +// n is _n_egative, p is _p_ositive 133.59 +#define FLOAT_n -1. 133.60 +#define FLOAT_p 1. 133.61 + 133.62 + 133.63 +// Transpose 8x8 matrix of 16-bit elements (in-place) 133.64 +#define TRANSPOSE8(a,b,c,d,e,f,g,h) \ 133.65 +do { \ 133.66 + vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \ 133.67 + vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \ 133.68 + \ 133.69 + A1 = vec_mergeh (a, e); \ 133.70 + B1 = vec_mergel (a, e); \ 133.71 + C1 = vec_mergeh (b, f); \ 133.72 + D1 = vec_mergel (b, f); \ 133.73 + E1 = vec_mergeh (c, g); \ 133.74 + F1 = vec_mergel (c, g); \ 133.75 + G1 = vec_mergeh (d, h); \ 133.76 + H1 = vec_mergel (d, h); \ 133.77 + \ 133.78 + A2 = vec_mergeh (A1, E1); \ 133.79 + B2 = vec_mergel (A1, E1); \ 133.80 + C2 = vec_mergeh (B1, F1); \ 133.81 + D2 = vec_mergel (B1, F1); \ 133.82 + E2 = vec_mergeh (C1, G1); \ 133.83 + F2 = vec_mergel (C1, G1); \ 133.84 + G2 = vec_mergeh (D1, H1); \ 133.85 + H2 = vec_mergel (D1, H1); \ 133.86 + \ 133.87 + a = vec_mergeh (A2, E2); \ 133.88 + b = vec_mergel (A2, E2); \ 133.89 + c = vec_mergeh (B2, F2); \ 133.90 + d = vec_mergel (B2, F2); \ 133.91 + e = vec_mergeh (C2, G2); \ 133.92 + f = vec_mergel (C2, G2); \ 133.93 + g = vec_mergeh (D2, H2); \ 133.94 + h = vec_mergel (D2, H2); \ 133.95 +} while (0) 133.96 + 133.97 + 133.98 +/** \brief loads unaligned vector \a *src with offset \a offset 133.99 + and returns it */ 133.100 +static inline vector unsigned char unaligned_load(int offset, uint8_t *src) 133.101 +{ 133.102 + register vector unsigned char first = vec_ld(offset, src); 133.103 + register vector unsigned char second = vec_ld(offset+15, src); 133.104 + register vector unsigned char mask = vec_lvsl(offset, src); 133.105 + return vec_perm(first, second, mask); 133.106 +} 133.107 + 133.108 +#endif /* AVCODEC_PPC_UTIL_ALTIVEC_H */
134.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 134.2 +++ b/ffmpeg_smp/h264dec/libavcodec/raw.h Mon Aug 27 12:09:56 2012 +0200 134.3 @@ -0,0 +1,39 @@ 134.4 +/* 134.5 + * Raw Video Codec 134.6 + * Copyright (c) 2001 Fabrice Bellard 134.7 + * 134.8 + * This file is part of FFmpeg. 134.9 + * 134.10 + * FFmpeg is free software; you can redistribute it and/or 134.11 + * modify it under the terms of the GNU Lesser General Public 134.12 + * License as published by the Free Software Foundation; either 134.13 + * version 2.1 of the License, or (at your option) any later version. 134.14 + * 134.15 + * FFmpeg is distributed in the hope that it will be useful, 134.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 134.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 134.18 + * Lesser General Public License for more details. 134.19 + * 134.20 + * You should have received a copy of the GNU Lesser General Public 134.21 + * License along with FFmpeg; if not, write to the Free Software 134.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 134.23 + */ 134.24 + 134.25 +/** 134.26 + * @file 134.27 + * Raw Video Codec 134.28 + */ 134.29 + 134.30 +#ifndef AVCODEC_RAW_H 134.31 +#define AVCODEC_RAW_H 134.32 + 134.33 +#include "avcodec.h" 134.34 + 134.35 +typedef struct PixelFormatTag { 134.36 + enum PixelFormat pix_fmt; 134.37 + unsigned int fourcc; 134.38 +} PixelFormatTag; 134.39 + 134.40 +extern const PixelFormatTag ff_raw_pixelFormatTags[]; 134.41 +int raw_init_encoder(AVCodecContext *avctx); 134.42 +#endif /* AVCODEC_RAW_H */
135.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 135.2 +++ b/ffmpeg_smp/h264dec/libavcodec/rectangle.h Mon Aug 27 12:09:56 2012 +0200 135.3 @@ -0,0 +1,92 @@ 135.4 +/* 135.5 + * rectangle filling function 135.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 135.7 + * 135.8 + * This file is part of FFmpeg. 135.9 + * 135.10 + * FFmpeg is free software; you can redistribute it and/or 135.11 + * modify it under the terms of the GNU Lesser General Public 135.12 + * License as published by the Free Software Foundation; either 135.13 + * version 2.1 of the License, or (at your option) any later version. 135.14 + * 135.15 + * FFmpeg is distributed in the hope that it will be useful, 135.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 135.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 135.18 + * Lesser General Public License for more details. 135.19 + * 135.20 + * You should have received a copy of the GNU Lesser General Public 135.21 + * License along with FFmpeg; if not, write to the Free Software 135.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 135.23 + */ 135.24 + 135.25 +/** 135.26 + * @file 135.27 + * useful rectangle filling function 135.28 + * @author Michael Niedermayer <michaelni@gmx.at> 135.29 + */ 135.30 + 135.31 +#ifndef AVCODEC_RECTANGLE_H 135.32 +#define AVCODEC_RECTANGLE_H 135.33 + 135.34 +#include <assert.h> 135.35 +//#include "config.h" 135.36 +#include "libavutil/common.h" 135.37 +#include "dsputil.h" 135.38 + 135.39 +/** 135.40 + * fill a rectangle. 135.41 + * @param h height of the rectangle, should be a constant 135.42 + * @param w width of the rectangle, should be a constant 135.43 + * @param size the size of val (1, 2 or 4), should be a constant 135.44 + */ 135.45 +static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){ 135.46 + uint8_t *p= (uint8_t*)vp; 135.47 + assert(size==1 || size==2 || size==4); 135.48 + assert(w<=4); 135.49 + 135.50 + w *= size; 135.51 + stride *= size; 135.52 + 135.53 + assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0); 135.54 + assert((stride&(w-1))==0); 135.55 + if(w==2){ 135.56 + const uint16_t v= size==4 ? val : val*0x0101; 135.57 + *(uint16_t*)(p + 0*stride)= v; 135.58 + if(h==1) return; 135.59 + *(uint16_t*)(p + 1*stride)= v; 135.60 + if(h==2) return; 135.61 + *(uint16_t*)(p + 2*stride)= v; 135.62 + *(uint16_t*)(p + 3*stride)= v; 135.63 + }else if(w==4){ 135.64 + const uint32_t v= size==4 ? val : size==2 ? val*0x00010001 : val*0x01010101; 135.65 + *(uint32_t*)(p + 0*stride)= v; 135.66 + if(h==1) return; 135.67 + *(uint32_t*)(p + 1*stride)= v; 135.68 + if(h==2) return; 135.69 + *(uint32_t*)(p + 2*stride)= v; 135.70 + *(uint32_t*)(p + 3*stride)= v; 135.71 + }else if(w==8){ 135.72 + const uint64_t v= size==2 ? val*0x0001000100010001ULL : val*0x0100000001ULL; 135.73 + *(uint64_t*)(p + 0*stride)= v; 135.74 + if(h==1) return; 135.75 + *(uint64_t*)(p + 1*stride)= v; 135.76 + if(h==2) return; 135.77 + *(uint64_t*)(p + 2*stride)= v; 135.78 + *(uint64_t*)(p + 3*stride)= v; 135.79 + }else if(w==16){ 135.80 + const uint64_t v= val*0x0100000001ULL; 135.81 + *(uint64_t*)(p + 0+0*stride)= v; 135.82 + *(uint64_t*)(p + 8+0*stride)= v; 135.83 + *(uint64_t*)(p + 0+1*stride)= v; 135.84 + *(uint64_t*)(p + 8+1*stride)= v; 135.85 + if(h==2) return; 135.86 + *(uint64_t*)(p + 0+2*stride)= v; 135.87 + *(uint64_t*)(p + 8+2*stride)= v; 135.88 + *(uint64_t*)(p + 0+3*stride)= v; 135.89 + *(uint64_t*)(p + 8+3*stride)= v; 135.90 + }else 135.91 + assert(0); 135.92 + assert(h==4); 135.93 +} 135.94 + 135.95 +#endif /* AVCODEC_RECTANGLE_H */
136.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 136.2 +++ b/ffmpeg_smp/h264dec/libavcodec/scratch.c Mon Aug 27 12:09:56 2012 +0200 136.3 @@ -0,0 +1,295 @@ 136.4 +static void *entropy_thread(void *arg){ 136.5 + H264Context *h = (H264Context *) arg; 136.6 + EDSlice *s; 136.7 + 136.8 + H264Cabac hcabac; 136.9 + CABACContext cabac; 136.10 + 136.11 + ff_init_cabac_states(); 136.12 + 136.13 + if (init_cabac(h, &hcabac)<0) 136.14 + return NULL; 136.15 + 136.16 + for(;;){ 136.17 + { 136.18 + pthread_mutex_lock(&h->lock[ENTROPY]); 136.19 + while (h->ed_cnt<=0) 136.20 + pthread_cond_wait(&h->cond[ENTROPY], &h->lock[ENTROPY]); 136.21 + s= &h->ed_q[h->ed_fo]; 136.22 + pthread_mutex_unlock(&h->lock[ENTROPY]); 136.23 + h->ed_fo++; h->ed_fo %= MAX_SLICE_COUNT; 136.24 + } 136.25 + if (s->state<0) 136.26 + break; 136.27 + 136.28 + decode_slice_entropy(&hcabac, &cabac, s); 136.29 + 136.30 + { 136.31 + pthread_mutex_lock(&h->lock[MBDEC]); 136.32 + while (h->mbdec_cnt >= MAX_SLICE_COUNT) 136.33 + pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]); 136.34 + h->mbdec_q[h->mbdec_fi] = *((MBSlice *) s); 136.35 + h->mbdec_cnt++; 136.36 + h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT; 136.37 + pthread_cond_signal(&h->cond[MBDEC]); 136.38 + pthread_mutex_unlock(&h->lock[MBDEC]); 136.39 + } 136.40 + { 136.41 + pthread_mutex_lock(&h->lock[ENTROPY]); 136.42 + h->ed_cnt--; 136.43 + pthread_cond_signal(&h->cond[ENTROPY]); 136.44 + pthread_mutex_unlock(&h->lock[ENTROPY]); 136.45 + } 136.46 + } 136.47 + 136.48 + { 136.49 + pthread_mutex_lock(&h->lock[MBDEC]); 136.50 + while (h->mbdec_cnt >= MAX_SLICE_COUNT) 136.51 + pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]); 136.52 + h->mbdec_q[h->mbdec_fi] = *((MBSlice *) s); 136.53 + h->mbdec_cnt++; 136.54 + h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT; 136.55 + pthread_cond_signal(&h->cond[MBDEC]); 136.56 + pthread_mutex_unlock(&h->lock[MBDEC]); 136.57 + 136.58 + } 136.59 + 136.60 + free_cabac(&hcabac); 136.61 + 136.62 + pthread_exit(NULL); 136.63 + return NULL; 136.64 + 136.65 +} 136.66 +/* 136.67 +* The following code is the main loop of the file converter 136.68 +*/ 136.69 +int av_transcode_1ed(int ifile, int ofile, int frame_width, int frame_height) { 136.70 + H264Context *h; 136.71 + pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr; 136.72 + 136.73 + h = ff_h264_decode_init(ifile, ofile, frame_width, frame_height); 136.74 + 136.75 + timer_start = av_gettime(); 136.76 + 136.77 + // pthread_create(&read_thr, NULL, read_thread, h); 136.78 + // pthread_create(&parsenal_thr, NULL, parsenal_thread, h); 136.79 + pthread_create(&entropy_thr, NULL, entropy_mbd_thread, h); 136.80 + 136.81 + // pthread_create(&mbdec_thr, NULL, mbdec_thread, h); 136.82 + 136.83 + // pthread_create(&write_thr, NULL, write_thread, h); 136.84 + 136.85 + // pthread_join(read_thr, NULL); 136.86 + // pthread_join(parsenal_thr, NULL); 136.87 + pthread_join(entropy_thr, NULL); 136.88 + // pthread_join(mbdec_thr, NULL); 136.89 + // printf("before write_thr\n"); 136.90 + // pthread_join(write_thr, NULL); 136.91 + 136.92 + /* finished ! */ 136.93 + ff_h264_decode_end(h); 136.94 + 136.95 + return 0; 136.96 +} 136.97 + 136.98 +static void reset_h264mb(EDSlice *s, int mb_width, int mb_height){ 136.99 + for (int i=0; i<mb_height; i++){ 136.100 + for (int j=0; j<mb_width; j++){ 136.101 + H264Mb *m = &s->mbs[i*mb_width + j]; 136.102 + 136.103 + m->left_mb_xy=0; 136.104 + m->top_mb_xy = 0; 136.105 + } 136.106 + } 136.107 +} 136.108 + 136.109 +static void *entropy_mbd_thread(void *arg){ 136.110 + H264Context *h = (H264Context *) arg; 136.111 + 136.112 + EDSlice slice, *s=&slice; 136.113 + MBSlice mbslice, *s2=&mbslice; 136.114 + H264Cabac hcabac; 136.115 + CABACContext cabac; 136.116 + int frames =0; 136.117 + MBDecContext mbdec, *d=&mbdec; 136.118 + int size=h->width*h->height; 136.119 + WriteContext write, *w=&write; 136.120 + AVCodecParserContext parser, *pc= &parser; 136.121 + NalContext nal, *n=&nal; 136.122 + 136.123 + 136.124 + memset(pc, 0, sizeof(AVCodecParserContext)); 136.125 + pc->buffer_size = 2048; 136.126 + pc->final_frame = 0; 136.127 + pc->cur_len= 0; 136.128 + pc->data = av_mallocz(2048 + FF_INPUT_BUFFER_PADDING_SIZE); 136.129 + pc->size = 2048; 136.130 + pc->eof_reached =0; 136.131 + pc->ifile = h->ifile; 136.132 + 136.133 + //init parse 136.134 + memset(n, 0, sizeof(NalContext)); 136.135 + n->width = h->width; 136.136 + n->height = h->height; 136.137 + n->mb_height = h->mb_height; 136.138 + n->mb_width = h->mb_width; 136.139 + n->b4_stride = n->mb_width*4 + 1; 136.140 + n->mb_stride = n->mb_width + 1; 136.141 + n->outputed_poc = INT_MIN; 136.142 +// memset(s, 0, sizeof(EDSlice)); 136.143 +// ff_init_slice(n, s); 136.144 +// 136.145 + 136.146 + memset(w, 0, sizeof(WriteContext)); 136.147 + w->bit_buffer_size= FFMAX(1024*256, 6*size + 200); 136.148 + w->bit_buffer= av_mallocz(w->bit_buffer_size); 136.149 + 136.150 + 136.151 + 136.152 + ff_h264dsp_init(&d->hdsp); 136.153 + ff_h264_pred_init(&d->hpc); 136.154 + dsputil_init(&d->dsp); 136.155 + d->hdsp.qpel_put= d->dsp.put_h264_qpel_pixels_tab; 136.156 + d->hdsp.qpel_avg= d->dsp.avg_h264_qpel_pixels_tab; 136.157 + d->mb_height = (h->height + 15) / 16; 136.158 + d->mb_width = (h->width + 15) / 16; 136.159 + d->linesize = h->width + EDGE_WIDTH*2; 136.160 + d->uvlinesize = d->linesize>>1; 136.161 + 136.162 + for(int i=0; i<16; i++){ 136.163 + d->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*d->linesize*((scan8[i] - scan8[0])>>3); 136.164 + } 136.165 + for(int i=0; i<4; i++){ 136.166 + d->block_offset[16+i]= 136.167 + d->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*d->uvlinesize*((scan8[i] - scan8[0])>>3); 136.168 + } 136.169 + 136.170 + d->scratchpad= av_mallocz((h->width+64)*4*16*2*sizeof(uint8_t)); 136.171 + 136.172 + ff_init_cabac_states(); 136.173 + 136.174 + if (init_cabac(h, &hcabac)<0) 136.175 + return NULL; 136.176 + 136.177 + while(!pc->final_frame && frames_max++ < 1000){ 136.178 + Picture *out; 136.179 + 136.180 + RawFrame *frm; 136.181 + Picture *pic=NULL; 136.182 + 136.183 + RawFrame frm_read; 136.184 + frm_read.state =0; 136.185 + av_read_frame_internal(pc, &frm_read); 136.186 + frm = &frm_read; 136.187 + 136.188 + if (frm->state < 0) 136.189 + break; 136.190 +/* 136.191 + { 136.192 + pthread_mutex_lock(&h->lock[PARSE2]); 136.193 + while (h->slice_cnt<=0) 136.194 + pthread_cond_wait(&h->cond[PARSE2], &h->lock[PARSE2]); 136.195 + h->slice_cnt--; 136.196 + s= &h->slices[h->slice_next++]; 136.197 + h->slice_next %= MAX_SLICE_COUNT; 136.198 + pthread_mutex_unlock(&h->lock[PARSE2]); 136.199 + }*/ 136.200 + ff_init_slice(n, s); 136.201 + reset_h264mb(s, n->mb_width, n->mb_height); 136.202 + for(int i=0; i<MAX_PIC_COUNT; i++){ 136.203 + if(h->picture[i].reference==0){ 136.204 + pic= &h->picture[i]; 136.205 + break; 136.206 + } 136.207 + } 136.208 +// { 136.209 +// pthread_mutex_lock(&h->lock[PARSE3]); 136.210 +// while (h->free_pic_cnt<=0) 136.211 +// pthread_cond_wait(&h->cond[PARSE3], &h->lock[PARSE3]); 136.212 +// h->free_pic_cnt--; 136.213 +// /* use first free picture */ 136.214 +// for(int i=0; i<MAX_PIC_COUNT; i++){ 136.215 +// if(h->picture[i].reference==0){ 136.216 +// pic= &h->picture[i]; 136.217 +// break; 136.218 +// } 136.219 +// } 136.220 +// pthread_mutex_unlock(&h->lock[PARSE3]); 136.221 +// } 136.222 + ff_alloc_picture(n, s, pic); 136.223 + 136.224 + decode_nal_units(n, s, frm, pic); 136.225 + 136.226 + 136.227 + decode_slice_entropy(&hcabac, &cabac, s); 136.228 + memcpy( s2, s, sizeof(MBSlice)); //this only copys the COMMON_SLICE part 136.229 + av_freep(&s->gb.raw); 136.230 + decode_slice_mb_seq(d, s2); 136.231 + 136.232 +// if (s2->release_cnt>0) { 136.233 +// int i; 136.234 +// for (i=0; i<s2->release_cnt; i++){ 136.235 +// if ((s2->release_ref[i]->reference & ~2) == 0) 136.236 +// default_release_buffer(h, s2->release_ref[i]); 136.237 +// else 136.238 +// s2->release_ref[i]->reference &= ~2; 136.239 +// } 136.240 +// s->release_cnt=0; 136.241 +// } 136.242 + 136.243 +if (s->release_cnt>0) { 136.244 + int i; 136.245 + for (i=0; i<s->release_cnt; i++){ 136.246 + s->release_ref[i]->reference &= ~2; 136.247 + } 136.248 + s->release_cnt=0; 136.249 +} 136.250 + 136.251 + 136.252 + { 136.253 + pthread_mutex_lock(&h->lock[PARSE2]); 136.254 + h->slice_cnt++; 136.255 + pthread_cond_signal(&h->cond[PARSE2]); 136.256 + pthread_mutex_unlock(&h->lock[PARSE2]); 136.257 + } 136.258 + 136.259 + out =output_frame(w, s2->current_picture, h->ofile, h->width, h->height); 136.260 + print_report(w->frame_number, w->video_size, 0); 136.261 + 136.262 + if (out){ 136.263 +// if ((out->reference & ~1) == 0) 136.264 +// default_release_buffer(h, out); 136.265 +// else 136.266 + out->reference &= ~1; 136.267 + } 136.268 + 136.269 + { 136.270 + pthread_mutex_lock(&h->lock[ENTROPY]); 136.271 + h->ed_cnt--; 136.272 + pthread_cond_signal(&h->cond[ENTROPY]); 136.273 + pthread_mutex_unlock(&h->lock[ENTROPY]); 136.274 + } 136.275 + } 136.276 + while (output_frame(w, NULL, h->ofile, h->width, h->height)); 136.277 + print_report(w->frame_number, w->video_size, 1); 136.278 + 136.279 + av_free(w->bit_buffer); 136.280 + 136.281 + {//propagate exit 136.282 + pthread_mutex_lock(&h->lock[WRITE]); 136.283 + while (h->write_cnt>= MAX_DELAYED_PIC_COUNT) 136.284 + pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]); 136.285 + last_pic.reference = -1; 136.286 + h->write_q[h->write_fi] = &last_pic; 136.287 + h->write_cnt++; 136.288 + h->write_fi++; h->write_fi %= MAX_DELAYED_PIC_COUNT; 136.289 + pthread_cond_signal(&h->cond[WRITE]); 136.290 + pthread_mutex_unlock(&h->lock[WRITE]); 136.291 + 136.292 + } 136.293 + free_cabac(&hcabac); 136.294 + 136.295 + pthread_exit(NULL); 136.296 + return NULL; 136.297 + 136.298 +}
137.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 137.2 +++ b/ffmpeg_smp/h264dec/libavcodec/simple_idct.c Mon Aug 27 12:09:56 2012 +0200 137.3 @@ -0,0 +1,372 @@ 137.4 +/* 137.5 + * Simple IDCT 137.6 + * 137.7 + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> 137.8 + * 137.9 + * This file is part of FFmpeg. 137.10 + * 137.11 + * FFmpeg is free software; you can redistribute it and/or 137.12 + * modify it under the terms of the GNU Lesser General Public 137.13 + * License as published by the Free Software Foundation; either 137.14 + * version 2.1 of the License, or (at your option) any later version. 137.15 + * 137.16 + * FFmpeg is distributed in the hope that it will be useful, 137.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 137.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 137.19 + * Lesser General Public License for more details. 137.20 + * 137.21 + * You should have received a copy of the GNU Lesser General Public 137.22 + * License along with FFmpeg; if not, write to the Free Software 137.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 137.24 + */ 137.25 + 137.26 +/** 137.27 + * @file 137.28 + * simpleidct in C. 137.29 + */ 137.30 + 137.31 +/* 137.32 + based upon some outcommented c code from mpeg2dec (idct_mmx.c 137.33 + written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>) 137.34 + */ 137.35 +#include "avcodec.h" 137.36 +#include "dsputil.h" 137.37 +#include "mathops.h" 137.38 +#include "simple_idct.h" 137.39 + 137.40 +#if 0 137.41 +#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ 137.42 +#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ 137.43 +#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */ 137.44 +#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */ 137.45 +#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */ 137.46 +#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */ 137.47 +#define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */ 137.48 +#define ROW_SHIFT 8 137.49 +#define COL_SHIFT 17 137.50 +#else 137.51 +#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 137.52 +#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 137.53 +#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 137.54 +#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 137.55 +#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 137.56 +#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 137.57 +#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 137.58 +#define ROW_SHIFT 11 137.59 +#define COL_SHIFT 20 // 6 137.60 +#endif 137.61 + 137.62 +static inline void idctRowCondDC (DCTELEM * row) 137.63 +{ 137.64 + int a0, a1, a2, a3, b0, b1, b2, b3; 137.65 + uint64_t temp; 137.66 + 137.67 +#if HAVE_BIGENDIAN 137.68 +#define ROW0_MASK 0xffff000000000000LL 137.69 +#else 137.70 +#define ROW0_MASK 0xffffLL 137.71 +#endif 137.72 + if(sizeof(DCTELEM)==2){ 137.73 + if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) | 137.74 + ((uint64_t *)row)[1]) == 0) { 137.75 + temp = (row[0] << 3) & 0xffff; 137.76 + temp += temp << 16; 137.77 + temp += temp << 32; 137.78 + ((uint64_t *)row)[0] = temp; 137.79 + ((uint64_t *)row)[1] = temp; 137.80 + return; 137.81 + } 137.82 + }else{ 137.83 + if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) { 137.84 + row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3; 137.85 + return; 137.86 + } 137.87 + } 137.88 + 137.89 + a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); 137.90 + a1 = a0; 137.91 + a2 = a0; 137.92 + a3 = a0; 137.93 + 137.94 + /* no need to optimize : gcc does it */ 137.95 + a0 += W2 * row[2]; 137.96 + a1 += W6 * row[2]; 137.97 + a2 -= W6 * row[2]; 137.98 + a3 -= W2 * row[2]; 137.99 + 137.100 + b0 = MUL16(W1, row[1]); 137.101 + MAC16(b0, W3, row[3]); 137.102 + b1 = MUL16(W3, row[1]); 137.103 + MAC16(b1, -W7, row[3]); 137.104 + b2 = MUL16(W5, row[1]); 137.105 + MAC16(b2, -W1, row[3]); 137.106 + b3 = MUL16(W7, row[1]); 137.107 + MAC16(b3, -W5, row[3]); 137.108 + 137.109 + temp = ((uint64_t*)row)[1]; 137.110 + 137.111 + if (temp != 0) { 137.112 + a0 += W4*row[4] + W6*row[6]; 137.113 + a1 += - W4*row[4] - W2*row[6]; 137.114 + a2 += - W4*row[4] + W2*row[6]; 137.115 + a3 += W4*row[4] - W6*row[6]; 137.116 + 137.117 + MAC16(b0, W5, row[5]); 137.118 + MAC16(b0, W7, row[7]); 137.119 + 137.120 + MAC16(b1, -W1, row[5]); 137.121 + MAC16(b1, -W5, row[7]); 137.122 + 137.123 + MAC16(b2, W7, row[5]); 137.124 + MAC16(b2, W3, row[7]); 137.125 + 137.126 + MAC16(b3, W3, row[5]); 137.127 + MAC16(b3, -W1, row[7]); 137.128 + } 137.129 + 137.130 + row[0] = (a0 + b0) >> ROW_SHIFT; 137.131 + row[7] = (a0 - b0) >> ROW_SHIFT; 137.132 + row[1] = (a1 + b1) >> ROW_SHIFT; 137.133 + row[6] = (a1 - b1) >> ROW_SHIFT; 137.134 + row[2] = (a2 + b2) >> ROW_SHIFT; 137.135 + row[5] = (a2 - b2) >> ROW_SHIFT; 137.136 + row[3] = (a3 + b3) >> ROW_SHIFT; 137.137 + row[4] = (a3 - b3) >> ROW_SHIFT; 137.138 +} 137.139 + 137.140 +static inline void idctSparseColPut (uint8_t *dest, int line_size, 137.141 + DCTELEM * col) 137.142 +{ 137.143 + int a0, a1, a2, a3, b0, b1, b2, b3; 137.144 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; 137.145 + 137.146 + /* XXX: I did that only to give same values as previous code */ 137.147 + a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); 137.148 + a1 = a0; 137.149 + a2 = a0; 137.150 + a3 = a0; 137.151 + 137.152 + a0 += + W2*col[8*2]; 137.153 + a1 += + W6*col[8*2]; 137.154 + a2 += - W6*col[8*2]; 137.155 + a3 += - W2*col[8*2]; 137.156 + 137.157 + b0 = MUL16(W1, col[8*1]); 137.158 + b1 = MUL16(W3, col[8*1]); 137.159 + b2 = MUL16(W5, col[8*1]); 137.160 + b3 = MUL16(W7, col[8*1]); 137.161 + 137.162 + MAC16(b0, + W3, col[8*3]); 137.163 + MAC16(b1, - W7, col[8*3]); 137.164 + MAC16(b2, - W1, col[8*3]); 137.165 + MAC16(b3, - W5, col[8*3]); 137.166 + 137.167 + if(col[8*4]){ 137.168 + a0 += + W4*col[8*4]; 137.169 + a1 += - W4*col[8*4]; 137.170 + a2 += - W4*col[8*4]; 137.171 + a3 += + W4*col[8*4]; 137.172 + } 137.173 + 137.174 + if (col[8*5]) { 137.175 + MAC16(b0, + W5, col[8*5]); 137.176 + MAC16(b1, - W1, col[8*5]); 137.177 + MAC16(b2, + W7, col[8*5]); 137.178 + MAC16(b3, + W3, col[8*5]); 137.179 + } 137.180 + 137.181 + if(col[8*6]){ 137.182 + a0 += + W6*col[8*6]; 137.183 + a1 += - W2*col[8*6]; 137.184 + a2 += + W2*col[8*6]; 137.185 + a3 += - W6*col[8*6]; 137.186 + } 137.187 + 137.188 + if (col[8*7]) { 137.189 + MAC16(b0, + W7, col[8*7]); 137.190 + MAC16(b1, - W5, col[8*7]); 137.191 + MAC16(b2, + W3, col[8*7]); 137.192 + MAC16(b3, - W1, col[8*7]); 137.193 + } 137.194 + 137.195 + dest[0] = cm[(a0 + b0) >> COL_SHIFT]; 137.196 + dest += line_size; 137.197 + dest[0] = cm[(a1 + b1) >> COL_SHIFT]; 137.198 + dest += line_size; 137.199 + dest[0] = cm[(a2 + b2) >> COL_SHIFT]; 137.200 + dest += line_size; 137.201 + dest[0] = cm[(a3 + b3) >> COL_SHIFT]; 137.202 + dest += line_size; 137.203 + dest[0] = cm[(a3 - b3) >> COL_SHIFT]; 137.204 + dest += line_size; 137.205 + dest[0] = cm[(a2 - b2) >> COL_SHIFT]; 137.206 + dest += line_size; 137.207 + dest[0] = cm[(a1 - b1) >> COL_SHIFT]; 137.208 + dest += line_size; 137.209 + dest[0] = cm[(a0 - b0) >> COL_SHIFT]; 137.210 +} 137.211 + 137.212 +static inline void idctSparseColAdd (uint8_t *dest, int line_size, 137.213 + DCTELEM * col) 137.214 +{ 137.215 + int a0, a1, a2, a3, b0, b1, b2, b3; 137.216 + uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; 137.217 + 137.218 + /* XXX: I did that only to give same values as previous code */ 137.219 + a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); 137.220 + a1 = a0; 137.221 + a2 = a0; 137.222 + a3 = a0; 137.223 + 137.224 + a0 += + W2*col[8*2]; 137.225 + a1 += + W6*col[8*2]; 137.226 + a2 += - W6*col[8*2]; 137.227 + a3 += - W2*col[8*2]; 137.228 + 137.229 + b0 = MUL16(W1, col[8*1]); 137.230 + b1 = MUL16(W3, col[8*1]); 137.231 + b2 = MUL16(W5, col[8*1]); 137.232 + b3 = MUL16(W7, col[8*1]); 137.233 + 137.234 + MAC16(b0, + W3, col[8*3]); 137.235 + MAC16(b1, - W7, col[8*3]); 137.236 + MAC16(b2, - W1, col[8*3]); 137.237 + MAC16(b3, - W5, col[8*3]); 137.238 + 137.239 + if(col[8*4]){ 137.240 + a0 += + W4*col[8*4]; 137.241 + a1 += - W4*col[8*4]; 137.242 + a2 += - W4*col[8*4]; 137.243 + a3 += + W4*col[8*4]; 137.244 + } 137.245 + 137.246 + if (col[8*5]) { 137.247 + MAC16(b0, + W5, col[8*5]); 137.248 + MAC16(b1, - W1, col[8*5]); 137.249 + MAC16(b2, + W7, col[8*5]); 137.250 + MAC16(b3, + W3, col[8*5]); 137.251 + } 137.252 + 137.253 + if(col[8*6]){ 137.254 + a0 += + W6*col[8*6]; 137.255 + a1 += - W2*col[8*6]; 137.256 + a2 += + W2*col[8*6]; 137.257 + a3 += - W6*col[8*6]; 137.258 + } 137.259 + 137.260 + if (col[8*7]) { 137.261 + MAC16(b0, + W7, col[8*7]); 137.262 + MAC16(b1, - W5, col[8*7]); 137.263 + MAC16(b2, + W3, col[8*7]); 137.264 + MAC16(b3, - W1, col[8*7]); 137.265 + } 137.266 + 137.267 + dest[0] = cm[dest[0] + ((a0 + b0) >> COL_SHIFT)]; 137.268 + dest += line_size; 137.269 + dest[0] = cm[dest[0] + ((a1 + b1) >> COL_SHIFT)]; 137.270 + dest += line_size; 137.271 + dest[0] = cm[dest[0] + ((a2 + b2) >> COL_SHIFT)]; 137.272 + dest += line_size; 137.273 + dest[0] = cm[dest[0] + ((a3 + b3) >> COL_SHIFT)]; 137.274 + dest += line_size; 137.275 + dest[0] = cm[dest[0] + ((a3 - b3) >> COL_SHIFT)]; 137.276 + dest += line_size; 137.277 + dest[0] = cm[dest[0] + ((a2 - b2) >> COL_SHIFT)]; 137.278 + dest += line_size; 137.279 + dest[0] = cm[dest[0] + ((a1 - b1) >> COL_SHIFT)]; 137.280 + dest += line_size; 137.281 + dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)]; 137.282 +} 137.283 + 137.284 +static inline void idctSparseCol (DCTELEM * col) 137.285 +{ 137.286 + int a0, a1, a2, a3, b0, b1, b2, b3; 137.287 + 137.288 + /* XXX: I did that only to give same values as previous code */ 137.289 + a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); 137.290 + a1 = a0; 137.291 + a2 = a0; 137.292 + a3 = a0; 137.293 + 137.294 + a0 += + W2*col[8*2]; 137.295 + a1 += + W6*col[8*2]; 137.296 + a2 += - W6*col[8*2]; 137.297 + a3 += - W2*col[8*2]; 137.298 + 137.299 + b0 = MUL16(W1, col[8*1]); 137.300 + b1 = MUL16(W3, col[8*1]); 137.301 + b2 = MUL16(W5, col[8*1]); 137.302 + b3 = MUL16(W7, col[8*1]); 137.303 + 137.304 + MAC16(b0, + W3, col[8*3]); 137.305 + MAC16(b1, - W7, col[8*3]); 137.306 + MAC16(b2, - W1, col[8*3]); 137.307 + MAC16(b3, - W5, col[8*3]); 137.308 + 137.309 + if(col[8*4]){ 137.310 + a0 += + W4*col[8*4]; 137.311 + a1 += - W4*col[8*4]; 137.312 + a2 += - W4*col[8*4]; 137.313 + a3 += + W4*col[8*4]; 137.314 + } 137.315 + 137.316 + if (col[8*5]) { 137.317 + MAC16(b0, + W5, col[8*5]); 137.318 + MAC16(b1, - W1, col[8*5]); 137.319 + MAC16(b2, + W7, col[8*5]); 137.320 + MAC16(b3, + W3, col[8*5]); 137.321 + } 137.322 + 137.323 + if(col[8*6]){ 137.324 + a0 += + W6*col[8*6]; 137.325 + a1 += - W2*col[8*6]; 137.326 + a2 += + W2*col[8*6]; 137.327 + a3 += - W6*col[8*6]; 137.328 + } 137.329 + 137.330 + if (col[8*7]) { 137.331 + MAC16(b0, + W7, col[8*7]); 137.332 + MAC16(b1, - W5, col[8*7]); 137.333 + MAC16(b2, + W3, col[8*7]); 137.334 + MAC16(b3, - W1, col[8*7]); 137.335 + } 137.336 + 137.337 + col[0 ] = ((a0 + b0) >> COL_SHIFT); 137.338 + col[8 ] = ((a1 + b1) >> COL_SHIFT); 137.339 + col[16] = ((a2 + b2) >> COL_SHIFT); 137.340 + col[24] = ((a3 + b3) >> COL_SHIFT); 137.341 + col[32] = ((a3 - b3) >> COL_SHIFT); 137.342 + col[40] = ((a2 - b2) >> COL_SHIFT); 137.343 + col[48] = ((a1 - b1) >> COL_SHIFT); 137.344 + col[56] = ((a0 - b0) >> COL_SHIFT); 137.345 +} 137.346 + 137.347 +void ff_simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block) 137.348 +{ 137.349 + int i; 137.350 + for(i=0; i<8; i++) 137.351 + idctRowCondDC(block + i*8); 137.352 + 137.353 + for(i=0; i<8; i++) 137.354 + idctSparseColPut(dest + i, line_size, block + i); 137.355 +} 137.356 + 137.357 +void ff_simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block) 137.358 +{ 137.359 + int i; 137.360 + for(i=0; i<8; i++) 137.361 + idctRowCondDC(block + i*8); 137.362 + 137.363 + for(i=0; i<8; i++) 137.364 + idctSparseColAdd(dest + i, line_size, block + i); 137.365 +} 137.366 + 137.367 +void ff_simple_idct(DCTELEM *block) 137.368 +{ 137.369 + int i; 137.370 + for(i=0; i<8; i++) 137.371 + idctRowCondDC(block + i*8); 137.372 + 137.373 + for(i=0; i<8; i++) 137.374 + idctSparseCol(block + i); 137.375 +}
138.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 138.2 +++ b/ffmpeg_smp/h264dec/libavcodec/simple_idct.h Mon Aug 27 12:09:56 2012 +0200 138.3 @@ -0,0 +1,47 @@ 138.4 +/* 138.5 + * Simple IDCT 138.6 + * 138.7 + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> 138.8 + * 138.9 + * This file is part of FFmpeg. 138.10 + * 138.11 + * FFmpeg is free software; you can redistribute it and/or 138.12 + * modify it under the terms of the GNU Lesser General Public 138.13 + * License as published by the Free Software Foundation; either 138.14 + * version 2.1 of the License, or (at your option) any later version. 138.15 + * 138.16 + * FFmpeg is distributed in the hope that it will be useful, 138.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 138.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 138.19 + * Lesser General Public License for more details. 138.20 + * 138.21 + * You should have received a copy of the GNU Lesser General Public 138.22 + * License along with FFmpeg; if not, write to the Free Software 138.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 138.24 + */ 138.25 + 138.26 +/** 138.27 + * @file 138.28 + * simple idct header. 138.29 + */ 138.30 + 138.31 +#ifndef AVCODEC_SIMPLE_IDCT_H 138.32 +#define AVCODEC_SIMPLE_IDCT_H 138.33 + 138.34 +#include <stdint.h> 138.35 +#include "dsputil.h" 138.36 + 138.37 +void ff_simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block); 138.38 +void ff_simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block); 138.39 +void ff_simple_idct_mmx(int16_t *block); 138.40 +void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block); 138.41 +void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block); 138.42 +void ff_simple_idct(DCTELEM *block); 138.43 + 138.44 +void ff_simple_idct248_put(uint8_t *dest, int line_size, DCTELEM *block); 138.45 + 138.46 +void ff_simple_idct84_add(uint8_t *dest, int line_size, DCTELEM *block); 138.47 +void ff_simple_idct48_add(uint8_t *dest, int line_size, DCTELEM *block); 138.48 +void ff_simple_idct44_add(uint8_t *dest, int line_size, DCTELEM *block); 138.49 + 138.50 +#endif /* AVCODEC_SIMPLE_IDCT_H */
139.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 139.2 +++ b/ffmpeg_smp/h264dec/libavcodec/utils.c Mon Aug 27 12:09:56 2012 +0200 139.3 @@ -0,0 +1,68 @@ 139.4 +/* 139.5 + * utils for libavcodec 139.6 + * Copyright (c) 2001 Fabrice Bellard 139.7 + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 139.8 + * 139.9 + * This file is part of FFmpeg. 139.10 + * 139.11 + * FFmpeg is free software; you can redistribute it and/or 139.12 + * modify it under the terms of the GNU Lesser General Public 139.13 + * License as published by the Free Software Foundation; either 139.14 + * version 2.1 of the License, or (at your option) any later version. 139.15 + * 139.16 + * FFmpeg is distributed in the hope that it will be useful, 139.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 139.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 139.19 + * Lesser General Public License for more details. 139.20 + * 139.21 + * You should have received a copy of the GNU Lesser General Public 139.22 + * License along with FFmpeg; if not, write to the Free Software 139.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 139.24 + */ 139.25 + 139.26 +/** 139.27 + * @file 139.28 + * utils. 139.29 + */ 139.30 + 139.31 +/* needed for mkstemp() */ 139.32 +#define _XOPEN_SOURCE 600 139.33 + 139.34 +#include "avcodec.h" 139.35 +#include "dsputil.h" 139.36 + 139.37 +#include <stdlib.h> 139.38 +#include <stdarg.h> 139.39 +#include <limits.h> 139.40 +#include <float.h> 139.41 +//#undef NDEBUG 139.42 +#include <assert.h> 139.43 + 139.44 +#include <fcntl.h> 139.45 + 139.46 +void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size) 139.47 +{ 139.48 + if(min_size < *size) 139.49 + return ptr; 139.50 + 139.51 + *size= FFMAX(17*min_size/16 + 32, min_size); 139.52 + 139.53 + ptr= av_realloc(ptr, *size); 139.54 + if(!ptr) //we could set this to the unmodified min_size but this is safer if the user lost the ptr and uses NULL now 139.55 + *size= 0; 139.56 + 139.57 + return ptr; 139.58 +} 139.59 + 139.60 +void av_fast_malloc(void *ptr, unsigned int *size, unsigned int min_size) 139.61 +{ 139.62 + void **p = ptr; 139.63 + if (min_size < *size) 139.64 + return; 139.65 + *size= FFMAX(17*min_size/16 + 32, min_size); 139.66 + av_free(*p); 139.67 + *p = av_malloc(*size); 139.68 + if (!*p) *size = 0; 139.69 +} 139.70 + 139.71 +
140.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 140.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/cpuid.c Mon Aug 27 12:09:56 2012 +0200 140.3 @@ -0,0 +1,135 @@ 140.4 +/* 140.5 + * CPU detection code, extracted from mmx.h 140.6 + * (c)1997-99 by H. Dietz and R. Fisher 140.7 + * Converted to C and improved by Fabrice Bellard. 140.8 + * 140.9 + * This file is part of FFmpeg. 140.10 + * 140.11 + * FFmpeg is free software; you can redistribute it and/or 140.12 + * modify it under the terms of the GNU Lesser General Public 140.13 + * License as published by the Free Software Foundation; either 140.14 + * version 2.1 of the License, or (at your option) any later version. 140.15 + * 140.16 + * FFmpeg is distributed in the hope that it will be useful, 140.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 140.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 140.19 + * Lesser General Public License for more details. 140.20 + * 140.21 + * You should have received a copy of the GNU Lesser General Public 140.22 + * License along with FFmpeg; if not, write to the Free Software 140.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 140.24 + */ 140.25 + 140.26 +#include <stdlib.h> 140.27 +#include "libavutil/x86_cpu.h" 140.28 +#include "libavcodec/dsputil.h" 140.29 + 140.30 +#undef printf 140.31 + 140.32 +/* ebx saving is necessary for PIC. gcc seems unable to see it alone */ 140.33 +#define cpuid(index,eax,ebx,ecx,edx)\ 140.34 + __asm__ volatile\ 140.35 + ("mov %%"REG_b", %%"REG_S"\n\t"\ 140.36 + "cpuid\n\t"\ 140.37 + "xchg %%"REG_b", %%"REG_S\ 140.38 + : "=a" (eax), "=S" (ebx),\ 140.39 + "=c" (ecx), "=d" (edx)\ 140.40 + : "0" (index)); 140.41 + 140.42 +/* Function to test if multimedia instructions are supported... */ 140.43 +int mm_support() 140.44 +{ 140.45 + int rval = 0; 140.46 + int eax, ebx, ecx, edx; 140.47 + int max_std_level, max_ext_level, std_caps=0, ext_caps=0; 140.48 + 140.49 +#if ARCH_X86_32 140.50 + x86_reg a, c; 140.51 + __asm__ volatile ( 140.52 + /* See if CPUID instruction is supported ... */ 140.53 + /* ... Get copies of EFLAGS into eax and ecx */ 140.54 + "pushfl\n\t" 140.55 + "pop %0\n\t" 140.56 + "mov %0, %1\n\t" 140.57 + 140.58 + /* ... Toggle the ID bit in one copy and store */ 140.59 + /* to the EFLAGS reg */ 140.60 + "xor $0x200000, %0\n\t" 140.61 + "push %0\n\t" 140.62 + "popfl\n\t" 140.63 + 140.64 + /* ... Get the (hopefully modified) EFLAGS */ 140.65 + "pushfl\n\t" 140.66 + "pop %0\n\t" 140.67 + : "=a" (a), "=c" (c) 140.68 + : 140.69 + : "cc" 140.70 + ); 140.71 + 140.72 + if (a == c) 140.73 + return 0; /* CPUID not supported */ 140.74 +#endif 140.75 + 140.76 + cpuid(0, max_std_level, ebx, ecx, edx); 140.77 + 140.78 + if(max_std_level >= 1){ 140.79 + cpuid(1, eax, ebx, ecx, std_caps); 140.80 + if (std_caps & (1<<23)) 140.81 + rval |= FF_MM_MMX; 140.82 + if (std_caps & (1<<25)) 140.83 + rval |= FF_MM_MMX2 140.84 +#if HAVE_SSE 140.85 + | FF_MM_SSE; 140.86 + if (std_caps & (1<<26)) 140.87 + rval |= FF_MM_SSE2; 140.88 + if (ecx & 1) 140.89 + rval |= FF_MM_SSE3; 140.90 + if (ecx & 0x00000200 ) 140.91 + rval |= FF_MM_SSSE3; 140.92 + if (ecx & 0x00080000 ) 140.93 + rval |= FF_MM_SSE4; 140.94 + if (ecx & 0x00100000 ) 140.95 + rval |= FF_MM_SSE42; 140.96 +#endif 140.97 + ; 140.98 + } 140.99 + 140.100 + cpuid(0x80000000, max_ext_level, ebx, ecx, edx); 140.101 + 140.102 + if(max_ext_level >= 0x80000001){ 140.103 + cpuid(0x80000001, eax, ebx, ecx, ext_caps); 140.104 + if (ext_caps & (1<<31)) 140.105 + rval |= FF_MM_3DNOW; 140.106 + if (ext_caps & (1<<30)) 140.107 + rval |= FF_MM_3DNOWEXT; 140.108 + if (ext_caps & (1<<23)) 140.109 + rval |= FF_MM_MMX; 140.110 + if (ext_caps & (1<<22)) 140.111 + rval |= FF_MM_MMX2; 140.112 + } 140.113 + 140.114 +#if 0 140.115 + av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s\n", 140.116 + (rval&FF_MM_MMX) ? "MMX ":"", 140.117 + (rval&FF_MM_MMX2) ? "MMX2 ":"", 140.118 + (rval&FF_MM_SSE) ? "SSE ":"", 140.119 + (rval&FF_MM_SSE2) ? "SSE2 ":"", 140.120 + (rval&FF_MM_SSE3) ? "SSE3 ":"", 140.121 + (rval&FF_MM_SSSE3) ? "SSSE3 ":"", 140.122 + (rval&FF_MM_SSE4) ? "SSE4.1 ":"", 140.123 + (rval&FF_MM_SSE42) ? "SSE4.2 ":"", 140.124 + (rval&FF_MM_3DNOW) ? "3DNow ":"", 140.125 + (rval&FF_MM_3DNOWEXT) ? "3DNowExt ":""); 140.126 +#endif 140.127 + return rval; 140.128 +} 140.129 + 140.130 +#ifdef TEST 140.131 +int main ( void ) 140.132 +{ 140.133 + int mm_flags; 140.134 + mm_flags = mm_support(); 140.135 + printf("mm_support = 0x%08X\n",mm_flags); 140.136 + return 0; 140.137 +} 140.138 +#endif
141.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 141.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_mmx.c Mon Aug 27 12:09:56 2012 +0200 141.3 @@ -0,0 +1,304 @@ 141.4 +/* 141.5 + * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, 141.6 + * Loren Merritt 141.7 + * 141.8 + * This file is part of FFmpeg. 141.9 + * 141.10 + * FFmpeg is free software; you can redistribute it and/or 141.11 + * modify it under the terms of the GNU Lesser General Public 141.12 + * License as published by the Free Software Foundation; either 141.13 + * version 2.1 of the License, or (at your option) any later version. 141.14 + * 141.15 + * FFmpeg is distributed in the hope that it will be useful, 141.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 141.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 141.18 + * Lesser General Public License for more details. 141.19 + * 141.20 + * You should have received a copy of the GNU Lesser General Public 141.21 + * License along with FFmpeg; if not, write to the Free Software 141.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 141.23 + */ 141.24 + 141.25 +/** 141.26 + * MMX optimized version of (put|avg)_h264_chroma_mc8. 141.27 + * H264_CHROMA_MC8_TMPL must be defined to the desired function name 141.28 + * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg 141.29 + * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function 141.30 + */ 141.31 +static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) 141.32 +{ 141.33 + DECLARE_ALIGNED(8, uint64_t, AA); 141.34 + DECLARE_ALIGNED(8, uint64_t, DD); 141.35 + int i; 141.36 + 141.37 + if(y==0 && x==0) { 141.38 + /* no filter needed */ 141.39 + H264_CHROMA_MC8_MV0(dst, src, stride, h); 141.40 + return; 141.41 + } 141.42 + 141.43 + assert(x<8 && y<8 && x>=0 && y>=0); 141.44 + 141.45 + if(y==0 || x==0) 141.46 + { 141.47 + /* 1 dimensional filter only */ 141.48 + const int dxy = x ? 1 : stride; 141.49 + 141.50 + __asm__ volatile( 141.51 + "movd %0, %%mm5\n\t" 141.52 + "movq %1, %%mm4\n\t" 141.53 + "movq %2, %%mm6\n\t" /* mm6 = rnd >> 3 */ 141.54 + "punpcklwd %%mm5, %%mm5\n\t" 141.55 + "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ 141.56 + "pxor %%mm7, %%mm7\n\t" 141.57 + "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ 141.58 + :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1))); 141.59 + 141.60 + for(i=0; i<h; i++) { 141.61 + __asm__ volatile( 141.62 + /* mm0 = src[0..7], mm1 = src[1..8] */ 141.63 + "movq %0, %%mm0\n\t" 141.64 + "movq %1, %%mm2\n\t" 141.65 + :: "m"(src[0]), "m"(src[dxy])); 141.66 + 141.67 + __asm__ volatile( 141.68 + /* [mm0,mm1] = A * src[0..7] */ 141.69 + /* [mm2,mm3] = B * src[1..8] */ 141.70 + "movq %%mm0, %%mm1\n\t" 141.71 + "movq %%mm2, %%mm3\n\t" 141.72 + "punpcklbw %%mm7, %%mm0\n\t" 141.73 + "punpckhbw %%mm7, %%mm1\n\t" 141.74 + "punpcklbw %%mm7, %%mm2\n\t" 141.75 + "punpckhbw %%mm7, %%mm3\n\t" 141.76 + "pmullw %%mm4, %%mm0\n\t" 141.77 + "pmullw %%mm4, %%mm1\n\t" 141.78 + "pmullw %%mm5, %%mm2\n\t" 141.79 + "pmullw %%mm5, %%mm3\n\t" 141.80 + 141.81 + /* dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 */ 141.82 + "paddw %%mm6, %%mm0\n\t" 141.83 + "paddw %%mm6, %%mm1\n\t" 141.84 + "paddw %%mm2, %%mm0\n\t" 141.85 + "paddw %%mm3, %%mm1\n\t" 141.86 + "psrlw $3, %%mm0\n\t" 141.87 + "psrlw $3, %%mm1\n\t" 141.88 + "packuswb %%mm1, %%mm0\n\t" 141.89 + H264_CHROMA_OP(%0, %%mm0) 141.90 + "movq %%mm0, %0\n\t" 141.91 + : "=m" (dst[0])); 141.92 + 141.93 + src += stride; 141.94 + dst += stride; 141.95 + } 141.96 + return; 141.97 + } 141.98 + 141.99 + /* general case, bilinear */ 141.100 + __asm__ volatile("movd %2, %%mm4\n\t" 141.101 + "movd %3, %%mm6\n\t" 141.102 + "punpcklwd %%mm4, %%mm4\n\t" 141.103 + "punpcklwd %%mm6, %%mm6\n\t" 141.104 + "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ 141.105 + "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */ 141.106 + "movq %%mm4, %%mm5\n\t" 141.107 + "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */ 141.108 + "psllw $3, %%mm5\n\t" 141.109 + "psllw $3, %%mm6\n\t" 141.110 + "movq %%mm5, %%mm7\n\t" 141.111 + "paddw %%mm6, %%mm7\n\t" 141.112 + "movq %%mm4, %1\n\t" /* DD = x * y */ 141.113 + "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */ 141.114 + "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */ 141.115 + "paddw %4, %%mm4\n\t" 141.116 + "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */ 141.117 + "pxor %%mm7, %%mm7\n\t" 141.118 + "movq %%mm4, %0\n\t" 141.119 + : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64)); 141.120 + 141.121 + __asm__ volatile( 141.122 + /* mm0 = src[0..7], mm1 = src[1..8] */ 141.123 + "movq %0, %%mm0\n\t" 141.124 + "movq %1, %%mm1\n\t" 141.125 + : : "m" (src[0]), "m" (src[1])); 141.126 + 141.127 + for(i=0; i<h; i++) { 141.128 + src += stride; 141.129 + 141.130 + __asm__ volatile( 141.131 + /* mm2 = A * src[0..3] + B * src[1..4] */ 141.132 + /* mm3 = A * src[4..7] + B * src[5..8] */ 141.133 + "movq %%mm0, %%mm2\n\t" 141.134 + "movq %%mm1, %%mm3\n\t" 141.135 + "punpckhbw %%mm7, %%mm0\n\t" 141.136 + "punpcklbw %%mm7, %%mm1\n\t" 141.137 + "punpcklbw %%mm7, %%mm2\n\t" 141.138 + "punpckhbw %%mm7, %%mm3\n\t" 141.139 + "pmullw %0, %%mm0\n\t" 141.140 + "pmullw %0, %%mm2\n\t" 141.141 + "pmullw %%mm5, %%mm1\n\t" 141.142 + "pmullw %%mm5, %%mm3\n\t" 141.143 + "paddw %%mm1, %%mm2\n\t" 141.144 + "paddw %%mm0, %%mm3\n\t" 141.145 + : : "m" (AA)); 141.146 + 141.147 + __asm__ volatile( 141.148 + /* [mm2,mm3] += C * src[0..7] */ 141.149 + "movq %0, %%mm0\n\t" 141.150 + "movq %%mm0, %%mm1\n\t" 141.151 + "punpcklbw %%mm7, %%mm0\n\t" 141.152 + "punpckhbw %%mm7, %%mm1\n\t" 141.153 + "pmullw %%mm6, %%mm0\n\t" 141.154 + "pmullw %%mm6, %%mm1\n\t" 141.155 + "paddw %%mm0, %%mm2\n\t" 141.156 + "paddw %%mm1, %%mm3\n\t" 141.157 + : : "m" (src[0])); 141.158 + 141.159 + __asm__ volatile( 141.160 + /* [mm2,mm3] += D * src[1..8] */ 141.161 + "movq %1, %%mm1\n\t" 141.162 + "movq %%mm1, %%mm0\n\t" 141.163 + "movq %%mm1, %%mm4\n\t" 141.164 + "punpcklbw %%mm7, %%mm0\n\t" 141.165 + "punpckhbw %%mm7, %%mm4\n\t" 141.166 + "pmullw %2, %%mm0\n\t" 141.167 + "pmullw %2, %%mm4\n\t" 141.168 + "paddw %%mm0, %%mm2\n\t" 141.169 + "paddw %%mm4, %%mm3\n\t" 141.170 + "movq %0, %%mm0\n\t" 141.171 + : : "m" (src[0]), "m" (src[1]), "m" (DD)); 141.172 + 141.173 + __asm__ volatile( 141.174 + /* dst[0..7] = ([mm2,mm3] + rnd) >> 6 */ 141.175 + "paddw %1, %%mm2\n\t" 141.176 + "paddw %1, %%mm3\n\t" 141.177 + "psrlw $6, %%mm2\n\t" 141.178 + "psrlw $6, %%mm3\n\t" 141.179 + "packuswb %%mm3, %%mm2\n\t" 141.180 + H264_CHROMA_OP(%0, %%mm2) 141.181 + "movq %%mm2, %0\n\t" 141.182 + : "=m" (dst[0]) : "m" (*rnd_reg)); 141.183 + dst+= stride; 141.184 + } 141.185 +} 141.186 + 141.187 +static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) 141.188 +{ 141.189 + __asm__ volatile( 141.190 + "pxor %%mm7, %%mm7 \n\t" 141.191 + "movd %5, %%mm2 \n\t" 141.192 + "movd %6, %%mm3 \n\t" 141.193 + "movq "MANGLE(ff_pw_8)", %%mm4\n\t" 141.194 + "movq "MANGLE(ff_pw_8)", %%mm5\n\t" 141.195 + "punpcklwd %%mm2, %%mm2 \n\t" 141.196 + "punpcklwd %%mm3, %%mm3 \n\t" 141.197 + "punpcklwd %%mm2, %%mm2 \n\t" 141.198 + "punpcklwd %%mm3, %%mm3 \n\t" 141.199 + "psubw %%mm2, %%mm4 \n\t" 141.200 + "psubw %%mm3, %%mm5 \n\t" 141.201 + 141.202 + "movd (%1), %%mm0 \n\t" 141.203 + "movd 1(%1), %%mm6 \n\t" 141.204 + "add %3, %1 \n\t" 141.205 + "punpcklbw %%mm7, %%mm0 \n\t" 141.206 + "punpcklbw %%mm7, %%mm6 \n\t" 141.207 + "pmullw %%mm4, %%mm0 \n\t" 141.208 + "pmullw %%mm2, %%mm6 \n\t" 141.209 + "paddw %%mm0, %%mm6 \n\t" 141.210 + 141.211 + "1: \n\t" 141.212 + "movd (%1), %%mm0 \n\t" 141.213 + "movd 1(%1), %%mm1 \n\t" 141.214 + "add %3, %1 \n\t" 141.215 + "punpcklbw %%mm7, %%mm0 \n\t" 141.216 + "punpcklbw %%mm7, %%mm1 \n\t" 141.217 + "pmullw %%mm4, %%mm0 \n\t" 141.218 + "pmullw %%mm2, %%mm1 \n\t" 141.219 + "paddw %%mm0, %%mm1 \n\t" 141.220 + "movq %%mm1, %%mm0 \n\t" 141.221 + "pmullw %%mm5, %%mm6 \n\t" 141.222 + "pmullw %%mm3, %%mm1 \n\t" 141.223 + "paddw %4, %%mm6 \n\t" 141.224 + "paddw %%mm6, %%mm1 \n\t" 141.225 + "psrlw $6, %%mm1 \n\t" 141.226 + "packuswb %%mm1, %%mm1 \n\t" 141.227 + H264_CHROMA_OP4((%0), %%mm1, %%mm6) 141.228 + "movd %%mm1, (%0) \n\t" 141.229 + "add %3, %0 \n\t" 141.230 + "movd (%1), %%mm6 \n\t" 141.231 + "movd 1(%1), %%mm1 \n\t" 141.232 + "add %3, %1 \n\t" 141.233 + "punpcklbw %%mm7, %%mm6 \n\t" 141.234 + "punpcklbw %%mm7, %%mm1 \n\t" 141.235 + "pmullw %%mm4, %%mm6 \n\t" 141.236 + "pmullw %%mm2, %%mm1 \n\t" 141.237 + "paddw %%mm6, %%mm1 \n\t" 141.238 + "movq %%mm1, %%mm6 \n\t" 141.239 + "pmullw %%mm5, %%mm0 \n\t" 141.240 + "pmullw %%mm3, %%mm1 \n\t" 141.241 + "paddw %4, %%mm0 \n\t" 141.242 + "paddw %%mm0, %%mm1 \n\t" 141.243 + "psrlw $6, %%mm1 \n\t" 141.244 + "packuswb %%mm1, %%mm1 \n\t" 141.245 + H264_CHROMA_OP4((%0), %%mm1, %%mm0) 141.246 + "movd %%mm1, (%0) \n\t" 141.247 + "add %3, %0 \n\t" 141.248 + "sub $2, %2 \n\t" 141.249 + "jnz 1b \n\t" 141.250 + : "+r"(dst), "+r"(src), "+r"(h) 141.251 + : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y) 141.252 + ); 141.253 +} 141.254 + 141.255 +#ifdef H264_CHROMA_MC2_TMPL 141.256 +static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 141.257 +{ 141.258 + int tmp = ((1<<16)-1)*x + 8; 141.259 + int CD= tmp*y; 141.260 + int AB= (tmp<<3) - CD; 141.261 + __asm__ volatile( 141.262 + /* mm5 = {A,B,A,B} */ 141.263 + /* mm6 = {C,D,C,D} */ 141.264 + "movd %0, %%mm5\n\t" 141.265 + "movd %1, %%mm6\n\t" 141.266 + "punpckldq %%mm5, %%mm5\n\t" 141.267 + "punpckldq %%mm6, %%mm6\n\t" 141.268 + "pxor %%mm7, %%mm7\n\t" 141.269 + /* mm0 = src[0,1,1,2] */ 141.270 + "movd %2, %%mm2\n\t" 141.271 + "punpcklbw %%mm7, %%mm2\n\t" 141.272 + "pshufw $0x94, %%mm2, %%mm2\n\t" 141.273 + :: "r"(AB), "r"(CD), "m"(src[0])); 141.274 + 141.275 + 141.276 + __asm__ volatile( 141.277 + "1:\n\t" 141.278 + "add %4, %1\n\t" 141.279 + /* mm1 = A * src[0,1] + B * src[1,2] */ 141.280 + "movq %%mm2, %%mm1\n\t" 141.281 + "pmaddwd %%mm5, %%mm1\n\t" 141.282 + /* mm0 = src[0,1,1,2] */ 141.283 + "movd (%1), %%mm0\n\t" 141.284 + "punpcklbw %%mm7, %%mm0\n\t" 141.285 + "pshufw $0x94, %%mm0, %%mm0\n\t" 141.286 + /* mm1 += C * src[0,1] + D * src[1,2] */ 141.287 + "movq %%mm0, %%mm2\n\t" 141.288 + "pmaddwd %%mm6, %%mm0\n\t" 141.289 + "paddw %3, %%mm1\n\t" 141.290 + "paddw %%mm0, %%mm1\n\t" 141.291 + /* dst[0,1] = pack((mm1 + 32) >> 6) */ 141.292 + "psrlw $6, %%mm1\n\t" 141.293 + "packssdw %%mm7, %%mm1\n\t" 141.294 + "packuswb %%mm7, %%mm1\n\t" 141.295 + H264_CHROMA_OP4((%0), %%mm1, %%mm3) 141.296 + "movd %%mm1, %%esi\n\t" 141.297 + "movw %%si, (%0)\n\t" 141.298 + "add %4, %0\n\t" 141.299 + "sub $1, %2\n\t" 141.300 + "jnz 1b\n\t" 141.301 + : "+r" (dst), "+r"(src), "+r"(h) 141.302 + : "m" (ff_pw_32), "r"((x86_reg)stride) 141.303 + : "%esi"); 141.304 + 141.305 +} 141.306 +#endif 141.307 +
142.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 142.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_ssse3.c Mon Aug 27 12:09:56 2012 +0200 142.3 @@ -0,0 +1,208 @@ 142.4 +/* 142.5 + * Copyright (c) 2008 Loren Merritt 142.6 + * 142.7 + * This file is part of FFmpeg. 142.8 + * 142.9 + * FFmpeg is free software; you can redistribute it and/or 142.10 + * modify it under the terms of the GNU Lesser General Public 142.11 + * License as published by the Free Software Foundation; either 142.12 + * version 2.1 of the License, or (at your option) any later version. 142.13 + * 142.14 + * FFmpeg is distributed in the hope that it will be useful, 142.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 142.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 142.17 + * Lesser General Public License for more details. 142.18 + * 142.19 + * You should have received a copy of the GNU Lesser General Public 142.20 + * License along with FFmpeg; if not, write to the Free Software 142.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 142.22 + */ 142.23 + 142.24 +/** 142.25 + * SSSE3 optimized version of (put|avg)_h264_chroma_mc8. 142.26 + * H264_CHROMA_MC8_TMPL must be defined to the desired function name 142.27 + * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function 142.28 + * AVG_OP must be defined to empty for put and the identify for avg 142.29 + */ 142.30 +static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) 142.31 +{ 142.32 + if(y==0 && x==0) { 142.33 + /* no filter needed */ 142.34 + H264_CHROMA_MC8_MV0(dst, src, stride, h); 142.35 + return; 142.36 + } 142.37 + 142.38 + assert(x<8 && y<8 && x>=0 && y>=0); 142.39 + 142.40 + if(y==0 || x==0) 142.41 + { 142.42 + /* 1 dimensional filter only */ 142.43 + __asm__ volatile( 142.44 + "movd %0, %%xmm7 \n\t" 142.45 + "movq %1, %%xmm6 \n\t" 142.46 + "pshuflw $0, %%xmm7, %%xmm7 \n\t" 142.47 + "movlhps %%xmm6, %%xmm6 \n\t" 142.48 + "movlhps %%xmm7, %%xmm7 \n\t" 142.49 + :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3)) 142.50 + ); 142.51 + 142.52 + if(x) { 142.53 + __asm__ volatile( 142.54 + "1: \n\t" 142.55 + "movq (%1), %%xmm0 \n\t" 142.56 + "movq 1(%1), %%xmm1 \n\t" 142.57 + "movq (%1,%3), %%xmm2 \n\t" 142.58 + "movq 1(%1,%3), %%xmm3 \n\t" 142.59 + "punpcklbw %%xmm1, %%xmm0 \n\t" 142.60 + "punpcklbw %%xmm3, %%xmm2 \n\t" 142.61 + "pmaddubsw %%xmm7, %%xmm0 \n\t" 142.62 + "pmaddubsw %%xmm7, %%xmm2 \n\t" 142.63 + AVG_OP("movq (%0), %%xmm4 \n\t") 142.64 + AVG_OP("movhps (%0,%3), %%xmm4 \n\t") 142.65 + "paddw %%xmm6, %%xmm0 \n\t" 142.66 + "paddw %%xmm6, %%xmm2 \n\t" 142.67 + "psrlw $3, %%xmm0 \n\t" 142.68 + "psrlw $3, %%xmm2 \n\t" 142.69 + "packuswb %%xmm2, %%xmm0 \n\t" 142.70 + AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") 142.71 + "movq %%xmm0, (%0) \n\t" 142.72 + "movhps %%xmm0, (%0,%3) \n\t" 142.73 + "sub $2, %2 \n\t" 142.74 + "lea (%1,%3,2), %1 \n\t" 142.75 + "lea (%0,%3,2), %0 \n\t" 142.76 + "jg 1b \n\t" 142.77 + :"+r"(dst), "+r"(src), "+r"(h) 142.78 + :"r"((x86_reg)stride) 142.79 + ); 142.80 + } else { 142.81 + __asm__ volatile( 142.82 + "1: \n\t" 142.83 + "movq (%1), %%xmm0 \n\t" 142.84 + "movq (%1,%3), %%xmm1 \n\t" 142.85 + "movdqa %%xmm1, %%xmm2 \n\t" 142.86 + "movq (%1,%3,2), %%xmm3 \n\t" 142.87 + "punpcklbw %%xmm1, %%xmm0 \n\t" 142.88 + "punpcklbw %%xmm3, %%xmm2 \n\t" 142.89 + "pmaddubsw %%xmm7, %%xmm0 \n\t" 142.90 + "pmaddubsw %%xmm7, %%xmm2 \n\t" 142.91 + AVG_OP("movq (%0), %%xmm4 \n\t") 142.92 + AVG_OP("movhps (%0,%3), %%xmm4 \n\t") 142.93 + "paddw %%xmm6, %%xmm0 \n\t" 142.94 + "paddw %%xmm6, %%xmm2 \n\t" 142.95 + "psrlw $3, %%xmm0 \n\t" 142.96 + "psrlw $3, %%xmm2 \n\t" 142.97 + "packuswb %%xmm2, %%xmm0 \n\t" 142.98 + AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") 142.99 + "movq %%xmm0, (%0) \n\t" 142.100 + "movhps %%xmm0, (%0,%3) \n\t" 142.101 + "sub $2, %2 \n\t" 142.102 + "lea (%1,%3,2), %1 \n\t" 142.103 + "lea (%0,%3,2), %0 \n\t" 142.104 + "jg 1b \n\t" 142.105 + :"+r"(dst), "+r"(src), "+r"(h) 142.106 + :"r"((x86_reg)stride) 142.107 + ); 142.108 + } 142.109 + return; 142.110 + } 142.111 + 142.112 + /* general case, bilinear */ 142.113 + __asm__ volatile( 142.114 + "movd %0, %%xmm7 \n\t" 142.115 + "movd %1, %%xmm6 \n\t" 142.116 + "movdqa %2, %%xmm5 \n\t" 142.117 + "pshuflw $0, %%xmm7, %%xmm7 \n\t" 142.118 + "pshuflw $0, %%xmm6, %%xmm6 \n\t" 142.119 + "movlhps %%xmm7, %%xmm7 \n\t" 142.120 + "movlhps %%xmm6, %%xmm6 \n\t" 142.121 + :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28)) 142.122 + ); 142.123 + 142.124 + __asm__ volatile( 142.125 + "movq (%1), %%xmm0 \n\t" 142.126 + "movq 1(%1), %%xmm1 \n\t" 142.127 + "punpcklbw %%xmm1, %%xmm0 \n\t" 142.128 + "add %3, %1 \n\t" 142.129 + "1: \n\t" 142.130 + "movq (%1), %%xmm1 \n\t" 142.131 + "movq 1(%1), %%xmm2 \n\t" 142.132 + "movq (%1,%3), %%xmm3 \n\t" 142.133 + "movq 1(%1,%3), %%xmm4 \n\t" 142.134 + "lea (%1,%3,2), %1 \n\t" 142.135 + "punpcklbw %%xmm2, %%xmm1 \n\t" 142.136 + "punpcklbw %%xmm4, %%xmm3 \n\t" 142.137 + "movdqa %%xmm1, %%xmm2 \n\t" 142.138 + "movdqa %%xmm3, %%xmm4 \n\t" 142.139 + "pmaddubsw %%xmm7, %%xmm0 \n\t" 142.140 + "pmaddubsw %%xmm6, %%xmm1 \n\t" 142.141 + "pmaddubsw %%xmm7, %%xmm2 \n\t" 142.142 + "pmaddubsw %%xmm6, %%xmm3 \n\t" 142.143 + "paddw %%xmm5, %%xmm0 \n\t" 142.144 + "paddw %%xmm5, %%xmm2 \n\t" 142.145 + "paddw %%xmm0, %%xmm1 \n\t" 142.146 + "paddw %%xmm2, %%xmm3 \n\t" 142.147 + "movdqa %%xmm4, %%xmm0 \n\t" 142.148 + "psrlw $6, %%xmm1 \n\t" 142.149 + "psrlw $6, %%xmm3 \n\t" 142.150 + AVG_OP("movq (%0), %%xmm2 \n\t") 142.151 + AVG_OP("movhps (%0,%3), %%xmm2 \n\t") 142.152 + "packuswb %%xmm3, %%xmm1 \n\t" 142.153 + AVG_OP("pavgb %%xmm2, %%xmm1 \n\t") 142.154 + "movq %%xmm1, (%0)\n\t" 142.155 + "movhps %%xmm1, (%0,%3)\n\t" 142.156 + "sub $2, %2 \n\t" 142.157 + "lea (%0,%3,2), %0 \n\t" 142.158 + "jg 1b \n\t" 142.159 + :"+r"(dst), "+r"(src), "+r"(h) 142.160 + :"r"((x86_reg)stride) 142.161 + ); 142.162 +} 142.163 + 142.164 +static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 142.165 +{ 142.166 + __asm__ volatile( 142.167 + "movd %0, %%mm7 \n\t" 142.168 + "movd %1, %%mm6 \n\t" 142.169 + "movq %2, %%mm5 \n\t" 142.170 + "pshufw $0, %%mm7, %%mm7 \n\t" 142.171 + "pshufw $0, %%mm6, %%mm6 \n\t" 142.172 + :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32) 142.173 + ); 142.174 + 142.175 + __asm__ volatile( 142.176 + "movd (%1), %%mm0 \n\t" 142.177 + "punpcklbw 1(%1), %%mm0 \n\t" 142.178 + "add %3, %1 \n\t" 142.179 + "1: \n\t" 142.180 + "movd (%1), %%mm1 \n\t" 142.181 + "movd (%1,%3), %%mm3 \n\t" 142.182 + "punpcklbw 1(%1), %%mm1 \n\t" 142.183 + "punpcklbw 1(%1,%3), %%mm3 \n\t" 142.184 + "lea (%1,%3,2), %1 \n\t" 142.185 + "movq %%mm1, %%mm2 \n\t" 142.186 + "movq %%mm3, %%mm4 \n\t" 142.187 + "pmaddubsw %%mm7, %%mm0 \n\t" 142.188 + "pmaddubsw %%mm6, %%mm1 \n\t" 142.189 + "pmaddubsw %%mm7, %%mm2 \n\t" 142.190 + "pmaddubsw %%mm6, %%mm3 \n\t" 142.191 + "paddw %%mm5, %%mm0 \n\t" 142.192 + "paddw %%mm5, %%mm2 \n\t" 142.193 + "paddw %%mm0, %%mm1 \n\t" 142.194 + "paddw %%mm2, %%mm3 \n\t" 142.195 + "movq %%mm4, %%mm0 \n\t" 142.196 + "psrlw $6, %%mm1 \n\t" 142.197 + "psrlw $6, %%mm3 \n\t" 142.198 + "packuswb %%mm1, %%mm1 \n\t" 142.199 + "packuswb %%mm3, %%mm3 \n\t" 142.200 + AVG_OP("pavgb (%0), %%mm1 \n\t") 142.201 + AVG_OP("pavgb (%0,%3), %%mm3 \n\t") 142.202 + "movd %%mm1, (%0)\n\t" 142.203 + "movd %%mm3, (%0,%3)\n\t" 142.204 + "sub $2, %2 \n\t" 142.205 + "lea (%0,%3,2), %0 \n\t" 142.206 + "jg 1b \n\t" 142.207 + :"+r"(dst), "+r"(src), "+r"(h) 142.208 + :"r"((x86_reg)stride) 142.209 + ); 142.210 +} 142.211 +
143.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 143.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.c Mon Aug 27 12:09:56 2012 +0200 143.3 @@ -0,0 +1,821 @@ 143.4 +/* 143.5 + * MMX optimized DSP utils 143.6 + * Copyright (c) 2000, 2001 Fabrice Bellard 143.7 + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 143.8 + * 143.9 + * This file is part of FFmpeg. 143.10 + * 143.11 + * FFmpeg is free software; you can redistribute it and/or 143.12 + * modify it under the terms of the GNU Lesser General Public 143.13 + * License as published by the Free Software Foundation; either 143.14 + * version 2.1 of the License, or (at your option) any later version. 143.15 + * 143.16 + * FFmpeg is distributed in the hope that it will be useful, 143.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 143.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 143.19 + * Lesser General Public License for more details. 143.20 + * 143.21 + * You should have received a copy of the GNU Lesser General Public 143.22 + * License along with FFmpeg; if not, write to the Free Software 143.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 143.24 + * 143.25 + * MMX optimization by Nick Kurshev <nickols_k@mail.ru> 143.26 + */ 143.27 + 143.28 +#include "libavutil/x86_cpu.h" 143.29 +#include "libavutil/internal.h" 143.30 +#include "libavcodec/dsputil.h" 143.31 +#include "libavcodec/h264_dsp.h" 143.32 +#include "dsputil_mmx.h" 143.33 + 143.34 + 143.35 +//#undef NDEBUG 143.36 +//#include <assert.h> 143.37 + 143.38 +int mm_flags; /* multimedia extension flags */ 143.39 + 143.40 +/* pixel operations */ 143.41 +DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL; 143.42 +DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; 143.43 + 143.44 +DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = 143.45 +{0x8000000080000000ULL, 0x8000000080000000ULL}; 143.46 + 143.47 +DECLARE_ALIGNED(8, const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL; 143.48 +DECLARE_ALIGNED(8, const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL; 143.49 +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; 143.50 +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL}; 143.51 +DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; 143.52 +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; 143.53 +DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; 143.54 +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL}; 143.55 +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; 143.56 +DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; 143.57 +DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL}; 143.58 +DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; 143.59 +DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; 143.60 +DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; 143.61 + 143.62 +DECLARE_ALIGNED(8, const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; 143.63 +DECLARE_ALIGNED(8, const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL; 143.64 +DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; 143.65 +DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; 143.66 +DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; 143.67 +DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; 143.68 +DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; 143.69 +DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; 143.70 + 143.71 +DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; 143.72 +DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; 143.73 + 143.74 +#define ASMALIGN(ZEROBITS) ".align 1 << " #ZEROBITS "\n\t" 143.75 +#define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::) 143.76 +#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::) 143.77 + 143.78 +#define MOVQ_BFE(regd) \ 143.79 + __asm__ volatile ( \ 143.80 + "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ 143.81 + "paddb %%" #regd ", %%" #regd " \n\t" ::) 143.82 + 143.83 +#ifndef PIC 143.84 +#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone)) 143.85 +#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo)) 143.86 +#else 143.87 +// for shared library it's better to use this way for accessing constants 143.88 +// pcmpeqd -> -1 143.89 +#define MOVQ_BONE(regd) \ 143.90 + __asm__ volatile ( \ 143.91 + "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ 143.92 + "psrlw $15, %%" #regd " \n\t" \ 143.93 + "packuswb %%" #regd ", %%" #regd " \n\t" ::) 143.94 + 143.95 +#define MOVQ_WTWO(regd) \ 143.96 + __asm__ volatile ( \ 143.97 + "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ 143.98 + "psrlw $15, %%" #regd " \n\t" \ 143.99 + "psllw $1, %%" #regd " \n\t"::) 143.100 + 143.101 +#endif 143.102 + 143.103 +// using regr as temporary and for the output result 143.104 +// first argument is unmodifed and second is trashed 143.105 +// regfe is supposed to contain 0xfefefefefefefefe 143.106 +#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ 143.107 + "movq " #rega ", " #regr " \n\t"\ 143.108 + "pand " #regb ", " #regr " \n\t"\ 143.109 + "pxor " #rega ", " #regb " \n\t"\ 143.110 + "pand " #regfe "," #regb " \n\t"\ 143.111 + "psrlq $1, " #regb " \n\t"\ 143.112 + "paddb " #regb ", " #regr " \n\t" 143.113 + 143.114 +#define PAVGB_MMX(rega, regb, regr, regfe) \ 143.115 + "movq " #rega ", " #regr " \n\t"\ 143.116 + "por " #regb ", " #regr " \n\t"\ 143.117 + "pxor " #rega ", " #regb " \n\t"\ 143.118 + "pand " #regfe "," #regb " \n\t"\ 143.119 + "psrlq $1, " #regb " \n\t"\ 143.120 + "psubb " #regb ", " #regr " \n\t" 143.121 + 143.122 +// mm6 is supposed to contain 0xfefefefefefefefe 143.123 +#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ 143.124 + "movq " #rega ", " #regr " \n\t"\ 143.125 + "movq " #regc ", " #regp " \n\t"\ 143.126 + "pand " #regb ", " #regr " \n\t"\ 143.127 + "pand " #regd ", " #regp " \n\t"\ 143.128 + "pxor " #rega ", " #regb " \n\t"\ 143.129 + "pxor " #regc ", " #regd " \n\t"\ 143.130 + "pand %%mm6, " #regb " \n\t"\ 143.131 + "pand %%mm6, " #regd " \n\t"\ 143.132 + "psrlq $1, " #regb " \n\t"\ 143.133 + "psrlq $1, " #regd " \n\t"\ 143.134 + "paddb " #regb ", " #regr " \n\t"\ 143.135 + "paddb " #regd ", " #regp " \n\t" 143.136 + 143.137 +#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ 143.138 + "movq " #rega ", " #regr " \n\t"\ 143.139 + "movq " #regc ", " #regp " \n\t"\ 143.140 + "por " #regb ", " #regr " \n\t"\ 143.141 + "por " #regd ", " #regp " \n\t"\ 143.142 + "pxor " #rega ", " #regb " \n\t"\ 143.143 + "pxor " #regc ", " #regd " \n\t"\ 143.144 + "pand %%mm6, " #regb " \n\t"\ 143.145 + "pand %%mm6, " #regd " \n\t"\ 143.146 + "psrlq $1, " #regd " \n\t"\ 143.147 + "psrlq $1, " #regb " \n\t"\ 143.148 + "psubb " #regb ", " #regr " \n\t"\ 143.149 + "psubb " #regd ", " #regp " \n\t" 143.150 + 143.151 +/***********************************/ 143.152 +/* MMX2 specific */ 143.153 + 143.154 +#define DEF(x) x ## _mmx2 143.155 + 143.156 +/* Introduced only in MMX2 set */ 143.157 +#define PAVGB "pavgb" 143.158 +#define OP_AVG PAVGB 143.159 + 143.160 +#include "dsputil_mmx_avg_template.c" 143.161 + 143.162 +#undef DEF 143.163 +#undef PAVGB 143.164 +#undef OP_AVG 143.165 + 143.166 +#define put_no_rnd_pixels16_mmx put_pixels16_mmx 143.167 +#define put_no_rnd_pixels8_mmx put_pixels8_mmx 143.168 +#define put_pixels16_mmx2 put_pixels16_mmx 143.169 +#define put_pixels8_mmx2 put_pixels8_mmx 143.170 +#define put_pixels4_mmx2 put_pixels4_mmx 143.171 +#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx 143.172 +#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx 143.173 +#define put_pixels16_3dnow put_pixels16_mmx 143.174 +#define put_pixels8_3dnow put_pixels8_mmx 143.175 +#define put_pixels4_3dnow put_pixels4_mmx 143.176 +#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx 143.177 +#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx 143.178 + 143.179 +/***********************************/ 143.180 +/* standard MMX */ 143.181 + 143.182 +void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) 143.183 +{ 143.184 + const DCTELEM *p; 143.185 + uint8_t *pix; 143.186 + 143.187 + /* read the pixels */ 143.188 + p = block; 143.189 + pix = pixels; 143.190 + /* unrolled loop */ 143.191 + __asm__ volatile( 143.192 + "movq %3, %%mm0 \n\t" 143.193 + "movq 8%3, %%mm1 \n\t" 143.194 + "movq 16%3, %%mm2 \n\t" 143.195 + "movq 24%3, %%mm3 \n\t" 143.196 + "movq 32%3, %%mm4 \n\t" 143.197 + "movq 40%3, %%mm5 \n\t" 143.198 + "movq 48%3, %%mm6 \n\t" 143.199 + "movq 56%3, %%mm7 \n\t" 143.200 + "packuswb %%mm1, %%mm0 \n\t" 143.201 + "packuswb %%mm3, %%mm2 \n\t" 143.202 + "packuswb %%mm5, %%mm4 \n\t" 143.203 + "packuswb %%mm7, %%mm6 \n\t" 143.204 + "movq %%mm0, (%0) \n\t" 143.205 + "movq %%mm2, (%0, %1) \n\t" 143.206 + "movq %%mm4, (%0, %1, 2) \n\t" 143.207 + "movq %%mm6, (%0, %2) \n\t" 143.208 + ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p) 143.209 + :"memory"); 143.210 + pix += line_size*4; 143.211 + p += 32; 143.212 + 143.213 + // if here would be an exact copy of the code above 143.214 + // compiler would generate some very strange code 143.215 + // thus using "r" 143.216 + __asm__ volatile( 143.217 + "movq (%3), %%mm0 \n\t" 143.218 + "movq 8(%3), %%mm1 \n\t" 143.219 + "movq 16(%3), %%mm2 \n\t" 143.220 + "movq 24(%3), %%mm3 \n\t" 143.221 + "movq 32(%3), %%mm4 \n\t" 143.222 + "movq 40(%3), %%mm5 \n\t" 143.223 + "movq 48(%3), %%mm6 \n\t" 143.224 + "movq 56(%3), %%mm7 \n\t" 143.225 + "packuswb %%mm1, %%mm0 \n\t" 143.226 + "packuswb %%mm3, %%mm2 \n\t" 143.227 + "packuswb %%mm5, %%mm4 \n\t" 143.228 + "packuswb %%mm7, %%mm6 \n\t" 143.229 + "movq %%mm0, (%0) \n\t" 143.230 + "movq %%mm2, (%0, %1) \n\t" 143.231 + "movq %%mm4, (%0, %1, 2) \n\t" 143.232 + "movq %%mm6, (%0, %2) \n\t" 143.233 + ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p) 143.234 + :"memory"); 143.235 +} 143.236 + 143.237 +DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] = 143.238 + { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; 143.239 + 143.240 +#define put_signed_pixels_clamped_mmx_half(off) \ 143.241 + "movq "#off"(%2), %%mm1 \n\t"\ 143.242 + "movq 16+"#off"(%2), %%mm2 \n\t"\ 143.243 + "movq 32+"#off"(%2), %%mm3 \n\t"\ 143.244 + "movq 48+"#off"(%2), %%mm4 \n\t"\ 143.245 + "packsswb 8+"#off"(%2), %%mm1 \n\t"\ 143.246 + "packsswb 24+"#off"(%2), %%mm2 \n\t"\ 143.247 + "packsswb 40+"#off"(%2), %%mm3 \n\t"\ 143.248 + "packsswb 56+"#off"(%2), %%mm4 \n\t"\ 143.249 + "paddb %%mm0, %%mm1 \n\t"\ 143.250 + "paddb %%mm0, %%mm2 \n\t"\ 143.251 + "paddb %%mm0, %%mm3 \n\t"\ 143.252 + "paddb %%mm0, %%mm4 \n\t"\ 143.253 + "movq %%mm1, (%0) \n\t"\ 143.254 + "movq %%mm2, (%0, %3) \n\t"\ 143.255 + "movq %%mm3, (%0, %3, 2) \n\t"\ 143.256 + "movq %%mm4, (%0, %1) \n\t" 143.257 + 143.258 +void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) 143.259 +{ 143.260 + x86_reg line_skip = line_size; 143.261 + x86_reg line_skip3; 143.262 + 143.263 + __asm__ volatile ( 143.264 + "movq "MANGLE(ff_vector128)", %%mm0 \n\t" 143.265 + "lea (%3, %3, 2), %1 \n\t" 143.266 + put_signed_pixels_clamped_mmx_half(0) 143.267 + "lea (%0, %3, 4), %0 \n\t" 143.268 + put_signed_pixels_clamped_mmx_half(64) 143.269 + :"+&r" (pixels), "=&r" (line_skip3) 143.270 + :"r" (block), "r"(line_skip) 143.271 + :"memory"); 143.272 +} 143.273 + 143.274 +void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) 143.275 +{ 143.276 + const DCTELEM *p; 143.277 + uint8_t *pix; 143.278 + int i; 143.279 + 143.280 + /* read the pixels */ 143.281 + p = block; 143.282 + pix = pixels; 143.283 + MOVQ_ZERO(mm7); 143.284 + i = 4; 143.285 + do { 143.286 + __asm__ volatile( 143.287 + "movq (%2), %%mm0 \n\t" 143.288 + "movq 8(%2), %%mm1 \n\t" 143.289 + "movq 16(%2), %%mm2 \n\t" 143.290 + "movq 24(%2), %%mm3 \n\t" 143.291 + "movq %0, %%mm4 \n\t" 143.292 + "movq %1, %%mm6 \n\t" 143.293 + "movq %%mm4, %%mm5 \n\t" 143.294 + "punpcklbw %%mm7, %%mm4 \n\t" 143.295 + "punpckhbw %%mm7, %%mm5 \n\t" 143.296 + "paddsw %%mm4, %%mm0 \n\t" 143.297 + "paddsw %%mm5, %%mm1 \n\t" 143.298 + "movq %%mm6, %%mm5 \n\t" 143.299 + "punpcklbw %%mm7, %%mm6 \n\t" 143.300 + "punpckhbw %%mm7, %%mm5 \n\t" 143.301 + "paddsw %%mm6, %%mm2 \n\t" 143.302 + "paddsw %%mm5, %%mm3 \n\t" 143.303 + "packuswb %%mm1, %%mm0 \n\t" 143.304 + "packuswb %%mm3, %%mm2 \n\t" 143.305 + "movq %%mm0, %0 \n\t" 143.306 + "movq %%mm2, %1 \n\t" 143.307 + :"+m"(*pix), "+m"(*(pix+line_size)) 143.308 + :"r"(p) 143.309 + :"memory"); 143.310 + pix += line_size*2; 143.311 + p += 16; 143.312 + } while (--i); 143.313 +} 143.314 + 143.315 +static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) 143.316 +{ 143.317 + __asm__ volatile( 143.318 + "lea (%3, %3), %%"REG_a" \n\t" 143.319 + ASMALIGN(3) 143.320 + "1: \n\t" 143.321 + "movq (%1), %%mm0 \n\t" 143.322 + "movq (%1, %3), %%mm1 \n\t" 143.323 + "movq %%mm0, (%2) \n\t" 143.324 + "movq %%mm1, (%2, %3) \n\t" 143.325 + "add %%"REG_a", %1 \n\t" 143.326 + "add %%"REG_a", %2 \n\t" 143.327 + "movq (%1), %%mm0 \n\t" 143.328 + "movq (%1, %3), %%mm1 \n\t" 143.329 + "movq %%mm0, (%2) \n\t" 143.330 + "movq %%mm1, (%2, %3) \n\t" 143.331 + "add %%"REG_a", %1 \n\t" 143.332 + "add %%"REG_a", %2 \n\t" 143.333 + "subl $4, %0 \n\t" 143.334 + "jnz 1b \n\t" 143.335 + : "+g"(h), "+r" (pixels), "+r" (block) 143.336 + : "r"((x86_reg)line_size) 143.337 + : "%"REG_a, "memory" 143.338 + ); 143.339 +} 143.340 + 143.341 +static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) 143.342 +{ 143.343 + __asm__ volatile( 143.344 + "1: \n\t" 143.345 + "movdqu (%1), %%xmm0 \n\t" 143.346 + "movdqu (%1,%3), %%xmm1 \n\t" 143.347 + "movdqu (%1,%3,2), %%xmm2 \n\t" 143.348 + "movdqu (%1,%4), %%xmm3 \n\t" 143.349 + "movdqa %%xmm0, (%2) \n\t" 143.350 + "movdqa %%xmm1, (%2,%3) \n\t" 143.351 + "movdqa %%xmm2, (%2,%3,2) \n\t" 143.352 + "movdqa %%xmm3, (%2,%4) \n\t" 143.353 + "subl $4, %0 \n\t" 143.354 + "lea (%1,%3,4), %1 \n\t" 143.355 + "lea (%2,%3,4), %2 \n\t" 143.356 + "jnz 1b \n\t" 143.357 + : "+g"(h), "+r" (pixels), "+r" (block) 143.358 + : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) 143.359 + : "memory" 143.360 + ); 143.361 +} 143.362 + 143.363 +static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) 143.364 +{ 143.365 + __asm__ volatile( 143.366 + "1: \n\t" 143.367 + "movdqu (%1), %%xmm0 \n\t" 143.368 + "movdqu (%1,%3), %%xmm1 \n\t" 143.369 + "movdqu (%1,%3,2), %%xmm2 \n\t" 143.370 + "movdqu (%1,%4), %%xmm3 \n\t" 143.371 + "pavgb (%2), %%xmm0 \n\t" 143.372 + "pavgb (%2,%3), %%xmm1 \n\t" 143.373 + "pavgb (%2,%3,2), %%xmm2 \n\t" 143.374 + "pavgb (%2,%4), %%xmm3 \n\t" 143.375 + "movdqa %%xmm0, (%2) \n\t" 143.376 + "movdqa %%xmm1, (%2,%3) \n\t" 143.377 + "movdqa %%xmm2, (%2,%3,2) \n\t" 143.378 + "movdqa %%xmm3, (%2,%4) \n\t" 143.379 + "subl $4, %0 \n\t" 143.380 + "lea (%1,%3,4), %1 \n\t" 143.381 + "lea (%2,%3,4), %2 \n\t" 143.382 + "jnz 1b \n\t" 143.383 + : "+g"(h), "+r" (pixels), "+r" (block) 143.384 + : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) 143.385 + : "memory" 143.386 + ); 143.387 +} 143.388 + 143.389 +static void clear_block_sse(DCTELEM *block) 143.390 +{ 143.391 + __asm__ volatile( 143.392 + "xorps %%xmm0, %%xmm0 \n" 143.393 + "movaps %%xmm0, (%0) \n" 143.394 + "movaps %%xmm0, 16(%0) \n" 143.395 + "movaps %%xmm0, 32(%0) \n" 143.396 + "movaps %%xmm0, 48(%0) \n" 143.397 + "movaps %%xmm0, 64(%0) \n" 143.398 + "movaps %%xmm0, 80(%0) \n" 143.399 + "movaps %%xmm0, 96(%0) \n" 143.400 + "movaps %%xmm0, 112(%0) \n" 143.401 + :: "r"(block) 143.402 + : "memory" 143.403 + ); 143.404 +} 143.405 + 143.406 +static void clear_blocks_sse(DCTELEM *blocks) 143.407 +{\ 143.408 + __asm__ volatile( 143.409 + "xorps %%xmm0, %%xmm0 \n" 143.410 + "mov %1, %%"REG_a" \n" 143.411 + "1: \n" 143.412 + "movaps %%xmm0, (%0, %%"REG_a") \n" 143.413 + "movaps %%xmm0, 16(%0, %%"REG_a") \n" 143.414 + "movaps %%xmm0, 32(%0, %%"REG_a") \n" 143.415 + "movaps %%xmm0, 48(%0, %%"REG_a") \n" 143.416 + "movaps %%xmm0, 64(%0, %%"REG_a") \n" 143.417 + "movaps %%xmm0, 80(%0, %%"REG_a") \n" 143.418 + "movaps %%xmm0, 96(%0, %%"REG_a") \n" 143.419 + "movaps %%xmm0, 112(%0, %%"REG_a") \n" 143.420 + "add $128, %%"REG_a" \n" 143.421 + " js 1b \n" 143.422 + : : "r" (((uint8_t *)blocks)+128*6), 143.423 + "i" (-128*6) 143.424 + : "%"REG_a 143.425 + ); 143.426 +} 143.427 + 143.428 +static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ 143.429 + __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... 143.430 + "movd %4, %%mm0 \n\t" 143.431 + "movd %5, %%mm1 \n\t" 143.432 + "movd %6, %%mm2 \n\t" 143.433 + "movd %7, %%mm3 \n\t" 143.434 + "punpcklbw %%mm1, %%mm0 \n\t" 143.435 + "punpcklbw %%mm3, %%mm2 \n\t" 143.436 + "movq %%mm0, %%mm1 \n\t" 143.437 + "punpcklwd %%mm2, %%mm0 \n\t" 143.438 + "punpckhwd %%mm2, %%mm1 \n\t" 143.439 + "movd %%mm0, %0 \n\t" 143.440 + "punpckhdq %%mm0, %%mm0 \n\t" 143.441 + "movd %%mm0, %1 \n\t" 143.442 + "movd %%mm1, %2 \n\t" 143.443 + "punpckhdq %%mm1, %%mm1 \n\t" 143.444 + "movd %%mm1, %3 \n\t" 143.445 + 143.446 + : "=m" (*(uint32_t*)(dst + 0*dst_stride)), 143.447 + "=m" (*(uint32_t*)(dst + 1*dst_stride)), 143.448 + "=m" (*(uint32_t*)(dst + 2*dst_stride)), 143.449 + "=m" (*(uint32_t*)(dst + 3*dst_stride)) 143.450 + : "m" (*(uint32_t*)(src + 0*src_stride)), 143.451 + "m" (*(uint32_t*)(src + 1*src_stride)), 143.452 + "m" (*(uint32_t*)(src + 2*src_stride)), 143.453 + "m" (*(uint32_t*)(src + 3*src_stride)) 143.454 + ); 143.455 +} 143.456 + 143.457 +#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ 143.458 +\ 143.459 +static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ 143.460 + OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\ 143.461 +}\ 143.462 +\ 143.463 +static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.464 + uint64_t temp[8];\ 143.465 + uint8_t * const half= (uint8_t*)temp;\ 143.466 + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ 143.467 + OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ 143.468 +}\ 143.469 +\ 143.470 +static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.471 + OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ 143.472 +}\ 143.473 +\ 143.474 +static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.475 + uint64_t temp[8];\ 143.476 + uint8_t * const half= (uint8_t*)temp;\ 143.477 + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ 143.478 + OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\ 143.479 +}\ 143.480 +\ 143.481 +static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.482 + uint64_t temp[8];\ 143.483 + uint8_t * const half= (uint8_t*)temp;\ 143.484 + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ 143.485 + OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ 143.486 +}\ 143.487 +\ 143.488 +static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.489 + OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ 143.490 +}\ 143.491 +\ 143.492 +static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.493 + uint64_t temp[8];\ 143.494 + uint8_t * const half= (uint8_t*)temp;\ 143.495 + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ 143.496 + OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\ 143.497 +}\ 143.498 +static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.499 + uint64_t half[8 + 9];\ 143.500 + uint8_t * const halfH= ((uint8_t*)half) + 64;\ 143.501 + uint8_t * const halfHV= ((uint8_t*)half);\ 143.502 + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 143.503 + put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ 143.504 + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 143.505 + OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ 143.506 +}\ 143.507 +static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.508 + uint64_t half[8 + 9];\ 143.509 + uint8_t * const halfH= ((uint8_t*)half) + 64;\ 143.510 + uint8_t * const halfHV= ((uint8_t*)half);\ 143.511 + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 143.512 + put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ 143.513 + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 143.514 + OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ 143.515 +}\ 143.516 +static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.517 + uint64_t half[8 + 9];\ 143.518 + uint8_t * const halfH= ((uint8_t*)half) + 64;\ 143.519 + uint8_t * const halfHV= ((uint8_t*)half);\ 143.520 + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 143.521 + put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ 143.522 + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 143.523 + OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ 143.524 +}\ 143.525 +static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.526 + uint64_t half[8 + 9];\ 143.527 + uint8_t * const halfH= ((uint8_t*)half) + 64;\ 143.528 + uint8_t * const halfHV= ((uint8_t*)half);\ 143.529 + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 143.530 + put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ 143.531 + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 143.532 + OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ 143.533 +}\ 143.534 +static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.535 + uint64_t half[8 + 9];\ 143.536 + uint8_t * const halfH= ((uint8_t*)half) + 64;\ 143.537 + uint8_t * const halfHV= ((uint8_t*)half);\ 143.538 + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 143.539 + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 143.540 + OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ 143.541 +}\ 143.542 +static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.543 + uint64_t half[8 + 9];\ 143.544 + uint8_t * const halfH= ((uint8_t*)half) + 64;\ 143.545 + uint8_t * const halfHV= ((uint8_t*)half);\ 143.546 + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 143.547 + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ 143.548 + OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ 143.549 +}\ 143.550 +static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.551 + uint64_t half[8 + 9];\ 143.552 + uint8_t * const halfH= ((uint8_t*)half);\ 143.553 + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 143.554 + put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ 143.555 + OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ 143.556 +}\ 143.557 +static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.558 + uint64_t half[8 + 9];\ 143.559 + uint8_t * const halfH= ((uint8_t*)half);\ 143.560 + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 143.561 + put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ 143.562 + OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ 143.563 +}\ 143.564 +static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.565 + uint64_t half[9];\ 143.566 + uint8_t * const halfH= ((uint8_t*)half);\ 143.567 + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ 143.568 + OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ 143.569 +}\ 143.570 +static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ 143.571 + OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\ 143.572 +}\ 143.573 +\ 143.574 +static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.575 + uint64_t temp[32];\ 143.576 + uint8_t * const half= (uint8_t*)temp;\ 143.577 + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ 143.578 + OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ 143.579 +}\ 143.580 +\ 143.581 +static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.582 + OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ 143.583 +}\ 143.584 +\ 143.585 +static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.586 + uint64_t temp[32];\ 143.587 + uint8_t * const half= (uint8_t*)temp;\ 143.588 + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ 143.589 + OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\ 143.590 +}\ 143.591 +\ 143.592 +static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.593 + uint64_t temp[32];\ 143.594 + uint8_t * const half= (uint8_t*)temp;\ 143.595 + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ 143.596 + OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ 143.597 +}\ 143.598 +\ 143.599 +static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.600 + OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ 143.601 +}\ 143.602 +\ 143.603 +static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.604 + uint64_t temp[32];\ 143.605 + uint8_t * const half= (uint8_t*)temp;\ 143.606 + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ 143.607 + OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\ 143.608 +}\ 143.609 +static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.610 + uint64_t half[16*2 + 17*2];\ 143.611 + uint8_t * const halfH= ((uint8_t*)half) + 256;\ 143.612 + uint8_t * const halfHV= ((uint8_t*)half);\ 143.613 + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 143.614 + put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ 143.615 + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 143.616 + OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ 143.617 +}\ 143.618 +static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.619 + uint64_t half[16*2 + 17*2];\ 143.620 + uint8_t * const halfH= ((uint8_t*)half) + 256;\ 143.621 + uint8_t * const halfHV= ((uint8_t*)half);\ 143.622 + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 143.623 + put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ 143.624 + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 143.625 + OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ 143.626 +}\ 143.627 +static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.628 + uint64_t half[16*2 + 17*2];\ 143.629 + uint8_t * const halfH= ((uint8_t*)half) + 256;\ 143.630 + uint8_t * const halfHV= ((uint8_t*)half);\ 143.631 + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 143.632 + put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ 143.633 + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 143.634 + OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ 143.635 +}\ 143.636 +static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.637 + uint64_t half[16*2 + 17*2];\ 143.638 + uint8_t * const halfH= ((uint8_t*)half) + 256;\ 143.639 + uint8_t * const halfHV= ((uint8_t*)half);\ 143.640 + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 143.641 + put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ 143.642 + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 143.643 + OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ 143.644 +}\ 143.645 +static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.646 + uint64_t half[16*2 + 17*2];\ 143.647 + uint8_t * const halfH= ((uint8_t*)half) + 256;\ 143.648 + uint8_t * const halfHV= ((uint8_t*)half);\ 143.649 + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 143.650 + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 143.651 + OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ 143.652 +}\ 143.653 +static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.654 + uint64_t half[16*2 + 17*2];\ 143.655 + uint8_t * const halfH= ((uint8_t*)half) + 256;\ 143.656 + uint8_t * const halfHV= ((uint8_t*)half);\ 143.657 + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 143.658 + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ 143.659 + OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ 143.660 +}\ 143.661 +static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.662 + uint64_t half[17*2];\ 143.663 + uint8_t * const halfH= ((uint8_t*)half);\ 143.664 + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 143.665 + put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ 143.666 + OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ 143.667 +}\ 143.668 +static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.669 + uint64_t half[17*2];\ 143.670 + uint8_t * const halfH= ((uint8_t*)half);\ 143.671 + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 143.672 + put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ 143.673 + OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ 143.674 +}\ 143.675 +static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 143.676 + uint64_t half[17*2];\ 143.677 + uint8_t * const halfH= ((uint8_t*)half);\ 143.678 + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ 143.679 + OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ 143.680 +} 143.681 + 143.682 +#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" 143.683 +#define AVG_3DNOW_OP(a,b,temp, size) \ 143.684 +"mov" #size " " #b ", " #temp " \n\t"\ 143.685 +"pavgusb " #temp ", " #a " \n\t"\ 143.686 +"mov" #size " " #a ", " #b " \n\t" 143.687 +#define AVG_MMX2_OP(a,b,temp, size) \ 143.688 +"mov" #size " " #b ", " #temp " \n\t"\ 143.689 +"pavgb " #temp ", " #a " \n\t"\ 143.690 +"mov" #size " " #a ", " #b " \n\t" 143.691 + 143.692 +#define PREFETCH(name, op) \ 143.693 +static void name(void *mem, int stride, int h){\ 143.694 + const uint8_t *p= mem;\ 143.695 + do{\ 143.696 + __asm__ volatile(#op" %0" :: "m"(*p));\ 143.697 + p+= stride;\ 143.698 + }while(--h);\ 143.699 +} 143.700 +PREFETCH(prefetch_mmx2, prefetcht0) 143.701 +#undef PREFETCH 143.702 + 143.703 +#include "h264dsp_mmx.c" 143.704 + 143.705 +void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); 143.706 +void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); 143.707 +void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); 143.708 +void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); 143.709 +void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); 143.710 + 143.711 +void dsputil_init_mmx(DSPContext* c) 143.712 +{ 143.713 + mm_flags = mm_support(); 143.714 + 143.715 + if (mm_flags & FF_MM_MMX) { 143.716 + c->clear_block = clear_block_sse; 143.717 + c->clear_blocks = clear_blocks_sse; 143.718 + c->prefetch = prefetch_mmx2; 143.719 + 143.720 + 143.721 +#define H264_QPEL_FUNCS(x, y, CPU)\ 143.722 + c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\ 143.723 + c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\ 143.724 + c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\ 143.725 + c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; 143.726 + 143.727 + if((mm_flags & FF_MM_SSE2)){ 143.728 + c->put_pixels_tab[0][0] = put_pixels16_sse2; 143.729 + c->avg_pixels_tab[0][0] = avg_pixels16_sse2; 143.730 + 143.731 + } 143.732 + if(mm_flags & FF_MM_SSE2){ 143.733 + H264_QPEL_FUNCS(0, 1, sse2); 143.734 + H264_QPEL_FUNCS(0, 2, sse2); 143.735 + H264_QPEL_FUNCS(0, 3, sse2); 143.736 + H264_QPEL_FUNCS(1, 1, sse2); 143.737 + H264_QPEL_FUNCS(1, 2, sse2); 143.738 + H264_QPEL_FUNCS(1, 3, sse2); 143.739 + H264_QPEL_FUNCS(2, 1, sse2); 143.740 + H264_QPEL_FUNCS(2, 2, sse2); 143.741 + H264_QPEL_FUNCS(2, 3, sse2); 143.742 + H264_QPEL_FUNCS(3, 1, sse2); 143.743 + H264_QPEL_FUNCS(3, 2, sse2); 143.744 + H264_QPEL_FUNCS(3, 3, sse2); 143.745 + } 143.746 +#if HAVE_SSSE3 143.747 + if(mm_flags & FF_MM_SSSE3){ 143.748 + H264_QPEL_FUNCS(1, 0, ssse3); 143.749 + H264_QPEL_FUNCS(1, 1, ssse3); 143.750 + H264_QPEL_FUNCS(1, 2, ssse3); 143.751 + H264_QPEL_FUNCS(1, 3, ssse3); 143.752 + H264_QPEL_FUNCS(2, 0, ssse3); 143.753 + H264_QPEL_FUNCS(2, 1, ssse3); 143.754 + H264_QPEL_FUNCS(2, 2, ssse3); 143.755 + H264_QPEL_FUNCS(2, 3, ssse3); 143.756 + H264_QPEL_FUNCS(3, 0, ssse3); 143.757 + H264_QPEL_FUNCS(3, 1, ssse3); 143.758 + H264_QPEL_FUNCS(3, 2, ssse3); 143.759 + H264_QPEL_FUNCS(3, 3, ssse3); 143.760 + 143.761 + c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd; 143.762 + c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd; 143.763 + c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3; 143.764 + c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3; 143.765 + } 143.766 +#endif 143.767 + 143.768 + 143.769 + } 143.770 +} 143.771 + 143.772 +void ff_h264dsp_init_x86(H264DSPContext *c) 143.773 +{ 143.774 + mm_flags = mm_support(); 143.775 + 143.776 + if (mm_flags & FF_MM_MMX) { 143.777 + c->h264_idct_dc_add= 143.778 + c->h264_idct_add= ff_h264_idct_add_mmx; 143.779 + c->h264_idct8_dc_add= 143.780 + c->h264_idct8_add= ff_h264_idct8_add_mmx; 143.781 + 143.782 + if (mm_flags & FF_MM_MMX2) { 143.783 + c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; 143.784 + c->h264_idct_add8 = ff_h264_idct_add8_mmx2; 143.785 + c->h264_idct_add16 = ff_h264_idct_add16_mmx2; 143.786 + c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; 143.787 + 143.788 + c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; 143.789 + c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; 143.790 + 143.791 + c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; 143.792 + c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; 143.793 + c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; 143.794 + c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; 143.795 + c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; 143.796 + c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; 143.797 + c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; 143.798 + 143.799 + c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; 143.800 + c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; 143.801 + c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; 143.802 + c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; 143.803 + c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; 143.804 + c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; 143.805 + c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; 143.806 + c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; 143.807 + 143.808 + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; 143.809 + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; 143.810 + c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; 143.811 + c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; 143.812 + c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; 143.813 + c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; 143.814 + c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; 143.815 + c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; 143.816 + } 143.817 + if(mm_flags & FF_MM_SSE2){ 143.818 + c->h264_idct8_add = ff_h264_idct8_add_sse2; 143.819 + c->h264_idct8_add4= ff_h264_idct8_add4_sse2; 143.820 + } 143.821 + 143.822 + } 143.823 +} 143.824 +
144.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 144.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.h Mon Aug 27 12:09:56 2012 +0200 144.3 @@ -0,0 +1,170 @@ 144.4 +/* 144.5 + * MMX optimized DSP utils 144.6 + * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> 144.7 + * 144.8 + * This file is part of FFmpeg. 144.9 + * 144.10 + * FFmpeg is free software; you can redistribute it and/or 144.11 + * modify it under the terms of the GNU Lesser General Public 144.12 + * License as published by the Free Software Foundation; either 144.13 + * version 2.1 of the License, or (at your option) any later version. 144.14 + * 144.15 + * FFmpeg is distributed in the hope that it will be useful, 144.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 144.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 144.18 + * Lesser General Public License for more details. 144.19 + * 144.20 + * You should have received a copy of the GNU Lesser General Public 144.21 + * License along with FFmpeg; if not, write to the Free Software 144.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 144.23 + */ 144.24 + 144.25 +#ifndef AVCODEC_X86_DSPUTIL_MMX_H 144.26 +#define AVCODEC_X86_DSPUTIL_MMX_H 144.27 + 144.28 +#include <stdint.h> 144.29 +#include "libavcodec/dsputil.h" 144.30 + 144.31 +typedef struct { uint64_t a, b; } xmm_reg; 144.32 + 144.33 +extern const uint64_t ff_bone; 144.34 +extern const uint64_t ff_wtwo; 144.35 + 144.36 +extern const uint64_t ff_pdw_80000000[2]; 144.37 + 144.38 +extern const uint64_t ff_pw_3; 144.39 +extern const uint64_t ff_pw_4; 144.40 +extern const xmm_reg ff_pw_5; 144.41 +extern const xmm_reg ff_pw_8; 144.42 +extern const uint64_t ff_pw_15; 144.43 +extern const xmm_reg ff_pw_16; 144.44 +extern const uint64_t ff_pw_20; 144.45 +extern const xmm_reg ff_pw_28; 144.46 +extern const xmm_reg ff_pw_32; 144.47 +extern const uint64_t ff_pw_42; 144.48 +extern const xmm_reg ff_pw_64; 144.49 +extern const uint64_t ff_pw_96; 144.50 +extern const uint64_t ff_pw_128; 144.51 +extern const uint64_t ff_pw_255; 144.52 + 144.53 +extern const uint64_t ff_pb_1; 144.54 +extern const uint64_t ff_pb_3; 144.55 +extern const uint64_t ff_pb_7; 144.56 +extern const uint64_t ff_pb_1F; 144.57 +extern const uint64_t ff_pb_3F; 144.58 +extern const uint64_t ff_pb_81; 144.59 +extern const uint64_t ff_pb_A1; 144.60 +extern const uint64_t ff_pb_FC; 144.61 + 144.62 +extern const double ff_pd_1[2]; 144.63 +extern const double ff_pd_2[2]; 144.64 + 144.65 +#define LOAD4(stride,in,a,b,c,d)\ 144.66 + "movq 0*"#stride"+"#in", "#a"\n\t"\ 144.67 + "movq 1*"#stride"+"#in", "#b"\n\t"\ 144.68 + "movq 2*"#stride"+"#in", "#c"\n\t"\ 144.69 + "movq 3*"#stride"+"#in", "#d"\n\t" 144.70 + 144.71 +#define STORE4(stride,out,a,b,c,d)\ 144.72 + "movq "#a", 0*"#stride"+"#out"\n\t"\ 144.73 + "movq "#b", 1*"#stride"+"#out"\n\t"\ 144.74 + "movq "#c", 2*"#stride"+"#out"\n\t"\ 144.75 + "movq "#d", 3*"#stride"+"#out"\n\t" 144.76 + 144.77 +/* in/out: mma=mma+mmb, mmb=mmb-mma */ 144.78 +#define SUMSUB_BA( a, b ) \ 144.79 + "paddw "#b", "#a" \n\t"\ 144.80 + "paddw "#b", "#b" \n\t"\ 144.81 + "psubw "#a", "#b" \n\t" 144.82 + 144.83 +#define SBUTTERFLY(a,b,t,n,m)\ 144.84 + "mov" #m " " #a ", " #t " \n\t" /* abcd */\ 144.85 + "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ 144.86 + "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ 144.87 + 144.88 +#define TRANSPOSE4(a,b,c,d,t)\ 144.89 + SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\ 144.90 + SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\ 144.91 + SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ 144.92 + SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ 144.93 + 144.94 +// e,f,g,h can be memory 144.95 +// out: a,d,t,c 144.96 +#define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\ 144.97 + "punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\ 144.98 + "punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\ 144.99 + "punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\ 144.100 + "punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\ 144.101 + SBUTTERFLY(a, b, t, bw, q) /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\ 144.102 + /* t= a2 b2 e2 f2 a3 b3 e3 f3 */\ 144.103 + SBUTTERFLY(c, d, b, bw, q) /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\ 144.104 + /* b= c2 d2 g2 h2 c3 d3 g3 h3 */\ 144.105 + SBUTTERFLY(a, c, d, wd, q) /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\ 144.106 + /* d= a1 b1 c1 d1 e1 f1 g1 h1 */\ 144.107 + SBUTTERFLY(t, b, c, wd, q) /* t= a2 b2 c2 d2 e2 f2 g2 h2 */\ 144.108 + /* c= a3 b3 c3 d3 e3 f3 g3 h3 */ 144.109 + 144.110 +#if ARCH_X86_64 144.111 +// permutes 01234567 -> 05736421 144.112 +#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ 144.113 + SBUTTERFLY(a,b,%%xmm8,wd,dqa)\ 144.114 + SBUTTERFLY(c,d,b,wd,dqa)\ 144.115 + SBUTTERFLY(e,f,d,wd,dqa)\ 144.116 + SBUTTERFLY(g,h,f,wd,dqa)\ 144.117 + SBUTTERFLY(a,c,h,dq,dqa)\ 144.118 + SBUTTERFLY(%%xmm8,b,c,dq,dqa)\ 144.119 + SBUTTERFLY(e,g,b,dq,dqa)\ 144.120 + SBUTTERFLY(d,f,g,dq,dqa)\ 144.121 + SBUTTERFLY(a,e,f,qdq,dqa)\ 144.122 + SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\ 144.123 + SBUTTERFLY(h,b,d,qdq,dqa)\ 144.124 + SBUTTERFLY(c,g,b,qdq,dqa)\ 144.125 + "movdqa %%xmm8, "#g" \n\t" 144.126 +#else 144.127 +#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ 144.128 + "movdqa "#h", "#t" \n\t"\ 144.129 + SBUTTERFLY(a,b,h,wd,dqa)\ 144.130 + "movdqa "#h", 16"#t" \n\t"\ 144.131 + "movdqa "#t", "#h" \n\t"\ 144.132 + SBUTTERFLY(c,d,b,wd,dqa)\ 144.133 + SBUTTERFLY(e,f,d,wd,dqa)\ 144.134 + SBUTTERFLY(g,h,f,wd,dqa)\ 144.135 + SBUTTERFLY(a,c,h,dq,dqa)\ 144.136 + "movdqa "#h", "#t" \n\t"\ 144.137 + "movdqa 16"#t", "#h" \n\t"\ 144.138 + SBUTTERFLY(h,b,c,dq,dqa)\ 144.139 + SBUTTERFLY(e,g,b,dq,dqa)\ 144.140 + SBUTTERFLY(d,f,g,dq,dqa)\ 144.141 + SBUTTERFLY(a,e,f,qdq,dqa)\ 144.142 + SBUTTERFLY(h,d,e,qdq,dqa)\ 144.143 + "movdqa "#h", 16"#t" \n\t"\ 144.144 + "movdqa "#t", "#h" \n\t"\ 144.145 + SBUTTERFLY(h,b,d,qdq,dqa)\ 144.146 + SBUTTERFLY(c,g,b,qdq,dqa)\ 144.147 + "movdqa 16"#t", "#g" \n\t" 144.148 +#endif 144.149 + 144.150 +#define MOVQ_WONE(regd) \ 144.151 + __asm__ volatile ( \ 144.152 + "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ 144.153 + "psrlw $15, %%" #regd ::) 144.154 + 144.155 +void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); 144.156 +void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); 144.157 +void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); 144.158 + 144.159 +void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); 144.160 +void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); 144.161 +void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); 144.162 +void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); 144.163 + 144.164 +void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd); 144.165 +void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd); 144.166 + 144.167 +void ff_lpc_compute_autocorr_sse2(const int32_t *data, int len, int lag, 144.168 + double *autoc); 144.169 + 144.170 +void ff_mmx_idct(DCTELEM *block); 144.171 +void ff_mmxext_idct(DCTELEM *block); 144.172 + 144.173 +#endif /* AVCODEC_X86_DSPUTIL_MMX_H */
145.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 145.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx_avg_template.c Mon Aug 27 12:09:56 2012 +0200 145.3 @@ -0,0 +1,250 @@ 145.4 +/* 145.5 + * DSP utils : average functions are compiled twice for 3dnow/mmx2 145.6 + * Copyright (c) 2000, 2001 Fabrice Bellard 145.7 + * Copyright (c) 2002-2004 Michael Niedermayer 145.8 + * 145.9 + * MMX optimization by Nick Kurshev <nickols_k@mail.ru> 145.10 + * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> 145.11 + * and improved by Zdenek Kabelac <kabi@users.sf.net> 145.12 + * 145.13 + * This file is part of FFmpeg. 145.14 + * 145.15 + * FFmpeg is free software; you can redistribute it and/or 145.16 + * modify it under the terms of the GNU Lesser General Public 145.17 + * License as published by the Free Software Foundation; either 145.18 + * version 2.1 of the License, or (at your option) any later version. 145.19 + * 145.20 + * FFmpeg is distributed in the hope that it will be useful, 145.21 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 145.22 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 145.23 + * Lesser General Public License for more details. 145.24 + * 145.25 + * You should have received a copy of the GNU Lesser General Public 145.26 + * License along with FFmpeg; if not, write to the Free Software 145.27 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 145.28 + */ 145.29 + 145.30 +static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 145.31 +{ 145.32 + __asm__ volatile( 145.33 + "testl $1, %0 \n\t" 145.34 + " jz 1f \n\t" 145.35 + "movq (%1), %%mm0 \n\t" 145.36 + "movq (%2), %%mm1 \n\t" 145.37 + "add %4, %1 \n\t" 145.38 + "add $8, %2 \n\t" 145.39 + PAVGB" %%mm1, %%mm0 \n\t" 145.40 + "movq %%mm0, (%3) \n\t" 145.41 + "add %5, %3 \n\t" 145.42 + "decl %0 \n\t" 145.43 + "1: \n\t" 145.44 + "movq (%1), %%mm0 \n\t" 145.45 + "add %4, %1 \n\t" 145.46 + "movq (%1), %%mm1 \n\t" 145.47 + "add %4, %1 \n\t" 145.48 + PAVGB" (%2), %%mm0 \n\t" 145.49 + PAVGB" 8(%2), %%mm1 \n\t" 145.50 + "movq %%mm0, (%3) \n\t" 145.51 + "add %5, %3 \n\t" 145.52 + "movq %%mm1, (%3) \n\t" 145.53 + "add %5, %3 \n\t" 145.54 + "movq (%1), %%mm0 \n\t" 145.55 + "add %4, %1 \n\t" 145.56 + "movq (%1), %%mm1 \n\t" 145.57 + "add %4, %1 \n\t" 145.58 + PAVGB" 16(%2), %%mm0 \n\t" 145.59 + PAVGB" 24(%2), %%mm1 \n\t" 145.60 + "movq %%mm0, (%3) \n\t" 145.61 + "add %5, %3 \n\t" 145.62 + "movq %%mm1, (%3) \n\t" 145.63 + "add %5, %3 \n\t" 145.64 + "add $32, %2 \n\t" 145.65 + "subl $4, %0 \n\t" 145.66 + "jnz 1b \n\t" 145.67 + 145.68 + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 145.69 + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) 145.70 + :"memory"); 145.71 +//the following should be used, though better not with gcc ... 145.72 +/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) 145.73 + :"r"(src1Stride), "r"(dstStride) 145.74 + :"memory");*/ 145.75 +} 145.76 + 145.77 +static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 145.78 +{ 145.79 + __asm__ volatile( 145.80 + "testl $1, %0 \n\t" 145.81 + " jz 1f \n\t" 145.82 + "movq (%1), %%mm0 \n\t" 145.83 + "movq (%2), %%mm1 \n\t" 145.84 + "add %4, %1 \n\t" 145.85 + "add $8, %2 \n\t" 145.86 + PAVGB" %%mm1, %%mm0 \n\t" 145.87 + PAVGB" (%3), %%mm0 \n\t" 145.88 + "movq %%mm0, (%3) \n\t" 145.89 + "add %5, %3 \n\t" 145.90 + "decl %0 \n\t" 145.91 + "1: \n\t" 145.92 + "movq (%1), %%mm0 \n\t" 145.93 + "add %4, %1 \n\t" 145.94 + "movq (%1), %%mm1 \n\t" 145.95 + "add %4, %1 \n\t" 145.96 + PAVGB" (%2), %%mm0 \n\t" 145.97 + PAVGB" 8(%2), %%mm1 \n\t" 145.98 + PAVGB" (%3), %%mm0 \n\t" 145.99 + "movq %%mm0, (%3) \n\t" 145.100 + "add %5, %3 \n\t" 145.101 + PAVGB" (%3), %%mm1 \n\t" 145.102 + "movq %%mm1, (%3) \n\t" 145.103 + "add %5, %3 \n\t" 145.104 + "movq (%1), %%mm0 \n\t" 145.105 + "add %4, %1 \n\t" 145.106 + "movq (%1), %%mm1 \n\t" 145.107 + "add %4, %1 \n\t" 145.108 + PAVGB" 16(%2), %%mm0 \n\t" 145.109 + PAVGB" 24(%2), %%mm1 \n\t" 145.110 + PAVGB" (%3), %%mm0 \n\t" 145.111 + "movq %%mm0, (%3) \n\t" 145.112 + "add %5, %3 \n\t" 145.113 + PAVGB" (%3), %%mm1 \n\t" 145.114 + "movq %%mm1, (%3) \n\t" 145.115 + "add %5, %3 \n\t" 145.116 + "add $32, %2 \n\t" 145.117 + "subl $4, %0 \n\t" 145.118 + "jnz 1b \n\t" 145.119 + 145.120 + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 145.121 + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) 145.122 + :"memory"); 145.123 +//the following should be used, though better not with gcc ... 145.124 +/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) 145.125 + :"r"(src1Stride), "r"(dstStride) 145.126 + :"memory");*/ 145.127 +} 145.128 + 145.129 + 145.130 +static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 145.131 +{ 145.132 + __asm__ volatile( 145.133 + "testl $1, %0 \n\t" 145.134 + " jz 1f \n\t" 145.135 + "movq (%1), %%mm0 \n\t" 145.136 + "movq 8(%1), %%mm1 \n\t" 145.137 + PAVGB" (%2), %%mm0 \n\t" 145.138 + PAVGB" 8(%2), %%mm1 \n\t" 145.139 + "add %4, %1 \n\t" 145.140 + "add $16, %2 \n\t" 145.141 + "movq %%mm0, (%3) \n\t" 145.142 + "movq %%mm1, 8(%3) \n\t" 145.143 + "add %5, %3 \n\t" 145.144 + "decl %0 \n\t" 145.145 + "1: \n\t" 145.146 + "movq (%1), %%mm0 \n\t" 145.147 + "movq 8(%1), %%mm1 \n\t" 145.148 + "add %4, %1 \n\t" 145.149 + PAVGB" (%2), %%mm0 \n\t" 145.150 + PAVGB" 8(%2), %%mm1 \n\t" 145.151 + "movq %%mm0, (%3) \n\t" 145.152 + "movq %%mm1, 8(%3) \n\t" 145.153 + "add %5, %3 \n\t" 145.154 + "movq (%1), %%mm0 \n\t" 145.155 + "movq 8(%1), %%mm1 \n\t" 145.156 + "add %4, %1 \n\t" 145.157 + PAVGB" 16(%2), %%mm0 \n\t" 145.158 + PAVGB" 24(%2), %%mm1 \n\t" 145.159 + "movq %%mm0, (%3) \n\t" 145.160 + "movq %%mm1, 8(%3) \n\t" 145.161 + "add %5, %3 \n\t" 145.162 + "add $32, %2 \n\t" 145.163 + "subl $2, %0 \n\t" 145.164 + "jnz 1b \n\t" 145.165 + 145.166 + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 145.167 + 145.168 + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) 145.169 + :"memory"); 145.170 +//the following should be used, though better not with gcc ... 145.171 +/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) 145.172 + :"r"(src1Stride), "r"(dstStride) 145.173 + :"memory");*/ 145.174 +} 145.175 + 145.176 +static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 145.177 +{ 145.178 + __asm__ volatile( 145.179 + "testl $1, %0 \n\t" 145.180 + " jz 1f \n\t" 145.181 + "movq (%1), %%mm0 \n\t" 145.182 + "movq 8(%1), %%mm1 \n\t" 145.183 + PAVGB" (%2), %%mm0 \n\t" 145.184 + PAVGB" 8(%2), %%mm1 \n\t" 145.185 + "add %4, %1 \n\t" 145.186 + "add $16, %2 \n\t" 145.187 + PAVGB" (%3), %%mm0 \n\t" 145.188 + PAVGB" 8(%3), %%mm1 \n\t" 145.189 + "movq %%mm0, (%3) \n\t" 145.190 + "movq %%mm1, 8(%3) \n\t" 145.191 + "add %5, %3 \n\t" 145.192 + "decl %0 \n\t" 145.193 + "1: \n\t" 145.194 + "movq (%1), %%mm0 \n\t" 145.195 + "movq 8(%1), %%mm1 \n\t" 145.196 + "add %4, %1 \n\t" 145.197 + PAVGB" (%2), %%mm0 \n\t" 145.198 + PAVGB" 8(%2), %%mm1 \n\t" 145.199 + PAVGB" (%3), %%mm0 \n\t" 145.200 + PAVGB" 8(%3), %%mm1 \n\t" 145.201 + "movq %%mm0, (%3) \n\t" 145.202 + "movq %%mm1, 8(%3) \n\t" 145.203 + "add %5, %3 \n\t" 145.204 + "movq (%1), %%mm0 \n\t" 145.205 + "movq 8(%1), %%mm1 \n\t" 145.206 + "add %4, %1 \n\t" 145.207 + PAVGB" 16(%2), %%mm0 \n\t" 145.208 + PAVGB" 24(%2), %%mm1 \n\t" 145.209 + PAVGB" (%3), %%mm0 \n\t" 145.210 + PAVGB" 8(%3), %%mm1 \n\t" 145.211 + "movq %%mm0, (%3) \n\t" 145.212 + "movq %%mm1, 8(%3) \n\t" 145.213 + "add %5, %3 \n\t" 145.214 + "add $32, %2 \n\t" 145.215 + "subl $2, %0 \n\t" 145.216 + "jnz 1b \n\t" 145.217 + 145.218 + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) 145.219 + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) 145.220 + :"memory"); 145.221 +//the following should be used, though better not with gcc ... 145.222 +/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) 145.223 + :"r"(src1Stride), "r"(dstStride) 145.224 + :"memory");*/ 145.225 +} 145.226 + 145.227 +static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) 145.228 +{ 145.229 + __asm__ volatile( 145.230 + "lea (%3, %3), %%"REG_a" \n\t" 145.231 + "1: \n\t" 145.232 + "movq (%2), %%mm0 \n\t" 145.233 + "movq (%2, %3), %%mm1 \n\t" 145.234 + PAVGB" (%1), %%mm0 \n\t" 145.235 + PAVGB" (%1, %3), %%mm1 \n\t" 145.236 + "movq %%mm0, (%2) \n\t" 145.237 + "movq %%mm1, (%2, %3) \n\t" 145.238 + "add %%"REG_a", %1 \n\t" 145.239 + "add %%"REG_a", %2 \n\t" 145.240 + "movq (%2), %%mm0 \n\t" 145.241 + "movq (%2, %3), %%mm1 \n\t" 145.242 + PAVGB" (%1), %%mm0 \n\t" 145.243 + PAVGB" (%1, %3), %%mm1 \n\t" 145.244 + "add %%"REG_a", %1 \n\t" 145.245 + "movq %%mm0, (%2) \n\t" 145.246 + "movq %%mm1, (%2, %3) \n\t" 145.247 + "add %%"REG_a", %2 \n\t" 145.248 + "subl $4, %0 \n\t" 145.249 + "jnz 1b \n\t" 145.250 + :"+g"(h), "+S"(pixels), "+D"(block) 145.251 + :"r" ((x86_reg)line_size) 145.252 + :"%"REG_a, "memory"); 145.253 +}
146.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 146.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/h264dsp_mmx.c Mon Aug 27 12:09:56 2012 +0200 146.3 @@ -0,0 +1,1741 @@ 146.4 +/* 146.5 + * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt 146.6 + * 146.7 + * This file is part of FFmpeg. 146.8 + * 146.9 + * FFmpeg is free software; you can redistribute it and/or 146.10 + * modify it under the terms of the GNU Lesser General Public 146.11 + * License as published by the Free Software Foundation; either 146.12 + * version 2.1 of the License, or (at your option) any later version. 146.13 + * 146.14 + * FFmpeg is distributed in the hope that it will be useful, 146.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 146.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 146.17 + * Lesser General Public License for more details. 146.18 + * 146.19 + * You should have received a copy of the GNU Lesser General Public 146.20 + * License along with FFmpeg; if not, write to the Free Software 146.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 146.22 + */ 146.23 + 146.24 +#include "dsputil_mmx.h" 146.25 + 146.26 +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; 146.27 +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; 146.28 + 146.29 +/***********************************/ 146.30 +/* IDCT */ 146.31 + 146.32 +#define SUMSUB_BADC( a, b, c, d ) \ 146.33 + "paddw "#b", "#a" \n\t"\ 146.34 + "paddw "#d", "#c" \n\t"\ 146.35 + "paddw "#b", "#b" \n\t"\ 146.36 + "paddw "#d", "#d" \n\t"\ 146.37 + "psubw "#a", "#b" \n\t"\ 146.38 + "psubw "#c", "#d" \n\t" 146.39 + 146.40 +#define SUMSUBD2_AB( a, b, t ) \ 146.41 + "movq "#b", "#t" \n\t"\ 146.42 + "psraw $1 , "#b" \n\t"\ 146.43 + "paddw "#a", "#b" \n\t"\ 146.44 + "psraw $1 , "#a" \n\t"\ 146.45 + "psubw "#t", "#a" \n\t" 146.46 + 146.47 +#define IDCT4_1D( s02, s13, d02, d13, t ) \ 146.48 + SUMSUB_BA ( s02, d02 )\ 146.49 + SUMSUBD2_AB( s13, d13, t )\ 146.50 + SUMSUB_BADC( d13, s02, s13, d02 ) 146.51 + 146.52 +#define STORE_DIFF_4P( p, t, z ) \ 146.53 + "psraw $6, "#p" \n\t"\ 146.54 + "movd (%0), "#t" \n\t"\ 146.55 + "punpcklbw "#z", "#t" \n\t"\ 146.56 + "paddsw "#t", "#p" \n\t"\ 146.57 + "packuswb "#z", "#p" \n\t"\ 146.58 + "movd "#p", (%0) \n\t" 146.59 + 146.60 +static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) 146.61 +{ 146.62 + /* Load dct coeffs */ 146.63 + __asm__ volatile( 146.64 + "movq (%0), %%mm0 \n\t" 146.65 + "movq 8(%0), %%mm1 \n\t" 146.66 + "movq 16(%0), %%mm2 \n\t" 146.67 + "movq 24(%0), %%mm3 \n\t" 146.68 + :: "r"(block) ); 146.69 + 146.70 + __asm__ volatile( 146.71 + /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */ 146.72 + IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 ) 146.73 + 146.74 + "movq %0, %%mm6 \n\t" 146.75 + /* in: 1,4,0,2 out: 1,2,3,0 */ 146.76 + TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 ) 146.77 + 146.78 + "paddw %%mm6, %%mm3 \n\t" 146.79 + 146.80 + /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */ 146.81 + IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) 146.82 + 146.83 + "pxor %%mm7, %%mm7 \n\t" 146.84 + :: "m"(ff_pw_32)); 146.85 + 146.86 + __asm__ volatile( 146.87 + STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) 146.88 + "add %1, %0 \n\t" 146.89 + STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) 146.90 + "add %1, %0 \n\t" 146.91 + STORE_DIFF_4P( %%mm3, %%mm1, %%mm7) 146.92 + "add %1, %0 \n\t" 146.93 + STORE_DIFF_4P( %%mm4, %%mm1, %%mm7) 146.94 + : "+r"(dst) 146.95 + : "r" ((x86_reg)stride) 146.96 + ); 146.97 +} 146.98 + 146.99 +static inline void h264_idct8_1d(int16_t *block) 146.100 +{ 146.101 + __asm__ volatile( 146.102 + "movq 112(%0), %%mm7 \n\t" 146.103 + "movq 80(%0), %%mm0 \n\t" 146.104 + "movq 48(%0), %%mm3 \n\t" 146.105 + "movq 16(%0), %%mm5 \n\t" 146.106 + 146.107 + "movq %%mm0, %%mm4 \n\t" 146.108 + "movq %%mm5, %%mm1 \n\t" 146.109 + "psraw $1, %%mm4 \n\t" 146.110 + "psraw $1, %%mm1 \n\t" 146.111 + "paddw %%mm0, %%mm4 \n\t" 146.112 + "paddw %%mm5, %%mm1 \n\t" 146.113 + "paddw %%mm7, %%mm4 \n\t" 146.114 + "paddw %%mm0, %%mm1 \n\t" 146.115 + "psubw %%mm5, %%mm4 \n\t" 146.116 + "paddw %%mm3, %%mm1 \n\t" 146.117 + 146.118 + "psubw %%mm3, %%mm5 \n\t" 146.119 + "psubw %%mm3, %%mm0 \n\t" 146.120 + "paddw %%mm7, %%mm5 \n\t" 146.121 + "psubw %%mm7, %%mm0 \n\t" 146.122 + "psraw $1, %%mm3 \n\t" 146.123 + "psraw $1, %%mm7 \n\t" 146.124 + "psubw %%mm3, %%mm5 \n\t" 146.125 + "psubw %%mm7, %%mm0 \n\t" 146.126 + 146.127 + "movq %%mm4, %%mm3 \n\t" 146.128 + "movq %%mm1, %%mm7 \n\t" 146.129 + "psraw $2, %%mm1 \n\t" 146.130 + "psraw $2, %%mm3 \n\t" 146.131 + "paddw %%mm5, %%mm3 \n\t" 146.132 + "psraw $2, %%mm5 \n\t" 146.133 + "paddw %%mm0, %%mm1 \n\t" 146.134 + "psraw $2, %%mm0 \n\t" 146.135 + "psubw %%mm4, %%mm5 \n\t" 146.136 + "psubw %%mm0, %%mm7 \n\t" 146.137 + 146.138 + "movq 32(%0), %%mm2 \n\t" 146.139 + "movq 96(%0), %%mm6 \n\t" 146.140 + "movq %%mm2, %%mm4 \n\t" 146.141 + "movq %%mm6, %%mm0 \n\t" 146.142 + "psraw $1, %%mm4 \n\t" 146.143 + "psraw $1, %%mm6 \n\t" 146.144 + "psubw %%mm0, %%mm4 \n\t" 146.145 + "paddw %%mm2, %%mm6 \n\t" 146.146 + 146.147 + "movq (%0), %%mm2 \n\t" 146.148 + "movq 64(%0), %%mm0 \n\t" 146.149 + SUMSUB_BA( %%mm0, %%mm2 ) 146.150 + SUMSUB_BA( %%mm6, %%mm0 ) 146.151 + SUMSUB_BA( %%mm4, %%mm2 ) 146.152 + SUMSUB_BA( %%mm7, %%mm6 ) 146.153 + SUMSUB_BA( %%mm5, %%mm4 ) 146.154 + SUMSUB_BA( %%mm3, %%mm2 ) 146.155 + SUMSUB_BA( %%mm1, %%mm0 ) 146.156 + :: "r"(block) 146.157 + ); 146.158 +} 146.159 + 146.160 +static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) 146.161 +{ 146.162 + int i; 146.163 + DECLARE_ALIGNED(8, int16_t, b2)[64]; 146.164 + 146.165 + block[0] += 32; 146.166 + 146.167 + for(i=0; i<2; i++){ 146.168 + DECLARE_ALIGNED(8, uint64_t, tmp); 146.169 + 146.170 + h264_idct8_1d(block+4*i); 146.171 + 146.172 + __asm__ volatile( 146.173 + "movq %%mm7, %0 \n\t" 146.174 + TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) 146.175 + "movq %%mm0, 8(%1) \n\t" 146.176 + "movq %%mm6, 24(%1) \n\t" 146.177 + "movq %%mm7, 40(%1) \n\t" 146.178 + "movq %%mm4, 56(%1) \n\t" 146.179 + "movq %0, %%mm7 \n\t" 146.180 + TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) 146.181 + "movq %%mm7, (%1) \n\t" 146.182 + "movq %%mm1, 16(%1) \n\t" 146.183 + "movq %%mm0, 32(%1) \n\t" 146.184 + "movq %%mm3, 48(%1) \n\t" 146.185 + : "=m"(tmp) 146.186 + : "r"(b2+32*i) 146.187 + : "memory" 146.188 + ); 146.189 + } 146.190 + 146.191 + for(i=0; i<2; i++){ 146.192 + h264_idct8_1d(b2+4*i); 146.193 + 146.194 + __asm__ volatile( 146.195 + "psraw $6, %%mm7 \n\t" 146.196 + "psraw $6, %%mm6 \n\t" 146.197 + "psraw $6, %%mm5 \n\t" 146.198 + "psraw $6, %%mm4 \n\t" 146.199 + "psraw $6, %%mm3 \n\t" 146.200 + "psraw $6, %%mm2 \n\t" 146.201 + "psraw $6, %%mm1 \n\t" 146.202 + "psraw $6, %%mm0 \n\t" 146.203 + 146.204 + "movq %%mm7, (%0) \n\t" 146.205 + "movq %%mm5, 16(%0) \n\t" 146.206 + "movq %%mm3, 32(%0) \n\t" 146.207 + "movq %%mm1, 48(%0) \n\t" 146.208 + "movq %%mm0, 64(%0) \n\t" 146.209 + "movq %%mm2, 80(%0) \n\t" 146.210 + "movq %%mm4, 96(%0) \n\t" 146.211 + "movq %%mm6, 112(%0) \n\t" 146.212 + :: "r"(b2+4*i) 146.213 + : "memory" 146.214 + ); 146.215 + } 146.216 + 146.217 + add_pixels_clamped_mmx(b2, dst, stride); 146.218 +} 146.219 + 146.220 +#define STORE_DIFF_8P( p, d, t, z )\ 146.221 + "movq "#d", "#t" \n"\ 146.222 + "psraw $6, "#p" \n"\ 146.223 + "punpcklbw "#z", "#t" \n"\ 146.224 + "paddsw "#t", "#p" \n"\ 146.225 + "packuswb "#p", "#p" \n"\ 146.226 + "movq "#p", "#d" \n" 146.227 + 146.228 +#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\ 146.229 + "movdqa "#c", "#a" \n"\ 146.230 + "movdqa "#g", "#e" \n"\ 146.231 + "psraw $1, "#c" \n"\ 146.232 + "psraw $1, "#g" \n"\ 146.233 + "psubw "#e", "#c" \n"\ 146.234 + "paddw "#a", "#g" \n"\ 146.235 + "movdqa "#b", "#e" \n"\ 146.236 + "psraw $1, "#e" \n"\ 146.237 + "paddw "#b", "#e" \n"\ 146.238 + "paddw "#d", "#e" \n"\ 146.239 + "paddw "#f", "#e" \n"\ 146.240 + "movdqa "#f", "#a" \n"\ 146.241 + "psraw $1, "#a" \n"\ 146.242 + "paddw "#f", "#a" \n"\ 146.243 + "paddw "#h", "#a" \n"\ 146.244 + "psubw "#b", "#a" \n"\ 146.245 + "psubw "#d", "#b" \n"\ 146.246 + "psubw "#d", "#f" \n"\ 146.247 + "paddw "#h", "#b" \n"\ 146.248 + "psubw "#h", "#f" \n"\ 146.249 + "psraw $1, "#d" \n"\ 146.250 + "psraw $1, "#h" \n"\ 146.251 + "psubw "#d", "#b" \n"\ 146.252 + "psubw "#h", "#f" \n"\ 146.253 + "movdqa "#e", "#d" \n"\ 146.254 + "movdqa "#a", "#h" \n"\ 146.255 + "psraw $2, "#d" \n"\ 146.256 + "psraw $2, "#h" \n"\ 146.257 + "paddw "#f", "#d" \n"\ 146.258 + "paddw "#b", "#h" \n"\ 146.259 + "psraw $2, "#f" \n"\ 146.260 + "psraw $2, "#b" \n"\ 146.261 + "psubw "#f", "#e" \n"\ 146.262 + "psubw "#a", "#b" \n"\ 146.263 + "movdqa 0x00(%1), "#a" \n"\ 146.264 + "movdqa 0x40(%1), "#f" \n"\ 146.265 + SUMSUB_BA(f, a)\ 146.266 + SUMSUB_BA(g, f)\ 146.267 + SUMSUB_BA(c, a)\ 146.268 + SUMSUB_BA(e, g)\ 146.269 + SUMSUB_BA(b, c)\ 146.270 + SUMSUB_BA(h, a)\ 146.271 + SUMSUB_BA(d, f) 146.272 + 146.273 +static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) 146.274 +{ 146.275 + __asm__ volatile( 146.276 + "movdqa 0x10(%1), %%xmm1 \n" 146.277 + "movdqa 0x20(%1), %%xmm2 \n" 146.278 + "movdqa 0x30(%1), %%xmm3 \n" 146.279 + "movdqa 0x50(%1), %%xmm5 \n" 146.280 + "movdqa 0x60(%1), %%xmm6 \n" 146.281 + "movdqa 0x70(%1), %%xmm7 \n" 146.282 + H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7) 146.283 + TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1)) 146.284 + "paddw %4, %%xmm4 \n" 146.285 + "movdqa %%xmm4, 0x00(%1) \n" 146.286 + "movdqa %%xmm2, 0x40(%1) \n" 146.287 + H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1) 146.288 + "movdqa %%xmm6, 0x60(%1) \n" 146.289 + "movdqa %%xmm7, 0x70(%1) \n" 146.290 + "pxor %%xmm7, %%xmm7 \n" 146.291 + STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7) 146.292 + STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7) 146.293 + STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7) 146.294 + STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7) 146.295 + "lea (%0,%2,4), %0 \n" 146.296 + STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7) 146.297 + STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7) 146.298 + "movdqa 0x60(%1), %%xmm0 \n" 146.299 + "movdqa 0x70(%1), %%xmm1 \n" 146.300 + STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7) 146.301 + STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7) 146.302 + :"+r"(dst) 146.303 + :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32) 146.304 + ); 146.305 +} 146.306 + 146.307 +static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) 146.308 +{ 146.309 + int dc = (block[0] + 32) >> 6; 146.310 + __asm__ volatile( 146.311 + "movd %0, %%mm0 \n\t" 146.312 + "pshufw $0, %%mm0, %%mm0 \n\t" 146.313 + "pxor %%mm1, %%mm1 \n\t" 146.314 + "psubw %%mm0, %%mm1 \n\t" 146.315 + "packuswb %%mm0, %%mm0 \n\t" 146.316 + "packuswb %%mm1, %%mm1 \n\t" 146.317 + ::"r"(dc) 146.318 + ); 146.319 + __asm__ volatile( 146.320 + "movd %0, %%mm2 \n\t" 146.321 + "movd %1, %%mm3 \n\t" 146.322 + "movd %2, %%mm4 \n\t" 146.323 + "movd %3, %%mm5 \n\t" 146.324 + "paddusb %%mm0, %%mm2 \n\t" 146.325 + "paddusb %%mm0, %%mm3 \n\t" 146.326 + "paddusb %%mm0, %%mm4 \n\t" 146.327 + "paddusb %%mm0, %%mm5 \n\t" 146.328 + "psubusb %%mm1, %%mm2 \n\t" 146.329 + "psubusb %%mm1, %%mm3 \n\t" 146.330 + "psubusb %%mm1, %%mm4 \n\t" 146.331 + "psubusb %%mm1, %%mm5 \n\t" 146.332 + "movd %%mm2, %0 \n\t" 146.333 + "movd %%mm3, %1 \n\t" 146.334 + "movd %%mm4, %2 \n\t" 146.335 + "movd %%mm5, %3 \n\t" 146.336 + :"+m"(*(uint32_t*)(dst+0*stride)), 146.337 + "+m"(*(uint32_t*)(dst+1*stride)), 146.338 + "+m"(*(uint32_t*)(dst+2*stride)), 146.339 + "+m"(*(uint32_t*)(dst+3*stride)) 146.340 + ); 146.341 +} 146.342 + 146.343 +static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) 146.344 +{ 146.345 + int dc = (block[0] + 32) >> 6; 146.346 + int y; 146.347 + __asm__ volatile( 146.348 + "movd %0, %%mm0 \n\t" 146.349 + "pshufw $0, %%mm0, %%mm0 \n\t" 146.350 + "pxor %%mm1, %%mm1 \n\t" 146.351 + "psubw %%mm0, %%mm1 \n\t" 146.352 + "packuswb %%mm0, %%mm0 \n\t" 146.353 + "packuswb %%mm1, %%mm1 \n\t" 146.354 + ::"r"(dc) 146.355 + ); 146.356 + for(y=2; y--; dst += 4*stride){ 146.357 + __asm__ volatile( 146.358 + "movq %0, %%mm2 \n\t" 146.359 + "movq %1, %%mm3 \n\t" 146.360 + "movq %2, %%mm4 \n\t" 146.361 + "movq %3, %%mm5 \n\t" 146.362 + "paddusb %%mm0, %%mm2 \n\t" 146.363 + "paddusb %%mm0, %%mm3 \n\t" 146.364 + "paddusb %%mm0, %%mm4 \n\t" 146.365 + "paddusb %%mm0, %%mm5 \n\t" 146.366 + "psubusb %%mm1, %%mm2 \n\t" 146.367 + "psubusb %%mm1, %%mm3 \n\t" 146.368 + "psubusb %%mm1, %%mm4 \n\t" 146.369 + "psubusb %%mm1, %%mm5 \n\t" 146.370 + "movq %%mm2, %0 \n\t" 146.371 + "movq %%mm3, %1 \n\t" 146.372 + "movq %%mm4, %2 \n\t" 146.373 + "movq %%mm5, %3 \n\t" 146.374 + :"+m"(*(uint64_t*)(dst+0*stride)), 146.375 + "+m"(*(uint64_t*)(dst+1*stride)), 146.376 + "+m"(*(uint64_t*)(dst+2*stride)), 146.377 + "+m"(*(uint64_t*)(dst+3*stride)) 146.378 + ); 146.379 + } 146.380 +} 146.381 + 146.382 +//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split 146.383 +static const uint8_t scan8[16 + 2*4]={ 146.384 + 4+1*8, 5+1*8, 4+2*8, 5+2*8, 146.385 + 6+1*8, 7+1*8, 6+2*8, 7+2*8, 146.386 + 4+3*8, 5+3*8, 4+4*8, 5+4*8, 146.387 + 6+3*8, 7+3*8, 6+4*8, 7+4*8, 146.388 + 1+1*8, 2+1*8, 146.389 + 1+2*8, 2+2*8, 146.390 + 1+4*8, 2+4*8, 146.391 + 1+5*8, 2+5*8, 146.392 +}; 146.393 + 146.394 +static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 146.395 + int i; 146.396 + for(i=0; i<16; i++){ 146.397 + int nnz = nnzc[ scan8[i] ]; 146.398 + if(nnz){ 146.399 + if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); 146.400 + else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); 146.401 + } 146.402 + } 146.403 +} 146.404 + 146.405 +static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 146.406 + int i; 146.407 + for(i=0; i<16; i++){ 146.408 + if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); 146.409 + else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); 146.410 + } 146.411 +} 146.412 + 146.413 +static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 146.414 + int i; 146.415 + for(i=0; i<16; i+=4){ 146.416 + int nnz = nnzc[ scan8[i] ]; 146.417 + if(nnz){ 146.418 + if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); 146.419 + else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride); 146.420 + } 146.421 + } 146.422 +} 146.423 + 146.424 +static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 146.425 + int i; 146.426 + for(i=0; i<16; i+=4){ 146.427 + int nnz = nnzc[ scan8[i] ]; 146.428 + if(nnz){ 146.429 + if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); 146.430 + else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride); 146.431 + } 146.432 + } 146.433 +} 146.434 + 146.435 +static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ 146.436 + int i; 146.437 + for(i=16; i<16+8; i++){ 146.438 + if(nnzc[ scan8[i] ]) 146.439 + ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); 146.440 + else if(block[i*16]) 146.441 + ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); 146.442 + } 146.443 +} 146.444 + 146.445 +/***********************************/ 146.446 +/* deblocking */ 146.447 + 146.448 +// out: o = |x-y|>a 146.449 +// clobbers: t 146.450 +#define DIFF_GT_MMX(x,y,a,o,t)\ 146.451 + "movq "#y", "#t" \n\t"\ 146.452 + "movq "#x", "#o" \n\t"\ 146.453 + "psubusb "#x", "#t" \n\t"\ 146.454 + "psubusb "#y", "#o" \n\t"\ 146.455 + "por "#t", "#o" \n\t"\ 146.456 + "psubusb "#a", "#o" \n\t" 146.457 + 146.458 +// out: o = |x-y|>a 146.459 +// clobbers: t 146.460 +#define DIFF_GT2_MMX(x,y,a,o,t)\ 146.461 + "movq "#y", "#t" \n\t"\ 146.462 + "movq "#x", "#o" \n\t"\ 146.463 + "psubusb "#x", "#t" \n\t"\ 146.464 + "psubusb "#y", "#o" \n\t"\ 146.465 + "psubusb "#a", "#t" \n\t"\ 146.466 + "psubusb "#a", "#o" \n\t"\ 146.467 + "pcmpeqb "#t", "#o" \n\t"\ 146.468 + 146.469 +// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 146.470 +// out: mm5=beta-1, mm7=mask 146.471 +// clobbers: mm4,mm6 146.472 +#define H264_DEBLOCK_MASK(alpha1, beta1) \ 146.473 + "pshufw $0, "#alpha1", %%mm4 \n\t"\ 146.474 + "pshufw $0, "#beta1 ", %%mm5 \n\t"\ 146.475 + "packuswb %%mm4, %%mm4 \n\t"\ 146.476 + "packuswb %%mm5, %%mm5 \n\t"\ 146.477 + DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\ 146.478 + DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\ 146.479 + "por %%mm4, %%mm7 \n\t"\ 146.480 + DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\ 146.481 + "por %%mm4, %%mm7 \n\t"\ 146.482 + "pxor %%mm6, %%mm6 \n\t"\ 146.483 + "pcmpeqb %%mm6, %%mm7 \n\t" 146.484 + 146.485 +// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) 146.486 +// out: mm1=p0' mm2=q0' 146.487 +// clobbers: mm0,3-6 146.488 +#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\ 146.489 + "movq %%mm1 , %%mm5 \n\t"\ 146.490 + "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\ 146.491 + "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\ 146.492 + "pcmpeqb %%mm4 , %%mm4 \n\t"\ 146.493 + "pxor %%mm4 , %%mm3 \n\t"\ 146.494 + "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\ 146.495 + "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\ 146.496 + "pxor %%mm1 , %%mm4 \n\t"\ 146.497 + "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\ 146.498 + "pavgb %%mm5 , %%mm3 \n\t"\ 146.499 + "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\ 146.500 + "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\ 146.501 + "psubusb %%mm3 , %%mm6 \n\t"\ 146.502 + "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\ 146.503 + "pminub %%mm7 , %%mm6 \n\t"\ 146.504 + "pminub %%mm7 , %%mm3 \n\t"\ 146.505 + "psubusb %%mm6 , %%mm1 \n\t"\ 146.506 + "psubusb %%mm3 , %%mm2 \n\t"\ 146.507 + "paddusb %%mm3 , %%mm1 \n\t"\ 146.508 + "paddusb %%mm6 , %%mm2 \n\t" 146.509 + 146.510 +// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone 146.511 +// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) 146.512 +// clobbers: q2, tmp, tc0 146.513 +#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\ 146.514 + "movq %%mm1, "#tmp" \n\t"\ 146.515 + "pavgb %%mm2, "#tmp" \n\t"\ 146.516 + "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\ 146.517 + "pxor "q2addr", "#tmp" \n\t"\ 146.518 + "pand %9, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\ 146.519 + "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ 146.520 + "movq "#p1", "#tmp" \n\t"\ 146.521 + "psubusb "#tc0", "#tmp" \n\t"\ 146.522 + "paddusb "#p1", "#tc0" \n\t"\ 146.523 + "pmaxub "#tmp", "#q2" \n\t"\ 146.524 + "pminub "#tc0", "#q2" \n\t"\ 146.525 + "movq "#q2", "q1addr" \n\t" 146.526 + 146.527 +static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) 146.528 +{ 146.529 + DECLARE_ALIGNED(8, uint64_t, tmp0)[2]; 146.530 + 146.531 + __asm__ volatile( 146.532 + "movq (%2,%4), %%mm0 \n\t" //p1 146.533 + "movq (%2,%4,2), %%mm1 \n\t" //p0 146.534 + "movq (%3), %%mm2 \n\t" //q0 146.535 + "movq (%3,%4), %%mm3 \n\t" //q1 146.536 + H264_DEBLOCK_MASK(%7, %8) 146.537 + 146.538 + "movd %6, %%mm4 \n\t" 146.539 + "punpcklbw %%mm4, %%mm4 \n\t" 146.540 + "punpcklwd %%mm4, %%mm4 \n\t" 146.541 + "pcmpeqb %%mm3, %%mm3 \n\t" 146.542 + "movq %%mm4, %%mm6 \n\t" 146.543 + "pcmpgtb %%mm3, %%mm4 \n\t" 146.544 + "movq %%mm6, %1 \n\t" 146.545 + "pand %%mm4, %%mm7 \n\t" 146.546 + "movq %%mm7, %0 \n\t" 146.547 + 146.548 + /* filter p1 */ 146.549 + "movq (%2), %%mm3 \n\t" //p2 146.550 + DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 146.551 + "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta 146.552 + "pand %1, %%mm7 \n\t" // mask & tc0 146.553 + "movq %%mm7, %%mm4 \n\t" 146.554 + "psubb %%mm6, %%mm7 \n\t" 146.555 + "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0 146.556 + H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%2)", "(%2,%4)", %%mm6, %%mm4) 146.557 + 146.558 + /* filter q1 */ 146.559 + "movq (%3,%4,2), %%mm4 \n\t" //q2 146.560 + DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 146.561 + "pand %0, %%mm6 \n\t" 146.562 + "movq %1, %%mm5 \n\t" // can be merged with the and below but is slower then 146.563 + "pand %%mm6, %%mm5 \n\t" 146.564 + "psubb %%mm6, %%mm7 \n\t" 146.565 + "movq (%3,%4), %%mm3 \n\t" 146.566 + H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6) 146.567 + 146.568 + /* filter p0, q0 */ 146.569 + H264_DEBLOCK_P0_Q0(%9, unused) 146.570 + "movq %%mm1, (%2,%4,2) \n\t" 146.571 + "movq %%mm2, (%3) \n\t" 146.572 + 146.573 + : "=m"(tmp0[0]), "=m"(tmp0[1]) 146.574 + : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride), 146.575 + "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), 146.576 + "m"(ff_bone) 146.577 + ); 146.578 +} 146.579 + 146.580 +static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 146.581 +{ 146.582 + if((tc0[0] & tc0[1]) >= 0) 146.583 + h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0); 146.584 + if((tc0[2] & tc0[3]) >= 0) 146.585 + h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2); 146.586 +} 146.587 +static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 146.588 +{ 146.589 + //FIXME: could cut some load/stores by merging transpose with filter 146.590 + // also, it only needs to transpose 6x8 146.591 + DECLARE_ALIGNED(8, uint8_t, trans)[8*8]; 146.592 + int i; 146.593 + for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { 146.594 + if((tc0[0] & tc0[1]) < 0) 146.595 + continue; 146.596 + transpose4x4(trans, pix-4, 8, stride); 146.597 + transpose4x4(trans +4*8, pix, 8, stride); 146.598 + transpose4x4(trans+4, pix-4+4*stride, 8, stride); 146.599 + transpose4x4(trans+4+4*8, pix +4*stride, 8, stride); 146.600 + h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0); 146.601 + transpose4x4(pix-2, trans +2*8, stride, 8); 146.602 + transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8); 146.603 + } 146.604 +} 146.605 + 146.606 +static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) 146.607 +{ 146.608 + __asm__ volatile( 146.609 + "movq (%0), %%mm0 \n\t" //p1 146.610 + "movq (%0,%2), %%mm1 \n\t" //p0 146.611 + "movq (%1), %%mm2 \n\t" //q0 146.612 + "movq (%1,%2), %%mm3 \n\t" //q1 146.613 + H264_DEBLOCK_MASK(%4, %5) 146.614 + "movd %3, %%mm6 \n\t" 146.615 + "punpcklbw %%mm6, %%mm6 \n\t" 146.616 + "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask 146.617 + H264_DEBLOCK_P0_Q0(%6, %7) 146.618 + "movq %%mm1, (%0,%2) \n\t" 146.619 + "movq %%mm2, (%1) \n\t" 146.620 + 146.621 + :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), 146.622 + "r"(*(uint32_t*)tc0), 146.623 + "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F) 146.624 + ); 146.625 +} 146.626 + 146.627 +static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 146.628 +{ 146.629 + h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0); 146.630 +} 146.631 + 146.632 +static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 146.633 +{ 146.634 + //FIXME: could cut some load/stores by merging transpose with filter 146.635 + DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; 146.636 + transpose4x4(trans, pix-2, 8, stride); 146.637 + transpose4x4(trans+4, pix-2+4*stride, 8, stride); 146.638 + h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); 146.639 + transpose4x4(pix-2, trans, stride, 8); 146.640 + transpose4x4(pix-2+4*stride, trans+4, stride, 8); 146.641 +} 146.642 + 146.643 +// p0 = (p0 + q1 + 2*p1 + 2) >> 2 146.644 +#define H264_FILTER_CHROMA4(p0, p1, q1, one) \ 146.645 + "movq "#p0", %%mm4 \n\t"\ 146.646 + "pxor "#q1", %%mm4 \n\t"\ 146.647 + "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\ 146.648 + "pavgb "#q1", "#p0" \n\t"\ 146.649 + "psubusb %%mm4, "#p0" \n\t"\ 146.650 + "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\ 146.651 + 146.652 +static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1) 146.653 +{ 146.654 + __asm__ volatile( 146.655 + "movq (%0), %%mm0 \n\t" 146.656 + "movq (%0,%2), %%mm1 \n\t" 146.657 + "movq (%1), %%mm2 \n\t" 146.658 + "movq (%1,%2), %%mm3 \n\t" 146.659 + H264_DEBLOCK_MASK(%3, %4) 146.660 + "movq %%mm1, %%mm5 \n\t" 146.661 + "movq %%mm2, %%mm6 \n\t" 146.662 + H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0' 146.663 + H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0' 146.664 + "psubb %%mm5, %%mm1 \n\t" 146.665 + "psubb %%mm6, %%mm2 \n\t" 146.666 + "pand %%mm7, %%mm1 \n\t" 146.667 + "pand %%mm7, %%mm2 \n\t" 146.668 + "paddb %%mm5, %%mm1 \n\t" 146.669 + "paddb %%mm6, %%mm2 \n\t" 146.670 + "movq %%mm1, (%0,%2) \n\t" 146.671 + "movq %%mm2, (%1) \n\t" 146.672 + :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), 146.673 + "m"(alpha1), "m"(beta1), "m"(ff_bone) 146.674 + ); 146.675 +} 146.676 + 146.677 +static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) 146.678 +{ 146.679 + h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1); 146.680 +} 146.681 + 146.682 +static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) 146.683 +{ 146.684 + //FIXME: could cut some load/stores by merging transpose with filter 146.685 + DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; 146.686 + transpose4x4(trans, pix-2, 8, stride); 146.687 + transpose4x4(trans+4, pix-2+4*stride, 8, stride); 146.688 + h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); 146.689 + transpose4x4(pix-2, trans, stride, 8); 146.690 + transpose4x4(pix-2+4*stride, trans+4, stride, 8); 146.691 +} 146.692 + 146.693 +static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], 146.694 + int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { 146.695 + int dir; 146.696 + __asm__ volatile( 146.697 + "movq %0, %%mm7 \n" 146.698 + "movq %1, %%mm6 \n" 146.699 + ::"m"(ff_pb_1), "m"(ff_pb_3) 146.700 + ); 146.701 + if(field) 146.702 + __asm__ volatile( 146.703 + "movq %0, %%mm6 \n" 146.704 + ::"m"(ff_pb_3_1) 146.705 + ); 146.706 + __asm__ volatile( 146.707 + "movq %%mm6, %%mm5 \n" 146.708 + "paddb %%mm5, %%mm5 \n" 146.709 + :); 146.710 + 146.711 + // could do a special case for dir==0 && edges==1, but it only reduces the 146.712 + // average filter time by 1.2% 146.713 + for( dir=1; dir>=0; dir-- ) { 146.714 + const x86_reg d_idx = dir ? -8 : -1; 146.715 + const int mask_mv = dir ? mask_mv1 : mask_mv0; 146.716 + DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; 146.717 + int b_idx, edge; 146.718 + for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { 146.719 + __asm__ volatile( 146.720 + "pand %0, %%mm0 \n\t" 146.721 + ::"m"(mask_dir) 146.722 + ); 146.723 + if(!(mask_mv & edge)) { 146.724 + if(bidir) { 146.725 + __asm__ volatile( 146.726 + "movd (%1,%0), %%mm2 \n" 146.727 + "punpckldq 40(%1,%0), %%mm2 \n" // { ref0[bn], ref1[bn] } 146.728 + "pshufw $0x44, (%1), %%mm0 \n" // { ref0[b], ref0[b] } 146.729 + "pshufw $0x44, 40(%1), %%mm1 \n" // { ref1[b], ref1[b] } 146.730 + "pshufw $0x4E, %%mm2, %%mm3 \n" 146.731 + "psubb %%mm2, %%mm0 \n" // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] } 146.732 + "psubb %%mm3, %%mm1 \n" // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] } 146.733 + "1: \n" 146.734 + "por %%mm1, %%mm0 \n" 146.735 + "movq (%2,%0,4), %%mm1 \n" 146.736 + "movq 8(%2,%0,4), %%mm2 \n" 146.737 + "movq %%mm1, %%mm3 \n" 146.738 + "movq %%mm2, %%mm4 \n" 146.739 + "psubw (%2), %%mm1 \n" 146.740 + "psubw 8(%2), %%mm2 \n" 146.741 + "psubw 160(%2), %%mm3 \n" 146.742 + "psubw 168(%2), %%mm4 \n" 146.743 + "packsswb %%mm2, %%mm1 \n" 146.744 + "packsswb %%mm4, %%mm3 \n" 146.745 + "paddb %%mm6, %%mm1 \n" 146.746 + "paddb %%mm6, %%mm3 \n" 146.747 + "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit 146.748 + "psubusb %%mm5, %%mm3 \n" 146.749 + "packsswb %%mm3, %%mm1 \n" 146.750 + "add $40, %0 \n" 146.751 + "cmp $40, %0 \n" 146.752 + "jl 1b \n" 146.753 + "sub $80, %0 \n" 146.754 + "pshufw $0x4E, %%mm1, %%mm1 \n" 146.755 + "por %%mm1, %%mm0 \n" 146.756 + "pshufw $0x4E, %%mm0, %%mm1 \n" 146.757 + "pminub %%mm1, %%mm0 \n" 146.758 + ::"r"(d_idx), 146.759 + "r"(ref[0]+b_idx), 146.760 + "r"(mv[0]+b_idx) 146.761 + ); 146.762 + } else { 146.763 + __asm__ volatile( 146.764 + "movd (%1), %%mm0 \n" 146.765 + "psubb (%1,%0), %%mm0 \n" // ref[b] != ref[bn] 146.766 + "movq (%2), %%mm1 \n" 146.767 + "movq 8(%2), %%mm2 \n" 146.768 + "psubw (%2,%0,4), %%mm1 \n" 146.769 + "psubw 8(%2,%0,4), %%mm2 \n" 146.770 + "packsswb %%mm2, %%mm1 \n" 146.771 + "paddb %%mm6, %%mm1 \n" 146.772 + "psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit 146.773 + "packsswb %%mm1, %%mm1 \n" 146.774 + "por %%mm1, %%mm0 \n" 146.775 + ::"r"(d_idx), 146.776 + "r"(ref[0]+b_idx), 146.777 + "r"(mv[0]+b_idx) 146.778 + ); 146.779 + } 146.780 + } 146.781 + __asm__ volatile( 146.782 + "movd %0, %%mm1 \n" 146.783 + "por %1, %%mm1 \n" // nnz[b] || nnz[bn] 146.784 + ::"m"(nnz[b_idx]), 146.785 + "m"(nnz[b_idx+d_idx]) 146.786 + ); 146.787 + __asm__ volatile( 146.788 + "pminub %%mm7, %%mm1 \n" 146.789 + "pminub %%mm7, %%mm0 \n" 146.790 + "psllw $1, %%mm1 \n" 146.791 + "pxor %%mm2, %%mm2 \n" 146.792 + "pmaxub %%mm0, %%mm1 \n" 146.793 + "punpcklbw %%mm2, %%mm1 \n" 146.794 + "movq %%mm1, %0 \n" 146.795 + :"=m"(*bS[dir][edge]) 146.796 + ::"memory" 146.797 + ); 146.798 + } 146.799 + edges = 4; 146.800 + step = 1; 146.801 + } 146.802 + __asm__ volatile( 146.803 + "movq (%0), %%mm0 \n\t" 146.804 + "movq 8(%0), %%mm1 \n\t" 146.805 + "movq 16(%0), %%mm2 \n\t" 146.806 + "movq 24(%0), %%mm3 \n\t" 146.807 + TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4) 146.808 + "movq %%mm0, (%0) \n\t" 146.809 + "movq %%mm3, 8(%0) \n\t" 146.810 + "movq %%mm4, 16(%0) \n\t" 146.811 + "movq %%mm2, 24(%0) \n\t" 146.812 + ::"r"(bS[0]) 146.813 + :"memory" 146.814 + ); 146.815 +} 146.816 + 146.817 +/***********************************/ 146.818 +/* motion compensation */ 146.819 + 146.820 +#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\ 146.821 + "mov"#q" "#C", "#T" \n\t"\ 146.822 + "mov"#d" (%0), "#F" \n\t"\ 146.823 + "paddw "#D", "#T" \n\t"\ 146.824 + "psllw $2, "#T" \n\t"\ 146.825 + "psubw "#B", "#T" \n\t"\ 146.826 + "psubw "#E", "#T" \n\t"\ 146.827 + "punpcklbw "#Z", "#F" \n\t"\ 146.828 + "pmullw %4, "#T" \n\t"\ 146.829 + "paddw %5, "#A" \n\t"\ 146.830 + "add %2, %0 \n\t"\ 146.831 + "paddw "#F", "#A" \n\t"\ 146.832 + "paddw "#A", "#T" \n\t"\ 146.833 + "psraw $5, "#T" \n\t"\ 146.834 + "packuswb "#T", "#T" \n\t"\ 146.835 + OP(T, (%1), A, d)\ 146.836 + "add %3, %1 \n\t" 146.837 + 146.838 +#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\ 146.839 + "mov"#q" "#C", "#T" \n\t"\ 146.840 + "mov"#d" (%0), "#F" \n\t"\ 146.841 + "paddw "#D", "#T" \n\t"\ 146.842 + "psllw $2, "#T" \n\t"\ 146.843 + "paddw %4, "#A" \n\t"\ 146.844 + "psubw "#B", "#T" \n\t"\ 146.845 + "psubw "#E", "#T" \n\t"\ 146.846 + "punpcklbw "#Z", "#F" \n\t"\ 146.847 + "pmullw %3, "#T" \n\t"\ 146.848 + "paddw "#F", "#A" \n\t"\ 146.849 + "add %2, %0 \n\t"\ 146.850 + "paddw "#A", "#T" \n\t"\ 146.851 + "mov"#q" "#T", "#OF"(%1) \n\t" 146.852 + 146.853 +#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q) 146.854 +#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q) 146.855 +#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa) 146.856 +#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa) 146.857 + 146.858 + 146.859 +#define QPEL_H264(OPNAME, OP, MMX)\ 146.860 +\ 146.861 +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 146.862 + int h=8;\ 146.863 + __asm__ volatile(\ 146.864 + "pxor %%mm7, %%mm7 \n\t"\ 146.865 + "movq %0, %%mm6 \n\t"\ 146.866 + :: "m"(ff_pw_5)\ 146.867 + );\ 146.868 + do{\ 146.869 + __asm__ volatile(\ 146.870 + "movq (%0), %%mm0 \n\t"\ 146.871 + "movq 1(%0), %%mm2 \n\t"\ 146.872 + "movq %%mm0, %%mm1 \n\t"\ 146.873 + "movq %%mm2, %%mm3 \n\t"\ 146.874 + "punpcklbw %%mm7, %%mm0 \n\t"\ 146.875 + "punpckhbw %%mm7, %%mm1 \n\t"\ 146.876 + "punpcklbw %%mm7, %%mm2 \n\t"\ 146.877 + "punpckhbw %%mm7, %%mm3 \n\t"\ 146.878 + "paddw %%mm2, %%mm0 \n\t"\ 146.879 + "paddw %%mm3, %%mm1 \n\t"\ 146.880 + "psllw $2, %%mm0 \n\t"\ 146.881 + "psllw $2, %%mm1 \n\t"\ 146.882 + "movq -1(%0), %%mm2 \n\t"\ 146.883 + "movq 2(%0), %%mm4 \n\t"\ 146.884 + "movq %%mm2, %%mm3 \n\t"\ 146.885 + "movq %%mm4, %%mm5 \n\t"\ 146.886 + "punpcklbw %%mm7, %%mm2 \n\t"\ 146.887 + "punpckhbw %%mm7, %%mm3 \n\t"\ 146.888 + "punpcklbw %%mm7, %%mm4 \n\t"\ 146.889 + "punpckhbw %%mm7, %%mm5 \n\t"\ 146.890 + "paddw %%mm4, %%mm2 \n\t"\ 146.891 + "paddw %%mm3, %%mm5 \n\t"\ 146.892 + "psubw %%mm2, %%mm0 \n\t"\ 146.893 + "psubw %%mm5, %%mm1 \n\t"\ 146.894 + "pmullw %%mm6, %%mm0 \n\t"\ 146.895 + "pmullw %%mm6, %%mm1 \n\t"\ 146.896 + "movd -2(%0), %%mm2 \n\t"\ 146.897 + "movd 7(%0), %%mm5 \n\t"\ 146.898 + "punpcklbw %%mm7, %%mm2 \n\t"\ 146.899 + "punpcklbw %%mm7, %%mm5 \n\t"\ 146.900 + "paddw %%mm3, %%mm2 \n\t"\ 146.901 + "paddw %%mm5, %%mm4 \n\t"\ 146.902 + "movq %5, %%mm5 \n\t"\ 146.903 + "paddw %%mm5, %%mm2 \n\t"\ 146.904 + "paddw %%mm5, %%mm4 \n\t"\ 146.905 + "paddw %%mm2, %%mm0 \n\t"\ 146.906 + "paddw %%mm4, %%mm1 \n\t"\ 146.907 + "psraw $5, %%mm0 \n\t"\ 146.908 + "psraw $5, %%mm1 \n\t"\ 146.909 + "movq (%2), %%mm4 \n\t"\ 146.910 + "packuswb %%mm1, %%mm0 \n\t"\ 146.911 + PAVGB" %%mm4, %%mm0 \n\t"\ 146.912 + OP(%%mm0, (%1),%%mm5, q)\ 146.913 + "add %4, %0 \n\t"\ 146.914 + "add %4, %1 \n\t"\ 146.915 + "add %3, %2 \n\t"\ 146.916 + : "+a"(src), "+c"(dst), "+d"(src2)\ 146.917 + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ 146.918 + "m"(ff_pw_16)\ 146.919 + : "memory"\ 146.920 + );\ 146.921 + }while(--h);\ 146.922 +}\ 146.923 +\ 146.924 +static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ 146.925 + int w = size>>4;\ 146.926 + do{\ 146.927 + int h = size;\ 146.928 + __asm__ volatile(\ 146.929 + "1: \n\t"\ 146.930 + "movq (%0), %%mm0 \n\t"\ 146.931 + "movq 8(%0), %%mm3 \n\t"\ 146.932 + "movq 2(%0), %%mm1 \n\t"\ 146.933 + "movq 10(%0), %%mm4 \n\t"\ 146.934 + "paddw %%mm4, %%mm0 \n\t"\ 146.935 + "paddw %%mm3, %%mm1 \n\t"\ 146.936 + "paddw 18(%0), %%mm3 \n\t"\ 146.937 + "paddw 16(%0), %%mm4 \n\t"\ 146.938 + "movq 4(%0), %%mm2 \n\t"\ 146.939 + "movq 12(%0), %%mm5 \n\t"\ 146.940 + "paddw 6(%0), %%mm2 \n\t"\ 146.941 + "paddw 14(%0), %%mm5 \n\t"\ 146.942 + "psubw %%mm1, %%mm0 \n\t"\ 146.943 + "psubw %%mm4, %%mm3 \n\t"\ 146.944 + "psraw $2, %%mm0 \n\t"\ 146.945 + "psraw $2, %%mm3 \n\t"\ 146.946 + "psubw %%mm1, %%mm0 \n\t"\ 146.947 + "psubw %%mm4, %%mm3 \n\t"\ 146.948 + "paddsw %%mm2, %%mm0 \n\t"\ 146.949 + "paddsw %%mm5, %%mm3 \n\t"\ 146.950 + "psraw $2, %%mm0 \n\t"\ 146.951 + "psraw $2, %%mm3 \n\t"\ 146.952 + "paddw %%mm2, %%mm0 \n\t"\ 146.953 + "paddw %%mm5, %%mm3 \n\t"\ 146.954 + "psraw $6, %%mm0 \n\t"\ 146.955 + "psraw $6, %%mm3 \n\t"\ 146.956 + "packuswb %%mm3, %%mm0 \n\t"\ 146.957 + OP(%%mm0, (%1),%%mm7, q)\ 146.958 + "add $48, %0 \n\t"\ 146.959 + "add %3, %1 \n\t"\ 146.960 + "decl %2 \n\t"\ 146.961 + " jnz 1b \n\t"\ 146.962 + : "+a"(tmp), "+c"(dst), "+g"(h)\ 146.963 + : "S"((x86_reg)dstStride)\ 146.964 + : "memory"\ 146.965 + );\ 146.966 + tmp += 8 - size*24;\ 146.967 + dst += 8 - size*dstStride;\ 146.968 + }while(w--);\ 146.969 +}\ 146.970 +\ 146.971 +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 146.972 + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 146.973 + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 146.974 + src += 8*dstStride;\ 146.975 + dst += 8*dstStride;\ 146.976 + src2 += 8*src2Stride;\ 146.977 + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 146.978 + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 146.979 +}\ 146.980 +static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ 146.981 +{\ 146.982 + do{\ 146.983 + __asm__ volatile(\ 146.984 + "movq (%1), %%mm0 \n\t"\ 146.985 + "movq 8(%1), %%mm1 \n\t"\ 146.986 + "movq 48(%1), %%mm2 \n\t"\ 146.987 + "movq 8+48(%1), %%mm3 \n\t"\ 146.988 + "psraw $5, %%mm0 \n\t"\ 146.989 + "psraw $5, %%mm1 \n\t"\ 146.990 + "psraw $5, %%mm2 \n\t"\ 146.991 + "psraw $5, %%mm3 \n\t"\ 146.992 + "packuswb %%mm1, %%mm0 \n\t"\ 146.993 + "packuswb %%mm3, %%mm2 \n\t"\ 146.994 + PAVGB" (%0), %%mm0 \n\t"\ 146.995 + PAVGB" (%0,%3), %%mm2 \n\t"\ 146.996 + OP(%%mm0, (%2), %%mm5, q)\ 146.997 + OP(%%mm2, (%2,%4), %%mm5, q)\ 146.998 + ::"a"(src8), "c"(src16), "d"(dst),\ 146.999 + "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\ 146.1000 + :"memory");\ 146.1001 + src8 += 2L*src8Stride;\ 146.1002 + src16 += 48;\ 146.1003 + dst += 2L*dstStride;\ 146.1004 + }while(h-=2);\ 146.1005 +}\ 146.1006 +static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ 146.1007 +{\ 146.1008 + OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ 146.1009 + OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ 146.1010 +}\ 146.1011 + 146.1012 + 146.1013 +#if ARCH_X86_64 146.1014 +#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 146.1015 +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 146.1016 + int h=16;\ 146.1017 + __asm__ volatile(\ 146.1018 + "pxor %%xmm15, %%xmm15 \n\t"\ 146.1019 + "movdqa %6, %%xmm14 \n\t"\ 146.1020 + "movdqa %7, %%xmm13 \n\t"\ 146.1021 + "1: \n\t"\ 146.1022 + "lddqu 6(%0), %%xmm1 \n\t"\ 146.1023 + "lddqu -2(%0), %%xmm7 \n\t"\ 146.1024 + "movdqa %%xmm1, %%xmm0 \n\t"\ 146.1025 + "punpckhbw %%xmm15, %%xmm1 \n\t"\ 146.1026 + "punpcklbw %%xmm15, %%xmm0 \n\t"\ 146.1027 + "punpcklbw %%xmm15, %%xmm7 \n\t"\ 146.1028 + "movdqa %%xmm1, %%xmm2 \n\t"\ 146.1029 + "movdqa %%xmm0, %%xmm6 \n\t"\ 146.1030 + "movdqa %%xmm1, %%xmm3 \n\t"\ 146.1031 + "movdqa %%xmm0, %%xmm8 \n\t"\ 146.1032 + "movdqa %%xmm1, %%xmm4 \n\t"\ 146.1033 + "movdqa %%xmm0, %%xmm9 \n\t"\ 146.1034 + "movdqa %%xmm0, %%xmm12 \n\t"\ 146.1035 + "movdqa %%xmm1, %%xmm11 \n\t"\ 146.1036 + "palignr $10,%%xmm0, %%xmm11\n\t"\ 146.1037 + "palignr $10,%%xmm7, %%xmm12\n\t"\ 146.1038 + "palignr $2, %%xmm0, %%xmm4 \n\t"\ 146.1039 + "palignr $2, %%xmm7, %%xmm9 \n\t"\ 146.1040 + "palignr $4, %%xmm0, %%xmm3 \n\t"\ 146.1041 + "palignr $4, %%xmm7, %%xmm8 \n\t"\ 146.1042 + "palignr $6, %%xmm0, %%xmm2 \n\t"\ 146.1043 + "palignr $6, %%xmm7, %%xmm6 \n\t"\ 146.1044 + "paddw %%xmm0 ,%%xmm11 \n\t"\ 146.1045 + "palignr $8, %%xmm0, %%xmm1 \n\t"\ 146.1046 + "palignr $8, %%xmm7, %%xmm0 \n\t"\ 146.1047 + "paddw %%xmm12,%%xmm7 \n\t"\ 146.1048 + "paddw %%xmm3, %%xmm2 \n\t"\ 146.1049 + "paddw %%xmm8, %%xmm6 \n\t"\ 146.1050 + "paddw %%xmm4, %%xmm1 \n\t"\ 146.1051 + "paddw %%xmm9, %%xmm0 \n\t"\ 146.1052 + "psllw $2, %%xmm2 \n\t"\ 146.1053 + "psllw $2, %%xmm6 \n\t"\ 146.1054 + "psubw %%xmm1, %%xmm2 \n\t"\ 146.1055 + "psubw %%xmm0, %%xmm6 \n\t"\ 146.1056 + "paddw %%xmm13,%%xmm11 \n\t"\ 146.1057 + "paddw %%xmm13,%%xmm7 \n\t"\ 146.1058 + "pmullw %%xmm14,%%xmm2 \n\t"\ 146.1059 + "pmullw %%xmm14,%%xmm6 \n\t"\ 146.1060 + "lddqu (%2), %%xmm3 \n\t"\ 146.1061 + "paddw %%xmm11,%%xmm2 \n\t"\ 146.1062 + "paddw %%xmm7, %%xmm6 \n\t"\ 146.1063 + "psraw $5, %%xmm2 \n\t"\ 146.1064 + "psraw $5, %%xmm6 \n\t"\ 146.1065 + "packuswb %%xmm2,%%xmm6 \n\t"\ 146.1066 + "pavgb %%xmm3, %%xmm6 \n\t"\ 146.1067 + OP(%%xmm6, (%1), %%xmm4, dqa)\ 146.1068 + "add %5, %0 \n\t"\ 146.1069 + "add %5, %1 \n\t"\ 146.1070 + "add %4, %2 \n\t"\ 146.1071 + "decl %3 \n\t"\ 146.1072 + "jg 1b \n\t"\ 146.1073 + : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ 146.1074 + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ 146.1075 + "m"(ff_pw_5), "m"(ff_pw_16)\ 146.1076 + : "memory"\ 146.1077 + );\ 146.1078 +} 146.1079 +#else // ARCH_X86_64 146.1080 +#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 146.1081 +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 146.1082 + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 146.1083 + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 146.1084 + src += 8*dstStride;\ 146.1085 + dst += 8*dstStride;\ 146.1086 + src2 += 8*src2Stride;\ 146.1087 + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 146.1088 + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 146.1089 +} 146.1090 +#endif // ARCH_X86_64 146.1091 + 146.1092 +#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ 146.1093 +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 146.1094 + int h=8;\ 146.1095 + __asm__ volatile(\ 146.1096 + "pxor %%xmm7, %%xmm7 \n\t"\ 146.1097 + "movdqa %0, %%xmm6 \n\t"\ 146.1098 + :: "m"(ff_pw_5)\ 146.1099 + );\ 146.1100 + do{\ 146.1101 + __asm__ volatile(\ 146.1102 + "lddqu -2(%0), %%xmm1 \n\t"\ 146.1103 + "movdqa %%xmm1, %%xmm0 \n\t"\ 146.1104 + "punpckhbw %%xmm7, %%xmm1 \n\t"\ 146.1105 + "punpcklbw %%xmm7, %%xmm0 \n\t"\ 146.1106 + "movdqa %%xmm1, %%xmm2 \n\t"\ 146.1107 + "movdqa %%xmm1, %%xmm3 \n\t"\ 146.1108 + "movdqa %%xmm1, %%xmm4 \n\t"\ 146.1109 + "movdqa %%xmm1, %%xmm5 \n\t"\ 146.1110 + "palignr $2, %%xmm0, %%xmm4 \n\t"\ 146.1111 + "palignr $4, %%xmm0, %%xmm3 \n\t"\ 146.1112 + "palignr $6, %%xmm0, %%xmm2 \n\t"\ 146.1113 + "palignr $8, %%xmm0, %%xmm1 \n\t"\ 146.1114 + "palignr $10,%%xmm0, %%xmm5 \n\t"\ 146.1115 + "paddw %%xmm5, %%xmm0 \n\t"\ 146.1116 + "paddw %%xmm3, %%xmm2 \n\t"\ 146.1117 + "paddw %%xmm4, %%xmm1 \n\t"\ 146.1118 + "psllw $2, %%xmm2 \n\t"\ 146.1119 + "movq (%2), %%xmm3 \n\t"\ 146.1120 + "psubw %%xmm1, %%xmm2 \n\t"\ 146.1121 + "paddw %5, %%xmm0 \n\t"\ 146.1122 + "pmullw %%xmm6, %%xmm2 \n\t"\ 146.1123 + "paddw %%xmm0, %%xmm2 \n\t"\ 146.1124 + "psraw $5, %%xmm2 \n\t"\ 146.1125 + "packuswb %%xmm2, %%xmm2 \n\t"\ 146.1126 + "pavgb %%xmm3, %%xmm2 \n\t"\ 146.1127 + OP(%%xmm2, (%1), %%xmm4, q)\ 146.1128 + "add %4, %0 \n\t"\ 146.1129 + "add %4, %1 \n\t"\ 146.1130 + "add %3, %2 \n\t"\ 146.1131 + : "+a"(src), "+c"(dst), "+d"(src2)\ 146.1132 + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ 146.1133 + "m"(ff_pw_16)\ 146.1134 + : "memory"\ 146.1135 + );\ 146.1136 + }while(--h);\ 146.1137 +}\ 146.1138 +QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 146.1139 +\ 146.1140 +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 146.1141 + int h=8;\ 146.1142 + __asm__ volatile(\ 146.1143 + "pxor %%xmm7, %%xmm7 \n\t"\ 146.1144 + "movdqa %5, %%xmm6 \n\t"\ 146.1145 + "1: \n\t"\ 146.1146 + "lddqu -2(%0), %%xmm1 \n\t"\ 146.1147 + "movdqa %%xmm1, %%xmm0 \n\t"\ 146.1148 + "punpckhbw %%xmm7, %%xmm1 \n\t"\ 146.1149 + "punpcklbw %%xmm7, %%xmm0 \n\t"\ 146.1150 + "movdqa %%xmm1, %%xmm2 \n\t"\ 146.1151 + "movdqa %%xmm1, %%xmm3 \n\t"\ 146.1152 + "movdqa %%xmm1, %%xmm4 \n\t"\ 146.1153 + "movdqa %%xmm1, %%xmm5 \n\t"\ 146.1154 + "palignr $2, %%xmm0, %%xmm4 \n\t"\ 146.1155 + "palignr $4, %%xmm0, %%xmm3 \n\t"\ 146.1156 + "palignr $6, %%xmm0, %%xmm2 \n\t"\ 146.1157 + "palignr $8, %%xmm0, %%xmm1 \n\t"\ 146.1158 + "palignr $10,%%xmm0, %%xmm5 \n\t"\ 146.1159 + "paddw %%xmm5, %%xmm0 \n\t"\ 146.1160 + "paddw %%xmm3, %%xmm2 \n\t"\ 146.1161 + "paddw %%xmm4, %%xmm1 \n\t"\ 146.1162 + "psllw $2, %%xmm2 \n\t"\ 146.1163 + "psubw %%xmm1, %%xmm2 \n\t"\ 146.1164 + "paddw %6, %%xmm0 \n\t"\ 146.1165 + "pmullw %%xmm6, %%xmm2 \n\t"\ 146.1166 + "paddw %%xmm0, %%xmm2 \n\t"\ 146.1167 + "psraw $5, %%xmm2 \n\t"\ 146.1168 + "packuswb %%xmm2, %%xmm2 \n\t"\ 146.1169 + OP(%%xmm2, (%1), %%xmm4, q)\ 146.1170 + "add %3, %0 \n\t"\ 146.1171 + "add %4, %1 \n\t"\ 146.1172 + "decl %2 \n\t"\ 146.1173 + " jnz 1b \n\t"\ 146.1174 + : "+a"(src), "+c"(dst), "+g"(h)\ 146.1175 + : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\ 146.1176 + "m"(ff_pw_5), "m"(ff_pw_16)\ 146.1177 + : "memory"\ 146.1178 + );\ 146.1179 +}\ 146.1180 +static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 146.1181 + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 146.1182 + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 146.1183 + src += 8*srcStride;\ 146.1184 + dst += 8*dstStride;\ 146.1185 + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 146.1186 + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 146.1187 +}\ 146.1188 + 146.1189 +#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ 146.1190 +static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 146.1191 + src -= 2*srcStride;\ 146.1192 + \ 146.1193 + __asm__ volatile(\ 146.1194 + "pxor %%xmm7, %%xmm7 \n\t"\ 146.1195 + "movq (%0), %%xmm0 \n\t"\ 146.1196 + "add %2, %0 \n\t"\ 146.1197 + "movq (%0), %%xmm1 \n\t"\ 146.1198 + "add %2, %0 \n\t"\ 146.1199 + "movq (%0), %%xmm2 \n\t"\ 146.1200 + "add %2, %0 \n\t"\ 146.1201 + "movq (%0), %%xmm3 \n\t"\ 146.1202 + "add %2, %0 \n\t"\ 146.1203 + "movq (%0), %%xmm4 \n\t"\ 146.1204 + "add %2, %0 \n\t"\ 146.1205 + "punpcklbw %%xmm7, %%xmm0 \n\t"\ 146.1206 + "punpcklbw %%xmm7, %%xmm1 \n\t"\ 146.1207 + "punpcklbw %%xmm7, %%xmm2 \n\t"\ 146.1208 + "punpcklbw %%xmm7, %%xmm3 \n\t"\ 146.1209 + "punpcklbw %%xmm7, %%xmm4 \n\t"\ 146.1210 + QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ 146.1211 + QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ 146.1212 + QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ 146.1213 + QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ 146.1214 + QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ 146.1215 + QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ 146.1216 + QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ 146.1217 + QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ 146.1218 + \ 146.1219 + : "+a"(src), "+c"(dst)\ 146.1220 + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 146.1221 + : "memory"\ 146.1222 + );\ 146.1223 + if(h==16){\ 146.1224 + __asm__ volatile(\ 146.1225 + QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ 146.1226 + QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ 146.1227 + QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ 146.1228 + QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ 146.1229 + QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ 146.1230 + QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ 146.1231 + QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ 146.1232 + QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ 146.1233 + \ 146.1234 + : "+a"(src), "+c"(dst)\ 146.1235 + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 146.1236 + : "memory"\ 146.1237 + );\ 146.1238 + }\ 146.1239 +}\ 146.1240 +static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 146.1241 + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ 146.1242 +}\ 146.1243 +static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 146.1244 + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ 146.1245 + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ 146.1246 +} 146.1247 + 146.1248 +static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ 146.1249 + int w = (size+8)>>3; 146.1250 + src -= 2*srcStride+2; 146.1251 + while(w--){ 146.1252 + __asm__ volatile( 146.1253 + "pxor %%xmm7, %%xmm7 \n\t" 146.1254 + "movq (%0), %%xmm0 \n\t" 146.1255 + "add %2, %0 \n\t" 146.1256 + "movq (%0), %%xmm1 \n\t" 146.1257 + "add %2, %0 \n\t" 146.1258 + "movq (%0), %%xmm2 \n\t" 146.1259 + "add %2, %0 \n\t" 146.1260 + "movq (%0), %%xmm3 \n\t" 146.1261 + "add %2, %0 \n\t" 146.1262 + "movq (%0), %%xmm4 \n\t" 146.1263 + "add %2, %0 \n\t" 146.1264 + "punpcklbw %%xmm7, %%xmm0 \n\t" 146.1265 + "punpcklbw %%xmm7, %%xmm1 \n\t" 146.1266 + "punpcklbw %%xmm7, %%xmm2 \n\t" 146.1267 + "punpcklbw %%xmm7, %%xmm3 \n\t" 146.1268 + "punpcklbw %%xmm7, %%xmm4 \n\t" 146.1269 + QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48) 146.1270 + QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48) 146.1271 + QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48) 146.1272 + QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48) 146.1273 + QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48) 146.1274 + QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48) 146.1275 + QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48) 146.1276 + QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48) 146.1277 + : "+a"(src) 146.1278 + : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) 146.1279 + : "memory" 146.1280 + ); 146.1281 + if(size==16){ 146.1282 + __asm__ volatile( 146.1283 + QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) 146.1284 + QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) 146.1285 + QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) 146.1286 + QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) 146.1287 + QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) 146.1288 + QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48) 146.1289 + QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48) 146.1290 + QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48) 146.1291 + : "+a"(src) 146.1292 + : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) 146.1293 + : "memory" 146.1294 + ); 146.1295 + } 146.1296 + tmp += 8; 146.1297 + src += 8 - (size+5)*srcStride; 146.1298 + } 146.1299 +} 146.1300 + 146.1301 +#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\ 146.1302 +static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ 146.1303 + int h = size;\ 146.1304 + if(size == 16){\ 146.1305 + __asm__ volatile(\ 146.1306 + "1: \n\t"\ 146.1307 + "movdqa 32(%0), %%xmm4 \n\t"\ 146.1308 + "movdqa 16(%0), %%xmm5 \n\t"\ 146.1309 + "movdqa (%0), %%xmm7 \n\t"\ 146.1310 + "movdqa %%xmm4, %%xmm3 \n\t"\ 146.1311 + "movdqa %%xmm4, %%xmm2 \n\t"\ 146.1312 + "movdqa %%xmm4, %%xmm1 \n\t"\ 146.1313 + "movdqa %%xmm4, %%xmm0 \n\t"\ 146.1314 + "palignr $10, %%xmm5, %%xmm0 \n\t"\ 146.1315 + "palignr $8, %%xmm5, %%xmm1 \n\t"\ 146.1316 + "palignr $6, %%xmm5, %%xmm2 \n\t"\ 146.1317 + "palignr $4, %%xmm5, %%xmm3 \n\t"\ 146.1318 + "palignr $2, %%xmm5, %%xmm4 \n\t"\ 146.1319 + "paddw %%xmm5, %%xmm0 \n\t"\ 146.1320 + "paddw %%xmm4, %%xmm1 \n\t"\ 146.1321 + "paddw %%xmm3, %%xmm2 \n\t"\ 146.1322 + "movdqa %%xmm5, %%xmm6 \n\t"\ 146.1323 + "movdqa %%xmm5, %%xmm4 \n\t"\ 146.1324 + "movdqa %%xmm5, %%xmm3 \n\t"\ 146.1325 + "palignr $8, %%xmm7, %%xmm4 \n\t"\ 146.1326 + "palignr $2, %%xmm7, %%xmm6 \n\t"\ 146.1327 + "palignr $10, %%xmm7, %%xmm3 \n\t"\ 146.1328 + "paddw %%xmm6, %%xmm4 \n\t"\ 146.1329 + "movdqa %%xmm5, %%xmm6 \n\t"\ 146.1330 + "palignr $6, %%xmm7, %%xmm5 \n\t"\ 146.1331 + "palignr $4, %%xmm7, %%xmm6 \n\t"\ 146.1332 + "paddw %%xmm7, %%xmm3 \n\t"\ 146.1333 + "paddw %%xmm6, %%xmm5 \n\t"\ 146.1334 + \ 146.1335 + "psubw %%xmm1, %%xmm0 \n\t"\ 146.1336 + "psubw %%xmm4, %%xmm3 \n\t"\ 146.1337 + "psraw $2, %%xmm0 \n\t"\ 146.1338 + "psraw $2, %%xmm3 \n\t"\ 146.1339 + "psubw %%xmm1, %%xmm0 \n\t"\ 146.1340 + "psubw %%xmm4, %%xmm3 \n\t"\ 146.1341 + "paddw %%xmm2, %%xmm0 \n\t"\ 146.1342 + "paddw %%xmm5, %%xmm3 \n\t"\ 146.1343 + "psraw $2, %%xmm0 \n\t"\ 146.1344 + "psraw $2, %%xmm3 \n\t"\ 146.1345 + "paddw %%xmm2, %%xmm0 \n\t"\ 146.1346 + "paddw %%xmm5, %%xmm3 \n\t"\ 146.1347 + "psraw $6, %%xmm0 \n\t"\ 146.1348 + "psraw $6, %%xmm3 \n\t"\ 146.1349 + "packuswb %%xmm0, %%xmm3 \n\t"\ 146.1350 + OP(%%xmm3, (%1), %%xmm7, dqa)\ 146.1351 + "add $48, %0 \n\t"\ 146.1352 + "add %3, %1 \n\t"\ 146.1353 + "decl %2 \n\t"\ 146.1354 + " jnz 1b \n\t"\ 146.1355 + : "+a"(tmp), "+c"(dst), "+g"(h)\ 146.1356 + : "S"((x86_reg)dstStride)\ 146.1357 + : "memory"\ 146.1358 + );\ 146.1359 + }else{\ 146.1360 + __asm__ volatile(\ 146.1361 + "1: \n\t"\ 146.1362 + "movdqa 16(%0), %%xmm1 \n\t"\ 146.1363 + "movdqa (%0), %%xmm0 \n\t"\ 146.1364 + "movdqa %%xmm1, %%xmm2 \n\t"\ 146.1365 + "movdqa %%xmm1, %%xmm3 \n\t"\ 146.1366 + "movdqa %%xmm1, %%xmm4 \n\t"\ 146.1367 + "movdqa %%xmm1, %%xmm5 \n\t"\ 146.1368 + "palignr $10, %%xmm0, %%xmm5 \n\t"\ 146.1369 + "palignr $8, %%xmm0, %%xmm4 \n\t"\ 146.1370 + "palignr $6, %%xmm0, %%xmm3 \n\t"\ 146.1371 + "palignr $4, %%xmm0, %%xmm2 \n\t"\ 146.1372 + "palignr $2, %%xmm0, %%xmm1 \n\t"\ 146.1373 + "paddw %%xmm5, %%xmm0 \n\t"\ 146.1374 + "paddw %%xmm4, %%xmm1 \n\t"\ 146.1375 + "paddw %%xmm3, %%xmm2 \n\t"\ 146.1376 + "psubw %%xmm1, %%xmm0 \n\t"\ 146.1377 + "psraw $2, %%xmm0 \n\t"\ 146.1378 + "psubw %%xmm1, %%xmm0 \n\t"\ 146.1379 + "paddw %%xmm2, %%xmm0 \n\t"\ 146.1380 + "psraw $2, %%xmm0 \n\t"\ 146.1381 + "paddw %%xmm2, %%xmm0 \n\t"\ 146.1382 + "psraw $6, %%xmm0 \n\t"\ 146.1383 + "packuswb %%xmm0, %%xmm0 \n\t"\ 146.1384 + OP(%%xmm0, (%1), %%xmm7, q)\ 146.1385 + "add $48, %0 \n\t"\ 146.1386 + "add %3, %1 \n\t"\ 146.1387 + "decl %2 \n\t"\ 146.1388 + " jnz 1b \n\t"\ 146.1389 + : "+a"(tmp), "+c"(dst), "+g"(h)\ 146.1390 + : "S"((x86_reg)dstStride)\ 146.1391 + : "memory"\ 146.1392 + );\ 146.1393 + }\ 146.1394 +} 146.1395 + 146.1396 +#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ 146.1397 +static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ 146.1398 + put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ 146.1399 + OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ 146.1400 +}\ 146.1401 +static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 146.1402 + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ 146.1403 +}\ 146.1404 +static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 146.1405 + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ 146.1406 +}\ 146.1407 + 146.1408 +#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2 146.1409 +#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2 146.1410 +#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2 146.1411 +#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2 146.1412 +#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2 146.1413 +#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2 146.1414 +#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2 146.1415 +#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2 146.1416 + 146.1417 +#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2 146.1418 +#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2 146.1419 +#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2 146.1420 +#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2 146.1421 +#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2 146.1422 +#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2 146.1423 +#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2 146.1424 +#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2 146.1425 + 146.1426 +#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2 146.1427 +#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2 146.1428 +#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2 146.1429 +#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2 146.1430 + 146.1431 +#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2 146.1432 +#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2 146.1433 +#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2 146.1434 +#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2 146.1435 + 146.1436 +#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2 146.1437 +#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2 146.1438 + 146.1439 +#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \ 146.1440 +H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ 146.1441 +H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ 146.1442 +H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ 146.1443 +H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ 146.1444 + 146.1445 +// static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ 146.1446 +// put_pixels16_sse2(dst, src, stride, 16); 146.1447 +// } 146.1448 +// static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ 146.1449 +// avg_pixels16_sse2(dst, src, stride, 16); 146.1450 +// } 146.1451 +#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2 146.1452 +#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2 146.1453 + 146.1454 +#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ 146.1455 +static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ 146.1456 + OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ 146.1457 +}\ 146.1458 + 146.1459 +#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ 146.1460 +static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 146.1461 + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ 146.1462 +}\ 146.1463 +\ 146.1464 +static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 146.1465 + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ 146.1466 +}\ 146.1467 +\ 146.1468 +static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 146.1469 + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ 146.1470 +}\ 146.1471 + 146.1472 +#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ 146.1473 +static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 146.1474 + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 146.1475 + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 146.1476 + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ 146.1477 +}\ 146.1478 +\ 146.1479 +static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 146.1480 + OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ 146.1481 +}\ 146.1482 +\ 146.1483 +static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 146.1484 + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 146.1485 + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 146.1486 + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ 146.1487 +}\ 146.1488 + 146.1489 +#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ 146.1490 +static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 146.1491 + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 146.1492 + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 146.1493 + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ 146.1494 +}\ 146.1495 +\ 146.1496 +static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 146.1497 + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 146.1498 + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ 146.1499 + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ 146.1500 +}\ 146.1501 +\ 146.1502 +static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 146.1503 + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 146.1504 + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 146.1505 + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ 146.1506 +}\ 146.1507 +\ 146.1508 +static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 146.1509 + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 146.1510 + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ 146.1511 + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ 146.1512 +}\ 146.1513 +\ 146.1514 +static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 146.1515 + DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\ 146.1516 + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ 146.1517 +}\ 146.1518 +\ 146.1519 +static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 146.1520 + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ 146.1521 + uint8_t * const halfHV= temp;\ 146.1522 + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 146.1523 + assert(((int)temp & 7) == 0);\ 146.1524 + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 146.1525 + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ 146.1526 +}\ 146.1527 +\ 146.1528 +static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 146.1529 + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ 146.1530 + uint8_t * const halfHV= temp;\ 146.1531 + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 146.1532 + assert(((int)temp & 7) == 0);\ 146.1533 + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 146.1534 + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ 146.1535 +}\ 146.1536 +\ 146.1537 +static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 146.1538 + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ 146.1539 + uint8_t * const halfHV= temp;\ 146.1540 + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 146.1541 + assert(((int)temp & 7) == 0);\ 146.1542 + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 146.1543 + OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ 146.1544 +}\ 146.1545 +\ 146.1546 +static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ 146.1547 + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ 146.1548 + uint8_t * const halfHV= temp;\ 146.1549 + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 146.1550 + assert(((int)temp & 7) == 0);\ 146.1551 + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 146.1552 + OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ 146.1553 +}\ 146.1554 + 146.1555 +#define H264_MC_4816(MMX)\ 146.1556 +H264_MC(put_, 4, MMX, 8)\ 146.1557 +H264_MC(put_, 8, MMX, 8)\ 146.1558 +H264_MC(put_, 16,MMX, 8)\ 146.1559 +H264_MC(avg_, 4, MMX, 8)\ 146.1560 +H264_MC(avg_, 8, MMX, 8)\ 146.1561 +H264_MC(avg_, 16,MMX, 8)\ 146.1562 + 146.1563 +#define H264_MC_816(QPEL, XMM)\ 146.1564 +QPEL(put_, 8, XMM, 16)\ 146.1565 +QPEL(put_, 16,XMM, 16)\ 146.1566 +QPEL(avg_, 8, XMM, 16)\ 146.1567 +QPEL(avg_, 16,XMM, 16)\ 146.1568 + 146.1569 + 146.1570 +#define AVG_3DNOW_OP(a,b,temp, size) \ 146.1571 +"mov" #size " " #b ", " #temp " \n\t"\ 146.1572 +"pavgusb " #temp ", " #a " \n\t"\ 146.1573 +"mov" #size " " #a ", " #b " \n\t" 146.1574 +#define AVG_MMX2_OP(a,b,temp, size) \ 146.1575 +"mov" #size " " #b ", " #temp " \n\t"\ 146.1576 +"pavgb " #temp ", " #a " \n\t"\ 146.1577 +"mov" #size " " #a ", " #b " \n\t" 146.1578 + 146.1579 +///this does not get detected correctly, uncomment on AMD machine 146.1580 +#ifdef HAVE_AMD3DNOW 146.1581 +#define PAVGB "pavgusb" 146.1582 +//QPEL_H264(put_, PUT_OP, 3dnow) 146.1583 +//QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) 146.1584 +#undef PAVGB 146.1585 +#endif 146.1586 + 146.1587 +#define PAVGB "pavgb" 146.1588 +QPEL_H264(put_, PUT_OP, mmx2) 146.1589 +QPEL_H264(avg_, AVG_MMX2_OP, mmx2) 146.1590 +QPEL_H264_V_XMM(put_, PUT_OP, sse2) 146.1591 +QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2) 146.1592 +QPEL_H264_HV_XMM(put_, PUT_OP, sse2) 146.1593 +QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2) 146.1594 +#if HAVE_SSSE3 146.1595 +QPEL_H264_H_XMM(put_, PUT_OP, ssse3) 146.1596 +QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3) 146.1597 +QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3) 146.1598 +QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3) 146.1599 +QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) 146.1600 +QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3) 146.1601 +#endif 146.1602 +#undef PAVGB 146.1603 + 146.1604 +H264_MC_816(H264_MC_V, sse2) 146.1605 +H264_MC_816(H264_MC_HV, sse2) 146.1606 +#if HAVE_SSSE3 146.1607 +H264_MC_816(H264_MC_H, ssse3) 146.1608 +H264_MC_816(H264_MC_HV, ssse3) 146.1609 +#endif 146.1610 + 146.1611 +/* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */ 146.1612 +DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = { 146.1613 + 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL 146.1614 +}; 146.1615 + 146.1616 +#if HAVE_SSSE3 146.1617 +#define AVG_OP(X) 146.1618 +#undef H264_CHROMA_MC8_TMPL 146.1619 +#undef H264_CHROMA_MC4_TMPL 146.1620 +#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3 146.1621 +#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3 146.1622 +#define H264_CHROMA_MC8_MV0 put_pixels8_mmx 146.1623 +#include "dsputil_h264_template_ssse3.c" 146.1624 +static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 146.1625 +{ 146.1626 + put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1); 146.1627 +} 146.1628 + 146.1629 +#undef AVG_OP 146.1630 +#undef H264_CHROMA_MC8_TMPL 146.1631 +#undef H264_CHROMA_MC4_TMPL 146.1632 +#undef H264_CHROMA_MC8_MV0 146.1633 +#define AVG_OP(X) X 146.1634 +#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3 146.1635 +#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3 146.1636 +#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2 146.1637 +#include "dsputil_h264_template_ssse3.c" 146.1638 +static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 146.1639 +{ 146.1640 + avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1); 146.1641 +} 146.1642 +#undef AVG_OP 146.1643 +#undef H264_CHROMA_MC8_TMPL 146.1644 +#undef H264_CHROMA_MC4_TMPL 146.1645 +#undef H264_CHROMA_MC8_MV0 146.1646 +#endif 146.1647 + 146.1648 +/***********************************/ 146.1649 +/* weighted prediction */ 146.1650 + 146.1651 +static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) 146.1652 +{ 146.1653 + int x, y; 146.1654 + offset <<= log2_denom; 146.1655 + offset += (1 << log2_denom) >> 1; 146.1656 + __asm__ volatile( 146.1657 + "movd %0, %%mm4 \n\t" 146.1658 + "movd %1, %%mm5 \n\t" 146.1659 + "movd %2, %%mm6 \n\t" 146.1660 + "pshufw $0, %%mm4, %%mm4 \n\t" 146.1661 + "pshufw $0, %%mm5, %%mm5 \n\t" 146.1662 + "pxor %%mm7, %%mm7 \n\t" 146.1663 + :: "g"(weight), "g"(offset), "g"(log2_denom) 146.1664 + ); 146.1665 + for(y=0; y<h; y+=2){ 146.1666 + for(x=0; x<w; x+=4){ 146.1667 + __asm__ volatile( 146.1668 + "movd %0, %%mm0 \n\t" 146.1669 + "movd %1, %%mm1 \n\t" 146.1670 + "punpcklbw %%mm7, %%mm0 \n\t" 146.1671 + "punpcklbw %%mm7, %%mm1 \n\t" 146.1672 + "pmullw %%mm4, %%mm0 \n\t" 146.1673 + "pmullw %%mm4, %%mm1 \n\t" 146.1674 + "paddsw %%mm5, %%mm0 \n\t" 146.1675 + "paddsw %%mm5, %%mm1 \n\t" 146.1676 + "psraw %%mm6, %%mm0 \n\t" 146.1677 + "psraw %%mm6, %%mm1 \n\t" 146.1678 + "packuswb %%mm7, %%mm0 \n\t" 146.1679 + "packuswb %%mm7, %%mm1 \n\t" 146.1680 + "movd %%mm0, %0 \n\t" 146.1681 + "movd %%mm1, %1 \n\t" 146.1682 + : "+m"(*(uint32_t*)(dst+x)), 146.1683 + "+m"(*(uint32_t*)(dst+x+stride)) 146.1684 + ); 146.1685 + } 146.1686 + dst += 2*stride; 146.1687 + } 146.1688 +} 146.1689 + 146.1690 +static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h) 146.1691 +{ 146.1692 + int x, y; 146.1693 + offset = ((offset + 1) | 1) << log2_denom; 146.1694 + __asm__ volatile( 146.1695 + "movd %0, %%mm3 \n\t" 146.1696 + "movd %1, %%mm4 \n\t" 146.1697 + "movd %2, %%mm5 \n\t" 146.1698 + "movd %3, %%mm6 \n\t" 146.1699 + "pshufw $0, %%mm3, %%mm3 \n\t" 146.1700 + "pshufw $0, %%mm4, %%mm4 \n\t" 146.1701 + "pshufw $0, %%mm5, %%mm5 \n\t" 146.1702 + "pxor %%mm7, %%mm7 \n\t" 146.1703 + :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1) 146.1704 + ); 146.1705 + for(y=0; y<h; y++){ 146.1706 + for(x=0; x<w; x+=4){ 146.1707 + __asm__ volatile( 146.1708 + "movd %0, %%mm0 \n\t" 146.1709 + "movd %1, %%mm1 \n\t" 146.1710 + "punpcklbw %%mm7, %%mm0 \n\t" 146.1711 + "punpcklbw %%mm7, %%mm1 \n\t" 146.1712 + "pmullw %%mm3, %%mm0 \n\t" 146.1713 + "pmullw %%mm4, %%mm1 \n\t" 146.1714 + "paddsw %%mm1, %%mm0 \n\t" 146.1715 + "paddsw %%mm5, %%mm0 \n\t" 146.1716 + "psraw %%mm6, %%mm0 \n\t" 146.1717 + "packuswb %%mm0, %%mm0 \n\t" 146.1718 + "movd %%mm0, %0 \n\t" 146.1719 + : "+m"(*(uint32_t*)(dst+x)) 146.1720 + : "m"(*(uint32_t*)(src+x)) 146.1721 + ); 146.1722 + } 146.1723 + src += stride; 146.1724 + dst += stride; 146.1725 + } 146.1726 +} 146.1727 + 146.1728 +#define H264_WEIGHT(W,H) \ 146.1729 +static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ 146.1730 + ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \ 146.1731 +} \ 146.1732 +static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \ 146.1733 + ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \ 146.1734 +} 146.1735 + 146.1736 +H264_WEIGHT(16,16) 146.1737 +H264_WEIGHT(16, 8) 146.1738 +H264_WEIGHT( 8,16) 146.1739 +H264_WEIGHT( 8, 8) 146.1740 +H264_WEIGHT( 8, 4) 146.1741 +H264_WEIGHT( 4, 8) 146.1742 +H264_WEIGHT( 4, 4) 146.1743 +H264_WEIGHT( 4, 2) 146.1744 +
147.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 147.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/mathops.h Mon Aug 27 12:09:56 2012 +0200 147.3 @@ -0,0 +1,67 @@ 147.4 +/* 147.5 + * simple math operations 147.6 + * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al 147.7 + * 147.8 + * This file is part of FFmpeg. 147.9 + * 147.10 + * FFmpeg is free software; you can redistribute it and/or 147.11 + * modify it under the terms of the GNU Lesser General Public 147.12 + * License as published by the Free Software Foundation; either 147.13 + * version 2.1 of the License, or (at your option) any later version. 147.14 + * 147.15 + * FFmpeg is distributed in the hope that it will be useful, 147.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 147.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 147.18 + * Lesser General Public License for more details. 147.19 + * 147.20 + * You should have received a copy of the GNU Lesser General Public 147.21 + * License along with FFmpeg; if not, write to the Free Software 147.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 147.23 + */ 147.24 + 147.25 +#ifndef AVCODEC_X86_MATHOPS_H 147.26 +#define AVCODEC_X86_MATHOPS_H 147.27 + 147.28 +#include "config.h" 147.29 +#include "libavutil/common.h" 147.30 + 147.31 +#if ARCH_X86_32 147.32 +#define MULL(ra, rb, shift) \ 147.33 + ({ int rt, dummy; __asm__ (\ 147.34 + "imull %3 \n\t"\ 147.35 + "shrdl %4, %%edx, %%eax \n\t"\ 147.36 + : "=a"(rt), "=d"(dummy)\ 147.37 + : "a" ((int)ra), "rm" ((int)rb), "i"(shift));\ 147.38 + rt; }) 147.39 + 147.40 +#define MULH(ra, rb) \ 147.41 + ({ int rt, dummy;\ 147.42 + __asm__ ("imull %3\n\t" : "=d"(rt), "=a"(dummy): "a" ((int)ra), "rm" ((int)rb));\ 147.43 + rt; }) 147.44 + 147.45 +#define MUL64(ra, rb) \ 147.46 + ({ int64_t rt;\ 147.47 + __asm__ ("imull %2\n\t" : "=A"(rt) : "a" ((int)ra), "g" ((int)rb));\ 147.48 + rt; }) 147.49 +#endif 147.50 + 147.51 +// avoid +32 for shift optimization (gcc should do that ...) 147.52 +#define NEG_SSR32 NEG_SSR32 147.53 +static inline int32_t NEG_SSR32( int32_t a, int8_t s){ 147.54 + __asm__ ("sarl %1, %0\n\t" 147.55 + : "+r" (a) 147.56 + : "ic" ((uint8_t)(-s)) 147.57 + ); 147.58 + return a; 147.59 +} 147.60 + 147.61 +#define NEG_USR32 NEG_USR32 147.62 +static inline uint32_t NEG_USR32(uint32_t a, int8_t s){ 147.63 + __asm__ ("shrl %1, %0\n\t" 147.64 + : "+r" (a) 147.65 + : "ic" ((uint8_t)(-s)) 147.66 + ); 147.67 + return a; 147.68 +} 147.69 + 147.70 +#endif /* AVCODEC_X86_MATHOPS_H */
148.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 148.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/mmx.h Mon Aug 27 12:09:56 2012 +0200 148.3 @@ -0,0 +1,267 @@ 148.4 +/* 148.5 + * mmx.h 148.6 + * Copyright (C) 1997-2001 H. Dietz and R. Fisher 148.7 + * 148.8 + * This file is part of FFmpeg. 148.9 + * 148.10 + * FFmpeg is free software; you can redistribute it and/or 148.11 + * modify it under the terms of the GNU Lesser General Public 148.12 + * License as published by the Free Software Foundation; either 148.13 + * version 2.1 of the License, or (at your option) any later version. 148.14 + * 148.15 + * FFmpeg is distributed in the hope that it will be useful, 148.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 148.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 148.18 + * Lesser General Public License for more details. 148.19 + * 148.20 + * You should have received a copy of the GNU Lesser General Public 148.21 + * License along with FFmpeg; if not, write to the Free Software 148.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 148.23 + */ 148.24 +#ifndef AVCODEC_X86_MMX_H 148.25 +#define AVCODEC_X86_MMX_H 148.26 + 148.27 +#warning Everything in this header is deprecated, use plain __asm__()! New code using this header will be rejected. 148.28 + 148.29 + 148.30 +#define mmx_i2r(op,imm,reg) \ 148.31 + __asm__ volatile (#op " %0, %%" #reg \ 148.32 + : /* nothing */ \ 148.33 + : "i" (imm) ) 148.34 + 148.35 +#define mmx_m2r(op,mem,reg) \ 148.36 + __asm__ volatile (#op " %0, %%" #reg \ 148.37 + : /* nothing */ \ 148.38 + : "m" (mem)) 148.39 + 148.40 +#define mmx_r2m(op,reg,mem) \ 148.41 + __asm__ volatile (#op " %%" #reg ", %0" \ 148.42 + : "=m" (mem) \ 148.43 + : /* nothing */ ) 148.44 + 148.45 +#define mmx_r2r(op,regs,regd) \ 148.46 + __asm__ volatile (#op " %" #regs ", %" #regd) 148.47 + 148.48 + 148.49 +#define emms() __asm__ volatile ("emms") 148.50 + 148.51 +#define movd_m2r(var,reg) mmx_m2r (movd, var, reg) 148.52 +#define movd_r2m(reg,var) mmx_r2m (movd, reg, var) 148.53 +#define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd) 148.54 + 148.55 +#define movq_m2r(var,reg) mmx_m2r (movq, var, reg) 148.56 +#define movq_r2m(reg,var) mmx_r2m (movq, reg, var) 148.57 +#define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd) 148.58 + 148.59 +#define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg) 148.60 +#define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd) 148.61 +#define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg) 148.62 +#define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd) 148.63 + 148.64 +#define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg) 148.65 +#define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd) 148.66 + 148.67 +#define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg) 148.68 +#define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd) 148.69 +#define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg) 148.70 +#define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd) 148.71 +#define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg) 148.72 +#define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd) 148.73 + 148.74 +#define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg) 148.75 +#define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd) 148.76 +#define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg) 148.77 +#define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd) 148.78 + 148.79 +#define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg) 148.80 +#define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd) 148.81 +#define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg) 148.82 +#define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd) 148.83 + 148.84 +#define pand_m2r(var,reg) mmx_m2r (pand, var, reg) 148.85 +#define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd) 148.86 + 148.87 +#define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg) 148.88 +#define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd) 148.89 + 148.90 +#define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg) 148.91 +#define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd) 148.92 +#define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg) 148.93 +#define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd) 148.94 +#define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg) 148.95 +#define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd) 148.96 + 148.97 +#define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg) 148.98 +#define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd) 148.99 +#define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg) 148.100 +#define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd) 148.101 +#define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg) 148.102 +#define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd) 148.103 + 148.104 +#define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg) 148.105 +#define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd) 148.106 + 148.107 +#define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg) 148.108 +#define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd) 148.109 + 148.110 +#define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg) 148.111 +#define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd) 148.112 + 148.113 +#define por_m2r(var,reg) mmx_m2r (por, var, reg) 148.114 +#define por_r2r(regs,regd) mmx_r2r (por, regs, regd) 148.115 + 148.116 +#define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg) 148.117 +#define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg) 148.118 +#define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd) 148.119 +#define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg) 148.120 +#define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg) 148.121 +#define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd) 148.122 +#define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg) 148.123 +#define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg) 148.124 +#define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd) 148.125 + 148.126 +#define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg) 148.127 +#define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg) 148.128 +#define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd) 148.129 +#define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg) 148.130 +#define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg) 148.131 +#define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd) 148.132 + 148.133 +#define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg) 148.134 +#define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg) 148.135 +#define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd) 148.136 +#define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg) 148.137 +#define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg) 148.138 +#define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd) 148.139 +#define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg) 148.140 +#define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg) 148.141 +#define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd) 148.142 + 148.143 +#define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg) 148.144 +#define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd) 148.145 +#define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg) 148.146 +#define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd) 148.147 +#define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg) 148.148 +#define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd) 148.149 + 148.150 +#define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg) 148.151 +#define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd) 148.152 +#define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg) 148.153 +#define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd) 148.154 + 148.155 +#define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg) 148.156 +#define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd) 148.157 +#define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg) 148.158 +#define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd) 148.159 + 148.160 +#define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg) 148.161 +#define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd) 148.162 +#define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg) 148.163 +#define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd) 148.164 +#define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg) 148.165 +#define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd) 148.166 + 148.167 +#define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg) 148.168 +#define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd) 148.169 +#define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg) 148.170 +#define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd) 148.171 +#define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg) 148.172 +#define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd) 148.173 + 148.174 +#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg) 148.175 +#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd) 148.176 + 148.177 + 148.178 +/* 3DNOW extensions */ 148.179 + 148.180 +#define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg) 148.181 +#define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd) 148.182 + 148.183 + 148.184 +/* AMD MMX extensions - also available in intel SSE */ 148.185 + 148.186 + 148.187 +#define mmx_m2ri(op,mem,reg,imm) \ 148.188 + __asm__ volatile (#op " %1, %0, %%" #reg \ 148.189 + : /* nothing */ \ 148.190 + : "m" (mem), "i" (imm)) 148.191 +#define mmx_r2ri(op,regs,regd,imm) \ 148.192 + __asm__ volatile (#op " %0, %%" #regs ", %%" #regd \ 148.193 + : /* nothing */ \ 148.194 + : "i" (imm) ) 148.195 + 148.196 +#define mmx_fetch(mem,hint) \ 148.197 + __asm__ volatile ("prefetch" #hint " %0" \ 148.198 + : /* nothing */ \ 148.199 + : "m" (mem)) 148.200 + 148.201 + 148.202 +#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg) 148.203 + 148.204 +#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var) 148.205 + 148.206 +#define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg) 148.207 +#define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd) 148.208 +#define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg) 148.209 +#define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd) 148.210 + 148.211 +#define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm) 148.212 + 148.213 +#define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm) 148.214 + 148.215 +#define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg) 148.216 +#define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd) 148.217 + 148.218 +#define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg) 148.219 +#define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd) 148.220 + 148.221 +#define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg) 148.222 +#define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd) 148.223 + 148.224 +#define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg) 148.225 +#define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd) 148.226 + 148.227 +#define pmovmskb(mmreg,reg) \ 148.228 + __asm__ volatile ("movmskps %" #mmreg ", %" #reg) 148.229 + 148.230 +#define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg) 148.231 +#define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd) 148.232 + 148.233 +#define prefetcht0(mem) mmx_fetch (mem, t0) 148.234 +#define prefetcht1(mem) mmx_fetch (mem, t1) 148.235 +#define prefetcht2(mem) mmx_fetch (mem, t2) 148.236 +#define prefetchnta(mem) mmx_fetch (mem, nta) 148.237 + 148.238 +#define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg) 148.239 +#define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd) 148.240 + 148.241 +#define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm) 148.242 +#define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm) 148.243 + 148.244 +#define sfence() __asm__ volatile ("sfence\n\t") 148.245 + 148.246 +/* SSE2 */ 148.247 +#define pshufhw_m2r(var,reg,imm) mmx_m2ri(pshufhw, var, reg, imm) 148.248 +#define pshufhw_r2r(regs,regd,imm) mmx_r2ri(pshufhw, regs, regd, imm) 148.249 +#define pshuflw_m2r(var,reg,imm) mmx_m2ri(pshuflw, var, reg, imm) 148.250 +#define pshuflw_r2r(regs,regd,imm) mmx_r2ri(pshuflw, regs, regd, imm) 148.251 + 148.252 +#define pshufd_r2r(regs,regd,imm) mmx_r2ri(pshufd, regs, regd, imm) 148.253 + 148.254 +#define movdqa_m2r(var,reg) mmx_m2r (movdqa, var, reg) 148.255 +#define movdqa_r2m(reg,var) mmx_r2m (movdqa, reg, var) 148.256 +#define movdqa_r2r(regs,regd) mmx_r2r (movdqa, regs, regd) 148.257 +#define movdqu_m2r(var,reg) mmx_m2r (movdqu, var, reg) 148.258 +#define movdqu_r2m(reg,var) mmx_r2m (movdqu, reg, var) 148.259 +#define movdqu_r2r(regs,regd) mmx_r2r (movdqu, regs, regd) 148.260 + 148.261 +#define pmullw_r2m(reg,var) mmx_r2m (pmullw, reg, var) 148.262 + 148.263 +#define pslldq_i2r(imm,reg) mmx_i2r (pslldq, imm, reg) 148.264 +#define psrldq_i2r(imm,reg) mmx_i2r (psrldq, imm, reg) 148.265 + 148.266 +#define punpcklqdq_r2r(regs,regd) mmx_r2r (punpcklqdq, regs, regd) 148.267 +#define punpckhqdq_r2r(regs,regd) mmx_r2r (punpckhqdq, regs, regd) 148.268 + 148.269 + 148.270 +#endif /* AVCODEC_X86_MMX_H */
149.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 149.2 +++ b/ffmpeg_smp/h264dec/libavutil/arm/bswap.h Mon Aug 27 12:09:56 2012 +0200 149.3 @@ -0,0 +1,72 @@ 149.4 +/* 149.5 + * This file is part of FFmpeg. 149.6 + * 149.7 + * FFmpeg is free software; you can redistribute it and/or 149.8 + * modify it under the terms of the GNU Lesser General Public 149.9 + * License as published by the Free Software Foundation; either 149.10 + * version 2.1 of the License, or (at your option) any later version. 149.11 + * 149.12 + * FFmpeg is distributed in the hope that it will be useful, 149.13 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 149.14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 149.15 + * Lesser General Public License for more details. 149.16 + * 149.17 + * You should have received a copy of the GNU Lesser General Public 149.18 + * License along with FFmpeg; if not, write to the Free Software 149.19 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 149.20 + */ 149.21 + 149.22 +#ifndef AVUTIL_ARM_BSWAP_H 149.23 +#define AVUTIL_ARM_BSWAP_H 149.24 + 149.25 +#include <stdint.h> 149.26 +#include "config.h" 149.27 +#include "libavutil/attributes.h" 149.28 + 149.29 +#ifdef __ARMCC_VERSION 149.30 + 149.31 +#if HAVE_ARMV6 149.32 +#define bswap_16 bswap_16 149.33 +static av_always_inline av_const unsigned bswap_16(unsigned x) 149.34 +{ 149.35 + __asm { rev16 x, x } 149.36 + return x; 149.37 +} 149.38 + 149.39 +#define bswap_32 bswap_32 149.40 +static av_always_inline av_const uint32_t bswap_32(uint32_t x) 149.41 +{ 149.42 + return __rev(x); 149.43 +} 149.44 +#endif /* HAVE_ARMV6 */ 149.45 + 149.46 +#elif HAVE_INLINE_ASM 149.47 + 149.48 +#if HAVE_ARMV6 149.49 +#define bswap_16 bswap_16 149.50 +static av_always_inline av_const unsigned bswap_16(unsigned x) 149.51 +{ 149.52 + __asm__("rev16 %0, %0" : "+r"(x)); 149.53 + return x; 149.54 +} 149.55 +#endif 149.56 + 149.57 +#define bswap_32 bswap_32 149.58 +static av_always_inline av_const uint32_t bswap_32(uint32_t x) 149.59 +{ 149.60 +#if HAVE_ARMV6 149.61 + __asm__("rev %0, %0" : "+r"(x)); 149.62 +#else 149.63 + uint32_t t; 149.64 + __asm__ ("eor %1, %0, %0, ror #16 \n\t" 149.65 + "bic %1, %1, #0xFF0000 \n\t" 149.66 + "mov %0, %0, ror #8 \n\t" 149.67 + "eor %0, %0, %1, lsr #8 \n\t" 149.68 + : "+r"(x), "=&r"(t)); 149.69 +#endif /* HAVE_ARMV6 */ 149.70 + return x; 149.71 +} 149.72 + 149.73 +#endif /* __ARMCC_VERSION */ 149.74 + 149.75 +#endif /* AVUTIL_ARM_BSWAP_H */
150.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 150.2 +++ b/ffmpeg_smp/h264dec/libavutil/arm/intreadwrite.h Mon Aug 27 12:09:56 2012 +0200 150.3 @@ -0,0 +1,78 @@ 150.4 +/* 150.5 + * This file is part of FFmpeg. 150.6 + * 150.7 + * FFmpeg is free software; you can redistribute it and/or 150.8 + * modify it under the terms of the GNU Lesser General Public 150.9 + * License as published by the Free Software Foundation; either 150.10 + * version 2.1 of the License, or (at your option) any later version. 150.11 + * 150.12 + * FFmpeg is distributed in the hope that it will be useful, 150.13 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 150.14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 150.15 + * Lesser General Public License for more details. 150.16 + * 150.17 + * You should have received a copy of the GNU Lesser General Public 150.18 + * License along with FFmpeg; if not, write to the Free Software 150.19 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 150.20 + */ 150.21 + 150.22 +#ifndef AVUTIL_ARM_INTREADWRITE_H 150.23 +#define AVUTIL_ARM_INTREADWRITE_H 150.24 + 150.25 +#include <stdint.h> 150.26 +#include "config.h" 150.27 + 150.28 +#if HAVE_FAST_UNALIGNED && HAVE_INLINE_ASM 150.29 + 150.30 +#define AV_RN16 AV_RN16 150.31 +static av_always_inline uint16_t AV_RN16(const void *p) 150.32 +{ 150.33 + uint16_t v; 150.34 + __asm__ ("ldrh %0, %1" : "=r"(v) : "m"(*(const uint16_t *)p)); 150.35 + return v; 150.36 +} 150.37 + 150.38 +#define AV_WN16 AV_WN16 150.39 +static av_always_inline void AV_WN16(void *p, uint16_t v) 150.40 +{ 150.41 + __asm__ ("strh %1, %0" : "=m"(*(uint16_t *)p) : "r"(v)); 150.42 +} 150.43 + 150.44 +#define AV_RN32 AV_RN32 150.45 +static av_always_inline uint32_t AV_RN32(const void *p) 150.46 +{ 150.47 + uint32_t v; 150.48 + __asm__ ("ldr %0, %1" : "=r"(v) : "m"(*(const uint32_t *)p)); 150.49 + return v; 150.50 +} 150.51 + 150.52 +#define AV_WN32 AV_WN32 150.53 +static av_always_inline void AV_WN32(void *p, uint32_t v) 150.54 +{ 150.55 + __asm__ ("str %1, %0" : "=m"(*(uint32_t *)p) : "r"(v)); 150.56 +} 150.57 + 150.58 +#define AV_RN64 AV_RN64 150.59 +static av_always_inline uint64_t AV_RN64(const void *p) 150.60 +{ 150.61 + union { uint64_t v; uint32_t hl[2]; } v; 150.62 + __asm__ ("ldr %0, %2 \n\t" 150.63 + "ldr %1, %3 \n\t" 150.64 + : "=&r"(v.hl[0]), "=r"(v.hl[1]) 150.65 + : "m"(*(const uint32_t*)p), "m"(*((const uint32_t*)p+1))); 150.66 + return v.v; 150.67 +} 150.68 + 150.69 +#define AV_WN64 AV_WN64 150.70 +static av_always_inline void AV_WN64(void *p, uint64_t v) 150.71 +{ 150.72 + union { uint64_t v; uint32_t hl[2]; } vv = { v }; 150.73 + __asm__ ("str %2, %0 \n\t" 150.74 + "str %3, %1 \n\t" 150.75 + : "=m"(*(uint32_t*)p), "=m"(*((uint32_t*)p+1)) 150.76 + : "r"(vv.hl[0]), "r"(vv.hl[1])); 150.77 +} 150.78 + 150.79 +#endif /* HAVE_INLINE_ASM */ 150.80 + 150.81 +#endif /* AVUTIL_ARM_INTREADWRITE_H */
151.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 151.2 +++ b/ffmpeg_smp/h264dec/libavutil/arm/timer.h Mon Aug 27 12:09:56 2012 +0200 151.3 @@ -0,0 +1,40 @@ 151.4 +/* 151.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 151.6 + * 151.7 + * This file is part of FFmpeg. 151.8 + * 151.9 + * FFmpeg is free software; you can redistribute it and/or 151.10 + * modify it under the terms of the GNU Lesser General Public 151.11 + * License as published by the Free Software Foundation; either 151.12 + * version 2.1 of the License, or (at your option) any later version. 151.13 + * 151.14 + * FFmpeg is distributed in the hope that it will be useful, 151.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 151.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 151.17 + * Lesser General Public License for more details. 151.18 + * 151.19 + * You should have received a copy of the GNU Lesser General Public 151.20 + * License along with FFmpeg; if not, write to the Free Software 151.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 151.22 + */ 151.23 + 151.24 +#ifndef AVUTIL_ARM_TIMER_H 151.25 +#define AVUTIL_ARM_TIMER_H 151.26 + 151.27 +#include <stdint.h> 151.28 +#include "config.h" 151.29 + 151.30 +#if HAVE_INLINE_ASM && defined(__ARM_ARCH_7A__) 151.31 + 151.32 +#define AV_READ_TIME read_time 151.33 + 151.34 +static inline uint64_t read_time(void) 151.35 +{ 151.36 + unsigned cc; 151.37 + __asm__ volatile ("mrc p15, 0, %0, c9, c13, 0" : "=r"(cc)); 151.38 + return cc; 151.39 +} 151.40 + 151.41 +#endif /* HAVE_INLINE_ASM && __ARM_ARCH_7A__ */ 151.42 + 151.43 +#endif /* AVUTIL_ARM_TIMER_H */
152.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 152.2 +++ b/ffmpeg_smp/h264dec/libavutil/attributes.h Mon Aug 27 12:09:56 2012 +0200 152.3 @@ -0,0 +1,113 @@ 152.4 +/* 152.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> 152.6 + * 152.7 + * This file is part of FFmpeg. 152.8 + * 152.9 + * FFmpeg is free software; you can redistribute it and/or 152.10 + * modify it under the terms of the GNU Lesser General Public 152.11 + * License as published by the Free Software Foundation; either 152.12 + * version 2.1 of the License, or (at your option) any later version. 152.13 + * 152.14 + * FFmpeg is distributed in the hope that it will be useful, 152.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 152.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 152.17 + * Lesser General Public License for more details. 152.18 + * 152.19 + * You should have received a copy of the GNU Lesser General Public 152.20 + * License along with FFmpeg; if not, write to the Free Software 152.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 152.22 + */ 152.23 + 152.24 +/** 152.25 + * @file 152.26 + * Macro definitions for various function/variable attributes 152.27 + */ 152.28 + 152.29 +#ifndef AVUTIL_ATTRIBUTES_H 152.30 +#define AVUTIL_ATTRIBUTES_H 152.31 + 152.32 +#ifdef __GNUC__ 152.33 +# define AV_GCC_VERSION_AT_LEAST(x,y) (__GNUC__ > x || __GNUC__ == x && __GNUC_MINOR__ >= y) 152.34 +#else 152.35 +# define AV_GCC_VERSION_AT_LEAST(x,y) 0 152.36 +#endif 152.37 + 152.38 +#ifndef av_always_inline 152.39 +#if AV_GCC_VERSION_AT_LEAST(3,1) 152.40 +# define av_always_inline __attribute__((always_inline)) inline 152.41 +#else 152.42 +# define av_always_inline inline 152.43 +#endif 152.44 +#endif 152.45 + 152.46 +#ifndef av_noinline 152.47 +#if AV_GCC_VERSION_AT_LEAST(3,1) 152.48 +# define av_noinline __attribute__((noinline)) 152.49 +#else 152.50 +# define av_noinline 152.51 +#endif 152.52 +#endif 152.53 + 152.54 +#ifndef av_pure 152.55 +#if AV_GCC_VERSION_AT_LEAST(3,1) 152.56 +# define av_pure __attribute__((pure)) 152.57 +#else 152.58 +# define av_pure 152.59 +#endif 152.60 +#endif 152.61 + 152.62 +#ifndef av_const 152.63 +#if AV_GCC_VERSION_AT_LEAST(2,6) 152.64 +# define av_const __attribute__((const)) 152.65 +#else 152.66 +# define av_const 152.67 +#endif 152.68 +#endif 152.69 + 152.70 +#ifndef av_cold 152.71 +#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,3) 152.72 +# define av_cold __attribute__((cold)) 152.73 +#else 152.74 +# define av_cold 152.75 +#endif 152.76 +#endif 152.77 + 152.78 +#ifndef av_flatten 152.79 +#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,1) 152.80 +# define av_flatten __attribute__((flatten)) 152.81 +#else 152.82 +# define av_flatten 152.83 +#endif 152.84 +#endif 152.85 + 152.86 +#ifndef attribute_deprecated 152.87 +#if AV_GCC_VERSION_AT_LEAST(3,1) 152.88 +# define attribute_deprecated __attribute__((deprecated)) 152.89 +#else 152.90 +# define attribute_deprecated 152.91 +#endif 152.92 +#endif 152.93 + 152.94 +#ifndef av_unused 152.95 +#if defined(__GNUC__) 152.96 +# define av_unused __attribute__((unused)) 152.97 +#else 152.98 +# define av_unused 152.99 +#endif 152.100 +#endif 152.101 + 152.102 +#ifndef av_uninit 152.103 +#if defined(__GNUC__) && !defined(__ICC) 152.104 +# define av_uninit(x) x=x 152.105 +#else 152.106 +# define av_uninit(x) x 152.107 +#endif 152.108 +#endif 152.109 + 152.110 +#ifdef __GNUC__ 152.111 +# define av_builtin_constant_p __builtin_constant_p 152.112 +#else 152.113 +# define av_builtin_constant_p(x) 0 152.114 +#endif 152.115 + 152.116 +#endif /* AVUTIL_ATTRIBUTES_H */
153.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 153.2 +++ b/ffmpeg_smp/h264dec/libavutil/bswap.h Mon Aug 27 12:09:56 2012 +0200 153.3 @@ -0,0 +1,95 @@ 153.4 +/* 153.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> 153.6 + * 153.7 + * This file is part of FFmpeg. 153.8 + * 153.9 + * FFmpeg is free software; you can redistribute it and/or 153.10 + * modify it under the terms of the GNU Lesser General Public 153.11 + * License as published by the Free Software Foundation; either 153.12 + * version 2.1 of the License, or (at your option) any later version. 153.13 + * 153.14 + * FFmpeg is distributed in the hope that it will be useful, 153.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 153.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 153.17 + * Lesser General Public License for more details. 153.18 + * 153.19 + * You should have received a copy of the GNU Lesser General Public 153.20 + * License along with FFmpeg; if not, write to the Free Software 153.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 153.22 + */ 153.23 + 153.24 +/** 153.25 + * @file 153.26 + * byte swapping routines 153.27 + */ 153.28 + 153.29 +#ifndef AVUTIL_BSWAP_H 153.30 +#define AVUTIL_BSWAP_H 153.31 + 153.32 +#include <stdint.h> 153.33 +#include "config.h" 153.34 +#include "attributes.h" 153.35 + 153.36 +#if ARCH_ARM 153.37 +# include "arm/bswap.h" 153.38 +#elif ARCH_X86 153.39 +# include "x86/bswap.h" 153.40 +#endif 153.41 + 153.42 +#ifndef bswap_16 153.43 +static av_always_inline av_const uint16_t bswap_16(uint16_t x) 153.44 +{ 153.45 + x= (x>>8) | (x<<8); 153.46 + return x; 153.47 +} 153.48 +#endif 153.49 + 153.50 +#ifndef bswap_32 153.51 +static av_always_inline av_const uint32_t bswap_32(uint32_t x) 153.52 +{ 153.53 + x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF); 153.54 + x= (x>>16) | (x<<16); 153.55 + return x; 153.56 +} 153.57 +#endif 153.58 + 153.59 +#ifndef bswap_64 153.60 +static inline uint64_t av_const bswap_64(uint64_t x) 153.61 +{ 153.62 +#if 0 153.63 + x= ((x<< 8)&0xFF00FF00FF00FF00ULL) | ((x>> 8)&0x00FF00FF00FF00FFULL); 153.64 + x= ((x<<16)&0xFFFF0000FFFF0000ULL) | ((x>>16)&0x0000FFFF0000FFFFULL); 153.65 + return (x>>32) | (x<<32); 153.66 +#else 153.67 + union { 153.68 + uint64_t ll; 153.69 + uint32_t l[2]; 153.70 + } w, r; 153.71 + w.ll = x; 153.72 + r.l[0] = bswap_32 (w.l[1]); 153.73 + r.l[1] = bswap_32 (w.l[0]); 153.74 + return r.ll; 153.75 +#endif 153.76 +} 153.77 +#endif 153.78 + 153.79 +// be2me ... big-endian to machine-endian 153.80 +// le2me ... little-endian to machine-endian 153.81 + 153.82 +#if HAVE_BIGENDIAN 153.83 +#define be2me_16(x) (x) 153.84 +#define be2me_32(x) (x) 153.85 +#define be2me_64(x) (x) 153.86 +#define le2me_16(x) bswap_16(x) 153.87 +#define le2me_32(x) bswap_32(x) 153.88 +#define le2me_64(x) bswap_64(x) 153.89 +#else 153.90 +#define be2me_16(x) bswap_16(x) 153.91 +#define be2me_32(x) bswap_32(x) 153.92 +#define be2me_64(x) bswap_64(x) 153.93 +#define le2me_16(x) (x) 153.94 +#define le2me_32(x) (x) 153.95 +#define le2me_64(x) (x) 153.96 +#endif 153.97 + 153.98 +#endif /* AVUTIL_BSWAP_H */
154.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 154.2 +++ b/ffmpeg_smp/h264dec/libavutil/common.h Mon Aug 27 12:09:56 2012 +0200 154.3 @@ -0,0 +1,298 @@ 154.4 +/* 154.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> 154.6 + * 154.7 + * This file is part of FFmpeg. 154.8 + * 154.9 + * FFmpeg is free software; you can redistribute it and/or 154.10 + * modify it under the terms of the GNU Lesser General Public 154.11 + * License as published by the Free Software Foundation; either 154.12 + * version 2.1 of the License, or (at your option) any later version. 154.13 + * 154.14 + * FFmpeg is distributed in the hope that it will be useful, 154.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 154.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 154.17 + * Lesser General Public License for more details. 154.18 + * 154.19 + * You should have received a copy of the GNU Lesser General Public 154.20 + * License along with FFmpeg; if not, write to the Free Software 154.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 154.22 + */ 154.23 + 154.24 +/** 154.25 + * @file 154.26 + * common internal and external API header 154.27 + */ 154.28 + 154.29 +#ifndef AVUTIL_COMMON_H 154.30 +#define AVUTIL_COMMON_H 154.31 + 154.32 +#include <ctype.h> 154.33 +#include <errno.h> 154.34 +#include <inttypes.h> 154.35 +#include <limits.h> 154.36 +#include <math.h> 154.37 +#include <stdio.h> 154.38 +#include <stdlib.h> 154.39 +#include <string.h> 154.40 +#include "attributes.h" 154.41 + 154.42 +//rounded division & shift 154.43 +#define RSHIFT(a,b) ((a) > 0 ? ((a) + ((1<<(b))>>1))>>(b) : ((a) + ((1<<(b))>>1)-1)>>(b)) 154.44 +/* assume b>0 */ 154.45 +#define ROUNDED_DIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b)) 154.46 +#define FFABS(a) ((a) >= 0 ? (a) : (-(a))) 154.47 +#define FFSIGN(a) ((a) > 0 ? 1 : -1) 154.48 + 154.49 +#define FFMAX(a,b) ((a) > (b) ? (a) : (b)) 154.50 +#define FFMAX3(a,b,c) FFMAX(FFMAX(a,b),c) 154.51 +#define FFMIN(a,b) ((a) > (b) ? (b) : (a)) 154.52 +#define FFMIN3(a,b,c) FFMIN(FFMIN(a,b),c) 154.53 + 154.54 +#define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0) 154.55 +#define FF_ARRAY_ELEMS(a) (sizeof(a) / sizeof((a)[0])) 154.56 +#define FFALIGN(x, a) (((x)+(a)-1)&~((a)-1)) 154.57 + 154.58 +/* misc math functions */ 154.59 +extern const uint8_t ff_log2_tab[256]; 154.60 + 154.61 +static inline av_const int av_log2_c(unsigned int v) 154.62 +{ 154.63 + int n = 0; 154.64 + if (v & 0xffff0000) { 154.65 + v >>= 16; 154.66 + n += 16; 154.67 + } 154.68 + if (v & 0xff00) { 154.69 + v >>= 8; 154.70 + n += 8; 154.71 + } 154.72 + n += ff_log2_tab[v]; 154.73 + 154.74 + return n; 154.75 +} 154.76 + 154.77 +static inline av_const int av_log2_16bit_c(unsigned int v) 154.78 +{ 154.79 + int n = 0; 154.80 + if (v & 0xff00) { 154.81 + v >>= 8; 154.82 + n += 8; 154.83 + } 154.84 + n += ff_log2_tab[v]; 154.85 + 154.86 + return n; 154.87 +} 154.88 + 154.89 +#ifdef HAVE_AV_CONFIG_H 154.90 +# include "config.h" 154.91 +#endif 154.92 + 154.93 +/** 154.94 + * Clips a signed integer value into the amin-amax range. 154.95 + * @param a value to clip 154.96 + * @param amin minimum value of the clip range 154.97 + * @param amax maximum value of the clip range 154.98 + * @return clipped value 154.99 + */ 154.100 +static inline av_const int av_clip(int a, int amin, int amax) 154.101 +{ 154.102 + if (a < amin) return amin; 154.103 + else if (a > amax) return amax; 154.104 + else return a; 154.105 +} 154.106 + 154.107 +/** 154.108 + * Clips a signed integer value into the 0-255 range. 154.109 + * @param a value to clip 154.110 + * @return clipped value 154.111 + */ 154.112 +static inline av_const uint8_t av_clip_uint8(int a) 154.113 +{ 154.114 + if (a&(~0xFF)) return (-a)>>31; 154.115 + else return a; 154.116 +} 154.117 + 154.118 +/** 154.119 + * Clips a signed integer value into the 0-65535 range. 154.120 + * @param a value to clip 154.121 + * @return clipped value 154.122 + */ 154.123 +static inline av_const uint16_t av_clip_uint16(int a) 154.124 +{ 154.125 + if (a&(~0xFFFF)) return (-a)>>31; 154.126 + else return a; 154.127 +} 154.128 + 154.129 +/** 154.130 + * Clips a signed integer value into the -32768,32767 range. 154.131 + * @param a value to clip 154.132 + * @return clipped value 154.133 + */ 154.134 +static inline av_const int16_t av_clip_int16(int a) 154.135 +{ 154.136 + if ((a+0x8000) & ~0xFFFF) return (a>>31) ^ 0x7FFF; 154.137 + else return a; 154.138 +} 154.139 + 154.140 +/** 154.141 + * Clips a signed 64-bit integer value into the -2147483648,2147483647 range. 154.142 + * @param a value to clip 154.143 + * @return clipped value 154.144 + */ 154.145 +static inline av_const int32_t av_clipl_int32(int64_t a) 154.146 +{ 154.147 + if ((a+0x80000000u) & ~UINT64_C(0xFFFFFFFF)) return (a>>63) ^ 0x7FFFFFFF; 154.148 + else return a; 154.149 +} 154.150 + 154.151 +/** 154.152 + * Clips a float value into the amin-amax range. 154.153 + * @param a value to clip 154.154 + * @param amin minimum value of the clip range 154.155 + * @param amax maximum value of the clip range 154.156 + * @return clipped value 154.157 + */ 154.158 +static inline av_const float av_clipf(float a, float amin, float amax) 154.159 +{ 154.160 + if (a < amin) return amin; 154.161 + else if (a > amax) return amax; 154.162 + else return a; 154.163 +} 154.164 + 154.165 +/** Computes ceil(log2(x)). 154.166 + * @param x value used to compute ceil(log2(x)) 154.167 + * @return computed ceiling of log2(x) 154.168 + */ 154.169 +static inline av_const int av_ceil_log2(int x) 154.170 +{ 154.171 + return av_log2_c((x - 1) << 1); 154.172 +} 154.173 + 154.174 +#define MKTAG(a,b,c,d) (a | (b << 8) | (c << 16) | (d << 24)) 154.175 +#define MKBETAG(a,b,c,d) (d | (c << 8) | (b << 16) | (a << 24)) 154.176 + 154.177 +/*! 154.178 + * \def GET_UTF8(val, GET_BYTE, ERROR) 154.179 + * Converts a UTF-8 character (up to 4 bytes long) to its 32-bit UCS-4 encoded form 154.180 + * \param val is the output and should be of type uint32_t. It holds the converted 154.181 + * UCS-4 character and should be a left value. 154.182 + * \param GET_BYTE gets UTF-8 encoded bytes from any proper source. It can be 154.183 + * a function or a statement whose return value or evaluated value is of type 154.184 + * uint8_t. It will be executed up to 4 times for values in the valid UTF-8 range, 154.185 + * and up to 7 times in the general case. 154.186 + * \param ERROR action that should be taken when an invalid UTF-8 byte is returned 154.187 + * from GET_BYTE. It should be a statement that jumps out of the macro, 154.188 + * like exit(), goto, return, break, or continue. 154.189 + */ 154.190 +#define GET_UTF8(val, GET_BYTE, ERROR)\ 154.191 + val= GET_BYTE;\ 154.192 + {\ 154.193 + int ones= 7 - av_log2(val ^ 255);\ 154.194 + if(ones==1)\ 154.195 + ERROR\ 154.196 + val&= 127>>ones;\ 154.197 + while(--ones > 0){\ 154.198 + int tmp= GET_BYTE - 128;\ 154.199 + if(tmp>>6)\ 154.200 + ERROR\ 154.201 + val= (val<<6) + tmp;\ 154.202 + }\ 154.203 + } 154.204 + 154.205 +/*! 154.206 + * \def GET_UTF16(val, GET_16BIT, ERROR) 154.207 + * Converts a UTF-16 character (2 or 4 bytes) to its 32-bit UCS-4 encoded form 154.208 + * \param val is the output and should be of type uint32_t. It holds the converted 154.209 + * UCS-4 character and should be a left value. 154.210 + * \param GET_16BIT gets two bytes of UTF-16 encoded data converted to native endianness. 154.211 + * It can be a function or a statement whose return value or evaluated value is of type 154.212 + * uint16_t. It will be executed up to 2 times. 154.213 + * \param ERROR action that should be taken when an invalid UTF-16 surrogate is 154.214 + * returned from GET_BYTE. It should be a statement that jumps out of the macro, 154.215 + * like exit(), goto, return, break, or continue. 154.216 + */ 154.217 +#define GET_UTF16(val, GET_16BIT, ERROR)\ 154.218 + val = GET_16BIT;\ 154.219 + {\ 154.220 + unsigned int hi = val - 0xD800;\ 154.221 + if (hi < 0x800) {\ 154.222 + val = GET_16BIT - 0xDC00;\ 154.223 + if (val > 0x3FFU || hi > 0x3FFU)\ 154.224 + ERROR\ 154.225 + val += (hi<<10) + 0x10000;\ 154.226 + }\ 154.227 + }\ 154.228 + 154.229 +/*! 154.230 + * \def PUT_UTF8(val, tmp, PUT_BYTE) 154.231 + * Converts a 32-bit Unicode character to its UTF-8 encoded form (up to 4 bytes long). 154.232 + * \param val is an input-only argument and should be of type uint32_t. It holds 154.233 + * a UCS-4 encoded Unicode character that is to be converted to UTF-8. If 154.234 + * val is given as a function it is executed only once. 154.235 + * \param tmp is a temporary variable and should be of type uint8_t. It 154.236 + * represents an intermediate value during conversion that is to be 154.237 + * output by PUT_BYTE. 154.238 + * \param PUT_BYTE writes the converted UTF-8 bytes to any proper destination. 154.239 + * It could be a function or a statement, and uses tmp as the input byte. 154.240 + * For example, PUT_BYTE could be "*output++ = tmp;" PUT_BYTE will be 154.241 + * executed up to 4 times for values in the valid UTF-8 range and up to 154.242 + * 7 times in the general case, depending on the length of the converted 154.243 + * Unicode character. 154.244 + */ 154.245 +#define PUT_UTF8(val, tmp, PUT_BYTE)\ 154.246 + {\ 154.247 + int bytes, shift;\ 154.248 + uint32_t in = val;\ 154.249 + if (in < 0x80) {\ 154.250 + tmp = in;\ 154.251 + PUT_BYTE\ 154.252 + } else {\ 154.253 + bytes = (av_log2(in) + 4) / 5;\ 154.254 + shift = (bytes - 1) * 6;\ 154.255 + tmp = (256 - (256 >> bytes)) | (in >> shift);\ 154.256 + PUT_BYTE\ 154.257 + while (shift >= 6) {\ 154.258 + shift -= 6;\ 154.259 + tmp = 0x80 | ((in >> shift) & 0x3f);\ 154.260 + PUT_BYTE\ 154.261 + }\ 154.262 + }\ 154.263 + } 154.264 + 154.265 +/*! 154.266 + * \def PUT_UTF16(val, tmp, PUT_16BIT) 154.267 + * Converts a 32-bit Unicode character to its UTF-16 encoded form (2 or 4 bytes). 154.268 + * \param val is an input-only argument and should be of type uint32_t. It holds 154.269 + * a UCS-4 encoded Unicode character that is to be converted to UTF-16. If 154.270 + * val is given as a function it is executed only once. 154.271 + * \param tmp is a temporary variable and should be of type uint16_t. It 154.272 + * represents an intermediate value during conversion that is to be 154.273 + * output by PUT_16BIT. 154.274 + * \param PUT_16BIT writes the converted UTF-16 data to any proper destination 154.275 + * in desired endianness. It could be a function or a statement, and uses tmp 154.276 + * as the input byte. For example, PUT_BYTE could be "*output++ = tmp;" 154.277 + * PUT_BYTE will be executed 1 or 2 times depending on input character. 154.278 + */ 154.279 +#define PUT_UTF16(val, tmp, PUT_16BIT)\ 154.280 + {\ 154.281 + uint32_t in = val;\ 154.282 + if (in < 0x10000) {\ 154.283 + tmp = in;\ 154.284 + PUT_16BIT\ 154.285 + } else {\ 154.286 + tmp = 0xD800 | ((in - 0x10000) >> 10);\ 154.287 + PUT_16BIT\ 154.288 + tmp = 0xDC00 | ((in - 0x10000) & 0x3FF);\ 154.289 + PUT_16BIT\ 154.290 + }\ 154.291 + }\ 154.292 + 154.293 + 154.294 + 154.295 +#include "mem.h" 154.296 + 154.297 +#ifdef HAVE_AV_CONFIG_H 154.298 +# include "internal.h" 154.299 +#endif /* HAVE_AV_CONFIG_H */ 154.300 + 154.301 +#endif /* AVUTIL_COMMON_H */
155.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 155.2 +++ b/ffmpeg_smp/h264dec/libavutil/error.h Mon Aug 27 12:09:56 2012 +0200 155.3 @@ -0,0 +1,53 @@ 155.4 +/* 155.5 + * This file is part of FFmpeg. 155.6 + * 155.7 + * FFmpeg is free software; you can redistribute it and/or 155.8 + * modify it under the terms of the GNU Lesser General Public 155.9 + * License as published by the Free Software Foundation; either 155.10 + * version 2.1 of the License, or (at your option) any later version. 155.11 + * 155.12 + * FFmpeg is distributed in the hope that it will be useful, 155.13 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 155.14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 155.15 + * Lesser General Public License for more details. 155.16 + * 155.17 + * You should have received a copy of the GNU Lesser General Public 155.18 + * License along with FFmpeg; if not, write to the Free Software 155.19 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 155.20 + */ 155.21 + 155.22 +/** 155.23 + * @file 155.24 + * error code definitions 155.25 + */ 155.26 + 155.27 +#ifndef AVUTIL_ERROR_H 155.28 +#define AVUTIL_ERROR_H 155.29 + 155.30 +#include <errno.h> 155.31 +#include "common.h" 155.32 + 155.33 +/* error handling */ 155.34 +#if EDOM > 0 155.35 +#define AVERROR(e) (-(e)) ///< Returns a negative error code from a POSIX error code, to return from library functions. 155.36 +#define AVUNERROR(e) (-(e)) ///< Returns a POSIX error code from a library function error return value. 155.37 +#else 155.38 +/* Some platforms have E* and errno already negated. */ 155.39 +#define AVERROR(e) (e) 155.40 +#define AVUNERROR(e) (e) 155.41 +#endif 155.42 + 155.43 +#define AVERROR_EOF AVERROR(EPIPE) ///< End of file 155.44 + 155.45 + 155.46 +/** 155.47 + * Puts a description of the AVERROR code errnum in errbuf. 155.48 + * In case of failure the global variable errno is set to indicate the 155.49 + * error. 155.50 + * 155.51 + * @param errbuf_size the size in bytes of errbuf 155.52 + * @return 0 on success, a negative value otherwise 155.53 + */ 155.54 +int av_strerror(int errnum, char *errbuf, size_t errbuf_size); 155.55 + 155.56 +#endif /* AVUTIL_ERROR_H */
156.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 156.2 +++ b/ffmpeg_smp/h264dec/libavutil/internal.h Mon Aug 27 12:09:56 2012 +0200 156.3 @@ -0,0 +1,168 @@ 156.4 +/* 156.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> 156.6 + * 156.7 + * This file is part of FFmpeg. 156.8 + * 156.9 + * FFmpeg is free software; you can redistribute it and/or 156.10 + * modify it under the terms of the GNU Lesser General Public 156.11 + * License as published by the Free Software Foundation; either 156.12 + * version 2.1 of the License, or (at your option) any later version. 156.13 + * 156.14 + * FFmpeg is distributed in the hope that it will be useful, 156.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 156.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 156.17 + * Lesser General Public License for more details. 156.18 + * 156.19 + * You should have received a copy of the GNU Lesser General Public 156.20 + * License along with FFmpeg; if not, write to the Free Software 156.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 156.22 + */ 156.23 + 156.24 +/** 156.25 + * @file 156.26 + * common internal API header 156.27 + */ 156.28 + 156.29 +#ifndef AVUTIL_INTERNAL_H 156.30 +#define AVUTIL_INTERNAL_H 156.31 + 156.32 +#if !defined(DEBUG) && !defined(NDEBUG) 156.33 +# define NDEBUG 156.34 +#endif 156.35 + 156.36 +#include <limits.h> 156.37 +#include <stdint.h> 156.38 +#include <stddef.h> 156.39 +#include <assert.h> 156.40 +#include "config.h" 156.41 +#include "attributes.h" 156.42 +#include "timer.h" 156.43 + 156.44 + 156.45 + 156.46 +#ifndef INT16_MIN 156.47 +#define INT16_MIN (-0x7fff - 1) 156.48 +#endif 156.49 + 156.50 +#ifndef INT16_MAX 156.51 +#define INT16_MAX 0x7fff 156.52 +#endif 156.53 + 156.54 +#ifndef INT32_MIN 156.55 +#define INT32_MIN (-0x7fffffff - 1) 156.56 +#endif 156.57 + 156.58 +#ifndef INT32_MAX 156.59 +#define INT32_MAX 0x7fffffff 156.60 +#endif 156.61 + 156.62 +#ifndef UINT32_MAX 156.63 +#define UINT32_MAX 0xffffffff 156.64 +#endif 156.65 + 156.66 +#ifndef INT64_MIN 156.67 +#define INT64_MIN (-0x7fffffffffffffffLL - 1) 156.68 +#endif 156.69 + 156.70 +#ifndef INT64_MAX 156.71 +#define INT64_MAX INT64_C(9223372036854775807) 156.72 +#endif 156.73 + 156.74 +#ifndef UINT64_MAX 156.75 +#define UINT64_MAX UINT64_C(0xFFFFFFFFFFFFFFFF) 156.76 +#endif 156.77 + 156.78 +#ifndef INT_BIT 156.79 +# define INT_BIT (CHAR_BIT * sizeof(int)) 156.80 +#endif 156.81 + 156.82 +#ifndef offsetof 156.83 +# define offsetof(T, F) ((unsigned int)((char *)&((T *)0)->F)) 156.84 +#endif 156.85 + 156.86 +/* Use to export labels from asm. */ 156.87 +#define LABEL_MANGLE(a) #a 156.88 +#define LOCAL_MANGLE(a) #a 156.89 +#define MANGLE(a) #a 156.90 + 156.91 +// Use rip-relative addressing if compiling PIC code on x86-64. 156.92 +// #if ARCH_X86_64 && defined(PIC) 156.93 +// # define LOCAL_MANGLE(a) #a "(%%rip)" 156.94 +// #else 156.95 +// # define LOCAL_MANGLE(a) #a 156.96 +// #endif 156.97 +// 156.98 +// #define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a) 156.99 + 156.100 +/* debug stuff */ 156.101 + 156.102 +/* dprintf macros */ 156.103 +#ifdef DEBUG 156.104 +# define dprintf(pctx, ...) av_log(pctx, AV_LOG_DEBUG, __VA_ARGS__) 156.105 +#else 156.106 +# define dprintf(pctx, ...) 156.107 +#endif 156.108 + 156.109 +#define av_abort() do { av_log(NULL, AV_LOG_ERROR, "Abort at %s:%d\n", __FILE__, __LINE__); abort(); } while (0) 156.110 + 156.111 +/* math */ 156.112 + 156.113 + 156.114 +/* avoid usage of dangerous/inappropriate system functions */ 156.115 +// #undef malloc 156.116 +// #define malloc please_use_av_malloc 156.117 +// #undef free 156.118 +// #define free please_use_av_free 156.119 +#undef realloc 156.120 +#define realloc please_use_av_realloc 156.121 +#undef time 156.122 +#define time time_is_forbidden_due_to_security_issues 156.123 +#undef rand 156.124 +#define rand rand_is_forbidden_due_to_state_trashing_use_av_lfg_get 156.125 +#undef srand 156.126 +#define srand srand_is_forbidden_due_to_state_trashing_use_av_lfg_init 156.127 +#undef random 156.128 +#define random random_is_forbidden_due_to_state_trashing_use_av_lfg_get 156.129 +#undef sprintf 156.130 +#define sprintf sprintf_is_forbidden_due_to_security_issues_use_snprintf 156.131 +//#undef exit 156.132 +//#define exit exit_is_forbidden 156.133 +#ifndef LIBAVFORMAT_BUILD 156.134 + 156.135 +#undef puts 156.136 +#define puts please_use_av_log_instead_of_puts 156.137 +#undef perror 156.138 +#define perror please_use_av_log_instead_of_perror 156.139 +#endif 156.140 + 156.141 +#define FF_ALLOC_OR_GOTO(p, size, label)\ 156.142 +{\ 156.143 + p = av_malloc(size);\ 156.144 + if (p == NULL && (size) != 0) {\ 156.145 + av_log(AV_LOG_ERROR, "Cannot allocate memory.\n");\ 156.146 + goto label;\ 156.147 + }\ 156.148 +} 156.149 + 156.150 +#define FF_ALLOCZ_OR_GOTO(p, size, label)\ 156.151 +{\ 156.152 + p = av_mallocz(size);\ 156.153 + if (p == NULL && (size) != 0) {\ 156.154 + av_log(AV_LOG_ERROR, "Cannot allocate memory.\n");\ 156.155 + goto label;\ 156.156 + }\ 156.157 +} 156.158 + 156.159 + 156.160 +/** 156.161 + * Returns NULL if CONFIG_SMALL is true, otherwise the argument 156.162 + * without modification. Used to disable the definition of strings 156.163 + * (for example AVCodec long_names). 156.164 + */ 156.165 +#if CONFIG_SMALL 156.166 +# define NULL_IF_CONFIG_SMALL(x) NULL 156.167 +#else 156.168 +# define NULL_IF_CONFIG_SMALL(x) x 156.169 +#endif 156.170 + 156.171 +#endif /* AVUTIL_INTERNAL_H */
157.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 157.2 +++ b/ffmpeg_smp/h264dec/libavutil/intreadwrite.h Mon Aug 27 12:09:56 2012 +0200 157.3 @@ -0,0 +1,498 @@ 157.4 +/* 157.5 + * This file is part of FFmpeg. 157.6 + * 157.7 + * FFmpeg is free software; you can redistribute it and/or 157.8 + * modify it under the terms of the GNU Lesser General Public 157.9 + * License as published by the Free Software Foundation; either 157.10 + * version 2.1 of the License, or (at your option) any later version. 157.11 + * 157.12 + * FFmpeg is distributed in the hope that it will be useful, 157.13 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 157.14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 157.15 + * Lesser General Public License for more details. 157.16 + * 157.17 + * You should have received a copy of the GNU Lesser General Public 157.18 + * License along with FFmpeg; if not, write to the Free Software 157.19 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 157.20 + */ 157.21 + 157.22 +#ifndef AVUTIL_INTREADWRITE_H 157.23 +#define AVUTIL_INTREADWRITE_H 157.24 + 157.25 +#include <stdint.h> 157.26 +#include "config.h" 157.27 +#include "bswap.h" 157.28 +#include "common.h" 157.29 + 157.30 +typedef union { 157.31 + uint64_t u64; 157.32 + uint32_t u32[2]; 157.33 + uint16_t u16[4]; 157.34 + uint8_t u8 [8]; 157.35 + double f64; 157.36 + float f32[2]; 157.37 +} __attribute__((__may_alias__)) av_alias64; 157.38 + 157.39 +typedef union { 157.40 + uint32_t u32; 157.41 + uint16_t u16[2]; 157.42 + uint8_t u8 [4]; 157.43 + float f32; 157.44 +} __attribute__((__may_alias__)) av_alias32; 157.45 + 157.46 +typedef union { 157.47 + uint16_t u16; 157.48 + uint8_t u8 [2]; 157.49 +} __attribute__((__may_alias__)) av_alias16 ; 157.50 + 157.51 +/* 157.52 + * Arch-specific headers can provide any combination of 157.53 + * AV_[RW][BLN](16|24|32|64) and AV_(COPY|SWAP|ZERO)(64|128) macros. 157.54 + * Preprocessor symbols must be defined, even if these are implemented 157.55 + * as inline functions. 157.56 + */ 157.57 + 157.58 +#if ARCH_ARM 157.59 +# include "arm/intreadwrite.h" 157.60 +#elif ARCH_PPC 157.61 +# include "ppc/intreadwrite.h" 157.62 +#elif ARCH_X86 157.63 +# include "x86/intreadwrite.h" 157.64 +#endif 157.65 + 157.66 +/* 157.67 + * Map AV_RNXX <-> AV_R[BL]XX for all variants provided by per-arch headers. 157.68 + */ 157.69 + 157.70 +#if HAVE_BIGENDIAN 157.71 + 157.72 +# if defined(AV_RN16) && !defined(AV_RB16) 157.73 +# define AV_RB16(p) AV_RN16(p) 157.74 +# elif !defined(AV_RN16) && defined(AV_RB16) 157.75 +# define AV_RN16(p) AV_RB16(p) 157.76 +# endif 157.77 + 157.78 +# if defined(AV_WN16) && !defined(AV_WB16) 157.79 +# define AV_WB16(p, v) AV_WN16(p, v) 157.80 +# elif !defined(AV_WN16) && defined(AV_WB16) 157.81 +# define AV_WN16(p, v) AV_WB16(p, v) 157.82 +# endif 157.83 + 157.84 +# if defined(AV_RN24) && !defined(AV_RB24) 157.85 +# define AV_RB24(p) AV_RN24(p) 157.86 +# elif !defined(AV_RN24) && defined(AV_RB24) 157.87 +# define AV_RN24(p) AV_RB24(p) 157.88 +# endif 157.89 + 157.90 +# if defined(AV_WN24) && !defined(AV_WB24) 157.91 +# define AV_WB24(p, v) AV_WN24(p, v) 157.92 +# elif !defined(AV_WN24) && defined(AV_WB24) 157.93 +# define AV_WN24(p, v) AV_WB24(p, v) 157.94 +# endif 157.95 + 157.96 +# if defined(AV_RN32) && !defined(AV_RB32) 157.97 +# define AV_RB32(p) AV_RN32(p) 157.98 +# elif !defined(AV_RN32) && defined(AV_RB32) 157.99 +# define AV_RN32(p) AV_RB32(p) 157.100 +# endif 157.101 + 157.102 +# if defined(AV_WN32) && !defined(AV_WB32) 157.103 +# define AV_WB32(p, v) AV_WN32(p, v) 157.104 +# elif !defined(AV_WN32) && defined(AV_WB32) 157.105 +# define AV_WN32(p, v) AV_WB32(p, v) 157.106 +# endif 157.107 + 157.108 +# if defined(AV_RN64) && !defined(AV_RB64) 157.109 +# define AV_RB64(p) AV_RN64(p) 157.110 +# elif !defined(AV_RN64) && defined(AV_RB64) 157.111 +# define AV_RN64(p) AV_RB64(p) 157.112 +# endif 157.113 + 157.114 +# if defined(AV_WN64) && !defined(AV_WB64) 157.115 +# define AV_WB64(p, v) AV_WN64(p, v) 157.116 +# elif !defined(AV_WN64) && defined(AV_WB64) 157.117 +# define AV_WN64(p, v) AV_WB64(p, v) 157.118 +# endif 157.119 + 157.120 +#else /* HAVE_BIGENDIAN */ 157.121 + 157.122 +# if defined(AV_RN16) && !defined(AV_RL16) 157.123 +# define AV_RL16(p) AV_RN16(p) 157.124 +# elif !defined(AV_RN16) && defined(AV_RL16) 157.125 +# define AV_RN16(p) AV_RL16(p) 157.126 +# endif 157.127 + 157.128 +# if defined(AV_WN16) && !defined(AV_WL16) 157.129 +# define AV_WL16(p, v) AV_WN16(p, v) 157.130 +# elif !defined(AV_WN16) && defined(AV_WL16) 157.131 +# define AV_WN16(p, v) AV_WL16(p, v) 157.132 +# endif 157.133 + 157.134 +# if defined(AV_RN24) && !defined(AV_RL24) 157.135 +# define AV_RL24(p) AV_RN24(p) 157.136 +# elif !defined(AV_RN24) && defined(AV_RL24) 157.137 +# define AV_RN24(p) AV_RL24(p) 157.138 +# endif 157.139 + 157.140 +# if defined(AV_WN24) && !defined(AV_WL24) 157.141 +# define AV_WL24(p, v) AV_WN24(p, v) 157.142 +# elif !defined(AV_WN24) && defined(AV_WL24) 157.143 +# define AV_WN24(p, v) AV_WL24(p, v) 157.144 +# endif 157.145 + 157.146 +# if defined(AV_RN32) && !defined(AV_RL32) 157.147 +# define AV_RL32(p) AV_RN32(p) 157.148 +# elif !defined(AV_RN32) && defined(AV_RL32) 157.149 +# define AV_RN32(p) AV_RL32(p) 157.150 +# endif 157.151 + 157.152 +# if defined(AV_WN32) && !defined(AV_WL32) 157.153 +# define AV_WL32(p, v) AV_WN32(p, v) 157.154 +# elif !defined(AV_WN32) && defined(AV_WL32) 157.155 +# define AV_WN32(p, v) AV_WL32(p, v) 157.156 +# endif 157.157 + 157.158 +# if defined(AV_RN64) && !defined(AV_RL64) 157.159 +# define AV_RL64(p) AV_RN64(p) 157.160 +# elif !defined(AV_RN64) && defined(AV_RL64) 157.161 +# define AV_RN64(p) AV_RL64(p) 157.162 +# endif 157.163 + 157.164 +# if defined(AV_WN64) && !defined(AV_WL64) 157.165 +# define AV_WL64(p, v) AV_WN64(p, v) 157.166 +# elif !defined(AV_WN64) && defined(AV_WL64) 157.167 +# define AV_WN64(p, v) AV_WL64(p, v) 157.168 +# endif 157.169 + 157.170 +#endif /* !HAVE_BIGENDIAN */ 157.171 + 157.172 +/* 157.173 + * Define AV_[RW]N helper macros to simplify definitions not provided 157.174 + * by per-arch headers. 157.175 + */ 157.176 + 157.177 + 157.178 + 157.179 +#if defined(__DECC) 157.180 + 157.181 +# define AV_RN(s, p) (*((const __unaligned uint##s##_t*)(p))) 157.182 +# define AV_WN(s, p, v) (*((__unaligned uint##s##_t*)(p)) = (v)) 157.183 + 157.184 +#else 157.185 + 157.186 +#ifndef AV_RB16 157.187 +# define AV_RB16(x) \ 157.188 + ((((const uint8_t*)(x))[0] << 8) | \ 157.189 + ((const uint8_t*)(x))[1]) 157.190 +#endif 157.191 +#ifndef AV_WB16 157.192 +# define AV_WB16(p, d) do { \ 157.193 + ((uint8_t*)(p))[1] = (d); \ 157.194 + ((uint8_t*)(p))[0] = (d)>>8; \ 157.195 + } while(0) 157.196 +#endif 157.197 + 157.198 +#ifndef AV_RL16 157.199 +# define AV_RL16(x) \ 157.200 + ((((const uint8_t*)(x))[1] << 8) | \ 157.201 + ((const uint8_t*)(x))[0]) 157.202 +#endif 157.203 +#ifndef AV_WL16 157.204 +# define AV_WL16(p, d) do { \ 157.205 + ((uint8_t*)(p))[0] = (d); \ 157.206 + ((uint8_t*)(p))[1] = (d)>>8; \ 157.207 + } while(0) 157.208 +#endif 157.209 + 157.210 +#ifndef AV_RB32 157.211 +# define AV_RB32(x) \ 157.212 + ((((const uint8_t*)(x))[0] << 24) | \ 157.213 + (((const uint8_t*)(x))[1] << 16) | \ 157.214 + (((const uint8_t*)(x))[2] << 8) | \ 157.215 + ((const uint8_t*)(x))[3]) 157.216 +#endif 157.217 +#ifndef AV_WB32 157.218 +# define AV_WB32(p, d) do { \ 157.219 + ((uint8_t*)(p))[3] = (d); \ 157.220 + ((uint8_t*)(p))[2] = (d)>>8; \ 157.221 + ((uint8_t*)(p))[1] = (d)>>16; \ 157.222 + ((uint8_t*)(p))[0] = (d)>>24; \ 157.223 + } while(0) 157.224 +#endif 157.225 + 157.226 +#ifndef AV_RL32 157.227 +# define AV_RL32(x) \ 157.228 + ((((const uint8_t*)(x))[3] << 24) | \ 157.229 + (((const uint8_t*)(x))[2] << 16) | \ 157.230 + (((const uint8_t*)(x))[1] << 8) | \ 157.231 + ((const uint8_t*)(x))[0]) 157.232 +#endif 157.233 +#ifndef AV_WL32 157.234 +# define AV_WL32(p, d) do { \ 157.235 + ((uint8_t*)(p))[0] = (d); \ 157.236 + ((uint8_t*)(p))[1] = (d)>>8; \ 157.237 + ((uint8_t*)(p))[2] = (d)>>16; \ 157.238 + ((uint8_t*)(p))[3] = (d)>>24; \ 157.239 + } while(0) 157.240 +#endif 157.241 + 157.242 +#ifndef AV_RB64 157.243 +# define AV_RB64(x) \ 157.244 + (((uint64_t)((const uint8_t*)(x))[0] << 56) | \ 157.245 + ((uint64_t)((const uint8_t*)(x))[1] << 48) | \ 157.246 + ((uint64_t)((const uint8_t*)(x))[2] << 40) | \ 157.247 + ((uint64_t)((const uint8_t*)(x))[3] << 32) | \ 157.248 + ((uint64_t)((const uint8_t*)(x))[4] << 24) | \ 157.249 + ((uint64_t)((const uint8_t*)(x))[5] << 16) | \ 157.250 + ((uint64_t)((const uint8_t*)(x))[6] << 8) | \ 157.251 + (uint64_t)((const uint8_t*)(x))[7]) 157.252 +#endif 157.253 +#ifndef AV_WB64 157.254 +# define AV_WB64(p, d) do { \ 157.255 + ((uint8_t*)(p))[7] = (d); \ 157.256 + ((uint8_t*)(p))[6] = (d)>>8; \ 157.257 + ((uint8_t*)(p))[5] = (d)>>16; \ 157.258 + ((uint8_t*)(p))[4] = (d)>>24; \ 157.259 + ((uint8_t*)(p))[3] = (d)>>32; \ 157.260 + ((uint8_t*)(p))[2] = (d)>>40; \ 157.261 + ((uint8_t*)(p))[1] = (d)>>48; \ 157.262 + ((uint8_t*)(p))[0] = (d)>>56; \ 157.263 + } while(0) 157.264 +#endif 157.265 + 157.266 +#ifndef AV_RL64 157.267 +# define AV_RL64(x) \ 157.268 + (((uint64_t)((const uint8_t*)(x))[7] << 56) | \ 157.269 + ((uint64_t)((const uint8_t*)(x))[6] << 48) | \ 157.270 + ((uint64_t)((const uint8_t*)(x))[5] << 40) | \ 157.271 + ((uint64_t)((const uint8_t*)(x))[4] << 32) | \ 157.272 + ((uint64_t)((const uint8_t*)(x))[3] << 24) | \ 157.273 + ((uint64_t)((const uint8_t*)(x))[2] << 16) | \ 157.274 + ((uint64_t)((const uint8_t*)(x))[1] << 8) | \ 157.275 + (uint64_t)((const uint8_t*)(x))[0]) 157.276 +#endif 157.277 +#ifndef AV_WL64 157.278 +# define AV_WL64(p, d) do { \ 157.279 + ((uint8_t*)(p))[0] = (d); \ 157.280 + ((uint8_t*)(p))[1] = (d)>>8; \ 157.281 + ((uint8_t*)(p))[2] = (d)>>16; \ 157.282 + ((uint8_t*)(p))[3] = (d)>>24; \ 157.283 + ((uint8_t*)(p))[4] = (d)>>32; \ 157.284 + ((uint8_t*)(p))[5] = (d)>>40; \ 157.285 + ((uint8_t*)(p))[6] = (d)>>48; \ 157.286 + ((uint8_t*)(p))[7] = (d)>>56; \ 157.287 + } while(0) 157.288 +#endif 157.289 + 157.290 +#if HAVE_BIGENDIAN 157.291 +# define AV_RN(s, p) AV_RB##s(p) 157.292 +# define AV_WN(s, p, v) AV_WB##s(p, v) 157.293 +#else 157.294 +# define AV_RN(s, p) AV_RL##s(p) 157.295 +# define AV_WN(s, p, v) AV_WL##s(p, v) 157.296 +#endif 157.297 + 157.298 +#endif /* HAVE_FAST_UNALIGNED */ 157.299 + 157.300 +#ifndef AV_RN16 157.301 +# define AV_RN16(p) AV_RN(16, p) 157.302 +#endif 157.303 + 157.304 +#ifndef AV_RN32 157.305 +# define AV_RN32(p) AV_RN(32, p) 157.306 +#endif 157.307 + 157.308 +#ifndef AV_RN64 157.309 +# define AV_RN64(p) AV_RN(64, p) 157.310 +#endif 157.311 + 157.312 +#ifndef AV_WN16 157.313 +# define AV_WN16(p, v) AV_WN(16, p, v) 157.314 +#endif 157.315 + 157.316 +#ifndef AV_WN32 157.317 +# define AV_WN32(p, v) AV_WN(32, p, v) 157.318 +#endif 157.319 + 157.320 +#ifndef AV_WN64 157.321 +# define AV_WN64(p, v) AV_WN(64, p, v) 157.322 +#endif 157.323 + 157.324 +#if HAVE_BIGENDIAN 157.325 +# define AV_RB(s, p) AV_RN##s(p) 157.326 +# define AV_WB(s, p, v) AV_WN##s(p, v) 157.327 +# define AV_RL(s, p) bswap_##s(AV_RN##s(p)) 157.328 +# define AV_WL(s, p, v) AV_WN##s(p, bswap_##s(v)) 157.329 +#else 157.330 +# define AV_RB(s, p) bswap_##s(AV_RN##s(p)) 157.331 +# define AV_WB(s, p, v) AV_WN##s(p, bswap_##s(v)) 157.332 +# define AV_RL(s, p) AV_RN##s(p) 157.333 +# define AV_WL(s, p, v) AV_WN##s(p, v) 157.334 +#endif 157.335 + 157.336 +#define AV_RB8(x) (((const uint8_t*)(x))[0]) 157.337 +#define AV_WB8(p, d) do { ((uint8_t*)(p))[0] = (d); } while(0) 157.338 + 157.339 +#define AV_RL8(x) AV_RB8(x) 157.340 +#define AV_WL8(p, d) AV_WB8(p, d) 157.341 + 157.342 +#ifndef AV_RB16 157.343 +# define AV_RB16(p) AV_RB(16, p) 157.344 +#endif 157.345 +#ifndef AV_WB16 157.346 +# define AV_WB16(p, v) AV_WB(16, p, v) 157.347 +#endif 157.348 + 157.349 +#ifndef AV_RL16 157.350 +# define AV_RL16(p) AV_RL(16, p) 157.351 +#endif 157.352 +#ifndef AV_WL16 157.353 +# define AV_WL16(p, v) AV_WL(16, p, v) 157.354 +#endif 157.355 + 157.356 +#ifndef AV_RB32 157.357 +# define AV_RB32(p) AV_RB(32, p) 157.358 +#endif 157.359 +#ifndef AV_WB32 157.360 +# define AV_WB32(p, v) AV_WB(32, p, v) 157.361 +#endif 157.362 + 157.363 +#ifndef AV_RL32 157.364 +# define AV_RL32(p) AV_RL(32, p) 157.365 +#endif 157.366 +#ifndef AV_WL32 157.367 +# define AV_WL32(p, v) AV_WL(32, p, v) 157.368 +#endif 157.369 + 157.370 +#ifndef AV_RB64 157.371 +# define AV_RB64(p) AV_RB(64, p) 157.372 +#endif 157.373 +#ifndef AV_WB64 157.374 +# define AV_WB64(p, v) AV_WB(64, p, v) 157.375 +#endif 157.376 + 157.377 +#ifndef AV_RL64 157.378 +# define AV_RL64(p) AV_RL(64, p) 157.379 +#endif 157.380 +#ifndef AV_WL64 157.381 +# define AV_WL64(p, v) AV_WL(64, p, v) 157.382 +#endif 157.383 + 157.384 +#ifndef AV_RB24 157.385 +# define AV_RB24(x) \ 157.386 + ((((const uint8_t*)(x))[0] << 16) | \ 157.387 + (((const uint8_t*)(x))[1] << 8) | \ 157.388 + ((const uint8_t*)(x))[2]) 157.389 +#endif 157.390 +#ifndef AV_WB24 157.391 +# define AV_WB24(p, d) do { \ 157.392 + ((uint8_t*)(p))[2] = (d); \ 157.393 + ((uint8_t*)(p))[1] = (d)>>8; \ 157.394 + ((uint8_t*)(p))[0] = (d)>>16; \ 157.395 + } while(0) 157.396 +#endif 157.397 + 157.398 +#ifndef AV_RL24 157.399 +# define AV_RL24(x) \ 157.400 + ((((const uint8_t*)(x))[2] << 16) | \ 157.401 + (((const uint8_t*)(x))[1] << 8) | \ 157.402 + ((const uint8_t*)(x))[0]) 157.403 +#endif 157.404 +#ifndef AV_WL24 157.405 +# define AV_WL24(p, d) do { \ 157.406 + ((uint8_t*)(p))[0] = (d); \ 157.407 + ((uint8_t*)(p))[1] = (d)>>8; \ 157.408 + ((uint8_t*)(p))[2] = (d)>>16; \ 157.409 + } while(0) 157.410 +#endif 157.411 + 157.412 +/* 157.413 + * The AV_[RW]NA macros access naturally aligned data 157.414 + * in a type-safe way. 157.415 + */ 157.416 + 157.417 +#define AV_RNA(s, p) (((const av_alias##s*)(p))->u##s) 157.418 +#define AV_WNA(s, p, v) (((av_alias##s*)(p))->u##s = (v)) 157.419 + 157.420 +#ifndef AV_RN16A 157.421 +# define AV_RN16A(p) AV_RNA(16, p) 157.422 +#endif 157.423 + 157.424 +#ifndef AV_RN32A 157.425 +# define AV_RN32A(p) AV_RNA(32, p) 157.426 +#endif 157.427 + 157.428 +#ifndef AV_RN64A 157.429 +# define AV_RN64A(p) AV_RNA(64, p) 157.430 +#endif 157.431 + 157.432 +#ifndef AV_WN16A 157.433 +# define AV_WN16A(p, v) AV_WNA(16, p, v) 157.434 +#endif 157.435 + 157.436 +#ifndef AV_WN32A 157.437 +# define AV_WN32A(p, v) AV_WNA(32, p, v) 157.438 +#endif 157.439 + 157.440 +#ifndef AV_WN64A 157.441 +# define AV_WN64A(p, v) AV_WNA(64, p, v) 157.442 +#endif 157.443 + 157.444 +/* Parameters for AV_COPY*, AV_SWAP*, AV_ZERO* must be 157.445 + * naturally aligned. They may be implemented using MMX, 157.446 + * so emms_c() must be called before using any float code 157.447 + * afterwards. 157.448 + */ 157.449 + 157.450 +#define AV_COPY(n, d, s) \ 157.451 + (((av_alias##n*)(d))->u##n = ((const av_alias##n*)(s))->u##n) 157.452 + 157.453 +#ifndef AV_COPY16 157.454 +# define AV_COPY16(d, s) AV_COPY(16, d, s) 157.455 +#endif 157.456 + 157.457 +#ifndef AV_COPY32 157.458 +# define AV_COPY32(d, s) AV_COPY(32, d, s) 157.459 +#endif 157.460 + 157.461 +#ifndef AV_COPY64 157.462 +# define AV_COPY64(d, s) AV_COPY(64, d, s) 157.463 +#endif 157.464 + 157.465 +#ifndef AV_COPY128 157.466 +# define AV_COPY128(d, s) \ 157.467 + do { \ 157.468 + AV_COPY64(d, s); \ 157.469 + AV_COPY64((char*)(d)+8, (char*)(s)+8); \ 157.470 + } while(0) 157.471 +#endif 157.472 + 157.473 +#define AV_SWAP(n, a, b) FFSWAP(av_alias##n, *(av_alias##n*)(a), *(av_alias##n*)(b)) 157.474 + 157.475 +#ifndef AV_SWAP64 157.476 +# define AV_SWAP64(a, b) AV_SWAP(64, a, b) 157.477 +#endif 157.478 + 157.479 +#define AV_ZERO(n, d) (((av_alias##n*)(d))->u##n = 0) 157.480 + 157.481 +#ifndef AV_ZERO16 157.482 +# define AV_ZERO16(d) AV_ZERO(16, d) 157.483 +#endif 157.484 + 157.485 +#ifndef AV_ZERO32 157.486 +# define AV_ZERO32(d) AV_ZERO(32, d) 157.487 +#endif 157.488 + 157.489 +#ifndef AV_ZERO64 157.490 +# define AV_ZERO64(d) AV_ZERO(64, d) 157.491 +#endif 157.492 + 157.493 +#ifndef AV_ZERO128 157.494 +# define AV_ZERO128(d) \ 157.495 + do { \ 157.496 + AV_ZERO64(d); \ 157.497 + AV_ZERO64((char*)(d)+8); \ 157.498 + } while(0) 157.499 +#endif 157.500 + 157.501 +#endif /* AVUTIL_INTREADWRITE_H */
158.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 158.2 +++ b/ffmpeg_smp/h264dec/libavutil/log.c Mon Aug 27 12:09:56 2012 +0200 158.3 @@ -0,0 +1,111 @@ 158.4 +/* 158.5 + * log functions 158.6 + * Copyright (c) 2003 Michel Bardiaux 158.7 + * 158.8 + * This file is part of FFmpeg. 158.9 + * 158.10 + * FFmpeg is free software; you can redistribute it and/or 158.11 + * modify it under the terms of the GNU Lesser General Public 158.12 + * License as published by the Free Software Foundation; either 158.13 + * version 2.1 of the License, or (at your option) any later version. 158.14 + * 158.15 + * FFmpeg is distributed in the hope that it will be useful, 158.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 158.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 158.18 + * Lesser General Public License for more details. 158.19 + * 158.20 + * You should have received a copy of the GNU Lesser General Public 158.21 + * License along with FFmpeg; if not, write to the Free Software 158.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 158.23 + */ 158.24 + 158.25 +/** 158.26 + * @file 158.27 + * logging functions 158.28 + */ 158.29 +#include "error.h" 158.30 +#include <unistd.h> 158.31 +#include <stdlib.h> 158.32 +#include "log.h" 158.33 + 158.34 + 158.35 +static int av_log_level = AV_LOG_INFO; 158.36 + 158.37 +static int use_ansi_color=-1; 158.38 + 158.39 +#undef fprintf 158.40 +static void colored_fputs(int color, const char *str){ 158.41 + if(use_ansi_color<0){ 158.42 +#if HAVE_ISATTY && !defined(_WIN32) 158.43 + use_ansi_color= getenv("TERM") && !getenv("NO_COLOR") && isatty(2); 158.44 +#else 158.45 + use_ansi_color= 0; 158.46 +#endif 158.47 + } 158.48 + 158.49 + if(use_ansi_color){ 158.50 + fprintf(stderr, "\033[%d;3%dm", color>>4, color&15); 158.51 + } 158.52 + fputs(str, stderr); 158.53 + if(use_ansi_color){ 158.54 + fprintf(stderr, "\033[0m"); 158.55 + } 158.56 +} 158.57 + 158.58 +void av_log_default_callback(int level, const char* fmt, va_list vl) 158.59 +{ 158.60 + static int print_prefix=1; 158.61 + static int count; 158.62 + static char line[1024], prev[1024]; 158.63 + static const uint8_t color[]={0x41,0x41,0x11,0x03,9,9,9}; 158.64 + 158.65 + if(level>av_log_level) 158.66 + return; 158.67 +#undef fprintf 158.68 + 158.69 + line[0]=0; 158.70 + 158.71 + vsnprintf(line + strlen(line), sizeof(line) - strlen(line), fmt, vl); 158.72 + 158.73 + print_prefix= line[strlen(line)-1] == '\n'; 158.74 + if(print_prefix && !strcmp(line, prev)){ 158.75 + count++; 158.76 + return; 158.77 + } 158.78 + if(count>0){ 158.79 + fprintf(stderr, " Last message repeated %d times\n", count); 158.80 + count=0; 158.81 + } 158.82 + colored_fputs(color[av_clip(level>>3, 0, 6)], line); 158.83 + strcpy(prev, line); 158.84 +} 158.85 + 158.86 +static void (*av_log_callback)(int, const char*, va_list) = av_log_default_callback; 158.87 + 158.88 +void av_log(int level, const char *fmt, ...) 158.89 +{ 158.90 + va_list vl; 158.91 + va_start(vl, fmt); 158.92 + av_vlog(level, fmt, vl); 158.93 + va_end(vl); 158.94 +} 158.95 + 158.96 +void av_vlog(int level, const char *fmt, va_list vl) 158.97 +{ 158.98 + av_log_callback(level, fmt, vl); 158.99 +} 158.100 + 158.101 +int av_log_get_level(void) 158.102 +{ 158.103 + return av_log_level; 158.104 +} 158.105 + 158.106 +void av_log_set_level(int level) 158.107 +{ 158.108 + av_log_level = level; 158.109 +} 158.110 + 158.111 +void av_log_set_callback(void (*callback)(int, const char*, va_list)) 158.112 +{ 158.113 + av_log_callback = callback; 158.114 +}
159.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 159.2 +++ b/ffmpeg_smp/h264dec/libavutil/log.h Mon Aug 27 12:09:56 2012 +0200 159.3 @@ -0,0 +1,120 @@ 159.4 +/* 159.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> 159.6 + * 159.7 + * This file is part of FFmpeg. 159.8 + * 159.9 + * FFmpeg is free software; you can redistribute it and/or 159.10 + * modify it under the terms of the GNU Lesser General Public 159.11 + * License as published by the Free Software Foundation; either 159.12 + * version 2.1 of the License, or (at your option) any later version. 159.13 + * 159.14 + * FFmpeg is distributed in the hope that it will be useful, 159.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 159.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 159.17 + * Lesser General Public License for more details. 159.18 + * 159.19 + * You should have received a copy of the GNU Lesser General Public 159.20 + * License along with FFmpeg; if not, write to the Free Software 159.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 159.22 + */ 159.23 + 159.24 +#ifndef AVUTIL_LOG_H 159.25 +#define AVUTIL_LOG_H 159.26 + 159.27 +#include <stdarg.h> 159.28 +//#include "avutil.h" 159.29 + 159.30 +/** 159.31 + * Describes the class of an AVClass context structure. That is an 159.32 + * arbitrary struct of which the first field is a pointer to an 159.33 + * AVClass struct (e.g. AVCodecContext, AVFormatContext etc.). 159.34 + */ 159.35 +typedef struct { 159.36 + /** 159.37 + * The name of the class; usually it is the same name as the 159.38 + * context structure type to which the AVClass is associated. 159.39 + */ 159.40 + const char* class_name; 159.41 + 159.42 + /** 159.43 + * A pointer to a function which returns the name of a context 159.44 + * instance ctx associated with the class. 159.45 + */ 159.46 + const char* (*item_name)(void* ctx); 159.47 + 159.48 + /** 159.49 + * a pointer to the first option specified in the class if any or NULL 159.50 + * 159.51 + * @see av_set_default_options() 159.52 + */ 159.53 + const struct AVOption *option; 159.54 + 159.55 + /** 159.56 + * LIBAVUTIL_VERSION with which this structure was created. 159.57 + * This is used to allow fields to be added without requiring major 159.58 + * version bumps everywhere. 159.59 + */ 159.60 + 159.61 + int version; 159.62 +} AVClass; 159.63 + 159.64 +/* av_log API */ 159.65 + 159.66 +#define AV_LOG_QUIET -8 159.67 + 159.68 +/** 159.69 + * Something went really wrong and we will crash now. 159.70 + */ 159.71 +#define AV_LOG_PANIC 0 159.72 + 159.73 +/** 159.74 + * Something went wrong and recovery is not possible. 159.75 + * For example, no header was found for a format which depends 159.76 + * on headers or an illegal combination of parameters is used. 159.77 + */ 159.78 +#define AV_LOG_FATAL 8 159.79 + 159.80 +/** 159.81 + * Something went wrong and cannot losslessly be recovered. 159.82 + * However, not all future data is affected. 159.83 + */ 159.84 +#define AV_LOG_ERROR 16 159.85 + 159.86 +/** 159.87 + * Something somehow does not look correct. This may or may not 159.88 + * lead to problems. An example would be the use of '-vstrict -2'. 159.89 + */ 159.90 +#define AV_LOG_WARNING 24 159.91 + 159.92 +#define AV_LOG_INFO 32 159.93 +#define AV_LOG_VERBOSE 40 159.94 + 159.95 +/** 159.96 + * Stuff which is only useful for libav* developers. 159.97 + */ 159.98 +#define AV_LOG_DEBUG 48 159.99 + 159.100 +/** 159.101 + * Sends the specified message to the log if the level is less than or equal 159.102 + * to the current av_log_level. By default, all logging messages are sent to 159.103 + * stderr. This behavior can be altered by setting a different av_vlog callback 159.104 + * function. 159.105 + * 159.106 + * @param avcl A pointer to an arbitrary struct of which the first field is a 159.107 + * pointer to an AVClass struct. 159.108 + * @param level The importance level of the message, lower values signifying 159.109 + * higher importance. 159.110 + * @param fmt The format string (printf-compatible) that specifies how 159.111 + * subsequent arguments are converted to output. 159.112 + * @see av_vlog 159.113 + */ 159.114 + 159.115 +void av_log(int level, const char *fmt, ...); 159.116 + 159.117 +void av_vlog(int level, const char *fmt, va_list); 159.118 +int av_log_get_level(void); 159.119 +void av_log_set_level(int); 159.120 +void av_log_set_callback(void (*)(int, const char*, va_list)); 159.121 +void av_log_default_callback(int level, const char* fmt, va_list vl); 159.122 + 159.123 +#endif /* AVUTIL_LOG_H */
160.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 160.2 +++ b/ffmpeg_smp/h264dec/libavutil/mem.c Mon Aug 27 12:09:56 2012 +0200 160.3 @@ -0,0 +1,127 @@ 160.4 +/* 160.5 + * default memory allocator for libavutil 160.6 + * Copyright (c) 2002 Fabrice Bellard 160.7 + * 160.8 + * This file is part of FFmpeg. 160.9 + * 160.10 + * FFmpeg is free software; you can redistribute it and/or 160.11 + * modify it under the terms of the GNU Lesser General Public 160.12 + * License as published by the Free Software Foundation; either 160.13 + * version 2.1 of the License, or (at your option) any later version. 160.14 + * 160.15 + * FFmpeg is distributed in the hope that it will be useful, 160.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 160.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 160.18 + * Lesser General Public License for more details. 160.19 + * 160.20 + * You should have received a copy of the GNU Lesser General Public 160.21 + * License along with FFmpeg; if not, write to the Free Software 160.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 160.23 + */ 160.24 + 160.25 +/** 160.26 + * @file 160.27 + * default memory allocator for libavutil 160.28 + */ 160.29 + 160.30 +#include "config.h" 160.31 + 160.32 +#include <limits.h> 160.33 +#include <stdlib.h> 160.34 +#include <stdint.h> 160.35 +#include <string.h> 160.36 +#if HAVE_MALLOC_H 160.37 +#include <malloc.h> 160.38 +#endif 160.39 + 160.40 +#include "mem.h" 160.41 + 160.42 +/* here we can use OS-dependent allocation functions */ 160.43 +#undef free 160.44 +#undef malloc 160.45 +#undef realloc 160.46 + 160.47 +#ifdef MALLOC_PREFIX 160.48 + 160.49 +#define malloc AV_JOIN(MALLOC_PREFIX, malloc) 160.50 +#define memalign AV_JOIN(MALLOC_PREFIX, memalign) 160.51 +#define posix_memalign AV_JOIN(MALLOC_PREFIX, posix_memalign) 160.52 +#define realloc AV_JOIN(MALLOC_PREFIX, realloc) 160.53 +#define free AV_JOIN(MALLOC_PREFIX, free) 160.54 + 160.55 +void *malloc(size_t size); 160.56 +void *memalign(size_t align, size_t size); 160.57 +int posix_memalign(void **ptr, size_t align, size_t size); 160.58 +void *realloc(void *ptr, size_t size); 160.59 +void free(void *ptr); 160.60 + 160.61 +#endif /* MALLOC_PREFIX */ 160.62 + 160.63 + 160.64 +/* You can redefine av_malloc and av_free in your project to use your 160.65 + memory allocator. You do not need to suppress this file because the 160.66 + linker will do it automatically. */ 160.67 + 160.68 +void *av_malloc(unsigned int size) 160.69 +{ 160.70 + void *ptr = NULL; 160.71 + /* let's disallow possible ambiguous cases */ 160.72 + if(size > (INT_MAX-16) ) 160.73 + return NULL; 160.74 + 160.75 +//FIXME: when no aligned mallocs vector code should be disabled. 160.76 +#if HAVE_POSIX_MEMALIGN 160.77 + if (posix_memalign(&ptr,16,size)) 160.78 + ptr = NULL; 160.79 +#elif HAVE_MEMALIGN 160.80 + ptr = memalign(16,size); 160.81 +#else 160.82 + ptr = malloc(size); 160.83 +#endif 160.84 + return ptr; 160.85 +} 160.86 + 160.87 +void *av_realloc(void *ptr, unsigned int size) 160.88 +{ 160.89 + /* let's disallow possible ambiguous cases */ 160.90 + if(size > (INT_MAX-16) ) 160.91 + return NULL; 160.92 + 160.93 + return realloc(ptr, size); 160.94 + 160.95 +} 160.96 + 160.97 +void av_free(void *ptr) 160.98 +{ 160.99 + /* XXX: this test should not be needed on most libcs */ 160.100 + if (ptr) 160.101 + free(ptr); 160.102 + 160.103 +} 160.104 + 160.105 +void av_freep(void *arg) 160.106 +{ 160.107 + void **ptr= (void**)arg; 160.108 + av_free(*ptr); 160.109 + *ptr = NULL; 160.110 +} 160.111 + 160.112 +void *av_mallocz(unsigned int size) 160.113 +{ 160.114 + void *ptr = av_malloc(size); 160.115 + if (ptr) 160.116 + memset(ptr, 0, size); 160.117 + return ptr; 160.118 +} 160.119 + 160.120 +char *av_strdup(const char *s) 160.121 +{ 160.122 + char *ptr= NULL; 160.123 + if(s){ 160.124 + int len = strlen(s) + 1; 160.125 + ptr = av_malloc(len); 160.126 + if (ptr) 160.127 + memcpy(ptr, s, len); 160.128 + } 160.129 + return ptr; 160.130 +}
161.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 161.2 +++ b/ffmpeg_smp/h264dec/libavutil/mem.h Mon Aug 27 12:09:56 2012 +0200 161.3 @@ -0,0 +1,143 @@ 161.4 +/* 161.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> 161.6 + * 161.7 + * This file is part of FFmpeg. 161.8 + * 161.9 + * FFmpeg is free software; you can redistribute it and/or 161.10 + * modify it under the terms of the GNU Lesser General Public 161.11 + * License as published by the Free Software Foundation; either 161.12 + * version 2.1 of the License, or (at your option) any later version. 161.13 + * 161.14 + * FFmpeg is distributed in the hope that it will be useful, 161.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 161.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 161.17 + * Lesser General Public License for more details. 161.18 + * 161.19 + * You should have received a copy of the GNU Lesser General Public 161.20 + * License along with FFmpeg; if not, write to the Free Software 161.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 161.22 + */ 161.23 + 161.24 +/** 161.25 + * @file 161.26 + * memory handling functions 161.27 + */ 161.28 + 161.29 +#ifndef AVUTIL_MEM_H 161.30 +#define AVUTIL_MEM_H 161.31 + 161.32 +#include "attributes.h" 161.33 +#include "config.h" 161.34 + 161.35 +#define DECLARE_ALIGNED(n,t,v) t __attribute__ ((aligned (n))) v 161.36 +#define DECLARE_ALIGNED_16(t,v) t __attribute__ ((aligned (16))) v 161.37 +#define DECLARE_ASM_CONST(n,t,v) static const t __attribute__((used)) __attribute__ ((aligned (n))) v 161.38 + 161.39 +#if AV_GCC_VERSION_AT_LEAST(3,1) 161.40 + #define av_malloc_attrib __attribute__((__malloc__)) 161.41 +#else 161.42 + #define av_malloc_attrib 161.43 +#endif 161.44 + 161.45 +/** 161.46 + * Allocates a block of size bytes with alignment suitable for all 161.47 + * memory accesses (including vectors if available on the CPU). 161.48 + * @param size Size in bytes for the memory block to be allocated. 161.49 + * @return Pointer to the allocated block, NULL if the block cannot 161.50 + * be allocated. 161.51 + * @see av_mallocz() 161.52 + */ 161.53 +void *av_malloc(unsigned int size) av_malloc_attrib; 161.54 + 161.55 +/** 161.56 + * Allocates or reallocates a block of memory. 161.57 + * If ptr is NULL and size > 0, allocates a new block. If 161.58 + * size is zero, frees the memory block pointed to by ptr. 161.59 + * @param size Size in bytes for the memory block to be allocated or 161.60 + * reallocated. 161.61 + * @param ptr Pointer to a memory block already allocated with 161.62 + * av_malloc(z)() or av_realloc() or NULL. 161.63 + * @return Pointer to a newly reallocated block or NULL if the block 161.64 + * cannot be reallocated or the function is used to free the memory block. 161.65 + * @see av_fast_realloc() 161.66 + */ 161.67 +void *av_realloc(void *ptr, unsigned int size); 161.68 + 161.69 +/** 161.70 + * Reallocates the given block if it is not large enough, otherwise it 161.71 + * does nothing. 161.72 + * 161.73 + * @see av_realloc 161.74 + */ 161.75 +void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size); 161.76 + 161.77 +/** 161.78 + * Allocates a buffer, reusing the given one if large enough. 161.79 + * 161.80 + * Contrary to av_fast_realloc the current buffer contents might not be 161.81 + * preserved and on error the old buffer is freed, thus no special 161.82 + * handling to avoid memleaks is necessary. 161.83 + * 161.84 + * @param ptr pointer to pointer to already allocated buffer, overwritten with pointer to new buffer 161.85 + * @param size size of the buffer *ptr points to 161.86 + * @param min_size minimum size of *ptr buffer after returning, *ptr will be NULL and 161.87 + * *size 0 if an error occurred. 161.88 + */ 161.89 +void av_fast_malloc(void *ptr, unsigned int *size, unsigned int min_size); 161.90 + 161.91 +/** 161.92 + * Frees a memory block which has been allocated with av_malloc(z)() or 161.93 + * av_realloc(). 161.94 + * @param ptr Pointer to the memory block which should be freed. 161.95 + * @note ptr = NULL is explicitly allowed. 161.96 + * @note It is recommended that you use av_freep() instead. 161.97 + * @see av_freep() 161.98 + */ 161.99 + 161.100 +void av_free(void *ptr); 161.101 + 161.102 +/** 161.103 + * Allocates a block of size bytes with alignment suitable for all 161.104 + * memory accesses (including vectors if available on the CPU) and 161.105 + * zeroes all the bytes of the block. 161.106 + * @param size Size in bytes for the memory block to be allocated. 161.107 + * @return Pointer to the allocated block, NULL if it cannot be allocated. 161.108 + * @see av_malloc() 161.109 + */ 161.110 +void *av_mallocz(unsigned int size) av_malloc_attrib; 161.111 + 161.112 +/** 161.113 + * Duplicates the string s. 161.114 + * @param s string to be duplicated 161.115 + * @return Pointer to a newly allocated string containing a 161.116 + * copy of s or NULL if the string cannot be allocated. 161.117 + */ 161.118 +char *av_strdup(const char *s) av_malloc_attrib; 161.119 + 161.120 +/** 161.121 + * Frees a memory block which has been allocated with av_malloc(z)() or 161.122 + * av_realloc() and set the pointer pointing to it to NULL. 161.123 + * @param ptr Pointer to the pointer to the memory block which should 161.124 + * be freed. 161.125 + * @see av_free() 161.126 + */ 161.127 +void av_freep(void *ptr); 161.128 + 161.129 + 161.130 +static av_always_inline uint32_t pack16to32(int a, int b){ 161.131 +#if HAVE_BIGENDIAN 161.132 + return (b&0xFFFF) + (a<<16); 161.133 +#else 161.134 + return (a&0xFFFF) + (b<<16); 161.135 +#endif 161.136 +} 161.137 + 161.138 +static av_always_inline uint16_t pack8to16(int a, int b){ 161.139 +#if HAVE_BIGENDIAN 161.140 + return (b&0xFF) + (a<<8); 161.141 +#else 161.142 + return (a&0xFF) + (b<<8); 161.143 +#endif 161.144 +} 161.145 + 161.146 +#endif /* AVUTIL_MEM_H */
162.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 162.2 +++ b/ffmpeg_smp/h264dec/libavutil/pixfmt.h Mon Aug 27 12:09:56 2012 +0200 162.3 @@ -0,0 +1,161 @@ 162.4 +/* 162.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> 162.6 + * 162.7 + * This file is part of FFmpeg. 162.8 + * 162.9 + * FFmpeg is free software; you can redistribute it and/or 162.10 + * modify it under the terms of the GNU Lesser General Public 162.11 + * License as published by the Free Software Foundation; either 162.12 + * version 2.1 of the License, or (at your option) any later version. 162.13 + * 162.14 + * FFmpeg is distributed in the hope that it will be useful, 162.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 162.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 162.17 + * Lesser General Public License for more details. 162.18 + * 162.19 + * You should have received a copy of the GNU Lesser General Public 162.20 + * License along with FFmpeg; if not, write to the Free Software 162.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 162.22 + */ 162.23 + 162.24 +#ifndef AVUTIL_PIXFMT_H 162.25 +#define AVUTIL_PIXFMT_H 162.26 + 162.27 +/** 162.28 + * @file 162.29 + * pixel format definitions 162.30 + * 162.31 + * @warning This file has to be considered an internal but installed 162.32 + * header, so it should not be directly included in your projects. 162.33 + */ 162.34 + 162.35 +/** 162.36 + * Pixel format. Notes: 162.37 + * 162.38 + * PIX_FMT_RGB32 is handled in an endian-specific manner. An RGBA 162.39 + * color is put together as: 162.40 + * (A << 24) | (R << 16) | (G << 8) | B 162.41 + * This is stored as BGRA on little-endian CPU architectures and ARGB on 162.42 + * big-endian CPUs. 162.43 + * 162.44 + * When the pixel format is palettized RGB (PIX_FMT_PAL8), the palettized 162.45 + * image data is stored in AVFrame.data[0]. The palette is transported in 162.46 + * AVFrame.data[1], is 1024 bytes long (256 4-byte entries) and is 162.47 + * formatted the same as in PIX_FMT_RGB32 described above (i.e., it is 162.48 + * also endian-specific). Note also that the individual RGB palette 162.49 + * components stored in AVFrame.data[1] should be in the range 0..255. 162.50 + * This is important as many custom PAL8 video codecs that were designed 162.51 + * to run on the IBM VGA graphics adapter use 6-bit palette components. 162.52 + * 162.53 + * For all the 8bit per pixel formats, an RGB32 palette is in data[1] like 162.54 + * for pal8. This palette is filled in automatically by the function 162.55 + * allocating the picture. 162.56 + * 162.57 + * Note, make sure that all newly added big endian formats have pix_fmt&1==1 162.58 + * and that all newly added little endian formats have pix_fmt&1==0 162.59 + * this allows simpler detection of big vs little endian. 162.60 + */ 162.61 +enum PixelFormat { 162.62 + PIX_FMT_NONE= -1, 162.63 + PIX_FMT_YUV420P, ///< planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples) 162.64 + PIX_FMT_YUYV422, ///< packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr 162.65 + PIX_FMT_RGB24, ///< packed RGB 8:8:8, 24bpp, RGBRGB... 162.66 + PIX_FMT_BGR24, ///< packed RGB 8:8:8, 24bpp, BGRBGR... 162.67 + PIX_FMT_YUV422P, ///< planar YUV 4:2:2, 16bpp, (1 Cr & Cb sample per 2x1 Y samples) 162.68 + PIX_FMT_YUV444P, ///< planar YUV 4:4:4, 24bpp, (1 Cr & Cb sample per 1x1 Y samples) 162.69 + PIX_FMT_YUV410P, ///< planar YUV 4:1:0, 9bpp, (1 Cr & Cb sample per 4x4 Y samples) 162.70 + PIX_FMT_YUV411P, ///< planar YUV 4:1:1, 12bpp, (1 Cr & Cb sample per 4x1 Y samples) 162.71 + PIX_FMT_GRAY8, ///< Y , 8bpp 162.72 + PIX_FMT_MONOWHITE, ///< Y , 1bpp, 0 is white, 1 is black 162.73 + PIX_FMT_MONOBLACK, ///< Y , 1bpp, 0 is black, 1 is white 162.74 + PIX_FMT_PAL8, ///< 8 bit with PIX_FMT_RGB32 palette 162.75 + PIX_FMT_YUVJ420P, ///< planar YUV 4:2:0, 12bpp, full scale (JPEG) 162.76 + PIX_FMT_YUVJ422P, ///< planar YUV 4:2:2, 16bpp, full scale (JPEG) 162.77 + PIX_FMT_YUVJ444P, ///< planar YUV 4:4:4, 24bpp, full scale (JPEG) 162.78 + PIX_FMT_XVMC_MPEG2_MC,///< XVideo Motion Acceleration via common packet passing 162.79 + PIX_FMT_XVMC_MPEG2_IDCT, 162.80 + PIX_FMT_UYVY422, ///< packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1 162.81 + PIX_FMT_UYYVYY411, ///< packed YUV 4:1:1, 12bpp, Cb Y0 Y1 Cr Y2 Y3 162.82 + PIX_FMT_BGR8, ///< packed RGB 3:3:2, 8bpp, (msb)2B 3G 3R(lsb) 162.83 + PIX_FMT_BGR4, ///< packed RGB 1:2:1, 4bpp, (msb)1B 2G 1R(lsb) 162.84 + PIX_FMT_BGR4_BYTE, ///< packed RGB 1:2:1, 8bpp, (msb)1B 2G 1R(lsb) 162.85 + PIX_FMT_RGB8, ///< packed RGB 3:3:2, 8bpp, (msb)2R 3G 3B(lsb) 162.86 + PIX_FMT_RGB4, ///< packed RGB 1:2:1, 4bpp, (msb)1R 2G 1B(lsb) 162.87 + PIX_FMT_RGB4_BYTE, ///< packed RGB 1:2:1, 8bpp, (msb)1R 2G 1B(lsb) 162.88 + PIX_FMT_NV12, ///< planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 for UV 162.89 + PIX_FMT_NV21, ///< as above, but U and V bytes are swapped 162.90 + 162.91 + PIX_FMT_ARGB, ///< packed ARGB 8:8:8:8, 32bpp, ARGBARGB... 162.92 + PIX_FMT_RGBA, ///< packed RGBA 8:8:8:8, 32bpp, RGBARGBA... 162.93 + PIX_FMT_ABGR, ///< packed ABGR 8:8:8:8, 32bpp, ABGRABGR... 162.94 + PIX_FMT_BGRA, ///< packed BGRA 8:8:8:8, 32bpp, BGRABGRA... 162.95 + 162.96 + PIX_FMT_GRAY16BE, ///< Y , 16bpp, big-endian 162.97 + PIX_FMT_GRAY16LE, ///< Y , 16bpp, little-endian 162.98 + PIX_FMT_YUV440P, ///< planar YUV 4:4:0 (1 Cr & Cb sample per 1x2 Y samples) 162.99 + PIX_FMT_YUVJ440P, ///< planar YUV 4:4:0 full scale (JPEG) 162.100 + PIX_FMT_YUVA420P, ///< planar YUV 4:2:0, 20bpp, (1 Cr & Cb sample per 2x2 Y & A samples) 162.101 + PIX_FMT_VDPAU_H264,///< H.264 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers 162.102 + PIX_FMT_VDPAU_MPEG1,///< MPEG-1 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers 162.103 + PIX_FMT_VDPAU_MPEG2,///< MPEG-2 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers 162.104 + PIX_FMT_VDPAU_WMV3,///< WMV3 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers 162.105 + PIX_FMT_VDPAU_VC1, ///< VC-1 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers 162.106 + PIX_FMT_RGB48BE, ///< packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, big-endian 162.107 + PIX_FMT_RGB48LE, ///< packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, little-endian 162.108 + 162.109 + PIX_FMT_RGB565BE, ///< packed RGB 5:6:5, 16bpp, (msb) 5R 6G 5B(lsb), big-endian 162.110 + PIX_FMT_RGB565LE, ///< packed RGB 5:6:5, 16bpp, (msb) 5R 6G 5B(lsb), little-endian 162.111 + PIX_FMT_RGB555BE, ///< packed RGB 5:5:5, 16bpp, (msb)1A 5R 5G 5B(lsb), big-endian, most significant bit to 0 162.112 + PIX_FMT_RGB555LE, ///< packed RGB 5:5:5, 16bpp, (msb)1A 5R 5G 5B(lsb), little-endian, most significant bit to 0 162.113 + 162.114 + PIX_FMT_BGR565BE, ///< packed BGR 5:6:5, 16bpp, (msb) 5B 6G 5R(lsb), big-endian 162.115 + PIX_FMT_BGR565LE, ///< packed BGR 5:6:5, 16bpp, (msb) 5B 6G 5R(lsb), little-endian 162.116 + PIX_FMT_BGR555BE, ///< packed BGR 5:5:5, 16bpp, (msb)1A 5B 5G 5R(lsb), big-endian, most significant bit to 1 162.117 + PIX_FMT_BGR555LE, ///< packed BGR 5:5:5, 16bpp, (msb)1A 5B 5G 5R(lsb), little-endian, most significant bit to 1 162.118 + 162.119 + PIX_FMT_VAAPI_MOCO, ///< HW acceleration through VA API at motion compensation entry-point, Picture.data[3] contains a vaapi_render_state struct which contains macroblocks as well as various fields extracted from headers 162.120 + PIX_FMT_VAAPI_IDCT, ///< HW acceleration through VA API at IDCT entry-point, Picture.data[3] contains a vaapi_render_state struct which contains fields extracted from headers 162.121 + PIX_FMT_VAAPI_VLD, ///< HW decoding through VA API, Picture.data[3] contains a vaapi_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers 162.122 + 162.123 + PIX_FMT_YUV420P16LE, ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian 162.124 + PIX_FMT_YUV420P16BE, ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian 162.125 + PIX_FMT_YUV422P16LE, ///< planar YUV 4:2:2, 32bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian 162.126 + PIX_FMT_YUV422P16BE, ///< planar YUV 4:2:2, 32bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian 162.127 + PIX_FMT_YUV444P16LE, ///< planar YUV 4:4:4, 48bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian 162.128 + PIX_FMT_YUV444P16BE, ///< planar YUV 4:4:4, 48bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian 162.129 + PIX_FMT_VDPAU_MPEG4, ///< MPEG4 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers 162.130 + PIX_FMT_DXVA2_VLD, ///< HW decoding through DXVA2, Picture.data[3] contains a LPDIRECT3DSURFACE9 pointer 162.131 + 162.132 + PIX_FMT_RGB444BE, ///< packed RGB 4:4:4, 16bpp, (msb)4A 4R 4G 4B(lsb), big-endian, most significant bits to 0 162.133 + PIX_FMT_RGB444LE, ///< packed RGB 4:4:4, 16bpp, (msb)4A 4R 4G 4B(lsb), little-endian, most significant bits to 0 162.134 + PIX_FMT_BGR444BE, ///< packed BGR 4:4:4, 16bpp, (msb)4A 4B 4G 4R(lsb), big-endian, most significant bits to 1 162.135 + PIX_FMT_BGR444LE, ///< packed BGR 4:4:4, 16bpp, (msb)4A 4B 4G 4R(lsb), little-endian, most significant bits to 1 162.136 + PIX_FMT_Y400A, ///< 8bit gray, 8bit alpha 162.137 + PIX_FMT_NB, ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions 162.138 +}; 162.139 + 162.140 +#if HAVE_BIGENDIAN 162.141 +# define PIX_FMT_NE(be, le) PIX_FMT_##be 162.142 +#else 162.143 +# define PIX_FMT_NE(be, le) PIX_FMT_##le 162.144 +#endif 162.145 + 162.146 +#define PIX_FMT_RGB32 PIX_FMT_NE(ARGB, BGRA) 162.147 +#define PIX_FMT_RGB32_1 PIX_FMT_NE(RGBA, ABGR) 162.148 +#define PIX_FMT_BGR32 PIX_FMT_NE(ABGR, RGBA) 162.149 +#define PIX_FMT_BGR32_1 PIX_FMT_NE(BGRA, ARGB) 162.150 + 162.151 +#define PIX_FMT_GRAY16 PIX_FMT_NE(GRAY16BE, GRAY16LE) 162.152 +#define PIX_FMT_RGB48 PIX_FMT_NE(RGB48BE, RGB48LE) 162.153 +#define PIX_FMT_RGB565 PIX_FMT_NE(RGB565BE, RGB565LE) 162.154 +#define PIX_FMT_RGB555 PIX_FMT_NE(RGB555BE, RGB555LE) 162.155 +#define PIX_FMT_RGB444 PIX_FMT_NE(RGB444BE, RGB444LE) 162.156 +#define PIX_FMT_BGR565 PIX_FMT_NE(BGR565BE, BGR565LE) 162.157 +#define PIX_FMT_BGR555 PIX_FMT_NE(BGR555BE, BGR555LE) 162.158 +#define PIX_FMT_BGR444 PIX_FMT_NE(BGR444BE, BGR444LE) 162.159 + 162.160 +#define PIX_FMT_YUV420P16 PIX_FMT_NE(YUV420P16BE, YUV420P16LE) 162.161 +#define PIX_FMT_YUV422P16 PIX_FMT_NE(YUV422P16BE, YUV422P16LE) 162.162 +#define PIX_FMT_YUV444P16 PIX_FMT_NE(YUV444P16BE, YUV444P16LE) 162.163 + 162.164 +#endif /* AVUTIL_PIXFMT_H */
163.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 163.2 +++ b/ffmpeg_smp/h264dec/libavutil/ppc/intreadwrite.h Mon Aug 27 12:09:56 2012 +0200 163.3 @@ -0,0 +1,108 @@ 163.4 +/* 163.5 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 163.6 + * 163.7 + * This file is part of FFmpeg. 163.8 + * 163.9 + * FFmpeg is free software; you can redistribute it and/or 163.10 + * modify it under the terms of the GNU Lesser General Public 163.11 + * License as published by the Free Software Foundation; either 163.12 + * version 2.1 of the License, or (at your option) any later version. 163.13 + * 163.14 + * FFmpeg is distributed in the hope that it will be useful, 163.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 163.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 163.17 + * Lesser General Public License for more details. 163.18 + * 163.19 + * You should have received a copy of the GNU Lesser General Public 163.20 + * License along with FFmpeg; if not, write to the Free Software 163.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 163.22 + */ 163.23 + 163.24 +#ifndef AVUTIL_PPC_INTREADWRITE_H 163.25 +#define AVUTIL_PPC_INTREADWRITE_H 163.26 + 163.27 +#include <stdint.h> 163.28 +#include "config.h" 163.29 + 163.30 +#if HAVE_XFORM_ASM 163.31 + 163.32 +#define AV_RL16 AV_RL16 163.33 +static av_always_inline uint16_t AV_RL16(const void *p) 163.34 +{ 163.35 + uint16_t v; 163.36 + __asm__ ("lhbrx %0, %y1" : "=r"(v) : "Z"(*(const uint16_t*)p)); 163.37 + return v; 163.38 +} 163.39 + 163.40 +#define AV_WL16 AV_WL16 163.41 +static av_always_inline void AV_WL16(void *p, uint16_t v) 163.42 +{ 163.43 + __asm__ ("sthbrx %1, %y0" : "=Z"(*(uint16_t*)p) : "r"(v)); 163.44 +} 163.45 + 163.46 +#define AV_RL32 AV_RL32 163.47 +static av_always_inline uint32_t AV_RL32(const void *p) 163.48 +{ 163.49 + uint32_t v; 163.50 + __asm__ ("lwbrx %0, %y1" : "=r"(v) : "Z"(*(const uint32_t*)p)); 163.51 + return v; 163.52 +} 163.53 + 163.54 +#define AV_WL32 AV_WL32 163.55 +static av_always_inline void AV_WL32(void *p, uint32_t v) 163.56 +{ 163.57 + __asm__ ("stwbrx %1, %y0" : "=Z"(*(uint32_t*)p) : "r"(v)); 163.58 +} 163.59 + 163.60 +#if HAVE_LDBRX 163.61 + 163.62 +#define AV_RL64 AV_RL64 163.63 +static av_always_inline uint64_t AV_RL64(const void *p) 163.64 +{ 163.65 + uint64_t v; 163.66 + __asm__ ("ldbrx %0, %y1" : "=r"(v) : "Z"(*(const uint64_t*)p)); 163.67 + return v; 163.68 +} 163.69 + 163.70 +#define AV_WL64 AV_WL64 163.71 +static av_always_inline void AV_WL64(void *p, uint64_t v) 163.72 +{ 163.73 + __asm__ ("stdbrx %1, %y0" : "=Z"(*(uint64_t*)p) : "r"(v)); 163.74 +} 163.75 + 163.76 +#else 163.77 + 163.78 +#define AV_RL64 AV_RL64 163.79 +static av_always_inline uint64_t AV_RL64(const void *p) 163.80 +{ 163.81 + union { uint64_t v; uint32_t hl[2]; } v; 163.82 + __asm__ ("lwbrx %0, %y2 \n\t" 163.83 + "lwbrx %1, %y3 \n\t" 163.84 + : "=&r"(v.hl[1]), "=r"(v.hl[0]) 163.85 + : "Z"(*(const uint32_t*)p), "Z"(*((const uint32_t*)p+1))); 163.86 + return v.v; 163.87 +} 163.88 + 163.89 +#define AV_WL64 AV_WL64 163.90 +static av_always_inline void AV_WL64(void *p, uint64_t v) 163.91 +{ 163.92 + union { uint64_t v; uint32_t hl[2]; } vv = { v }; 163.93 + __asm__ ("stwbrx %2, %y0 \n\t" 163.94 + "stwbrx %3, %y1 \n\t" 163.95 + : "=Z"(*(uint32_t*)p), "=Z"(*((uint32_t*)p+1)) 163.96 + : "r"(vv.hl[1]), "r"(vv.hl[0])); 163.97 +} 163.98 + 163.99 +#endif /* HAVE_LDBRX */ 163.100 + 163.101 +#endif /* HAVE_XFORM_ASM */ 163.102 + 163.103 +/* 163.104 + * GCC fails miserably on the packed struct version which is used by 163.105 + * default, so we override it here. 163.106 + */ 163.107 + 163.108 +#define AV_RB64(p) (*(const uint64_t *)(p)) 163.109 +#define AV_WB64(p, v) (*(uint64_t *)(p) = (v)) 163.110 + 163.111 +#endif /* AVUTIL_PPC_INTREADWRITE_H */
164.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 164.2 +++ b/ffmpeg_smp/h264dec/libavutil/ppc/timer.h Mon Aug 27 12:09:56 2012 +0200 164.3 @@ -0,0 +1,47 @@ 164.4 +/* 164.5 + * Copyright (c) 2005 Luca Barbato <lu_zero@gentoo.org> 164.6 + * 164.7 + * This file is part of FFmpeg. 164.8 + * 164.9 + * FFmpeg is free software; you can redistribute it and/or 164.10 + * modify it under the terms of the GNU Lesser General Public 164.11 + * License as published by the Free Software Foundation; either 164.12 + * version 2.1 of the License, or (at your option) any later version. 164.13 + * 164.14 + * FFmpeg is distributed in the hope that it will be useful, 164.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 164.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 164.17 + * Lesser General Public License for more details. 164.18 + * 164.19 + * You should have received a copy of the GNU Lesser General Public 164.20 + * License along with FFmpeg; if not, write to the Free Software 164.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 164.22 + */ 164.23 + 164.24 +#ifndef AVUTIL_PPC_TIMER_H 164.25 +#define AVUTIL_PPC_TIMER_H 164.26 + 164.27 +#include <stdint.h> 164.28 + 164.29 +#define AV_READ_TIME read_time 164.30 + 164.31 +static inline uint64_t read_time(void) 164.32 +{ 164.33 + uint32_t tbu, tbl, temp; 164.34 + 164.35 + /* from section 2.2.1 of the 32-bit PowerPC PEM */ 164.36 + __asm__ volatile( 164.37 + "1:\n" 164.38 + "mftbu %2\n" 164.39 + "mftb %0\n" 164.40 + "mftbu %1\n" 164.41 + "cmpw %2,%1\n" 164.42 + "bne 1b\n" 164.43 + : "=r"(tbl), "=r"(tbu), "=r"(temp) 164.44 + : 164.45 + : "cc"); 164.46 + 164.47 + return (((uint64_t)tbu)<<32) | (uint64_t)tbl; 164.48 +} 164.49 + 164.50 +#endif /* AVUTIL_PPC_TIMER_H */
165.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 165.2 +++ b/ffmpeg_smp/h264dec/libavutil/timer.h Mon Aug 27 12:09:56 2012 +0200 165.3 @@ -0,0 +1,69 @@ 165.4 +/** 165.5 + * @file 165.6 + * high precision timer, useful to profile code 165.7 + * 165.8 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> 165.9 + * 165.10 + * This file is part of FFmpeg. 165.11 + * 165.12 + * FFmpeg is free software; you can redistribute it and/or 165.13 + * modify it under the terms of the GNU Lesser General Public 165.14 + * License as published by the Free Software Foundation; either 165.15 + * version 2.1 of the License, or (at your option) any later version. 165.16 + * 165.17 + * FFmpeg is distributed in the hope that it will be useful, 165.18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 165.19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 165.20 + * Lesser General Public License for more details. 165.21 + * 165.22 + * You should have received a copy of the GNU Lesser General Public 165.23 + * License along with FFmpeg; if not, write to the Free Software 165.24 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 165.25 + */ 165.26 + 165.27 +#ifndef AVUTIL_TIMER_H 165.28 +#define AVUTIL_TIMER_H 165.29 + 165.30 +#include <stdlib.h> 165.31 +#include <stdint.h> 165.32 +#include "config.h" 165.33 + 165.34 +#if ARCH_ARM 165.35 +# include "arm/timer.h" 165.36 +#elif ARCH_PPC 165.37 +# include "ppc/timer.h" 165.38 +#elif ARCH_X86 165.39 +# include "x86/timer.h" 165.40 +#endif 165.41 + 165.42 +#if !defined(AV_READ_TIME) && HAVE_GETHRTIME 165.43 +# define AV_READ_TIME gethrtime 165.44 +#endif 165.45 + 165.46 +#ifdef AV_READ_TIME 165.47 +#define START_TIMER \ 165.48 +uint64_t tend;\ 165.49 +uint64_t tstart= AV_READ_TIME();\ 165.50 + 165.51 +#define STOP_TIMER(id) \ 165.52 +tend= AV_READ_TIME();\ 165.53 +{\ 165.54 + static uint64_t tsum=0;\ 165.55 + static int tcount=0;\ 165.56 + static int tskip_count=0;\ 165.57 + if(tcount<2 || tend - tstart < 8*tsum/tcount || tend - tstart < 2000){\ 165.58 + tsum+= tend - tstart;\ 165.59 + tcount++;\ 165.60 + }else\ 165.61 + tskip_count++;\ 165.62 + if(((tcount+tskip_count)&(tcount+tskip_count-1))==0){\ 165.63 + av_log(NULL, AV_LOG_ERROR, "%"PRIu64" dezicycles in %s, %d runs, %d skips\n",\ 165.64 + tsum*10/tcount, id, tcount, tskip_count);\ 165.65 + }\ 165.66 +} 165.67 +#else 165.68 +#define START_TIMER 165.69 +#define STOP_TIMER(id) {} 165.70 +#endif 165.71 + 165.72 +#endif /* AVUTIL_TIMER_H */
166.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 166.2 +++ b/ffmpeg_smp/h264dec/libavutil/x86/bswap.h Mon Aug 27 12:09:56 2012 +0200 166.3 @@ -0,0 +1,61 @@ 166.4 +/* 166.5 + * This file is part of FFmpeg. 166.6 + * 166.7 + * FFmpeg is free software; you can redistribute it and/or 166.8 + * modify it under the terms of the GNU Lesser General Public 166.9 + * License as published by the Free Software Foundation; either 166.10 + * version 2.1 of the License, or (at your option) any later version. 166.11 + * 166.12 + * FFmpeg is distributed in the hope that it will be useful, 166.13 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 166.14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 166.15 + * Lesser General Public License for more details. 166.16 + * 166.17 + * You should have received a copy of the GNU Lesser General Public 166.18 + * License along with FFmpeg; if not, write to the Free Software 166.19 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 166.20 + */ 166.21 + 166.22 +/** 166.23 + * @file 166.24 + * byte swapping routines 166.25 + */ 166.26 + 166.27 +#ifndef AVUTIL_X86_BSWAP_H 166.28 +#define AVUTIL_X86_BSWAP_H 166.29 + 166.30 +#include <stdint.h> 166.31 +#include "config.h" 166.32 +#include "libavutil/attributes.h" 166.33 + 166.34 +#define bswap_16 bswap_16 166.35 +static av_always_inline av_const uint16_t bswap_16(uint16_t x) 166.36 +{ 166.37 + __asm__("rorw $8, %0" : "+r"(x)); 166.38 + return x; 166.39 +} 166.40 + 166.41 +#define bswap_32 bswap_32 166.42 +static av_always_inline av_const uint32_t bswap_32(uint32_t x) 166.43 +{ 166.44 +// #if HAVE_BSWAP 166.45 + __asm__("bswap %0" : "+r" (x)); 166.46 +// #else 166.47 +// __asm__("rorw $8, %w0 \n\t" 166.48 +// "rorl $16, %0 \n\t" 166.49 +// "rorw $8, %w0" 166.50 +// : "+r"(x)); 166.51 +// #endif 166.52 + return x; 166.53 +} 166.54 + 166.55 +#if ARCH_X86_64 166.56 +#define bswap_64 bswap_64 166.57 +static inline uint64_t av_const bswap_64(uint64_t x) 166.58 +{ 166.59 + __asm__("bswap %0": "=r" (x) : "0" (x)); 166.60 + return x; 166.61 +} 166.62 +#endif 166.63 + 166.64 +#endif /* AVUTIL_X86_BSWAP_H */
167.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 167.2 +++ b/ffmpeg_smp/h264dec/libavutil/x86/intreadwrite.h Mon Aug 27 12:09:56 2012 +0200 167.3 @@ -0,0 +1,97 @@ 167.4 +/* 167.5 + * Copyright (c) 2010 Alexander Strange <astrange@ithinksw.com> 167.6 + * 167.7 + * This file is part of FFmpeg. 167.8 + * 167.9 + * FFmpeg is free software; you can redistribute it and/or 167.10 + * modify it under the terms of the GNU Lesser General Public 167.11 + * License as published by the Free Software Foundation; either 167.12 + * version 2.1 of the License, or (at your option) any later version. 167.13 + * 167.14 + * FFmpeg is distributed in the hope that it will be useful, 167.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 167.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 167.17 + * Lesser General Public License for more details. 167.18 + * 167.19 + * You should have received a copy of the GNU Lesser General Public 167.20 + * License along with FFmpeg; if not, write to the Free Software 167.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 167.22 + */ 167.23 + 167.24 +#ifndef AVUTIL_X86_INTREADWRITE_H 167.25 +#define AVUTIL_X86_INTREADWRITE_H 167.26 + 167.27 +#include <stdint.h> 167.28 +#include "config.h" 167.29 +#include "libavutil/attributes.h" 167.30 + 167.31 +#if HAVE_MMX 167.32 + 167.33 +#if defined(__MMX__) 167.34 + 167.35 +#define AV_COPY64 AV_COPY64 167.36 +static av_always_inline void AV_COPY64(void *d, const void *s) 167.37 +{ 167.38 + __asm__("movq %1, %%mm0 \n\t" 167.39 + "movq %%mm0, %0 \n\t" 167.40 + : "=m"(*(uint64_t*)d) 167.41 + : "m" (*(const uint64_t*)s) 167.42 + : "mm0"); 167.43 +} 167.44 + 167.45 +#define AV_SWAP64 AV_SWAP64 167.46 +static av_always_inline void AV_SWAP64(void *a, void *b) 167.47 +{ 167.48 + __asm__("movq %1, %%mm0 \n\t" 167.49 + "movq %0, %%mm1 \n\t" 167.50 + "movq %%mm0, %0 \n\t" 167.51 + "movq %%mm1, %1 \n\t" 167.52 + : "+m"(*(uint64_t*)a), "+m"(*(uint64_t*)b) 167.53 + ::"mm0", "mm1"); 167.54 +} 167.55 + 167.56 +#define AV_ZERO64 AV_ZERO64 167.57 +static av_always_inline void AV_ZERO64(void *d) 167.58 +{ 167.59 + __asm__("pxor %%mm0, %%mm0 \n\t" 167.60 + "movq %%mm0, %0 \n\t" 167.61 + : "=m"(*(uint64_t*)d) 167.62 + :: "mm0"); 167.63 +} 167.64 + 167.65 +#endif /* !HAVE_FAST_64BIT && defined(__MMX__) */ 167.66 + 167.67 +#ifdef __SSE__ 167.68 + 167.69 +#define AV_COPY128 AV_COPY128 167.70 +static av_always_inline void AV_COPY128(void *d, const void *s) 167.71 +{ 167.72 + struct v {uint64_t v[2];}; 167.73 + 167.74 + __asm__("movaps %1, %%xmm0 \n\t" 167.75 + "movaps %%xmm0, %0 \n\t" 167.76 + : "=m"(*(struct v*)d) 167.77 + : "m" (*(const struct v*)s) 167.78 + : "xmm0"); 167.79 +} 167.80 + 167.81 +#endif /* __SSE__ */ 167.82 + 167.83 +#ifdef __SSE2__ 167.84 + 167.85 +#define AV_ZERO128 AV_ZERO128 167.86 +static av_always_inline void AV_ZERO128(void *d) 167.87 +{ 167.88 + struct v {uint64_t v[2];}; 167.89 + 167.90 + __asm__("pxor %%xmm0, %%xmm0 \n\t" 167.91 + "movdqa %%xmm0, %0 \n\t" 167.92 + : "=m"(*(struct v*)d) 167.93 + :: "xmm0"); 167.94 +} 167.95 + 167.96 +#endif /* __SSE2__ */ 167.97 + 167.98 +#endif /* HAVE_MMX */ 167.99 + 167.100 +#endif /* AVUTIL_X86_INTREADWRITE_H */
168.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 168.2 +++ b/ffmpeg_smp/h264dec/libavutil/x86/timer.h Mon Aug 27 12:09:56 2012 +0200 168.3 @@ -0,0 +1,35 @@ 168.4 +/* 168.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> 168.6 + * 168.7 + * This file is part of FFmpeg. 168.8 + * 168.9 + * FFmpeg is free software; you can redistribute it and/or 168.10 + * modify it under the terms of the GNU Lesser General Public 168.11 + * License as published by the Free Software Foundation; either 168.12 + * version 2.1 of the License, or (at your option) any later version. 168.13 + * 168.14 + * FFmpeg is distributed in the hope that it will be useful, 168.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 168.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 168.17 + * Lesser General Public License for more details. 168.18 + * 168.19 + * You should have received a copy of the GNU Lesser General Public 168.20 + * License along with FFmpeg; if not, write to the Free Software 168.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 168.22 + */ 168.23 + 168.24 +#ifndef AVUTIL_X86_TIMER_H 168.25 +#define AVUTIL_X86_TIMER_H 168.26 + 168.27 +#include <stdint.h> 168.28 + 168.29 +#define AV_READ_TIME read_time 168.30 + 168.31 +static inline uint64_t read_time(void) 168.32 +{ 168.33 + uint32_t a, d; 168.34 + __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); 168.35 + return ((uint64_t)d << 32) + a; 168.36 +} 168.37 + 168.38 +#endif /* AVUTIL_X86_TIMER_H */
169.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 169.2 +++ b/ffmpeg_smp/h264dec/libavutil/x86_cpu.h Mon Aug 27 12:09:56 2012 +0200 169.3 @@ -0,0 +1,73 @@ 169.4 +/* 169.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> 169.6 + * 169.7 + * This file is part of FFmpeg. 169.8 + * 169.9 + * FFmpeg is free software; you can redistribute it and/or 169.10 + * modify it under the terms of the GNU Lesser General Public 169.11 + * License as published by the Free Software Foundation; either 169.12 + * version 2.1 of the License, or (at your option) any later version. 169.13 + * 169.14 + * FFmpeg is distributed in the hope that it will be useful, 169.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 169.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 169.17 + * Lesser General Public License for more details. 169.18 + * 169.19 + * You should have received a copy of the GNU Lesser General Public 169.20 + * License along with FFmpeg; if not, write to the Free Software 169.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 169.22 + */ 169.23 + 169.24 +#ifndef AVUTIL_X86_CPU_H 169.25 +#define AVUTIL_X86_CPU_H 169.26 + 169.27 +#include <stdint.h> 169.28 +#include "config.h" 169.29 + 169.30 +#if ARCH_X86_64 169.31 +# define REG_a "rax" 169.32 +# define REG_b "rbx" 169.33 +# define REG_c "rcx" 169.34 +# define REG_d "rdx" 169.35 +# define REG_D "rdi" 169.36 +# define REG_S "rsi" 169.37 +# define PTR_SIZE "8" 169.38 +typedef int64_t x86_reg; 169.39 + 169.40 +# define REG_SP "rsp" 169.41 +# define REG_BP "rbp" 169.42 +# define REGBP rbp 169.43 +# define REGa rax 169.44 +# define REGb rbx 169.45 +# define REGc rcx 169.46 +# define REGd rdx 169.47 +# define REGSP rsp 169.48 + 169.49 +#elif ARCH_X86_32 169.50 + 169.51 +# define REG_a "eax" 169.52 +# define REG_b "ebx" 169.53 +# define REG_c "ecx" 169.54 +# define REG_d "edx" 169.55 +# define REG_D "edi" 169.56 +# define REG_S "esi" 169.57 +# define PTR_SIZE "4" 169.58 +typedef int32_t x86_reg; 169.59 + 169.60 +# define REG_SP "esp" 169.61 +# define REG_BP "ebp" 169.62 +# define REGBP ebp 169.63 +# define REGa eax 169.64 +# define REGb ebx 169.65 +# define REGc ecx 169.66 +# define REGd edx 169.67 +# define REGSP esp 169.68 +#else 169.69 +typedef int x86_reg; 169.70 +#endif 169.71 + 169.72 +// #if ARCH_X86_64 && defined(PIC) 169.73 +// # define BROKEN_RELOCATIONS 1 169.74 +// #endif 169.75 + 169.76 +#endif /* AVUTIL_X86_CPU_H */
