FFmpeg: libpostproc/postprocess.c Source File

00001 /*
00002  * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
00003  *
00004  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
00005  *
00006  * This file is part of FFmpeg.
00007  *
00008  * FFmpeg is free software; you can redistribute it and/or modify
00009  * it under the terms of the GNU General Public License as published by
00010  * the Free Software Foundation; either version 2 of the License, or
00011  * (at your option) any later version.
00012  *
00013  * FFmpeg is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016  * GNU General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU General Public License
00019  * along with FFmpeg; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  */
00022 
00028 /*
00029                         C       MMX     MMX2    3DNow   AltiVec
00030 isVertDC                Ec      Ec                      Ec
00031 isVertMinMaxOk          Ec      Ec                      Ec
00032 doVertLowPass           E               e       e       Ec
00033 doVertDefFilter         Ec      Ec      e       e       Ec
00034 isHorizDC               Ec      Ec                      Ec
00035 isHorizMinMaxOk         a       E                       Ec
00036 doHorizLowPass          E               e       e       Ec
00037 doHorizDefFilter        Ec      Ec      e       e       Ec
00038 do_a_deblock            Ec      E       Ec      E
00039 deRing                  E               e       e*      Ecp
00040 Vertical RKAlgo1        E               a       a
00041 Horizontal RKAlgo1                      a       a
00042 Vertical X1#            a               E       E
00043 Horizontal X1#          a               E       E
00044 LinIpolDeinterlace      e               E       E*
00045 CubicIpolDeinterlace    a               e       e*
00046 LinBlendDeinterlace     e               E       E*
00047 MedianDeinterlace#      E       Ec      Ec
00048 TempDeNoiser#           E               e       e       Ec
00049 
00050 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
00051 # more or less selfinvented filters so the exactness is not too meaningful
00052 E = Exact implementation
00053 e = almost exact implementation (slightly different rounding,...)
00054 a = alternative / approximate impl
00055 c = checked against the other implementations (-vo md5)
00056 p = partially optimized, still some work to do
00057 */
00058 
00059 /*
00060 TODO:
00061 reduce the time wasted on the mem transfer
00062 unroll stuff if instructions depend too much on the prior one
00063 move YScale thing to the end instead of fixing QP
00064 write a faster and higher quality deblocking filter :)
00065 make the mainloop more flexible (variable number of blocks at once
00066         (the if/else stuff per block is slowing things down)
00067 compare the quality & speed of all filters
00068 split this huge file
00069 optimize c versions
00070 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
00071 ...
00072 */
00073 
00074 //Changelog: use the Subversion log
00075 
00076 #include "config.h"
00077 #include "libavutil/avutil.h"
00078 #include <inttypes.h>
00079 #include <stdio.h>
00080 #include <stdlib.h>
00081 #include <string.h>
00082 //#undef HAVE_MMX2
00083 //#define HAVE_AMD3DNOW
00084 //#undef HAVE_MMX
00085 //#undef ARCH_X86
00086 //#define DEBUG_BRIGHTNESS
00087 #include "postprocess.h"
00088 #include "postprocess_internal.h"
00089 
00090 unsigned postproc_version(void)
00091 {
00092     return LIBPOSTPROC_VERSION_INT;
00093 }
00094 
00095 #if HAVE_ALTIVEC_H
00096 #include <altivec.h>
00097 #endif
00098 
00099 #define GET_MODE_BUFFER_SIZE 500
00100 #define OPTIONS_ARRAY_SIZE 10
00101 #define BLOCK_SIZE 8
00102 #define TEMP_STRIDE 8
00103 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
00104 
00105 #if ARCH_X86
00106 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
00107 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
00108 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
00109 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
00110 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
00111 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
00112 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
00113 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
00114 #endif
00115 
00116 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
00117 
00118 
00119 static struct PPFilter filters[]=
00120 {
00121     {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
00122     {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
00123 /*  {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
00124     {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
00125     {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
00126     {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
00127     {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
00128     {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
00129     {"dr", "dering",                1, 5, 6, DERING},
00130     {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
00131     {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
00132     {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
00133     {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
00134     {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
00135     {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
00136     {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
00137     {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
00138     {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
00139     {NULL, NULL,0,0,0,0} //End Marker
00140 };
00141 
00142 static const char *replaceTable[]=
00143 {
00144     "default",      "hb:a,vb:a,dr:a",
00145     "de",           "hb:a,vb:a,dr:a",
00146     "fast",         "h1:a,v1:a,dr:a",
00147     "fa",           "h1:a,v1:a,dr:a",
00148     "ac",           "ha:a:128:7,va:a,dr:a",
00149     NULL //End Marker
00150 };
00151 
00152 
00153 #if ARCH_X86
00154 static inline void prefetchnta(void *p)
00155 {
00156     __asm__ volatile(   "prefetchnta (%0)\n\t"
00157         : : "r" (p)
00158     );
00159 }
00160 
00161 static inline void prefetcht0(void *p)
00162 {
00163     __asm__ volatile(   "prefetcht0 (%0)\n\t"
00164         : : "r" (p)
00165     );
00166 }
00167 
00168 static inline void prefetcht1(void *p)
00169 {
00170     __asm__ volatile(   "prefetcht1 (%0)\n\t"
00171         : : "r" (p)
00172     );
00173 }
00174 
00175 static inline void prefetcht2(void *p)
00176 {
00177     __asm__ volatile(   "prefetcht2 (%0)\n\t"
00178         : : "r" (p)
00179     );
00180 }
00181 #endif
00182 
00183 /* The horizontal functions exist only in C because the MMX
00184  * code is faster with vertical filters and transposing. */
00185 
00189 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
00190 {
00191     int numEq= 0;
00192     int y;
00193     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
00194     const int dcThreshold= dcOffset*2 + 1;
00195 
00196     for(y=0; y<BLOCK_SIZE; y++){
00197         if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
00198         if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
00199         if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
00200         if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
00201         if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
00202         if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
00203         if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
00204         src+= stride;
00205     }
00206     return numEq > c->ppMode.flatnessThreshold;
00207 }
00208 
00212 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c)
00213 {
00214     int numEq= 0;
00215     int y;
00216     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
00217     const int dcThreshold= dcOffset*2 + 1;
00218 
00219     src+= stride*4; // src points to begin of the 8x8 Block
00220     for(y=0; y<BLOCK_SIZE-1; y++){
00221         if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
00222         if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
00223         if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
00224         if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
00225         if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
00226         if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
00227         if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
00228         if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
00229         src+= stride;
00230     }
00231     return numEq > c->ppMode.flatnessThreshold;
00232 }
00233 
00234 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
00235 {
00236     int i;
00237 #if 1
00238     for(i=0; i<2; i++){
00239         if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
00240         src += stride;
00241         if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
00242         src += stride;
00243         if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
00244         src += stride;
00245         if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
00246         src += stride;
00247     }
00248 #else
00249     for(i=0; i<8; i++){
00250         if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
00251         src += stride;
00252     }
00253 #endif
00254     return 1;
00255 }
00256 
00257 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
00258 {
00259 #if 1
00260 #if 1
00261     int x;
00262     src+= stride*4;
00263     for(x=0; x<BLOCK_SIZE; x+=4){
00264         if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
00265         if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
00266         if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
00267         if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
00268     }
00269 #else
00270     int x;
00271     src+= stride*3;
00272     for(x=0; x<BLOCK_SIZE; x++){
00273         if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
00274     }
00275 #endif
00276     return 1;
00277 #else
00278     int x;
00279     src+= stride*4;
00280     for(x=0; x<BLOCK_SIZE; x++){
00281         int min=255;
00282         int max=0;
00283         int y;
00284         for(y=0; y<8; y++){
00285             int v= src[x + y*stride];
00286             if(v>max) max=v;
00287             if(v<min) min=v;
00288         }
00289         if(max-min > 2*QP) return 0;
00290     }
00291     return 1;
00292 #endif
00293 }
00294 
00295 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c)
00296 {
00297     if( isHorizDC_C(src, stride, c) ){
00298         if( isHorizMinMaxOk_C(src, stride, c->QP) )
00299             return 1;
00300         else
00301             return 0;
00302     }else{
00303         return 2;
00304     }
00305 }
00306 
00307 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c)
00308 {
00309     if( isVertDC_C(src, stride, c) ){
00310         if( isVertMinMaxOk_C(src, stride, c->QP) )
00311             return 1;
00312         else
00313             return 0;
00314     }else{
00315         return 2;
00316     }
00317 }
00318 
00319 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
00320 {
00321     int y;
00322     for(y=0; y<BLOCK_SIZE; y++){
00323         const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
00324 
00325         if(FFABS(middleEnergy) < 8*c->QP){
00326             const int q=(dst[3] - dst[4])/2;
00327             const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
00328             const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
00329 
00330             int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
00331             d= FFMAX(d, 0);
00332 
00333             d= (5*d + 32) >> 6;
00334             d*= FFSIGN(-middleEnergy);
00335 
00336             if(q>0)
00337             {
00338                 d= d<0 ? 0 : d;
00339                 d= d>q ? q : d;
00340             }
00341             else
00342             {
00343                 d= d>0 ? 0 : d;
00344                 d= d<q ? q : d;
00345             }
00346 
00347             dst[3]-= d;
00348             dst[4]+= d;
00349         }
00350         dst+= stride;
00351     }
00352 }
00353 
00358 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
00359 {
00360     int y;
00361     for(y=0; y<BLOCK_SIZE; y++){
00362         const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
00363         const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
00364 
00365         int sums[10];
00366         sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
00367         sums[1] = sums[0] - first  + dst[3];
00368         sums[2] = sums[1] - first  + dst[4];
00369         sums[3] = sums[2] - first  + dst[5];
00370         sums[4] = sums[3] - first  + dst[6];
00371         sums[5] = sums[4] - dst[0] + dst[7];
00372         sums[6] = sums[5] - dst[1] + last;
00373         sums[7] = sums[6] - dst[2] + last;
00374         sums[8] = sums[7] - dst[3] + last;
00375         sums[9] = sums[8] - dst[4] + last;
00376 
00377         dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
00378         dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
00379         dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
00380         dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
00381         dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
00382         dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
00383         dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
00384         dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
00385 
00386         dst+= stride;
00387     }
00388 }
00389 
00398 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
00399 {
00400     int y;
00401     static uint64_t *lut= NULL;
00402     if(lut==NULL)
00403     {
00404         int i;
00405         lut = av_malloc(256*8);
00406         for(i=0; i<256; i++)
00407         {
00408             int v= i < 128 ? 2*i : 2*(i-256);
00409 /*
00410 //Simulate 112242211 9-Tap filter
00411             uint64_t a= (v/16)  & 0xFF;
00412             uint64_t b= (v/8)   & 0xFF;
00413             uint64_t c= (v/4)   & 0xFF;
00414             uint64_t d= (3*v/8) & 0xFF;
00415 */
00416 //Simulate piecewise linear interpolation
00417             uint64_t a= (v/16)   & 0xFF;
00418             uint64_t b= (v*3/16) & 0xFF;
00419             uint64_t c= (v*5/16) & 0xFF;
00420             uint64_t d= (7*v/16) & 0xFF;
00421             uint64_t A= (0x100 - a)&0xFF;
00422             uint64_t B= (0x100 - b)&0xFF;
00423             uint64_t C= (0x100 - c)&0xFF;
00424             uint64_t D= (0x100 - c)&0xFF;
00425 
00426             lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
00427                        (D<<24) | (C<<16) | (B<<8)  | (A);
00428             //lut[i] = (v<<32) | (v<<24);
00429         }
00430     }
00431 
00432     for(y=0; y<BLOCK_SIZE; y++){
00433         int a= src[1] - src[2];
00434         int b= src[3] - src[4];
00435         int c= src[5] - src[6];
00436 
00437         int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
00438 
00439         if(d < QP){
00440             int v = d * FFSIGN(-b);
00441 
00442             src[1] +=v/8;
00443             src[2] +=v/4;
00444             src[3] +=3*v/8;
00445             src[4] -=3*v/8;
00446             src[5] -=v/4;
00447             src[6] -=v/8;
00448         }
00449         src+=stride;
00450     }
00451 }
00452 
00456 static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
00457     int y;
00458     const int QP= c->QP;
00459     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
00460     const int dcThreshold= dcOffset*2 + 1;
00461 //START_TIMER
00462     src+= step*4; // src points to begin of the 8x8 Block
00463     for(y=0; y<8; y++){
00464         int numEq= 0;
00465 
00466         if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
00467         if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
00468         if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
00469         if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
00470         if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
00471         if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
00472         if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
00473         if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
00474         if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
00475         if(numEq > c->ppMode.flatnessThreshold){
00476             int min, max, x;
00477 
00478             if(src[0] > src[step]){
00479                 max= src[0];
00480                 min= src[step];
00481             }else{
00482                 max= src[step];
00483                 min= src[0];
00484             }
00485             for(x=2; x<8; x+=2){
00486                 if(src[x*step] > src[(x+1)*step]){
00487                         if(src[x    *step] > max) max= src[ x   *step];
00488                         if(src[(x+1)*step] < min) min= src[(x+1)*step];
00489                 }else{
00490                         if(src[(x+1)*step] > max) max= src[(x+1)*step];
00491                         if(src[ x   *step] < min) min= src[ x   *step];
00492                 }
00493             }
00494             if(max-min < 2*QP){
00495                 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
00496                 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
00497 
00498                 int sums[10];
00499                 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
00500                 sums[1] = sums[0] - first       + src[3*step];
00501                 sums[2] = sums[1] - first       + src[4*step];
00502                 sums[3] = sums[2] - first       + src[5*step];
00503                 sums[4] = sums[3] - first       + src[6*step];
00504                 sums[5] = sums[4] - src[0*step] + src[7*step];
00505                 sums[6] = sums[5] - src[1*step] + last;
00506                 sums[7] = sums[6] - src[2*step] + last;
00507                 sums[8] = sums[7] - src[3*step] + last;
00508                 sums[9] = sums[8] - src[4*step] + last;
00509 
00510                 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
00511                 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
00512                 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
00513                 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
00514                 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
00515                 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
00516                 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
00517                 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
00518             }
00519         }else{
00520             const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
00521 
00522             if(FFABS(middleEnergy) < 8*QP){
00523                 const int q=(src[3*step] - src[4*step])/2;
00524                 const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
00525                 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
00526 
00527                 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
00528                 d= FFMAX(d, 0);
00529 
00530                 d= (5*d + 32) >> 6;
00531                 d*= FFSIGN(-middleEnergy);
00532 
00533                 if(q>0){
00534                     d= d<0 ? 0 : d;
00535                     d= d>q ? q : d;
00536                 }else{
00537                     d= d>0 ? 0 : d;
00538                     d= d<q ? q : d;
00539                 }
00540 
00541                 src[3*step]-= d;
00542                 src[4*step]+= d;
00543             }
00544         }
00545 
00546         src += stride;
00547     }
00548 /*if(step==16){
00549     STOP_TIMER("step16")
00550 }else{
00551     STOP_TIMER("stepX")
00552 }*/
00553 }
00554 
00555 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
00556 //Plain C versions
00557 #if !(HAVE_MMX || HAVE_ALTIVEC) || CONFIG_RUNTIME_CPUDETECT
00558 #define COMPILE_C
00559 #endif
00560 
00561 #if HAVE_ALTIVEC
00562 #define COMPILE_ALTIVEC
00563 #endif //HAVE_ALTIVEC
00564 
00565 #if ARCH_X86
00566 
00567 #if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
00568 #define COMPILE_MMX
00569 #endif
00570 
00571 #if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT
00572 #define COMPILE_MMX2
00573 #endif
00574 
00575 #if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
00576 #define COMPILE_3DNOW
00577 #endif
00578 #endif /* ARCH_X86 */
00579 
00580 #undef HAVE_MMX
00581 #define HAVE_MMX 0
00582 #undef HAVE_MMX2
00583 #define HAVE_MMX2 0
00584 #undef HAVE_AMD3DNOW
00585 #define HAVE_AMD3DNOW 0
00586 #undef HAVE_ALTIVEC
00587 #define HAVE_ALTIVEC 0
00588 
00589 #ifdef COMPILE_C
00590 #define RENAME(a) a ## _C
00591 #include "postprocess_template.c"
00592 #endif
00593 
00594 #ifdef COMPILE_ALTIVEC
00595 #undef RENAME
00596 #undef HAVE_ALTIVEC
00597 #define HAVE_ALTIVEC 1
00598 #define RENAME(a) a ## _altivec
00599 #include "postprocess_altivec_template.c"
00600 #include "postprocess_template.c"
00601 #endif
00602 
00603 //MMX versions
00604 #ifdef COMPILE_MMX
00605 #undef RENAME
00606 #undef HAVE_MMX
00607 #define HAVE_MMX 1
00608 #define RENAME(a) a ## _MMX
00609 #include "postprocess_template.c"
00610 #endif
00611 
00612 //MMX2 versions
00613 #ifdef COMPILE_MMX2
00614 #undef RENAME
00615 #undef HAVE_MMX
00616 #undef HAVE_MMX2
00617 #define HAVE_MMX 1
00618 #define HAVE_MMX2 1
00619 #define RENAME(a) a ## _MMX2
00620 #include "postprocess_template.c"
00621 #endif
00622 
00623 //3DNOW versions
00624 #ifdef COMPILE_3DNOW
00625 #undef RENAME
00626 #undef HAVE_MMX
00627 #undef HAVE_MMX2
00628 #undef HAVE_AMD3DNOW
00629 #define HAVE_MMX 1
00630 #define HAVE_MMX2 0
00631 #define HAVE_AMD3DNOW 1
00632 #define RENAME(a) a ## _3DNow
00633 #include "postprocess_template.c"
00634 #endif
00635 
00636 // minor note: the HAVE_xyz is messed up after that line so do not use it.
00637 
00638 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
00639         const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
00640 {
00641     PPContext *c= (PPContext *)vc;
00642     PPMode *ppMode= (PPMode *)vm;
00643     c->ppMode= *ppMode; //FIXME
00644 
00645     // Using ifs here as they are faster than function pointers although the
00646     // difference would not be measurable here but it is much better because
00647     // someone might exchange the CPU whithout restarting MPlayer ;)
00648 #if CONFIG_RUNTIME_CPUDETECT
00649 #if ARCH_X86
00650     // ordered per speed fastest first
00651     if(c->cpuCaps & PP_CPU_CAPS_MMX2)
00652         postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00653     else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
00654         postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00655     else if(c->cpuCaps & PP_CPU_CAPS_MMX)
00656         postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00657     else
00658         postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00659 #else
00660 #if HAVE_ALTIVEC
00661     if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
00662             postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00663     else
00664 #endif
00665             postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00666 #endif
00667 #else //CONFIG_RUNTIME_CPUDETECT
00668 #if   HAVE_MMX2
00669             postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00670 #elif HAVE_AMD3DNOW
00671             postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00672 #elif HAVE_MMX
00673             postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00674 #elif HAVE_ALTIVEC
00675             postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00676 #else
00677             postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
00678 #endif
00679 #endif 
00680 }
00681 
00682 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
00683 //        QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
00684 
00685 /* -pp Command line Help
00686 */
00687 #if LIBPOSTPROC_VERSION_INT < (52<<16)
00688 const char *const pp_help=
00689 #else
00690 const char pp_help[] =
00691 #endif
00692 "Available postprocessing filters:\n"
00693 "Filters                        Options\n"
00694 "short  long name       short   long option     Description\n"
00695 "*      *               a       autoq           CPU power dependent enabler\n"
00696 "                       c       chrom           chrominance filtering enabled\n"
00697 "                       y       nochrom         chrominance filtering disabled\n"
00698 "                       n       noluma          luma filtering disabled\n"
00699 "hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
00700 "       1. difference factor: default=32, higher -> more deblocking\n"
00701 "       2. flatness threshold: default=39, lower -> more deblocking\n"
00702 "                       the h & v deblocking filters share these\n"
00703 "                       so you can't set different thresholds for h / v\n"
00704 "vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
00705 "ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
00706 "va     vadeblock       (2 threshold)           vertical deblocking filter\n"
00707 "h1     x1hdeblock                              experimental h deblock filter 1\n"
00708 "v1     x1vdeblock                              experimental v deblock filter 1\n"
00709 "dr     dering                                  deringing filter\n"
00710 "al     autolevels                              automatic brightness / contrast\n"
00711 "                       f        fullyrange     stretch luminance to (0..255)\n"
00712 "lb     linblenddeint                           linear blend deinterlacer\n"
00713 "li     linipoldeint                            linear interpolating deinterlace\n"
00714 "ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
00715 "md     mediandeint                             median deinterlacer\n"
00716 "fd     ffmpegdeint                             ffmpeg deinterlacer\n"
00717 "l5     lowpass5                                FIR lowpass deinterlacer\n"
00718 "de     default                                 hb:a,vb:a,dr:a\n"
00719 "fa     fast                                    h1:a,v1:a,dr:a\n"
00720 "ac                                             ha:a:128:7,va:a,dr:a\n"
00721 "tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
00722 "                     1. <= 2. <= 3.            larger -> stronger filtering\n"
00723 "fq     forceQuant      <quantizer>             force quantizer\n"
00724 "Usage:\n"
00725 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
00726 "long form example:\n"
00727 "vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
00728 "short form example:\n"
00729 "vb:a/hb:a/lb                                   de,-vb\n"
00730 "more examples:\n"
00731 "tn:64:128:256\n"
00732 "\n"
00733 ;
00734 
00735 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
00736 {
00737     char temp[GET_MODE_BUFFER_SIZE];
00738     char *p= temp;
00739     static const char filterDelimiters[] = ",/";
00740     static const char optionDelimiters[] = ":";
00741     struct PPMode *ppMode;
00742     char *filterToken;
00743 
00744     ppMode= av_malloc(sizeof(PPMode));
00745 
00746     ppMode->lumMode= 0;
00747     ppMode->chromMode= 0;
00748     ppMode->maxTmpNoise[0]= 700;
00749     ppMode->maxTmpNoise[1]= 1500;
00750     ppMode->maxTmpNoise[2]= 3000;
00751     ppMode->maxAllowedY= 234;
00752     ppMode->minAllowedY= 16;
00753     ppMode->baseDcDiff= 256/8;
00754     ppMode->flatnessThreshold= 56-16-1;
00755     ppMode->maxClippedThreshold= 0.01;
00756     ppMode->error=0;
00757 
00758     strncpy(temp, name, GET_MODE_BUFFER_SIZE);
00759 
00760     av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
00761 
00762     for(;;){
00763         char *filterName;
00764         int q= 1000000; //PP_QUALITY_MAX;
00765         int chrom=-1;
00766         int luma=-1;
00767         char *option;
00768         char *options[OPTIONS_ARRAY_SIZE];
00769         int i;
00770         int filterNameOk=0;
00771         int numOfUnknownOptions=0;
00772         int enable=1; //does the user want us to enabled or disabled the filter
00773 
00774         filterToken= strtok(p, filterDelimiters);
00775         if(filterToken == NULL) break;
00776         p+= strlen(filterToken) + 1; // p points to next filterToken
00777         filterName= strtok(filterToken, optionDelimiters);
00778         av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
00779 
00780         if(*filterName == '-'){
00781             enable=0;
00782             filterName++;
00783         }
00784 
00785         for(;;){ //for all options
00786             option= strtok(NULL, optionDelimiters);
00787             if(option == NULL) break;
00788 
00789             av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
00790             if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
00791             else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
00792             else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
00793             else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
00794             else{
00795                 options[numOfUnknownOptions] = option;
00796                 numOfUnknownOptions++;
00797             }
00798             if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
00799         }
00800         options[numOfUnknownOptions] = NULL;
00801 
00802         /* replace stuff from the replace Table */
00803         for(i=0; replaceTable[2*i]!=NULL; i++){
00804             if(!strcmp(replaceTable[2*i], filterName)){
00805                 int newlen= strlen(replaceTable[2*i + 1]);
00806                 int plen;
00807                 int spaceLeft;
00808 
00809                 if(p==NULL) p= temp, *p=0;      //last filter
00810                 else p--, *p=',';               //not last filter
00811 
00812                 plen= strlen(p);
00813                 spaceLeft= p - temp + plen;
00814                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE){
00815                     ppMode->error++;
00816                     break;
00817                 }
00818                 memmove(p + newlen, p, plen+1);
00819                 memcpy(p, replaceTable[2*i + 1], newlen);
00820                 filterNameOk=1;
00821             }
00822         }
00823 
00824         for(i=0; filters[i].shortName!=NULL; i++){
00825             if(   !strcmp(filters[i].longName, filterName)
00826                || !strcmp(filters[i].shortName, filterName)){
00827                 ppMode->lumMode &= ~filters[i].mask;
00828                 ppMode->chromMode &= ~filters[i].mask;
00829 
00830                 filterNameOk=1;
00831                 if(!enable) break; // user wants to disable it
00832 
00833                 if(q >= filters[i].minLumQuality && luma)
00834                     ppMode->lumMode|= filters[i].mask;
00835                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
00836                     if(q >= filters[i].minChromQuality)
00837                             ppMode->chromMode|= filters[i].mask;
00838 
00839                 if(filters[i].mask == LEVEL_FIX){
00840                     int o;
00841                     ppMode->minAllowedY= 16;
00842                     ppMode->maxAllowedY= 234;
00843                     for(o=0; options[o]!=NULL; o++){
00844                         if(  !strcmp(options[o],"fullyrange")
00845                            ||!strcmp(options[o],"f")){
00846                             ppMode->minAllowedY= 0;
00847                             ppMode->maxAllowedY= 255;
00848                             numOfUnknownOptions--;
00849                         }
00850                     }
00851                 }
00852                 else if(filters[i].mask == TEMP_NOISE_FILTER)
00853                 {
00854                     int o;
00855                     int numOfNoises=0;
00856 
00857                     for(o=0; options[o]!=NULL; o++){
00858                         char *tail;
00859                         ppMode->maxTmpNoise[numOfNoises]=
00860                             strtol(options[o], &tail, 0);
00861                         if(tail!=options[o]){
00862                             numOfNoises++;
00863                             numOfUnknownOptions--;
00864                             if(numOfNoises >= 3) break;
00865                         }
00866                     }
00867                 }
00868                 else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
00869                      || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
00870                     int o;
00871 
00872                     for(o=0; options[o]!=NULL && o<2; o++){
00873                         char *tail;
00874                         int val= strtol(options[o], &tail, 0);
00875                         if(tail==options[o]) break;
00876 
00877                         numOfUnknownOptions--;
00878                         if(o==0) ppMode->baseDcDiff= val;
00879                         else ppMode->flatnessThreshold= val;
00880                     }
00881                 }
00882                 else if(filters[i].mask == FORCE_QUANT){
00883                     int o;
00884                     ppMode->forcedQuant= 15;
00885 
00886                     for(o=0; options[o]!=NULL && o<1; o++){
00887                         char *tail;
00888                         int val= strtol(options[o], &tail, 0);
00889                         if(tail==options[o]) break;
00890 
00891                         numOfUnknownOptions--;
00892                         ppMode->forcedQuant= val;
00893                     }
00894                 }
00895             }
00896         }
00897         if(!filterNameOk) ppMode->error++;
00898         ppMode->error += numOfUnknownOptions;
00899     }
00900 
00901     av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
00902     if(ppMode->error){
00903         av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
00904         av_free(ppMode);
00905         return NULL;
00906     }
00907     return ppMode;
00908 }
00909 
00910 void pp_free_mode(pp_mode *mode){
00911     av_free(mode);
00912 }
00913 
00914 static void reallocAlign(void **p, int alignment, int size){
00915     av_free(*p);
00916     *p= av_mallocz(size);
00917 }
00918 
00919 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
00920     int mbWidth = (width+15)>>4;
00921     int mbHeight= (height+15)>>4;
00922     int i;
00923 
00924     c->stride= stride;
00925     c->qpStride= qpStride;
00926 
00927     reallocAlign((void **)&c->tempDst, 8, stride*24);
00928     reallocAlign((void **)&c->tempSrc, 8, stride*24);
00929     reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
00930     reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
00931     for(i=0; i<256; i++)
00932             c->yHistogram[i]= width*height/64*15/256;
00933 
00934     for(i=0; i<3; i++){
00935         //Note: The +17*1024 is just there so i do not have to worry about r/w over the end.
00936         reallocAlign((void **)&c->tempBlurred[i], 8, stride*mbHeight*16 + 17*1024);
00937         reallocAlign((void **)&c->tempBlurredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
00938     }
00939 
00940     reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
00941     reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
00942     reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
00943     reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
00944 }
00945 
00946 static const char * context_to_name(void * ptr) {
00947     return "postproc";
00948 }
00949 
00950 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
00951 
00952 pp_context *pp_get_context(int width, int height, int cpuCaps){
00953     PPContext *c= av_malloc(sizeof(PPContext));
00954     int stride= (width+15)&(~15);    //assumed / will realloc if needed
00955     int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
00956 
00957     memset(c, 0, sizeof(PPContext));
00958     c->av_class = &av_codec_context_class;
00959     c->cpuCaps= cpuCaps;
00960     if(cpuCaps&PP_FORMAT){
00961         c->hChromaSubSample= cpuCaps&0x3;
00962         c->vChromaSubSample= (cpuCaps>>4)&0x3;
00963     }else{
00964         c->hChromaSubSample= 1;
00965         c->vChromaSubSample= 1;
00966     }
00967 
00968     reallocBuffers(c, width, height, stride, qpStride);
00969 
00970     c->frameNum=-1;
00971 
00972     return c;
00973 }
00974 
00975 void pp_free_context(void *vc){
00976     PPContext *c = (PPContext*)vc;
00977     int i;
00978 
00979     for(i=0; i<3; i++) av_free(c->tempBlurred[i]);
00980     for(i=0; i<3; i++) av_free(c->tempBlurredPast[i]);
00981 
00982     av_free(c->tempBlocks);
00983     av_free(c->yHistogram);
00984     av_free(c->tempDst);
00985     av_free(c->tempSrc);
00986     av_free(c->deintTemp);
00987     av_free(c->stdQPTable);
00988     av_free(c->nonBQPTable);
00989     av_free(c->forcedQPTable);
00990 
00991     memset(c, 0, sizeof(PPContext));
00992 
00993     av_free(c);
00994 }
00995 
00996 void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
00997                      uint8_t * dst[3], const int dstStride[3],
00998                      int width, int height,
00999                      const QP_STORE_T *QP_store,  int QPStride,
01000                      pp_mode *vm,  void *vc, int pict_type)
01001 {
01002     int mbWidth = (width+15)>>4;
01003     int mbHeight= (height+15)>>4;
01004     PPMode *mode = (PPMode*)vm;
01005     PPContext *c = (PPContext*)vc;
01006     int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
01007     int absQPStride = FFABS(QPStride);
01008 
01009     // c->stride and c->QPStride are always positive
01010     if(c->stride < minStride || c->qpStride < absQPStride)
01011         reallocBuffers(c, width, height,
01012                        FFMAX(minStride, c->stride),
01013                        FFMAX(c->qpStride, absQPStride));
01014 
01015     if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)){
01016         int i;
01017         QP_store= c->forcedQPTable;
01018         absQPStride = QPStride = 0;
01019         if(mode->lumMode & FORCE_QUANT)
01020             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
01021         else
01022             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
01023     }
01024 
01025     if(pict_type & PP_PICT_TYPE_QP2){
01026         int i;
01027         const int count= mbHeight * absQPStride;
01028         for(i=0; i<(count>>2); i++){
01029             ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
01030         }
01031         for(i<<=2; i<count; i++){
01032             c->stdQPTable[i] = QP_store[i]>>1;
01033         }
01034         QP_store= c->stdQPTable;
01035         QPStride= absQPStride;
01036     }
01037 
01038     if(0){
01039         int x,y;
01040         for(y=0; y<mbHeight; y++){
01041             for(x=0; x<mbWidth; x++){
01042                 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
01043             }
01044             av_log(c, AV_LOG_INFO, "\n");
01045         }
01046         av_log(c, AV_LOG_INFO, "\n");
01047     }
01048 
01049     if((pict_type&7)!=3){
01050         if (QPStride >= 0){
01051             int i;
01052             const int count= mbHeight * QPStride;
01053             for(i=0; i<(count>>2); i++){
01054                 ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
01055             }
01056             for(i<<=2; i<count; i++){
01057                 c->nonBQPTable[i] = QP_store[i] & 0x3F;
01058             }
01059         } else {
01060             int i,j;
01061             for(i=0; i<mbHeight; i++) {
01062                 for(j=0; j<absQPStride; j++) {
01063                     c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
01064                 }
01065             }
01066         }
01067     }
01068 
01069     av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
01070            mode->lumMode, mode->chromMode);
01071 
01072     postProcess(src[0], srcStride[0], dst[0], dstStride[0],
01073                 width, height, QP_store, QPStride, 0, mode, c);
01074 
01075     width  = (width )>>c->hChromaSubSample;
01076     height = (height)>>c->vChromaSubSample;
01077 
01078     if(mode->chromMode){
01079         postProcess(src[1], srcStride[1], dst[1], dstStride[1],
01080                     width, height, QP_store, QPStride, 1, mode, c);
01081         postProcess(src[2], srcStride[2], dst[2], dstStride[2],
01082                     width, height, QP_store, QPStride, 2, mode, c);
01083     }
01084     else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
01085         linecpy(dst[1], src[1], height, srcStride[1]);
01086         linecpy(dst[2], src[2], height, srcStride[2]);
01087     }else{
01088         int y;
01089         for(y=0; y<height; y++){
01090             memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
01091             memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
01092         }
01093     }
01094 }
01095