mmsfb_blit_blend_argb_to_yv12.cpp

00001 /***************************************************************************
00002  *   Copyright (C) 2005-2007 Stefan Schwarzer, Jens Schneider,             *
00003  *                           Matthias Hardt, Guido Madaus                  *
00004  *                                                                         *
00005  *   Copyright (C) 2007-2008 BerLinux Solutions GbR                        *
00006  *                           Stefan Schwarzer & Guido Madaus               *
00007  *                                                                         *
00008  *   Copyright (C) 2009-2013 BerLinux Solutions GmbH                       *
00009  *                                                                         *
00010  *   Authors:                                                              *
00011  *      Stefan Schwarzer   <stefan.schwarzer@diskohq.org>,                 *
00012  *      Matthias Hardt     <matthias.hardt@diskohq.org>,                   *
00013  *      Jens Schneider     <jens.schneider@diskohq.org>,                   *
00014  *      Guido Madaus       <guido.madaus@diskohq.org>,                     *
00015  *      Patrick Helterhoff <patrick.helterhoff@diskohq.org>,               *
00016  *      René Bählkow       <rene.baehlkow@diskohq.org>                     *
00017  *                                                                         *
00018  *   This library is free software; you can redistribute it and/or         *
00019  *   modify it under the terms of the GNU Lesser General Public            *
00020  *   License version 2.1 as published by the Free Software Foundation.     *
00021  *                                                                         *
00022  *   This library is distributed in the hope that it will be useful,       *
00023  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
00024  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     *
00025  *   Lesser General Public License for more details.                       *
00026  *                                                                         *
00027  *   You should have received a copy of the GNU Lesser General Public      *
00028  *   License along with this library; if not, write to the                 *
00029  *   Free Software Foundation, Inc.,                                       *
00030  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA            *
00031  **************************************************************************/
00032 
00033 #include "mmsgui/fb/mmsfbconv.h"
00034 
00035 #ifdef __HAVE_PF_ARGB__
00036 #ifdef __HAVE_PF_YV12__
00037 
00038 #include "mmstools/mmstools.h"
00039 
00040 
00041 #ifdef __HAVE_SSE__
00042 
00043     v4si X1 = { 0x00ff00ff, 0x00ff00ff };
00044     v4six Y_RBRB = { 25, 66, 25, 66 };
00045     v4six Y_GG   = { 129, 0, 129, 0 };
00046     v4six U_RBRB = { 112, -38, 112, -38 };
00047     v4six U_GG   = { -74, 0, -74, 0 };
00048     v4six V_RBRB = { -18, 112, -18, 112 };
00049     v4six V_GG   = { -94, 0, -94, 0 };
00050 
00051     v4six YY = { 16, 0, 16, 0 };
00052     v4six UV = { 128, 0, 128, 0 };
00053 
00054 
00055 
00056 #define MMSFB_BLIT_BLEND_ARGB_TO_YV12_LOAD_SRC_ALPHA                    \
00057             __asm__ __volatile__ (                                      \
00058                     "###########################################\n\t"   \
00059                     "# load src: x1 -> mm0, x2 -> mm1, A -> mm2 \n\t"   \
00060                     "movq       %[src],     %%mm0               \n\t"   \
00061                     "movq       %%mm0,      %%mm1               \n\t"   \
00062                     "pand       %[X1],      %%mm0               \n\t"   \
00063                     "psrlw      $8,         %%mm1               \n\t"   \
00064                     "movq       %%mm1,      %%mm2               \n\t"   \
00065                     "psrld      $16,        %%mm2               \n\t"   \
00066                     "###########################################\n\t"   \
00067                     : /* no outputs */                                  \
00068                     : [src] "m" (*ssrc->i), [X1] "m" (*X1)              \
00069                     );                                                  \
00070             __asm__ __volatile__ (                                      \
00071                     "###########################################\n\t"   \
00072                     "# calc Y in mm3                            \n\t"   \
00073                     "movq       %%mm0,      %%mm3               \n\t"   \
00074                     "pmaddwd    %[Y_RBRB],  %%mm3               \n\t"   \
00075                     "movq       %%mm1,      %%mm7               \n\t"   \
00076                     "pmaddwd    %[Y_GG],    %%mm7               \n\t"   \
00077                     "paddd      %%mm7,      %%mm3               \n\t"   \
00078                     "psrld      $8,         %%mm3               \n\t"   \
00079                     "paddd      %[YY],      %%mm3               \n\t"   \
00080                     "pmullw     %%mm2,      %%mm3               \n\t"   \
00081                     "###########################################\n\t"   \
00082                     : /* no outputs */                                  \
00083                     : [Y_RBRB] "m" (*Y_RBRB), [Y_GG] "m" (*Y_GG), [YY] "m" (*YY)    \
00084                     );                                                  \
00085             __asm__ __volatile__ (                                      \
00086                     "###########################################\n\t"   \
00087                     "# calc U in mm4                            \n\t"   \
00088                     "movq       %%mm0,      %%mm4               \n\t"   \
00089                     "pmaddwd    %[U_RBRB],  %%mm4               \n\t"   \
00090                     "movq       %%mm1,      %%mm7               \n\t"   \
00091                     "pmaddwd    %[U_GG],    %%mm7               \n\t"   \
00092                     "paddd      %%mm7,      %%mm4               \n\t"   \
00093                     "psrld      $8,         %%mm4               \n\t"   \
00094                     "paddd      %[UV],      %%mm4               \n\t"   \
00095                     "pmullw     %%mm2,      %%mm4               \n\t"   \
00096                     "###########################################\n\t"   \
00097                     : /* no outputs */                                  \
00098                     : [U_RBRB] "m" (*U_RBRB), [U_GG] "m" (*U_GG), [UV] "m" (*UV)    \
00099                     );                                                  \
00100             __asm__ __volatile__ (                                      \
00101                     "###########################################\n\t"   \
00102                     "# calc V in mm5                            \n\t"   \
00103                     "movq       %%mm0,      %%mm5               \n\t"   \
00104                     "pmaddwd    %[V_RBRB],  %%mm5               \n\t"   \
00105                     "movq       %%mm1,      %%mm7               \n\t"   \
00106                     "pmaddwd    %[V_GG],    %%mm7               \n\t"   \
00107                     "paddd      %%mm7,      %%mm5               \n\t"   \
00108                     "psrld      $8,         %%mm5               \n\t"   \
00109                     "paddd      %[UV],      %%mm5               \n\t"   \
00110                     "pmullw     %%mm2,      %%mm5               \n\t"   \
00111                     "###########################################\n\t"   \
00112                     : /* no outputs */                                  \
00113                     : [V_RBRB] "m" (*V_RBRB), [V_GG] "m" (*V_GG), [UV] "m" (*UV)    \
00114                     );                                                  \
00115             __asm__ __volatile__ (                                      \
00116                     "###########################################\n\t"   \
00117                     "# calc A in mm2                            \n\t"   \
00118                     "movq       %%mm2,      %%mm7               \n\t"   \
00119                     "movq       %[TTTT],    %%mm2               \n\t"   \
00120                     "psubd      %%mm7,      %%mm2               \n\t"   \
00121                     "###########################################\n\t"   \
00122                     "# important: clear mm7!!!                  \n\t"   \
00123                     "pxor       %%mm7,      %%mm7               \n\t"   \
00124                     "###########################################\n\t"   \
00125                     : /* no outputs */                                  \
00126                     : [TTTT] "m" (*TTTT)                                \
00127                     );
00128 
00129 
00130 
00131 
00132 #define MMSFB_BLIT_BLEND_ARGB_TO_YV12_LOAD_SRC                          \
00133             __asm__ __volatile__ (                                      \
00134                     "###########################################\n\t"   \
00135                     "# load src: x1 -> mm0, x2 -> mm1           \n\t"   \
00136                     "movq       %[src],     %%mm0               \n\t"   \
00137                     "movq       %%mm0,      %%mm1               \n\t"   \
00138                     "pand       %[X1],      %%mm0               \n\t"   \
00139                     "psrlw      $8,         %%mm1               \n\t"   \
00140                     "###########################################\n\t"   \
00141                     : /* no outputs */                                  \
00142                     : [src] "m" (*ssrc->i), [X1] "m" (*X1)              \
00143                     );                                                  \
00144             __asm__ __volatile__ (                                      \
00145                     "###########################################\n\t"   \
00146                     "# calc Y in mm3                            \n\t"   \
00147                     "movq       %%mm0,      %%mm3               \n\t"   \
00148                     "pmaddwd    %[Y_RBRB],  %%mm3               \n\t"   \
00149                     "movq       %%mm1,      %%mm7               \n\t"   \
00150                     "pmaddwd    %[Y_GG],    %%mm7               \n\t"   \
00151                     "paddd      %%mm7,      %%mm3               \n\t"   \
00152                     "psrld      $8,         %%mm3               \n\t"   \
00153                     "paddd      %[YY],      %%mm3               \n\t"   \
00154                     "###########################################\n\t"   \
00155                     : /* no outputs */                                  \
00156                     : [Y_RBRB] "m" (*Y_RBRB), [Y_GG] "m" (*Y_GG), [YY] "m" (*YY)    \
00157                     );                                                  \
00158             __asm__ __volatile__ (                                      \
00159                     "###########################################\n\t"   \
00160                     "# calc U in mm4                            \n\t"   \
00161                     "movq       %%mm0,      %%mm4               \n\t"   \
00162                     "pmaddwd    %[U_RBRB],  %%mm4               \n\t"   \
00163                     "movq       %%mm1,      %%mm7               \n\t"   \
00164                     "pmaddwd    %[U_GG],    %%mm7               \n\t"   \
00165                     "paddd      %%mm7,      %%mm4               \n\t"   \
00166                     "psrld      $8,         %%mm4               \n\t"   \
00167                     "paddd      %[UV],      %%mm4               \n\t"   \
00168                     "###########################################\n\t"   \
00169                     : /* no outputs */                                  \
00170                     : [U_RBRB] "m" (*U_RBRB), [U_GG] "m" (*U_GG), [UV] "m" (*UV)    \
00171                     );                                                  \
00172             __asm__ __volatile__ (                                      \
00173                     "###########################################\n\t"   \
00174                     "# calc V in mm5                            \n\t"   \
00175                     "movq       %%mm0,      %%mm5               \n\t"   \
00176                     "pmaddwd    %[V_RBRB],  %%mm5               \n\t"   \
00177                     "movq       %%mm1,      %%mm7               \n\t"   \
00178                     "pmaddwd    %[V_GG],    %%mm7               \n\t"   \
00179                     "paddd      %%mm7,      %%mm5               \n\t"   \
00180                     "psrld      $8,         %%mm5               \n\t"   \
00181                     "paddd      %[UV],      %%mm5               \n\t"   \
00182                     "###########################################\n\t"   \
00183                     "# important: clear mm7!!!                  \n\t"   \
00184                     "pxor       %%mm7,      %%mm7               \n\t"   \
00185                     "###########################################\n\t"   \
00186                     : /* no outputs */                                  \
00187                     : [V_RBRB] "m" (*V_RBRB), [V_GG] "m" (*V_GG), [UV] "m" (*UV)    \
00188                     );
00189 
00190 
00191 
00192 
00193 
00194 
00195 #endif
00196 
00197 
00198 void mmsfb_blit_blend_argb_to_yv12(MMSFBExternalSurfaceBuffer *extbuf, int src_height, int sx, int sy, int sw, int sh,
00199                                    unsigned char *dst, int dst_pitch, int dst_height, int dx, int dy) {
00200 
00201     // first time?
00202     static bool firsttime = true;
00203     if (firsttime) {
00204         printf("DISKO: Using accelerated blend ARGB to YV12.\n");
00205         firsttime = false;
00206     }
00207 
00208     // get the first source ptr/pitch
00209     unsigned int *src = (unsigned int *)extbuf->ptr;
00210     int src_pitch = extbuf->pitch;
00211 
00212     // prepare...
00213     int  src_pitch_pix      = src_pitch >> 2;
00214     int dst_pitch_pix       = dst_pitch;
00215     int dst_pitch_pix_half  = dst_pitch_pix >> 1;
00216 
00217     src+= sx + sy * src_pitch_pix;
00218 
00219     // check the surface range
00220     if (dst_pitch_pix - dx < sw - sx)
00221         sw = dst_pitch_pix - dx - sx;
00222     if (dst_height - dy < sh - sy)
00223         sh = dst_height - dy - sy;
00224     if ((sw <= 0)||(sh <= 0))
00225         return;
00226 
00227     unsigned int OLDSRC  = (*src) + 1;
00228 
00229     unsigned int old_y;
00230     unsigned int old_u;
00231     unsigned int old_v;
00232 
00233     int  src_pixels = src_pitch_pix * sh;
00234 
00235     // check odd/even
00236     bool odd_left   = (dx & 0x01);
00237     bool odd_top    = (dy & 0x01);
00238     bool odd_right  = ((dx + sw) & 0x01);
00239     bool odd_bottom = ((dy + sh) & 0x01);
00240 
00241     // pointer to the pixel components of the first pixel
00242     unsigned char *dst_y = dst + dx + dy * dst_pitch_pix;
00243     unsigned char *dst_u = dst + dst_pitch_pix * dst_height + dst_pitch_pix_half * (dst_height >> 1) + (dx >> 1) + (dy >> 1) * dst_pitch_pix_half;
00244     unsigned char *dst_v = dst + dst_pitch_pix * dst_height                                          + (dx >> 1) + (dy >> 1) * dst_pitch_pix_half;
00245 
00246     // offsets to the other three pixels
00247     unsigned int dst_y2_offs = 1;
00248     unsigned int dst_y3_offs = dst_pitch;
00249     unsigned int src2_offs = 1;
00250     unsigned int src3_offs = src_pitch_pix;
00251 
00252     // arithmetic mean
00253     register unsigned int d_u;
00254     register unsigned int d_v;
00255 
00256     // draw odd pixels around the even rectangle
00257     if (odd_top && odd_left) {
00258         // odd top-left pixel
00259         register unsigned int SRC;
00260         register unsigned int A;
00261 
00262         // for arithmetic mean we have to set U and V from pixels outside the current rectangle
00263         d_u = (*dst_u) * 3;
00264         d_v = (*dst_v) * 3;
00265 
00266         // calculate my pixel...
00267         MMSFB_CONV_BLEND_ARGB_TO_YV12_PIXEL(*src, *dst_y, *dst_u, *dst_v, d_u+=, d_v+=);
00268 
00269         // calulate the arithmetic mean
00270         *dst_u = d_u >> 2;
00271         *dst_v = d_v >> 2;
00272     }
00273 
00274     if (odd_top && odd_right) {
00275         // odd top-right pixel
00276         MMSFB_CONV_BLEND_ARGB_TO_YV12_PUSHPTR;
00277 
00278         // go to the pixel in the current line
00279         src   += sw - 1;
00280         dst_y += sw - 1;
00281         if (odd_left) {
00282             dst_u += sw >> 1;
00283             dst_v += sw >> 1;
00284         }
00285         else {
00286             dst_u += (sw - 1) >> 1;
00287             dst_v += (sw - 1) >> 1;
00288         }
00289 
00290         register unsigned int SRC;
00291         register unsigned int A;
00292 
00293         // for arithmetic mean we have to set U and V from pixels outside the current rectangle
00294         d_u = (*dst_u) * 3;
00295         d_v = (*dst_v) * 3;
00296 
00297         // calculate my pixel...
00298         MMSFB_CONV_BLEND_ARGB_TO_YV12_PIXEL(*src, *dst_y, *dst_u, *dst_v, d_u+=, d_v+=);
00299 
00300         // calulate the arithmetic mean
00301         *dst_u = d_u >> 2;
00302         *dst_v = d_v >> 2;
00303 
00304         // restore the pointers
00305         MMSFB_CONV_BLEND_ARGB_TO_YV12_POPPTR;
00306     }
00307 
00308     if (odd_bottom && odd_left) {
00309         // odd bottom-left pixel
00310         MMSFB_CONV_BLEND_ARGB_TO_YV12_PUSHPTR;
00311 
00312         // go to the line
00313         src   += src_pitch_pix * (sh-1);
00314         dst_y += dst_pitch_pix * (sh-1);
00315         if (odd_top) {
00316             dst_u += dst_pitch_pix_half * (sh >> 1);
00317             dst_v += dst_pitch_pix_half * (sh >> 1);
00318         }
00319         else {
00320             dst_u += dst_pitch_pix_half * ((sh-1) >> 1);
00321             dst_v += dst_pitch_pix_half * ((sh-1) >> 1);
00322         }
00323 
00324         register unsigned int SRC;
00325         register unsigned int A;
00326 
00327         // for arithmetic mean we have to set U and V from pixels outside the current rectangle
00328         d_u = (*dst_u) * 3;
00329         d_v = (*dst_v) * 3;
00330 
00331         // calculate my pixel...
00332         MMSFB_CONV_BLEND_ARGB_TO_YV12_PIXEL(*src, *dst_y, *dst_u, *dst_v, d_u+=, d_v+=);
00333 
00334         // calulate the arithmetic mean
00335         *dst_u = d_u >> 2;
00336         *dst_v = d_v >> 2;
00337 
00338         // restore the pointers
00339         MMSFB_CONV_BLEND_ARGB_TO_YV12_POPPTR;
00340     }
00341 
00342     if (odd_bottom && odd_right) {
00343         // odd bottom-right pixel
00344         MMSFB_CONV_BLEND_ARGB_TO_YV12_PUSHPTR;
00345 
00346         // go to the line
00347         src   += src_pitch_pix * (sh-1);
00348         dst_y += dst_pitch_pix * (sh-1);
00349         if (odd_top) {
00350             dst_u += dst_pitch_pix_half * (sh >> 1);
00351             dst_v += dst_pitch_pix_half * (sh >> 1);
00352         }
00353         else {
00354             dst_u += dst_pitch_pix_half * ((sh-1) >> 1);
00355             dst_v += dst_pitch_pix_half * ((sh-1) >> 1);
00356         }
00357 
00358         // go to the pixel in the current line
00359         src   += sw - 1;
00360         dst_y += sw - 1;
00361         if (odd_left) {
00362             dst_u += sw >> 1;
00363             dst_v += sw >> 1;
00364         }
00365         else {
00366             dst_u += (sw - 1) >> 1;
00367             dst_v += (sw - 1) >> 1;
00368         }
00369 
00370         register unsigned int SRC;
00371         register unsigned int A;
00372 
00373         // for arithmetic mean we have to set U and V from pixels outside the current rectangle
00374         d_u = (*dst_u) * 3;
00375         d_v = (*dst_v) * 3;
00376 
00377         // calculate my pixel...
00378         MMSFB_CONV_BLEND_ARGB_TO_YV12_PIXEL(*src, *dst_y, *dst_u, *dst_v, d_u+=, d_v+=);
00379 
00380         // calulate the arithmetic mean
00381         *dst_u = d_u >> 2;
00382         *dst_v = d_v >> 2;
00383 
00384         // restore the pointers
00385         MMSFB_CONV_BLEND_ARGB_TO_YV12_POPPTR;
00386     }
00387 
00388     if (odd_top) {
00389         // odd top line
00390         MMSFB_CONV_BLEND_ARGB_TO_YV12_PUSHPTR;
00391 
00392         // calculate start and end
00393         unsigned int *line_end = src + sw;
00394         if (odd_left) {
00395             src++;
00396             dst_y++;
00397             dst_u++;
00398             dst_v++;
00399             line_end--;
00400         }
00401         if (odd_right)
00402             line_end--;
00403 
00404         // through the line
00405         while (src < line_end) {
00406             register unsigned int SRC;
00407             register unsigned int A;
00408 
00409             // for arithmetic mean we have to set U and V from pixels outside the current rectangle
00410             d_u = (*dst_u) << 1;
00411             d_v = (*dst_v) << 1;
00412 
00413             // calculate my two pixels...
00414             MMSFB_CONV_BLEND_ARGB_TO_YV12_PIXEL(*src, *dst_y, *dst_u, *dst_v, d_u+=, d_v+=);
00415             MMSFB_CONV_BLEND_ARGB_TO_YV12_PIXEL(src[src2_offs], dst_y[dst_y2_offs], *dst_u, *dst_v, d_u+=, d_v+=);
00416 
00417             // calulate the arithmetic mean
00418             *dst_u = d_u >> 2;
00419             *dst_v = d_v >> 2;
00420 
00421             // go to the next two pixels
00422             src+=2;
00423             dst_y+=2;
00424             dst_u++;
00425             dst_v++;
00426         }
00427 
00428         // restore the pointers
00429         MMSFB_CONV_BLEND_ARGB_TO_YV12_POPPTR;
00430     }
00431 
00432     if (odd_bottom) {
00433         // odd bottom line
00434         MMSFB_CONV_BLEND_ARGB_TO_YV12_PUSHPTR;
00435 
00436         // calculate start and end
00437         src   += src_pitch_pix * (sh-1);
00438         dst_y += dst_pitch_pix * (sh-1);
00439         if (odd_top) {
00440             dst_u += dst_pitch_pix_half * (sh >> 1);
00441             dst_v += dst_pitch_pix_half * (sh >> 1);
00442         }
00443         else {
00444             dst_u += dst_pitch_pix_half * ((sh-1) >> 1);
00445             dst_v += dst_pitch_pix_half * ((sh-1) >> 1);
00446         }
00447 
00448         unsigned int *line_end = src + sw;
00449         if (odd_left) {
00450             src++;
00451             dst_y++;
00452             dst_u++;
00453             dst_v++;
00454             line_end--;
00455         }
00456         if (odd_right)
00457             line_end--;
00458 
00459         // through the line
00460         while (src < line_end) {
00461             register unsigned int SRC;
00462             register unsigned int A;
00463 
00464             // for arithmetic mean we have to set U and V from pixels outside the current rectangle
00465             d_u = (*dst_u) << 1;
00466             d_v = (*dst_v) << 1;
00467 
00468             // calculate my two pixels...
00469             MMSFB_CONV_BLEND_ARGB_TO_YV12_PIXEL(*src, *dst_y, *dst_u, *dst_v, d_u+=, d_v+=);
00470             MMSFB_CONV_BLEND_ARGB_TO_YV12_PIXEL(src[src2_offs], dst_y[dst_y2_offs], *dst_u, *dst_v, d_u+=, d_v+=);
00471 
00472             // calulate the arithmetic mean
00473             *dst_u = d_u >> 2;
00474             *dst_v = d_v >> 2;
00475 
00476             // go to the next two pixels
00477             src+=2;
00478             dst_y+=2;
00479             dst_u++;
00480             dst_v++;
00481         }
00482 
00483         // restore the pointers
00484         MMSFB_CONV_BLEND_ARGB_TO_YV12_POPPTR;
00485     }
00486 
00487     if (odd_left) {
00488         // odd left line
00489         MMSFB_CONV_BLEND_ARGB_TO_YV12_PUSHPTR;
00490 
00491         // calculate start and end
00492         unsigned int *src_end = src + src_pixels;
00493         int src_pitch_diff    = src_pitch_pix << 1;
00494         int dst_pitch_diff    = dst_pitch_pix << 1;
00495         int dst_pitch_uvdiff  = dst_pitch_pix_half;
00496         if (odd_top) {
00497             src     += src_pitch_pix;
00498             src_end -= src_pitch_pix;
00499             dst_y   += dst_pitch_pix;
00500             dst_u   += dst_pitch_pix_half;
00501             dst_v   += dst_pitch_pix_half;
00502         }
00503         if (odd_bottom)
00504             src_end -= src_pitch_pix;
00505 
00506         // through all lines
00507         while (src < src_end) {
00508             // for the first pixel in the line
00509             register unsigned int SRC;
00510             register unsigned int A;
00511 
00512             // for arithmetic mean we have to set U and V from pixels outside the current rectangle
00513             d_u = (*dst_u) << 1;
00514             d_v = (*dst_v) << 1;
00515 
00516             // calculate my two pixels...
00517             MMSFB_CONV_BLEND_ARGB_TO_YV12_PIXEL(*src, *dst_y, *dst_u, *dst_v, d_u+=, d_v+=);
00518             MMSFB_CONV_BLEND_ARGB_TO_YV12_PIXEL(src[src3_offs], dst_y[dst_y3_offs], *dst_u, *dst_v, d_u+=, d_v+=);
00519 
00520             // calulate the arithmetic mean
00521             *dst_u = d_u >> 2;
00522             *dst_v = d_v >> 2;
00523 
00524             // go to the next two lines
00525             src   += src_pitch_diff;
00526             dst_y += dst_pitch_diff;
00527             dst_u += dst_pitch_uvdiff;
00528             dst_v += dst_pitch_uvdiff;
00529         }
00530 
00531         // restore the pointers
00532         MMSFB_CONV_BLEND_ARGB_TO_YV12_POPPTR;
00533     }
00534 
00535     if (odd_right) {
00536         // odd right line
00537         MMSFB_CONV_BLEND_ARGB_TO_YV12_PUSHPTR;
00538 
00539         // calculate start and end
00540         unsigned int *src_end = src + src_pixels;
00541         int src_pitch_diff    = src_pitch_pix << 1;
00542         int dst_pitch_diff    = dst_pitch_pix << 1;
00543         int dst_pitch_uvdiff  = dst_pitch_pix_half;
00544         src   += sw - 1;
00545         dst_y += sw - 1;
00546         if (odd_left) {
00547             dst_u += sw >> 1;
00548             dst_v += sw >> 1;
00549         }
00550         else {
00551             dst_u += (sw - 1) >> 1;
00552             dst_v += (sw - 1) >> 1;
00553         }
00554         if (odd_top) {
00555             src     += src_pitch_pix;
00556             src_end -= src_pitch_pix;
00557             dst_y   += dst_pitch_pix;
00558             dst_u   += dst_pitch_pix_half;
00559             dst_v   += dst_pitch_pix_half;
00560         }
00561         if (odd_bottom)
00562             src_end -= src_pitch_pix;
00563 
00564         // through all lines
00565         while (src < src_end) {
00566             // for the first pixel in the line
00567             register unsigned int SRC;
00568             register unsigned int A;
00569 
00570             // for arithmetic mean we have to set U and V from pixels outside the current rectangle
00571             d_u = (*dst_u) << 1;
00572             d_v = (*dst_v) << 1;
00573 
00574             // calculate my two pixels...
00575             MMSFB_CONV_BLEND_ARGB_TO_YV12_PIXEL(*src, *dst_y, *dst_u, *dst_v, d_u+=, d_v+=);
00576             MMSFB_CONV_BLEND_ARGB_TO_YV12_PIXEL(src[src3_offs], dst_y[dst_y3_offs], *dst_u, *dst_v, d_u+=, d_v+=);
00577 
00578             // calulate the arithmetic mean
00579             *dst_u = d_u >> 2;
00580             *dst_v = d_v >> 2;
00581 
00582             // go to the next two lines
00583             src   += src_pitch_diff;
00584             dst_y += dst_pitch_diff;
00585             dst_u += dst_pitch_uvdiff;
00586             dst_v += dst_pitch_uvdiff;
00587         }
00588 
00589         // restore the pointers
00590         MMSFB_CONV_BLEND_ARGB_TO_YV12_POPPTR;
00591     }
00592 
00593     // calc even positions...
00594     if (odd_top) {
00595         // odd top
00596         dy++;
00597         sh--;
00598         src+=src_pitch_pix;
00599         src_pixels-=src_pitch_pix;
00600         dst_y+=dst_pitch;
00601         dst_u+=dst_pitch >> 1;
00602         dst_v+=dst_pitch >> 1;
00603     }
00604 
00605     if (odd_bottom) {
00606         // odd bottom
00607         src_height--;
00608         src_pixels-=src_pitch_pix;
00609     }
00610 
00611     if (odd_left) {
00612         // odd left
00613         dx++;
00614         sw--;
00615         src++;
00616         dst_y++;
00617         dst_u++;
00618         dst_v++;
00619     }
00620 
00621     if (odd_right) {
00622         // odd right
00623         sw--;
00624     }
00625 
00626     // now we are even aligned and can go through a optimized loop
00627     ////////////////////////////////////////////////////////////////////////
00628 
00629 #ifndef __HAVE_SSE__
00630     unsigned int dst_y4_offs = dst_y3_offs + 1;
00631     unsigned int src4_offs = src3_offs + 1;
00632 
00633     // without mmx/sse
00634     unsigned int *src_end = src + src_pixels;
00635     int src_pitch_diff = (src_pitch_pix << 1) - sw;
00636     int dst_pitch_diff = (dst_pitch_pix << 1) - sw;
00637     int dst_pitch_uvdiff = (dst_pitch_pix - sw) >> 1;
00638 
00639     // for all lines
00640     while (src < src_end) {
00641         // for all pixels in the line
00642         unsigned int *line_end = src + sw;
00643 
00644         // go through two lines in parallel (square 2x2 pixel)
00645         while (src < line_end) {
00646             register unsigned int SRC;
00647             register unsigned int A;
00648 
00649             // calculate the four pixels...
00650             MMSFB_CONV_BLEND_ARGB_TO_YV12_PIXEL(*src, *dst_y, *dst_u, *dst_v, d_u=, d_v=);
00651             MMSFB_CONV_BLEND_ARGB_TO_YV12_PIXEL(src[src2_offs], dst_y[dst_y2_offs], *dst_u, *dst_v, d_u+=, d_v+=);
00652             MMSFB_CONV_BLEND_ARGB_TO_YV12_PIXEL(src[src3_offs], dst_y[dst_y3_offs], *dst_u, *dst_v, d_u+=, d_v+=);
00653             MMSFB_CONV_BLEND_ARGB_TO_YV12_PIXEL(src[src4_offs], dst_y[dst_y4_offs], *dst_u, *dst_v, d_u+=, d_v+=);
00654 
00655             // calulate the arithmetic mean
00656             *dst_u = d_u >> 2;
00657             *dst_v = d_v >> 2;
00658 
00659             // go to the next two pixels
00660             src  +=2;
00661             dst_y+=2;
00662             dst_u++;
00663             dst_v++;
00664         }
00665 
00666         // go to the next two lines
00667         src   += src_pitch_diff;
00668         dst_y += dst_pitch_diff;
00669         dst_u += dst_pitch_uvdiff;
00670         dst_v += dst_pitch_uvdiff;
00671     }
00672 
00673 #else
00674 
00675     // with mmx/sse
00676 //  static v4six TTT = { 0,0,0,0 };
00677     static v4six TTTT = { 0x100,0,0x100,0 };
00678 
00679 
00680 
00681 
00682 
00683     _v4si *src_end = (_v4si *)(src + src_pixels);
00684     _v4si *ssrc = (_v4si *)src;
00685     int src_pitch_diff = (src_pitch_pix << 1) - sw;
00686     int dst_pitch_diff = (dst_pitch_pix << 1) - sw;
00687     int dst_pitch_uvdiff = (dst_pitch_pix - sw) >> 1;
00688 
00689 
00690     src3_offs = src3_offs>>1;
00691     sw = sw >> 1;
00692     src_pitch_diff = src_pitch_diff >> 1;
00693 
00694 
00695 
00696     int src3_offsX = src3_offs-1;
00697     int dst_y3_offsX = dst_y3_offs-2;
00698 
00699     _v4si   OLDSRC_MMX;
00700     OLDSRC_MMX.i[0] = ssrc->i[0]+1;
00701     OLDSRC_MMX.i[1] = ssrc->i[1]+1;
00702 
00703     // for all lines
00704     while (ssrc < src_end) {
00705         // for all pixels in the line
00706         _v4si *line_end = ssrc + sw;
00707 
00708         // go through two lines in parallel (square 2x2 pixel)
00709         while (ssrc < line_end) {
00710             if ((ssrc->c[3]==0xff)&&(ssrc->c[7]==0xff)) {
00711                 // alpha channel == 0xff for both pixels
00712                 if ((ssrc->i[0] != OLDSRC_MMX.i[0])||(ssrc->i[1] != OLDSRC_MMX.i[1])) {
00713                     // convert argb source to yv12
00714                     MMSFB_BLIT_BLEND_ARGB_TO_YV12_LOAD_SRC;
00715                     OLDSRC_MMX = *ssrc;
00716                 }
00717 
00718                 __asm__ __volatile__ (
00719                         "###########################################\n\t"
00720                         "# save the two Y values                    \n\t"
00721                         "pextrw     $0,         %%mm3,      %%eax   \n\t"
00722                         "pextrw     $2,         %%mm3,      %%ecx   \n\t"
00723                         "mov        %%cl,       %%ah                \n\t"
00724                         "mov        %%ax,       %[dst_y]            \n\t"
00725                         "###########################################\n\t"
00726                         "# load reg mm0 with the U value            \n\t"
00727                         "movq       %%mm4,      %%mm0               \n\t"
00728                         "psadbw     %%mm7,      %%mm0               \n\t"
00729                         "# save the U result in mm6                 \n\t"
00730                         "movq       %%mm0,      %%mm6               \n\t"
00731                         "###########################################\n\t"
00732                         "# load reg mm0 with the V value            \n\t"
00733                         "movq       %%mm5,      %%mm0               \n\t"
00734                         "psadbw     %%mm7,      %%mm0               \n\t"
00735                         "pextrw     $0,         %%mm0,      %%eax   \n\t"
00736                         "# save the V result in mm6                 \n\t"
00737                         "pinsrw     $2,         %%eax,      %%mm6   \n\t"
00738                         "###########################################\n\t"
00739                         : [dst_y] "=m" (*dst_y)             // outputs
00740                         :                                   // inputs
00741                         : "cc", "%eax", "%ecx"              // clobbers
00742                         );
00743 
00744             }
00745             else
00746             if ((!ssrc->c[3])&&(!ssrc->c[7])) {
00747                 // alpha channel == 0x00 for both pixels
00748                 if ((ssrc->i[0] != OLDSRC_MMX.i[0])||(ssrc->i[1] != OLDSRC_MMX.i[1])) {
00749                     // pixel value has changed
00750                     OLDSRC_MMX = *ssrc;
00751                 }
00752 
00753                 // calculate U/V values, do it because we don't know if the next too pixels have also alpha 0x00
00754                 __asm__ __volatile__ (
00755                         "###########################################\n\t"
00756                         "# load reg eax with the U value            \n\t"
00757                         "xor        %%eax,      %%eax               \n\t"
00758                         "mov        %[dst_u],   %%al                \n\t"
00759                         "# calc U * 2                               \n\t"
00760                         "shl        $1,         %%ax                \n\t"
00761                         "# save the U result in mm6                 \n\t"
00762                         "pinsrw     $0,         %%eax,      %%mm6   \n\t"
00763                         "###########################################\n\t"
00764                         "# load reg eax with the V value            \n\t"
00765                         "xor        %%eax,      %%eax               \n\t"
00766                         "mov        %[dst_v],   %%al                \n\t"
00767                         "# calc V * 2                               \n\t"
00768                         "shl        $1,         %%ax                \n\t"
00769                         "# save the V result in mm6                 \n\t"
00770                         "pinsrw     $2,         %%eax,      %%mm6   \n\t"
00771                         "###########################################\n\t"
00772                         :                                               // outputs
00773                         : [dst_u] "m" (*dst_u), [dst_v] "m" (*dst_v)    // inputs
00774                         : "cc", "%eax"                                  // clobbers
00775                         );
00776             }
00777             else {
00778                 if ((ssrc->i[0] != OLDSRC_MMX.i[0])||(ssrc->i[1] != OLDSRC_MMX.i[1])) {
00779                     // convert argb source to yv12
00780                     MMSFB_BLIT_BLEND_ARGB_TO_YV12_LOAD_SRC_ALPHA;
00781                     OLDSRC_MMX = *ssrc;
00782                 }
00783                 __asm__ __volatile__ (
00784                         "###########################################\n\t"
00785                         "# load reg mm0 with the two Y values       \n\t"
00786                         "pxor       %%mm0,      %%mm0               \n\t"
00787                         "mov        %[dst_y],   %%ax                \n\t"
00788                         "mov        %%ax,       %%cx                \n\t"
00789                         "xor        %%ah,       %%ah                \n\t"
00790                         "shr        $8,         %%cx                \n\t"
00791                         "pinsrw     $0,         %%eax,      %%mm0   \n\t"
00792                         "pinsrw     $2,         %%ecx,      %%mm0   \n\t"
00793                         "# calc Y                                   \n\t"
00794                         "pmullw     %%mm2,      %%mm0               \n\t"
00795                         "paddw      %%mm3,      %%mm0               \n\t"
00796                         "psrlw      $8,         %%mm0               \n\t"
00797                         "# save the two Y results                   \n\t"
00798                         "pextrw     $0,         %%mm0,      %%eax   \n\t"
00799                         "pextrw     $2,         %%mm0,      %%ecx   \n\t"
00800                         "mov        %%cl,       %%ah                \n\t"
00801                         "mov        %%ax,       %[dst_y]            \n\t"
00802                         "###########################################\n\t"
00803                         : [dst_y] "+m" (*dst_y)             // outputs
00804                         :                                   // inputs
00805                         : "cc", "%eax", "%ecx"              // clobbers
00806                         );
00807 
00808                 __asm__ __volatile__ (
00809                         "###########################################\n\t"
00810                         "# load reg mm0 with the U value            \n\t"
00811                         "xor        %%eax,      %%eax               \n\t"
00812                         "mov        %[dst_u],   %%al                \n\t"
00813                         "pinsrw     $0,         %%eax,      %%mm0   \n\t"
00814                         "pinsrw     $2,         %%eax,      %%mm0   \n\t"
00815                         "# calc U                                   \n\t"
00816                         "pmullw     %%mm2,      %%mm0               \n\t"
00817                         "paddw      %%mm4,      %%mm0               \n\t"
00818                         "psrlw      $8,         %%mm0               \n\t"
00819                         "psadbw     %%mm7,      %%mm0               \n\t"
00820                         "# save the U result in mm6                 \n\t"
00821                         "movq       %%mm0,      %%mm6               \n\t"
00822                         "###########################################\n\t"
00823                         "# load reg mm0 with the V value            \n\t"
00824                         "xor        %%eax,      %%eax               \n\t"
00825                         "mov        %[dst_v],   %%al                \n\t"
00826                         "pinsrw     $0,         %%eax,      %%mm0   \n\t"
00827                         "pinsrw     $2,         %%eax,      %%mm0   \n\t"
00828                         "# calc V                                   \n\t"
00829                         "pmullw     %%mm2,      %%mm0               \n\t"
00830                         "paddw      %%mm5,      %%mm0               \n\t"
00831                         "psrlw      $8,         %%mm0               \n\t"
00832                         "psadbw     %%mm7,      %%mm0               \n\t"
00833                         "# save the V result in mm6                 \n\t"
00834                         "pextrw     $0,         %%mm0,      %%eax   \n\t"
00835                         "pinsrw     $2,         %%eax,      %%mm6   \n\t"
00836                         "###########################################\n\t"
00837                         :                                               // outputs
00838                         : [dst_u] "m" (*dst_u), [dst_v] "m" (*dst_v)    // inputs
00839                         : "cc", "%eax"                                  // clobbers
00840                         );
00841             }
00842 
00843             ssrc+=src3_offs;
00844             dst_y+=dst_y3_offs;
00845 
00846             if ((ssrc->c[3]==0xff)&&(ssrc->c[7]==0xff)) {
00847                 // alpha channel == 0xff for both pixels
00848                 if ((ssrc->i[0] != OLDSRC_MMX.i[0])||(ssrc->i[1] != OLDSRC_MMX.i[1])) {
00849                     // pixel value has changed, convert argb source to yv12
00850                     MMSFB_BLIT_BLEND_ARGB_TO_YV12_LOAD_SRC;
00851                     OLDSRC_MMX = *ssrc;
00852 
00853                     // calculate the U/V values
00854                     __asm__ __volatile__ (
00855                             "###########################################\n\t"
00856                             "# load reg mm0 with the U value            \n\t"
00857                             "movq       %%mm4,      %%mm0               \n\t"
00858                             "psadbw     %%mm7,      %%mm0               \n\t"
00859                             "# save the U result to memory              \n\t"
00860                             "paddw      %%mm6,      %%mm0               \n\t"
00861                             "pextrw     $0,         %%mm0,      %%eax   \n\t"
00862                             "shr        $2,         %%eax               \n\t"
00863                             "mov        %%al,       %[dst_u]            \n\t"
00864                             "###########################################\n\t"
00865                             "# load reg mm0 with the V value            \n\t"
00866                             "movq       %%mm5,      %%mm0               \n\t"
00867                             "psadbw     %%mm7,      %%mm0               \n\t"
00868                             "# save the V result to memory              \n\t"
00869                             "pextrw     $0,         %%mm0,      %%eax   \n\t"
00870                             "pextrw     $2,         %%mm6,      %%ecx   \n\t"
00871                             "add        %%ecx,      %%eax               \n\t"
00872                             "shr        $2,         %%eax               \n\t"
00873                             "mov        %%al,       %[dst_v]            \n\t"
00874                             "###########################################\n\t"
00875                             : [dst_u] "=m" (*dst_u), [dst_v] "=m" (*dst_v)  // outputs
00876                             :                                               // inputs
00877                             : "cc", "%eax", "%ecx"                          // clobbers
00878                             );
00879                 }
00880                 else {
00881                     // pixel value has NOT changed, so we can use a optimized calculation of U and V
00882 
00883                     // calculate the U/V values
00884                     __asm__ __volatile__ (
00885                             "###########################################\n\t"
00886                             "# save the U result to memory              \n\t"
00887                             "pextrw     $0,         %%mm6,      %%eax   \n\t"
00888                             "shr        $1,         %%eax               \n\t"
00889                             "mov        %%al,       %[dst_u]            \n\t"
00890                             "###########################################\n\t"
00891                             "# save the V result to memory              \n\t"
00892                             "pextrw     $2,         %%mm6,      %%eax   \n\t"
00893                             "shr        $1,         %%eax               \n\t"
00894                             "mov        %%al,       %[dst_v]            \n\t"
00895                             "###########################################\n\t"
00896                             : [dst_u] "=m" (*dst_u), [dst_v] "=m" (*dst_v)  // outputs
00897                             :                                               // inputs
00898                             : "cc", "%eax"                                  // clobbers
00899                             );
00900                 }
00901 
00902                 // calculate the two Y values
00903                 __asm__ __volatile__ (
00904                         "###########################################\n\t"
00905                         "# save the two Y values                    \n\t"
00906                         "pextrw     $0,         %%mm3,      %%eax   \n\t"
00907                         "pextrw     $2,         %%mm3,      %%ecx   \n\t"
00908                         "mov        %%cl,       %%ah                \n\t"
00909                         "mov        %%ax,       %[dst_y]            \n\t"
00910                         "###########################################\n\t"
00911                         : [dst_y] "=m" (*dst_y)             // outputs
00912                         :                                   // inputs
00913                         : "cc", "%eax", "%ecx"              // clobbers
00914                         );
00915             }
00916             else
00917             if ((!ssrc->c[3])&&(!ssrc->c[7])) {
00918                 // alpha channel == 0x00 for both pixels
00919                 if ((ssrc->i[0] != OLDSRC_MMX.i[0])||(ssrc->i[1] != OLDSRC_MMX.i[1])) {
00920                     // pixel value has changed, calculate U/V values
00921                     __asm__ __volatile__ (
00922                             "###########################################\n\t"
00923                             "# load reg eax with the U value            \n\t"
00924                             "xor        %%eax,      %%eax               \n\t"
00925                             "mov        %[dst_u],   %%al                \n\t"
00926                             "# calc U * 2                               \n\t"
00927                             "shl        $1,         %%ax                \n\t"
00928                             "# save the U result to memory              \n\t"
00929                             "pextrw     $0,         %%mm6,      %%ecx   \n\t"
00930                             "add        %%ecx,      %%eax               \n\t"
00931                             "shr        $2,         %%eax               \n\t"
00932                             "mov        %%al,       %[dst_u]            \n\t"
00933                             "###########################################\n\t"
00934                             : [dst_u] "+m" (*dst_u)             // outputs
00935                             :                                   // inputs
00936                             : "cc", "%eax", "%ecx"              // clobbers
00937                             );
00938 
00939                     __asm__ __volatile__ (
00940                             "###########################################\n\t"
00941                             "# load reg eax with the V value            \n\t"
00942                             "xor        %%eax,      %%eax               \n\t"
00943                             "mov        %[dst_v],   %%al                \n\t"
00944                             "# calc V * 2                               \n\t"
00945                             "shl        $1,         %%ax                \n\t"
00946                             "# save the V result to memory              \n\t"
00947                             "pextrw     $2,         %%mm6,      %%ecx   \n\t"
00948                             "add        %%ecx,      %%eax               \n\t"
00949                             "shr        $2,         %%eax               \n\t"
00950                             "mov        %%al,       %[dst_v]            \n\t"
00951                             "###########################################\n\t"
00952                             : [dst_v] "+m" (*dst_v)             // outputs
00953                             :                                   // inputs
00954                             : "cc", "%eax", "%ecx"              // clobbers
00955                             );
00956                 }
00957             }
00958             else {
00959                 // alpha channel > 0x00 and < 0xff for both pixels
00960                 if ((ssrc->i[0] != OLDSRC_MMX.i[0])||(ssrc->i[1] != OLDSRC_MMX.i[1])) {
00961                     // pixel value has changed, convert argb source to yv12
00962                     MMSFB_BLIT_BLEND_ARGB_TO_YV12_LOAD_SRC_ALPHA;
00963                     OLDSRC_MMX = *ssrc;
00964 
00965                     // calculate the U value
00966                     __asm__ __volatile__ (
00967                             "###########################################\n\t"
00968                             "# load reg mm0 with the U value            \n\t"
00969                             "xor        %%eax,      %%eax               \n\t"
00970                             "mov        %[dst_u],   %%al                \n\t"
00971                             "pinsrw     $0,         %%eax,      %%mm0   \n\t"
00972                             "pinsrw     $2,         %%eax,      %%mm0   \n\t"
00973                             "# calc U                                   \n\t"
00974                             "pmullw     %%mm2,      %%mm0               \n\t"
00975                             "paddw      %%mm4,      %%mm0               \n\t"
00976                             "psrlw      $8,         %%mm0               \n\t"
00977                             "psadbw     %%mm7,      %%mm0               \n\t"
00978                             "# save the U result to memory              \n\t"
00979                             "paddw      %%mm6,      %%mm0               \n\t"
00980                             "pextrw     $0,         %%mm0,      %%eax   \n\t"
00981                             "shr        $2,         %%eax               \n\t"
00982                             "mov        %%al,       %[dst_u]            \n\t"
00983                             "###########################################\n\t"
00984                             : [dst_u] "+m" (*dst_u)             // outputs
00985                             :                                   // inputs
00986                             : "cc", "%eax"                      // clobbers
00987                             );
00988 
00989                     // calculate the V value
00990                     __asm__ __volatile__ (
00991                             "###########################################\n\t"
00992                             "# load reg mm0 with the V value            \n\t"
00993                             "xor        %%eax,      %%eax               \n\t"
00994                             "mov        %[dst_v],   %%al                \n\t"
00995                             "pinsrw     $0,         %%eax,      %%mm0   \n\t"
00996                             "pinsrw     $2,         %%eax,      %%mm0   \n\t"
00997                             "# calc V                                   \n\t"
00998                             "pmullw     %%mm2,      %%mm0               \n\t"
00999                             "paddw      %%mm5,      %%mm0               \n\t"
01000                             "psrlw      $8,         %%mm0               \n\t"
01001                             "psadbw     %%mm7,      %%mm0               \n\t"
01002                             "# save the V result to memory              \n\t"
01003                             "pextrw     $0,         %%mm0,      %%eax   \n\t"
01004                             "pextrw     $2,         %%mm6,      %%ecx   \n\t"
01005                             "add        %%ecx,      %%eax               \n\t"
01006                             "shr        $2,         %%eax               \n\t"
01007                             "mov        %%al,       %[dst_v]            \n\t"
01008                             "###########################################\n\t"
01009                             : [dst_v] "+m" (*dst_v)             // outputs
01010                             :                                   // inputs
01011                             : "cc", "%eax", "%ecx"              // clobbers
01012                             );
01013                 }
01014                 else {
01015                     // pixel value has NOT changed, so we can use a optimized calculation of U and V
01016 
01017                     // calculate the U/V values, we do not need to load the destination because of same pixel value
01018                     __asm__ __volatile__ (
01019                             "###########################################\n\t"
01020                             "# save the U result to memory              \n\t"
01021                             "pextrw     $0,         %%mm6,      %%eax   \n\t"
01022                             "shr        $1,         %%eax               \n\t"
01023                             "mov        %%al,       %[dst_u]            \n\t"
01024                             "###########################################\n\t"
01025                             "# save the V result to memory              \n\t"
01026                             "pextrw     $2,         %%mm6,      %%eax   \n\t"
01027                             "shr        $1,         %%eax               \n\t"
01028                             "mov        %%al,       %[dst_v]            \n\t"
01029                             "###########################################\n\t"
01030                             : [dst_u] "=m" (*dst_u), [dst_v] "=m" (*dst_v)  // outputs
01031                             :                                               // inputs
01032                             : "cc", "%eax"                                  // clobbers
01033                             );
01034                 }
01035 
01036                 // calculate the two Y values
01037                 __asm__ __volatile__ (
01038                         "###########################################\n\t"
01039                         "# load reg mm0 with the two Y values       \n\t"
01040                         "pxor       %%mm0,      %%mm0               \n\t"
01041                         "mov        %[dst_y],   %%ax                \n\t"
01042                         "mov        %%ax,       %%cx                \n\t"
01043                         "xor        %%ah,       %%ah                \n\t"
01044                         "shr        $8,         %%cx                \n\t"
01045                         "pinsrw     $0,         %%eax,      %%mm0   \n\t"
01046                         "pinsrw     $2,         %%ecx,      %%mm0   \n\t"
01047                         "# calc Y                                   \n\t"
01048                         "pmullw     %%mm2,      %%mm0               \n\t"
01049                         "paddw      %%mm3,      %%mm0               \n\t"
01050                         "psrlw      $8,         %%mm0               \n\t"
01051                         "# save the two Y results                   \n\t"
01052                         "pextrw     $0,         %%mm0,      %%eax   \n\t"
01053                         "pextrw     $2,         %%mm0,      %%ecx   \n\t"
01054                         "mov        %%cl,       %%ah                \n\t"
01055                         "mov        %%ax,       %[dst_y]            \n\t"
01056                         : [dst_y] "+m" (*dst_y)             // outputs
01057                         :                                   // inputs
01058                         : "cc", "%eax", "%ecx"              // clobbers
01059                         );
01060 
01061             }
01062 
01063 
01064 
01065             // go to the next two pixels
01066             ssrc-=src3_offsX;
01067             dst_y-=dst_y3_offsX;
01068             dst_u++;
01069             dst_v++;
01070         }
01071 
01072         // go to the next two lines
01073         ssrc  += src_pitch_diff;
01074         dst_y += dst_pitch_diff;
01075         dst_u += dst_pitch_uvdiff;
01076         dst_v += dst_pitch_uvdiff;
01077     }
01078 
01079 
01080     __asm__ __volatile__ (
01081             "###########################################\n\t"
01082             "# clear the MMX state                      \n\t"
01083             "emms                                       \n\t"
01084             "###########################################\n\t"
01085             );
01086 #endif
01087 
01088 }
01089 
01090 #endif
01091 #endif