shithub: libvpx

--- a/build/make/configure.sh

+++ b/build/make/configure.sh

@@ -980,6 +980,9 @@

         esac

fi

+    # for sysconf(3) and friends.

+    check_header unistd.h

     # glibc needs these

     if enabled linux; then

         add_cflags -D_LARGEFILE_SOURCE

--- a/configure

+++ b/configure

@@ -211,6 +211,7 @@

     alt_tree_layout

     pthread_h

     sys_mman_h

+    unistd_h

 EXPERIMENT_LIST="

     extend_qrange

--- a/vp8/common/arm/neon/loopfilter_neon.asm

+++ b/vp8/common/arm/neon/loopfilter_neon.asm

@@ -308,7 +308,6 @@

 ; q9    q2

 ; q10   q3

 |vp8_loop_filter_neon| PROC

-    ldr         r12, _lf_coeff_

     ; vp8_filter_mask

     vabd.u8     q11, q3, q4                 ; abs(p3 - p2)

@@ -339,7 +338,7 @@

     vqadd.u8    q9, q9, q2                  ; a = b + a

     vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1

-    vld1.u8     {q0}, [r12]!

+    vmov.u8     q0, #0x80                   ; 0x80

     ; vp8_filter() function

     ; convert to signed

@@ -348,7 +347,7 @@

     veor        q5, q5, q0                  ; ps1

     veor        q8, q8, q0                  ; qs1

-    vld1.u8     {q10}, [r12]!

+    vmov.u8     q10, #3                     ; #3

     vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)

     vsubl.s8    q11, d15, d13

@@ -367,7 +366,7 @@

     vaddw.s8    q2, q2, d2

     vaddw.s8    q11, q11, d3

-    vld1.u8     {q9}, [r12]!

+    vmov.u8     q9, #4                      ; #4

     ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))

     vqmovn.s16  d2, q2

@@ -398,13 +397,5 @@

     ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|

 ;-----------------

-_lf_coeff_

-    DCD     lf_coeff

-lf_coeff

-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080

-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303

-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404

-    DCD     0x01010101, 0x01010101, 0x01010101, 0x01010101

END

--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm

+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm

@@ -22,20 +22,19 @@

 ; r1    int p, //pitch

 ; r2    const signed char *flimit,

 ; r3    const signed char *limit,

-; stack(r4) const signed char *thresh,

+; stack(r4) const signed char *thresh (unused)

 ; //stack(r5)   int count --unused

 |vp8_loop_filter_simple_horizontal_edge_neon| PROC

     sub         r0, r0, r1, lsl #1          ; move src pointer down by 2 lines

-    ldr         r12, _lfhy_coeff_

     vld1.u8     {q5}, [r0], r1              ; p1

     vld1.s8     {d2[], d3[]}, [r2]          ; flimit

     vld1.s8     {d26[], d27[]}, [r3]        ; limit -> q13

     vld1.u8     {q6}, [r0], r1              ; p0

-    vld1.u8     {q0}, [r12]!                ; 0x80

+    vmov.u8     q0, #0x80                   ; 0x80

     vld1.u8     {q7}, [r0], r1              ; q0

-    vld1.u8     {q10}, [r12]!               ; 0x03

+    vmov.u8     q10, #0x03                  ; 0x03

     vld1.u8     {q8}, [r0]                  ; q1

     ;vp8_filter_mask() function

@@ -66,7 +65,7 @@

     vadd.s16    q11, q2, q2                 ;  3 * ( qs0 - ps0)

     vadd.s16    q12, q3, q3

-    vld1.u8     {q9}, [r12]!                ; 0x04

+    vmov.u8     q9, #0x04                   ; 0x04

     vadd.s16    q2, q2, q11

     vadd.s16    q3, q3, q12

@@ -104,12 +103,5 @@

     ENDP        ; |vp8_loop_filter_simple_horizontal_edge_neon|

 ;-----------------

-_lfhy_coeff_

-    DCD     lfhy_coeff

-lfhy_coeff

-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080

-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303

-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404

END

--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm

+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm

@@ -22,7 +22,7 @@

 ; r1    int p, //pitch

 ; r2    const signed char *flimit,

 ; r3    const signed char *limit,

-; stack(r4) const signed char *thresh,

+; stack(r4) const signed char *thresh (unused)

 ; //stack(r5)   int count --unused

 |vp8_loop_filter_simple_vertical_edge_neon| PROC

@@ -32,7 +32,6 @@

     vld1.s8     {d2[], d3[]}, [r2]          ; flimit

     vld1.s8     {d26[], d27[]}, [r3]        ; limit -> q13

     vld4.8      {d6[1], d7[1], d8[1], d9[1]}, [r0], r1

-    ldr         r12, _vlfy_coeff_

     vld4.8      {d6[2], d7[2], d8[2], d9[2]}, [r0], r1

     vld4.8      {d6[3], d7[3], d8[3], d9[3]}, [r0], r1

     vld4.8      {d6[4], d7[4], d8[4], d9[4]}, [r0], r1

@@ -41,11 +40,11 @@

     vld4.8      {d6[7], d7[7], d8[7], d9[7]}, [r0], r1

     vld4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1

-    vld1.u8     {q0}, [r12]!                ; 0x80

+    vmov.u8     q0, #0x80                ; 0x80

     vld4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1

-    vld1.u8     {q11}, [r12]!               ; 0x03

+    vmov.u8     q11, #0x03              ; 0x03

     vld4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1

-    vld1.u8     {q12}, [r12]!               ; 0x04

+    vmov.u8     q12, #0x04               ; 0x04

     vld4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1

     vld4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1

     vld4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1

@@ -145,12 +144,5 @@

     ENDP        ; |vp8_loop_filter_simple_vertical_edge_neon|

 ;-----------------

-_vlfy_coeff_

-    DCD     vlfy_coeff

-vlfy_coeff

-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080

-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303

-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404

END

--- a/vp8/common/arm/neon/mbloopfilter_neon.asm

+++ b/vp8/common/arm/neon/mbloopfilter_neon.asm

@@ -372,7 +372,6 @@

 ; q10   q3

 |vp8_mbloop_filter_neon| PROC

-    ldr         r12, _mblf_coeff_

     ; vp8_filter_mask

     vabd.u8     q11, q3, q4                 ; abs(p3 - p2)

@@ -396,7 +395,7 @@

     vld1.s8     {d4[], d5[]}, [r2]          ; flimit

-    vld1.u8     {q0}, [r12]!

+    vmov.u8     q0, #0x80                   ; 0x80

     vadd.u8     q2, q2, q2                  ; flimit * 2

     vadd.u8     q2, q2, q1                  ; flimit * 2 +  limit

@@ -431,12 +430,12 @@

     vadd.s16    q2, q2, q10

     vadd.s16    q13, q13, q11

-    vld1.u8     {q12}, [r12]!               ; #3

+    vmov.u8     q12, #3                     ; #3

     vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)

     vaddw.s8    q13, q13, d3

-    vld1.u8     {q11}, [r12]!               ; #4

+    vmov.u8     q11, #4                     ; #4

     ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))

     vqmovn.s16  d2, q2

@@ -444,16 +443,16 @@

     vand        q1, q1, q15                 ; vp8_filter &= mask

-    vld1.u8     {q15}, [r12]!               ; #63

-    ;

+    vmov.u16    q15, #63                    ; #63

     vand        q13, q1, q14                ; Filter2 &= hev

-    vld1.u8     {d7}, [r12]!                ; #9

+    vmov.u8     d7, #9                      ; #9

     vqadd.s8    q2, q13, q11                ; Filter1 = clamp(Filter2+4)

     vqadd.s8    q13, q13, q12               ; Filter2 = clamp(Filter2+3)

-    vld1.u8     {d6}, [r12]!                ; #18

+    vmov.u8     d6, #18                     ; #18

     vshr.s8     q2, q2, #3                  ; Filter1 >>= 3

     vshr.s8     q13, q13, #3                ; Filter2 >>= 3

@@ -463,7 +462,7 @@

     vqsub.s8    q7, q7, q2                  ; qs0 = clamp(qs0 - Filter1)

-    vld1.u8     {d5}, [r12]!                ; #27

+    vmov.u8     d5, #27                     ; #27

     vqadd.s8    q6, q6, q13                 ; ps0 = clamp(ps0 + Filter2)

@@ -506,15 +505,5 @@

     ENDP        ; |vp8_mbloop_filter_neon|

 ;-----------------

-_mblf_coeff_

-    DCD     mblf_coeff

-mblf_coeff

-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080

-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303

-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404

-    DCD     0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f

-    DCD     0x09090909, 0x09090909, 0x12121212, 0x12121212

-    DCD     0x1b1b1b1b, 0x1b1b1b1b

END

--- a/vp8/common/generic/systemdependent.c

+++ b/vp8/common/generic/systemdependent.c

@@ -17,9 +17,54 @@

 #include "vp8/common/idct.h"

 #include "vp8/common/onyxc_int.h"

+#if CONFIG_MULTITHREAD

+#if HAVE_UNISTD_H

+#include <unistd.h>

+#elif defined(_WIN32)

+#include <windows.h>

+typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO);

+#endif

+#endif

 extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);

 extern void vp8_arch_arm_common_init(VP8_COMMON *ctx);

+#if CONFIG_MULTITHREAD

+static int get_cpu_count()

+{

+    int core_count = 16;

+#if HAVE_UNISTD_H

+#if defined(_SC_NPROCESSORS_ONLN)

+    core_count = sysconf(_SC_NPROCESSORS_ONLN);

+#elif defined(_SC_NPROC_ONLN)

+    core_count = sysconf(_SC_NPROC_ONLN);

+#endif

+#elif defined(_WIN32)

+    {

+        PGNSI pGNSI;

+        SYSTEM_INFO sysinfo;

+        /* Call GetNativeSystemInfo if supported or

+         * GetSystemInfo otherwise. */

+        pGNSI = (PGNSI) GetProcAddress(

+                GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo");

+        if (pGNSI != NULL)

+            pGNSI(&sysinfo);

+        else

+            GetSystemInfo(&sysinfo);

+        core_count = sysinfo.dwNumberOfProcessors;

+    }

+#else

+    /* other platforms */

+#endif

+    return core_count > 0 ? core_count : 1;

+}

+#endif

 void vp8_machine_specific_config(VP8_COMMON *ctx)

 #if CONFIG_RUNTIME_CPU_DETECT

@@ -98,4 +143,7 @@

 #endif

+#if CONFIG_MULTITHREAD

+    ctx->processor_core_count = get_cpu_count();

+#endif /* CONFIG_MULTITHREAD */

--- a/vp8/common/onyxc_int.h

+++ b/vp8/common/onyxc_int.h

@@ -196,6 +196,9 @@

 #if CONFIG_RUNTIME_CPU_DETECT

     VP8_COMMON_RTCD rtcd;

 #endif

+#if CONFIG_MULTITHREAD

+    int processor_core_count;

+#endif

     struct postproc_state  postproc_state;

 } VP8_COMMON;

--- a/vp8/decoder/threading.c

+++ b/vp8/decoder/threading.c

@@ -439,12 +439,18 @@

     pbi->b_multithreaded_rd = 0;

     pbi->allocated_decoding_thread_count = 0;

-    core_count = (pbi->max_threads > 16) ? 16 : pbi->max_threads;

+    /* limit decoding threads to the max number of token partitions */

+    core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads;

+    /* limit decoding threads to the available cores */

+    if (core_count > pbi->common.processor_core_count)

+        core_count = pbi->common.processor_core_count;

     if (core_count > 1)

         pbi->b_multithreaded_rd = 1;

-        pbi->decoding_thread_count = core_count -1;

+        pbi->decoding_thread_count = core_count - 1;

         CHECK_MEM_ERROR(pbi->h_decoding_thread, vpx_malloc(sizeof(pthread_t) * pbi->decoding_thread_count));

         CHECK_MEM_ERROR(pbi->h_event_start_decoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));

--- a/vp8/encoder/ethreading.c

+++ b/vp8/encoder/ethreading.c

@@ -459,15 +459,15 @@

     cpi->b_multi_threaded = 0;

     cpi->encoding_thread_count = 0;

-    cpi->processor_core_count = 32; //vp8_get_proc_core_count();

-    if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)

+    if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)

         int ithread;

         int th_count = cpi->oxcf.multi_threaded - 1;

-        if (cpi->oxcf.multi_threaded > cpi->processor_core_count)

-            th_count = cpi->processor_core_count - 1;

+        /* don't allocate more threads than cores available */

+        if (cpi->oxcf.multi_threaded > cm->processor_core_count)

+            th_count = cm->processor_core_count - 1;

         /* we have th_count + 1 (main) threads processing one row each */

         /* no point to have more threads than the sync range allows */

@@ -514,6 +514,7 @@

             LPFTHREAD_DATA * lpfthd = &cpi->lpf_thread_data;

             sem_init(&cpi->h_event_start_lpf, 0, 0);

+            sem_init(&cpi->h_event_end_picklpf, 0, 0);

             sem_init(&cpi->h_event_end_lpf, 0, 0);

             lpfthd->ptr1 = (void *)cpi;

@@ -547,6 +548,7 @@

         sem_destroy(&cpi->h_event_end_encoding);

         sem_destroy(&cpi->h_event_end_lpf);

+        sem_destroy(&cpi->h_event_end_picklpf);

         sem_destroy(&cpi->h_event_start_lpf);

         //free thread related resources

--- a/vp8/encoder/onyx_if.c

+++ b/vp8/encoder/onyx_if.c

@@ -3211,7 +3211,7 @@

 #if CONFIG_MULTITHREAD

     if (cpi->b_multi_threaded)

-        sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */

+        sem_post(&cpi->h_event_end_picklpf); /* signal that we have set filter_level */

 #endif

     if (cm->filter_level > 0)

@@ -4221,7 +4221,7 @@

 #if CONFIG_MULTITHREAD

     /* wait that filter_level is picked so that we can continue with stream packing */

     if (cpi->b_multi_threaded)

-        sem_wait(&cpi->h_event_end_lpf);

+        sem_wait(&cpi->h_event_end_picklpf);

 #endif

     // build the bitstream

--- a/vp8/encoder/onyx_int.h

+++ b/vp8/encoder/onyx_int.h

@@ -580,7 +580,6 @@

     // multithread data

     int * mt_current_mb_col;

     int mt_sync_range;

-    int processor_core_count;

     int b_multi_threaded;

     int encoding_thread_count;

@@ -595,6 +594,7 @@

     sem_t *h_event_start_encoding;

     sem_t h_event_end_encoding;

     sem_t h_event_start_lpf;

+    sem_t h_event_end_picklpf;

     sem_t h_event_end_lpf;

 #endif

--

⑨