Fix small pixel drawing on little endian systems

If the pixel size (bpp) is smaller than the word size (long) on a little
endian system, pixel data is written to the wrong part of the word.

Fix this by reversing the shifts on little endian systems.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
diff --git a/drawops/bitstream.c b/drawops/bitstream.c
index b3e0933..06c7404 100644
--- a/drawops/bitstream.c
+++ b/drawops/bitstream.c
@@ -13,6 +13,14 @@
 #include "bitstream.h"
 #include "fb.h"
 
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define FIRST_MASK(idx)		(~0UL << (idx))
+#define LAST_MASK(idx, n)	(~(~0UL << (((idx)+(n)) % BITS_PER_LONG)))
+#else
+#define FIRST_MASK(idx)		(~0UL >> (idx))
+#define LAST_MASK(idx, n)	(~(~0UL >> (((idx)+(n)) % BITS_PER_LONG)))
+#endif
+
 
     /*
      *  Compose two values, using a bitmask as decision value
@@ -42,8 +50,8 @@
 	return;
 
     shift = dst_idx-src_idx;
-    first = ~0UL >> dst_idx;
-    last = ~(~0UL >> ((dst_idx+n) % BITS_PER_LONG));
+    first = FIRST_MASK(dst_idx);
+    last = LAST_MASK(dst_idx, n);
 
     if (!shift) {
 	// Same alignment for source and dest
@@ -190,8 +198,8 @@
     }
 
     shift = dst_idx-src_idx;
-    first = ~0UL << (BITS_PER_LONG-1-dst_idx);
-    last = ~(~0UL << (BITS_PER_LONG-1-((dst_idx-n) % BITS_PER_LONG)));
+    first = FIRST_MASK(BITS_PER_LONG-1-dst_idx);
+    last = LAST_MASK(BITS_PER_LONG-1-dst_idx, n);
 
     if (!shift) {
 	// Same alignment for source and dest
@@ -328,8 +336,8 @@
 	return;
 
     shift = dst_idx-src_idx;
-    first = ~0UL >> dst_idx;
-    last = ~(~0UL >> ((dst_idx+n) % BITS_PER_LONG));
+    first = FIRST_MASK(dst_idx);
+    last = LAST_MASK(dst_idx, n);
 
     if (!shift) {
 	// Same alignment for source and dest
@@ -465,8 +473,8 @@
     val |= val << 32;
 #endif
 
-    first = ~0UL >> dst_idx;
-    last = ~(~0UL >> ((dst_idx+n) % BITS_PER_LONG));
+    first = FIRST_MASK(dst_idx);
+    last = LAST_MASK(dst_idx, n);
 
     if (dst_idx+n <= BITS_PER_LONG) {
 	// Single word
@@ -520,8 +528,8 @@
     if (!n)
 	return;
 
-    first = ~0UL >> dst_idx;
-    last = ~(~0UL >> ((dst_idx+n) % BITS_PER_LONG));
+    first = FIRST_MASK(dst_idx);
+    last = LAST_MASK(dst_idx, n);
 
     if (dst_idx+n <= BITS_PER_LONG) {
 	// Single word
diff --git a/drawops/cfb.c b/drawops/cfb.c
index 1d8c88e..4a5a1ab 100644
--- a/drawops/cfb.c
+++ b/drawops/cfb.c
@@ -104,7 +104,7 @@
 void cfb_draw_hline(u32 x, u32 y, u32 length, pixel_t pixel)
 {
     unsigned long *dst;
-    int dst_idx, left;
+    int dst_idx, left, right;
     u32 bpp = fb_var.bits_per_pixel;
 
     dst = (unsigned long *)((unsigned long)fb & ~(BYTES_PER_LONG-1));
@@ -118,15 +118,22 @@
 	u32 pat = pixel_to_pat32(pixel);
 	bitfill32(dst, dst_idx, pat, length*bpp);
     } else {
-	unsigned long pat = pixel_to_pat(pixel, (left-dst_idx) % bpp);
-	bitfill(dst, dst_idx, pat, left, bpp-left, length*bpp);
+	unsigned long pat;
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+	right = left;
+	left = bpp-left;
+#else
+	right = bpp-left;
+#endif
+	pat = pixel_to_pat(pixel, (left-dst_idx) % bpp);
+	bitfill(dst, dst_idx, pat, left, right, length*bpp);
     }
 }
 
 void cfb_fill_rect(u32 x, u32 y, u32 width, u32 height, pixel_t pixel)
 {
     unsigned long *dst;
-    int dst_idx, left;
+    int dst_idx, left, right;
     u32 bpp = fb_var.bits_per_pixel;
 
     dst = (unsigned long *)((unsigned long)fb & ~(BYTES_PER_LONG-1));
@@ -143,9 +150,15 @@
 	    dst_idx += next_line*8;
 	}
     } else {
-	unsigned long pat = pixel_to_pat(pixel, (left-dst_idx) % bpp);
-	int right = bpp-left;
+	unsigned long pat;
 	int r;
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+	right = left;
+	left = bpp-left;
+#else
+	right = bpp-left;
+#endif
+	pat = pixel_to_pat(pixel, (left-dst_idx) % bpp);
 	while (height--) {
 	    dst += dst_idx >> SHIFT_PER_LONG;
 	    dst_idx &= (BITS_PER_LONG-1);
diff --git a/include/types.h b/include/types.h
index 8b11ee1..33066fd 100644
--- a/include/types.h
+++ b/include/types.h
@@ -9,6 +9,8 @@
  *  more details.
  */
 
+#include <endian.h>
+
 
     /*
      *  Fixed size quantities