lm4f: Improve read/write endpoint performance

Modify lm4f_ep_read/write_packet() to take advantage of 32-bit and 16-bit accesses to the USB FIFO, as opposed to using only 8-bit accesses. This change increases endpoint performance in a number of configurations. On the host side, we use usb_bulk_bench to test https://github.com/mrnuke/usb_bulk_bench With the following invocations: usb_bulk_bench -d c03e:b007 -a -t 64 -q 32 -e [ep] [-I/-O] On the device side, we use the usb_bulk_dev example: https://github.com/mrnuke/libopencm3-examples/tree/stellaris The example is in: examples/lm4f/stellaris-ek-lm4f120xl/usb_bulk_dev (This example will be available in libopencm3-examples in the near future) The endpoints configuration is the following: * EP1 OUT - interrupt driven RX endpoint * EP2 IN - interrupt driven TX endpoint * EP3 OUT - polled RX endpoint * EP4 IN - polled TX endpoint * EP5 OUT - polled RX endpoint with unaligned buffer * EP6 IN - polled TX endpoint with unaligned buffer We test the speed in each configuration, using different system clock frequencies. We run the tests once without the patch applied, and once with the patch applied. The results are given below: Before patch (numbers in KiB/s): freq: 80 MHz 57 MHz 40 MHz 30 MHz 20 MHz 16 MHz EP1 562 562 562 562 562 550 EP2 936 872 812 812 687 625 EP3 1062 890 700 600 562 562 EP4 900 812 812 750 625 562 EP5 1062 890 700 600 562 562 EP6 930 812 812 750 625 562 With patch (numbers in KiB/s): freq: 80 MHz 57 MHz 40 MHz 30 MHz 20 MHz 16 MHz EP1 1062 1062 1062 690 562 562 EP2 1125 936 936 936 870 812 EP3 1062 960 750 750 562 562 EP4 936 936 870 870 770 700 EP5 1062 900 700 630 562 562 EP6 930 930 870 870 740 650 Percent change in speed (*): freq: 80 MHz 57 MHz 40 MHz 30 MHz 20 MHz 16 MHz EP1 89.0 89.0 89.0 22.8 0.0 2.2 EP2 20.2 7.3 15.3 15.3 26.6 29.9 EP3 0.0 7.9 7.1 25.0 0.0 0.0 EP4 4.0 15.3 7.1 16.0 23.2 24.6 EP5 0.0 1.1 0.0 5.0 0.0 0.0 EP6 0.0 14.5 7.1 16.0 18.4 15.7 (*) Numbers given as percent change relative to speed before applying this patch. We see throughput increases across the board. Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
12 years ago · 2184eb2b13
1 changed files with 24 additions and 8 deletions
--- a/lib/lm4f/usb_lm4f.c
+++ b/lib/lm4f/usb_lm4f.c
@ -364,12 +364,18 @@ static u16 lm4f_ep_write_packet(usbd_device *usbd_dev, u8 addr,
 	}

 	/*
-	 * For some reason, using 16 or 32-bit transfers to the FIFO does not
-	 * work well.
+	 * We don't need to worry about buf not being aligned. If it's not,
+	 * the reads are downgraded to 8-bit in hardware. We lose a bit of
+	 * performance, but we don't crash.
 	 */
-	for (i = 0; i < len; i++)
-		USB_FIFO8(ep) = ((u8 *)buf)[i];
-
+	for (i = 0; i < (len & ~0x3); i += 4)
+		USB_FIFO32(ep) = *((u32 *)(buf + i));
+	if (len & 0x2) {
+		USB_FIFO16(ep) = *((u16 *)(buf + i));
+		i += 2;
+	}
+	if (len & 0x1)
+		USB_FIFO8(ep)  = *((u8 *)(buf + i));

 	if (ep == 0) {
 		/*
@ -394,7 +400,6 @@ static u16 lm4f_ep_read_packet(usbd_device *usbd_dev, u8 addr, void *buf, u16 le
 {
 	(void)usbd_dev;

-	u8 * buffy = buf;
 	u16 rlen;
 	u8 ep = addr & 0xf;

@ -402,8 +407,19 @@ static u16 lm4f_ep_read_packet(usbd_device *usbd_dev, u8 addr, void *buf, u16 le

 	rlen = (fifoin > len) ? len : fifoin;

-	for (len = 0; len < rlen; len++)
-		buffy[len] = USB_FIFO8(ep);
+	/*
+	 * We don't need to worry about buf not being aligned. If it's not,
+	 * the writes are downgraded to 8-bit in hardware. We lose a bit of
+	 * performance, but we don't crash.
+	 */
+	for (len = 0; len < (rlen & ~0x3); len += 4)
+		*((u32 *)(buf + len)) = USB_FIFO32(ep);
+	if (rlen & 0x2) {
+		*((u16 *)(buf + len)) = USB_FIFO16(ep);
+		len += 2;
+	}
+	if (rlen & 0x1)
+		*((u8 *)(buf + len)) = USB_FIFO8(ep);

 	if (ep == 0) {
 		/*