Slight improvements to data throughput, which may assist SCSI hosts with short timeouts v6.2.7
authorMichael McMaster <michael@codesrc.com>
Mon, 5 Aug 2019 10:07:37 +0000 (20:07 +1000)
committerMichael McMaster <michael@codesrc.com>
Wed, 9 Oct 2019 09:13:01 +0000 (19:13 +1000)
12 files changed:
CHANGELOG
STM32CubeMX/SCSI2SD-V6/Inc/stm32f2xx_it.h
STM32CubeMX/SCSI2SD-V6/Src/gpio.c
STM32CubeMX/SCSI2SD-V6/Src/stm32f2xx_it.c
rtl/fpga_bitmap.o
src/firmware/config.c
src/firmware/disk.c
src/firmware/scsi.c
src/firmware/scsi.h
src/firmware/scsiPhy.c
src/firmware/scsiPhy.h
src/firmware/sd.h

index f30692b..a6955ff 100644 (file)
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,7 @@
+20191009               6.2.7
+       - Slight improvements to data throughput, which may assist SCSI hosts with
+       short timeouts.
+
 20190529               6.2.5
        - Add scsi mode page 0 support
        - Fix SD card hotswap bug when the SCSI host is constantly polling
index 9f13ad0..4c7075a 100755 (executable)
@@ -46,6 +46,7 @@
 /* Exported functions ------------------------------------------------------- */
 
 void SysTick_Handler(void);
+void EXTI3_IRQHandler(void);
 void EXTI4_IRQHandler(void);
 void SDIO_IRQHandler(void);
 void DMA2_Stream3_IRQHandler(void);
index ba18a6d..5682f25 100755 (executable)
@@ -69,11 +69,17 @@ void MX_GPIO_Init(void)
   __GPIOD_CLK_ENABLE();
 
   /*Configure GPIO pins : PEPin PEPin PEPin PEPin */
-  GPIO_InitStruct.Pin = FPGA_GPIO2_Pin|FPGA_GPIO3_Pin|UNUSED_PE5_Pin|UNUSED_PE6_Pin;
+  GPIO_InitStruct.Pin = FPGA_GPIO2_Pin|UNUSED_PE5_Pin|UNUSED_PE6_Pin;
   GPIO_InitStruct.Mode = GPIO_MODE_INPUT;
   GPIO_InitStruct.Pull = GPIO_PULLDOWN;
   HAL_GPIO_Init(GPIOE, &GPIO_InitStruct);
 
+  /*Configure GPIO pin : PE3 */
+  GPIO_InitStruct.Pin = FPGA_GPIO3_Pin;
+  GPIO_InitStruct.Mode = GPIO_MODE_INPUT;
+  GPIO_InitStruct.Pull = GPIO_NOPULL;
+  HAL_GPIO_Init(GPIOE, &GPIO_InitStruct);
+
   /*Configure GPIO pin : PE4 */
   GPIO_InitStruct.Pin = GPIO_PIN_4;
   GPIO_InitStruct.Mode = GPIO_MODE_IT_RISING;
index 3ed95bb..efa2c5a 100755 (executable)
@@ -72,6 +72,7 @@ void SysTick_Handler(void)
 /* please refer to the startup file (startup_stm32f2xx.s).                    */
 /******************************************************************************/
 
+
 /**
 * @brief This function handles EXTI line4 interrupt.
 */
index 857b042..e877e44 100644 (file)
Binary files a/rtl/fpga_bitmap.o and b/rtl/fpga_bitmap.o differ
index 0ea55b2..f9b288c 100755 (executable)
@@ -37,7 +37,7 @@
 \r
 #include <string.h>\r
 \r
-static const uint16_t FIRMWARE_VERSION = 0x0625;\r
+static const uint16_t FIRMWARE_VERSION = 0x0627;\r
 \r
 // 1 flash row\r
 static const uint8_t DEFAULT_CONFIG[128] =\r
index 961387f..315db1c 100755 (executable)
@@ -18,6 +18,8 @@
 \r
 #include "stm32f2xx.h"\r
 \r
+#include <assert.h>\r
+\r
 // For SD write direct routines\r
 #include "sdio.h"\r
 #include "bsp_driver_sd.h"\r
@@ -561,14 +563,17 @@ void scsiDiskPoll()
                int scsiActive __attribute__((unused)) = 0; // unused if DMA disabled\r
                int sdActive = 0;\r
 \r
-               uint32_t partialScsiChunk = 0;\r
-\r
-               // Start reading from the SD card FIRST, because we change state and\r
-               // wait for SCSI signals\r
-               int dataInStarted = 0;\r
+               // It's highly unlikely that someone is going to use huge transfers\r
+               // per scsi command, but if they do it'll be slower than usual.\r
+               uint32_t totalScsiBytes = transfer.blocks * bytesPerSector;\r
+               int useSlowDataCount = totalScsiBytes >= SCSI_XFER_MAX;\r
+               if (!useSlowDataCount)\r
+               {\r
+                       scsiSetDataCount(totalScsiBytes);\r
+               }\r
 \r
                while ((i < totalSDSectors) &&\r
-                       (!dataInStarted || likely(scsiDev.phase == DATA_IN)) &&\r
+                       likely(scsiDev.phase == DATA_IN) &&\r
                        likely(!scsiDev.resetFlag))\r
                {\r
                        int completedDmaSectors;\r
@@ -588,12 +593,13 @@ void scsiDiskPoll()
 \r
                        if (!sdActive &&\r
                                (prep - i < buffers) &&\r
-                               (prep < totalSDSectors))\r
+                               (prep < totalSDSectors) &&\r
+                               ((totalSDSectors - prep) >= sdPerScsi) &&\r
+                               (likely(!useSlowDataCount) || scsiPhyComplete()))\r
                        {\r
                                // Start an SD transfer if we have space.\r
                                uint32_t startBuffer = prep % buffers;\r
                                uint32_t sectors = totalSDSectors - prep;\r
-\r
                                uint32_t freeBuffers = buffers - (prep - i);\r
 \r
                                uint32_t contiguousBuffers = buffers - startBuffer;\r
@@ -603,6 +609,12 @@ void scsiDiskPoll()
 \r
                                if (sectors > 128) sectors = 128; // 65536 DMA limit !!\r
 \r
+                               // Round-down when we have odd sector sizes.\r
+                               if (sdPerScsi != 1)\r
+                               {\r
+                                       sectors = (sectors / sdPerScsi) * sdPerScsi;\r
+                               }\r
+\r
                                for (int dodgy = 0; dodgy < sectors; dodgy++)\r
                                {\r
                                        scsiDev.data[SD_SECTOR_SIZE * (startBuffer + dodgy) + 510] = 0xAA;\r
@@ -613,6 +625,11 @@ void scsiDiskPoll()
 \r
                                sdActive = sectors;\r
 \r
+                               if (useSlowDataCount)\r
+                               {\r
+                                       scsiSetDataCount((sectors / sdPerScsi) * bytesPerSector);\r
+                               }\r
+\r
                                // Wait now that the SD card is busy\r
                                // Chances are we've probably already waited sufficient time,\r
                                // but it's hard to measure microseconds cheaply. So just wait\r
@@ -624,26 +641,6 @@ void scsiDiskPoll()
                                }\r
                        }\r
 \r
-#ifdef SCSI_FSMC_DMA\r
-                       #error this code not updated for 256 max bytes in scsi fifo\r
-                       if (scsiActive && scsiPhyComplete() && scsiWriteDMAPoll())\r
-                       {\r
-                               scsiActive = 0;\r
-                               i++;\r
-                               scsiPhyFifoFlip();\r
-                       }\r
-                       if (!scsiActive && ((prep - i) > 0))\r
-                       {\r
-                               int dmaBytes = SD_SECTOR_SIZE;\r
-                               if ((i % sdPerScsi) == (sdPerScsi - 1))\r
-                               {\r
-                                       dmaBytes = bytesPerSector % SD_SECTOR_SIZE;\r
-                                       if (dmaBytes == 0) dmaBytes = SD_SECTOR_SIZE;\r
-                               }\r
-                               scsiWriteDMA(&scsiDev.data[SD_SECTOR_SIZE * (i % buffers)], dmaBytes);\r
-                               scsiActive = 1;\r
-                       }\r
-#else\r
                        if ((prep - i) > 0)\r
                        {\r
                                int dmaBytes = SD_SECTOR_SIZE;\r
@@ -653,42 +650,11 @@ void scsiDiskPoll()
                                        if (dmaBytes == 0) dmaBytes = SD_SECTOR_SIZE;\r
                                }\r
 \r
-                               // Manually unrolled loop for performance.\r
-                               // -Os won't unroll this for us automatically,\r
-                               // especially since scsiPhyTx does volatile stuff.\r
-                               // Reduces bus utilisation by making the fsmc split\r
-                               // 32bits into 2 16 bit writes.\r
-\r
-                               uint16_t* scsiDmaData = (uint16_t*) &(scsiDev.data[SD_SECTOR_SIZE * (i % buffers) + partialScsiChunk]);\r
-\r
-                               uint32_t chunk = ((dmaBytes - partialScsiChunk) > SCSI_FIFO_DEPTH)\r
-                                       ? SCSI_FIFO_DEPTH : (dmaBytes - partialScsiChunk);\r
-\r
-                               int k = 0;\r
-                               for (; k + 4 < (chunk + 1) / 2; k += 4)\r
-                               {\r
-                                       scsiPhyTx32(scsiDmaData[k], scsiDmaData[k+1]);\r
-                                       scsiPhyTx32(scsiDmaData[k+2], scsiDmaData[k+3]);\r
-                               }\r
-                               for (; k < (chunk + 1) / 2; ++k)\r
-                               {\r
-                                       scsiPhyTx(scsiDmaData[k]);\r
-                               }\r
-                               while (!scsiPhyComplete() && !scsiDev.resetFlag)\r
-                               {\r
-                                       __WFE(); // Wait for event\r
-                               }\r
-                               scsiPhyFifoFlip();\r
-                               scsiSetDataCount(chunk);\r
+                               uint8_t* scsiDmaData = &(scsiDev.data[SD_SECTOR_SIZE * (i % buffers)]);\r
+                               scsiWritePIO(scsiDmaData, dmaBytes);\r
 \r
-                               partialScsiChunk += chunk;\r
-                               if (partialScsiChunk == dmaBytes)\r
-                               {\r
-                                       partialScsiChunk = 0;\r
-                                       ++i;\r
-                               }\r
+                               ++i;\r
                        }\r
-#endif\r
                }\r
 \r
                if (phaseChangeDelayUs > 0 && !scsiDev.resetFlag) // zero bytes ?\r
@@ -699,13 +665,14 @@ void scsiDiskPoll()
 \r
                // We've finished transferring the data to the FPGA, now wait until it's\r
                // written to he SCSI bus.\r
+               __disable_irq();\r
                while (!scsiPhyComplete() &&\r
                        likely(scsiDev.phase == DATA_IN) &&\r
                        likely(!scsiDev.resetFlag))\r
                {\r
-                       __WFE(); // Wait for event\r
+                       __WFI();\r
                }\r
-\r
+               __enable_irq();\r
 \r
                if (scsiDev.phase == DATA_IN)\r
                {\r
@@ -727,22 +694,28 @@ void scsiDiskPoll()
                                transfer.lba);\r
                int i = 0;\r
                int clearBSY = 0;\r
+               int extraSectors = 0;\r
 \r
                int parityError = 0;\r
                int enableParity = scsiDev.boardCfg.flags & S2S_CFG_ENABLE_PARITY;\r
 \r
+               uint32_t scsiSpeed = s2s_getScsiRateMBs();\r
+\r
+               uint32_t maxSectors = sizeof(scsiDev.data) / SD_SECTOR_SIZE;\r
+\r
+               static_assert(SCSI_XFER_MAX >= sizeof(scsiDev.data), "Assumes SCSI_XFER_MAX >= sizeof(scsiDev.data)");\r
+\r
+               // Start reading and filling fifos as soon as possible.\r
+               scsiSetDataCount(transfer.blocks * bytesPerSector);\r
+\r
                while ((i < totalSDSectors) &&\r
                        likely(scsiDev.phase == DATA_OUT) &&\r
-                       likely(!scsiDev.resetFlag) &&\r
-                       likely(!parityError || !enableParity))\r
+                       likely(!scsiDev.resetFlag))\r
+                       // KEEP GOING to ensure FIFOs are in a good state.\r
+                       // likely(!parityError || !enableParity))\r
                {\r
-                       // Well, until we have some proper non-blocking SD code, we must\r
-                       // do this in a half-duplex fashion. We need to write as much as\r
-                       // possible in each SD card transaction.\r
-                       uint32_t maxSectors = sizeof(scsiDev.data) / SD_SECTOR_SIZE;\r
                        uint32_t rem = totalSDSectors - i;\r
-                       uint32_t sectors =\r
-                               rem < maxSectors ? rem : maxSectors;\r
+                       uint32_t sectors = rem < maxSectors ? rem : maxSectors;\r
 \r
                        if (bytesPerSector == SD_SECTOR_SIZE)\r
                        {\r
@@ -750,19 +723,20 @@ void scsiDiskPoll()
                                // no flow control. This can be handled if a) the scsi interface\r
                                // doesn't block and b) we read enough SCSI sectors first so that\r
                                // the SD interface cannot catch up.\r
+                               int prevExtraSectors = extraSectors;\r
                                uint32_t totalBytes = sectors * SD_SECTOR_SIZE;\r
-                               uint32_t readAheadBytes = sectors * SD_SECTOR_SIZE;\r
+                               extraSectors = 0;\r
+\r
+                               int32_t readAheadBytes = totalBytes;\r
                                uint32_t sdSpeed = s2s_getSdRateMBs() + (scsiDev.sdUnderrunCount / 2);\r
-                               uint32_t scsiSpeed = s2s_getScsiRateMBs();\r
                                // if (have blind writes)\r
                                if (scsiSpeed > 0 && scsiDev.sdUnderrunCount < 16)\r
                                {\r
                                        // readAhead = sectors * (sd / scsi - 1 + 0.1);\r
-                                       readAheadBytes = totalBytes * sdSpeed / scsiSpeed - totalBytes + SCSI_FIFO_DEPTH;\r
-                                       if (readAheadBytes < SCSI_FIFO_DEPTH)\r
-                                       {\r
-                                               readAheadBytes = SCSI_FIFO_DEPTH;\r
-                                       }\r
+                                       readAheadBytes = totalBytes * sdSpeed / scsiSpeed - totalBytes;\r
+\r
+                                       // Round up to nearest FIFO size.\r
+                                       readAheadBytes = ((readAheadBytes / SCSI_FIFO_DEPTH) + 1) * SCSI_FIFO_DEPTH;\r
 \r
                                        if (readAheadBytes > totalBytes)\r
                                        {\r
@@ -770,60 +744,58 @@ void scsiDiskPoll()
                                        }\r
                                }\r
 \r
-                               uint32_t chunk = (readAheadBytes > SCSI_FIFO_DEPTH) ? SCSI_FIFO_DEPTH : readAheadBytes;\r
-                               scsiSetDataCount(chunk);\r
+                               uint32_t prevExtraBytes = prevExtraSectors * SD_SECTOR_SIZE;\r
+                               uint32_t scsiBytesRead = prevExtraBytes;\r
+                               readAheadBytes -= prevExtraBytes; // Must be signed!\r
 \r
-                               uint32_t scsiBytesRead = 0;\r
-                               while (scsiBytesRead < readAheadBytes)\r
+                               if (readAheadBytes > 0)\r
                                {\r
-                                       while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))\r
-                                       {\r
-                                               __WFE(); // Wait for event\r
-                                       }\r
-                                       parityError |= scsiParityError();\r
-                                       scsiPhyFifoFlip();\r
-                                       uint32_t nextChunk = ((totalBytes - scsiBytesRead - chunk) > SCSI_FIFO_DEPTH)\r
-                                               ? SCSI_FIFO_DEPTH : (totalBytes - scsiBytesRead - chunk);\r
-\r
-                                       if (nextChunk > 0) scsiSetDataCount(nextChunk);\r
-                                       scsiReadPIO(&scsiDev.data[scsiBytesRead], chunk);\r
-                                       scsiBytesRead += chunk;\r
-                                       chunk = nextChunk;\r
+                                       scsiReadPIO(\r
+                                               &scsiDev.data[scsiBytesRead],\r
+                                               readAheadBytes,\r
+                                               &parityError);\r
+                                       scsiBytesRead += readAheadBytes;\r
                                }\r
 \r
                                HAL_SD_WriteBlocks_DMA(&hsd, (uint32_t*) (&scsiDev.data[0]), (i + sdLBA) * 512ll, SD_SECTOR_SIZE, sectors);\r
 \r
-                               while (scsiBytesRead < totalBytes)\r
+                               int underrun = 0;\r
+                               if (scsiBytesRead < totalBytes && !scsiDev.resetFlag)\r
                                {\r
-                                       while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))\r
-                                       {\r
-                                               __WFE(); // Wait for event\r
-                                       }\r
-                                       parityError |= scsiParityError();\r
-                                       scsiPhyFifoFlip();\r
-                                       uint32_t nextChunk = ((totalBytes - scsiBytesRead - chunk) > SCSI_FIFO_DEPTH)\r
-                                               ? SCSI_FIFO_DEPTH : (totalBytes - scsiBytesRead - chunk);\r
-\r
-                                       if (nextChunk > 0) scsiSetDataCount(nextChunk);\r
-                                       scsiReadPIO(&scsiDev.data[scsiBytesRead], chunk);\r
-                                       scsiBytesRead += chunk;\r
-                                       chunk = nextChunk;\r
+                                       scsiReadPIO(\r
+                                               &scsiDev.data[scsiBytesRead],\r
+                                               totalBytes - readAheadBytes,\r
+                                               &parityError);\r
+\r
+                                       // Oh dear, SD finished first.\r
+                                       underrun = hsd.DmaTransferCplt;\r
+\r
+                                       scsiBytesRead += (totalBytes - readAheadBytes);\r
                                }\r
 \r
-                               // Oh dear, SD finished first.\r
-                               int underrun = totalBytes > readAheadBytes && hsd.DmaTransferCplt;\r
+                               if (!underrun && rem > sectors)\r
+                               {\r
+                                       // We probably have some time to waste reading more here.\r
+                                       // While noting this is going to drop us down into\r
+                                       // half-duplex operation (hence why we read max / 4 only)\r
+\r
+                                       extraSectors = rem - sectors > (maxSectors / 4)\r
+                                               ? (maxSectors / 4)\r
+                                               : rem - sectors;\r
+\r
+                                       scsiReadPIO(\r
+                                               &scsiDev.data[0],\r
+                                               extraSectors * SD_SECTOR_SIZE,\r
+                                               &parityError);\r
+                               }\r
 \r
                                uint32_t dmaFinishTime = s2s_getTime_ms();\r
-                               while (!hsd.SdTransferCplt &&\r
+                               while ((!hsd.SdTransferCplt ||\r
+                                               __HAL_SD_SDIO_GET_FLAG(&hsd, SDIO_FLAG_TXACT)) &&\r
                                        s2s_elapsedTime_ms(dmaFinishTime) < 180)\r
                                {\r
                                        // Wait while keeping BSY.\r
                                }\r
-                               while((__HAL_SD_SDIO_GET_FLAG(&hsd, SDIO_FLAG_TXACT)) &&\r
-                                       s2s_elapsedTime_ms(dmaFinishTime) < 180)\r
-                               {\r
-                                       // Wait for SD card while keeping BSY.\r
-                               }\r
 \r
                                if (i + sectors >= totalSDSectors &&\r
                                        !underrun &&\r
@@ -842,14 +814,14 @@ void scsiDiskPoll()
 \r
                                HAL_SD_CheckWriteOperation(&hsd, (uint32_t)SD_DATATIMEOUT);\r
 \r
-                               if (underrun)\r
+                               if (underrun && (!parityError || !enableParity))\r
                                {\r
                                        // Try again. Data is still in memory.\r
                                        sdTmpWrite(&scsiDev.data[0], i + sdLBA, sectors);\r
                                        scsiDev.sdUnderrunCount++;\r
                                }\r
-                               i += sectors;\r
 \r
+                               i += sectors;\r
                        }\r
                        else\r
                        {\r
@@ -857,11 +829,7 @@ void scsiDiskPoll()
                                // do this in a half-duplex fashion. We need to write as much as\r
                                // possible in each SD card transaction.\r
                                // use sg_dd from sg_utils3 tools to test.\r
-                               uint32_t maxSectors = sizeof(scsiDev.data) / SD_SECTOR_SIZE;\r
-                               uint32_t rem = totalSDSectors - i;\r
-                               uint32_t sectors = rem < maxSectors ? rem : maxSectors;\r
-                               int scsiSector;\r
-                               for (scsiSector = i; scsiSector < i + sectors; ++scsiSector)\r
+                               for (int scsiSector = i; scsiSector < i + sectors; ++scsiSector)\r
                                {\r
                                        int dmaBytes = SD_SECTOR_SIZE;\r
                                        if ((scsiSector % sdPerScsi) == (sdPerScsi - 1))\r
@@ -869,9 +837,10 @@ void scsiDiskPoll()
                                                dmaBytes = bytesPerSector % SD_SECTOR_SIZE;\r
                                                if (dmaBytes == 0) dmaBytes = SD_SECTOR_SIZE;\r
                                        }\r
-                                       scsiRead(&scsiDev.data[SD_SECTOR_SIZE * (scsiSector - i)], dmaBytes, &parityError);\r
+\r
+                                       scsiReadPIO(&scsiDev.data[SD_SECTOR_SIZE * (scsiSector - i)], dmaBytes, &parityError);\r
                                }\r
-                               if (!parityError)\r
+                               if (!parityError || !enableParity)\r
                                {\r
                                        sdTmpWrite(&scsiDev.data[0], i + sdLBA, sectors);\r
                                }\r
@@ -879,6 +848,15 @@ void scsiDiskPoll()
                        }\r
                }\r
 \r
+               // Should already be complete here as we've ready the FIFOs\r
+               // by now. Check anyway.\r
+               __disable_irq();\r
+               while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))\r
+               {\r
+                       __WFI();\r
+               }\r
+               __enable_irq();\r
+\r
                if (clearBSY)\r
                {\r
                        enter_BusFree();\r
index 4864426..25f0283 100755 (executable)
@@ -303,7 +303,6 @@ static void process_Command()
        {\r
                scsiRead(scsiDev.cdb + 6, scsiDev.cdbLen - 6, &parityError);\r
        }\r
-\r
        command = scsiDev.cdb[0];\r
 \r
        // Prefer LUN's set by IDENTIFY messages for newer hosts.\r
index 5480a6a..cbfa980 100755 (executable)
@@ -106,7 +106,9 @@ typedef struct
 typedef struct
 {
        // TODO reduce this buffer size and add a proper cache
-       uint8_t data[MAX_SECTOR_SIZE * 8]; // Must be aligned for DMA
+       // Must be aligned for DMA
+       // 65536 bytes is the DMA limit
+       uint8_t data[MAX_SECTOR_SIZE * 8];
 
        TargetState targets[S2S_MAX_TARGETS];
        TargetState* target;
index deb67b2..9337b13 100755 (executable)
@@ -30,7 +30,8 @@
 static uint8_t asyncTimings[][4] =\r
 {\r
 /* Speed,    Assert,    Deskew,    Hold,    Glitch */\r
-{/*1.5MB/s*/ 28,        18,        13,      15},\r
+{/*1.5MB/s*/ 28,        18,        7,      15},\r
+//{/*1.5MB/s*/ 63,        31,        7,      15},\r
 {/*3.3MB/s*/ 13,        6,         6,       13},\r
 {/*5MB/s*/   9,         6,         6,       6}, // 80ns\r
 {/*safe*/    3,         6,         6,       6}, // Probably safe\r
@@ -106,8 +107,6 @@ static DMA_HandleTypeDef fsmcToMem;
 volatile uint8_t scsiRxDMAComplete;\r
 volatile uint8_t scsiTxDMAComplete;\r
 \r
-uint8_t scsiPhyFifoSel = 0; // global\r
-\r
 // scsi IRQ handler is initialised by the STM32 HAL. Connected to\r
 // PE4\r
 // Note: naming is important to ensure this function is listed in the\r
@@ -120,15 +119,18 @@ void EXTI4_IRQHandler()
                // Clear interrupt flag\r
                __HAL_GPIO_EXTI_CLEAR_IT(GPIO_PIN_4);\r
 \r
-               scsiDev.resetFlag = scsiDev.resetFlag || scsiStatusRST();\r
+               uint8_t statusFlags = *SCSI_STS_SCSI;\r
+\r
+               scsiDev.resetFlag = scsiDev.resetFlag || (statusFlags & 0x04);\r
 \r
                // selFlag is required for Philips P2000C which releases it after 600ns\r
                // without waiting for BSY.\r
                // Also required for some early Mac Plus roms\r
-               scsiDev.selFlag = *SCSI_STS_SELECTED;\r
+               if (statusFlags & 0x08) // Check SEL flag\r
+               {\r
+                       scsiDev.selFlag = *SCSI_STS_SELECTED;\r
+               }\r
        }\r
-\r
-       __SEV(); // Set event. See corresponding __WFE() calls.\r
 }\r
 \r
 static void assertFail()\r
@@ -145,92 +147,215 @@ static void assertFail()
 void\r
 scsiSetDataCount(uint32_t count)\r
 {\r
-       *SCSI_DATA_CNT_HI = count >> 8;\r
+       *SCSI_DATA_CNT_HI = (count >> 16) & 0xff;\r
+       *SCSI_DATA_CNT_MID = (count >> 8) & 0xff;\r
        *SCSI_DATA_CNT_LO = count & 0xff;\r
        *SCSI_DATA_CNT_SET = 1;\r
 }\r
 \r
+int scsiFifoReady(void)\r
+{\r
+       __NOP();\r
+       HAL_GPIO_ReadPin(GPIOE, FPGA_GPIO3_Pin);\r
+       __NOP();\r
+       return HAL_GPIO_ReadPin(GPIOE, FPGA_GPIO3_Pin) != 0;\r
+}\r
+\r
 uint8_t\r
 scsiReadByte(void)\r
 {\r
-#if FIFODEBUG\r
-       if (!scsiPhyFifoAltEmpty()) {\r
-               // Force a lock-up.\r
-               assertFail();\r
-       }\r
-#endif\r
        scsiSetDataCount(1);\r
 \r
+       // Ready immediately. setDataCount resets fifos\r
+\r
        while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))\r
        {\r
-               __WFE(); // Wait for event\r
+               __WFI(); // Wait for interrupt\r
        }\r
-       scsiPhyFifoFlip();\r
+       __enable_irq();\r
+\r
        uint8_t val = scsiPhyRx();\r
        // TODO scsiDev.parityError = scsiDev.parityError || SCSI_Parity_Error_Read();\r
 \r
-#if FIFODEBUG\r
-       if (!scsiPhyFifoEmpty()) {\r
-               int j = 0;\r
-               uint8_t k __attribute((unused));\r
-               while (!scsiPhyFifoEmpty()) { k = scsiPhyRx(); ++j; }\r
-\r
-               // Force a lock-up.\r
-               assertFail();\r
-       }\r
-#endif\r
        return val;\r
 }\r
 \r
 \r
 void\r
-scsiReadPIO(uint8_t* data, uint32_t count)\r
+scsiReadPIO(uint8_t* data, uint32_t count, int* parityError)\r
 {\r
        uint16_t* fifoData = (uint16_t*)data;\r
+       uint32_t count16 = (count + 1) / 2;\r
 \r
-       for (int i = 0; i < (count + 1) / 2; ++i)\r
+       int i = 0;\r
+       while ((i  < count16) && likely(!scsiDev.resetFlag))\r
        {\r
-               fifoData[i] = scsiPhyRx(); // TODO ASSUMES LITTLE ENDIAN\r
-       }\r
-}\r
-\r
-void\r
-scsiReadDMA(uint8_t* data, uint32_t count)\r
-{\r
-       // Prepare DMA transfer\r
-       dmaInProgress = 1;\r
-\r
-       scsiTxDMAComplete = 1; // TODO not used much\r
-       scsiRxDMAComplete = 0; // TODO not used much\r
+               // Wait until FIFO is full (or complete)\r
+               while (!scsiFifoReady() && likely(!scsiDev.resetFlag))\r
+               {\r
+                       // spin\r
+               }\r
 \r
-       HAL_DMA_Start(\r
-               &fsmcToMem,\r
-               (uint32_t) SCSI_FIFO_DATA,\r
-               (uint32_t) data,\r
-               (count + 1) / 2);\r
-}\r
+               if (count16 - i >= SCSI_FIFO_DEPTH16)\r
+               {\r
+                       uint32_t chunk16 = SCSI_FIFO_DEPTH16;\r
 \r
-int\r
-scsiReadDMAPoll()\r
-{\r
-       int complete = __HAL_DMA_GET_COUNTER(&fsmcToMem) == 0;\r
-       complete = complete && (HAL_DMA_PollForTransfer(&fsmcToMem, HAL_DMA_FULL_TRANSFER, 0xffffffff) == HAL_OK);\r
-       if (complete)\r
-       {\r
-               scsiTxDMAComplete = 1; // TODO MM FIX IRQ\r
-               scsiRxDMAComplete = 1;\r
+                       // Let gcc unroll the loop as much as possible.\r
+                       for (uint32_t k = 0; k + 128 <= chunk16; k += 128)\r
+                       {\r
+                               fifoData[i + k] = scsiPhyRx();\r
+                               fifoData[i + k + 1] = scsiPhyRx();\r
+                               fifoData[i + k + 2] = scsiPhyRx();\r
+                               fifoData[i + k + 3] = scsiPhyRx();\r
+                               fifoData[i + k + 4] = scsiPhyRx();\r
+                               fifoData[i + k + 5] = scsiPhyRx();\r
+                               fifoData[i + k + 6] = scsiPhyRx();\r
+                               fifoData[i + k + 7] = scsiPhyRx();\r
+                               fifoData[i + k + 8] = scsiPhyRx();\r
+                               fifoData[i + k + 9] = scsiPhyRx();\r
+                               fifoData[i + k + 10] = scsiPhyRx();\r
+                               fifoData[i + k + 11] = scsiPhyRx();\r
+                               fifoData[i + k + 12] = scsiPhyRx();\r
+                               fifoData[i + k + 13] = scsiPhyRx();\r
+                               fifoData[i + k + 14] = scsiPhyRx();\r
+                               fifoData[i + k + 15] = scsiPhyRx();\r
+                               fifoData[i + k + 16] = scsiPhyRx();\r
+                               fifoData[i + k + 17] = scsiPhyRx();\r
+                               fifoData[i + k + 18] = scsiPhyRx();\r
+                               fifoData[i + k + 19] = scsiPhyRx();\r
+                               fifoData[i + k + 20] = scsiPhyRx();\r
+                               fifoData[i + k + 21] = scsiPhyRx();\r
+                               fifoData[i + k + 22] = scsiPhyRx();\r
+                               fifoData[i + k + 23] = scsiPhyRx();\r
+                               fifoData[i + k + 24] = scsiPhyRx();\r
+                               fifoData[i + k + 25] = scsiPhyRx();\r
+                               fifoData[i + k + 26] = scsiPhyRx();\r
+                               fifoData[i + k + 27] = scsiPhyRx();\r
+                               fifoData[i + k + 28] = scsiPhyRx();\r
+                               fifoData[i + k + 29] = scsiPhyRx();\r
+                               fifoData[i + k + 30] = scsiPhyRx();\r
+                               fifoData[i + k + 31] = scsiPhyRx();\r
+                               fifoData[i + k + 32] = scsiPhyRx();\r
+                               fifoData[i + k + 33] = scsiPhyRx();\r
+                               fifoData[i + k + 34] = scsiPhyRx();\r
+                               fifoData[i + k + 35] = scsiPhyRx();\r
+                               fifoData[i + k + 36] = scsiPhyRx();\r
+                               fifoData[i + k + 37] = scsiPhyRx();\r
+                               fifoData[i + k + 38] = scsiPhyRx();\r
+                               fifoData[i + k + 39] = scsiPhyRx();\r
+                               fifoData[i + k + 40] = scsiPhyRx();\r
+                               fifoData[i + k + 41] = scsiPhyRx();\r
+                               fifoData[i + k + 42] = scsiPhyRx();\r
+                               fifoData[i + k + 43] = scsiPhyRx();\r
+                               fifoData[i + k + 44] = scsiPhyRx();\r
+                               fifoData[i + k + 45] = scsiPhyRx();\r
+                               fifoData[i + k + 46] = scsiPhyRx();\r
+                               fifoData[i + k + 47] = scsiPhyRx();\r
+                               fifoData[i + k + 48] = scsiPhyRx();\r
+                               fifoData[i + k + 49] = scsiPhyRx();\r
+                               fifoData[i + k + 50] = scsiPhyRx();\r
+                               fifoData[i + k + 51] = scsiPhyRx();\r
+                               fifoData[i + k + 52] = scsiPhyRx();\r
+                               fifoData[i + k + 53] = scsiPhyRx();\r
+                               fifoData[i + k + 54] = scsiPhyRx();\r
+                               fifoData[i + k + 55] = scsiPhyRx();\r
+                               fifoData[i + k + 56] = scsiPhyRx();\r
+                               fifoData[i + k + 57] = scsiPhyRx();\r
+                               fifoData[i + k + 58] = scsiPhyRx();\r
+                               fifoData[i + k + 59] = scsiPhyRx();\r
+                               fifoData[i + k + 60] = scsiPhyRx();\r
+                               fifoData[i + k + 61] = scsiPhyRx();\r
+                               fifoData[i + k + 62] = scsiPhyRx();\r
+                               fifoData[i + k + 63] = scsiPhyRx();\r
+                               fifoData[i + k + 64] = scsiPhyRx();\r
+                               fifoData[i + k + 65] = scsiPhyRx();\r
+                               fifoData[i + k + 66] = scsiPhyRx();\r
+                               fifoData[i + k + 67] = scsiPhyRx();\r
+                               fifoData[i + k + 68] = scsiPhyRx();\r
+                               fifoData[i + k + 69] = scsiPhyRx();\r
+                               fifoData[i + k + 70] = scsiPhyRx();\r
+                               fifoData[i + k + 71] = scsiPhyRx();\r
+                               fifoData[i + k + 72] = scsiPhyRx();\r
+                               fifoData[i + k + 73] = scsiPhyRx();\r
+                               fifoData[i + k + 74] = scsiPhyRx();\r
+                               fifoData[i + k + 75] = scsiPhyRx();\r
+                               fifoData[i + k + 76] = scsiPhyRx();\r
+                               fifoData[i + k + 77] = scsiPhyRx();\r
+                               fifoData[i + k + 78] = scsiPhyRx();\r
+                               fifoData[i + k + 79] = scsiPhyRx();\r
+                               fifoData[i + k + 80] = scsiPhyRx();\r
+                               fifoData[i + k + 81] = scsiPhyRx();\r
+                               fifoData[i + k + 82] = scsiPhyRx();\r
+                               fifoData[i + k + 83] = scsiPhyRx();\r
+                               fifoData[i + k + 84] = scsiPhyRx();\r
+                               fifoData[i + k + 85] = scsiPhyRx();\r
+                               fifoData[i + k + 86] = scsiPhyRx();\r
+                               fifoData[i + k + 87] = scsiPhyRx();\r
+                               fifoData[i + k + 88] = scsiPhyRx();\r
+                               fifoData[i + k + 89] = scsiPhyRx();\r
+                               fifoData[i + k + 90] = scsiPhyRx();\r
+                               fifoData[i + k + 91] = scsiPhyRx();\r
+                               fifoData[i + k + 92] = scsiPhyRx();\r
+                               fifoData[i + k + 93] = scsiPhyRx();\r
+                               fifoData[i + k + 94] = scsiPhyRx();\r
+                               fifoData[i + k + 95] = scsiPhyRx();\r
+                               fifoData[i + k + 96] = scsiPhyRx();\r
+                               fifoData[i + k + 97] = scsiPhyRx();\r
+                               fifoData[i + k + 98] = scsiPhyRx();\r
+                               fifoData[i + k + 99] = scsiPhyRx();\r
+                               fifoData[i + k + 100] = scsiPhyRx();\r
+                               fifoData[i + k + 101] = scsiPhyRx();\r
+                               fifoData[i + k + 102] = scsiPhyRx();\r
+                               fifoData[i + k + 103] = scsiPhyRx();\r
+                               fifoData[i + k + 104] = scsiPhyRx();\r
+                               fifoData[i + k + 105] = scsiPhyRx();\r
+                               fifoData[i + k + 106] = scsiPhyRx();\r
+                               fifoData[i + k + 107] = scsiPhyRx();\r
+                               fifoData[i + k + 108] = scsiPhyRx();\r
+                               fifoData[i + k + 109] = scsiPhyRx();\r
+                               fifoData[i + k + 110] = scsiPhyRx();\r
+                               fifoData[i + k + 111] = scsiPhyRx();\r
+                               fifoData[i + k + 112] = scsiPhyRx();\r
+                               fifoData[i + k + 113] = scsiPhyRx();\r
+                               fifoData[i + k + 114] = scsiPhyRx();\r
+                               fifoData[i + k + 115] = scsiPhyRx();\r
+                               fifoData[i + k + 116] = scsiPhyRx();\r
+                               fifoData[i + k + 117] = scsiPhyRx();\r
+                               fifoData[i + k + 118] = scsiPhyRx();\r
+                               fifoData[i + k + 119] = scsiPhyRx();\r
+                               fifoData[i + k + 120] = scsiPhyRx();\r
+                               fifoData[i + k + 121] = scsiPhyRx();\r
+                               fifoData[i + k + 122] = scsiPhyRx();\r
+                               fifoData[i + k + 123] = scsiPhyRx();\r
+                               fifoData[i + k + 124] = scsiPhyRx();\r
+                               fifoData[i + k + 125] = scsiPhyRx();\r
+                               fifoData[i + k + 126] = scsiPhyRx();\r
+                               fifoData[i + k + 127] = scsiPhyRx();\r
+                       }\r
 \r
-               dmaInProgress = 0;\r
-#if 0\r
-               // TODO MM scsiDev.parityError = scsiDev.parityError || SCSI_Parity_Error_Read();\r
-#endif\r
-               return 1;\r
+                       i += chunk16;\r
+               }\r
+               else\r
+               {\r
+                       uint32_t chunk16 = count16 - i;\r
 \r
+                       uint32_t k = 0;\r
+                       for (; k + 4 <= chunk16; k += 4)\r
+                       {\r
+                               fifoData[i + k] = scsiPhyRx();\r
+                               fifoData[i + 1 + k] = scsiPhyRx();\r
+                               fifoData[i + 2 + k] = scsiPhyRx();\r
+                               fifoData[i + 3 + k] = scsiPhyRx();\r
+                       }\r
+                       for (; k < chunk16; ++k)\r
+                       {\r
+                               fifoData[i + k] = scsiPhyRx();\r
+                       }\r
+                       i += chunk16;\r
+               }\r
        }\r
-       else\r
-       {\r
-               return 0;\r
-       }\r
+\r
+       *parityError |= scsiParityError();\r
 }\r
 \r
 void\r
@@ -239,208 +364,173 @@ scsiRead(uint8_t* data, uint32_t count, int* parityError)
        int i = 0;\r
        *parityError = 0;\r
 \r
-\r
-       uint32_t chunk = ((count - i) > SCSI_FIFO_DEPTH)\r
-               ? SCSI_FIFO_DEPTH : (count - i);\r
-#ifdef SCSI_FSMC_DMA\r
-       if (chunk >= 16)\r
-       {\r
-               // DMA is doing 32bit transfers.\r
-               chunk = chunk & 0xFFFFFFF8;\r
-       }\r
-#endif\r
-       scsiSetDataCount(chunk);\r
-\r
        while (i < count && likely(!scsiDev.resetFlag))\r
        {\r
-               while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))\r
-               {\r
-                       __WFE(); // Wait for event\r
-               }\r
-               *parityError |= scsiParityError();\r
-               scsiPhyFifoFlip();\r
+               uint32_t chunk = ((count - i) > SCSI_XFER_MAX)\r
+                       ? SCSI_XFER_MAX : (count - i);\r
+               scsiSetDataCount(chunk);\r
 \r
-               uint32_t nextChunk = ((count - i - chunk) > SCSI_FIFO_DEPTH)\r
-                       ? SCSI_FIFO_DEPTH : (count - i - chunk);\r
-#ifdef SCSI_FSMC_DMA\r
-               if (nextChunk >= 16)\r
-               {\r
-                       nextChunk = nextChunk & 0xFFFFFFF8;\r
-               }\r
-#endif\r
-               if (nextChunk > 0)\r
-               {\r
-                       scsiSetDataCount(nextChunk);\r
-               }\r
+               scsiReadPIO(data + i, chunk, parityError);\r
 \r
-#ifdef SCSI_FSMC_DMA\r
-               if (chunk < 16)\r
-#endif\r
-               {\r
-                       scsiReadPIO(data + i, chunk);\r
-               }\r
-#ifdef SCSI_FSMC_DMA\r
-               else\r
+               __disable_irq();\r
+               while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))\r
                {\r
-                       scsiReadDMA(data + i, chunk);\r
-\r
-                       while (!scsiReadDMAPoll() && likely(!scsiDev.resetFlag))\r
-                       {\r
-                       };\r
+                       __WFI();\r
                }\r
-#endif\r
-\r
+               __enable_irq();\r
 \r
                i += chunk;\r
-               chunk = nextChunk;\r
        }\r
-#if FIFODEBUG\r
-               if (!scsiPhyFifoEmpty() || !scsiPhyFifoAltEmpty()) {\r
-                       int j = 0;\r
-                       while (!scsiPhyFifoEmpty()) { scsiPhyRx(); ++j; }\r
-                       scsiPhyFifoFlip();\r
-                       int k = 0;\r
-                       while (!scsiPhyFifoEmpty()) { scsiPhyRx(); ++k; }\r
-                       // Force a lock-up.\r
-                       assertFail();\r
-               }\r
-#endif\r
 }\r
 \r
 void\r
 scsiWriteByte(uint8_t value)\r
 {\r
-#if FIFODEBUG\r
-       if (!scsiPhyFifoEmpty()) {\r
-               // Force a lock-up.\r
-               assertFail();\r
-       }\r
-#endif\r
-       scsiPhyTx(value);\r
-       scsiPhyFifoFlip();\r
-\r
        scsiSetDataCount(1);\r
+       scsiPhyTx(value);\r
 \r
+       __disable_irq();\r
        while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))\r
        {\r
-               __WFE(); // Wait for event\r
-       }\r
-\r
-#if FIFODEBUG\r
-       if (!scsiPhyFifoAltEmpty()) {\r
-               // Force a lock-up.\r
-               assertFail();\r
+               __WFI();\r
        }\r
-#endif\r
+       __enable_irq();\r
 }\r
 \r
-static void\r
+void\r
 scsiWritePIO(const uint8_t* data, uint32_t count)\r
 {\r
        uint16_t* fifoData = (uint16_t*)data;\r
-       for (int i = 0; i < (count + 1) / 2; ++i)\r
+       uint32_t count16 = (count + 1) / 2;\r
+\r
+       int i = 0;\r
+       while ((i  < count16) && likely(!scsiDev.resetFlag))\r
        {\r
-               scsiPhyTx(fifoData[i]);\r
-       }\r
-}\r
+               while (!scsiFifoReady() && likely(!scsiDev.resetFlag))\r
+               {\r
+                       // Spin\r
+               }\r
 \r
-void\r
-scsiWriteDMA(const uint8_t* data, uint32_t count)\r
-{\r
-       // Prepare DMA transfer\r
-       dmaInProgress = 1;\r
+               if (count16 - i >= SCSI_FIFO_DEPTH16)\r
+               {\r
+                       uint32_t chunk16 = SCSI_FIFO_DEPTH16;\r
 \r
-       scsiTxDMAComplete = 0;\r
-       scsiRxDMAComplete = 1;\r
+                       // Let gcc unroll the loop as much as possible.\r
+                       for (uint32_t k = 0; k + 128 <= chunk16; k += 128)\r
+                       {\r
+                               scsiPhyTx32(fifoData[i + k], fifoData[i + k + 1]);\r
+                               scsiPhyTx32(fifoData[i + 2 + k], fifoData[i + k + 3]);\r
+                               scsiPhyTx32(fifoData[i + 4 + k], fifoData[i + k + 5]);\r
+                               scsiPhyTx32(fifoData[i + 6 + k], fifoData[i + k + 7]);\r
+                               scsiPhyTx32(fifoData[i + 8 + k], fifoData[i + k + 9]);\r
+                               scsiPhyTx32(fifoData[i + 10 + k], fifoData[i + k + 11]);\r
+                               scsiPhyTx32(fifoData[i + 12 + k], fifoData[i + k + 13]);\r
+                               scsiPhyTx32(fifoData[i + 14 + k], fifoData[i + k + 15]);\r
+                               scsiPhyTx32(fifoData[i + 16 + k], fifoData[i + k + 17]);\r
+                               scsiPhyTx32(fifoData[i + 18 + k], fifoData[i + k + 19]);\r
+                               scsiPhyTx32(fifoData[i + 20 + k], fifoData[i + k + 21]);\r
+                               scsiPhyTx32(fifoData[i + 22 + k], fifoData[i + k + 23]);\r
+                               scsiPhyTx32(fifoData[i + 24 + k], fifoData[i + k + 25]);\r
+                               scsiPhyTx32(fifoData[i + 26 + k], fifoData[i + k + 27]);\r
+                               scsiPhyTx32(fifoData[i + 28 + k], fifoData[i + k + 29]);\r
+                               scsiPhyTx32(fifoData[i + 30 + k], fifoData[i + k + 31]);\r
+\r
+                               scsiPhyTx32(fifoData[i + 32 + k], fifoData[i + k + 33]);\r
+                               scsiPhyTx32(fifoData[i + 34 + k], fifoData[i + k + 35]);\r
+                               scsiPhyTx32(fifoData[i + 36 + k], fifoData[i + k + 37]);\r
+                               scsiPhyTx32(fifoData[i + 38 + k], fifoData[i + k + 39]);\r
+                               scsiPhyTx32(fifoData[i + 40 + k], fifoData[i + k + 41]);\r
+                               scsiPhyTx32(fifoData[i + 42 + k], fifoData[i + k + 43]);\r
+                               scsiPhyTx32(fifoData[i + 44 + k], fifoData[i + k + 45]);\r
+                               scsiPhyTx32(fifoData[i + 46 + k], fifoData[i + k + 47]);\r
+                               scsiPhyTx32(fifoData[i + 48 + k], fifoData[i + k + 49]);\r
+                               scsiPhyTx32(fifoData[i + 50 + k], fifoData[i + k + 51]);\r
+                               scsiPhyTx32(fifoData[i + 52 + k], fifoData[i + k + 53]);\r
+                               scsiPhyTx32(fifoData[i + 54 + k], fifoData[i + k + 55]);\r
+                               scsiPhyTx32(fifoData[i + 56 + k], fifoData[i + k + 57]);\r
+                               scsiPhyTx32(fifoData[i + 58 + k], fifoData[i + k + 59]);\r
+                               scsiPhyTx32(fifoData[i + 60 + k], fifoData[i + k + 61]);\r
+                               scsiPhyTx32(fifoData[i + 62 + k], fifoData[i + k + 63]);\r
+\r
+                               scsiPhyTx32(fifoData[i + 64 + k], fifoData[i + k + 65]);\r
+                               scsiPhyTx32(fifoData[i + 66 + k], fifoData[i + k + 67]);\r
+                               scsiPhyTx32(fifoData[i + 68 + k], fifoData[i + k + 69]);\r
+                               scsiPhyTx32(fifoData[i + 70 + k], fifoData[i + k + 71]);\r
+                               scsiPhyTx32(fifoData[i + 72 + k], fifoData[i + k + 73]);\r
+                               scsiPhyTx32(fifoData[i + 74 + k], fifoData[i + k + 75]);\r
+                               scsiPhyTx32(fifoData[i + 76 + k], fifoData[i + k + 77]);\r
+                               scsiPhyTx32(fifoData[i + 78 + k], fifoData[i + k + 79]);\r
+                               scsiPhyTx32(fifoData[i + 80 + k], fifoData[i + k + 81]);\r
+                               scsiPhyTx32(fifoData[i + 82 + k], fifoData[i + k + 83]);\r
+                               scsiPhyTx32(fifoData[i + 84 + k], fifoData[i + k + 85]);\r
+                               scsiPhyTx32(fifoData[i + 86 + k], fifoData[i + k + 87]);\r
+                               scsiPhyTx32(fifoData[i + 88 + k], fifoData[i + k + 89]);\r
+                               scsiPhyTx32(fifoData[i + 90 + k], fifoData[i + k + 91]);\r
+                               scsiPhyTx32(fifoData[i + 92 + k], fifoData[i + k + 93]);\r
+                               scsiPhyTx32(fifoData[i + 94 + k], fifoData[i + k + 95]);\r
+\r
+                               scsiPhyTx32(fifoData[i + 96 + k], fifoData[i + k + 97]);\r
+                               scsiPhyTx32(fifoData[i + 98 + k], fifoData[i + k + 99]);\r
+                               scsiPhyTx32(fifoData[i + 100 + k], fifoData[i + k + 101]);\r
+                               scsiPhyTx32(fifoData[i + 102 + k], fifoData[i + k + 103]);\r
+                               scsiPhyTx32(fifoData[i + 104 + k], fifoData[i + k + 105]);\r
+                               scsiPhyTx32(fifoData[i + 106 + k], fifoData[i + k + 107]);\r
+                               scsiPhyTx32(fifoData[i + 108 + k], fifoData[i + k + 109]);\r
+                               scsiPhyTx32(fifoData[i + 110 + k], fifoData[i + k + 111]);\r
+                               scsiPhyTx32(fifoData[i + 112 + k], fifoData[i + k + 113]);\r
+                               scsiPhyTx32(fifoData[i + 114 + k], fifoData[i + k + 115]);\r
+                               scsiPhyTx32(fifoData[i + 116 + k], fifoData[i + k + 117]);\r
+                               scsiPhyTx32(fifoData[i + 118 + k], fifoData[i + k + 119]);\r
+                               scsiPhyTx32(fifoData[i + 120 + k], fifoData[i + k + 121]);\r
+                               scsiPhyTx32(fifoData[i + 122 + k], fifoData[i + k + 123]);\r
+                               scsiPhyTx32(fifoData[i + 124 + k], fifoData[i + k + 125]);\r
+                               scsiPhyTx32(fifoData[i + 126 + k], fifoData[i + k + 127]);\r
 \r
-       HAL_DMA_Start(\r
-               &memToFSMC,\r
-               (uint32_t) data,\r
-               (uint32_t) SCSI_FIFO_DATA,\r
-               count / 4);\r
-}\r
+                       }\r
 \r
-int\r
-scsiWriteDMAPoll()\r
-{\r
-       int complete = __HAL_DMA_GET_COUNTER(&memToFSMC) == 0;\r
-       complete = complete && (HAL_DMA_PollForTransfer(&memToFSMC, HAL_DMA_FULL_TRANSFER, 0xffffffff) == HAL_OK);\r
-       if (complete)\r
-       {\r
-               scsiTxDMAComplete = 1; // TODO MM FIX IRQ\r
-               scsiRxDMAComplete = 1;\r
+                       i += chunk16;\r
+               }\r
+               else\r
+               {\r
+                       uint32_t chunk16 = count16 - i;\r
 \r
-               dmaInProgress = 0;\r
-               return 1;\r
-       }\r
-       else\r
-       {\r
-               return 0;\r
+                       uint32_t k = 0;\r
+                       for (; k + 4 <= chunk16; k += 4)\r
+                       {\r
+                               scsiPhyTx32(fifoData[i + k], fifoData[i + k + 1]);\r
+                               scsiPhyTx32(fifoData[i + k + 2], fifoData[i + k + 3]);\r
+                       }\r
+                       for (; k < chunk16; ++k)\r
+                       {\r
+                               scsiPhyTx(fifoData[i + k]);\r
+                       }\r
+                       i += chunk16;\r
+               }\r
        }\r
 }\r
 \r
+\r
 void\r
 scsiWrite(const uint8_t* data, uint32_t count)\r
 {\r
        int i = 0;\r
        while (i < count && likely(!scsiDev.resetFlag))\r
        {\r
-               uint32_t chunk = ((count - i) > SCSI_FIFO_DEPTH)\r
-                       ? SCSI_FIFO_DEPTH : (count - i);\r
-\r
-#if FIFODEBUG\r
-               if (!scsiPhyFifoEmpty()) {\r
-                       // Force a lock-up.\r
-                       assertFail();\r
-               }\r
-#endif\r
-\r
-#ifdef SCSI_FSMC_DMA\r
-               if (chunk < 16)\r
-#endif\r
-               {\r
-                       scsiWritePIO(data + i, chunk);\r
-               }\r
-#ifdef SCSI_FSMC_DMA\r
-               else\r
-               {\r
-                       // DMA is doing 32bit transfers.\r
-                       chunk = chunk & 0xFFFFFFF8;\r
-                       scsiWriteDMA(data + i, chunk);\r
+               uint32_t chunk = ((count - i) > SCSI_XFER_MAX)\r
+                       ? SCSI_XFER_MAX : (count - i);\r
+               scsiSetDataCount(chunk);\r
 \r
-                       while (!scsiWriteDMAPoll() && likely(!scsiDev.resetFlag))\r
-                       {\r
-                       }\r
-               }\r
-#endif\r
+               scsiWritePIO(data + i, chunk);\r
 \r
+               __disable_irq();\r
                while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))\r
                {\r
-                       __WFE(); // Wait for event\r
+                       __WFI();\r
                }\r
+               __enable_irq();\r
 \r
-#if FIFODEBUG\r
-               if (!scsiPhyFifoAltEmpty()) {\r
-                       // Force a lock-up.\r
-                       assertFail();\r
-               }\r
-#endif\r
-\r
-               scsiPhyFifoFlip();\r
-               scsiSetDataCount(chunk);\r
                i += chunk;\r
        }\r
-       while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))\r
-       {\r
-               __WFE(); // Wait for event\r
-       }\r
-\r
-#if FIFODEBUG\r
-       if (!scsiPhyFifoAltEmpty()) {\r
-               // Force a lock-up.\r
-               assertFail();\r
-       }\r
-#endif\r
 }\r
 \r
 static inline void busSettleDelay(void)\r
@@ -498,10 +588,6 @@ uint32_t scsiEnterPhaseImmediate(int newPhase)
 \r
        int oldPhase = *SCSI_CTRL_PHASE;\r
 \r
-       if (!scsiDev.resetFlag && (!scsiPhyFifoEmpty() || !scsiPhyFifoAltEmpty())) {\r
-               // Force a lock-up.\r
-               assertFail();\r
-       }\r
        if (newPhase != oldPhase)\r
        {\r
                if ((newPhase == DATA_IN || newPhase == DATA_OUT) &&\r
@@ -639,8 +725,6 @@ void scsiPhyReset()
 \r
        *SCSI_CTRL_PHASE = 0x00;\r
        *SCSI_CTRL_BSY = 0x00;\r
-       scsiPhyFifoSel = 0;\r
-       *SCSI_FIFO_SEL = 0;\r
        *SCSI_CTRL_DBX = 0;\r
 \r
        *SCSI_CTRL_SYNC_OFFSET = 0;\r
@@ -674,6 +758,31 @@ void scsiPhyReset()
        }\r
        #endif\r
 \r
+       // PIO Benchmark code\r
+       // Currently 16.7MB/s.\r
+       //#define PIO_BENCHMARK 1\r
+       #ifdef PIO_BENCHMARK\r
+       while(1)\r
+       {\r
+               s2s_ledOn();\r
+\r
+               scsiEnterPhase(DATA_IN); // Need IO flag set for fifo ready flag\r
+\r
+               // 100MB\r
+               for (int i = 0; i < (100LL * 1024 * 1024 / SCSI_FIFO_DEPTH); ++i)\r
+               {\r
+                       scsiSetDataCount(1); // Resets fifos.\r
+\r
+                       // Shouldn't block\r
+                       scsiDev.resetFlag = 0;\r
+                       scsiWritePIO(&scsiDev.data[0], SCSI_FIFO_DEPTH);\r
+               }\r
+               s2s_ledOff();\r
+\r
+               for(int i = 0; i < 10; ++i) s2s_delay_ms(1000);\r
+       }\r
+       #endif\r
+\r
        #ifdef SCSI_FREQ_TEST\r
        while(1)\r
        {\r
@@ -749,8 +858,6 @@ void scsiPhyInit()
        *SCSI_CTRL_IDMASK = 0x00; // Reset in scsiPhyConfig\r
        *SCSI_CTRL_PHASE = 0x00;\r
        *SCSI_CTRL_BSY = 0x00;\r
-       scsiPhyFifoSel = 0;\r
-       *SCSI_FIFO_SEL = 0;\r
        *SCSI_CTRL_DBX = 0;\r
 \r
        *SCSI_CTRL_SYNC_OFFSET = 0;\r
index 19f0aa6..c2288db 100755 (executable)
@@ -20,8 +20,8 @@
 #define SCSI_CTRL_IDMASK ((volatile uint8_t*)0x60000000)
 #define SCSI_CTRL_PHASE ((volatile uint8_t*)0x60000002)
 #define SCSI_CTRL_BSY ((volatile uint8_t*)0x60000004)
-#define SCSI_FIFO_SEL ((volatile uint8_t*)0x60000006)
-#define SCSI_DATA_CNT_HI ((volatile uint8_t*)0x60000008)
+#define SCSI_DATA_CNT_HI ((volatile uint8_t*)0x60000006)
+#define SCSI_DATA_CNT_MID ((volatile uint8_t*)0x60000008)
 #define SCSI_DATA_CNT_LO ((volatile uint8_t*)0x6000000A)
 #define SCSI_DATA_CNT_SET ((volatile uint8_t*)0x6000000C)
 #define SCSI_CTRL_DBX ((volatile uint8_t*)0x6000000E)
@@ -35,7 +35,7 @@
 #define SCSI_CTRL_SEL_TIMING ((volatile uint8_t*)0x60000018)
 
 #define SCSI_STS_FIFO ((volatile uint8_t*)0x60000020)
-#define SCSI_STS_ALTFIFO ((volatile uint8_t*)0x60000022)
+// Obsolete #define SCSI_STS_ALTFIFO ((volatile uint8_t*)0x60000022)
 #define SCSI_STS_FIFO_COMPLETE ((volatile uint8_t*)0x60000024)
 #define SCSI_STS_SELECTED ((volatile uint8_t*)0x60000026)
 #define SCSI_STS_SCSI ((volatile uint8_t*)0x60000028)
 #define SCSI_STS_PARITY_ERR ((volatile uint8_t*)0x6000002C)
 
 #define SCSI_FIFO_DATA ((volatile uint16_t*)0x60000040)
-#define SCSI_FIFO_DEPTH 256
 
+#define SCSI_FIFO_DEPTH 512
+#define SCSI_FIFO_DEPTH16 (SCSI_FIFO_DEPTH / 2)
+#define SCSI_XFER_MAX 524288
 
-#define scsiPhyFifoFull() ((*SCSI_STS_FIFO & 0x01) == 0x01)
-#define scsiPhyFifoEmpty() ((*SCSI_STS_FIFO & 0x02) == 0x02)
-#define scsiPhyFifoAltEmpty() ((*SCSI_STS_ALTFIFO & 0x02) == 0x02)
+// Check if FIFO is empty or full.
+// Replaced with method due to delays
+// #define scsiFifoReady() (HAL_GPIO_ReadPin(GPIOE, FPGA_GPIO3_Pin) != 0)
 
-#define scsiPhyFifoFlip() \
-{\
-       scsiPhyFifoSel ^= 1; \
-       *SCSI_FIFO_SEL = scsiPhyFifoSel; \
-}
+#define scsiPhyFifoFull() ((*SCSI_STS_FIFO & 0x01) != 0)
+#define scsiPhyFifoEmpty() ((*SCSI_STS_FIFO & 0x02) != 0)
 
 #define scsiPhyTx(val) *SCSI_FIFO_DATA = (val)
 
 #define scsiPhyRx() *SCSI_FIFO_DATA
 #define scsiPhyComplete() ((*SCSI_STS_FIFO_COMPLETE & 0x01) == 0x01)
 
-#define scsiStatusATN() ((*SCSI_STS_SCSI & 0x01) == 0x01)
-#define scsiStatusBSY() ((*SCSI_STS_SCSI & 0x02) == 0x02)
-#define scsiStatusRST() ((*SCSI_STS_SCSI & 0x04) == 0x04)
-#define scsiStatusSEL() ((*SCSI_STS_SCSI & 0x08) == 0x08)
-#define scsiStatusACK() ((*SCSI_STS_SCSI & 0x10) == 0x10)
+#define scsiStatusATN() ((*SCSI_STS_SCSI & 0x01) != 0)
+#define scsiStatusBSY() ((*SCSI_STS_SCSI & 0x02) != 0)
+#define scsiStatusRST() ((*SCSI_STS_SCSI & 0x04) != 0)
+#define scsiStatusSEL() ((*SCSI_STS_SCSI & 0x08) != 0)
+#define scsiStatusACK() ((*SCSI_STS_SCSI & 0x10) != 0)
 
-#define scsiParityError() ((*SCSI_STS_PARITY_ERR & 0x1) == 0x1)
+#define scsiParityError() ((*SCSI_STS_PARITY_ERR & 0x1) != 0)
 
 // Disable DMA due to errate with the STM32F205 DMA2 controller when
 // concurrently transferring FSMC (with FIFO) and APB (ie. sdio)
 // peripherals.
 #undef SCSI_FSMC_DMA
 
-extern uint8_t scsiPhyFifoSel;
-
 void scsiPhyInit(void);
 void scsiPhyConfig(void);
 void scsiPhyReset(void);
+int scsiFifoReady(void);
 
 void scsiEnterPhase(int phase);
 uint32_t scsiEnterPhaseImmediate(int phase);
@@ -111,7 +109,8 @@ void scsiReadDMA(uint8_t* data, uint32_t count);
 int scsiReadDMAPoll();
 
 // Low-level.
-void scsiReadPIO(uint8_t* data, uint32_t count);
+void scsiReadPIO(uint8_t* data, uint32_t count, int* parityError);
+void scsiWritePIO(const uint8_t* data, uint32_t count);
 
 void scsiWriteDMA(const uint8_t* data, uint32_t count);
 int scsiWriteDMAPoll();
index 0a9d632..b82f119 100755 (executable)
@@ -34,14 +34,9 @@ extern SdDevice sdDev;
 
 int sdInit(void);
 
-void sdWriteMultiSectorPrep(uint32_t sdLBA, uint32_t sdSectors);
-void sdWriteMultiSectorDMA(uint8_t* outputBuffer);
-int sdWriteSectorDMAPoll();
-
 void sdReadDMA(uint32_t lba, uint32_t sectors, uint8_t* outputBuffer);
 int sdReadDMAPoll(uint32_t remainingSectors);
 void sdCompleteTransfer();
 
-void sdPoll();
 
 #endif