1 Star 2 Fork 0

huaxiu/espusb

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
usb_notes.txt 12.36 KB
一键复制 编辑 原始数据 按行查看 历史
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545
//ESP8266 instruction timing guide:
//
// nop : 1
// add, addi, or, extui : 2 (Sort of)
// movi: 1
// l32r: 7 (Sort of)
// l32i.n: 1 (Seems to be always)
//
// 1 Clock cycle:
// add.n a10, a10, a11
//
// 2 Clock Cycles:
// add.n a10, a10, a11
// add.n a10, a10, a11
//
// 2 Clock Cycles:
// add.n a10, a10, a11
// extui a10, a10, 24, 3
//
// 7 Clocks:
// l32r a10, 4010027c <_DoubleExceptionVector+0x20c>
//
// Also 7 Clocks:
// addi.n a10, a10, -1
// l32r a10, 4010027c <_DoubleExceptionVector+0x20c>
//
// 9 Clocks:
// addi.n a10, a10, -1
// addi.n a11, a11, -1
// l32r a11, 4010027c <_DoubleExceptionVector+0x20c>
//
// 14 Clocks
// l32r a10, 4010027c <_DoubleExceptionVector+0x20c>
// l32r a11, 40100280 <_DoubleExceptionVector+0x210>
//
// 1 Clocks
// l32i.n a10, a2, 0
//
// 2 Clocks
// add.n a10, a10, a11
// l32i.n a10, a2, 0
//
// 3 Clocks (Second operation depends on first)
// l32i.n a10, a2, 0
// (Nop or no nop, still 3 cycles)
// add.n a10, a10, a11
//
//
// 2 Clocks (Second operation does not depend on first)
// l32i.n a10, a2, 0
// add.n a11, a11, a11
//
// 3 Clocks (@a2 == 0) (Branch not taken)
// l32i.n a10, a2, 0
// nop.n
// bgeui a10, 5, 40100b60 <end>
//
// 5 Clocks (@a2 == 6) (Branch taken) Branching takes 2 additional instructions
// l32i.n a10, a2, 0
// nop.n
// bgeui a10, 5, 40100b60 <end>
//
//
// Big note: with regular jumps and comparative jumps, if the target's address ends with 11 (binary), then it will take one extra cycle.
//
//
// 3 Clocks
// j end
// ...
// end:
//
// 4 Clocks: (Branch not taken)
// l32i a10, a2, 0
// bgeui a10, 6, 40100b58 <end>
// addi a10, a10, 1
//
// 4 Clocks: (Branch not taken)
// l32i a10, a2, 0
// extui a10, a10, 23, 1
// bgeui a10, 6, 40100b58 <end>
//
// 6 Clocks: (Branch Taken)
// l32i a10, a2, 0
// (Free to do unrelated instruction here)
// extui a10, a10, 1, 4
// bgeui a10, 6, 40100b58 <end>
//
//
// Function Calls:
//
//
// 5 Clocks: No function call.
// nop.n
// nop.n
// j 40100b58 <end>
// my_func:
// ret.n
// end:
//
// 10 Clocks: Include function call
// call0 40100b54 <my_func>
// j 40100b56 <end>
// <my_func>:
// ret.n
// end:
//
// 10 Clocks: Include function call, and an instruction in the beginning
// call0 40100b54 <my_func>
// j 40100b58 <end>
// <my_func>:
// nop.n
// ret.n
// <end>:
//
// 12 Clocks: (WHYYYYYYY WHY Jump from 10 to 12??)
// <intrs>:
// call0 40100b54 <my_func>
// j 40100b5a <end>
// <my_func>:
// nop.n
// nop.n
// ret.n
//
// 13 Clocks:
//<intrs>:
// call0 40100b54 <my_func>
// j 40100b5e <end>
// <my_func>:
// addi a1, a1, -16
// nop.n
// addi a1, a1, 16
// ret.n
//
// 15 Clocks:
// <intrs>:
// call0 40100b54 <my_func>
// j 40100b62 <end>
// <my_func>:
// addi a1, a1, -16
// s32i.n a0, a1, 0
// nop.n
// l32i.n a0, a1, 0
// addi a1, a1, 16
// ret.n
// <end>:
//
//
// 21 Clocks:
// <intrs>:
// call0 40100b54 <my_func>
// j 40100b6e <end>
// <my_func>:
// addi a1, a1, -16
// s32i.n a0, a1, 0
// s32i.n a2, a1, 4
// s32i.n a3, a1, 8
// s32i.n a4, a1, 12
// nop.n
// l32i.n a4, a1, 12
// l32i.n a3, a1, 8
// l32i.n a2, a1, 4
// l32i.n a0, a1, 0
// addi a1, a1, 16
// ret.n
// <end>:
//
// For the following, we will be replacing 'nop' with the following:
//
// This takes 1 clock...
// nop
//
// These take 20 Clocks... WHYYYY?Y??YY?
// l32r a2, 40100280 <_DoubleExceptionVector+0x210> (These are actually movi a2, (some constant))
// l32r a3, 40100270 <_DoubleExceptionVector+0x200>
// l32r a4, 40100284 <_DoubleExceptionVector+0x214>
//
// 14 Clocks:
// l32r a2, 40100280 <_DoubleExceptionVector+0x210>
// l32r a3, 40100270 <_DoubleExceptionVector+0x200>
//
// 7 Clocks:
// l32r a3, 40100270 <_DoubleExceptionVector+0x200>
//
// These take 25 Clocks... Ok... this is getting weird.
// l32r a0, 40100280 <_DoubleExceptionVector+0x210> (These are actually movi a2, (some constant))
// l32r a2, 40100280 <_DoubleExceptionVector+0x210>
// l32r a3, 40100270 <_DoubleExceptionVector+0x200>
// l32r a4, 40100284 <_DoubleExceptionVector+0x214>
//
// 33 Clock... WAT??
// l32r a0, 40100280 <_DoubleExceptionVector+0x210>
// l32r a2, 40100280 <_DoubleExceptionVector+0x210>
// l32r a4, 40100280 <_DoubleExceptionVector+0x210>
// l32r a0, 40100280 <_DoubleExceptionVector+0x210>
// l32r a2, 40100280 <_DoubleExceptionVector+0x210>
//
// 9 Clocks: (If you want to cleaverly pull data out of a table)
// l32r a0, 40100160 <_DoubleExceptionVector+0xf0>
// (ONE Optional nop here)
// sskipper32i.n a1, a0, 0
//
//
// BIG NOTE About jumping. If the instruction you are jumping to ends in a 11 as the last two bits, it will take ONE extra cycle to complete.
// JUMPS TAKE ONE EXTRA CYCLE WHEN 11 is last two bits: CONFIRMED. This lets us place 3 instructions in and it will still take just as long!
//
// Whenever reading with any kind of load, the next instruction CANNOT use the data that was loaded.
//
// mul16u a, b, c <<1 cycle, always.
//
//
/*
this is 5 cycles... Multiplies take two cycles to complete, but only one to issue.
mul16u a7, a5, a6
bbsi a7, 0, skipper
_nop.n
_nop
skipper:
...ALSO 5
mul16u a7, a5, a6
nop
bbsi a7, 0, skipper
_nop.n
_nop
skipper:
... ALSO 5
mul16u a7, a5, a6
mul16u a5, a5, a6
bbsi a7, 0, skipper
_nop.n
_nop.n
skipper:
...BUT if we use one multiply in another multiply it stalls. (6 cycles)
mul16u a7, a5, a6
mul16u a7, a5, a7
bbsi a7, 0, skipper
_nop.n
_nop.n
skipper:
*/
/*
Calling...
.align 4
my_test_call:
ret
elsewhere..
_call0 my_test_call
//IF the last two bits of the address the CALL line is on are:
// 00 = 7 cycles
// 01 = 6 cycles
// 10 = 6 cycles
// 11 = 7 cycles
*/
/*
Revisiting l32
//A4 = some place within iram, aligned. Unaligned reads will crash regardless of if they're from iram or dram.
//7, 8 cycles (Or +1 if there is a dependency)
//
// If l32r's address ends in:
// 00: 7 cycles
// 01: 7 cycles
// 10: 8 cycles
// 11: 8 cycles
_l32r a6, my_test_data
add a6, a7, a7
//l32i behaves exactly the same!!! Btw! Can only load from an address that ends in 00.
You can't seem to l8ui from iram. It seems to always crash.
//A4 = some place within dram
//2 Cycles
_l32i.n a6, a4, 0
add a7, a7, a7
//3 Cycles
_l32i.n a7, a4, 0
add a7, a7, a7
//2 Cycles (If a4 is aligned or not, regardless)
_l8i.n a6, a4, 0
add a7, a7, a7 (3 cycles if the dependency exists)
*/
////////////////////////////////////////////////////
#if 0
//NOTES: We are hooked up to USB by GPIO12: D-? (White), GPIO14: D-, GPIO5 EXTDEBUG
#define STDPLUS "12"
#define STDMINUS "14"
#define GPSET __asm__ __volatile__ ("movi a2, 0x60000304\nmovi a4, 0x60000308\nmovi.n a3, 32\n"); //GPIO5
#define LOW __asm__ __volatile__ ("s32i.n a3, a4, 0"); //One instruction.
#define HIGH __asm__ __volatile__ ("s32i.n a3, a2, 0");
#define LOWS __asm__ __volatile__ ("s32i.n a14, a13, 0":::"a3");
#define HIGHS __asm__ __volatile__ ("s32i.n a14, a12, 0":::"a3");
//Detailed analysis of some useful stuff and performance tweaking: http://naberius.de/2015/05/14/esp8266-gpio-output-performance/
//Reverse engineerd boot room can be helpful, too: http://cholla.mmto.org/esp8266/bootrom/boot.txt
//The folowing performs this code:
// uint32_t gpio_status = GPIO_REG_READ(GPIO_STATUS_ADDRESS);
// GPIO_REG_WRITE(GPIO_STATUS_W1TC_ADDRESS, gpio_status); //clear interrupt status
//
//GPIO Base Register(PERIPHS_GPIO_BASEADDR): 0x60000300 + GPIO Status Register: 0x1c = 0x6000031c
//GPIO_STATUS_W1TC_ADDRESS = 0x24
#define ACKNOWLEDGE_INTERRUPT __asm__ __volatile__ ("movi a2, 0x6000031c \n l32i.n a3, a2, 0 \n movi a2, 0x60000324 \n s32i.n a3, a2, 0");
// ETS_INTR_DISABLE(ETS_GPIO_INUM) ETS_GPIO_INUM=4,
//#define ETS_INTR_ENABLE(inum) \
// ets_isr_unmask((1<<inum))
//#define ETS_INTR_DISABLE(inum) \
// ets_isr_mask((1<<inum))
//These still aren't very fast. I am sure more can be done.
#define FAST_ETS_INTR_DISABLE_GPIO _xtos_ints_off(1<<4);
#define FAST_ETS_INTR_ENABLE_GPIO _xtos_ints_on(1<<4);
/* Charles' useful assembly guide for the ESP:
* don't jump if bit in register set: bbci a2, 14, 40100379 <gpio_intr+0x11>
* jump if bit in register set: bbsi a2, 14, 40100379 <gpio_intr+0x11>
* movi a2, 0x60000304 < move the address of
* movi.n a3, 4096 < set the value
* s32i.n a3, a4, 0 < Update it in memory
*/
//XXX Careful: Note the "int" to let things wrap.
static inline int __attribute__((always_inline)) get_ccount(void)
{
unsigned r;
asm volatile ("rsr %0, ccount" : "=r"(r));
return r;
}
//static inline unsigned __attribute__((always_inline)) wait_bit()
#define wait_bit \
{ \
bittime+=53; \
while( bittime > get_ccount() ); \
}
#define WBV (((*pins)&_BV(DPLUS))?1:0)
//Plan of attack:
//1: Spin at beginning waiting for a bit transition.
//2: Start "wait bit"ting every 53 cycles.
#define TIMEOUT 0xff
void gpio_intr(void)
{
register unsigned to asm ("a9"); //timeout
register int bittime asm ("a10");
volatile register uint32_t * pins asm ("a11");
//Variables used for debugging.
volatile register uint32_t * pin_out_set asm ("a12");
volatile register uint32_t * pin_out_clr asm ("a13");
volatile register int psel asm("a14");
uint32_t bits[16];
unsigned i, j;
//Initialize common-use variables
__asm__ __volatile__("\n\
movi a11, "STR_PIN_IN"\n\
movi a12, "STR_PIN_OUT_SET"\n\
movi a13, "STR_PIN_OUT_CLEAR"\n\
movi a14, %0\n\
" : : "M"(1<<DEBUGPIN) : "a11", "a12", "a13", "a14" );
FAST_ETS_INTR_DISABLE_GPIO;
HIGHS
//#define SLOW_START
#ifdef SLOW_START
//Wait for it to go low.
__asm__ __volatile__("mstart:\n" );
for( to = TIMEOUT; to; to-- )
{
if( !(PIN_IN & _BV(DPLUS)) ) break;
}
//Wait for it to go high.
for( to = TIMEOUT; to; to-- )
{
if( (PIN_IN & _BV(DPLUS)) ) break;
}
__asm__ __volatile__("mend:\n" );
#else
//First, we wait for the signal to go (or be) low. Then we wait for the signal to go high.
__asm__ __volatile__("\n\
movi a9, 1023 #timeout \n\
find_low:\n\
l32i.n a2, a11, 0\n\
bbci a2, " STDPLUS ", done_low\n\
addi.n a9, a9, -1\n\
bnez a9, find_low\n\
j end_gpio_intr\n\
done_low:\n\
movi a9, 1023 #timeout \n\
find_high:\n\
l32i.n a2, a11, 0\n\
bbsi a2, " STDPLUS ", done_high\n\
addi.n a9, a9, -1\n\
bnez a9, find_high\n\
j end_gpio_intr\n\
done_high:\n\
" : : : "a2" );
#endif
LOWS
// bittime = get_ccount() + 42 - 53; //Each bit is 53 cycles, we are shooting for the middle of the coming bit, but give us back a few us..
//GCC Seems pretty smart, this actually translates to moving the value into r10 and adding some values.
//Scratch that. GCC messes this up, too.
//This does effectively take a snapshot of the start of the tracking.
__asm__ __volatile("\n\
rsr a10, ccount\n\
addi a10, a10, -12\n\
");
/*
//Code for assembly below.
//Wait for a double 0.
for( to = TIMEOUT; to; to-- )
{
HIGHS
wait_bit
LOWS
if( !(PIN_IN&_BV(DPLUS)) )
{
HIGHS
wait_bit
LOWS
if( !(PIN_IN&_BV(DPLUS)) ) break;
}
}
*/
//Wait for a double-low bit.
//R10 contains when we should stop reading next.
__asm__ __volatile__("\n\
movi a9, 1023 #timeout \n\
find_double_low:\n\
addi.n a9, a9, -1\n\
beqz a9, end_gpio_intr\n\
addi a10, a10, 53\n\
\n\
wait_double_loop_int:\n\
rsr a3, ccount\n\
bge a10, a3, wait_double_loop_int\n\
# Time to read another bit.\n\
l32i.n a3, a11, 0\n\
memw\n\
bbsi a3, " STDPLUS ", find_double_low\n\
# Got a 0! Let's see if that happens again.\n\
addi a10, a10, 53\n\
wait_double_loop_int2:\n\
rsr a3, ccount\n\
bge a10, a3, wait_double_loop_int2\n\
# Poll again.\n\
l32i.n a3, a11, 0\n\
memw\n\
bbsi a3, " STDPLUS ", find_double_low\n\
" : : : "a3" );
//Ok! We now can use wait_bit to wait for the next bit then receive using WBV.
for(i = 0; i < 16; i++ )
{
HIGHS
wait_bit
LOWS
bits[i] = WBV;
// GPSET; //Setup GPIO
// LOW;
// HIGH;
}
HIGHS
/*
printf( "\n" );
for(i = 0; i < 16; i++ )
{
printf( "%d", bits[i] );
}
LOWS
printf( "\n" );*/
__asm__ __volatile__("end_gpio_intr:");
LOWS
ACKNOWLEDGE_INTERRUPT;
//ETS_GPIO_INTR_ENABLE();
FAST_ETS_INTR_ENABLE_GPIO;
//XT_CLI;
}
#endif
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
C
1
https://gitee.com/huaxiuyiwei/espusb.git
git@gitee.com:huaxiuyiwei/espusb.git
huaxiuyiwei
espusb
espusb
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385