/*************************************************************************** * Copyright (C) 2015, 2016 by Terraneo Federico * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * As a special exception, if other files instantiate templates or use * * macros or inline functions from this file, or you compile this file * * and link it with other works to produce a work based on this file, * * this file does not by itself cause the resulting work to be covered * * by the GNU General Public License. However the source code for this * * file must still be made available in accordance with the GNU General * * Public License. This exception does not invalidate any other reasons * * why a work based on this file might be covered by the GNU General * * Public License. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, see * ***************************************************************************/ #include "timeconversion.h" #ifdef TEST_ALGORITHM #include #include #include #include #include static bool print=true; #define P(x) if(print) std::cout<<#x<<'='<>32; } /** * \param a 32 bit unsigned number * \param b 32 bit unsigned number * \return a * b as a 64 unsigned number */ static inline unsigned long long mul32x32to64(unsigned int a, unsigned int b) { //Casts are to produce a 64 bit result. Compiles to a single asm instruction //in processors having 32x32 multiplication with 64 bit result return static_cast(a)*static_cast(b); } unsigned long long mul64x32d32(unsigned long long a, unsigned int bi, unsigned int bf) { /* * The implemntation is a standard multiplication algorithm: * | 64bit | 32bit | Frac part * ---------------------------------------------------------------------- * | hi(a) | lo(a) | 0 x * | | bi | bf = * ====================================================================== * | hi(bi*lo(a)) | lo(bi*lo(a)) | 0 | - * hi(bi*hi(a)) | +lo(bi*hi(a)) | | | * | | +hi(bf*lo(a)) | lo(bf*lo(a)) | 0 * | +hi(bf*hi(a)) | +lo(bf*hi(a)) | | * ---------------------------------------------------------------------- * 96bit | 64bit | 32bit | Frac part * (Discarded) | <------ returned part ------> | (Disacrded) * * Note that [hi(bi*lo(a))|lo(bi*lo(a))] and [hi(bf*hi(a))|lo(bf*hi(a))] * are two 64 bit numbers with the same alignment as the result, so * result can be rewritten as * bi*lo(a) + bf*hi(a) + hi(bf*lo(a)) + lo(bi*hi(a))<<32 * * The arithmetic rounding is implemented by adding one to the result if * lo(bf*lo(a)) has bit 31 set. That is, if the fractional part is >=0.5. * * This code (without arithmetic rounding) takes around 30..40 clock cycles * on Cortex-A9 (arm/thumb2), Cortex-M3 (thumb2), Cortex-M4 (thumb2), * ARM7TDMI (arm), with GCC 4.7.3 and optimization levels -Os, -O2, -O3. * * TODO: this code is *much* slower with architectures missing a 32x32 * multiplication with 64 bit result, probably in the thousands of clock * cycles. An example arch is the Cortex-M0. How to deal with those arch? */ unsigned int aLo=lo(a); unsigned int aHi=hi(a); unsigned long long result=mul32x32to64(bi,aLo); result+=mul32x32to64(bf,aHi); unsigned long long bfaLo=mul32x32to64(bf,aLo); result+=hi(bfaLo); //Uncommenting this adds arithmetic rounding (round to nearest value). //Leaving it commented out always rounds towards lowest and makes the //algorithm significantly faster by not requiring a branch //if(bfaLo & 0x80000000) result++; //This is a 32x32 -> 32 multiplication (upper 32 bits disacrded), with //the 32 bits shifted to the upper word of a 64 bit number result+=static_cast(bi*aHi)<<32; //Caller is responsible to never call this with values that produce overflow return result; } /** * \param x a long long * \return true if x>=0 */ static inline bool sign(long long x) { return x>=0; } /** * \param x a long long * \return |x| * * Note that compared to llabs this function returns an unsigned number, and * is also sure to be inlined */ static inline unsigned long long uabs(long long x) { long long result= sign(x) ? x : -x; return static_cast(result); } /* * A note on the issues of tick to nanosecond conversion and back. * Converting a number in tick to nanoseconds requires multiplying the ticks by * a coefficient M, while doing the opposite requires a multiplication by 1/M. * This code assumes that the tick frequency is less than 1GHz, so that M > 1, * and thus 1/M < 1. Doing an exact multiplication would be too expensive * computationally, so the coefficients are rounded in a 32.32 fixed point * representation. In this representation, numbers are in the form * A+B/2^32, where A and B are two unsigned 32 bit numbers. * * The essential requirements on this approximation are two: * - the error of the conversion from tick to nanosecond should be as small * as possible * - if a value in ticks is converted to nanoseconds and then converted back to * ticks, the result in ticks should be again the original value in ticks * that we started with, with a very small error (say less than two ticks). * For this reason, the error for the conversion from nanosecond to tick * should be equal and opposite in sign to the tick to nanosecond error. * * The reason for the first requirement is easy to understand, the tick to * nanosecond conversion is used when reading the hardware clock, so errors * in this conversion result in errors in time measurement. * Also, this requirement is easy to achieve with the given representation, * as the M coefficient is greater than 1, and a fixed point representation * has a high resolution when storing large numbers. * Tests done with TEST_ALGORITHM have shown that the tick to nanosecond is * zero for some relevant frequencies, and is generally below 0.04ppm, * or an error that accumulates at a rate of 1.26 seconds per year of uptime, * which is lower than that of most clock crystals. * * The second requirement requires some more explanation. Assume a typical use * case for the timing subsystem: a task gets the current time, adds a small * delay, say 1ms, and does an absolute sleep till that time point. * The former operation, getting the time, requires a tick to nanosecond * conversion, while the latter a nanosecond to tick for setting the interrupt. * If these two conversions, that are here called a "round trip" would introduce * an error of many ticks, that would make the sleep imprecise. At first * this may seem a non-issue, as the errors are in the parts per million range, * but the key point is that we are working with absolute times. So after * one year of uptime, a mere mismatch of 0.04ppm between the two * conversion coefficients will make our 1ms sleep become a 1261ms one, * or -depending on the error sign- a sleep in the past! * * This issue is also difficult to solve, as the back conversion coefficient is * less than 1, and the fixed point stores small numbers with less resolution, * so the error of the nanosecond to tick conversion can grow as large as * 10ppm for tick frequencies in the 10KHz range. The solution to this problem * is twofold. First, an offline optimization of the back conversion coefficient * is done int the TimeConversion constructor, with the aim of having its error * as much as possible equal and opposite in sign to the tick to nanosecond * error. * Then, an online round trip adjustment is done in ns2tick(), by computing the * round trip offset and subtracting it to the conversion, in order to make * sure the round trip error is less than two ticks. * As the online adjustment is a little expensive, the result is cached, and for * conversion which are "near" to the one at the previous nanosecond to tick * call the same offset is used. The range that is considered "near" is * again computed offline, as it depends on how good the offline optimization * was at making one conversion coefficient the opposite of the other, but is * generally in the range of a few seconds to a few tens of seconds. */ // // class TimeConversion // long long TimeConversion::ns2tick(long long ns) { /* * This algorithm does the online adjustment to compensate for round trip * error that accumulates over time. It is probably the least intuitive * algorithm in this file, so it requires careful explanation. * If we are close enough to te last adjustment we simply perform the * conversion and add adjustOffsetNs, which is the cached value. Otherwise, * we need to recompute the offset. This part is a straightforward caching. * * The first implementation of the recomputation was a simple * * adjustOffsetNs=ns-convert(convert(ns,toTick),toNs); * lastAdjustTimeNs=ns; * * which is straightforward, but it failed when tested with very long * ns values (100+ years) and low tick frequencies (8MHz or less). * The reason for this is that the round trip computation first converts * the given ns value to ticks, which incurs in the error that we want to * get rid of, and then converts it back to nanoseconds, resulting in a * value that we call ns2. However, since we are adjusting the * first conversion to be equal to the second one, the adjust value so * obtained is good for compensating the error in a range of amplitude * adjustIntervalNs around ns2, NOT around ns! * So, if the error grows so large that ns2-ns is greater than * adjustIntervalNs, the simple round trip written above does not zero * the error. To solve this, an iterative algorithm is needed. * * Somewhere else ns2tick * in the kernel * * tick tick2 --+ * | ^ | * | | | * V | V * ns -----------+ ns2 * * The following diagram explains it further. Soewhere else in the kernel * a value in ticks was read from the hardware counter, and converted to ns. * Within ns2tick we don't know what tick is, and if we're doing a round * trip adjustment we start by back converting ns, resulting in tick2 * which is different from tick, because the point of all this is that * the conversion from ns to tick is imprecise. If we then complete the * round trip and naively compute adjustOffsetNs as the difference from * ns2 and ns, the resulting adjustment when applied to ns will only * give us the correct answer, tick, if the difference between ns and ns2 * is less than adjustIntervalNs. * * The algorithm that fixes this works as follows. Before the for loop, * there's an adjustOffsetNs=0 statement. * At the first iteration we try to do a round trip around * ns+adjustOffsetNs, but adjustOffsetNs is zero, so we get the same result * as the previous round trip described above. As the result of this * round trip gets us ns2, which as said earlier is the center of the * validity range for the round trip, we store it in our lastAdjustTimeNs * variable. Then we compute the adjust offset as the original time point, * ns+adjustOffsetNs, minus the round trip value ns2. * Then we check the validity range of our find. The check is whether we * are in a range which is half the size of our real validity range. This * is a compromise, as using the full validity range would lower the number * of iterations of the algorithm that will produce a correct answer, but * will diminish the value of caching, as ns2tick will be likely called with * values close to the previous ns one, and if we lastAdjustTimeNs compared * to ns is at edge of the range, later calls are likely to require a * readjustment. To utilize the caching range fully, we would ideally want * to stop when ns==lastAdjustTimeNs, but this will require too many * iterations and would slow down the algorithm. * Anyway, if a single iteration isn't enough, we iterate again, using * the previous adjustOffsetNs as a guess to get make tick2 closer to * tick this time. The maximum number of iterations that was observed is * just 3, so the iteration is particularly efficient. * As you can see, the for loop has a hard limit of 5 iterations. This is * NOT expected to occur in practice, and has only been done to be 100% * sure that this algorithm won't turn into an infinite loop when given * off-design numbers, such as negative numbers. * The last thing worth mentioning is that a previous implementation omitted * the adjustOffsetNs=0 before the for loop. As explained, each iteration * works by using the previous adjustOffsetNs as a guess for the next one, * and the initializtion of was omitted to use the previous value as a guess * for the new adjustment. While this was shown to reduce the number of * iterations in some cases, if ns2tick() was first called with a very large * value, and then with a very small one, a negative adjustOffsetNs would * result in underflow and caused wrong results to bbe produced. * Also, the casts to unsigned are needed because ns+adjustOffsetNs was * shown to overflow for values of ns close to the maximum number a * long long can hold and positive adjustOffsetNs, but uns+adjustOffsetNs * can hold positive numbers far greater than its signed couterpart. * * This algorithm was benchmarked on an efm32 Cortex-M4 microcontroller * running at 48MHz, and this is the runtime: * using cached value 115 cycles * readjust 1 iteration 284 cycles * readjust 2 iterations 438 cycles */ //Negative numbers for ns are not allowed, cast is safe auto uns=static_cast(ns); if(uabs(static_cast(uns-lastAdjustTimeNs))>adjustIntervalNs) { adjustOffsetNs=0; for(int i=0;i<5;i++) { #ifdef TEST_ALGORITHM ITERATION; #endif //TEST_ALGORITHM lastAdjustTimeNs=convert(convert(uns+adjustOffsetNs,toTick),toNs); adjustOffsetNs=static_cast((uns+adjustOffsetNs)-lastAdjustTimeNs); if(uabs(static_cast(uns-lastAdjustTimeNs))<(adjustIntervalNs>>1)) break; } #ifdef TEST_ALGORITHM NL; #endif //TEST_ALGORITHM } return static_cast(convert(uns+adjustOffsetNs,toTick)); } TimeConversion::TimeConversion(unsigned int hz) : lastAdjustTimeNs(0), adjustOffsetNs(0) { // // As a first part, compute the initial toNs and toTick coefficients // float hzf=static_cast(hz); toNs=floatToFactor(1e9f/hzf); toTick=floatToFactor(hzf/1e9f); // // Then perform the bisection algorithm to optimize toTick offline // /* * For choosing the time point on which to do the toTick coefficient * optimization, we need both an as high as possible number, but also * without fear of overflow. 1<<62 is ~146years, but even if during the * bisection the number nearly doubles, no overflow occurs. */ const unsigned long long longUptimeNs=1ULL<<62; const unsigned long long longUptimeTick=convert(longUptimeNs,toTick); /* * Max correction 25ppm (1/400000=25ppm). This value is a guess based on * the observed correction values when running with TEST_ALGORITHM, and has * the advantage that the number of iterations is lower when hz is lower, * thus being less time consuming on slow CPUs */ int aDelta=toTick.fractionalPart()/40000; int bDelta=-aDelta; long long aError=computeRoundTripError(longUptimeTick,aDelta); long long bError=computeRoundTripError(longUptimeTick,bDelta); if(sign(aError)!=sign(bError)) { //Different sign, do a binary search for(;;) { #ifdef TEST_ALGORITHM ITERATION; #endif //TEST_ALGORITHM int cDelta=(aDelta+bDelta)/2; if(cDelta==aDelta || cDelta==bDelta) break; long long cError=computeRoundTripError(longUptimeTick,cDelta); if(sign(aError)==sign(cError)) { aDelta=cDelta; aError=cError; } else { bDelta=cDelta; bError=cError; } } } int delta=uabs(aError)>=1; adjustIntervalNs>>=1; } adjustIntervalNs>>=1; #ifdef TEST_ALGORITHM double adjustInterval=static_cast(adjustIntervalNs)/1e9; P(adjustInterval); NL; #endif //TEST_ALGORITHM /* * This constructor was benchmarked on an efm32 Cortex-M4 microcontroller * running at 48MHz, and this is the runtime: * tick freq 32768 2458 cycles * tick freq 400MHz 4800 cycles */ } long long TimeConversion::computeRoundTripError(unsigned long long tick, int delta) const { auto adjustedToTick=toTick+delta; unsigned long long ns=convert(tick,toNs); unsigned long long roundTrip=convert(ns,adjustedToTick); return static_cast(tick-roundTrip); } TimeConversionFactor TimeConversion::floatToFactor(float x) { const float twoPower32=4294967296.f; //2^32 as a float unsigned int i=x; unsigned int f=(x-i)*twoPower32; return TimeConversionFactor(i,f); } } //namespace miosix //Testsuite for multiplication algorithm and factor computation. Compile with: // g++ -std=c++11 -O2 -DTEST_ALGORITHM -o test timeconversion.cpp; ./test #ifdef TEST_ALGORITHM using namespace std; using namespace miosix; void printRoundTripError(TimeConversion& tc, long long tick) { long long roundTripError=tick-tc.ns2tick(tc.tick2ns(tick)); P(roundTripError); NL; } long double coefficient(unsigned int bi, unsigned int bd) { //Recompute rounded value using those coefficients. The aim of this test //is to assess the multiplication algorithm, so we use integer and //fractional part back-converted to a long double that contains the exact //same value as the fixed point number const long double twoPower32=4294967296.; //2^32 as a long double return static_cast(bd)/twoPower32+static_cast(bi); } void test(double b, unsigned int bi, unsigned int bd, int iterations) { long double br=coefficient(bi,bd); P(bi); P(bd); //This is the error of the 32.32 representation and factor computation double errorPPM=(b-br)/b*1e6; P(errorPPM); srand(0); for(int i=0;i(aHi)<<32 | static_cast(aLo); unsigned long long result=mul64x32d32(a,bi,bd); //Our reference is a long double multiplication unsigned long long reference=static_cast(a)*br; assert(uabs(result-reference)<2); } NL; } void testns2tick(TimeConversion& tc, int iterations) { //First, we get the tick value that results in the maximum ns value //that fits in a long long unsigned int bi,bd; bi=tc.getTick2nsConversion().integerPart(); bd=tc.getTick2nsConversion().fractionalPart(); long double toNs=coefficient(bi,bd); long long maxTick=numeric_limits::max()/toNs; //Care about rounding while(tc.tick2ns(maxTick)<0) maxTick--; print=false; srand(0); //Fully random test for(int i=0;i(rand() & 0x7fffffff)<<32 | static_cast(rand()); a=a % maxTick; assert(uabs(tc.ns2tick(tc.tick2ns(a))-a)<2); } //Large and small test, was proven to check some corner cases for(int i=0;i(rand() & 0x7fffffff)<<32 | static_cast(rand()); a=a % maxTick; assert(uabs(tc.ns2tick(tc.tick2ns(a))-a)<2); long long b=static_cast(rand()); b=b % maxTick; assert(uabs(tc.ns2tick(tc.tick2ns(b))-b)<2); } //Largest and small test for(int i=0;i(rand()); b=b % maxTick; assert(uabs(tc.ns2tick(tc.tick2ns(b))-b)<2); } print=true; } int main() { vector freqs={10000, 32768, 100000, 1e6, 8e6, 24e6, 48e6, 72e6, 84e6, 120e6, 168e6, 180e6, 400e6}; for(double d : freqs) { cout<<"==== "<(tc.getAdjustInterval())/1e9*d*0.49; printRoundTripError(tc,tick-delta); printRoundTripError(tc,tick+delta); b=1e9/d; bi=tc.getTick2nsConversion().integerPart(); bd=tc.getTick2nsConversion().fractionalPart(); test(b,bi,bd,1000000); b=d/1e9; bi=tc.getNs2tickConversion().integerPart(); bd=tc.getNs2tickConversion().fractionalPart(); test(b,bi,bd,1000000); testns2tick(tc,1000000); cout<