From 6ba960ce47229a1d34235cd718a0c66d6073f7f6 Mon Sep 17 00:00:00 2001 From: geeksville Date: Tue, 19 May 2020 14:54:47 -0700 Subject: [PATCH] one hop reliable ready for testing --- docs/software/mesh-alg.md | 21 ++++++++----- src/mesh/ReliableRouter.cpp | 61 +++++++++++++++++++++++++++++++++++-- src/mesh/ReliableRouter.h | 51 +++++++++++++++++++++++++++++++ 3 files changed, 123 insertions(+), 10 deletions(-) diff --git a/docs/software/mesh-alg.md b/docs/software/mesh-alg.md index 4bf2afa3..fe5d9752 100644 --- a/docs/software/mesh-alg.md +++ b/docs/software/mesh-alg.md @@ -8,12 +8,13 @@ reliable messaging tasks (stage one for DSR): - DONE add a max hops parameter, use it for broadcast as well (0 means adjacent only, 1 is one forward etc...). Store as three bits in the header. - DONE add a 'snoopReceived' hook for all messages that pass through our node. - DONE use the same 'recentmessages' array used for broadcast msgs to detect duplicate retransmitted messages. -- in the router receive path?, send an ack packet if want_ack was set and we are the final destination. FIXME, for now don't handle multihop or merging of data replies with these acks. -- keep a list of packets waiting for acks -- for each message keep a count of # retries (max of three). Local to the node, only for the most immediate hop, ignorant of multihop routing. -- delay some random time for each retry (large enough to allow for acks to come in) -- once an ack comes in, remove the packet from the retry list and deliver the ack to the original sender -- after three retries, deliver a no-ack packet to the original sender (i.e. the phone app or mesh router service) +- DONE in the router receive path?, send an ack packet if want_ack was set and we are the final destination. FIXME, for now don't handle multihop or merging of data replies with these acks. +- DONE keep a list of packets waiting for acks +- DONE for each message keep a count of # retries (max of three). Local to the node, only for the most immediate hop, ignorant of multihop routing. +- DONE delay some random time for each retry (large enough to allow for acks to come in) +- DONE once an ack comes in, remove the packet from the retry list and deliver the ack to the original sender +- DONE after three retries, deliver a no-ack packet to the original sender (i.e. the phone app or mesh router service) +- test one hop ack/nak with the python framework dsr tasks @@ -21,9 +22,15 @@ dsr tasks - when sending, if destnodeinfo.next_hop is zero (and no message is already waiting for an arp for that node), startRouteDiscovery() for that node. Queue the message in the 'waiting for arp queue' so we can send it later when then the arp completes. - otherwise, use next_hop and start sending a message (with ack request) towards that node. - Don't use broadcasts for the network pings (close open github issue) +- add ignoreSenders to myNodeInfo to allow testing different mesh topologies by refusing to see certain senders +- test multihop delivery with the python framework -optimizations: +optimizations / low priority: +- low priority: think more careful about reliable retransmit intervals +- make ReliableRouter.pending threadsafe +- bump up PacketPool size for all the new ack/nak/routing packets +- handle 51 day rollover in doRetransmissions - use a priority queue for the messages waiting to send. Send acks first, then routing messages, then data messages, then broadcasts? when we receive any packet diff --git a/src/mesh/ReliableRouter.cpp b/src/mesh/ReliableRouter.cpp index 6ea884ca..02833f8c 100644 --- a/src/mesh/ReliableRouter.cpp +++ b/src/mesh/ReliableRouter.cpp @@ -83,17 +83,72 @@ void ReliableRouter::sendAckNak(bool isAck, NodeNum to, PacketId idFrom) send(p); } +#define NUM_RETRANSMISSIONS 3 + +PendingPacket::PendingPacket(MeshPacket *p) +{ + packet = p; + numRetransmissions = NUM_RETRANSMISSIONS - 1; // We subtract one, because we assume the user just did the first send + setNextTx(); +} + /** * Stop any retransmissions we are doing of the specified node/packet ID pair */ -void ReliableRouter::stopRetransmission(NodeNum from, PacketId id) {} +void ReliableRouter::stopRetransmission(NodeNum from, PacketId id) +{ + auto key = GlobalPacketId(from, id); + stopRetransmission(key); +} +void ReliableRouter::stopRetransmission(GlobalPacketId key) +{ + auto old = pending.find(key); // If we have an old record, someone messed up because id got reused + if (old != pending.end()) { + auto numErased = pending.erase(key); + assert(numErased == 1); + packetPool.release(old->second.packet); + } +} /** * Add p to the list of packets to retransmit occasionally. We will free it once we stop retransmitting. */ -void ReliableRouter::startRetransmission(MeshPacket *p) {} +void ReliableRouter::startRetransmission(MeshPacket *p) +{ + auto id = GlobalPacketId(p); + auto rec = PendingPacket(p); + + stopRetransmission(p->from, p->id); + pending[id] = rec; +} /** * Do any retransmissions that are scheduled (FIXME - for the time being called from loop) */ -void ReliableRouter::doRetransmissions() {} \ No newline at end of file +void ReliableRouter::doRetransmissions() +{ + uint32_t now = millis(); + + // FIXME, we should use a better datastructure rather than walking through this map. + // for(auto el: pending) { + for (auto it = pending.begin(), nextIt = it; it != pending.end(); it = nextIt) { + ++nextIt; // we use this odd pattern because we might be deleting it... + auto &p = it->second; + + // FIXME, handle 51 day rolloever here!!! + if (p.nextTxMsec <= now) { + if (p.numRetransmissions == 0) { + DEBUG_MSG("Reliable send failed, returning a nak\n"); + sendAckNak(false, p.packet->from, p.packet->id); + stopRetransmission(it->first); + } else { + DEBUG_MSG("Sending reliable retransmission\n"); + send(packetPool.allocCopy(*p.packet)); + + // Queue again + --p.numRetransmissions; + p.setNextTx(); + } + } + } +} \ No newline at end of file diff --git a/src/mesh/ReliableRouter.h b/src/mesh/ReliableRouter.h index 75bedfb6..3798d9d6 100644 --- a/src/mesh/ReliableRouter.h +++ b/src/mesh/ReliableRouter.h @@ -2,6 +2,54 @@ #include "FloodingRouter.h" #include "PeriodicTask.h" +#include + +/** + * An identifier for a globalally unique message - a pair of the sending nodenum and the packet id assigned + * to that message + */ +struct GlobalPacketId { + NodeNum node; + PacketId id; + + bool operator==(const GlobalPacketId &p) const { return node == p.node && id == p.id; } + + GlobalPacketId(const MeshPacket *p) + { + node = p->from; + id = p->id; + } + + GlobalPacketId(NodeNum _from, PacketId _id) + { + node = _from; + id = _id; + } +}; + +/** + * A packet queued for retransmission + */ +struct PendingPacket { + MeshPacket *packet; + + /** The next time we should try to retransmit this packet */ + uint32_t nextTxMsec; + + /** Starts at NUM_RETRANSMISSIONS -1(normally 3) and counts down. Once zero it will be removed from the list */ + uint8_t numRetransmissions; + + PendingPacket() {} + PendingPacket(MeshPacket *p); + + void setNextTx() { nextTxMsec = millis() + random(10 * 1000, 12 * 1000); } +}; + +class GlobalPacketIdHashFunction +{ + public: + size_t operator()(const GlobalPacketId &p) const { return (hash()(p.node)) ^ (hash()(p.id)); } +}; /** * This is a mixin that extends Router with the ability to do (one hop only) reliable message sends. @@ -9,6 +57,8 @@ class ReliableRouter : public FloodingRouter { private: + unordered_map pending; + public: /** * Constructor @@ -50,6 +100,7 @@ class ReliableRouter : public FloodingRouter * Stop any retransmissions we are doing of the specified node/packet ID pair */ void stopRetransmission(NodeNum from, PacketId id); + void stopRetransmission(GlobalPacketId p); /** * Add p to the list of packets to retransmit occasionally. We will free it once we stop retransmitting.