/* * Copyright (c) 2016, 2017, 2018 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * * net/tcp.cpp * Transmission Control Protocol. */ // TODO: Plan: // // - TCP_OFFSET_DECODE. // - Finish socket closing and lifetime. // - Implement waiting 2 MSL after socket is closed. // - Implement sending back RST and such. // - Improve retransmission logic under error conditions. // - os-test all the things. // - Work through the receive/transmit details according to the standards. // - PUSH. // - URG? // - Nagle's algorithm, MSG_MORE, TCP_CORK, TCP_NODELAY, etc. // - TCP options. // - Buffer and segment sizes (respect TCP_MSS). // - Efficient receieve queue when out of order. // - Efficient backlog / half-open? Avoid denial of service attacks? // - Measure average round trip time for efficient retransmission? // - High speed extensions? // - Anti-congestion extensions? // - Selective acknowledgements. // - Documentation. // TODO: Read RFC 793 and comment where each requirement is implemented. // TODO: Read RFC 1122's section on TCP. // TODO: Write tcp(4) documentation. // TODO: os-test tcp. #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef IOV_MAX #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ipv4.h" #include "tcp.h" // TODO: Why 2 times the maximum window size? Document this? #define BUFFER_SIZE (2 * 64 * 1024) namespace Sortix { namespace TCP { class TCPSocket; // TODO: Implement PUSH. // TODO: Implement URG? union tcp_sockaddr { sa_family_t family; struct sockaddr_in in; struct sockaddr_in6 in6; }; // The TCP states per STD 7 (RFC 793). enum tcp_state { TCP_STATE_CLOSED, TCP_STATE_LISTEN, TCP_STATE_SYN_SENT, TCP_STATE_SYN_RECV, TCP_STATE_ESTAB, TCP_STATE_FIN_WAIT_1, TCP_STATE_CLOSE_WAIT, TCP_STATE_FIN_WAIT_2, TCP_STATE_CLOSING, TCP_STATE_LAST_ACK, TCP_STATE_TIME_WAIT, }; enum tcp_special { TCP_SPECIAL_NOT, TCP_SPECIAL_PENDING, TCP_SPECIAL_WINDOW, TCP_SPECIAL_ACKED, }; // TODO: Document tcp_lock, and lifetimes. static kthread_mutex_t tcp_lock = KTHREAD_MUTEX_INITIALIZER; static TCPSocket** bindings_v4; static TCPSocket** bindings_v6; void Init() { if ( !(bindings_v4 = new TCPSocket*[65536]) || !(bindings_v6 = new TCPSocket*[65536]) ) Panic("Failed to allocate TCP Socket bindings"); for ( size_t i = 0; i < 65536; i++ ) { bindings_v4[i] = NULL; bindings_v6[i] = NULL; } } static inline bool mod32_le(tcp_seq a, tcp_seq b) { return (int32_t) (a - b) <= 0; } static inline bool mod32_lt(tcp_seq a, tcp_seq b) { return (int32_t) (a - b) < 0; } static inline bool mod32_ge(tcp_seq a, tcp_seq b) { return (int32_t) (a - b) >= 0; } static inline bool mod32_gt(tcp_seq a, tcp_seq b) { return (int32_t) (a - b) > 0; } static bool IsSupportedAddressFamily(int af) { return af == AF_INET /* TODO: || af == AF_INET6 */; } static size_t AddressFamilySize(int af) { switch ( af ) { case AF_INET: return sizeof(struct sockaddr_in); case AF_INET6: return sizeof(struct sockaddr_in6); } return 0; } // The TCP socket implementation. It is separate from the class TCPSocketNode // as that class is reference counted, but this class manages its own lifetime // so the socket is properly shut down after all references are closed. // // Bound sockets are in a double linked list starting from the appropriate // bindings array indexed by the port, and then the sockets on that port are // doubly linked using prev_socket and next_socket. // // Half-open sockets are in a doubly linked list starting from connecting_half // in the listening socket, and then doubly linked with connecting_prev and // connecting_next (with connecting_parent going back to the listening socket). // // Ready sockets that have not yet been accepted are in a doubly linked list // starting from connecting_ready in the listening socket, and then doubly // linked with connecting_prev and connecting_next (with connecting_parent going // back to the listening socket). // TODO: Document the socket lifetime. // TODO: Document the state in tcp(4). class TCPSocket { friend void HandleIPv4(Ref pkt, const struct in_addr* src, const struct in_addr* dst, bool dst_broadcast); public: TCPSocket(int af); ~TCPSocket(); void close(); void destroy(); Ref accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize, int flags); int bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize); int connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize); int listen(ioctx_t* ctx, int backlog); ssize_t recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags); ssize_t send(ioctx_t* ctx, const uint8_t* buf, size_t count, int flags); ssize_t sendmsg(ioctx_t* ctx, const struct msghdr* msg_ptr, int flags); ssize_t read(ioctx_t* ctx, uint8_t* buf, size_t count); ssize_t recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags); ssize_t write(ioctx_t* ctx, const uint8_t* buf, size_t count); int poll(ioctx_t* ctx, PollNode* node); int getsockopt(ioctx_t* ctx, int level, int option_name, void* option_value, size_t* option_size_ptr); int setsockopt(ioctx_t* ctx, int level, int option_name, const void* option_value, size_t option_size); int shutdown(ioctx_t* ctx, int how); int getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize); int getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize); public: void ProcessPacket(Ref pkt, union tcp_sockaddr* pkt_src, union tcp_sockaddr* pkt_dst); void ReceivePacket(Ref pkt, union tcp_sockaddr* pkt_src, union tcp_sockaddr* pkt_dst); void TransmitWork(); void OnTimer(); // TODO: Review this function carefully. inline bool needs_destruction() { return delete_on_close && !transmit_ongoing && !timer_armed && state == TCP_STATE_CLOSED; } private: short PollEventStatus(); bool ImportAddress(ioctx_t* ctx, union tcp_sockaddr* dest, const void* addr, size_t addrsize); bool CanBind(union tcp_sockaddr new_local); bool BindDefault(const union tcp_sockaddr* new_local_ptr); void UpdateWindow(uint16_t new_window); bool Transmit(); void ScheduleTransmit(); void DoScheduleTransmit(); void SetTimer(); ssize_t recv_unlocked(ioctx_t* ctx, uint8_t* buf, size_t count, int flags); ssize_t send_unlocked(ioctx_t* ctx, const uint8_t* buf, size_t count, int flags); int shutdown_unlocked(int how); bool close_unlocked(); public: // The previous socket bound on the same port in the address family. TCPSocket* prev_socket; // The next socket bound on the same port in the address family. TCPSocket* next_socket; // The first half-connected socket in our listening queue. TCPSocket* connecting_half; // The first ready socket in our listening queue. TCPSocket* connecting_ready; // The previous half-connected or ready socket in our listening queue. TCPSocket* connecting_prev; // The next half-connected or ready socket in our listening queue. TCPSocket* connecting_next; // The listening socket this socket is in the listening queue for. TCPSocket* connecting_parent; // Condition variable that is signaled when new data can be received. kthread_cond_t receive_cond; // Condition variable that is signaled when new data can be transmitted. kthread_cond_t transmit_cond; // The local socket name, or the any address port 0 if not set. union tcp_sockaddr local; // The remote socket name, or the any address port 0 if not set. union tcp_sockaddr remote; // The network interface the socket is bound to, or 0 if none. unsigned int ifindex; // Whether the socket has been bound to a port. bool bound; // Whether the socket is receiving datagrams. bool remoted; // Whether SO_REUSEADDR is set. bool reuseaddr; private: // The timer used for retransmissions and timing out the connection. Timer timer; // The poll channel to publish poll bit changes on. PollChannel poll_channel; // The queue of incoming packets whose sequence numbers are too high to // process right now, sorted by increasing sequence number. Ref receive_queue; // TODO: Not a good way to keep track of this. // The offset at which data begins in the incoming ring buffer. size_t incoming_offset; // The amount of bytes in the incoming ring buffer. size_t incoming_used; // The offset at which data begins in the outgoing ring buffer. size_t outgoing_offset; // The amount of bytes in the outgoing ring buffer. size_t outgoing_used; // Send unacknowledged (STD 7, RFC 793). tcp_seq send_una; // Send next (STD 7, RFC 793). tcp_seq send_nxt; // Send window (STD 7, RFC 793). tcp_seq send_wnd; // Send urgent pointer (STD 7, RFC 793). tcp_seq send_up; // Segment sequence number used for last window update (STD 7, RFC 793). tcp_seq send_wl1; // Segment acknowledgment number used for last window update (STD 7, RFC // 793). tcp_seq send_wl2; // Next sequence to send (STD 7, RFC 793). tcp_seq send_pos; // Initial send sequence number (STD 7, RFC 793). tcp_seq iss; // Receive next (STD 7, RFC 793). tcp_seq recv_nxt; // Receive window (STD 7, RFC 793). tcp_seq recv_wnd; // Receive urgent pointer (STD 7, RFC 793). tcp_seq recv_up; // Last sequence acked (STD 7, RFC 793). tcp_seq recv_acked; // Last window size advertised (STD 7, RFC 793). tcp_seq recv_wndlast; // Initial receive sequence number (STD 7, RFC 793). tcp_seq irs; // The address family to which this socket belongs. int af; // Set to an errno value if a socket error has occured, or 0 otherwise. int sockerr; // The number of sockets in the listening queue. int backlog_used; // The maximum number of sockets in the listening queue. int backlog_max; // The number of retransmissions that have occured since the last // acknowledgement from the remote socket. unsigned int retransmissions; // The current TCP state. enum tcp_state state; // The state of the outgoing SYN. enum tcp_special outgoing_syn; // The state of the outgoing FIN. enum tcp_special outgoing_fin; // Whether SYN has been received from the remote socket. bool has_syn; // Whether FIN has been received from the remote socket. bool has_fin; // Whether a transmission has been scheduled. bool transmit_scheduled; // Whether the current owner of the tcp_lock should schedule a transmission // after unlocking the lock. // TODO: Confirm the above comment is true. bool transmit_do_schedule_worker; // Whether a transmission is currently happening. // TODO: Confirm the above comment is true. bool transmit_ongoing; // Whether the socket should be deleted when it's closed. // TODO: Clarify the above comment. bool delete_on_close; // Whether the timer is pending. bool timer_armed; // The incoming ring buffer. unsigned char incoming[BUFFER_SIZE]; // The outgoing ring buffer. unsigned char outgoing[BUFFER_SIZE]; }; // The TCP socket Inode with a reference counted lifetime. The backend class // TCPSocket is separate as it may stay alive for a little while after all // references to it has been lost. class TCPSocketNode : public AbstractInode { public: TCPSocketNode(TCPSocket* socket); virtual ~TCPSocketNode(); virtual Ref accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize, int flags); virtual int bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize); virtual int connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize); virtual int listen(ioctx_t* ctx, int backlog); virtual ssize_t recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags); virtual ssize_t recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags); virtual ssize_t send(ioctx_t* ctx, const uint8_t* buf, size_t count, int flags); virtual ssize_t sendmsg(ioctx_t* ctx, const struct msghdr* msg_ptr, int flags); virtual ssize_t read(ioctx_t* ctx, uint8_t* buf, size_t count); virtual ssize_t write(ioctx_t* ctx, const uint8_t* buf, size_t count); virtual int poll(ioctx_t* ctx, PollNode* node); virtual int getsockopt(ioctx_t* ctx, int level, int option_name, void* option_value, size_t* option_size_ptr); virtual int setsockopt(ioctx_t* ctx, int level, int option_name, const void* option_value, size_t option_size); virtual int shutdown(ioctx_t* ctx, int how); virtual int getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize); virtual int getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize); private: TCPSocket* socket; }; void TCPSocket__OnTimer(Clock* /*clock*/, Timer* /*timer*/, void* user) { ((TCPSocket*) user)->OnTimer(); } TCPSocket::TCPSocket(int af) { prev_socket = NULL; next_socket = NULL; connecting_half = NULL; connecting_ready = NULL; connecting_prev = NULL; connecting_next = NULL; connecting_parent = NULL; receive_cond = KTHREAD_COND_INITIALIZER; transmit_cond = KTHREAD_COND_INITIALIZER; memset(&local, 0, sizeof(local)); memset(&remote, 0, sizeof(remote)); ifindex = 0; bound = false; remoted = false; reuseaddr = false; // timer is initialized by its constructor. timer.Attach(Time::GetClock(CLOCK_MONOTONIC)); // poll_channel is initialized by its constructor. // receive_queue is initialized by its constructor. incoming_offset = 0; incoming_used = 0; outgoing_offset = 0; outgoing_used = 0; send_una = 0; send_nxt = 0; send_wnd = 0; send_up = 0; send_wl1 = 0; send_wl2 = 0; send_pos = 0; iss = 0; recv_nxt = 0; recv_wnd = 0; recv_up = 0; recv_acked = 0; recv_wndlast = 0; irs = 0; this->af = af; sockerr = 0; backlog_used = 0; backlog_max = 0; retransmissions = 0; state = TCP_STATE_CLOSED; outgoing_syn = TCP_SPECIAL_NOT; outgoing_fin = TCP_SPECIAL_NOT; has_syn = false; has_fin = false; transmit_scheduled = false; transmit_do_schedule_worker = false; transmit_ongoing = false; delete_on_close = false; timer_armed = false; memset(incoming, 0, sizeof(incoming)); memset(outgoing, 0, sizeof(outgoing)); } TCPSocket::~TCPSocket() { // TODO: TCP_STATE_LISTEN can't reach this location, as all // needs_destruction() requires the CLOSED state, and close_unlocked // puts us in the CLOSED state. Let's make it official that closing // a listening socket puts it in the CLOSED state? assert(state == TCP_STATE_CLOSED || state == TCP_STATE_LISTEN); assert(!bound); assert(!prev_socket); assert(!next_socket); assert(!connecting_half); assert(!connecting_half); assert(!connecting_ready); assert(!connecting_prev); assert(!connecting_next); assert(!connecting_parent); } // TODO: Review this function carefully. void TCPSocket::close() { kthread_mutex_lock(&tcp_lock); bool do_delete = close_unlocked(); bool do_schedule_worker = transmit_do_schedule_worker; transmit_do_schedule_worker = false; kthread_mutex_unlock(&tcp_lock); if ( do_delete ) { // destroy() was called by close_unlocked. delete this; } else if ( do_schedule_worker ) DoScheduleTransmit(); } // TODO: Review this function carefully. bool TCPSocket::close_unlocked() // tcp_lock taken { if ( state != TCP_STATE_CLOSED && state != TCP_STATE_LISTEN && state != TCP_STATE_SYN_SENT && state != TCP_STATE_SYN_RECV /* TODO: ??? */ ) { shutdown_unlocked(SHUT_RDWR); delete_on_close = true; return false; } if ( timer_armed && timer.TryCancel() ) timer_armed = false; if ( transmit_ongoing || timer_armed ) { delete_on_close = true; return false; } // TODO: Is this right in the TCP_STATE_SYN_SENT and TCP_STATE_SYN_RECV // cases? Should the socket be kept around for some MSLs or such? state = TCP_STATE_CLOSED; destroy(); return true; } void TCPSocket::destroy() // tcp_lock taken { if ( bound ) { if ( af == AF_INET ) { uint16_t port = be16toh(local.in.sin_port); if ( prev_socket ) prev_socket->next_socket = next_socket; else bindings_v4[port] = next_socket; if ( next_socket ) next_socket->prev_socket = prev_socket; } else if ( af == AF_INET6 ) { uint16_t port = be16toh(local.in6.sin6_port); if ( prev_socket ) prev_socket->next_socket = next_socket; else bindings_v6[port] = next_socket; if ( next_socket ) next_socket->prev_socket = prev_socket; } prev_socket = NULL; next_socket = NULL; bound = false; } while ( connecting_half || connecting_ready ) { TCPSocket* socket; if ( connecting_half ) { socket = connecting_half; connecting_half = socket->connecting_next; if ( connecting_half ) connecting_half->connecting_prev = NULL; } else { socket = connecting_ready; connecting_ready = socket->connecting_next; if ( connecting_ready ) connecting_ready->connecting_prev = NULL; } socket->connecting_prev = NULL; socket->connecting_next = NULL; socket->connecting_parent = NULL; // TODO: This should send RST to the other side. (But probably not for // half-open connections). // TODO: According to what should that happen? socket->close_unlocked(); backlog_used--; } if ( connecting_parent ) { if ( connecting_prev ) connecting_prev->connecting_next = connecting_next; else if ( state == TCP_STATE_SYN_RECV ) connecting_parent->connecting_half = connecting_next; else connecting_parent->connecting_ready = connecting_next; if ( connecting_next ) connecting_next->connecting_prev = connecting_prev; connecting_prev = NULL; connecting_next = NULL; // TODO: Review backlog_used is accounted correctly. connecting_parent->backlog_used--; connecting_parent = NULL; } } Ref TCPSocket::accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr, int flags) { if ( flags & ~(0) ) return errno = EINVAL, Ref(NULL); if ( addr && !addrsize_ptr ) return errno = EINVAL, Ref(NULL); ScopedLock lock(&tcp_lock); if ( state != TCP_STATE_LISTEN ) return errno = EINVAL, Ref(NULL); while ( !connecting_ready ) { if ( ctx->dflags & O_NONBLOCK ) return errno = EWOULDBLOCK, Ref(NULL); if ( !kthread_cond_wait_signal(&receive_cond, &tcp_lock) ) return errno = EINTR, Ref(NULL); } TCPSocket* socket = connecting_ready; if ( addr ) { size_t addrsize; if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) ) return Ref(NULL); size_t af_addrsize = AddressFamilySize(af); if ( af_addrsize < addrsize ) addrsize = af_addrsize; if ( !ctx->copy_to_dest(addr, &socket->remote, addrsize) ) return Ref(NULL); if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) ) return Ref(NULL); } Ref result(new TCPSocketNode(socket)); if ( !result ) return Ref(NULL); connecting_ready = socket->connecting_next; if ( connecting_ready ) connecting_ready->connecting_prev = NULL; socket->connecting_prev = NULL; socket->connecting_next = NULL; socket->connecting_parent = NULL; backlog_used--; return result; } bool TCPSocket::ImportAddress(ioctx_t* ctx, union tcp_sockaddr* dest, const void* addr, size_t addrsize) { // TODO: os-test whether AF_UNSPEC can disconnect. if ( addrsize != AddressFamilySize(af) ) return errno = EINVAL, -1; union tcp_sockaddr copy; memset(©, 0, sizeof(copy)); if ( !ctx->copy_from_src(©, addr, addrsize) ) return false; if ( copy.family != af ) return errno = EAFNOSUPPORT, false; memcpy(dest, ©, sizeof(copy)); return true; } bool TCPSocket::CanBind(union tcp_sockaddr new_local) // tcp_lock taken { if ( af == AF_INET ) { // TODO: os-test binding to broadcast addresses. // Bind to either the any address or the address of a network interface. if ( new_local.in.sin_addr.s_addr != htobe32(INADDR_ANY) ) { // TODO: What happens to sockets if the network interface changes // its address? ScopedLock ifs_lock(&netifs_lock); bool found = false; for ( unsigned int i = 1; i < netifs_count; i++ ) { NetworkInterface* netif = netifs[i]; if ( !netif ) continue; ScopedLock cfg_lock(&netif->cfg_lock); if ( memcmp(&netif->cfg.inet.address, &new_local.in.sin_addr, sizeof(struct in_addr)) == 0 ) { found = true; break; } } // No interface had the correct address. if ( !found ) return errno = EADDRNOTAVAIL, false; } uint16_t port = be16toh(new_local.in.sin_port); for ( TCPSocket* socket = bindings_v4[port]; socket; socket = socket->next_socket ) { // TODO: os-test how SO_REUSEADDR works for TCP. if ( new_local.in.sin_addr.s_addr == htobe32(INADDR_ANY) && !(reuseaddr && socket->reuseaddr) ) return errno = EADDRINUSE, false; if ( socket->local.in.sin_addr.s_addr == htobe32(INADDR_ANY) && !(reuseaddr && socket->reuseaddr) ) return errno = EADDRINUSE, false; if ( new_local.in.sin_addr.s_addr == socket->local.in.sin_addr.s_addr ) return errno = EADDRINUSE, false; } } else if ( af == AF_INET6 ) { // TODO: IPv6 support for seeing if any interface has the address. if ( true ) return errno = EAFNOSUPPORT, false; uint16_t port = be16toh(new_local.in6.sin6_port); if ( bindings_v6[port] ) return errno = EADDRINUSE, false; for ( TCPSocket* socket = bindings_v6[port]; socket; socket = socket->next_socket ) { if ( !memcmp(&new_local.in6.sin6_addr, &in6addr_any, sizeof(in6addr_any)) && !(reuseaddr && socket->reuseaddr) ) if ( !memcmp(&socket->local.in6.sin6_addr, &in6addr_any, sizeof(in6addr_any)) && !(reuseaddr && socket->reuseaddr) ) if ( !memcmp(&new_local.in6.sin6_addr, &socket->local.in6.sin6_addr, sizeof(new_local.in6.sin6_addr)) ) return errno = EADDRINUSE, false; } } else return errno = EAFNOSUPPORT, false; return true; } int TCPSocket::bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize) { ScopedLock lock(&tcp_lock); if ( bound ) return errno = EINVAL, -1; union tcp_sockaddr new_local; if ( !ImportAddress(ctx, &new_local, addr, addrsize) ) return -1; uint16_t port; if ( af == AF_INET ) port = be16toh(new_local.in.sin_port); else if ( af == AF_INET6 ) port = be16toh(new_local.in6.sin6_port); else return errno = EAFNOSUPPORT, -1; // TODO: Binding to the any address needs to pick the appropriate source // interface and bind to its address. (Or really? udp doesn't? // os-test?) // TODO: os-test a server listening on any, and then getsockname a // connection received on that port. if ( port == 0 ) return BindDefault(&new_local) ? 0 : -1; if ( !CanBind(new_local) ) return -1; if ( af == AF_INET ) { uint16_t port = be16toh(new_local.in.sin_port); if ( bindings_v4[port] ) bindings_v4[port]->prev_socket = this; next_socket = bindings_v4[port]; prev_socket = NULL; bindings_v4[port] = this; } else if ( af == AF_INET6 ) { uint16_t port = be16toh(new_local.in6.sin6_port); if ( bindings_v6[port] ) return errno = EADDRINUSE, -1; next_socket = bindings_v6[port]; prev_socket = NULL; bindings_v6[port] = this; } else return errno = EAFNOSUPPORT, -1; memcpy(&local, &new_local, sizeof(new_local)); bound = true; return 0; } // tcp_lock locked bool TCPSocket::BindDefault(const union tcp_sockaddr* new_local_ptr) { // TODO: This allocator becomes increasingly biased as more ports are // allocated. // TODO: Try not to allocate recently used ports. union tcp_sockaddr new_local; if ( new_local_ptr ) memcpy(&new_local, new_local_ptr, sizeof(union tcp_sockaddr)); else { memset(&new_local, 0, sizeof(new_local)); if ( af == AF_INET ) { new_local.in.sin_family = AF_INET; new_local.in.sin_addr.s_addr = htobe32(INADDR_ANY); } else if ( af == AF_INET6 ) { new_local.in6.sin6_family = AF_INET6; new_local.in6.sin6_addr = in6addr_any; } else return errno = EAFNOSUPPORT, false; } uint16_t start = 32768; // TODO: Document in tcp(4). uint16_t end = 61000; // TODO: Document in tcp(4). uint16_t count = end - start; uint16_t offset = arc4random_uniform(count); for ( uint16_t i = 0; i < count; i++ ) { uint16_t j = offset + i; if ( count <= j ) j -= count; uint16_t port = start + j; if ( af == AF_INET ) new_local.in.sin_port = htobe16(port); else if ( af == AF_INET6 ) new_local.in6.sin6_port = htobe16(port); else return errno = EAFNOSUPPORT, false; if ( !CanBind(new_local) ) { if ( errno == EADDRINUSE ) continue; return false; } if ( af == AF_INET ) { if ( bindings_v4[port] ) bindings_v4[port]->prev_socket = this; next_socket = bindings_v4[port]; prev_socket = NULL; bindings_v4[port] = this; } else if ( af == AF_INET6 ) { if ( bindings_v6[port] ) bindings_v6[port]->prev_socket = this; next_socket = bindings_v6[port]; prev_socket = NULL; bindings_v6[port] = this; } else return errno = EAFNOSUPPORT, false; memcpy(&local, &new_local, sizeof(new_local)); bound = true; return true; } return errno = EAGAIN, false; } bool TCPSocket::Transmit() { if ( state == TCP_STATE_CLOSED ) return (errno = sockerr ? sockerr : ENOTCONN), false; // Move new outgoing data into the transmission window if there is room. tcp_seq window_available = (tcp_seq) (send_una + send_wnd - send_nxt); if ( window_available && outgoing_syn == TCP_SPECIAL_PENDING ) { send_nxt++; outgoing_syn = TCP_SPECIAL_WINDOW; window_available--; } if ( window_available ) { tcp_seq window_data = (tcp_seq)(send_nxt - send_una); if ( outgoing_syn == TCP_SPECIAL_WINDOW ) window_data--; if ( outgoing_fin == TCP_SPECIAL_WINDOW ) window_data--; assert(window_data <= outgoing_used); size_t outgoing_new = outgoing_used - window_data; tcp_seq amount = window_available; if ( outgoing_new < amount ) amount = outgoing_new; send_nxt += amount; window_available -= amount; } if ( window_available && outgoing_fin == TCP_SPECIAL_PENDING ) { send_nxt++; outgoing_fin = TCP_SPECIAL_WINDOW; window_available--; } // Transmit packets. while ( mod32_lt(send_pos, send_nxt) || (has_syn && mod32_lt(recv_acked, recv_nxt)) || recv_wnd != recv_wndlast ) { size_t mtu; union tcp_sockaddr sendfrom; if ( af == AF_INET ) { if ( !IPv4::GetSourceIP(&local.in.sin_addr, &remote.in.sin_addr, &sendfrom.in.sin_addr, ifindex, &mtu) ) { // TODO: Retry after timer. Close if permanent failure? return false; } } // TODO: IPv6 support. else return errno = EAFNOSUPPORT, false; if ( mtu < sizeof(struct tcphdr) ) { // TODO: Should the connection be closed here? return errno = EINVAL, false; } mtu -= sizeof(struct tcphdr); Ref pkt = GetPacket(); if ( !pkt ) { // TODO: Retry after timer. return false; } pkt->length = sizeof(struct tcphdr); unsigned char* out = pkt->from; struct tcphdr hdr; if ( af == AF_INET ) { hdr.th_sport = local.in.sin_port; hdr.th_dport = remote.in.sin_port; } else if ( af == AF_INET6 ) { hdr.th_sport = local.in6.sin6_port; hdr.th_dport = remote.in6.sin6_port; } else return errno = EAFNOSUPPORT, false; hdr.th_seq = htobe32(send_pos); hdr.th_offset = TCP_OFFSET_ENCODE(sizeof(struct tcphdr) / 4); hdr.th_flags = 0; tcp_seq send_nxtpos = send_pos; assert(mod32_le(send_nxtpos, send_nxt)); if ( outgoing_syn == TCP_SPECIAL_WINDOW && send_nxtpos == send_una ) { hdr.th_flags |= TH_SYN; send_nxtpos++; } assert(mod32_le(send_nxtpos, send_nxt)); if ( has_syn ) { // TODO: RFC 1122 4.2.2.6: // "TCP SHOULD send an MSS (Maximum Segment Size) option in // every SYN segment when its receive MSS differs from the // default 536, and MAY send it always." // "If an MSS option is not received at connection setup, TCP // MUST assume a default send MSS of 536 (576-40)." hdr.th_flags |= TH_ACK; hdr.th_ack = htobe32(recv_nxt); } else hdr.th_ack = htobe32(0); hdr.th_win = htobe16(recv_wnd); hdr.th_urp = htobe16(0); hdr.th_sum = htobe16(0); tcp_seq window_data = (tcp_seq)(send_nxt - send_pos); if ( send_pos == send_una && outgoing_syn == TCP_SPECIAL_WINDOW ) window_data--; if ( mod32_lt(send_pos, send_nxt) && outgoing_fin == TCP_SPECIAL_WINDOW ) window_data--; if ( window_data ) { size_t amount = mtu < window_data ? mtu : window_data; assert(outgoing_offset <= sizeof(outgoing)); tcp_seq window_length = (tcp_seq) (send_nxtpos - send_una); if ( outgoing_syn == TCP_SPECIAL_WINDOW ) window_length--; assert(window_length <= sizeof(outgoing)); size_t outgoing_end = outgoing_offset + window_length; if ( sizeof(outgoing) <= outgoing_end ) outgoing_end -= sizeof(outgoing); assert(outgoing_end < sizeof(outgoing)); size_t until_end = sizeof(outgoing) - outgoing_end; size_t first = until_end < amount ? until_end : amount; assert(first <= sizeof(outgoing)); assert(first <= sizeof(outgoing) - outgoing_end); size_t second = amount - first; assert(second <= sizeof(outgoing)); memcpy(out + sizeof(hdr), outgoing + outgoing_end, first); if ( second ) memcpy(out + sizeof(hdr) + first, outgoing, second); pkt->length += amount; send_nxtpos += amount; } assert(mod32_le(send_nxtpos, send_nxt)); if ( outgoing_fin == TCP_SPECIAL_WINDOW && send_nxtpos + 1 == send_nxt ) { hdr.th_flags |= TH_FIN; send_nxtpos++; } assert(mod32_le(send_nxtpos, send_nxt)); memcpy(out, &hdr, sizeof(hdr)); uint16_t checksum = 0; if ( af == AF_INET ) { checksum = IPv4::ipsum_buf(checksum, &sendfrom.in.sin_addr, sizeof(struct in_addr)); checksum = IPv4::ipsum_buf(checksum, &remote.in.sin_addr, sizeof(struct in_addr)); } else if ( af == AF_INET6 ) { checksum = IPv4::ipsum_buf(checksum, &sendfrom.in6.sin6_addr, sizeof(struct in6_addr)); checksum = IPv4::ipsum_buf(checksum, &remote.in6.sin6_addr, sizeof(struct in6_addr)); } else return errno = EAFNOSUPPORT, false; checksum = IPv4::ipsum_word(checksum, IPPROTO_TCP); checksum = IPv4::ipsum_word(checksum, pkt->length); checksum = IPv4::ipsum_buf(checksum, out, pkt->length); hdr.th_sum = htobe16(IPv4::ipsum_finish(checksum)); memcpy(out, &hdr, sizeof(hdr)); if ( af == AF_INET ) { if ( !IPv4::Send(pkt, &sendfrom.in.sin_addr, &remote.in.sin_addr, IPPROTO_TCP, ifindex, false) ) { // TODO: Retry after timer. Close if permanent failure? // TODO: Should the initial SYN fail quick & hard if permanent? sockerr = errno; kthread_cond_broadcast(&transmit_cond); kthread_cond_broadcast(&receive_cond); poll_channel.Signal(PollEventStatus()); return false; } } // TODO: IPv6 support. else return errno = EAFNOSUPPORT, false; if ( has_syn ) recv_acked = recv_nxt; recv_wndlast = recv_wnd; assert(mod32_le(send_nxtpos, send_nxt)); send_pos = send_nxtpos; // TODO: Also set timer to retry in case of any errors above. if ( mod32_lt(send_una, send_pos) && !timer_armed ) SetTimer(); } return true; } void TCPSocket::TransmitWork() { ScopedLock lock(&tcp_lock); transmit_scheduled = false; transmit_ongoing = false; Transmit(); // TODO: What if the timer needs to be cancelled due to the new close state? if ( needs_destruction() ) { destroy(); delete this; } } void TCPSocket::OnTimer() { ScopedLock lock(&tcp_lock); timer_armed = false; // TODO: What if the timer needs to be cancelled due to the new close state? if ( needs_destruction() ) { destroy(); delete this; return; } if ( mod32_lt(send_una, send_pos) ) { retransmissions++; // TODO: IMPLEMENTATION DETAILS the number of retransmissions. if ( 6 /* Or some suitable constant */ < retransmissions ) { // TODO: Consider the connection closed. sockerr = ETIMEDOUT; kthread_cond_broadcast(&transmit_cond); kthread_cond_broadcast(&receive_cond); poll_channel.Signal(PollEventStatus()); return; } send_pos = send_una; Transmit(); // TODO: needs_destruction() could be true here? } } static void TCPSocket__Transmit(void* ctx) { ((TCPSocket*) ctx)->TransmitWork(); } void TCPSocket::ScheduleTransmit() // tcp_lock locked { if ( transmit_scheduled ) return; transmit_scheduled = true; transmit_do_schedule_worker = true; transmit_ongoing = true; } void TCPSocket::DoScheduleTransmit() // tcp_lock unlocked { Worker::Schedule(TCPSocket__Transmit, this); } void TCPSocket::SetTimer() // tcp_lock locked { if ( timer_armed ) { if ( !timer.TryCancel() ) return; timer_armed = false; } if ( mod32_le(send_una, send_pos) ) { struct itimerspec timeout; memset(&timeout, 0, sizeof(timeout)); // TODO: IMPLEMENTATION DETAILS the timeout algorithm. // TODO: BUGS measure latency and adapt to it for first retransmission. timeout.it_value.tv_sec = 1 + 1 * retransmissions; // TODO: Review timer lifetime. timer.Set(&timeout, NULL, TIMER_FUNC_MAY_DEALLOCATE_TIMER, TCPSocket__OnTimer, this); timer_armed = true; } } void TCPSocket::ProcessPacket(Ref pkt, union tcp_sockaddr* pkt_src, union tcp_sockaddr* pkt_dst) // tcp_lock locked { const unsigned char* in = pkt->from + pkt->offset; size_t inlen = pkt->length - pkt->offset; struct tcphdr hdr; memcpy(&hdr, in, sizeof(hdr)); hdr.th_sport = be16toh(hdr.th_sport); hdr.th_dport = be16toh(hdr.th_dport); hdr.th_seq = be32toh(hdr.th_seq); hdr.th_ack = be32toh(hdr.th_ack); hdr.th_win = be16toh(hdr.th_win); hdr.th_urp = be16toh(hdr.th_urp); in += sizeof(hdr); inlen -= sizeof(hdr); if ( state == TCP_STATE_CLOSED ) { if ( hdr.th_flags & TH_RST ) return; // TODO: ACK the RST. // TODO: Also do this if there isn't any bound socket. return; } else if ( state == TCP_STATE_LISTEN ) { if ( hdr.th_flags & TH_RST ) return; if ( hdr.th_flags & TH_ACK ) { // TODO: Send . return; } if ( !(hdr.th_flags & TH_SYN) ) return; // TODO: BUGS how this leads to denial of service. if ( backlog_used == backlog_max ) return; // TODO: Use SYN cache to mitigate SYN flood attack. TCPSocket* socket = new TCPSocket(af); if ( !socket ) return; assert(pkt_src); assert(pkt_dst); socket->remote = *pkt_src; socket->local = *pkt_dst; socket->remoted = true; socket->bound = true; // TODO: Do we know for sure that such a connection doesn't already // exist? I don't think that's possible but a check seems prudent // to avoid duplicate connections. if ( af == AF_INET ) { uint16_t port = be16toh(socket->local.in.sin_port); socket->prev_socket = NULL; socket->next_socket = bindings_v4[port]; if ( socket->next_socket ) socket->next_socket->prev_socket = socket; bindings_v4[port] = socket; } else if ( af == AF_INET6 ) { uint16_t port = be16toh(socket->local.in6.sin6_port); socket->prev_socket = NULL; socket->next_socket = bindings_v6[port]; if ( socket->next_socket ) socket->next_socket->prev_socket = socket; bindings_v6[port] = socket; } socket->iss = arc4random(); socket->send_una = socket->iss; socket->send_nxt = socket->iss; socket->send_wnd = 1; socket->send_pos = socket->iss; socket->outgoing_syn = TCP_SPECIAL_PENDING; socket->recv_wnd = TCP_MAXWIN; socket->recv_acked = hdr.th_seq; socket->recv_nxt = hdr.th_seq + 1; socket->irs = hdr.th_seq; socket->has_syn = true; socket->state = TCP_STATE_SYN_RECV; socket->UpdateWindow(hdr.th_win); socket->connecting_parent = this; socket->connecting_prev = NULL; socket->connecting_next = connecting_half; if ( socket->connecting_next ) socket->connecting_next->connecting_prev = socket; connecting_half = socket; socket->Transmit(); return; } else if ( state == TCP_STATE_SYN_SENT ) { if ( hdr.th_flags & TH_ACK ) { if ( mod32_le(hdr.th_ack, iss) || mod32_gt(hdr.th_ack, send_nxt) ) { if ( hdr.th_flags & TH_RST ) return; // TODO: Send RST. } if ( !(mod32_le(send_una, hdr.th_ack) && mod32_le(hdr.th_ack, send_nxt)) ) return; } if ( hdr.th_flags & TH_RST ) { sockerr = ECONNREFUSED; state = TCP_STATE_CLOSED; kthread_cond_broadcast(&receive_cond); // Wake up connect. kthread_cond_broadcast(&transmit_cond); poll_channel.Signal(PollEventStatus()); return; } if ( !(hdr.th_flags & TH_SYN) ) return; recv_acked = hdr.th_seq; recv_nxt = hdr.th_seq + 1; irs = hdr.th_seq; has_syn = true; UpdateWindow(hdr.th_win); // TODO: Drop packet if the packet contains data/FIN beyond the SYN? if ( hdr.th_flags & TH_ACK ) { send_una = hdr.th_ack; retransmissions = 0; SetTimer(); // TODO: Dequeue any segments on the retransmission queue before // send_una. // TODO: How can the above happen? if ( mod32_le(iss, send_una) ) // TODO: Or lt? { outgoing_syn = TCP_SPECIAL_ACKED; state = TCP_STATE_ESTAB; kthread_cond_broadcast(&receive_cond); // Wake up connect. } } else { state = TCP_STATE_SYN_RECV; } return; } bool acceptable = false; if ( inlen == 0 && recv_wnd == 0 ) acceptable = hdr.th_seq == recv_nxt; else if ( inlen == 0 && 0 < recv_wnd ) acceptable = mod32_le(recv_nxt, hdr.th_seq) && mod32_lt(hdr.th_seq, recv_nxt + recv_wnd); else if ( 0 < inlen && 0 < recv_wnd ) { tcp_seq seg_end = (tcp_seq) (hdr.th_seq + inlen - 1); acceptable = (mod32_le(recv_nxt, hdr.th_seq) && mod32_lt(hdr.th_seq, recv_nxt + recv_wnd)) || (mod32_le(recv_nxt, seg_end) && mod32_lt(seg_end, recv_nxt + recv_wnd)); } if ( !acceptable ) { if ( hdr.th_flags & TH_RST ) return; // Send . recv_acked = recv_nxt - 1; return; } uint16_t real_seq = hdr.th_seq; if ( mod32_lt(hdr.th_seq, recv_nxt) && hdr.th_flags & TH_SYN ) { hdr.th_flags &= ~TH_SYN; hdr.th_seq++; } if ( mod32_lt(hdr.th_seq, recv_nxt) ) { tcp_seq skip = recv_nxt - hdr.th_seq; if ( inlen < skip ) skip = inlen; hdr.th_seq += skip; in += skip; inlen -= skip; } if ( mod32_lt(hdr.th_seq, recv_nxt) && hdr.th_flags & TH_FIN ) { hdr.th_flags &= ~TH_FIN; hdr.th_seq++; } if ( mod32_lt(hdr.th_seq, recv_nxt) ) return; if ( mod32_gt(hdr.th_seq, recv_nxt) ) { Ref prev; Ref iter = receive_queue; // TODO: For n packets in the worst order, this scales O(n^2). // TODO: This wastes a packet per byte in the worst case. while ( iter ) { const unsigned char* iter_in = iter->from + iter->offset; const unsigned char* iter_in_seq = iter_in + offsetof(struct tcphdr, th_seq); tcp_seq iter_seq; memcpy(&iter_seq, iter_in_seq, sizeof(iter_seq)); iter_seq = be32toh(iter_seq); if ( mod32_le(real_seq, iter_seq) ) break; // TODO: Handle duplicate and overlapping segments. prev = iter; iter = iter->next; } if ( prev ) { pkt->next = prev->next; prev->next = pkt; } else { pkt->next = receive_queue; receive_queue = pkt; } return; } // TODO: Potentially trim the end if it exceeds the receive window. if ( hdr.th_flags & TH_RST ) { if ( state == TCP_STATE_SYN_RECV ) { // TODO: If passive open (listening), then return to the LISTEN // state and return. state = TCP_STATE_CLOSED; } else if ( state == TCP_STATE_ESTAB || state == TCP_STATE_FIN_WAIT_1 || state == TCP_STATE_FIN_WAIT_2 || state == TCP_STATE_CLOSE_WAIT ) { state = TCP_STATE_CLOSED; } else // CLOSING, LAST_ACK, TIME_WAIT. { state = TCP_STATE_CLOSED; } sockerr = ECONNRESET; kthread_cond_broadcast(&receive_cond); kthread_cond_broadcast(&transmit_cond); poll_channel.Signal(PollEventStatus()); return; } if ( hdr.th_flags & TH_SYN ) { // TODO: Send RST. sockerr = ECONNRESET; kthread_cond_broadcast(&receive_cond); kthread_cond_broadcast(&transmit_cond); poll_channel.Signal(PollEventStatus()); return; } if ( !(hdr.th_flags & TH_ACK) ) return; if ( state == TCP_STATE_SYN_RECV ) { if ( mod32_le(send_una, hdr.th_ack) && mod32_le(hdr.th_ack, send_nxt) ) { state = TCP_STATE_ESTAB; if ( connecting_parent ) { if ( connecting_prev ) connecting_prev->connecting_next = connecting_next; else connecting_parent->connecting_half = connecting_next; if ( connecting_next ) connecting_next->connecting_prev = connecting_prev; // TODO: This inserts the connection to the front of the // accept queue, rather than the end, which is unfair to // connections that have been waiting longer. connecting_prev = NULL; connecting_next = connecting_parent->connecting_ready; if ( connecting_next ) connecting_next->connecting_prev = this; connecting_parent->connecting_ready = this; kthread_cond_broadcast(&connecting_parent->receive_cond); uint16_t status = connecting_parent->PollEventStatus(); connecting_parent->poll_channel.Signal(status); } } else { // TODO: Send . Transmit(); return; } } // TODO: This is only supposed to happen if state == TCP_STATE_ESTAB (or // when TCP_STATE_RECV transitions to TCP_STATE_ESTAB) but I want it // in other cases as well (FIN_WAIT_1 at least). Check RFC 793. if ( mod32_lt(send_una, hdr.th_ack) && mod32_le(hdr.th_ack, send_nxt) ) { // TODO: If in CLOSING state and FIN is not ACKed, drop segment. // TODO: If in LAST-ACK state only ACK of our FIN can arrive. // TODO: If in TIME-WAIT state, only a retransmission of the remote // FIN can arrive. If so, ACK it and restart the 2 MSL timeout. tcp_seq old_send_una = send_una; tcp_seq acked = (tcp_seq) (hdr.th_ack - send_una); if ( outgoing_syn == TCP_SPECIAL_WINDOW && 0 < acked ) { outgoing_syn = TCP_SPECIAL_ACKED; acked--; send_una++; } tcp_seq window_data = (tcp_seq)(send_nxt - send_una); if ( outgoing_fin == TCP_SPECIAL_WINDOW ) window_data--; if ( window_data && acked ) { size_t amount = window_data < acked ? window_data : acked; assert(outgoing_offset < sizeof(outgoing)); outgoing_offset += amount; if ( sizeof(outgoing) <= outgoing_offset ) outgoing_offset -= sizeof(outgoing); assert(outgoing_offset < sizeof(outgoing)); assert(amount <= outgoing_used); outgoing_used -= amount; kthread_cond_broadcast(&transmit_cond); poll_channel.Signal(PollEventStatus()); acked -= amount; send_una += amount; } bool fin_was_acked = false; if ( outgoing_fin == TCP_SPECIAL_WINDOW && 0 < acked ) { outgoing_fin = TCP_SPECIAL_ACKED; acked--; send_una++; fin_was_acked = true; } if ( send_una != old_send_una ) { // TODO: Possibly recalculate the average time to contact remote. retransmissions = 0; SetTimer(); } if ( fin_was_acked ) { if ( state == TCP_STATE_FIN_WAIT_1 ) state = TCP_STATE_FIN_WAIT_2; else if ( state == TCP_STATE_CLOSING ) { state = TCP_STATE_CLOSING; // TODO: Wait 2 MSL and then close: state = TCP_STATE_CLOSED; return; } else if ( state == TCP_STATE_LAST_ACK ) { state = TCP_STATE_CLOSED; return; } } } // TODO: If the ACK is a duplicate (SEG.ACK < SND.UNA), it can be // ignored. If the ACK acks something not yet sent (SEG.ACK > // SND.NXT) then send an ACK, drop the segment, and return. if ( mod32_lt(send_wl1, hdr.th_seq) || (send_wl1 == hdr.th_seq && mod32_le(send_wl2, hdr.th_ack)) ) { UpdateWindow(hdr.th_win); send_wl1 = hdr.th_seq; send_wl2 = hdr.th_ack; } if ( state == TCP_STATE_ESTAB || state == TCP_STATE_FIN_WAIT_1 || state == TCP_STATE_FIN_WAIT_2 ) { assert(incoming_offset < sizeof(incoming)); assert(incoming_used <= sizeof(incoming)); size_t available = sizeof(incoming) - incoming_used; size_t amount = available < inlen ? available : inlen; assert(amount <= sizeof(incoming)); assert(amount <= available); size_t newat = incoming_offset + incoming_used; if ( sizeof(incoming) <= newat ) newat -= sizeof(incoming); assert(newat < sizeof(incoming)); size_t until_end = sizeof(incoming) - newat; assert(until_end <= sizeof(incoming)); size_t first = until_end < amount ? until_end : amount; assert(first <= amount); assert(first <= sizeof(incoming)); size_t second = amount - first; assert(second <= amount); assert(second <= sizeof(incoming)); assert(first + second == amount); assert(first + second <= sizeof(incoming)); assert(first + second <= available); memcpy(incoming + newat, in, first); if ( second ) memcpy(incoming, in + first, second); incoming_used += amount; available = sizeof(incoming) - incoming_used; if ( available < recv_wnd ) recv_wnd = available; recv_nxt = hdr.th_seq + amount; if ( amount == inlen && (hdr.th_flags & TH_FIN) ) { recv_nxt++; has_fin = true; } // TODO: Is it possible that amount == 0? kthread_cond_broadcast(&receive_cond); poll_channel.Signal(PollEventStatus()); } if ( hdr.th_flags & TH_FIN ) { // TODO: Are any of these states even possible here? I suppose it must // be, at least for SYN-SENT, as the remote can close at any time. if ( state == TCP_STATE_CLOSED || state == TCP_STATE_LISTEN || state == TCP_STATE_SYN_SENT ) return; // TODO: HACK: ACK the FIN. This would be done if we didn't enter CLOSED // immediately, it also doesn't retransmit the ACK in case the // ACK gets lost and the remote sends the FIN again. Transmit(); if ( state == TCP_STATE_SYN_RECV || state == TCP_STATE_ESTAB ) { state = TCP_STATE_CLOSE_WAIT; kthread_cond_broadcast(&receive_cond); poll_channel.Signal(PollEventStatus()); } else if ( state == TCP_STATE_FIN_WAIT_1 ) { if ( outgoing_fin == TCP_SPECIAL_ACKED ) { // TODO: Is this case possible? Wouldn't we be in FIN-WAIT-2? state = TCP_STATE_TIME_WAIT; // TODO: Start the TIME-WAIT timer and turn off other timers. state = TCP_STATE_CLOSED; } else { state = TCP_STATE_CLOSING; // TODO: Are we supposed to wait 2 MSL and then close? state = TCP_STATE_CLOSED; } } else if ( state == TCP_STATE_FIN_WAIT_2 ) { state = TCP_STATE_TIME_WAIT; // TODO: Start the TIME-WAIT timer and turn off other timers. state = TCP_STATE_CLOSED; } else if ( state == TCP_STATE_TIME_WAIT ) { // TODO: Restart the 2 MSL TIME-WAIT timeout. } } } void TCPSocket::ReceivePacket(Ref pktnew, union tcp_sockaddr* pkt_src, union tcp_sockaddr* pkt_dst) // tcp_lock locked { if ( pktnew ) ProcessPacket(pktnew, pkt_src, pkt_dst); while ( receive_queue ) { Ref pkt = receive_queue; const unsigned char* in = pkt->from + pkt->offset; const unsigned char* in_seq = in + offsetof(struct tcphdr, th_seq); tcp_seq seq; memcpy(&seq, in_seq, sizeof(seq)); seq = be32toh(seq); if ( mod32_gt(seq, recv_nxt) ) break; receive_queue = pkt->next; if ( seq == recv_nxt ) ProcessPacket(pkt, pkt_src, pkt_dst); } ScheduleTransmit(); } void TCPSocket::UpdateWindow(uint16_t new_window) { tcp_seq pending = (tcp_seq) (send_nxt - send_una); if ( new_window < pending ) send_nxt = (tcp_seq) (send_una + pending); send_wnd = new_window; } int TCPSocket::connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize) { ScopedLock lock(&tcp_lock); // TODO: os-test listen + connect, what errno? if ( state != TCP_STATE_CLOSED ) return errno = EISCONN, -1; // TODO: Another errno if listening? union tcp_sockaddr new_remote; if ( !ImportAddress(ctx, &new_remote, addr, addrsize) ) return -1; if ( af == AF_INET ) { // Verify the port is non-zero. if ( be16toh(new_remote.in.sin_port) == 0 ) return errno = EADDRNOTAVAIL, -1; } else return errno = EAFNOSUPPORT, -1; // TODO: os-test AF_UNSPEC // If the socket is not bound, find a route to the remote address and bind // to the appropriate source address. if ( !bound ) { union tcp_sockaddr new_local; memset(&new_local, 0, sizeof(new_local)); if ( af == AF_INET ) { struct in_addr any; any.s_addr = htobe32(INADDR_ANY); new_local.in.sin_family = AF_INET; if ( !IPv4::GetSourceIP(&any, &new_remote.in.sin_addr, &new_local.in.sin_addr, ifindex, NULL) ) return -1; new_local.in.sin_port = htobe16(0); } else return errno = EAFNOSUPPORT, -1; if ( !BindDefault(&new_local) ) return -1; } // Test if there is a route from the local address to the remote address. // TODO: Does TCP also do this? Note that connecting to the any address // should be forbidden, right? if ( af == AF_INET ) { if ( !IPv4::GetSourceIP(&local.in.sin_addr, &new_remote.in.sin_addr, NULL, ifindex, NULL) ) return -1; } else return errno = EAFNOSUPPORT, -1; memcpy(&remote, &new_remote, sizeof(new_remote)); remoted = true; iss = arc4random(); //recv_wnd = UINT16_MAX; // TODO? recv_wnd = TCP_MAXWIN; send_una = iss; send_nxt = iss; send_wnd = 1; send_pos = iss; outgoing_syn = TCP_SPECIAL_PENDING; state = TCP_STATE_SYN_SENT; if ( !Transmit() ) { // TODO: This is not recoverable. Is that correct? // TODO: os-test whether reconnect is possible after failed connect? return -1; } while ( !sockerr && (state == TCP_STATE_SYN_SENT || state == TCP_STATE_SYN_RECV) ) { // TODO: non-blocking connect. // TODO: os-test non-blocking connect. if ( !kthread_cond_wait_signal(&receive_cond, &tcp_lock) ) return errno = EINTR, -1; } if ( sockerr ) return errno = sockerr, -1; return 0; } int TCPSocket::listen(ioctx_t* /*ctx*/, int backlog) { if ( backlog < 0 ) return errno = EINVAL, -1; if ( backlog == 0 || SOMAXCONN < backlog ) backlog = SOMAXCONN; ScopedLock lock(&tcp_lock); if ( !bound ) return errno = EDESTADDRREQ, -1; // TODO: Does this allow listening on a closed normal socket? // TODO: os-test a regular connection, close, and then try to listen. if ( state != TCP_STATE_CLOSED ) return errno = EINVAL, -1; backlog_used = 0; backlog_max = backlog; memset(&remote, 0, sizeof(remote)); if ( af == AF_INET ) { remote.in.sin_family = AF_INET; remote.in.sin_addr.s_addr = htobe32(INADDR_ANY); } else if ( af == AF_INET6 ) { remote.in6.sin6_family = AF_INET6; remote.in6.sin6_addr = in6addr_any; } else return errno = EAFNOSUPPORT, -1; remoted = true; state = TCP_STATE_LISTEN; return 0; } ssize_t TCPSocket::recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags) { if ( flags & ~(MSG_PEEK | MSG_WAITALL) ) // TODO: MSG_OOB. return errno = EINVAL, -1; kthread_mutex_lock(&tcp_lock); ssize_t result = recv_unlocked(ctx, buf, count, flags); bool do_schedule_worker = transmit_do_schedule_worker; transmit_do_schedule_worker = false; kthread_mutex_unlock(&tcp_lock); if ( do_schedule_worker ) DoScheduleTransmit(); return result; } ssize_t TCPSocket::recvmsg(ioctx_t* ctx, struct msghdr* msg_ptr, int flags) { struct msghdr msg; if ( !ctx->copy_from_src(&msg, msg_ptr, sizeof(msg)) ) return -1; if ( msg.msg_iovlen < 0 || IOV_MAX < msg.msg_iovlen ) return errno = EINVAL, -1; // TODO: Reject if non-null msg_name, msg_control? size_t iov_size = msg.msg_iovlen * sizeof(struct iovec); struct iovec* iov = new struct iovec[msg.msg_iovlen]; if ( !iov ) return -1; if ( !ctx->copy_from_src(iov, msg.msg_iov, iov_size) ) return delete[] iov, -1; msg.msg_iov = iov; kthread_mutex_lock(&tcp_lock); ssize_t result = 0; for ( int i = 0; i < msg.msg_iovlen && result < SSIZE_MAX; i++ ) { size_t maximum = SSIZE_MAX - (size_t) result; uint8_t* buf = (uint8_t*) iov[i].iov_base; size_t count = iov[i].iov_len < maximum ? iov[i].iov_len : maximum; // TODO: What about an empty iov_len vs. EOF? ssize_t amount = recv_unlocked(ctx, buf, count, flags); if ( amount < 0 ) { if ( result == 0 ) result = -1; break; } result += amount; if ( (size_t) amount != count ) break; } bool do_schedule_worker = transmit_do_schedule_worker; transmit_do_schedule_worker = false; kthread_mutex_unlock(&tcp_lock); if ( do_schedule_worker ) DoScheduleTransmit(); delete[] iov; if ( !ctx->copy_to_dest(msg_ptr, &msg, sizeof(msg)) ) return -1; return result; } ssize_t TCPSocket::recv_unlocked(ioctx_t* ctx, uint8_t* buf, size_t count, int flags) { if ( flags & ~(MSG_PEEK | MSG_WAITALL) ) // TODO: MSG_OOB. return errno = EINVAL, -1; if ( sockerr ) return errno = sockerr, -1; // TODO: os-test non-blocking connect + immediate recv. // TODO: Proper state check. if ( state != TCP_STATE_ESTAB && state != TCP_STATE_CLOSE_WAIT && state != TCP_STATE_FIN_WAIT_1 && state != TCP_STATE_FIN_WAIT_2) return errno = ENOTCONN, -1; size_t sofar = 0; while ( sofar < count ) { while ( !(incoming_used || has_fin) ) { // TODO: Would has_fin be true in TCP_STATE_CLOSE_WAIT? if ( state == TCP_STATE_CLOSE_WAIT ) return sofar; if ( sofar && !(flags & MSG_WAITALL) ) return sofar; // TODO: Is this the right precedence for errors? if ( sockerr ) return sofar ? sofar : (errno = sockerr, -1); if ( ctx->dflags & O_NONBLOCK ) return sofar ? sofar : (errno = EWOULDBLOCK, -1); if ( !kthread_cond_wait_signal(&receive_cond, &tcp_lock) ) return sofar ? sofar : (errno = EINTR, -1); if ( sockerr ) return sofar ? sofar : (errno = sockerr, -1); } if ( incoming_used == 0 && has_fin ) return sofar; uint8_t* data = buf + sofar; size_t left = count - sofar; assert(incoming_used <= sizeof(incoming)); size_t amount = incoming_used < left ? incoming_used : left; assert(incoming_offset < sizeof(incoming)); size_t until_end = sizeof(incoming) - incoming_offset; size_t first = until_end < amount ? until_end : amount; size_t second = amount - first; if ( !ctx->copy_to_dest(data, incoming + incoming_offset, first) ) return sofar ? sofar : -1; if ( second && !ctx->copy_to_dest(data + first, incoming, second) ) return sofar ? sofar : -1; sofar += amount; if ( flags & MSG_PEEK ) return sofar; incoming_offset += amount; if ( sizeof(incoming) <= incoming_offset ) incoming_offset -= sizeof(incoming); assert(incoming_offset < sizeof(incoming)); incoming_used -= amount; size_t window_possible = sizeof(incoming) - incoming_used; if ( UINT16_MAX < window_possible ) window_possible = UINT16_MAX; if ( TCP_MAXWIN < window_possible ) window_possible = TCP_MAXWIN; // TODO: Should this be done outside of the established state? if ( !sockerr && recv_wnd != window_possible ) { recv_wnd = window_possible; ScheduleTransmit(); } } return sofar; } ssize_t TCPSocket::send(ioctx_t* ctx, const uint8_t* buf, size_t count, int flags) { // TODO: MSG_MORE (and implement TCP_CORK). if ( flags & ~(MSG_NOSIGNAL) ) // TODO: MSG_OOB, MSG_DONTROUTE. return errno = EINVAL, -1; kthread_mutex_lock(&tcp_lock); ssize_t result = send_unlocked(ctx, buf, count, flags); bool do_schedule_worker = transmit_do_schedule_worker; transmit_do_schedule_worker = false; kthread_mutex_unlock(&tcp_lock); if ( do_schedule_worker ) DoScheduleTransmit(); return result; } ssize_t TCPSocket::sendmsg(ioctx_t* ctx, const struct msghdr* msg_ptr, int flags) { // TODO: MSG_MORE (and implement TCP_CORK). if ( flags & ~(MSG_NOSIGNAL) ) // TODO: MSG_OOB, MSG_DONTROUTE. return errno = EINVAL, -1; struct msghdr msg; if ( !ctx->copy_from_src(&msg, msg_ptr, sizeof(msg)) ) return -1; if ( msg.msg_iovlen < 0 || IOV_MAX < msg.msg_iovlen ) return errno = EINVAL, -1; // TODO: Reject if non-null msg_name, msg_control? size_t iov_size = msg.msg_iovlen * sizeof(struct iovec); struct iovec* iov = new struct iovec[msg.msg_iovlen]; if ( !iov ) return -1; if ( !ctx->copy_from_src(iov, msg.msg_iov, iov_size) ) return delete[] iov, -1; msg.msg_iov = iov; kthread_mutex_lock(&tcp_lock); ssize_t result = 0; for ( int i = 0; i < msg.msg_iovlen && result < SSIZE_MAX; i++ ) { size_t maximum = SSIZE_MAX - (size_t) result; const uint8_t* buf = (const uint8_t*) iov[i].iov_base; size_t count = iov[i].iov_len < maximum ? iov[i].iov_len : maximum; ssize_t amount = send_unlocked(ctx, buf, count, flags); if ( amount < 0 ) { if ( result == 0 ) result = -1; break; } result += amount; if ( (size_t) amount != count ) break; } bool do_schedule_worker = transmit_do_schedule_worker; transmit_do_schedule_worker = false; kthread_mutex_unlock(&tcp_lock); if ( do_schedule_worker ) DoScheduleTransmit(); delete[] iov; return result; } ssize_t TCPSocket::send_unlocked(ioctx_t* ctx, const uint8_t* buf, size_t count, int flags) { // TODO: Implement SIGPIPE/EPIPE if !MSG_NOSIGNAL. // TODO: os-test SIGPIPE/EPIPE if !MSG_NOSIGNAL. (void) flags; if ( sockerr ) return errno = sockerr, -1; // TODO: Proper state check. // TODO: FIN-WAIT-1, FIN-WAIT-2, CLOSING, LAST-ACK, TIME-WAIT. if ( state != TCP_STATE_ESTAB && state != TCP_STATE_CLOSE_WAIT ) return errno = ENOTCONN, -1; size_t sofar = 0; while ( sofar < count ) { // TODO: Also wake on connection termination. while ( outgoing_used == sizeof(outgoing) ) { if ( sofar ) return sofar; if ( sockerr ) return errno = sockerr, -1; if ( ctx->dflags & O_NONBLOCK ) return errno = EWOULDBLOCK; if ( !kthread_cond_wait_signal(&transmit_cond, &tcp_lock) ) return errno = EINTR, -1; } const uint8_t* data = buf + sofar; size_t left = count - sofar; assert(outgoing_offset < sizeof(outgoing)); assert(outgoing_used <= sizeof(outgoing)); size_t available = sizeof(outgoing) - outgoing_used; size_t amount = available < left ? available : left; size_t newat = outgoing_offset + outgoing_used; if ( sizeof(outgoing) <= newat ) newat -= sizeof(outgoing); assert(newat < sizeof(outgoing)); size_t until_end = sizeof(outgoing) - newat; size_t first = until_end < amount ? until_end : amount; size_t second = amount - first; if ( !ctx->copy_from_src(outgoing + newat, data, first) ) return sofar ? sofar : -1; if ( second && !ctx->copy_from_src(outgoing, data + first, second) ) return sofar ? sofar : -1; outgoing_used += amount; assert(outgoing_used <= sizeof(outgoing)); sofar += amount; // TODO: If there's a sent packet that hasn't been acknowledged, and // there isn't a full packet yet, then just buffer and don't // transmit yet. // TODO: TCP_NODELAY, TCP_NOPUSH, MSG_MORE. // TODO: Set PUSH appropriately. ScheduleTransmit(); } return sofar; } ssize_t TCPSocket::read(ioctx_t* ctx, uint8_t* buf, size_t count) { return recv(ctx, buf, count, 0); } ssize_t TCPSocket::write(ioctx_t* ctx, const uint8_t* buf, size_t count) { return send(ctx, buf, count, 0); } short TCPSocket::PollEventStatus() { // TODO: os-test the poll bits. // TODO: OOB poll bits. short status = 0; if ( connecting_ready ) status |= POLLIN | POLLRDNORM; if ( incoming_used ) status |= POLLIN | POLLRDNORM; if ( outgoing_used < sizeof(outgoing) ) status |= POLLOUT | POLLWRNORM; if ( state == TCP_STATE_CLOSE_WAIT || state == TCP_STATE_LAST_ACK || state == TCP_STATE_TIME_WAIT || state == TCP_STATE_CLOSED ) status |= POLLHUP; if ( sockerr ) status |= POLLERR; return status; } int TCPSocket::poll(ioctx_t* /*ctx*/, PollNode* node) { ScopedLock lock(&tcp_lock); short ret_status = PollEventStatus() & node->events; if ( ret_status ) { node->master->revents |= ret_status; return 0; } poll_channel.Register(node); return errno = EAGAIN, -1; } int TCPSocket::getsockopt(ioctx_t* ctx, int level, int option_name, void* option_value, size_t* option_size_ptr) { ScopedLock lock(&tcp_lock); if ( level == SOL_SOCKET && option_name == SO_BINDTODEVICE ) { ScopedLock lock(&netifs_lock); const char* ifname = ""; if ( ifindex < netifs_count && netifs[ifindex] ) ifname = netifs[ifindex]->ifinfo.name; size_t option_size; if ( !CopyFromUser(&option_size, option_size_ptr, sizeof(option_size)) ) return -1; size_t len = strlen(ifname); size_t size = len + 1; if ( option_size < size ) return errno = ERANGE; if ( !CopyToUser(option_value, ifname, size) || !CopyToUser(option_size_ptr, &size, sizeof(size)) ) return -1; return 0; } uintmax_t result = 0; if ( level == IPPROTO_TCP ) { switch ( option_name ) { //case TCP_NODELAY: break; // TODO: Transmit if turned on? //case TCP_MAXSEG: break; // TODO: Do I want this initially? //case TCP_NOPUSH: break; // TODO: Do I want this initially? // TODO: TCP_CORK? default: return errno = ENOPROTOOPT, -1; } } else if ( level == SOL_SOCKET ) { switch ( option_name ) { case SO_BINDTOINDEX: result = ifindex; break; case SO_DEBUG: result = 0; break; case SO_DOMAIN: result = af; break; case SO_ERROR: result = sockerr; break; case SO_PROTOCOL: result = IPPROTO_TCP; break; case SO_RCVBUF: result = sizeof(incoming); break; case SO_REUSEADDR: result = reuseaddr; break; case SO_SNDBUF: result = sizeof(outgoing); break; case SO_TYPE: result = SOCK_STREAM; break; // TODO: And document these: // TODO: SO_ACCEPTCONN // TODO: SO_LINGER // TODO: SO_OOBINLINE // TODO: SO_RCVLOWAT // TODO: SO_RCVTIMEO // TODO: SO_SNDLOWAT // TODO: SO_SNDTIMEO // TODO: SO_DONTROUTE // TODO: SO_BROADCAST default: return errno = ENOPROTOOPT, -1; } } else return errno = EINVAL, -1; if ( !sockopt_return_uintmax(result, ctx, option_value, option_size_ptr) ) return -1; return 0; } // TODO: os-test socket options on shut down sockets. POSIX says EINVAL. // TODO: os-test the errno for an invalid protocol. // TODO: os-test the errno for an invalid option at a protocol level. int TCPSocket::setsockopt(ioctx_t* ctx, int level, int option_name, const void* option_value, size_t option_size) { ScopedLock lock(&tcp_lock); if ( level == SOL_SOCKET && option_name == SO_BINDTODEVICE ) { char ifname[IF_NAMESIZE]; if ( sizeof(ifname) < option_size ) option_size = sizeof(ifname); if ( !CopyFromUser(ifname, option_value, option_size) ) return -1; if ( strnlen(ifname, option_size) == sizeof(ifname) ) return errno = ENODEV, -1; ifname[option_size] = '\0'; ScopedLock lock(&netifs_lock); for ( size_t i = 1; i < netifs_count; i++ ) { if ( netifs[i] && !strcmp(ifname, netifs[i]->ifinfo.name) ) { ifindex = i; return 0; } } return errno = ENODEV, -1; } uintmax_t value; if ( !sockopt_fetch_uintmax(&value, ctx, option_value, option_size) ) return -1; if ( level == IPPROTO_TCP ) { switch ( option_name ) { case TCP_NODELAY: break; // TODO: Transmit if turned on? case TCP_MAXSEG: break; // TODO: Do I want this initially? case TCP_NOPUSH: break; // TODO: Do I want this initially? // TODO: TCP_CORK? default: return errno = ENOPROTOOPT, -1; } } else if ( level == SOL_SOCKET ) { switch ( option_name ) { case SO_BINDTOINDEX: if ( UINT_MAX < value ) return errno = EINVAL, -1; ifindex = value; break; case SO_DEBUG: if ( value != 0 ) return errno = EPERM, -1; break; case SO_KEEPALIVE: break; // TODO: Implement this. case SO_REUSEADDR: reuseaddr = value; break; case SO_LINGER: break; // TODO: Implement this. case SO_RCVBUF: break; // TODO: Implement this. case SO_SNDBUF: break; // TODO: Implement this. // TODO: And document these: // TODO: SO_BROADCAST // TODO: SO_DONTROUTE // TODO: SO_LINGER // TODO: SO_RCVLOWAT // TODO: SO_RCVTIMEO // TODO: SO_SNDLOWAT // TODO: SO_SNDTIMEO default: return errno = ENOPROTOOPT, -1; } } else return errno = EINVAL, -1; return 0; } int TCPSocket::shutdown(ioctx_t* /*ctx*/, int how) { kthread_mutex_lock(&tcp_lock); int result = shutdown_unlocked(how); bool do_schedule_worker = transmit_do_schedule_worker; transmit_do_schedule_worker = false; kthread_mutex_unlock(&tcp_lock); if ( do_schedule_worker ) DoScheduleTransmit(); return result; } int TCPSocket::shutdown_unlocked(int how) // tcp_lock taken { // TODO: Proper state check. if ( state != TCP_STATE_ESTAB && state != TCP_STATE_CLOSE_WAIT ) return errno = ENOTCONN, -1; if ( how & SHUT_WR && outgoing_fin == TCP_SPECIAL_NOT ) { if ( state == TCP_STATE_ESTAB || state == TCP_STATE_CLOSE_WAIT ) { outgoing_fin = TCP_SPECIAL_PENDING; if ( state == TCP_STATE_ESTAB ) state = TCP_STATE_FIN_WAIT_1; else state = TCP_STATE_LAST_ACK; ScheduleTransmit(); } else { // TODO: Other states. } } return 0; } int TCPSocket::getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr) { ScopedLock lock(&tcp_lock); if ( !remoted || state == TCP_STATE_LISTEN ) return errno = ENOTCONN, -1; size_t addrsize; if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) ) return -1; if ( af == AF_INET ) { if ( sizeof(remote.in) < addrsize ) addrsize = sizeof(remote.in); } else if ( af == AF_INET6 ) { if ( sizeof(remote.in6) < addrsize ) addrsize = sizeof(remote.in6); } else return errno = EAFNOSUPPORT, -1; if ( !ctx->copy_to_dest(addr, &remote, addrsize) ) return -1; if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) ) return -1; return 0; } int TCPSocket::getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize_ptr) { ScopedLock lock(&tcp_lock); size_t addrsize; if ( !ctx->copy_from_src(&addrsize, addrsize_ptr, sizeof(addrsize)) ) return -1; if ( af == AF_INET ) { if ( sizeof(local.in) < addrsize ) addrsize = sizeof(local.in); } else if ( af == AF_INET6 ) { if ( sizeof(local.in6) < addrsize ) addrsize = sizeof(local.in6); } else return errno = EAFNOSUPPORT, -1; if ( !ctx->copy_to_dest(addr, &local, addrsize) ) return -1; if ( !ctx->copy_to_dest(addrsize_ptr, &addrsize, sizeof(addrsize)) ) return -1; return 0; } // TODO: os-test fstat on a socket. TCPSocketNode::TCPSocketNode(TCPSocket* socket) { this->socket = socket; Process* process = CurrentProcess(); inode_type = INODE_TYPE_STREAM; dev = (dev_t) this; ino = (ino_t) this; type = S_IFSOCK; kthread_mutex_lock(&process->idlock); stat_uid = process->uid; stat_gid = process->gid; kthread_mutex_unlock(&process->idlock); stat_mode = 0600 | this->type; } TCPSocketNode::~TCPSocketNode() { socket->close(); } Ref TCPSocketNode::accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize, int flags) { return socket->accept4(ctx, addr, addrsize, flags); } int TCPSocketNode::bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize) { return socket->bind(ctx, addr, addrsize); } int TCPSocketNode::connect(ioctx_t* ctx, const uint8_t* addr, size_t addrsize) { return socket->connect(ctx, addr, addrsize); } int TCPSocketNode::listen(ioctx_t* ctx, int backlog) { return socket->listen(ctx, backlog); } ssize_t TCPSocketNode::recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags) { return socket->recv(ctx, buf, count, flags); } ssize_t TCPSocketNode::recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags) { return socket->recvmsg(ctx, msg, flags); } ssize_t TCPSocketNode::send(ioctx_t* ctx, const uint8_t* buf, size_t count, int flags) { return socket->send(ctx, buf, count, flags); } ssize_t TCPSocketNode::sendmsg(ioctx_t* ctx, const struct msghdr* msg, int flags) { return socket->sendmsg(ctx, msg, flags); } ssize_t TCPSocketNode::read(ioctx_t* ctx, uint8_t* buf, size_t count) { return socket->read(ctx, buf, count); } ssize_t TCPSocketNode::write(ioctx_t* ctx, const uint8_t* buf, size_t count) { return socket->write(ctx, buf, count); } int TCPSocketNode::poll(ioctx_t* ctx, PollNode* node) { return socket->poll(ctx, node); } int TCPSocketNode::getsockopt(ioctx_t* ctx, int level, int option_name, void* option_value, size_t* option_size_ptr) { return socket->getsockopt(ctx, level, option_name, option_value, option_size_ptr); } int TCPSocketNode::setsockopt(ioctx_t* ctx, int level, int option_name, const void* option_value, size_t option_size) { return socket->setsockopt(ctx, level, option_name, option_value, option_size); } int TCPSocketNode::shutdown(ioctx_t* ctx, int how) { return socket->shutdown(ctx, how); } int TCPSocketNode::getpeername(ioctx_t* ctx, uint8_t* addr, size_t* addrsize) { return socket->getpeername(ctx, addr, addrsize); } int TCPSocketNode::getsockname(ioctx_t* ctx, uint8_t* addr, size_t* addrsize) { return socket->getsockname(ctx, addr, addrsize); } void HandleIPv4(Ref pkt, const struct in_addr* src, const struct in_addr* dst, bool dst_broadcast) { // TODO: What if src is the any address? if ( dst_broadcast ) return; const unsigned char* in = pkt->from + pkt->offset; size_t inlen = pkt->length - pkt->offset; struct tcphdr hdr; if ( inlen < sizeof(hdr) ) return; if ( UINT16_MAX < inlen ) return; memcpy(&hdr, in, sizeof(hdr)); hdr.th_sport = be16toh(hdr.th_sport); hdr.th_dport = be16toh(hdr.th_dport); hdr.th_sum = be16toh(hdr.th_sum); uint16_t sum = 0; sum = IPv4::ipsum_buf(sum, src, sizeof(struct in_addr)); sum = IPv4::ipsum_buf(sum, dst, sizeof(struct in_addr)); sum = IPv4::ipsum_word(sum, IPPROTO_TCP); sum = IPv4::ipsum_word(sum, inlen); sum = IPv4::ipsum_buf(sum, in, inlen); if ( sum != 0 && sum != 0xFFFF ) return; if ( TCP_OFFSET_DECODE(hdr.th_offset) < sizeof(hdr) / 4 ) return; // TODO: This is the only use of TCP_OFFSET_DECODE. That seems wrong. // TODO: Reject if src is the any address. Right? Is this the best place to // handle it? What about udp and ping? They'll want to receive from // any, but they don't want to respond except by broadcast, right? // Port 0 is not valid. if ( hdr.th_sport == 0 || hdr.th_dport == 0 ) return; // TODO: TCP options. Respect TCPOPT_MAXSEG. TCPSocket* socket = NULL; TCPSocket* socket_listener = NULL; TCPSocket* any_socket = NULL; TCPSocket* any_socket_listener = NULL; ScopedLock lock(&tcp_lock); for ( TCPSocket* iter = bindings_v4[hdr.th_dport]; !socket && iter; iter = iter->next_socket ) { // TODO: If a TCP socket is bound, and then connected to, what happens? // What if the TCP socket then connects to the other side? if ( !iter->remoted ) continue; // The datagram was sent to the socket's local address. if ( !memcmp(&iter->local.in.sin_addr, dst, sizeof(*dst)) ) { // The first priority is to receive on a socket with the correct // local address and the correct remote address. if ( !memcmp(&iter->remote.in.sin_addr, src, sizeof(*src)) && be16toh(iter->remote.in.sin_port) == hdr.th_sport ) socket = iter; // The second priority is to receive on a socket with the correct // local address and listening for connections from any address. else if ( iter->remote.in.sin_addr.s_addr == htobe32(INADDR_ANY) ) socket_listener = iter; } // The socket is bound to the any address. if ( iter->local.in.sin_addr.s_addr == htobe32(INADDR_ANY) ) { // The third priority is to receive on a socket bound to the any // address and the correct remote address. // local address and correct remote address. // TODO: Can a socket bound to the any address be connected? Is the // any_socket case possible, and desirable or harmful? if ( !memcmp(&iter->remote.in.sin_addr, src, sizeof(*src)) && be16toh(iter->remote.in.sin_port) == hdr.th_sport ) any_socket = iter; // The fourth priority is to receive on a socket bound to the any // address and listening for connections from any address. else if ( iter->remote.in.sin_addr.s_addr == htobe32(INADDR_ANY) ) any_socket_listener = iter; } } if ( !socket ) socket = socket_listener; if ( !socket ) socket = any_socket; if ( !socket ) socket = any_socket_listener; // No socket wanted to receive the packet. if ( !socket ) { // TODO: Send RST. return; } // If the socket is bound to a network interface, require the packet to // have been received on that network interface. if ( socket->ifindex && socket->ifindex != pkt->netif->ifinfo.linkid ) { // TODO: Send RST. return; } union tcp_sockaddr pkt_src; pkt_src.in.sin_family = AF_INET; pkt_src.in.sin_addr = *src; pkt_src.in.sin_port = htobe16(hdr.th_sport); union tcp_sockaddr pkt_dst; pkt_dst.in.sin_family = AF_INET; pkt_dst.in.sin_addr = *dst; pkt_dst.in.sin_port = htobe16(hdr.th_dport); // Receive the packet on the socket. socket->ReceivePacket(pkt, &pkt_src, &pkt_dst); // Destroy the socket if needed or schedule a transmit if needed. if ( socket->needs_destruction() ) { socket->destroy(); delete socket; } else if ( socket->transmit_do_schedule_worker ) { socket->transmit_do_schedule_worker = false; lock.Reset(); socket->DoScheduleTransmit(); } } Ref Socket(int af) { if ( !IsSupportedAddressFamily(af) ) return errno = EAFNOSUPPORT, Ref(NULL); TCPSocket* socket = new TCPSocket(af); if ( !socket ) return Ref(); Ref result(new TCPSocketNode(socket)); if ( !result ) { // destroy() is not called because there is no state that needs to be // cleaned up that requires the tcp_lock to be taken. return delete socket, Ref(); } return result; } } // namespace TCP } // namespace Sortix