diff --git a/Makefile b/Makefile index c6520223..895c366c 100644 --- a/Makefile +++ b/Makefile @@ -177,7 +177,7 @@ sysroot-system: sysroot-fsh sysroot-base-headers echo 'ID=sortix' && \ echo 'VERSION_ID="$(VERSION)"' && \ echo 'PRETTY_NAME="Sortix $(VERSION)"' && \ - echo 'SORTIX_ABI=1.1' && \ + echo 'SORTIX_ABI=1.2' && \ true) > "$(SYSROOT)/etc/sortix-release" echo /etc/sortix-release >> "$(SYSROOT)/tix/manifest/system" ln -sf sortix-release "$(SYSROOT)/etc/os-release" diff --git a/kernel/descriptor.cpp b/kernel/descriptor.cpp index f7f8791a..681afbf6 100644 --- a/kernel/descriptor.cpp +++ b/kernel/descriptor.cpp @@ -232,6 +232,16 @@ bool Descriptor::IsSeekable() return seekable; } +bool Descriptor::pass() +{ + return vnode->pass(); +} + +void Descriptor::unpass() +{ + vnode->unpass(); +} + int Descriptor::sync(ioctx_t* ctx) { // TODO: Possible denial-of-service attack if someone opens the file without diff --git a/kernel/fs/user.cpp b/kernel/fs/user.cpp index fd7f866e..68938ca4 100644 --- a/kernel/fs/user.cpp +++ b/kernel/fs/user.cpp @@ -195,6 +195,8 @@ class Unode : public Inode public: Unode(Ref server, ino_t ino, mode_t type); virtual ~Unode(); + virtual bool pass(); + virtual void unpass(); virtual void linked(); virtual void unlinked(); virtual int sync(ioctx_t* ctx); @@ -766,6 +768,15 @@ void Unode::UnexpectedResponse(Channel* channel, struct fsm_msg_header* hdr) errno = EIO; } +bool Unode::pass() +{ + return true; +} + +void Unode::unpass() +{ +} + void Unode::linked() { } diff --git a/kernel/include/sortix/kernel/descriptor.h b/kernel/include/sortix/kernel/descriptor.h index facc2102..f99a17a0 100644 --- a/kernel/include/sortix/kernel/descriptor.h +++ b/kernel/include/sortix/kernel/descriptor.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, 2013, 2014, 2015, 2016, 2017 Jonas 'Sortie' Termansen. + * Copyright (c) 2012-2017, 2021 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -58,6 +58,8 @@ public: Ref Fork(); bool SetFlags(int new_dflags); int GetFlags(); + bool pass(); + void unpass(); int sync(ioctx_t* ctx); int stat(ioctx_t* ctx, struct stat* st); int statvfs(ioctx_t* ctx, struct statvfs* stvfs); diff --git a/kernel/include/sortix/kernel/inode.h b/kernel/include/sortix/kernel/inode.h index 250f9090..55dd9ce4 100644 --- a/kernel/include/sortix/kernel/inode.h +++ b/kernel/include/sortix/kernel/inode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, 2013, 2014, 2015, 2016, 2017 Jonas 'Sortie' Termansen. + * Copyright (c) 2012-2017, 2021 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -54,6 +54,8 @@ public: /* These must never change after construction and is read-only. */ public: virtual ~Inode() { } + virtual bool pass() = 0; + virtual void unpass() = 0; virtual void linked() = 0; virtual void unlinked() = 0; virtual int sync(ioctx_t* ctx) = 0; @@ -165,6 +167,8 @@ protected: public: AbstractInode(); virtual ~AbstractInode(); + virtual bool pass(); + virtual void unpass(); virtual void linked(); virtual void unlinked(); virtual int sync(ioctx_t* ctx); diff --git a/kernel/include/sortix/kernel/pipe.h b/kernel/include/sortix/kernel/pipe.h index effd53da..54c385cf 100644 --- a/kernel/include/sortix/kernel/pipe.h +++ b/kernel/include/sortix/kernel/pipe.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, 2012, 2013, 2014, 2017 Jonas 'Sortie' Termansen. + * Copyright (c) 2011, 2012, 2013, 2014, 2017, 2021 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -43,6 +43,8 @@ public: bool SetSIGPIPEDelivery(bool deliver_sigpipe); size_t Size(); bool Resize(size_t new_size); + bool pass(); + void unpass(); ssize_t readv(ioctx_t* ctx, const struct iovec* iov, int iovcnt); ssize_t recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags); ssize_t recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags); diff --git a/kernel/include/sortix/kernel/refcount.h b/kernel/include/sortix/kernel/refcount.h index 1620c9a8..eebb93fa 100644 --- a/kernel/include/sortix/kernel/refcount.h +++ b/kernel/include/sortix/kernel/refcount.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, 2013 Jonas 'Sortie' Termansen. + * Copyright (c) 2012, 2013, 2014, 2017 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -124,6 +124,21 @@ public: size_t Refcount() const { return obj ? obj->Refcount : 0; } bool IsUnique() const { return obj->IsUnique(); } + // Leak a reference and allow recreating it later from an integer. + uintptr_t Export() + { + if ( obj ) + obj->Refer_Renamed(); + return (uintptr_t) obj; + } + + // Restore a leaked reference from an integer. + void Import(uintptr_t ptr) + { + Reset(); + obj = (T*) ptr; + } + private: T* obj; diff --git a/kernel/include/sortix/kernel/vnode.h b/kernel/include/sortix/kernel/vnode.h index 0940ffad..cae8d505 100644 --- a/kernel/include/sortix/kernel/vnode.h +++ b/kernel/include/sortix/kernel/vnode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, 2013, 2014, 2015, 2016, 2017 Jonas 'Sortie' Termansen. + * Copyright (c) 2012-2017, 2021 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -55,6 +55,8 @@ public: /* These must never change after construction and is read-only. */ public: Vnode(Ref inode, Ref mountedat, ino_t rootino, dev_t rootdev); virtual ~Vnode(); + bool pass(); + void unpass(); int sync(ioctx_t* ctx); int stat(ioctx_t* ctx, struct stat* st); int statvfs(ioctx_t* ctx, struct statvfs* stvfs); diff --git a/kernel/include/sortix/mode.h b/kernel/include/sortix/mode.h index 0c26396d..dc33020b 100644 --- a/kernel/include/sortix/mode.h +++ b/kernel/include/sortix/mode.h @@ -61,6 +61,8 @@ #define S_IFFACTORY 0x10000 /* Don't run the factory method if simply stat'ing the inode. */ #define S_IFFACTORY_NOSTAT 0x20000 +/* The file object must never be wrapped in another file object. */ +#define S_IFNEVERWRAP 0x40000 #endif #endif diff --git a/kernel/inode.cpp b/kernel/inode.cpp index 38d6fe78..6fabed46 100644 --- a/kernel/inode.cpp +++ b/kernel/inode.cpp @@ -62,6 +62,15 @@ AbstractInode::~AbstractInode() { } +bool AbstractInode::pass() +{ + return true; +} + +void AbstractInode::unpass() +{ +} + void AbstractInode::linked() { InterlockedIncrement(&stat_nlink); diff --git a/kernel/io.cpp b/kernel/io.cpp index ed40713c..77a50233 100644 --- a/kernel/io.cpp +++ b/kernel/io.cpp @@ -836,6 +836,8 @@ int sys_mkpartition(int fd, off_t start, off_t length, int flags) Ref inner_inode = desc->vnode->inode; desc.Reset(); + if ( inner_inode->type & S_IFNEVERWRAP ) + return errno = EPERM, -1; if ( !S_ISBLK(inner_inode->type) && !S_ISREG(inner_inode->type) ) return errno = EPERM, -1; if ( start < 0 || length < 0 ) diff --git a/kernel/net/fs.cpp b/kernel/net/fs.cpp index 2e913585..73f4c22a 100644 --- a/kernel/net/fs.cpp +++ b/kernel/net/fs.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, 2014, 2016, 2017 Jonas 'Sortie' Termansen. + * Copyright (c) 2013, 2014, 2016, 2017, 2021 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -83,6 +83,8 @@ class StreamSocket : public AbstractInode public: StreamSocket(uid_t owner, gid_t group, mode_t mode, Ref manager); virtual ~StreamSocket(); + virtual bool pass(); + virtual void unpass(); virtual Ref accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize, int flags); virtual int bind(ioctx_t* ctx, const uint8_t* addr, size_t addrsize); @@ -163,7 +165,10 @@ StreamSocket::StreamSocket(uid_t owner, gid_t group, mode_t mode, inode_type = INODE_TYPE_STREAM; dev = (dev_t) manager.Get(); ino = (ino_t) this; - this->type = S_IFSOCK; + // Never allow wrapping filesystem sockets as they need to be able to + // recognize themselves when passing filesystems, to prevent reference + // cycle loops. + this->type = S_IFSOCK | S_IFNEVERWRAP; this->stat_uid = owner; this->stat_gid = group; this->stat_mode = (mode & S_SETABLE) | this->type; @@ -191,6 +196,23 @@ StreamSocket::~StreamSocket() free(bound_address); } +bool StreamSocket::pass() +{ + if ( outgoing.pass() ) + { + if ( incoming.pass() ) + return true; + outgoing.unpass(); + } + return false; +} + +void StreamSocket::unpass() +{ + outgoing.unpass(); + incoming.unpass(); +} + Ref StreamSocket::accept4(ioctx_t* ctx, uint8_t* addr, size_t* addrsize, int flags) { diff --git a/kernel/partition.cpp b/kernel/partition.cpp index a2847209..818df86f 100644 --- a/kernel/partition.cpp +++ b/kernel/partition.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, 2015 Jonas 'Sortie' Termansen. + * Copyright (c) 2013, 2015, 2017 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -39,6 +39,7 @@ Partition::Partition(Ref inner_inode, off_t start, off_t length) assert(0 <= start); assert(0 <= length); assert(length <= OFF_MAX - start); + assert(!(inner_inode->type & S_IFNEVERWRAP)); assert(S_ISBLK(inner_inode->type) || S_ISREG(inner_inode->type)); this->dev = (dev_t) this; this->ino = (ino_t) this; diff --git a/kernel/pipe.cpp b/kernel/pipe.cpp index c409c6af..2fbe0519 100644 --- a/kernel/pipe.cpp +++ b/kernel/pipe.cpp @@ -51,10 +51,24 @@ namespace Sortix { +// A segment contains an optional leading ancillary data buffer and then a +// normal data buffer. +struct segment_header +{ + size_t ancillary; + size_t normal; +}; + +// A pipe communication channel in one direction. +// +// The pipe uses a ring buffer containing segments. Each segment is stored as +// its header, followed by an optional ancillary data buffer (cmsg(3)) and a +// normal data buffer. Writing data will append to the last segment, if any, if +// it has a compatible type and isn't finished. class PipeChannel { public: - PipeChannel(uint8_t* buffer, size_t buffersize); + PipeChannel(uint8_t* buffer, size_t buffer_size); ~PipeChannel(); void CloseReading(); void CloseWriting(); @@ -64,6 +78,29 @@ public: size_t WriteSize(); bool ReadResize(size_t new_size); bool WriteResize(size_t new_size); + bool WriteBuffer(bool (*copy_from_src)(void*, const void*, size_t), + const void* src_ptr, + size_t amount, + size_t position); + bool ReadBuffer(bool (*copy_to_dest)(void*, const void*, size_t), + void* dest_ptr, + size_t amount, + size_t position); + bool Enqueue(bool (*copy_from_src)(void* dest, const void* src, size_t n), + const void* src, + size_t amount, + bool is_ancillary); + void Unenqueue(bool (*copy_to_dest)(void*, const void*, size_t), + void* dest_ptr, + size_t amount, + size_t saved_last_header_position); + bool Dequeue(bool (*copy_to_dest)(void* dest, const void* src, size_t n), + void* dest, + size_t amount, + bool peek, + size_t peek_offset); + bool pass(); + void unpass(); ssize_t readv(ioctx_t* ctx, const struct iovec* iov, int iovcnt); ssize_t recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags); ssize_t recvmsg(ioctx_t* ctx, struct msghdr* msg, int flags); @@ -82,55 +119,130 @@ private: private: PollChannel read_poll_channel; PollChannel write_poll_channel; - kthread_mutex_t pipelock; - kthread_cond_t readcond; - kthread_cond_t writecond; - uint8_t* buffer; + // Lock protecting the pipe's data structrues. + kthread_mutex_t pipe_lock; + // Lock protecting passing_count and passing_count only. + kthread_mutex_t pass_lock; + // Condition for when readers can wake up. + kthread_cond_t read_cond; + // Condition for when writers can wake up. + kthread_cond_t write_cond; + // The id of a thread that has plegded to write and can be yielded to. uintptr_t sender_system_tid; + // The id of a thread that has plegded to receive and can be yielded to. uintptr_t receiver_system_tid; - size_t bufferoffset; - size_t bufferused; - size_t buffersize; + // The ring buffer containing segments. + uint8_t* buffer; + // The offset in the ring buffer where data begins. + size_t buffer_offset; + // The amount of data used after buffer_offset in the ring buffer. + size_t buffer_used; + // The size of the ring buffer. + size_t buffer_size; + // The position of the last header in the ring buffer, a new header is + // required if last_header_position == buffer_used. + size_t last_header_position; + // The buffer size as far as the reader is concerned (the writer decides). size_t pretended_read_buffer_size; - size_t pledged_read; + // This many readers have pledged to write. size_t pledged_write; + // This many writers have pledged to read. + size_t pledged_read; + // How many file descriptors are passed on this socket. + size_t passing_count; + // How many times this socket itself is being passed. + size_t passed_count; + // Atomically incremented on read/write close, number two does the delete. unsigned long closers; - bool anyreading; - bool anywriting; + // Whether anyone still has this channel open for reading. + bool any_reading; + // Whether anyone still has this channel open for writing. + bool any_writing; + // Whether writing with no readers will send SIGPIPE. bool is_sigpipe_enabled; }; -PipeChannel::PipeChannel(uint8_t* buffer, size_t buffersize) +PipeChannel::PipeChannel(uint8_t* buffer, size_t buffer_size) { - pipelock = KTHREAD_MUTEX_INITIALIZER; - readcond = KTHREAD_COND_INITIALIZER; - writecond = KTHREAD_COND_INITIALIZER; - this->buffer = buffer; - this->buffersize = buffersize; - bufferoffset = bufferused = 0; - anyreading = anywriting = true; - is_sigpipe_enabled = true; + pipe_lock = KTHREAD_MUTEX_INITIALIZER; + pass_lock = KTHREAD_MUTEX_INITIALIZER; + read_cond = KTHREAD_COND_INITIALIZER; + write_cond = KTHREAD_COND_INITIALIZER; sender_system_tid = 0; receiver_system_tid = 0; - pledged_read = 0; + this->buffer = buffer; + buffer_offset = 0; + buffer_used = 0; + this->buffer_size = buffer_size; + last_header_position = 0; + pretended_read_buffer_size = buffer_size; pledged_write = 0; + pledged_read = 0; + passing_count = 0; + passed_count = 0; closers = 0; + any_reading = true; + any_writing = true; + is_sigpipe_enabled = true; } PipeChannel::~PipeChannel() { + // Drain the ring buffer contents and deference passed file descriptors. + while ( buffer_used ) + { + struct segment_header header; + assert(sizeof(header) <= buffer_used); + ReadBuffer(CopyToKernel, &header, sizeof(header), 0); + while ( 0 < header.ancillary ) + { + struct cmsghdr cmsg; + assert(sizeof(cmsg) <= header.ancillary); + assert(sizeof(header) <= buffer_used); + Dequeue(CopyToKernel, &cmsg, sizeof(cmsg), false, 0); + header.ancillary -= sizeof(cmsg); + size_t data = cmsg.cmsg_len - sizeof(struct cmsghdr); + if ( cmsg.cmsg_level == SOL_SOCKET && cmsg.cmsg_type == SCM_RIGHTS ) + { + size_t fds = data / sizeof(int); + for ( size_t i = 0; i < fds; i++ ) + { + uintptr_t ptr; + assert(sizeof(header) <= buffer_used); + Dequeue(CopyToKernel, &ptr, sizeof(ptr), false, 0); + header.ancillary -= sizeof(ptr); + Ref desc; + desc.Import(ptr); + passing_count--; + } + } + else + { + assert(sizeof(header) <= buffer_used); + Dequeue(NULL, NULL, data, false, 0); + header.ancillary -= data; + } + } + if ( header.normal ) + { + assert(sizeof(header) <= buffer_used); + Dequeue(NULL, NULL, header.normal, false, 0); + header.normal -= header.normal; + } + } + assert(!passed_count && !passing_count); delete[] buffer; } void PipeChannel::CloseReading() { - kthread_mutex_lock(&pipelock); - anyreading = false; - kthread_cond_broadcast(&writecond); + kthread_mutex_lock(&pipe_lock); + any_reading = false; + kthread_cond_broadcast(&write_cond); read_poll_channel.Signal(ReadPollEventStatus()); write_poll_channel.Signal(WritePollEventStatus()); - kthread_mutex_unlock(&pipelock); + kthread_mutex_unlock(&pipe_lock); unsigned long count = InterlockedIncrement(&closers).n; if ( count == 2 ) delete this; @@ -138,19 +250,203 @@ void PipeChannel::CloseReading() void PipeChannel::CloseWriting() { - kthread_mutex_lock(&pipelock); - anywriting = false; - kthread_cond_broadcast(&readcond); + kthread_mutex_lock(&pipe_lock); + any_writing = false; + kthread_cond_broadcast(&read_cond); read_poll_channel.Signal(ReadPollEventStatus()); write_poll_channel.Signal(WritePollEventStatus()); - kthread_mutex_unlock(&pipelock); + kthread_mutex_unlock(&pipe_lock); unsigned long count = InterlockedIncrement(&closers).n; if ( count == 2 ) delete this; } -ssize_t PipeChannel::recv(ioctx_t* ctx, uint8_t* buf, size_t count, - int flags) +bool PipeChannel::WriteBuffer(bool (*copy_from_src)(void*, const void*, size_t), + const void* src_ptr, + size_t amount, + size_t position) +{ + size_t write_offset = (buffer_offset + position) % buffer_size; + size_t linear = buffer_size - write_offset; + size_t first = linear < amount ? linear : amount; + const unsigned char* src = (const unsigned char*) src_ptr; + if ( !copy_from_src(buffer + write_offset, src, first) ) + return false; + if ( first < amount && + !copy_from_src(buffer, src + first, amount - first) ) + return false; + return true; +} + +bool PipeChannel::ReadBuffer(bool (*copy_to_dest)(void*, const void*, size_t), + void* dest_ptr, + size_t amount, + size_t position) +{ + size_t offset = (buffer_offset + position) % buffer_size; + size_t linear = buffer_size - offset; + size_t first = linear < amount ? linear : amount; + unsigned char* dest = (unsigned char*) dest_ptr; + if ( !copy_to_dest(dest, buffer + offset, first) ) + return false; + if ( first < amount && + !copy_to_dest(dest + first, buffer, amount - first) ) + return false; + return true; +} + +bool PipeChannel::Enqueue(bool (*copy_from_src)(void*, const void*, size_t), + const void* src_ptr, + size_t amount, + bool is_ancillary) +{ + struct segment_header header = { 0, 0 }; + assert(last_header_position <= buffer_used); + // Try to unify with a previous segment header if any or make a new one. + bool has_header = buffer_used - last_header_position; + if ( has_header ) + ReadBuffer(CopyToKernel, &header, sizeof(header), last_header_position); + if ( is_ancillary ) + { + // Ancillary data must be at the start of the segment. + if ( header.normal ) + { + last_header_position = buffer_used; + assert(last_header_position <= buffer_used); + has_header = false; + memset(&header, 0, sizeof(header)); + } + header.ancillary += amount; + } + else + header.normal += amount; + assert(has_header || sizeof(header) <= buffer_size - buffer_used); + size_t header_size = has_header ? 0 : sizeof(header); + size_t position = buffer_used + header_size; + assert(amount <= buffer_size - position); + if ( !WriteBuffer(copy_from_src, src_ptr, amount, position) ) + return false; + WriteBuffer(CopyFromKernel, &header, sizeof(header), last_header_position); + buffer_used += amount + header_size; + kthread_cond_broadcast(&read_cond); + read_poll_channel.Signal(ReadPollEventStatus()); + write_poll_channel.Signal(WritePollEventStatus()); + return true; +} + +void PipeChannel::Unenqueue(bool (*copy_to_dest)(void*, const void*, size_t), + void* dest_ptr, + size_t amount, + size_t saved_last_header_position) +{ + struct segment_header header; + assert(sizeof(header) <= buffer_used); + assert(last_header_position <= buffer_used); + // Remove the data from the last segment header. + ReadBuffer(CopyToKernel, &header, sizeof(header), last_header_position); + if ( header.ancillary ) + { + assert(amount <= header.ancillary); + header.ancillary -= amount; + } + else + { + assert(amount <= header.normal); + header.normal -= amount; + } + assert(amount <= buffer_used - sizeof(header)); + if ( copy_to_dest ) + ReadBuffer(CopyToKernel, dest_ptr, amount, buffer_used - amount); + buffer_used -= amount; + // Remove the last segment header if it becomes empty. + if ( header.ancillary || header.normal ) + WriteBuffer(CopyFromKernel, &header, sizeof(header), + last_header_position); + else + { + buffer_used -= sizeof(header); + last_header_position = saved_last_header_position; + } + kthread_cond_broadcast(&write_cond); + read_poll_channel.Signal(ReadPollEventStatus()); + write_poll_channel.Signal(WritePollEventStatus()); +} + +bool PipeChannel::Dequeue(bool (*copy_to_dest)(void*, const void*, size_t), + void* dest_ptr, + size_t amount, + bool peek, + size_t peek_offset) +{ + assert(peek || !peek_offset); + struct segment_header header; + assert(sizeof(header) <= buffer_used); + assert(last_header_position <= buffer_used); + assert(peek_offset <= buffer_used - sizeof(header)); + if ( copy_to_dest && + !ReadBuffer(copy_to_dest, dest_ptr, amount, + sizeof(header) + peek_offset) ) + return false; + if ( !peek ) + { + // Remove the data from the segment. + ReadBuffer(CopyToKernel, &header, sizeof(header), 0); + if ( header.ancillary ) + { + assert(amount <= header.ancillary); + header.ancillary -= amount; + } + else + { + assert(amount <= header.normal); + header.normal -= amount; + } + // Shift the segment header. + if ( header.ancillary || header.normal ) + WriteBuffer(CopyFromKernel, &header, sizeof(header), amount); + else + amount += sizeof(header); + buffer_offset = (buffer_offset + amount) % buffer_size; + buffer_used -= amount; + if ( amount <= last_header_position ) + last_header_position -= amount; + else + last_header_position = 0; + kthread_cond_broadcast(&write_cond); + read_poll_channel.Signal(ReadPollEventStatus()); + write_poll_channel.Signal(WritePollEventStatus()); + // Realign the buffer if it becomes empty. + if ( !buffer_used ) + { + buffer_offset = 0; + last_header_position = 0; + } + } + return true; +} + +bool PipeChannel::pass() +{ + // The Unix socket is being passed on another socket. + ScopedLock lock(&pass_lock); + // If this socket has descriptors passed on it, then refuse to pass it over + // another socket to avoid reference count cycles. + if ( passing_count ) + return false; + passed_count++; + return true; +} + +void PipeChannel::unpass() +{ + // The Unix socket is no longer being passed on another socket. + ScopedLock lock(&pass_lock); + assert(passed_count); + assert(!passing_count); + passed_count--; +} + +ssize_t PipeChannel::recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags) { struct iovec iov; memset(&iov, 0, sizeof(iov)); @@ -174,6 +470,7 @@ ssize_t PipeChannel::readv(ioctx_t* ctx, const struct iovec* iov, int iovcnt) ssize_t PipeChannel::recvmsg(ioctx_t* ctx, struct msghdr* msg_ptr, int flags) { + // recvmsg can only be called through Unix sockets and not regular pipes. struct msghdr msg; if ( !ctx->copy_from_src(&msg, msg_ptr, sizeof(msg)) ) return -1; @@ -183,30 +480,209 @@ ssize_t PipeChannel::recvmsg(ioctx_t* ctx, struct msghdr* msg_ptr, int flags) struct iovec* iov = new struct iovec[msg.msg_iovlen]; if ( !iov ) return -1; - if ( !ctx->copy_from_src(iov, msg.msg_iov, iov_size) ) + struct iovec* user_iov = msg.msg_iov; + if ( !ctx->copy_from_src(iov, user_iov, iov_size) ) return delete[] iov, -1; msg.msg_iov = iov; - size_t result = recvmsg_internal(ctx, &msg, flags); + ssize_t result = recvmsg_internal(ctx, &msg, flags); + msg.msg_iov = user_iov; delete[] iov; if ( !ctx->copy_to_dest(msg_ptr, &msg, sizeof(msg)) ) return -1; return result; } -ssize_t PipeChannel::recvmsg_internal(ioctx_t* ctx, struct msghdr* msg, +ssize_t PipeChannel::recvmsg_internal(ioctx_t* ctx, + struct msghdr* msg, int flags) { - if ( flags & ~(MSG_PEEK | MSG_WAITALL) ) + msg->msg_flags = 0; + if ( flags & ~(MSG_PEEK | MSG_WAITALL | MSG_CMSG_CLOEXEC | + MSG_CMSG_CLOFORK) ) + return errno = EINVAL, -1; + if ( SSIZE_MAX < TruncateIOVec(msg->msg_iov, msg->msg_iovlen, SSIZE_MAX) ) return errno = EINVAL, -1; Thread* this_thread = CurrentThread(); this_thread->yield_to_tid = sender_system_tid; - ScopedLockSignal lock(&pipelock); + ScopedLockSignal lock(&pipe_lock); if ( !lock.IsAcquired() ) return errno = EINTR, -1; - ssize_t so_far = 0; + receiver_system_tid = this_thread->system_tid; + // Receive the first segment only in the ring buffer and wait for it. + while ( !buffer_used ) + { + // EOF if there are no writers left. + if ( !any_writing ) + { + msg->msg_controllen = 0; + return 0; + } + // If a thread has plegded to send, yield to it. + this_thread->yield_to_tid = sender_system_tid; + if ( pledged_write ) + { + pledged_read++; + kthread_mutex_unlock(&pipe_lock); + kthread_yield(); + kthread_mutex_lock(&pipe_lock); + pledged_read--; + continue; + } + // Wait for data to arrive in the ring buffer. + if ( ctx->dflags & O_NONBLOCK ) + return errno = EWOULDBLOCK, -1; + pledged_read++; + bool interrupted = !kthread_cond_wait_signal(&read_cond, &pipe_lock); + pledged_read--; + if ( interrupted ) + return errno = EINTR, -1; + } + // Peeking iterates through the segment without removing data. + bool peek = flags & MSG_PEEK; size_t peeked = 0; - if ( SSIZE_MAX < TruncateIOVec(msg->msg_iov, msg->msg_iovlen, SSIZE_MAX) ) - return errno = EINVAL, -1; + // The remaining user-space control data (if any), incremented as it is + // being written to, and the final values are used to tell the caller how + // much control data was written. + unsigned char* control = (unsigned char*) msg->msg_control; + size_t control_length = msg->msg_controllen; + // Whether reading the control data failed, and whether it was due to a + // harmless truncation (in which case the system call doesn't fail). + bool failed = false; + bool truncated = false; + // Read the ancillary data, if any, and discard it if the caller didn't + // expect control data or if reading the control the data failed. + struct segment_header header; + assert(sizeof(header) <= buffer_used); + ReadBuffer(CopyToKernel, &header, sizeof(header), 0); + while ( 0 < header.ancillary ) + { + // Read the nested cmsg header and find the control message type. + struct cmsghdr cmsg; + assert(sizeof(cmsg) <= header.ancillary); + Dequeue(CopyToKernel, &cmsg, sizeof(cmsg), peek, peeked); + if ( peek ) + peeked += sizeof(cmsg); + header.ancillary -= sizeof(cmsg); + // Determine how much data the caller gets after any truncation and + // correct the control message header given to the caller. + size_t data = cmsg.cmsg_len - sizeof(struct cmsghdr); + size_t truncated_data = data; + if ( cmsg.cmsg_level == SOL_SOCKET && cmsg.cmsg_type == SCM_RIGHTS ) + truncated_data = (data / sizeof(int)) * sizeof(int); + size_t available_data = + sizeof(cmsg) <= control_length ? control_length - sizeof(cmsg) : 0; + if ( available_data < truncated_data ) + truncated_data = available_data; + cmsg.cmsg_len = sizeof(struct cmsghdr) + truncated_data; + // Copy the control message header to the caller. + if ( sizeof(cmsg) <= control_length && + ctx->copy_to_dest(control, &cmsg, sizeof(cmsg)) ) + { + control += sizeof(cmsg); + control_length -= sizeof(cmsg); + } + else if ( !failed ) + truncated = failed = true; + // Passed file descriptors needs to be unserialized and allocated in the + // process file descriptor table. + if ( cmsg.cmsg_level == SOL_SOCKET && cmsg.cmsg_type == SCM_RIGHTS ) + { + int fdflags = 0; + if ( flags & MSG_CMSG_CLOEXEC ) + fdflags |= FD_CLOEXEC; + if ( flags & MSG_CMSG_CLOFORK ) + fdflags |= FD_CLOFORK; + Process* process = CurrentProcess(); + Ref dtable = process->GetDTable(); + size_t fds = data / sizeof(int); + // Preallocate the needed file descriptor slots so it doesn't fail + // later and it becomes possible to do an all-or-nothing operation. + int reservation = 0; + if ( !failed && + (INT_MAX < fds || !dtable->Reserve(fds, &reservation)) ) + { + errno = EMSGSIZE; + failed = true; + } + for ( size_t i = 0; i < fds; i++ ) + { + uintptr_t ptr; + Dequeue(CopyToKernel, &ptr, sizeof(ptr), peek, peeked); + if ( peek ) + peeked += sizeof(ptr); + header.ancillary -= sizeof(ptr); + Ref desc; + desc.Import(ptr); + if ( peek ) + desc.Export(); + else + { + desc->unpass(); + kthread_mutex_lock(&pass_lock); + passing_count--; + kthread_mutex_unlock(&pass_lock); + } + if ( failed ) + continue; + if ( control_length < sizeof(int) ) + { + truncated = failed = true; + continue; + } + int fd = dtable->Allocate(desc, fdflags, 0, &reservation); + assert(0 <= fd); + if ( !ctx->copy_to_dest(control, &fd, sizeof(fd)) ) + { + // The file descriptor leaks because the caller was faulty. + failed = true; + continue; + } + control += sizeof(fd); + control_length -= sizeof(fd); + } + dtable->Unreserve(&reservation); + } + else + { + // Transfer the control data directly to the caller and truncate it + // if it was too long. + size_t amount = control_length < data ? control_length : data; + if ( failed || + !Dequeue(ctx->copy_to_dest, control, amount, peek, peeked) ) + { + failed = true; + amount = data; + Dequeue(NULL, NULL, amount, peek, peeked); + } + if ( !failed && amount < data ) + failed = truncated = true; + if ( peek ) + peeked += data; + header.ancillary -= data; + } + if ( !failed ) + { + // Zero the alignment padding between the caller's control messages. + size_t misaligned = CMSG_ALIGN(data) - data; + if ( control_length <= misaligned && + ctx->zero_dest(control, misaligned) ) + { + control += misaligned; + control_length -= misaligned; + } + else if ( header.ancillary ) + truncated = failed = true; + } + } + // Fail if the control data couldn't be read and was discarded. It's not an + // error if the control data was simply truncated. + if ( !truncated && failed ) + return -1; + msg->msg_controllen -= control_length; + if ( truncated ) + msg->msg_flags |= MSG_CTRUNC; + // Read the regular data in the segment. + ssize_t so_far = 0; int iov_i = 0; size_t iov_offset = 0; while ( iov_i < msg->msg_iovlen && so_far < SSIZE_MAX ) @@ -224,57 +700,64 @@ ssize_t PipeChannel::recvmsg_internal(ioctx_t* ctx, struct msghdr* msg, continue; } receiver_system_tid = this_thread->system_tid; - while ( anywriting && bufferused <= peeked ) + // If we depleted the segment, try to wait for additional data if a + // writer has committed to writing more data or if MSG_WAITALL wants + // all its data. + while ( !header.normal && any_writing ) { - if ( (flags & MSG_PEEK) && so_far ) + if ( peek ) + return so_far; + // If the segment is empty but the buffer is non-empty, then there + // already is another segment which shouldn't be received yet. + if ( buffer_used ) return so_far; this_thread->yield_to_tid = sender_system_tid; - if ( pledged_read ) + if ( pledged_write ) { - pledged_write++; - kthread_mutex_unlock(&pipelock); + // Yield to the thread that has pledged to write more data. + pledged_read++; + kthread_mutex_unlock(&pipe_lock); kthread_yield(); - kthread_mutex_lock(&pipelock); - pledged_write--; - continue; + kthread_mutex_lock(&pipe_lock); + pledged_read--; } - if ( !(flags & MSG_WAITALL) && so_far ) + else + { + // Wait for the remaining data to arrive if MSG_WAITALL. + if ( !(flags & MSG_WAITALL) && so_far ) + return so_far; + if ( ctx->dflags & O_NONBLOCK ) + return errno = EWOULDBLOCK, -1; + pledged_read++; + bool interrupted = + !kthread_cond_wait_signal(&read_cond, &pipe_lock); + pledged_read--; + if ( interrupted ) + return so_far ? so_far : (errno = EINTR, -1); + } + if ( !buffer_used ) + continue; + // Reread the header as another thread has updated it. + assert(sizeof(header) <= buffer_used); + ReadBuffer(CopyToKernel, &header, sizeof(header), 0); + // Don't cross into another segment with ancillary data. + if ( header.ancillary ) return so_far; - if ( ctx->dflags & O_NONBLOCK ) - return errno = EWOULDBLOCK, -1; - pledged_write++; - bool interrupted = !kthread_cond_wait_signal(&readcond, &pipelock); - pledged_write--; - if ( interrupted ) - return so_far ? so_far : (errno = EINTR, -1); } - size_t used = bufferused - peeked; - if ( !used && !anywriting ) - return so_far; + // EOF if there are no writers left. + if ( !header.normal && !any_writing ) + break; + // Transfer the normal data to the caller. size_t amount = count; - if ( used < amount ) - amount = used; - size_t offset = bufferoffset; - if ( peeked ) - offset = (bufferoffset + peeked) % buffersize; - size_t linear = buffersize - offset; - if ( linear < amount ) - amount = linear; - assert(amount); - if ( !ctx->copy_to_dest(buf, buffer + offset, amount) ) + if ( header.normal < amount ) + amount = header.normal; + if ( !Dequeue(ctx->copy_to_dest, buf, amount, peek, peeked) ) return so_far ? so_far : -1; so_far += amount; - if ( flags & MSG_PEEK ) + if ( peek ) peeked += amount; - else - { - bufferoffset = (bufferoffset + amount) % buffersize; - bufferused -= amount; - kthread_cond_broadcast(&writecond); - read_poll_channel.Signal(ReadPollEventStatus()); - write_poll_channel.Signal(WritePollEventStatus()); - } iov_offset += amount; + header.normal -= amount; if ( iov_offset == iov->iov_len ) { iov_i++; @@ -307,14 +790,18 @@ ssize_t PipeChannel::writev(ioctx_t* ctx, const struct iovec* iov, int iovcnt) return sendmsg_internal(ctx, &msg, 0); } -ssize_t PipeChannel::sendmsg(ioctx_t* ctx, const struct msghdr* msg_ptr, +ssize_t PipeChannel::sendmsg(ioctx_t* ctx, + const struct msghdr* msg_ptr, int flags) { + // sendmsg can only be called through Unix sockets and not regular pipes. struct msghdr msg; if ( !ctx->copy_from_src(&msg, msg_ptr, sizeof(msg)) ) return -1; if ( msg.msg_iovlen < 0 || IOV_MAX < msg.msg_iovlen ) return errno = EINVAL, -1; + if ( msg.msg_name ) + return errno = EISCONN, -1; size_t iov_size = msg.msg_iovlen * sizeof(struct iovec); struct iovec* iov = new struct iovec[msg.msg_iovlen]; if ( !iov ) @@ -322,65 +809,284 @@ ssize_t PipeChannel::sendmsg(ioctx_t* ctx, const struct msghdr* msg_ptr, if ( !ctx->copy_from_src(iov, msg.msg_iov, iov_size) ) return delete[] iov, -1; msg.msg_iov = iov; - size_t result = sendmsg_internal(ctx, &msg, flags); + ssize_t result = sendmsg_internal(ctx, &msg, flags); delete[] iov; return result; } -ssize_t PipeChannel::sendmsg_internal(ioctx_t* ctx, const struct msghdr* msg, +ssize_t PipeChannel::sendmsg_internal(ioctx_t* ctx, + const struct msghdr* msg, int flags) { - if ( flags & ~(MSG_WAITALL | MSG_NOSIGNAL) ) + if ( flags & ~(MSG_NOSIGNAL) ) return errno = EINVAL, -1; + if ( SSIZE_MAX < TruncateIOVec(msg->msg_iov, msg->msg_iovlen, SSIZE_MAX) ) + return errno = EINVAL, -1; + // Measure how much control data buffer space is required to make sure it + // can be sent in a single attempt without unnecessary truncation or + // fragmentation. This is only used as an estimate, if the control data + // changes between the two reads of it, then the caller loses the ability + // to know how much control data was sent. + unsigned char* control_ptr = (unsigned char*) msg->msg_control; + size_t required = 1; // At least one free byte, plus control below. + for ( size_t control_offset = 0; control_offset < msg->msg_controllen; ) + { + // Read the next control message header. + size_t control_left = msg->msg_controllen - control_offset; + struct cmsghdr cmsg; + if ( control_left < sizeof(cmsg) ) + return errno = EINVAL, -1; + unsigned char* cmsg_ptr = control_ptr + control_offset; + if ( !ctx->copy_from_src(&cmsg, cmsg_ptr, sizeof(cmsg)) ) + return -1; + if ( cmsg.cmsg_len < sizeof(struct cmsghdr) || + control_left < cmsg.cmsg_len ) + return errno = EINVAL, -1; + // Determine how much space is needed for the message. + size_t needed = cmsg.cmsg_len; + size_t data_size = cmsg.cmsg_len - sizeof(struct cmsghdr); + if ( cmsg.cmsg_level == SOL_SOCKET && cmsg.cmsg_type == SCM_RIGHTS ) + { + size_t pointers_size; + if ( __builtin_mul_overflow(data_size / sizeof(int), + sizeof(uintptr_t), &pointers_size) || + __builtin_add_overflow(sizeof(struct cmsghdr), pointers_size, + &needed) ) + return errno = EMSGSIZE, -1; + } + if ( __builtin_add_overflow(required, needed, &required) ) + return errno = EMSGSIZE, -1; + control_offset += CMSG_ALIGN(cmsg.cmsg_len); + } Thread* this_thread = CurrentThread(); this_thread->yield_to_tid = receiver_system_tid; - ScopedLockSignal lock(&pipelock); + ScopedLockSignal lock(&pipe_lock); if ( !lock.IsAcquired() ) return errno = EINTR, -1; sender_system_tid = this_thread->system_tid; - if ( SSIZE_MAX < TruncateIOVec(msg->msg_iov, msg->msg_iovlen, SSIZE_MAX) ) - return errno = EINVAL, -1; + bool need_header; + struct segment_header header; + // Wait until we can send all the control data in one attempt plus one byte. + while ( true ) + { + // Send SIGPIPE or fail with EPIPE if there are no readers left. + if ( !any_reading ) + { + if ( is_sigpipe_enabled && !(flags & MSG_NOSIGNAL) ) + CurrentThread()->DeliverSignal(SIGPIPE); + return errno = EPIPE, -1; + } + if ( buffer_size < sizeof(header) ) + return errno = EMSGSIZE, -1; + if ( buffer_size - sizeof(header) < required ) + return errno = EMSGSIZE, -1; + // Check whether there is an existing segment header we can combine. + need_header = true; + if ( last_header_position < buffer_used ) + { + assert(sizeof(header) <= buffer_used - last_header_position); + ReadBuffer(CopyToKernel, &header, sizeof(header), + last_header_position); + need_header = msg->msg_controllen && header.normal; + } + // Check if there's enough buffer space for the request now. + size_t available = buffer_size - buffer_used; + if ( need_header && + sizeof(header) <= available && + required <= available - sizeof(header) ) + break; + if ( !need_header && required <= available ) + break; + // Wait for more ring buffer space to be available. + if ( ctx->dflags & O_NONBLOCK ) + return errno = EWOULDBLOCK, -1; + this_thread->yield_to_tid = receiver_system_tid; + if ( !kthread_cond_wait_signal(&write_cond, &pipe_lock) ) + return errno = EINTR, -1; + } + // Write the control data now that we've taken steps avoid truncation. + for ( size_t control_offset = 0; control_offset < msg->msg_controllen; ) + { + // Read the next control message header. + size_t control_left = msg->msg_controllen - control_offset; + struct cmsghdr cmsg; + if ( control_left < sizeof(cmsg) ) + return errno = EINVAL, -1; + unsigned char* cmsg_ptr = control_ptr + control_offset; + if ( !ctx->copy_from_src(&cmsg, cmsg_ptr, sizeof(cmsg)) ) + return -1; + if ( cmsg.cmsg_len < sizeof(struct cmsghdr) || + control_left < cmsg.cmsg_len ) + return errno = EINVAL, -1; + // Determine how much space is needed for the message. + size_t needed = cmsg.cmsg_len; + size_t data_size = cmsg.cmsg_len - sizeof(struct cmsghdr); + if ( cmsg.cmsg_level == SOL_SOCKET && cmsg.cmsg_type == SCM_RIGHTS ) + { + size_t pointers_size; + if ( __builtin_mul_overflow(data_size / sizeof(int), + sizeof(uintptr_t), &pointers_size) || + __builtin_add_overflow(sizeof(struct cmsghdr), pointers_size, + &needed) ) + return errno = EMSGSIZE, -1; + } + else + return errno = EINVAL, -1; + // Reject the control message if the ancillary data changed midway and + // more buffer space is required than was provisioned above. + if ( need_header && + __builtin_add_overflow(needed, sizeof(header), &needed) ) + return errno = EMSGSIZE, -1; + if ( buffer_size - buffer_used < needed ) + return errno = EMSGSIZE, -1; + // Take into account potentially needing a segment header. + size_t saved_last_header_position = last_header_position; + Enqueue(CopyFromKernel, &cmsg, sizeof(cmsg), true); + need_header = false; + unsigned char* data_ptr = control_ptr + control_offset + sizeof(cmsg); + bool failed = false; + // File descriptors needs to be serialized as a raw pointer to the + // underlying object which may be larger than an int. + if ( cmsg.cmsg_level == SOL_SOCKET && cmsg.cmsg_type == SCM_RIGHTS ) + { + Process* process = CurrentProcess(); + Ref dtable = process->GetDTable(); + size_t fds = data_size / sizeof(int); + for ( size_t i = 0; i < fds; i++ ) + { + unsigned char* buf = data_ptr + sizeof(int) * i; + int fd; + if ( !(failed = !ctx->copy_from_src(&fd, buf, sizeof(fd))) ) + { + Ref desc = dtable->Get(fd); + if ( !(failed = !desc) ) + { + // File descriptors are reference counted and there must + // never any reference count cycles, which can happen if + // a socket is sent on a socket. Additionally a tall + // tower of reference counted objects containing + // reference counted objects can overflow the stack in + // the destructors. Other operating systems have a full + // garbage collection system to avoid these problems, + // but here these properties are protected with three + // strict rules. + kthread_mutex_lock(&pass_lock); + // 1. Sockets themselves being sent cannot be sent on. + if ( passed_count ) + failed = true; + // 2. Sockets cannot be sent on themselves (either + // endpoint). Prevent it by marking themselves as + // being sent on before asking if the descriptor is + // not being sent on. + passing_count++; + kthread_mutex_unlock(&pass_lock); + // 3. Sockets cannot send another socket being sent on. + if ( failed || (failed = !desc->pass()) ) + { + errno = EPERM; + kthread_mutex_lock(&pass_lock); + passing_count--; + kthread_mutex_unlock(&pass_lock); + } + else + { + // Pass file descriptors as a leaked reference to + // the underlying reference counted descriptor. + uintptr_t ptr = desc.Export(); + Enqueue(CopyFromKernel, &ptr, sizeof(ptr), true); + } + } + } + // If any file descriptors couldn't be sent, undo the entire + // control message, so the caller knows that either none of them + // got sent or all of them. + if ( failed ) + { + for ( ; i; i-- ) + { + uintptr_t ptr; + Unenqueue(CopyToKernel, &ptr, sizeof(ptr), + saved_last_header_position); + Ref desc; + desc.Import(ptr); + desc->unpass(); + kthread_mutex_lock(&pass_lock); + passing_count--; + kthread_mutex_unlock(&pass_lock); + } + break; + } + } + } + else + { + if ( !Enqueue(ctx->copy_from_src, data_ptr, data_size, true) ) + failed = true; + } + // Undo the control message header if the control message data couldn't + // be sent. + if ( failed ) + { + Unenqueue(NULL, NULL, sizeof(cmsg), saved_last_header_position); + return -1; + } + control_offset += CMSG_ALIGN(cmsg.cmsg_len); + } + // Write the normal data to the ring buffer. ssize_t so_far = 0; int iov_i = 0; size_t iov_offset = 0; while ( iov_i < msg->msg_iovlen && so_far < SSIZE_MAX ) { - size_t maxcount = SSIZE_MAX - so_far; + size_t max_count = SSIZE_MAX - so_far; struct iovec* iov = &msg->msg_iov[iov_i]; const uint8_t* buf = (const uint8_t*) iov->iov_base + iov_offset; size_t count = iov->iov_len - iov_offset; - if ( maxcount < count ) - count = maxcount; + if ( max_count < count ) + count = max_count; if ( count == 0 ) { iov_i++; iov_offset = 0; continue; } + // Handle when the buffer space is exhausted. sender_system_tid = this_thread->system_tid; - while ( anyreading && bufferused == buffersize ) + size_t overhead = need_header ? sizeof(header) : 0; + while ( any_reading && buffer_size - buffer_used <= overhead ) { + // Yield to the thread that pledged to read more data. this_thread->yield_to_tid = receiver_system_tid; - if ( pledged_write ) + if ( pledged_read ) { - pledged_read++; - kthread_mutex_unlock(&pipelock); + pledged_write++; + kthread_mutex_unlock(&pipe_lock); kthread_yield(); - kthread_mutex_lock(&pipelock); - pledged_read--; - continue; + kthread_mutex_lock(&pipe_lock); + pledged_write--; } - if ( so_far && !(flags & MSG_WAITALL) ) - return so_far; - if ( ctx->dflags & O_NONBLOCK ) - return errno = EWOULDBLOCK, -1; - pledged_read++; - bool interrupted = !kthread_cond_wait_signal(&writecond, &pipelock); - pledged_read--; - if ( interrupted ) - return errno = EINTR, -1; + else + { + // If the estimated of the required buffer space was accurate, + // it will always have been possible to write at least one byte + // of data. + if ( so_far ) + return so_far; + // Wait for more buffer space to become available. + if ( ctx->dflags & O_NONBLOCK ) + return so_far ? so_far : errno = EWOULDBLOCK, -1; + pledged_write++; + bool interrupted = + !kthread_cond_wait_signal(&write_cond, &pipe_lock); + pledged_write--; + if ( interrupted ) + return so_far ? so_far : errno = EINTR, -1; + } + need_header = last_header_position == buffer_used; + overhead = need_header ? sizeof(header) : 0; } - if ( !anyreading ) + // Send SIGPIPE or fail with EPIPE if there are no readers left. + if ( !any_reading ) { if ( so_far ) return so_far; @@ -388,21 +1094,14 @@ ssize_t PipeChannel::sendmsg_internal(ioctx_t* ctx, const struct msghdr* msg, CurrentThread()->DeliverSignal(SIGPIPE); return errno = EPIPE, -1; } + // Write the normal data to the ring buffer. + size_t available = buffer_size - buffer_used - overhead; size_t amount = count; - if ( buffersize - bufferused < amount ) - amount = buffersize - bufferused; - size_t writeoffset = (bufferoffset + bufferused) % buffersize; - size_t linear = buffersize - writeoffset; - if ( linear < amount ) - amount = linear; - assert(amount); - if ( !ctx->copy_from_src(buffer + writeoffset, buf, amount) ) + if ( available < amount ) + amount = available; + if ( !Enqueue(ctx->copy_from_src, buf, amount, false) ) return so_far ? so_far : -1; - bufferused += amount; so_far += amount; - kthread_cond_broadcast(&readcond); - read_poll_channel.Signal(ReadPollEventStatus()); - write_poll_channel.Signal(WritePollEventStatus()); iov_offset += amount; if ( iov_offset == iov->iov_len ) { @@ -416,9 +1115,9 @@ ssize_t PipeChannel::sendmsg_internal(ioctx_t* ctx, const struct msghdr* msg, short PipeChannel::ReadPollEventStatus() { short status = 0; - if ( !anywriting && !bufferused ) + if ( !any_writing && !buffer_used ) status |= POLLHUP; - if ( bufferused ) + if ( buffer_used ) status |= POLLIN | POLLRDNORM; return status; } @@ -426,16 +1125,18 @@ short PipeChannel::ReadPollEventStatus() short PipeChannel::WritePollEventStatus() { short status = 0; - if ( !anyreading ) + if ( !any_reading ) status |= POLLERR; - if ( anyreading && bufferused != buffersize ) + bool need_header = last_header_position == buffer_size; + size_t needed_space = need_header ? sizeof(segment_header) : 0; + if ( any_reading && needed_space < buffer_size - buffer_used ) status |= POLLOUT | POLLWRNORM; return status; } int PipeChannel::read_poll(ioctx_t* /*ctx*/, PollNode* node) { - ScopedLock lock(&pipelock); + ScopedLock lock(&pipe_lock); short ret_status = ReadPollEventStatus() & node->events; if ( ret_status ) return node->master->revents |= ret_status, 0; @@ -445,7 +1146,7 @@ int PipeChannel::read_poll(ioctx_t* /*ctx*/, PollNode* node) int PipeChannel::write_poll(ioctx_t* /*ctx*/, PollNode* node) { - ScopedLock lock(&pipelock); + ScopedLock lock(&pipe_lock); short ret_status = WritePollEventStatus() & node->events; if ( ret_status ) return node->master->revents |= ret_status, 0; @@ -455,62 +1156,68 @@ int PipeChannel::write_poll(ioctx_t* /*ctx*/, PollNode* node) bool PipeChannel::GetSIGPIPEDelivery() { - ScopedLock lock(&pipelock); + ScopedLock lock(&pipe_lock); return is_sigpipe_enabled; } void PipeChannel::SetSIGPIPEDelivery(bool deliver_sigpipe) { - ScopedLock lock(&pipelock); + ScopedLock lock(&pipe_lock); is_sigpipe_enabled = deliver_sigpipe; } size_t PipeChannel::ReadSize() { - ScopedLock lock(&pipelock); + ScopedLock lock(&pipe_lock); return pretended_read_buffer_size; } size_t PipeChannel::WriteSize() { - ScopedLock lock(&pipelock); - return buffersize; + ScopedLock lock(&pipe_lock); + return buffer_size; } bool PipeChannel::ReadResize(size_t new_size) { - ScopedLock lock(&pipelock); + ScopedLock lock(&pipe_lock); if ( !new_size ) return errno = EINVAL, false; - // The read and write end share the same buffer, so let the write end decide - // how big a buffer it wants and pretend the read end can decide too. + // The reading and writing ends share the same buffer, so let the writing + // end decide how big a buffer it wants and pretend the reading end can + // decide too. pretended_read_buffer_size = new_size; return true; } bool PipeChannel::WriteResize(size_t new_size) { - ScopedLock lock(&pipelock); + ScopedLock lock(&pipe_lock); if ( !new_size ) return errno = EINVAL, false; - size_t MAX_PIPE_SIZE = 2 * 1024 * 1024; - if ( MAX_PIPE_SIZE < new_size ) - new_size = MAX_PIPE_SIZE; + size_t max_pipe_size = 2 * 1024 * 1024; + if ( max_pipe_size < new_size ) + new_size = max_pipe_size; + + size_t min_pipe_size = sizeof(segment_header) + 1; + if ( new_size < min_pipe_size ) + new_size = min_pipe_size; // Refuse to lose data if the the new size would cause truncation. - if ( new_size < bufferused ) - new_size = bufferused; + if ( new_size < buffer_used ) + new_size = buffer_used; uint8_t* new_buffer = new uint8_t[new_size]; if ( !new_buffer ) return false; - for ( size_t i = 0; i < bufferused; i++ ) - new_buffer[i] = buffer[(bufferoffset + i) % buffersize]; + for ( size_t i = 0; i < buffer_used; i++ ) + new_buffer[i] = buffer[(buffer_offset + i) % buffer_size]; delete[] buffer; buffer = new_buffer; - buffersize = new_size; + buffer_size = new_size; + buffer_offset = 0; return true; } @@ -556,6 +1263,17 @@ void PipeEndpoint::Disconnect() channel = NULL; } +bool PipeEndpoint::pass() +{ + return channel ? channel->pass() : false; +} + +void PipeEndpoint::unpass() +{ + if ( channel ) + channel->unpass(); +} + ssize_t PipeEndpoint::recv(ioctx_t* ctx, uint8_t* buf, size_t count, int flags) { if ( !reading ) @@ -687,9 +1405,15 @@ class PipeNode : public AbstractInode public: PipeNode(dev_t dev, uid_t owner, gid_t group, mode_t mode); virtual ~PipeNode(); + virtual bool pass(); + virtual void unpass(); virtual ssize_t readv(ioctx_t* ctx, const struct iovec* iov, int iovcnt); virtual ssize_t writev(ioctx_t* ctx, const struct iovec* iov, int iovcnt); virtual int poll(ioctx_t* ctx, PollNode* node); + // Pipes must not provide sendmsg/recvmsg that can do file descriptor + // passing. S_IFNEVERWRAP in type must be set if this was to be supported, + // and the kernel would need to be audited for the assumption that only + // filesystem sockets can do file descriptor passing. public: bool Connect(PipeNode* destination); @@ -720,6 +1444,16 @@ PipeNode::~PipeNode() { } +bool PipeNode::pass() +{ + return endpoint.pass(); +} + +void PipeNode::unpass() +{ + endpoint.unpass(); +} + ssize_t PipeNode::readv(ioctx_t* ctx, const struct iovec* iov, int iovcnt) { return endpoint.readv(ctx, iov, iovcnt); diff --git a/kernel/vnode.cpp b/kernel/vnode.cpp index 27ee4586..613a9d84 100644 --- a/kernel/vnode.cpp +++ b/kernel/vnode.cpp @@ -216,6 +216,16 @@ int Vnode::unmount(ioctx_t* ctx, const char* filename, int flags) return 0; } +bool Vnode::pass() +{ + return inode->pass(); +} + +void Vnode::unpass() +{ + inode->unpass(); +} + int Vnode::sync(ioctx_t* ctx) { return inode->sync(ctx); diff --git a/libc/include/sys/socket.h b/libc/include/sys/socket.h index b78a45ab..cfb67ce2 100644 --- a/libc/include/sys/socket.h +++ b/libc/include/sys/socket.h @@ -97,9 +97,21 @@ struct cmsghdr #define SCM_RIGHTS 1 -/* TODO: CMSG_DATA(cmsg) */ -/* TODO: CMSG_NXTHDR(cmsg) */ -/* TODO: CMSH_FIRSTHDR(cmsg) */ +#define CMSG_ALIGN(value) \ + (-(-(size_t)(value) & ~(__alignof__(struct cmsghdr) - 1))) +#define CMSG_SPACE(size) (sizeof(struct cmsghdr) + CMSG_ALIGN(size)) +#define CMSG_LEN(size) (sizeof(struct cmsghdr) + (size)) +#define CMSG_DATA(cmsg) ((unsigned char*) ((struct cmsghdr*) (cmsg) + 1)) +#define CMSG_FIRSTHDR(mhdr) \ + ((mhdr)->msg_controllen < sizeof(struct cmsghdr) ? \ + (struct cmsghdr*) 0 : \ + (struct cmsghdr*) (mhdr)->msg_control) +#define CMSG_NXTHDR(mhdr, cmsg) \ + ((cmsg)->cmsg_len < sizeof(struct cmsghdr) || \ + (char*) (mhdr)->msg_control + (mhdr)->msg_controllen - (char*) (cmsg) <= \ + CMSG_ALIGN((cmsg)->cmsg_len) ? \ + (struct cmsghdr*) 0 : \ + (struct cmsghdr*) (((char*) (cmsg)) + CMSG_ALIGN((cmsg)->cmsg_len))) struct linger { @@ -140,6 +152,7 @@ struct linger #define MSG_WAITALL (1<<7) #define MSG_DONTWAIT (1<<8) #define MSG_CMSG_CLOEXEC (1<<9) +#define MSG_CMSG_CLOFORK (1<<10) #define AF_UNSPEC 0 #define AF_INET 1 diff --git a/regress/Makefile b/regress/Makefile index c9407b2a..b0910068 100644 --- a/regress/Makefile +++ b/regress/Makefile @@ -25,6 +25,10 @@ test-pthread-once \ test-pthread-self \ test-pthread-tls \ test-signal-raise \ +test-unix-socket-fd-cycle \ +test-unix-socket-fd-leak \ +test-unix-socket-fd-pass \ +test-unix-socket-fd-trunc \ test-unix-socket-name \ test-unix-socket-shutdown \ diff --git a/regress/test-unix-socket-fd-cycle.c b/regress/test-unix-socket-fd-cycle.c new file mode 100644 index 00000000..6231065f --- /dev/null +++ b/regress/test-unix-socket-fd-cycle.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2021 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * test-unix-socket-fd-cycle.c + * Tests whether Unix socket file descriptor passing cycles are rejected. + */ + +#include +#include +#include + +#include +#include +#include + +#include "test.h" + +int main(void) +{ + int a_fds[2]; + test_assert(socketpair(AF_UNIX, SOCK_STREAM, 0, a_fds) == 0); + int b_fds[2]; + test_assert(socketpair(AF_UNIX, SOCK_STREAM, 0, b_fds) == 0); + + struct msghdr mhdr; + char buf[1] = { 0 }; + struct iovec iov; + iov.iov_base = buf; + iov.iov_len = sizeof(buf); + alignas(struct cmsghdr) char cmsgdata[CMSG_SPACE(sizeof(int))]; + ssize_t amount; + struct cmsghdr* cmsg; + + // Passing a Unix socket on itself isn't permitted. + buf[0] = 'X'; + memset(&mhdr, 0, sizeof(mhdr)); + mhdr.msg_iov = &iov; + mhdr.msg_iovlen = 1; + mhdr.msg_control = cmsgdata; + mhdr.msg_controllen = sizeof(cmsgdata); + cmsg = CMSG_FIRSTHDR(&mhdr); + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + int* cdata = (int*) CMSG_DATA(cmsg); + *cdata = a_fds[1]; + amount = sendmsg(a_fds[1], &mhdr, 0); + test_assertx(amount < 0); + test_assert(errno == EPERM); + + // Passing a Unix socket on its other end isn't permitted. + *cdata = a_fds[0]; + amount = sendmsg(a_fds[1], &mhdr, 0); + test_assertx(amount < 0); + test_assert(errno == EPERM); + + // Passing a Unix socket (with no fds passed) on another Unix socket (which + // itself isn't being passed) is allowed. + *cdata = b_fds[1]; + amount = sendmsg(a_fds[1], &mhdr, 0); + test_assert(0 <= amount); + test_assertx(amount == 1); + + // A Unix socket (itself being passed) is not permitted to pass fds. + // b_fds[1] is already being sent on a_fds[1] (to a_fds[0]). + FILE* file; + test_assert((file = tmpfile())); + *cdata = fileno(file); + amount = sendmsg(b_fds[1], &mhdr, 0); + test_assertx(amount < 0); + test_assert(errno == EPERM); + fclose(file); + + // A Unix socket is not permitted to send a socket with fds being sent. + // b_fds[1] is already being sent on a_fds[1] (to a_fds[0]). + *cdata = a_fds[1]; + amount = sendmsg(b_fds[0], &mhdr, 0); + test_assertx(amount < 0); + test_assert(errno == EPERM); + + // Receive b_fds[1] being sent on a_fds[1] to a_fds[0]. + memset(&mhdr, 0, sizeof(mhdr)); + mhdr.msg_iov = &iov; + mhdr.msg_iovlen = 1; + mhdr.msg_control = cmsgdata; + mhdr.msg_controllen = sizeof(cmsgdata); + amount = recvmsg(a_fds[0], &mhdr, 0); + test_assert(0 <= amount); + test_assertx(amount == 1); + test_assertx(buf[0] == 'X'); + test_assertx(!(mhdr.msg_flags & MSG_CTRUNC)); + test_assertx(!mhdr.msg_flags); + test_assertx(mhdr.msg_controllen); + cmsg = CMSG_FIRSTHDR(&mhdr); + test_assertx(cmsg); + test_assertx(cmsg->cmsg_level == SOL_SOCKET); + test_assertx(cmsg->cmsg_type == SCM_RIGHTS); + test_assertx(cmsg->cmsg_len == CMSG_LEN(sizeof(int))); + cdata = (int*) CMSG_DATA(cmsg); + int file_fd = *cdata; + test_assertx(0 <= file_fd); + struct stat gotten_st; + test_assert(fstat(file_fd, &gotten_st) == 0); + struct stat expected_st; + test_assert(fstat(b_fds[1], &expected_st) == 0); + test_assertx(gotten_st.st_ino == expected_st.st_ino); + test_assertx(gotten_st.st_dev == expected_st.st_dev); + test_assertx(!CMSG_NXTHDR(&mhdr, cmsg)); + close(file_fd); + + return 0; +} diff --git a/regress/test-unix-socket-fd-leak.c b/regress/test-unix-socket-fd-leak.c new file mode 100644 index 00000000..8abc2e57 --- /dev/null +++ b/regress/test-unix-socket-fd-leak.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2021 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * test-unix-socket-fd-leak.c + * Tests whether leaking file descriptors over a Unix socket works. + */ + +#include +#include +#include + +#include +#include +#include + +#include "test.h" + +int main(void) +{ + int fds[2]; + test_assert(socketpair(AF_UNIX, SOCK_STREAM, 0, fds) == 0); + + FILE* file; + test_assert((file = tmpfile())); + + struct msghdr mhdr; + char buf[1] = { 0 }; + struct iovec iov; + iov.iov_base = buf; + iov.iov_len = sizeof(buf); + alignas(struct cmsghdr) char cmsgdata[CMSG_SPACE(sizeof(int))]; + + buf[0] = 'X'; + memset(&mhdr, 0, sizeof(mhdr)); + mhdr.msg_iov = &iov; + mhdr.msg_iovlen = 1; + mhdr.msg_control = cmsgdata; + mhdr.msg_controllen = sizeof(cmsgdata); + struct cmsghdr* cmsg = CMSG_FIRSTHDR(&mhdr); + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + int* cdata = (int*) CMSG_DATA(cmsg); + *cdata = fileno(file); + ssize_t amount = sendmsg(fds[1], &mhdr, 0); + test_assert(0 <= amount); + test_assertx(amount == 1); + + fclose(file); + close(fds[0]); + close(fds[1]); + + return 0; +} diff --git a/regress/test-unix-socket-fd-pass.c b/regress/test-unix-socket-fd-pass.c new file mode 100644 index 00000000..5b082010 --- /dev/null +++ b/regress/test-unix-socket-fd-pass.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2017, 2021 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * test-unix-socket-fd-pass.c + * Tests whether passing a file descriptor over an Unix socket works. + */ + +#include +#include +#include + +#include +#include +#include + +#include "test.h" + +int main(void) +{ + int fds[2]; + test_assert(socketpair(AF_UNIX, SOCK_STREAM, 0, fds) == 0); + + FILE* file; + test_assert((file = tmpfile())); + + struct stat expected_st; + test_assert(fstat(fileno(file), &expected_st) == 0); + + struct msghdr mhdr; + char buf[1] = { 0 }; + struct iovec iov; + iov.iov_base = buf; + iov.iov_len = sizeof(buf); + alignas(struct cmsghdr) char cmsgdata[CMSG_SPACE(sizeof(int))]; + + pid_t child_pid; + test_assert(0 <= (child_pid = fork())); + + if ( child_pid == 0 ) + { + close(fds[0]); + buf[0] = 'X'; + memset(&mhdr, 0, sizeof(mhdr)); + mhdr.msg_iov = &iov; + mhdr.msg_iovlen = 1; + mhdr.msg_control = cmsgdata; + mhdr.msg_controllen = sizeof(cmsgdata); + struct cmsghdr* cmsg = CMSG_FIRSTHDR(&mhdr); + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + int* cdata = (int*) CMSG_DATA(cmsg); + *cdata = fileno(file); + ssize_t amount = sendmsg(fds[1], &mhdr, 0); + test_assert(0 <= amount); + test_assertx(amount == 1); + _exit(0); + } + + close(fds[1]); + fclose(file); + + memset(&mhdr, 0, sizeof(mhdr)); + mhdr.msg_iov = &iov; + mhdr.msg_iovlen = 1; + mhdr.msg_control = cmsgdata; + mhdr.msg_controllen = sizeof(cmsgdata); + ssize_t amount = recvmsg(fds[0], &mhdr, MSG_PEEK); + test_assert(0 <= amount); + test_assertx(amount == 1); + test_assertx(buf[0] == 'X'); + test_assertx(!(mhdr.msg_flags & MSG_CTRUNC)); + test_assertx(!mhdr.msg_flags); + test_assertx(mhdr.msg_controllen); + struct cmsghdr* cmsg = CMSG_FIRSTHDR(&mhdr); + test_assertx(cmsg); + test_assertx(cmsg->cmsg_level == SOL_SOCKET); + test_assertx(cmsg->cmsg_type == SCM_RIGHTS); + test_assertx(cmsg->cmsg_len == CMSG_LEN(sizeof(int))); + int* cdata = (int*) CMSG_DATA(cmsg); + int file_fd = *cdata; + test_assertx(0 <= file_fd); + struct stat gotten_st; + test_assert(fstat(file_fd, &gotten_st) == 0); + test_assertx(gotten_st.st_ino == expected_st.st_ino); + test_assertx(gotten_st.st_dev == expected_st.st_dev); + test_assertx(!CMSG_NXTHDR(&mhdr, cmsg)); + close(file_fd); + + memset(&mhdr, 0, sizeof(mhdr)); + mhdr.msg_iov = &iov; + mhdr.msg_iovlen = 1; + mhdr.msg_control = cmsgdata; + mhdr.msg_controllen = sizeof(cmsgdata); + amount = recvmsg(fds[0], &mhdr, 0); + test_assert(0 <= amount); + test_assertx(amount == 1); + test_assertx(buf[0] == 'X'); + test_assertx(!(mhdr.msg_flags & MSG_CTRUNC)); + test_assertx(!mhdr.msg_flags); + test_assertx(mhdr.msg_controllen); + cmsg = CMSG_FIRSTHDR(&mhdr); + test_assertx(cmsg); + test_assertx(cmsg->cmsg_level == SOL_SOCKET); + test_assertx(cmsg->cmsg_type == SCM_RIGHTS); + test_assertx(cmsg->cmsg_len == CMSG_LEN(sizeof(int))); + cdata = (int*) CMSG_DATA(cmsg); + file_fd = *cdata; + test_assertx(0 <= file_fd); + test_assert(fstat(file_fd, &gotten_st) == 0); + test_assertx(gotten_st.st_ino == expected_st.st_ino); + test_assertx(gotten_st.st_dev == expected_st.st_dev); + test_assertx(!CMSG_NXTHDR(&mhdr, cmsg)); + close(file_fd); + + memset(&mhdr, 0, sizeof(mhdr)); + mhdr.msg_iov = &iov; + mhdr.msg_iovlen = 1; + mhdr.msg_control = cmsgdata; + mhdr.msg_controllen = sizeof(cmsgdata); + amount = recvmsg(fds[0], &mhdr, 0); + test_assert(0 <= amount); + test_assertx(amount == 0); + test_assertx(!mhdr.msg_flags); + test_assertx(!mhdr.msg_controllen); + + int code; + test_assert(waitpid(child_pid, &code, 0) == child_pid); + test_assert(WIFEXITED(code) && WEXITSTATUS(code) == 0); + + return 0; +} diff --git a/regress/test-unix-socket-fd-trunc.c b/regress/test-unix-socket-fd-trunc.c new file mode 100644 index 00000000..843f8a94 --- /dev/null +++ b/regress/test-unix-socket-fd-trunc.c @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2021 Jonas 'Sortie' Termansen. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * test-unix-socket-fd-trunc.c + * Tests having too little control data when passing file descriptors. + */ + +#include +#include +#include + +#include +#include +#include + +#include "test.h" + +int main(void) +{ + int fds[2]; + test_assert(socketpair(AF_UNIX, SOCK_STREAM, 0, fds) == 0); + + FILE* file1; + test_assert((file1 = tmpfile())); + FILE* file2; + test_assert((file2 = tmpfile())); + + struct stat expected_st; + test_assert(fstat(fileno(file1), &expected_st) == 0); + + struct msghdr mhdr; + char buf[1] = { 0 }; + struct iovec iov; + iov.iov_base = buf; + iov.iov_len = sizeof(buf); + alignas(struct cmsghdr) char cmsgdata[CMSG_SPACE(sizeof(int) * 2)]; + + buf[0] = 'X'; + memset(&mhdr, 0, sizeof(mhdr)); + mhdr.msg_iov = &iov; + mhdr.msg_iovlen = 1; + mhdr.msg_control = cmsgdata; + mhdr.msg_controllen = sizeof(cmsgdata); + struct cmsghdr* cmsg = CMSG_FIRSTHDR(&mhdr); + cmsg->cmsg_len = CMSG_LEN(sizeof(int) * 2); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + int* cdata = (int*) CMSG_DATA(cmsg); + cdata[0] = fileno(file1); + cdata[1] = fileno(file2); + ssize_t amount = sendmsg(fds[1], &mhdr, 0); + test_assert(0 <= amount); + test_assertx(amount == 1); + + fclose(file1); + fclose(file2); + + alignas(struct cmsghdr) + char cmsgdatasmall[CMSG_ALIGN(sizeof(struct cmsghdr)) + sizeof(int)]; + memset(&mhdr, 0, sizeof(mhdr)); + mhdr.msg_iov = &iov; + mhdr.msg_iovlen = 1; + mhdr.msg_control = cmsgdatasmall; + mhdr.msg_controllen = sizeof(cmsgdatasmall); + amount = recvmsg(fds[0], &mhdr, 0); + test_assert(0 <= amount); + test_assertx(amount == 1); + test_assertx(buf[0] == 'X'); + test_assertx(mhdr.msg_flags == MSG_CTRUNC); + test_assertx(mhdr.msg_controllen); + test_assertx(mhdr.msg_controllen == sizeof(cmsgdatasmall)); + cmsg = CMSG_FIRSTHDR(&mhdr); + test_assertx(cmsg); + test_assertx(cmsg->cmsg_level == SOL_SOCKET); + test_assertx(cmsg->cmsg_type == SCM_RIGHTS); + test_assertx(cmsg->cmsg_len == CMSG_LEN(sizeof(int))); + cdata = (int*) CMSG_DATA(cmsg); + int file_fd = *cdata; + test_assertx(0 <= file_fd); + struct stat gotten_st; + test_assert(fstat(file_fd, &gotten_st) == 0); + test_assertx(gotten_st.st_ino == expected_st.st_ino); + test_assertx(gotten_st.st_dev == expected_st.st_dev); + test_assertx(!CMSG_NXTHDR(&mhdr, cmsg)); + close(file_fd); + + close(fds[0]); + close(fds[1]); + + return 0; +} diff --git a/regress/test.h b/regress/test.h index 5776a0d5..f0add8c3 100644 --- a/regress/test.h +++ b/regress/test.h @@ -21,6 +21,7 @@ #define TEST_H #include +#include #include #include #include diff --git a/share/man/man7/following-development.7 b/share/man/man7/following-development.7 index c3a9599a..a6e46054 100644 --- a/share/man/man7/following-development.7 +++ b/share/man/man7/following-development.7 @@ -69,6 +69,14 @@ releasing Sortix x.y, foo." to allow the maintainer to easily .Xr grep 1 for it after a release. .Sh CHANGES +.Ss Implement file descriptor passing +The +.Dv SCM_RIGHTS +control message have been implemented, allowing file descriptors to be passed +over +.Dv AF_UNIX +sockets. +This is a minor compatible ABI change. .Ss Implement threading primitives that truly sleep The .Xr futex 2