@inproceedings{zhong25:_oasis,
address = {Seoul, Korea},
author = {Yuhong Zhong and Daniel S. Berger and Pantea Zardoshti and
Enrique Saurez and Jacob Nelson and Dan R. K. Ports and
Antonis Psistakis and Joshua Fried and Asaf Cidon},
booktitle = {Proceedings of the 31st {ACM} {S}ymposium on {O}perating
{S}ystems {P}rinciples ({SOSP} '25)},
month = oct,
organization = {{ACM}},
title = {Oasis: Pooling {PCIe} Devices Over {CXL} to Boost
Utilization},
year = {2025},
abstract = {PCIe devices, such as NICs, SSDs, are frequently
underutilized in cloud platforms. Pooling PCIe devices
would enable multiplexing these devices across hosts and
thus reduce total cost of ownership. However, PCIe switches
are expensive. We design Oasis, a system that pools PCIe
devices in software over CXL memory pools. CXL memory pools
are already being deployed to boost datacenter memory
utilization and reduce costs. Once CXL pools are in place,
they can serve as an efficient data path between hosts and
PCIe devices. Oasis provides a control plane and datapath
over CXL pools, mapping and routing PCIe device traffic
across host boundaries. PCIe devices with different
functionalities can be supported by adding an Oasis engine
for each device class. We implement an Oasis network engine
to demonstrate NIC pooling. Our evaluation shows that Oasis
improves the NIC utilization by 2x and handles NIC failover
with only a 50 ms interruption.},
doi = {10.1145/3731569.3764812},
pdf = {papers/oasis-sosp25.pdf},
monthnum = {10}
}
@inproceedings{ports24:_futur_cloud_networ_system,
address = {Kyoto, Japan},
author = {Dan R. K. Ports},
booktitle = {Proceedings of the 15th Asia-Pacific Workshop on Systems
({APSYS} '24)},
month = sep,
note = {Keynote address.},
organization = {{ACM}},
title = {The Future of Cloud Networking is Systems},
year = {2024},
abstract = {As cloud platforms evolve, the boundaries between
networking and systems are increasingly blurred. Cloud
networking not only asks us to solve a challenging systems
problem -- providing an efficient, reliable, and secure
virtual infrastructure -- but offers us opportunities to
rethink our approach to classic challenges in distributed
systems. This keynote will explore the potential of
systems/networking co-design through the lens of a
cloud-scale hardware-accelerated load balancing platform.
Specifically, I will discuss how programmable networking
technology, including programmable switches and smart NICs,
enables these load balancers to achieve dramatically higher
efficiency than existing software solutions while
simultaneously offering increased flexibility for custom,
application-specific load balancing logic. Looking to the
future, I’ll describe three opportunities that this
design offers for redefining the architecture of
distributed systems with new load-balancing, migration, and
snapshotting algorithms, paving the way to a new generation
of high-performance, resilient, and scalable cloud
services.},
slidespdf = {papers/cloudnet-keynote-apsys24-slides.pdf},
monthnum = {09}
}
@inproceedings{yu24:_beaver,
address = {Santa Clara, CA, USA},
author = {Liangcheng Yu and Xiao Zhang and Haoran Zhang and John
Sonchak and Dan R. K. Ports and Vincent Liu},
booktitle = {Proceedings of the 18th {USENIX} {S}ymposium on
{O}perating {S}ystems {D}esign and {I}mplementation ({OSDI}
'24)},
month = jul,
organization = {{USENIX}},
title = {Beaver: Practical Partial Snapshots for Distributed Cloud
Services},
year = {2024},
abstract = {Distributed snapshots are a classic class of protocols
used for capturing a causally consistent view of states
across machines. Although effective, existing protocols
presume an isolated universe of processes to snapshot and
require instrumentation and coordination of all. This
assumption does not match today’s cloud services—it is
not always practical to instrument all involved processes
nor realistic to assume zero interaction of the machines of
interest with the external world. To bridge this gap,
this paper presents Beaver, the first practical partial
snapshot protocol that ensures causal consistency under
external traffic interference. Beaver presents a unique
design point that tightly couples its protocol with the
underlying regularities of the data center environment. By
exploiting the placement of software load balancers in data
center networks and their associated communication pattern,
Beaver not only requires minimal changes to today’s data
center operations but also eliminates any form of blocking
to existing communication, thus incurring near-zero
overhead to user traffic. We demonstrate the Beaver’s
effectiveness through extensive testbed experiments and
novel use cases.},
pdf = {papers/beaver-osdi24.pdf},
cats = {https://discuss.systems/@dan/112232483230067973},
monthnum = {07}
}
@inproceedings{choi23:_capyb,
address = {Seoul, South Korea},
author = {Inho Choi and Nimesh Wadekar and Raj Joshi and Dan R. K.
Ports and Irene Zhang and Jialin Li},
booktitle = {Proceedings of the 14th Asia-Pacific Workshop on Systems
({APSYS} '23)},
month = aug,
organization = {{ACM}},
title = {Capybara: Microsecond-Scale Live TCP Migration},
year = {2023},
abstract = {Latency-critical μs-scale data center applications are
susceptible to server load spikes. The issue is
particularly challenging for services using long-lived TCP
connections. This paper introduces Capybara, a highly
efficient and versatile live TCP migration system. Capybara
builds atop a deterministic, kernel-bypassed TCP stack
running in a library OS to realize its μs-scale TCP
migration mechanism. Using modern programmable switches,
Capybara implements migration-aware dynamic packet
forwarding and transient packet buffering, further reducing
system interference during live TCP migration. Capybara can
transparently migrate a running TCP connection in 4 μs on
average. It improves the average migration host latency by
about 12 times compared to a Linux kernel-based solution.},
doi = {10.1145/3609510.3609813},
pdf = {papers/capybara-apsys23.pdf},
monthnum = {08}
}
@inproceedings{choi23:_hydra,
address = {Boston, MA, USA},
author = {Inho Choi and Ellis Michael and Yunfan Li and Dan R. K.
Ports and Jialin Li},
booktitle = {Proceedings of the 20th {USENIX} {S}ymposium on
{N}etworked {S}ystems {D}esign and {I}mplementation ({NSDI}
'23)},
month = apr,
organization = {{USENIX}},
title = {Hydra: Serialization-Free Network Ordering for Strongly
Consistent Distributed Applications},
year = {2023},
abstract = {A large class of distributed systems, e.g., state machine
replication and fault-tolerant distributed databases, rely
on establishing a consistent order of operations on groups
of nodes in the system. Traditionally, an application-level
distributed protocol such as Paxos and two-phase locking
provide the ordering guarantees. To reduce the performance
overhead imposed by these protocols, a recent line of work
propose to move the responsibility of ensuring operation
ordering into the network – by sequencing requests
through a centralized network sequencer. This network
sequencing approach yields significant application-level
performance improvements, but routing all requests through
a single sequencer comes with several fundamental
limitations, including sequencer scalability bottleneck,
prolonged system downtime during sequencer failover,
worsened network-level load balancing, etc. Our work,
Hydra, overcomes these limitations by using a distributed
set of network sequencers to provide network ordering.
Hydra leverages loosely synchronized clocks on network
sequencers to establish message ordering across them,
per-sequencer sequence numbers to detect message drops, and
periodic timestamp messages to enforce progress when some
sequencers are idle. To demonstrate the benefit of Hydra,
we co-designed a state machine replication protocol and a
distributed transactional system using the Hydra network
primitive. Compared to serialization-based network ordering
systems, Hydra shows equivalent performance improvement
over traditional approaches in both applications, but with
significantly higher scalability, shorter sequencer
failover time, and better network-level load balancing.},
pdf = {papers/hydra-nsdi23.pdf},
monthnum = {04}
}
@inproceedings{lerner23:_datab_moder_networ,
author = {Alberto Lerner and Carsten Binnig and Philippe
Cudr{\'e}-Mauroux and Rana Hussein and Matthias Jasny and
Theo Jepsen and Dan R. K. Ports and Lasse Thostrup and
Tobias Ziegler},
booktitle = {Proceedings of the 49th {I}nternational {C}onference on
{V}ery {L}arge {D}ata {B}ases ({VLDB} '23)},
key = {{VLDB '23}},
month = aug,
note = {Tutorial},
title = {Databases on Modern Networks: A Decade of Research that
now comes into Practice},
year = {2023},
abstract = {Modern cloud networks are a fundamental pillar of
data-intensive applications. They provide high-speed
transaction (packet) rates and low overhead, enabling, for
instance, truly scalable database designs. These networks,
however, are fundamentally different from conventional
ones. Arguably, the two key discerning technologies are
RDMA and programmable network devices. Today, these
technologies are not niche technologies anymore and are
widely deployed across all major cloud vendors. The
question is thus not if but how a new breed of
data-intensive applications can benefit from modern
networks, given the perceived difficulty in using and
programming them. This tutorial addresses these challenges
by exposing how the underlying principles changed as the
network evolved and by presenting the new system design
opportunities they opened. In the process, we also discuss
several hard-earned lessons accumulated by making the
transition first-hand.},
doi = {10.14778/3611540.3611579},
pdf = {papers/dbtutorial-vldb23.pdf},
monthnum = {08}
}
@inproceedings{liu23:_slimem,
address = {Hong Kong, China},
author = {Ziyuan Liu and Zhixiong Niu and Ran Shu and Liang Gao and
Guohong Lai and Na Wang and Zongying He and Jacob Nelson
and Dan R. K. Ports and Lihua Yuan and Peng Cheng and
Yongqiang Xiong},
booktitle = {Proceedings of the 7th Asia-Pacific Workshop on Networking
({APNet '23})},
month = jul,
organization = {{ACM}},
title = {SlimeMold: Hardware Load Balancer at Scale in Datacenter},
year = {2023},
abstract = {Stateful load balancers (LB) are essential services in
cloud data centers, playing a crucial role in enhancing the
availability and capacity of applications. Numerous studies
have proposed methods to improve the throughput,
connections per second, and concurrent flows of single LBs.
For instance, with the advancement of programmable
switches, hardware-based load balancers (HLB) have become
mainstream due to their high efficiency. However,
programmable switches still face the issue of limited
registers and table entries, preventing them from fully
meeting the performance requirements of data centers. In
this paper, rather than solely focusing on enhancing
individual HLBs, we introduce SlimeMold, which enables HLBs
to work collaboratively at scale as an integrated LB system
in data centers. First, we design a novel HLB building
block capable of achieving load balancing and exchanging
states with other building blocks in the data plane. Next,
we decouple forwarding and state operations, organizing the
states using our proposed 2-level mapping mechanism.
Finally, we optimize the system with flow caching and table
entry balancing. We implement a real HLB building block
using the Broadcom 56788 SmartToR chip, which attains line
rate for state read and >1M OPS for flow write operations.
Our simulation demonstrates full scalability in large-scale
experiments, supporting 454 million concurrent flows with
512 state-hosting building blocks.},
doi = {10.1145/3600061.3600067},
pdf = {papers/slimemold-apnet23.pdf},
slidespdf = {papers/slimemold-apnet23-slides.pdf},
monthnum = {07}
}
@inproceedings{yuan23:_rambd,
address = {Montreal, QC, Canada},
author = {Yifan Yuan and Jinghan Huang and Yan Sun and Tianchen Wang
and Jacob Nelson and Dan Ports and Yipeng Wang and Ren Wang
and Charlie Tai and Nam Sung Kim},
booktitle = {Proceedings of the 29th International Symposium on High
Performance Computer Architecture ({HPCA} '23)},
month = feb,
organization = {{IEEE}},
title = {{RAMBDA}: {RDMA}-driven Acceleration Framework for
Memory-intensive us-scale Datacenter Applications},
year = {2023},
abstract = {Responding to the "datacenter tax" and "killer
microseconds" problems for memory-intensive datacenter
applications, diverse solutions including Smart NIC-based
ones have been proposed. Nonetheless, they often suffer
from high overhead of communications over network and/or
PCIe links. To tackle the limitations of the current
solutions, this paper proposes RAMBDA, RDMA-driven
acceleration framework for Boosting performance of
memory-intensive us-scale datacenter applications. this
paper proposes RAMBDA, a holistic network and architecture
co-design solution RAMBDA leverages current RDMA and
emerging cache-coherent off-chip interconnect technologies
and consists of the following four hardware and software
components: (1) unified abstraction of inter- and
intra-machine communications synergistically managed by
one-sided RDMA write and cache-coherent memory write; (2)
efficient notification of requests to accelerators assisted
by cache coherence; (3) cache-coherent accelerator
architecture directly interacting with NIC; and (4)
adaptive device-to-host data transfer for modern server
memory systems comprising both DRAM and NVM exploiting
state-of-the-art features in CPUs and PCIe. We prototype
RAMBDA with a commercial system and evaluate three popular
datacenter applications: (1) in-memory key-value store, (2)
chain replication-based distributed transaction system, and
(3) deep learning recommendation model inference. The
evaluation shows that RAMBDA provides 30.1∼69.1 lower
latency, up to 2.5x higher throughput, and ~3x higher
energy efficiency than the current state-of-the-art
solutions.},
doi = {10.1109/HPCA56546.2023.10071127},
pdf = {papers/rambda-hpca23.pdf},
monthnum = {02}
}
@inproceedings{zeno23:_swish,
address = {Haifa, Israel},
author = {Lior Zeno and Dan R. K. Ports and Jacob Nelson and
Daehyeok Kim and Shir Landau Feibish and Idit Keidar and
Arik Rinberg and Alon Rashelbach and Igor De-Paula and Mark
Silberstein},
booktitle = {Proceedings of the 16th {ACM} International System and
Storage Conference ({SYSTOR} '23)},
month = jul,
note = {Highlights Session},
organization = {{ACM}},
title = {{SwiSh}: Distributed Shared State Abstractions for
Programmable Switches},
year = {2023},
abstract = {We design and evaluate SwiShmem, a distributed
shared state management layer for data-plane P4
programs. SwiShmem enables running scalable stateful
distributed network functions on programmable switches
entirely in the data-plane. We explore several schemes to
build a shared variable abstraction, which differ in
consistency, performance, and in-switch implementation
complexity. We introduce the novel Strong
Delayed-Writes (SDW) protocol which offers consistent
snapshots of shared data-plane objects with semantics known
as strong r-relaxed linearizability, enabling
implementation of distributed concurrent sketches with
precise error bounds.
We implement strong, eventual,
and SDW consistency protocols in Tofino switches, and
compare their performance in microbenchmarks and three
realistic network functions, NAT, DDoS detector, and rate
limiter. Our results demonstrate that the general
distributed state management in the data plane is
practical, and outperforms any centralized solution by up
to four orders of magnitude in update throughput and
replication latency.},
monthnum = {07}
}
@inproceedings{liu22:_disag_data_collec_approac_loss_toler_applic,
address = {Fuzhou, China},
author = {Ziyuan Liu and Zhixiong Niu and Ran Shu and Wenxue Cheng
and Peng Cheng and Yongqiang Xiong and Lihua Yuan and Jacob
Nelson and Dan R. K. Ports},
booktitle = {Proceedings of the 6th Asia-Pacific Workshop on Networking
({APNet '22})},
month = jul,
organization = {{ACM}},
title = {A Disaggregate Data Collecting Approach for Loss-Tolerant
Applications},
year = {2022},
abstract = {Datacenter generates operation data at an extremely high
rate, and data center operators collect and analyze them
for problem diagnosis, resource utilization improvement,
and performance optimization. However, existing data
collection methods fail to efficiently aggregate and store
data at extremely high speed and scale. In this paper, we
explore a new approach that leverages programmable switches
to aggregate data and directly write data to the
destination storage. Our proposed data collection system,
ALT, uses programmable switches to control NVMe SSDs on
remote hosts without the involvement of a remote CPU. To
tolerate loss, ALT uses an elegant data structure to enable
efficient data recovery when retrieving the collected data.
We implement our system on a Tofino-based programmable
switch for a prototype. Our evaluation shows that ALT can
saturate SSD’s peak performance without any CPU
involvement.},
doi = {10.1145/3542637.3542646},
pdf = {papers/alt-apnet22.pdf},
monthnum = {07}
}
@inproceedings{yuan22:_unloc_power_inlin_float_point,
address = {Renton, WA, USA},
author = {Yifan Yuan and Omar Alama and Jiawei Fei and Jacob Nelson
and Dan R. K. Ports and Amedeo Sapio and Marco Canini and
Nam Sung Kim},
booktitle = {Proceedings of the 19th {USENIX} {S}ymposium on
{N}etworked {S}ystems {D}esign and {I}mplementation ({NSDI}
'22)},
month = apr,
organization = {{USENIX}},
title = {Unlocking the Power of Inline Floating-Point Operations on
Programmable Switches},
year = {2022},
abstract = {The advent of switches with programmable dataplanes has
enabled the rapid development of new network functionality,
as well as providing a platform for acceleration of a broad
range of application-level functionality. However, existing
switch hardware was not designed with application
acceleration in mind, and thus applications requiring
operations or datatypes not used in traditional network
protocols must resort to expensive workarounds.
Applications involving floating point data, including
distributed training for machine learning and distributed
query processing, are key examples. In this paper, we
propose FPISA, a floating point representation designed to
work efficiently in programmable switches. We first
implement FPISA on an Intel Tofino switch, but find that it
has limitations that impact throughput and accuracy. We
then propose hardware changes to address these limitations
based on the open-source Banzai switch architecture, and
synthesize them in a 15-nm standard-cell library to
demonstrate their feasibility. Finally, we use FPISA to
implement accelerators for training for machine learning as
an example application, and evaluate its performance on a
switch implementing our changes using emulation. We find
that FPISA allows distributed training to use one to three
fewer CPU cores and provide up to 85.9\% better throughput
than SwitchML in a CPU-constrained environment.},
pdf = {papers/fpisa-nsdi22.pdf},
slidespdf = {papers/fpisa-nsdi22-slides.pdf},
video = {https://www.usenix.org/conference/nsdi22/presentation/yuan},
monthnum = {04}
}
@inproceedings{zeno22:_swish,
address = {Renton, WA, USA},
author = {Lior Zeno and Dan R. K. Ports and Jacob Nelson and
Daehyeok Kim and Shir Landau Feibish and Idit Keidar and
Arik Rinberg and Alon Rashelbach and Igor De-Paula and Mark
Silberstein},
booktitle = {Proceedings of the 19th {USENIX} {S}ymposium on
{N}etworked {S}ystems {D}esign and {I}mplementation ({NSDI}
'22)},
month = apr,
organization = {{USENIX}},
title = {{SwiSh}: Distributed Shared State Abstractions for
Programmable Switches},
year = {2022},
abstract = {We design and evaluate SwiShmem, a distributed
shared state management layer for data-plane P4
programs. SwiShmem enables running scalable stateful
distributed network functions on programmable switches
entirely in the data-plane. We explore several schemes to
build a shared variable abstraction, which differ in
consistency, performance, and in-switch implementation
complexity. We introduce the novel Strong
Delayed-Writes (SDW) protocol which offers consistent
snapshots of shared data-plane objects with semantics known
as strong r-relaxed linearizability, enabling
implementation of distributed concurrent sketches with
precise error bounds.
We implement strong, eventual,
and SDW consistency protocols in Tofino switches, and
compare their performance in microbenchmarks and three
realistic network functions, NAT, DDoS detector, and rate
limiter. Our results demonstrate that the general
distributed state management in the data plane is
practical, and outperforms any centralized solution by up
to four orders of magnitude in update throughput and
replication latency.},
pdf = {papers/swish-nsdi22.pdf},
slidespdf = {papers/swish-nsdi22-slides.pdf},
video = {https://www.usenix.org/conference/nsdi22/presentation/zeno},
monthnum = {04}
}
@inproceedings{zhu22:_netvr,
address = {Renton, WA, USA},
author = {Hang Zhu and Tao Wang and Yi Hong and Dan R. K. Ports and
Anirudh Sivaraman and Xin Jin},
booktitle = {Proceedings of the 19th {USENIX} {S}ymposium on
{N}etworked {S}ystems {D}esign and {I}mplementation ({NSDI}
'22)},
month = apr,
organization = {{USENIX}},
title = {{NetVRM}: Virtual Register Memory for Programmable
Networks},
year = {2022},
abstract = {Programmable networks are enabling a new class of
applications that leverage the line-rate processing
capability and on-chip register memory of the switch data
plane. Yet the status quo is focused on developing
approaches that share the register memory statically. We
present NetVRM, a network management system that supports
dynamic register memory sharing between multiple concurrent
applications on a programmable network and is readily
deployable on commodity programmable switches. NetVRM
provides a virtual register memory abstraction that enables
applications to share the register memory in the data
plane, and abstracts away the underlying details. In
principle, NetVRM supports any memory allocation algorithm
given the virtual register memory abstraction. It also
provides a default memory allocation algorithm that
exploits the observation that applications have diminishing
returns on additional memory. NetVRM provides an extension
of P4, P4VRM, for developing applications with virtual
register memory, and a compiler to generate data plane
programs and control plane APIs. Testbed experiments show
that NetVRM is general to a diverse variety of
applications, and that its utility-based dynamic allocation
policy outperforms static resource allocation.
Specifically, it improves the mean satisfaction ratio
(i.e., the fraction of a network application’s lifetime
that it meets its utility target) by 1.6-2.2x under a range
of workloads.},
pdf = {papers/netvrm-nsdi22.pdf},
slidespdf = {papers/netvrm-nsdi22-slides.pdf},
video = {https://www.usenix.org/conference/nsdi22/presentation/zhu},
monthnum = {04}
}
@inproceedings{burke21:_prism,
address = {Virtual Conference},
author = {Matthew Burke and Sowmya Dharanipragada and Shannon Joyner
and Adriana Szekeres and Jacob Nelson and Irene Zhang and
Dan R. K. Ports},
booktitle = {Proceedings of the 28th {ACM} {S}ymposium on {O}perating
{S}ystems {P}rinciples ({SOSP} '21)},
month = oct,
organization = {{ACM}},
title = {{PRISM}: Rethinking the {RDMA} Interface for Distributed
Systems},
year = {2021},
abstract = {Remote Direct Memory Access (RDMA) has been used to
accelerate a variety of distributed systems, by providing
low-latency, CPU-bypassing access to a remote host's
memory. However, most of the distributed protocols used in
these systems cannot easily be expressed in terms of the
simple memory READs and WRITEs provided by RDMA. As a
result, designers face a choice between introducing
additional protocol complexity (e.g., additional round
trips) or forgoing the benefits of RDMA entirely. This
paper argues that an extension to the RDMA interface can
resolve this dilemma. We introduce the PRISM interface,
which extends the RDMA interface with four new primitives:
indirection, allocation, enhanced compare-and-swap, and
operation chaining. These increase the expressivity of the
RDMA interface, while still being implementable using the
same underlying hardware features. We show their utility by
designing three new applications using PRISM primitives,
that require little to no server-side CPU involvement: (1)
PRISM-KV, a key-value store; (2) PRISM-RS a replicated
block store; and (3) PRISM-TX, a distributed transaction
protocol. Using a software-based implementation of the
PRISM primitives, we show that these systems outperform
prior RDMA-based equivalents.},
doi = {10.1145/3477132.3483587},
pdf = {papers/prism-sosp21.pdf},
video = {https://youtu.be/yZDw9uBMHxI},
monthnum = {10}
}
@inproceedings{kim21:_redpl,
address = {Virtual Conference},
author = {Daehyeok Kim and Jacob Nelson and Dan R. K. Ports and Vyas
Sekar and Srinivasan Seshan},
booktitle = {Proceedings of {ACM} SIGCOMM 2021},
month = aug,
organization = {{ACM}},
title = {{RedPlane}: Enabling Fault Tolerant Stateful In-Switch
Applications},
year = {2021},
abstract = {Many recent efforts have demonstrated the performance
benefits of running datacenter functions (e.g., NATs, load
balancers, monitoring) on programmable switches. However, a
key missing piece remains: fault tolerance. This is
especially critical as the network is no longer stateless
and pure endpoint recovery does not suffice. In this paper,
we design and implement RedPlane, a fault-tolerant state
store for stateful in-switch applications. This provides
in-switch applications consistent access to their state,
even if the switch they run on fails or traffic is rerouted
to an alternative switch. We address key challenges in
devising a practical, provably correct replication protocol
and implementing it in the switch data plane. Our
evaluations show that RedPlane incurs negligible overhead
and enables end-to-end applications to rapidly recover from
switch failures.},
doi = {10.1145/3452296.3472905},
pdf = {papers/redplane-sigcomm21.pdf},
code = {https://github.com/daehyeok-kim/redplane-public},
video = {papers/redplane-sigcomm21-video.mp4},
monthnum = {08}
}
@inproceedings{sapio21:_scalin_distr_machin_learn_in_networ_aggreg,
address = {Boston, MA, USA},
author = {Amedeo Sapio and Marco Canini and Chen-Yu Ho and Jacob
Nelson and Panos Kalnis and Changhoon Kim and Arvind
Krishnamurthy and Masoud Moshref and Dan R. K. Ports and
Peter Richtarik},
booktitle = {Proceedings of the 18th {USENIX} {S}ymposium on
{N}etworked {S}ystems {D}esign and {I}mplementation ({NSDI}
'21)},
month = apr,
organization = {{USENIX}},
title = {Scaling Distributed Machine Learning with In-Network
Aggregation},
year = {2021},
abstract = {Training machine learning models in parallel is an
increasingly important workload. We accelerate distributed
parallel training by designing a communication primitive
that uses a programmable switch dataplane to execute a key
step of the training process. Our approach, SwitchML,
reduces the volume of exchanged data by aggregating the
model updates from multiple workers in the network. We
co-design the switch processing with the end-host protocols
and ML frameworks to provide an efficient solution that
speeds up training by up to 5.5x for a number of real-world
benchmark models.},
pdf = {papers/switchml-nsdi21.pdf},
slidespdf = {papers/switchml-nsdi21-slides.pdf},
code = {https://github.com/p4lang/p4app-switchML},
video = {https://www.youtube.com/watch?v=FIZsXfeZrvE},
cats = {https://discuss.systems/@dan/109373463921944039},
monthnum = {04}
}
@inproceedings{li20:_effic_portab_virtual_nvme_storag_arm_socs,
address = {Lausanne, Switzerland},
author = {Huaicheng Li and Mingzhe Hao and Stanko Novakovic and
Vaibhav Gogte and Sriram Govindan and Dan R. K. Ports and
Irene Zhang and Ricardo Bianchini and Haryadi S. Gunawi and
Anirudh Badam},
booktitle = {Proceedings of the 24th {I}nternational {C}onference on
{A}rchitectural {S}upport for {P}rogramming {L}anguages and
{O}perating {S}ystems ({ASPLOS '20})},
month = apr,
organization = {{ACM}},
title = {{LeapIO}: Efficient and Portable Virtual {NVMe} Storage on
{ARM SoCs}},
year = {2020},
abstract = {Today's cloud storage stack is extremely resource hungry,
burning 10-20\% of datacenter x86 cores, a major "storage
tax" that cloud providers must pay. Yet, the complex cloud
storage stack is not completely offload-ready to today's IO
accelerators. We present LeapIO, a new cloud storage stack
that leverages ARM-based co-processors to offload complex
storage services. LeapIO addresses many deployment
challenges, such as hardware fungibility, software
portability, virtualizability, composability, and
efficiency. It uses a set of OS/software techniques and new
hardware properties that provide a uniform address space
across the x86 and ARM cores and expose virtual NVMe
storage to unmodified guest VMs, at a performance that is
competitive with bare-metal servers.},
doi = {10.1145/3373376.3378531},
pdf = {papers/leapio-asplos20.pdf},
monthnum = {04}
}
@inproceedings{li20:_pegas,
address = {Banff, AL, Canada},
author = {Jialin Li and Jacob Nelson and Ellis Michael and Xin Jin
and Dan R. K. Ports},
booktitle = {Proceedings of the 14th {USENIX} {S}ymposium on
{O}perating {S}ystems {D}esign and {I}mplementation ({OSDI}
'20)},
month = nov,
organization = {{USENIX}},
title = {Pegasus: Tolerating Skewed Workloads in Distributed
Storage with In-Network Coherence Directories},
year = {2020},
abstract = {High performance distributed storage systems face the
challenge of load imbalance caused by skewed and dynamic
workloads. This paper introduces Pegasus, a new storage
system that leverages new-generation programmable switch
ASICs to balance load across storage servers. Pegasus uses
selective replication of the most popular objects in the
data store to distribute load. Using a novel in-network
coherence directory, the Pegasus switch tracks and manages
the location of replicated objects. This allows it to
achieve load-aware forwarding and dynamic rebalancing for
replicated keys, while still guaranteeing data coherence
and consistency. The Pegasus design is practical to
implement as it stores only forwarding metadata in the
switch data plane. The resulting system improves the
throughput of a distributed in-memory key-value store by
more than 10x under a latency SLO -- results which hold
across a large set of workloads with varying degrees of
skew, read/write ratio, object sizes, and dynamism.},
pdf = {papers/pegasus-osdi20.pdf},
slidespdf = {papers/pegasus-osdi20-slides.pdf},
code = {https://github.com/NUS-Systems-Lab/pegasus},
video = {https://www.youtube.com/watch?v=55Aki75fAr4},
cats = {https://discuss.systems/@dan/109355268209462963},
monthnum = {11}
}
@inproceedings{szekeres20:_meerk,
address = {Heraklion, Crete, Greece},
author = {Adriana Szekeres and Michael Whittaker and Naveen Kr.
Sharma and Jialin Li and Arvind Krishnamurthy and Irene
Zhang and Dan R. K. Ports},
booktitle = {Proceedings of the 15th {ACM} {SIGOPS} {E}uro{S}ys
({EuroSys '20})},
month = apr,
organization = {{ACM}},
title = {Meerkat: Scalable Replicated Transactions Following the
Zero-Coordination Principle},
year = {2020},
abstract = {Traditionally, the high cost of network communication
between servers has hidden the impact of cross-core
coordination in replicated systems. However, new
technologies, like kernel-bypass networking and faster
network links, have exposed hidden bottlenecks in
distributed systems. This paper explores how to build
multicore-scalable, replicated storage systems. We
introduce a new guideline for their design, called the
Zero-Coordination Principle. We use this principle to
design a new multicore-scalable, in-memory, replicated,
key-value store, called Meerkat.
Unlike existing
systems, Meerkat eliminates all cross-core and
cross-replica coordination, both of which pose a
scalability bottleneck. Our experiments found that Meerkat
is able to scale up to 80 hyper-threads and execute 8.3
million transactions per second. Meerkat represents an
improvement of 12x on state-of-the art, fault-tolerant,
in-memory, transactional storage systems built using
leader-based replication and a shared transaction log.},
doi = {10.1145/3342195.3387529},
pdf = {papers/meerkat-eurosys20.pdf},
monthnum = {04}
}
@inproceedings{wang20:_multit,
address = {Boston, MA, USA},
author = {Tao Wang and Hang Zhu and Fabian Ruffy and Xin Jin and
Anirudh Sivaraman and Dan R. K. Ports and Aurojit Panda},
booktitle = {Proceedings of the 11th Hot Topics in Cloud Computing
({HotCloud} '20)},
month = jul,
organization = {{USENIX}},
title = {Multitenancy for fast and programmable networks in the
cloud},
year = {2020},
abstract = {Fast and programmable network devices are now readily
available, both in the form of programmable switches and
smart network-interface cards. Going forward, we envision
that these devices will be widely deployed in the networks
of cloud providers (e.g., AWS, Azure, and GCP) and exposed
as a programmable surface for cloud customers—similar to
how cloud customers can today rent CPUs, GPUs, FPGAs, and
ML accelerators. Making this vision a reality requires us
to develop a mechanism to share the resources of a
programmable network device across multiple cloud tenants.
In other words, we need to provide multitenancy on these
devices. In this position paper, we design compile and
run-time approaches to multitenancy. We present preliminary
results showing that our design provides both efficient
utilization of the resources of a programmable network
device and isolation of tenant programs from each other.},
pdf = {papers/multitenancy-hotcloud20.pdf},
monthnum = {07}
}
@inproceedings{zeno20:_swish,
address = {Chicago, IL, USA},
author = {Lior Zeno and Dan R. K. Ports and Jacob Nelson and Mark
Silberstein},
booktitle = {Proceedings of the 16th Workshop on Hot Topics in Networks
({HotNets} '20)},
month = nov,
organization = {{ACM}},
title = {{SwiShmem}: Distributed Shared State Abstractions for
Programmable Switches},
year = {2020},
abstract = {Programmable switches provide an appealing platform for
running network functions (NFs), such as NATs, firewalls
and DDoS detectors, entirely in data plane, at staggering
multi-Tbps processing rates. However, to be used in real
deployments with a complex multi-switch topology, one NF
instance must be deployed on each switch, which together
act as a single logical NF. This requirement poses
significant challenges in particular for stateful NFs, due
to the need to manage distributed shared NF
state among the switches. While considered a solved
problem in classical distributed systems, data-plane state
sharing requires addressing several unique challenges: high
data rate, limited switch memory, and packet loss. We
present the design of SwiShmem, the first distributed
shared state management layer for data-plane P4
programs, which facilitates the implementation of stateful
distributed NFs on programmable switches. We first analyze
the access patterns and consistency requirements of popular
NFs that lend themselves for in-switch execution, and then
discuss the design and implementation options while
highlighting open research questions.},
doi = {10.1145/3422604.3425946},
pdf = {papers/swishmem-hotnets20.pdf},
monthnum = {11}
}
@inproceedings{ports19:_when_shoul_networ_be_comput,
address = {Bertinoro, Italy},
author = {Dan R. K. Ports and Jacob Nelson},
booktitle = {Proceedings of the 17th Workshop on Hot Topics in
Operating Systems ({HotOS} '19)},
month = may,
organization = {{ACM}},
title = {When Should The Network Be The Computer?},
year = {2019},
abstract = {Researchers have repurposed programmable network devices
to place small amounts of application computation in the
network, sometimes yielding orders-of-magnitude performance
gains. At the same time, effectively using these devices
requires careful use of limited resources and managing
deployment challenges. This paper provides a framework
for principled use of in-network processing. We provide a
set of guidelines for building robust and deployable
in-network primives, along with a taxonomy to help identify
which applications can benefit from in-network processing
and what types of devices they should use.},
doi = {10.1145/3317550.3321439},
pdf = {papers/innetwork-hotos19.pdf},
slidespdf = {papers/innetwork-hotos19-slides.pdf},
monthnum = {05}
}
@inproceedings{michael18:_towar_causal_datac_networ,
address = {Porto, Portugal},
author = {Ellis Michael and Dan R. K. Ports},
booktitle = {Proceedings of the 2018 Workshop on Principles and
Practice of Consistency for Distributed Data ({PaPoC}
'18)},
month = apr,
organization = {{ACM}},
title = {Towards Causal Datacenter Networks},
year = {2018},
abstract = {Traditionally, distributed systems conservatively assume
an asynchronous network. However, recent work on the
co-design of networks and distributed systems has shown
that stronger ordering properties are achievable in
datacenter networks and yield performance improvements for
the distributed systems they support. We build on that
trend and ask whether it is possible for the datacenter
network to order all messages in a protocol-agnostic way.
This approach, which we call omnisequencing, would ensure
causal delivery of all messages, making consistency a
network-level guarantee.},
doi = {10.1145/3194261.3194269},
pdf = {papers/causal-papoc18.pdf},
monthnum = {04}
}
@inproceedings{gudmundsdottir17:_demon_inter_analy_perfor_measur_viska,
address = {Chicago, IL, USA},
author = {Helga Gudmundsdottir and Babak Salimi and Magdalena
Balazinska and Dan R. K. Ports and Dan Suciu},
booktitle = {Proceedings of the 2017 {ACM} {SIGMOD} {I}nternational
{C}onference on {M}anagement of {D}ata},
month = may,
note = {Demonstration},
organization = {{ACM}},
title = {A Demonstration of Interactive Analysis of Performance
Measurements with {Viska}},
year = {2017},
abstract = {The ultimate goal of system performance analysis is to
identify the underlying causes for performance
differences between different systems and different
workloads. We make it easier to achieve this goal with
Viska, a new tool for generating and interpreting
performance measurement results. and Viska leverages
cutting-edge techniques from big data analytics and data
visualization to aid and automate this analysis, and helps
users derive meaningful and statistically sound conclusions
using state-of-the-art causal inference and hypothesis
testing techniques.},
doi = {10.1145/3035918.3056448},
pdf = {papers/viska-sigmod17demo.pdf},
monthnum = {05}
}
@inproceedings{li17:_eris,
address = {Shanghai, China},
author = {Jialin Li and Ellis Michael and Dan R. K. Ports},
booktitle = {Proceedings of the 26th {ACM} {S}ymposium on {O}perating
{S}ystems {P}rinciples ({SOSP} '17)},
month = oct,
organization = {{ACM}},
title = {{Eris}: Coordination-Free Consistent Transactions using
Network Multi-Sequencing},
year = {2017},
abstract = {Distributed storage systems aim to provide strong
consistency and isolation guarantees on an architecture
that is partitioned across multiple shards for scalability
and replicated for fault-tolerance. Traditionally,
achieving all of these goals has required an expensive
combination of atomic commitment and replication protocols
-- introducing extensive coordination overhead. Our system,
Eris, takes a very different approach. It moves a core
piece of concurrency control functionality, which we term
multi-sequencing, into the datacenter network itself. This
network primitive takes on the responsibility for
consistently ordering transactions, and a new lightweight
transaction protocol ensures atomicity. The end result is
that Eris avoids both replication and transaction
coordination overhead: we show that it can process a large
class of distributed transactions in a single round-trip
from the client to the storage system without any explicit
coordination between shards or replicas. It provides
atomicity, consistency, and fault-tolerance with less than
10\% overhead -- achieving throughput 4.5--35x higher and
latency 72--80\% lower than a conventional design on
standard benchmarks.},
doi = {10.1145/3132747.3132751},
pdf = {papers/eris-sosp17.pdf},
monthnum = {10}
}
@inproceedings{michael17:_recov_shared_objec_without_stabl_storag,
address = {Vienna, Austria},
author = {Ellis Michael and Dan R. K. Ports and Naveen Kr. Sharma
and Adriana Szekeres},
booktitle = {Proceedings of the 31st International Symposium on
Distributed Computing ({DISC} '17)},
key = {DISC '17},
month = oct,
title = {Recovering Shared Objects Without Stable Storage},
year = {2017},
abstract = {This paper considers the problem of building
fault-tolerant shared objects when processes can crash and
recover but lose their persistent state on recovery. This
Diskless Crash-Recovery (DCR) model matches the way many
long-lived systems are built. We show that it presents new
challenges, as operations that are recorded at a quorum may
not persist after some of the processes in that quorum
crash and then recover. To address this problem, we
introduce the notion of crash-consistent quorums,
where no recoveries happen during the quorum responses. We
show that relying on crash-consistent quorums enables a
recovery procedure that can recover all operations that
successfully finished. Crash-consistent quorums can be
easily identified using a mechanism we term the crash
vector, which tracks the causal relationship between
crashes, recoveries, and other operations.
We apply
crash-consistent quorums and crash vectors to build two
storage primitives. We give a new algorithm for
multi-reader multi-writer atomic registers in the DCR model
that guarantees safety under all conditions and termination
under a natural condition. It improves on the best prior
protocol for this problem by requiring fewer rounds, fewer
nodes to participate in the quorum, and a less restrictive
liveness condition. We also present a more efficient
single-reader, single-writer atomic set---a virtual
stable storage abstraction. It can be used to lift any
existing algorithm from the traditional Crash-Recovery with
Stable Storage model to the DCR model. We examine a
specific application, state machine replication, and show
that existing diskless protocols can violate their
correctness guarantees, while ours offers a general and
correct solution.},
pdf = {papers/recovery-disc17.pdf},
monthnum = {10}
}
@inproceedings{salimi17:_zaliq,
author = {Babak Salimi and Corey Cole and Dan R. K. Ports and Dan
Suciu},
booktitle = {Proceedings of the 43rd {I}nternational {C}onference on
{V}ery {L}arge {D}ata {B}ases ({VLDB} '17)},
key = {{VLDB '17}},
month = aug,
note = {Demonstration.},
title = {ZaliQL: Causal Inference from Observational Data at
Scale},
year = {2017},
abstract = {Causal inference from observational data is a subject of
active research and development in statistics and computer
science. Many statistical software packages have been
developed for this purpose. However, these toolkits do not
scale to large datasets. We propose and demonstrate ZaliQL:
a SQL-based framework for drawing causal inference from
observational data. ZaliQL supports the state-of-the-art
methods for causal inference and runs at scale within
PostgreSQL database system. In addition, we built a visual
interface to wrap around ZaliQL. In our demonstration, we
will use this GUI to show a live investigation of the
causal effect of different weather conditions on flight
delays.},
doi = {10.14778/3137765.3137818},
pdf = {papers/zaliql-vldb17demo.pdf},
monthnum = {08}
}
@inproceedings{gudmondsdottir16:_viska,
address = {Savannah, GA, USA},
author = {Helga Gudmundsdottir and Babak Salimi and Magdalena
Balazinska and Dan R. K. Ports and Dan Suciu},
booktitle = {Proceedings of the 12th {USENIX} {S}ymposium on
{O}perating {S}ystems {D}esign and {I}mplementation ({OSDI}
'16)},
month = nov,
note = {Poster},
organization = {{USENIX}},
title = {Viska: Enabling Interactive Analysis of Performance
Measurements},
year = {2016},
abstract = {Much of systems research consists of performance analysis
-- to learn when one system outperforms another, to
identify architectural choices responsible for the
difference, or to identify performance anomalies in
particular workloads, for example. However, despite recent
advances in data analytics and interactive data
visualization, the tools we use for performance analysis
remain remarkably primitive. The Viska project aims to
close this gap by providing a new toolkit for systems
researchers to generate and interpret performance
measurement results, helping users derive meaningful and
statistically sound conclusions. Viska leverages
cutting-edge techniques from big data analytics and data
visualization to aid and automate this analysis.},
monthnum = {11}
}
@inproceedings{holt16:_discip_incon_consis_types,
address = {Santa Clara, CA, USA},
author = {Brandon Holt and James Bornholt and Irene Zhang and Dan R.
K. Ports and Mark Oskin and Luis Ceze},
booktitle = {Proceedings of the 7th Symposium on Cloud Computing
({SOCC} '16)},
month = oct,
organization = {{ACM}},
title = {Disciplined Inconsistency with Consistency Types},
year = {2016},
abstract = {Distributed applications and web services, such as online
stores or social networks, are expected to be scalable,
available, responsive, and fault-tolerant. To meet these
steep requirements in the face of high round-trip
latencies, network partitions, server failures, and load
spikes, applications use eventually consistent datastores
that allow them to weaken the consistency of some data.
However, making this transition is highly error-prone
because relaxed consistency models are notoriously
difficult to understand and test. In this work, we
propose a new programming model for distributed data that
makes consistency properties explicit and uses a type
system to enforce consistency safety. With the
Inconsistent, Performance-bound, Approximate (IPA)
storage system, programmers specify performance targets and
correctness requirements as constraints on persistent data
structures and handle uncertainty about the result of
datastore reads using new consistency types. We
implement a prototype of this model in Scala on top of an
existing datastore, Cassandra, and use it to make
performance/correctness tradeoffs in two applications: a
ticket sales service and a Twitter clone. Our evaluation
shows that IPA prevents consistency-based programming
errors and adapts consistency automatically in response to
changing network conditions, performing comparably to weak
consistency and 2-10x faster than strong consistency.},
doi = {10.1145/2987550.2987559},
pdf = {papers/ipa-socc16.pdf},
monthnum = {10}
}
@inproceedings{li16:_fast_replic_nopax,
address = {Savannah, GA, USA},
author = {Jialin Li and Ellis Michael and Adriana Szekeres and
Naveen Kr. Sharma and Dan R. K. Ports},
booktitle = {Proceedings of the 12th {USENIX} {S}ymposium on
{O}perating {S}ystems {D}esign and {I}mplementation ({OSDI}
'16)},
month = nov,
organization = {{USENIX}},
title = {Just Say {NO} to {Paxos} Overhead: Replacing Consensus
with Network Ordering},
year = {2016},
abstract = {Distributed applications use replication, implemented by
protocols like Paxos, to ensure data availability and
transparently mask server failures. This paper presents a
new approach to achieving replication in the data center
without the performance cost of traditional methods. Our
work carefully divides replication responsibility between
the network and protocol layers. The network orders
requests but does not ensure reliable delivery -- using a
new primitive we call ordered unreliable multicast (OUM).
Implementing this primitive can be achieved with
near-zero-cost in the data center. Our new replication
protocol, Network-Ordered Paxos (NOPaxos), exploits network
ordering to provide strongly consistent replication without
coordination. The resulting system not only outperforms
both latency- and throughput-optimized protocols on their
respective metrics, but also yields throughput within 2\%
and latency within 16 us of an unreplicated system --
providing replication without the performance cost.},
pdf = {papers/nopaxos-osdi16.pdf},
code = {https://github.com/uwsyslab/nopaxos/},
monthnum = {11}
}
@inproceedings{holt15:_claret,
address = {Bordeaux, France},
author = {Brandon Holt and Irene Zhang and Dan R. K. Ports and Mark
Oskin and Luis Ceze},
booktitle = {Proceedings of the 2015 Workshop on Principles and
Practice of Consistency for Distributed Data ({PaPoC}
'15)},
month = apr,
organization = {{ACM}},
title = {Claret: Using Data Types for Highly Concurrent Distributed
Transactions},
year = {2015},
abstract = {Out of the many NoSQL databases in use today, some that
provide simple data structures for records, such as Redis
and MongoDB, are now becoming popular. Building
applications out of these complex data types provides a way
to communicate intent to the database system without
sacrificing flexibility or committing to a fixed schema.
Currently this capability is leveraged in limited ways,
such as to ensure related values are co-located, or for
atomic updates. There are many ways data types can be used
to make databases more efficient that are not yet being
exploited. We explore several ways of leveraging
abstract data type (ADT) semantics in databases, focusing
primarily on commutativity. Using a Twitter clone as a case
study, we show that using commutativity can reduce
transaction abort rates for high-contention, update-heavy
workloads that arise in real social networks. We conclude
that ADTs are a good abstraction for database records,
providing a safe and expressive programming model with
ample opportunities for optimization, making databases more
safe and scalable.},
doi = {10.1145/2745947.2745951},
pdf = {papers/claret-papoc15.pdf},
monthnum = {04}
}
@inproceedings{holt15:_claret_poster,
address = {Bordeaux, France},
author = {Brandon Holt and Irene Zhang and Dan R. K. Ports and Mark
Oskin and Luis Ceze},
booktitle = {Proceedings of the 10th {ACM} {SIGOPS} {E}uro{S}ys
({EuroSys '15})},
month = apr,
note = {Poster},
organization = {{ACM}},
title = {Claret: Using Data Types for Highly Concurrent Distributed
Transactions},
year = {2015},
monthnum = {04}
}
@inproceedings{ports15:_desig_distr_system_using_approx,
address = {Oakland, CA, USA},
author = {Dan R. K. Ports and Jialin Li and Vincent Liu and Naveen
Kr. Sharma and Arvind Krishnamurthy},
booktitle = {Proceedings of the 12th {USENIX} {S}ymposium on
{N}etworked {S}ystems {D}esign and {I}mplementation ({NSDI}
'15)},
month = may,
organization = {{USENIX}},
title = {Designing Distributed Systems Using Approximate Synchrony
in Datacenter Networks},
year = {2015},
abstract = {Distributed systems are traditionally designed
independently from the underlying network, making
worst-case assumptions (e.g., complete asynchrony) about
its behavior. However, many of today's distributed
applications are deployed in data centers, where the
network is more reliable, predictable, and extensible. In
these environments, it is possible to co-design distributed
systems with their network layer, and doing so can offer
substantial benefits. This paper explores network-level
mechanisms for providing Mostly-Ordered Multicast
(MOM): a best-effort ordering property for concurrent
multicast operations. Using this primitive, we design
Speculative Paxos, a state machine replication
protocol that relies on the network to order requests in
the normal case. This approach leads to substantial
performance benefits: under realistic data center
conditions, Speculative Paxos can provide 40\% lower
latency and 2.6x higher throughput than the standard Paxos
protocol. It offers lower latency than a latency-optimized
protocol (Fast Paxos) with the same throughput as a
throughput-optimized protocol (batching).},
pdf = {papers/specpaxos-nsdi15.pdf},
slidespdf = {papers/specpaxos-nsdi15-slides.pdf},
award = {Best Paper Award},
code = {https://github.com/uwsyslab/specpaxos/},
monthnum = {05}
}
@inproceedings{sharma15:_transtorm_poster,
address = {Monterey, CA, USA},
author = {Naveen Kr. Sharma and Brandon Holt and Irene Zhang and Dan
R. K. Ports and Marcos Aguilera},
booktitle = {Proceedings of the 25th {ACM} {S}ymposium on {O}perating
{S}ystems {P}rinciples ({SOSP} '15)},
month = oct,
note = {Poster},
organization = {{ACM}},
title = {Transtorm: a benchmark suite for transactional key-value
storage systems},
year = {2015},
monthnum = {10}
}
@inproceedings{zhang15:_build_consis_trans_incon_replic,
address = {Monterey, CA, USA},
author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and
Arvind Krishnamurthy and Dan R. K. Ports},
booktitle = {Proceedings of the 25th {ACM} {S}ymposium on {O}perating
{S}ystems {P}rinciples ({SOSP} '15)},
month = oct,
organization = {{ACM}},
title = {Building Consistent Transactions with Inconsistent
Replication},
year = {2015},
abstract = {Application programmers increasingly prefer distributed
storage systems with strong consistency and distributed
transactions (e.g., Google's Spanner) for their strong
guarantees and ease of use. Unfortunately, existing
transactional storage systems are expensive to use -- in
part because they require costly replication protocols,
like Paxos, for fault tolerance. In this paper, we present
a new approach that makes transactional storage systems
more affordable: we eliminate consistency from the
replication protocol while still providing distributed
transactions with strong consistency to applications.
We present TAPIR -- the Transactional Application Protocol
for Inconsistent Replication -- the first transaction
protocol to use a novel replication protocol, called
inconsistent replication, that provides fault tolerance
without consistency. By enforcing strong consistency only
in the transaction protocol, TAPIR can commit transactions
in a single round-trip and order distributed transactions
without centralized coordination. We demonstrate the use of
TAPIR in a transactional key-value store, TAPIR-KV.
Compared to conventional systems, TAPIR-KV provides better
latency and throughput.},
doi = {10.1145/2815400.2815404},
pdf = {papers/tapir-sosp15.pdf},
code = {https://github.com/uwsyslab/tapir},
monthnum = {10}
}
@inproceedings{li14:_tales_tail,
address = {Seattle, WA, USA},
author = {Jialin Li and Naveen Kr. Sharma and Dan R. K. Ports and
Steven D. Gribble},
booktitle = {Proceedings of the 5th Symposium on Cloud Computing
({SOCC} '14)},
month = nov,
organization = {{ACM}},
title = {Tales of the Tail: Hardware, {OS}, and Application-level
Sources of Tail Latency},
year = {2014},
abstract = {Interactive services often have large-scale parallel
implementations. To deliver fast responses, the median and
tail latencies of a service's components must be low. In
this paper, we explore the hardware, OS, and
application-level sources of poor tail latency in high
throughput servers executing on multi-core machines. We
model these network services as a queuing system in order
to establish the best-achievable latency distribution.
Using fine-grained measurements of three different servers
(a null RPC service, Memcached, and Nginx) on Linux, we
then explore why these servers exhibit significantly worse
tail latencies than queuing models alone predict. The
underlying causes include interference from background
processes, request re-ordering caused by poor scheduling or
constrained concurrency models, suboptimal interrupt
routing, CPU power saving mechanisms, and NUMA effects.
We systematically eliminate these factors and show that
Memcached can achieve a median latency of 11 us and a
99.9th percentile latency of 32 us at 80\% utilization on a
four-core system. In comparison, a naive deployment of
Memcached at the same utilization on a single-core system
has a median latency of 100 us and a 99.9th percentile
latency of 5 ms. Finally, we demonstrate that tradeoffs
exist between throughput, energy, and tail latency.},
doi = {10.1145/2670979.2670988},
pdf = {papers/latency-socc14.pdf},
monthnum = {11}
}
@inproceedings{peter14:_arrak,
address = {Broomfield, CO, USA},
author = {Simon Peter and Jialin Li and Irene Zhang and Dan R. K.
Ports and Doug Woos and Arvind Krishnamurthy and Thomas
Anderson and Timothy Roscoe},
booktitle = {Proceedings of the 11th {USENIX} {S}ymposium on
{O}perating {S}ystems {D}esign and {I}mplementation ({OSDI}
'14)},
month = oct,
organization = {{USENIX}},
title = {Arrakis: The Operating System is the Control Plane},
year = {2014},
abstract = {Recent device hardware trends enable a new approach to the
design of network server operating systems. In a
traditional operating system, the kernel mediates access to
device hardware by server applications, to enforce process
isolation as well as network and disk security. We have
designed and implemented a new operating system, Arrakis,
that splits the traditional role of the kernel in two.
Applications have direct access to virtualized I/O devices,
allowing most I/O operations to skip the kernel entirely,
while the kernel is re-engineered to provide network and
disk protection without kernel mediation of every
operation. We describe the hardware and software changes
needed to take advantage of this new abstraction, and we
illustrate its power by showing improvements of 2-5x in
latency and 9x in throughput for a popular persistent NoSQL
store relative to a well-tuned Linux implementation},
pdf = {papers/arrakis-osdi14.pdf},
award = {Jay Lepreau Best Paper Award},
code = {https://github.com/UWNetworksLab/arrakis},
monthnum = {10}
}
@inproceedings{peter14:_towar_high_perfor_applic_level_storag_manag,
address = {Philadelphia, PA, USA},
author = {Simon Peter and Jialin Li and Doug Woos and Irene Zhang
and Dan R. K. Ports and Thomas Anderson and Arvind
Krishnamurthy and Mark Zbikowski},
booktitle = {Proceedings of the 5th Hot Topics in Storage and File
Systems ({HotStorage} '14)},
month = jun,
organization = {{USENIX}},
title = {Towards High-Performance Application-Level Storage
Management},
year = {2014},
abstract = {We propose a radical re-architecture of the traditional
operating system storage stack to move the kernel off the
data path. Leveraging virtualized I/O hardware for disk and
flash storage, most read and write I/O operations go
directly to application code. The kernel dynamically
allocates extents, manages the virtual to physical binding,
and performs name translation. The benefit is to
dramatically reduce the CPU overhead of storage operations
while improving application flexibility.},
pdf = {papers/arrakis-hotstorage14.pdf},
monthnum = {06}
}
@inproceedings{zhang14:_optim_replic_two_phase_commit,
address = {Beijing, China},
author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and
Arvind Krishnamurthy and Dan R. K. Ports},
booktitle = {Proceedings of the 5th Asia-Pacific Workshop on Systems
({APSYS} '14)},
key = {APSys 2014},
month = jun,
note = {Poster and extended abstract},
title = {Optimistic Replicated Two-Phase Commit},
year = {2014},
monthnum = {06}
}
@inproceedings{zhuo14:_machin_fault_toler_reliab_datac_system,
address = {Beijing, China},
author = {Danyang Zhuo and Qiao Zhang and Dan R. K. Ports and Arvind
Krishnamurthy and Thomas Anderson},
booktitle = {Proceedings of the 5th Asia-Pacific Workshop on Systems
({APSYS} '14)},
key = {APSys 2014},
month = jun,
title = {Machine Fault Tolerance for Reliable Datacenter Systems},
year = {2014},
abstract = {Although rare in absolute terms, undetected CPU, memory,
and disk errors occur often enough at data center scale to
significantly affect overall system reliability and
availability. In this paper, we propose a new failure
model, called Machine Fault Tolerance, and a new
abstraction, a replicated write-once trusted table, to
provide improved resilience to these types of failures.
Since most machine failures manifest in application server
and operating system code, we assume a Byzantine model for
those parts of the system. However, by assuming that the
hypervisor and network are trustworthy, we are able to
reduce the overhead of machine-fault masking to be close to
that of non-Byzantine Paxos.},
doi = {10.1145/2637166.2637235},
pdf = {papers/mft-apsys14.pdf},
monthnum = {06}
}
@inproceedings{cheng12:_abstr_usabl_infor_flow_contr_aeolus,
address = {Boston, MA, USA},
author = {Winnie Cheng and Dan R. K. Ports and David Schultz and
Victoria Popic and Aaron Blankstein and James Cowling and
Dorothy Curtis and Liuba Shrira and Barbara Liskov},
booktitle = {Proceedings of the 2012 {USENIX} {A}nnual {T}echnical
{C}onference},
month = jun,
organization = {{USENIX}},
title = {Abstractions for Usable Information Flow Control in
{Aeolus}},
year = {2012},
abstract = {Despite the increasing importance of protecting
confidential data, building secure software remains as
challenging as ever. This paper describes Aeolus, a new
platform for building secure distributed applications.
Aeolus uses information flow control to provide
confidentiality and data integrity. It differs from
previous information flow control systems in a way that we
believe makes it easier to understand and use. Aeolus uses
a new, simpler security model, the first to combine a
standard principal-based scheme for authority management
with thread-granularity information flow tracking. The
principal hierarchy matches the way developers already
reason about authority and access control, and the
coarse-grained information flow tracking eases the task of
defining a program's security restrictions. In addition,
Aeolus provides a number of new mechanisms (authority
closures, compound tags, boxes, and shared volatile state)
that support common design patterns in secure application
design.},
doi = {},
pdf = {papers/aeolus-usenix12.pdf},
slidespdf = {papers/aeolus-usenix12-slides.pdf},
code = {http://pmg.csail.mit.edu/aeolus/#sw},
monthnum = {06}
}
@inproceedings{ports10:_trans_consis_autom_manag_applic_data_cache,
address = {Vancouver, BC, Canada},
author = {Dan R. K. Ports and Austin T. Clements and Irene Zhang and
Samuel Madden and Barbara Liskov},
booktitle = {Proceedings of the 9th {USENIX} {S}ymposium on {O}perating
{S}ystems {D}esign and {I}mplementation ({OSDI} '10)},
month = oct,
organization = {{USENIX}},
title = {Transactional Consistency and Automatic Management in an
Application Data Cache},
year = {2010},
abstract = {Distributed in-memory application data caches like
memcached are a popular solution for scaling
database-driven web sites. These systems are easy to add to
existing deployments, and increase performance
significantly by reducing load on both the database and
application servers. Unfortunately, such caches do not
integrate well with the database or the application. They
cannot maintain transactional consistency across the entire
system, violating the isolation properties of the
underlying database. They leave the application responsible
for locating data in the cache and keeping it up to date, a
frequent source of application complexity and programming
errors. Addressing both of these problems, we introduce
a transactional cache, TxCache, with a simple programming
model. TxCache ensures that any data seen within a
transaction, whether it comes from the cache or the
database, reflects a slightly stale but consistent snapshot
of the database. TxCache makes it easy to add caching to an
application by simply designating functions as cacheable;
it automatically caches their results, and invalidates the
cached data as the underlying database changes. Our
experiments found that adding TxCache increased the
throughput of a web application by up to 5.2x, only
slightly less than a non-transactional cache, showing that
consistency does not have to come at the price of
performance.},
pdf = {papers/txcache-osdi10.pdf},
psgz = {papers/txcache-osdi10.ps.gz},
slidespdf = {papers/txcache-osdi10-slides.pdf},
code = {https://github.com/drkp/txcache/},
monthnum = {10}
}
@inproceedings{cowling09:_census,
address = {San Diego, CA, USA},
author = {James Cowling and Dan R. K. Ports and Barbara Liskov and
Raluca Ada Popa and Abhijeet Gaikwad},
booktitle = {Proceedings of the 2009 {USENIX} {A}nnual {T}echnical
{C}onference},
month = jun,
organization = {{USENIX}},
title = {Census: Location-Aware Membership Management for
Large-Scale Distributed Systems},
year = {2009},
abstract = {We present Census, a platform for building large-scale
distributed applications. Census provides a membership
service and a multicast mechanism. The membership service
provides every node with a consistent view of the system
membership, which may be global or partitioned into
location-based regions. Census distributes membership
updates with low overhead, propagates changes promptly, and
is resilient to both crashes and Byzantine failures. We
believe that Census is the first system to provide a
consistent membership abstraction at very large scale,
greatly simplifying the design of applications built atop
large deployments such as multi-site data centers.
Census builds on a novel multicast mechanism that is
closely integrated with the membership service. It
organizes nodes into a reliable overlay composed of
multiple distribution trees, using network coordinates to
minimize latency. Unlike other multicast systems, it avoids
the cost of using distributed algorithms to construct and
maintain trees. Instead, each node independently produces
the same trees from the consistent membership view. Census
uses this multicast mechanism to distribute membership
updates, along with application-provided messages.
We
evaluate the platform under simulation and on a real-world
deployment on PlanetLab. We find that it imposes minimal
bandwidth overhead, is able to react quickly to node
failures and changes in the system membership, and can
scale to substantial size.},
pdf = {papers/census-usenix09.pdf},
psgz = {papers/census-usenix09.ps.gz},
slidespdf = {papers/census-usenix09-slides.pdf},
monthnum = {06}
}
@inproceedings{ports09:_trans_cachin_applic_data_recen_snaps,
address = {Big Sky, MT, USA},
author = {Dan R. K. Ports and Austin T. Clements and Irene Y. Zhang
and Samuel Madden and Barbara Liskov},
booktitle = {Proceedings of the 22nd {ACM} {S}ymposium on {O}perating
{S}ystems {P}rinciples ({SOSP} '09)},
month = oct,
note = {Work in Progress report},
organization = {{ACM}},
title = {Transactional Caching of Application Data using Recent
Snapshots},
year = {2009},
abstract = {Many of today's well-known websites use application data
caches to reduce the bottleneck load on the database, as
well as the computational load on the application servers.
Distributed in-memory shared caches, exemplified by
memcached, are one popular approach. These caches
typically provide a get/put interface, akin to a
distributed hash table; the application chooses what data
to keep in the cache and keeps it up to date. By storing
the cache entirely in memory and horizontally partitioning
among nodes, in-memory caches provide quick response times
and ease of scaling. However, existing caches have no
notion of transactional consistency: there is no way to
ensure that two accesses to the cache reflect a view of the
database at the same point in time. While the backing
database goes to great lengths to ensure this property
(serializable isolation), the caching layer violates these
guarantees. The resulting inconsistencies can have
unpleasant consequences if exposed to the user (e.g.,
attributing the latest bid to the wrong user on an auction
site), or add complexity to application code by forcing it
to cope with temporarily violated invariants.
We argue
that transactional semantics are not incompatible with
cache performance and scalability. We introduce a
transactional cache, TxCache, which guarantees that
all values retrieved from the cache or database during a
transaction reflect a consistent snapshot of the database.
TxCache also strives to simplify application design by
helping manage the cache. Instead of requiring applications
to manually insert and check for values in the cache,
TxCache provides a library with which programmers simply
designate functions as cacheable, and the library checks
the cache for previous calls with the same arguments. In
particular, and unlike memcached, TxCache does not require
applications to explicitly invalidate cached values;
correctly identifying the values to invalidate is difficult
because it requires global reasoning about the
application.},
pdf = {papers/txcache-sosp09wip-abstract.pdf},
psgz = {papers/txcache-sosp09wip-abstract.ps.gz},
slidespdf = {papers/txcache-sosp09wip-slides.pdf},
monthnum = {10}
}
@inproceedings{chen08:_overs,
address = {Seattle, WA, USA},
author = {Xiaoxin Chen and Tal Garfinkel and E. Christopher Lewis
and Pratap Subrahmanyam and Carl A. Waldspurger and Dan
Boneh and Jeffrey Dwoskin and Dan R. K. Ports},
booktitle = {Proceedings of the 13th {I}nternational {C}onference on
{A}rchitectural {S}upport for {P}rogramming {L}anguages and
{O}perating {S}ystems ({ASPLOS '08})},
month = mar,
organization = {{ACM}},
title = {Overshadow: A Virtualization-Based Approach to
Retrofitting Protection in Commodity Operating Systems},
year = {2008},
abstract = {Commodity operating systems entrusted with securing
sensitive data are remarkably large and complex, and
consequently, frequently prone to compromise. To address
this limitation, we introduce a virtual-machine-based
system called Overshadow that protects the privacy and
integrity of application data, even in the event of a total
OS compromise. Overshadow presents an application with a
normal view of its resources, but the OS with an encrypted
view. This allows the operating system to carry out the
complex task of managing an application's resources,
without allowing it to read or modify them. Thus,
Overshadow offers a last line of defense for application
data. Overshadow builds on multi-shadowing, a novel
mechanism that presents different views of ``physical''
memory, depending on the context performing the access.
This primitive offers an additional dimension of protection
beyond the hierarchical protection domains implemented by
traditional operating systems and processor architectures.
We present the design and implementation of Overshadow
and show how its new protection semantics can be integrated
with existing systems. Our design has been fully
implemented and used to protect a wide range of unmodified
legacy applications running on an unmodified Linux
operating system. We evaluate the performance of our
implementation, demonstrating that this approach is
practical.},
doi = {10.1145/1346281.1346284},
pdf = {papers/overshadow-asplos08.pdf},
psgz = {papers/overshadow-asplos08.ps.gz},
monthnum = {03}
}
@inproceedings{ports08:_towar_applic_secur_untrus_operat_system,
address = {San Jose, CA, USA},
author = {Dan R. K. Ports and Tal Garfinkel},
booktitle = {Proceedings of the 3rd Workshop on Hot Topics in Security
(HotSec '08)},
month = jul,
organization = {{USENIX}},
title = {Towards Application Security on Untrusted Operating
Systems},
year = {2008},
abstract = {Complexity in commodity operating systems makes
compromises inevitable. Consequently, a great deal of work
has examined how to protect security-critical portions of
applications from the OS through mechanisms such as
microkernels, virtual machine monitors, and new processor
architectures. Unfortunately, most work has focused on CPU
and memory isolation and neglected OS semantics. Thus,
while much is known about how to prevent OS and application
processes from modifying each other, far less is understood
about how different OS components can undermine application
security if they turn malicious. We consider this
problem in the context of our work on Overshadow, a
virtual-machine-based system for retrofitting protection in
commodity operating systems. We explore how malicious
behavior in each major OS subsystem can undermine
application security, and present potential mitigations.
While our discussion is presented in terms of Overshadow
and Linux, many of the problems and solutions are
applicable to other systems where trusted applications rely
on untrusted, potentially malicious OS components.},
pdf = {papers/overshadow-hotsec08.pdf},
psgz = {papers/overshadow-hotsec08.ps.gz},
slidespdf = {papers/overshadow-hotsec08-slides.pdf},
monthnum = {07}
}
@inproceedings{clements05:_arpeg,
address = {Ithaca, NY, USA},
author = {Austin T. Clements and Dan R. K. Ports and David R.
Karger},
booktitle = {Proceedings of the 4th International Workshop on
Peer-to-Peer Systems ({IPTPS} '05)},
key = {IPTPS '05},
month = feb,
pages = {58--68},
publisher = {Springer},
series = {Lecture Notes in Computer Science},
title = {Arpeggio: Metadata Searching and Content Sharing with
{C}hord},
volume = {3640},
year = {2005},
abstract = {Arpeggio is a peer-to-peer file-sharing network based on
the Chord lookup primitive. Queries for data whose metadata
matches a certain criterion are performed efficiently by
using a distributed keyword-set index, augmented with
index-side filtering. We introduce index gateways, a
technique for minimizing index maintenance overhead.
Because file data is large, Arpeggio employs subrings to
track live source peers without the cost of inserting the
data itself into the network. Finally, we introduce
postfetching, a technique that uses information in the
index to improve the availability of rare files. The result
is a system that provides efficient query operations with
the scalability and reliability advantages of full
decentralization, and a content distribution system tuned
to the requirements and capabilities of a peer-to-peer
network.},
pdf = {papers/arpeggio-iptps05.pdf},
psgz = {papers/arpeggio-iptps05.ps.gz},
slidespdf = {papers/arpeggio-iptps05-slides.pdf},
monthnum = {02}
}
@inproceedings{ports05:_persif,
address = {Brighton, United Kingdom},
author = {Dan R. K. Ports and Austin T. Clements and Erik D.
Demaine},
booktitle = {Proceedings of the 20th {ACM} {S}ymposium on {O}perating
{S}ystems {P}rinciples ({SOSP} '05)},
month = oct,
note = {Poster and extended abstract},
organization = {{ACM}},
title = {{PersiFS}: A Versioned File System with an Efficient
Representation},
year = {2005},
doi = {10.1145/1095810.1118598},
monthnum = {10}
}
@inproceedings{clements04:_arpeg,
address = {Cambridge, MA, USA},
author = {Austin T. Clements and Dan R. K. Ports and David R.
Karger},
booktitle = {Proceedings of the 2nd Project IRIS Student Workshop
({ISW} '04)},
key = {ISW '04},
month = nov,
note = {Poster and extended abstract.},
title = {Arpeggio: Efficient Metadata-based Searching and File
Transfer with {DHTs}},
year = {2004},
abstract = {Arpeggio is a peer-to-peer file-sharing network
based on the Chord distributed hash table. Queries for
files whose metadata matches a certain criterion are
performed efficiently by using a distributed
keyword-set index , augmented with index-side
filtering . We introduce metadata gateways , a
technique for minimizing index maintenance overhead.
Arpeggio also uses the DHT for indirect
storage of file contents, maintaining pointers from
content to the live peers that provide it. Finally, we
introduce postfetching , a technique that uses
information in the index to improve the availability of
rare files. The result is a system that provides efficient
query operations with the scalability and reliability
advantages of full decentralization, and a content
distribution system tuned to the requirements of a
peer-to-peer file-sharing network.},
monthnum = {11}
}
@mastersthesis{ports07:_metad_index_in_struc_peer,
address = {Cambridge, MA, USA},
author = {Dan R. K. Ports},
month = feb,
school = {Massachusetts Institute of Technology},
type = {M.Eng. thesis},
title = {Arpeggio: Metadata Indexing in a Structured Peer-to-Peer
Network},
year = {2007},
abstract = {Peer-to-peer networks require an efficient means for
performing searches for files by metadata keywords.
Unfortunately, current methods usually sacrifice either
scalability or recall. Arpeggio is a peer-to-peer
file-sharing network that uses the Chord lookup primitive
as a basis for constructing distributed keyword-set index,
augmented with index-side filtering, to address this
problem. We introduce index gateways, a technique for
minimizing index maintenance overhead. Arpeggio also
includes a content distribution system for finding source
peers for a file; we present a novel system that uses Chord
subrings to track live source peers without the cost of
inserting the data itself into the network, and supports
postfetching: using information in the index to improve the
availability of rare files. The result is a system that
provides efficient query operations with the scalability
and reliability advantages of full decentralization. We use
analysis and simulation results to show that our indexing
system has reasonable storage and bandwidth costs, and
improves load distribution.},
doi = {1721.1/41663},
pdf = {papers/arpeggio-meng.pdf},
psgz = {papers/arpeggio-meng.ps.gz},
monthnum = {02}
}
@article{liu25:_hyper,
author = {Ziyuan Liu and Zhixiong Niu and Ran Shu and Wenxue Cheng
and Lihua Yuan and Jacob Nelson and Dan R. K. Ports and
Peng Cheng and Yongqiang Xiong},
journal = {{IEEE} Transactions on Cloud Computing},
month = apr,
number = {2},
pages = {498-511},
title = {{HyperDrive}: Direct Network Telemetry Storage via
Programmable Switches},
volume = {13},
year = {2025},
abstract = {In cloud datacenter operations, telemetry and logs are
indispensable, enabling essential services such as network
diagnostics, auditing, and knowledge discovery. The
escalating scale of data centers, coupled with increased
bandwidth and finer-grained telemetry, results in an
overwhelming volume of data. This proliferation poses
significant storage challenges for telemetry systems. In
this paper, we introduce HyperDrive, an innovative system
designed to efficiently store large volumes of telemetry
and logs in data centers using programmable switches. This
in-network approach effectively mitigates bandwidth
bottlenecks commonly associated with traditional
endpoint-based methods. To our knowledge, we are the first
to use a programmable switch to directly control storage,
bypassing the CPU to achieve the best performance. With
merely 21\% of a switch's resources, our HyperDrive
implementation showcases remarkable scalability and
efficiency. Through rigorous evaluation, it has
demonstrated linear scaling capabilities, efficiently
managing 12 SSDs on a single server with minimal host
overhead. In an eight-server testbed, HyperDrive achieved
an impressive throughput of approximately 730Gbps,
underscoring its potential to transform data center
telemetry and logging practices.},
doi = {10.1109/TCC.2025.3543477},
monthnum = {04}
}
@techreport{szekeres20:_makin_distr_mobil_applic_safe,
author = {Adriana Szekeres and Irene Zhang and Katelin Bailey and
Isaac Ackerman and Haichen Shen and Franziska Roesner and
Dan R. K. Ports and Arvind Krishnamurthy and Henry M.
Levy},
institution = {arXiv},
month = aug,
number = {2008.06536},
type = {arXiv preprint},
title = {Making Distributed Mobile Applications {SAFE}: Enforcing
User Privacy Policies on Untrusted Applications with Secure
Application Flow Enforcement},
year = {2020},
abstract = {Today's mobile devices sense, collect, and store huge
amounts of personal information, which users share with
family and friends through a wide range of applications.
Once users give applications access to their data, they
must implicitly trust that the apps correctly maintain data
privacy. As we know from both experience and
all-too-frequent press articles, that trust is often
misplaced. While users do not trust applications, they do
trust their mobile devices and operating systems.
Unfortunately, sharing applications are not limited to
mobile clients but must also run on cloud services to share
data between users. In this paper, we leverage the trust
that users have in their mobile OSes to vet cloud services.
To do so, we define a new Secure Application Flow
Enforcement (SAFE) framework, which requires cloud services
to attest to a system stack that will enforce policies
provided by the mobile OS for user data. We implement a
mobile OS that enforces SAFE policies on unmodified mobile
apps and two systems for enforcing policies on untrusted
cloud services. Using these prototypes, we demonstrate that
it is possible to enforce existing user privacy policies on
unmodified applications.},
doi = {10.48550/arXiv.2008.06536},
pdf = {papers/agate-arxiv20.pdf},
monthnum = {08}
}
@techreport{sapio19:_scalin_distr_machin_learn_in_networ_aggreg,
author = {Amedeo Sapio and Marco Canini and Chen-Yu Ho and Jacob
Nelson and Panos Kalnis and Changhoon Kim and Arvind
Krishnamurthy and Masoud Moshref and Dan R. K. Ports and
Peter Richtarik},
institution = {arXiv},
month = feb,
note = {version 2, Sep. 2020},
number = {1903.06701},
type = {arXiv preprint},
title = {Scaling Distributed Machine Learning with In-Network
Aggregation},
year = {2019},
abstract = {Training machine learning models in parallel is an
increasingly important workload. We accelerate distributed
parallel training by designing a communication primitive
that uses a programmable switch dataplane to execute a key
step of the training process. Our approach, SwitchML,
reduces the volume of exchanged data by aggregating the
model updates from multiple workers in the network. We
co-design the switch processing with the end-host protocols
and ML frameworks to provide an efficient solution that
speeds up training by up to 5.5× for a number of
real-world benchmark models.},
doi = {10.48550/arXiv.1903.06701},
pdf = {papers/switchml-arxiv-v2.pdf},
supersededby = {sapio21:_scalin_distr_machin_learn_in_networ_aggreg},
supersededas = {arXiv version (2020)},
monthnum = {02}
}
@techreport{szekeres19:_meerk,
address = {Seattle, WA, USA},
author = {Adriana Szekeres and Michael Whittaker and Naveen Kr.
Sharma and Jialin Li and Arvind Krishnamurthy and Irene
Zhang and Dan R. K. Ports},
institution = {University of Washington CSE},
month = nov,
number = {UW-CSE-2019-11-02},
title = {Meerkat: Scalable Replicated Transactions Following the
Zero-Coordination Principle},
year = {2019},
abstract = {Traditionally, the high cost of network communication
between servers has hidden the impact of cross-core
coordination in replicated systems. However, new
technologies, like kernel-bypass networking and faster
network links, have exposed hidden bottlenecks in
distributed systems. This paper explores how to build
multicore-scalable, replicated storage systems. We
introduce a new guideline for their design, called the
Zero-Coordination Principle. We use this principle to
design a new multicore-scalable, in-memory, replicated,
key-value store, called Meerkat.
Unlike existing
systems, Meerkat eliminates all cross-core and
cross-replica coordination, both of which pose a
scalability bottleneck. Our experiments found that Meerkat
is able to scale up to 80 hyper-threads and execute 8.3
million transactions per second. Meerkat represents an
improvement of 12x on state-of-the art, fault-tolerant,
in-memory, transactional storage systems built using
leader-based replication and a shared transaction log.},
pdf = {papers/meerkat-tr19.pdf},
monthnum = {11}
}
@article{zhu19:_harmon,
author = {Hang Zhu and Zhihao Bai and Jialin Li and Ellis Michael
and Dan R. K. Ports and Ion Stoica and Xin Jin},
journal = {Proceedings of the VLDB Endowment},
month = nov,
number = {3},
pages = {376--389},
title = {Harmonia: Near-Linear Scalability for Replicated Storage
with In-Network Conflict Detection},
volume = {13},
year = {2019},
abstract = {Distributed storage employs replication to mask failures
and improve availability. However, these systems typically
exhibit a hard tradeoff between consistency and
performance. Ensuring consistency introduces coordination
overhead, and as a result the system throughput does not
scale with the number of replicas. We present Harmonia, a
replicated storage architecture that exploits the
capability of new-generation programmable switches to
obviate this tradeoff by providing near-linear scalability
without sacrificing consistency. To achieve this goal,
Harmonia detects read-write conflicts in the network, which
enables any replica to serve reads for objects with no
pending writes. Harmonia implements this functionality at
line rate, thus imposing no performance overhead. We have
implemented a prototype of Harmonia on a cluster of
commodity servers connected by a Barefoot Tofino switch,
and have integrated it with Redis. We demonstrate the
generality of our approach by supporting a variety of
replication protocols, including primary-backup, chain
replication, Viewstamped Replication, and NOPaxos.
Experimental results show that Harmonia improves the
throughput of these protocols by up to 10x for a
replication factor of 10, providing near-linear scalability
up to the limit of our testbed.},
doi = {10.14778/3368289.3368301},
pdf = {papers/harmonia-vldb20.pdf},
monthnum = {11}
}
@techreport{zhu19:_harmon_arxiv,
author = {Hang Zhu and Zhihao Bai and Jialin Li and Ellis Michael
and Dan R. K. Ports and Ion Stoica and Xin Jin},
institution = {arXiv},
month = apr,
number = {1904.08964},
type = {arXiv preprint},
title = {Harmonia: Near-Linear Scalability for Replicated Storage
with In-Network Conflict Detection},
year = {2019},
abstract = {Distributed storage employs replication to mask failures
and improve availability. However, these systems typically
exhibit a hard tradeoff between consistency and
performance. Ensuring consistency introduces coordination
overhead, and as a result the system throughput does not
scale with the number of replicas. We present Harmonia, a
replicated storage architecture that exploits the
capability of new-generation programmable switches to
obviate this tradeoff by providing near-linear scalability
without sacrificing consistency. To achieve this goal,
Harmonia detects read-write conflicts in the network, which
enables any replica to serve reads for objects with no
pending writes. Harmonia implements this functionality at
line rate, thus imposing no performance overhead. We have
implemented a prototype of Harmonia on a cluster of
commodity servers connected by a Barefoot Tofino switch,
and have integrated it with Redis. We demonstrate the
generality of our approach by supporting a variety of
replication protocols, including primary-backup, chain
replication, Viewstamped Replication, and NOPaxos.
Experimental results show that Harmonia improves the
throughput of these protocols by up to 10X for a
replication factor of 10, providing near-linear scalability
up to the limit of our testbed.},
doi = {10.48550/arXiv.1904.08964},
pdf = {papers/harmonia-arxiv19.pdf},
monthnum = {04}
}
@techreport{li18:_pegas,
address = {Seattle, WA, USA},
author = {Jialin Li and Jacob Nelson and Xin Jin and Dan R. K.
Ports},
institution = {University of Washington CSE},
month = dec,
number = {UW-CSE-18-12-01},
title = {Pegasus: Load-Aware Selective Replication with an
In-Network Coherence Directory},
year = {2018},
abstract = {High performance distributed storage systems face the
challenge of load imbalance caused by skewed and dynamic
workloads. This paper introduces Pegasus, a new storage
architecture that leverages new-generation programmable
switch ASICs to balance load across storage servers.
Pegasus uses selective replication of the most popular
objects in the data store to distribute load. Using a novel
in-network coherence directory, the Pegasus switch tracks
and manages the location of replicated objects. This allows
it to achive load-aware forwarding and dynamic rebalancing
for replicated keys, while still guaranteeing data
coherence. The Pegasus design is practical to implement as
it stores only forwarding metadata in the switch data
plane. The resulting system improves the 99\% tail latency
of a distributed in-memory key-value store by more than
95\%, and yields up to a 9x throughput improvement under a
latency SLO -- results which hold across a large set of
workloads with varying degrees of skewness, read/write
ratio, and dynamism.},
pdf = {papers/pegasus-tr18.pdf},
supersededby = {li20:_pegas},
supersededas = {Tech. Report (2018)},
monthnum = {12}
}
@article{zhang18:_build_consis_trans_incon_replic,
author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and
Arvind Krishnamurthy and Dan R. K. Ports},
journal = {{ACM} Transactions on Computer Systems},
month = dec,
number = {4},
pages = {12},
title = {Building Consistent Transactions with Inconsistent
Replication},
volume = {35},
year = {2018},
abstract = {Application programmers increasingly prefer distributed
storage systems with strong consistency and distributed
transactions (e.g., Google's Spanner) for their strong
guarantees and ease of use. Unfortunately, existing
transactional storage systems are expensive to use -- in
part because they require costly replication protocols,
like Paxos, for fault tolerance. In this paper, we present
a new approach that makes transactional storage systems
more affordable: we eliminate consistency from the
replication protocol while still providing distributed
transactions with strong consistency to applications.
We present TAPIR -- the Transactional Application Protocol
for Inconsistent Replication -- the first transaction
protocol to use a novel replication protocol, called
inconsistent replication, that provides fault tolerance
without consistency. By enforcing strong consistency only
in the transaction protocol, TAPIR can commit transactions
in a single round-trip and order distributed transactions
without centralized coordination. We demonstrate the use of
TAPIR in a transactional key-value store, TAPIR-KV.
Compared to conventional systems, TAPIR-KV provides both
better latency and better throughput.},
doi = {10.1145/3269981},
pdf = {papers/tapir-tocs18.pdf},
code = {https://github.com/uwsyslab/tapir/},
cats = {https://discuss.systems/@dan/109389114238305747},
monthnum = {12}
}
@techreport{li17:_eris_tr,
address = {Seattle, WA, USA},
author = {Jialin Li and Ellis Michael and Dan R. K. Ports},
institution = {University of Washington CSE},
month = oct,
number = {UW-CSE-TR-17-10-01},
title = {{Eris}: Coordination-Free Consistent Transactions using
Network Multi-Sequencing (Extended Version)},
year = {2017},
abstract = {Distributed storage systems aim to provide strong
consistency and isolation guarantees on an architecture
that is partitioned across multiple shards for scalability
and replicated for fault-tolerance. Traditionally,
achieving all of these goals has required an expensive
combination of atomic commitment and replication protocols
-- introducing extensive coordination overhead. Our system,
Eris, takes a very different approach. It moves a core
piece of concurrency control functionality, which we term
multi-sequencing, into the datacenter network itself. This
network primitive takes on the responsibility for
consistently ordering transactions, and a new lightweight
transaction protocol ensures atomicity. The end result is
that Eris avoids both replication and transaction
coordination overhead: we show that it can process a large
class of distributed transactions in a single round-trip
from the client to the storage system without any explicit
coordination between shards or replicas. It provides
atomicity, consistency, and fault-tolerance with less than
10\% overhead -- achieving throughput 4.5--35x higher and
latency 72--80\% lower than a conventional design on
standard benchmarks.},
pdf = {papers/eris-tr17.pdf},
monthnum = {10}
}
@techreport{michael17:_recov_shared_objec_without_stabl,
address = {Seattle, WA, USA},
author = {Ellis Michael and Dan R. K. Ports and Naveen Kr. Sharma
and Adriana Szekeres},
institution = {University of Washington CSE},
month = aug,
number = {UW-CSE-17-08-01},
title = {Recovering Shared Objects Without Stable Storage (Extended
Version)},
year = {2017},
abstract = {This paper considers the problem of building
fault-tolerant shared objects when processes can crash and
recover but lose their persistent state on recovery. This
Diskless Crash-Recovery (DCR) model matches the way many
long-lived systems are built. We show that it presents new
challenges, as operations that are recorded at a quorum may
not persist after some of the processes in that quorum
crash and then recover. To address this problem, we
introduce the notion of crash-consistent quorums,
where no recoveries happen during the quorum responses. We
show that relying on crash-consistent quorums enables a
recovery procedure that can recover all operations that
successfully finished. Crash-consistent quorums can be
easily identified using a mechanism we term the crash
vector, which tracks the causal relationship between
crashes, recoveries, and other operations.
We apply
crash-consistent quorums and crash vectors to build two
storage primitives. We give a new algorithm for
multi-reader multi-writer atomic registers in the DCR model
that guarantees safety under all conditions and termination
under a natural condition. It improves on the best prior
protocol for this problem by requiring fewer rounds, fewer
nodes to participate in the quorum, and a less restrictive
liveness condition. We also present a more efficient
single-reader, single-writer atomic set---a virtual
stable storage abstraction. It can be used to lift any
existing algorithm from the traditional Crash-Recovery with
Stable Storage model to the DCR model. We examine a
specific application, state machine replication, and show
that existing diskless protocols can violate their
correctness guarantees, while ours offers a general and
correct solution.},
pdf = {papers/recovery-tr17.pdf},
monthnum = {08}
}
@techreport{holt16:_discip_incon,
address = {Seattle, WA, USA},
author = {Brandon Holt and James Bornholt and Irene Zhang and Dan R.
K. Ports and Mark Oskin and Luis Ceze},
institution = {University of Washington CSE},
month = jun,
number = {UW-CSE-TR-16-06-01},
title = {Disciplined Inconsistency},
year = {2016},
abstract = {Distributed applications and web services, such as online
stores or social networks, are expected to be scalable,
available, responsive, and fault-tolerant. To meet these
steep requirements in the face of high round-trip
latencies, network partitions, server failures, and load
spikes, applications use eventually consistent datastores
that allow them to weaken the consistency of some data.
However, making this transition is highly error-prone
because relaxed consistency models are notoriously
difficult to understand and test. In this work, we
propose a new programming model for distributed data that
makes consistency properties explicit and uses a type
system to enforce consistency safety. With the
Inconsistent, Performance-bound, Approximate (IPA)
storage system, programmers specify performance targets and
correctness requirements as constraints on persistent data
structures and handle uncertainty about the result of
datastore reads using new *consistency types*. We implement
a prototype of this model in Scala on top of an existing
datastore, Cassandra, and use it to make
performance/correctness tradeoffs in two applications: a
ticket sales service and a Twitter clone. Our evaluation
shows that IPA prevents consistency-based programming
errors and adapts consistency automatically in response to
changing network conditions, performing comparably to weak
consistency and 2-10x faster than strong consistency.},
pdf = {papers/ipa-tr16.pdf},
supersededby = {holt16:_discip_incon_consis_types},
supersededas = {Tech. Report (2016)},
monthnum = {06}
}
@techreport{li16:_fast_replic_nopax_tr,
address = {Seattle, WA, USA},
author = {Jialin Li and Ellis Michael and Adriana Szekeres and
Naveen Kr. Sharma and Dan R. K. Ports},
institution = {University of Washington CSE},
number = {UW-CSE-TR-16-09-02},
title = {Just Say {NO} to {Paxos} Overhead: Replacing Consensus
with Network Ordering (Extended Version)},
year = {2016},
abstract = {Distributed applications use replication, implemented by
protocols like Paxos, to ensure data availability and
transparently mask server failures. This paper presents a
new approach to achieving replication in the data center
without the performance cost of traditional methods. Our
work carefully divides replication responsibility between
the network and protocol layers. The network orders
requests but does not ensure reliable delivery -- using a
new primitive we call ordered unreliable multicast (OUM).
Implementing this primitive can be achieved with
near-zero-cost in the data center. Our new replication
protocol, Network-Ordered Paxos (NOPaxos), exploits network
ordering to provide strongly consistent replication without
coordination. The resulting system not only outperforms
both latency- and throughput-optimized protocols on their
respective metrics, but also yields throughput within 2\%
and latency within 16 us of an unreplicated system --
providing replication without the performance cost.},
pdf = {papers/nopaxos-tr16.pdf},
code = {https://github.com/uwsyslab/nopaxos/},
monthnum = {}
}
@techreport{michael16:_provid_stabl_storag_diskl_crash,
address = {Seattle, WA, USA},
author = {Ellis Michael and Dan R. K. Ports and Naveen Kr. Sharma
and Adriana Szekeres},
institution = {University of Washington CSE},
month = aug,
number = {UW-CSE-TR-16-08-02},
title = {Providing Stable Storage for the Diskless Crash-Recovery
Failure Model},
year = {2016},
abstract = {Many classic protocols in the fault tolerant distributed
computing literature assume a Crash-Fail model in which
processes either are up, or have crashed and are
permanently down. While this model is useful, it does not
fully capture the difficulties many real systems must
contend with. In particular, real-world systems are
long-lived and must have a recovery mechanism so that
crashed processes can rejoin the system and restore its
fault-tolerance. When processes are assumed to have access
to stable storage that is persistent across failures, the
Crash-Recovery model is trivial. However, because disk
failures are common and because having a disk on a
protocol's critical path is often performance concern,
diskless recovery protocols are needed. While such
protocols do exist in the state machine replication
literature, several well-known protocols have flawed
recovery mechanisms. We examine these errors to elucidate
the problem of diskless recovery and present our own
protocol for providing virtual stable storage, transforming
any protocol in the Crash-Recovery with stable storage
model into a protocol in the Diskless Crash-Recover
model.},
pdf = {papers/diskless-tr16.pdf},
monthnum = {08}
}
@article{zhang16:_when_is_operat_order_requir,
author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and
Arvind Krishnamurthy and Dan R. K. Ports},
journal = {{IEEE} Data Engineering Bulletin},
month = mar,
number = {1},
pages = {27--38},
title = {When Is Operation Ordering Required in Replicated
Transactional Storage?},
volume = {39},
year = {2016},
abstract = {Today's replicated transactional storage systems typically
have a layered architecture, combining protocols for
transaction coordination, consistent replication, and
concurrency control. These systems generally require costly
strongly-consistent replication protocols like Paxos, which
assign a total order to all operations. To avoid this cost,
we ask whether all replicated operations in these systems
need to be strictly ordered. Recent research has yielded
replication protocols that can avoid unnecessary ordering,
e.g., by exploiting commutative operations, but it is not
clear how to apply these to replicated transaction
processing systems. We answer this question by analyzing
existing transaction processing designs in terms of which
replicated operations require ordering and which simply
require fault tolerance. We describe how this analysis
leads to our recent work on TAPIR, a transaction protocol
that efficiently provides strict serializability by using a
new replication protocol that provides fault tolerance but
not ordering for most operations.},
pdf = {papers/ordering-debull16.pdf},
monthnum = {03}
}
@article{peter15:_arrak,
author = {Simon Peter and Jialin Li and Irene Zhang and Dan R. K.
Ports and Doug Woos and Arvind Krishnamurthy and Thomas
Anderson and Timothy Roscoe},
journal = {{ACM} Transactions on Computer Systems},
month = nov,
number = {4},
title = {Arrakis: The Operating System Is the Control Plane},
volume = {33},
year = {2015},
abstract = {Recent device hardware trends enable a new approach to the
design of network server operating systems. In a
traditional operating system, the kernel mediates access to
device hardware by server applications to enforce process
isolation as well as network and disk security. We have
designed and implemented a new operating system, Arrakis,
that splits the traditional role of the kernel in two.
Applications have direct access to virtualized I/O devices,
allowing most I/O operations to skip the kernel entirely,
while the kernel is re-engineered to provide network and
disk protection without kernel mediation of every
operation. We describe the hardware and software changes
needed to take advantage of this new abstraction, and we
illustrate its power by showing improvements of 2 to 5 x in
latency and 9x throughput for a popular persistent NoSQL
store relative to a well-tuned Linux implementation.},
doi = {10.1145/2812806},
pdf = {papers/arrakis-tocs15.pdf},
code = {https://github.com/UWNetworksLab/arrakis},
monthnum = {11}
}
@techreport{zhang15:_build_consis_trans_incon_replic_exten_version,
author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and
Arvind Krishnamurthy and Dan R. K. Ports},
institution = {University of Washington CSE},
month = oct,
number = {UW-CSE-2014-12-01 v2},
title = {Building Consistent Transactions with Inconsistent
Replication (Extended Version)},
year = {2015},
abstract = {Application programmers increasingly prefer distributed
storage systems with strong consistency and distributed
transactions (e.g., Google's Spanner) for their strong
guarantees and ease of use. Unfortunately, existing
transactional storage systems are expensive to use -- in
part because they require costly replication protocols,
like Paxos, for fault tolerance. In this paper, we present
a new approach that makes transactional storage systems
more affordable: we eliminate consistency from the
replication protocol while still providing distributed
transactions with strong consistency to applications.
We present TAPIR -- the Transactional Application Protocol
for Inconsistent Replication -- the first transaction
protocol to use a novel replication protocol, called
inconsistent replication, that provides fault tolerance
without consistency. By enforcing strong consistency only
in the transaction protocol, TAPIR can commit transactions
in a single round-trip and order distributed transactions
without centralized coordination. We demonstrate the use of
TAPIR in a transactional key-value store, TAPIR-KV.
Compared to conventional systems, TAPIR-KV provides better
latency and throughput.},
pdf = {papers/tapir-tr-v2.pdf},
code = {https://github.com/uwsyslab/tapir/},
cats = {https://discuss.systems/@dan/109389114238305747},
monthnum = {10}
}
@techreport{li14:_tales_tail_tr,
address = {Seattle, WA, USA},
author = {Jialin Li and Naveen Kr. Sharma and Dan R. K. Ports and
Steven D. Gribble},
institution = {University of Washington CSE},
month = apr,
number = {UW-CSE-14-04-01},
title = {Tales of the Tail: Hardware, {OS}, and Application-level
Sources of Tail Latency},
year = {2014},
abstract = {Interactive services often have large-scale parallel
implementations. To deliver fast responses, the median and
tail latencies of a service's components must be low. In
this paper, we explore the hardware, OS, and
application-level sources of poor tail latency in high
throughput servers executing on multi-core machines. We
first review the basic queuing theory that governs service
latency. Using fine-grained measurements of three different
servers (a null RPC service, Memcached, and Nginx) on
Linux, we then explore why these servers exhibit
significantly worse tail latencies than queuing models
alone predict. The underlying causes include interference
from background processes, request re-ordering caused by
poor scheduling or constrained concurrency models,
suboptimal interrupt routing, CPU power saving mechanisms,
and NUMA effects.
We systematically eliminate these
factors and show that Memcached can achieve a median
latency of 11 us and a 99.9th percentile latency of 32 us
at 75\% utilization. In comparison, a naive deployment of
Memcached has a median latency of 33 us and a 99.9th
percentile latency of 14 ms. Finally, we demonstrate that a
tradeoff often exists between throughput and tail latency.},
pdf = {papers/latency-tr14.pdf},
supersededby = {li14:_tales_tail},
supersededas = {Tech. Report},
monthnum = {04}
}
@techreport{peter14:_arrak_tr_v2,
address = {Seattle, WA, USA},
author = {Simon Peter and Jialin Li and Irene Zhang and Dan R. K.
Ports and Arvind Krishnamurthy and Thomas Anderson and
Timothy Roscoe},
institution = {University of Washington CSE},
month = may,
number = {UW-CSE-13-10-01, version 2.0},
title = {Arrakis: The Operating System is the Control Plane},
year = {2014},
abstract = {Recent device hardware trends enable a new approach to the
design of network server operating systems. In a
traditional operating system, the kernel mediates access to
device hardware by server applications, to enforce process
isolation as well as network and disk security. We have
designed and implemented a new operating system, Arrakis,
that splits the traditional role of the kernel in two.
Applications have direct access to virtualized I/O devices,
allowing most I/O operations to skip the kernel entirely,
while the kernel is re-engineered to provide network and
disk protection without kernel mediation of every
operation. We describe the hardware and software changes
needed to take advantage of this new abstraction, and we
illustrate its power by showing 2-5x end-to-end latency and
9x throughput improvements for a popular persistent NoSQL
store relative to a well-tuned Linuxv implementation.},
pdf = {papers/arrakis-tr-ver2.pdf},
supersededby = {peter14:_arrak},
supersededas = {Tech. Report (v2.0, 2014)},
monthnum = {05}
}
@techreport{zhang14:_build_consis_trans_incon_replic,
author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and
Arvind Krishnamurthy and Dan R. K. Ports},
institution = {University of Washington CSE},
month = dec,
number = {UW-CSE-2014-12-01},
title = {Building Consistent Transactions with Inconsistent
Replication},
year = {2014},
pdf = {papers/tapir-tr14.pdf},
supersededby = {zhang15:_build_consis_trans_incon_replic_exten_version},
supersededas = {Tech. Report (2014)},
monthnum = {12}
}
@techreport{hornyack13:_study_virtual_memor_usage_implic_large_memor,
address = {Seattle, WA},
author = {Peter Hornyack and Luis Ceze and Steven D. Gribble and Dan
R. K. Ports and Henry M. Levy},
institution = {University of Washington CSE},
title = {A Study of Virtual Memory Usage and Implications for Large
Memory},
year = {2013},
abstract = {The mechanisms now used to implement virtual memory -
pages, page tables, and TLBs - have worked remarkably well
for over fifty years. However, they are beginning to show
their age due to current trends, such as significant
increases in physical memory size, emerging data-intensive
applications, and imminent non-volatile main memory. These
trends call into question whether page-based
address-translation and protection mechanisms remain viable
solutions in the future. In this paper, we present a
detailed study of how modern applications use virtual
memory. Among other topics, our study examines the
footprint of mapped regions, the use of memory protection,
and the overhead of TLBs. Our results suggest that a
segment-based translation mechanism, together with a
fine-grained protection mechanism, merit consideration for
future systems.},
pdf = {papers/vmstudy-tr13.pdf},
monthnum = {}
}
@techreport{peter13:_arrak,
address = {Seattle, WA, USA},
author = {Simon Peter and Jialin Li and Irene Zhang and Dan R. K.
Ports and Arvind Krishnamurthy and Thomas Anderson and
Timothy Roscoe},
institution = {University of Washington CSE},
month = oct,
number = {UW-CSE-13-10-01},
title = {Arrakis: The Operating System is the Control Plane},
year = {2013},
abstract = {Recent device hardware trends enable a new approach to the
design of network servers. In a traditional operating
system, the kernel mediates access to device hardware by
server applications, to enforce process isolation as well
as network and disk security. We have designed and
implemented a new operating system, Arrakis, that splits
the traditional role of the kernel in two. Applications
have direct access to virtualized I/O devices, allowing
most I/O operations to skip the kernel entirely. The
Arrakis kernel operates only in the control plane. We
describe the the hardware and software changes needed to
take advantage of this new abstraction, and we illustrate
its power by showing significant latency and throughput
improvements for network server applications relative to a
well-tuned Linux implementation.},
pdf = {papers/arrakis-tr.pdf},
supersededby = {peter14:_arrak},
supersededas = {Tech. Report (v1.0, 2013)},
monthnum = {10}
}
@phdthesis{ports12:_applic_level_cachin_trans_consis,
address = {Cambridge, MA, USA},
author = {Dan R. K. Ports},
month = jun,
school = {Massachusetts Institute of Technology},
type = {Ph.D. thesis},
title = {Application-Level Caching with Transactional Consistency},
year = {2012},
abstract = {Distributed in-memory application data caches like
memcached are a popular solution for scaling
database-driven web sites. These systems increase
performance significantly by reducing load on both the
database and application servers. Unfortunately, such
caches present two challenges for application developers.
First, they cannot ensure that the application sees a
consistent view of the data within a transaction, violating
the isolation properties of the underlying database.
Second, they leave the application responsible for locating
data in the cache and keeping it up to date, a frequent
source of application complexity and programming errors.
This thesis addresses both of these problems in a new
cache called TxCache. TxCache is a transactional cache: it
ensures that any data seen within a transaction, whether
from the cache or the database, reflects a slightly stale
but consistent snapshot of the database. TxCache also
offers a simple programming model. Application developers
simply designate certain functions as cacheable, and the
system automatically caches their results and invalidates
the cached data as the underlying database changes.
Our
experiments found that TxCache can substantially increase
the performance of a web application: on the RUBiS
benchmark, it increases throughput by up to 5.2x relative
to a system without caching. More importantly, on this
application, TxCache achieves performance comparable
(within 5\%) to that of a non-transactional cache, showing
that consistency does not have to come at the price of
performance.},
doi = {1721.1/75448},
pdf = {papers/thesis.pdf},
monthnum = {06}
}
@article{ports12:_serial_snaps_isolat_postg,
author = {Dan R. K. Ports and Kevin Grittner},
journal = {Proceedings of the VLDB Endowment},
month = aug,
number = {12},
pages = {1850--1861},
title = {Serializable Snapshot Isolation in {PostgreSQL}},
volume = {5},
year = {2012},
abstract = {This paper describes our experience implementing
PostgreSQL's new serializable isolation level. It is based
on the recently-developed Serializable Snapshot Isolation
(SSI) technique. This is the first implementation of SSI in
a production database release as well as the first in a
database that did not previously have a lock-based
serializable isolation level. We reflect on our experience
and describe how we overcame some of the resulting
challenges, including the implementation of a new lock
manager, a technique for ensuring memory usage is bounded,
and integration with other PostgreSQL features. We also
introduce an extension to SSI that improves performance for
read-only transactions. We evaluate PostgreSQL's
serializable isolation level using several benchmarks and
show that it achieves performance only slightly below that
of snapshot isolation, and significantly outperforms the
traditional two-phase locking approach on read-intensive
workloads.},
doi = {10.48550/arXiv.1208.4179},
pdf = {papers/ssi-vldb12.pdf},
slidespdf = {papers/ssi-vldb12-slides.pdf},
code = {https://www.postgresql.org/},
monthnum = {08}
}
This file was generated by bibtex2html 1.99.