bibliography-sorted.bib

@inproceedings{zhong25:_oasis,
  address = {Seoul, Korea},
  author = {Yuhong Zhong and Daniel S. Berger and Pantea Zardoshti and
		  Enrique Saurez and Jacob Nelson and Dan R. K. Ports and
		  Antonis Psistakis and Joshua Fried and Asaf Cidon},
  booktitle = {Proceedings of the 31st {ACM} {S}ymposium on {O}perating
		  {S}ystems {P}rinciples ({SOSP} '25)},
  month = oct,
  organization = {{ACM}},
  title = {Oasis: Pooling {PCIe} Devices Over {CXL} to Boost
		  Utilization},
  year = {2025},
  abstract = {PCIe devices, such as NICs, SSDs, are frequently
		  underutilized in cloud platforms. Pooling PCIe devices
		  would enable multiplexing these devices across hosts and
		  thus reduce total cost of ownership. However, PCIe switches
		  are expensive. We design Oasis, a system that pools PCIe
		  devices in software over CXL memory pools. CXL memory pools
		  are already being deployed to boost datacenter memory
		  utilization and reduce costs. Once CXL pools are in place,
		  they can serve as an efficient data path between hosts and
		  PCIe devices. Oasis provides a control plane and datapath
		  over CXL pools, mapping and routing PCIe device traffic
		  across host boundaries. PCIe devices with different
		  functionalities can be supported by adding an Oasis engine
		  for each device class. We implement an Oasis network engine
		  to demonstrate NIC pooling. Our evaluation shows that Oasis
		  improves the NIC utilization by 2x and handles NIC failover
		  with only a 50 ms interruption.},
  monthnum = {10}
}

@inproceedings{ports24:_futur_cloud_networ_system,
  address = {Kyoto, Japan},
  author = {Dan R. K. Ports},
  booktitle = {Proceedings of the 15th Asia-Pacific Workshop on Systems
		  ({APSYS} '24)},
  month = sep,
  note = {Keynote address.},
  organization = {{ACM}},
  title = {The Future of Cloud Networking is Systems},
  year = {2024},
  abstract = {As cloud platforms evolve, the boundaries between
		  networking and systems are increasingly blurred. Cloud
		  networking not only asks us to solve a challenging systems
		  problem -- providing an efficient, reliable, and secure
		  virtual infrastructure -- but offers us opportunities to
		  rethink our approach to classic challenges in distributed
		  systems. This keynote will explore the potential of
		  systems/networking co-design through the lens of a
		  cloud-scale hardware-accelerated load balancing platform.
		  Specifically, I will discuss how programmable networking
		  technology, including programmable switches and smart NICs,
		  enables these load balancers to achieve dramatically higher
		  efficiency than existing software solutions while
		  simultaneously offering increased flexibility for custom,
		  application-specific load balancing logic. Looking to the
		  future, I’ll describe three opportunities that this
		  design offers for redefining the architecture of
		  distributed systems with new load-balancing, migration, and
		  snapshotting algorithms, paving the way to a new generation
		  of high-performance, resilient, and scalable cloud
		  services.},
  slidespdf = {papers/cloudnet-keynote-apsys24-slides.pdf},
  monthnum = {09}
}

@inproceedings{yu24:_beaver,
  address = {Santa Clara, CA, USA},
  author = {Liangcheng Yu and Xiao Zhang and Haoran Zhang and John
		  Sonchak and Dan R. K. Ports and Vincent Liu},
  booktitle = {Proceedings of the 18th {USENIX} {S}ymposium on
		  {O}perating {S}ystems {D}esign and {I}mplementation ({OSDI}
		  '24)},
  month = jul,
  organization = {{USENIX}},
  title = {Beaver: Practical Partial Snapshots for Distributed Cloud
		  Services},
  year = {2024},
  abstract = {Distributed snapshots are a classic class of protocols
		  used for capturing a causally consistent view of states
		  across machines. Although effective, existing protocols
		  presume an isolated universe of processes to snapshot and
		  require instrumentation and coordination of all. This
		  assumption does not match today’s cloud services—it is
		  not always practical to instrument all involved processes
		  nor realistic to assume zero interaction of the machines of
		  interest with the external world.  To bridge this gap,
		  this paper presents Beaver, the first practical partial
		  snapshot protocol that ensures causal consistency under
		  external traffic interference. Beaver presents a unique
		  design point that tightly couples its protocol with the
		  underlying regularities of the data center environment. By
		  exploiting the placement of software load balancers in data
		  center networks and their associated communication pattern,
		  Beaver not only requires minimal changes to today’s data
		  center operations but also eliminates any form of blocking
		  to existing communication, thus incurring near-zero
		  overhead to user traffic. We demonstrate the Beaver’s
		  effectiveness through extensive testbed experiments and
		  novel use cases.},
  pdf = {papers/beaver-osdi24.pdf},
  cats = {https://discuss.systems/@dan/112232483230067973},
  monthnum = {07}
}

@inproceedings{choi23:_capyb,
  address = {Seoul, South Korea},
  author = {Inho Choi and Nimesh Wadekar and Raj Joshi and Dan R. K.
		  Ports and Irene Zhang and Jialin Li},
  booktitle = {Proceedings of the 14th Asia-Pacific Workshop on Systems
		  ({APSYS} '23)},
  month = aug,
  organization = {{ACM}},
  title = {Capybara: Microsecond-Scale Live TCP Migration},
  year = {2023},
  abstract = {Latency-critical μs-scale data center applications are
		  susceptible to server load spikes. The issue is
		  particularly challenging for services using long-lived TCP
		  connections. This paper introduces Capybara, a highly
		  efficient and versatile live TCP migration system. Capybara
		  builds atop a deterministic, kernel-bypassed TCP stack
		  running in a library OS to realize its μs-scale TCP
		  migration mechanism. Using modern programmable switches,
		  Capybara implements migration-aware dynamic packet
		  forwarding and transient packet buffering, further reducing
		  system interference during live TCP migration. Capybara can
		  transparently migrate a running TCP connection in 4 μs on
		  average. It improves the average migration host latency by
		  about 12 times compared to a Linux kernel-based solution.},
  doi = {10.1145/3609510.3609813},
  pdf = {papers/capybara-apsys23.pdf},
  monthnum = {08}
}

@inproceedings{choi23:_hydra, address = {Boston, MA, USA}, author = {Inho Choi and Ellis Michael and Yunfan Li and Dan R. K. Ports and Jialin Li}, booktitle = {Proceedings of the 20th {USENIX} {S}ymposium on {N}etworked {S}ystems {D}esign and {I}mplementation ({NSDI} '23)}, month = apr, organization = {{USENIX}}, title = {Hydra: Serialization-Free Network Ordering for Strongly Consistent Distributed Applications}, year = {2023}, abstract = {A large class of distributed systems, e.g., state machine replication and fault-tolerant distributed databases, rely on establishing a consistent order of operations on groups of nodes in the system. Traditionally, an application-level distributed protocol such as Paxos and two-phase locking provide the ordering guarantees. To reduce the performance overhead imposed by these protocols, a recent line of work propose to move the responsibility of ensuring operation ordering into the network – by sequencing requests through a centralized network sequencer. This network sequencing approach yields significant application-level performance improvements, but routing all requests through a single sequencer comes with several fundamental limitations, including sequencer scalability bottleneck, prolonged system downtime during sequencer failover, worsened network-level load balancing, etc.

Our work, Hydra, overcomes these limitations by using a distributed set of network sequencers to provide network ordering. Hydra leverages loosely synchronized clocks on network sequencers to establish message ordering across them, per-sequencer sequence numbers to detect message drops, and periodic timestamp messages to enforce progress when some sequencers are idle. To demonstrate the benefit of Hydra, we co-designed a state machine replication protocol and a distributed transactional system using the Hydra network primitive. Compared to serialization-based network ordering systems, Hydra shows equivalent performance improvement over traditional approaches in both applications, but with significantly higher scalability, shorter sequencer failover time, and better network-level load balancing.}, pdf = {papers/hydra-nsdi23.pdf}, monthnum = {04} }

@inproceedings{lerner23:_datab_moder_networ,
  author = {Alberto Lerner and Carsten Binnig and Philippe
		  Cudr{\'e}-Mauroux and Rana Hussein and Matthias Jasny and
		  Theo Jepsen and Dan R. K. Ports and Lasse Thostrup and
		  Tobias Ziegler},
  booktitle = {Proceedings of the 49th {I}nternational {C}onference on
		  {V}ery {L}arge {D}ata {B}ases ({VLDB} '23)},
  key = {{VLDB '23}},
  month = aug,
  note = {Tutorial},
  title = {Databases on Modern Networks: A Decade of Research that
		  now comes into Practice},
  year = {2023},
  abstract = {Modern cloud networks are a fundamental pillar of
		  data-intensive applications. They provide high-speed
		  transaction (packet) rates and low overhead, enabling, for
		  instance, truly scalable database designs. These networks,
		  however, are fundamentally different from conventional
		  ones. Arguably, the two key discerning technologies are
		  RDMA and programmable network devices. Today, these
		  technologies are not niche technologies anymore and are
		  widely deployed across all major cloud vendors. The
		  question is thus not if but how a new breed of
		  data-intensive applications can benefit from modern
		  networks, given the perceived difficulty in using and
		  programming them. This tutorial addresses these challenges
		  by exposing how the underlying principles changed as the
		  network evolved and by presenting the new system design
		  opportunities they opened. In the process, we also discuss
		  several hard-earned lessons accumulated by making the
		  transition first-hand.},
  doi = {10.14778/3611540.3611579},
  pdf = {papers/dbtutorial-vldb23.pdf},
  monthnum = {08}
}

@inproceedings{liu23:_slimem, address = {Hong Kong, China}, author = {Ziyuan Liu and Zhixiong Niu and Ran Shu and Liang Gao and Guohong Lai and Na Wang and Zongying He and Jacob Nelson and Dan R. K. Ports and Lihua Yuan and Peng Cheng and Yongqiang Xiong}, booktitle = {Proceedings of the 7th Asia-Pacific Workshop on Networking ({APNet '23})}, month = jul, organization = {{ACM}}, title = {SlimeMold: Hardware Load Balancer at Scale in Datacenter}, year = {2023}, abstract = {Stateful load balancers (LB) are essential services in cloud data centers, playing a crucial role in enhancing the availability and capacity of applications. Numerous studies have proposed methods to improve the throughput, connections per second, and concurrent flows of single LBs. For instance, with the advancement of programmable switches, hardware-based load balancers (HLB) have become mainstream due to their high efficiency. However, programmable switches still face the issue of limited registers and table entries, preventing them from fully meeting the performance requirements of data centers. In this paper, rather than solely focusing on enhancing individual HLBs, we introduce SlimeMold, which enables HLBs to work collaboratively at scale as an integrated LB system in data centers.

First, we design a novel HLB building block capable of achieving load balancing and exchanging states with other building blocks in the data plane. Next, we decouple forwarding and state operations, organizing the states using our proposed 2-level mapping mechanism. Finally, we optimize the system with flow caching and table entry balancing. We implement a real HLB building block using the Broadcom 56788 SmartToR chip, which attains line rate for state read and >1M OPS for flow write operations. Our simulation demonstrates full scalability in large-scale experiments, supporting 454 million concurrent flows with 512 state-hosting building blocks.}, doi = {10.1145/3600061.3600067}, pdf = {papers/slimemold-apnet23.pdf}, slidespdf = {papers/slimemold-apnet23-slides.pdf}, monthnum = {07} }

@inproceedings{yuan23:_rambd,
  address = {Montreal, QC, Canada},
  author = {Yifan Yuan and Jinghan Huang and Yan Sun and Tianchen Wang
		  and Jacob Nelson and Dan Ports and Yipeng Wang and Ren Wang
		  and Charlie Tai and Nam Sung Kim},
  booktitle = {Proceedings of the 29th International Symposium on High
		  Performance Computer Architecture ({HPCA} '23)},
  month = feb,
  organization = {{IEEE}},
  title = {{RAMBDA}: {RDMA}-driven Acceleration Framework for
		  Memory-intensive us-scale Datacenter Applications},
  year = {2023},
  abstract = {Responding to the "datacenter tax" and "killer
		  microseconds" problems for memory-intensive datacenter
		  applications, diverse solutions including Smart NIC-based
		  ones have been proposed. Nonetheless, they often suffer
		  from high overhead of communications over network and/or
		  PCIe links. To tackle the limitations of the current
		  solutions, this paper proposes RAMBDA, RDMA-driven
		  acceleration framework for Boosting performance of
		  memory-intensive us-scale datacenter applications. this
		  paper proposes RAMBDA, a holistic network and architecture
		  co-design solution RAMBDA leverages current RDMA and
		  emerging cache-coherent off-chip interconnect technologies
		  and consists of the following four hardware and software
		  components: (1) unified abstraction of inter- and
		  intra-machine communications synergistically managed by
		  one-sided RDMA write and cache-coherent memory write; (2)
		  efficient notification of requests to accelerators assisted
		  by cache coherence; (3) cache-coherent accelerator
		  architecture directly interacting with NIC; and (4)
		  adaptive device-to-host data transfer for modern server
		  memory systems comprising both DRAM and NVM exploiting
		  state-of-the-art features in CPUs and PCIe. We prototype
		  RAMBDA with a commercial system and evaluate three popular
		  datacenter applications: (1) in-memory key-value store, (2)
		  chain replication-based distributed transaction system, and
		  (3) deep learning recommendation model inference. The
		  evaluation shows that RAMBDA provides 30.1∼69.1 lower
		  latency, up to 2.5x higher throughput, and ~3x higher
		  energy efficiency than the current state-of-the-art
		  solutions.},
  doi = {10.1109/HPCA56546.2023.10071127},
  pdf = {papers/rambda-hpca23.pdf},
  monthnum = {02}
}

@inproceedings{zeno23:_swish, address = {Haifa, Israel}, author = {Lior Zeno and Dan R. K. Ports and Jacob Nelson and Daehyeok Kim and Shir Landau Feibish and Idit Keidar and Arik Rinberg and Alon Rashelbach and Igor De-Paula and Mark Silberstein}, booktitle = {Proceedings of the 16th {ACM} International System and Storage Conference ({SYSTOR} '23)}, month = jul, note = {Highlights Session}, organization = {{ACM}}, title = {{SwiSh}: Distributed Shared State Abstractions for Programmable Switches}, year = {2023}, abstract = {We design and evaluate SwiShmem, a distributed shared state management layer for data-plane P4 programs. SwiShmem enables running scalable stateful distributed network functions on programmable switches entirely in the data-plane. We explore several schemes to build a shared variable abstraction, which differ in consistency, performance, and in-switch implementation complexity.

We introduce the novel Strong Delayed-Writes (SDW) protocol which offers consistent snapshots of shared data-plane objects with semantics known as strong r-relaxed linearizability, enabling implementation of distributed concurrent sketches with precise error bounds.

We implement strong, eventual, and SDW consistency protocols in Tofino switches, and compare their performance in microbenchmarks and three realistic network functions, NAT, DDoS detector, and rate limiter. Our results demonstrate that the general distributed state management in the data plane is practical, and outperforms any centralized solution by up to four orders of magnitude in update throughput and replication latency.}, monthnum = {07} }

@inproceedings{liu22:_disag_data_collec_approac_loss_toler_applic,
  address = {Fuzhou, China},
  author = {Ziyuan Liu and Zhixiong Niu and Ran Shu and Wenxue Cheng
		  and Peng Cheng and Yongqiang Xiong and Lihua Yuan and Jacob
		  Nelson and Dan R. K. Ports},
  booktitle = {Proceedings of the 6th Asia-Pacific Workshop on Networking
		  ({APNet '22})},
  month = jul,
  organization = {{ACM}},
  title = {A Disaggregate Data Collecting Approach for Loss-Tolerant
		  Applications},
  year = {2022},
  abstract = {Datacenter generates operation data at an extremely high
		  rate, and data center operators collect and analyze them
		  for problem diagnosis, resource utilization improvement,
		  and performance optimization. However, existing data
		  collection methods fail to efficiently aggregate and store
		  data at extremely high speed and scale. In this paper, we
		  explore a new approach that leverages programmable switches
		  to aggregate data and directly write data to the
		  destination storage. Our proposed data collection system,
		  ALT, uses programmable switches to control NVMe SSDs on
		  remote hosts without the involvement of a remote CPU. To
		  tolerate loss, ALT uses an elegant data structure to enable
		  efficient data recovery when retrieving the collected data.
		  We implement our system on a Tofino-based programmable
		  switch for a prototype. Our evaluation shows that ALT can
		  saturate SSD’s peak performance without any CPU
		  involvement.},
  doi = {10.1145/3542637.3542646},
  pdf = {papers/alt-apnet22.pdf},
  monthnum = {07}
}

@inproceedings{yuan22:_unloc_power_inlin_float_point, address = {Renton, WA, USA}, author = {Yifan Yuan and Omar Alama and Jiawei Fei and Jacob Nelson and Dan R. K. Ports and Amedeo Sapio and Marco Canini and Nam Sung Kim}, booktitle = {Proceedings of the 19th {USENIX} {S}ymposium on {N}etworked {S}ystems {D}esign and {I}mplementation ({NSDI} '22)}, month = apr, organization = {{USENIX}}, title = {Unlocking the Power of Inline Floating-Point Operations on Programmable Switches}, year = {2022}, abstract = {The advent of switches with programmable dataplanes has enabled the rapid development of new network functionality, as well as providing a platform for acceleration of a broad range of application-level functionality. However, existing switch hardware was not designed with application acceleration in mind, and thus applications requiring operations or datatypes not used in traditional network protocols must resort to expensive workarounds. Applications involving floating point data, including distributed training for machine learning and distributed query processing, are key examples.

In this paper, we propose FPISA, a floating point representation designed to work efficiently in programmable switches. We first implement FPISA on an Intel Tofino switch, but find that it has limitations that impact throughput and accuracy. We then propose hardware changes to address these limitations based on the open-source Banzai switch architecture, and synthesize them in a 15-nm standard-cell library to demonstrate their feasibility. Finally, we use FPISA to implement accelerators for training for machine learning as an example application, and evaluate its performance on a switch implementing our changes using emulation. We find that FPISA allows distributed training to use one to three fewer CPU cores and provide up to 85.9\% better throughput than SwitchML in a CPU-constrained environment.}, pdf = {papers/fpisa-nsdi22.pdf}, slidespdf = {papers/fpisa-nsdi22-slides.pdf}, video = {https://www.usenix.org/conference/nsdi22/presentation/yuan}, monthnum = {04} }

@inproceedings{zeno22:_swish,
  address = {Renton, WA, USA},
  author = {Lior Zeno and Dan R. K. Ports and Jacob Nelson and
		  Daehyeok Kim and Shir Landau Feibish and Idit Keidar and
		  Arik Rinberg and Alon Rashelbach and Igor De-Paula and Mark
		  Silberstein},
  booktitle = {Proceedings of the 19th {USENIX} {S}ymposium on
		  {N}etworked {S}ystems {D}esign and {I}mplementation ({NSDI}
		  '22)},
  month = apr,
  organization = {{USENIX}},
  title = {{SwiSh}: Distributed Shared State Abstractions for
		  Programmable Switches},
  year = {2022},
  abstract = {We design and evaluate SwiShmem, a distributed
		  shared state management layer for data-plane P4
		  programs. SwiShmem enables running scalable stateful
		  distributed network functions on programmable switches
		  entirely in the data-plane. We explore several schemes to
		  build a shared variable abstraction, which differ in
		  consistency, performance, and in-switch implementation
		  complexity.  We introduce the novel Strong
		  Delayed-Writes (SDW) protocol which offers consistent
		  snapshots of shared data-plane objects with semantics known
		  as strong r-relaxed linearizability, enabling
		  implementation of distributed concurrent sketches with
		  precise error bounds. 
 We implement strong, eventual,
		  and SDW consistency protocols in Tofino switches, and
		  compare their performance in microbenchmarks and three
		  realistic network functions, NAT, DDoS detector, and rate
		  limiter. Our results demonstrate that the general
		  distributed state management in the data plane is
		  practical, and outperforms any centralized solution by up
		  to four orders of magnitude in update throughput and
		  replication latency.},
  pdf = {papers/swish-nsdi22.pdf},
  slidespdf = {papers/swish-nsdi22-slides.pdf},
  video = {https://www.usenix.org/conference/nsdi22/presentation/zeno},
  monthnum = {04}
}

@inproceedings{zhu22:_netvr,
  address = {Renton, WA, USA},
  author = {Hang Zhu and Tao Wang and Yi Hong and Dan R. K. Ports and
		  Anirudh Sivaraman and Xin Jin},
  booktitle = {Proceedings of the 19th {USENIX} {S}ymposium on
		  {N}etworked {S}ystems {D}esign and {I}mplementation ({NSDI}
		  '22)},
  month = apr,
  organization = {{USENIX}},
  title = {{NetVRM}: Virtual Register Memory for Programmable
		  Networks},
  year = {2022},
  abstract = {Programmable networks are enabling a new class of
		  applications that leverage the line-rate processing
		  capability and on-chip register memory of the switch data
		  plane. Yet the status quo is focused on developing
		  approaches that share the register memory statically. We
		  present NetVRM, a network management system that supports
		  dynamic register memory sharing between multiple concurrent
		  applications on a programmable network and is readily
		  deployable on commodity programmable switches. NetVRM
		  provides a virtual register memory abstraction that enables
		  applications to share the register memory in the data
		  plane, and abstracts away the underlying details. In
		  principle, NetVRM supports any memory allocation algorithm
		  given the virtual register memory abstraction. It also
		  provides a default memory allocation algorithm that
		  exploits the observation that applications have diminishing
		  returns on additional memory. NetVRM provides an extension
		  of P4, P4VRM, for developing applications with virtual
		  register memory, and a compiler to generate data plane
		  programs and control plane APIs. Testbed experiments show
		  that NetVRM is general to a diverse variety of
		  applications, and that its utility-based dynamic allocation
		  policy outperforms static resource allocation.
		  Specifically, it improves the mean satisfaction ratio
		  (i.e., the fraction of a network application’s lifetime
		  that it meets its utility target) by 1.6-2.2x under a range
		  of workloads.},
  pdf = {papers/netvrm-nsdi22.pdf},
  slidespdf = {papers/netvrm-nsdi22-slides.pdf},
  video = {https://www.usenix.org/conference/nsdi22/presentation/zhu},
  monthnum = {04}
}

@inproceedings{burke21:_prism,
  address = {Virtual Conference},
  author = {Matthew Burke and Sowmya Dharanipragada and Shannon Joyner
		  and Adriana Szekeres and Jacob Nelson and Irene Zhang and
		  Dan R. K. Ports},
  booktitle = {Proceedings of the 28th {ACM} {S}ymposium on {O}perating
		  {S}ystems {P}rinciples ({SOSP} '21)},
  month = oct,
  organization = {{ACM}},
  title = {{PRISM}: Rethinking the {RDMA} Interface for Distributed
		  Systems},
  year = {2021},
  abstract = {Remote Direct Memory Access (RDMA) has been used to
		  accelerate a variety of distributed systems, by providing
		  low-latency, CPU-bypassing access to a remote host's
		  memory. However, most of the distributed protocols used in
		  these systems cannot easily be expressed in terms of the
		  simple memory READs and WRITEs provided by RDMA. As a
		  result, designers face a choice between introducing
		  additional protocol complexity (e.g., additional round
		  trips) or forgoing the benefits of RDMA entirely.  This
		  paper argues that an extension to the RDMA interface can
		  resolve this dilemma. We introduce the PRISM interface,
		  which extends the RDMA interface with four new primitives:
		  indirection, allocation, enhanced compare-and-swap, and
		  operation chaining. These increase the expressivity of the
		  RDMA interface, while still being implementable using the
		  same underlying hardware features. We show their utility by
		  designing three new applications using PRISM primitives,
		  that require little to no server-side CPU involvement: (1)
		  PRISM-KV, a key-value store; (2) PRISM-RS a replicated
		  block store; and (3) PRISM-TX, a distributed transaction
		  protocol. Using a software-based implementation of the
		  PRISM primitives, we show that these systems outperform
		  prior RDMA-based equivalents.},
  doi = {10.1145/3477132.3483587},
  pdf = {papers/prism-sosp21.pdf},
  video = {https://youtu.be/yZDw9uBMHxI},
  monthnum = {10}
}

@inproceedings{kim21:_redpl,
  address = {Virtual Conference},
  author = {Daehyeok Kim and Jacob Nelson and Dan R. K. Ports and Vyas
		  Sekar and Srinivasan Seshan},
  booktitle = {Proceedings of {ACM} SIGCOMM 2021},
  month = aug,
  organization = {{ACM}},
  title = {{RedPlane}: Enabling Fault Tolerant Stateful In-Switch
		  Applications},
  year = {2021},
  abstract = {Many recent efforts have demonstrated the performance
		  benefits of running datacenter functions (e.g., NATs, load
		  balancers, monitoring) on programmable switches. However, a
		  key missing piece remains: fault tolerance. This is
		  especially critical as the network is no longer stateless
		  and pure endpoint recovery does not suffice. In this paper,
		  we design and implement RedPlane, a fault-tolerant state
		  store for stateful in-switch applications. This provides
		  in-switch applications consistent access to their state,
		  even if the switch they run on fails or traffic is rerouted
		  to an alternative switch. We address key challenges in
		  devising a practical, provably correct replication protocol
		  and implementing it in the switch data plane. Our
		  evaluations show that RedPlane incurs negligible overhead
		  and enables end-to-end applications to rapidly recover from
		  switch failures.},
  doi = {10.1145/3452296.3472905},
  pdf = {papers/redplane-sigcomm21.pdf},
  code = {https://github.com/daehyeok-kim/redplane-public},
  video = {papers/redplane-sigcomm21-video.mp4},
  monthnum = {08}
}

@inproceedings{sapio21:_scalin_distr_machin_learn_in_networ_aggreg,
  address = {Boston, MA, USA},
  author = {Amedeo Sapio and Marco Canini and Chen-Yu Ho and Jacob
		  Nelson and Panos Kalnis and Changhoon Kim and Arvind
		  Krishnamurthy and Masoud Moshref and Dan R. K. Ports and
		  Peter Richtarik},
  booktitle = {Proceedings of the 18th {USENIX} {S}ymposium on
		  {N}etworked {S}ystems {D}esign and {I}mplementation ({NSDI}
		  '21)},
  month = apr,
  organization = {{USENIX}},
  title = {Scaling Distributed Machine Learning with In-Network
		  Aggregation},
  year = {2021},
  abstract = {Training machine learning models in parallel is an
		  increasingly important workload. We accelerate distributed
		  parallel training by designing a communication primitive
		  that uses a programmable switch dataplane to execute a key
		  step of the training process. Our approach, SwitchML,
		  reduces the volume of exchanged data by aggregating the
		  model updates from multiple workers in the network. We
		  co-design the switch processing with the end-host protocols
		  and ML frameworks to provide an efficient solution that
		  speeds up training by up to 5.5x for a number of real-world
		  benchmark models.},
  pdf = {papers/switchml-nsdi21.pdf},
  slidespdf = {papers/switchml-nsdi21-slides.pdf},
  code = {https://github.com/p4lang/p4app-switchML},
  video = {https://www.youtube.com/watch?v=FIZsXfeZrvE},
  cats = {https://discuss.systems/@dan/109373463921944039},
  monthnum = {04}
}

@inproceedings{li20:_effic_portab_virtual_nvme_storag_arm_socs,
  address = {Lausanne, Switzerland},
  author = {Huaicheng Li and Mingzhe Hao and Stanko Novakovic and
		  Vaibhav Gogte and Sriram Govindan and Dan R. K. Ports and
		  Irene Zhang and Ricardo Bianchini and Haryadi S. Gunawi and
		  Anirudh Badam},
  booktitle = {Proceedings of the 24th {I}nternational {C}onference on
		  {A}rchitectural {S}upport for {P}rogramming {L}anguages and
		  {O}perating {S}ystems ({ASPLOS '20})},
  month = apr,
  organization = {{ACM}},
  title = {{LeapIO}: Efficient and Portable Virtual {NVMe} Storage on
		  {ARM SoCs}},
  year = {2020},
  abstract = {Today's cloud storage stack is extremely resource hungry,
		  burning 10-20\% of datacenter x86 cores, a major "storage
		  tax" that cloud providers must pay. Yet, the complex cloud
		  storage stack is not completely offload-ready to today's IO
		  accelerators. We present LeapIO, a new cloud storage stack
		  that leverages ARM-based co-processors to offload complex
		  storage services. LeapIO addresses many deployment
		  challenges, such as hardware fungibility, software
		  portability, virtualizability, composability, and
		  efficiency. It uses a set of OS/software techniques and new
		  hardware properties that provide a uniform address space
		  across the x86 and ARM cores and expose virtual NVMe
		  storage to unmodified guest VMs, at a performance that is
		  competitive with bare-metal servers.},
  doi = {10.1145/3373376.3378531},
  pdf = {papers/leapio-asplos20.pdf},
  monthnum = {04}
}

@inproceedings{li20:_pegas,
  address = {Banff, AL, Canada},
  author = {Jialin Li and Jacob Nelson and Ellis Michael and Xin Jin
		  and Dan R. K. Ports},
  booktitle = {Proceedings of the 14th {USENIX} {S}ymposium on
		  {O}perating {S}ystems {D}esign and {I}mplementation ({OSDI}
		  '20)},
  month = nov,
  organization = {{USENIX}},
  title = {Pegasus: Tolerating Skewed Workloads in Distributed
		  Storage with In-Network Coherence Directories},
  year = {2020},
  abstract = {High performance distributed storage systems face the
		  challenge of load imbalance caused by skewed and dynamic
		  workloads. This paper introduces Pegasus, a new storage
		  system that leverages new-generation programmable switch
		  ASICs to balance load across storage servers. Pegasus uses
		  selective replication of the most popular objects in the
		  data store to distribute load. Using a novel in-network
		  coherence directory, the Pegasus switch tracks and manages
		  the location of replicated objects. This allows it to
		  achieve load-aware forwarding and dynamic rebalancing for
		  replicated keys, while still guaranteeing data coherence
		  and consistency. The Pegasus design is practical to
		  implement as it stores only forwarding metadata in the
		  switch data plane. The resulting system improves the
		  throughput of a distributed in-memory key-value store by
		  more than 10x under a latency SLO -- results which hold
		  across a large set of workloads with varying degrees of
		  skew, read/write ratio, object sizes, and dynamism.},
  pdf = {papers/pegasus-osdi20.pdf},
  slidespdf = {papers/pegasus-osdi20-slides.pdf},
  code = {https://github.com/NUS-Systems-Lab/pegasus},
  video = {https://www.youtube.com/watch?v=55Aki75fAr4},
  cats = {https://discuss.systems/@dan/109355268209462963},
  monthnum = {11}
}

@inproceedings{szekeres20:_meerk,
  address = {Heraklion, Crete, Greece},
  author = {Adriana Szekeres and Michael Whittaker and Naveen Kr.
		  Sharma and Jialin Li and Arvind Krishnamurthy and Irene
		  Zhang and Dan R. K. Ports},
  booktitle = {Proceedings of the 15th {ACM} {SIGOPS} {E}uro{S}ys
		  ({EuroSys '20})},
  month = apr,
  organization = {{ACM}},
  title = {Meerkat: Scalable Replicated Transactions Following the
		  Zero-Coordination Principle},
  year = {2020},
  abstract = {Traditionally, the high cost of network communication
		  between servers has hidden the impact of cross-core
		  coordination in replicated systems. However, new
		  technologies, like kernel-bypass networking and faster
		  network links, have exposed hidden bottlenecks in
		  distributed systems.  This paper explores how to build
		  multicore-scalable, replicated storage systems. We
		  introduce a new guideline for their design, called the
		  Zero-Coordination Principle. We use this principle to
		  design a new multicore-scalable, in-memory, replicated,
		  key-value store, called Meerkat. 
 Unlike existing
		  systems, Meerkat eliminates all cross-core and
		  cross-replica coordination, both of which pose a
		  scalability bottleneck. Our experiments found that Meerkat
		  is able to scale up to 80 hyper-threads and execute 8.3
		  million transactions per second. Meerkat represents an
		  improvement of 12x on state-of-the art, fault-tolerant,
		  in-memory, transactional storage systems built using
		  leader-based replication and a shared transaction log.},
  doi = {10.1145/3342195.3387529},
  pdf = {papers/meerkat-eurosys20.pdf},
  monthnum = {04}
}

@inproceedings{wang20:_multit,
  address = {Boston, MA, USA},
  author = {Tao Wang and Hang Zhu and Fabian Ruffy and Xin Jin and
		  Anirudh Sivaraman and Dan R. K. Ports and Aurojit Panda},
  booktitle = {Proceedings of the 11th Hot Topics in Cloud Computing
		  ({HotCloud} '20)},
  month = jul,
  organization = {{USENIX}},
  title = {Multitenancy for fast and programmable networks in the
		  cloud},
  year = {2020},
  abstract = {Fast and programmable network devices are now readily
		  available, both in the form of programmable switches and
		  smart network-interface cards. Going forward, we envision
		  that these devices will be widely deployed in the networks
		  of cloud providers (e.g., AWS, Azure, and GCP) and exposed
		  as a programmable surface for cloud customers—similar to
		  how cloud customers can today rent CPUs, GPUs, FPGAs, and
		  ML accelerators. Making this vision a reality requires us
		  to develop a mechanism to share the resources of a
		  programmable network device across multiple cloud tenants.
		  In other words, we need to provide multitenancy on these
		  devices. In this position paper, we design compile and
		  run-time approaches to multitenancy. We present preliminary
		  results showing that our design provides both efficient
		  utilization of the resources of a programmable network
		  device and isolation of tenant programs from each other.},
  pdf = {papers/multitenancy-hotcloud20.pdf},
  monthnum = {07}
}

@inproceedings{zeno20:_swish, address = {Chicago, IL, USA}, author = {Lior Zeno and Dan R. K. Ports and Jacob Nelson and Mark Silberstein}, booktitle = {Proceedings of the 16th Workshop on Hot Topics in Networks ({HotNets} '20)}, month = nov, organization = {{ACM}}, title = {{SwiShmem}: Distributed Shared State Abstractions for Programmable Switches}, year = {2020}, abstract = {Programmable switches provide an appealing platform for running network functions (NFs), such as NATs, firewalls and DDoS detectors, entirely in data plane, at staggering multi-Tbps processing rates. However, to be used in real deployments with a complex multi-switch topology, one NF instance must be deployed on each switch, which together act as a single logical NF. This requirement poses significant challenges in particular for stateful NFs, due to the need to manage distributed shared NF state among the switches. While considered a solved problem in classical distributed systems, data-plane state sharing requires addressing several unique challenges: high data rate, limited switch memory, and packet loss.

We present the design of SwiShmem, the first distributed shared state management layer for data-plane P4 programs, which facilitates the implementation of stateful distributed NFs on programmable switches. We first analyze the access patterns and consistency requirements of popular NFs that lend themselves for in-switch execution, and then discuss the design and implementation options while highlighting open research questions.}, doi = {10.1145/3422604.3425946}, pdf = {papers/swishmem-hotnets20.pdf}, monthnum = {11} }

@inproceedings{ports19:_when_shoul_networ_be_comput,
  address = {Bertinoro, Italy},
  author = {Dan R. K. Ports and Jacob Nelson},
  booktitle = {Proceedings of the 17th Workshop on Hot Topics in
		  Operating Systems ({HotOS} '19)},
  month = may,
  organization = {{ACM}},
  title = {When Should The Network Be The Computer?},
  year = {2019},
  abstract = {Researchers have repurposed programmable network devices
		  to place small amounts of application computation in the
		  network, sometimes yielding orders-of-magnitude performance
		  gains. At the same time, effectively using these devices
		  requires careful use of limited resources and managing
		  deployment challenges.  This paper provides a framework
		  for principled use of in-network processing. We provide a
		  set of guidelines for building robust and deployable
		  in-network primives, along with a taxonomy to help identify
		  which applications can benefit from in-network processing
		  and what types of devices they should use.},
  doi = {10.1145/3317550.3321439},
  pdf = {papers/innetwork-hotos19.pdf},
  slidespdf = {papers/innetwork-hotos19-slides.pdf},
  monthnum = {05}
}

@inproceedings{michael18:_towar_causal_datac_networ,
  address = {Porto, Portugal},
  author = {Ellis Michael and Dan R. K. Ports},
  booktitle = {Proceedings of the 2018 Workshop on Principles and
		  Practice of Consistency for Distributed Data ({PaPoC}
		  '18)},
  month = apr,
  organization = {{ACM}},
  title = {Towards Causal Datacenter Networks},
  year = {2018},
  abstract = {Traditionally, distributed systems conservatively assume
		  an asynchronous network. However, recent work on the
		  co-design of networks and distributed systems has shown
		  that stronger ordering properties are achievable in
		  datacenter networks and yield performance improvements for
		  the distributed systems they support. We build on that
		  trend and ask whether it is possible for the datacenter
		  network to order all messages in a protocol-agnostic way.
		  This approach, which we call omnisequencing, would ensure
		  causal delivery of all messages, making consistency a
		  network-level guarantee.},
  doi = {10.1145/3194261.3194269},
  pdf = {papers/causal-papoc18.pdf},
  monthnum = {04}
}

@inproceedings{gudmundsdottir17:_demon_inter_analy_perfor_measur_viska,
  address = {Chicago, IL, USA},
  author = {Helga Gudmundsdottir and Babak Salimi and Magdalena
		  Balazinska and Dan R. K. Ports and Dan Suciu},
  booktitle = {Proceedings of the 2017 {ACM} {SIGMOD} {I}nternational
		  {C}onference on {M}anagement of {D}ata},
  month = may,
  note = {Demonstration},
  organization = {{ACM}},
  title = {A Demonstration of Interactive Analysis of Performance
		  Measurements with {Viska}},
  year = {2017},
  abstract = {The ultimate goal of system performance analysis is to
		  identify the underlying causes for performance
		  differences between different systems and different
		  workloads. We make it easier to achieve this goal with
		  Viska, a new tool for generating and interpreting
		  performance measurement results. and Viska leverages
		  cutting-edge techniques from big data analytics and data
		  visualization to aid and automate this analysis, and helps
		  users derive meaningful and statistically sound conclusions
		  using state-of-the-art causal inference and hypothesis
		  testing techniques.},
  doi = {10.1145/3035918.3056448},
  pdf = {papers/viska-sigmod17demo.pdf},
  monthnum = {05}
}

@inproceedings{li17:_eris,
  address = {Shanghai, China},
  author = {Jialin Li and Ellis Michael and Dan R. K. Ports},
  booktitle = {Proceedings of the 26th {ACM} {S}ymposium on {O}perating
		  {S}ystems {P}rinciples ({SOSP} '17)},
  month = oct,
  organization = {{ACM}},
  title = {{Eris}: Coordination-Free Consistent Transactions using
		  Network Multi-Sequencing},
  year = {2017},
  abstract = {Distributed storage systems aim to provide strong
		  consistency and isolation guarantees on an architecture
		  that is partitioned across multiple shards for scalability
		  and replicated for fault-tolerance. Traditionally,
		  achieving all of these goals has required an expensive
		  combination of atomic commitment and replication protocols
		  -- introducing extensive coordination overhead. Our system,
		  Eris, takes a very different approach. It moves a core
		  piece of concurrency control functionality, which we term
		  multi-sequencing, into the datacenter network itself. This
		  network primitive takes on the responsibility for
		  consistently ordering transactions, and a new lightweight
		  transaction protocol ensures atomicity. The end result is
		  that Eris avoids both replication and transaction
		  coordination overhead: we show that it can process a large
		  class of distributed transactions in a single round-trip
		  from the client to the storage system without any explicit
		  coordination between shards or replicas. It provides
		  atomicity, consistency, and fault-tolerance with less than
		  10\% overhead -- achieving throughput 4.5--35x higher and
		  latency 72--80\% lower than a conventional design on
		  standard benchmarks.},
  doi = {10.1145/3132747.3132751},
  pdf = {papers/eris-sosp17.pdf},
  monthnum = {10}
}

@inproceedings{michael17:_recov_shared_objec_without_stabl_storag, address = {Vienna, Austria}, author = {Ellis Michael and Dan R. K. Ports and Naveen Kr. Sharma and Adriana Szekeres}, booktitle = {Proceedings of the 31st International Symposium on Distributed Computing ({DISC} '17)}, key = {DISC '17}, month = oct, title = {Recovering Shared Objects Without Stable Storage}, year = {2017}, abstract = {This paper considers the problem of building fault-tolerant shared objects when processes can crash and recover but lose their persistent state on recovery. This Diskless Crash-Recovery (DCR) model matches the way many long-lived systems are built. We show that it presents new challenges, as operations that are recorded at a quorum may not persist after some of the processes in that quorum crash and then recover.

To address this problem, we introduce the notion of crash-consistent quorums, where no recoveries happen during the quorum responses. We show that relying on crash-consistent quorums enables a recovery procedure that can recover all operations that successfully finished. Crash-consistent quorums can be easily identified using a mechanism we term the crash vector, which tracks the causal relationship between crashes, recoveries, and other operations.

We apply crash-consistent quorums and crash vectors to build two storage primitives. We give a new algorithm for multi-reader multi-writer atomic registers in the DCR model that guarantees safety under all conditions and termination under a natural condition. It improves on the best prior protocol for this problem by requiring fewer rounds, fewer nodes to participate in the quorum, and a less restrictive liveness condition. We also present a more efficient single-reader, single-writer atomic set---a virtual stable storage abstraction. It can be used to lift any existing algorithm from the traditional Crash-Recovery with Stable Storage model to the DCR model. We examine a specific application, state machine replication, and show that existing diskless protocols can violate their correctness guarantees, while ours offers a general and correct solution.}, pdf = {papers/recovery-disc17.pdf}, monthnum = {10} }

@inproceedings{salimi17:_zaliq,
  author = {Babak Salimi and Corey Cole and Dan R. K. Ports and Dan
		  Suciu},
  booktitle = {Proceedings of the 43rd {I}nternational {C}onference on
		  {V}ery {L}arge {D}ata {B}ases ({VLDB} '17)},
  key = {{VLDB '17}},
  month = aug,
  note = {Demonstration.},
  title = {ZaliQL: Causal Inference from Observational Data at
		  Scale},
  year = {2017},
  abstract = {Causal inference from observational data is a subject of
		  active research and development in statistics and computer
		  science. Many statistical software packages have been
		  developed for this purpose. However, these toolkits do not
		  scale to large datasets. We propose and demonstrate ZaliQL:
		  a SQL-based framework for drawing causal inference from
		  observational data. ZaliQL supports the state-of-the-art
		  methods for causal inference and runs at scale within
		  PostgreSQL database system. In addition, we built a visual
		  interface to wrap around ZaliQL. In our demonstration, we
		  will use this GUI to show a live investigation of the
		  causal effect of different weather conditions on flight
		  delays.},
  doi = {10.14778/3137765.3137818},
  pdf = {papers/zaliql-vldb17demo.pdf},
  monthnum = {08}
}

@inproceedings{gudmondsdottir16:_viska,
  address = {Savannah, GA, USA},
  author = {Helga Gudmundsdottir and Babak Salimi and Magdalena
		  Balazinska and Dan R. K. Ports and Dan Suciu},
  booktitle = {Proceedings of the 12th {USENIX} {S}ymposium on
		  {O}perating {S}ystems {D}esign and {I}mplementation ({OSDI}
		  '16)},
  month = nov,
  note = {Poster},
  organization = {{USENIX}},
  title = {Viska: Enabling Interactive Analysis of Performance
		  Measurements},
  year = {2016},
  abstract = {Much of systems research consists of performance analysis
		  -- to learn when one system outperforms another, to
		  identify architectural choices responsible for the
		  difference, or to identify performance anomalies in
		  particular workloads, for example. However, despite recent
		  advances in data analytics and interactive data
		  visualization, the tools we use for performance analysis
		  remain remarkably primitive.  The Viska project aims to
		  close this gap by providing a new toolkit for systems
		  researchers to generate and interpret performance
		  measurement results, helping users derive meaningful and
		  statistically sound conclusions. Viska leverages
		  cutting-edge techniques from big data analytics and data
		  visualization to aid and automate this analysis.},
  monthnum = {11}
}

@inproceedings{holt16:_discip_incon_consis_types, address = {Santa Clara, CA, USA}, author = {Brandon Holt and James Bornholt and Irene Zhang and Dan R. K. Ports and Mark Oskin and Luis Ceze}, booktitle = {Proceedings of the 7th Symposium on Cloud Computing ({SOCC} '16)}, month = oct, organization = {{ACM}}, title = {Disciplined Inconsistency with Consistency Types}, year = {2016}, abstract = {Distributed applications and web services, such as online stores or social networks, are expected to be scalable, available, responsive, and fault-tolerant. To meet these steep requirements in the face of high round-trip latencies, network partitions, server failures, and load spikes, applications use eventually consistent datastores that allow them to weaken the consistency of some data. However, making this transition is highly error-prone because relaxed consistency models are notoriously difficult to understand and test.

In this work, we propose a new programming model for distributed data that makes consistency properties explicit and uses a type system to enforce consistency safety. With the Inconsistent, Performance-bound, Approximate (IPA) storage system, programmers specify performance targets and correctness requirements as constraints on persistent data structures and handle uncertainty about the result of datastore reads using new consistency types. We implement a prototype of this model in Scala on top of an existing datastore, Cassandra, and use it to make performance/correctness tradeoffs in two applications: a ticket sales service and a Twitter clone. Our evaluation shows that IPA prevents consistency-based programming errors and adapts consistency automatically in response to changing network conditions, performing comparably to weak consistency and 2-10x faster than strong consistency.}, doi = {10.1145/2987550.2987559}, pdf = {papers/ipa-socc16.pdf}, monthnum = {10} }

@inproceedings{li16:_fast_replic_nopax,
  address = {Savannah, GA, USA},
  author = {Jialin Li and Ellis Michael and Adriana Szekeres and
		  Naveen Kr. Sharma and Dan R. K. Ports},
  booktitle = {Proceedings of the 12th {USENIX} {S}ymposium on
		  {O}perating {S}ystems {D}esign and {I}mplementation ({OSDI}
		  '16)},
  month = nov,
  organization = {{USENIX}},
  title = {Just Say {NO} to {Paxos} Overhead: Replacing Consensus
		  with Network Ordering},
  year = {2016},
  abstract = {Distributed applications use replication, implemented by
		  protocols like Paxos, to ensure data availability and
		  transparently mask server failures. This paper presents a
		  new approach to achieving replication in the data center
		  without the performance cost of traditional methods. Our
		  work carefully divides replication responsibility between
		  the network and protocol layers. The network orders
		  requests but does not ensure reliable delivery -- using a
		  new primitive we call ordered unreliable multicast (OUM).
		  Implementing this primitive can be achieved with
		  near-zero-cost in the data center. Our new replication
		  protocol, Network-Ordered Paxos (NOPaxos), exploits network
		  ordering to provide strongly consistent replication without
		  coordination. The resulting system not only outperforms
		  both latency- and throughput-optimized protocols on their
		  respective metrics, but also yields throughput within 2\%
		  and latency within 16 us of an unreplicated system --
		  providing replication without the performance cost.},
  pdf = {papers/nopaxos-osdi16.pdf},
  code = {https://github.com/uwsyslab/nopaxos/},
  monthnum = {11}
}

@inproceedings{holt15:_claret, address = {Bordeaux, France}, author = {Brandon Holt and Irene Zhang and Dan R. K. Ports and Mark Oskin and Luis Ceze}, booktitle = {Proceedings of the 2015 Workshop on Principles and Practice of Consistency for Distributed Data ({PaPoC} '15)}, month = apr, organization = {{ACM}}, title = {Claret: Using Data Types for Highly Concurrent Distributed Transactions}, year = {2015}, abstract = {Out of the many NoSQL databases in use today, some that provide simple data structures for records, such as Redis and MongoDB, are now becoming popular. Building applications out of these complex data types provides a way to communicate intent to the database system without sacrificing flexibility or committing to a fixed schema. Currently this capability is leveraged in limited ways, such as to ensure related values are co-located, or for atomic updates. There are many ways data types can be used to make databases more efficient that are not yet being exploited.

We explore several ways of leveraging abstract data type (ADT) semantics in databases, focusing primarily on commutativity. Using a Twitter clone as a case study, we show that using commutativity can reduce transaction abort rates for high-contention, update-heavy workloads that arise in real social networks. We conclude that ADTs are a good abstraction for database records, providing a safe and expressive programming model with ample opportunities for optimization, making databases more safe and scalable.}, doi = {10.1145/2745947.2745951}, pdf = {papers/claret-papoc15.pdf}, monthnum = {04} }

@inproceedings{holt15:_claret_poster,
  address = {Bordeaux, France},
  author = {Brandon Holt and Irene Zhang and Dan R. K. Ports and Mark
		  Oskin and Luis Ceze},
  booktitle = {Proceedings of the 10th {ACM} {SIGOPS} {E}uro{S}ys
		  ({EuroSys '15})},
  month = apr,
  note = {Poster},
  organization = {{ACM}},
  title = {Claret: Using Data Types for Highly Concurrent Distributed
		  Transactions},
  year = {2015},
  monthnum = {04}
}

@inproceedings{ports15:_desig_distr_system_using_approx,
  address = {Oakland, CA, USA},
  author = {Dan R. K. Ports and Jialin Li and Vincent Liu and Naveen
		  Kr. Sharma and Arvind Krishnamurthy},
  booktitle = {Proceedings of the 12th {USENIX} {S}ymposium on
		  {N}etworked {S}ystems {D}esign and {I}mplementation ({NSDI}
		  '15)},
  month = may,
  organization = {{USENIX}},
  title = {Designing Distributed Systems Using Approximate Synchrony
		  in Datacenter Networks},
  year = {2015},
  abstract = {Distributed systems are traditionally designed
		  independently from the underlying network, making
		  worst-case assumptions (e.g., complete asynchrony) about
		  its behavior. However, many of today's distributed
		  applications are deployed in data centers, where the
		  network is more reliable, predictable, and extensible. In
		  these environments, it is possible to co-design distributed
		  systems with their network layer, and doing so can offer
		  substantial benefits.  This paper explores network-level
		  mechanisms for providing Mostly-Ordered Multicast
		  (MOM): a best-effort ordering property for concurrent
		  multicast operations. Using this primitive, we design
		  Speculative Paxos, a state machine replication
		  protocol that relies on the network to order requests in
		  the normal case. This approach leads to substantial
		  performance benefits: under realistic data center
		  conditions, Speculative Paxos can provide 40\% lower
		  latency and 2.6x higher throughput than the standard Paxos
		  protocol. It offers lower latency than a latency-optimized
		  protocol (Fast Paxos) with the same throughput as a
		  throughput-optimized protocol (batching).},
  pdf = {papers/specpaxos-nsdi15.pdf},
  slidespdf = {papers/specpaxos-nsdi15-slides.pdf},
  award = {Best Paper Award},
  code = {https://github.com/uwsyslab/specpaxos/},
  monthnum = {05}
}

@inproceedings{sharma15:_transtorm_poster,
  address = {Monterey, CA, USA},
  author = {Naveen Kr. Sharma and Brandon Holt and Irene Zhang and Dan
		  R. K. Ports and Marcos Aguilera},
  booktitle = {Proceedings of the 25th {ACM} {S}ymposium on {O}perating
		  {S}ystems {P}rinciples ({SOSP} '15)},
  month = oct,
  note = {Poster},
  organization = {{ACM}},
  title = {Transtorm: a benchmark suite for transactional key-value
		  storage systems},
  year = {2015},
  monthnum = {10}
}

@inproceedings{zhang15:_build_consis_trans_incon_replic,
  address = {Monterey, CA, USA},
  author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and
		  Arvind Krishnamurthy and Dan R. K. Ports},
  booktitle = {Proceedings of the 25th {ACM} {S}ymposium on {O}perating
		  {S}ystems {P}rinciples ({SOSP} '15)},
  month = oct,
  organization = {{ACM}},
  title = {Building Consistent Transactions with Inconsistent
		  Replication},
  year = {2015},
  abstract = {Application programmers increasingly prefer distributed
		  storage systems with strong consistency and distributed
		  transactions (e.g., Google's Spanner) for their strong
		  guarantees and ease of use. Unfortunately, existing
		  transactional storage systems are expensive to use -- in
		  part because they require costly replication protocols,
		  like Paxos, for fault tolerance. In this paper, we present
		  a new approach that makes transactional storage systems
		  more affordable: we eliminate consistency from the
		  replication protocol while still providing distributed
		  transactions with strong consistency to applications. 
		  We present TAPIR -- the Transactional Application Protocol
		  for Inconsistent Replication -- the first transaction
		  protocol to use a novel replication protocol, called
		  inconsistent replication, that provides fault tolerance
		  without consistency. By enforcing strong consistency only
		  in the transaction protocol, TAPIR can commit transactions
		  in a single round-trip and order distributed transactions
		  without centralized coordination. We demonstrate the use of
		  TAPIR in a transactional key-value store, TAPIR-KV.
		  Compared to conventional systems, TAPIR-KV provides better
		  latency and throughput.},
  doi = {10.1145/2815400.2815404},
  pdf = {papers/tapir-sosp15.pdf},
  code = {https://github.com/uwsyslab/tapir},
  monthnum = {10}
}

@inproceedings{li14:_tales_tail, address = {Seattle, WA, USA}, author = {Jialin Li and Naveen Kr. Sharma and Dan R. K. Ports and Steven D. Gribble}, booktitle = {Proceedings of the 5th Symposium on Cloud Computing ({SOCC} '14)}, month = nov, organization = {{ACM}}, title = {Tales of the Tail: Hardware, {OS}, and Application-level Sources of Tail Latency}, year = {2014}, abstract = {Interactive services often have large-scale parallel implementations. To deliver fast responses, the median and tail latencies of a service's components must be low. In this paper, we explore the hardware, OS, and application-level sources of poor tail latency in high throughput servers executing on multi-core machines.

We model these network services as a queuing system in order to establish the best-achievable latency distribution. Using fine-grained measurements of three different servers (a null RPC service, Memcached, and Nginx) on Linux, we then explore why these servers exhibit significantly worse tail latencies than queuing models alone predict. The underlying causes include interference from background processes, request re-ordering caused by poor scheduling or constrained concurrency models, suboptimal interrupt routing, CPU power saving mechanisms, and NUMA effects.

We systematically eliminate these factors and show that Memcached can achieve a median latency of 11 us and a 99.9th percentile latency of 32 us at 80\% utilization on a four-core system. In comparison, a naive deployment of Memcached at the same utilization on a single-core system has a median latency of 100 us and a 99.9th percentile latency of 5 ms. Finally, we demonstrate that tradeoffs exist between throughput, energy, and tail latency.}, doi = {10.1145/2670979.2670988}, pdf = {papers/latency-socc14.pdf}, monthnum = {11} }

@inproceedings{peter14:_arrak,
  address = {Broomfield, CO, USA},
  author = {Simon Peter and Jialin Li and Irene Zhang and Dan R. K.
		  Ports and Doug Woos and Arvind Krishnamurthy and Thomas
		  Anderson and Timothy Roscoe},
  booktitle = {Proceedings of the 11th {USENIX} {S}ymposium on
		  {O}perating {S}ystems {D}esign and {I}mplementation ({OSDI}
		  '14)},
  month = oct,
  organization = {{USENIX}},
  title = {Arrakis: The Operating System is the Control Plane},
  year = {2014},
  abstract = {Recent device hardware trends enable a new approach to the
		  design of network server operating systems. In a
		  traditional operating system, the kernel mediates access to
		  device hardware by server applications, to enforce process
		  isolation as well as network and disk security. We have
		  designed and implemented a new operating system, Arrakis,
		  that splits the traditional role of the kernel in two.
		  Applications have direct access to virtualized I/O devices,
		  allowing most I/O operations to skip the kernel entirely,
		  while the kernel is re-engineered to provide network and
		  disk protection without kernel mediation of every
		  operation. We describe the hardware and software changes
		  needed to take advantage of this new abstraction, and we
		  illustrate its power by showing improvements of 2-5x in
		  latency and 9x in throughput for a popular persistent NoSQL
		  store relative to a well-tuned Linux implementation},
  pdf = {papers/arrakis-osdi14.pdf},
  award = {Jay Lepreau Best Paper Award},
  code = {https://github.com/UWNetworksLab/arrakis},
  monthnum = {10}
}

@inproceedings{peter14:_towar_high_perfor_applic_level_storag_manag,
  address = {Philadelphia, PA, USA},
  author = {Simon Peter and Jialin Li and Doug Woos and Irene Zhang
		  and Dan R. K. Ports and Thomas Anderson and Arvind
		  Krishnamurthy and Mark Zbikowski},
  booktitle = {Proceedings of the 5th Hot Topics in Storage and File
		  Systems ({HotStorage} '14)},
  month = jun,
  organization = {{USENIX}},
  title = {Towards High-Performance Application-Level Storage
		  Management},
  year = {2014},
  abstract = {We propose a radical re-architecture of the traditional
		  operating system storage stack to move the kernel off the
		  data path. Leveraging virtualized I/O hardware for disk and
		  flash storage, most read and write I/O operations go
		  directly to application code. The kernel dynamically
		  allocates extents, manages the virtual to physical binding,
		  and performs name translation. The benefit is to
		  dramatically reduce the CPU overhead of storage operations
		  while improving application flexibility.},
  pdf = {papers/arrakis-hotstorage14.pdf},
  monthnum = {06}
}

@inproceedings{zhang14:_optim_replic_two_phase_commit,
  address = {Beijing, China},
  author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and
		  Arvind Krishnamurthy and Dan R. K. Ports},
  booktitle = {Proceedings of the 5th Asia-Pacific Workshop on Systems
		  ({APSYS} '14)},
  key = {APSys 2014},
  month = jun,
  note = {Poster and extended abstract},
  title = {Optimistic Replicated Two-Phase Commit},
  year = {2014},
  monthnum = {06}
}

@inproceedings{zhuo14:_machin_fault_toler_reliab_datac_system,
  address = {Beijing, China},
  author = {Danyang Zhuo and Qiao Zhang and Dan R. K. Ports and Arvind
		  Krishnamurthy and Thomas Anderson},
  booktitle = {Proceedings of the 5th Asia-Pacific Workshop on Systems
		  ({APSYS} '14)},
  key = {APSys 2014},
  month = jun,
  title = {Machine Fault Tolerance for Reliable Datacenter Systems},
  year = {2014},
  abstract = {Although rare in absolute terms, undetected CPU, memory,
		  and disk errors occur often enough at data center scale to
		  significantly affect overall system reliability and
		  availability. In this paper, we propose a new failure
		  model, called Machine Fault Tolerance, and a new
		  abstraction, a replicated write-once trusted table, to
		  provide improved resilience to these types of failures.
		  Since most machine failures manifest in application server
		  and operating system code, we assume a Byzantine model for
		  those parts of the system. However, by assuming that the
		  hypervisor and network are trustworthy, we are able to
		  reduce the overhead of machine-fault masking to be close to
		  that of non-Byzantine Paxos.},
  doi = {10.1145/2637166.2637235},
  pdf = {papers/mft-apsys14.pdf},
  monthnum = {06}
}

@inproceedings{cheng12:_abstr_usabl_infor_flow_contr_aeolus,
  address = {Boston, MA, USA},
  author = {Winnie Cheng and Dan R. K. Ports and David Schultz and
		  Victoria Popic and Aaron Blankstein and James Cowling and
		  Dorothy Curtis and Liuba Shrira and Barbara Liskov},
  booktitle = {Proceedings of the 2012 {USENIX} {A}nnual {T}echnical
		  {C}onference},
  month = jun,
  organization = {{USENIX}},
  title = {Abstractions for Usable Information Flow Control in
		  {Aeolus}},
  year = {2012},
  abstract = {Despite the increasing importance of protecting
		  confidential data, building secure software remains as
		  challenging as ever. This paper describes Aeolus, a new
		  platform for building secure distributed applications.
		  Aeolus uses information flow control to provide
		  confidentiality and data integrity. It differs from
		  previous information flow control systems in a way that we
		  believe makes it easier to understand and use. Aeolus uses
		  a new, simpler security model, the first to combine a
		  standard principal-based scheme for authority management
		  with thread-granularity information flow tracking. The
		  principal hierarchy matches the way developers already
		  reason about authority and access control, and the
		  coarse-grained information flow tracking eases the task of
		  defining a program's security restrictions. In addition,
		  Aeolus provides a number of new mechanisms (authority
		  closures, compound tags, boxes, and shared volatile state)
		  that support common design patterns in secure application
		  design.},
  doi = {},
  pdf = {papers/aeolus-usenix12.pdf},
  slidespdf = {papers/aeolus-usenix12-slides.pdf},
  code = {http://pmg.csail.mit.edu/aeolus/#sw},
  monthnum = {06}
}

@inproceedings{ports10:_trans_consis_autom_manag_applic_data_cache,
  address = {Vancouver, BC, Canada},
  author = {Dan R. K. Ports and Austin T. Clements and Irene Zhang and
		  Samuel Madden and Barbara Liskov},
  booktitle = {Proceedings of the 9th {USENIX} {S}ymposium on {O}perating
		  {S}ystems {D}esign and {I}mplementation ({OSDI} '10)},
  month = oct,
  organization = {{USENIX}},
  title = {Transactional Consistency and Automatic Management in an
		  Application Data Cache},
  year = {2010},
  abstract = {Distributed in-memory application data caches like
		  memcached are a popular solution for scaling
		  database-driven web sites. These systems are easy to add to
		  existing deployments, and increase performance
		  significantly by reducing load on both the database and
		  application servers. Unfortunately, such caches do not
		  integrate well with the database or the application. They
		  cannot maintain transactional consistency across the entire
		  system, violating the isolation properties of the
		  underlying database. They leave the application responsible
		  for locating data in the cache and keeping it up to date, a
		  frequent source of application complexity and programming
		  errors.  Addressing both of these problems, we introduce
		  a transactional cache, TxCache, with a simple programming
		  model. TxCache ensures that any data seen within a
		  transaction, whether it comes from the cache or the
		  database, reflects a slightly stale but consistent snapshot
		  of the database. TxCache makes it easy to add caching to an
		  application by simply designating functions as cacheable;
		  it automatically caches their results, and invalidates the
		  cached data as the underlying database changes. Our
		  experiments found that adding TxCache increased the
		  throughput of a web application by up to 5.2x, only
		  slightly less than a non-transactional cache, showing that
		  consistency does not have to come at the price of
		  performance.},
  pdf = {papers/txcache-osdi10.pdf},
  psgz = {papers/txcache-osdi10.ps.gz},
  slidespdf = {papers/txcache-osdi10-slides.pdf},
  code = {https://github.com/drkp/txcache/},
  monthnum = {10}
}

@inproceedings{cowling09:_census, address = {San Diego, CA, USA}, author = {James Cowling and Dan R. K. Ports and Barbara Liskov and Raluca Ada Popa and Abhijeet Gaikwad}, booktitle = {Proceedings of the 2009 {USENIX} {A}nnual {T}echnical {C}onference}, month = jun, organization = {{USENIX}}, title = {Census: Location-Aware Membership Management for Large-Scale Distributed Systems}, year = {2009}, abstract = {We present Census, a platform for building large-scale distributed applications. Census provides a membership service and a multicast mechanism. The membership service provides every node with a consistent view of the system membership, which may be global or partitioned into location-based regions. Census distributes membership updates with low overhead, propagates changes promptly, and is resilient to both crashes and Byzantine failures. We believe that Census is the first system to provide a consistent membership abstraction at very large scale, greatly simplifying the design of applications built atop large deployments such as multi-site data centers.

Census builds on a novel multicast mechanism that is closely integrated with the membership service. It organizes nodes into a reliable overlay composed of multiple distribution trees, using network coordinates to minimize latency. Unlike other multicast systems, it avoids the cost of using distributed algorithms to construct and maintain trees. Instead, each node independently produces the same trees from the consistent membership view. Census uses this multicast mechanism to distribute membership updates, along with application-provided messages.

We evaluate the platform under simulation and on a real-world deployment on PlanetLab. We find that it imposes minimal bandwidth overhead, is able to react quickly to node failures and changes in the system membership, and can scale to substantial size.}, pdf = {papers/census-usenix09.pdf}, psgz = {papers/census-usenix09.ps.gz}, slidespdf = {papers/census-usenix09-slides.pdf}, monthnum = {06} }

@inproceedings{ports09:_trans_cachin_applic_data_recen_snaps, address = {Big Sky, MT, USA}, author = {Dan R. K. Ports and Austin T. Clements and Irene Y. Zhang and Samuel Madden and Barbara Liskov}, booktitle = {Proceedings of the 22nd {ACM} {S}ymposium on {O}perating {S}ystems {P}rinciples ({SOSP} '09)}, month = oct, note = {Work in Progress report}, organization = {{ACM}}, title = {Transactional Caching of Application Data using Recent Snapshots}, year = {2009}, abstract = {Many of today's well-known websites use application data caches to reduce the bottleneck load on the database, as well as the computational load on the application servers. Distributed in-memory shared caches, exemplified by memcached, are one popular approach. These caches typically provide a get/put interface, akin to a distributed hash table; the application chooses what data to keep in the cache and keeps it up to date. By storing the cache entirely in memory and horizontally partitioning among nodes, in-memory caches provide quick response times and ease of scaling.

However, existing caches have no notion of transactional consistency: there is no way to ensure that two accesses to the cache reflect a view of the database at the same point in time. While the backing database goes to great lengths to ensure this property (serializable isolation), the caching layer violates these guarantees. The resulting inconsistencies can have unpleasant consequences if exposed to the user (e.g., attributing the latest bid to the wrong user on an auction site), or add complexity to application code by forcing it to cope with temporarily violated invariants.

We argue that transactional semantics are not incompatible with cache performance and scalability. We introduce a transactional cache, TxCache, which guarantees that all values retrieved from the cache or database during a transaction reflect a consistent snapshot of the database.

TxCache also strives to simplify application design by helping manage the cache. Instead of requiring applications to manually insert and check for values in the cache, TxCache provides a library with which programmers simply designate functions as cacheable, and the library checks the cache for previous calls with the same arguments. In particular, and unlike memcached, TxCache does not require applications to explicitly invalidate cached values; correctly identifying the values to invalidate is difficult because it requires global reasoning about the application.}, pdf = {papers/txcache-sosp09wip-abstract.pdf}, psgz = {papers/txcache-sosp09wip-abstract.ps.gz}, slidespdf = {papers/txcache-sosp09wip-slides.pdf}, monthnum = {10} }

@inproceedings{chen08:_overs,
  address = {Seattle, WA, USA},
  author = {Xiaoxin Chen and Tal Garfinkel and E. Christopher Lewis
		  and Pratap Subrahmanyam and Carl A. Waldspurger and Dan
		  Boneh and Jeffrey Dwoskin and Dan R. K. Ports},
  booktitle = {Proceedings of the 13th {I}nternational {C}onference on
		  {A}rchitectural {S}upport for {P}rogramming {L}anguages and
		  {O}perating {S}ystems ({ASPLOS '08})},
  month = mar,
  organization = {{ACM}},
  title = {Overshadow: A Virtualization-Based Approach to
		  Retrofitting Protection in Commodity Operating Systems},
  year = {2008},
  abstract = {Commodity operating systems entrusted with securing
		  sensitive data are remarkably large and complex, and
		  consequently, frequently prone to compromise. To address
		  this limitation, we introduce a virtual-machine-based
		  system called Overshadow that protects the privacy and
		  integrity of application data, even in the event of a total
		  OS compromise. Overshadow presents an application with a
		  normal view of its resources, but the OS with an encrypted
		  view. This allows the operating system to carry out the
		  complex task of managing an application's resources,
		  without allowing it to read or modify them. Thus,
		  Overshadow offers a last line of defense for application
		  data.  Overshadow builds on multi-shadowing, a novel
		  mechanism that presents different views of ``physical''
		  memory, depending on the context performing the access.
		  This primitive offers an additional dimension of protection
		  beyond the hierarchical protection domains implemented by
		  traditional operating systems and processor architectures.
		  
 We present the design and implementation of Overshadow
		  and show how its new protection semantics can be integrated
		  with existing systems. Our design has been fully
		  implemented and used to protect a wide range of unmodified
		  legacy applications running on an unmodified Linux
		  operating system. We evaluate the performance of our
		  implementation, demonstrating that this approach is
		  practical.},
  doi = {10.1145/1346281.1346284},
  pdf = {papers/overshadow-asplos08.pdf},
  psgz = {papers/overshadow-asplos08.ps.gz},
  monthnum = {03}
}

@inproceedings{ports08:_towar_applic_secur_untrus_operat_system, address = {San Jose, CA, USA}, author = {Dan R. K. Ports and Tal Garfinkel}, booktitle = {Proceedings of the 3rd Workshop on Hot Topics in Security (HotSec '08)}, month = jul, organization = {{USENIX}}, title = {Towards Application Security on Untrusted Operating Systems}, year = {2008}, abstract = {Complexity in commodity operating systems makes compromises inevitable. Consequently, a great deal of work has examined how to protect security-critical portions of applications from the OS through mechanisms such as microkernels, virtual machine monitors, and new processor architectures. Unfortunately, most work has focused on CPU and memory isolation and neglected OS semantics. Thus, while much is known about how to prevent OS and application processes from modifying each other, far less is understood about how different OS components can undermine application security if they turn malicious.

We consider this problem in the context of our work on Overshadow, a virtual-machine-based system for retrofitting protection in commodity operating systems. We explore how malicious behavior in each major OS subsystem can undermine application security, and present potential mitigations. While our discussion is presented in terms of Overshadow and Linux, many of the problems and solutions are applicable to other systems where trusted applications rely on untrusted, potentially malicious OS components.}, pdf = {papers/overshadow-hotsec08.pdf}, psgz = {papers/overshadow-hotsec08.ps.gz}, slidespdf = {papers/overshadow-hotsec08-slides.pdf}, monthnum = {07} }

@inproceedings{clements05:_arpeg,
  address = {Ithaca, NY, USA},
  author = {Austin T. Clements and Dan R. K. Ports and David R.
		  Karger},
  booktitle = {Proceedings of the 4th International Workshop on
		  Peer-to-Peer Systems ({IPTPS} '05)},
  key = {IPTPS '05},
  month = feb,
  pages = {58--68},
  publisher = {Springer},
  series = {Lecture Notes in Computer Science},
  title = {Arpeggio: Metadata Searching and Content Sharing with
		  {C}hord},
  volume = {3640},
  year = {2005},
  abstract = {Arpeggio is a peer-to-peer file-sharing network based on
		  the Chord lookup primitive. Queries for data whose metadata
		  matches a certain criterion are performed efficiently by
		  using a distributed keyword-set index, augmented with
		  index-side filtering. We introduce index gateways, a
		  technique for minimizing index maintenance overhead.
		  Because file data is large, Arpeggio employs subrings to
		  track live source peers without the cost of inserting the
		  data itself into the network. Finally, we introduce
		  postfetching, a technique that uses information in the
		  index to improve the availability of rare files. The result
		  is a system that provides efficient query operations with
		  the scalability and reliability advantages of full
		  decentralization, and a content distribution system tuned
		  to the requirements and capabilities of a peer-to-peer
		  network.},
  pdf = {papers/arpeggio-iptps05.pdf},
  psgz = {papers/arpeggio-iptps05.ps.gz},
  slidespdf = {papers/arpeggio-iptps05-slides.pdf},
  monthnum = {02}
}

@inproceedings{ports05:_persif,
  address = {Brighton, United Kingdom},
  author = {Dan R. K. Ports and Austin T. Clements and Erik D.
		  Demaine},
  booktitle = {Proceedings of the 20th {ACM} {S}ymposium on {O}perating
		  {S}ystems {P}rinciples ({SOSP} '05)},
  month = oct,
  note = {Poster and extended abstract},
  organization = {{ACM}},
  title = {{PersiFS}: A Versioned File System with an Efficient
		  Representation},
  year = {2005},
  doi = {10.1145/1095810.1118598},
  monthnum = {10}
}

@inproceedings{clements04:_arpeg,
  address = {Cambridge, MA, USA},
  author = {Austin T. Clements and Dan R. K. Ports and David R.
		  Karger},
  booktitle = {Proceedings of the 2nd Project IRIS Student Workshop
		  ({ISW} '04)},
  key = {ISW '04},
  month = nov,
  note = {Poster and extended abstract.},
  title = {Arpeggio: Efficient Metadata-based Searching and File
		  Transfer with {DHTs}},
  year = {2004},
  abstract = {Arpeggio is a peer-to-peer file-sharing network
		  based on the Chord distributed hash table. Queries for
		  files whose metadata matches a certain criterion are
		  performed efficiently by using a distributed
		  keyword-set index, augmented with index-side
		  filtering. We introduce metadata gateways, a
		  technique for minimizing index maintenance overhead.
		  Arpeggio also uses the DHT for indirect
		  storage of file contents, maintaining pointers from
		  content to the live peers that provide it. Finally, we
		  introduce postfetching, a technique that uses
		  information in the index to improve the availability of
		  rare files. The result is a system that provides efficient
		  query operations with the scalability and reliability
		  advantages of full decentralization, and a content
		  distribution system tuned to the requirements of a
		  peer-to-peer file-sharing network.},
  monthnum = {11}
}

@mastersthesis{ports07:_metad_index_in_struc_peer,
  address = {Cambridge, MA, USA},
  author = {Dan R. K. Ports},
  month = feb,
  school = {Massachusetts Institute of Technology},
  type = {M.Eng. thesis},
  title = {Arpeggio: Metadata Indexing in a Structured Peer-to-Peer
		  Network},
  year = {2007},
  abstract = {Peer-to-peer networks require an efficient means for
		  performing searches for files by metadata keywords.
		  Unfortunately, current methods usually sacrifice either
		  scalability or recall. Arpeggio is a peer-to-peer
		  file-sharing network that uses the Chord lookup primitive
		  as a basis for constructing distributed keyword-set index,
		  augmented with index-side filtering, to address this
		  problem. We introduce index gateways, a technique for
		  minimizing index maintenance overhead. Arpeggio also
		  includes a content distribution system for finding source
		  peers for a file; we present a novel system that uses Chord
		  subrings to track live source peers without the cost of
		  inserting the data itself into the network, and supports
		  postfetching: using information in the index to improve the
		  availability of rare files. The result is a system that
		  provides efficient query operations with the scalability
		  and reliability advantages of full decentralization. We use
		  analysis and simulation results to show that our indexing
		  system has reasonable storage and bandwidth costs, and
		  improves load distribution.},
  doi = {1721.1/41663},
  pdf = {papers/arpeggio-meng.pdf},
  psgz = {papers/arpeggio-meng.ps.gz},
  monthnum = {02}
}

@article{liu25:_hyper,
  author = {Ziyuan Liu and Zhixiong Niu and Ran Shu and Wenxue Cheng
		  and Lihua Yuan and Jacob Nelson and Dan R. K. Ports and
		  Peng Cheng and Yongqiang Xiong},
  journal = {{IEEE} Transactions on Cloud Computing},
  month = apr,
  number = {2},
  pages = {498-511},
  title = {{HyperDrive}: Direct Network Telemetry Storage via
		  Programmable Switches},
  volume = {13},
  year = {2025},
  abstract = {In cloud datacenter operations, telemetry and logs are
		  indispensable, enabling essential services such as network
		  diagnostics, auditing, and knowledge discovery. The
		  escalating scale of data centers, coupled with increased
		  bandwidth and finer-grained telemetry, results in an
		  overwhelming volume of data. This proliferation poses
		  significant storage challenges for telemetry systems. In
		  this paper, we introduce HyperDrive, an innovative system
		  designed to efficiently store large volumes of telemetry
		  and logs in data centers using programmable switches. This
		  in-network approach effectively mitigates bandwidth
		  bottlenecks commonly associated with traditional
		  endpoint-based methods. To our knowledge, we are the first
		  to use a programmable switch to directly control storage,
		  bypassing the CPU to achieve the best performance. With
		  merely 21\% of a switch's resources, our HyperDrive
		  implementation showcases remarkable scalability and
		  efficiency. Through rigorous evaluation, it has
		  demonstrated linear scaling capabilities, efficiently
		  managing 12 SSDs on a single server with minimal host
		  overhead. In an eight-server testbed, HyperDrive achieved
		  an impressive throughput of approximately 730Gbps,
		  underscoring its potential to transform data center
		  telemetry and logging practices.},
  doi = {10.1109/TCC.2025.3543477},
  monthnum = {04}
}

@techreport{szekeres20:_makin_distr_mobil_applic_safe,
  author = {Adriana Szekeres and Irene Zhang and Katelin Bailey and
		  Isaac Ackerman and Haichen Shen and Franziska Roesner and
		  Dan R. K. Ports and Arvind Krishnamurthy and Henry M.
		  Levy},
  institution = {arXiv},
  month = aug,
  number = {2008.06536},
  type = {arXiv preprint},
  title = {Making Distributed Mobile Applications {SAFE}: Enforcing
		  User Privacy Policies on Untrusted Applications with Secure
		  Application Flow Enforcement},
  year = {2020},
  abstract = {Today's mobile devices sense, collect, and store huge
		  amounts of personal information, which users share with
		  family and friends through a wide range of applications.
		  Once users give applications access to their data, they
		  must implicitly trust that the apps correctly maintain data
		  privacy. As we know from both experience and
		  all-too-frequent press articles, that trust is often
		  misplaced. While users do not trust applications, they do
		  trust their mobile devices and operating systems.
		  Unfortunately, sharing applications are not limited to
		  mobile clients but must also run on cloud services to share
		  data between users. In this paper, we leverage the trust
		  that users have in their mobile OSes to vet cloud services.
		  To do so, we define a new Secure Application Flow
		  Enforcement (SAFE) framework, which requires cloud services
		  to attest to a system stack that will enforce policies
		  provided by the mobile OS for user data. We implement a
		  mobile OS that enforces SAFE policies on unmodified mobile
		  apps and two systems for enforcing policies on untrusted
		  cloud services. Using these prototypes, we demonstrate that
		  it is possible to enforce existing user privacy policies on
		  unmodified applications.},
  doi = {10.48550/arXiv.2008.06536},
  pdf = {papers/agate-arxiv20.pdf},
  monthnum = {08}
}

@techreport{sapio19:_scalin_distr_machin_learn_in_networ_aggreg,
  author = {Amedeo Sapio and Marco Canini and Chen-Yu Ho and Jacob
		  Nelson and Panos Kalnis and Changhoon Kim and Arvind
		  Krishnamurthy and Masoud Moshref and Dan R. K. Ports and
		  Peter Richtarik},
  institution = {arXiv},
  month = feb,
  note = {version 2, Sep. 2020},
  number = {1903.06701},
  type = {arXiv preprint},
  title = {Scaling Distributed Machine Learning with In-Network
		  Aggregation},
  year = {2019},
  abstract = {Training machine learning models in parallel is an
		  increasingly important workload. We accelerate distributed
		  parallel training by designing a communication primitive
		  that uses a programmable switch dataplane to execute a key
		  step of the training process. Our approach, SwitchML,
		  reduces the volume of exchanged data by aggregating the
		  model updates from multiple workers in the network. We
		  co-design the switch processing with the end-host protocols
		  and ML frameworks to provide an efficient solution that
		  speeds up training by up to 5.5× for a number of
		  real-world benchmark models.},
  doi = {10.48550/arXiv.1903.06701},
  pdf = {papers/switchml-arxiv-v2.pdf},
  supersededby = {sapio21:_scalin_distr_machin_learn_in_networ_aggreg},
  supersededas = {arXiv version (2020)},
  monthnum = {02}
}

@techreport{szekeres19:_meerk, address = {Seattle, WA, USA}, author = {Adriana Szekeres and Michael Whittaker and Naveen Kr. Sharma and Jialin Li and Arvind Krishnamurthy and Irene Zhang and Dan R. K. Ports}, institution = {University of Washington CSE}, month = nov, number = {UW-CSE-2019-11-02}, title = {Meerkat: Scalable Replicated Transactions Following the Zero-Coordination Principle}, year = {2019}, abstract = {Traditionally, the high cost of network communication between servers has hidden the impact of cross-core coordination in replicated systems. However, new technologies, like kernel-bypass networking and faster network links, have exposed hidden bottlenecks in distributed systems.

This paper explores how to build multicore-scalable, replicated storage systems. We introduce a new guideline for their design, called the Zero-Coordination Principle. We use this principle to design a new multicore-scalable, in-memory, replicated, key-value store, called Meerkat.

Unlike existing systems, Meerkat eliminates all cross-core and cross-replica coordination, both of which pose a scalability bottleneck. Our experiments found that Meerkat is able to scale up to 80 hyper-threads and execute 8.3 million transactions per second. Meerkat represents an improvement of 12x on state-of-the art, fault-tolerant, in-memory, transactional storage systems built using leader-based replication and a shared transaction log.}, pdf = {papers/meerkat-tr19.pdf}, monthnum = {11} }

@article{zhu19:_harmon,
  author = {Hang Zhu and Zhihao Bai and Jialin Li and Ellis Michael
		  and Dan R. K. Ports and Ion Stoica and Xin Jin},
  journal = {Proceedings of the VLDB Endowment},
  month = nov,
  number = {3},
  pages = {376--389},
  title = {Harmonia: Near-Linear Scalability for Replicated Storage
		  with In-Network Conflict Detection},
  volume = {13},
  year = {2019},
  abstract = {Distributed storage employs replication to mask failures
		  and improve availability. However, these systems typically
		  exhibit a hard tradeoff between consistency and
		  performance. Ensuring consistency introduces coordination
		  overhead, and as a result the system throughput does not
		  scale with the number of replicas. We present Harmonia, a
		  replicated storage architecture that exploits the
		  capability of new-generation programmable switches to
		  obviate this tradeoff by providing near-linear scalability
		  without sacrificing consistency. To achieve this goal,
		  Harmonia detects read-write conflicts in the network, which
		  enables any replica to serve reads for objects with no
		  pending writes. Harmonia implements this functionality at
		  line rate, thus imposing no performance overhead. We have
		  implemented a prototype of Harmonia on a cluster of
		  commodity servers connected by a Barefoot Tofino switch,
		  and have integrated it with Redis. We demonstrate the
		  generality of our approach by supporting a variety of
		  replication protocols, including primary-backup, chain
		  replication, Viewstamped Replication, and NOPaxos.
		  Experimental results show that Harmonia improves the
		  throughput of these protocols by up to 10x for a
		  replication factor of 10, providing near-linear scalability
		  up to the limit of our testbed.},
  doi = {10.14778/3368289.3368301},
  pdf = {papers/harmonia-vldb20.pdf},
  monthnum = {11}
}

@techreport{zhu19:_harmon_arxiv,
  author = {Hang Zhu and Zhihao Bai and Jialin Li and Ellis Michael
		  and Dan R. K. Ports and Ion Stoica and Xin Jin},
  institution = {arXiv},
  month = apr,
  number = {1904.08964},
  type = {arXiv preprint},
  title = {Harmonia: Near-Linear Scalability for Replicated Storage
		  with In-Network Conflict Detection},
  year = {2019},
  abstract = {Distributed storage employs replication to mask failures
		  and improve availability. However, these systems typically
		  exhibit a hard tradeoff between consistency and
		  performance. Ensuring consistency introduces coordination
		  overhead, and as a result the system throughput does not
		  scale with the number of replicas. We present Harmonia, a
		  replicated storage architecture that exploits the
		  capability of new-generation programmable switches to
		  obviate this tradeoff by providing near-linear scalability
		  without sacrificing consistency. To achieve this goal,
		  Harmonia detects read-write conflicts in the network, which
		  enables any replica to serve reads for objects with no
		  pending writes. Harmonia implements this functionality at
		  line rate, thus imposing no performance overhead. We have
		  implemented a prototype of Harmonia on a cluster of
		  commodity servers connected by a Barefoot Tofino switch,
		  and have integrated it with Redis. We demonstrate the
		  generality of our approach by supporting a variety of
		  replication protocols, including primary-backup, chain
		  replication, Viewstamped Replication, and NOPaxos.
		  Experimental results show that Harmonia improves the
		  throughput of these protocols by up to 10X for a
		  replication factor of 10, providing near-linear scalability
		  up to the limit of our testbed.},
  doi = {10.48550/arXiv.1904.08964},
  pdf = {papers/harmonia-arxiv19.pdf},
  monthnum = {04}
}

@techreport{li18:_pegas,
  address = {Seattle, WA, USA},
  author = {Jialin Li and Jacob Nelson and Xin Jin and Dan R. K.
		  Ports},
  institution = {University of Washington CSE},
  month = dec,
  number = {UW-CSE-18-12-01},
  title = {Pegasus: Load-Aware Selective Replication with an
		  In-Network Coherence Directory},
  year = {2018},
  abstract = {High performance distributed storage systems face the
		  challenge of load imbalance caused by skewed and dynamic
		  workloads. This paper introduces Pegasus, a new storage
		  architecture that leverages new-generation programmable
		  switch ASICs to balance load across storage servers.
		  Pegasus uses selective replication of the most popular
		  objects in the data store to distribute load. Using a novel
		  in-network coherence directory, the Pegasus switch tracks
		  and manages the location of replicated objects. This allows
		  it to achive load-aware forwarding and dynamic rebalancing
		  for replicated keys, while still guaranteeing data
		  coherence. The Pegasus design is practical to implement as
		  it stores only forwarding metadata in the switch data
		  plane. The resulting system improves the 99\% tail latency
		  of a distributed in-memory key-value store by more than
		  95\%, and yields up to a 9x throughput improvement under a
		  latency SLO -- results which hold across a large set of
		  workloads with varying degrees of skewness, read/write
		  ratio, and dynamism.},
  pdf = {papers/pegasus-tr18.pdf},
  supersededby = {li20:_pegas},
  supersededas = {Tech. Report (2018)},
  monthnum = {12}
}

@article{zhang18:_build_consis_trans_incon_replic, author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and Arvind Krishnamurthy and Dan R. K. Ports}, journal = {{ACM} Transactions on Computer Systems}, month = dec, number = {4}, pages = {12}, title = {Building Consistent Transactions with Inconsistent Replication}, volume = {35}, year = {2018}, abstract = {Application programmers increasingly prefer distributed storage systems with strong consistency and distributed transactions (e.g., Google's Spanner) for their strong guarantees and ease of use. Unfortunately, existing transactional storage systems are expensive to use -- in part because they require costly replication protocols, like Paxos, for fault tolerance. In this paper, we present a new approach that makes transactional storage systems more affordable: we eliminate consistency from the replication protocol while still providing distributed transactions with strong consistency to applications.

We present TAPIR -- the Transactional Application Protocol for Inconsistent Replication -- the first transaction protocol to use a novel replication protocol, called inconsistent replication, that provides fault tolerance without consistency. By enforcing strong consistency only in the transaction protocol, TAPIR can commit transactions in a single round-trip and order distributed transactions without centralized coordination. We demonstrate the use of TAPIR in a transactional key-value store, TAPIR-KV. Compared to conventional systems, TAPIR-KV provides both better latency and better throughput.}, doi = {10.1145/3269981}, pdf = {papers/tapir-tocs18.pdf}, code = {https://github.com/uwsyslab/tapir/}, cats = {https://discuss.systems/@dan/109389114238305747}, monthnum = {12} }

@techreport{li17:_eris_tr,
  address = {Seattle, WA, USA},
  author = {Jialin Li and Ellis Michael and Dan R. K. Ports},
  institution = {University of Washington CSE},
  month = oct,
  number = {UW-CSE-TR-17-10-01},
  title = {{Eris}: Coordination-Free Consistent Transactions using
		  Network Multi-Sequencing (Extended Version)},
  year = {2017},
  abstract = {Distributed storage systems aim to provide strong
		  consistency and isolation guarantees on an architecture
		  that is partitioned across multiple shards for scalability
		  and replicated for fault-tolerance. Traditionally,
		  achieving all of these goals has required an expensive
		  combination of atomic commitment and replication protocols
		  -- introducing extensive coordination overhead. Our system,
		  Eris, takes a very different approach. It moves a core
		  piece of concurrency control functionality, which we term
		  multi-sequencing, into the datacenter network itself. This
		  network primitive takes on the responsibility for
		  consistently ordering transactions, and a new lightweight
		  transaction protocol ensures atomicity. The end result is
		  that Eris avoids both replication and transaction
		  coordination overhead: we show that it can process a large
		  class of distributed transactions in a single round-trip
		  from the client to the storage system without any explicit
		  coordination between shards or replicas. It provides
		  atomicity, consistency, and fault-tolerance with less than
		  10\% overhead -- achieving throughput 4.5--35x higher and
		  latency 72--80\% lower than a conventional design on
		  standard benchmarks.},
  pdf = {papers/eris-tr17.pdf},
  monthnum = {10}
}

@techreport{michael17:_recov_shared_objec_without_stabl, address = {Seattle, WA, USA}, author = {Ellis Michael and Dan R. K. Ports and Naveen Kr. Sharma and Adriana Szekeres}, institution = {University of Washington CSE}, month = aug, number = {UW-CSE-17-08-01}, title = {Recovering Shared Objects Without Stable Storage (Extended Version)}, year = {2017}, abstract = {This paper considers the problem of building fault-tolerant shared objects when processes can crash and recover but lose their persistent state on recovery. This Diskless Crash-Recovery (DCR) model matches the way many long-lived systems are built. We show that it presents new challenges, as operations that are recorded at a quorum may not persist after some of the processes in that quorum crash and then recover.

@techreport{holt16:_discip_incon, address = {Seattle, WA, USA}, author = {Brandon Holt and James Bornholt and Irene Zhang and Dan R. K. Ports and Mark Oskin and Luis Ceze}, institution = {University of Washington CSE}, month = jun, number = {UW-CSE-TR-16-06-01}, title = {Disciplined Inconsistency}, year = {2016}, abstract = {Distributed applications and web services, such as online stores or social networks, are expected to be scalable, available, responsive, and fault-tolerant. To meet these steep requirements in the face of high round-trip latencies, network partitions, server failures, and load spikes, applications use eventually consistent datastores that allow them to weaken the consistency of some data. However, making this transition is highly error-prone because relaxed consistency models are notoriously difficult to understand and test.

In this work, we propose a new programming model for distributed data that makes consistency properties explicit and uses a type system to enforce consistency safety. With the Inconsistent, Performance-bound, Approximate (IPA) storage system, programmers specify performance targets and correctness requirements as constraints on persistent data structures and handle uncertainty about the result of datastore reads using new *consistency types*. We implement a prototype of this model in Scala on top of an existing datastore, Cassandra, and use it to make performance/correctness tradeoffs in two applications: a ticket sales service and a Twitter clone. Our evaluation shows that IPA prevents consistency-based programming errors and adapts consistency automatically in response to changing network conditions, performing comparably to weak consistency and 2-10x faster than strong consistency.}, pdf = {papers/ipa-tr16.pdf}, supersededby = {holt16:_discip_incon_consis_types}, supersededas = {Tech. Report (2016)}, monthnum = {06} }

@techreport{li16:_fast_replic_nopax_tr,
  address = {Seattle, WA, USA},
  author = {Jialin Li and Ellis Michael and Adriana Szekeres and
		  Naveen Kr. Sharma and Dan R. K. Ports},
  institution = {University of Washington CSE},
  number = {UW-CSE-TR-16-09-02},
  title = {Just Say {NO} to {Paxos} Overhead: Replacing Consensus
		  with Network Ordering (Extended Version)},
  year = {2016},
  abstract = {Distributed applications use replication, implemented by
		  protocols like Paxos, to ensure data availability and
		  transparently mask server failures. This paper presents a
		  new approach to achieving replication in the data center
		  without the performance cost of traditional methods. Our
		  work carefully divides replication responsibility between
		  the network and protocol layers. The network orders
		  requests but does not ensure reliable delivery -- using a
		  new primitive we call ordered unreliable multicast (OUM).
		  Implementing this primitive can be achieved with
		  near-zero-cost in the data center. Our new replication
		  protocol, Network-Ordered Paxos (NOPaxos), exploits network
		  ordering to provide strongly consistent replication without
		  coordination. The resulting system not only outperforms
		  both latency- and throughput-optimized protocols on their
		  respective metrics, but also yields throughput within 2\%
		  and latency within 16 us of an unreplicated system --
		  providing replication without the performance cost.},
  pdf = {papers/nopaxos-tr16.pdf},
  code = {https://github.com/uwsyslab/nopaxos/},
  monthnum = {}
}

@techreport{michael16:_provid_stabl_storag_diskl_crash,
  address = {Seattle, WA, USA},
  author = {Ellis Michael and Dan R. K. Ports and Naveen Kr. Sharma
		  and Adriana Szekeres},
  institution = {University of Washington CSE},
  month = aug,
  number = {UW-CSE-TR-16-08-02},
  title = {Providing Stable Storage for the Diskless Crash-Recovery
		  Failure Model},
  year = {2016},
  abstract = {Many classic protocols in the fault tolerant distributed
		  computing literature assume a Crash-Fail model in which
		  processes either are up, or have crashed and are
		  permanently down. While this model is useful, it does not
		  fully capture the difficulties many real systems must
		  contend with. In particular, real-world systems are
		  long-lived and must have a recovery mechanism so that
		  crashed processes can rejoin the system and restore its
		  fault-tolerance. When processes are assumed to have access
		  to stable storage that is persistent across failures, the
		  Crash-Recovery model is trivial. However, because disk
		  failures are common and because having a disk on a
		  protocol's critical path is often performance concern,
		  diskless recovery protocols are needed. While such
		  protocols do exist in the state machine replication
		  literature, several well-known protocols have flawed
		  recovery mechanisms. We examine these errors to elucidate
		  the problem of diskless recovery and present our own
		  protocol for providing virtual stable storage, transforming
		  any protocol in the Crash-Recovery with stable storage
		  model into a protocol in the Diskless Crash-Recover
		  model.},
  pdf = {papers/diskless-tr16.pdf},
  monthnum = {08}
}

@article{zhang16:_when_is_operat_order_requir,
  author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and
		  Arvind Krishnamurthy and Dan R. K. Ports},
  journal = {{IEEE} Data Engineering Bulletin},
  month = mar,
  number = {1},
  pages = {27--38},
  title = {When Is Operation Ordering Required in Replicated
		  Transactional Storage?},
  volume = {39},
  year = {2016},
  abstract = {Today's replicated transactional storage systems typically
		  have a layered architecture, combining protocols for
		  transaction coordination, consistent replication, and
		  concurrency control. These systems generally require costly
		  strongly-consistent replication protocols like Paxos, which
		  assign a total order to all operations. To avoid this cost,
		  we ask whether all replicated operations in these systems
		  need to be strictly ordered. Recent research has yielded
		  replication protocols that can avoid unnecessary ordering,
		  e.g., by exploiting commutative operations, but it is not
		  clear how to apply these to replicated transaction
		  processing systems. We answer this question by analyzing
		  existing transaction processing designs in terms of which
		  replicated operations require ordering and which simply
		  require fault tolerance. We describe how this analysis
		  leads to our recent work on TAPIR, a transaction protocol
		  that efficiently provides strict serializability by using a
		  new replication protocol that provides fault tolerance but
		  not ordering for most operations.},
  pdf = {papers/ordering-debull16.pdf},
  monthnum = {03}
}

@article{peter15:_arrak,
  author = {Simon Peter and Jialin Li and Irene Zhang and Dan R. K.
		  Ports and Doug Woos and Arvind Krishnamurthy and Thomas
		  Anderson and Timothy Roscoe},
  journal = {{ACM} Transactions on Computer Systems},
  month = nov,
  number = {4},
  title = {Arrakis: The Operating System Is the Control Plane},
  volume = {33},
  year = {2015},
  abstract = {Recent device hardware trends enable a new approach to the
		  design of network server operating systems. In a
		  traditional operating system, the kernel mediates access to
		  device hardware by server applications to enforce process
		  isolation as well as network and disk security. We have
		  designed and implemented a new operating system, Arrakis,
		  that splits the traditional role of the kernel in two.
		  Applications have direct access to virtualized I/O devices,
		  allowing most I/O operations to skip the kernel entirely,
		  while the kernel is re-engineered to provide network and
		  disk protection without kernel mediation of every
		  operation. We describe the hardware and software changes
		  needed to take advantage of this new abstraction, and we
		  illustrate its power by showing improvements of 2 to 5 x in
		  latency and 9x throughput for a popular persistent NoSQL
		  store relative to a well-tuned Linux implementation.},
  doi = {10.1145/2812806},
  pdf = {papers/arrakis-tocs15.pdf},
  code = {https://github.com/UWNetworksLab/arrakis},
  monthnum = {11}
}

@techreport{zhang15:_build_consis_trans_incon_replic_exten_version, author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and Arvind Krishnamurthy and Dan R. K. Ports}, institution = {University of Washington CSE}, month = oct, number = {UW-CSE-2014-12-01 v2}, title = {Building Consistent Transactions with Inconsistent Replication (Extended Version)}, year = {2015}, abstract = {Application programmers increasingly prefer distributed storage systems with strong consistency and distributed transactions (e.g., Google's Spanner) for their strong guarantees and ease of use. Unfortunately, existing transactional storage systems are expensive to use -- in part because they require costly replication protocols, like Paxos, for fault tolerance. In this paper, we present a new approach that makes transactional storage systems more affordable: we eliminate consistency from the replication protocol while still providing distributed transactions with strong consistency to applications.

We present TAPIR -- the Transactional Application Protocol for Inconsistent Replication -- the first transaction protocol to use a novel replication protocol, called inconsistent replication, that provides fault tolerance without consistency. By enforcing strong consistency only in the transaction protocol, TAPIR can commit transactions in a single round-trip and order distributed transactions without centralized coordination. We demonstrate the use of TAPIR in a transactional key-value store, TAPIR-KV. Compared to conventional systems, TAPIR-KV provides better latency and throughput.}, pdf = {papers/tapir-tr-v2.pdf}, code = {https://github.com/uwsyslab/tapir/}, cats = {https://discuss.systems/@dan/109389114238305747}, monthnum = {10} }

@techreport{li14:_tales_tail_tr, address = {Seattle, WA, USA}, author = {Jialin Li and Naveen Kr. Sharma and Dan R. K. Ports and Steven D. Gribble}, institution = {University of Washington CSE}, month = apr, number = {UW-CSE-14-04-01}, title = {Tales of the Tail: Hardware, {OS}, and Application-level Sources of Tail Latency}, year = {2014}, abstract = {Interactive services often have large-scale parallel implementations. To deliver fast responses, the median and tail latencies of a service's components must be low. In this paper, we explore the hardware, OS, and application-level sources of poor tail latency in high throughput servers executing on multi-core machines.

We first review the basic queuing theory that governs service latency. Using fine-grained measurements of three different servers (a null RPC service, Memcached, and Nginx) on Linux, we then explore why these servers exhibit significantly worse tail latencies than queuing models alone predict. The underlying causes include interference from background processes, request re-ordering caused by poor scheduling or constrained concurrency models, suboptimal interrupt routing, CPU power saving mechanisms, and NUMA effects.

We systematically eliminate these factors and show that Memcached can achieve a median latency of 11 us and a 99.9th percentile latency of 32 us at 75\% utilization. In comparison, a naive deployment of Memcached has a median latency of 33 us and a 99.9th percentile latency of 14 ms. Finally, we demonstrate that a tradeoff often exists between throughput and tail latency.}, pdf = {papers/latency-tr14.pdf}, supersededby = {li14:_tales_tail}, supersededas = {Tech. Report}, monthnum = {04} }

@techreport{peter14:_arrak_tr_v2,
  address = {Seattle, WA, USA},
  author = {Simon Peter and Jialin Li and Irene Zhang and Dan R. K.
		  Ports and Arvind Krishnamurthy and Thomas Anderson and
		  Timothy Roscoe},
  institution = {University of Washington CSE},
  month = may,
  number = {UW-CSE-13-10-01, version 2.0},
  title = {Arrakis: The Operating System is the Control Plane},
  year = {2014},
  abstract = {Recent device hardware trends enable a new approach to the
		  design of network server operating systems. In a
		  traditional operating system, the kernel mediates access to
		  device hardware by server applications, to enforce process
		  isolation as well as network and disk security. We have
		  designed and implemented a new operating system, Arrakis,
		  that splits the traditional role of the kernel in two.
		  Applications have direct access to virtualized I/O devices,
		  allowing most I/O operations to skip the kernel entirely,
		  while the kernel is re-engineered to provide network and
		  disk protection without kernel mediation of every
		  operation. We describe the hardware and software changes
		  needed to take advantage of this new abstraction, and we
		  illustrate its power by showing 2-5x end-to-end latency and
		  9x throughput improvements for a popular persistent NoSQL
		  store relative to a well-tuned Linuxv implementation.},
  pdf = {papers/arrakis-tr-ver2.pdf},
  supersededby = {peter14:_arrak},
  supersededas = {Tech. Report (v2.0, 2014)},
  monthnum = {05}
}

@techreport{zhang14:_build_consis_trans_incon_replic,
  author = {Irene Zhang and Naveen Kr. Sharma and Adriana Szekeres and
		  Arvind Krishnamurthy and Dan R. K. Ports},
  institution = {University of Washington CSE},
  month = dec,
  number = {UW-CSE-2014-12-01},
  title = {Building Consistent Transactions with Inconsistent
		  Replication},
  year = {2014},
  pdf = {papers/tapir-tr14.pdf},
  supersededby = {zhang15:_build_consis_trans_incon_replic_exten_version},
  supersededas = {Tech. Report (2014)},
  monthnum = {12}
}

@techreport{hornyack13:_study_virtual_memor_usage_implic_large_memor,
  address = {Seattle, WA},
  author = {Peter Hornyack and Luis Ceze and Steven D. Gribble and Dan
		  R. K. Ports and Henry M. Levy},
  institution = {University of Washington CSE},
  title = {A Study of Virtual Memory Usage and Implications for Large
		  Memory},
  year = {2013},
  abstract = {The mechanisms now used to implement virtual memory -
		  pages, page tables, and TLBs - have worked remarkably well
		  for over fifty years. However, they are beginning to show
		  their age due to current trends, such as significant
		  increases in physical memory size, emerging data-intensive
		  applications, and imminent non-volatile main memory. These
		  trends call into question whether page-based
		  address-translation and protection mechanisms remain viable
		  solutions in the future. In this paper, we present a
		  detailed study of how modern applications use virtual
		  memory. Among other topics, our study examines the
		  footprint of mapped regions, the use of memory protection,
		  and the overhead of TLBs. Our results suggest that a
		  segment-based translation mechanism, together with a
		  fine-grained protection mechanism, merit consideration for
		  future systems.},
  pdf = {papers/vmstudy-tr13.pdf},
  monthnum = {}
}

@techreport{peter13:_arrak,
  address = {Seattle, WA, USA},
  author = {Simon Peter and Jialin Li and Irene Zhang and Dan R. K.
		  Ports and Arvind Krishnamurthy and Thomas Anderson and
		  Timothy Roscoe},
  institution = {University of Washington CSE},
  month = oct,
  number = {UW-CSE-13-10-01},
  title = {Arrakis: The Operating System is the Control Plane},
  year = {2013},
  abstract = {Recent device hardware trends enable a new approach to the
		  design of network servers. In a traditional operating
		  system, the kernel mediates access to device hardware by
		  server applications, to enforce process isolation as well
		  as network and disk security. We have designed and
		  implemented a new operating system, Arrakis, that splits
		  the traditional role of the kernel in two. Applications
		  have direct access to virtualized I/O devices, allowing
		  most I/O operations to skip the kernel entirely. The
		  Arrakis kernel operates only in the control plane. We
		  describe the the hardware and software changes needed to
		  take advantage of this new abstraction, and we illustrate
		  its power by showing significant latency and throughput
		  improvements for network server applications relative to a
		  well-tuned Linux implementation.},
  pdf = {papers/arrakis-tr.pdf},
  supersededby = {peter14:_arrak},
  supersededas = {Tech. Report (v1.0, 2013)},
  monthnum = {10}
}

@phdthesis{ports12:_applic_level_cachin_trans_consis, address = {Cambridge, MA, USA}, author = {Dan R. K. Ports}, month = jun, school = {Massachusetts Institute of Technology}, type = {Ph.D. thesis}, title = {Application-Level Caching with Transactional Consistency}, year = {2012}, abstract = {Distributed in-memory application data caches like memcached are a popular solution for scaling database-driven web sites. These systems increase performance significantly by reducing load on both the database and application servers. Unfortunately, such caches present two challenges for application developers. First, they cannot ensure that the application sees a consistent view of the data within a transaction, violating the isolation properties of the underlying database. Second, they leave the application responsible for locating data in the cache and keeping it up to date, a frequent source of application complexity and programming errors.

This thesis addresses both of these problems in a new cache called TxCache. TxCache is a transactional cache: it ensures that any data seen within a transaction, whether from the cache or the database, reflects a slightly stale but consistent snapshot of the database. TxCache also offers a simple programming model. Application developers simply designate certain functions as cacheable, and the system automatically caches their results and invalidates the cached data as the underlying database changes.

Our experiments found that TxCache can substantially increase the performance of a web application: on the RUBiS benchmark, it increases throughput by up to 5.2x relative to a system without caching. More importantly, on this application, TxCache achieves performance comparable (within 5\%) to that of a non-transactional cache, showing that consistency does not have to come at the price of performance.}, doi = {1721.1/75448}, pdf = {papers/thesis.pdf}, monthnum = {06} }

@article{ports12:_serial_snaps_isolat_postg,
  author = {Dan R. K. Ports and Kevin Grittner},
  journal = {Proceedings of the VLDB Endowment},
  month = aug,
  number = {12},
  pages = {1850--1861},
  title = {Serializable Snapshot Isolation in {PostgreSQL}},
  volume = {5},
  year = {2012},
  abstract = {This paper describes our experience implementing
		  PostgreSQL's new serializable isolation level. It is based
		  on the recently-developed Serializable Snapshot Isolation
		  (SSI) technique. This is the first implementation of SSI in
		  a production database release as well as the first in a
		  database that did not previously have a lock-based
		  serializable isolation level. We reflect on our experience
		  and describe how we overcame some of the resulting
		  challenges, including the implementation of a new lock
		  manager, a technique for ensuring memory usage is bounded,
		  and integration with other PostgreSQL features. We also
		  introduce an extension to SSI that improves performance for
		  read-only transactions. We evaluate PostgreSQL's
		  serializable isolation level using several benchmarks and
		  show that it achieves performance only slightly below that
		  of snapshot isolation, and significantly outperforms the
		  traditional two-phase locking approach on read-intensive
		  workloads.},
  doi = {10.48550/arXiv.1208.4179},
  pdf = {papers/ssi-vldb12.pdf},
  slidespdf = {papers/ssi-vldb12-slides.pdf},
  code = {https://www.postgresql.org/},
  monthnum = {08}
}

This file was generated by bibtex2html 1.99.